From f2ee559cd4d783d3c9887ee8d424fe1e81dc9d9e Mon Sep 17 00:00:00 2001 From: Douglas Q Hawkins Date: Wed, 27 May 2026 14:34:20 -0400 Subject: [PATCH] Lazy-allocate the error latency histogram on AggregateEntry Each AggregateEntry allocated two DDSketchHistograms in its constructor (ok + error latencies). DDSketchHistogram wraps a DDSketch + lazy store, roughly 60-80 bytes per histogram even when empty. Most spans aren't errors, so most entries' errorLatencies sit empty for life. Now the field starts null. recordOneDuration lazy-allocates on the first error; if no error ever lands on the entry, it stays null and ~80 bytes of empty-histogram overhead are reclaimed. Across a full 2048-entry table that's ~150 KB if 95% of entries never error -- the typical case. For the wire format, SerializingMetricWriter caches the serialized form of an empty histogram (~17 bytes) on first use and writes those cached bytes when an entry's errorLatencies is null. The cache is per-writer (not a global static) so each writer instance picks up the Histograms factory state at the time of its first report, avoiding races with test setup that registers the DDSketch factory at varying points. Trade-off: entries that DO see an error retain the histogram across clear() (just cleared, not nulled), so always-erroring entries allocate exactly once. Same total allocation as before for that case. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../trace/common/metrics/AggregateEntry.java | 33 +++++++++++++++++-- .../metrics/SerializingMetricWriter.java | 27 ++++++++++++++- 2 files changed, 56 insertions(+), 4 deletions(-) diff --git a/dd-trace-core/src/main/java/datadog/trace/common/metrics/AggregateEntry.java b/dd-trace-core/src/main/java/datadog/trace/common/metrics/AggregateEntry.java index f407167be37..8d6dc6b72d0 100644 --- a/dd-trace-core/src/main/java/datadog/trace/common/metrics/AggregateEntry.java +++ b/dd-trace-core/src/main/java/datadog/trace/common/metrics/AggregateEntry.java @@ -127,7 +127,15 @@ final class AggregateEntry extends Hashtable.Entry { // Mutable aggregate state -- single-thread (consumer/aggregator) writer. private final Histogram okLatencies = Histogram.newHistogram(); - private final Histogram errorLatencies = Histogram.newHistogram(); + + /** + * Lazily allocated on the first recorded error. Most entries never see an error and keep this + * null for life; {@link SerializingMetricWriter} writes a cached empty-histogram form when null + * to keep the wire payload identical. Once allocated, it survives {@link #clear()} (cleared, not + * nulled) since an entry that errored once tends to error again. + */ + @Nullable private Histogram errorLatencies; + private int errorCount; private int hitCount; private int topLevelCount; @@ -165,7 +173,7 @@ void recordOneDuration(long tagAndDuration) { } if ((tagAndDuration & ERROR_TAG) == ERROR_TAG) { tagAndDuration ^= ERROR_TAG; - errorLatencies.accept(tagAndDuration); + errorLatenciesForWrite().accept(tagAndDuration); ++errorCount; } else { okLatencies.accept(tagAndDuration); @@ -193,10 +201,26 @@ Histogram getOkLatencies() { return okLatencies; } + /** + * Returns the entry's error-latency histogram, or {@code null} if no error has been recorded. + * Callers serializing this should treat {@code null} as "emit a cached empty histogram"; see + * {@link SerializingMetricWriter}. + */ + @Nullable Histogram getErrorLatencies() { return errorLatencies; } + /** Lazy-allocates {@link #errorLatencies} on the first error. */ + private Histogram errorLatenciesForWrite() { + Histogram h = errorLatencies; + if (h == null) { + h = Histogram.newHistogram(); + errorLatencies = h; + } + return h; + } + /** * Resets the per-cycle counters and histograms. Label fields ({@code resource}, {@code service}, * ..., {@code peerTagNames}, {@code peerTagValues}) are deliberately left intact -- they're the @@ -210,7 +234,10 @@ void clear() { this.topLevelCount = 0; this.duration = 0; this.okLatencies.clear(); - this.errorLatencies.clear(); + // errorLatencies stays null on entries that never errored. Only clear if it was allocated. + if (this.errorLatencies != null) { + this.errorLatencies.clear(); + } } boolean matches(SpanSnapshot s) { diff --git a/dd-trace-core/src/main/java/datadog/trace/common/metrics/SerializingMetricWriter.java b/dd-trace-core/src/main/java/datadog/trace/common/metrics/SerializingMetricWriter.java index 7644ebaf044..c9fb15b4d0c 100644 --- a/dd-trace-core/src/main/java/datadog/trace/common/metrics/SerializingMetricWriter.java +++ b/dd-trace-core/src/main/java/datadog/trace/common/metrics/SerializingMetricWriter.java @@ -227,7 +227,32 @@ public void add(AggregateEntry entry) { writer.writeBinary(entry.getOkLatencies().serialize()); writer.writeUTF8(ERROR_SUMMARY); - writer.writeBinary(entry.getErrorLatencies().serialize()); + final datadog.metrics.api.Histogram errorLatencies = entry.getErrorLatencies(); + if (errorLatencies != null) { + writer.writeBinary(errorLatencies.serialize()); + } else { + // Entry never saw an error; emit a cached empty-histogram payload so the wire format is + // unchanged without allocating a histogram per entry. + writer.writeBinary(emptyErrorHistogramBytes()); + } + } + + private byte[] emptyHistogramBytesCache; + + /** + * Returns the cached serialized form of an empty histogram. Computed lazily on first call so the + * {@link datadog.metrics.api.Histograms} factory has been registered (by the producer-side tracer + * startup or test setup) before we sample its output. + */ + private byte[] emptyErrorHistogramBytes() { + byte[] cached = emptyHistogramBytesCache; + if (cached == null) { + java.nio.ByteBuffer buf = datadog.metrics.api.Histogram.newHistogram().serialize(); + cached = new byte[buf.remaining()]; + buf.get(cached); + emptyHistogramBytesCache = cached; + } + return cached; } @Override