From 36d899b186b061c2448868b092f0cb92b84fdd5c Mon Sep 17 00:00:00 2001
From: Ebrahim Shafiei <prof.shafiei@gmail.com>
Date: Sun, 31 May 2026 00:31:26 +0330
Subject: [PATCH] fix(comparison): adapt decision tolerance to observed page
 jitter

---
 lib/request/comparison.py | 111 +++++++++++++++++++++++++++++++++++---
 1 file changed, 105 insertions(+), 6 deletions(-)

diff --git a/lib/request/comparison.py b/lib/request/comparison.py
index 0c6ab2586c2..b64c9705364 100644
--- a/lib/request/comparison.py
+++ b/lib/request/comparison.py
@@ -34,6 +34,85 @@
 from lib.core.threads import getCurrentThreadData
 from thirdparty import six
 
+MATCH_RATIO_MIN_SAMPLES = 3
+MATCH_RATIO_MAX_SAMPLES = 7
+
+# Multiplier applied to the observed jitter (MAD) when computing the
+# adaptive decision tolerance. Roughly mimics ~2 sigma for a normal
+# distribution while staying robust against outliers.
+JITTER_TOLERANCE_MULTIPLIER = 3.0
+
+def _toBytes(value):
+    if value is None:
+        return b""
+    elif isinstance(value, six.binary_type):
+        return value
+    elif isinstance(value, six.text_type):
+        return getBytes(value, kb.pageEncoding or DEFAULT_PAGE_ENCODING, "ignore")
+    else:
+        return getBytes(six.text_type(value), kb.pageEncoding or DEFAULT_PAGE_ENCODING, "ignore")
+
+def _sampledSimilarity(first, second):
+    """
+    Lightweight fallback similarity for very large responses.
+
+    It avoids expensive full-sequence matching while still comparing actual
+    content (not only response length), reducing false positives.
+    """
+
+    first, second = _toBytes(first), _toBytes(second)
+
+    if first == second:
+        return 1.0
+    elif not first or not second:
+        return float(first == second)
+
+    firstLength, secondLength = len(first), len(second)
+    ratio = 1.0 * min(firstLength, secondLength) / max(firstLength, secondLength)
+
+    window = min(4096, firstLength, secondLength)
+    if not window:
+        return ratio
+
+    similarity = 0.0
+    positions = (0.0, 0.25, 0.5, 0.75, 1.0)
+
+    for position in positions:
+        firstStart = int(max(0, firstLength - window) * position)
+        secondStart = int(max(0, secondLength - window) * position)
+
+        firstChunk = first[firstStart:firstStart + window]
+        secondChunk = second[secondStart:secondStart + window]
+
+        similarity += (1.0 * sum(left == right for left, right in zip(firstChunk, secondChunk)) / window)
+
+    similarity /= len(positions)
+
+    # Favor actual content match while still accounting for size drift.
+    return 0.7 * similarity + 0.3 * ratio
+
+def _median(values):
+    ordered = sorted(values)
+    middle = len(ordered) // 2
+
+    if len(ordered) % 2:
+        return ordered[middle]
+    else:
+        return (ordered[middle - 1] + ordered[middle]) / 2.0
+
+def _mad(values, center):
+    """
+    Median Absolute Deviation around the given center value.
+
+    Used as a robust, outlier-resistant estimate of the natural noise level
+    observed during the matchRatio calibration window.
+    """
+
+    if not values:
+        return 0.0
+
+    return _median([abs(value - center) for value in values])
+
 def comparison(page, headers, code=None, getRatioValue=False, pageLength=None):
     if not isinstance(page, (six.text_type, six.binary_type, type(None))):
         logger.critical("got page of type %s; repr(page)[:200]=%s" % (type(page), repr(page)[:200]))
@@ -60,6 +139,15 @@ def _adjust(condition, getRatioValue):
 
 def _comparison(page, headers, code, getRatioValue, pageLength):
     threadData = getCurrentThreadData()
+    calibrationKey = hash((kb.pageTemplate, conf.textOnly, conf.titles))
+
+    if kb.matchRatio is not None:
+        kb.matchRatioCandidates = []
+        kb.matchRatioCalibrationKey = None
+    elif getattr(kb, "matchRatioCalibrationKey", None) != calibrationKey:
+        kb.matchRatioCandidates = []
+        kb.matchRatioJitter = None
+        kb.matchRatioCalibrationKey = calibrationKey
 
     if kb.testMode:
         threadData.lastComparisonHeaders = listToStrValue(_ for _ in headers.headers if not _.startswith("%s:" % URI_HTTP_HEADER)) if headers else ""
@@ -142,9 +230,7 @@ def _comparison(page, headers, code, getRatioValue, pageLength):
             if not page or not seqMatcher.a:
                 return float(seqMatcher.a == page)
             else:
-                ratio = 1. * len(seqMatcher.a) / len(page)
-                if ratio > 1:
-                    ratio = 1. / ratio
+                ratio = _sampledSimilarity(seqMatcher.a, page)
         else:
             seq1, seq2 = None, None
 
@@ -203,8 +289,16 @@ def _comparison(page, headers, code, getRatioValue, pageLength):
     # current injected value changes the url page content
     if kb.matchRatio is None:
         if ratio >= LOWER_RATIO_BOUND and ratio <= UPPER_RATIO_BOUND:
-            kb.matchRatio = ratio
-            logger.debug("setting match ratio for current parameter to %.3f" % kb.matchRatio)
+            kb.matchRatioCandidates.append(ratio)
+            kb.matchRatioCandidates = kb.matchRatioCandidates[-MATCH_RATIO_MAX_SAMPLES:]
+
+            if len(kb.matchRatioCandidates) >= MATCH_RATIO_MIN_SAMPLES:
+                kb.matchRatio = round(_median(kb.matchRatioCandidates), 3)
+                kb.matchRatioJitter = round(_mad(kb.matchRatioCandidates, kb.matchRatio), 3)
+                sampleCount = len(kb.matchRatioCandidates)
+                kb.matchRatioCandidates = []
+                kb.matchRatioCalibrationKey = None
+                logger.debug("setting match ratio for current parameter to %.3f (median of %d samples, jitter=%.3f)" % (kb.matchRatio, sampleCount, kb.matchRatioJitter))
 
     if kb.testMode:
         threadData.lastComparisonRatio = ratio
@@ -224,4 +318,9 @@ def _comparison(page, headers, code, getRatioValue, pageLength):
         return None
 
     else:
-        return (ratio - kb.matchRatio) > DIFF_TOLERANCE
+        # Adaptive tolerance: the static DIFF_TOLERANCE acts as a hard floor
+        # for stable pages, while noisy targets get a wider band derived
+        # from the observed jitter (MAD) captured during calibration.
+        jitter = getattr(kb, "matchRatioJitter", None) or 0.0
+        tolerance = max(DIFF_TOLERANCE, JITTER_TOLERANCE_MULTIPLIER * jitter)
+        return (ratio - kb.matchRatio) > tolerance