From 36d899b186b061c2448868b092f0cb92b84fdd5c Mon Sep 17 00:00:00 2001 From: Ebrahim Shafiei Date: Sun, 31 May 2026 00:31:26 +0330 Subject: [PATCH] fix(comparison): adapt decision tolerance to observed page jitter --- lib/request/comparison.py | 111 +++++++++++++++++++++++++++++++++++--- 1 file changed, 105 insertions(+), 6 deletions(-) diff --git a/lib/request/comparison.py b/lib/request/comparison.py index 0c6ab2586c2..b64c9705364 100644 --- a/lib/request/comparison.py +++ b/lib/request/comparison.py @@ -34,6 +34,85 @@ from lib.core.threads import getCurrentThreadData from thirdparty import six +MATCH_RATIO_MIN_SAMPLES = 3 +MATCH_RATIO_MAX_SAMPLES = 7 + +# Multiplier applied to the observed jitter (MAD) when computing the +# adaptive decision tolerance. Roughly mimics ~2 sigma for a normal +# distribution while staying robust against outliers. +JITTER_TOLERANCE_MULTIPLIER = 3.0 + +def _toBytes(value): + if value is None: + return b"" + elif isinstance(value, six.binary_type): + return value + elif isinstance(value, six.text_type): + return getBytes(value, kb.pageEncoding or DEFAULT_PAGE_ENCODING, "ignore") + else: + return getBytes(six.text_type(value), kb.pageEncoding or DEFAULT_PAGE_ENCODING, "ignore") + +def _sampledSimilarity(first, second): + """ + Lightweight fallback similarity for very large responses. + + It avoids expensive full-sequence matching while still comparing actual + content (not only response length), reducing false positives. + """ + + first, second = _toBytes(first), _toBytes(second) + + if first == second: + return 1.0 + elif not first or not second: + return float(first == second) + + firstLength, secondLength = len(first), len(second) + ratio = 1.0 * min(firstLength, secondLength) / max(firstLength, secondLength) + + window = min(4096, firstLength, secondLength) + if not window: + return ratio + + similarity = 0.0 + positions = (0.0, 0.25, 0.5, 0.75, 1.0) + + for position in positions: + firstStart = int(max(0, firstLength - window) * position) + secondStart = int(max(0, secondLength - window) * position) + + firstChunk = first[firstStart:firstStart + window] + secondChunk = second[secondStart:secondStart + window] + + similarity += (1.0 * sum(left == right for left, right in zip(firstChunk, secondChunk)) / window) + + similarity /= len(positions) + + # Favor actual content match while still accounting for size drift. + return 0.7 * similarity + 0.3 * ratio + +def _median(values): + ordered = sorted(values) + middle = len(ordered) // 2 + + if len(ordered) % 2: + return ordered[middle] + else: + return (ordered[middle - 1] + ordered[middle]) / 2.0 + +def _mad(values, center): + """ + Median Absolute Deviation around the given center value. + + Used as a robust, outlier-resistant estimate of the natural noise level + observed during the matchRatio calibration window. + """ + + if not values: + return 0.0 + + return _median([abs(value - center) for value in values]) + def comparison(page, headers, code=None, getRatioValue=False, pageLength=None): if not isinstance(page, (six.text_type, six.binary_type, type(None))): logger.critical("got page of type %s; repr(page)[:200]=%s" % (type(page), repr(page)[:200])) @@ -60,6 +139,15 @@ def _adjust(condition, getRatioValue): def _comparison(page, headers, code, getRatioValue, pageLength): threadData = getCurrentThreadData() + calibrationKey = hash((kb.pageTemplate, conf.textOnly, conf.titles)) + + if kb.matchRatio is not None: + kb.matchRatioCandidates = [] + kb.matchRatioCalibrationKey = None + elif getattr(kb, "matchRatioCalibrationKey", None) != calibrationKey: + kb.matchRatioCandidates = [] + kb.matchRatioJitter = None + kb.matchRatioCalibrationKey = calibrationKey if kb.testMode: threadData.lastComparisonHeaders = listToStrValue(_ for _ in headers.headers if not _.startswith("%s:" % URI_HTTP_HEADER)) if headers else "" @@ -142,9 +230,7 @@ def _comparison(page, headers, code, getRatioValue, pageLength): if not page or not seqMatcher.a: return float(seqMatcher.a == page) else: - ratio = 1. * len(seqMatcher.a) / len(page) - if ratio > 1: - ratio = 1. / ratio + ratio = _sampledSimilarity(seqMatcher.a, page) else: seq1, seq2 = None, None @@ -203,8 +289,16 @@ def _comparison(page, headers, code, getRatioValue, pageLength): # current injected value changes the url page content if kb.matchRatio is None: if ratio >= LOWER_RATIO_BOUND and ratio <= UPPER_RATIO_BOUND: - kb.matchRatio = ratio - logger.debug("setting match ratio for current parameter to %.3f" % kb.matchRatio) + kb.matchRatioCandidates.append(ratio) + kb.matchRatioCandidates = kb.matchRatioCandidates[-MATCH_RATIO_MAX_SAMPLES:] + + if len(kb.matchRatioCandidates) >= MATCH_RATIO_MIN_SAMPLES: + kb.matchRatio = round(_median(kb.matchRatioCandidates), 3) + kb.matchRatioJitter = round(_mad(kb.matchRatioCandidates, kb.matchRatio), 3) + sampleCount = len(kb.matchRatioCandidates) + kb.matchRatioCandidates = [] + kb.matchRatioCalibrationKey = None + logger.debug("setting match ratio for current parameter to %.3f (median of %d samples, jitter=%.3f)" % (kb.matchRatio, sampleCount, kb.matchRatioJitter)) if kb.testMode: threadData.lastComparisonRatio = ratio @@ -224,4 +318,9 @@ def _comparison(page, headers, code, getRatioValue, pageLength): return None else: - return (ratio - kb.matchRatio) > DIFF_TOLERANCE + # Adaptive tolerance: the static DIFF_TOLERANCE acts as a hard floor + # for stable pages, while noisy targets get a wider band derived + # from the observed jitter (MAD) captured during calibration. + jitter = getattr(kb, "matchRatioJitter", None) or 0.0 + tolerance = max(DIFF_TOLERANCE, JITTER_TOLERANCE_MULTIPLIER * jitter) + return (ratio - kb.matchRatio) > tolerance