Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 50 additions & 3 deletions lib/request/comparison.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,55 @@
from lib.core.threads import getCurrentThreadData
from thirdparty import six

def _toBytes(value):
if value is None:
return b""
elif isinstance(value, six.binary_type):
return value
elif isinstance(value, six.text_type):
return getBytes(value, kb.pageEncoding or DEFAULT_PAGE_ENCODING, "ignore")
else:
return getBytes(six.text_type(value), kb.pageEncoding or DEFAULT_PAGE_ENCODING, "ignore")

def _sampledSimilarity(first, second):
"""
Lightweight fallback similarity for very large responses.

It avoids expensive full-sequence matching while still comparing actual
content (not only response length), reducing false positives.
"""

first, second = _toBytes(first), _toBytes(second)

if first == second:
return 1.0
elif not first or not second:
return float(first == second)

firstLength, secondLength = len(first), len(second)
ratio = 1.0 * min(firstLength, secondLength) / max(firstLength, secondLength)

window = min(4096, firstLength, secondLength)
if not window:
return ratio

similarity = 0.0
positions = (0.0, 0.25, 0.5, 0.75, 1.0)

for position in positions:
firstStart = int(max(0, firstLength - window) * position)
secondStart = int(max(0, secondLength - window) * position)

firstChunk = first[firstStart:firstStart + window]
secondChunk = second[secondStart:secondStart + window]

similarity += (1.0 * sum(left == right for left, right in zip(firstChunk, secondChunk)) / window)

similarity /= len(positions)

# Favor actual content match while still accounting for size drift.
return 0.7 * similarity + 0.3 * ratio

def comparison(page, headers, code=None, getRatioValue=False, pageLength=None):
if not isinstance(page, (six.text_type, six.binary_type, type(None))):
logger.critical("got page of type %s; repr(page)[:200]=%s" % (type(page), repr(page)[:200]))
Expand Down Expand Up @@ -142,9 +191,7 @@ def _comparison(page, headers, code, getRatioValue, pageLength):
if not page or not seqMatcher.a:
return float(seqMatcher.a == page)
else:
ratio = 1. * len(seqMatcher.a) / len(page)
if ratio > 1:
ratio = 1. / ratio
ratio = _sampledSimilarity(seqMatcher.a, page)
else:
seq1, seq2 = None, None

Expand Down