Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
111 changes: 105 additions & 6 deletions lib/request/comparison.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,85 @@
from lib.core.threads import getCurrentThreadData
from thirdparty import six

MATCH_RATIO_MIN_SAMPLES = 3
MATCH_RATIO_MAX_SAMPLES = 7

# Multiplier applied to the observed jitter (MAD) when computing the
# adaptive decision tolerance. Roughly mimics ~2 sigma for a normal
# distribution while staying robust against outliers.
JITTER_TOLERANCE_MULTIPLIER = 3.0

def _toBytes(value):
if value is None:
return b""
elif isinstance(value, six.binary_type):
return value
elif isinstance(value, six.text_type):
return getBytes(value, kb.pageEncoding or DEFAULT_PAGE_ENCODING, "ignore")
else:
return getBytes(six.text_type(value), kb.pageEncoding or DEFAULT_PAGE_ENCODING, "ignore")

def _sampledSimilarity(first, second):
"""
Lightweight fallback similarity for very large responses.

It avoids expensive full-sequence matching while still comparing actual
content (not only response length), reducing false positives.
"""

first, second = _toBytes(first), _toBytes(second)

if first == second:
return 1.0
elif not first or not second:
return float(first == second)

firstLength, secondLength = len(first), len(second)
ratio = 1.0 * min(firstLength, secondLength) / max(firstLength, secondLength)

window = min(4096, firstLength, secondLength)
if not window:
return ratio

similarity = 0.0
positions = (0.0, 0.25, 0.5, 0.75, 1.0)

for position in positions:
firstStart = int(max(0, firstLength - window) * position)
secondStart = int(max(0, secondLength - window) * position)

firstChunk = first[firstStart:firstStart + window]
secondChunk = second[secondStart:secondStart + window]

similarity += (1.0 * sum(left == right for left, right in zip(firstChunk, secondChunk)) / window)

similarity /= len(positions)

# Favor actual content match while still accounting for size drift.
return 0.7 * similarity + 0.3 * ratio

def _median(values):
ordered = sorted(values)
middle = len(ordered) // 2

if len(ordered) % 2:
return ordered[middle]
else:
return (ordered[middle - 1] + ordered[middle]) / 2.0

def _mad(values, center):
"""
Median Absolute Deviation around the given center value.

Used as a robust, outlier-resistant estimate of the natural noise level
observed during the matchRatio calibration window.
"""

if not values:
return 0.0

return _median([abs(value - center) for value in values])

def comparison(page, headers, code=None, getRatioValue=False, pageLength=None):
if not isinstance(page, (six.text_type, six.binary_type, type(None))):
logger.critical("got page of type %s; repr(page)[:200]=%s" % (type(page), repr(page)[:200]))
Expand All @@ -60,6 +139,15 @@ def _adjust(condition, getRatioValue):

def _comparison(page, headers, code, getRatioValue, pageLength):
threadData = getCurrentThreadData()
calibrationKey = hash((kb.pageTemplate, conf.textOnly, conf.titles))

if kb.matchRatio is not None:
kb.matchRatioCandidates = []
kb.matchRatioCalibrationKey = None
elif getattr(kb, "matchRatioCalibrationKey", None) != calibrationKey:
kb.matchRatioCandidates = []
kb.matchRatioJitter = None
kb.matchRatioCalibrationKey = calibrationKey

if kb.testMode:
threadData.lastComparisonHeaders = listToStrValue(_ for _ in headers.headers if not _.startswith("%s:" % URI_HTTP_HEADER)) if headers else ""
Expand Down Expand Up @@ -142,9 +230,7 @@ def _comparison(page, headers, code, getRatioValue, pageLength):
if not page or not seqMatcher.a:
return float(seqMatcher.a == page)
else:
ratio = 1. * len(seqMatcher.a) / len(page)
if ratio > 1:
ratio = 1. / ratio
ratio = _sampledSimilarity(seqMatcher.a, page)
else:
seq1, seq2 = None, None

Expand Down Expand Up @@ -203,8 +289,16 @@ def _comparison(page, headers, code, getRatioValue, pageLength):
# current injected value changes the url page content
if kb.matchRatio is None:
if ratio >= LOWER_RATIO_BOUND and ratio <= UPPER_RATIO_BOUND:
kb.matchRatio = ratio
logger.debug("setting match ratio for current parameter to %.3f" % kb.matchRatio)
kb.matchRatioCandidates.append(ratio)
kb.matchRatioCandidates = kb.matchRatioCandidates[-MATCH_RATIO_MAX_SAMPLES:]

if len(kb.matchRatioCandidates) >= MATCH_RATIO_MIN_SAMPLES:
kb.matchRatio = round(_median(kb.matchRatioCandidates), 3)
kb.matchRatioJitter = round(_mad(kb.matchRatioCandidates, kb.matchRatio), 3)
sampleCount = len(kb.matchRatioCandidates)
kb.matchRatioCandidates = []
kb.matchRatioCalibrationKey = None
logger.debug("setting match ratio for current parameter to %.3f (median of %d samples, jitter=%.3f)" % (kb.matchRatio, sampleCount, kb.matchRatioJitter))

if kb.testMode:
threadData.lastComparisonRatio = ratio
Expand All @@ -224,4 +318,9 @@ def _comparison(page, headers, code, getRatioValue, pageLength):
return None

else:
return (ratio - kb.matchRatio) > DIFF_TOLERANCE
# Adaptive tolerance: the static DIFF_TOLERANCE acts as a hard floor
# for stable pages, while noisy targets get a wider band derived
# from the observed jitter (MAD) captured during calibration.
jitter = getattr(kb, "matchRatioJitter", None) or 0.0
tolerance = max(DIFF_TOLERANCE, JITTER_TOLERANCE_MULTIPLIER * jitter)
return (ratio - kb.matchRatio) > tolerance