-
Notifications
You must be signed in to change notification settings - Fork 27
/
Copy pathdiffPage.py
90 lines (72 loc) · 2.47 KB
/
diffPage.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
#!/usr/bin/python
# -*- coding: utf-8 -*-
from difflib import SequenceMatcher
from functools import reduce
import re
def get_filtered_page_content(page, only_text=True, split=" "):
"""
返回经过过滤的页面内容,不包含脚本、样式和/或注释
或所有HTML标签
>>> get_filtered_page_content(u'<html><title>foobar</title><body>test</body></html>')
u'foobar test'
"""
ret_val = page
ret_val = re.sub(r"(?si)<script.+?</script>|<!--.+?-->|<style.+?</style>%s" % (r"|<[^>]+>|\t|\n|\r" if only_text else ""),split, page)
while ret_val.find(2 * split) != -1:
ret_val = ret_val.replace(2 * split, split)
ret_val = htmlunescape(ret_val.strip().strip(split))
return ret_val
def htmlunescape(value):
"""
返回(基本转换)HTML未转义值
>>> htmlunescape('a<b')
'a<b'
"""
ret_val = value
codes = (('<', '<'), ('>', '>'), ('"', '"'), (' ', ' '), ('&', '&'))
ret_val = reduce(lambda x, y: x.replace(y[0], y[1]), codes, ret_val)
try:
ret_val = re.sub(r"&#x([^ ;]+);", lambda match: unichr(int(match.group(1), 16)), ret_val)
except ValueError:
pass
return ret_val
def get_ratio(first_page, second_page):
"""
打印出现在两个不同响应页面中的单词
对比文本相似度, 会去掉html标签
"""
first_page = get_filtered_page_content(first_page)
second_page = get_filtered_page_content(second_page)
match = SequenceMatcher(None, first_page, second_page).ratio()
return match
def split_by_sep(seq):
chunks = []
chunk = ''
for c in seq:
if c in '\n\t\r"\'<':
chunks.append(chunk)
chunk = ''
else:
chunk += c
chunks.append(chunk)
return chunks
def relative_distance_boolean(a_str, b_str, threshold=0.6):
if threshold == 0:
return True
elif threshold == 1.0:
return a_str == b_str
if len(b_str) < len(a_str):
a_str, b_str = b_str, a_str
alen = len(a_str)
blen = len(b_str)
if blen == 0 or alen == 0:
return alen == blen
if blen == alen and a_str == b_str and threshold <= 1.0:
return True
if threshold > upper_bound_similarity(a_str, b_str):
return False
else:
simalar = SequenceMatcher(None, split_by_sep(a_str), split_by_sep(b_str)).quick_ratio()
return threshold <= simalar
def upper_bound_similarity(a, b):
return (2.0 * len(a)) / (len(a) + len(b))