-
Notifications
You must be signed in to change notification settings - Fork 22
/
Copy pathpost_processing.py
105 lines (84 loc) · 3.37 KB
/
post_processing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
from Levenshtein import distance
# Given two sequences of words, determine the ratio of correct matches to incorrect matches.
def similarity(a, b, use_distance=True):
overlap = 0
for idx in range(min(len(a), len(b))):
if use_distance:
overlap += int(distance(a[idx], b[idx]) <= 2)
else:
overlap += int(a[idx] == b[idx])
return overlap
def combine_characters(a, b):
''' Line up the two input strings of utf8 characters so that they match as
closely as possible then splice them together.
Returns (spliced string, needs_update, number of new characters)
'''
if a == "":
return b, False, len(b)
if b == "":
return a, False, 0
COMBINATION_WINDOW_LEN = 15
MAX_NEW_WORDS = 10
max_similarity = 0
max_similarity_offset = 0
for offset in range(min(len(b), MAX_NEW_WORDS)):
a_idx = max(0, len(a) - len(b) + offset)
sim = similarity(a[a_idx:], b, use_distance=False)
if sim > max_similarity:
max_similarity = sim
max_similarity_offset = offset
if max_similarity == 0:
return a + b, False, len(b)
# Prevent editing of old text when nothing new is present.
if max_similarity_offset == 0 and len(b) < len(a):
return a, False, 0
a_end_idx = len(b) - max_similarity_offset
out_string = a[:-a_end_idx] + b
words_to_check = min(MAX_NEW_WORDS, len(a),
len(out_string) - max_similarity_offset)
start_out = len(out_string) - words_to_check - max_similarity_offset
end_out = len(out_string) - max_similarity_offset
start_a = len(a) - words_to_check
end_a = len(a)
out_comp = [out_string[i] for i in range(start_out, end_out)]
a_comp = [a[i] for i in range(start_a, end_a)]
needs_update = out_comp != a_comp
return out_string, needs_update, max_similarity_offset
def combine_words(a, b):
''' Line up the two input strings so that they match as closely as possible
then splice them together.
Returns (spliced string, needs_update, number of new words)
'''
if a == "":
return b, False, len(b)
if b == "":
return a, False, 0
COMBINATION_WINDOW_LEN = 15
MAX_NEW_WORDS = 10
a_s = [word for word in a.split(' ') if word != '']
b_s = [word for word in b.split(' ') if word != ''
][-COMBINATION_WINDOW_LEN:]
max_similarity = 0
max_similarity_offset = 0
for offset in range(min(len(b_s), MAX_NEW_WORDS)):
a_idx = max(0, len(a_s) - len(b_s) + offset)
sim = similarity(a_s[a_idx:], b_s)
if sim > max_similarity:
max_similarity = sim
max_similarity_offset = offset
if max_similarity == 0:
return a + ' ' + b, False, len(b_s)
# Prevent editing of old text when nothing new is present.
if max_similarity_offset == 0 and len(b_s) < len(a_s):
return a, False, 0
# Shorten to max overlap length for combination step.
b_s = b_s[-MAX_NEW_WORDS:]
a_end_idx = len(b_s) - max_similarity_offset
out_wordlist = a_s[:-a_end_idx] + b_s
words_to_check = min(MAX_NEW_WORDS, len(a_s),
len(out_wordlist) - max_similarity_offset)
needs_update = any([
out_wordlist[-i - max_similarity_offset] != a_s[-i]
for i in range(words_to_check)
])
return ' '.join(out_wordlist), needs_update, max_similarity_offset