-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathstrip_short_diff.py
More file actions
55 lines (42 loc) · 1.54 KB
/
strip_short_diff.py
File metadata and controls
55 lines (42 loc) · 1.54 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
#!/usr/bin/env python3
"""Strip \\DIFadd{...} and \\DIFaddFL{...} markup when the content has fewer than N words."""
import re
import sys
def count_words(text: str) -> int:
# Remove LaTeX commands
plain = re.sub(r'\\[a-zA-Z]+\*?(?:\[[^\]]*\])?(?:\{[^{}]*\})*', '', text)
# Remove braces, tildes, dollars
plain = re.sub(r'[{}~$]', '', plain)
return len(plain.split())
def strip_short(content: str, min_words: int) -> str:
# Match \DIFadd{...} or \DIFaddFL{...} with balanced braces
pattern = re.compile(r'\\DIFadd(?:FL)?\{')
result = []
i = 0
while i < len(content):
m = pattern.search(content, i)
if not m:
result.append(content[i:])
break
result.append(content[i:m.start()])
# Find matching closing brace
depth = 0
j = m.end() - 1 # position of opening {
for j in range(m.end() - 1, len(content)):
if content[j] == '{':
depth += 1
elif content[j] == '}':
depth -= 1
if depth == 0:
break
body = content[m.end():j] # content inside { }
if count_words(body) < min_words:
result.append(body) # strip markup, keep content
else:
result.append(content[m.start():j + 1]) # keep markup
i = j + 1
return ''.join(result)
if __name__ == '__main__':
min_words = int(sys.argv[1]) if len(sys.argv) > 1 else 3
content = sys.stdin.read()
sys.stdout.write(strip_short(content, min_words))