Skip to content

Commit fe2a03e

Browse files
committed
Rework regex in xx-Remove-emails
This changes the regex to be stricter about what it matches in terms of email addresses. That should cut down on the amount of false positives from censored swearing (e.g., "f!@#$%%"). Unscientifically, it's also faster, at least on significant sized archives.
1 parent cdb6bd6 commit fe2a03e

File tree

1 file changed

+6
-3
lines changed

1 file changed

+6
-3
lines changed

xx-Remove-emails-from-Open-Doors-Tables.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,11 @@
66
from prompt_toolkit.formatted_text import FormattedText
77
from prompt_toolkit.shortcuts import clear
88

9+
# This regex is pulled from the HTML5 spec. Though it is technically not
10+
# compliant with RFC 5322 ("a willful violation"), it's good enough for our
11+
# purposes.
912
email_regex = re.compile(
10-
r"([-!#-'*+/-9=?A-Z^-~]+(\.[-!#-'*+/-9=?A-Z^-~]+)*|\"([]!#-[^-~ \t]|(\\[\t -~]))+\")@([-!#-'*+/-9=?A-Z^-~]+(\.[-!#-'*+/-9=?A-Z^-~]+)*|\[[\t -Z^-~]*])"
13+
r"([a-zA-Z0-9.!#$%&'*+\/=?^_`{|}~-]+)@([a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*)"
1114
)
1215

1316

@@ -44,7 +47,7 @@ def is_mailto(match) -> bool:
4447
def ask_user_for_action(match) -> str:
4548
start, end = match.span()
4649
raw_email = match.string[start:end]
47-
domain = match.group(5)
50+
domain = match.group(2)
4851
clear()
4952
print_context(match, 50)
5053
while True:
@@ -85,7 +88,7 @@ def return_from_list(match) -> str:
8588
return raw_email
8689
elif address_entry is not None:
8790
return address_entry
88-
domain = match.group(5)
91+
domain = match.group(2)
8992
domain_entry = domains.get(domain)
9093
if domain_entry is True:
9194
return raw_email

0 commit comments

Comments
 (0)