aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorKonstantin Ryabitsev <konstantin@linuxfoundation.org>2020-11-27 10:40:32 -0500
committerKonstantin Ryabitsev <konstantin@linuxfoundation.org>2020-11-27 10:40:32 -0500
commita6d26ec0e1e1ab4311572e2c957fa8cf24b9c924 (patch)
treedc91aef36b064d9ef7390c4459b20325b9a470fa
parentaabb998988810989806dbc2e0dace84d0fa909ed (diff)
downloadkorg-helpers-a6d26ec0e1e1ab4311572e2c957fa8cf24b9c924.tar.gz
Deal with mailman archives that don't mangle
Those do exist! Signed-off-by: Konstantin Ryabitsev <konstantin@linuxfoundation.org>
-rwxr-xr-xlist-archive-collector.py14
1 files changed, 9 insertions, 5 deletions
diff --git a/list-archive-collector.py b/list-archive-collector.py
index d97d078..e798c4c 100755
--- a/list-archive-collector.py
+++ b/list-archive-collector.py
@@ -328,16 +328,20 @@ def grab_pipermail_archive(pipermail_url, mbx, listid, toaddr, lookaside, checks
logger.info(' demangling %s', chunks[-1])
regex = r'^From '
subst = '>From '
- mboxdata = re.sub(regex, subst, mboxdata, 0, re.MULTILINE)
+ mboxdata = re.sub(regex, subst, mboxdata, 0, flags=re.M)
# Fix pipermail mangling where it changes some email addresses
# to be ' at ' instead of '@'. This is easiest to do with a
# handful of regexes than via actual message body manipulation
- # as parf of the python's email.message object
+ # as part of the python's email.message object
regex = r'(<[^>]+) at ([^>]+>)'
- subst = '\\1@\\2'
- mboxdata = re.sub(regex, subst, mboxdata, 0, re.MULTILINE)
+ subst = r'\1@\2'
+ mboxdata = re.sub(regex, subst, mboxdata, 0, flags=re.M)
regex = r'^>?(From:? \S+) at (\S+\..*)'
- mboxdata = re.sub(regex, subst, mboxdata, 0, re.MULTILINE)
+ mboxdata = re.sub(regex, subst, mboxdata, 0, flags=re.M)
+ # Fix any remaining false From escapes
+ regex = r'^>(From\s+\S+@\S+\s+\w+\s+\w+\s+\d+\s+\d+:\d+:\d+\s+\d{4})'
+ subst = r'\1'
+ mboxdata = re.sub(regex, subst, mboxdata, 0, flags=re.M)
with open(tmpfile, 'wb') as out_fh:
out_fh.write(mboxdata.encode())