diff options
author | Konstantin Ryabitsev <konstantin@linuxfoundation.org> | 2020-11-27 10:40:32 -0500 |
---|---|---|
committer | Konstantin Ryabitsev <konstantin@linuxfoundation.org> | 2020-11-27 10:40:32 -0500 |
commit | a6d26ec0e1e1ab4311572e2c957fa8cf24b9c924 (patch) | |
tree | dc91aef36b064d9ef7390c4459b20325b9a470fa | |
parent | aabb998988810989806dbc2e0dace84d0fa909ed (diff) | |
download | korg-helpers-a6d26ec0e1e1ab4311572e2c957fa8cf24b9c924.tar.gz |
Deal with mailman archives that don't mangle
Those do exist!
Signed-off-by: Konstantin Ryabitsev <konstantin@linuxfoundation.org>
-rwxr-xr-x | list-archive-collector.py | 14 |
1 files changed, 9 insertions, 5 deletions
diff --git a/list-archive-collector.py b/list-archive-collector.py index d97d078..e798c4c 100755 --- a/list-archive-collector.py +++ b/list-archive-collector.py @@ -328,16 +328,20 @@ def grab_pipermail_archive(pipermail_url, mbx, listid, toaddr, lookaside, checks logger.info(' demangling %s', chunks[-1]) regex = r'^From ' subst = '>From ' - mboxdata = re.sub(regex, subst, mboxdata, 0, re.MULTILINE) + mboxdata = re.sub(regex, subst, mboxdata, 0, flags=re.M) # Fix pipermail mangling where it changes some email addresses # to be ' at ' instead of '@'. This is easiest to do with a # handful of regexes than via actual message body manipulation - # as parf of the python's email.message object + # as part of the python's email.message object regex = r'(<[^>]+) at ([^>]+>)' - subst = '\\1@\\2' - mboxdata = re.sub(regex, subst, mboxdata, 0, re.MULTILINE) + subst = r'\1@\2' + mboxdata = re.sub(regex, subst, mboxdata, 0, flags=re.M) regex = r'^>?(From:? \S+) at (\S+\..*)' - mboxdata = re.sub(regex, subst, mboxdata, 0, re.MULTILINE) + mboxdata = re.sub(regex, subst, mboxdata, 0, flags=re.M) + # Fix any remaining false From escapes + regex = r'^>(From\s+\S+@\S+\s+\w+\s+\w+\s+\d+\s+\d+:\d+:\d+\s+\d{4})' + subst = r'\1' + mboxdata = re.sub(regex, subst, mboxdata, 0, flags=re.M) with open(tmpfile, 'wb') as out_fh: out_fh.write(mboxdata.encode()) |