diff options
author | Konstantin Ryabitsev <konstantin@linuxfoundation.org> | 2020-11-27 10:56:31 -0500 |
---|---|---|
committer | Konstantin Ryabitsev <konstantin@linuxfoundation.org> | 2020-11-27 10:56:31 -0500 |
commit | d9d9d4eee641cd007487a7f5c4defd71f2d21ee8 (patch) | |
tree | 375bd06c5612a16dcb4e8a7c074fd4c67017d4dc | |
parent | fa9a573b428dd8e8ce67e4ab0a2c28111ec7988d (diff) | |
download | korg-helpers-d9d9d4eee641cd007487a7f5c4defd71f2d21ee8.tar.gz |
More handling of non-mangled mailman archives
Signed-off-by: Konstantin Ryabitsev <konstantin@linuxfoundation.org>
-rwxr-xr-x | list-archive-collector.py | 28 |
1 files changed, 15 insertions, 13 deletions
diff --git a/list-archive-collector.py b/list-archive-collector.py index f122e27..810fb4e 100755 --- a/list-archive-collector.py +++ b/list-archive-collector.py @@ -329,19 +329,21 @@ def grab_pipermail_archive(pipermail_url, mbx, listid, toaddr, lookaside, checks regex = r'^From ' subst = '>From ' mboxdata = re.sub(regex, subst, mboxdata, 0, flags=re.M) - # Fix pipermail mangling where it changes some email addresses - # to be ' at ' instead of '@'. This is easiest to do with a - # handful of regexes than via actual message body manipulation - # as part of the python's email.message object regex = r'(<[^>]+) at ([^>]+>)' - subst = r'\1@\2' - mboxdata = re.sub(regex, subst, mboxdata, 0, flags=re.M) - regex = r'^>?(From:? \S+) at (\S+\..*)' - mboxdata = re.sub(regex, subst, mboxdata, 0, flags=re.M) - # Fix any remaining false From escapes - regex = r'^>(From\s+\S+@\S+\s+\w+\s+\w+\s+\d+\s+\d+:\d+:\d+\s+\d{4})' - subst = r'\1' - mboxdata = re.sub(regex, subst, mboxdata, 0, flags=re.M) + if re.search(regex, mboxdata): + # Fix pipermail mangling where it changes some email addresses + # to be ' at ' instead of '@'. This is easiest to do with a + # handful of regexes than via actual message body manipulation + # as part of the python's email.message object + subst = r'\1@\2' + mboxdata = re.sub(regex, subst, mboxdata, 0, flags=re.M) + regex = r'^>?(From:? \S+) at (\S+\..*)' + mboxdata = re.sub(regex, subst, mboxdata, 0, flags=re.M) + else: + # Fix any remaining false From escapes + regex = r'^>(From\s+\S+@\S+\s+\w+\s+\w+\s+\d+\s+\d+:\d+:\d+\s+\d{4})' + subst = r'\1' + mboxdata = re.sub(regex, subst, mboxdata, 0, flags=re.M) with open(tmpfile, 'wb') as out_fh: out_fh.write(mboxdata.encode()) @@ -350,7 +352,7 @@ def grab_pipermail_archive(pipermail_url, mbx, listid, toaddr, lookaside, checks tmpmbx = mailbox.mbox(tmpfile) for msg in tmpmbx: logger.info(' processing: %s', msg.get('Message-Id')) - fromline = msg.get('From') + fromline = str(msg.get('From', '')) if fromline and fromline.find('(') > 0: # Fix bogus From: foo@bar.baz (Foo Barski) -> Foo Barski <foo@bar.baz> matches = re.search(r'(\S+@\S+\.\S+) \((.*)\)$', fromline) |