diff options
author | Daniel Jordan via RT <kernel-helpdesk@rt.linuxfoundation.org> | 2019-01-24 22:45:26 -0500 |
---|---|---|
committer | Konstantin Ryabitsev <konstantin@linuxfoundation.org> | 2019-01-25 15:08:05 -0500 |
commit | 45b6f51bb85688ab5db4386a61c766d7481bc85a (patch) | |
tree | 81f76caab6652b080873ae0b5e85e39437ac8500 | |
parent | dc2306ba201b6f4a7c0d9675dcdf71f2549dae28 (diff) | |
download | korg-helpers-45b6f51bb85688ab5db4386a61c766d7481bc85a.tar.gz |
handle multiple To: and Cc: fields in malformed emails
Emails should have at most one To: and one Cc: header, but sometimes
malformed ones have more than that, causing 'notourlist' false
positives and leaving legitimate messages out of the archive.
Collapse multiple identical headers into one.
Signed-off-by: Daniel Jordan <daniel.m.jordan@oracle.com>
-rwxr-xr-x | list-archive-maker.py | 35 |
1 files changed, 34 insertions, 1 deletions
diff --git a/list-archive-maker.py b/list-archive-maker.py index bfd6478..b618a18 100755 --- a/list-archive-maker.py +++ b/list-archive-maker.py @@ -175,9 +175,13 @@ def main(sources, outdir, msgids, listids, rejectsfile): # Remove headers not in WANTHDRS list and any Received: # lines that do not mention the list email address newhdrs = [] + to_list = [] + to_header_idx = None + cc_list = [] + cc_header_idx = None recvtime = None is_our_list = False - for hdrname, hdrval in msg._headers: + for hdrname, hdrval in list(msg._headers): lhdrname = hdrname.lower() if is_nntp and lhdrname.find('original-') == 0: lhdrname = lhdrname.replace('original-', '') @@ -224,9 +228,38 @@ def main(sources, outdir, msgids, listids, rejectsfile): is_our_list = True break + # Malformed emails can have multiple to: and cc: fields. Merge + # so there's one field for each header type. + # + # Save the place in newhdrs where the first to or cc list would + # have appeared so we can insert the merged list there rather + # than strangely at the end. + + elif lhdrname == 'to': + to_list.extend(hdrval.split(',')) + msg._headers.remove((hdrname, hdrval)) + if to_header_idx is None: + to_header_idx = len(newhdrs) + + elif lhdrname == 'cc': + cc_list.extend(hdrval.split(',')) + msg._headers.remove((hdrname, hdrval)) + if cc_header_idx is None: + cc_header_idx = len(newhdrs) + else: newhdrs.append((hdrname, hdrval)) + if len(to_list) > 0: + to_header = ('To', ', '.join(to_list)) + msg._headers.append(to_header) + newhdrs.insert(to_header_idx, to_header) + + if len(cc_list) > 0: + cc_header = ('Cc', ', '.join(cc_list)) + msg._headers.append(cc_header) + newhdrs.insert(cc_header_idx, cc_header) + if not is_our_list: # Sometimes a message is cc'd to multiple mailing lists and the # archives only contain a copy of the message that was delivered to a |