diff options
author | Konstantin Ryabitsev <konstantin@linuxfoundation.org> | 2021-09-15 16:43:17 -0400 |
---|---|---|
committer | Konstantin Ryabitsev <konstantin@linuxfoundation.org> | 2021-09-15 16:43:17 -0400 |
commit | 75071b1c24883374f54bc1359729172808b62395 (patch) | |
tree | b11c31df8d18a65f31a35f2cea11d653578563e9 | |
parent | f6ac8bf765f605cb49d6dd4fc47e40fa3ef29eee (diff) | |
download | korg-helpers-75071b1c24883374f54bc1359729172808b62395.tar.gz |
Add initial support for mailman3 archives
Hyperkitty mangles things in similar, but slightly different ways from
pipermail.
Signed-off-by: Konstantin Ryabitsev <konstantin@linuxfoundation.org>
-rwxr-xr-x | list-archive-collector.py | 131 |
1 files changed, 105 insertions, 26 deletions
diff --git a/list-archive-collector.py b/list-archive-collector.py index 1f73df6..b3186ad 100755 --- a/list-archive-collector.py +++ b/list-archive-collector.py @@ -18,6 +18,7 @@ import sys import mailbox import email.utils import email.policy +import email.header import time import re import quopri @@ -58,6 +59,25 @@ EMLPOLICY = email.policy.EmailPolicy(utf8=True, cte_type='8bit', max_line_length logger = logging.getLogger(__name__) +def clean_header(hdrval): + if hdrval is None: + return '' + + decoded = '' + for hstr, hcs in email.header.decode_header(hdrval): + if hcs is None: + hcs = 'utf-8' + try: + decoded += hstr.decode(hcs, errors='replace') + except LookupError: + # Try as utf-u + decoded += hstr.decode('utf-8', errors='replace') + except (UnicodeDecodeError, AttributeError): + decoded += hstr + new_hdrval = re.sub(r'\n?\s+', ' ', decoded) + return new_hdrval.strip() + + def get_requests_session(): global REQSESSION if REQSESSION is None: @@ -296,30 +316,70 @@ def parse_pipermail_index(pipermail_url): soup = BeautifulSoup(index, features='lxml') - mboxes = [] + mboxes = set() for tag in soup.find_all('a'): # we are looking for a href that ends with .txt.gz if 'href' in tag.attrs and tag.attrs['href'].find('.txt') > 1: - mboxes.append(os.path.join(pipermail_url, tag.attrs['href'])) + mboxes.add(os.path.join(pipermail_url, tag.attrs['href'])) + + return mboxes + +def parse_hyperkitty_index(hyperkitty_url): + logger.info('Grabbing hyperkitty index from %s', hyperkitty_url) + rses = get_requests_session() + resp = rses.get(hyperkitty_url) + index = resp.content + + soup = BeautifulSoup(index, features='lxml') + + mboxes = set() + for tag in soup.find_all('a'): + # we are looking for a href that has year/month notation + if 'href' in tag.attrs: + matches = re.search(r'.*/(\d{4})/(\d{1,2})/', tag.attrs['href']) + if matches: + year, month = matches.groups() + year = int(year) + month = int(month) + fromdate = '%d-%02d-01' % (year, month) + if month == 12: + todate = '%d-01-01' % (year+1) + else: + todate = '%d-%02d-01' % (year, month+1) + archurl = '%s/export/export.mbox.gz?start=%s&end=%s' % (hyperkitty_url.rstrip('/'), fromdate, todate) + mboxes.add(archurl) + + # return {f'{hyperkitty_url}/export/export.mbox.gz?start=2021-09-01&end=2021-10-01'} return mboxes def grab_pipermail_archive(pipermail_url, mbx, listid, toaddr, lookaside, checkspam): tmpfile = mkstemp('pipermail')[1] chunks = pipermail_url.split('/') - - logger.info(' grabbing %s', chunks[-1]) - rses = get_requests_session() - # XXX: this can be horribly large - resp = rses.get(pipermail_url, stream=True) - if resp.content.startswith(b'\x1f\x8b'): - with gzip.GzipFile(fileobj=io.BytesIO(resp.content)) as uncompressed: - mboxdata = uncompressed.read().decode('utf-8', errors='replace') + if pipermail_url[0] == '/': + with open(pipermail_url, 'rb') as fh: + if pipermail_url[-3:] == '.gz': + with gzip.GzipFile(fileobj=fh) as uncompressed: + mboxdata = uncompressed.read().decode('utf-8', errors='replace') + else: + mboxdata = fh.read().decode('utf-8', errors='replace') else: - mboxdata = resp.content.decode('utf-8', errors='replace') - - resp.close() + logger.info(' grabbing %s', chunks[-1]) + rses = get_requests_session() + # XXX: this can be horribly large + try: + resp = rses.get(pipermail_url, stream=True) + if resp.content.startswith(b'\x1f\x8b'): + with gzip.GzipFile(fileobj=io.BytesIO(resp.content)) as uncompressed: + mboxdata = uncompressed.read().decode('utf-8', errors='replace') + else: + mboxdata = resp.content.decode('utf-8', errors='replace') + + resp.close() + except EOFError: + logger.info(' corrupted month: %s, skipped', chunks[-1]) + return # Pipermail does a nasty thing where it doesn't properly handle # lines in the body that start with "From ". First, we add ">" to @@ -336,10 +396,13 @@ def grab_pipermail_archive(pipermail_url, mbx, listid, toaddr, lookaside, checks regex = r'(<[^>]+) at ([^>]+>)' subst = r'\1@\2' mboxdata = re.sub(regex, subst, mboxdata, 0, flags=re.M) - regex = r'^>?(From:? \S+) at (\S+\..*)' + regex = r'(<[^>]+)\(a\)([^>]+>)' + subst = r'\1@\2' + mboxdata = re.sub(regex, subst, mboxdata, 0, flags=re.M) + regex = r'^>?((?:From|To):? \S+) at (\S+\..*)' mboxdata = re.sub(regex, subst, mboxdata, 0, flags=re.M) # Fix any remaining false From escapes - regex = r'^>(From\s+\S+\s+\w+\s+\w+\s+\d+\s+\d+:\d+:\d+\s+\d{4})' + regex = r'^>(From\s+\S+[@-]\S+\s+\w+\s+\w+\s+\d+\s+\d+:\d+:\d+\s+\d{4})' subst = r'\1' mboxdata = re.sub(regex, subst, mboxdata, 0, flags=re.M) @@ -356,13 +419,18 @@ def grab_pipermail_archive(pipermail_url, mbx, listid, toaddr, lookaside, checks logger.info(' error parsing message %d, skipped', mkey) continue - fromline = str(msg.get('From', '')) - if fromline and fromline.find('(') > 0: + oldfrom = str(msg.get('From', '')) + if oldfrom: + newfrom = clean_header(oldfrom) + # Fix any leftover at-escaping + newfrom = newfrom.replace(' at ', '@') # Fix bogus From: foo@bar.baz (Foo Barski) -> Foo Barski <foo@bar.baz> - matches = re.search(r'(\S+@\S+\.\S+) \((.*)\)$', fromline) + matches = re.search(r'(\S+@\S+\.\S+) \((.*)\)$', newfrom) if matches: gr = matches.groups() - msg.replace_header('From', f'{gr[1]} <{gr[0]}>') + newfrom = f'{gr[1]} <{gr[0]}>' + if newfrom != oldfrom: + msg.replace_header('From', newfrom) if listid: msg['List-Id'] = f'<{listid}>' @@ -372,7 +440,10 @@ def grab_pipermail_archive(pipermail_url, mbx, listid, toaddr, lookaside, checks if not msg.get('To'): msg['To'] = toaddr - + # Fix in-reply-to + irt = msg.get('in-reply-to') + if irt and irt[0] != '<': + msg.replace_header('In-Reply-To', f'<{irt}>') add_msg_to_mbx(msg, mbx, checkspam) tmpmbx.close() @@ -488,13 +559,19 @@ def get_mailman(args): if not args.to: args.to = args.listid.replace('.', '@', 1) - months = parse_pipermail_index(args.url) - if not months: - print('Could not find any .txt.gz files listed at %s' % args.url) - sys.exit(1) mbx = mailbox.mbox(args.out) - for month in months: - grab_pipermail_archive(month, mbx, args.listid, args.to, args.lookaside, args.checkspam) + if args.url[0] == '/': + grab_pipermail_archive(args.url, mbx, args.listid, args.to, args.lookaside, args.checkspam) + else: + if args.mailman3: + months = parse_hyperkitty_index(args.url) + else: + months = parse_pipermail_index(args.url) + if not months: + print('Could not find any .txt.gz files listed at %s' % args.url) + sys.exit(1) + for month in months: + grab_pipermail_archive(month, mbx, args.listid, args.to, args.lookaside, args.checkspam) def get_nntp(args): @@ -565,6 +642,8 @@ if __name__ == '__main__': formatter_class=argparse.ArgumentDefaultsHelpFormatter) sp_mm.add_argument('-u', '--url', required=True, help='Mailman archive index URL') + sp_mm.add_argument('-3', '--mailman3', action='store_true', default=False, + help='This is a mailman3 site') sp_mm.set_defaults(func=get_mailman) sp_marc = subparsers.add_parser('marcinfo', help='Collect marc.info archives', |