diff options
author | Konstantin Ryabitsev <konstantin@linuxfoundation.org> | 2022-10-20 14:11:29 -0400 |
---|---|---|
committer | Konstantin Ryabitsev <konstantin@linuxfoundation.org> | 2022-10-20 14:11:29 -0400 |
commit | 354fc16e397312c8972c6b48e7645d2c98d71db3 (patch) | |
tree | 63cf3ce0686bd08aa79e42c2aa68afbe226cb308 | |
parent | a72a55b06660b249c82cdba32a4dc54e8a7ea2f3 (diff) | |
download | korg-helpers-354fc16e397312c8972c6b48e7645d2c98d71db3.tar.gz |
list-collectors: small improvements
Add a few small improvements to the list-archive-collector and
list-archive-maker lists, allowing to more easier deal with partial
archives.
Signed-off-by: Konstantin Ryabitsev <konstantin@linuxfoundation.org>
-rwxr-xr-x | list-archive-collector.py | 137 | ||||
-rwxr-xr-x | list-archive-maker.py | 112 |
2 files changed, 136 insertions, 113 deletions
diff --git a/list-archive-collector.py b/list-archive-collector.py index b3186ad..99e22be 100755 --- a/list-archive-collector.py +++ b/list-archive-collector.py @@ -25,23 +25,25 @@ import quopri import base64 import gzip import io -import nntplib import requests import logging import subprocess +import argparse try: - import cchardet as chardet # noqa + import cchardet as chardet # noqa except ImportError: - import chardet + import chardet # noqa from tempfile import mkstemp from bs4 import BeautifulSoup # noqa from requests.adapters import HTTPAdapter -from requests.packages.urllib3.util.retry import Retry +from urllib3 import Retry + +from typing import Optional, Union, List, Set from email import charset -charset.add_charset('utf-8', charset.SHORTEST, '8bit') # noqa +charset.add_charset('utf-8', None) # Used for our requests session REQSESSION = None @@ -59,7 +61,7 @@ EMLPOLICY = email.policy.EmailPolicy(utf8=True, cte_type='8bit', max_line_length logger = logging.getLogger(__name__) -def clean_header(hdrval): +def clean_header(hdrval: str) -> Optional[str]: if hdrval is None: return '' @@ -78,7 +80,7 @@ def clean_header(hdrval): return new_hdrval.strip() -def get_requests_session(): +def get_requests_session() -> requests.Session: global REQSESSION if REQSESSION is None: REQSESSION = requests.session() @@ -94,14 +96,14 @@ def get_requests_session(): return REQSESSION -def lore_get_message(msgid): +def lore_get_message(msgid: str) -> email.message.Message: # See where we're redirected rurl = f'https://lore.kernel.org/r/{msgid}' rses = get_requests_session() resp = rses.head(rurl) if resp.status_code < 300 or resp.status_code > 400: # Not known on lore - return None + raise LookupError # Pop msgid from the end of the redirect msgurl = resp.headers['Location'] + 'raw' resp.close() @@ -111,33 +113,7 @@ def lore_get_message(msgid): return msg -# Turned off for now -def patchwork_get_headers(msgid): - url = f'https://patchwork.kernel.org/api/1.2/patches/' - params = [ - ('msgid', msgid), - ] - rses = get_requests_session() - resp = rses.get(url, params=params, stream=False) - if resp.status_code > 200: - return None - - jj = resp.json() - if not len(jj): - return None - - # we only care about one - p_id = jj[0].get('id') - resp = rses.get(f'{url}{p_id}', stream=False) - if resp.status_code > 200: - return None - - logger.info(' found on patchwork') - jj = resp.json() - return jj.get('headers') - - -def lookaside_fillin(msg): +def lookaside_fillin(msg: email.message.Message) -> bool: wanthdrs = [ 'To', 'Cc', @@ -147,12 +123,10 @@ def lookaside_fillin(msg): 'X-Mailer', ] msgid = str(msg.get('Message-Id')).strip('<>') - lmsg = lore_get_message(msgid) - if not lmsg: + try: + lmsg = lore_get_message(msgid) + except LookupError: return False - # lmsg = patchwork_get_headers(msgid) - # if not lmsg: - # return False for wanthdr in wanthdrs: if not msg.get(wanthdr) and lmsg.get(wanthdr): @@ -161,7 +135,8 @@ def lookaside_fillin(msg): return True -def marc_get_message(marc_list_id, msgnum, listid, toaddr, lookaside): +def marc_get_message(marc_list_id: str, msgnum: str, listid: str, toaddr: str, + lookaside: bool) -> email.message.Message: rses = get_requests_session() url = f'{MARCURL}/?l={marc_list_id}&m={msgnum}&q=mbox' logger.info(' grabbing message %s', msgnum) @@ -179,7 +154,7 @@ def marc_get_message(marc_list_id, msgnum, listid, toaddr, lookaside): if not msg.get('Message-Id'): logger.info(' No message-id, ignored') # Can't use it anyway - return None + raise LookupError hdrs = list() @@ -227,7 +202,7 @@ def marc_get_message(marc_list_id, msgnum, listid, toaddr, lookaside): return msg -def check_if_spam(bmsg): +def check_if_spam(bmsg: bytes) -> bool: if not os.path.exists('/usr/bin/spamc'): return False @@ -243,7 +218,16 @@ def check_if_spam(bmsg): return True -def add_msg_to_mbx(msg, mbx, checkspam): +def add_msg_to_mbx(msg: email.message.Message, mbx: Union[mailbox.Mailbox, mailbox.Maildir], + checkspam: bool, cleansubj: Optional[str]) -> None: + oldsubj = clean_header(msg.get('Subject', '')) + if cleansubj and cleansubj in oldsubj: + # We only remove it if it's ^thatsring, or ^Re: thatstring + if oldsubj.startswith(cleansubj): + msg.replace_header('Subject', oldsubj.replace(cleansubj, '', 1).strip()) + if oldsubj.startswith(f'Re: {cleansubj}'): + msg.replace_header('Subject', oldsubj.replace(f'Re: {cleansubj}', 'Re:', 1).strip()) + if msg.get_default_type() == 'text/plain': try: payload = msg.get_payload(decode=True) @@ -261,6 +245,7 @@ def add_msg_to_mbx(msg, mbx, checkspam): logger.info(' spam: %s', msg['Subject']) return + logger.info(' added: %s', msg['Subject']) mbx.add(bmsg) except: # noqa # Throw it out, because life is too short to figure out all possible ways @@ -269,7 +254,7 @@ def add_msg_to_mbx(msg, mbx, checkspam): return -def marc_get_full_thread(marc_list_id, thread_id): +def marc_get_full_thread(marc_list_id: str, thread_id: str) -> List[str]: cp = 1 rses = get_requests_session() msgnums = list() @@ -308,7 +293,7 @@ def marc_get_full_thread(marc_list_id, thread_id): return msgnums -def parse_pipermail_index(pipermail_url): +def parse_pipermail_index(pipermail_url: str) -> Set[str]: logger.info('Grabbing pipermail index from %s', pipermail_url) rses = get_requests_session() resp = rses.get(pipermail_url) @@ -325,7 +310,7 @@ def parse_pipermail_index(pipermail_url): return mboxes -def parse_hyperkitty_index(hyperkitty_url): +def parse_hyperkitty_index(hyperkitty_url: str) -> Set[str]: logger.info('Grabbing hyperkitty index from %s', hyperkitty_url) rses = get_requests_session() resp = rses.get(hyperkitty_url) @@ -354,7 +339,8 @@ def parse_hyperkitty_index(hyperkitty_url): return mboxes -def grab_pipermail_archive(pipermail_url, mbx, listid, toaddr, lookaside, checkspam): +def grab_pipermail_archive(pipermail_url: str, mbx: Union[mailbox.Mailbox, mailbox.Maildir], + args: argparse.Namespace) -> None: tmpfile = mkstemp('pipermail')[1] chunks = pipermail_url.split('/') if pipermail_url[0] == '/': @@ -432,25 +418,25 @@ def grab_pipermail_archive(pipermail_url, mbx, listid, toaddr, lookaside, checks if newfrom != oldfrom: msg.replace_header('From', newfrom) - if listid: - msg['List-Id'] = f'<{listid}>' + if args.listid: + msg['List-Id'] = f'<{args.listid}>' - if lookaside: + if args.lookaside: lookaside_fillin(msg) if not msg.get('To'): - msg['To'] = toaddr + msg['To'] = args.to # Fix in-reply-to irt = msg.get('in-reply-to') if irt and irt[0] != '<': msg.replace_header('In-Reply-To', f'<{irt}>') - add_msg_to_mbx(msg, mbx, checkspam) + add_msg_to_mbx(msg, mbx, args.checkspam, args.clean_subject) tmpmbx.close() os.unlink(tmpfile) -def get_marcinfo(args): +def get_marcinfo(args: argparse.Namespace) -> None: global MARCNICE if args.nice < 0.5: @@ -521,7 +507,7 @@ def get_marcinfo(args): break cp += 1 - mbx = mailbox.mbox(args.out) + mbx = get_outbox(args) for thdnum in thdnums: tnums = marc_get_full_thread(marc_list_id, thdnum) # last message starts the thread @@ -531,8 +517,9 @@ def get_marcinfo(args): if tnum in msgnums: msgnums.remove(tnum) time.sleep(MARCNICE) - msg = marc_get_message(marc_list_id, tnum, args.listid, args.to, args.lookaside) - if not msg: + try: + msg = marc_get_message(marc_list_id, tnum, args.listid, args.to, args.lookaside) + except LookupError: continue if not irt: @@ -541,7 +528,7 @@ def get_marcinfo(args): msg['References'] = irt msg['In-Reply-To'] = irt - add_msg_to_mbx(msg, mbx, args.checkspam) + add_msg_to_mbx(msg, mbx, args.checkspam, args.clean_subject) logger.info('Grabbing remaining unthreaded messages') for msgnum in msgnums: @@ -550,18 +537,18 @@ def get_marcinfo(args): if not msg: continue - add_msg_to_mbx(msg, mbx, args.checkspam) + add_msg_to_mbx(msg, mbx, args.checkspam, args.clean_subject) mbx.close() -def get_mailman(args): +def get_mailman(args: argparse.Namespace) -> None: if not args.to: args.to = args.listid.replace('.', '@', 1) - mbx = mailbox.mbox(args.out) + mbx = get_outbox(args) if args.url[0] == '/': - grab_pipermail_archive(args.url, mbx, args.listid, args.to, args.lookaside, args.checkspam) + grab_pipermail_archive(args.url, mbx, args) else: if args.mailman3: months = parse_hyperkitty_index(args.url) @@ -571,10 +558,11 @@ def get_mailman(args): print('Could not find any .txt.gz files listed at %s' % args.url) sys.exit(1) for month in months: - grab_pipermail_archive(month, mbx, args.listid, args.to, args.lookaside, args.checkspam) + grab_pipermail_archive(month, mbx, args) -def get_nntp(args): +def get_nntp(args: argparse.Namespace) -> None: + import nntplib # Expect in format nntp://news.gmane.org/gmane.linux.network logger.info('Connecting to %s', args.url) chunks = args.url.split('/') @@ -584,7 +572,7 @@ def get_nntp(args): resp, count, first, last, name = server.group(group) total = int(last) - mbx = mailbox.mbox(args.out) + mbx = get_outbox(args) aid = 1 while aid <= total: try: @@ -601,9 +589,9 @@ def get_nntp(args): try: msg.replace_header('List-Id', f'<{args.listid}>') except KeyError: - msg.add_header('List-Id', f'<{args.listid}') + msg.add_header('List-Id', f'<{args.listid}>') - add_msg_to_mbx(msg, mbx, args.checkspam) + add_msg_to_mbx(msg, mbx, args.checkspam, args.clean_subject) except nntplib.NNTPTemporaryError: # Ignore one-off article failures -- probably deletes @@ -614,9 +602,14 @@ def get_nntp(args): mbx.close() -if __name__ == '__main__': - import argparse +def get_outbox(args: argparse.Namespace) -> Union[mailbox.Mailbox, mailbox.Maildir]: + if args.as_maildir: + logger.info('Will output into maildir %s', args.out) + return mailbox.Maildir(args.out) + return mailbox.mbox(args.out) + +if __name__ == '__main__': # noinspection PyTypeChecker parser = argparse.ArgumentParser( description="Collect external mail archives into a local mbox", @@ -635,6 +628,10 @@ if __name__ == '__main__': help='Run spamc to check messages for spam before adding') parser.add_argument('-o', '--out', required=True, help='Filename of the mailbox file to write out') + parser.add_argument('-m', '--as-maildir', action='store_true', default=False, + help='Output as maildir instead of mbox') + parser.add_argument('-c', '--clean-subject', + help='Remove this string from subjects (e.g. [listname])') subparsers = parser.add_subparsers(help='sub-command help', dest='subcmd') diff --git a/list-archive-maker.py b/list-archive-maker.py index 393e294..801d840 100755 --- a/list-archive-maker.py +++ b/list-archive-maker.py @@ -27,9 +27,12 @@ import mailbox import email.utils import email.policy import fnmatch +import argparse + +from typing import Tuple, List, Set from email import charset -charset.add_charset('utf-8', charset.SHORTEST, '8bit') # noqa +charset.add_charset('utf-8', None) # Set our own policy EMLPOLICY = email.policy.EmailPolicy(utf8=True, cte_type='8bit', max_line_length=None) @@ -62,7 +65,8 @@ WANTHDRS = {'return-path', __VERSION__ = '2.0' -def formataddr(pair): + +def formataddr(pair: Tuple[str, str]) -> str: try: return email.utils.formataddr(pair) except UnicodeEncodeError: @@ -70,11 +74,12 @@ def formataddr(pair): # drop the real name then. return email.utils.formataddr((None, pair[1])) -def process_archives(sources, outdir, msgids, listids, rejectsfile, asmaildir): - outboxes = {} - writecount = {} - seenids = [] - knownset = set(msgids) + +def process_archives(sources: List[str], outdir: str, knownset: Set[str], listids: List[str], + rejectsfile: str, asmaildir: bool, extrahdrs: List[Tuple[str, str]]) -> Set[str]: + outboxes = dict() + writecount = dict() + seenids = set() if asmaildir: outbox = mailbox.Maildir(outdir) @@ -84,7 +89,7 @@ def process_archives(sources, outdir, msgids, listids, rejectsfile, asmaildir): # convert listids into email addresses by replacing the first '.' to '@'. # if you're working with a mailing list that has a non-standard list-id, you # can specify the list email address as part of the listids to satisfy this check. - eaddrs = [] + eaddrs = list() for listid in listids: if listid.find('@') < 0: eaddrs.append(listid.replace('.', '@', 1)) @@ -150,9 +155,9 @@ def process_archives(sources, outdir, msgids, listids, rejectsfile, asmaildir): # Remove headers not in WANTHDRS list and any Received: # lines that do not mention the list email address - newhdrs = [] - to = [] - cc = [] + newhdrs = list() + to = list() + cc = list() recvtime = None is_our_list = False for hdrname, hdrval in list(msg._headers): # noqa @@ -203,13 +208,13 @@ def process_archives(sources, outdir, msgids, listids, rejectsfile, asmaildir): # so there's one field for each header type. # # Save the place in newhdrs where the first to or cc list would - # have appeared so we can insert the merged list there rather + # have appeared, so we can insert the merged list there rather # than strangely at the end. elif lhdrname == 'to': for pair in email.utils.getaddresses([hdrval]): if pair[1] in cc: - # already in Cc, so no need to add it to To + # already in Cc, so no need to add it to "To" continue to.append(formataddr(pair)) @@ -256,6 +261,9 @@ def process_archives(sources, outdir, msgids, listids, rejectsfile, asmaildir): notourlist += 1 continue + if extrahdrs: + newhdrs += extrahdrs + msg._headers = newhdrs msgdate = recvtime @@ -278,10 +286,10 @@ def process_archives(sources, outdir, msgids, listids, rejectsfile, asmaildir): try: outbox.add(msg.as_string(policy=EMLPOLICY).encode()) - seenids.append(msgid) + seenids.add(msgid) knownset.add(msgid) writecount[mboxname] += 1 - except: + except: # noqa # Oh well, toss it pass @@ -298,66 +306,84 @@ def process_archives(sources, outdir, msgids, listids, rejectsfile, asmaildir): print(' %s: %s new (%s total)' % (os.path.join(outdir, mboxname), writecount[mboxname], len(outboxes[mboxname]))) outboxes[mboxname].close() - return seenids else: print('No new messages found.') - return None + + return seenids -def main(args): - if not args.asmaildir and not os.path.isdir(args.exportdir): +def main(args: argparse.Namespace): + if not args.as_maildir and not os.path.isdir(args.exportdir): os.mkdir(args.exportdir) - if args.knownids and os.path.exists(args.knownids): - with open(args.knownids, 'r') as fh: - knownids = fh.read().splitlines() - fh.close() - print('Loaded %s message-ids from "%s"' % (len(knownids), args.knownids)) + if args.known_ids and os.path.exists(args.known_ids): + if args.known_ids.endswith('.sqlite3'): + import sqlite3 + dbconn = sqlite3.connect(args.known_ids, sqlite3.PARSE_DECLTYPES | sqlite3.PARSE_COLNAMES) + cur = dbconn.cursor() + rows = cur.execute('SELECT mid FROM msgmap').fetchall() + knownids = {x[0] for x in rows} + else: + with open(args.known_ids, 'r') as fh: + knownids = set(fh.read().splitlines()) + fh.close() + print('Loaded %s message-ids from "%s"' % (len(knownids), args.known_ids)) else: # should we load message-ids from existing mailboxes found in the export dir? # right now we're just appending to them, which is probably not expected behaviour. - knownids = [] + knownids = set() if not args.source: print('You have to specify at least one source') sys.exit(1) - # Make list ID matching case insensitive to match more mail - listids = [listid.lower() for listid in args.listids] + # Make list ID matching case-insensitive to match more mail + if args.list_ids: + listids = [listid.lower() for listid in args.list_ids] + else: + listids = ['*'] + + extrahdrs = list() + if args.extrahdrs: + for hdr in args.extrahdrs: + name, val = hdr.split(':', maxsplit=1) + if val.strip(): + extrahdrs.append((name.strip(), val.strip())) - newids = process_archives(args.source, args.exportdir, knownids, listids, args.rejected, args.asmaildir) + newids = process_archives(args.source, args.exportdir, knownids, listids, args.rejected, args.as_maildir, + extrahdrs) - if newids is None or not args.knownids: + if newids is None or not args.known_ids or args.known_ids.endswith('.sqlite3'): sys.exit(0) - new_idlist = knownids + newids - print('Wrote %s msgids into %s (%s new)' % (len(new_idlist), args.knownids, len(newids))) - with open(args.knownids, 'w') as fh: - fh.write('\n'.join(new_idlist)) + knownids.update(newids) + print('Wrote %s msgids into %s (%s new)' % (len(knownids), args.known_ids, len(newids))) + with open(args.known_ids, 'w') as fh: + fh.write('\n'.join(knownids)) fh.close() if __name__ == '__main__': - import argparse - # noinspection PyTypeChecker parser = argparse.ArgumentParser( description="Make a mbox of LKML messages we haven't yet archived", formatter_class=argparse.ArgumentDefaultsHelpFormatter, ) - parser.add_argument('-source', nargs='+', + parser.add_argument('-s', '--source', nargs='+', help=('Mbox file with archives, can be multiple. ' 'Paths with trailing "/" will be treated as maildirs.')) - parser.add_argument('-exportdir', required=True, default='list-archives', + parser.add_argument('-e', '--exportdir', required=True, default='list-archives', help='Export dir where to put sanitized archives') - parser.add_argument('-asmaildir', action='store_true', default=False, + parser.add_argument('-m', '--as-maildir', action='store_true', default=False, help='Export as maildir instead of mailboxes') - parser.add_argument('-knownids', - help='File with known Message-IDs (one per line)') - parser.add_argument('-listids', required=True, nargs='+', - help='List ID to match, can be multiple') - parser.add_argument('-rejected', + parser.add_argument('-k', '--known-ids', + help='File with known Message-IDs (one per line, or msgmap.sqlite3)') + parser.add_argument('-l', '--list-ids', nargs='+', + help='Limit to just these list-ids (can be multiple)') + parser.add_argument('-r', '--rejected', help='Mailbox file where to save messages that were rejected ' '(adds X-Import-Rejected-Reason header)') + parser.add_argument('-x', '--extrahdrs', nargs='+', metavar='FULLHDR', + help='Extra headers to inject into each message') main(parser.parse_args()) |