list-collectors: small improvements

Add a few small improvements to the list-archive-collector and list-archive-maker lists, allowing to more easier deal with partial archives. Signed-off-by: Konstantin Ryabitsev <konstantin@linuxfoundation.org>
author: Konstantin Ryabitsev <konstantin@linuxfoundation.org> 2022-10-20 14:11:29 -0400
committer: Konstantin Ryabitsev <konstantin@linuxfoundation.org> 2022-10-20 14:11:29 -0400
commit: 354fc16e397312c8972c6b48e7645d2c98d71db3 (patch)
tree: 63cf3ce0686bd08aa79e42c2aa68afbe226cb308
parent: a72a55b06660b249c82cdba32a4dc54e8a7ea2f3 (diff)
download: korg-helpers-354fc16e397312c8972c6b48e7645d2c98d71db3.tar.gz
2 files changed, 136 insertions, 113 deletions
diff --git a/list-archive-collector.py b/list-archive-collector.py
index b3186ad..99e22be 100755
--- a/list-archive-collector.py
+++ b/list-archive-collector.py
@@ -25,23 +25,25 @@ import quopri
 import base64
 import gzip
 import io
-import nntplib
 import requests
 import logging
 import subprocess
+import argparse
 
 try:
-    import cchardet as chardet # noqa
+    import cchardet as chardet  # noqa
 except ImportError:
-    import chardet
+    import chardet  # noqa
 
 from tempfile import mkstemp
 from bs4 import BeautifulSoup # noqa
 from requests.adapters import HTTPAdapter
-from requests.packages.urllib3.util.retry import Retry
+from urllib3 import Retry
+
+from typing import Optional, Union, List, Set
 
 from email import charset
-charset.add_charset('utf-8', charset.SHORTEST, '8bit') # noqa
+charset.add_charset('utf-8', None)
 
 # Used for our requests session
 REQSESSION = None
@@ -59,7 +61,7 @@ EMLPOLICY = email.policy.EmailPolicy(utf8=True, cte_type='8bit', max_line_length
 logger = logging.getLogger(__name__)
 
 
-def clean_header(hdrval):
+def clean_header(hdrval: str) -> Optional[str]:
     if hdrval is None:
         return ''
 
@@ -78,7 +80,7 @@ def clean_header(hdrval):
     return new_hdrval.strip()
 
 
-def get_requests_session():
+def get_requests_session() -> requests.Session:
     global REQSESSION
     if REQSESSION is None:
         REQSESSION = requests.session()
@@ -94,14 +96,14 @@ def get_requests_session():
     return REQSESSION
 
 
-def lore_get_message(msgid):
+def lore_get_message(msgid: str) -> email.message.Message:
     # See where we're redirected
     rurl = f'https://lore.kernel.org/r/{msgid}'
     rses = get_requests_session()
     resp = rses.head(rurl)
     if resp.status_code < 300 or resp.status_code > 400:
         # Not known on lore
-        return None
+        raise LookupError
     # Pop msgid from the end of the redirect
     msgurl = resp.headers['Location'] + 'raw'
     resp.close()
@@ -111,33 +113,7 @@ def lore_get_message(msgid):
     return msg
 
 
-# Turned off for now
-def patchwork_get_headers(msgid):
-    url = f'https://patchwork.kernel.org/api/1.2/patches/'
-    params = [
-        ('msgid', msgid),
-    ]
-    rses = get_requests_session()
-    resp = rses.get(url, params=params, stream=False)
-    if resp.status_code > 200:
-        return None
-
-    jj = resp.json()
-    if not len(jj):
-        return None
-
-    # we only care about one
-    p_id = jj[0].get('id')
-    resp = rses.get(f'{url}{p_id}', stream=False)
-    if resp.status_code > 200:
-        return None
-
-    logger.info('    found on patchwork')
-    jj = resp.json()
-    return jj.get('headers')
-
-
-def lookaside_fillin(msg):
+def lookaside_fillin(msg: email.message.Message) -> bool:
     wanthdrs = [
         'To',
         'Cc',
@@ -147,12 +123,10 @@ def lookaside_fillin(msg):
         'X-Mailer',
     ]
     msgid = str(msg.get('Message-Id')).strip('<>')
-    lmsg = lore_get_message(msgid)
-    if not lmsg:
+    try:
+        lmsg = lore_get_message(msgid)
+    except LookupError:
         return False
-        # lmsg = patchwork_get_headers(msgid)
-        # if not lmsg:
-        #    return False
 
     for wanthdr in wanthdrs:
         if not msg.get(wanthdr) and lmsg.get(wanthdr):
@@ -161,7 +135,8 @@ def lookaside_fillin(msg):
     return True
 
 
-def marc_get_message(marc_list_id, msgnum, listid, toaddr, lookaside):
+def marc_get_message(marc_list_id: str, msgnum: str, listid: str, toaddr: str,
+                     lookaside: bool) -> email.message.Message:
     rses = get_requests_session()
     url = f'{MARCURL}/?l={marc_list_id}&m={msgnum}&q=mbox'
     logger.info('  grabbing message %s', msgnum)
@@ -179,7 +154,7 @@ def marc_get_message(marc_list_id, msgnum, listid, toaddr, lookaside):
     if not msg.get('Message-Id'):
         logger.info('    No message-id, ignored')
         # Can't use it anyway
-        return None
+        raise LookupError
 
     hdrs = list()
 
@@ -227,7 +202,7 @@ def marc_get_message(marc_list_id, msgnum, listid, toaddr, lookaside):
     return msg
 
 
-def check_if_spam(bmsg):
+def check_if_spam(bmsg: bytes) -> bool:
     if not os.path.exists('/usr/bin/spamc'):
         return False
 
@@ -243,7 +218,16 @@ def check_if_spam(bmsg):
     return True
 
 
-def add_msg_to_mbx(msg, mbx, checkspam):
+def add_msg_to_mbx(msg: email.message.Message, mbx: Union[mailbox.Mailbox, mailbox.Maildir],
+                   checkspam: bool, cleansubj: Optional[str]) -> None:
+    oldsubj = clean_header(msg.get('Subject', ''))
+    if cleansubj and cleansubj in oldsubj:
+        # We only remove it if it's ^thatsring, or ^Re: thatstring
+        if oldsubj.startswith(cleansubj):
+            msg.replace_header('Subject', oldsubj.replace(cleansubj, '', 1).strip())
+        if oldsubj.startswith(f'Re: {cleansubj}'):
+            msg.replace_header('Subject', oldsubj.replace(f'Re: {cleansubj}', 'Re:', 1).strip())
+
     if msg.get_default_type() == 'text/plain':
         try:
             payload = msg.get_payload(decode=True)
@@ -261,6 +245,7 @@ def add_msg_to_mbx(msg, mbx, checkspam):
             logger.info('    spam: %s', msg['Subject'])
             return
 
+        logger.info('    added: %s', msg['Subject'])
         mbx.add(bmsg)
     except: # noqa
         # Throw it out, because life is too short to figure out all possible ways
@@ -269,7 +254,7 @@ def add_msg_to_mbx(msg, mbx, checkspam):
         return
 
 
-def marc_get_full_thread(marc_list_id, thread_id):
+def marc_get_full_thread(marc_list_id: str, thread_id: str) -> List[str]:
     cp = 1
     rses = get_requests_session()
     msgnums = list()
@@ -308,7 +293,7 @@ def marc_get_full_thread(marc_list_id, thread_id):
     return msgnums
 
 
-def parse_pipermail_index(pipermail_url):
+def parse_pipermail_index(pipermail_url: str) -> Set[str]:
     logger.info('Grabbing pipermail index from %s', pipermail_url)
     rses = get_requests_session()
     resp = rses.get(pipermail_url)
@@ -325,7 +310,7 @@ def parse_pipermail_index(pipermail_url):
     return mboxes
 
 
-def parse_hyperkitty_index(hyperkitty_url):
+def parse_hyperkitty_index(hyperkitty_url: str) -> Set[str]:
     logger.info('Grabbing hyperkitty index from %s', hyperkitty_url)
     rses = get_requests_session()
     resp = rses.get(hyperkitty_url)
@@ -354,7 +339,8 @@ def parse_hyperkitty_index(hyperkitty_url):
     return mboxes
 
 
-def grab_pipermail_archive(pipermail_url, mbx, listid, toaddr, lookaside, checkspam):
+def grab_pipermail_archive(pipermail_url: str, mbx: Union[mailbox.Mailbox, mailbox.Maildir],
+                           args: argparse.Namespace) -> None:
     tmpfile = mkstemp('pipermail')[1]
     chunks = pipermail_url.split('/')
     if pipermail_url[0] == '/':
@@ -432,25 +418,25 @@ def grab_pipermail_archive(pipermail_url, mbx, listid, toaddr, lookaside, checks
             if newfrom != oldfrom:
                 msg.replace_header('From', newfrom)
 
-        if listid:
-            msg['List-Id'] = f'<{listid}>'
+        if args.listid:
+            msg['List-Id'] = f'<{args.listid}>'
 
-        if lookaside:
+        if args.lookaside:
             lookaside_fillin(msg)
 
         if not msg.get('To'):
-            msg['To'] = toaddr
+            msg['To'] = args.to
         # Fix in-reply-to
         irt = msg.get('in-reply-to')
         if irt and irt[0] != '<':
             msg.replace_header('In-Reply-To', f'<{irt}>')
-        add_msg_to_mbx(msg, mbx, checkspam)
+        add_msg_to_mbx(msg, mbx, args.checkspam, args.clean_subject)
 
     tmpmbx.close()
     os.unlink(tmpfile)
 
 
-def get_marcinfo(args):
+def get_marcinfo(args: argparse.Namespace) -> None:
     global MARCNICE
 
     if args.nice < 0.5:
@@ -521,7 +507,7 @@ def get_marcinfo(args):
                 break
             cp += 1
 
-    mbx = mailbox.mbox(args.out)
+    mbx = get_outbox(args)
     for thdnum in thdnums:
         tnums = marc_get_full_thread(marc_list_id, thdnum)
         # last message starts the thread
@@ -531,8 +517,9 @@ def get_marcinfo(args):
             if tnum in msgnums:
                 msgnums.remove(tnum)
             time.sleep(MARCNICE)
-            msg = marc_get_message(marc_list_id, tnum, args.listid, args.to, args.lookaside)
-            if not msg:
+            try:
+                msg = marc_get_message(marc_list_id, tnum, args.listid, args.to, args.lookaside)
+            except LookupError:
                 continue
 
             if not irt:
@@ -541,7 +528,7 @@ def get_marcinfo(args):
                 msg['References'] = irt
                 msg['In-Reply-To'] = irt
 
-            add_msg_to_mbx(msg, mbx, args.checkspam)
+            add_msg_to_mbx(msg, mbx, args.checkspam, args.clean_subject)
 
     logger.info('Grabbing remaining unthreaded messages')
     for msgnum in msgnums:
@@ -550,18 +537,18 @@ def get_marcinfo(args):
         if not msg:
             continue
 
-        add_msg_to_mbx(msg, mbx, args.checkspam)
+        add_msg_to_mbx(msg, mbx, args.checkspam, args.clean_subject)
 
     mbx.close()
 
 
-def get_mailman(args):
+def get_mailman(args: argparse.Namespace) -> None:
     if not args.to:
         args.to = args.listid.replace('.', '@', 1)
 
-    mbx = mailbox.mbox(args.out)
+    mbx = get_outbox(args)
     if args.url[0] == '/':
-        grab_pipermail_archive(args.url, mbx, args.listid, args.to, args.lookaside, args.checkspam)
+        grab_pipermail_archive(args.url, mbx, args)
     else:
         if args.mailman3:
             months = parse_hyperkitty_index(args.url)
@@ -571,10 +558,11 @@ def get_mailman(args):
             print('Could not find any .txt.gz files listed at %s' % args.url)
             sys.exit(1)
         for month in months:
-            grab_pipermail_archive(month, mbx, args.listid, args.to, args.lookaside, args.checkspam)
+            grab_pipermail_archive(month, mbx, args)
 
 
-def get_nntp(args):
+def get_nntp(args: argparse.Namespace) -> None:
+    import nntplib
     # Expect in format nntp://news.gmane.org/gmane.linux.network
     logger.info('Connecting to %s', args.url)
     chunks = args.url.split('/')
@@ -584,7 +572,7 @@ def get_nntp(args):
     resp, count, first, last, name = server.group(group)
     total = int(last)
 
-    mbx = mailbox.mbox(args.out)
+    mbx = get_outbox(args)
     aid = 1
     while aid <= total:
         try:
@@ -601,9 +589,9 @@ def get_nntp(args):
                 try:
                     msg.replace_header('List-Id', f'<{args.listid}>')
                 except KeyError:
-                    msg.add_header('List-Id', f'<{args.listid}')
+                    msg.add_header('List-Id', f'<{args.listid}>')
 
-            add_msg_to_mbx(msg, mbx, args.checkspam)
+            add_msg_to_mbx(msg, mbx, args.checkspam, args.clean_subject)
 
         except nntplib.NNTPTemporaryError:
             # Ignore one-off article failures -- probably deletes
@@ -614,9 +602,14 @@ def get_nntp(args):
     mbx.close()
 
 
-if __name__ == '__main__':
-    import argparse
+def get_outbox(args: argparse.Namespace) -> Union[mailbox.Mailbox, mailbox.Maildir]:
+    if args.as_maildir:
+        logger.info('Will output into maildir %s', args.out)
+        return mailbox.Maildir(args.out)
+    return mailbox.mbox(args.out)
+
 
+if __name__ == '__main__':
     # noinspection PyTypeChecker
     parser = argparse.ArgumentParser(
         description="Collect external mail archives into a local mbox",
@@ -635,6 +628,10 @@ if __name__ == '__main__':
                         help='Run spamc to check messages for spam before adding')
     parser.add_argument('-o', '--out', required=True,
                         help='Filename of the mailbox file to write out')
+    parser.add_argument('-m', '--as-maildir', action='store_true', default=False,
+                        help='Output as maildir instead of mbox')
+    parser.add_argument('-c', '--clean-subject',
+                        help='Remove this string from subjects (e.g. [listname])')
 
     subparsers = parser.add_subparsers(help='sub-command help', dest='subcmd')
 
diff --git a/list-archive-maker.py b/list-archive-maker.py
index 393e294..801d840 100755
--- a/list-archive-maker.py
+++ b/list-archive-maker.py
@@ -27,9 +27,12 @@ import mailbox
 import email.utils
 import email.policy
 import fnmatch
+import argparse
+
+from typing import Tuple, List, Set
 
 from email import charset
-charset.add_charset('utf-8', charset.SHORTEST, '8bit') # noqa
+charset.add_charset('utf-8', None)
 
 # Set our own policy
 EMLPOLICY = email.policy.EmailPolicy(utf8=True, cte_type='8bit', max_line_length=None)
@@ -62,7 +65,8 @@ WANTHDRS = {'return-path',
 
 __VERSION__ = '2.0'
 
-def formataddr(pair):
+
+def formataddr(pair: Tuple[str, str]) -> str:
     try:
         return email.utils.formataddr(pair)
     except UnicodeEncodeError:
@@ -70,11 +74,12 @@ def formataddr(pair):
         # drop the real name then.
         return email.utils.formataddr((None, pair[1]))
 
-def process_archives(sources, outdir, msgids, listids, rejectsfile, asmaildir):
-    outboxes = {}
-    writecount = {}
-    seenids = []
-    knownset = set(msgids)
+
+def process_archives(sources: List[str], outdir: str, knownset: Set[str], listids: List[str],
+                     rejectsfile: str, asmaildir: bool, extrahdrs: List[Tuple[str, str]]) -> Set[str]:
+    outboxes = dict()
+    writecount = dict()
+    seenids = set()
 
     if asmaildir:
         outbox = mailbox.Maildir(outdir)
@@ -84,7 +89,7 @@ def process_archives(sources, outdir, msgids, listids, rejectsfile, asmaildir):
     # convert listids into email addresses by replacing the first '.' to '@'.
     # if you're working with a mailing list that has a non-standard list-id, you
     # can specify the list email address as part of the listids to satisfy this check.
-    eaddrs = []
+    eaddrs = list()
     for listid in listids:
         if listid.find('@') < 0:
             eaddrs.append(listid.replace('.', '@', 1))
@@ -150,9 +155,9 @@ def process_archives(sources, outdir, msgids, listids, rejectsfile, asmaildir):
 
             # Remove headers not in WANTHDRS list and any Received:
             # lines that do not mention the list email address
-            newhdrs = []
-            to = []
-            cc = []
+            newhdrs = list()
+            to = list()
+            cc = list()
             recvtime = None
             is_our_list = False
             for hdrname, hdrval in list(msg._headers): # noqa
@@ -203,13 +208,13 @@ def process_archives(sources, outdir, msgids, listids, rejectsfile, asmaildir):
                 # so there's one field for each header type.
                 #
                 # Save the place in newhdrs where the first to or cc list would
-                # have appeared so we can insert the merged list there rather
+                # have appeared, so we can insert the merged list there rather
                 # than strangely at the end.
 
                 elif lhdrname == 'to':
                     for pair in email.utils.getaddresses([hdrval]):
                         if pair[1] in cc:
-                            # already in Cc, so no need to add it to To
+                            # already in Cc, so no need to add it to "To"
                             continue
                         to.append(formataddr(pair))
 
@@ -256,6 +261,9 @@ def process_archives(sources, outdir, msgids, listids, rejectsfile, asmaildir):
                     notourlist += 1
                     continue
 
+            if extrahdrs:
+                newhdrs += extrahdrs
+
             msg._headers = newhdrs
 
             msgdate = recvtime
@@ -278,10 +286,10 @@ def process_archives(sources, outdir, msgids, listids, rejectsfile, asmaildir):
 
             try:
                 outbox.add(msg.as_string(policy=EMLPOLICY).encode())
-                seenids.append(msgid)
+                seenids.add(msgid)
                 knownset.add(msgid)
                 writecount[mboxname] += 1
-            except:
+            except:  # noqa
                 # Oh well, toss it
                 pass
 
@@ -298,66 +306,84 @@ def process_archives(sources, outdir, msgids, listids, rejectsfile, asmaildir):
             print('  %s: %s new (%s total)' %
                   (os.path.join(outdir, mboxname), writecount[mboxname], len(outboxes[mboxname])))
             outboxes[mboxname].close()
-        return seenids
     else:
         print('No new messages found.')
-        return None
+
+    return seenids
 
 
-def main(args):
-    if not args.asmaildir and not os.path.isdir(args.exportdir):
+def main(args: argparse.Namespace):
+    if not args.as_maildir and not os.path.isdir(args.exportdir):
         os.mkdir(args.exportdir)
 
-    if args.knownids and os.path.exists(args.knownids):
-        with open(args.knownids, 'r') as fh:
-            knownids = fh.read().splitlines()
-            fh.close()
-        print('Loaded %s message-ids from "%s"' % (len(knownids), args.knownids))
+    if args.known_ids and os.path.exists(args.known_ids):
+        if args.known_ids.endswith('.sqlite3'):
+            import sqlite3
+            dbconn = sqlite3.connect(args.known_ids, sqlite3.PARSE_DECLTYPES | sqlite3.PARSE_COLNAMES)
+            cur = dbconn.cursor()
+            rows = cur.execute('SELECT mid FROM msgmap').fetchall()
+            knownids = {x[0] for x in rows}
+        else:
+            with open(args.known_ids, 'r') as fh:
+                knownids = set(fh.read().splitlines())
+                fh.close()
+        print('Loaded %s message-ids from "%s"' % (len(knownids), args.known_ids))
     else:
         # should we load message-ids from existing mailboxes found in the export dir?
         # right now we're just appending to them, which is probably not expected behaviour.
-        knownids = []
+        knownids = set()
 
     if not args.source:
         print('You have to specify at least one source')
         sys.exit(1)
 
-    # Make list ID matching case insensitive to match more mail
-    listids = [listid.lower() for listid in args.listids]
+    # Make list ID matching case-insensitive to match more mail
+    if args.list_ids:
+        listids = [listid.lower() for listid in args.list_ids]
+    else:
+        listids = ['*']
+
+    extrahdrs = list()
+    if args.extrahdrs:
+        for hdr in args.extrahdrs:
+            name, val = hdr.split(':', maxsplit=1)
+            if val.strip():
+                extrahdrs.append((name.strip(), val.strip()))
 
-    newids = process_archives(args.source, args.exportdir, knownids, listids, args.rejected, args.asmaildir)
+    newids = process_archives(args.source, args.exportdir, knownids, listids, args.rejected, args.as_maildir,
+                              extrahdrs)
 
-    if newids is None or not args.knownids:
+    if newids is None or not args.known_ids or args.known_ids.endswith('.sqlite3'):
         sys.exit(0)
 
-    new_idlist = knownids + newids
-    print('Wrote %s msgids into %s (%s new)' % (len(new_idlist), args.knownids, len(newids)))
-    with open(args.knownids, 'w') as fh:
-        fh.write('\n'.join(new_idlist))
+    knownids.update(newids)
+    print('Wrote %s msgids into %s (%s new)' % (len(knownids), args.known_ids, len(newids)))
+    with open(args.known_ids, 'w') as fh:
+        fh.write('\n'.join(knownids))
         fh.close()
 
 
 if __name__ == '__main__':
-    import argparse
-
     # noinspection PyTypeChecker
     parser = argparse.ArgumentParser(
         description="Make a mbox of LKML messages we haven't yet archived",
         formatter_class=argparse.ArgumentDefaultsHelpFormatter,
     )
-    parser.add_argument('-source', nargs='+',
+    parser.add_argument('-s', '--source', nargs='+',
                         help=('Mbox file with archives, can be multiple. '
                               'Paths with trailing "/" will be treated as maildirs.'))
-    parser.add_argument('-exportdir', required=True, default='list-archives',
+    parser.add_argument('-e', '--exportdir', required=True, default='list-archives',
                         help='Export dir where to put sanitized archives')
-    parser.add_argument('-asmaildir', action='store_true', default=False,
+    parser.add_argument('-m', '--as-maildir', action='store_true', default=False,
                         help='Export as maildir instead of mailboxes')
-    parser.add_argument('-knownids',
-                        help='File with known Message-IDs (one per line)')
-    parser.add_argument('-listids', required=True, nargs='+',
-                        help='List ID to match, can be multiple')
-    parser.add_argument('-rejected',
+    parser.add_argument('-k', '--known-ids',
+                        help='File with known Message-IDs (one per line, or msgmap.sqlite3)')
+    parser.add_argument('-l', '--list-ids', nargs='+',
+                        help='Limit to just these list-ids (can be multiple)')
+    parser.add_argument('-r', '--rejected',
                         help='Mailbox file where to save messages that were rejected '
                              '(adds X-Import-Rejected-Reason header)')
+    parser.add_argument('-x', '--extrahdrs', nargs='+', metavar='FULLHDR',
+                        help='Extra headers to inject into each message')
 
     main(parser.parse_args())
author	Konstantin Ryabitsev <konstantin@linuxfoundation.org>	2022-10-20 14:11:29 -0400
committer	Konstantin Ryabitsev <konstantin@linuxfoundation.org>	2022-10-20 14:11:29 -0400
commit	354fc16e397312c8972c6b48e7645d2c98d71db3 (patch)
tree	63cf3ce0686bd08aa79e42c2aa68afbe226cb308
parent	a72a55b06660b249c82cdba32a4dc54e8a7ea2f3 (diff)
download	korg-helpers-354fc16e397312c8972c6b48e7645d2c98d71db3.tar.gz