diff options
author | Konstantin Ryabitsev <konstantin@linuxfoundation.org> | 2020-02-04 21:27:36 -0500 |
---|---|---|
committer | Konstantin Ryabitsev <konstantin@linuxfoundation.org> | 2020-02-04 21:27:36 -0500 |
commit | f734b1a74939eb58041d15bbdb22f3df00814548 (patch) | |
tree | 904a9493159e9973d391d1aefce0fee410c97447 | |
parent | bb4aa13576587848f955248dfa0d742f4f655b85 (diff) | |
download | korg-helpers-f734b1a74939eb58041d15bbdb22f3df00814548.tar.gz |
Add a feature to fetch newer series revisions
You can now pass "-c/--check-newer-revisions" to query lore.kernel.org
for never revisions to a series.
I'm not overly fond of how this is done -- it will need rewriting once
lore/public-inbox supports proper search-based API. However, it does its
job and is what maintainers are asking for.
Signed-off-by: Konstantin Ryabitsev <konstantin@linuxfoundation.org>
-rwxr-xr-x | get-lore-mbox.py | 281 |
1 files changed, 224 insertions, 57 deletions
diff --git a/get-lore-mbox.py b/get-lore-mbox.py index 5f45da4..146ad2f 100755 --- a/get-lore-mbox.py +++ b/get-lore-mbox.py @@ -17,8 +17,11 @@ import fnmatch import time import requests +import urllib.parse +import xml.etree.ElementTree as ET import gzip +from tempfile import mkstemp from email import charset charset.add_charset('utf-8', None) logger = logging.getLogger('get-lore-mbox') @@ -131,6 +134,16 @@ def get_msgid_from_stdin(): sys.exit(1) +def get_pi_thread_by_url(t_mbx_url, savefile): + resp = requests.get(t_mbx_url) + t_mbox = gzip.decompress(resp.content) + resp.close() + with open(savefile, 'wb') as fh: + logger.debug('Saving %s', savefile) + fh.write(t_mbox) + return savefile + + def get_pi_thread_by_msgid(msgid, config, cmdargs): wantname = cmdargs.wantname outdir = cmdargs.outdir @@ -144,20 +157,19 @@ def get_pi_thread_by_msgid(msgid, config, cmdargs): canonical = resp.headers['Location'].rstrip('/') resp.close() t_mbx_url = '%s/t.mbox.gz' % canonical - logger.critical('Grabbing thread from %s', t_mbx_url) - resp = requests.get(t_mbx_url) - t_mbox = gzip.decompress(resp.content) - resp.close() if wantname: savefile = os.path.join(outdir, wantname) else: # Save it into msgid.mbox savefile = '%s.t.mbx' % msgid savefile = os.path.join(outdir, savefile) - with open(savefile, 'wb') as fh: - logger.debug('Saving %s', savefile) - fh.write(t_mbox) - return savefile + + loc = urllib.parse.urlparse(t_mbx_url) + if cmdargs.useproject: + t_mbx_url = '%s://%s/%s/%s/t.mbox.gz' % ( + loc.scheme, loc.netloc, cmdargs.useproject, msgid) + logger.critical('Grabbing thread from %s', loc.netloc) + return get_pi_thread_by_url(t_mbx_url, savefile) def get_plain_part(msg, ensurediff=False): @@ -170,7 +182,7 @@ def get_plain_part(msg, ensurediff=False): if body is None: continue body = body.decode('utf-8', errors='replace') - if ensurediff and not re.search(r'^---.*\n\+\+\+', body, re.MULTILINE): + if ensurediff and not body_contains_diff(body): continue break return body @@ -205,66 +217,67 @@ def mbox_to_am(mboxfile, config, cmdargs): count = len(mbx) logger.info('Analyzing %s messages in the thread', count) am_kept = list() - msgid_map = dict() slug = None cover_keys = dict() sorted_keys = [None, None] trailer_map = dict() - expected_count = 1 cur_vn = None vn = None multiple_revisions = False + msgid_map = dict() + irt_map = dict() + # Go through the mbox once to build a message map: for key, msg in mbx.items(): msgid = get_clean_msgid(msg) + irtid = get_clean_msgid(msg, header='In-Reply-To') msgid_map[msgid] = key - subject = re.sub(r'\s+', ' ', msg['Subject']) - logger.debug('Looking at msg %s: %s', key, subject) - - # Start by looking at prefixes in the subject - matches = re.search(r'\[PATCH([^\]]*)\]', subject, re.IGNORECASE) - if not matches: - # if the key is 0, it may be a cover letter - if key == 0: - body = get_plain_part(msg) - if re.search(r'^\s*\d+\sfile.*\d+ insertion.*\d+ deletion', body, re.MULTILINE | re.IGNORECASE): - # Looks like a cover letter, so keep it in mind, unless we find - # something else better suited to be a cover letter - logger.debug(' Probaby a cover letter') - cover_keys[1] = key - # Ignoring this message + if irtid is not None: + if irtid not in irt_map.keys(): + irt_map[irtid] = list() + irt_map[irtid].append(key) + # Go through it slowly now + for key, msg in mbx.items(): + subj_info = get_subject_info(msg['Subject']) + logger.debug('Looking at msg %s: %s', key, subj_info['full_subject']) + body = get_plain_part(msg) + msgid = get_clean_msgid(msg) + irtid = get_clean_msgid(msg, header='In-Reply-To') + has_diffstat = body_contains_diffstat(body) + has_diff = body_contains_diff(body) + + # if it has no in-reply-to, but other messages I-R-T to it, then + # it's probably a cover letter that doesn't follow the standard 00/NN notation + if irtid is None and msgid in irt_map.keys(): + logger.debug(' Probaby a cover letter') + cover_keys[subj_info['revision']] = key continue - cur_count = 1 - expected_count = 1 - new_vn = 1 - for prefix in matches.groups()[0].split(): - # Does it match \d/\d? - if re.search(r'\d/\d', prefix): - cur, expected = prefix.split('/') - cur_count = int(cur) - expected_count = int(expected) - # Is does it have a v\d? - matches = re.search(r'v(\d+)', prefix, re.IGNORECASE) - if matches: - new_vn = int(matches.groups()[0]) + if subj_info['revision_inferred'] and irtid is not None: + # Grab revision info from the cover letter + cover_subj_info = get_subject_info(mbx[msgid_map[irtid]]['Subject']) + subj_info['revision'] = cover_subj_info['revision'] + # Make sure sorted_keys has enough members + if len(sorted_keys) < subj_info['expected'] + 1: + sorted_keys = [None] * (subj_info['expected'] + 1) + + new_vn = subj_info['revision'] if cur_vn is None or new_vn > cur_vn: - if new_vn != 1: - if wantver and wantver != new_vn: - logger.info('Found series revision: v%s (ignored)', new_vn) - else: - logger.info('Found series revision: v%s', new_vn) + if wantver and wantver != new_vn: + logger.info('Found series revision: v%s (ignored)', new_vn) + else: + logger.info('Found series revision: v%s', new_vn) if cur_vn is not None and new_vn > cur_vn: multiple_revisions = True if wantver is None or wantver == new_vn: # Blow away anything we currently have in sorted_keys - sorted_keys = [None] * (expected_count + 1) + sorted_keys = [None] * (subj_info['expected'] + 1) slug = None cur_vn = new_vn elif vn is None: cur_vn = new_vn if wantver is not None and wantver != cur_vn: - logger.debug(' Ignoring v%s: %s', cur_vn, subject) + logger.debug(' Ignoring v%s: %s', cur_vn, subj_info['full_subject']) continue vn = cur_vn @@ -281,25 +294,21 @@ def mbox_to_am(mboxfile, config, cmdargs): slug = '%s_%s' % (prefix, author) if cur_vn != 1: slug = 'v%s_%s' % (cur_vn, slug) - logger.debug(' Processing: %s', subject) - # If the count is 00/NN, it's the cover letter - if cur_count == 0 and cur_vn not in cover_keys.keys(): + # If the counter is 0, it's definitely the cover letter + if subj_info['counter'] == 0 and cur_vn not in cover_keys.keys(): # Found the cover letter logger.debug(' Found a cover letter for v%s', cur_vn) am_kept.append(key) - sorted_keys[cur_count] = key + sorted_keys[subj_info['revision']] = key cover_keys[cur_vn] = key continue - body = get_plain_part(msg) - # Do we have a '^---' followed by '^+++' in the body anywhere? - if re.search(r'^---.*\n\+\+\+', body, re.MULTILINE): - # Contains a diff + if has_diff: # Do we already have a match for this, though? - if sorted_keys[cur_count] is None: + if sorted_keys[subj_info['counter']] is None: am_kept.append(key) - sorted_keys[cur_count] = key + sorted_keys[subj_info['counter']] = key continue # Do we have something that looks like a new trailer? matches = re.search(r'^\s*([\w-]+: .*<\S+>)\s*$', body, re.MULTILINE) @@ -347,7 +356,7 @@ def mbox_to_am(mboxfile, config, cmdargs): at = 1 for key in sorted_keys[1:]: if key is None: - logger.error(' ERROR: missing [%s/%s]!', at, expected_count) + logger.error(' ERROR: missing [%s/%s]!', at, len(sorted_keys)-1) have_missing = True else: msg = mbx[key] @@ -412,6 +421,156 @@ def mbox_to_am(mboxfile, config, cmdargs): return am_filename +def get_subject_info(subject): + subject = re.sub(r'\s+', ' ', subject).strip() + subject_info = { + 'full_subject': subject, + 'reply': False, + 'resend': False, + 'rfc': False, + 'revision': 1, + 'revision_inferred': True, + 'counter': 1, + 'expected': 1, + 'prefixes': list(), + 'subject': None, + } + # Is it a reply? + if re.search(r'^\w+:\s*\[', subject): + subject_info['reply'] = True + subject = re.sub(r'^\w+:\s*\[', '[', subject) + + # Find all [foo] in the title + while subject.find('[') == 0: + matches = re.search(r'^\[([^\]]*)\]', subject) + for chunk in matches.groups()[0].split(): + if re.search(r'^\d+/\d+$', chunk): + counters = chunk.split('/') + subject_info['counter'] = int(counters[0]) + subject_info['expected'] = int(counters[1]) + elif re.search(r'^v\d+$', chunk, re.IGNORECASE): + subject_info['revision'] = int(chunk[1:]) + subject_info['revision_inferred'] = False + elif chunk.lower() == 'rfc': + subject_info['rfc'] = True + elif chunk.lower() == 'resend': + subject_info['resend'] = True + subject_info['prefixes'].append(chunk.lower()) + subject = re.sub(r'^\s*\[[^\]]*\]\s*', '', subject) + subject_info['subject'] = subject + + return subject_info + + +def body_contains_diffstat(body): + if re.search(r'^\s*\d+\sfile.*\d+ insertion.*\d+ deletion', body, re.MULTILINE | re.IGNORECASE): + return True + return False + + +def body_contains_diff(body): + if re.search(r'^---.*\n\+\+\+', body, re.MULTILINE): + return True + return False + + +def get_newest_series(mboxfile, cmdargs): + # Open the mbox and find the latest series mentioned in it + mbx = mailbox.mbox(mboxfile) + base_msg = None + latest_revision = None + seen_msgids = list() + seen_covers = list() + for key, msg in mbx.items(): + msgid = get_clean_msgid(msg) + seen_msgids.append(msgid) + subj_info = get_subject_info(msg['Subject']) + # Ignore replies or counters above 1 + if subj_info['reply'] or subj_info['counter'] > 1: + continue + if latest_revision is None or subj_info['revision'] > latest_revision: + # New revision + latest_revision = subj_info['revision'] + if subj_info['counter'] == 0: + # And a cover letter, nice. This is the easy case + base_msg = msg + seen_covers.append(latest_revision) + continue + if subj_info['counter'] == 1: + if latest_revision not in seen_covers: + # A patch/series without a cover letter + base_msg = msg + + # Get subject info from base_msg again + subj_info = get_subject_info(base_msg['Subject']) + if not len(subj_info['prefixes']): + logger.debug('Not checking for new revisions: no prefixes on the cover letter.') + mbx.close() + return + base_msgid = get_clean_msgid(base_msg) + fromeml = email.utils.getaddresses(base_msg.get_all('from', []))[0][1] + msgdate = email.utils.parsedate_tz(str(base_msg['Date'])) + startdate = time.strftime('%Y%m%d', msgdate[:9]) + listarc = base_msg.get_all('List-Archive')[-1].strip('<>') + q = 's:"%s" AND f:"%s" AND d:%s..' % (subj_info['subject'], fromeml, startdate) + queryurl = '%s?%s' % (listarc, urllib.parse.urlencode({'q': q, 'x': 'A', 'o': '-1'})) + logger.critical('Checking for newer revisions on %s', listarc) + logger.debug('Query URL: %s', queryurl) + resp = requests.get(queryurl) + # try to parse it + tree = ET.fromstring(resp.content) + resp.close() + ns = {'atom': 'http://www.w3.org/2005/Atom'} + entries = tree.findall('atom:entry', ns) + + for entry in entries: + title = entry.find('atom:title', ns).text + subj_info = get_subject_info(title) + if subj_info['reply'] or subj_info['counter'] > 1: + logger.debug('Ignoring result (not interesting): %s', title) + continue + link = entry.find('atom:link', ns).get('href') + if subj_info['revision'] < latest_revision: + logger.debug('Ignoring result (not new revision): %s', title) + continue + if link.find('/%s/' % base_msgid) > 0: + logger.debug('Ignoring result (same thread as ours):%s', title) + continue + if subj_info['revision'] == 1 and subj_info['revision'] == latest_revision: + # Someone sent a separate message with an identical title but no new vX in the subject line + # It's *probably* a new revision. + logger.debug('Likely a new revision: %s', title) + elif subj_info['revision'] > latest_revision: + logger.debug('Definitely a new revision [v%s]: %s', subj_info['revision'], title) + else: + logger.debug('No idea what this is: %s', title) + continue + t_mbx_url = '%st.mbox.gz' % link + savefile = mkstemp('get-lore-mbox')[1] + nt_mboxfile = get_pi_thread_by_url(t_mbx_url, savefile) + nt_mbx = mailbox.mbox(nt_mboxfile) + # Append all of these to the existing mailbox + new_adds = 0 + for nt_msg in nt_mbx: + nt_msgid = get_clean_msgid(nt_msg) + if nt_msgid in seen_msgids: + logger.debug('Duplicate message, skipping') + continue + nt_subject = re.sub(r'\s+', ' ', nt_msg['Subject']) + logger.debug('Adding: %s', nt_subject) + new_adds += 1 + mbx.add(nt_msg) + seen_msgids.append(nt_msgid) + nt_mbx.close() + if new_adds: + logger.info('Added %s messages from thread: %s', new_adds, title) + logger.debug('Removing temporary %s', nt_mboxfile) + os.unlink(nt_mboxfile) + + # We close the mbox, since we'll be reopening it later + mbx.close() + + def main(cmdargs): logger.setLevel(logging.DEBUG) @@ -437,6 +596,10 @@ def main(cmdargs): msgid = msgid.strip('<>') config = get_config_from_git() mboxfile = get_pi_thread_by_msgid(msgid, config, cmdargs) + + if mboxfile and cmdargs.checknewer: + get_newest_series(mboxfile, cmdargs) + if mboxfile and cmdargs.amready: # Move it into -thread threadmbox = '%s-thread' % mboxfile @@ -457,6 +620,10 @@ if __name__ == '__main__': help='Message ID to process, or pipe a raw message') parser.add_argument('-o', '--outdir', default='.', help='Output into this directory') + parser.add_argument('-p', '--use-project', dest='useproject', default=None, + help='Use a specific project instead of guessing (linux-mm, linux-hardening, etc)') + parser.add_argument('-c', '--check-newer-revisions', dest='checknewer', action='store_true', default=False, + help='Check if newer patch revisions exist') parser.add_argument('-a', '--am-ready', dest='amready', action='store_true', default=False, help='Make an mbox ready for git am') parser.add_argument('-t', '--apply-cover-trailers', dest='covertrailers', action='store_true', default=False, |