aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorKonstantin Ryabitsev <konstantin@linuxfoundation.org>2020-03-09 12:07:25 -0400
committerKonstantin Ryabitsev <konstantin@linuxfoundation.org>2020-03-09 12:07:25 -0400
commit84df4e5487b9c3e131774ae6bf3c50794b822265 (patch)
tree3f8e0b81f400f8532ca153a66e923103b5faf18d
parentd82c47fe759772cadcda358224cf77ad1b529711 (diff)
downloadkorg-helpers-84df4e5487b9c3e131774ae6bf3c50794b822265.tar.gz
Implement strict thread matching
Public-Inbox will return loosely matched threads, which is almost never what we want for get-lore-mbox. This version implements strict thread matching to make sure that we only pay attention to messages that have clear references to the message-id passed as parameter. Signed-off-by: Konstantin Ryabitsev <konstantin@linuxfoundation.org>
-rwxr-xr-xget-lore-mbox.py51
1 files changed, 49 insertions, 2 deletions
diff --git a/get-lore-mbox.py b/get-lore-mbox.py
index cca213f..572bc43 100755
--- a/get-lore-mbox.py
+++ b/get-lore-mbox.py
@@ -30,7 +30,7 @@ charset.add_charset('utf-8', None)
emlpolicy = email.policy.EmailPolicy(utf8=True, cte_type='8bit', max_line_length=None)
logger = logging.getLogger('get-lore-mbox')
-VERSION = '0.2.13'
+VERSION = '0.2.14'
# You can use bash-style globbing here
WANTHDRS = [
@@ -744,7 +744,54 @@ def get_pi_thread_by_msgid(msgid, config, cmdargs, session):
loc.scheme, loc.netloc, cmdargs.useproject, msgid)
logger.debug('Will query: %s', t_mbx_url)
logger.critical('Grabbing thread from %s', loc.netloc)
- return get_pi_thread_by_url(t_mbx_url, savefile, session)
+ pi_mbx = get_pi_thread_by_url(t_mbx_url, '%s-loose' % savefile, session)
+ return get_strict_thread(pi_mbx, msgid, savefile)
+
+
+def get_strict_thread(pi_mbx, msgid, savefile):
+ pmbx = mailbox.mbox(pi_mbx)
+ smbx = mailbox.mbox(savefile)
+ want = {msgid}
+ got = set()
+ seen = set()
+ while True:
+ for msg in pmbx:
+ c_msgid = LoreMessage.get_clean_msgid(msg)
+ seen.add(c_msgid)
+ if c_msgid in got:
+ continue
+
+ refs = list()
+ for ref in msg.get('References', msg.get('In-Reply-To', '')).split():
+ ref = ref.strip().strip('<>')
+ if ref in got or ref in want:
+ want.add(c_msgid)
+ elif len(ref):
+ refs.append(ref)
+
+ if c_msgid in want:
+ smbx.add(msg)
+ got.add(c_msgid)
+ want.update(refs)
+ want.discard(c_msgid)
+ logger.debug('Kept in thread: %s', c_msgid)
+
+ # Remove any entries not in "seen" (missing messages)
+ for c_msgid in set(want):
+ if c_msgid not in seen:
+ want.remove(c_msgid)
+ if not len(want):
+ break
+
+ if not len(smbx):
+ return None
+
+ if len(pmbx) > len(smbx):
+ logger.info('Reduced thread to strict matches only (%s->%s)', len(pmbx), len(smbx))
+ pmbx.close()
+ smbx.close()
+ os.unlink(pi_mbx)
+ return savefile
def mbox_to_am(mboxfile, config, cmdargs):