Abstract out our own get_payload for better charset support

When we use msg.get_payload(decode=True), we can't blindly call .decode() on that, because we need to pay attention to the charset of the message. We're already doing various checks for this elsewhere, so move that logic into a static method and use that whenever we need to get payload of a message that we didn't construct ourselves. Reported-by: Rob Herring <robh@kernel.org> Link: https://msgid.link/CAL_JsqJULTWSv8Ww3g=gdLTUqpcgJRD5HFXO_qsUK7L0JN7caw@mail.gmail.com Signed-off-by: Konstantin Ryabitsev <konstantin@linuxfoundation.org>
author: Konstantin Ryabitsev <konstantin@linuxfoundation.org> 2023-02-27 13:20:56 -0500
committer: Konstantin Ryabitsev <konstantin@linuxfoundation.org> 2023-02-27 13:24:37 -0500
commit: ebd05d6210e3421af2918dc178985789cc5fc390 (patch)
tree: dc45d4dc3e0d9ad94ee25c415009d2730285b7ea
parent: ada3021c64dd484e53219eb3af55da6c8f25d0ec (diff)
download: b4-ebd05d6210e3421af2918dc178985789cc5fc390.tar.gz
3 files changed, 46 insertions, 39 deletions
diff --git a/b4/__init__.py b/b4/__init__.py
index 4d5a6c9..674bff2 100644
--- a/b4/__init__.py
+++ b/b4/__init__.py
@@ -1040,37 +1040,7 @@ class LoreMessage:
             self.date = self.date.replace(tzinfo=datetime.timezone.utc)
 
         # walk until we find the first text/plain part
-        mcharset = self.msg.get_content_charset()
-        if not mcharset:
-            mcharset = 'utf-8'
-        self.charset = mcharset
-
-        for part in msg.walk():
-            cte = part.get_content_type()
-            if cte.find('/plain') < 0 and cte.find('/x-patch') < 0:
-                continue
-            payload = part.get_payload(decode=True)
-            if payload is None:
-                continue
-            pcharset = part.get_content_charset()
-            if not pcharset:
-                pcharset = mcharset
-            try:
-                payload = payload.decode(pcharset, errors='replace')
-                self.charset = pcharset
-            except LookupError:
-                # what kind of encoding is that?
-                # Whatever, we'll use utf-8 and hope for the best
-                payload = payload.decode('utf-8', errors='replace')
-                part.set_param('charset', 'utf-8')
-                self.charset = 'utf-8'
-            if self.body is None:
-                self.body = payload
-                continue
-            # If we already found a body, but we now find something that contains a diff,
-            # then we prefer this part
-            if DIFF_RE.search(payload):
-                self.body = payload
+        self.body, self.charset = LoreMessage.get_payload(self.msg)
 
         if self.body is None:
             # Woah, we didn't find any usable parts
@@ -1397,6 +1367,43 @@ class LoreMessage:
         return '\n'.join(out)
 
     @staticmethod
+    def get_payload(msg: email.message.Message) -> Tuple[str, str]:
+        # walk until we find the first text/plain part
+        mcharset = msg.get_content_charset()
+        if not mcharset:
+            mcharset = 'utf-8'
+
+        mbody = None
+        for part in msg.walk():
+            cte = part.get_content_type()
+            if cte.find('/plain') < 0 and cte.find('/x-patch') < 0:
+                continue
+            payload = part.get_payload(decode=True)
+            if payload is None:
+                continue
+            pcharset = part.get_content_charset()
+            if not pcharset:
+                pcharset = mcharset
+            try:
+                payload = payload.decode(pcharset, errors='replace')
+                mcharset = pcharset
+            except LookupError:
+                # what kind of encoding is that?
+                # Whatever, we'll use utf-8 and hope for the best
+                payload = payload.decode('utf-8', errors='replace')
+                part.set_param('charset', 'utf-8')
+                mcharset = 'utf-8'
+            if mbody is None:
+                mbody = payload
+                continue
+            # If we already found a body, but we now find something that contains a diff,
+            # then we prefer this part
+            if DIFF_RE.search(payload):
+                mbody = payload
+
+        return mbody, mcharset
+
+    @staticmethod
     def clean_header(hdrval):
         if hdrval is None:
             return ''
diff --git a/b4/ez.py b/b4/ez.py
index 74afddc..3971b6f 100644
--- a/b4/ez.py
+++ b/b4/ez.py
@@ -826,7 +826,7 @@ def update_trailers(cmdargs: argparse.Namespace) -> None:
         if not msg:
             continue
         commit_map[commit] = msg
-        body = msg.get_payload(decode=True).decode()
+        body, charset = b4.LoreMessage.get_payload(msg)
         patchid = b4.LoreMessage.get_patch_id(body)
         ls = b4.LoreSubject(msg.get('subject'))
         by_subject[ls.subject] = commit
@@ -876,7 +876,8 @@ def update_trailers(cmdargs: argparse.Namespace) -> None:
                 logger.debug('No match for %s', lmsg.full_subject)
                 continue
 
-            parts = b4.LoreMessage.get_body_parts(commit_map[commit].get_payload(decode=True).decode())
+            mbody, mcharset = b4.LoreMessage.get_payload(commit_map[commit])
+            parts = b4.LoreMessage.get_body_parts(mbody)
             for fltr in addtrailers:
                 if fltr not in parts[2]:
                     if commit not in updates:
@@ -1045,7 +1046,7 @@ def add_cover(csubject: b4.LoreSubject, msgid_tpt: str, patches: List[Tuple[str,
 
 def mixin_cover(cbody: str, patches: List[Tuple[str, email.message.Message]]) -> None:
     msg = patches[0][1]
-    pbody = msg.get_payload(decode=True).decode()
+    pbody, pcharset = b4.LoreMessage.get_payload(msg)
     pheaders, pmessage, ptrailers, pbasement, psignature = b4.LoreMessage.get_body_parts(pbody)
     cheaders, cmessage, ctrailers, cbasement, csignature = b4.LoreMessage.get_body_parts(cbody)
     nbparts = list()
@@ -1311,7 +1312,7 @@ def cmd_send(cmdargs: argparse.Namespace) -> None:
         for commit, msg in patches:
             if not msg:
                 continue
-            body = msg.get_payload(decode=True).decode()
+            body, charset = b4.LoreMessage.get_payload(msg)
             btrs, junk = b4.LoreMessage.find_trailers(body)
             for btr in btrs:
                 if btr.type != 'person':
@@ -1909,7 +1910,7 @@ def cmd_prep(cmdargs: argparse.Namespace) -> None:
                 if b4.LoreMessage.get_clean_msgid(msg) == msgid:
                     # Prepare annotated tag body from the cover letter
                     lsubject = b4.LoreSubject(msg.get('subject'))
-                    cbody = msg.get_payload(decode=True).decode()
+                    cbody, charset = b4.LoreMessage.get_payload(msg)
                     prefixes = lsubject.get_extra_prefixes()
                     if prefixes:
                         subject = '[%s] %s' % (' '.join(prefixes), lsubject.subject)
diff --git a/b4/mbox.py b/b4/mbox.py
index 0b0fc40..fb9d092 100644
--- a/b4/mbox.py
+++ b/b4/mbox.py
@@ -536,9 +536,8 @@ def get_extra_series(msgs: list, direction: int = 1, wantvers: Optional[int] = N
             continue
 
         if not lsub.reply:
-            payload = msg.get_payload(decode=True)
-            if isinstance(payload, bytes):
-                payload = payload.decode()
+            payload, charset = b4.LoreMessage.get_payload(msg)
+            if payload:
                 matches = re.search(r'^change-id:\s+(\S+)', payload, flags=re.I | re.M)
                 if matches:
                     logger.debug('Found change-id %s', matches.groups()[0])
author	Konstantin Ryabitsev <konstantin@linuxfoundation.org>	2023-02-27 13:20:56 -0500
committer	Konstantin Ryabitsev <konstantin@linuxfoundation.org>	2023-02-27 13:24:37 -0500
commit	ebd05d6210e3421af2918dc178985789cc5fc390 (patch)
tree	dc45d4dc3e0d9ad94ee25c415009d2730285b7ea
parent	ada3021c64dd484e53219eb3af55da6c8f25d0ec (diff)
download	b4-ebd05d6210e3421af2918dc178985789cc5fc390.tar.gz