Try both utf-8 and windows-1252 for decoding email

Recent submissions from Cirrus were classified as spam by the lore analysis robot script. This is because cirrus used windows-1252 for the encoding which failed to decode as utf-8. Try both encodings when decoding email. Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
author: Mario Limonciello <mario.limonciello@amd.com> 2023-11-16 10:42:10 -0600
committer: Mario Limonciello <mario.limonciello@amd.com> 2023-11-16 10:42:10 -0600
commit: 8228c2222fcf5791fe5643252e4d248839c199e9 (patch)
tree: 936f598573e4b45f62cb73159e655ba77d26ded7
parent: 58ec43257cffef033c3210d92d3dd57ac431f262 (diff)
download: linux-firmware-8228c2222fcf5791fe5643252e4d248839c199e9.tar.gz
1 files changed, 22 insertions, 9 deletions
diff --git a/contrib/process_linux_firmware.py b/contrib/process_linux_firmware.py
index 668e35c0..ea108391 100755
--- a/contrib/process_linux_firmware.py
+++ b/contrib/process_linux_firmware.py
@@ -34,6 +34,8 @@ content_types = {
 def classify_content(content):
     # load content into the email library
     msg = email.message_from_string(content)
+    decoded = None
+    body = None
 
     # check the subject
     subject = msg["Subject"]
@@ -42,17 +44,28 @@ def classify_content(content):
     if "PATCH" in subject:
         return ContentType.PATCH
 
-    for part in msg.walk():
-        if part.get_content_type() == "text/plain":
+    if msg.is_multipart():
+        for part in msg.walk():
+            if part.get_content_type() == "text/plain":
+                body = part.get_payload(decode=True)
+    else:
+        body = msg.get_payload(decode=True)
+
+    if body:
+        for encoding in ["utf-8", "windows-1252"]:
             try:
-                body = part.get_payload(decode=True).decode("utf-8")
-                for key in content_types.keys():
-                    if key in body:
-                        return content_types[key]
-                break
-            except UnicodeDecodeError as e:
-                logging.warning("Failed to decode email: %s, treating as SPAM" % e)
+                decoded = body.decode(encoding)
                 break
+            except UnicodeDecodeError:
+                pass
+
+    if decoded:
+        for key in content_types.keys():
+            if key in decoded:
+                return content_types[key]
+    else:
+        logging.warning("Failed to decode email: %s, treating as SPAM", body)
+
     return ContentType.SPAM
author	Mario Limonciello <mario.limonciello@amd.com>	2023-11-16 10:42:10 -0600
committer	Mario Limonciello <mario.limonciello@amd.com>	2023-11-16 10:42:10 -0600
commit	8228c2222fcf5791fe5643252e4d248839c199e9 (patch)
tree	936f598573e4b45f62cb73159e655ba77d26ded7
parent	58ec43257cffef033c3210d92d3dd57ac431f262 (diff)
download	linux-firmware-8228c2222fcf5791fe5643252e4d248839c199e9.tar.gz