Bläddra i källkod

fix utf8 decode error

Yuxin Wu 1 år sedan
förälder
incheckning
6d9e4a00a9
2 ändrade filer med 15 tillägg och 1 borttagningar
  1. 1 0
      requirements.txt
  2. 14 1
      wechat/parser.py

+ 1 - 0
requirements.txt

@@ -6,3 +6,4 @@ requests
 pysqlcipher3>=1.0.3
 csscompressor
 numpy
+ipython

+ 14 - 1
wechat/parser.py

@@ -26,7 +26,11 @@ class WeChatDBParser(object):
         """ db_fname: a decoded EnMicroMsg.db"""
         self.db_fname = db_fname
         self.db_conn = sqlite3.connect(self.db_fname)
+        self.db_conn_bytes = sqlite3.connect(self.db_fname)
+        # https://stackoverflow.com/questions/22751363/sqlite3-operationalerror-could-not-decode-to-utf-8-column
+        self.db_conn_bytes.text_factory = lambda b: b
         self.cc = self.db_conn.cursor()
+
         self.contacts = {}      # username -> nickname
         self.contacts_rev = defaultdict(list)
         self.msgs_by_chat = defaultdict(list)
@@ -53,7 +57,7 @@ SELECT username,conRemark,nickname FROM rcontact
 
     def _parse_msg(self):
         msgs_tot_cnt = 0
-        db_msgs = self.cc.execute(
+        db_msgs = self.db_conn_bytes.cursor().execute(
 """
 SELECT {} FROM message
 """.format(','.join(WeChatDBParser.FIELDS)))
@@ -136,11 +140,20 @@ SELECT {} FROM message
     def _parse_msg_row(self, row):
         """ parse a record of message into my format"""
         values = dict(zip(WeChatDBParser.FIELDS, row))
+        try:
+            values['content'].decode()
+        except:
+            logger.warning(f"Invalid byte sequence in message content (type={values['type']})")
+            values['content'] = 'FAILED TO DECODE'
         if values['content']:
             values['content'] = ensure_unicode(values['content'])
         else:
             values['content'] = ''
         values['createTime'] = datetime.fromtimestamp(values['createTime']/ 1000)
+
+        values['talker'] = values['talker'].decode()
+        if values['imgPath']:
+            values['imgPath'] = values['imgPath'].decode()
         values['chat'] = values['talker']
         try:
             if values['chat'].endswith('@chatroom'):