Yuxin Wu 5 роки тому
батько
коміт
95826392df

+ 0 - 4
common/progress.py

@@ -1,8 +1,4 @@
-#!/usr/bin/env python2
 # -*- coding: UTF-8 -*-
-# File: progress.py
-# Date: Wed Jun 17 23:59:52 2015 +0800
-# Author: Yuxin Wu
 
 import time
 import sys

+ 4 - 13
common/textutil.py

@@ -1,23 +1,14 @@
-#!/usr/bin/env python2
 # -*- coding: UTF-8 -*-
-# File: utils.py
-# Date: Wed Jun 17 23:59:25 2015 +0800
-# Author: Yuxin Wu
 
 import hashlib
 import base64
 
-def ensure_bin_str(s):
-    if type(s) == str:
-        return s
-    if type(s) == unicode:
-        return s.encode('utf-8')
-
 def ensure_unicode(s):
     if type(s) == str:
-        return s.decode('utf-8')
-    if type(s) == unicode:
         return s
+    elif type(s) == bytes:
+        return s.decode('utf-8')
+    raise TypeError(f"type of string is {type(s)}")
 
 
 def md5(s):
@@ -27,7 +18,7 @@ def md5(s):
 
 def get_file_b64(fname):
     data = open(fname, 'rb').read()
-    return base64.b64encode(data)
+    return base64.b64encode(data).decode('ascii')
 
 def safe_filename(fname):
     filename = ensure_unicode(fname)

+ 1 - 5
common/timer.py

@@ -1,8 +1,4 @@
-#!/usr/bin/env python2
 # -*- coding: UTF-8 -*-
-# File: timer.py
-# Date: Wed Jun 17 23:25:54 2015 +0800
-# Author: Yuxin Wu
 
 import time, functools
 from collections import defaultdict
@@ -20,7 +16,7 @@ class TotalTimer(object):
         self.times = defaultdict(float)
 
     def __del__(self):
-        for k, v in self.times.iteritems():
+        for k, v in self.times.items():
             logger.info("{} took {} seconds in total.".format(k, v))
 
 _total_timer = TotalTimer()

+ 19 - 24
dump-audio.py

@@ -1,16 +1,13 @@
-#!/usr/bin/env python2
+#!/usr/bin/env python3
 # -*- coding: utf-8 -*-
-# File: dump-audio.py
-# Author: Yuxin Wu
 
 import sys
+import base64
 import argparse
 
-from common.textutil import ensure_unicode
 from wechat.parser import WeChatDBParser
+from wechat.msg import TYPE_SPEAK
 from wechat.res import Resource
-from wechat.render import HTMLRender
-from wechat.libchathelper import LibChatHelper
 
 def get_args():
     parser = argparse.ArgumentParser()
@@ -23,27 +20,25 @@ def get_args():
 
 if __name__ == '__main__':
     args = get_args()
-
-    name = ensure_unicode(args.name)
-    output_file = args.output
-
     parser = WeChatDBParser(args.db)
     res = Resource(parser, args.res, '')
 
-    if name and name in parser.msgs_by_chat:
-        msgs = parser.msgs_by_chat[name]
-    else:
-        sys.stderr.write(u"Valid Contacts: {}\n".format(u'\n'.join(parser.msgs_by_chat.keys())))
-        sys.stderr.write(u"Couldn't find that contact {}.".format(name));
+    try:
+        chatid = parser.get_id_by_nickname(args.name)
+    except KeyError:
+        sys.stderr.write(u"Valid Contacts: {}\n".format('\n'.join(parser.all_chat_nicknames)))
+        sys.stderr.write(u"Couldn't find the chat {}.".format(args.name));
         sys.exit(1)
-    print "Number of Messages: ", len(msgs)
+
+    msgs = parser.msgs_by_chat[chatid]
+    print(f"Number of Messages for {args.name}: ", len(msgs))
     assert len(msgs) > 0
 
-    libchat = LibChatHelper(parser, res)
-    msgs = libchat.convert_msgs(msgs)
-    voices = [m.sound for m in msgs if m.sound]
-    for idx, v in enumerate(voices):
-        p = v.find(':')
-        v = v[p:]
-        with open('/{}/{:04d}.mp3'.format(args.output, idx), 'wb') as f:
-            f.write(v)
+    voice_msgs = [m for m in msgs if m.type == TYPE_SPEAK]
+    for idx, m in enumerate(voice_msgs):
+        audio_str, duration = res.get_voice_mp3(m.imgPath)
+        audio_bytes = base64.b64decode(audio_str)
+        outf = f'/{args.output}/{idx:04d}-{duration:.1f}s.mp3'
+        with open(outf, 'wb') as f:
+            f.write(audio_bytes)
+        print(f"Audio written to {outf}")

+ 8 - 7
dump-html.py

@@ -1,17 +1,17 @@
-#!/usr/bin/env python2
+#!/usr/bin/env python3
 # -*- coding: UTF-8 -*-
-# File: dump-html.py
-# Date: Wed Mar 25 17:44:20 2015 +0800
-# Author: Yuxin Wu
 
 import sys
 import argparse
+import logging
 
 from common.textutil import ensure_unicode
 from wechat.parser import WeChatDBParser
 from wechat.res import Resource
 from wechat.render import HTMLRender
 
+logger = logging.getLogger("wechat")
+
 def get_args():
     parser = argparse.ArgumentParser()
     parser.add_argument('name', help='name of contact')
@@ -39,7 +39,7 @@ if __name__ == '__main__':
         sys.exit(1)
     res = Resource(parser, args.res, args.avt)
     msgs = parser.msgs_by_chat[chatid]
-    print "Number of Messages: ", len(msgs)
+    logger.info(f"Number of Messages: {len(msgs)}")
     assert len(msgs) > 0
 
     render = HTMLRender(parser, res)
@@ -47,10 +47,11 @@ if __name__ == '__main__':
 
     if len(htmls) == 1:
         with open(output_file, 'w') as f:
-            print >> f, htmls[0].encode('utf-8')
+            f.write(htmls[0])
     else:
         assert output_file.endswith(".html")
         basename = output_file[:-5]
         for idx, html in enumerate(htmls):
             with open(basename + '.{}'.format(idx) + '.html', 'w') as f:
-                print >> f, html.encode('utf-8')
+                f.write(html)
+    res.emoji_cache.flush()

+ 31 - 29
dump-msg.py

@@ -1,38 +1,40 @@
-#!/usr/bin/env python2
+#!/usr/bin/env python3
 # -*- coding: UTF-8 -*-
-# File: dump-msg.py
-# Date: Mon May 25 15:23:05 2015 +0800
-# Author: Yuxin Wu
 
+import logging
 from wechat.parser import WeChatDBParser
 from common.textutil import safe_filename
 import sys, os
 
-if len(sys.argv) != 3:
-    sys.exit("Usage: {0} <path to decoded_database.db> <output_dir>".format(sys.argv[0]))
+logger = logging.getLogger("wechat")
 
-db_file = sys.argv[1]
-output_dir = sys.argv[2]
-try:
-    os.mkdir(output_dir)
-except:
-    pass
-if not os.path.isdir(output_dir):
-    sys.exit("Error creating directory {}".format(output_dir))
+if __name__ == '__main__':
+    if len(sys.argv) != 3:
+        sys.exit("Usage: {0} <path to decoded_database.db> <output_dir>".format(sys.argv[0]))
 
-parser = WeChatDBParser(db_file)
+    db_file = sys.argv[1]
+    output_dir = sys.argv[2]
+    try:
+        os.mkdir(output_dir)
+    except:
+        pass
+    if not os.path.isdir(output_dir):
+        sys.exit("Error creating directory {}".format(output_dir))
 
-for chatid, msgs in parser.msgs_by_chat.iteritems():
-    name = parser.contacts[chatid]
-    if len(name) == 0:
-        print u"Chat {} doesn't have a valid display name".format(chatid)
-        name = str(id(chatid))
-    print u"Writing msgs for {}".format(name)
-    safe_name = safe_filename(name)
-    outf = os.path.join(output_dir, safe_name + '.txt')
-    if os.path.isfile(outf):
-        print(u"File {} exists! Skip contact {}".format(outf, name))
-        continue
-    with open(outf, 'w') as f:
-        for m in msgs:
-            print >> f, m
+    parser = WeChatDBParser(db_file)
+
+    for chatid, msgs in parser.msgs_by_chat.items():
+        name = parser.contacts[chatid]
+        if len(name) == 0:
+            logger.info(f"Chat {chatid} doesn't have a valid display name.")
+            name = str(id(chatid))
+        logger.info(f"Writing msgs for {name}")
+        safe_name = safe_filename(name)
+        outf = os.path.join(output_dir, safe_name + '.txt')
+        if os.path.isfile(outf):
+            logger.info(f"File {outf} exists! Skip contact {name}")
+            continue
+        with open(outf, 'w') as f:
+            for m in msgs:
+                f.write(str(m))
+                f.write("\n")

+ 14 - 13
emoji-cache-tool.py

@@ -1,9 +1,7 @@
-#!/usr/bin/env python2
+#!/usr/bin/env python3
 # -*- coding: utf-8 -*-
-# File: emoji-cache-tool.py
-# Author: Yuxin Wu
 
-import cPickle as pickle
+import pickle
 import sys
 import os
 import imghdr
@@ -11,23 +9,26 @@ import base64
 
 if __name__ == '__main__':
     if len(sys.argv) != 3:
-        print """\
+        print("""\
 Usage:
  {} unpack output-dir
  {} pack input-dir
-""".format(sys.argv[0], sys.argv[0])
+""".format(sys.argv[0], sys.argv[0]))
         sys.exit(1)
 
     if sys.argv[1] == 'unpack':
-        with open('emoji.cache') as f:
+        with open('emoji.cache', 'rb') as f:
             dic = pickle.load(f)
         outdir = sys.argv[2]
         assert os.path.isdir(outdir)
-        for md5, img in dic.iteritems():
+        for md5, img in dic.items():
+            data = img[0]
+            if not isinstance(data, bytes):
+                data = data.encode('ascii')
             name = os.path.join(outdir, md5 + '.' + img[1].lower())
-            print name
+            print(name)
             with open(name, 'wb') as f:
-                f.write(base64.decodestring(img[0]))
+                f.write(base64.decodebytes(data))
     elif sys.argv[1] == 'pack':
         ret = {}
         indir = sys.argv[2]
@@ -36,10 +37,10 @@ Usage:
             try:
                 md5, format = fname.split('.')
             except:
-                print "Unable to parse", fname
+                print("Unable to parse", fname)
                 continue
-            with open(os.path.join(indir, fname)) as f:
-                b64 = base64.encodestring(f.read())
+            with open(os.path.join(indir, fname), 'rb') as f:
+                b64 = base64.encodebytes(f.read()).decode('ascii')
             ret[md5] = (b64, format)
         with open('emoji.cache', 'wb') as f:
             pickle.dump(ret, f)

+ 2 - 2
libchat/create_db.py

@@ -1,4 +1,4 @@
-#!/usr/bin/env python2
+#!/usr/bin/env python3
 # -*- coding: UTF-8 -*-
 # File: create_table.py
 # Date: Wed Mar 25 16:43:22 2015 +0800
@@ -10,7 +10,7 @@ import os
 from libchat import SqliteLibChat
 
 if len(sys.argv) != 2:
-    print "Usage: {} <DB file name>"
+    print("Usage: {} <DB file name>")
     sys.exit()
 
 db_name = sys.argv[1]

+ 3 - 6
libchat/libchat.py

@@ -1,8 +1,5 @@
-#!/usr/bin/env python2
+#!/usr/bin/env python3
 # -*- coding: UTF-8 -*-
-# File: libchat.py
-# Date: Sun Apr 12 21:08:51 2015 +0900
-# Author: Yuxin Wu
 import sqlite3
 import os
 from datetime import datetime
@@ -99,7 +96,7 @@ class SqliteLibChat(object):
         else:
             self.c.execute("SELECT * FROM message WHERE {}".format(
                 ' AND '.join(["{} = {}".format(k, v)
-                              for k, v in predicate.iteritems()])))
+                              for k, v in predicate.items()])))
         for row in self.c.fetchall():
             yield ChatMsg(*SqliteLibChat.postfilter(row))
 
@@ -113,5 +110,5 @@ if __name__ == '__main__':
 
     for k in db.iterate_all_msg():
         from IPython import embed; embed()
-        print k
+        print(k)
 

+ 3 - 3
list-chats.py

@@ -1,4 +1,4 @@
-#!/usr/bin/env python2
+#!/usr/bin/env python3
 # -*- coding: UTF-8 -*-
 # File: list-chats.py
 # Author: Yuxin Wu <[email protected]>
@@ -6,7 +6,7 @@
 from wechat.parser import WeChatDBParser
 import sys
 if len(sys.argv) != 2:
-    print "Usage: {} db_file".format(sys.argv[0])
+    print("Usage: {} db_file".format(sys.argv[0]))
     sys.exit(1)
 
 db_file = sys.argv[1]
@@ -14,4 +14,4 @@ db_file = sys.argv[1]
 parser = WeChatDBParser(db_file)
 chats = parser.msgs_by_chat.keys()
 for k in chats:
-    print parser.contacts[k], '\t', k
+    print(parser.contacts[k], '\t', k)

+ 2 - 2
plot-num-msg-by-time.py

@@ -1,4 +1,4 @@
-#!/usr/bin/env python2
+#!/usr/bin/env python3
 # -*- coding: UTF-8 -*-
 # File: plot-num-msg-by-time.py
 # Date: Wed Mar 25 17:44:39 2015 +0800
@@ -39,7 +39,7 @@ plt.show()
 # I'm in a different time zone in this period:
 #TZ_DELTA = {(datetime(2014, 7, 13), datetime(2014, 10, 1)): -15}
 #def real_hour(x):
-    #for k, v in TZ_DELTA.iteritems():
+    #for k, v in TZ_DELTA.items():
         #if x > k[0] and x < k[1]:
             #print x
             #return (x.hour + v + 24) % 24

+ 1 - 1
wechat/__init__.py

@@ -1,4 +1,4 @@
-#!/usr/bin/env python2
+#!/usr/bin/env python3
 # -*- coding: UTF-8 -*-
 
 import logging

+ 19 - 30
wechat/audio.py

@@ -1,17 +1,12 @@
-#!/usr/bin/env python2
+#!/usr/bin/env python3
 # -*- coding: UTF-8 -*-
-# File: audio.py
-# Date: Fri Jun 26 10:42:41 2015 +0800
-# Author: Yuxin Wu
 
 import os
-from subprocess import PIPE, Popen, call
 import logging
 logger = logging.getLogger(__name__)
 
-import pysox
-
 from common.textutil import get_file_b64
+from common.procutil import subproc_succ
 
 SILK_DECODER = os.path.join(os.path.dirname(__file__),
                             '../third-party/silk/decoder')
@@ -23,56 +18,50 @@ def parse_wechat_audio_file(file_name):
     try:
         return do_parse_wechat_audio_file(file_name)
     except Exception as e:
-        logger.error("Pase audio file {} error!".format(file_name))
-        logger.error(e)
+        logger.exception("Error when parsing audio file {}".format(file_name))
         return "", 0
 
 def do_parse_wechat_audio_file(file_name):
-    """ return a mp3 base64 string, and the duration"""
+    """ return a mp3 stored in base64 unicode string, and the duration"""
     if not file_name: return "", 0
 
     mp3_file = os.path.join('/tmp',
                             os.path.basename(file_name)[:-4] + '.mp3')
-    with open(file_name) as f:
+    with open(file_name, 'rb') as f:
         header = f.read(10)
-    if 'AMR' in header:
-        # maybe this is faster than calling sox from command line?
+    if b'AMR' in header:
+        raise NotImplementedError("AMR decoding not implemented because it seems deprecated since WeChat6.0+")
+        # The below is python2 only. It should be equivalent to using sox from command line?
+        import pysox
         infile = pysox.CSoxStream(file_name)
         outfile = pysox.CSoxStream(mp3_file, 'w', infile.get_signal())
         chain = pysox.CEffectsChain(infile, outfile)
         chain.flow_effects()
         outfile.close()
-
         signal = infile.get_signal().get_signalinfo()
         duration = signal['length'] * 1.0 / signal['rate']
-    elif 'SILK' in header:
+    elif b'SILK' in header:
         raw_file = os.path.join('/tmp',
                                 os.path.basename(file_name)[:-4] + '.raw')
-        proc = Popen('{0} {1} {2}'.format(SILK_DECODER,
-                                                file_name, raw_file),
-                    shell=True, stdout=PIPE, stderr=PIPE)
-        stdout = proc.communicate()[0]
-        for line in stdout.split('\n'):
-            if 'File length' in line:
+        cmd = '{0} {1} {2}'.format(SILK_DECODER, file_name, raw_file)
+        out = subproc_succ(cmd)
+        for line in out.split(b'\n'):
+            if b'File length' in line:
                 duration = float(line[13:-3].strip())
                 break
         else:
             raise RuntimeError("Error decoding silk audio file!")
 
-        # I don't know how to do this with pysox
-        proc = call('sox -r 24000 -e signed -b 16 -c 1 {} {}'.format(
-            raw_file, mp3_file), shell=True)
+        # TODO don't know how to do this with python
+        subproc_succ('sox -r 24000 -e signed -b 16 -c 1 {} {}'.format(raw_file, mp3_file))
         os.unlink(raw_file)
     else:
         raise NotImplementedError("Unsupported Audio Format! This is a bug!")
-    try:
-        mp3_string = get_file_b64(mp3_file)
-        os.unlink(mp3_file)
-    except:
-        raise RuntimeError("Failed to decode audio file: {}".format(file_name))
+    mp3_string = get_file_b64(mp3_file)
+    os.unlink(mp3_file)
     return mp3_string, duration
 
 if __name__ == '__main__':
     import sys
     fname = sys.argv[1]
-    print parse_wechat_audio_file(fname)[1]
+    print(parse_wechat_audio_file(fname)[1])

+ 7 - 12
wechat/avatar.py

@@ -1,11 +1,7 @@
-#!/usr/bin/env python2
 # -*- coding: UTF-8 -*-
-# File: avatar.py
-# Date: Wed Nov 29 03:27:16 2017 -0800
-# Author: Yuxin Wu
 
 from PIL import Image
-import cStringIO
+import io
 import glob
 import os
 import numpy as np
@@ -13,7 +9,7 @@ import logging
 import sqlite3
 logger = logging.getLogger(__name__)
 
-from common.textutil import ensure_bin_str, md5
+from common.textutil import ensure_unicode, md5
 
 
 class AvatarReader(object):
@@ -40,7 +36,7 @@ class AvatarReader(object):
         if not self._use_avt:
             return None
 
-        username = ensure_bin_str(username)
+        username = ensure_unicode(username).encode('utf-8')
         filename = md5(username)
         dir1, dir2 = filename[:2], filename[2:4]
         filename = os.path.join(dir1, dir2,
@@ -58,13 +54,12 @@ class AvatarReader(object):
                     else:
                         return None
             except TypeError:
-                logger.warn("Avatar for {} not found in avatar database.".format(username))
+                logger.warning("Avatar for {} not found in avatar database.".format(username))
                 return None
         except Exception as e:
             raise
-            print e
-            logger.warn("Failed to retrieve avatar!")
-            return None
+            # logger.exception("Failed to retrieve avatar!")
+            # return None
 
 
     def read_img(self, pos, size):
@@ -77,7 +72,7 @@ class AvatarReader(object):
             with open(fname, 'rb') as f:
                 f.seek(start_pos)
                 data = f.read(size)
-                im = Image.open(cStringIO.StringIO(data))
+                im = Image.open(io.BytesIO(data))
                 return im
         except IOError as e:
             logger.warn("Cannot read avatar from {}: {}".format(fname, str(e)))

+ 3 - 7
wechat/libchathelper.py

@@ -1,8 +1,4 @@
-#!/usr/bin/env python2
 # -*- coding: UTF-8 -*-
-# File: libchathelper.py
-# Date: Wed Nov 29 03:44:54 2017 -0800
-# Author: Yuxin Wu
 
 import base64
 from pyquery import PyQuery
@@ -65,8 +61,8 @@ class LibChatHelper(object):
     def _get_sound(self, msg):
         if msg.type == TYPE_SPEAK:
             audio_str, duration = self.res.get_voice_mp3(msg.imgPath)
-            return '{}:{}'.format(duration, base64.b64decode(audio_str))
-        return ''
+            return base64.b64decode(audio_str)
+        return b''
 
     def _get_extra(self, msg):
         ret = {}
@@ -83,7 +79,7 @@ class LibChatHelper(object):
         if img:
             # TODO don't use b64, directly return image content
             img = base64.b64decode(img)
-# TODO do we need to save format?
+        # TODO do we need to save format or voice duration?
         sound = self._get_sound(msg)
         extra = self._get_extra(msg)
 

+ 16 - 9
wechat/msg.py

@@ -1,8 +1,4 @@
-#!/usr/bin/env python2
 # -*- coding: UTF-8 -*-
-# File: msg.py
-# Date: Thu Jun 18 00:01:00 2015 +0800
-# Author: Yuxin Wu
 TYPE_MSG = 1
 TYPE_IMG = 3
 TYPE_SPEAK = 34
@@ -16,6 +12,7 @@ TYPE_WX_VIDEO = 62  # video took by wechat
 TYPE_SYSTEM = 10000
 TYPE_CUSTOM_EMOJI = 1048625
 TYPE_REDENVELOPE = 436207665
+TYPE_MONEY_TRANSFER = 419430449  # 微信转账
 TYPE_LOCATION_SHARING = -1879048186
 TYPE_APP_MSG = 16777265
 
@@ -40,7 +37,7 @@ class WeChatMsg(object):
         return False
 
     def __init__(self, values):
-        for k, v in values.iteritems():
+        for k, v in values.items():
             setattr(self, k, v)
         if self.type not in _KNOWN_TYPES:
             logger.warn("Unhandled message type: {}".format(self.type))
@@ -101,6 +98,16 @@ class WeChatMsg(object):
             except:
                 pass
             return u"[RED ENVELOPE]"
+        elif self.type == TYPE_MONEY_TRANSFER:
+            data_to_parse = io.BytesIO(self.content.encode('utf-8'))
+            try:
+                for event, elem in ET.iterparse(data_to_parse, events=('end',)):
+                    if elem.tag == 'des':
+                        title = elem.text
+                        return u"[Money Transfer]\n{}".format(title)
+            except:
+                pass
+            return u"[Money Transfer]"
         else:
             # TODO replace smiley with text
             return self.content
@@ -113,14 +120,14 @@ class WeChatMsg(object):
         return msg
 
     def __repr__(self):
-        ret = u"{}|{}:{}:{}".format(
+        ret = "{}|{}:{}:{}".format(
             self.type,
             self.talker_nickname if not self.isSend else 'me',
             self.createTime,
-            ensure_unicode(self.msg_str())).encode('utf-8')
+            ensure_unicode(self.msg_str()))
         if self.imgPath:
-            ret = u"{}|img:{}".format(ensure_unicode(ret.strip()), self.imgPath)
-            return ret.encode('utf-8')
+            ret = "{}|img:{}".format(ensure_unicode(ret.strip()), self.imgPath)
+            return ret
         else:
             return ret
 

+ 0 - 4
wechat/msgslice.py

@@ -1,8 +1,4 @@
-#!/usr/bin/env python2
 # -*- coding: UTF-8 -*-
-# File: msgslice.py
-# Date: Thu Jan 08 00:15:49 2015 +0800
-# Author: Yuxin Wu
 
 class MessageSlicerByTime(object):
     """ Separate messages into slices by time,

+ 3 - 7
wechat/parser.py

@@ -1,8 +1,4 @@
-#!/usr/bin/env python2
 # -*- coding: UTF-8 -*-
-# File: parser.py
-# Date: Thu Jun 18 00:03:53 2015 +0800
-# Author: Yuxin Wu
 
 import sqlite3
 from collections import defaultdict
@@ -51,7 +47,7 @@ SELECT username,conRemark,nickname FROM rcontact
             else:
                 self.contacts[username] = ensure_unicode(nickname)
 
-        for k, v in self.contacts.iteritems():
+        for k, v in self.contacts.items():
             self.contacts_rev[v].append(k)
         logger.info("Found {} names in `contact` table.".format(len(self.contacts)))
 
@@ -70,7 +66,7 @@ SELECT {} FROM message
             if not WeChatMsg.filter_type(msg.type):
                 self.msgs_by_chat[msg.chat].append(msg)
 
-        for k, v in self.msgs_by_chat.iteritems():
+        for k, v in self.msgs_by_chat.items():
             self.msgs_by_chat[k] = sorted(v, key=lambda x: x.createTime)
             msgs_tot_cnt += len(v)
         logger.info("Found {} message records.".format(msgs_tot_cnt))
@@ -166,7 +162,7 @@ SELECT {} FROM message
 
     @property
     def all_chat_nicknames(self):
-        return [self.contacts[k] for k in self.all_chat_ids]
+        return [self.contacts[k] for k in self.all_chat_ids if len(self.contacts[k])]
 
     def get_id_by_nickname(self, nickname):
         l = self.contacts_rev[nickname]

+ 4 - 5
wechat/render.py

@@ -1,8 +1,5 @@
-#!/usr/bin/env python2
+#!/usr/bin/env python3
 # -*- coding: UTF-8 -*-
-# File: render.py
-# Date: Wed Nov 29 03:53:55 2017 -0800
-# Author: Yuxin Wu
 
 import os
 import base64
@@ -36,7 +33,7 @@ TEMPLATES_FILES = {TYPE_MSG: "TP_MSG",
                    TYPE_CUSTOM_EMOJI: "TP_EMOJI",
                    TYPE_LINK: "TP_MSG"}
 TEMPLATES = {k: ensure_unicode(open(os.path.join(STATIC_PATH, '{}.html'.format(v))).read())
-    for k, v in TEMPLATES_FILES.iteritems()}
+    for k, v in TEMPLATES_FILES.items()}
 
 class HTMLRender(object):
     def __init__(self, parser, res=None):
@@ -99,6 +96,7 @@ class HTMLRender(object):
             format_dict['nickname'] = '>\n       <pre align=\'left\'>'+msg.talker_nickname+'</pre'
         else:
             format_dict['nickname'] = ' '
+
         def fallback():
             template = TEMPLATES[TYPE_MSG]
             content = msg.msg_str()
@@ -150,6 +148,7 @@ class HTMLRender(object):
                 content = u'URL:<a target="_blank" href="{0}">{0}</a>'.format(url)
                 format_dict['content'] = content
                 return template.format(**format_dict)
+        # TODO handle TYPE_VIDEO_FILE
         elif msg.type == TYPE_WX_VIDEO:
             # TODO: fetch video from resource
             return fallback()

+ 25 - 25
wechat/res.py

@@ -1,21 +1,17 @@
-#!/usr/bin/env python2
 # -*- coding: UTF-8 -*-
-# File: res.py
-# Date: Wed Nov 29 03:43:50 2017 -0800
-# Author: Yuxin Wu
 
 import glob
 import os
 import re
 from PIL import Image
-import cStringIO
+import io
 import base64
 import logging
 logger = logging.getLogger(__name__)
 import imghdr
 from multiprocessing import Pool
 import atexit
-import cPickle as pickle
+import pickle
 import requests
 
 from .avatar import AvatarReader
@@ -36,26 +32,29 @@ class EmojiCache(object):
     def __init__(self, fname):
         self.fname = fname
         if os.path.isfile(fname):
-            self.dic = pickle.load(open(fname))
+            with open(fname, 'rb') as f:
+                self.dic = pickle.load(f)
         else:
             self.dic = {}
 
         self._curr_size = len(self.dic)
 
     def query(self, md5):
-        return self.dic.get(md5, (None, None))
+        data, format = self.dic.get(md5, (None, None))
+        if data is not None and not isinstance(data, str):
+            data = data.decode('ascii')
+        return data, format
 
     def fetch(self, md5, url):
         try:
             logger.info("Requesting emoji {} from {} ...".format(md5, url))
             r = requests.get(url).content
-            im = Image.open(cStringIO.StringIO(r))
+            im = Image.open(io.BytesIO(r))
             format = im.format.lower()
-            ret = (base64.b64encode(r), format)
+            ret = (base64.b64encode(r).decode('ascii'), format)
             self.dic[md5] = ret
 
-            if len(self.dic) == self._curr_size + 10:
-                self._curr_size = len(self.dic)
+            if len(self.dic) >= self._curr_size + 10:
                 self.flush()
             return ret
         except Exception as e:
@@ -63,8 +62,10 @@ class EmojiCache(object):
             return None, None
 
     def flush(self):
-        with open(self.fname, 'wb') as f:
-            pickle.dump(self.dic, f)
+        if len(self.dic) > self._curr_size:
+            self._curr_size = len(self.dic)
+            with open(self.fname, 'wb') as f:
+                pickle.dump(self.dic, f)
 
 class Resource(object):
     """ multimedia resources in chat"""
@@ -86,7 +87,7 @@ class Resource(object):
         self.avt_reader = AvatarReader(res_dir, avt_db)
 
     def get_voice_filename(self, imgpath):
-        fname = md5(imgpath)
+        fname = md5(imgpath.encode('ascii'))
         dir1, dir2 = fname[:2], fname[2:4]
         ret = os.path.join(self.voice_dir, dir1, dir2,
                            'msg_{}.amr'.format(imgpath))
@@ -107,21 +108,19 @@ class Resource(object):
         """ for speed.
         msgs: a collection of WeChatMsg, to cache for later fetch"""
         voice_paths = [msg.imgPath for msg in msgs if msg.type == TYPE_SPEAK]
+        # NOTE: remove all the caching code to debug serial decoding
         self.voice_cache_idx = {k: idx for idx, k in enumerate(voice_paths)}
         pool = Pool(3)
         atexit.register(lambda x: x.terminate(), pool)
         self.voice_cache = [pool.apply_async(parse_wechat_audio_file,
                                              (self.get_voice_filename(k),)) for k in voice_paths]
-# single-threaded version, for debug
-        #self.voice_cache = map(parse_wechat_audio_file,
-                             #(self.get_voice_filename(k) for k in voice_paths))
 
     def get_avatar(self, username):
-        """ return base64 string"""
+        """ return base64 unicode string"""
         im = self.avt_reader.get_avatar(username)
         if im is None:
             return ""
-        buf = cStringIO.StringIO()
+        buf = io.BytesIO()
         try:
             im.save(buf, 'JPEG', quality=JPEG_QUALITY)
         except IOError:
@@ -131,7 +130,7 @@ class Resource(object):
             except IOError:
                 return ""
         jpeg_str = buf.getvalue()
-        return base64.b64encode(jpeg_str)
+        return base64.b64encode(jpeg_str).decode('ascii')
 
     def _get_img_file(self, fnames):
         """ fnames: a list of filename to search for
@@ -167,7 +166,7 @@ class Resource(object):
                 logger.warn("Found big image but not thumbnail: {}".format(fname))
                 return (name, "")
         big = cands[-1]
-        ths = filter(name_is_thumbnail, [k[0] for k in cands])
+        ths = list(filter(name_is_thumbnail, [k[0] for k in cands]))
         if not ths:
             return (big[0], "")
         return (big[0], ths[0])
@@ -187,10 +186,11 @@ class Resource(object):
             if not img_file.endswith('jpg') and \
                imghdr.what(img_file) != 'jpeg':
                 im = Image.open(open(img_file, 'rb'))
-                buf = cStringIO.StringIO()
+                buf = io.BytesIO()
                 im.convert('RGB').save(buf, 'JPEG', quality=JPEG_QUALITY)
-                return base64.b64encode(buf.getvalue())
+                return base64.b64encode(buf.getvalue()).decode('ascii')
             return get_file_b64(img_file)
+
         big_file = get_jpg_b64(big_file)
         if big_file:
             return big_file
@@ -224,7 +224,7 @@ class Resource(object):
         return get_file_b64(f), imghdr.what(f)
 
     def get_emoji_by_md5(self, md5):
-        """ :returns: (b64 img, format)"""
+        """ :returns: (b64 unicode img, format)"""
         assert md5, md5
         if md5 in self.parser.internal_emojis:
             # TODO this seems broken

+ 7 - 10
wechat/smiley.py

@@ -1,8 +1,5 @@
-#!/usr/bin/env python2
+#!/usr/bin/env python3
 # -*- coding: UTF-8 -*-
-# File: smiley.py
-# Date: Thu Jun 18 00:02:43 2015 +0800
-# Author: Yuxin Wu
 
 import os
 import re
@@ -59,7 +56,7 @@ class SmileyProvider(object):
         # some extra smiley from javascript on wx.qq.com
         extra_smiley = json.load(open(TENCENT_EXTRASMILEY_FILE))
         extra_smiley = {u'[' + k + u']': v for k, v in
-                            extra_smiley.iteritems()}
+                            extra_smiley.items()}
         self.tencent_smiley.update(extra_smiley)
 
         # 1f35c -> "\ue340"
@@ -69,14 +66,14 @@ class SmileyProvider(object):
         # u'\ue415' -> 'e415'       # for android
         unicode_smiley_dict = json.load(open(UNICODE_SMILEY_FILE))
         self.unicode_smiley = {(self.unichar(int(k, 16))): hex(ord(v))[2:] for k, v in
-                                unicode_smiley_dict.iteritems()}
+                                unicode_smiley_dict.items()}
         self.unicode_smiley.update({v: hex(ord(v))[2:] for _, v in
-                                unicode_smiley_dict.iteritems()})
+                                unicode_smiley_dict.items()})
         self.used_smiley_id = set()
 
     def unichar(self, i):
         try:
-            return unichr(i)
+            return chr(i)
         except ValueError:
             return struct.pack('i', i).decode('utf-32')
 
@@ -88,7 +85,7 @@ class SmileyProvider(object):
         if not UNICODE_SMILEY_RE.findall(msg):
         # didn't find the code
             return msg
-        for k, v in self.unicode_smiley.iteritems():
+        for k, v in self.unicode_smiley.items():
             if k in msg:
                 msg = msg.replace(k, self.gen_replace_elem(v))
         return msg
@@ -97,7 +94,7 @@ class SmileyProvider(object):
         if (not '[' in msg or not ']' in msg) \
            and (not '/:' in msg) and (not '/' in msg):
             return msg
-        for k, v in self.tencent_smiley.iteritems():
+        for k, v in self.tencent_smiley.items():
             if k in msg:
                 msg = msg.replace(k, self.gen_replace_elem(v))
         return msg

+ 1 - 4
wechat/static/parse_tencent_smiley.py

@@ -1,8 +1,5 @@
-#!/usr/bin/env python2
+#!/usr/bin/env python3
 # -*- coding: UTF-8 -*-
-# File: parse_tencent_smiley.py
-# Date: Sat Dec 27 00:15:14 2014 +0800
-# Author: Yuxin Wu
 
 import xml.etree.ElementTree as ET
 import os

+ 0 - 3
wechat/static/see_smiley_name.sh

@@ -1,6 +1,3 @@
 #!/bin/bash -e
-# File: see_smiley_name.sh
-# Date: Sun Jan 11 21:37:06 2015 +0800
-# Author: Yuxin Wu
 
 cat tencent-smiley.json | jq 'to_entries | group_by(.value) | .[] | "---------",.[].key'