瀏覽代碼

group message with proper handling of user name

Yuxin Wu 10 年之前
父節點
當前提交
28c7d3d599
共有 8 個文件被更改,包括 72 次插入61 次删除
  1. 2 2
      dump-html.py
  2. 1 1
      dump-msg.py
  3. 2 2
      plot-num-msg-by-time.py
  4. 2 2
      wechat/libchathelper.py
  5. 8 30
      wechat/msg.py
  6. 46 13
      wechat/parser.py
  7. 9 9
      wechat/render.py
  8. 2 2
      wechat/static/TP_INDEX.html

+ 2 - 2
dump-html.py

@@ -23,9 +23,9 @@ parser = WeChatDBParser(db_file)
 res = Resource(resource_dir, avt_db)
 
 try:
-    msgs = parser.msgs_by_talker[name]
+    msgs = parser.msgs_by_chat[name]
 except:
-    sys.stderr.write(u"Valid Contacts: {}\n".format(u'\n'.join(parser.msgs_by_talker.keys())))
+    sys.stderr.write(u"Valid Contacts: {}\n".format(u'\n'.join(parser.msgs_by_chat.keys())))
     sys.stderr.write(u"Couldn't find that contact {}.".format(name));
     sys.exit(1)
 

+ 1 - 1
dump-msg.py

@@ -22,7 +22,7 @@ if not os.path.isdir(output_dir):
 
 parser = WeChatDBParser(db_file)
 
-for name, msgs in parser.msgs_by_talker.iteritems():
+for name, msgs in parser.msgs_by_chat.iteritems():
     print u"Writing msgs for {}".format(name)
     safe_name = safe_filename(name)
     with open(os.path.join(output_dir, safe_name + '.txt'), 'w') as f:

+ 2 - 2
plot-num-msg-by-time.py

@@ -1,6 +1,6 @@
 #!/usr/bin/env python2
 # -*- coding: UTF-8 -*-
-# File: plot_num_msg_by_time.py
+# File: plot-num-msg-by-time.py
 # Date: Wed Mar 25 17:44:39 2015 +0800
 # Author: Yuxin Wu <[email protected]>
 
@@ -20,7 +20,7 @@ name = ensure_unicode(sys.argv[2])
 every_k_days = 2
 
 parser = WeChatDBParser(db_file)
-msgs = parser.msgs_by_talker[name]
+msgs = parser.msgs_by_chat[name]
 times = [x.createTime for x in msgs]
 start_time = times[0]
 diffs = [(x - start_time).days for x in times]

+ 2 - 2
wechat/libchathelper.py

@@ -78,9 +78,9 @@ class LibChatHelper(object):
         return json.dumps(ret)
 
     def _convert_msg(self, msg):
-        sender = 'me' if msg.isSend else msg.get_msg_talker_id()
+        sender = 'me' if msg.isSend else msg.talker
         chatroom = msg.get_chatroom()
-        text = msg.content_no_first_line if msg.type == TYPE_MSG else ''
+        text = msg.content if msg.type == TYPE_MSG else ''
         img, format = self._get_image(msg)
         if img:
             # TODO don't use b64, directly return image content

+ 8 - 30
wechat/msg.py

@@ -21,7 +21,6 @@ TYPE_APP_MSG = 16777265
 _KNOWN_TYPES = [eval(k) for k in dir() if k.startswith('TYPE_')]
 
 import re
-from datetime import datetime
 from pyquery import PyQuery
 import logging
 logger = logging.getLogger(__name__)
@@ -30,7 +29,6 @@ from common.textutil import ensure_unicode
 
 
 class WeChatMsg(object):
-    FIELDS = ["msgSvrId","type","isSend","createTime","talker","content","imgPath"]
 
     @staticmethod
     def filter_type(tp):
@@ -38,21 +36,13 @@ class WeChatMsg(object):
             return True
         return False
 
-    def __init__(self, row):
-        """ row: a tuple corresponding to FIELDS"""
-        assert len(row) == len(WeChatMsg.FIELDS)
-        for f, v in zip(WeChatMsg.FIELDS, row):
-            setattr(self, f, v)
+    def __init__(self, values):
+        for k, v in values.iteritems():
+            setattr(self, k, v)
         if self.type not in _KNOWN_TYPES:
             logger.warn("Unhandled message type: {}".format(self.type))
             # only to supress repeated warning:
             _KNOWN_TYPES.append(self.type)
-        self.createTime = datetime.fromtimestamp(self.createTime / 1000)
-        self.talker_name = None
-        if self.content:
-            self.content = ensure_unicode(self.content)
-        else:
-            self.content = u""
 
     def msg_str(self):
         if self.type == TYPE_LOCATION:
@@ -97,29 +87,22 @@ class WeChatMsg(object):
             return "LOCATION SHARING"
         elif self.type == TYPE_EMOJI:
             # TODO add emoji name
-            return self.content_no_first_line
+            return self.content
         else:
             # TODO replace smiley with text
-            return self.content_no_first_line
-
-    @property
-    def content_no_first_line(self):
-        if not self.is_chatroom():
             return self.content
-        return self.content[self.content.find('\n')+1:]
 
     @property
     def content_xml_ready(self):
         # remove xml headers to avoid possible errors it may create
         header = re.compile(r'<\?.*\?>')
-        msg = header.sub("", self.content_no_first_line)
+        msg = header.sub("", self.content)
         return msg
 
     def __repr__(self):
         ret = u"{}|{}:{}:{}".format(
             self.type,
-            (self.talker if not self.talker_name else self.talker_name) \
-                if not self.isSend else 'me',
+            self.talker if not self.isSend else 'me',
             self.createTime,
             ensure_unicode(self.msg_str())).encode('utf-8')
         if self.imgPath:
@@ -132,16 +115,11 @@ class WeChatMsg(object):
         return self.createTime < r.createTime
 
     def is_chatroom(self):
-        return self.talker.endswith('@chatroom')
-
-    def get_msg_talker_id(self):
-        if not self.is_chatroom():
-            return self.talker
-        return self.content[:self.content.find(':')]
+        return self.talker != self.chat
 
     def get_chatroom(self):
         if self.is_chatroom():
-            return self.talker[:-9]
+            return self.chat
         else:
             return ''
 

+ 46 - 13
wechat/parser.py

@@ -7,6 +7,7 @@
 import sqlite3
 from collections import defaultdict
 import itertools
+from datetime import datetime
 import logging
 logger = logging.getLogger(__name__)
 
@@ -23,13 +24,15 @@ rcontact
 """
 
 class WeChatDBParser(object):
+    FIELDS = ["msgSvrId","type","isSend","createTime","talker","content","imgPath"]
+
     def __init__(self, db_fname):
         """ db_fname: EnMicroMsg.db"""
         self.db_fname = db_fname
         self.db_conn = sqlite3.connect(self.db_fname)
         self.cc = self.db_conn.cursor()
         self.contacts = {}
-        self.msgs_by_talker = defaultdict(list)
+        self.msgs_by_chat = defaultdict(list)
         self.emojis = {}
         self.internal_emojis = {}
         self._parse()
@@ -46,6 +49,7 @@ SELECT username,conRemark,nickname FROM rcontact
             else:
                 self.contacts[username] = ensure_unicode(nickname)
 
+        self.contacts_rev = {v: k for k, v in self.contacts.iteritems()}
         logger.info("Found {} contacts.".format(len(self.contacts)))
 
     def _parse_msg(self):
@@ -53,19 +57,18 @@ SELECT username,conRemark,nickname FROM rcontact
         db_msgs = self.cc.execute(
 """
 SELECT {} FROM message
-""".format(','.join(WeChatMsg.FIELDS)))
+""".format(','.join(WeChatDBParser.FIELDS)))
         for row in db_msgs:
-            msg = WeChatMsg(row)
+            values = self._parse_row(row)
+            if not values:
+                continue
+            msg = WeChatMsg(values)
+            # TODO keep system message?
             if not WeChatMsg.filter_type(msg.type):
-                self.msgs_by_talker[msg.talker].append(msg)
-
-        # It's possible that messages are kept in database after contacts been deleted
-        # TODO handle this with a random contact name
-        self.msgs_by_talker = {self.contacts[k]: sorted(v, key=lambda x: x.createTime)
-                           for k, v in self.msgs_by_talker.iteritems() if k in self.contacts}
-        for k, v in self.msgs_by_talker.iteritems():
-            for msg in v:
-                msg.talker_name = ensure_unicode(k)
+                self.msgs_by_chat[msg.chat].append(msg)
+
+        for k, v in self.msgs_by_chat.iteritems():
+            self.msgs_by_chat[k] = sorted(v, key=lambda x: x.createTime)
             msgs_tot_cnt += len(v)
         logger.info("Found {} message records.".format(msgs_tot_cnt))
 
@@ -84,7 +87,7 @@ SELECT {} FROM message
     def _find_msg_by_type(self, msgs=None):
         ret = []
         if msgs is None:
-            msgs = itertools.chain.from_iterable(self.msgs_by_talker.itervalues())
+            msgs = itertools.chain.from_iterable(self.msgs_by_chat.itervalues())
         for msg in msgs:
             if msg.type == 34:
                 ret.append(msg)
@@ -114,3 +117,33 @@ SELECT {} FROM message
         self._parse_msg()
         self._parse_imginfo()
         self._parse_emoji()
+
+    # process the values in a row
+    def _parse_row(self, row):
+        values = dict(zip(WeChatDBParser.FIELDS, row))
+        if values['content']:
+            values['content'] = ensure_unicode(values['content'])
+        else:
+            values['content'] = u''
+        values['createTime'] = datetime.fromtimestamp(values['createTime']/ 1000)
+        values['chat'] = values['talker']
+        try:
+            if values['chat'].endswith('@chatroom'):
+                values['chat'] = self.contacts[values['chat']]
+                content = values['content']
+                talker = content[:content.find(':')]
+                try:
+                    values['talker'] = self.contacts[talker]
+                    values['content'] = content[content.find('\n') + 1:]
+                except KeyError:
+                    # system messages have no talker
+                    values['talker'] = u''
+            else:
+                tk_id = values['talker']
+                values['chat'] = self.contacts[tk_id]
+                values['talker'] = self.contacts[tk_id]
+        except KeyError:
+            # It's possible that messages are kept in database after contacts been deleted
+            logger.warn("Unknown contact, probably deleted: {}".format(tk_id))
+            return None
+        return values

+ 9 - 9
wechat/render.py

@@ -92,7 +92,7 @@ class HTMLRender(object):
     def render_msg(self, msg):
         """ render a message, return the html block"""
         # TODO for chatroom, add nickname on avatar
-        sender = u'you ' + msg.get_msg_talker_id() if not msg.isSend else 'me'
+        sender = u'you ' + msg.talker if not msg.isSend else 'me'
         format_dict = {'sender_label': sender,
                        'time': msg.createTime }
         def fallback():
@@ -174,7 +174,7 @@ class HTMLRender(object):
         # string operation is extremely slow
         return self.html.format(extra_css=self.all_css,
                             extra_js=self.all_js,
-                            talker=msgs[0].talker_name,
+                            chat=msgs[0].chat,
                             messages=u''.join(blocks)
                            )
 
@@ -184,25 +184,25 @@ class HTMLRender(object):
         css = avatar_tpl.format(name='me', avatar=my_avatar)
 
         for talker in talkers:
-            avatar = self.res.get_avatar(talker)
+            avatar = self.res.get_avatar(self.parser.contacts_rev[talker])
             css += avatar_tpl.format(name=talker, avatar=avatar)
         self.css_string.append(css)
 
     def render_msgs(self, msgs):
-        """ render msgs of one friend, return a list of html"""
-        talker_id = msgs[0].talker
+        """ render msgs of one chat, return a list of html"""
+        chat = msgs[0].chat
         if msgs[0].is_chatroom():
             talkers = set()
             for msg in msgs:
-                talkers.add(msg.get_msg_talker_id())
+                talkers.add(msg.talker)
         else:
-            talkers = set([talker_id])
+            talkers = set([chat])
         self.prepare_avatar_css(talkers)
 
         self.res.cache_voice_mp3(msgs)
 
-        logger.info(u"Rendering {} messages of {}({})".format(
-            len(msgs), self.parser.contacts[talker_id], talker_id))
+        logger.info(u"Rendering {} messages of {}".format(
+            len(msgs), chat))
 
         self.prgs = ProgressReporter("Render", total=len(msgs))
         slice_by_size = MessageSlicerBySize().slice(msgs)

+ 2 - 2
wechat/static/TP_INDEX.html

@@ -2,7 +2,7 @@
 <head>
   <meta http-equiv="Content-Type" content="text/html;charset=utf8">
   <meta http-equiv="X-UA-Compatible" content="IE=edge">
-  <title>Chat with {talker}</title>
+  <title>Chat with {chat}</title>
   {extra_css}
   {extra_js}
 </head>
@@ -12,7 +12,7 @@
     <div class="chatMainPanel" id="chatMainPanel" style="padding-top:40px;">
       <div class="chatTitle" style="margin-top: -40px;">
         <div class="chatNameWrap">
-          <p class="chatName" id="messagePanelTitle">{talker}</p>
+          <p class="chatName" id="messagePanelTitle">{chat}</p>
         </div>
       </div>
       <div class="chatScorll" style="position: relative; font-family:initial;">