parser.py 6.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177
  1. #!/usr/bin/env python2
  2. # -*- coding: UTF-8 -*-
  3. # File: parser.py
  4. # Date: Thu Jun 18 00:03:53 2015 +0800
  5. # Author: Yuxin Wu <[email protected]>
  6. import sqlite3
  7. from collections import defaultdict
  8. import itertools
  9. from datetime import datetime
  10. import logging
  11. logger = logging.getLogger(__name__)
  12. from .msg import WeChatMsg, TYPE_SYSTEM
  13. from common.textutil import ensure_unicode
  14. """ tables in concern:
  15. emojiinfo
  16. imginfo2
  17. addr_upload2
  18. chatroom
  19. message
  20. rcontact
  21. """
  22. class WeChatDBParser(object):
  23. FIELDS = ["msgSvrId","type","isSend","createTime","talker","content","imgPath"]
  24. def __init__(self, db_fname):
  25. """ db_fname: a decoded EnMicroMsg.db"""
  26. self.db_fname = db_fname
  27. self.db_conn = sqlite3.connect(self.db_fname)
  28. self.cc = self.db_conn.cursor()
  29. self.contacts = {} # username -> nickname
  30. self.contacts_rev = defaultdict(list)
  31. self.msgs_by_chat = defaultdict(list)
  32. self.emoji_groups = {}
  33. self.emoji_url = {}
  34. self.internal_emojis = {}
  35. self._parse()
  36. def _parse_contact(self):
  37. contacts = self.cc.execute(
  38. """
  39. SELECT username,conRemark,nickname FROM rcontact
  40. """)
  41. for row in contacts:
  42. username, remark, nickname = row
  43. if remark:
  44. self.contacts[username] = ensure_unicode(remark)
  45. else:
  46. self.contacts[username] = ensure_unicode(nickname)
  47. for k, v in self.contacts.iteritems():
  48. self.contacts_rev[v].append(k)
  49. logger.info("Found {} names in `contact` table.".format(len(self.contacts)))
  50. def _parse_msg(self):
  51. msgs_tot_cnt = 0
  52. db_msgs = self.cc.execute(
  53. """
  54. SELECT {} FROM message
  55. """.format(','.join(WeChatDBParser.FIELDS)))
  56. for row in db_msgs:
  57. values = self._parse_msg_row(row)
  58. if not values:
  59. continue
  60. msg = WeChatMsg(values)
  61. # TODO keep system message?
  62. if not WeChatMsg.filter_type(msg.type):
  63. self.msgs_by_chat[msg.chat].append(msg)
  64. for k, v in self.msgs_by_chat.iteritems():
  65. self.msgs_by_chat[k] = sorted(v, key=lambda x: x.createTime)
  66. msgs_tot_cnt += len(v)
  67. logger.info("Found {} message records.".format(msgs_tot_cnt))
  68. def _parse_userinfo(self):
  69. userinfo_q = self.cc.execute(""" SELECT id, value FROM userinfo """)
  70. userinfo = dict(userinfo_q)
  71. self.username = userinfo[2]
  72. logger.info("Your username is: {}".format(self.username))
  73. def _parse_imginfo(self):
  74. imginfo_q = self.cc.execute("""SELECT msgSvrId, bigImgPath FROM ImgInfo2""")
  75. self.imginfo = {k: v for (k, v) in imginfo_q
  76. if not v.startswith('SERVERID://')}
  77. logger.info("Found {} hd image records.".format(len(self.imginfo)))
  78. def _find_msg_by_type(self, msgs=None):
  79. ret = []
  80. if msgs is None:
  81. msgs = itertools.chain.from_iterable(self.msgs_by_chat.itervalues())
  82. for msg in msgs:
  83. if msg.type == 34:
  84. ret.append(msg)
  85. return sorted(ret)
  86. def _parse_emoji(self):
  87. # wechat provided emojis
  88. emojiinfo_q = self.cc.execute(
  89. """ SELECT md5, groupid FROM EmojiInfoDesc """)
  90. for row in emojiinfo_q:
  91. md5, group = row
  92. self.emoji_groups[md5] = group
  93. NEEDED_EMOJI_CATALOG = [49, 50, 17]
  94. emojiinfo_q = self.cc.execute(
  95. """ SELECT md5, catalog, name, cdnUrl FROM EmojiInfo""")
  96. for row in emojiinfo_q:
  97. md5, catalog, name, cdnUrl = row
  98. if cdnUrl:
  99. self.emoji_url[md5] = cdnUrl
  100. if catalog not in NEEDED_EMOJI_CATALOG:
  101. continue
  102. self.internal_emojis[md5] = name
  103. def _parse(self):
  104. self._parse_userinfo()
  105. self._parse_contact()
  106. self._parse_msg()
  107. self._parse_imginfo()
  108. self._parse_emoji()
  109. # process the values in a row
  110. def _parse_msg_row(self, row):
  111. """ parse a record of message into my format"""
  112. values = dict(zip(WeChatDBParser.FIELDS, row))
  113. if values['content']:
  114. values['content'] = ensure_unicode(values['content'])
  115. else:
  116. values['content'] = u''
  117. values['createTime'] = datetime.fromtimestamp(values['createTime']/ 1000)
  118. values['chat'] = values['talker']
  119. try:
  120. if values['chat'].endswith('@chatroom'):
  121. values['chat_nickname'] = self.contacts[values['chat']]
  122. content = values['content']
  123. if values['isSend'] == 1:
  124. values['talker'] = self.username
  125. elif values['type'] == TYPE_SYSTEM:
  126. values['talker'] = u'SYSTEM'
  127. else:
  128. talker = content[:content.find(':')]
  129. values['talker'] = talker
  130. values['talker_nickname'] = self.contacts.get(talker, talker)
  131. values['content'] = content[content.find('\n') + 1:]
  132. else:
  133. tk_id = values['talker']
  134. values['chat'] = tk_id
  135. values['chat_nickname'] = self.contacts[tk_id]
  136. values['talker'] = tk_id
  137. values['talker_nickname'] = self.contacts[tk_id]
  138. except KeyError:
  139. # It's possible that messages are kept in database after contacts been deleted
  140. logger.warn("Unknown contact: {}".format(values.get('talker', '')))
  141. return None
  142. return values
  143. @property
  144. def all_chat_ids(self):
  145. return self.msgs_by_chat.keys()
  146. @property
  147. def all_chat_nicknames(self):
  148. return [self.contacts[k] for k in self.all_chat_ids]
  149. def get_id_by_nickname(self, nickname):
  150. l = self.contacts_rev[nickname]
  151. if len(l) == 0:
  152. raise KeyError("No contacts have nickname {}".format(nickname))
  153. if len(l) > 1:
  154. logger.warn("More than one contacts have nickname {}! Using the first contact".format(nickname))
  155. return l[0]