parser.py 5.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153
  1. #!/usr/bin/env python2
  2. # -*- coding: UTF-8 -*-
  3. # File: parser.py
  4. # Date: Thu Jun 18 00:03:53 2015 +0800
  5. # Author: Yuxin Wu <[email protected]>
  6. import sqlite3
  7. from collections import defaultdict
  8. import itertools
  9. from datetime import datetime
  10. import logging
  11. logger = logging.getLogger(__name__)
  12. from .msg import WeChatMsg
  13. from common.textutil import ensure_unicode
  14. """ tables in concern:
  15. emojiinfo
  16. imginfo2
  17. addr_upload2
  18. chatroom
  19. message
  20. rcontact
  21. """
  22. class WeChatDBParser(object):
  23. FIELDS = ["msgSvrId","type","isSend","createTime","talker","content","imgPath"]
  24. def __init__(self, db_fname):
  25. """ db_fname: a decrypted EnMicroMsg.db"""
  26. self.db_fname = db_fname
  27. self.db_conn = sqlite3.connect(self.db_fname)
  28. self.cc = self.db_conn.cursor()
  29. self.contacts = {}
  30. self.msgs_by_chat = defaultdict(list)
  31. self.emoji_groups = {}
  32. self.emoji_url = {}
  33. self.internal_emojis = {}
  34. self._parse()
  35. def _parse_contact(self):
  36. contacts = self.cc.execute(
  37. """
  38. SELECT username,conRemark,nickname FROM rcontact
  39. """)
  40. for row in contacts:
  41. username, remark, nickname = row
  42. if remark:
  43. self.contacts[username] = ensure_unicode(remark)
  44. else:
  45. self.contacts[username] = ensure_unicode(nickname)
  46. self.contacts_rev = {v: k for k, v in self.contacts.iteritems()}
  47. logger.info("Found {} contacts.".format(len(self.contacts)))
  48. def _parse_msg(self):
  49. msgs_tot_cnt = 0
  50. db_msgs = self.cc.execute(
  51. """
  52. SELECT {} FROM message
  53. """.format(','.join(WeChatDBParser.FIELDS)))
  54. for row in db_msgs:
  55. values = self._parse_msg_row(row)
  56. if not values:
  57. continue
  58. msg = WeChatMsg(values)
  59. # TODO keep system message?
  60. if not WeChatMsg.filter_type(msg.type):
  61. self.msgs_by_chat[msg.chat].append(msg)
  62. for k, v in self.msgs_by_chat.iteritems():
  63. self.msgs_by_chat[k] = sorted(v, key=lambda x: x.createTime)
  64. msgs_tot_cnt += len(v)
  65. logger.info("Found {} message records.".format(msgs_tot_cnt))
  66. def _parse_userinfo(self):
  67. userinfo_q = self.cc.execute(""" SELECT id, value FROM userinfo """)
  68. userinfo = dict(userinfo_q)
  69. self.username = userinfo[2]
  70. logger.info("Your username is: {}".format(self.username))
  71. def _parse_imginfo(self):
  72. imginfo_q = self.cc.execute("""SELECT msgSvrId, bigImgPath FROM ImgInfo2""")
  73. self.imginfo = {k: v for (k, v) in imginfo_q
  74. if not v.startswith('SERVERID://')}
  75. logger.info("Found {} hd image records.".format(len(self.imginfo)))
  76. def _find_msg_by_type(self, msgs=None):
  77. ret = []
  78. if msgs is None:
  79. msgs = itertools.chain.from_iterable(self.msgs_by_chat.itervalues())
  80. for msg in msgs:
  81. if msg.type == 34:
  82. ret.append(msg)
  83. return sorted(ret)
  84. def _parse_emoji(self):
  85. # wechat provided emojis
  86. emojiinfo_q = self.cc.execute(
  87. """ SELECT md5, groupid FROM EmojiInfoDesc """)
  88. for row in emojiinfo_q:
  89. md5, group = row
  90. self.emoji_groups[md5] = group
  91. NEEDED_EMOJI_CATALOG = [49, 50, 17]
  92. emojiinfo_q = self.cc.execute(
  93. """ SELECT md5, catalog, name, cdnUrl FROM EmojiInfo""")
  94. for row in emojiinfo_q:
  95. md5, catalog, name, cdnUrl = row
  96. if cdnUrl:
  97. self.emoji_url[md5] = cdnUrl
  98. if catalog not in NEEDED_EMOJI_CATALOG:
  99. continue
  100. self.internal_emojis[md5] = name
  101. def _parse(self):
  102. self._parse_userinfo()
  103. self._parse_contact()
  104. self._parse_msg()
  105. self._parse_imginfo()
  106. self._parse_emoji()
  107. # process the values in a row
  108. def _parse_msg_row(self, row):
  109. """ parse a record of message into my format"""
  110. values = dict(zip(WeChatDBParser.FIELDS, row))
  111. if values['content']:
  112. values['content'] = ensure_unicode(values['content'])
  113. else:
  114. values['content'] = u''
  115. values['createTime'] = datetime.fromtimestamp(values['createTime']/ 1000)
  116. values['chat'] = values['talker']
  117. try:
  118. if values['chat'].endswith('@chatroom'):
  119. values['chat'] = self.contacts[values['chat']]
  120. content = values['content']
  121. talker = content[:content.find(':')]
  122. try:
  123. values['talker'] = self.contacts[talker]
  124. values['content'] = content[content.find('\n') + 1:]
  125. except KeyError:
  126. # system messages have no talker
  127. values['talker'] = u''
  128. else:
  129. tk_id = values['talker']
  130. values['chat'] = self.contacts[tk_id]
  131. values['talker'] = self.contacts[tk_id]
  132. except KeyError:
  133. # It's possible that messages are kept in database after contacts been deleted
  134. logger.warn("Unknown contact, probably deleted: {}".format(tk_id))
  135. return None
  136. return values