parser.py 6.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199
  1. # -*- coding: UTF-8 -*-
  2. import sqlite3
  3. from collections import defaultdict
  4. import itertools
  5. from datetime import datetime
  6. import logging
  7. logger = logging.getLogger(__name__)
  8. from .msg import WeChatMsg, TYPE_SYSTEM
  9. from .common.textutil import ensure_unicode
  10. """ tables in concern:
  11. emojiinfo
  12. imginfo2
  13. addr_upload2
  14. chatroom
  15. message
  16. rcontact
  17. """
  18. class WeChatDBParser(object):
  19. FIELDS = ["msgSvrId","type","isSend","createTime","talker","content","imgPath"]
  20. def __init__(self, db_fname):
  21. """ db_fname: a decoded EnMicroMsg.db"""
  22. self.db_fname = db_fname
  23. self.db_conn = sqlite3.connect(self.db_fname)
  24. self.cc = self.db_conn.cursor()
  25. self.contacts = {} # username -> nickname
  26. self.contacts_rev = defaultdict(list)
  27. self.msgs_by_chat = defaultdict(list)
  28. self.emoji_groups = {}
  29. self.emoji_info = {}
  30. self.emoji_encryption_key = None
  31. self._parse()
  32. def _parse_contact(self):
  33. contacts = self.cc.execute(
  34. """
  35. SELECT username,conRemark,nickname FROM rcontact
  36. """)
  37. for row in contacts:
  38. username, remark, nickname = row
  39. if remark:
  40. self.contacts[username] = ensure_unicode(remark)
  41. else:
  42. self.contacts[username] = ensure_unicode(nickname)
  43. for k, v in self.contacts.items():
  44. self.contacts_rev[v].append(k)
  45. logger.info("Found {} names in `contact` table.".format(len(self.contacts)))
  46. def _parse_msg(self):
  47. msgs_tot_cnt = 0
  48. db_msgs = self.cc.execute(
  49. """
  50. SELECT {} FROM message
  51. """.format(','.join(WeChatDBParser.FIELDS)))
  52. for row in db_msgs:
  53. values = self._parse_msg_row(row)
  54. if not values:
  55. continue
  56. msg = WeChatMsg(values)
  57. # TODO keep system message?
  58. if not WeChatMsg.filter_type(msg.type):
  59. self.msgs_by_chat[msg.chat].append(msg)
  60. for k, v in self.msgs_by_chat.items():
  61. self.msgs_by_chat[k] = sorted(v, key=lambda x: x.createTime)
  62. msgs_tot_cnt += len(v)
  63. logger.info("Found {} message records.".format(msgs_tot_cnt))
  64. def _parse_userinfo(self):
  65. userinfo_q = self.cc.execute(""" SELECT id, value FROM userinfo """)
  66. userinfo = dict(userinfo_q)
  67. self.username = userinfo.get(2, None)
  68. if self.username is None:
  69. logger.error("Cannot find username in userinfo table!")
  70. self.username = input("Please enter your username:")
  71. logger.info("Your username is: {}".format(self.username))
  72. def _parse_imginfo(self):
  73. imginfo_q = self.cc.execute("""SELECT msgSvrId, bigImgPath FROM ImgInfo2""")
  74. self.imginfo = {k: v for (k, v) in imginfo_q
  75. if not v.startswith('SERVERID://')}
  76. logger.info("Found {} hd image records.".format(len(self.imginfo)))
  77. def _find_msg_by_type(self, msgs=None):
  78. ret = []
  79. if msgs is None:
  80. msgs = itertools.chain.from_iterable(self.msgs_by_chat.itervalues())
  81. for msg in msgs:
  82. if msg.type == 34:
  83. ret.append(msg)
  84. return sorted(ret)
  85. def _parse_emoji(self):
  86. # wechat provided emojis
  87. query = self.cc.execute(
  88. """ SELECT md5, groupid FROM EmojiInfoDesc """)
  89. for row in query:
  90. md5, group = row
  91. self.emoji_groups[md5] = group
  92. try:
  93. query = self.cc.execute(
  94. """ SELECT md5, catalog, name, cdnUrl, encrypturl, aeskey FROM EmojiInfo""")
  95. except: # old database does not have cdnurl
  96. pass
  97. else:
  98. for row in query:
  99. md5, catalog, name, cdnUrl, encrypturl, aeskey = row
  100. if cdnUrl or encrypturl:
  101. self.emoji_info[md5] = (catalog, cdnUrl, encrypturl, aeskey)
  102. def _parse(self):
  103. self._parse_userinfo()
  104. self._parse_contact()
  105. self._parse_msg()
  106. self._parse_imginfo()
  107. self._parse_emoji()
  108. def get_emoji_encryption_key(self):
  109. # obtain local encryption key in a special entry in the database
  110. # this also equals to md5(imei)
  111. query = self.cc.execute("SELECT md5 FROM EmojiInfo where catalog == 153")
  112. results = list(query)
  113. if len(results):
  114. assert len(results) == 1, "Found > 1 encryption keys in EmojiInfo. This is a bug!"
  115. return results[0][0]
  116. return None
  117. # process the values in a row
  118. def _parse_msg_row(self, row):
  119. """ parse a record of message into my format"""
  120. values = dict(zip(WeChatDBParser.FIELDS, row))
  121. if values['content']:
  122. values['content'] = ensure_unicode(values['content'])
  123. else:
  124. values['content'] = ''
  125. values['createTime'] = datetime.fromtimestamp(values['createTime']/ 1000)
  126. values['chat'] = values['talker']
  127. try:
  128. if values['chat'].endswith('@chatroom'):
  129. values['chat_nickname'] = self.contacts[values['chat']]
  130. content = values['content']
  131. if values['isSend'] == 1:
  132. values['talker'] = self.username
  133. elif values['type'] == TYPE_SYSTEM:
  134. values['talker'] = 'SYSTEM'
  135. else:
  136. talker = content[:content.find(':')]
  137. values['talker'] = talker
  138. values['talker_nickname'] = self.contacts.get(talker, talker)
  139. values['content'] = content[content.find('\n') + 1:]
  140. else:
  141. tk_id = values['talker']
  142. values['chat'] = tk_id
  143. values['chat_nickname'] = self.contacts[tk_id]
  144. values['talker'] = tk_id
  145. values['talker_nickname'] = self.contacts[tk_id]
  146. except KeyError:
  147. # It's possible that messages are kept in database after contacts been deleted
  148. logger.warn("Unknown contact: {}".format(values.get('talker', '')))
  149. return None
  150. return values
  151. @property
  152. def all_chat_ids(self):
  153. return self.msgs_by_chat.keys()
  154. @property
  155. def all_chat_nicknames(self):
  156. return [self.contacts[k] for k in self.all_chat_ids if len(self.contacts[k])]
  157. def get_id_by_nickname(self, nickname):
  158. """
  159. Get chat id by nickname.
  160. """
  161. l = self.contacts_rev[nickname]
  162. if len(l) == 0:
  163. raise KeyError("No contacts have nickname {}".format(nickname))
  164. if len(l) > 1:
  165. logger.warn("More than one contacts have nickname {}! Using the first contact".format(nickname))
  166. return l[0]
  167. def get_chat_id(self, nick_name_or_id):
  168. """
  169. Get the unique chat id by either chat id itself, or the nickname of the chat.
  170. """
  171. if nick_name_or_id in self.contacts:
  172. return nick_name_or_id
  173. else:
  174. return self.get_id_by_nickname(nick_name_or_id)