parser.py 2.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293
  1. #!/usr/bin/env python2
  2. # -*- coding: UTF-8 -*-
  3. # File: parser.py
  4. # Date: Sat Dec 20 15:58:16 2014 +0800
  5. # Author: Yuxin Wu <[email protected]>
  6. import sqlite3
  7. from collections import defaultdict
  8. import itertools
  9. import logging
  10. logger = logging.getLogger(__name__)
  11. from .msg import WeChatMsg
  12. from .utils import ensure_unicode
  13. """ tables in concern:
  14. emojiinfo
  15. imginfo2
  16. addr_upload2
  17. chatroom
  18. message
  19. rcontact
  20. """
  21. class WeChatDBParser(object):
  22. def __init__(self, db_fname):
  23. """ db_fname: EnMicroMsg.db"""
  24. self.db_fname = db_fname
  25. self.db_conn = sqlite3.connect(self.db_fname)
  26. self.cc = self.db_conn.cursor()
  27. self.contacts = {}
  28. self.msgs_by_talker = defaultdict(list)
  29. self.parse()
  30. def _parse_contact(self):
  31. contacts = self.cc.execute(
  32. """
  33. SELECT username,conRemark,nickname FROM rcontact
  34. """)
  35. for row in contacts:
  36. username, remark, nickname = row
  37. if remark:
  38. self.contacts[username] = ensure_unicode(remark)
  39. else:
  40. self.contacts[username] = ensure_unicode(nickname)
  41. logger.info("Found {} contacts.".format(len(self.contacts)))
  42. def _parse_msg(self):
  43. msgs_tot_cnt = 0
  44. db_msgs = self.cc.execute(
  45. """
  46. SELECT {} FROM message
  47. """.format(','.join(WeChatMsg.FIELDS)))
  48. for row in db_msgs:
  49. msg = WeChatMsg(row)
  50. if not WeChatMsg.filter_type(msg.type):
  51. self.msgs_by_talker[msg.talker].append(msg)
  52. self.msgs_by_talker = dict([
  53. (self.contacts[k], sorted(v, key=lambda x: x.createTime))
  54. for k, v in self.msgs_by_talker.iteritems()])
  55. for k, v in self.msgs_by_talker.iteritems():
  56. for msg in v:
  57. msg.talker_name = ensure_unicode(k)
  58. msgs_tot_cnt += len(v)
  59. logger.info("Found {} message records.".format(msgs_tot_cnt))
  60. def _parse_userinfo(self):
  61. userinfo_q = self.cc.execute(""" SELECT id, value FROM userinfo """)
  62. userinfo = dict(userinfo_q)
  63. self.username = userinfo[2]
  64. logger.info("Your username is: {}".format(self.username))
  65. def _parse_imginfo(self):
  66. imginfo_q = self.cc.execute("""SELECT msgSvrId, bigImgPath FROM ImgInfo2""")
  67. self.imginfo = dict([(k, v) for (k, v) in imginfo_q
  68. if not v.startswith('SERVERID://')])
  69. logger.info("Found {} big images records.".format(len(self.imginfo)))
  70. def _find_msg_by_type(self, msgs=None):
  71. ret = []
  72. if msgs is None:
  73. msgs = itertools.chain.from_iterable(self.msgs_by_talker.itervalues())
  74. for msg in msgs:
  75. if msg.type == 34:
  76. ret.append(msg)
  77. return sorted(ret)
  78. def parse(self):
  79. self._parse_userinfo()
  80. self._parse_contact()
  81. self._parse_msg()
  82. self._parse_imginfo()