msg.py 8.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260
  1. # -*- coding: UTF-8 -*-
  2. TYPE_MSG = 1
  3. TYPE_IMG = 3
  4. TYPE_SPEAK = 34
  5. TYPE_NAMECARD = 42
  6. TYPE_VIDEO_FILE = 43
  7. TYPE_EMOJI = 47
  8. TYPE_LOCATION = 48
  9. TYPE_LINK = 49 # link share OR file from web, see https://github.com/ppwwyyxx/wechat-dump/issues/52
  10. TYPE_VOIP = 50
  11. TYPE_WX_VIDEO = 62 # video took by wechat
  12. TYPE_SYSTEM = 10000
  13. TYPE_CUSTOM_EMOJI = 1048625
  14. TYPE_REDENVELOPE = 436207665
  15. TYPE_MONEY_TRANSFER = 419430449 # 微信转账
  16. TYPE_LOCATION_SHARING = -1879048186
  17. TYPE_REPLY = 822083633 # 回复的消息.
  18. TYPE_FILE = 1090519089
  19. TYPE_QQMUSIC = 1040187441
  20. TYPE_APP_MSG = 16777265
  21. _KNOWN_TYPES = tuple([eval(k) for k in dir() if k.startswith('TYPE_')])
  22. import re
  23. import json
  24. import io
  25. import html
  26. from pyquery import PyQuery
  27. import xml.etree.ElementTree as ET
  28. import logging
  29. logger = logging.getLogger(__name__)
  30. class WeChatMsg(object):
  31. @staticmethod
  32. def filter_type(tp):
  33. if tp in [TYPE_SYSTEM]:
  34. return True
  35. return False
  36. def __init__(self, values):
  37. for k, v in values.items():
  38. setattr(self, k, v)
  39. self.known_type = self.type in _KNOWN_TYPES
  40. def msg_str(self):
  41. if self.type == TYPE_IMG:
  42. return "Image"
  43. elif self.type == TYPE_SPEAK:
  44. return "Voice"
  45. if self.type == TYPE_LOCATION:
  46. try:
  47. pq = PyQuery(self.content_xml_ready, parser='xml')
  48. loc = pq('location').attr
  49. label = loc['label']
  50. poiname = loc['poiname']
  51. if poiname:
  52. label = poiname
  53. return "LOCATION:" + label + " ({},{})".format(loc['x'], loc['y'])
  54. except:
  55. return "LOCATION: unknown"
  56. elif self.type == TYPE_LINK:
  57. pq = PyQuery(self.content_xml_ready)
  58. url = pq('url').text()
  59. if not url:
  60. # TODO: see https://github.com/ppwwyyxx/wechat-dump/issues/52 for
  61. # more logic to implement
  62. title = pq('title').text()
  63. if title: # may not be correct
  64. return "FILE:{}".format(title)
  65. return "Link"
  66. return "URL:{}".format(url)
  67. elif self.type == TYPE_NAMECARD:
  68. pq = PyQuery(self.content_xml_ready, parser='xml')
  69. msg = pq('msg').attr
  70. name = msg['nickname']
  71. if not name:
  72. name = msg['alias']
  73. if not name:
  74. name = ""
  75. return "NAMECARD: {}".format(self.content_xml_ready)
  76. elif self.type == TYPE_APP_MSG:
  77. pq = PyQuery(self.content_xml_ready, parser='xml')
  78. return pq('title').text()
  79. elif self.type == TYPE_VIDEO_FILE:
  80. return "VIDEO FILE"
  81. elif self.type == TYPE_WX_VIDEO:
  82. return "WeChat VIDEO"
  83. elif self.type == TYPE_VOIP:
  84. return "REQUEST VIDEO CHAT"
  85. elif self.type == TYPE_LOCATION_SHARING:
  86. return "LOCATION SHARING"
  87. elif self.type == TYPE_EMOJI:
  88. # TODO add emoji name
  89. if self.content.lstrip().startswith("<"):
  90. return "Emoji"
  91. if not self.content:
  92. return "Emoji"
  93. return self.content
  94. elif self.type == TYPE_CUSTOM_EMOJI:
  95. return "Emoji"
  96. elif self.type == TYPE_REDENVELOPE:
  97. data_to_parse = io.BytesIO(self.content.encode('utf-8'))
  98. try:
  99. for event, elem in ET.iterparse(data_to_parse, events=('end',)):
  100. if elem.tag == 'sendertitle':
  101. title = elem.text
  102. return "[RED ENVELOPE]\n{}".format(title)
  103. except:
  104. pass
  105. return "[RED ENVELOPE]"
  106. elif self.type == TYPE_MONEY_TRANSFER:
  107. data_to_parse = io.BytesIO(self.content.encode('utf-8'))
  108. try:
  109. for event, elem in ET.iterparse(data_to_parse, events=('end',)):
  110. if elem.tag == 'des':
  111. title = elem.text
  112. return "[Money Transfer]\n{}".format(title)
  113. except:
  114. pass
  115. return "[Money Transfer]"
  116. elif self.type == TYPE_REPLY:
  117. pq = PyQuery(self.content_xml_ready)
  118. titles = pq('title')
  119. if len(titles) == 0:
  120. return self.content_xml_ready
  121. msg = titles[0].text
  122. # TODO parse reply.
  123. return msg
  124. elif self.type == TYPE_FILE:
  125. pq = PyQuery(self.content_xml_ready)
  126. titles = pq('title')
  127. if len(titles) == 0:
  128. return self.content_xml_ready
  129. return "FILE:" + titles[0].text
  130. elif self.type == TYPE_QQMUSIC:
  131. pq = PyQuery(self.content_xml_ready)
  132. title = pq('title')[0].text
  133. singer = pq('des')[0].text
  134. url = html.unescape(pq('url')[0].text)
  135. return json.dumps(dict(
  136. title=title, singer=singer, url=url
  137. ))
  138. else:
  139. # TODO replace smiley with text
  140. return self.content
  141. def reply_info(self):
  142. """Parse TYPE_REPLY payload.
  143. Returns: {title, ref_name, ref_content, ref_type, ref_svrid}.
  144. """
  145. if self.type != TYPE_REPLY:
  146. return None
  147. def _one_line(text: str, *, max_len: int) -> str:
  148. text = re.sub(r"\s+", " ", (text or "")).strip()
  149. if len(text) > max_len:
  150. return text[: max_len - 1] + "…"
  151. return text
  152. xml = self.content_xml_ready
  153. idx = xml.find("<msg")
  154. if idx != -1:
  155. xml = xml[idx:]
  156. try:
  157. pq = PyQuery(xml, parser="xml")
  158. except Exception:
  159. return None
  160. title = html.unescape(pq("appmsg > title").text() or pq("title").eq(0).text() or "")
  161. ref_name_raw = pq("refermsg displayname").text() or pq("refermsg fromusr").text() or ""
  162. ref_name = _one_line(html.unescape(ref_name_raw), max_len=80)
  163. ref_content_raw = pq("refermsg content").text() or ""
  164. ref_svrid_i = None
  165. ref_svrid = pq("refermsg svrid").text() or pq("refermsg svrId").text()
  166. if ref_svrid:
  167. try:
  168. ref_svrid_i = int(ref_svrid)
  169. except Exception:
  170. ref_svrid_i = None
  171. ref_type_i = None
  172. ref_type = pq("refermsg type").text()
  173. if ref_type:
  174. try:
  175. ref_type_i = int(ref_type)
  176. except Exception:
  177. ref_type_i = None
  178. ref_content_fallback = html.unescape(ref_content_raw or "")
  179. if ref_type_i is None:
  180. ref_content = ref_content_fallback
  181. else:
  182. try:
  183. ref_content = WeChatMsg({"type": ref_type_i, "content": ref_content_fallback}).msg_str()
  184. except Exception:
  185. ref_content = ref_content_fallback
  186. ref_content = _one_line(ref_content, max_len=200)
  187. if not title and not ref_name and not ref_content:
  188. return None
  189. return {
  190. "title": title.strip(),
  191. "ref_name": ref_name,
  192. "ref_content": ref_content,
  193. "ref_type": ref_type_i,
  194. "ref_svrid": ref_svrid_i,
  195. }
  196. @property
  197. def content_xml_ready(self):
  198. # remove xml headers to avoid possible errors it may create
  199. header = re.compile(r'<\?.*\?>')
  200. msg = header.sub("", self.content)
  201. return msg
  202. def __repr__(self):
  203. ret = "{}|{}:{}:{}".format(
  204. self.type,
  205. self.talker_nickname if not self.isSend else 'me',
  206. self.createTime,
  207. self.msg_str())
  208. if self.imgPath:
  209. ret = "{}|img:{}".format(ret.strip(), self.imgPath)
  210. return ret
  211. else:
  212. return ret
  213. def __eq__(self, r):
  214. return self.createTime == r.createTime and \
  215. self.talker == r.talker and \
  216. self.isSend == r.isSend
  217. # imgPath might change after migration.
  218. def __lt__(self, r):
  219. return self.createTime < r.createTime
  220. def is_chatroom(self):
  221. return self.talker != self.chat
  222. def get_chatroom(self):
  223. if self.is_chatroom():
  224. return self.chat
  225. else:
  226. return ''
  227. def get_emoji_product_id(self):
  228. assert self.type == TYPE_EMOJI, "Wrong call to get_emoji_product_id()!"
  229. pq = PyQuery(self.content_xml_ready, parser='xml')
  230. emoji = pq('emoji')
  231. if not emoji:
  232. return None
  233. return emoji.attrs['productid']