2
0

emoji.py 8.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222
  1. import os
  2. from pathlib import Path
  3. import logging
  4. import io
  5. import requests
  6. import base64
  7. from PIL import Image
  8. import pickle
  9. from Crypto.Cipher import AES
  10. from .wxgf import WxgfDecoder, is_wxgf_buffer
  11. from .parser import WeChatDBParser
  12. from .common.imgutil import what as img_what
  13. from .common.textutil import md5 as get_md5_hex, get_file_b64, get_file_md5
  14. LIB_PATH = os.path.dirname(os.path.abspath(__file__))
  15. DEFAULT_EMOJI_CACHE = os.path.join(LIB_PATH, '..', 'emoji.cache')
  16. logger = logging.getLogger(__name__)
  17. def _get_aes_key(md5):
  18. # ascii representation of the first half of md5 is used as aes key
  19. assert len(md5) == 32
  20. return md5[:16].encode('ascii')
  21. class EmojiReader:
  22. def __init__(self,
  23. resource_dir: str,
  24. parser: WeChatDBParser,
  25. *,
  26. wxgf_decoder: WxgfDecoder,
  27. cache_file: str=None):
  28. """
  29. Args:
  30. resource_dir: path to resource/
  31. parser: Database parser
  32. wxgf_decoder: Wxgf image decoder
  33. cache_file: a cache file to store emoji downloaded from URLs.
  34. default to a emoji.cache file under wechat-dump.
  35. """
  36. self.emoji_dir = Path(resource_dir) / 'emoji'
  37. assert self.emoji_dir.is_dir(), self.emoji_dir
  38. self.parser = parser
  39. self.emoji_info = parser.emoji_info or {}
  40. # mapping from md5 to the (cdnurl, encrypturl, aeskey)
  41. # columns in EmojiInfo table.
  42. self.cache_file = cache_file or DEFAULT_EMOJI_CACHE
  43. self.wxgf_decoder = wxgf_decoder
  44. # cache stores md5 -> (base64str, format)
  45. if os.path.isfile(self.cache_file):
  46. with open(self.cache_file, "rb") as f:
  47. self._cache = pickle.load(f)
  48. else:
  49. self._cache = {}
  50. self._cache_size = len(self._cache)
  51. self.encryption_key = parser.get_emoji_encryption_key()
  52. if self.encryption_key is not None:
  53. self.encryption_key = _get_aes_key(self.encryption_key)
  54. def get_emoji(self, md5):
  55. """ Returns: (b64 encoded img string, format) """
  56. assert md5, f"Invalid md5 {md5}!"
  57. # check cache
  58. img, format = self._cache_query(md5)
  59. if format:
  60. return img, format
  61. # check resource/
  62. subdir = self.parser.emoji_groups.get(md5, '')
  63. dir_to_search = self.emoji_dir / subdir
  64. img, format = self._search_in_res(dir_to_search, md5, False)
  65. if format:
  66. return img, format
  67. emoji_info = self.emoji_info.get(md5, None)
  68. if emoji_info:
  69. catalog, cdnurl, encrypturl, aeskey = emoji_info
  70. img, format = self._fetch(md5, cdnurl, encrypturl, aeskey)
  71. if format:
  72. return img, format
  73. img, format = self._search_in_res(dir_to_search, md5, True)
  74. if format:
  75. logger.info(f"Using fallback for emoji {md5}")
  76. return img, format
  77. else:
  78. emoji_in_table = emoji_info is not None
  79. msg = "not in database" if not emoji_in_table else f"group='{subdir}'"
  80. logger.warning(f"Cannot find emoji {md5}: {msg}")
  81. return None, None
  82. def _cache_query(self, md5):
  83. data, format = self._cache.get(md5, (None, None))
  84. if data is not None and not isinstance(data, str):
  85. data = data.decode('ascii')
  86. return data, format
  87. def _cache_add(self, md5, values):
  88. self._cache[md5] = values
  89. if len(self._cache) >= self._cache_size + 15:
  90. self.flush_cache()
  91. def flush_cache(self):
  92. if len(self._cache) > self._cache_size:
  93. self._cache_size = len(self._cache)
  94. with open(self.cache_file, 'wb') as f:
  95. pickle.dump(self._cache, f, protocol=-1)
  96. def _search_in_res(self, dir, md5, allow_fallback=False):
  97. if allow_fallback:
  98. candidates = dir.glob(f'{md5}*')
  99. # There are misc low-quality matches, e.g.:
  100. # 'md5_{0..15}' for each frame of gif, non-animated md5_thumb, md5_cover
  101. # candidates = [k for k in candidates if not re.match('.*_[0-9]+$', k)]
  102. # candidates = [k for k in candidates if (not k.endswith('_cover') and not k.endswith('_thumb')))]
  103. else:
  104. if (dir / md5).is_file():
  105. candidates = [dir / md5]
  106. else:
  107. candidates = []
  108. def get_data_no_fallback(fname):
  109. if img_what(fname):
  110. data_md5 = get_file_md5(fname)
  111. if data_md5 == md5:
  112. return get_file_b64(fname), img_what(fname)
  113. try:
  114. content = self._decode_emoji(fname)
  115. data_md5 = get_md5_hex(content)
  116. if data_md5 != md5:
  117. if is_wxgf_buffer(content):
  118. content = self.wxgf_decoder.decode_with_cache(fname, content)
  119. if content is None:
  120. if not self.wxgf_decoder.has_server():
  121. logger.warning("Cannot decode wxgf emojis. Install ffmpeg+ffprobe or provide a wxgf decoder server with --wxgf-server.")
  122. raise ValueError("Failed to decode wxgf file.")
  123. else:
  124. raise ValueError("Decoded data mismatch md5!")
  125. im = Image.open(io.BytesIO(content))
  126. return (base64.b64encode(content).decode('ascii'), im.format.lower())
  127. except Exception as e:
  128. logger.error(f"Error decoding emoji {fname} : {str(e)}")
  129. def get_data_fallback(fname):
  130. if not img_what(fname):
  131. return # fallback files are not encrypted
  132. return get_file_b64(fname), img_what(fname)
  133. get_data_func = get_data_fallback if allow_fallback else get_data_no_fallback
  134. results = [(x, get_data_func(x)) for x in candidates]
  135. results = [(a, b) for a, b in results if b is not None]
  136. # maybe sort candidates by heuristics?
  137. if len(results):
  138. return results[0][1]
  139. return (None, None)
  140. def _decode_emoji(self, fname):
  141. cipher = AES.new(self.encryption_key, AES.MODE_ECB)
  142. with open(fname, 'rb') as f:
  143. head = f.read(1024)
  144. plain_head = cipher.decrypt(head)
  145. data = plain_head + f.read()
  146. return data
  147. def _fetch(self, md5, cdnurl, encrypturl, aeskey):
  148. ret = None
  149. if cdnurl:
  150. try:
  151. logger.info("Requesting emoji {} from {} ...".format(md5, cdnurl))
  152. r = requests.get(cdnurl).content
  153. emoji_md5 = get_md5_hex(r)
  154. im = Image.open(io.BytesIO(r))
  155. ret = (base64.b64encode(r).decode('ascii'), im.format.lower())
  156. if emoji_md5 == md5:
  157. self._cache_add(md5, ret)
  158. return ret
  159. else:
  160. raise ValueError("Emoji MD5 from CDNURL does not match")
  161. except Exception:
  162. logger.debug("Error processing cdnurl {}".format(cdnurl))
  163. if encrypturl:
  164. try:
  165. logger.info("Requesting encrypted emoji {} from {} ...".format(md5, encrypturl))
  166. buf = requests.get(encrypturl).content
  167. if buf == b'':
  168. logger.error(f"Failed to download emoji {md5}")
  169. return None, None
  170. aeskey = bytes.fromhex(aeskey)
  171. cipher = AES.new(aeskey, AES.MODE_CBC, iv=aeskey)
  172. decoded_buf = cipher.decrypt(buf)
  173. im = Image.open(io.BytesIO(decoded_buf))
  174. ret = (base64.b64encode(decoded_buf).decode('ascii'), im.format.lower())
  175. self._cache_add(md5, ret)
  176. return ret
  177. except Exception:
  178. logger.exception("Error processing encrypturl {}".format(encrypturl))
  179. if ret is not None:
  180. # ret may become something with wrong md5. Try it anyway, but don't cache.
  181. return ret
  182. return None, None
  183. if __name__ == "__main__":
  184. logger.setLevel(logging.DEBUG)
  185. handler = logging.StreamHandler()
  186. logger.addHandler(handler)
  187. class Dummy():
  188. def _cache_add(self, md5, ret):
  189. pass
  190. # test decryption
  191. md5 = '5a7fc462d63ef845e6d99c1523bbc91e'
  192. encurl = 'http://emoji.qpic.cn/wx_emoji/CQmBgayyMuvscRVEKN9s4HyTjKVU9iacqqhyCpdtqOVcCql5JaibjDFg/'
  193. enckey = '8ba7f51f9f3ac58cf8ed937fc90200a6'
  194. b64, format = EmojiReader._fetch(Dummy(), md5, None, encurl, enckey)
  195. print("format=", format)