res.py 9.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268
  1. # -*- coding: UTF-8 -*-
  2. import glob
  3. import os
  4. import re
  5. from PIL import Image
  6. import io
  7. import base64
  8. import logging
  9. logger = logging.getLogger(__name__)
  10. import imghdr
  11. from multiprocessing import Pool
  12. import atexit
  13. import pickle
  14. import requests
  15. from .avatar import AvatarReader
  16. from .common.textutil import md5, get_file_b64
  17. from .common.timer import timing
  18. from .msg import TYPE_SPEAK
  19. from .audio import parse_wechat_audio_file
  20. LIB_PATH = os.path.dirname(os.path.abspath(__file__))
  21. INTERNAL_EMOJI_DIR = os.path.join(LIB_PATH, 'static', 'internal_emoji')
  22. VOICE_DIRNAME = 'voice2'
  23. IMG_DIRNAME = 'image2'
  24. EMOJI_DIRNAME = 'emoji'
  25. VIDEO_DIRNAME = 'video'
  26. JPEG_QUALITY = 50
  27. class EmojiCache(object):
  28. def __init__(self, fname):
  29. self.fname = fname
  30. if os.path.isfile(fname):
  31. with open(fname, 'rb') as f:
  32. self.dic = pickle.load(f)
  33. else:
  34. self.dic = {}
  35. self._curr_size = len(self.dic)
  36. def query(self, md5):
  37. data, format = self.dic.get(md5, (None, None))
  38. if data is not None and not isinstance(data, str):
  39. data = data.decode('ascii')
  40. return data, format
  41. def fetch(self, md5, url):
  42. try:
  43. logger.info("Requesting emoji {} from {} ...".format(md5, url))
  44. r = requests.get(url).content
  45. im = Image.open(io.BytesIO(r))
  46. format = im.format.lower()
  47. ret = (base64.b64encode(r).decode('ascii'), format)
  48. self.dic[md5] = ret
  49. if len(self.dic) >= self._curr_size + 10:
  50. self.flush()
  51. return ret
  52. except Exception as e:
  53. logger.exception("Error processing emoji from {}".format(url))
  54. return None, None
  55. def flush(self):
  56. if len(self.dic) > self._curr_size:
  57. self._curr_size = len(self.dic)
  58. with open(self.fname, 'wb') as f:
  59. pickle.dump(self.dic, f)
  60. class Resource(object):
  61. """ multimedia resources in chat"""
  62. def __init__(self, parser, res_dir, avt_db):
  63. def check(subdir):
  64. dir_to_check = os.path.join(res_dir, subdir)
  65. assert os.path.isdir(dir_to_check), f"No such directory: {dir_to_check}"
  66. [check(k) for k in ['', IMG_DIRNAME, EMOJI_DIRNAME, VOICE_DIRNAME]]
  67. self.emoji_cache = EmojiCache(
  68. os.path.join(os.path.dirname(os.path.abspath(__file__)),
  69. '..', 'emoji.cache'))
  70. self.res_dir = res_dir
  71. self.parser = parser
  72. self.voice_cache_idx = {}
  73. self.img_dir = os.path.join(res_dir, IMG_DIRNAME)
  74. self.voice_dir = os.path.join(res_dir, VOICE_DIRNAME)
  75. self.emoji_dir = os.path.join(res_dir, EMOJI_DIRNAME)
  76. self.video_dir = os.path.join(res_dir, VIDEO_DIRNAME)
  77. self.avt_reader = AvatarReader(res_dir, avt_db)
  78. def get_voice_filename(self, imgpath):
  79. fname = md5(imgpath.encode('ascii'))
  80. dir1, dir2 = fname[:2], fname[2:4]
  81. ret = os.path.join(self.voice_dir, dir1, dir2,
  82. 'msg_{}.amr'.format(imgpath))
  83. if not os.path.isfile(ret):
  84. logger.error("Voice file not found for {}".format(imgpath))
  85. return ""
  86. return ret
  87. def get_voice_mp3(self, imgpath):
  88. """ return mp3 and duration, or empty string and 0 on failure"""
  89. idx = self.voice_cache_idx.get(imgpath)
  90. if idx is None:
  91. return parse_wechat_audio_file(
  92. self.get_voice_filename(imgpath))
  93. return self.voice_cache[idx].get()
  94. def cache_voice_mp3(self, msgs):
  95. """ for speed.
  96. msgs: a collection of WeChatMsg, to cache for later fetch"""
  97. voice_paths = [msg.imgPath for msg in msgs if msg.type == TYPE_SPEAK]
  98. # NOTE: remove all the caching code to debug serial decoding
  99. self.voice_cache_idx = {k: idx for idx, k in enumerate(voice_paths)}
  100. pool = Pool(3)
  101. atexit.register(lambda x: x.terminate(), pool)
  102. self.voice_cache = [pool.apply_async(parse_wechat_audio_file,
  103. (self.get_voice_filename(k),)) for k in voice_paths]
  104. def get_avatar(self, username):
  105. """ return base64 unicode string"""
  106. im = self.avt_reader.get_avatar(username)
  107. if im is None:
  108. logger.warning(f"Avatar for {username} is missing.")
  109. return ""
  110. buf = io.BytesIO()
  111. try:
  112. im.save(buf, 'JPEG', quality=JPEG_QUALITY)
  113. except IOError:
  114. try:
  115. # sometimes it works the second time...
  116. im.save(buf, 'JPEG', quality=JPEG_QUALITY)
  117. except IOError:
  118. return ""
  119. jpeg_str = buf.getvalue()
  120. return base64.b64encode(jpeg_str).decode('ascii')
  121. def _get_img_file(self, fnames):
  122. """ fnames: a list of filename to search for
  123. return (filename, filename) of (big, small) image.
  124. could be empty string.
  125. """
  126. cands = []
  127. for fname in fnames:
  128. dir1, dir2 = fname[:2], fname[2:4]
  129. dirname = os.path.join(self.img_dir, dir1, dir2)
  130. if not os.path.isdir(dirname):
  131. logger.warn("Directory not found: {}".format(dirname))
  132. continue
  133. for f in os.listdir(dirname):
  134. if fname in f:
  135. full_name = os.path.join(dirname, f)
  136. size = os.path.getsize(full_name)
  137. if size > 0:
  138. cands.append((full_name, size))
  139. if not cands:
  140. return ("", "")
  141. cands = sorted(cands, key=lambda x: x[1])
  142. def name_is_thumbnail(name):
  143. return os.path.basename(name).startswith('th_') \
  144. and not name.endswith('hd')
  145. if len(cands) == 1:
  146. name = cands[0][0]
  147. if name_is_thumbnail(name):
  148. # thumbnail
  149. return ("", name)
  150. else:
  151. logger.warn("Found big image but not thumbnail: {}".format(fname))
  152. return (name, "")
  153. big = cands[-1]
  154. ths = list(filter(name_is_thumbnail, [k[0] for k in cands]))
  155. if not ths:
  156. return (big[0], "")
  157. return (big[0], ths[0])
  158. def get_img(self, fnames):
  159. """
  160. :params fnames: possible file paths
  161. :returns: two base64 jpg string
  162. """
  163. fnames = [k for k in fnames if k] # filter out empty string
  164. big_file, small_file = self._get_img_file(fnames)
  165. def get_jpg_b64(img_file):
  166. if not img_file:
  167. return None
  168. if not img_file.endswith('jpg') and \
  169. imghdr.what(img_file) != 'jpeg':
  170. im = Image.open(open(img_file, 'rb'))
  171. buf = io.BytesIO()
  172. im.convert('RGB').save(buf, 'JPEG', quality=JPEG_QUALITY)
  173. return base64.b64encode(buf.getvalue()).decode('ascii')
  174. return get_file_b64(img_file)
  175. big_file = get_jpg_b64(big_file)
  176. if big_file:
  177. return big_file
  178. return get_jpg_b64(small_file)
  179. def _get_res_emoji(self, md5, pack_id, allow_cover=False):
  180. """
  181. pack_id: can be None
  182. allow_cover: Cover is non-animated. Can be used as a fallback.
  183. """
  184. path = os.path.join(self.emoji_dir, pack_id or '')
  185. candidates = glob.glob(os.path.join(path, '{}*'.format(md5)))
  186. candidates = [k for k in candidates if not re.match('.*_[0-9]+$', k)]
  187. def try_use(f):
  188. if not f: return None
  189. if not imghdr.what(f[0]): # cannot recognize file type
  190. return None
  191. return f[0]
  192. candidates = [k for k in candidates if (allow_cover or (not k.endswith('_cover') and not k.endswith('_thumb')))]
  193. for cand in candidates:
  194. if imghdr.what(cand):
  195. return get_file_b64(cand), imghdr.what(cand)
  196. return None, None
  197. def _get_internal_emoji(self, fname):
  198. f = os.path.join(INTERNAL_EMOJI_DIR, fname)
  199. return get_file_b64(f), imghdr.what(f)
  200. def get_emoji_by_md5(self, md5):
  201. """ :returns: (b64 unicode img, format)"""
  202. assert md5, md5
  203. if md5 in self.parser.internal_emojis:
  204. emoji_img, format = self._get_internal_emoji(self.parser.internal_emojis[md5])
  205. return emoji_img, format
  206. else:
  207. # check cache
  208. img, format = self.emoji_cache.query(md5)
  209. if format:
  210. return img, format
  211. # check resource/emoji/ dir
  212. group = self.parser.emoji_groups.get(md5, None)
  213. emoji_img, format = self._get_res_emoji(md5, group)
  214. if format:
  215. return emoji_img, format
  216. # check url
  217. url = self.parser.emoji_url.get(md5, None)
  218. if url:
  219. emoji_img, format = self.emoji_cache.fetch(md5, url)
  220. if format:
  221. return emoji_img, format
  222. # check resource/emoji dir again, for cover
  223. emoji_img, format = self._get_res_emoji(md5, group, allow_cover=True)
  224. if format:
  225. return emoji_img, format
  226. # TODO: first 1k in emoji is encrypted
  227. logger.warn("Cannot get emoji {} in group {}".format(md5, group))
  228. return None, None
  229. def get_video(self, videoid):
  230. video_file = os.path.join(self.video_dir, videoid + ".mp4")
  231. video_thumbnail_file = os.path.join(self.video_dir, videoid + ".jpg")
  232. if os.path.exists(video_file):
  233. return video_file
  234. elif os.path.exists(video_thumbnail_file):
  235. return video_thumbnail_file
  236. return ""