parse_tencent_smiley.py 971 B

12345678910111213141516171819202122232425262728293031323334
  1. #!/usr/bin/env python2
  2. # -*- coding: UTF-8 -*-
  3. # File: parse_tencent_smiley.py
  4. # Date: Sat Dec 27 00:15:14 2014 +0800
  5. # Author: Yuxin Wu <[email protected]>
  6. import xml.etree.ElementTree as ET
  7. import os
  8. import json
  9. tree = ET.parse(os.path.join(
  10. os.path.dirname(os.path.abspath(__file__)), 'smiley.xml'))
  11. root = tree.getroot()
  12. smileys = {}
  13. for child in root:
  14. name = child.attrib['name']
  15. if 'smiley_values' in name:
  16. if '_th' in name:
  17. continue # ignore thailand language
  18. lst = [c.text for c in child]
  19. assert len(lst) == 105
  20. for idx, v in enumerate(lst):
  21. if type(v) == str:
  22. # two code appears in the xml.. don't know why
  23. v = v.strip('"')
  24. v = v.replace('&lt;', '<')
  25. v = v.replace('&amp;', '&')
  26. v = v.decode('utf-8')
  27. smileys[v] = idx
  28. with open('tencent-smiley.json', 'w') as f:
  29. json.dump(smileys, f)