/Tools/scripts/parseentities.py

http://unladen-swallow.googlecode.com/ · Python · 64 lines · 51 code · 6 blank · 7 comment · 13 complexity · 0d749724bf0aa2ffe2a42cd0abfb76cf MD5 · raw file

  1. #!/usr/local/bin/python
  2. """ Utility for parsing HTML entity definitions available from:
  3. http://www.w3.org/ as e.g.
  4. http://www.w3.org/TR/REC-html40/HTMLlat1.ent
  5. Input is read from stdin, output is written to stdout in form of a
  6. Python snippet defining a dictionary "entitydefs" mapping literal
  7. entity name to character or numeric entity.
  8. Marc-Andre Lemburg, mal@lemburg.com, 1999.
  9. Use as you like. NO WARRANTIES.
  10. """
  11. import re,sys
  12. import TextTools
  13. entityRE = re.compile('<!ENTITY +(\w+) +CDATA +"([^"]+)" +-- +((?:.|\n)+?) *-->')
  14. def parse(text,pos=0,endpos=None):
  15. pos = 0
  16. if endpos is None:
  17. endpos = len(text)
  18. d = {}
  19. while 1:
  20. m = entityRE.search(text,pos,endpos)
  21. if not m:
  22. break
  23. name,charcode,comment = m.groups()
  24. d[name] = charcode,comment
  25. pos = m.end()
  26. return d
  27. def writefile(f,defs):
  28. f.write("entitydefs = {\n")
  29. items = defs.items()
  30. items.sort()
  31. for name,(charcode,comment) in items:
  32. if charcode[:2] == '&#':
  33. code = int(charcode[2:-1])
  34. if code < 256:
  35. charcode = "'\%o'" % code
  36. else:
  37. charcode = repr(charcode)
  38. else:
  39. charcode = repr(charcode)
  40. comment = TextTools.collapse(comment)
  41. f.write(" '%s':\t%s, \t# %s\n" % (name,charcode,comment))
  42. f.write('\n}\n')
  43. if __name__ == '__main__':
  44. if len(sys.argv) > 1:
  45. infile = open(sys.argv[1])
  46. else:
  47. infile = sys.stdin
  48. if len(sys.argv) > 2:
  49. outfile = open(sys.argv[2],'w')
  50. else:
  51. outfile = sys.stdout
  52. text = infile.read()
  53. defs = parse(text)
  54. writefile(outfile,defs)