You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

63 lines
1.7 KiB

  1. #!/usr/bin/env python3
  2. """ Utility for parsing HTML entity definitions available from:
  3. http://www.w3.org/ as e.g.
  4. http://www.w3.org/TR/REC-html40/HTMLlat1.ent
  5. Input is read from stdin, output is written to stdout in form of a
  6. Python snippet defining a dictionary "entitydefs" mapping literal
  7. entity name to character or numeric entity.
  8. Marc-Andre Lemburg, mal@lemburg.com, 1999.
  9. Use as you like. NO WARRANTIES.
  10. """
  11. import re,sys
  12. import TextTools
  13. entityRE = re.compile('<!ENTITY +(\w+) +CDATA +"([^"]+)" +-- +((?:.|\n)+?) *-->')
  14. def parse(text,pos=0,endpos=None):
  15. pos = 0
  16. if endpos is None:
  17. endpos = len(text)
  18. d = {}
  19. while 1:
  20. m = entityRE.search(text,pos,endpos)
  21. if not m:
  22. break
  23. name,charcode,comment = m.groups()
  24. d[name] = charcode,comment
  25. pos = m.end()
  26. return d
  27. def writefile(f,defs):
  28. f.write("entitydefs = {\n")
  29. items = sorted(defs.items())
  30. for name, (charcode,comment) in items:
  31. if charcode[:2] == '&#':
  32. code = int(charcode[2:-1])
  33. if code < 256:
  34. charcode = "'\%o'" % code
  35. else:
  36. charcode = repr(charcode)
  37. else:
  38. charcode = repr(charcode)
  39. comment = TextTools.collapse(comment)
  40. f.write(" '%s':\t%s, \t# %s\n" % (name,charcode,comment))
  41. f.write('\n}\n')
  42. if __name__ == '__main__':
  43. if len(sys.argv) > 1:
  44. infile = open(sys.argv[1])
  45. else:
  46. infile = sys.stdin
  47. if len(sys.argv) > 2:
  48. outfile = open(sys.argv[2],'w')
  49. else:
  50. outfile = sys.stdout
  51. text = infile.read()
  52. defs = parse(text)
  53. writefile(outfile,defs)