| 1 | n/a | #!/usr/bin/env python3 |
|---|
| 2 | n/a | """ Utility for parsing HTML entity definitions available from: |
|---|
| 3 | n/a | |
|---|
| 4 | n/a | http://www.w3.org/ as e.g. |
|---|
| 5 | n/a | http://www.w3.org/TR/REC-html40/HTMLlat1.ent |
|---|
| 6 | n/a | |
|---|
| 7 | n/a | Input is read from stdin, output is written to stdout in form of a |
|---|
| 8 | n/a | Python snippet defining a dictionary "entitydefs" mapping literal |
|---|
| 9 | n/a | entity name to character or numeric entity. |
|---|
| 10 | n/a | |
|---|
| 11 | n/a | Marc-Andre Lemburg, mal@lemburg.com, 1999. |
|---|
| 12 | n/a | Use as you like. NO WARRANTIES. |
|---|
| 13 | n/a | |
|---|
| 14 | n/a | """ |
|---|
| 15 | n/a | import re,sys |
|---|
| 16 | n/a | |
|---|
| 17 | n/a | entityRE = re.compile(r'<!ENTITY +(\w+) +CDATA +"([^"]+)" +-- +((?:.|\n)+?) *-->') |
|---|
| 18 | n/a | |
|---|
| 19 | n/a | def parse(text,pos=0,endpos=None): |
|---|
| 20 | n/a | |
|---|
| 21 | n/a | pos = 0 |
|---|
| 22 | n/a | if endpos is None: |
|---|
| 23 | n/a | endpos = len(text) |
|---|
| 24 | n/a | d = {} |
|---|
| 25 | n/a | while 1: |
|---|
| 26 | n/a | m = entityRE.search(text,pos,endpos) |
|---|
| 27 | n/a | if not m: |
|---|
| 28 | n/a | break |
|---|
| 29 | n/a | name,charcode,comment = m.groups() |
|---|
| 30 | n/a | d[name] = charcode,comment |
|---|
| 31 | n/a | pos = m.end() |
|---|
| 32 | n/a | return d |
|---|
| 33 | n/a | |
|---|
| 34 | n/a | def writefile(f,defs): |
|---|
| 35 | n/a | |
|---|
| 36 | n/a | f.write("entitydefs = {\n") |
|---|
| 37 | n/a | items = sorted(defs.items()) |
|---|
| 38 | n/a | for name, (charcode,comment) in items: |
|---|
| 39 | n/a | if charcode[:2] == '&#': |
|---|
| 40 | n/a | code = int(charcode[2:-1]) |
|---|
| 41 | n/a | if code < 256: |
|---|
| 42 | n/a | charcode = r"'\%o'" % code |
|---|
| 43 | n/a | else: |
|---|
| 44 | n/a | charcode = repr(charcode) |
|---|
| 45 | n/a | else: |
|---|
| 46 | n/a | charcode = repr(charcode) |
|---|
| 47 | n/a | comment = ' '.join(comment.split()) |
|---|
| 48 | n/a | f.write(" '%s':\t%s, \t# %s\n" % (name,charcode,comment)) |
|---|
| 49 | n/a | f.write('\n}\n') |
|---|
| 50 | n/a | |
|---|
| 51 | n/a | if __name__ == '__main__': |
|---|
| 52 | n/a | if len(sys.argv) > 1: |
|---|
| 53 | n/a | infile = open(sys.argv[1]) |
|---|
| 54 | n/a | else: |
|---|
| 55 | n/a | infile = sys.stdin |
|---|
| 56 | n/a | if len(sys.argv) > 2: |
|---|
| 57 | n/a | outfile = open(sys.argv[2],'w') |
|---|
| 58 | n/a | else: |
|---|
| 59 | n/a | outfile = sys.stdout |
|---|
| 60 | n/a | text = infile.read() |
|---|
| 61 | n/a | defs = parse(text) |
|---|
| 62 | n/a | writefile(outfile,defs) |
|---|