1 | n/a | #!/usr/bin/env python3 |
---|
2 | n/a | """ Utility for parsing HTML entity definitions available from: |
---|
3 | n/a | |
---|
4 | n/a | http://www.w3.org/ as e.g. |
---|
5 | n/a | http://www.w3.org/TR/REC-html40/HTMLlat1.ent |
---|
6 | n/a | |
---|
7 | n/a | Input is read from stdin, output is written to stdout in form of a |
---|
8 | n/a | Python snippet defining a dictionary "entitydefs" mapping literal |
---|
9 | n/a | entity name to character or numeric entity. |
---|
10 | n/a | |
---|
11 | n/a | Marc-Andre Lemburg, mal@lemburg.com, 1999. |
---|
12 | n/a | Use as you like. NO WARRANTIES. |
---|
13 | n/a | |
---|
14 | n/a | """ |
---|
15 | n/a | import re,sys |
---|
16 | n/a | |
---|
17 | n/a | entityRE = re.compile(r'<!ENTITY +(\w+) +CDATA +"([^"]+)" +-- +((?:.|\n)+?) *-->') |
---|
18 | n/a | |
---|
19 | n/a | def parse(text,pos=0,endpos=None): |
---|
20 | n/a | |
---|
21 | n/a | pos = 0 |
---|
22 | n/a | if endpos is None: |
---|
23 | n/a | endpos = len(text) |
---|
24 | n/a | d = {} |
---|
25 | n/a | while 1: |
---|
26 | n/a | m = entityRE.search(text,pos,endpos) |
---|
27 | n/a | if not m: |
---|
28 | n/a | break |
---|
29 | n/a | name,charcode,comment = m.groups() |
---|
30 | n/a | d[name] = charcode,comment |
---|
31 | n/a | pos = m.end() |
---|
32 | n/a | return d |
---|
33 | n/a | |
---|
34 | n/a | def writefile(f,defs): |
---|
35 | n/a | |
---|
36 | n/a | f.write("entitydefs = {\n") |
---|
37 | n/a | items = sorted(defs.items()) |
---|
38 | n/a | for name, (charcode,comment) in items: |
---|
39 | n/a | if charcode[:2] == '&#': |
---|
40 | n/a | code = int(charcode[2:-1]) |
---|
41 | n/a | if code < 256: |
---|
42 | n/a | charcode = r"'\%o'" % code |
---|
43 | n/a | else: |
---|
44 | n/a | charcode = repr(charcode) |
---|
45 | n/a | else: |
---|
46 | n/a | charcode = repr(charcode) |
---|
47 | n/a | comment = ' '.join(comment.split()) |
---|
48 | n/a | f.write(" '%s':\t%s, \t# %s\n" % (name,charcode,comment)) |
---|
49 | n/a | f.write('\n}\n') |
---|
50 | n/a | |
---|
51 | n/a | if __name__ == '__main__': |
---|
52 | n/a | if len(sys.argv) > 1: |
---|
53 | n/a | infile = open(sys.argv[1]) |
---|
54 | n/a | else: |
---|
55 | n/a | infile = sys.stdin |
---|
56 | n/a | if len(sys.argv) > 2: |
---|
57 | n/a | outfile = open(sys.argv[2],'w') |
---|
58 | n/a | else: |
---|
59 | n/a | outfile = sys.stdout |
---|
60 | n/a | text = infile.read() |
---|
61 | n/a | defs = parse(text) |
---|
62 | n/a | writefile(outfile,defs) |
---|