1 | n/a | #!/usr/bin/env python3 |
---|
2 | n/a | """ |
---|
3 | n/a | Utility for parsing HTML5 entity definitions available from: |
---|
4 | n/a | |
---|
5 | n/a | http://dev.w3.org/html5/spec/entities.json |
---|
6 | n/a | |
---|
7 | n/a | Written by Ezio Melotti and Iuliia Proskurnia. |
---|
8 | n/a | |
---|
9 | n/a | """ |
---|
10 | n/a | |
---|
11 | n/a | import os |
---|
12 | n/a | import sys |
---|
13 | n/a | import json |
---|
14 | n/a | from urllib.request import urlopen |
---|
15 | n/a | from html.entities import html5 |
---|
16 | n/a | |
---|
17 | n/a | entities_url = 'http://dev.w3.org/html5/spec/entities.json' |
---|
18 | n/a | |
---|
19 | n/a | def get_json(url): |
---|
20 | n/a | """Download the json file from the url and returns a decoded object.""" |
---|
21 | n/a | with urlopen(url) as f: |
---|
22 | n/a | data = f.read().decode('utf-8') |
---|
23 | n/a | return json.loads(data) |
---|
24 | n/a | |
---|
25 | n/a | def create_dict(entities): |
---|
26 | n/a | """Create the html5 dict from the decoded json object.""" |
---|
27 | n/a | new_html5 = {} |
---|
28 | n/a | for name, value in entities.items(): |
---|
29 | n/a | new_html5[name.lstrip('&')] = value['characters'] |
---|
30 | n/a | return new_html5 |
---|
31 | n/a | |
---|
32 | n/a | def compare_dicts(old, new): |
---|
33 | n/a | """Compare the old and new dicts and print the differences.""" |
---|
34 | n/a | added = new.keys() - old.keys() |
---|
35 | n/a | if added: |
---|
36 | n/a | print('{} entitie(s) have been added:'.format(len(added))) |
---|
37 | n/a | for name in sorted(added): |
---|
38 | n/a | print(' {!r}: {!r}'.format(name, new[name])) |
---|
39 | n/a | removed = old.keys() - new.keys() |
---|
40 | n/a | if removed: |
---|
41 | n/a | print('{} entitie(s) have been removed:'.format(len(removed))) |
---|
42 | n/a | for name in sorted(removed): |
---|
43 | n/a | print(' {!r}: {!r}'.format(name, old[name])) |
---|
44 | n/a | changed = set() |
---|
45 | n/a | for name in (old.keys() & new.keys()): |
---|
46 | n/a | if old[name] != new[name]: |
---|
47 | n/a | changed.add((name, old[name], new[name])) |
---|
48 | n/a | if changed: |
---|
49 | n/a | print('{} entitie(s) have been modified:'.format(len(changed))) |
---|
50 | n/a | for item in sorted(changed): |
---|
51 | n/a | print(' {!r}: {!r} -> {!r}'.format(*item)) |
---|
52 | n/a | |
---|
53 | n/a | def write_items(entities, file=sys.stdout): |
---|
54 | n/a | """Write the items of the dictionary in the specified file.""" |
---|
55 | n/a | # The keys in the generated dictionary should be sorted |
---|
56 | n/a | # in a case-insensitive way, however, when two keys are equal, |
---|
57 | n/a | # the uppercase version should come first so that the result |
---|
58 | n/a | # looks like: ['Aacute', 'aacute', 'Aacute;', 'aacute;', ...] |
---|
59 | n/a | # To do this we first sort in a case-sensitive way (so all the |
---|
60 | n/a | # uppercase chars come first) and then sort with key=str.lower. |
---|
61 | n/a | # Since the sorting is stable the uppercase keys will eventually |
---|
62 | n/a | # be before their equivalent lowercase version. |
---|
63 | n/a | keys = sorted(entities.keys()) |
---|
64 | n/a | keys = sorted(keys, key=str.lower) |
---|
65 | n/a | print('html5 = {', file=file) |
---|
66 | n/a | for name in keys: |
---|
67 | n/a | print(' {!r}: {!a},'.format(name, entities[name]), file=file) |
---|
68 | n/a | print('}', file=file) |
---|
69 | n/a | |
---|
70 | n/a | |
---|
71 | n/a | if __name__ == '__main__': |
---|
72 | n/a | # without args print a diff between html.entities.html5 and new_html5 |
---|
73 | n/a | # with --create print the new html5 dict |
---|
74 | n/a | # with --patch patch the Lib/html/entities.py file |
---|
75 | n/a | new_html5 = create_dict(get_json(entities_url)) |
---|
76 | n/a | if '--create' in sys.argv: |
---|
77 | n/a | print('# map the HTML5 named character references to the ' |
---|
78 | n/a | 'equivalent Unicode character(s)') |
---|
79 | n/a | print('# Generated by {}. Do not edit manually.'.format(__file__)) |
---|
80 | n/a | write_items(new_html5) |
---|
81 | n/a | elif '--patch' in sys.argv: |
---|
82 | n/a | fname = 'Lib/html/entities.py' |
---|
83 | n/a | temp_fname = fname + '.temp' |
---|
84 | n/a | with open(fname) as f1, open(temp_fname, 'w') as f2: |
---|
85 | n/a | skip = False |
---|
86 | n/a | for line in f1: |
---|
87 | n/a | if line.startswith('html5 = {'): |
---|
88 | n/a | write_items(new_html5, file=f2) |
---|
89 | n/a | skip = True |
---|
90 | n/a | continue |
---|
91 | n/a | if skip: |
---|
92 | n/a | # skip the old items until the } |
---|
93 | n/a | if line.startswith('}'): |
---|
94 | n/a | skip = False |
---|
95 | n/a | continue |
---|
96 | n/a | f2.write(line) |
---|
97 | n/a | os.remove(fname) |
---|
98 | n/a | os.rename(temp_fname, fname) |
---|
99 | n/a | else: |
---|
100 | n/a | if html5 == new_html5: |
---|
101 | n/a | print('The current dictionary is updated.') |
---|
102 | n/a | else: |
---|
103 | n/a | compare_dicts(html5, new_html5) |
---|
104 | n/a | print('Run "./python {0} --patch" to update Lib/html/entities.html ' |
---|
105 | n/a | 'or "./python {0} --create" to see the generated ' 'dictionary.'.format(__file__)) |
---|