| 1 | n/a | #!/usr/bin/env python3 |
|---|
| 2 | n/a | """ |
|---|
| 3 | n/a | Utility for parsing HTML5 entity definitions available from: |
|---|
| 4 | n/a | |
|---|
| 5 | n/a | http://dev.w3.org/html5/spec/entities.json |
|---|
| 6 | n/a | |
|---|
| 7 | n/a | Written by Ezio Melotti and Iuliia Proskurnia. |
|---|
| 8 | n/a | |
|---|
| 9 | n/a | """ |
|---|
| 10 | n/a | |
|---|
| 11 | n/a | import os |
|---|
| 12 | n/a | import sys |
|---|
| 13 | n/a | import json |
|---|
| 14 | n/a | from urllib.request import urlopen |
|---|
| 15 | n/a | from html.entities import html5 |
|---|
| 16 | n/a | |
|---|
| 17 | n/a | entities_url = 'http://dev.w3.org/html5/spec/entities.json' |
|---|
| 18 | n/a | |
|---|
| 19 | n/a | def get_json(url): |
|---|
| 20 | n/a | """Download the json file from the url and returns a decoded object.""" |
|---|
| 21 | n/a | with urlopen(url) as f: |
|---|
| 22 | n/a | data = f.read().decode('utf-8') |
|---|
| 23 | n/a | return json.loads(data) |
|---|
| 24 | n/a | |
|---|
| 25 | n/a | def create_dict(entities): |
|---|
| 26 | n/a | """Create the html5 dict from the decoded json object.""" |
|---|
| 27 | n/a | new_html5 = {} |
|---|
| 28 | n/a | for name, value in entities.items(): |
|---|
| 29 | n/a | new_html5[name.lstrip('&')] = value['characters'] |
|---|
| 30 | n/a | return new_html5 |
|---|
| 31 | n/a | |
|---|
| 32 | n/a | def compare_dicts(old, new): |
|---|
| 33 | n/a | """Compare the old and new dicts and print the differences.""" |
|---|
| 34 | n/a | added = new.keys() - old.keys() |
|---|
| 35 | n/a | if added: |
|---|
| 36 | n/a | print('{} entitie(s) have been added:'.format(len(added))) |
|---|
| 37 | n/a | for name in sorted(added): |
|---|
| 38 | n/a | print(' {!r}: {!r}'.format(name, new[name])) |
|---|
| 39 | n/a | removed = old.keys() - new.keys() |
|---|
| 40 | n/a | if removed: |
|---|
| 41 | n/a | print('{} entitie(s) have been removed:'.format(len(removed))) |
|---|
| 42 | n/a | for name in sorted(removed): |
|---|
| 43 | n/a | print(' {!r}: {!r}'.format(name, old[name])) |
|---|
| 44 | n/a | changed = set() |
|---|
| 45 | n/a | for name in (old.keys() & new.keys()): |
|---|
| 46 | n/a | if old[name] != new[name]: |
|---|
| 47 | n/a | changed.add((name, old[name], new[name])) |
|---|
| 48 | n/a | if changed: |
|---|
| 49 | n/a | print('{} entitie(s) have been modified:'.format(len(changed))) |
|---|
| 50 | n/a | for item in sorted(changed): |
|---|
| 51 | n/a | print(' {!r}: {!r} -> {!r}'.format(*item)) |
|---|
| 52 | n/a | |
|---|
| 53 | n/a | def write_items(entities, file=sys.stdout): |
|---|
| 54 | n/a | """Write the items of the dictionary in the specified file.""" |
|---|
| 55 | n/a | # The keys in the generated dictionary should be sorted |
|---|
| 56 | n/a | # in a case-insensitive way, however, when two keys are equal, |
|---|
| 57 | n/a | # the uppercase version should come first so that the result |
|---|
| 58 | n/a | # looks like: ['Aacute', 'aacute', 'Aacute;', 'aacute;', ...] |
|---|
| 59 | n/a | # To do this we first sort in a case-sensitive way (so all the |
|---|
| 60 | n/a | # uppercase chars come first) and then sort with key=str.lower. |
|---|
| 61 | n/a | # Since the sorting is stable the uppercase keys will eventually |
|---|
| 62 | n/a | # be before their equivalent lowercase version. |
|---|
| 63 | n/a | keys = sorted(entities.keys()) |
|---|
| 64 | n/a | keys = sorted(keys, key=str.lower) |
|---|
| 65 | n/a | print('html5 = {', file=file) |
|---|
| 66 | n/a | for name in keys: |
|---|
| 67 | n/a | print(' {!r}: {!a},'.format(name, entities[name]), file=file) |
|---|
| 68 | n/a | print('}', file=file) |
|---|
| 69 | n/a | |
|---|
| 70 | n/a | |
|---|
| 71 | n/a | if __name__ == '__main__': |
|---|
| 72 | n/a | # without args print a diff between html.entities.html5 and new_html5 |
|---|
| 73 | n/a | # with --create print the new html5 dict |
|---|
| 74 | n/a | # with --patch patch the Lib/html/entities.py file |
|---|
| 75 | n/a | new_html5 = create_dict(get_json(entities_url)) |
|---|
| 76 | n/a | if '--create' in sys.argv: |
|---|
| 77 | n/a | print('# map the HTML5 named character references to the ' |
|---|
| 78 | n/a | 'equivalent Unicode character(s)') |
|---|
| 79 | n/a | print('# Generated by {}. Do not edit manually.'.format(__file__)) |
|---|
| 80 | n/a | write_items(new_html5) |
|---|
| 81 | n/a | elif '--patch' in sys.argv: |
|---|
| 82 | n/a | fname = 'Lib/html/entities.py' |
|---|
| 83 | n/a | temp_fname = fname + '.temp' |
|---|
| 84 | n/a | with open(fname) as f1, open(temp_fname, 'w') as f2: |
|---|
| 85 | n/a | skip = False |
|---|
| 86 | n/a | for line in f1: |
|---|
| 87 | n/a | if line.startswith('html5 = {'): |
|---|
| 88 | n/a | write_items(new_html5, file=f2) |
|---|
| 89 | n/a | skip = True |
|---|
| 90 | n/a | continue |
|---|
| 91 | n/a | if skip: |
|---|
| 92 | n/a | # skip the old items until the } |
|---|
| 93 | n/a | if line.startswith('}'): |
|---|
| 94 | n/a | skip = False |
|---|
| 95 | n/a | continue |
|---|
| 96 | n/a | f2.write(line) |
|---|
| 97 | n/a | os.remove(fname) |
|---|
| 98 | n/a | os.rename(temp_fname, fname) |
|---|
| 99 | n/a | else: |
|---|
| 100 | n/a | if html5 == new_html5: |
|---|
| 101 | n/a | print('The current dictionary is updated.') |
|---|
| 102 | n/a | else: |
|---|
| 103 | n/a | compare_dicts(html5, new_html5) |
|---|
| 104 | n/a | print('Run "./python {0} --patch" to update Lib/html/entities.html ' |
|---|
| 105 | n/a | 'or "./python {0} --create" to see the generated ' 'dictionary.'.format(__file__)) |
|---|