| 1 | n/a | """ Codec for the Punicode encoding, as specified in RFC 3492 |
|---|
| 2 | n/a | |
|---|
| 3 | n/a | Written by Martin v. Löwis. |
|---|
| 4 | n/a | """ |
|---|
| 5 | n/a | |
|---|
| 6 | n/a | import codecs |
|---|
| 7 | n/a | |
|---|
| 8 | n/a | ##################### Encoding ##################################### |
|---|
| 9 | n/a | |
|---|
| 10 | n/a | def segregate(str): |
|---|
| 11 | n/a | """3.1 Basic code point segregation""" |
|---|
| 12 | n/a | base = bytearray() |
|---|
| 13 | n/a | extended = set() |
|---|
| 14 | n/a | for c in str: |
|---|
| 15 | n/a | if ord(c) < 128: |
|---|
| 16 | n/a | base.append(ord(c)) |
|---|
| 17 | n/a | else: |
|---|
| 18 | n/a | extended.add(c) |
|---|
| 19 | n/a | extended = sorted(extended) |
|---|
| 20 | n/a | return bytes(base), extended |
|---|
| 21 | n/a | |
|---|
| 22 | n/a | def selective_len(str, max): |
|---|
| 23 | n/a | """Return the length of str, considering only characters below max.""" |
|---|
| 24 | n/a | res = 0 |
|---|
| 25 | n/a | for c in str: |
|---|
| 26 | n/a | if ord(c) < max: |
|---|
| 27 | n/a | res += 1 |
|---|
| 28 | n/a | return res |
|---|
| 29 | n/a | |
|---|
| 30 | n/a | def selective_find(str, char, index, pos): |
|---|
| 31 | n/a | """Return a pair (index, pos), indicating the next occurrence of |
|---|
| 32 | n/a | char in str. index is the position of the character considering |
|---|
| 33 | n/a | only ordinals up to and including char, and pos is the position in |
|---|
| 34 | n/a | the full string. index/pos is the starting position in the full |
|---|
| 35 | n/a | string.""" |
|---|
| 36 | n/a | |
|---|
| 37 | n/a | l = len(str) |
|---|
| 38 | n/a | while 1: |
|---|
| 39 | n/a | pos += 1 |
|---|
| 40 | n/a | if pos == l: |
|---|
| 41 | n/a | return (-1, -1) |
|---|
| 42 | n/a | c = str[pos] |
|---|
| 43 | n/a | if c == char: |
|---|
| 44 | n/a | return index+1, pos |
|---|
| 45 | n/a | elif c < char: |
|---|
| 46 | n/a | index += 1 |
|---|
| 47 | n/a | |
|---|
| 48 | n/a | def insertion_unsort(str, extended): |
|---|
| 49 | n/a | """3.2 Insertion unsort coding""" |
|---|
| 50 | n/a | oldchar = 0x80 |
|---|
| 51 | n/a | result = [] |
|---|
| 52 | n/a | oldindex = -1 |
|---|
| 53 | n/a | for c in extended: |
|---|
| 54 | n/a | index = pos = -1 |
|---|
| 55 | n/a | char = ord(c) |
|---|
| 56 | n/a | curlen = selective_len(str, char) |
|---|
| 57 | n/a | delta = (curlen+1) * (char - oldchar) |
|---|
| 58 | n/a | while 1: |
|---|
| 59 | n/a | index,pos = selective_find(str,c,index,pos) |
|---|
| 60 | n/a | if index == -1: |
|---|
| 61 | n/a | break |
|---|
| 62 | n/a | delta += index - oldindex |
|---|
| 63 | n/a | result.append(delta-1) |
|---|
| 64 | n/a | oldindex = index |
|---|
| 65 | n/a | delta = 0 |
|---|
| 66 | n/a | oldchar = char |
|---|
| 67 | n/a | |
|---|
| 68 | n/a | return result |
|---|
| 69 | n/a | |
|---|
| 70 | n/a | def T(j, bias): |
|---|
| 71 | n/a | # Punycode parameters: tmin = 1, tmax = 26, base = 36 |
|---|
| 72 | n/a | res = 36 * (j + 1) - bias |
|---|
| 73 | n/a | if res < 1: return 1 |
|---|
| 74 | n/a | if res > 26: return 26 |
|---|
| 75 | n/a | return res |
|---|
| 76 | n/a | |
|---|
| 77 | n/a | digits = b"abcdefghijklmnopqrstuvwxyz0123456789" |
|---|
| 78 | n/a | def generate_generalized_integer(N, bias): |
|---|
| 79 | n/a | """3.3 Generalized variable-length integers""" |
|---|
| 80 | n/a | result = bytearray() |
|---|
| 81 | n/a | j = 0 |
|---|
| 82 | n/a | while 1: |
|---|
| 83 | n/a | t = T(j, bias) |
|---|
| 84 | n/a | if N < t: |
|---|
| 85 | n/a | result.append(digits[N]) |
|---|
| 86 | n/a | return bytes(result) |
|---|
| 87 | n/a | result.append(digits[t + ((N - t) % (36 - t))]) |
|---|
| 88 | n/a | N = (N - t) // (36 - t) |
|---|
| 89 | n/a | j += 1 |
|---|
| 90 | n/a | |
|---|
| 91 | n/a | def adapt(delta, first, numchars): |
|---|
| 92 | n/a | if first: |
|---|
| 93 | n/a | delta //= 700 |
|---|
| 94 | n/a | else: |
|---|
| 95 | n/a | delta //= 2 |
|---|
| 96 | n/a | delta += delta // numchars |
|---|
| 97 | n/a | # ((base - tmin) * tmax) // 2 == 455 |
|---|
| 98 | n/a | divisions = 0 |
|---|
| 99 | n/a | while delta > 455: |
|---|
| 100 | n/a | delta = delta // 35 # base - tmin |
|---|
| 101 | n/a | divisions += 36 |
|---|
| 102 | n/a | bias = divisions + (36 * delta // (delta + 38)) |
|---|
| 103 | n/a | return bias |
|---|
| 104 | n/a | |
|---|
| 105 | n/a | |
|---|
| 106 | n/a | def generate_integers(baselen, deltas): |
|---|
| 107 | n/a | """3.4 Bias adaptation""" |
|---|
| 108 | n/a | # Punycode parameters: initial bias = 72, damp = 700, skew = 38 |
|---|
| 109 | n/a | result = bytearray() |
|---|
| 110 | n/a | bias = 72 |
|---|
| 111 | n/a | for points, delta in enumerate(deltas): |
|---|
| 112 | n/a | s = generate_generalized_integer(delta, bias) |
|---|
| 113 | n/a | result.extend(s) |
|---|
| 114 | n/a | bias = adapt(delta, points==0, baselen+points+1) |
|---|
| 115 | n/a | return bytes(result) |
|---|
| 116 | n/a | |
|---|
| 117 | n/a | def punycode_encode(text): |
|---|
| 118 | n/a | base, extended = segregate(text) |
|---|
| 119 | n/a | deltas = insertion_unsort(text, extended) |
|---|
| 120 | n/a | extended = generate_integers(len(base), deltas) |
|---|
| 121 | n/a | if base: |
|---|
| 122 | n/a | return base + b"-" + extended |
|---|
| 123 | n/a | return extended |
|---|
| 124 | n/a | |
|---|
| 125 | n/a | ##################### Decoding ##################################### |
|---|
| 126 | n/a | |
|---|
| 127 | n/a | def decode_generalized_number(extended, extpos, bias, errors): |
|---|
| 128 | n/a | """3.3 Generalized variable-length integers""" |
|---|
| 129 | n/a | result = 0 |
|---|
| 130 | n/a | w = 1 |
|---|
| 131 | n/a | j = 0 |
|---|
| 132 | n/a | while 1: |
|---|
| 133 | n/a | try: |
|---|
| 134 | n/a | char = ord(extended[extpos]) |
|---|
| 135 | n/a | except IndexError: |
|---|
| 136 | n/a | if errors == "strict": |
|---|
| 137 | n/a | raise UnicodeError("incomplete punicode string") |
|---|
| 138 | n/a | return extpos + 1, None |
|---|
| 139 | n/a | extpos += 1 |
|---|
| 140 | n/a | if 0x41 <= char <= 0x5A: # A-Z |
|---|
| 141 | n/a | digit = char - 0x41 |
|---|
| 142 | n/a | elif 0x30 <= char <= 0x39: |
|---|
| 143 | n/a | digit = char - 22 # 0x30-26 |
|---|
| 144 | n/a | elif errors == "strict": |
|---|
| 145 | n/a | raise UnicodeError("Invalid extended code point '%s'" |
|---|
| 146 | n/a | % extended[extpos]) |
|---|
| 147 | n/a | else: |
|---|
| 148 | n/a | return extpos, None |
|---|
| 149 | n/a | t = T(j, bias) |
|---|
| 150 | n/a | result += digit * w |
|---|
| 151 | n/a | if digit < t: |
|---|
| 152 | n/a | return extpos, result |
|---|
| 153 | n/a | w = w * (36 - t) |
|---|
| 154 | n/a | j += 1 |
|---|
| 155 | n/a | |
|---|
| 156 | n/a | |
|---|
| 157 | n/a | def insertion_sort(base, extended, errors): |
|---|
| 158 | n/a | """3.2 Insertion unsort coding""" |
|---|
| 159 | n/a | char = 0x80 |
|---|
| 160 | n/a | pos = -1 |
|---|
| 161 | n/a | bias = 72 |
|---|
| 162 | n/a | extpos = 0 |
|---|
| 163 | n/a | while extpos < len(extended): |
|---|
| 164 | n/a | newpos, delta = decode_generalized_number(extended, extpos, |
|---|
| 165 | n/a | bias, errors) |
|---|
| 166 | n/a | if delta is None: |
|---|
| 167 | n/a | # There was an error in decoding. We can't continue because |
|---|
| 168 | n/a | # synchronization is lost. |
|---|
| 169 | n/a | return base |
|---|
| 170 | n/a | pos += delta+1 |
|---|
| 171 | n/a | char += pos // (len(base) + 1) |
|---|
| 172 | n/a | if char > 0x10FFFF: |
|---|
| 173 | n/a | if errors == "strict": |
|---|
| 174 | n/a | raise UnicodeError("Invalid character U+%x" % char) |
|---|
| 175 | n/a | char = ord('?') |
|---|
| 176 | n/a | pos = pos % (len(base) + 1) |
|---|
| 177 | n/a | base = base[:pos] + chr(char) + base[pos:] |
|---|
| 178 | n/a | bias = adapt(delta, (extpos == 0), len(base)) |
|---|
| 179 | n/a | extpos = newpos |
|---|
| 180 | n/a | return base |
|---|
| 181 | n/a | |
|---|
| 182 | n/a | def punycode_decode(text, errors): |
|---|
| 183 | n/a | if isinstance(text, str): |
|---|
| 184 | n/a | text = text.encode("ascii") |
|---|
| 185 | n/a | if isinstance(text, memoryview): |
|---|
| 186 | n/a | text = bytes(text) |
|---|
| 187 | n/a | pos = text.rfind(b"-") |
|---|
| 188 | n/a | if pos == -1: |
|---|
| 189 | n/a | base = "" |
|---|
| 190 | n/a | extended = str(text, "ascii").upper() |
|---|
| 191 | n/a | else: |
|---|
| 192 | n/a | base = str(text[:pos], "ascii", errors) |
|---|
| 193 | n/a | extended = str(text[pos+1:], "ascii").upper() |
|---|
| 194 | n/a | return insertion_sort(base, extended, errors) |
|---|
| 195 | n/a | |
|---|
| 196 | n/a | ### Codec APIs |
|---|
| 197 | n/a | |
|---|
| 198 | n/a | class Codec(codecs.Codec): |
|---|
| 199 | n/a | |
|---|
| 200 | n/a | def encode(self, input, errors='strict'): |
|---|
| 201 | n/a | res = punycode_encode(input) |
|---|
| 202 | n/a | return res, len(input) |
|---|
| 203 | n/a | |
|---|
| 204 | n/a | def decode(self, input, errors='strict'): |
|---|
| 205 | n/a | if errors not in ('strict', 'replace', 'ignore'): |
|---|
| 206 | n/a | raise UnicodeError("Unsupported error handling "+errors) |
|---|
| 207 | n/a | res = punycode_decode(input, errors) |
|---|
| 208 | n/a | return res, len(input) |
|---|
| 209 | n/a | |
|---|
| 210 | n/a | class IncrementalEncoder(codecs.IncrementalEncoder): |
|---|
| 211 | n/a | def encode(self, input, final=False): |
|---|
| 212 | n/a | return punycode_encode(input) |
|---|
| 213 | n/a | |
|---|
| 214 | n/a | class IncrementalDecoder(codecs.IncrementalDecoder): |
|---|
| 215 | n/a | def decode(self, input, final=False): |
|---|
| 216 | n/a | if self.errors not in ('strict', 'replace', 'ignore'): |
|---|
| 217 | n/a | raise UnicodeError("Unsupported error handling "+self.errors) |
|---|
| 218 | n/a | return punycode_decode(input, self.errors) |
|---|
| 219 | n/a | |
|---|
| 220 | n/a | class StreamWriter(Codec,codecs.StreamWriter): |
|---|
| 221 | n/a | pass |
|---|
| 222 | n/a | |
|---|
| 223 | n/a | class StreamReader(Codec,codecs.StreamReader): |
|---|
| 224 | n/a | pass |
|---|
| 225 | n/a | |
|---|
| 226 | n/a | ### encodings module API |
|---|
| 227 | n/a | |
|---|
| 228 | n/a | def getregentry(): |
|---|
| 229 | n/a | return codecs.CodecInfo( |
|---|
| 230 | n/a | name='punycode', |
|---|
| 231 | n/a | encode=Codec().encode, |
|---|
| 232 | n/a | decode=Codec().decode, |
|---|
| 233 | n/a | incrementalencoder=IncrementalEncoder, |
|---|
| 234 | n/a | incrementaldecoder=IncrementalDecoder, |
|---|
| 235 | n/a | streamwriter=StreamWriter, |
|---|
| 236 | n/a | streamreader=StreamReader, |
|---|
| 237 | n/a | ) |
|---|