Source code

Revision control

Copy as Markdown

Other Tools

# Copyright 2013-2014 The rust-url developers.
#
# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
# <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
# option. This file may not be copied, modified, or distributed
# except according to those terms.
# Run as: python make_uts46_mapping_table.py IdnaMappingTable.txt > uts46_mapping_table.rs
# You can get the latest idna table from
import collections
import itertools
print('''\
// Copyright 2013-2020 The rust-url developers.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
// Generated by make_idna_table.py
''')
txt = open("IdnaMappingTable.txt")
def escape_char(c):
return "\\u{%x}" % ord(c[0])
def char(s):
return chr(int(s, 16))
strtab = collections.OrderedDict()
strtab_offset = 0
def strtab_slice(s):
global strtab, strtab_offset
if s in strtab:
return strtab[s]
else:
utf8_len = len(s.encode('utf8'))
c = (strtab_offset, utf8_len)
strtab[s] = c
strtab_offset += utf8_len
return c
def rust_slice(s):
start = s[0]
length = s[1]
start_lo = start & 0xff
start_hi = start >> 8
assert length <= 255
assert start_hi <= 255
return "(StringTableSlice { byte_start_lo: %d, byte_start_hi: %d, byte_len: %d })" % (start_lo, start_hi, length)
ranges = []
for line in txt:
# remove comments
line, _, _ = line.partition('#')
# skip empty lines
if len(line.strip()) == 0:
continue
fields = line.split(';')
if fields[0].strip() == 'D800..DFFF':
continue # Surrogates don't occur in Rust strings.
first, _, last = fields[0].strip().partition('..')
if not last:
last = first
mapping = fields[1].strip().replace('_', ' ').title().replace(' ', '')
unicode_str = None
if len(fields) > 2:
if fields[2].strip():
unicode_str = u''.join(char(c) for c in fields[2].strip().split(' '))
elif mapping == "Deviation":
unicode_str = u''
if len(fields) > 3:
assert fields[3].strip() in ('NV8', 'XV8'), fields[3]
assert mapping == 'Valid', mapping
mapping = 'DisallowedIdna2008'
ranges.append((first, last, mapping, unicode_str))
def mergeable_key(r):
mapping = r[2]
# These types have associated data, so we should not merge them.
if mapping in ('Mapped', 'Deviation', 'DisallowedStd3Mapped'):
return r
assert mapping in ('Valid', 'Ignored', 'Disallowed', 'DisallowedStd3Valid', 'DisallowedIdna2008')
return mapping
grouped_ranges = itertools.groupby(ranges, key=mergeable_key)
optimized_ranges = []
for (k, g) in grouped_ranges:
group = list(g)
if len(group) == 1:
optimized_ranges.append(group[0])
continue
# Assert that nothing in the group has an associated unicode string.
for g in group:
if g[3] is not None and len(g[3]) > 2:
assert not g[3][2].strip()
# Assert that consecutive members of the group don't leave gaps in
# the codepoint space.
a, b = itertools.tee(group)
next(b, None)
for (g1, g2) in zip(a, b):
last_char = int(g1[1], 16)
next_char = int(g2[0], 16)
if last_char + 1 == next_char:
continue
# There's a gap where surrogates would appear, but we don't have to
# worry about that gap, as surrogates never appear in Rust strings.
# Assert we're seeing the surrogate case here.
assert last_char == 0xd7ff
assert next_char == 0xe000
optimized_ranges.append((group[0][0], group[-1][1]) + group[0][2:])
def is_single_char_range(r):
(first, last, _, _) = r
return first == last
# We can reduce the size of the character range table and the index table to about 1/4
# by merging runs of single character ranges and using character offsets from the start
# of that range to retrieve the correct `Mapping` value
def merge_single_char_ranges(ranges):
current = []
for r in ranges:
if not current or is_single_char_range(current[-1]) and is_single_char_range(r):
current.append(r)
continue
if len(current) != 0:
ret = current
current = [r]
yield ret
continue
current.append(r)
ret = current
current = []
yield ret
yield current
optimized_ranges = list(merge_single_char_ranges(optimized_ranges))
SINGLE_MARKER = 1 << 15
print("static TABLE: &[(char, u16)] = &[")
offset = 0
for ranges in optimized_ranges:
assert offset < SINGLE_MARKER
block_len = len(ranges)
single = SINGLE_MARKER if block_len == 1 else 0
index = offset | single
offset += block_len
start = escape_char(char(ranges[0][0]))
print(" ('%s', %s)," % (start, index))
print("];\n")
print("static MAPPING_TABLE: &[Mapping] = &[")
for ranges in optimized_ranges:
for (first, last, mapping, unicode_str) in ranges:
if unicode_str is not None:
mapping += rust_slice(strtab_slice(unicode_str))
print(" %s," % mapping)
print("];\n")
def escape_str(s):
return [escape_char(c) for c in s]
print("static STRING_TABLE: &str = \"%s\";"
% '\\\n '.join(itertools.chain(*[escape_str(s) for s in strtab.keys()])))