| def lines_get(f): |
| '''Parse a file like object, removing comments and returning a list of |
| lines.''' |
| def cut_comment(line): |
| first_hash = line.find('#') |
| if first_hash == -1: |
| return line |
| return line[:first_hash] |
| |
| return [x for x in [cut_comment(x[:-1]) for x in f.readlines()] if len(x)] |
| |
| def line_split(line): |
| '''Split a line based on a semicolon separator.''' |
| def normalise(word): |
| return word.lstrip().rstrip() |
| return [normalise(x) for x in line.split(';')] |
| |
| def codepoints_parse(token): |
| '''Parse a Unicode style code-point range. Return either a single value or a |
| tuple of (start, end) for a range of code-points.''' |
| def fromHex(token): |
| return int(token, 16) |
| parts = token.split('..') |
| if len(parts) == 2: |
| return (fromHex(parts[0]), fromHex(parts[1])) |
| elif len(parts) == 1: |
| return fromHex(parts[0]) |
| else: |
| raise ValueError(token) |
| |
| def unicode_file_parse(input, map, default_value = None): |
| '''Parse a file like object, @input where the first column is a code-point |
| range and the second column is mapped via the given dict, @map.''' |
| ranges = [] |
| tokens = [line_split(x) for x in lines_get(input)] |
| for line in tokens: |
| if len(line) == 2: |
| codepoints = codepoints_parse(line[0]) |
| value = map[line[1]] |
| if value == default_value: |
| continue |
| |
| if type(codepoints) == int: |
| codepoints = (codepoints, codepoints) |
| |
| ranges.append((codepoints[0], codepoints[1], value)) |
| else: |
| raise ValueError(line) |
| |
| return ranges |
| |
| def sort_and_merge(ranges): |
| '''Given a list of (start, end, value), merge elements where the ranges are |
| continuous and the values are the same.''' |
| output = [] |
| ranges.sort() |
| current = None |
| for v in ranges: |
| if current is None: |
| current = v |
| continue |
| if current[1] + 1 == v[0] and current[2] == v[2]: |
| current = (current[0], v[1], v[2]) |
| else: |
| output.append(current) |
| current = v |
| if current is not None: |
| output.append(current) |
| |
| return output |