xref: /llvm-project/llvm/utils/unicode-case-fold.py (revision b71edfaa4ec3c998aadb35255ce2f60bba2940b0)
13b17b84bSPavel Labath#!/usr/bin/env python
23b17b84bSPavel Labath"""
33b17b84bSPavel LabathUnicode case folding database conversion utility
43b17b84bSPavel Labath
53b17b84bSPavel LabathParses the database and generates a C++ function which implements the case
63b17b84bSPavel Labathfolding algorithm. The database entries are of the form:
73b17b84bSPavel Labath
83b17b84bSPavel Labath  <code>; <status>; <mapping>; # <name>
93b17b84bSPavel Labath
103b17b84bSPavel Labath<status> can be one of four characters:
113b17b84bSPavel Labath  C - Common mappings
123b17b84bSPavel Labath  S - mappings for Simple case folding
133b17b84bSPavel Labath  F - mappings for Full case folding
143b17b84bSPavel Labath  T - special case for Turkish I characters
153b17b84bSPavel Labath
163b17b84bSPavel LabathRight now this generates a function which implements simple case folding (C+S
173b17b84bSPavel Labathentries).
183b17b84bSPavel Labath"""
193b17b84bSPavel Labath
204a27478aSSerge Gueltonfrom __future__ import print_function
214a27478aSSerge Guelton
223b17b84bSPavel Labathimport sys
233b17b84bSPavel Labathimport re
24*b71edfaaSTobias Hieta
253026e912SSerge Gueltontry:
263026e912SSerge Guelton    from urllib.request import urlopen
273026e912SSerge Gueltonexcept ImportError:
283026e912SSerge Guelton    from urllib2 import urlopen
293026e912SSerge Guelton
303b17b84bSPavel Labath
313b17b84bSPavel Labath# This variable will body of the mappings function
323b17b84bSPavel Labathbody = ""
333b17b84bSPavel Labath
343b17b84bSPavel Labath# Reads file line-by-line, extracts Common and Simple case fold mappings and
353b17b84bSPavel Labath# returns a (from_char, to_char, from_name) tuple.
363b17b84bSPavel Labathdef mappings(f):
373b17b84bSPavel Labath    previous_from = -1
38*b71edfaaSTobias Hieta    expr = re.compile(r"^(.*); [CS]; (.*); # (.*)")
393b17b84bSPavel Labath    for line in f:
403b17b84bSPavel Labath        m = expr.match(line)
41*b71edfaaSTobias Hieta        if not m:
42*b71edfaaSTobias Hieta            continue
433b17b84bSPavel Labath        from_char = int(m.group(1), 16)
443b17b84bSPavel Labath        to_char = int(m.group(2), 16)
453b17b84bSPavel Labath        from_name = m.group(3)
463b17b84bSPavel Labath
473b17b84bSPavel Labath        if from_char <= previous_from:
483b17b84bSPavel Labath            raise Exception("Duplicate or unsorted characters in input")
493b17b84bSPavel Labath        yield from_char, to_char, from_name
503b17b84bSPavel Labath        previous_from = from_char
513b17b84bSPavel Labath
52*b71edfaaSTobias Hieta
533b17b84bSPavel Labath# Computes the shift (to_char - from_char) in a mapping.
543b17b84bSPavel Labathdef shift(mapping):
553b17b84bSPavel Labath    return mapping[1] - mapping[0]
563b17b84bSPavel Labath
57*b71edfaaSTobias Hieta
583b17b84bSPavel Labath# Computes the stride (from_char2 - from_char1) of two mappings.
593b17b84bSPavel Labathdef stride2(mapping1, mapping2):
603b17b84bSPavel Labath    return mapping2[0] - mapping1[0]
613b17b84bSPavel Labath
62*b71edfaaSTobias Hieta
633b17b84bSPavel Labath# Computes the stride of a list of mappings. The list should have at least two
643b17b84bSPavel Labath# mappings. All mappings in the list are assumed to have the same stride.
653b17b84bSPavel Labathdef stride(block):
663b17b84bSPavel Labath    return stride2(block[0], block[1])
673b17b84bSPavel Labath
683b17b84bSPavel Labath
693b17b84bSPavel Labath# b is a list of mappings. All the mappings are assumed to have the same
703b17b84bSPavel Labath# shift and the stride between adjecant mappings (if any) is constant.
713b17b84bSPavel Labathdef dump_block(b):
723b17b84bSPavel Labath    global body
733b17b84bSPavel Labath
743b17b84bSPavel Labath    if len(b) == 1:
753b17b84bSPavel Labath        # Special case for handling blocks of length 1. We don't even need to
763b17b84bSPavel Labath        # emit the "if (C < X) return C" check below as all characters in this
773b17b84bSPavel Labath        # range will be caught by the "C < X" check emitted by the first
783b17b84bSPavel Labath        # non-trivial block.
793b17b84bSPavel Labath        body += "  // {2}\n  if (C == {0:#06x})\n    return {1:#06x};\n".format(*b[0])
803b17b84bSPavel Labath        return
813b17b84bSPavel Labath
823b17b84bSPavel Labath    first = b[0][0]
833b17b84bSPavel Labath    last = first + stride(b) * (len(b) - 1)
843b17b84bSPavel Labath    modulo = first % stride(b)
853b17b84bSPavel Labath
863b17b84bSPavel Labath    # All characters before this block map to themselves.
873b17b84bSPavel Labath    body += "  if (C < {0:#06x})\n    return C;\n".format(first)
883b17b84bSPavel Labath    body += "  // {0} characters\n".format(len(b))
893b17b84bSPavel Labath
903b17b84bSPavel Labath    # Generic pattern: check upper bound (lower bound is checked by the "if"
913b17b84bSPavel Labath    # above) and modulo of C, return C+shift.
923b17b84bSPavel Labath    pattern = "  if (C <= {0:#06x} && C % {1} == {2})\n    return C + {3};\n"
933b17b84bSPavel Labath
943b17b84bSPavel Labath    if stride(b) == 2 and shift(b[0]) == 1 and modulo == 0:
953b17b84bSPavel Labath        # Special case:
963b17b84bSPavel Labath        # We can elide the modulo-check because the expression "C|1" will map
973b17b84bSPavel Labath        # the intervening characters to themselves.
983b17b84bSPavel Labath        pattern = "  if (C <= {0:#06x})\n    return C | 1;\n"
993b17b84bSPavel Labath    elif stride(b) == 1:
1003b17b84bSPavel Labath        # Another special case: X % 1 is always zero, so don't emit the
1013b17b84bSPavel Labath        # modulo-check.
1023b17b84bSPavel Labath        pattern = "  if (C <= {0:#06x})\n    return C + {3};\n"
1033b17b84bSPavel Labath
1043b17b84bSPavel Labath    body += pattern.format(last, stride(b), modulo, shift(b[0]))
1053b17b84bSPavel Labath
106*b71edfaaSTobias Hieta
1073b17b84bSPavel Labathcurrent_block = []
1083026e912SSerge Gueltonf = urlopen(sys.argv[1])
1093b17b84bSPavel Labathfor m in mappings(f):
1103b17b84bSPavel Labath    if len(current_block) == 0:
1113b17b84bSPavel Labath        current_block.append(m)
1123b17b84bSPavel Labath        continue
1133b17b84bSPavel Labath
1143b17b84bSPavel Labath    if shift(current_block[0]) != shift(m):
1153b17b84bSPavel Labath        # Incompatible shift, start a new block.
1163b17b84bSPavel Labath        dump_block(current_block)
1173b17b84bSPavel Labath        current_block = [m]
1183b17b84bSPavel Labath        continue
1193b17b84bSPavel Labath
120*b71edfaaSTobias Hieta    if len(current_block) == 1 or stride(current_block) == stride2(
121*b71edfaaSTobias Hieta        current_block[-1], m
122*b71edfaaSTobias Hieta    ):
1233b17b84bSPavel Labath        current_block.append(m)
1243b17b84bSPavel Labath        continue
1253b17b84bSPavel Labath
1263b17b84bSPavel Labath    # Incompatible stride, start a new block.
1273b17b84bSPavel Labath    dump_block(current_block)
1283b17b84bSPavel Labath    current_block = [m]
1293b17b84bSPavel Labathf.close()
1303b17b84bSPavel Labath
1313b17b84bSPavel Labathdump_block(current_block)
1323b17b84bSPavel Labath
133*b71edfaaSTobias Hietaprint(
134*b71edfaaSTobias Hieta    "//===---------- Support/UnicodeCaseFold.cpp -------------------------------===//"
135*b71edfaaSTobias Hieta)
136*b71edfaaSTobias Hietaprint("//")
137*b71edfaaSTobias Hietaprint("// This file was generated by utils/unicode-case-fold.py from the Unicode")
138*b71edfaaSTobias Hietaprint("// case folding database at")
139*b71edfaaSTobias Hietaprint("//   ", sys.argv[1])
140*b71edfaaSTobias Hietaprint("//")
141*b71edfaaSTobias Hietaprint("// To regenerate this file, run:")
142*b71edfaaSTobias Hietaprint("//   utils/unicode-case-fold.py \\")
1434a27478aSSerge Gueltonprint('//     "{}" \\'.format(sys.argv[1]))
144*b71edfaaSTobias Hietaprint("//     > lib/Support/UnicodeCaseFold.cpp")
145*b71edfaaSTobias Hietaprint("//")
146*b71edfaaSTobias Hietaprint(
147*b71edfaaSTobias Hieta    "//===----------------------------------------------------------------------===//"
148*b71edfaaSTobias Hieta)
149*b71edfaaSTobias Hietaprint("")
1504a27478aSSerge Gueltonprint('#include "llvm/Support/Unicode.h"')
151*b71edfaaSTobias Hietaprint("")
1524a27478aSSerge Gueltonprint("int llvm::sys::unicode::foldCharSimple(int C) {")
1534a27478aSSerge Gueltonprint(body)
1544a27478aSSerge Gueltonprint("  return C;")
1554a27478aSSerge Gueltonprint("}")
156