xref: /netbsd-src/external/gpl3/gcc/dist/contrib/unicode/from_glibc/utf8_gen.py (revision b1e838363e3c6fc78a55519254d99869742dd33c)
1#!/usr/bin/python3
2# -*- coding: utf-8 -*-
3# Copyright (C) 2014-2020 Free Software Foundation, Inc.
4# This file is part of the GNU C Library.
5#
6# The GNU C Library is free software; you can redistribute it and/or
7# modify it under the terms of the GNU Lesser General Public
8# License as published by the Free Software Foundation; either
9# version 2.1 of the License, or (at your option) any later version.
10#
11# The GNU C Library is distributed in the hope that it will be useful,
12# but WITHOUT ANY WARRANTY; without even the implied warranty of
13# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14# Lesser General Public License for more details.
15#
16# You should have received a copy of the GNU Lesser General Public
17# License along with the GNU C Library; if not, see
18# <https://www.gnu.org/licenses/>.
19
20'''glibc/localedata/charmaps/UTF-8 file generator script
21
22This script generates a glibc/localedata/charmaps/UTF-8 file
23from Unicode data.
24
25Usage: python3 utf8_gen.py UnicodeData.txt EastAsianWidth.txt
26
27It will output UTF-8 file
28'''
29
30import argparse
31import sys
32import re
33import unicode_utils
34
35# Auxiliary tables for Hangul syllable names, see the Unicode 3.0 book,
36# sections 3.11 and 4.4.
37
38JAMO_INITIAL_SHORT_NAME = (
39    'G', 'GG', 'N', 'D', 'DD', 'R', 'M', 'B', 'BB', 'S', 'SS', '', 'J', 'JJ',
40    'C', 'K', 'T', 'P', 'H'
41)
42
43JAMO_MEDIAL_SHORT_NAME = (
44    'A', 'AE', 'YA', 'YAE', 'EO', 'E', 'YEO', 'YE', 'O', 'WA', 'WAE', 'OE',
45    'YO', 'U', 'WEO', 'WE', 'WI', 'YU', 'EU', 'YI', 'I'
46)
47
48JAMO_FINAL_SHORT_NAME = (
49    '', 'G', 'GG', 'GS', 'N', 'NI', 'NH', 'D', 'L', 'LG', 'LM', 'LB', 'LS',
50    'LT', 'LP', 'LH', 'M', 'B', 'BS', 'S', 'SS', 'NG', 'J', 'C', 'K', 'T',
51    'P', 'H'
52)
53
54def process_range(start, end, outfile, name):
55    '''Writes a range of code points into the CHARMAP section of the
56    output file
57
58    '''
59    if 'Hangul Syllable' in name:
60        # from glibc/localedata/ChangeLog:
61        #
62        #  2000-09-24  Bruno Haible  <haible@clisp.cons.org>
63        #  * charmaps/UTF-8: Expand <Hangul Syllable> and <Private Use> ranges,
64        #  so they become printable and carry a width. Comment out surrogate
65        #  ranges. Add a WIDTH table
66        #
67        # So we expand the Hangul Syllables here:
68        for i in range(int(start, 16), int(end, 16)+1 ):
69            index2, index3 = divmod(i - 0xaC00, 28)
70            index1, index2 = divmod(index2, 21)
71            hangul_syllable_name = 'HANGUL SYLLABLE ' \
72                                   + JAMO_INITIAL_SHORT_NAME[index1] \
73                                   + JAMO_MEDIAL_SHORT_NAME[index2] \
74                                   + JAMO_FINAL_SHORT_NAME[index3]
75            outfile.write('{:<11s} {:<12s} {:s}\n'.format(
76                unicode_utils.ucs_symbol(i), convert_to_hex(i),
77                hangul_syllable_name))
78        return
79    # UnicodeData.txt file has contains code point ranges like this:
80    #
81    # 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
82    # 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
83    #
84    # The glibc UTF-8 file splits ranges like these into shorter
85    # ranges of 64 code points each:
86    #
87    # <U3400>..<U343F>     /xe3/x90/x80         <CJK Ideograph Extension A>
88    # …
89    # <U4D80>..<U4DB5>     /xe4/xb6/x80         <CJK Ideograph Extension A>
90    for i in range(int(start, 16), int(end, 16), 64 ):
91        if i > (int(end, 16)-64):
92            outfile.write('{:s}..{:s} {:<12s} {:s}\n'.format(
93                    unicode_utils.ucs_symbol(i),
94                    unicode_utils.ucs_symbol(int(end,16)),
95                    convert_to_hex(i),
96                    name))
97            break
98        outfile.write('{:s}..{:s} {:<12s} {:s}\n'.format(
99                unicode_utils.ucs_symbol(i),
100                unicode_utils.ucs_symbol(i+63),
101                convert_to_hex(i),
102                name))
103
104def process_charmap(flines, outfile):
105    '''This function takes an array which contains *all* lines of
106    of UnicodeData.txt and write lines to outfile as used in the
107
108    CHARMAP
109110    END CHARMAP
111
112    section of the UTF-8 file in glibc/localedata/charmaps/UTF-8.
113
114    Samples for input lines:
115
116    0010;<control>;Cc;0;BN;;;;;N;DATA LINK ESCAPE;;;;
117    3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
118    4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
119    D800;<Non Private Use High Surrogate, First>;Cs;0;L;;;;;N;;;;;
120    DB7F;<Non Private Use High Surrogate, Last>;Cs;0;L;;;;;N;;;;;
121    100000;<Plane 16 Private Use, First>;Co;0;L;;;;;N;;;;;
122    10FFFD;<Plane 16 Private Use, Last>;Co;0;L;;;;;N;;;;;
123
124    Samples for output lines (Unicode-Value UTF-8-HEX Unicode-Char-Name):
125
126    <U0010>     /x10 DATA LINK ESCAPE
127    <U3400>..<U343F>     /xe3/x90/x80 <CJK Ideograph Extension A>
128    %<UD800>     /xed/xa0/x80 <Non Private Use High Surrogate, First>
129    %<UDB7F>     /xed/xad/xbf <Non Private Use High Surrogate, Last>
130    <U0010FFC0>..<U0010FFFD>     /xf4/x8f/xbf/x80 <Plane 16 Private Use>
131
132    '''
133    fields_start = []
134    for line in flines:
135        fields = line.split(";")
136         # Some characters have “<control>” as their name. We try to
137         # use the “Unicode 1.0 Name” (10th field in
138         # UnicodeData.txt) for them.
139         #
140         # The Characters U+0080, U+0081, U+0084 and U+0099 have
141         # “<control>” as their name but do not even have aa
142         # ”Unicode 1.0 Name”. We could write code to take their
143         # alternate names from NameAliases.txt.
144        if fields[1] == "<control>" and fields[10]:
145            fields[1] = fields[10]
146        # Handling code point ranges like:
147        #
148        # 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
149        # 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
150        if fields[1].endswith(', First>') and not 'Surrogate,' in fields[1]:
151            fields_start = fields
152            continue
153        if fields[1].endswith(', Last>') and not 'Surrogate,' in fields[1]:
154            process_range(fields_start[0], fields[0],
155                          outfile, fields[1][:-7]+'>')
156            fields_start = []
157            continue
158        fields_start = []
159        if 'Surrogate,' in fields[1]:
160            # Comment out the surrogates in the UTF-8 file.
161            # One could of course skip them completely but
162            # the original UTF-8 file in glibc had them as
163            # comments, so we keep these comment lines.
164            outfile.write('%')
165        outfile.write('{:<11s} {:<12s} {:s}\n'.format(
166                unicode_utils.ucs_symbol(int(fields[0], 16)),
167                convert_to_hex(int(fields[0], 16)),
168                fields[1]))
169
170def convert_to_hex(code_point):
171    '''Converts a code point to a hexadecimal UTF-8 representation
172    like /x**/x**/x**.'''
173    # Getting UTF8 of Unicode characters.
174    # In Python3, .encode('UTF-8') does not work for
175    # surrogates. Therefore, we use this conversion table
176    surrogates = {
177        0xD800: '/xed/xa0/x80',
178        0xDB7F: '/xed/xad/xbf',
179        0xDB80: '/xed/xae/x80',
180        0xDBFF: '/xed/xaf/xbf',
181        0xDC00: '/xed/xb0/x80',
182        0xDFFF: '/xed/xbf/xbf',
183    }
184    if code_point in surrogates:
185        return surrogates[code_point]
186    return ''.join([
187        '/x{:02x}'.format(c) for c in chr(code_point).encode('UTF-8')
188    ])
189
190def write_header_charmap(outfile):
191    '''Write the header on top of the CHARMAP section to the output file'''
192    outfile.write("<code_set_name> UTF-8\n")
193    outfile.write("<comment_char> %\n")
194    outfile.write("<escape_char> /\n")
195    outfile.write("<mb_cur_min> 1\n")
196    outfile.write("<mb_cur_max> 6\n\n")
197    outfile.write("% CHARMAP generated using utf8_gen.py\n")
198    outfile.write("% alias ISO-10646/UTF-8\n")
199    outfile.write("CHARMAP\n")
200
201def write_header_width(outfile, unicode_version):
202    '''Writes the header on top of the WIDTH section to the output file'''
203    outfile.write('% Character width according to Unicode '
204                  + '{:s}.\n'.format(unicode_version))
205    outfile.write('% - Default width is 1.\n')
206    outfile.write('% - Double-width characters have width 2; generated from\n')
207    outfile.write('%        "grep \'^[^;]*;[WF]\' EastAsianWidth.txt"\n')
208    outfile.write('% - Non-spacing characters have width 0; '
209                  + 'generated from PropList.txt or\n')
210    outfile.write('%   "grep \'^[^;]*;[^;]*;[^;]*;[^;]*;NSM;\' '
211                  + 'UnicodeData.txt"\n')
212    outfile.write('% - Format control characters have width 0; '
213                  + 'generated from\n')
214    outfile.write("%   \"grep '^[^;]*;[^;]*;Cf;' UnicodeData.txt\"\n")
215#   Not needed covered by Cf
216#    outfile.write("% - Zero width characters have width 0; generated from\n")
217#    outfile.write("%   \"grep '^[^;]*;ZERO WIDTH ' UnicodeData.txt\"\n")
218    outfile.write("WIDTH\n")
219
220def process_width(outfile, ulines, elines, plines):
221    '''ulines are lines from UnicodeData.txt, elines are lines from
222    EastAsianWidth.txt containing characters with width “W” or “F”,
223    plines are lines from PropList.txt which contain characters
224    with the property “Prepended_Concatenation_Mark”.
225
226    '''
227    width_dict = {}
228    for line in elines:
229        fields = line.split(";")
230        if not '..' in fields[0]:
231            code_points = (fields[0], fields[0])
232        else:
233            code_points = fields[0].split("..")
234        for key in range(int(code_points[0], 16),
235                         int(code_points[1], 16)+1):
236            width_dict[key] = 2
237
238    for line in ulines:
239        fields = line.split(";")
240        if fields[4] == "NSM" or fields[2] in ("Cf", "Me", "Mn"):
241            width_dict[int(fields[0], 16)] = 0
242
243    for line in plines:
244        # Characters with the property “Prepended_Concatenation_Mark”
245        # should have the width 1:
246        fields = line.split(";")
247        if not '..' in fields[0]:
248            code_points = (fields[0], fields[0])
249        else:
250            code_points = fields[0].split("..")
251        for key in range(int(code_points[0], 16),
252                         int(code_points[1], 16)+1):
253            del width_dict[key] # default width is 1
254
255    # handle special cases for compatibility
256    for key in list((0x00AD,)):
257        # https://www.cs.tut.fi/~jkorpela/shy.html
258        if key in width_dict:
259            del width_dict[key] # default width is 1
260    for key in list(range(0x1160, 0x1200)):
261        # Hangul jungseong and jongseong:
262        if key in unicode_utils.UNICODE_ATTRIBUTES:
263            width_dict[key] = 0
264    for key in list(range(0xD7B0, 0xD800)):
265        # Hangul jungseong and jongseong:
266        if key in unicode_utils.UNICODE_ATTRIBUTES:
267            width_dict[key] = 0
268    for key in list(range(0x3248, 0x3250)):
269        # These are “A” which means we can decide whether to treat them
270        # as “W” or “N” based on context:
271        # http://www.unicode.org/mail-arch/unicode-ml/y2017-m08/0023.html
272        # For us, “W” seems better.
273        width_dict[key] = 2
274    for key in list(range(0x4DC0, 0x4E00)):
275        width_dict[key] = 2
276
277    same_width_lists = []
278    current_width_list = []
279    for key in sorted(width_dict):
280        if not current_width_list:
281            current_width_list = [key]
282        elif (key == current_width_list[-1] + 1
283              and width_dict[key] == width_dict[current_width_list[0]]):
284            current_width_list.append(key)
285        else:
286            same_width_lists.append(current_width_list)
287            current_width_list = [key]
288    if current_width_list:
289        same_width_lists.append(current_width_list)
290
291    for same_width_list in same_width_lists:
292        if len(same_width_list) == 1:
293            outfile.write('{:s}\t{:d}\n'.format(
294                unicode_utils.ucs_symbol(same_width_list[0]),
295                width_dict[same_width_list[0]]))
296        else:
297            outfile.write('{:s}...{:s}\t{:d}\n'.format(
298                unicode_utils.ucs_symbol(same_width_list[0]),
299                unicode_utils.ucs_symbol(same_width_list[-1]),
300                width_dict[same_width_list[0]]))
301
302if __name__ == "__main__":
303    PARSER = argparse.ArgumentParser(
304        description='''
305        Generate a UTF-8 file from UnicodeData.txt, EastAsianWidth.txt, and PropList.txt.
306        ''')
307    PARSER.add_argument(
308        '-u', '--unicode_data_file',
309        nargs='?',
310        type=str,
311        default='UnicodeData.txt',
312        help=('The UnicodeData.txt file to read, '
313              + 'default: %(default)s'))
314    PARSER.add_argument(
315        '-e', '--east_asian_with_file',
316        nargs='?',
317        type=str,
318        default='EastAsianWidth.txt',
319        help=('The EastAsianWidth.txt file to read, '
320              + 'default: %(default)s'))
321    PARSER.add_argument(
322        '-p', '--prop_list_file',
323        nargs='?',
324        type=str,
325        default='PropList.txt',
326        help=('The PropList.txt file to read, '
327              + 'default: %(default)s'))
328    PARSER.add_argument(
329        '--unicode_version',
330        nargs='?',
331        required=True,
332        type=str,
333        help='The Unicode version of the input files used.')
334    ARGS = PARSER.parse_args()
335
336    unicode_utils.fill_attributes(ARGS.unicode_data_file)
337    with open(ARGS.unicode_data_file, mode='r') as UNIDATA_FILE:
338        UNICODE_DATA_LINES = UNIDATA_FILE.readlines()
339    with open(ARGS.east_asian_with_file, mode='r') as EAST_ASIAN_WIDTH_FILE:
340        EAST_ASIAN_WIDTH_LINES = []
341        for LINE in EAST_ASIAN_WIDTH_FILE:
342            # If characters from EastAasianWidth.txt which are from
343            # from reserved ranges (i.e. not yet assigned code points)
344            # are added to the WIDTH section of the UTF-8 file, then
345            # “make check” produces “Unknown Character” errors for
346            # these code points because such unassigned code points
347            # are not in the CHARMAP section of the UTF-8 file.
348            #
349            # Therefore, we skip all reserved code points when reading
350            # the EastAsianWidth.txt file.
351            if re.match(r'.*<reserved-.+>\.\.<reserved-.+>.*', LINE):
352                continue
353            if re.match(r'^[^;]*;[WF]', LINE):
354                EAST_ASIAN_WIDTH_LINES.append(LINE.strip())
355    with open(ARGS.prop_list_file, mode='r') as PROP_LIST_FILE:
356        PROP_LIST_LINES = []
357        for LINE in PROP_LIST_FILE:
358            if re.match(r'^[^;]*;[\s]*Prepended_Concatenation_Mark', LINE):
359                PROP_LIST_LINES.append(LINE.strip())
360    with open('UTF-8', mode='w') as OUTFILE:
361        # Processing UnicodeData.txt and write CHARMAP to UTF-8 file
362        write_header_charmap(OUTFILE)
363        process_charmap(UNICODE_DATA_LINES, OUTFILE)
364        OUTFILE.write("END CHARMAP\n\n")
365        # Processing EastAsianWidth.txt and write WIDTH to UTF-8 file
366        write_header_width(OUTFILE, ARGS.unicode_version)
367        process_width(OUTFILE,
368                      UNICODE_DATA_LINES,
369                      EAST_ASIAN_WIDTH_LINES,
370                      PROP_LIST_LINES)
371        OUTFILE.write("END WIDTH\n")
372