1#!/usr/bin/python3 2# -*- coding: utf-8 -*- 3# Copyright (C) 2014-2020 Free Software Foundation, Inc. 4# This file is part of the GNU C Library. 5# 6# The GNU C Library is free software; you can redistribute it and/or 7# modify it under the terms of the GNU Lesser General Public 8# License as published by the Free Software Foundation; either 9# version 2.1 of the License, or (at your option) any later version. 10# 11# The GNU C Library is distributed in the hope that it will be useful, 12# but WITHOUT ANY WARRANTY; without even the implied warranty of 13# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14# Lesser General Public License for more details. 15# 16# You should have received a copy of the GNU Lesser General Public 17# License along with the GNU C Library; if not, see 18# <https://www.gnu.org/licenses/>. 19 20'''glibc/localedata/charmaps/UTF-8 file generator script 21 22This script generates a glibc/localedata/charmaps/UTF-8 file 23from Unicode data. 24 25Usage: python3 utf8_gen.py UnicodeData.txt EastAsianWidth.txt 26 27It will output UTF-8 file 28''' 29 30import argparse 31import sys 32import re 33import unicode_utils 34 35# Auxiliary tables for Hangul syllable names, see the Unicode 3.0 book, 36# sections 3.11 and 4.4. 37 38JAMO_INITIAL_SHORT_NAME = ( 39 'G', 'GG', 'N', 'D', 'DD', 'R', 'M', 'B', 'BB', 'S', 'SS', '', 'J', 'JJ', 40 'C', 'K', 'T', 'P', 'H' 41) 42 43JAMO_MEDIAL_SHORT_NAME = ( 44 'A', 'AE', 'YA', 'YAE', 'EO', 'E', 'YEO', 'YE', 'O', 'WA', 'WAE', 'OE', 45 'YO', 'U', 'WEO', 'WE', 'WI', 'YU', 'EU', 'YI', 'I' 46) 47 48JAMO_FINAL_SHORT_NAME = ( 49 '', 'G', 'GG', 'GS', 'N', 'NI', 'NH', 'D', 'L', 'LG', 'LM', 'LB', 'LS', 50 'LT', 'LP', 'LH', 'M', 'B', 'BS', 'S', 'SS', 'NG', 'J', 'C', 'K', 'T', 51 'P', 'H' 52) 53 54def process_range(start, end, outfile, name): 55 '''Writes a range of code points into the CHARMAP section of the 56 output file 57 58 ''' 59 if 'Hangul Syllable' in name: 60 # from glibc/localedata/ChangeLog: 61 # 62 # 2000-09-24 Bruno Haible <haible@clisp.cons.org> 63 # * charmaps/UTF-8: Expand <Hangul Syllable> and <Private Use> ranges, 64 # so they become printable and carry a width. Comment out surrogate 65 # ranges. Add a WIDTH table 66 # 67 # So we expand the Hangul Syllables here: 68 for i in range(int(start, 16), int(end, 16)+1 ): 69 index2, index3 = divmod(i - 0xaC00, 28) 70 index1, index2 = divmod(index2, 21) 71 hangul_syllable_name = 'HANGUL SYLLABLE ' \ 72 + JAMO_INITIAL_SHORT_NAME[index1] \ 73 + JAMO_MEDIAL_SHORT_NAME[index2] \ 74 + JAMO_FINAL_SHORT_NAME[index3] 75 outfile.write('{:<11s} {:<12s} {:s}\n'.format( 76 unicode_utils.ucs_symbol(i), convert_to_hex(i), 77 hangul_syllable_name)) 78 return 79 # UnicodeData.txt file has contains code point ranges like this: 80 # 81 # 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;; 82 # 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;; 83 # 84 # The glibc UTF-8 file splits ranges like these into shorter 85 # ranges of 64 code points each: 86 # 87 # <U3400>..<U343F> /xe3/x90/x80 <CJK Ideograph Extension A> 88 # … 89 # <U4D80>..<U4DB5> /xe4/xb6/x80 <CJK Ideograph Extension A> 90 for i in range(int(start, 16), int(end, 16), 64 ): 91 if i > (int(end, 16)-64): 92 outfile.write('{:s}..{:s} {:<12s} {:s}\n'.format( 93 unicode_utils.ucs_symbol(i), 94 unicode_utils.ucs_symbol(int(end,16)), 95 convert_to_hex(i), 96 name)) 97 break 98 outfile.write('{:s}..{:s} {:<12s} {:s}\n'.format( 99 unicode_utils.ucs_symbol(i), 100 unicode_utils.ucs_symbol(i+63), 101 convert_to_hex(i), 102 name)) 103 104def process_charmap(flines, outfile): 105 '''This function takes an array which contains *all* lines of 106 of UnicodeData.txt and write lines to outfile as used in the 107 108 CHARMAP 109 … 110 END CHARMAP 111 112 section of the UTF-8 file in glibc/localedata/charmaps/UTF-8. 113 114 Samples for input lines: 115 116 0010;<control>;Cc;0;BN;;;;;N;DATA LINK ESCAPE;;;; 117 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;; 118 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;; 119 D800;<Non Private Use High Surrogate, First>;Cs;0;L;;;;;N;;;;; 120 DB7F;<Non Private Use High Surrogate, Last>;Cs;0;L;;;;;N;;;;; 121 100000;<Plane 16 Private Use, First>;Co;0;L;;;;;N;;;;; 122 10FFFD;<Plane 16 Private Use, Last>;Co;0;L;;;;;N;;;;; 123 124 Samples for output lines (Unicode-Value UTF-8-HEX Unicode-Char-Name): 125 126 <U0010> /x10 DATA LINK ESCAPE 127 <U3400>..<U343F> /xe3/x90/x80 <CJK Ideograph Extension A> 128 %<UD800> /xed/xa0/x80 <Non Private Use High Surrogate, First> 129 %<UDB7F> /xed/xad/xbf <Non Private Use High Surrogate, Last> 130 <U0010FFC0>..<U0010FFFD> /xf4/x8f/xbf/x80 <Plane 16 Private Use> 131 132 ''' 133 fields_start = [] 134 for line in flines: 135 fields = line.split(";") 136 # Some characters have “<control>” as their name. We try to 137 # use the “Unicode 1.0 Name” (10th field in 138 # UnicodeData.txt) for them. 139 # 140 # The Characters U+0080, U+0081, U+0084 and U+0099 have 141 # “<control>” as their name but do not even have aa 142 # ”Unicode 1.0 Name”. We could write code to take their 143 # alternate names from NameAliases.txt. 144 if fields[1] == "<control>" and fields[10]: 145 fields[1] = fields[10] 146 # Handling code point ranges like: 147 # 148 # 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;; 149 # 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;; 150 if fields[1].endswith(', First>') and not 'Surrogate,' in fields[1]: 151 fields_start = fields 152 continue 153 if fields[1].endswith(', Last>') and not 'Surrogate,' in fields[1]: 154 process_range(fields_start[0], fields[0], 155 outfile, fields[1][:-7]+'>') 156 fields_start = [] 157 continue 158 fields_start = [] 159 if 'Surrogate,' in fields[1]: 160 # Comment out the surrogates in the UTF-8 file. 161 # One could of course skip them completely but 162 # the original UTF-8 file in glibc had them as 163 # comments, so we keep these comment lines. 164 outfile.write('%') 165 outfile.write('{:<11s} {:<12s} {:s}\n'.format( 166 unicode_utils.ucs_symbol(int(fields[0], 16)), 167 convert_to_hex(int(fields[0], 16)), 168 fields[1])) 169 170def convert_to_hex(code_point): 171 '''Converts a code point to a hexadecimal UTF-8 representation 172 like /x**/x**/x**.''' 173 # Getting UTF8 of Unicode characters. 174 # In Python3, .encode('UTF-8') does not work for 175 # surrogates. Therefore, we use this conversion table 176 surrogates = { 177 0xD800: '/xed/xa0/x80', 178 0xDB7F: '/xed/xad/xbf', 179 0xDB80: '/xed/xae/x80', 180 0xDBFF: '/xed/xaf/xbf', 181 0xDC00: '/xed/xb0/x80', 182 0xDFFF: '/xed/xbf/xbf', 183 } 184 if code_point in surrogates: 185 return surrogates[code_point] 186 return ''.join([ 187 '/x{:02x}'.format(c) for c in chr(code_point).encode('UTF-8') 188 ]) 189 190def write_header_charmap(outfile): 191 '''Write the header on top of the CHARMAP section to the output file''' 192 outfile.write("<code_set_name> UTF-8\n") 193 outfile.write("<comment_char> %\n") 194 outfile.write("<escape_char> /\n") 195 outfile.write("<mb_cur_min> 1\n") 196 outfile.write("<mb_cur_max> 6\n\n") 197 outfile.write("% CHARMAP generated using utf8_gen.py\n") 198 outfile.write("% alias ISO-10646/UTF-8\n") 199 outfile.write("CHARMAP\n") 200 201def write_header_width(outfile, unicode_version): 202 '''Writes the header on top of the WIDTH section to the output file''' 203 outfile.write('% Character width according to Unicode ' 204 + '{:s}.\n'.format(unicode_version)) 205 outfile.write('% - Default width is 1.\n') 206 outfile.write('% - Double-width characters have width 2; generated from\n') 207 outfile.write('% "grep \'^[^;]*;[WF]\' EastAsianWidth.txt"\n') 208 outfile.write('% - Non-spacing characters have width 0; ' 209 + 'generated from PropList.txt or\n') 210 outfile.write('% "grep \'^[^;]*;[^;]*;[^;]*;[^;]*;NSM;\' ' 211 + 'UnicodeData.txt"\n') 212 outfile.write('% - Format control characters have width 0; ' 213 + 'generated from\n') 214 outfile.write("% \"grep '^[^;]*;[^;]*;Cf;' UnicodeData.txt\"\n") 215# Not needed covered by Cf 216# outfile.write("% - Zero width characters have width 0; generated from\n") 217# outfile.write("% \"grep '^[^;]*;ZERO WIDTH ' UnicodeData.txt\"\n") 218 outfile.write("WIDTH\n") 219 220def process_width(outfile, ulines, elines, plines): 221 '''ulines are lines from UnicodeData.txt, elines are lines from 222 EastAsianWidth.txt containing characters with width “W” or “F”, 223 plines are lines from PropList.txt which contain characters 224 with the property “Prepended_Concatenation_Mark”. 225 226 ''' 227 width_dict = {} 228 for line in elines: 229 fields = line.split(";") 230 if not '..' in fields[0]: 231 code_points = (fields[0], fields[0]) 232 else: 233 code_points = fields[0].split("..") 234 for key in range(int(code_points[0], 16), 235 int(code_points[1], 16)+1): 236 width_dict[key] = 2 237 238 for line in ulines: 239 fields = line.split(";") 240 if fields[4] == "NSM" or fields[2] in ("Cf", "Me", "Mn"): 241 width_dict[int(fields[0], 16)] = 0 242 243 for line in plines: 244 # Characters with the property “Prepended_Concatenation_Mark” 245 # should have the width 1: 246 fields = line.split(";") 247 if not '..' in fields[0]: 248 code_points = (fields[0], fields[0]) 249 else: 250 code_points = fields[0].split("..") 251 for key in range(int(code_points[0], 16), 252 int(code_points[1], 16)+1): 253 del width_dict[key] # default width is 1 254 255 # handle special cases for compatibility 256 for key in list((0x00AD,)): 257 # https://www.cs.tut.fi/~jkorpela/shy.html 258 if key in width_dict: 259 del width_dict[key] # default width is 1 260 for key in list(range(0x1160, 0x1200)): 261 # Hangul jungseong and jongseong: 262 if key in unicode_utils.UNICODE_ATTRIBUTES: 263 width_dict[key] = 0 264 for key in list(range(0xD7B0, 0xD800)): 265 # Hangul jungseong and jongseong: 266 if key in unicode_utils.UNICODE_ATTRIBUTES: 267 width_dict[key] = 0 268 for key in list(range(0x3248, 0x3250)): 269 # These are “A” which means we can decide whether to treat them 270 # as “W” or “N” based on context: 271 # http://www.unicode.org/mail-arch/unicode-ml/y2017-m08/0023.html 272 # For us, “W” seems better. 273 width_dict[key] = 2 274 for key in list(range(0x4DC0, 0x4E00)): 275 width_dict[key] = 2 276 277 same_width_lists = [] 278 current_width_list = [] 279 for key in sorted(width_dict): 280 if not current_width_list: 281 current_width_list = [key] 282 elif (key == current_width_list[-1] + 1 283 and width_dict[key] == width_dict[current_width_list[0]]): 284 current_width_list.append(key) 285 else: 286 same_width_lists.append(current_width_list) 287 current_width_list = [key] 288 if current_width_list: 289 same_width_lists.append(current_width_list) 290 291 for same_width_list in same_width_lists: 292 if len(same_width_list) == 1: 293 outfile.write('{:s}\t{:d}\n'.format( 294 unicode_utils.ucs_symbol(same_width_list[0]), 295 width_dict[same_width_list[0]])) 296 else: 297 outfile.write('{:s}...{:s}\t{:d}\n'.format( 298 unicode_utils.ucs_symbol(same_width_list[0]), 299 unicode_utils.ucs_symbol(same_width_list[-1]), 300 width_dict[same_width_list[0]])) 301 302if __name__ == "__main__": 303 PARSER = argparse.ArgumentParser( 304 description=''' 305 Generate a UTF-8 file from UnicodeData.txt, EastAsianWidth.txt, and PropList.txt. 306 ''') 307 PARSER.add_argument( 308 '-u', '--unicode_data_file', 309 nargs='?', 310 type=str, 311 default='UnicodeData.txt', 312 help=('The UnicodeData.txt file to read, ' 313 + 'default: %(default)s')) 314 PARSER.add_argument( 315 '-e', '--east_asian_with_file', 316 nargs='?', 317 type=str, 318 default='EastAsianWidth.txt', 319 help=('The EastAsianWidth.txt file to read, ' 320 + 'default: %(default)s')) 321 PARSER.add_argument( 322 '-p', '--prop_list_file', 323 nargs='?', 324 type=str, 325 default='PropList.txt', 326 help=('The PropList.txt file to read, ' 327 + 'default: %(default)s')) 328 PARSER.add_argument( 329 '--unicode_version', 330 nargs='?', 331 required=True, 332 type=str, 333 help='The Unicode version of the input files used.') 334 ARGS = PARSER.parse_args() 335 336 unicode_utils.fill_attributes(ARGS.unicode_data_file) 337 with open(ARGS.unicode_data_file, mode='r') as UNIDATA_FILE: 338 UNICODE_DATA_LINES = UNIDATA_FILE.readlines() 339 with open(ARGS.east_asian_with_file, mode='r') as EAST_ASIAN_WIDTH_FILE: 340 EAST_ASIAN_WIDTH_LINES = [] 341 for LINE in EAST_ASIAN_WIDTH_FILE: 342 # If characters from EastAasianWidth.txt which are from 343 # from reserved ranges (i.e. not yet assigned code points) 344 # are added to the WIDTH section of the UTF-8 file, then 345 # “make check” produces “Unknown Character” errors for 346 # these code points because such unassigned code points 347 # are not in the CHARMAP section of the UTF-8 file. 348 # 349 # Therefore, we skip all reserved code points when reading 350 # the EastAsianWidth.txt file. 351 if re.match(r'.*<reserved-.+>\.\.<reserved-.+>.*', LINE): 352 continue 353 if re.match(r'^[^;]*;[WF]', LINE): 354 EAST_ASIAN_WIDTH_LINES.append(LINE.strip()) 355 with open(ARGS.prop_list_file, mode='r') as PROP_LIST_FILE: 356 PROP_LIST_LINES = [] 357 for LINE in PROP_LIST_FILE: 358 if re.match(r'^[^;]*;[\s]*Prepended_Concatenation_Mark', LINE): 359 PROP_LIST_LINES.append(LINE.strip()) 360 with open('UTF-8', mode='w') as OUTFILE: 361 # Processing UnicodeData.txt and write CHARMAP to UTF-8 file 362 write_header_charmap(OUTFILE) 363 process_charmap(UNICODE_DATA_LINES, OUTFILE) 364 OUTFILE.write("END CHARMAP\n\n") 365 # Processing EastAsianWidth.txt and write WIDTH to UTF-8 file 366 write_header_width(OUTFILE, ARGS.unicode_version) 367 process_width(OUTFILE, 368 UNICODE_DATA_LINES, 369 EAST_ASIAN_WIDTH_LINES, 370 PROP_LIST_LINES) 371 OUTFILE.write("END WIDTH\n") 372