1#!/usr/bin/env python 2# ===----------------------------------------------------------------------===## 3# 4# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 5# See https://llvm.org/LICENSE.txt for license information. 6# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 7# 8# ===----------------------------------------------------------------------===## 9 10# The code is based on 11# https://github.com/microsoft/STL/blob/main/tools/unicode_properties_parse/grapheme_break_property_data_gen.py 12# 13# Copyright (c) Microsoft Corporation. 14# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 15 16from io import StringIO 17from pathlib import Path 18from dataclasses import dataclass 19from typing import Optional 20import re 21import sys 22 23 24@dataclass 25class PropertyRange: 26 lower: int = -1 27 upper: int = -1 28 prop: str = None 29 30 31@dataclass 32class Entry: 33 lower: int = -1 34 offset: int = -1 35 36 37LINE_REGEX = re.compile( 38 r"^(?P<lower>[0-9A-F]{4,6})(?:\.\.(?P<upper>[0-9A-F]{4,6}))?\s*;\s*(?P<prop>\w+)" 39) 40 41 42def filterProperty(element: PropertyRange) -> Optional[PropertyRange]: 43 ### Matches property predicate? 44 if element.prop in ["W", "F"]: 45 return element 46 47 ### Matches hardcode ranges predicate? 48 49 # Yijing Hexagram Symbols 50 if element.lower >= 0x4DC0 and element.upper <= 0x4DFF: 51 return element 52 53 # Miscellaneous Symbols and Pictographs 54 if element.lower >= 0x1F300 and element.upper <= 0x1F5FF: 55 return element 56 57 # Supplemental Symbols and Pictographs 58 if element.lower >= 0x1F900 and element.upper <= 0x1F9FF: 59 return element 60 61 return None 62 63 64def parsePropertyLine(inputLine: str) -> Optional[PropertyRange]: 65 result = PropertyRange() 66 if m := LINE_REGEX.match(inputLine): 67 lower_str, upper_str, result.prop = m.group("lower", "upper", "prop") 68 result.lower = int(lower_str, base=16) 69 result.upper = result.lower 70 if upper_str is not None: 71 result.upper = int(upper_str, base=16) 72 return result 73 74 else: 75 return None 76 77 78def compactPropertyRanges(input: list[PropertyRange]) -> list[PropertyRange]: 79 """ 80 Merges overlapping and consecutive ranges to one range. 81 82 Since the input properties are filtered the exact property isn't 83 interesting anymore. The properties in the output are merged to aid 84 debugging. 85 Merging the ranges results in fewer ranges in the output table, 86 reducing binary and improving lookup performance. 87 """ 88 result = list() 89 for x in input: 90 if ( 91 len(result) 92 and x.lower > result[-1].lower 93 and x.lower <= result[-1].upper + 1 94 ): 95 result[-1].upper = max(result[-1].upper, x.upper) 96 result[-1].prop += f" {x.prop}" 97 continue 98 result.append(x) 99 return result 100 101 102DATA_ARRAY_TEMPLATE = r""" 103/// The entries of the characters with an estimated width of 2. 104/// 105/// Contains the entries for [format.string.std]/12 106/// - Any code point with the East_Asian_Width="W" or East_Asian_Width="F" 107/// Derived Extracted Property as described by UAX #44 108/// - U+4DC0 - U+4DFF (Yijing Hexagram Symbols) 109/// - U+1F300 - U+1F5FF (Miscellaneous Symbols and Pictographs) 110/// - U+1F900 - U+1F9FF (Supplemental Symbols and Pictographs) 111/// 112/// The data is generated from 113/// - https://www.unicode.org/Public/UCD/latest/ucd/EastAsianWidth.txt 114/// - The "overrides" in [format.string.std]/12 115/// 116/// The format of EastAsianWidth.txt is two fields separated by a semicolon. 117/// Field 0: Unicode code point value or range of code point values 118/// Field 1: East_Asian_Width property, consisting of one of the following values: 119/// "A", "F", "H", "N", "Na", "W" 120/// - All code points, assigned or unassigned, that are not listed 121/// explicitly are given the value "N". 122/// - The unassigned code points in the following blocks default to "W": 123/// CJK Unified Ideographs Extension A: U+3400..U+4DBF 124/// CJK Unified Ideographs: U+4E00..U+9FFF 125/// CJK Compatibility Ideographs: U+F900..U+FAFF 126/// - All undesignated code points in Planes 2 and 3, whether inside or 127/// outside of allocated blocks, default to "W": 128/// Plane 2: U+20000..U+2FFFD 129/// Plane 3: U+30000..U+3FFFD 130/// 131/// The table is similar to the table 132/// __extended_grapheme_custer_property_boundary::__entries 133/// which explains the details of these classes. The only difference is this 134/// table lacks a property, thus having more bits available for the size. 135/// 136/// The maximum code point that has an estimated width of 2 is U+3FFFD. This 137/// value can be encoded in 18 bits. Thus the upper 3 bits of the code point 138/// are always 0. These 3 bits are used to enlarge the offset range. This 139/// optimization reduces the table in Unicode 15 from 184 to 104 entries, 140/// saving 320 bytes. 141/// 142/// The data has 2 values: 143/// - bits [0, 13] The size of the range, allowing 16384 elements. 144/// - bits [14, 31] The lower bound code point of the range. The upper bound of 145/// the range is lower bound + size. 146_LIBCPP_HIDE_FROM_ABI inline constexpr uint32_t __entries[{size}] = {{ 147{entries}}}; 148 149/// The upper bound entry of EastAsianWidth.txt. 150/// 151/// Values greater than this value may have more than 18 significant bits. 152/// They always have a width of 1. This property makes it possible to store 153/// the table in its compact form. 154inline constexpr uint32_t __table_upper_bound = 0x{upper_bound:08x}; 155 156/// Returns the estimated width of a Unicode code point. 157/// 158/// \\pre The code point is a valid Unicode code point. 159[[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr int __estimated_width(const char32_t __code_point) noexcept {{ 160 // Since __table_upper_bound contains the unshifted range do the 161 // comparison without shifting. 162 if (__code_point > __table_upper_bound) [[unlikely]] 163 return 1; 164 165 // When the code-point is less than the first element in the table 166 // the lookup is quite expensive. Since quite some scripts are in 167 // that range, it makes sense to validate that first. 168 // The std_format_spec_string_unicode benchmark gives a measurable 169 // improvement. 170 if (__code_point < (__entries[0] >> 14)) 171 return 1; 172 173 ptrdiff_t __i = std::ranges::upper_bound(__entries, (__code_point << 14) | 0x3fffu) - __entries; 174 if (__i == 0) 175 return 1; 176 177 --__i; 178 uint32_t __upper_bound = (__entries[__i] >> 14) + (__entries[__i] & 0x3fffu); 179 return 1 + (__code_point <= __upper_bound); 180}} 181""" 182 183TABLES_HPP_TEMPLATE = """ 184// -*- C++ -*- 185//===----------------------------------------------------------------------===// 186// 187// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 188// See https://llvm.org/LICENSE.txt for license information. 189// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 190// 191//===----------------------------------------------------------------------===// 192 193// WARNING, this entire header is generated by 194// utils/generate_width_estimation_table.py 195// DO NOT MODIFY! 196 197// UNICODE, INC. LICENSE AGREEMENT - DATA FILES AND SOFTWARE 198// 199// See Terms of Use <https://www.unicode.org/copyright.html> 200// for definitions of Unicode Inc.'s Data Files and Software. 201// 202// NOTICE TO USER: Carefully read the following legal agreement. 203// BY DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING UNICODE INC.'S 204// DATA FILES ("DATA FILES"), AND/OR SOFTWARE ("SOFTWARE"), 205// YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE 206// TERMS AND CONDITIONS OF THIS AGREEMENT. 207// IF YOU DO NOT AGREE, DO NOT DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE 208// THE DATA FILES OR SOFTWARE. 209// 210// COPYRIGHT AND PERMISSION NOTICE 211// 212// Copyright (c) 1991-2022 Unicode, Inc. All rights reserved. 213// Distributed under the Terms of Use in https://www.unicode.org/copyright.html. 214// 215// Permission is hereby granted, free of charge, to any person obtaining 216// a copy of the Unicode data files and any associated documentation 217// (the "Data Files") or Unicode software and any associated documentation 218// (the "Software") to deal in the Data Files or Software 219// without restriction, including without limitation the rights to use, 220// copy, modify, merge, publish, distribute, and/or sell copies of 221// the Data Files or Software, and to permit persons to whom the Data Files 222// or Software are furnished to do so, provided that either 223// (a) this copyright and permission notice appear with all copies 224// of the Data Files or Software, or 225// (b) this copyright and permission notice appear in associated 226// Documentation. 227// 228// THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF 229// ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE 230// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 231// NONINFRINGEMENT OF THIRD PARTY RIGHTS. 232// IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS 233// NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL 234// DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, 235// DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER 236// TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 237// PERFORMANCE OF THE DATA FILES OR SOFTWARE. 238// 239// Except as contained in this notice, the name of a copyright holder 240// shall not be used in advertising or otherwise to promote the sale, 241// use or other dealings in these Data Files or Software without prior 242// written authorization of the copyright holder. 243 244#ifndef _LIBCPP___FORMAT_WIDTH_ESTIMATION_TABLE_H 245#define _LIBCPP___FORMAT_WIDTH_ESTIMATION_TABLE_H 246 247#include <__algorithm/ranges_upper_bound.h> 248#include <__config> 249#include <__cstddef/ptrdiff_t.h> 250#include <cstdint> 251 252#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) 253# pragma GCC system_header 254#endif 255 256_LIBCPP_BEGIN_NAMESPACE_STD 257 258#if _LIBCPP_STD_VER >= 20 259 260namespace __width_estimation_table {{ 261{content} 262}} // namespace __width_estimation_table 263 264#endif // _LIBCPP_STD_VER >= 20 265 266_LIBCPP_END_NAMESPACE_STD 267 268#endif // _LIBCPP___FORMAT_WIDTH_ESTIMATION_TABLE_H""" 269 270 271def property_ranges_to_table(ranges: list[PropertyRange]) -> list[Entry]: 272 # The maximum value that can be encoded in the available bits in the 273 # __entries table. 274 upper_bound = 0x3FFFF 275 # The maximum offset in an __entries entry. Larger offsets will be 276 # splitted and stored in multiple entries. 277 chunk = 16384 278 result = list[Entry]() 279 high = -1 280 for range in sorted(ranges, key=lambda x: x.lower): 281 # Validate overlapping ranges 282 assert range.lower > high 283 high = range.upper 284 assert high <= upper_bound 285 286 while True: 287 e = Entry(range.lower, range.upper - range.lower) 288 if e.offset < chunk: 289 result.append(e) 290 break 291 e.offset = chunk - 1 292 result.append(e) 293 range.lower += chunk 294 return result 295 296 297cpp_entrytemplate = " 0x{:08x} /* {:08x} - {:08x} [{:>5}] */" 298 299 300def generate_cpp_data(ranges: list[PropertyRange], upper_bound: int) -> str: 301 result = StringIO() 302 table = property_ranges_to_table(ranges) 303 result.write( 304 DATA_ARRAY_TEMPLATE.format( 305 size=len(table), 306 entries=", //\n".join( 307 [ 308 cpp_entrytemplate.format( 309 x.lower << 14 | x.offset, 310 x.lower, 311 x.lower + x.offset, 312 x.offset + 1, 313 ) 314 for x in table 315 ] 316 ), 317 upper_bound=upper_bound, 318 ) 319 ) 320 321 return result.getvalue() 322 323 324def generate_data_tables() -> str: 325 """ 326 Generate Unicode data for [format.string.std]/12 327 """ 328 east_asian_width_path = ( 329 Path(__file__).absolute().parent / "data" / "unicode" / "EastAsianWidth.txt" 330 ) 331 332 properties = list() 333 with east_asian_width_path.open(encoding="utf-8") as f: 334 properties.extend( 335 list( 336 filter( 337 filterProperty, 338 [x for line in f if (x := parsePropertyLine(line))], 339 ) 340 ) 341 ) 342 # The range U+4DC0 - U+4DFF is neutral and should not be in the table 343 # The range U+1F300 - U+1F5FF is partly in the range, for example 344 # 1F300..1F320;W # So [33] CYCLONE..SHOOTING STAR 345 # 1F321..1F32C;N # So [12] THERMOMETER..WIND BLOWING FACE 346 # 1F32D..1F335;W # So [9] HOT DOG..CACTUS 347 # The first and last ranges are present, but the second isn't 348 349 # Validate the hardcode ranges are present 350 351 # Yijing Hexagram Symbols 352 for i in range(0x4DC0, 0x4DFF + 1): 353 assert [x for x in properties if i >= x.lower and i <= x.upper] 354 355 # Miscellaneous Symbols and Pictographs 356 for i in range(0x1F300, 0x1F5FF + 1): 357 assert [x for x in properties if i >= x.lower and i <= x.upper] 358 359 # Miscellaneous Symbols and Pictographs 360 for i in range(0x1F900, 0x1F9FF + 1): 361 assert [x for x in properties if i >= x.lower and i <= x.upper] 362 363 data = compactPropertyRanges(sorted(properties, key=lambda x: x.lower)) 364 365 return "\n".join([generate_cpp_data(data, data[-1].upper)]) 366 367 368if __name__ == "__main__": 369 if len(sys.argv) == 2: 370 sys.stdout = open(sys.argv[1], "w") 371 print(TABLES_HPP_TEMPLATE.lstrip().format(content=generate_data_tables())) 372