1#!/usr/bin/env python 2# ===----------------------------------------------------------------------===## 3# 4# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 5# See https://llvm.org/LICENSE.txt for license information. 6# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 7# 8# ===----------------------------------------------------------------------===## 9 10# The code is based on 11# https://github.com/microsoft/STL/blob/main/tools/unicode_properties_parse/grapheme_break_property_data_gen.py 12# 13# Copyright (c) Microsoft Corporation. 14# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 15 16from io import StringIO 17from pathlib import Path 18from dataclasses import dataclass 19from typing import Optional 20import re 21import sys 22 23 24@dataclass 25class PropertyRange: 26 lower: int = -1 27 upper: int = -1 28 prop: str = None 29 30 31@dataclass 32class Entry: 33 lower: int = -1 34 offset: int = -1 35 prop: int = -1 36 37 38LINE_REGEX = re.compile( 39 r"^(?P<lower>[0-9A-F]{4,5})(?:\.\.(?P<upper>[0-9A-F]{4,5}))?\s*;\s*(?P<prop>\w+)" 40) 41 42 43def parsePropertyLine(inputLine: str) -> Optional[PropertyRange]: 44 result = PropertyRange() 45 if m := LINE_REGEX.match(inputLine): 46 lower_str, upper_str, result.prop = m.group("lower", "upper", "prop") 47 result.lower = int(lower_str, base=16) 48 result.upper = result.lower 49 if upper_str is not None: 50 result.upper = int(upper_str, base=16) 51 return result 52 53 else: 54 return None 55 56 57def compactPropertyRanges(input: list[PropertyRange]) -> list[PropertyRange]: 58 """ 59 Merges consecutive ranges with the same property to one range. 60 61 Merging the ranges results in fewer ranges in the output table, 62 reducing binary and improving lookup performance. 63 """ 64 result = list() 65 for x in input: 66 if ( 67 len(result) 68 and result[-1].prop == x.prop 69 and result[-1].upper + 1 == x.lower 70 ): 71 result[-1].upper = x.upper 72 continue 73 result.append(x) 74 return result 75 76 77PROP_VALUE_ENUMERATOR_TEMPLATE = " __{}" 78PROP_VALUE_ENUM_TEMPLATE = """ 79enum class __property : uint8_t {{ 80 // Values generated from the data files. 81{enumerators}, 82 83 // The properies below aren't stored in the "database". 84 85 // Text position properties. 86 __sot, 87 __eot, 88 89 // The code unit has none of above properties. 90 __none 91}}; 92""" 93 94DATA_ARRAY_TEMPLATE = """ 95/// The entries of the extended grapheme cluster bondary property table. 96/// 97/// The data is generated from 98/// - https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakProperty.txt 99/// - https://www.unicode.org/Public/UCD/latest/ucd/emoji/emoji-data.txt 100/// 101/// The data has 3 values 102/// - bits [0, 3] The property. One of the values generated from the datafiles 103/// of \\ref __property 104/// - bits [4, 10] The size of the range. 105/// - bits [11, 31] The lower bound code point of the range. The upper bound of 106/// the range is lower bound + size. 107/// 108/// The 7 bits for the size allow a maximum range of 128 elements. Some ranges 109/// in the Unicode tables are larger. They are stored in multiple consecutive 110/// ranges in the data table. An alternative would be to store the sizes in a 111/// separate 16-bit value. The original MSVC STL code had such an approach, but 112/// this approach uses less space for the data and is about 4% faster in the 113/// following benchmark. 114/// libcxx/benchmarks/std_format_spec_string_unicode.bench.cpp 115// clang-format off 116_LIBCPP_HIDE_FROM_ABI inline constexpr uint32_t __entries[{size}] = {{ 117{entries}}}; 118// clang-format on 119 120/// Returns the extended grapheme cluster bondary property of a code point. 121[[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr __property __get_property(const char32_t __code_point) noexcept {{ 122 // The algorithm searches for the upper bound of the range and, when found, 123 // steps back one entry. This algorithm is used since the code point can be 124 // anywhere in the range. After a lower bound is found the next step is to 125 // compare whether the code unit is indeed in the range. 126 // 127 // Since the entry contains a code unit, size, and property the code point 128 // being sought needs to be adjusted. Just shifting the code point to the 129 // proper position doesn't work; suppose an entry has property 0, size 1, 130 // and lower bound 3. This results in the entry 0x1810. 131 // When searching for code point 3 it will search for 0x1800, find 0x1810 132 // and moves to the previous entry. Thus the lower bound value will never 133 // be found. 134 // The simple solution is to set the bits belonging to the property and 135 // size. Then the upper bound for code point 3 will return the entry after 136 // 0x1810. After moving to the previous entry the algorithm arrives at the 137 // correct entry. 138 ptrdiff_t __i = std::ranges::upper_bound(__entries, (__code_point << 11) | 0x7ffu) - __entries; 139 if (__i == 0) 140 return __property::__none; 141 142 --__i; 143 uint32_t __upper_bound = (__entries[__i] >> 11) + ((__entries[__i] >> 4) & 0x7f); 144 if (__code_point <= __upper_bound) 145 return static_cast<__property>(__entries[__i] & 0xf); 146 147 return __property::__none; 148}} 149""" 150 151MSVC_FORMAT_UCD_TABLES_HPP_TEMPLATE = """ 152// -*- C++ -*- 153//===----------------------------------------------------------------------===// 154// 155// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 156// See https://llvm.org/LICENSE.txt for license information. 157// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 158// 159//===----------------------------------------------------------------------===// 160 161// WARNING, this entire header is generated by 162// utils/generate_extended_grapheme_cluster_table.py 163// DO NOT MODIFY! 164 165// UNICODE, INC. LICENSE AGREEMENT - DATA FILES AND SOFTWARE 166// 167// See Terms of Use <https://www.unicode.org/copyright.html> 168// for definitions of Unicode Inc.'s Data Files and Software. 169// 170// NOTICE TO USER: Carefully read the following legal agreement. 171// BY DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING UNICODE INC.'S 172// DATA FILES ("DATA FILES"), AND/OR SOFTWARE ("SOFTWARE"), 173// YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE 174// TERMS AND CONDITIONS OF THIS AGREEMENT. 175// IF YOU DO NOT AGREE, DO NOT DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE 176// THE DATA FILES OR SOFTWARE. 177// 178// COPYRIGHT AND PERMISSION NOTICE 179// 180// Copyright (c) 1991-2022 Unicode, Inc. All rights reserved. 181// Distributed under the Terms of Use in https://www.unicode.org/copyright.html. 182// 183// Permission is hereby granted, free of charge, to any person obtaining 184// a copy of the Unicode data files and any associated documentation 185// (the "Data Files") or Unicode software and any associated documentation 186// (the "Software") to deal in the Data Files or Software 187// without restriction, including without limitation the rights to use, 188// copy, modify, merge, publish, distribute, and/or sell copies of 189// the Data Files or Software, and to permit persons to whom the Data Files 190// or Software are furnished to do so, provided that either 191// (a) this copyright and permission notice appear with all copies 192// of the Data Files or Software, or 193// (b) this copyright and permission notice appear in associated 194// Documentation. 195// 196// THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF 197// ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE 198// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 199// NONINFRINGEMENT OF THIRD PARTY RIGHTS. 200// IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS 201// NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL 202// DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, 203// DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER 204// TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 205// PERFORMANCE OF THE DATA FILES OR SOFTWARE. 206// 207// Except as contained in this notice, the name of a copyright holder 208// shall not be used in advertising or otherwise to promote the sale, 209// use or other dealings in these Data Files or Software without prior 210// written authorization of the copyright holder. 211 212#ifndef _LIBCPP___FORMAT_EXTENDED_GRAPHEME_CLUSTER_TABLE_H 213#define _LIBCPP___FORMAT_EXTENDED_GRAPHEME_CLUSTER_TABLE_H 214 215#include <__algorithm/ranges_upper_bound.h> 216#include <__config> 217#include <__cstddef/ptrdiff_t.h> 218#include <__iterator/access.h> 219#include <cstdint> 220 221#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) 222# pragma GCC system_header 223#endif 224 225_LIBCPP_BEGIN_NAMESPACE_STD 226 227#if _LIBCPP_STD_VER >= 20 228 229namespace __extended_grapheme_custer_property_boundary {{ 230{content} 231}} // namespace __extended_grapheme_custer_property_boundary 232 233#endif // _LIBCPP_STD_VER >= 20 234 235_LIBCPP_END_NAMESPACE_STD 236 237#endif // _LIBCPP___FORMAT_EXTENDED_GRAPHEME_CLUSTER_TABLE_H""" 238 239 240def property_ranges_to_table( 241 ranges: list[PropertyRange], props: list[str] 242) -> list[Entry]: 243 assert len(props) < 16 244 result = list[Entry]() 245 high = -1 246 for range in sorted(ranges, key=lambda x: x.lower): 247 # Validate overlapping ranges 248 assert range.lower > high 249 high = range.upper 250 251 while True: 252 e = Entry(range.lower, range.upper - range.lower, props.index(range.prop)) 253 if e.offset <= 127: 254 result.append(e) 255 break 256 e.offset = 127 257 result.append(e) 258 range.lower += 128 259 return result 260 261 262cpp_entrytemplate = " 0x{:08x}" 263 264 265def generate_cpp_data(prop_name: str, ranges: list[PropertyRange]) -> str: 266 result = StringIO() 267 prop_values = sorted(set(x.prop for x in ranges)) 268 table = property_ranges_to_table(ranges, prop_values) 269 enumerator_values = [PROP_VALUE_ENUMERATOR_TEMPLATE.format(x) for x in prop_values] 270 result.write( 271 PROP_VALUE_ENUM_TEMPLATE.format(enumerators=",\n".join(enumerator_values)) 272 ) 273 result.write( 274 DATA_ARRAY_TEMPLATE.format( 275 prop_name=prop_name, 276 size=len(table), 277 entries=",\n".join( 278 [ 279 cpp_entrytemplate.format(x.lower << 11 | x.offset << 4 | x.prop) 280 for x in table 281 ] 282 ), 283 ) 284 ) 285 286 return result.getvalue() 287 288 289def generate_data_tables() -> str: 290 """ 291 Generate Unicode data for inclusion into <format> from 292 - https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakProperty.txt 293 - https://www.unicode.org/Public/UCD/latest/ucd/emoji/emoji-data.txt 294 - https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt 295 296 These files are expected to be stored in the same directory as this script. 297 """ 298 root = Path(__file__).absolute().parent / "data" / "unicode" 299 gbp_data_path = root / "GraphemeBreakProperty.txt" 300 emoji_data_path = root / "emoji-data.txt" 301 302 gbp_ranges = list() 303 emoji_ranges = list() 304 with gbp_data_path.open(encoding="utf-8") as f: 305 gbp_ranges = compactPropertyRanges( 306 [x for line in f if (x := parsePropertyLine(line))] 307 ) 308 with emoji_data_path.open(encoding="utf-8") as f: 309 emoji_ranges = compactPropertyRanges( 310 [x for line in f if (x := parsePropertyLine(line))] 311 ) 312 313 [gbp_ranges.append(x) for x in emoji_ranges if x.prop == "Extended_Pictographic"] 314 gpb_cpp_data = generate_cpp_data("Grapheme_Break", gbp_ranges) 315 return "\n".join([gpb_cpp_data]) 316 317 318if __name__ == "__main__": 319 if len(sys.argv) == 2: 320 sys.stdout = open(sys.argv[1], "w") 321 print( 322 MSVC_FORMAT_UCD_TABLES_HPP_TEMPLATE.lstrip().format( 323 content=generate_data_tables() 324 ) 325 ) 326