1857a78c0SMark de Wever#!/usr/bin/env python 2857a78c0SMark de Wever# ===----------------------------------------------------------------------===## 3857a78c0SMark de Wever# 4857a78c0SMark de Wever# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 5857a78c0SMark de Wever# See https://llvm.org/LICENSE.txt for license information. 6857a78c0SMark de Wever# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 7857a78c0SMark de Wever# 8857a78c0SMark de Wever# ===----------------------------------------------------------------------===## 9857a78c0SMark de Wever 10857a78c0SMark de Wever# The code is based on 11857a78c0SMark de Wever# https://github.com/microsoft/STL/blob/main/tools/unicode_properties_parse/grapheme_break_property_data_gen.py 12857a78c0SMark de Wever# 13857a78c0SMark de Wever# Copyright (c) Microsoft Corporation. 14857a78c0SMark de Wever# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 15857a78c0SMark de Wever 16857a78c0SMark de Weverfrom io import StringIO 17857a78c0SMark de Weverfrom pathlib import Path 180d3c40b8SStephan T. Lavavejfrom dataclasses import dataclass 19857a78c0SMark de Weverfrom typing import Optional 20857a78c0SMark de Weverimport re 21130b1816SMark de Weverimport sys 22857a78c0SMark de Wever 23857a78c0SMark de Wever 24857a78c0SMark de Wever@dataclass 25857a78c0SMark de Weverclass PropertyRange: 26857a78c0SMark de Wever lower: int = -1 27857a78c0SMark de Wever upper: int = -1 28857a78c0SMark de Wever prop: str = None 29857a78c0SMark de Wever 30857a78c0SMark de Wever 31857a78c0SMark de Wever@dataclass 32857a78c0SMark de Weverclass Entry: 33857a78c0SMark de Wever lower: int = -1 34857a78c0SMark de Wever offset: int = -1 35857a78c0SMark de Wever prop: int = -1 36857a78c0SMark de Wever 37857a78c0SMark de Wever 38857a78c0SMark de WeverLINE_REGEX = re.compile( 39857a78c0SMark de Wever r"^(?P<lower>[0-9A-F]{4,5})(?:\.\.(?P<upper>[0-9A-F]{4,5}))?\s*;\s*(?P<prop>\w+)" 40857a78c0SMark de Wever) 41857a78c0SMark de Wever 42857a78c0SMark de Wever 43857a78c0SMark de Weverdef parsePropertyLine(inputLine: str) -> Optional[PropertyRange]: 44857a78c0SMark de Wever result = PropertyRange() 45857a78c0SMark de Wever if m := LINE_REGEX.match(inputLine): 46857a78c0SMark de Wever lower_str, upper_str, result.prop = m.group("lower", "upper", "prop") 47857a78c0SMark de Wever result.lower = int(lower_str, base=16) 48857a78c0SMark de Wever result.upper = result.lower 49857a78c0SMark de Wever if upper_str is not None: 50857a78c0SMark de Wever result.upper = int(upper_str, base=16) 51857a78c0SMark de Wever return result 52857a78c0SMark de Wever 53857a78c0SMark de Wever else: 54857a78c0SMark de Wever return None 55857a78c0SMark de Wever 56857a78c0SMark de Wever 57857a78c0SMark de Weverdef compactPropertyRanges(input: list[PropertyRange]) -> list[PropertyRange]: 58857a78c0SMark de Wever """ 59857a78c0SMark de Wever Merges consecutive ranges with the same property to one range. 60857a78c0SMark de Wever 61857a78c0SMark de Wever Merging the ranges results in fewer ranges in the output table, 62857a78c0SMark de Wever reducing binary and improving lookup performance. 63857a78c0SMark de Wever """ 64857a78c0SMark de Wever result = list() 65857a78c0SMark de Wever for x in input: 66857a78c0SMark de Wever if ( 67857a78c0SMark de Wever len(result) 68857a78c0SMark de Wever and result[-1].prop == x.prop 69857a78c0SMark de Wever and result[-1].upper + 1 == x.lower 70857a78c0SMark de Wever ): 71857a78c0SMark de Wever result[-1].upper = x.upper 72857a78c0SMark de Wever continue 73857a78c0SMark de Wever result.append(x) 74857a78c0SMark de Wever return result 75857a78c0SMark de Wever 76857a78c0SMark de Wever 77857a78c0SMark de WeverPROP_VALUE_ENUMERATOR_TEMPLATE = " __{}" 78857a78c0SMark de WeverPROP_VALUE_ENUM_TEMPLATE = """ 79857a78c0SMark de Weverenum class __property : uint8_t {{ 80857a78c0SMark de Wever // Values generated from the data files. 81857a78c0SMark de Wever{enumerators}, 82857a78c0SMark de Wever 83857a78c0SMark de Wever // The properies below aren't stored in the "database". 84857a78c0SMark de Wever 85857a78c0SMark de Wever // Text position properties. 86857a78c0SMark de Wever __sot, 87857a78c0SMark de Wever __eot, 88857a78c0SMark de Wever 89857a78c0SMark de Wever // The code unit has none of above properties. 90857a78c0SMark de Wever __none 91857a78c0SMark de Wever}}; 92857a78c0SMark de Wever""" 93857a78c0SMark de Wever 94857a78c0SMark de WeverDATA_ARRAY_TEMPLATE = """ 95857a78c0SMark de Wever/// The entries of the extended grapheme cluster bondary property table. 96857a78c0SMark de Wever/// 97857a78c0SMark de Wever/// The data is generated from 98857a78c0SMark de Wever/// - https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakProperty.txt 99857a78c0SMark de Wever/// - https://www.unicode.org/Public/UCD/latest/ucd/emoji/emoji-data.txt 100857a78c0SMark de Wever/// 101857a78c0SMark de Wever/// The data has 3 values 102898b5c9fSPiotr Fusik/// - bits [0, 3] The property. One of the values generated from the datafiles 103857a78c0SMark de Wever/// of \\ref __property 104857a78c0SMark de Wever/// - bits [4, 10] The size of the range. 105857a78c0SMark de Wever/// - bits [11, 31] The lower bound code point of the range. The upper bound of 106857a78c0SMark de Wever/// the range is lower bound + size. 107857a78c0SMark de Wever/// 108857a78c0SMark de Wever/// The 7 bits for the size allow a maximum range of 128 elements. Some ranges 109857a78c0SMark de Wever/// in the Unicode tables are larger. They are stored in multiple consecutive 110857a78c0SMark de Wever/// ranges in the data table. An alternative would be to store the sizes in a 111857a78c0SMark de Wever/// separate 16-bit value. The original MSVC STL code had such an approach, but 112857a78c0SMark de Wever/// this approach uses less space for the data and is about 4% faster in the 113857a78c0SMark de Wever/// following benchmark. 114857a78c0SMark de Wever/// libcxx/benchmarks/std_format_spec_string_unicode.bench.cpp 115b18a46e3SLouis Dionne// clang-format off 116d179176fSMark de Wever_LIBCPP_HIDE_FROM_ABI inline constexpr uint32_t __entries[{size}] = {{ 117da38bcfdSMark de Wever{entries}}}; 118b18a46e3SLouis Dionne// clang-format on 119857a78c0SMark de Wever 120857a78c0SMark de Wever/// Returns the extended grapheme cluster bondary property of a code point. 121857a78c0SMark de Wever[[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr __property __get_property(const char32_t __code_point) noexcept {{ 122857a78c0SMark de Wever // The algorithm searches for the upper bound of the range and, when found, 123857a78c0SMark de Wever // steps back one entry. This algorithm is used since the code point can be 124857a78c0SMark de Wever // anywhere in the range. After a lower bound is found the next step is to 125857a78c0SMark de Wever // compare whether the code unit is indeed in the range. 126857a78c0SMark de Wever // 127857a78c0SMark de Wever // Since the entry contains a code unit, size, and property the code point 128857a78c0SMark de Wever // being sought needs to be adjusted. Just shifting the code point to the 129857a78c0SMark de Wever // proper position doesn't work; suppose an entry has property 0, size 1, 130857a78c0SMark de Wever // and lower bound 3. This results in the entry 0x1810. 131857a78c0SMark de Wever // When searching for code point 3 it will search for 0x1800, find 0x1810 132857a78c0SMark de Wever // and moves to the previous entry. Thus the lower bound value will never 133857a78c0SMark de Wever // be found. 134857a78c0SMark de Wever // The simple solution is to set the bits belonging to the property and 135857a78c0SMark de Wever // size. Then the upper bound for code point 3 will return the entry after 136857a78c0SMark de Wever // 0x1810. After moving to the previous entry the algorithm arrives at the 137857a78c0SMark de Wever // correct entry. 1387c932cdbSMark de Wever ptrdiff_t __i = std::ranges::upper_bound(__entries, (__code_point << 11) | 0x7ffu) - __entries; 139857a78c0SMark de Wever if (__i == 0) 140857a78c0SMark de Wever return __property::__none; 141857a78c0SMark de Wever 142857a78c0SMark de Wever --__i; 143857a78c0SMark de Wever uint32_t __upper_bound = (__entries[__i] >> 11) + ((__entries[__i] >> 4) & 0x7f); 144857a78c0SMark de Wever if (__code_point <= __upper_bound) 145857a78c0SMark de Wever return static_cast<__property>(__entries[__i] & 0xf); 146857a78c0SMark de Wever 147857a78c0SMark de Wever return __property::__none; 148857a78c0SMark de Wever}} 149857a78c0SMark de Wever""" 150857a78c0SMark de Wever 151857a78c0SMark de WeverMSVC_FORMAT_UCD_TABLES_HPP_TEMPLATE = """ 152857a78c0SMark de Wever// -*- C++ -*- 153857a78c0SMark de Wever//===----------------------------------------------------------------------===// 154857a78c0SMark de Wever// 155857a78c0SMark de Wever// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 156857a78c0SMark de Wever// See https://llvm.org/LICENSE.txt for license information. 157857a78c0SMark de Wever// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 158857a78c0SMark de Wever// 159857a78c0SMark de Wever//===----------------------------------------------------------------------===// 160857a78c0SMark de Wever 161857a78c0SMark de Wever// WARNING, this entire header is generated by 162750ee8d5SMark de Wever// utils/generate_extended_grapheme_cluster_table.py 163857a78c0SMark de Wever// DO NOT MODIFY! 164857a78c0SMark de Wever 165857a78c0SMark de Wever// UNICODE, INC. LICENSE AGREEMENT - DATA FILES AND SOFTWARE 166857a78c0SMark de Wever// 167857a78c0SMark de Wever// See Terms of Use <https://www.unicode.org/copyright.html> 168857a78c0SMark de Wever// for definitions of Unicode Inc.'s Data Files and Software. 169857a78c0SMark de Wever// 170857a78c0SMark de Wever// NOTICE TO USER: Carefully read the following legal agreement. 171857a78c0SMark de Wever// BY DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING UNICODE INC.'S 172857a78c0SMark de Wever// DATA FILES ("DATA FILES"), AND/OR SOFTWARE ("SOFTWARE"), 173857a78c0SMark de Wever// YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE 174857a78c0SMark de Wever// TERMS AND CONDITIONS OF THIS AGREEMENT. 175857a78c0SMark de Wever// IF YOU DO NOT AGREE, DO NOT DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE 176857a78c0SMark de Wever// THE DATA FILES OR SOFTWARE. 177857a78c0SMark de Wever// 178857a78c0SMark de Wever// COPYRIGHT AND PERMISSION NOTICE 179857a78c0SMark de Wever// 180857a78c0SMark de Wever// Copyright (c) 1991-2022 Unicode, Inc. All rights reserved. 181857a78c0SMark de Wever// Distributed under the Terms of Use in https://www.unicode.org/copyright.html. 182857a78c0SMark de Wever// 183857a78c0SMark de Wever// Permission is hereby granted, free of charge, to any person obtaining 184857a78c0SMark de Wever// a copy of the Unicode data files and any associated documentation 185857a78c0SMark de Wever// (the "Data Files") or Unicode software and any associated documentation 186857a78c0SMark de Wever// (the "Software") to deal in the Data Files or Software 187857a78c0SMark de Wever// without restriction, including without limitation the rights to use, 188857a78c0SMark de Wever// copy, modify, merge, publish, distribute, and/or sell copies of 189857a78c0SMark de Wever// the Data Files or Software, and to permit persons to whom the Data Files 190857a78c0SMark de Wever// or Software are furnished to do so, provided that either 191857a78c0SMark de Wever// (a) this copyright and permission notice appear with all copies 192857a78c0SMark de Wever// of the Data Files or Software, or 193857a78c0SMark de Wever// (b) this copyright and permission notice appear in associated 194857a78c0SMark de Wever// Documentation. 195857a78c0SMark de Wever// 196857a78c0SMark de Wever// THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF 197857a78c0SMark de Wever// ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE 198857a78c0SMark de Wever// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 199857a78c0SMark de Wever// NONINFRINGEMENT OF THIRD PARTY RIGHTS. 200857a78c0SMark de Wever// IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS 201857a78c0SMark de Wever// NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL 202857a78c0SMark de Wever// DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, 203857a78c0SMark de Wever// DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER 204857a78c0SMark de Wever// TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 205857a78c0SMark de Wever// PERFORMANCE OF THE DATA FILES OR SOFTWARE. 206857a78c0SMark de Wever// 207857a78c0SMark de Wever// Except as contained in this notice, the name of a copyright holder 208857a78c0SMark de Wever// shall not be used in advertising or otherwise to promote the sale, 209857a78c0SMark de Wever// use or other dealings in these Data Files or Software without prior 210857a78c0SMark de Wever// written authorization of the copyright holder. 211857a78c0SMark de Wever 212857a78c0SMark de Wever#ifndef _LIBCPP___FORMAT_EXTENDED_GRAPHEME_CLUSTER_TABLE_H 213857a78c0SMark de Wever#define _LIBCPP___FORMAT_EXTENDED_GRAPHEME_CLUSTER_TABLE_H 214857a78c0SMark de Wever 2157c932cdbSMark de Wever#include <__algorithm/ranges_upper_bound.h> 216857a78c0SMark de Wever#include <__config> 217*e99c4906SNikolas Klauser#include <__cstddef/ptrdiff_t.h> 218857a78c0SMark de Wever#include <__iterator/access.h> 219857a78c0SMark de Wever#include <cstdint> 220857a78c0SMark de Wever 221857a78c0SMark de Wever#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) 222857a78c0SMark de Wever# pragma GCC system_header 223857a78c0SMark de Wever#endif 224857a78c0SMark de Wever 225857a78c0SMark de Wever_LIBCPP_BEGIN_NAMESPACE_STD 226857a78c0SMark de Wever 2274f15267dSNikolas Klauser#if _LIBCPP_STD_VER >= 20 228857a78c0SMark de Wever 229857a78c0SMark de Wevernamespace __extended_grapheme_custer_property_boundary {{ 230857a78c0SMark de Wever{content} 231da38bcfdSMark de Wever}} // namespace __extended_grapheme_custer_property_boundary 232857a78c0SMark de Wever 2334f15267dSNikolas Klauser#endif // _LIBCPP_STD_VER >= 20 234857a78c0SMark de Wever 235857a78c0SMark de Wever_LIBCPP_END_NAMESPACE_STD 236857a78c0SMark de Wever 237da38bcfdSMark de Wever#endif // _LIBCPP___FORMAT_EXTENDED_GRAPHEME_CLUSTER_TABLE_H""" 238857a78c0SMark de Wever 239857a78c0SMark de Wever 240857a78c0SMark de Weverdef property_ranges_to_table( 241857a78c0SMark de Wever ranges: list[PropertyRange], props: list[str] 242857a78c0SMark de Wever) -> list[Entry]: 243857a78c0SMark de Wever assert len(props) < 16 244857a78c0SMark de Wever result = list[Entry]() 245857a78c0SMark de Wever high = -1 246857a78c0SMark de Wever for range in sorted(ranges, key=lambda x: x.lower): 247857a78c0SMark de Wever # Validate overlapping ranges 248857a78c0SMark de Wever assert range.lower > high 249857a78c0SMark de Wever high = range.upper 250857a78c0SMark de Wever 251857a78c0SMark de Wever while True: 252857a78c0SMark de Wever e = Entry(range.lower, range.upper - range.lower, props.index(range.prop)) 253857a78c0SMark de Wever if e.offset <= 127: 254857a78c0SMark de Wever result.append(e) 255857a78c0SMark de Wever break 256857a78c0SMark de Wever e.offset = 127 257857a78c0SMark de Wever result.append(e) 258857a78c0SMark de Wever range.lower += 128 259857a78c0SMark de Wever return result 260857a78c0SMark de Wever 261857a78c0SMark de Wever 262857a78c0SMark de Wevercpp_entrytemplate = " 0x{:08x}" 263857a78c0SMark de Wever 264857a78c0SMark de Wever 265857a78c0SMark de Weverdef generate_cpp_data(prop_name: str, ranges: list[PropertyRange]) -> str: 266857a78c0SMark de Wever result = StringIO() 267857a78c0SMark de Wever prop_values = sorted(set(x.prop for x in ranges)) 268857a78c0SMark de Wever table = property_ranges_to_table(ranges, prop_values) 269857a78c0SMark de Wever enumerator_values = [PROP_VALUE_ENUMERATOR_TEMPLATE.format(x) for x in prop_values] 270857a78c0SMark de Wever result.write( 271da38bcfdSMark de Wever PROP_VALUE_ENUM_TEMPLATE.format(enumerators=",\n".join(enumerator_values)) 272857a78c0SMark de Wever ) 273857a78c0SMark de Wever result.write( 274857a78c0SMark de Wever DATA_ARRAY_TEMPLATE.format( 275857a78c0SMark de Wever prop_name=prop_name, 276857a78c0SMark de Wever size=len(table), 277da38bcfdSMark de Wever entries=",\n".join( 278857a78c0SMark de Wever [ 279857a78c0SMark de Wever cpp_entrytemplate.format(x.lower << 11 | x.offset << 4 | x.prop) 280857a78c0SMark de Wever for x in table 281857a78c0SMark de Wever ] 282857a78c0SMark de Wever ), 283857a78c0SMark de Wever ) 284857a78c0SMark de Wever ) 285857a78c0SMark de Wever 286857a78c0SMark de Wever return result.getvalue() 287857a78c0SMark de Wever 288857a78c0SMark de Wever 289857a78c0SMark de Weverdef generate_data_tables() -> str: 290857a78c0SMark de Wever """ 291857a78c0SMark de Wever Generate Unicode data for inclusion into <format> from 29259e66c51SMark de Wever - https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakProperty.txt 29359e66c51SMark de Wever - https://www.unicode.org/Public/UCD/latest/ucd/emoji/emoji-data.txt 29459e66c51SMark de Wever - https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt 295857a78c0SMark de Wever 29659e66c51SMark de Wever These files are expected to be stored in the same directory as this script. 297857a78c0SMark de Wever """ 29859e66c51SMark de Wever root = Path(__file__).absolute().parent / "data" / "unicode" 29959e66c51SMark de Wever gbp_data_path = root / "GraphemeBreakProperty.txt" 30059e66c51SMark de Wever emoji_data_path = root / "emoji-data.txt" 30159e66c51SMark de Wever 302857a78c0SMark de Wever gbp_ranges = list() 303857a78c0SMark de Wever emoji_ranges = list() 304857a78c0SMark de Wever with gbp_data_path.open(encoding="utf-8") as f: 305857a78c0SMark de Wever gbp_ranges = compactPropertyRanges( 306857a78c0SMark de Wever [x for line in f if (x := parsePropertyLine(line))] 307857a78c0SMark de Wever ) 308857a78c0SMark de Wever with emoji_data_path.open(encoding="utf-8") as f: 309857a78c0SMark de Wever emoji_ranges = compactPropertyRanges( 310857a78c0SMark de Wever [x for line in f if (x := parsePropertyLine(line))] 311857a78c0SMark de Wever ) 312857a78c0SMark de Wever 313857a78c0SMark de Wever [gbp_ranges.append(x) for x in emoji_ranges if x.prop == "Extended_Pictographic"] 314857a78c0SMark de Wever gpb_cpp_data = generate_cpp_data("Grapheme_Break", gbp_ranges) 315857a78c0SMark de Wever return "\n".join([gpb_cpp_data]) 316857a78c0SMark de Wever 317857a78c0SMark de Wever 318857a78c0SMark de Weverif __name__ == "__main__": 319130b1816SMark de Wever if len(sys.argv) == 2: 320130b1816SMark de Wever sys.stdout = open(sys.argv[1], "w") 321857a78c0SMark de Wever print( 322857a78c0SMark de Wever MSVC_FORMAT_UCD_TABLES_HPP_TEMPLATE.lstrip().format( 323857a78c0SMark de Wever content=generate_data_tables() 324857a78c0SMark de Wever ) 325857a78c0SMark de Wever ) 326