1*4bdff4beSrobert#!/usr/bin/env python 2*4bdff4beSrobert# ===----------------------------------------------------------------------===## 3*4bdff4beSrobert# 4*4bdff4beSrobert# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 5*4bdff4beSrobert# See https://llvm.org/LICENSE.txt for license information. 6*4bdff4beSrobert# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 7*4bdff4beSrobert# 8*4bdff4beSrobert# ===----------------------------------------------------------------------===## 9*4bdff4beSrobert 10*4bdff4beSrobert# The code is based on 11*4bdff4beSrobert# https://github.com/microsoft/STL/blob/main/tools/unicode_properties_parse/grapheme_break_property_data_gen.py 12*4bdff4beSrobert# 13*4bdff4beSrobert# Copyright (c) Microsoft Corporation. 14*4bdff4beSrobert# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 15*4bdff4beSrobert 16*4bdff4beSrobertfrom io import StringIO 17*4bdff4beSrobertfrom pathlib import Path 18*4bdff4beSrobertfrom dataclasses import dataclass, field 19*4bdff4beSrobertfrom typing import Optional 20*4bdff4beSrobertimport re 21*4bdff4beSrobertimport sys 22*4bdff4beSrobert 23*4bdff4beSrobert 24*4bdff4beSrobert@dataclass 25*4bdff4beSrobertclass PropertyRange: 26*4bdff4beSrobert lower: int = -1 27*4bdff4beSrobert upper: int = -1 28*4bdff4beSrobert prop: str = None 29*4bdff4beSrobert 30*4bdff4beSrobert 31*4bdff4beSrobert@dataclass 32*4bdff4beSrobertclass Entry: 33*4bdff4beSrobert lower: int = -1 34*4bdff4beSrobert offset: int = -1 35*4bdff4beSrobert prop: int = -1 36*4bdff4beSrobert 37*4bdff4beSrobert 38*4bdff4beSrobertLINE_REGEX = re.compile( 39*4bdff4beSrobert r"^(?P<lower>[0-9A-F]{4,5})(?:\.\.(?P<upper>[0-9A-F]{4,5}))?\s*;\s*(?P<prop>\w+)" 40*4bdff4beSrobert) 41*4bdff4beSrobert 42*4bdff4beSrobert 43*4bdff4beSrobertdef parsePropertyLine(inputLine: str) -> Optional[PropertyRange]: 44*4bdff4beSrobert result = PropertyRange() 45*4bdff4beSrobert if m := LINE_REGEX.match(inputLine): 46*4bdff4beSrobert lower_str, upper_str, result.prop = m.group("lower", "upper", "prop") 47*4bdff4beSrobert result.lower = int(lower_str, base=16) 48*4bdff4beSrobert result.upper = result.lower 49*4bdff4beSrobert if upper_str is not None: 50*4bdff4beSrobert result.upper = int(upper_str, base=16) 51*4bdff4beSrobert return result 52*4bdff4beSrobert 53*4bdff4beSrobert else: 54*4bdff4beSrobert return None 55*4bdff4beSrobert 56*4bdff4beSrobert 57*4bdff4beSrobertdef compactPropertyRanges(input: list[PropertyRange]) -> list[PropertyRange]: 58*4bdff4beSrobert """ 59*4bdff4beSrobert Merges consecutive ranges with the same property to one range. 60*4bdff4beSrobert 61*4bdff4beSrobert Merging the ranges results in fewer ranges in the output table, 62*4bdff4beSrobert reducing binary and improving lookup performance. 63*4bdff4beSrobert """ 64*4bdff4beSrobert result = list() 65*4bdff4beSrobert for x in input: 66*4bdff4beSrobert if ( 67*4bdff4beSrobert len(result) 68*4bdff4beSrobert and result[-1].prop == x.prop 69*4bdff4beSrobert and result[-1].upper + 1 == x.lower 70*4bdff4beSrobert ): 71*4bdff4beSrobert result[-1].upper = x.upper 72*4bdff4beSrobert continue 73*4bdff4beSrobert result.append(x) 74*4bdff4beSrobert return result 75*4bdff4beSrobert 76*4bdff4beSrobert 77*4bdff4beSrobertPROP_VALUE_ENUMERATOR_TEMPLATE = " __{}" 78*4bdff4beSrobertPROP_VALUE_ENUM_TEMPLATE = """ 79*4bdff4beSrobertenum class __property : uint8_t {{ 80*4bdff4beSrobert // Values generated from the data files. 81*4bdff4beSrobert{enumerators}, 82*4bdff4beSrobert 83*4bdff4beSrobert // The properies below aren't stored in the "database". 84*4bdff4beSrobert 85*4bdff4beSrobert // Text position properties. 86*4bdff4beSrobert __sot, 87*4bdff4beSrobert __eot, 88*4bdff4beSrobert 89*4bdff4beSrobert // The code unit has none of above properties. 90*4bdff4beSrobert __none 91*4bdff4beSrobert}}; 92*4bdff4beSrobert""" 93*4bdff4beSrobert 94*4bdff4beSrobertDATA_ARRAY_TEMPLATE = """ 95*4bdff4beSrobert/// The entries of the extended grapheme cluster bondary property table. 96*4bdff4beSrobert/// 97*4bdff4beSrobert/// The data is generated from 98*4bdff4beSrobert/// - https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakProperty.txt 99*4bdff4beSrobert/// - https://www.unicode.org/Public/UCD/latest/ucd/emoji/emoji-data.txt 100*4bdff4beSrobert/// 101*4bdff4beSrobert/// The data has 3 values 102*4bdff4beSrobert/// - bits [0, 3] The property. One of the values generated from the datafiles 103*4bdff4beSrobert/// of \\ref __property 104*4bdff4beSrobert/// - bits [4, 10] The size of the range. 105*4bdff4beSrobert/// - bits [11, 31] The lower bound code point of the range. The upper bound of 106*4bdff4beSrobert/// the range is lower bound + size. 107*4bdff4beSrobert/// 108*4bdff4beSrobert/// The 7 bits for the size allow a maximum range of 128 elements. Some ranges 109*4bdff4beSrobert/// in the Unicode tables are larger. They are stored in multiple consecutive 110*4bdff4beSrobert/// ranges in the data table. An alternative would be to store the sizes in a 111*4bdff4beSrobert/// separate 16-bit value. The original MSVC STL code had such an approach, but 112*4bdff4beSrobert/// this approach uses less space for the data and is about 4% faster in the 113*4bdff4beSrobert/// following benchmark. 114*4bdff4beSrobert/// libcxx/benchmarks/std_format_spec_string_unicode.bench.cpp 115*4bdff4beSrobertinline constexpr uint32_t __entries[{size}] = {{ 116*4bdff4beSrobert{entries}}}; 117*4bdff4beSrobert 118*4bdff4beSrobert/// Returns the extended grapheme cluster bondary property of a code point. 119*4bdff4beSrobert[[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr __property __get_property(const char32_t __code_point) noexcept {{ 120*4bdff4beSrobert // The algorithm searches for the upper bound of the range and, when found, 121*4bdff4beSrobert // steps back one entry. This algorithm is used since the code point can be 122*4bdff4beSrobert // anywhere in the range. After a lower bound is found the next step is to 123*4bdff4beSrobert // compare whether the code unit is indeed in the range. 124*4bdff4beSrobert // 125*4bdff4beSrobert // Since the entry contains a code unit, size, and property the code point 126*4bdff4beSrobert // being sought needs to be adjusted. Just shifting the code point to the 127*4bdff4beSrobert // proper position doesn't work; suppose an entry has property 0, size 1, 128*4bdff4beSrobert // and lower bound 3. This results in the entry 0x1810. 129*4bdff4beSrobert // When searching for code point 3 it will search for 0x1800, find 0x1810 130*4bdff4beSrobert // and moves to the previous entry. Thus the lower bound value will never 131*4bdff4beSrobert // be found. 132*4bdff4beSrobert // The simple solution is to set the bits belonging to the property and 133*4bdff4beSrobert // size. Then the upper bound for code point 3 will return the entry after 134*4bdff4beSrobert // 0x1810. After moving to the previous entry the algorithm arrives at the 135*4bdff4beSrobert // correct entry. 136*4bdff4beSrobert ptrdiff_t __i = std::ranges::upper_bound(__entries, (__code_point << 11) | 0x7ffu) - __entries; 137*4bdff4beSrobert if (__i == 0) 138*4bdff4beSrobert return __property::__none; 139*4bdff4beSrobert 140*4bdff4beSrobert --__i; 141*4bdff4beSrobert uint32_t __upper_bound = (__entries[__i] >> 11) + ((__entries[__i] >> 4) & 0x7f); 142*4bdff4beSrobert if (__code_point <= __upper_bound) 143*4bdff4beSrobert return static_cast<__property>(__entries[__i] & 0xf); 144*4bdff4beSrobert 145*4bdff4beSrobert return __property::__none; 146*4bdff4beSrobert}} 147*4bdff4beSrobert""" 148*4bdff4beSrobert 149*4bdff4beSrobertMSVC_FORMAT_UCD_TABLES_HPP_TEMPLATE = """ 150*4bdff4beSrobert// -*- C++ -*- 151*4bdff4beSrobert//===----------------------------------------------------------------------===// 152*4bdff4beSrobert// 153*4bdff4beSrobert// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 154*4bdff4beSrobert// See https://llvm.org/LICENSE.txt for license information. 155*4bdff4beSrobert// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 156*4bdff4beSrobert// 157*4bdff4beSrobert//===----------------------------------------------------------------------===// 158*4bdff4beSrobert 159*4bdff4beSrobert// WARNING, this entire header is generated by 160*4bdff4beSrobert// utils/generate_extended_grapheme_cluster_table.py 161*4bdff4beSrobert// DO NOT MODIFY! 162*4bdff4beSrobert 163*4bdff4beSrobert// UNICODE, INC. LICENSE AGREEMENT - DATA FILES AND SOFTWARE 164*4bdff4beSrobert// 165*4bdff4beSrobert// See Terms of Use <https://www.unicode.org/copyright.html> 166*4bdff4beSrobert// for definitions of Unicode Inc.'s Data Files and Software. 167*4bdff4beSrobert// 168*4bdff4beSrobert// NOTICE TO USER: Carefully read the following legal agreement. 169*4bdff4beSrobert// BY DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING UNICODE INC.'S 170*4bdff4beSrobert// DATA FILES ("DATA FILES"), AND/OR SOFTWARE ("SOFTWARE"), 171*4bdff4beSrobert// YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE 172*4bdff4beSrobert// TERMS AND CONDITIONS OF THIS AGREEMENT. 173*4bdff4beSrobert// IF YOU DO NOT AGREE, DO NOT DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE 174*4bdff4beSrobert// THE DATA FILES OR SOFTWARE. 175*4bdff4beSrobert// 176*4bdff4beSrobert// COPYRIGHT AND PERMISSION NOTICE 177*4bdff4beSrobert// 178*4bdff4beSrobert// Copyright (c) 1991-2022 Unicode, Inc. All rights reserved. 179*4bdff4beSrobert// Distributed under the Terms of Use in https://www.unicode.org/copyright.html. 180*4bdff4beSrobert// 181*4bdff4beSrobert// Permission is hereby granted, free of charge, to any person obtaining 182*4bdff4beSrobert// a copy of the Unicode data files and any associated documentation 183*4bdff4beSrobert// (the "Data Files") or Unicode software and any associated documentation 184*4bdff4beSrobert// (the "Software") to deal in the Data Files or Software 185*4bdff4beSrobert// without restriction, including without limitation the rights to use, 186*4bdff4beSrobert// copy, modify, merge, publish, distribute, and/or sell copies of 187*4bdff4beSrobert// the Data Files or Software, and to permit persons to whom the Data Files 188*4bdff4beSrobert// or Software are furnished to do so, provided that either 189*4bdff4beSrobert// (a) this copyright and permission notice appear with all copies 190*4bdff4beSrobert// of the Data Files or Software, or 191*4bdff4beSrobert// (b) this copyright and permission notice appear in associated 192*4bdff4beSrobert// Documentation. 193*4bdff4beSrobert// 194*4bdff4beSrobert// THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF 195*4bdff4beSrobert// ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE 196*4bdff4beSrobert// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 197*4bdff4beSrobert// NONINFRINGEMENT OF THIRD PARTY RIGHTS. 198*4bdff4beSrobert// IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS 199*4bdff4beSrobert// NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL 200*4bdff4beSrobert// DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, 201*4bdff4beSrobert// DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER 202*4bdff4beSrobert// TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 203*4bdff4beSrobert// PERFORMANCE OF THE DATA FILES OR SOFTWARE. 204*4bdff4beSrobert// 205*4bdff4beSrobert// Except as contained in this notice, the name of a copyright holder 206*4bdff4beSrobert// shall not be used in advertising or otherwise to promote the sale, 207*4bdff4beSrobert// use or other dealings in these Data Files or Software without prior 208*4bdff4beSrobert// written authorization of the copyright holder. 209*4bdff4beSrobert 210*4bdff4beSrobert#ifndef _LIBCPP___FORMAT_EXTENDED_GRAPHEME_CLUSTER_TABLE_H 211*4bdff4beSrobert#define _LIBCPP___FORMAT_EXTENDED_GRAPHEME_CLUSTER_TABLE_H 212*4bdff4beSrobert 213*4bdff4beSrobert#include <__algorithm/ranges_upper_bound.h> 214*4bdff4beSrobert#include <__config> 215*4bdff4beSrobert#include <__iterator/access.h> 216*4bdff4beSrobert#include <cstddef> 217*4bdff4beSrobert#include <cstdint> 218*4bdff4beSrobert 219*4bdff4beSrobert#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) 220*4bdff4beSrobert# pragma GCC system_header 221*4bdff4beSrobert#endif 222*4bdff4beSrobert 223*4bdff4beSrobert_LIBCPP_BEGIN_NAMESPACE_STD 224*4bdff4beSrobert 225*4bdff4beSrobert#if _LIBCPP_STD_VER > 17 226*4bdff4beSrobert 227*4bdff4beSrobertnamespace __extended_grapheme_custer_property_boundary {{ 228*4bdff4beSrobert{content} 229*4bdff4beSrobert}} // namespace __extended_grapheme_custer_property_boundary 230*4bdff4beSrobert 231*4bdff4beSrobert#endif //_LIBCPP_STD_VER > 17 232*4bdff4beSrobert 233*4bdff4beSrobert_LIBCPP_END_NAMESPACE_STD 234*4bdff4beSrobert 235*4bdff4beSrobert#endif // _LIBCPP___FORMAT_EXTENDED_GRAPHEME_CLUSTER_TABLE_H""" 236*4bdff4beSrobert 237*4bdff4beSrobert 238*4bdff4beSrobertdef property_ranges_to_table( 239*4bdff4beSrobert ranges: list[PropertyRange], props: list[str] 240*4bdff4beSrobert) -> list[Entry]: 241*4bdff4beSrobert assert len(props) < 16 242*4bdff4beSrobert result = list[Entry]() 243*4bdff4beSrobert high = -1 244*4bdff4beSrobert for range in sorted(ranges, key=lambda x: x.lower): 245*4bdff4beSrobert # Validate overlapping ranges 246*4bdff4beSrobert assert range.lower > high 247*4bdff4beSrobert high = range.upper 248*4bdff4beSrobert 249*4bdff4beSrobert while True: 250*4bdff4beSrobert e = Entry(range.lower, range.upper - range.lower, props.index(range.prop)) 251*4bdff4beSrobert if e.offset <= 127: 252*4bdff4beSrobert result.append(e) 253*4bdff4beSrobert break 254*4bdff4beSrobert e.offset = 127 255*4bdff4beSrobert result.append(e) 256*4bdff4beSrobert range.lower += 128 257*4bdff4beSrobert return result 258*4bdff4beSrobert 259*4bdff4beSrobert 260*4bdff4beSrobertcpp_entrytemplate = " 0x{:08x}" 261*4bdff4beSrobert 262*4bdff4beSrobert 263*4bdff4beSrobertdef generate_cpp_data(prop_name: str, ranges: list[PropertyRange]) -> str: 264*4bdff4beSrobert result = StringIO() 265*4bdff4beSrobert prop_values = sorted(set(x.prop for x in ranges)) 266*4bdff4beSrobert table = property_ranges_to_table(ranges, prop_values) 267*4bdff4beSrobert enumerator_values = [PROP_VALUE_ENUMERATOR_TEMPLATE.format(x) for x in prop_values] 268*4bdff4beSrobert result.write( 269*4bdff4beSrobert PROP_VALUE_ENUM_TEMPLATE.format(enumerators=",\n".join(enumerator_values)) 270*4bdff4beSrobert ) 271*4bdff4beSrobert result.write( 272*4bdff4beSrobert DATA_ARRAY_TEMPLATE.format( 273*4bdff4beSrobert prop_name=prop_name, 274*4bdff4beSrobert size=len(table), 275*4bdff4beSrobert entries=",\n".join( 276*4bdff4beSrobert [ 277*4bdff4beSrobert cpp_entrytemplate.format(x.lower << 11 | x.offset << 4 | x.prop) 278*4bdff4beSrobert for x in table 279*4bdff4beSrobert ] 280*4bdff4beSrobert ), 281*4bdff4beSrobert ) 282*4bdff4beSrobert ) 283*4bdff4beSrobert 284*4bdff4beSrobert return result.getvalue() 285*4bdff4beSrobert 286*4bdff4beSrobert 287*4bdff4beSrobertdef generate_data_tables() -> str: 288*4bdff4beSrobert """ 289*4bdff4beSrobert Generate Unicode data for inclusion into <format> from 290*4bdff4beSrobert GraphemeBreakProperty.txt and emoji-data.txt. 291*4bdff4beSrobert 292*4bdff4beSrobert GraphemeBreakProperty.txt can be found at 293*4bdff4beSrobert https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakProperty.txt 294*4bdff4beSrobert 295*4bdff4beSrobert emoji-data.txt can be found at 296*4bdff4beSrobert https://www.unicode.org/Public/UCD/latest/ucd/emoji/emoji-data.txt 297*4bdff4beSrobert 298*4bdff4beSrobert Both files are expected to be in the same directory as this script. 299*4bdff4beSrobert """ 300*4bdff4beSrobert gbp_data_path = ( 301*4bdff4beSrobert Path(__file__).absolute().parent 302*4bdff4beSrobert / "data" 303*4bdff4beSrobert / "unicode" 304*4bdff4beSrobert / "GraphemeBreakProperty.txt" 305*4bdff4beSrobert ) 306*4bdff4beSrobert emoji_data_path = ( 307*4bdff4beSrobert Path(__file__).absolute().parent / "data" / "unicode" / "emoji-data.txt" 308*4bdff4beSrobert ) 309*4bdff4beSrobert gbp_ranges = list() 310*4bdff4beSrobert emoji_ranges = list() 311*4bdff4beSrobert with gbp_data_path.open(encoding="utf-8") as f: 312*4bdff4beSrobert gbp_ranges = compactPropertyRanges( 313*4bdff4beSrobert [x for line in f if (x := parsePropertyLine(line))] 314*4bdff4beSrobert ) 315*4bdff4beSrobert with emoji_data_path.open(encoding="utf-8") as f: 316*4bdff4beSrobert emoji_ranges = compactPropertyRanges( 317*4bdff4beSrobert [x for line in f if (x := parsePropertyLine(line))] 318*4bdff4beSrobert ) 319*4bdff4beSrobert 320*4bdff4beSrobert [gbp_ranges.append(x) for x in emoji_ranges if x.prop == "Extended_Pictographic"] 321*4bdff4beSrobert gpb_cpp_data = generate_cpp_data("Grapheme_Break", gbp_ranges) 322*4bdff4beSrobert return "\n".join([gpb_cpp_data]) 323*4bdff4beSrobert 324*4bdff4beSrobert 325*4bdff4beSrobertif __name__ == "__main__": 326*4bdff4beSrobert if len(sys.argv) == 2: 327*4bdff4beSrobert sys.stdout = open(sys.argv[1], "w") 328*4bdff4beSrobert print( 329*4bdff4beSrobert MSVC_FORMAT_UCD_TABLES_HPP_TEMPLATE.lstrip().format( 330*4bdff4beSrobert content=generate_data_tables() 331*4bdff4beSrobert ) 332*4bdff4beSrobert ) 333