1*4bdff4beSrobert#!/usr/bin/env python 2*4bdff4beSrobert# ===----------------------------------------------------------------------===## 3*4bdff4beSrobert# 4*4bdff4beSrobert# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 5*4bdff4beSrobert# See https://llvm.org/LICENSE.txt for license information. 6*4bdff4beSrobert# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 7*4bdff4beSrobert# 8*4bdff4beSrobert# ===----------------------------------------------------------------------===## 9*4bdff4beSrobert 10*4bdff4beSrobert# The code is based on 11*4bdff4beSrobert# https://github.com/microsoft/STL/blob/main/tools/unicode_properties_parse/grapheme_break_property_data_gen.py 12*4bdff4beSrobert# 13*4bdff4beSrobert# Copyright (c) Microsoft Corporation. 14*4bdff4beSrobert# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 15*4bdff4beSrobert 16*4bdff4beSrobertfrom io import StringIO 17*4bdff4beSrobertfrom pathlib import Path 18*4bdff4beSrobertfrom dataclasses import dataclass, field 19*4bdff4beSrobertfrom typing import Optional 20*4bdff4beSrobertimport re 21*4bdff4beSrobertimport sys 22*4bdff4beSrobert 23*4bdff4beSrobert 24*4bdff4beSrobert@dataclass 25*4bdff4beSrobertclass PropertyRange: 26*4bdff4beSrobert lower: int = -1 27*4bdff4beSrobert upper: int = -1 28*4bdff4beSrobert prop: str = None 29*4bdff4beSrobert 30*4bdff4beSrobert 31*4bdff4beSrobert@dataclass 32*4bdff4beSrobertclass Entry: 33*4bdff4beSrobert lower: int = -1 34*4bdff4beSrobert offset: int = -1 35*4bdff4beSrobert 36*4bdff4beSrobert 37*4bdff4beSrobertLINE_REGEX = re.compile( 38*4bdff4beSrobert r"^(?P<lower>[0-9A-F]{4,6})(?:\.\.(?P<upper>[0-9A-F]{4,6}))?\s*;\s*(?P<prop>\w+)" 39*4bdff4beSrobert) 40*4bdff4beSrobert 41*4bdff4beSrobert 42*4bdff4beSrobertdef filterCoreProperty(element: PropertyRange) -> Optional[PropertyRange]: 43*4bdff4beSrobert if element.prop == "Grapheme_Extend": 44*4bdff4beSrobert return element 45*4bdff4beSrobert return None 46*4bdff4beSrobert 47*4bdff4beSrobert 48*4bdff4beSrobert# https://www.unicode.org/reports/tr44/#GC_Values_Table 49*4bdff4beSrobertdef filterGeneralProperty(element: PropertyRange) -> Optional[PropertyRange]: 50*4bdff4beSrobert if element.prop in ["Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn"]: 51*4bdff4beSrobert return element 52*4bdff4beSrobert return None 53*4bdff4beSrobert 54*4bdff4beSrobert 55*4bdff4beSrobertdef parsePropertyLine(inputLine: str) -> Optional[PropertyRange]: 56*4bdff4beSrobert result = PropertyRange() 57*4bdff4beSrobert if m := LINE_REGEX.match(inputLine): 58*4bdff4beSrobert lower_str, upper_str, result.prop = m.group("lower", "upper", "prop") 59*4bdff4beSrobert result.lower = int(lower_str, base=16) 60*4bdff4beSrobert result.upper = result.lower 61*4bdff4beSrobert if upper_str is not None: 62*4bdff4beSrobert result.upper = int(upper_str, base=16) 63*4bdff4beSrobert return result 64*4bdff4beSrobert 65*4bdff4beSrobert else: 66*4bdff4beSrobert return None 67*4bdff4beSrobert 68*4bdff4beSrobert 69*4bdff4beSrobertdef compactPropertyRanges(input: list[PropertyRange]) -> list[PropertyRange]: 70*4bdff4beSrobert """ 71*4bdff4beSrobert Merges overlapping and consecutive ranges to one range. 72*4bdff4beSrobert 73*4bdff4beSrobert Since the input properties are filtered the exact property isn't 74*4bdff4beSrobert interesting anymore. The properties in the output are merged to aid 75*4bdff4beSrobert debugging. 76*4bdff4beSrobert Merging the ranges results in fewer ranges in the output table, 77*4bdff4beSrobert reducing binary and improving lookup performance. 78*4bdff4beSrobert """ 79*4bdff4beSrobert result = list() 80*4bdff4beSrobert for x in input: 81*4bdff4beSrobert if ( 82*4bdff4beSrobert len(result) 83*4bdff4beSrobert and x.lower > result[-1].lower 84*4bdff4beSrobert and x.lower <= result[-1].upper + 1 85*4bdff4beSrobert ): 86*4bdff4beSrobert result[-1].upper = max(result[-1].upper, x.upper) 87*4bdff4beSrobert result[-1].prop += f" {x.prop}" 88*4bdff4beSrobert continue 89*4bdff4beSrobert result.append(x) 90*4bdff4beSrobert return result 91*4bdff4beSrobert 92*4bdff4beSrobert 93*4bdff4beSrobertDATA_ARRAY_TEMPLATE = """ 94*4bdff4beSrobert/// The entries of the characters to escape in format's debug string. 95*4bdff4beSrobert/// 96*4bdff4beSrobert/// Contains the entries for [format.string.escaped]/2.2.1.2.1 97*4bdff4beSrobert/// CE is a Unicode encoding and C corresponds to either a UCS scalar value 98*4bdff4beSrobert/// whose Unicode property General_Category has a value in the groups 99*4bdff4beSrobert/// Separator (Z) or Other (C) or to a UCS scalar value which has the Unicode 100*4bdff4beSrobert/// property Grapheme_Extend=Yes, as described by table 12 of UAX #44 101*4bdff4beSrobert/// 102*4bdff4beSrobert/// Separator (Z) consists of General_Category 103*4bdff4beSrobert/// - Space_Separator, 104*4bdff4beSrobert/// - Line_Separator, 105*4bdff4beSrobert/// - Paragraph_Separator. 106*4bdff4beSrobert/// 107*4bdff4beSrobert/// Other (C) consists of General_Category 108*4bdff4beSrobert/// - Control, 109*4bdff4beSrobert/// - Format, 110*4bdff4beSrobert/// - Surrogate, 111*4bdff4beSrobert/// - Private_Use, 112*4bdff4beSrobert/// - Unassigned. 113*4bdff4beSrobert/// 114*4bdff4beSrobert/// The data is generated from 115*4bdff4beSrobert/// - https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt 116*4bdff4beSrobert/// - https://www.unicode.org/Public/UCD/latest/ucd/extracted/DerivedGeneralCategory.txt 117*4bdff4beSrobert/// 118*4bdff4beSrobert/// The table is similar to the table 119*4bdff4beSrobert/// __extended_grapheme_custer_property_boundary::__entries 120*4bdff4beSrobert/// which explains the details of these classes. The only difference is this 121*4bdff4beSrobert/// table lacks a property, thus having more bits available for the size. 122*4bdff4beSrobert/// 123*4bdff4beSrobert/// The data has 2 values: 124*4bdff4beSrobert/// - bits [0, 10] The size of the range, allowing 2048 elements. 125*4bdff4beSrobert/// - bits [11, 31] The lower bound code point of the range. The upper bound of 126*4bdff4beSrobert/// the range is lower bound + size. 127*4bdff4beSrobertinline constexpr uint32_t __entries[{size}] = {{ 128*4bdff4beSrobert{entries}}}; 129*4bdff4beSrobert 130*4bdff4beSrobert/// At the end of the valid Unicode code points space a lot of code points are 131*4bdff4beSrobert/// either reserved or a noncharacter. Adding all these entries to the 132*4bdff4beSrobert/// lookup table would add 446 entries to the table (in Unicode 14). 133*4bdff4beSrobert/// Instead the only the start of the region is stored, every code point in 134*4bdff4beSrobert/// this region needs to be escaped. 135*4bdff4beSrobertinline constexpr uint32_t __unallocated_region_lower_bound = 0x{unallocated:08x}; 136*4bdff4beSrobert 137*4bdff4beSrobert/// Returns whether the code unit needs to be escaped. 138*4bdff4beSrobert/// 139*4bdff4beSrobert/// \pre The code point is a valid Unicode code point. 140*4bdff4beSrobert[[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr bool __needs_escape(const char32_t __code_point) noexcept {{ 141*4bdff4beSrobert // Since __unallocated_region_lower_bound contains the unshifted range do the 142*4bdff4beSrobert // comparison without shifting. 143*4bdff4beSrobert if (__code_point >= __unallocated_region_lower_bound) 144*4bdff4beSrobert return true; 145*4bdff4beSrobert 146*4bdff4beSrobert ptrdiff_t __i = std::ranges::upper_bound(__entries, (__code_point << 11) | 0x7ffu) - __entries; 147*4bdff4beSrobert if (__i == 0) 148*4bdff4beSrobert return false; 149*4bdff4beSrobert 150*4bdff4beSrobert --__i; 151*4bdff4beSrobert uint32_t __upper_bound = (__entries[__i] >> 11) + (__entries[__i] & 0x7ffu); 152*4bdff4beSrobert return __code_point <= __upper_bound; 153*4bdff4beSrobert}} 154*4bdff4beSrobert""" 155*4bdff4beSrobert 156*4bdff4beSrobertTABLES_HPP_TEMPLATE = """ 157*4bdff4beSrobert// -*- C++ -*- 158*4bdff4beSrobert//===----------------------------------------------------------------------===// 159*4bdff4beSrobert// 160*4bdff4beSrobert// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 161*4bdff4beSrobert// See https://llvm.org/LICENSE.txt for license information. 162*4bdff4beSrobert// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 163*4bdff4beSrobert// 164*4bdff4beSrobert//===----------------------------------------------------------------------===// 165*4bdff4beSrobert 166*4bdff4beSrobert// WARNING, this entire header is generated by 167*4bdff4beSrobert// utils/generate_escaped_output_table.py 168*4bdff4beSrobert// DO NOT MODIFY! 169*4bdff4beSrobert 170*4bdff4beSrobert// UNICODE, INC. LICENSE AGREEMENT - DATA FILES AND SOFTWARE 171*4bdff4beSrobert// 172*4bdff4beSrobert// See Terms of Use <https://www.unicode.org/copyright.html> 173*4bdff4beSrobert// for definitions of Unicode Inc.'s Data Files and Software. 174*4bdff4beSrobert// 175*4bdff4beSrobert// NOTICE TO USER: Carefully read the following legal agreement. 176*4bdff4beSrobert// BY DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING UNICODE INC.'S 177*4bdff4beSrobert// DATA FILES ("DATA FILES"), AND/OR SOFTWARE ("SOFTWARE"), 178*4bdff4beSrobert// YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE 179*4bdff4beSrobert// TERMS AND CONDITIONS OF THIS AGREEMENT. 180*4bdff4beSrobert// IF YOU DO NOT AGREE, DO NOT DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE 181*4bdff4beSrobert// THE DATA FILES OR SOFTWARE. 182*4bdff4beSrobert// 183*4bdff4beSrobert// COPYRIGHT AND PERMISSION NOTICE 184*4bdff4beSrobert// 185*4bdff4beSrobert// Copyright (c) 1991-2022 Unicode, Inc. All rights reserved. 186*4bdff4beSrobert// Distributed under the Terms of Use in https://www.unicode.org/copyright.html. 187*4bdff4beSrobert// 188*4bdff4beSrobert// Permission is hereby granted, free of charge, to any person obtaining 189*4bdff4beSrobert// a copy of the Unicode data files and any associated documentation 190*4bdff4beSrobert// (the "Data Files") or Unicode software and any associated documentation 191*4bdff4beSrobert// (the "Software") to deal in the Data Files or Software 192*4bdff4beSrobert// without restriction, including without limitation the rights to use, 193*4bdff4beSrobert// copy, modify, merge, publish, distribute, and/or sell copies of 194*4bdff4beSrobert// the Data Files or Software, and to permit persons to whom the Data Files 195*4bdff4beSrobert// or Software are furnished to do so, provided that either 196*4bdff4beSrobert// (a) this copyright and permission notice appear with all copies 197*4bdff4beSrobert// of the Data Files or Software, or 198*4bdff4beSrobert// (b) this copyright and permission notice appear in associated 199*4bdff4beSrobert// Documentation. 200*4bdff4beSrobert// 201*4bdff4beSrobert// THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF 202*4bdff4beSrobert// ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE 203*4bdff4beSrobert// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 204*4bdff4beSrobert// NONINFRINGEMENT OF THIRD PARTY RIGHTS. 205*4bdff4beSrobert// IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS 206*4bdff4beSrobert// NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL 207*4bdff4beSrobert// DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, 208*4bdff4beSrobert// DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER 209*4bdff4beSrobert// TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 210*4bdff4beSrobert// PERFORMANCE OF THE DATA FILES OR SOFTWARE. 211*4bdff4beSrobert// 212*4bdff4beSrobert// Except as contained in this notice, the name of a copyright holder 213*4bdff4beSrobert// shall not be used in advertising or otherwise to promote the sale, 214*4bdff4beSrobert// use or other dealings in these Data Files or Software without prior 215*4bdff4beSrobert// written authorization of the copyright holder. 216*4bdff4beSrobert 217*4bdff4beSrobert#ifndef _LIBCPP___FORMAT_ESCAPED_OUTPUT_TABLE_H 218*4bdff4beSrobert#define _LIBCPP___FORMAT_ESCAPED_OUTPUT_TABLE_H 219*4bdff4beSrobert 220*4bdff4beSrobert#include <__algorithm/ranges_upper_bound.h> 221*4bdff4beSrobert#include <__config> 222*4bdff4beSrobert#include <cstddef> 223*4bdff4beSrobert#include <cstdint> 224*4bdff4beSrobert 225*4bdff4beSrobert#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) 226*4bdff4beSrobert# pragma GCC system_header 227*4bdff4beSrobert#endif 228*4bdff4beSrobert 229*4bdff4beSrobert_LIBCPP_BEGIN_NAMESPACE_STD 230*4bdff4beSrobert 231*4bdff4beSrobert#if _LIBCPP_STD_VER > 20 232*4bdff4beSrobert 233*4bdff4beSrobertnamespace __escaped_output_table {{ 234*4bdff4beSrobert{content} 235*4bdff4beSrobert}} // namespace __escaped_output_table 236*4bdff4beSrobert 237*4bdff4beSrobert#endif //_LIBCPP_STD_VER > 20 238*4bdff4beSrobert 239*4bdff4beSrobert_LIBCPP_END_NAMESPACE_STD 240*4bdff4beSrobert 241*4bdff4beSrobert#endif // _LIBCPP___FORMAT_ESCAPED_OUTPUT_TABLE_H""" 242*4bdff4beSrobert 243*4bdff4beSrobert 244*4bdff4beSrobertdef property_ranges_to_table(ranges: list[PropertyRange]) -> list[Entry]: 245*4bdff4beSrobert result = list[Entry]() 246*4bdff4beSrobert high = -1 247*4bdff4beSrobert for range in sorted(ranges, key=lambda x: x.lower): 248*4bdff4beSrobert # Validate overlapping ranges 249*4bdff4beSrobert assert range.lower > high 250*4bdff4beSrobert high = range.upper 251*4bdff4beSrobert 252*4bdff4beSrobert while True: 253*4bdff4beSrobert e = Entry(range.lower, range.upper - range.lower) 254*4bdff4beSrobert if e.offset <= 2047: 255*4bdff4beSrobert result.append(e) 256*4bdff4beSrobert break 257*4bdff4beSrobert e.offset = 2047 258*4bdff4beSrobert result.append(e) 259*4bdff4beSrobert range.lower += 2048 260*4bdff4beSrobert return result 261*4bdff4beSrobert 262*4bdff4beSrobert 263*4bdff4beSrobertcpp_entrytemplate = " 0x{:08x}" 264*4bdff4beSrobert 265*4bdff4beSrobert 266*4bdff4beSrobertdef generate_cpp_data(ranges: list[PropertyRange], unallocated: int) -> str: 267*4bdff4beSrobert result = StringIO() 268*4bdff4beSrobert table = property_ranges_to_table(ranges) 269*4bdff4beSrobert result.write( 270*4bdff4beSrobert DATA_ARRAY_TEMPLATE.format( 271*4bdff4beSrobert size=len(table), 272*4bdff4beSrobert entries=",\n".join( 273*4bdff4beSrobert [cpp_entrytemplate.format(x.lower << 11 | x.offset) for x in table] 274*4bdff4beSrobert ), 275*4bdff4beSrobert unallocated=unallocated, 276*4bdff4beSrobert ) 277*4bdff4beSrobert ) 278*4bdff4beSrobert 279*4bdff4beSrobert return result.getvalue() 280*4bdff4beSrobert 281*4bdff4beSrobert 282*4bdff4beSrobertdef generate_data_tables() -> str: 283*4bdff4beSrobert """ 284*4bdff4beSrobert Generate Unicode data for [format.string.escaped]/2.2.1.2.1 285*4bdff4beSrobert """ 286*4bdff4beSrobert derived_general_catagory_path = ( 287*4bdff4beSrobert Path(__file__).absolute().parent 288*4bdff4beSrobert / "data" 289*4bdff4beSrobert / "unicode" 290*4bdff4beSrobert / "DerivedGeneralCategory.txt" 291*4bdff4beSrobert ) 292*4bdff4beSrobert derived_core_catagory_path = ( 293*4bdff4beSrobert Path(__file__).absolute().parent 294*4bdff4beSrobert / "data" 295*4bdff4beSrobert / "unicode" 296*4bdff4beSrobert / "DerivedCoreProperties.txt" 297*4bdff4beSrobert ) 298*4bdff4beSrobert 299*4bdff4beSrobert properties = list() 300*4bdff4beSrobert with derived_general_catagory_path.open(encoding="utf-8") as f: 301*4bdff4beSrobert properties.extend( 302*4bdff4beSrobert list( 303*4bdff4beSrobert filter( 304*4bdff4beSrobert filterGeneralProperty, 305*4bdff4beSrobert [x for line in f if (x := parsePropertyLine(line))], 306*4bdff4beSrobert ) 307*4bdff4beSrobert ) 308*4bdff4beSrobert ) 309*4bdff4beSrobert with derived_core_catagory_path.open(encoding="utf-8") as f: 310*4bdff4beSrobert properties.extend( 311*4bdff4beSrobert list( 312*4bdff4beSrobert filter( 313*4bdff4beSrobert filterCoreProperty, 314*4bdff4beSrobert [x for line in f if (x := parsePropertyLine(line))], 315*4bdff4beSrobert ) 316*4bdff4beSrobert ) 317*4bdff4beSrobert ) 318*4bdff4beSrobert 319*4bdff4beSrobert data = compactPropertyRanges(sorted(properties, key=lambda x: x.lower)) 320*4bdff4beSrobert 321*4bdff4beSrobert # The last entry is large. In Unicode 14 it contains the entries 322*4bdff4beSrobert # 3134B..0FFFF 912564 elements 323*4bdff4beSrobert # This are 446 entries of 1325 entries in the table. 324*4bdff4beSrobert # Based on the nature of these entries it is expected they remain for the 325*4bdff4beSrobert # forseeable future. Therefore we only store the lower bound of this section. 326*4bdff4beSrobert # 327*4bdff4beSrobert # When this region becomes substantially smaller we need to investigate 328*4bdff4beSrobert # this design. 329*4bdff4beSrobert assert data[-1].upper == 0x10FFFF 330*4bdff4beSrobert assert data[-1].upper - data[-1].lower > 900000 331*4bdff4beSrobert 332*4bdff4beSrobert return "\n".join([generate_cpp_data(data[:-1], data[-1].lower)]) 333*4bdff4beSrobert 334*4bdff4beSrobert 335*4bdff4beSrobertif __name__ == "__main__": 336*4bdff4beSrobert if len(sys.argv) == 2: 337*4bdff4beSrobert sys.stdout = open(sys.argv[1], "w") 338*4bdff4beSrobert print(TABLES_HPP_TEMPLATE.lstrip().format(content=generate_data_tables())) 339