1a4800735SMark de Wever#!/usr/bin/env python 2a4800735SMark de Wever# ===----------------------------------------------------------------------===## 3a4800735SMark de Wever# 4a4800735SMark de Wever# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 5a4800735SMark de Wever# See https://llvm.org/LICENSE.txt for license information. 6a4800735SMark de Wever# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 7a4800735SMark de Wever# 8a4800735SMark de Wever# ===----------------------------------------------------------------------===## 9a4800735SMark de Wever 10a4800735SMark de Wever# The code is based on 11a4800735SMark de Wever# https://github.com/microsoft/STL/blob/main/tools/unicode_properties_parse/grapheme_break_property_data_gen.py 12a4800735SMark de Wever# 13a4800735SMark de Wever# Copyright (c) Microsoft Corporation. 14a4800735SMark de Wever# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 15a4800735SMark de Wever 16a4800735SMark de Weverfrom io import StringIO 17a4800735SMark de Weverfrom pathlib import Path 180d3c40b8SStephan T. Lavavejfrom dataclasses import dataclass 19a4800735SMark de Weverfrom typing import Optional 20a4800735SMark de Weverimport re 21a4800735SMark de Weverimport sys 22a4800735SMark de Wever 23a4800735SMark de Wever 24a4800735SMark de Wever@dataclass 25a4800735SMark de Weverclass PropertyRange: 26a4800735SMark de Wever lower: int = -1 27a4800735SMark de Wever upper: int = -1 28a4800735SMark de Wever prop: str = None 29a4800735SMark de Wever 30a4800735SMark de Wever 31a4800735SMark de Wever@dataclass 32a4800735SMark de Weverclass Entry: 33a4800735SMark de Wever lower: int = -1 34a4800735SMark de Wever offset: int = -1 35a4800735SMark de Wever 36a4800735SMark de Wever 37a4800735SMark de WeverLINE_REGEX = re.compile( 38a4800735SMark de Wever r"^(?P<lower>[0-9A-F]{4,6})(?:\.\.(?P<upper>[0-9A-F]{4,6}))?\s*;\s*(?P<prop>\w+)" 39a4800735SMark de Wever) 40a4800735SMark de Wever 41a4800735SMark de Wever 42a4800735SMark de Wever# https://www.unicode.org/reports/tr44/#GC_Values_Table 43a4800735SMark de Weverdef filterGeneralProperty(element: PropertyRange) -> Optional[PropertyRange]: 44a4800735SMark de Wever if element.prop in ["Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn"]: 45a4800735SMark de Wever return element 46a4800735SMark de Wever return None 47a4800735SMark de Wever 48a4800735SMark de Wever 49a4800735SMark de Weverdef parsePropertyLine(inputLine: str) -> Optional[PropertyRange]: 50a4800735SMark de Wever result = PropertyRange() 51a4800735SMark de Wever if m := LINE_REGEX.match(inputLine): 52a4800735SMark de Wever lower_str, upper_str, result.prop = m.group("lower", "upper", "prop") 53a4800735SMark de Wever result.lower = int(lower_str, base=16) 54a4800735SMark de Wever result.upper = result.lower 55a4800735SMark de Wever if upper_str is not None: 56a4800735SMark de Wever result.upper = int(upper_str, base=16) 57a4800735SMark de Wever return result 58a4800735SMark de Wever 59a4800735SMark de Wever else: 60a4800735SMark de Wever return None 61a4800735SMark de Wever 62a4800735SMark de Wever 63a4800735SMark de Weverdef compactPropertyRanges(input: list[PropertyRange]) -> list[PropertyRange]: 64a4800735SMark de Wever """ 65a4800735SMark de Wever Merges overlapping and consecutive ranges to one range. 66a4800735SMark de Wever 67a4800735SMark de Wever Since the input properties are filtered the exact property isn't 68a4800735SMark de Wever interesting anymore. The properties in the output are merged to aid 69a4800735SMark de Wever debugging. 70a4800735SMark de Wever Merging the ranges results in fewer ranges in the output table, 71a4800735SMark de Wever reducing binary and improving lookup performance. 72a4800735SMark de Wever """ 73a4800735SMark de Wever result = list() 74a4800735SMark de Wever for x in input: 75a4800735SMark de Wever if ( 76a4800735SMark de Wever len(result) 77a4800735SMark de Wever and x.lower > result[-1].lower 78a4800735SMark de Wever and x.lower <= result[-1].upper + 1 79a4800735SMark de Wever ): 80a4800735SMark de Wever result[-1].upper = max(result[-1].upper, x.upper) 81a4800735SMark de Wever result[-1].prop += f" {x.prop}" 82a4800735SMark de Wever continue 83a4800735SMark de Wever result.append(x) 84a4800735SMark de Wever return result 85a4800735SMark de Wever 86a4800735SMark de Wever 8788184e50SEisuke KawashimaDATA_ARRAY_TEMPLATE = r""" 88a4800735SMark de Wever/// The entries of the characters to escape in format's debug string. 89a4800735SMark de Wever/// 90a4800735SMark de Wever/// Contains the entries for [format.string.escaped]/2.2.1.2.1 91ad76a859SMark de Wever/// CE is a Unicode encoding and C corresponds to a UCS scalar value whose 92ad76a859SMark de Wever/// Unicode property General_Category has a value in the groups Separator (Z) 93ad76a859SMark de Wever/// or Other (C), as described by table 12 of UAX #44 94a4800735SMark de Wever/// 95a4800735SMark de Wever/// Separator (Z) consists of General_Category 96a4800735SMark de Wever/// - Space_Separator, 97a4800735SMark de Wever/// - Line_Separator, 98a4800735SMark de Wever/// - Paragraph_Separator. 99a4800735SMark de Wever/// 100a4800735SMark de Wever/// Other (C) consists of General_Category 101a4800735SMark de Wever/// - Control, 102a4800735SMark de Wever/// - Format, 103a4800735SMark de Wever/// - Surrogate, 104a4800735SMark de Wever/// - Private_Use, 105a4800735SMark de Wever/// - Unassigned. 106a4800735SMark de Wever/// 107a4800735SMark de Wever/// The data is generated from 108a4800735SMark de Wever/// - https://www.unicode.org/Public/UCD/latest/ucd/extracted/DerivedGeneralCategory.txt 109a4800735SMark de Wever/// 110a4800735SMark de Wever/// The table is similar to the table 111a4800735SMark de Wever/// __extended_grapheme_custer_property_boundary::__entries 112a4800735SMark de Wever/// which explains the details of these classes. The only difference is this 113a4800735SMark de Wever/// table lacks a property, thus having more bits available for the size. 114a4800735SMark de Wever/// 115a4800735SMark de Wever/// The data has 2 values: 116e3dea5e3SMark de Wever/// - bits [0, 13] The size of the range, allowing 16384 elements. 117e3dea5e3SMark de Wever/// - bits [14, 31] The lower bound code point of the range. The upper bound of 118e3dea5e3SMark de Wever/// the range is lower bound + size. Note the code expects code units the fit 119e3dea5e3SMark de Wever/// into 18 bits, instead of the 21 bits needed for the full Unicode range. 120d179176fSMark de Wever_LIBCPP_HIDE_FROM_ABI inline constexpr uint32_t __entries[{size}] = {{ 121a4800735SMark de Wever{entries}}}; 122a4800735SMark de Wever 123e3dea5e3SMark de Wever/// Returns whether the code unit needs to be escaped. 124e3dea5e3SMark de Wever/// 125a4800735SMark de Wever/// At the end of the valid Unicode code points space a lot of code points are 126a4800735SMark de Wever/// either reserved or a noncharacter. Adding all these entries to the 127e3dea5e3SMark de Wever/// lookup table would greatly increase the size of the table. Instead these 128e3dea5e3SMark de Wever/// entries are manually processed. In this large area of reserved code points, 129e3dea5e3SMark de Wever/// there is a small area of extended graphemes that should not be escaped 130e3dea5e3SMark de Wever/// unconditionally. This is also manually coded. See the generation script for 131e3dea5e3SMark de Wever/// more details. 132a4800735SMark de Wever 133a4800735SMark de Wever/// 134ae858b51SAngryLoki/// \\pre The code point is a valid Unicode code point. 135a4800735SMark de Wever[[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr bool __needs_escape(const char32_t __code_point) noexcept {{ 136e3dea5e3SMark de Wever 137e3dea5e3SMark de Wever // The entries in the gap at the end. 138e3dea5e3SMark de Wever if(__code_point >= 0x{gap_lower:08x} && __code_point <= 0x{gap_upper:08x}) 139e3dea5e3SMark de Wever return false; 140e3dea5e3SMark de Wever 141e3dea5e3SMark de Wever // The entries at the end. 142e3dea5e3SMark de Wever if (__code_point >= 0x{unallocated:08x}) 143a4800735SMark de Wever return true; 144a4800735SMark de Wever 145e3dea5e3SMark de Wever ptrdiff_t __i = std::ranges::upper_bound(__entries, (__code_point << 14) | 0x3fffu) - __entries; 146a4800735SMark de Wever if (__i == 0) 147a4800735SMark de Wever return false; 148a4800735SMark de Wever 149a4800735SMark de Wever --__i; 150e3dea5e3SMark de Wever uint32_t __upper_bound = (__entries[__i] >> 14) + (__entries[__i] & 0x3fffu); 151a4800735SMark de Wever return __code_point <= __upper_bound; 152a4800735SMark de Wever}} 153a4800735SMark de Wever""" 154a4800735SMark de Wever 155a4800735SMark de WeverTABLES_HPP_TEMPLATE = """ 156a4800735SMark de Wever// -*- C++ -*- 157a4800735SMark de Wever//===----------------------------------------------------------------------===// 158a4800735SMark de Wever// 159a4800735SMark de Wever// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 160a4800735SMark de Wever// See https://llvm.org/LICENSE.txt for license information. 161a4800735SMark de Wever// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 162a4800735SMark de Wever// 163a4800735SMark de Wever//===----------------------------------------------------------------------===// 164a4800735SMark de Wever 165a4800735SMark de Wever// WARNING, this entire header is generated by 166a4800735SMark de Wever// utils/generate_escaped_output_table.py 167a4800735SMark de Wever// DO NOT MODIFY! 168a4800735SMark de Wever 169a4800735SMark de Wever// UNICODE, INC. LICENSE AGREEMENT - DATA FILES AND SOFTWARE 170a4800735SMark de Wever// 171a4800735SMark de Wever// See Terms of Use <https://www.unicode.org/copyright.html> 172a4800735SMark de Wever// for definitions of Unicode Inc.'s Data Files and Software. 173a4800735SMark de Wever// 174a4800735SMark de Wever// NOTICE TO USER: Carefully read the following legal agreement. 175a4800735SMark de Wever// BY DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING UNICODE INC.'S 176a4800735SMark de Wever// DATA FILES ("DATA FILES"), AND/OR SOFTWARE ("SOFTWARE"), 177a4800735SMark de Wever// YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE 178a4800735SMark de Wever// TERMS AND CONDITIONS OF THIS AGREEMENT. 179a4800735SMark de Wever// IF YOU DO NOT AGREE, DO NOT DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE 180a4800735SMark de Wever// THE DATA FILES OR SOFTWARE. 181a4800735SMark de Wever// 182a4800735SMark de Wever// COPYRIGHT AND PERMISSION NOTICE 183a4800735SMark de Wever// 184a4800735SMark de Wever// Copyright (c) 1991-2022 Unicode, Inc. All rights reserved. 185a4800735SMark de Wever// Distributed under the Terms of Use in https://www.unicode.org/copyright.html. 186a4800735SMark de Wever// 187a4800735SMark de Wever// Permission is hereby granted, free of charge, to any person obtaining 188a4800735SMark de Wever// a copy of the Unicode data files and any associated documentation 189a4800735SMark de Wever// (the "Data Files") or Unicode software and any associated documentation 190a4800735SMark de Wever// (the "Software") to deal in the Data Files or Software 191a4800735SMark de Wever// without restriction, including without limitation the rights to use, 192a4800735SMark de Wever// copy, modify, merge, publish, distribute, and/or sell copies of 193a4800735SMark de Wever// the Data Files or Software, and to permit persons to whom the Data Files 194a4800735SMark de Wever// or Software are furnished to do so, provided that either 195a4800735SMark de Wever// (a) this copyright and permission notice appear with all copies 196a4800735SMark de Wever// of the Data Files or Software, or 197a4800735SMark de Wever// (b) this copyright and permission notice appear in associated 198a4800735SMark de Wever// Documentation. 199a4800735SMark de Wever// 200a4800735SMark de Wever// THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF 201a4800735SMark de Wever// ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE 202a4800735SMark de Wever// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 203a4800735SMark de Wever// NONINFRINGEMENT OF THIRD PARTY RIGHTS. 204a4800735SMark de Wever// IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS 205a4800735SMark de Wever// NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL 206a4800735SMark de Wever// DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, 207a4800735SMark de Wever// DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER 208a4800735SMark de Wever// TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 209a4800735SMark de Wever// PERFORMANCE OF THE DATA FILES OR SOFTWARE. 210a4800735SMark de Wever// 211a4800735SMark de Wever// Except as contained in this notice, the name of a copyright holder 212a4800735SMark de Wever// shall not be used in advertising or otherwise to promote the sale, 213a4800735SMark de Wever// use or other dealings in these Data Files or Software without prior 214a4800735SMark de Wever// written authorization of the copyright holder. 215a4800735SMark de Wever 216a4800735SMark de Wever#ifndef _LIBCPP___FORMAT_ESCAPED_OUTPUT_TABLE_H 217a4800735SMark de Wever#define _LIBCPP___FORMAT_ESCAPED_OUTPUT_TABLE_H 218a4800735SMark de Wever 219a4800735SMark de Wever#include <__algorithm/ranges_upper_bound.h> 220a4800735SMark de Wever#include <__config> 221*e99c4906SNikolas Klauser#include <__cstddef/ptrdiff_t.h> 222a4800735SMark de Wever#include <cstdint> 223a4800735SMark de Wever 224a4800735SMark de Wever#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) 225a4800735SMark de Wever# pragma GCC system_header 226a4800735SMark de Wever#endif 227a4800735SMark de Wever 228a4800735SMark de Wever_LIBCPP_BEGIN_NAMESPACE_STD 229a4800735SMark de Wever 2304f15267dSNikolas Klauser#if _LIBCPP_STD_VER >= 23 231a4800735SMark de Wever 232a4800735SMark de Wevernamespace __escaped_output_table {{ 233b18a46e3SLouis Dionne// clang-format off 234a4800735SMark de Wever{content} 235b18a46e3SLouis Dionne// clang-format on 236a4800735SMark de Wever}} // namespace __escaped_output_table 237a4800735SMark de Wever 2384f15267dSNikolas Klauser#endif // _LIBCPP_STD_VER >= 23 239a4800735SMark de Wever 240a4800735SMark de Wever_LIBCPP_END_NAMESPACE_STD 241a4800735SMark de Wever 242a4800735SMark de Wever#endif // _LIBCPP___FORMAT_ESCAPED_OUTPUT_TABLE_H""" 243a4800735SMark de Wever 244a4800735SMark de Wever 245a4800735SMark de Weverdef property_ranges_to_table(ranges: list[PropertyRange]) -> list[Entry]: 246a4800735SMark de Wever result = list[Entry]() 247a4800735SMark de Wever high = -1 248a4800735SMark de Wever for range in sorted(ranges, key=lambda x: x.lower): 249a4800735SMark de Wever # Validate overlapping ranges 250a4800735SMark de Wever assert range.lower > high 251a4800735SMark de Wever high = range.upper 252a4800735SMark de Wever 253a4800735SMark de Wever while True: 254a4800735SMark de Wever e = Entry(range.lower, range.upper - range.lower) 255e3dea5e3SMark de Wever if e.offset <= 16383: 256a4800735SMark de Wever result.append(e) 257a4800735SMark de Wever break 258e3dea5e3SMark de Wever e.offset = 16383 259a4800735SMark de Wever result.append(e) 260e3dea5e3SMark de Wever range.lower += 16384 261a4800735SMark de Wever return result 262a4800735SMark de Wever 263a4800735SMark de Wever 264ad76a859SMark de Wevercpp_entrytemplate = " 0x{:08x} /* {:08x} - {:08x} [{:>5}] */" 265a4800735SMark de Wever 266a4800735SMark de Wever 267e3dea5e3SMark de Weverdef generate_cpp_data( 268e3dea5e3SMark de Wever ranges: list[PropertyRange], unallocated: int, gap_lower: int, gap_upper: int 269e3dea5e3SMark de Wever) -> str: 270a4800735SMark de Wever result = StringIO() 271a4800735SMark de Wever table = property_ranges_to_table(ranges) 272e3dea5e3SMark de Wever # Validates all entries fit in 18 bits. 273e3dea5e3SMark de Wever for x in table: 274e3dea5e3SMark de Wever assert x.lower + x.offset < 0x3FFFF 275a4800735SMark de Wever result.write( 276a4800735SMark de Wever DATA_ARRAY_TEMPLATE.format( 277a4800735SMark de Wever size=len(table), 278a4800735SMark de Wever entries=",\n".join( 279ad76a859SMark de Wever [ 280ad76a859SMark de Wever cpp_entrytemplate.format( 281e3dea5e3SMark de Wever x.lower << 14 | x.offset, 282ad76a859SMark de Wever x.lower, 283ad76a859SMark de Wever x.lower + x.offset, 284ad76a859SMark de Wever x.offset + 1, 285ad76a859SMark de Wever ) 286ad76a859SMark de Wever for x in table 287ad76a859SMark de Wever ] 288a4800735SMark de Wever ), 289a4800735SMark de Wever unallocated=unallocated, 290e3dea5e3SMark de Wever gap_lower=gap_lower, 291e3dea5e3SMark de Wever gap_upper=gap_upper, 292a4800735SMark de Wever ) 293a4800735SMark de Wever ) 294a4800735SMark de Wever 295a4800735SMark de Wever return result.getvalue() 296a4800735SMark de Wever 297a4800735SMark de Wever 298a4800735SMark de Weverdef generate_data_tables() -> str: 299a4800735SMark de Wever """ 300a4800735SMark de Wever Generate Unicode data for [format.string.escaped]/2.2.1.2.1 301a4800735SMark de Wever """ 302a4800735SMark de Wever derived_general_catagory_path = ( 303a4800735SMark de Wever Path(__file__).absolute().parent 304a4800735SMark de Wever / "data" 305a4800735SMark de Wever / "unicode" 306a4800735SMark de Wever / "DerivedGeneralCategory.txt" 307a4800735SMark de Wever ) 308a4800735SMark de Wever 309a4800735SMark de Wever properties = list() 310a4800735SMark de Wever with derived_general_catagory_path.open(encoding="utf-8") as f: 311a4800735SMark de Wever properties.extend( 312a4800735SMark de Wever list( 313a4800735SMark de Wever filter( 314a4800735SMark de Wever filterGeneralProperty, 315a4800735SMark de Wever [x for line in f if (x := parsePropertyLine(line))], 316a4800735SMark de Wever ) 317a4800735SMark de Wever ) 318a4800735SMark de Wever ) 319a4800735SMark de Wever 320a4800735SMark de Wever data = compactPropertyRanges(sorted(properties, key=lambda x: x.lower)) 321a4800735SMark de Wever 322e3dea5e3SMark de Wever # The output table has two large entries at the end, with a small "gap" 323ad76a859SMark de Wever # E0100..E01EF ; Grapheme_Extend # Mn [240] VARIATION SELECTOR-17..VARIATION SELECTOR-256 324e3dea5e3SMark de Wever # Based on Unicode 15.1.0: 325e3dea5e3SMark de Wever # - Encoding all these entries in the table requires 1173 entries. 326e3dea5e3SMark de Wever # - Manually handling these last two blocks reduces the size to 729 entries. 327e3dea5e3SMark de Wever # This not only reduces the binary size, but also improves the performance 328e3dea5e3SMark de Wever # by having fewer elements to search. 329e3dea5e3SMark de Wever # The exact entries may differ between Unicode versions. When these numbers 330e3dea5e3SMark de Wever # change the test needs to be updated too. 331e3dea5e3SMark de Wever # libcxx/test/libcxx/utilities/format/format.string/format.string.std/escaped_output.pass.cpp 332e3dea5e3SMark de Wever assert (data[-2].lower) == 0x323B0 333e3dea5e3SMark de Wever assert (data[-2].upper) == 0xE00FF 334e3dea5e3SMark de Wever assert (data[-1].lower) == 0xE01F0 335e3dea5e3SMark de Wever assert (data[-1].upper) == 0x10FFFF 336a4800735SMark de Wever 337e3dea5e3SMark de Wever return "\n".join( 338e3dea5e3SMark de Wever [ 339e3dea5e3SMark de Wever generate_cpp_data( 340e3dea5e3SMark de Wever data[:-2], data[-2].lower, data[-2].upper + 1, data[-1].lower - 1 341e3dea5e3SMark de Wever ) 342e3dea5e3SMark de Wever ] 343e3dea5e3SMark de Wever ) 344a4800735SMark de Wever 345a4800735SMark de Wever 346a4800735SMark de Weverif __name__ == "__main__": 347a4800735SMark de Wever if len(sys.argv) == 2: 348a4800735SMark de Wever sys.stdout = open(sys.argv[1], "w") 349a4800735SMark de Wever print(TABLES_HPP_TEMPLATE.lstrip().format(content=generate_data_tables())) 350