xref: /llvm-project/libcxx/utils/generate_escaped_output_table.py (revision e99c4906e44ae3f921fa05356909d006cda8d954)
1a4800735SMark de Wever#!/usr/bin/env python
2a4800735SMark de Wever# ===----------------------------------------------------------------------===##
3a4800735SMark de Wever#
4a4800735SMark de Wever# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
5a4800735SMark de Wever# See https://llvm.org/LICENSE.txt for license information.
6a4800735SMark de Wever# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7a4800735SMark de Wever#
8a4800735SMark de Wever# ===----------------------------------------------------------------------===##
9a4800735SMark de Wever
10a4800735SMark de Wever# The code is based on
11a4800735SMark de Wever# https://github.com/microsoft/STL/blob/main/tools/unicode_properties_parse/grapheme_break_property_data_gen.py
12a4800735SMark de Wever#
13a4800735SMark de Wever# Copyright (c) Microsoft Corporation.
14a4800735SMark de Wever# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
15a4800735SMark de Wever
16a4800735SMark de Weverfrom io import StringIO
17a4800735SMark de Weverfrom pathlib import Path
180d3c40b8SStephan T. Lavavejfrom dataclasses import dataclass
19a4800735SMark de Weverfrom typing import Optional
20a4800735SMark de Weverimport re
21a4800735SMark de Weverimport sys
22a4800735SMark de Wever
23a4800735SMark de Wever
24a4800735SMark de Wever@dataclass
25a4800735SMark de Weverclass PropertyRange:
26a4800735SMark de Wever    lower: int = -1
27a4800735SMark de Wever    upper: int = -1
28a4800735SMark de Wever    prop: str = None
29a4800735SMark de Wever
30a4800735SMark de Wever
31a4800735SMark de Wever@dataclass
32a4800735SMark de Weverclass Entry:
33a4800735SMark de Wever    lower: int = -1
34a4800735SMark de Wever    offset: int = -1
35a4800735SMark de Wever
36a4800735SMark de Wever
37a4800735SMark de WeverLINE_REGEX = re.compile(
38a4800735SMark de Wever    r"^(?P<lower>[0-9A-F]{4,6})(?:\.\.(?P<upper>[0-9A-F]{4,6}))?\s*;\s*(?P<prop>\w+)"
39a4800735SMark de Wever)
40a4800735SMark de Wever
41a4800735SMark de Wever
42a4800735SMark de Wever# https://www.unicode.org/reports/tr44/#GC_Values_Table
43a4800735SMark de Weverdef filterGeneralProperty(element: PropertyRange) -> Optional[PropertyRange]:
44a4800735SMark de Wever    if element.prop in ["Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn"]:
45a4800735SMark de Wever        return element
46a4800735SMark de Wever    return None
47a4800735SMark de Wever
48a4800735SMark de Wever
49a4800735SMark de Weverdef parsePropertyLine(inputLine: str) -> Optional[PropertyRange]:
50a4800735SMark de Wever    result = PropertyRange()
51a4800735SMark de Wever    if m := LINE_REGEX.match(inputLine):
52a4800735SMark de Wever        lower_str, upper_str, result.prop = m.group("lower", "upper", "prop")
53a4800735SMark de Wever        result.lower = int(lower_str, base=16)
54a4800735SMark de Wever        result.upper = result.lower
55a4800735SMark de Wever        if upper_str is not None:
56a4800735SMark de Wever            result.upper = int(upper_str, base=16)
57a4800735SMark de Wever        return result
58a4800735SMark de Wever
59a4800735SMark de Wever    else:
60a4800735SMark de Wever        return None
61a4800735SMark de Wever
62a4800735SMark de Wever
63a4800735SMark de Weverdef compactPropertyRanges(input: list[PropertyRange]) -> list[PropertyRange]:
64a4800735SMark de Wever    """
65a4800735SMark de Wever    Merges overlapping and consecutive ranges to one range.
66a4800735SMark de Wever
67a4800735SMark de Wever    Since the input properties are filtered the exact property isn't
68a4800735SMark de Wever    interesting anymore. The properties in the output are merged to aid
69a4800735SMark de Wever    debugging.
70a4800735SMark de Wever    Merging the ranges results in fewer ranges in the output table,
71a4800735SMark de Wever    reducing binary and improving lookup performance.
72a4800735SMark de Wever    """
73a4800735SMark de Wever    result = list()
74a4800735SMark de Wever    for x in input:
75a4800735SMark de Wever        if (
76a4800735SMark de Wever            len(result)
77a4800735SMark de Wever            and x.lower > result[-1].lower
78a4800735SMark de Wever            and x.lower <= result[-1].upper + 1
79a4800735SMark de Wever        ):
80a4800735SMark de Wever            result[-1].upper = max(result[-1].upper, x.upper)
81a4800735SMark de Wever            result[-1].prop += f" {x.prop}"
82a4800735SMark de Wever            continue
83a4800735SMark de Wever        result.append(x)
84a4800735SMark de Wever    return result
85a4800735SMark de Wever
86a4800735SMark de Wever
8788184e50SEisuke KawashimaDATA_ARRAY_TEMPLATE = r"""
88a4800735SMark de Wever/// The entries of the characters to escape in format's debug string.
89a4800735SMark de Wever///
90a4800735SMark de Wever/// Contains the entries for [format.string.escaped]/2.2.1.2.1
91ad76a859SMark de Wever///   CE is a Unicode encoding and C corresponds to a UCS scalar value whose
92ad76a859SMark de Wever///   Unicode property General_Category has a value in the groups Separator (Z)
93ad76a859SMark de Wever///   or Other (C), as described by table 12 of UAX #44
94a4800735SMark de Wever///
95a4800735SMark de Wever/// Separator (Z) consists of General_Category
96a4800735SMark de Wever/// - Space_Separator,
97a4800735SMark de Wever/// - Line_Separator,
98a4800735SMark de Wever/// - Paragraph_Separator.
99a4800735SMark de Wever///
100a4800735SMark de Wever/// Other (C) consists of General_Category
101a4800735SMark de Wever/// - Control,
102a4800735SMark de Wever/// - Format,
103a4800735SMark de Wever/// - Surrogate,
104a4800735SMark de Wever/// - Private_Use,
105a4800735SMark de Wever/// - Unassigned.
106a4800735SMark de Wever///
107a4800735SMark de Wever/// The data is generated from
108a4800735SMark de Wever/// - https://www.unicode.org/Public/UCD/latest/ucd/extracted/DerivedGeneralCategory.txt
109a4800735SMark de Wever///
110a4800735SMark de Wever/// The table is similar to the table
111a4800735SMark de Wever///  __extended_grapheme_custer_property_boundary::__entries
112a4800735SMark de Wever/// which explains the details of these classes. The only difference is this
113a4800735SMark de Wever/// table lacks a property, thus having more bits available for the size.
114a4800735SMark de Wever///
115a4800735SMark de Wever/// The data has 2 values:
116e3dea5e3SMark de Wever/// - bits [0, 13] The size of the range, allowing 16384 elements.
117e3dea5e3SMark de Wever/// - bits [14, 31] The lower bound code point of the range. The upper bound of
118e3dea5e3SMark de Wever///   the range is lower bound + size. Note the code expects code units the fit
119e3dea5e3SMark de Wever///   into 18 bits, instead of the 21 bits needed for the full Unicode range.
120d179176fSMark de Wever_LIBCPP_HIDE_FROM_ABI inline constexpr uint32_t __entries[{size}] = {{
121a4800735SMark de Wever{entries}}};
122a4800735SMark de Wever
123e3dea5e3SMark de Wever/// Returns whether the code unit needs to be escaped.
124e3dea5e3SMark de Wever///
125a4800735SMark de Wever/// At the end of the valid Unicode code points space a lot of code points are
126a4800735SMark de Wever/// either reserved or a noncharacter. Adding all these entries to the
127e3dea5e3SMark de Wever/// lookup table would greatly increase the size of the table. Instead these
128e3dea5e3SMark de Wever/// entries are manually processed. In this large area of reserved code points,
129e3dea5e3SMark de Wever/// there is a small area of extended graphemes that should not be escaped
130e3dea5e3SMark de Wever/// unconditionally. This is also manually coded. See the generation script for
131e3dea5e3SMark de Wever/// more details.
132a4800735SMark de Wever
133a4800735SMark de Wever///
134ae858b51SAngryLoki/// \\pre The code point is a valid Unicode code point.
135a4800735SMark de Wever[[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr bool __needs_escape(const char32_t __code_point) noexcept {{
136e3dea5e3SMark de Wever
137e3dea5e3SMark de Wever  // The entries in the gap at the end.
138e3dea5e3SMark de Wever  if(__code_point >= 0x{gap_lower:08x} && __code_point <= 0x{gap_upper:08x})
139e3dea5e3SMark de Wever     return false;
140e3dea5e3SMark de Wever
141e3dea5e3SMark de Wever  // The entries at the end.
142e3dea5e3SMark de Wever  if (__code_point >= 0x{unallocated:08x})
143a4800735SMark de Wever    return true;
144a4800735SMark de Wever
145e3dea5e3SMark de Wever  ptrdiff_t __i = std::ranges::upper_bound(__entries, (__code_point << 14) | 0x3fffu) - __entries;
146a4800735SMark de Wever  if (__i == 0)
147a4800735SMark de Wever    return false;
148a4800735SMark de Wever
149a4800735SMark de Wever  --__i;
150e3dea5e3SMark de Wever  uint32_t __upper_bound = (__entries[__i] >> 14) + (__entries[__i] & 0x3fffu);
151a4800735SMark de Wever  return __code_point <= __upper_bound;
152a4800735SMark de Wever}}
153a4800735SMark de Wever"""
154a4800735SMark de Wever
155a4800735SMark de WeverTABLES_HPP_TEMPLATE = """
156a4800735SMark de Wever// -*- C++ -*-
157a4800735SMark de Wever//===----------------------------------------------------------------------===//
158a4800735SMark de Wever//
159a4800735SMark de Wever// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
160a4800735SMark de Wever// See https://llvm.org/LICENSE.txt for license information.
161a4800735SMark de Wever// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
162a4800735SMark de Wever//
163a4800735SMark de Wever//===----------------------------------------------------------------------===//
164a4800735SMark de Wever
165a4800735SMark de Wever// WARNING, this entire header is generated by
166a4800735SMark de Wever// utils/generate_escaped_output_table.py
167a4800735SMark de Wever// DO NOT MODIFY!
168a4800735SMark de Wever
169a4800735SMark de Wever// UNICODE, INC. LICENSE AGREEMENT - DATA FILES AND SOFTWARE
170a4800735SMark de Wever//
171a4800735SMark de Wever// See Terms of Use <https://www.unicode.org/copyright.html>
172a4800735SMark de Wever// for definitions of Unicode Inc.'s Data Files and Software.
173a4800735SMark de Wever//
174a4800735SMark de Wever// NOTICE TO USER: Carefully read the following legal agreement.
175a4800735SMark de Wever// BY DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING UNICODE INC.'S
176a4800735SMark de Wever// DATA FILES ("DATA FILES"), AND/OR SOFTWARE ("SOFTWARE"),
177a4800735SMark de Wever// YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE
178a4800735SMark de Wever// TERMS AND CONDITIONS OF THIS AGREEMENT.
179a4800735SMark de Wever// IF YOU DO NOT AGREE, DO NOT DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE
180a4800735SMark de Wever// THE DATA FILES OR SOFTWARE.
181a4800735SMark de Wever//
182a4800735SMark de Wever// COPYRIGHT AND PERMISSION NOTICE
183a4800735SMark de Wever//
184a4800735SMark de Wever// Copyright (c) 1991-2022 Unicode, Inc. All rights reserved.
185a4800735SMark de Wever// Distributed under the Terms of Use in https://www.unicode.org/copyright.html.
186a4800735SMark de Wever//
187a4800735SMark de Wever// Permission is hereby granted, free of charge, to any person obtaining
188a4800735SMark de Wever// a copy of the Unicode data files and any associated documentation
189a4800735SMark de Wever// (the "Data Files") or Unicode software and any associated documentation
190a4800735SMark de Wever// (the "Software") to deal in the Data Files or Software
191a4800735SMark de Wever// without restriction, including without limitation the rights to use,
192a4800735SMark de Wever// copy, modify, merge, publish, distribute, and/or sell copies of
193a4800735SMark de Wever// the Data Files or Software, and to permit persons to whom the Data Files
194a4800735SMark de Wever// or Software are furnished to do so, provided that either
195a4800735SMark de Wever// (a) this copyright and permission notice appear with all copies
196a4800735SMark de Wever// of the Data Files or Software, or
197a4800735SMark de Wever// (b) this copyright and permission notice appear in associated
198a4800735SMark de Wever// Documentation.
199a4800735SMark de Wever//
200a4800735SMark de Wever// THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF
201a4800735SMark de Wever// ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
202a4800735SMark de Wever// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
203a4800735SMark de Wever// NONINFRINGEMENT OF THIRD PARTY RIGHTS.
204a4800735SMark de Wever// IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS
205a4800735SMark de Wever// NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL
206a4800735SMark de Wever// DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
207a4800735SMark de Wever// DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
208a4800735SMark de Wever// TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
209a4800735SMark de Wever// PERFORMANCE OF THE DATA FILES OR SOFTWARE.
210a4800735SMark de Wever//
211a4800735SMark de Wever// Except as contained in this notice, the name of a copyright holder
212a4800735SMark de Wever// shall not be used in advertising or otherwise to promote the sale,
213a4800735SMark de Wever// use or other dealings in these Data Files or Software without prior
214a4800735SMark de Wever// written authorization of the copyright holder.
215a4800735SMark de Wever
216a4800735SMark de Wever#ifndef _LIBCPP___FORMAT_ESCAPED_OUTPUT_TABLE_H
217a4800735SMark de Wever#define _LIBCPP___FORMAT_ESCAPED_OUTPUT_TABLE_H
218a4800735SMark de Wever
219a4800735SMark de Wever#include <__algorithm/ranges_upper_bound.h>
220a4800735SMark de Wever#include <__config>
221*e99c4906SNikolas Klauser#include <__cstddef/ptrdiff_t.h>
222a4800735SMark de Wever#include <cstdint>
223a4800735SMark de Wever
224a4800735SMark de Wever#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
225a4800735SMark de Wever#  pragma GCC system_header
226a4800735SMark de Wever#endif
227a4800735SMark de Wever
228a4800735SMark de Wever_LIBCPP_BEGIN_NAMESPACE_STD
229a4800735SMark de Wever
2304f15267dSNikolas Klauser#if _LIBCPP_STD_VER >= 23
231a4800735SMark de Wever
232a4800735SMark de Wevernamespace __escaped_output_table {{
233b18a46e3SLouis Dionne// clang-format off
234a4800735SMark de Wever{content}
235b18a46e3SLouis Dionne// clang-format on
236a4800735SMark de Wever}} // namespace __escaped_output_table
237a4800735SMark de Wever
2384f15267dSNikolas Klauser#endif // _LIBCPP_STD_VER >= 23
239a4800735SMark de Wever
240a4800735SMark de Wever_LIBCPP_END_NAMESPACE_STD
241a4800735SMark de Wever
242a4800735SMark de Wever#endif // _LIBCPP___FORMAT_ESCAPED_OUTPUT_TABLE_H"""
243a4800735SMark de Wever
244a4800735SMark de Wever
245a4800735SMark de Weverdef property_ranges_to_table(ranges: list[PropertyRange]) -> list[Entry]:
246a4800735SMark de Wever    result = list[Entry]()
247a4800735SMark de Wever    high = -1
248a4800735SMark de Wever    for range in sorted(ranges, key=lambda x: x.lower):
249a4800735SMark de Wever        # Validate overlapping ranges
250a4800735SMark de Wever        assert range.lower > high
251a4800735SMark de Wever        high = range.upper
252a4800735SMark de Wever
253a4800735SMark de Wever        while True:
254a4800735SMark de Wever            e = Entry(range.lower, range.upper - range.lower)
255e3dea5e3SMark de Wever            if e.offset <= 16383:
256a4800735SMark de Wever                result.append(e)
257a4800735SMark de Wever                break
258e3dea5e3SMark de Wever            e.offset = 16383
259a4800735SMark de Wever            result.append(e)
260e3dea5e3SMark de Wever            range.lower += 16384
261a4800735SMark de Wever    return result
262a4800735SMark de Wever
263a4800735SMark de Wever
264ad76a859SMark de Wevercpp_entrytemplate = "    0x{:08x} /* {:08x} - {:08x} [{:>5}] */"
265a4800735SMark de Wever
266a4800735SMark de Wever
267e3dea5e3SMark de Weverdef generate_cpp_data(
268e3dea5e3SMark de Wever    ranges: list[PropertyRange], unallocated: int, gap_lower: int, gap_upper: int
269e3dea5e3SMark de Wever) -> str:
270a4800735SMark de Wever    result = StringIO()
271a4800735SMark de Wever    table = property_ranges_to_table(ranges)
272e3dea5e3SMark de Wever    # Validates all entries fit in 18 bits.
273e3dea5e3SMark de Wever    for x in table:
274e3dea5e3SMark de Wever        assert x.lower + x.offset < 0x3FFFF
275a4800735SMark de Wever    result.write(
276a4800735SMark de Wever        DATA_ARRAY_TEMPLATE.format(
277a4800735SMark de Wever            size=len(table),
278a4800735SMark de Wever            entries=",\n".join(
279ad76a859SMark de Wever                [
280ad76a859SMark de Wever                    cpp_entrytemplate.format(
281e3dea5e3SMark de Wever                        x.lower << 14 | x.offset,
282ad76a859SMark de Wever                        x.lower,
283ad76a859SMark de Wever                        x.lower + x.offset,
284ad76a859SMark de Wever                        x.offset + 1,
285ad76a859SMark de Wever                    )
286ad76a859SMark de Wever                    for x in table
287ad76a859SMark de Wever                ]
288a4800735SMark de Wever            ),
289a4800735SMark de Wever            unallocated=unallocated,
290e3dea5e3SMark de Wever            gap_lower=gap_lower,
291e3dea5e3SMark de Wever            gap_upper=gap_upper,
292a4800735SMark de Wever        )
293a4800735SMark de Wever    )
294a4800735SMark de Wever
295a4800735SMark de Wever    return result.getvalue()
296a4800735SMark de Wever
297a4800735SMark de Wever
298a4800735SMark de Weverdef generate_data_tables() -> str:
299a4800735SMark de Wever    """
300a4800735SMark de Wever    Generate Unicode data for [format.string.escaped]/2.2.1.2.1
301a4800735SMark de Wever    """
302a4800735SMark de Wever    derived_general_catagory_path = (
303a4800735SMark de Wever        Path(__file__).absolute().parent
304a4800735SMark de Wever        / "data"
305a4800735SMark de Wever        / "unicode"
306a4800735SMark de Wever        / "DerivedGeneralCategory.txt"
307a4800735SMark de Wever    )
308a4800735SMark de Wever
309a4800735SMark de Wever    properties = list()
310a4800735SMark de Wever    with derived_general_catagory_path.open(encoding="utf-8") as f:
311a4800735SMark de Wever        properties.extend(
312a4800735SMark de Wever            list(
313a4800735SMark de Wever                filter(
314a4800735SMark de Wever                    filterGeneralProperty,
315a4800735SMark de Wever                    [x for line in f if (x := parsePropertyLine(line))],
316a4800735SMark de Wever                )
317a4800735SMark de Wever            )
318a4800735SMark de Wever        )
319a4800735SMark de Wever
320a4800735SMark de Wever    data = compactPropertyRanges(sorted(properties, key=lambda x: x.lower))
321a4800735SMark de Wever
322e3dea5e3SMark de Wever    # The output table has two large entries at the end, with a small "gap"
323ad76a859SMark de Wever    #   E0100..E01EF  ; Grapheme_Extend # Mn [240] VARIATION SELECTOR-17..VARIATION SELECTOR-256
324e3dea5e3SMark de Wever    # Based on Unicode 15.1.0:
325e3dea5e3SMark de Wever    # - Encoding all these entries in the table requires 1173 entries.
326e3dea5e3SMark de Wever    # - Manually handling these last two blocks reduces the size to 729 entries.
327e3dea5e3SMark de Wever    # This not only reduces the binary size, but also improves the performance
328e3dea5e3SMark de Wever    # by having fewer elements to search.
329e3dea5e3SMark de Wever    # The exact entries may differ between Unicode versions. When these numbers
330e3dea5e3SMark de Wever    # change the test needs to be updated too.
331e3dea5e3SMark de Wever    #   libcxx/test/libcxx/utilities/format/format.string/format.string.std/escaped_output.pass.cpp
332e3dea5e3SMark de Wever    assert (data[-2].lower) == 0x323B0
333e3dea5e3SMark de Wever    assert (data[-2].upper) == 0xE00FF
334e3dea5e3SMark de Wever    assert (data[-1].lower) == 0xE01F0
335e3dea5e3SMark de Wever    assert (data[-1].upper) == 0x10FFFF
336a4800735SMark de Wever
337e3dea5e3SMark de Wever    return "\n".join(
338e3dea5e3SMark de Wever        [
339e3dea5e3SMark de Wever            generate_cpp_data(
340e3dea5e3SMark de Wever                data[:-2], data[-2].lower, data[-2].upper + 1, data[-1].lower - 1
341e3dea5e3SMark de Wever            )
342e3dea5e3SMark de Wever        ]
343e3dea5e3SMark de Wever    )
344a4800735SMark de Wever
345a4800735SMark de Wever
346a4800735SMark de Weverif __name__ == "__main__":
347a4800735SMark de Wever    if len(sys.argv) == 2:
348a4800735SMark de Wever        sys.stdout = open(sys.argv[1], "w")
349a4800735SMark de Wever    print(TABLES_HPP_TEMPLATE.lstrip().format(content=generate_data_tables()))
350