xref: /openbsd-src/gnu/llvm/libcxx/utils/generate_extended_grapheme_cluster_table.py (revision 4bdff4bed0e3d54e55670334c7d0077db4170f86)
1*4bdff4beSrobert#!/usr/bin/env python
2*4bdff4beSrobert# ===----------------------------------------------------------------------===##
3*4bdff4beSrobert#
4*4bdff4beSrobert# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
5*4bdff4beSrobert# See https://llvm.org/LICENSE.txt for license information.
6*4bdff4beSrobert# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7*4bdff4beSrobert#
8*4bdff4beSrobert# ===----------------------------------------------------------------------===##
9*4bdff4beSrobert
10*4bdff4beSrobert# The code is based on
11*4bdff4beSrobert# https://github.com/microsoft/STL/blob/main/tools/unicode_properties_parse/grapheme_break_property_data_gen.py
12*4bdff4beSrobert#
13*4bdff4beSrobert# Copyright (c) Microsoft Corporation.
14*4bdff4beSrobert# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
15*4bdff4beSrobert
16*4bdff4beSrobertfrom io import StringIO
17*4bdff4beSrobertfrom pathlib import Path
18*4bdff4beSrobertfrom dataclasses import dataclass, field
19*4bdff4beSrobertfrom typing import Optional
20*4bdff4beSrobertimport re
21*4bdff4beSrobertimport sys
22*4bdff4beSrobert
23*4bdff4beSrobert
24*4bdff4beSrobert@dataclass
25*4bdff4beSrobertclass PropertyRange:
26*4bdff4beSrobert    lower: int = -1
27*4bdff4beSrobert    upper: int = -1
28*4bdff4beSrobert    prop: str = None
29*4bdff4beSrobert
30*4bdff4beSrobert
31*4bdff4beSrobert@dataclass
32*4bdff4beSrobertclass Entry:
33*4bdff4beSrobert    lower: int = -1
34*4bdff4beSrobert    offset: int = -1
35*4bdff4beSrobert    prop: int = -1
36*4bdff4beSrobert
37*4bdff4beSrobert
38*4bdff4beSrobertLINE_REGEX = re.compile(
39*4bdff4beSrobert    r"^(?P<lower>[0-9A-F]{4,5})(?:\.\.(?P<upper>[0-9A-F]{4,5}))?\s*;\s*(?P<prop>\w+)"
40*4bdff4beSrobert)
41*4bdff4beSrobert
42*4bdff4beSrobert
43*4bdff4beSrobertdef parsePropertyLine(inputLine: str) -> Optional[PropertyRange]:
44*4bdff4beSrobert    result = PropertyRange()
45*4bdff4beSrobert    if m := LINE_REGEX.match(inputLine):
46*4bdff4beSrobert        lower_str, upper_str, result.prop = m.group("lower", "upper", "prop")
47*4bdff4beSrobert        result.lower = int(lower_str, base=16)
48*4bdff4beSrobert        result.upper = result.lower
49*4bdff4beSrobert        if upper_str is not None:
50*4bdff4beSrobert            result.upper = int(upper_str, base=16)
51*4bdff4beSrobert        return result
52*4bdff4beSrobert
53*4bdff4beSrobert    else:
54*4bdff4beSrobert        return None
55*4bdff4beSrobert
56*4bdff4beSrobert
57*4bdff4beSrobertdef compactPropertyRanges(input: list[PropertyRange]) -> list[PropertyRange]:
58*4bdff4beSrobert    """
59*4bdff4beSrobert    Merges consecutive ranges with the same property to one range.
60*4bdff4beSrobert
61*4bdff4beSrobert    Merging the ranges results in fewer ranges in the output table,
62*4bdff4beSrobert    reducing binary and improving lookup performance.
63*4bdff4beSrobert    """
64*4bdff4beSrobert    result = list()
65*4bdff4beSrobert    for x in input:
66*4bdff4beSrobert        if (
67*4bdff4beSrobert            len(result)
68*4bdff4beSrobert            and result[-1].prop == x.prop
69*4bdff4beSrobert            and result[-1].upper + 1 == x.lower
70*4bdff4beSrobert        ):
71*4bdff4beSrobert            result[-1].upper = x.upper
72*4bdff4beSrobert            continue
73*4bdff4beSrobert        result.append(x)
74*4bdff4beSrobert    return result
75*4bdff4beSrobert
76*4bdff4beSrobert
77*4bdff4beSrobertPROP_VALUE_ENUMERATOR_TEMPLATE = "  __{}"
78*4bdff4beSrobertPROP_VALUE_ENUM_TEMPLATE = """
79*4bdff4beSrobertenum class __property : uint8_t {{
80*4bdff4beSrobert  // Values generated from the data files.
81*4bdff4beSrobert{enumerators},
82*4bdff4beSrobert
83*4bdff4beSrobert  // The properies below aren't stored in the "database".
84*4bdff4beSrobert
85*4bdff4beSrobert  // Text position properties.
86*4bdff4beSrobert  __sot,
87*4bdff4beSrobert  __eot,
88*4bdff4beSrobert
89*4bdff4beSrobert  // The code unit has none of above properties.
90*4bdff4beSrobert  __none
91*4bdff4beSrobert}};
92*4bdff4beSrobert"""
93*4bdff4beSrobert
94*4bdff4beSrobertDATA_ARRAY_TEMPLATE = """
95*4bdff4beSrobert/// The entries of the extended grapheme cluster bondary property table.
96*4bdff4beSrobert///
97*4bdff4beSrobert/// The data is generated from
98*4bdff4beSrobert/// - https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakProperty.txt
99*4bdff4beSrobert/// - https://www.unicode.org/Public/UCD/latest/ucd/emoji/emoji-data.txt
100*4bdff4beSrobert///
101*4bdff4beSrobert/// The data has 3 values
102*4bdff4beSrobert/// - bits [0, 3] The property. One of the values generated from the datafiles
103*4bdff4beSrobert///   of \\ref __property
104*4bdff4beSrobert/// - bits [4, 10] The size of the range.
105*4bdff4beSrobert/// - bits [11, 31] The lower bound code point of the range. The upper bound of
106*4bdff4beSrobert///   the range is lower bound + size.
107*4bdff4beSrobert///
108*4bdff4beSrobert/// The 7 bits for the size allow a maximum range of 128 elements. Some ranges
109*4bdff4beSrobert/// in the Unicode tables are larger. They are stored in multiple consecutive
110*4bdff4beSrobert/// ranges in the data table. An alternative would be to store the sizes in a
111*4bdff4beSrobert/// separate 16-bit value. The original MSVC STL code had such an approach, but
112*4bdff4beSrobert/// this approach uses less space for the data and is about 4% faster in the
113*4bdff4beSrobert/// following benchmark.
114*4bdff4beSrobert/// libcxx/benchmarks/std_format_spec_string_unicode.bench.cpp
115*4bdff4beSrobertinline constexpr uint32_t __entries[{size}] = {{
116*4bdff4beSrobert{entries}}};
117*4bdff4beSrobert
118*4bdff4beSrobert/// Returns the extended grapheme cluster bondary property of a code point.
119*4bdff4beSrobert[[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr __property __get_property(const char32_t __code_point) noexcept {{
120*4bdff4beSrobert  // The algorithm searches for the upper bound of the range and, when found,
121*4bdff4beSrobert  // steps back one entry. This algorithm is used since the code point can be
122*4bdff4beSrobert  // anywhere in the range. After a lower bound is found the next step is to
123*4bdff4beSrobert  // compare whether the code unit is indeed in the range.
124*4bdff4beSrobert  //
125*4bdff4beSrobert  // Since the entry contains a code unit, size, and property the code point
126*4bdff4beSrobert  // being sought needs to be adjusted. Just shifting the code point to the
127*4bdff4beSrobert  // proper position doesn't work; suppose an entry has property 0, size 1,
128*4bdff4beSrobert  // and lower bound 3. This results in the entry 0x1810.
129*4bdff4beSrobert  // When searching for code point 3 it will search for 0x1800, find 0x1810
130*4bdff4beSrobert  // and moves to the previous entry. Thus the lower bound value will never
131*4bdff4beSrobert  // be found.
132*4bdff4beSrobert  // The simple solution is to set the bits belonging to the property and
133*4bdff4beSrobert  // size. Then the upper bound for code point 3 will return the entry after
134*4bdff4beSrobert  // 0x1810. After moving to the previous entry the algorithm arrives at the
135*4bdff4beSrobert  // correct entry.
136*4bdff4beSrobert  ptrdiff_t __i = std::ranges::upper_bound(__entries, (__code_point << 11) | 0x7ffu) - __entries;
137*4bdff4beSrobert  if (__i == 0)
138*4bdff4beSrobert    return __property::__none;
139*4bdff4beSrobert
140*4bdff4beSrobert  --__i;
141*4bdff4beSrobert  uint32_t __upper_bound = (__entries[__i] >> 11) + ((__entries[__i] >> 4) & 0x7f);
142*4bdff4beSrobert  if (__code_point <= __upper_bound)
143*4bdff4beSrobert    return static_cast<__property>(__entries[__i] & 0xf);
144*4bdff4beSrobert
145*4bdff4beSrobert  return __property::__none;
146*4bdff4beSrobert}}
147*4bdff4beSrobert"""
148*4bdff4beSrobert
149*4bdff4beSrobertMSVC_FORMAT_UCD_TABLES_HPP_TEMPLATE = """
150*4bdff4beSrobert// -*- C++ -*-
151*4bdff4beSrobert//===----------------------------------------------------------------------===//
152*4bdff4beSrobert//
153*4bdff4beSrobert// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
154*4bdff4beSrobert// See https://llvm.org/LICENSE.txt for license information.
155*4bdff4beSrobert// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
156*4bdff4beSrobert//
157*4bdff4beSrobert//===----------------------------------------------------------------------===//
158*4bdff4beSrobert
159*4bdff4beSrobert// WARNING, this entire header is generated by
160*4bdff4beSrobert// utils/generate_extended_grapheme_cluster_table.py
161*4bdff4beSrobert// DO NOT MODIFY!
162*4bdff4beSrobert
163*4bdff4beSrobert// UNICODE, INC. LICENSE AGREEMENT - DATA FILES AND SOFTWARE
164*4bdff4beSrobert//
165*4bdff4beSrobert// See Terms of Use <https://www.unicode.org/copyright.html>
166*4bdff4beSrobert// for definitions of Unicode Inc.'s Data Files and Software.
167*4bdff4beSrobert//
168*4bdff4beSrobert// NOTICE TO USER: Carefully read the following legal agreement.
169*4bdff4beSrobert// BY DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING UNICODE INC.'S
170*4bdff4beSrobert// DATA FILES ("DATA FILES"), AND/OR SOFTWARE ("SOFTWARE"),
171*4bdff4beSrobert// YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE
172*4bdff4beSrobert// TERMS AND CONDITIONS OF THIS AGREEMENT.
173*4bdff4beSrobert// IF YOU DO NOT AGREE, DO NOT DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE
174*4bdff4beSrobert// THE DATA FILES OR SOFTWARE.
175*4bdff4beSrobert//
176*4bdff4beSrobert// COPYRIGHT AND PERMISSION NOTICE
177*4bdff4beSrobert//
178*4bdff4beSrobert// Copyright (c) 1991-2022 Unicode, Inc. All rights reserved.
179*4bdff4beSrobert// Distributed under the Terms of Use in https://www.unicode.org/copyright.html.
180*4bdff4beSrobert//
181*4bdff4beSrobert// Permission is hereby granted, free of charge, to any person obtaining
182*4bdff4beSrobert// a copy of the Unicode data files and any associated documentation
183*4bdff4beSrobert// (the "Data Files") or Unicode software and any associated documentation
184*4bdff4beSrobert// (the "Software") to deal in the Data Files or Software
185*4bdff4beSrobert// without restriction, including without limitation the rights to use,
186*4bdff4beSrobert// copy, modify, merge, publish, distribute, and/or sell copies of
187*4bdff4beSrobert// the Data Files or Software, and to permit persons to whom the Data Files
188*4bdff4beSrobert// or Software are furnished to do so, provided that either
189*4bdff4beSrobert// (a) this copyright and permission notice appear with all copies
190*4bdff4beSrobert// of the Data Files or Software, or
191*4bdff4beSrobert// (b) this copyright and permission notice appear in associated
192*4bdff4beSrobert// Documentation.
193*4bdff4beSrobert//
194*4bdff4beSrobert// THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF
195*4bdff4beSrobert// ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
196*4bdff4beSrobert// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
197*4bdff4beSrobert// NONINFRINGEMENT OF THIRD PARTY RIGHTS.
198*4bdff4beSrobert// IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS
199*4bdff4beSrobert// NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL
200*4bdff4beSrobert// DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
201*4bdff4beSrobert// DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
202*4bdff4beSrobert// TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
203*4bdff4beSrobert// PERFORMANCE OF THE DATA FILES OR SOFTWARE.
204*4bdff4beSrobert//
205*4bdff4beSrobert// Except as contained in this notice, the name of a copyright holder
206*4bdff4beSrobert// shall not be used in advertising or otherwise to promote the sale,
207*4bdff4beSrobert// use or other dealings in these Data Files or Software without prior
208*4bdff4beSrobert// written authorization of the copyright holder.
209*4bdff4beSrobert
210*4bdff4beSrobert#ifndef _LIBCPP___FORMAT_EXTENDED_GRAPHEME_CLUSTER_TABLE_H
211*4bdff4beSrobert#define _LIBCPP___FORMAT_EXTENDED_GRAPHEME_CLUSTER_TABLE_H
212*4bdff4beSrobert
213*4bdff4beSrobert#include <__algorithm/ranges_upper_bound.h>
214*4bdff4beSrobert#include <__config>
215*4bdff4beSrobert#include <__iterator/access.h>
216*4bdff4beSrobert#include <cstddef>
217*4bdff4beSrobert#include <cstdint>
218*4bdff4beSrobert
219*4bdff4beSrobert#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
220*4bdff4beSrobert#  pragma GCC system_header
221*4bdff4beSrobert#endif
222*4bdff4beSrobert
223*4bdff4beSrobert_LIBCPP_BEGIN_NAMESPACE_STD
224*4bdff4beSrobert
225*4bdff4beSrobert#if _LIBCPP_STD_VER > 17
226*4bdff4beSrobert
227*4bdff4beSrobertnamespace __extended_grapheme_custer_property_boundary {{
228*4bdff4beSrobert{content}
229*4bdff4beSrobert}} // namespace __extended_grapheme_custer_property_boundary
230*4bdff4beSrobert
231*4bdff4beSrobert#endif //_LIBCPP_STD_VER > 17
232*4bdff4beSrobert
233*4bdff4beSrobert_LIBCPP_END_NAMESPACE_STD
234*4bdff4beSrobert
235*4bdff4beSrobert#endif // _LIBCPP___FORMAT_EXTENDED_GRAPHEME_CLUSTER_TABLE_H"""
236*4bdff4beSrobert
237*4bdff4beSrobert
238*4bdff4beSrobertdef property_ranges_to_table(
239*4bdff4beSrobert    ranges: list[PropertyRange], props: list[str]
240*4bdff4beSrobert) -> list[Entry]:
241*4bdff4beSrobert    assert len(props) < 16
242*4bdff4beSrobert    result = list[Entry]()
243*4bdff4beSrobert    high = -1
244*4bdff4beSrobert    for range in sorted(ranges, key=lambda x: x.lower):
245*4bdff4beSrobert        # Validate overlapping ranges
246*4bdff4beSrobert        assert range.lower > high
247*4bdff4beSrobert        high = range.upper
248*4bdff4beSrobert
249*4bdff4beSrobert        while True:
250*4bdff4beSrobert            e = Entry(range.lower, range.upper - range.lower, props.index(range.prop))
251*4bdff4beSrobert            if e.offset <= 127:
252*4bdff4beSrobert                result.append(e)
253*4bdff4beSrobert                break
254*4bdff4beSrobert            e.offset = 127
255*4bdff4beSrobert            result.append(e)
256*4bdff4beSrobert            range.lower += 128
257*4bdff4beSrobert    return result
258*4bdff4beSrobert
259*4bdff4beSrobert
260*4bdff4beSrobertcpp_entrytemplate = "    0x{:08x}"
261*4bdff4beSrobert
262*4bdff4beSrobert
263*4bdff4beSrobertdef generate_cpp_data(prop_name: str, ranges: list[PropertyRange]) -> str:
264*4bdff4beSrobert    result = StringIO()
265*4bdff4beSrobert    prop_values = sorted(set(x.prop for x in ranges))
266*4bdff4beSrobert    table = property_ranges_to_table(ranges, prop_values)
267*4bdff4beSrobert    enumerator_values = [PROP_VALUE_ENUMERATOR_TEMPLATE.format(x) for x in prop_values]
268*4bdff4beSrobert    result.write(
269*4bdff4beSrobert        PROP_VALUE_ENUM_TEMPLATE.format(enumerators=",\n".join(enumerator_values))
270*4bdff4beSrobert    )
271*4bdff4beSrobert    result.write(
272*4bdff4beSrobert        DATA_ARRAY_TEMPLATE.format(
273*4bdff4beSrobert            prop_name=prop_name,
274*4bdff4beSrobert            size=len(table),
275*4bdff4beSrobert            entries=",\n".join(
276*4bdff4beSrobert                [
277*4bdff4beSrobert                    cpp_entrytemplate.format(x.lower << 11 | x.offset << 4 | x.prop)
278*4bdff4beSrobert                    for x in table
279*4bdff4beSrobert                ]
280*4bdff4beSrobert            ),
281*4bdff4beSrobert        )
282*4bdff4beSrobert    )
283*4bdff4beSrobert
284*4bdff4beSrobert    return result.getvalue()
285*4bdff4beSrobert
286*4bdff4beSrobert
287*4bdff4beSrobertdef generate_data_tables() -> str:
288*4bdff4beSrobert    """
289*4bdff4beSrobert    Generate Unicode data for inclusion into <format> from
290*4bdff4beSrobert    GraphemeBreakProperty.txt and emoji-data.txt.
291*4bdff4beSrobert
292*4bdff4beSrobert    GraphemeBreakProperty.txt can be found at
293*4bdff4beSrobert    https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakProperty.txt
294*4bdff4beSrobert
295*4bdff4beSrobert    emoji-data.txt can be found at
296*4bdff4beSrobert    https://www.unicode.org/Public/UCD/latest/ucd/emoji/emoji-data.txt
297*4bdff4beSrobert
298*4bdff4beSrobert    Both files are expected to be in the same directory as this script.
299*4bdff4beSrobert    """
300*4bdff4beSrobert    gbp_data_path = (
301*4bdff4beSrobert        Path(__file__).absolute().parent
302*4bdff4beSrobert        / "data"
303*4bdff4beSrobert        / "unicode"
304*4bdff4beSrobert        / "GraphemeBreakProperty.txt"
305*4bdff4beSrobert    )
306*4bdff4beSrobert    emoji_data_path = (
307*4bdff4beSrobert        Path(__file__).absolute().parent / "data" / "unicode" / "emoji-data.txt"
308*4bdff4beSrobert    )
309*4bdff4beSrobert    gbp_ranges = list()
310*4bdff4beSrobert    emoji_ranges = list()
311*4bdff4beSrobert    with gbp_data_path.open(encoding="utf-8") as f:
312*4bdff4beSrobert        gbp_ranges = compactPropertyRanges(
313*4bdff4beSrobert            [x for line in f if (x := parsePropertyLine(line))]
314*4bdff4beSrobert        )
315*4bdff4beSrobert    with emoji_data_path.open(encoding="utf-8") as f:
316*4bdff4beSrobert        emoji_ranges = compactPropertyRanges(
317*4bdff4beSrobert            [x for line in f if (x := parsePropertyLine(line))]
318*4bdff4beSrobert        )
319*4bdff4beSrobert
320*4bdff4beSrobert    [gbp_ranges.append(x) for x in emoji_ranges if x.prop == "Extended_Pictographic"]
321*4bdff4beSrobert    gpb_cpp_data = generate_cpp_data("Grapheme_Break", gbp_ranges)
322*4bdff4beSrobert    return "\n".join([gpb_cpp_data])
323*4bdff4beSrobert
324*4bdff4beSrobert
325*4bdff4beSrobertif __name__ == "__main__":
326*4bdff4beSrobert    if len(sys.argv) == 2:
327*4bdff4beSrobert        sys.stdout = open(sys.argv[1], "w")
328*4bdff4beSrobert    print(
329*4bdff4beSrobert        MSVC_FORMAT_UCD_TABLES_HPP_TEMPLATE.lstrip().format(
330*4bdff4beSrobert            content=generate_data_tables()
331*4bdff4beSrobert        )
332*4bdff4beSrobert    )
333