xref: /llvm-project/libcxx/utils/generate_extended_grapheme_cluster_table.py (revision e99c4906e44ae3f921fa05356909d006cda8d954)
1857a78c0SMark de Wever#!/usr/bin/env python
2857a78c0SMark de Wever# ===----------------------------------------------------------------------===##
3857a78c0SMark de Wever#
4857a78c0SMark de Wever# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
5857a78c0SMark de Wever# See https://llvm.org/LICENSE.txt for license information.
6857a78c0SMark de Wever# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7857a78c0SMark de Wever#
8857a78c0SMark de Wever# ===----------------------------------------------------------------------===##
9857a78c0SMark de Wever
10857a78c0SMark de Wever# The code is based on
11857a78c0SMark de Wever# https://github.com/microsoft/STL/blob/main/tools/unicode_properties_parse/grapheme_break_property_data_gen.py
12857a78c0SMark de Wever#
13857a78c0SMark de Wever# Copyright (c) Microsoft Corporation.
14857a78c0SMark de Wever# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
15857a78c0SMark de Wever
16857a78c0SMark de Weverfrom io import StringIO
17857a78c0SMark de Weverfrom pathlib import Path
180d3c40b8SStephan T. Lavavejfrom dataclasses import dataclass
19857a78c0SMark de Weverfrom typing import Optional
20857a78c0SMark de Weverimport re
21130b1816SMark de Weverimport sys
22857a78c0SMark de Wever
23857a78c0SMark de Wever
24857a78c0SMark de Wever@dataclass
25857a78c0SMark de Weverclass PropertyRange:
26857a78c0SMark de Wever    lower: int = -1
27857a78c0SMark de Wever    upper: int = -1
28857a78c0SMark de Wever    prop: str = None
29857a78c0SMark de Wever
30857a78c0SMark de Wever
31857a78c0SMark de Wever@dataclass
32857a78c0SMark de Weverclass Entry:
33857a78c0SMark de Wever    lower: int = -1
34857a78c0SMark de Wever    offset: int = -1
35857a78c0SMark de Wever    prop: int = -1
36857a78c0SMark de Wever
37857a78c0SMark de Wever
38857a78c0SMark de WeverLINE_REGEX = re.compile(
39857a78c0SMark de Wever    r"^(?P<lower>[0-9A-F]{4,5})(?:\.\.(?P<upper>[0-9A-F]{4,5}))?\s*;\s*(?P<prop>\w+)"
40857a78c0SMark de Wever)
41857a78c0SMark de Wever
42857a78c0SMark de Wever
43857a78c0SMark de Weverdef parsePropertyLine(inputLine: str) -> Optional[PropertyRange]:
44857a78c0SMark de Wever    result = PropertyRange()
45857a78c0SMark de Wever    if m := LINE_REGEX.match(inputLine):
46857a78c0SMark de Wever        lower_str, upper_str, result.prop = m.group("lower", "upper", "prop")
47857a78c0SMark de Wever        result.lower = int(lower_str, base=16)
48857a78c0SMark de Wever        result.upper = result.lower
49857a78c0SMark de Wever        if upper_str is not None:
50857a78c0SMark de Wever            result.upper = int(upper_str, base=16)
51857a78c0SMark de Wever        return result
52857a78c0SMark de Wever
53857a78c0SMark de Wever    else:
54857a78c0SMark de Wever        return None
55857a78c0SMark de Wever
56857a78c0SMark de Wever
57857a78c0SMark de Weverdef compactPropertyRanges(input: list[PropertyRange]) -> list[PropertyRange]:
58857a78c0SMark de Wever    """
59857a78c0SMark de Wever    Merges consecutive ranges with the same property to one range.
60857a78c0SMark de Wever
61857a78c0SMark de Wever    Merging the ranges results in fewer ranges in the output table,
62857a78c0SMark de Wever    reducing binary and improving lookup performance.
63857a78c0SMark de Wever    """
64857a78c0SMark de Wever    result = list()
65857a78c0SMark de Wever    for x in input:
66857a78c0SMark de Wever        if (
67857a78c0SMark de Wever            len(result)
68857a78c0SMark de Wever            and result[-1].prop == x.prop
69857a78c0SMark de Wever            and result[-1].upper + 1 == x.lower
70857a78c0SMark de Wever        ):
71857a78c0SMark de Wever            result[-1].upper = x.upper
72857a78c0SMark de Wever            continue
73857a78c0SMark de Wever        result.append(x)
74857a78c0SMark de Wever    return result
75857a78c0SMark de Wever
76857a78c0SMark de Wever
77857a78c0SMark de WeverPROP_VALUE_ENUMERATOR_TEMPLATE = "  __{}"
78857a78c0SMark de WeverPROP_VALUE_ENUM_TEMPLATE = """
79857a78c0SMark de Weverenum class __property : uint8_t {{
80857a78c0SMark de Wever  // Values generated from the data files.
81857a78c0SMark de Wever{enumerators},
82857a78c0SMark de Wever
83857a78c0SMark de Wever  // The properies below aren't stored in the "database".
84857a78c0SMark de Wever
85857a78c0SMark de Wever  // Text position properties.
86857a78c0SMark de Wever  __sot,
87857a78c0SMark de Wever  __eot,
88857a78c0SMark de Wever
89857a78c0SMark de Wever  // The code unit has none of above properties.
90857a78c0SMark de Wever  __none
91857a78c0SMark de Wever}};
92857a78c0SMark de Wever"""
93857a78c0SMark de Wever
94857a78c0SMark de WeverDATA_ARRAY_TEMPLATE = """
95857a78c0SMark de Wever/// The entries of the extended grapheme cluster bondary property table.
96857a78c0SMark de Wever///
97857a78c0SMark de Wever/// The data is generated from
98857a78c0SMark de Wever/// - https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakProperty.txt
99857a78c0SMark de Wever/// - https://www.unicode.org/Public/UCD/latest/ucd/emoji/emoji-data.txt
100857a78c0SMark de Wever///
101857a78c0SMark de Wever/// The data has 3 values
102898b5c9fSPiotr Fusik/// - bits [0, 3] The property. One of the values generated from the datafiles
103857a78c0SMark de Wever///   of \\ref __property
104857a78c0SMark de Wever/// - bits [4, 10] The size of the range.
105857a78c0SMark de Wever/// - bits [11, 31] The lower bound code point of the range. The upper bound of
106857a78c0SMark de Wever///   the range is lower bound + size.
107857a78c0SMark de Wever///
108857a78c0SMark de Wever/// The 7 bits for the size allow a maximum range of 128 elements. Some ranges
109857a78c0SMark de Wever/// in the Unicode tables are larger. They are stored in multiple consecutive
110857a78c0SMark de Wever/// ranges in the data table. An alternative would be to store the sizes in a
111857a78c0SMark de Wever/// separate 16-bit value. The original MSVC STL code had such an approach, but
112857a78c0SMark de Wever/// this approach uses less space for the data and is about 4% faster in the
113857a78c0SMark de Wever/// following benchmark.
114857a78c0SMark de Wever/// libcxx/benchmarks/std_format_spec_string_unicode.bench.cpp
115b18a46e3SLouis Dionne// clang-format off
116d179176fSMark de Wever_LIBCPP_HIDE_FROM_ABI inline constexpr uint32_t __entries[{size}] = {{
117da38bcfdSMark de Wever{entries}}};
118b18a46e3SLouis Dionne// clang-format on
119857a78c0SMark de Wever
120857a78c0SMark de Wever/// Returns the extended grapheme cluster bondary property of a code point.
121857a78c0SMark de Wever[[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr __property __get_property(const char32_t __code_point) noexcept {{
122857a78c0SMark de Wever  // The algorithm searches for the upper bound of the range and, when found,
123857a78c0SMark de Wever  // steps back one entry. This algorithm is used since the code point can be
124857a78c0SMark de Wever  // anywhere in the range. After a lower bound is found the next step is to
125857a78c0SMark de Wever  // compare whether the code unit is indeed in the range.
126857a78c0SMark de Wever  //
127857a78c0SMark de Wever  // Since the entry contains a code unit, size, and property the code point
128857a78c0SMark de Wever  // being sought needs to be adjusted. Just shifting the code point to the
129857a78c0SMark de Wever  // proper position doesn't work; suppose an entry has property 0, size 1,
130857a78c0SMark de Wever  // and lower bound 3. This results in the entry 0x1810.
131857a78c0SMark de Wever  // When searching for code point 3 it will search for 0x1800, find 0x1810
132857a78c0SMark de Wever  // and moves to the previous entry. Thus the lower bound value will never
133857a78c0SMark de Wever  // be found.
134857a78c0SMark de Wever  // The simple solution is to set the bits belonging to the property and
135857a78c0SMark de Wever  // size. Then the upper bound for code point 3 will return the entry after
136857a78c0SMark de Wever  // 0x1810. After moving to the previous entry the algorithm arrives at the
137857a78c0SMark de Wever  // correct entry.
1387c932cdbSMark de Wever  ptrdiff_t __i = std::ranges::upper_bound(__entries, (__code_point << 11) | 0x7ffu) - __entries;
139857a78c0SMark de Wever  if (__i == 0)
140857a78c0SMark de Wever    return __property::__none;
141857a78c0SMark de Wever
142857a78c0SMark de Wever  --__i;
143857a78c0SMark de Wever  uint32_t __upper_bound = (__entries[__i] >> 11) + ((__entries[__i] >> 4) & 0x7f);
144857a78c0SMark de Wever  if (__code_point <= __upper_bound)
145857a78c0SMark de Wever    return static_cast<__property>(__entries[__i] & 0xf);
146857a78c0SMark de Wever
147857a78c0SMark de Wever  return __property::__none;
148857a78c0SMark de Wever}}
149857a78c0SMark de Wever"""
150857a78c0SMark de Wever
151857a78c0SMark de WeverMSVC_FORMAT_UCD_TABLES_HPP_TEMPLATE = """
152857a78c0SMark de Wever// -*- C++ -*-
153857a78c0SMark de Wever//===----------------------------------------------------------------------===//
154857a78c0SMark de Wever//
155857a78c0SMark de Wever// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
156857a78c0SMark de Wever// See https://llvm.org/LICENSE.txt for license information.
157857a78c0SMark de Wever// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
158857a78c0SMark de Wever//
159857a78c0SMark de Wever//===----------------------------------------------------------------------===//
160857a78c0SMark de Wever
161857a78c0SMark de Wever// WARNING, this entire header is generated by
162750ee8d5SMark de Wever// utils/generate_extended_grapheme_cluster_table.py
163857a78c0SMark de Wever// DO NOT MODIFY!
164857a78c0SMark de Wever
165857a78c0SMark de Wever// UNICODE, INC. LICENSE AGREEMENT - DATA FILES AND SOFTWARE
166857a78c0SMark de Wever//
167857a78c0SMark de Wever// See Terms of Use <https://www.unicode.org/copyright.html>
168857a78c0SMark de Wever// for definitions of Unicode Inc.'s Data Files and Software.
169857a78c0SMark de Wever//
170857a78c0SMark de Wever// NOTICE TO USER: Carefully read the following legal agreement.
171857a78c0SMark de Wever// BY DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING UNICODE INC.'S
172857a78c0SMark de Wever// DATA FILES ("DATA FILES"), AND/OR SOFTWARE ("SOFTWARE"),
173857a78c0SMark de Wever// YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE
174857a78c0SMark de Wever// TERMS AND CONDITIONS OF THIS AGREEMENT.
175857a78c0SMark de Wever// IF YOU DO NOT AGREE, DO NOT DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE
176857a78c0SMark de Wever// THE DATA FILES OR SOFTWARE.
177857a78c0SMark de Wever//
178857a78c0SMark de Wever// COPYRIGHT AND PERMISSION NOTICE
179857a78c0SMark de Wever//
180857a78c0SMark de Wever// Copyright (c) 1991-2022 Unicode, Inc. All rights reserved.
181857a78c0SMark de Wever// Distributed under the Terms of Use in https://www.unicode.org/copyright.html.
182857a78c0SMark de Wever//
183857a78c0SMark de Wever// Permission is hereby granted, free of charge, to any person obtaining
184857a78c0SMark de Wever// a copy of the Unicode data files and any associated documentation
185857a78c0SMark de Wever// (the "Data Files") or Unicode software and any associated documentation
186857a78c0SMark de Wever// (the "Software") to deal in the Data Files or Software
187857a78c0SMark de Wever// without restriction, including without limitation the rights to use,
188857a78c0SMark de Wever// copy, modify, merge, publish, distribute, and/or sell copies of
189857a78c0SMark de Wever// the Data Files or Software, and to permit persons to whom the Data Files
190857a78c0SMark de Wever// or Software are furnished to do so, provided that either
191857a78c0SMark de Wever// (a) this copyright and permission notice appear with all copies
192857a78c0SMark de Wever// of the Data Files or Software, or
193857a78c0SMark de Wever// (b) this copyright and permission notice appear in associated
194857a78c0SMark de Wever// Documentation.
195857a78c0SMark de Wever//
196857a78c0SMark de Wever// THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF
197857a78c0SMark de Wever// ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
198857a78c0SMark de Wever// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
199857a78c0SMark de Wever// NONINFRINGEMENT OF THIRD PARTY RIGHTS.
200857a78c0SMark de Wever// IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS
201857a78c0SMark de Wever// NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL
202857a78c0SMark de Wever// DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
203857a78c0SMark de Wever// DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
204857a78c0SMark de Wever// TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
205857a78c0SMark de Wever// PERFORMANCE OF THE DATA FILES OR SOFTWARE.
206857a78c0SMark de Wever//
207857a78c0SMark de Wever// Except as contained in this notice, the name of a copyright holder
208857a78c0SMark de Wever// shall not be used in advertising or otherwise to promote the sale,
209857a78c0SMark de Wever// use or other dealings in these Data Files or Software without prior
210857a78c0SMark de Wever// written authorization of the copyright holder.
211857a78c0SMark de Wever
212857a78c0SMark de Wever#ifndef _LIBCPP___FORMAT_EXTENDED_GRAPHEME_CLUSTER_TABLE_H
213857a78c0SMark de Wever#define _LIBCPP___FORMAT_EXTENDED_GRAPHEME_CLUSTER_TABLE_H
214857a78c0SMark de Wever
2157c932cdbSMark de Wever#include <__algorithm/ranges_upper_bound.h>
216857a78c0SMark de Wever#include <__config>
217*e99c4906SNikolas Klauser#include <__cstddef/ptrdiff_t.h>
218857a78c0SMark de Wever#include <__iterator/access.h>
219857a78c0SMark de Wever#include <cstdint>
220857a78c0SMark de Wever
221857a78c0SMark de Wever#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
222857a78c0SMark de Wever#  pragma GCC system_header
223857a78c0SMark de Wever#endif
224857a78c0SMark de Wever
225857a78c0SMark de Wever_LIBCPP_BEGIN_NAMESPACE_STD
226857a78c0SMark de Wever
2274f15267dSNikolas Klauser#if _LIBCPP_STD_VER >= 20
228857a78c0SMark de Wever
229857a78c0SMark de Wevernamespace __extended_grapheme_custer_property_boundary {{
230857a78c0SMark de Wever{content}
231da38bcfdSMark de Wever}} // namespace __extended_grapheme_custer_property_boundary
232857a78c0SMark de Wever
2334f15267dSNikolas Klauser#endif // _LIBCPP_STD_VER >= 20
234857a78c0SMark de Wever
235857a78c0SMark de Wever_LIBCPP_END_NAMESPACE_STD
236857a78c0SMark de Wever
237da38bcfdSMark de Wever#endif // _LIBCPP___FORMAT_EXTENDED_GRAPHEME_CLUSTER_TABLE_H"""
238857a78c0SMark de Wever
239857a78c0SMark de Wever
240857a78c0SMark de Weverdef property_ranges_to_table(
241857a78c0SMark de Wever    ranges: list[PropertyRange], props: list[str]
242857a78c0SMark de Wever) -> list[Entry]:
243857a78c0SMark de Wever    assert len(props) < 16
244857a78c0SMark de Wever    result = list[Entry]()
245857a78c0SMark de Wever    high = -1
246857a78c0SMark de Wever    for range in sorted(ranges, key=lambda x: x.lower):
247857a78c0SMark de Wever        # Validate overlapping ranges
248857a78c0SMark de Wever        assert range.lower > high
249857a78c0SMark de Wever        high = range.upper
250857a78c0SMark de Wever
251857a78c0SMark de Wever        while True:
252857a78c0SMark de Wever            e = Entry(range.lower, range.upper - range.lower, props.index(range.prop))
253857a78c0SMark de Wever            if e.offset <= 127:
254857a78c0SMark de Wever                result.append(e)
255857a78c0SMark de Wever                break
256857a78c0SMark de Wever            e.offset = 127
257857a78c0SMark de Wever            result.append(e)
258857a78c0SMark de Wever            range.lower += 128
259857a78c0SMark de Wever    return result
260857a78c0SMark de Wever
261857a78c0SMark de Wever
262857a78c0SMark de Wevercpp_entrytemplate = "    0x{:08x}"
263857a78c0SMark de Wever
264857a78c0SMark de Wever
265857a78c0SMark de Weverdef generate_cpp_data(prop_name: str, ranges: list[PropertyRange]) -> str:
266857a78c0SMark de Wever    result = StringIO()
267857a78c0SMark de Wever    prop_values = sorted(set(x.prop for x in ranges))
268857a78c0SMark de Wever    table = property_ranges_to_table(ranges, prop_values)
269857a78c0SMark de Wever    enumerator_values = [PROP_VALUE_ENUMERATOR_TEMPLATE.format(x) for x in prop_values]
270857a78c0SMark de Wever    result.write(
271da38bcfdSMark de Wever        PROP_VALUE_ENUM_TEMPLATE.format(enumerators=",\n".join(enumerator_values))
272857a78c0SMark de Wever    )
273857a78c0SMark de Wever    result.write(
274857a78c0SMark de Wever        DATA_ARRAY_TEMPLATE.format(
275857a78c0SMark de Wever            prop_name=prop_name,
276857a78c0SMark de Wever            size=len(table),
277da38bcfdSMark de Wever            entries=",\n".join(
278857a78c0SMark de Wever                [
279857a78c0SMark de Wever                    cpp_entrytemplate.format(x.lower << 11 | x.offset << 4 | x.prop)
280857a78c0SMark de Wever                    for x in table
281857a78c0SMark de Wever                ]
282857a78c0SMark de Wever            ),
283857a78c0SMark de Wever        )
284857a78c0SMark de Wever    )
285857a78c0SMark de Wever
286857a78c0SMark de Wever    return result.getvalue()
287857a78c0SMark de Wever
288857a78c0SMark de Wever
289857a78c0SMark de Weverdef generate_data_tables() -> str:
290857a78c0SMark de Wever    """
291857a78c0SMark de Wever    Generate Unicode data for inclusion into <format> from
29259e66c51SMark de Wever    - https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakProperty.txt
29359e66c51SMark de Wever    - https://www.unicode.org/Public/UCD/latest/ucd/emoji/emoji-data.txt
29459e66c51SMark de Wever    - https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt
295857a78c0SMark de Wever
29659e66c51SMark de Wever    These files are expected to be stored in the same directory as this script.
297857a78c0SMark de Wever    """
29859e66c51SMark de Wever    root = Path(__file__).absolute().parent / "data" / "unicode"
29959e66c51SMark de Wever    gbp_data_path = root / "GraphemeBreakProperty.txt"
30059e66c51SMark de Wever    emoji_data_path = root / "emoji-data.txt"
30159e66c51SMark de Wever
302857a78c0SMark de Wever    gbp_ranges = list()
303857a78c0SMark de Wever    emoji_ranges = list()
304857a78c0SMark de Wever    with gbp_data_path.open(encoding="utf-8") as f:
305857a78c0SMark de Wever        gbp_ranges = compactPropertyRanges(
306857a78c0SMark de Wever            [x for line in f if (x := parsePropertyLine(line))]
307857a78c0SMark de Wever        )
308857a78c0SMark de Wever    with emoji_data_path.open(encoding="utf-8") as f:
309857a78c0SMark de Wever        emoji_ranges = compactPropertyRanges(
310857a78c0SMark de Wever            [x for line in f if (x := parsePropertyLine(line))]
311857a78c0SMark de Wever        )
312857a78c0SMark de Wever
313857a78c0SMark de Wever    [gbp_ranges.append(x) for x in emoji_ranges if x.prop == "Extended_Pictographic"]
314857a78c0SMark de Wever    gpb_cpp_data = generate_cpp_data("Grapheme_Break", gbp_ranges)
315857a78c0SMark de Wever    return "\n".join([gpb_cpp_data])
316857a78c0SMark de Wever
317857a78c0SMark de Wever
318857a78c0SMark de Weverif __name__ == "__main__":
319130b1816SMark de Wever    if len(sys.argv) == 2:
320130b1816SMark de Wever        sys.stdout = open(sys.argv[1], "w")
321857a78c0SMark de Wever    print(
322857a78c0SMark de Wever        MSVC_FORMAT_UCD_TABLES_HPP_TEMPLATE.lstrip().format(
323857a78c0SMark de Wever            content=generate_data_tables()
324857a78c0SMark de Wever        )
325857a78c0SMark de Wever    )
326