xref: /llvm-project/libcxx/utils/generate_extended_grapheme_cluster_table.py (revision e99c4906e44ae3f921fa05356909d006cda8d954)
1#!/usr/bin/env python
2# ===----------------------------------------------------------------------===##
3#
4# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
5# See https://llvm.org/LICENSE.txt for license information.
6# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7#
8# ===----------------------------------------------------------------------===##
9
10# The code is based on
11# https://github.com/microsoft/STL/blob/main/tools/unicode_properties_parse/grapheme_break_property_data_gen.py
12#
13# Copyright (c) Microsoft Corporation.
14# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
15
16from io import StringIO
17from pathlib import Path
18from dataclasses import dataclass
19from typing import Optional
20import re
21import sys
22
23
24@dataclass
25class PropertyRange:
26    lower: int = -1
27    upper: int = -1
28    prop: str = None
29
30
31@dataclass
32class Entry:
33    lower: int = -1
34    offset: int = -1
35    prop: int = -1
36
37
38LINE_REGEX = re.compile(
39    r"^(?P<lower>[0-9A-F]{4,5})(?:\.\.(?P<upper>[0-9A-F]{4,5}))?\s*;\s*(?P<prop>\w+)"
40)
41
42
43def parsePropertyLine(inputLine: str) -> Optional[PropertyRange]:
44    result = PropertyRange()
45    if m := LINE_REGEX.match(inputLine):
46        lower_str, upper_str, result.prop = m.group("lower", "upper", "prop")
47        result.lower = int(lower_str, base=16)
48        result.upper = result.lower
49        if upper_str is not None:
50            result.upper = int(upper_str, base=16)
51        return result
52
53    else:
54        return None
55
56
57def compactPropertyRanges(input: list[PropertyRange]) -> list[PropertyRange]:
58    """
59    Merges consecutive ranges with the same property to one range.
60
61    Merging the ranges results in fewer ranges in the output table,
62    reducing binary and improving lookup performance.
63    """
64    result = list()
65    for x in input:
66        if (
67            len(result)
68            and result[-1].prop == x.prop
69            and result[-1].upper + 1 == x.lower
70        ):
71            result[-1].upper = x.upper
72            continue
73        result.append(x)
74    return result
75
76
77PROP_VALUE_ENUMERATOR_TEMPLATE = "  __{}"
78PROP_VALUE_ENUM_TEMPLATE = """
79enum class __property : uint8_t {{
80  // Values generated from the data files.
81{enumerators},
82
83  // The properies below aren't stored in the "database".
84
85  // Text position properties.
86  __sot,
87  __eot,
88
89  // The code unit has none of above properties.
90  __none
91}};
92"""
93
94DATA_ARRAY_TEMPLATE = """
95/// The entries of the extended grapheme cluster bondary property table.
96///
97/// The data is generated from
98/// - https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakProperty.txt
99/// - https://www.unicode.org/Public/UCD/latest/ucd/emoji/emoji-data.txt
100///
101/// The data has 3 values
102/// - bits [0, 3] The property. One of the values generated from the datafiles
103///   of \\ref __property
104/// - bits [4, 10] The size of the range.
105/// - bits [11, 31] The lower bound code point of the range. The upper bound of
106///   the range is lower bound + size.
107///
108/// The 7 bits for the size allow a maximum range of 128 elements. Some ranges
109/// in the Unicode tables are larger. They are stored in multiple consecutive
110/// ranges in the data table. An alternative would be to store the sizes in a
111/// separate 16-bit value. The original MSVC STL code had such an approach, but
112/// this approach uses less space for the data and is about 4% faster in the
113/// following benchmark.
114/// libcxx/benchmarks/std_format_spec_string_unicode.bench.cpp
115// clang-format off
116_LIBCPP_HIDE_FROM_ABI inline constexpr uint32_t __entries[{size}] = {{
117{entries}}};
118// clang-format on
119
120/// Returns the extended grapheme cluster bondary property of a code point.
121[[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr __property __get_property(const char32_t __code_point) noexcept {{
122  // The algorithm searches for the upper bound of the range and, when found,
123  // steps back one entry. This algorithm is used since the code point can be
124  // anywhere in the range. After a lower bound is found the next step is to
125  // compare whether the code unit is indeed in the range.
126  //
127  // Since the entry contains a code unit, size, and property the code point
128  // being sought needs to be adjusted. Just shifting the code point to the
129  // proper position doesn't work; suppose an entry has property 0, size 1,
130  // and lower bound 3. This results in the entry 0x1810.
131  // When searching for code point 3 it will search for 0x1800, find 0x1810
132  // and moves to the previous entry. Thus the lower bound value will never
133  // be found.
134  // The simple solution is to set the bits belonging to the property and
135  // size. Then the upper bound for code point 3 will return the entry after
136  // 0x1810. After moving to the previous entry the algorithm arrives at the
137  // correct entry.
138  ptrdiff_t __i = std::ranges::upper_bound(__entries, (__code_point << 11) | 0x7ffu) - __entries;
139  if (__i == 0)
140    return __property::__none;
141
142  --__i;
143  uint32_t __upper_bound = (__entries[__i] >> 11) + ((__entries[__i] >> 4) & 0x7f);
144  if (__code_point <= __upper_bound)
145    return static_cast<__property>(__entries[__i] & 0xf);
146
147  return __property::__none;
148}}
149"""
150
151MSVC_FORMAT_UCD_TABLES_HPP_TEMPLATE = """
152// -*- C++ -*-
153//===----------------------------------------------------------------------===//
154//
155// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
156// See https://llvm.org/LICENSE.txt for license information.
157// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
158//
159//===----------------------------------------------------------------------===//
160
161// WARNING, this entire header is generated by
162// utils/generate_extended_grapheme_cluster_table.py
163// DO NOT MODIFY!
164
165// UNICODE, INC. LICENSE AGREEMENT - DATA FILES AND SOFTWARE
166//
167// See Terms of Use <https://www.unicode.org/copyright.html>
168// for definitions of Unicode Inc.'s Data Files and Software.
169//
170// NOTICE TO USER: Carefully read the following legal agreement.
171// BY DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING UNICODE INC.'S
172// DATA FILES ("DATA FILES"), AND/OR SOFTWARE ("SOFTWARE"),
173// YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE
174// TERMS AND CONDITIONS OF THIS AGREEMENT.
175// IF YOU DO NOT AGREE, DO NOT DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE
176// THE DATA FILES OR SOFTWARE.
177//
178// COPYRIGHT AND PERMISSION NOTICE
179//
180// Copyright (c) 1991-2022 Unicode, Inc. All rights reserved.
181// Distributed under the Terms of Use in https://www.unicode.org/copyright.html.
182//
183// Permission is hereby granted, free of charge, to any person obtaining
184// a copy of the Unicode data files and any associated documentation
185// (the "Data Files") or Unicode software and any associated documentation
186// (the "Software") to deal in the Data Files or Software
187// without restriction, including without limitation the rights to use,
188// copy, modify, merge, publish, distribute, and/or sell copies of
189// the Data Files or Software, and to permit persons to whom the Data Files
190// or Software are furnished to do so, provided that either
191// (a) this copyright and permission notice appear with all copies
192// of the Data Files or Software, or
193// (b) this copyright and permission notice appear in associated
194// Documentation.
195//
196// THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF
197// ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
198// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
199// NONINFRINGEMENT OF THIRD PARTY RIGHTS.
200// IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS
201// NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL
202// DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
203// DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
204// TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
205// PERFORMANCE OF THE DATA FILES OR SOFTWARE.
206//
207// Except as contained in this notice, the name of a copyright holder
208// shall not be used in advertising or otherwise to promote the sale,
209// use or other dealings in these Data Files or Software without prior
210// written authorization of the copyright holder.
211
212#ifndef _LIBCPP___FORMAT_EXTENDED_GRAPHEME_CLUSTER_TABLE_H
213#define _LIBCPP___FORMAT_EXTENDED_GRAPHEME_CLUSTER_TABLE_H
214
215#include <__algorithm/ranges_upper_bound.h>
216#include <__config>
217#include <__cstddef/ptrdiff_t.h>
218#include <__iterator/access.h>
219#include <cstdint>
220
221#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
222#  pragma GCC system_header
223#endif
224
225_LIBCPP_BEGIN_NAMESPACE_STD
226
227#if _LIBCPP_STD_VER >= 20
228
229namespace __extended_grapheme_custer_property_boundary {{
230{content}
231}} // namespace __extended_grapheme_custer_property_boundary
232
233#endif // _LIBCPP_STD_VER >= 20
234
235_LIBCPP_END_NAMESPACE_STD
236
237#endif // _LIBCPP___FORMAT_EXTENDED_GRAPHEME_CLUSTER_TABLE_H"""
238
239
240def property_ranges_to_table(
241    ranges: list[PropertyRange], props: list[str]
242) -> list[Entry]:
243    assert len(props) < 16
244    result = list[Entry]()
245    high = -1
246    for range in sorted(ranges, key=lambda x: x.lower):
247        # Validate overlapping ranges
248        assert range.lower > high
249        high = range.upper
250
251        while True:
252            e = Entry(range.lower, range.upper - range.lower, props.index(range.prop))
253            if e.offset <= 127:
254                result.append(e)
255                break
256            e.offset = 127
257            result.append(e)
258            range.lower += 128
259    return result
260
261
262cpp_entrytemplate = "    0x{:08x}"
263
264
265def generate_cpp_data(prop_name: str, ranges: list[PropertyRange]) -> str:
266    result = StringIO()
267    prop_values = sorted(set(x.prop for x in ranges))
268    table = property_ranges_to_table(ranges, prop_values)
269    enumerator_values = [PROP_VALUE_ENUMERATOR_TEMPLATE.format(x) for x in prop_values]
270    result.write(
271        PROP_VALUE_ENUM_TEMPLATE.format(enumerators=",\n".join(enumerator_values))
272    )
273    result.write(
274        DATA_ARRAY_TEMPLATE.format(
275            prop_name=prop_name,
276            size=len(table),
277            entries=",\n".join(
278                [
279                    cpp_entrytemplate.format(x.lower << 11 | x.offset << 4 | x.prop)
280                    for x in table
281                ]
282            ),
283        )
284    )
285
286    return result.getvalue()
287
288
289def generate_data_tables() -> str:
290    """
291    Generate Unicode data for inclusion into <format> from
292    - https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakProperty.txt
293    - https://www.unicode.org/Public/UCD/latest/ucd/emoji/emoji-data.txt
294    - https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt
295
296    These files are expected to be stored in the same directory as this script.
297    """
298    root = Path(__file__).absolute().parent / "data" / "unicode"
299    gbp_data_path = root / "GraphemeBreakProperty.txt"
300    emoji_data_path = root / "emoji-data.txt"
301
302    gbp_ranges = list()
303    emoji_ranges = list()
304    with gbp_data_path.open(encoding="utf-8") as f:
305        gbp_ranges = compactPropertyRanges(
306            [x for line in f if (x := parsePropertyLine(line))]
307        )
308    with emoji_data_path.open(encoding="utf-8") as f:
309        emoji_ranges = compactPropertyRanges(
310            [x for line in f if (x := parsePropertyLine(line))]
311        )
312
313    [gbp_ranges.append(x) for x in emoji_ranges if x.prop == "Extended_Pictographic"]
314    gpb_cpp_data = generate_cpp_data("Grapheme_Break", gbp_ranges)
315    return "\n".join([gpb_cpp_data])
316
317
318if __name__ == "__main__":
319    if len(sys.argv) == 2:
320        sys.stdout = open(sys.argv[1], "w")
321    print(
322        MSVC_FORMAT_UCD_TABLES_HPP_TEMPLATE.lstrip().format(
323            content=generate_data_tables()
324        )
325    )
326