xref: /llvm-project/libcxx/utils/generate_width_estimation_table.py (revision e99c4906e44ae3f921fa05356909d006cda8d954)
1#!/usr/bin/env python
2# ===----------------------------------------------------------------------===##
3#
4# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
5# See https://llvm.org/LICENSE.txt for license information.
6# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7#
8# ===----------------------------------------------------------------------===##
9
10# The code is based on
11# https://github.com/microsoft/STL/blob/main/tools/unicode_properties_parse/grapheme_break_property_data_gen.py
12#
13# Copyright (c) Microsoft Corporation.
14# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
15
16from io import StringIO
17from pathlib import Path
18from dataclasses import dataclass
19from typing import Optional
20import re
21import sys
22
23
24@dataclass
25class PropertyRange:
26    lower: int = -1
27    upper: int = -1
28    prop: str = None
29
30
31@dataclass
32class Entry:
33    lower: int = -1
34    offset: int = -1
35
36
37LINE_REGEX = re.compile(
38    r"^(?P<lower>[0-9A-F]{4,6})(?:\.\.(?P<upper>[0-9A-F]{4,6}))?\s*;\s*(?P<prop>\w+)"
39)
40
41
42def filterProperty(element: PropertyRange) -> Optional[PropertyRange]:
43    ### Matches property predicate?
44    if element.prop in ["W", "F"]:
45        return element
46
47    ### Matches hardcode ranges predicate?
48
49    # Yijing Hexagram Symbols
50    if element.lower >= 0x4DC0 and element.upper <= 0x4DFF:
51        return element
52
53    # Miscellaneous Symbols and Pictographs
54    if element.lower >= 0x1F300 and element.upper <= 0x1F5FF:
55        return element
56
57    # Supplemental Symbols and Pictographs
58    if element.lower >= 0x1F900 and element.upper <= 0x1F9FF:
59        return element
60
61    return None
62
63
64def parsePropertyLine(inputLine: str) -> Optional[PropertyRange]:
65    result = PropertyRange()
66    if m := LINE_REGEX.match(inputLine):
67        lower_str, upper_str, result.prop = m.group("lower", "upper", "prop")
68        result.lower = int(lower_str, base=16)
69        result.upper = result.lower
70        if upper_str is not None:
71            result.upper = int(upper_str, base=16)
72        return result
73
74    else:
75        return None
76
77
78def compactPropertyRanges(input: list[PropertyRange]) -> list[PropertyRange]:
79    """
80    Merges overlapping and consecutive ranges to one range.
81
82    Since the input properties are filtered the exact property isn't
83    interesting anymore. The properties in the output are merged to aid
84    debugging.
85    Merging the ranges results in fewer ranges in the output table,
86    reducing binary and improving lookup performance.
87    """
88    result = list()
89    for x in input:
90        if (
91            len(result)
92            and x.lower > result[-1].lower
93            and x.lower <= result[-1].upper + 1
94        ):
95            result[-1].upper = max(result[-1].upper, x.upper)
96            result[-1].prop += f" {x.prop}"
97            continue
98        result.append(x)
99    return result
100
101
102DATA_ARRAY_TEMPLATE = r"""
103/// The entries of the characters with an estimated width of 2.
104///
105/// Contains the entries for [format.string.std]/12
106///  -  Any code point with the East_Asian_Width="W" or East_Asian_Width="F"
107///     Derived Extracted Property as described by UAX #44
108/// - U+4DC0 - U+4DFF (Yijing Hexagram Symbols)
109/// - U+1F300 - U+1F5FF (Miscellaneous Symbols and Pictographs)
110/// - U+1F900 - U+1F9FF (Supplemental Symbols and Pictographs)
111///
112/// The data is generated from
113/// - https://www.unicode.org/Public/UCD/latest/ucd/EastAsianWidth.txt
114/// - The "overrides" in [format.string.std]/12
115///
116/// The format of EastAsianWidth.txt is two fields separated by a semicolon.
117/// Field 0: Unicode code point value or range of code point values
118/// Field 1: East_Asian_Width property, consisting of one of the following values:
119///         "A", "F", "H", "N", "Na", "W"
120///  - All code points, assigned or unassigned, that are not listed
121///      explicitly are given the value "N".
122///  - The unassigned code points in the following blocks default to "W":
123///         CJK Unified Ideographs Extension A: U+3400..U+4DBF
124///         CJK Unified Ideographs:             U+4E00..U+9FFF
125///         CJK Compatibility Ideographs:       U+F900..U+FAFF
126///  - All undesignated code points in Planes 2 and 3, whether inside or
127///      outside of allocated blocks, default to "W":
128///         Plane 2:                            U+20000..U+2FFFD
129///         Plane 3:                            U+30000..U+3FFFD
130///
131/// The table is similar to the table
132///  __extended_grapheme_custer_property_boundary::__entries
133/// which explains the details of these classes. The only difference is this
134/// table lacks a property, thus having more bits available for the size.
135///
136/// The maximum code point that has an estimated width of 2 is U+3FFFD. This
137/// value can be encoded in 18 bits. Thus the upper 3 bits of the code point
138/// are always 0. These 3 bits are used to enlarge the offset range. This
139/// optimization reduces the table in Unicode 15 from 184 to 104 entries,
140/// saving 320 bytes.
141///
142/// The data has 2 values:
143/// - bits [0, 13] The size of the range, allowing 16384 elements.
144/// - bits [14, 31] The lower bound code point of the range. The upper bound of
145///   the range is lower bound + size.
146_LIBCPP_HIDE_FROM_ABI inline constexpr uint32_t __entries[{size}] = {{
147{entries}}};
148
149/// The upper bound entry of EastAsianWidth.txt.
150///
151/// Values greater than this value may have more than 18 significant bits.
152/// They always have a width of 1. This property makes it possible to store
153/// the table in its compact form.
154inline constexpr uint32_t __table_upper_bound = 0x{upper_bound:08x};
155
156/// Returns the estimated width of a Unicode code point.
157///
158/// \\pre The code point is a valid Unicode code point.
159[[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr int __estimated_width(const char32_t __code_point) noexcept {{
160  // Since __table_upper_bound contains the unshifted range do the
161  // comparison without shifting.
162  if (__code_point > __table_upper_bound) [[unlikely]]
163    return 1;
164
165  // When the code-point is less than the first element in the table
166  // the lookup is quite expensive. Since quite some scripts are in
167  // that range, it makes sense to validate that first.
168  // The std_format_spec_string_unicode benchmark gives a measurable
169  // improvement.
170  if (__code_point < (__entries[0] >> 14))
171    return 1;
172
173  ptrdiff_t __i = std::ranges::upper_bound(__entries, (__code_point << 14) | 0x3fffu) - __entries;
174  if (__i == 0)
175    return 1;
176
177  --__i;
178  uint32_t __upper_bound = (__entries[__i] >> 14) + (__entries[__i] & 0x3fffu);
179  return 1 + (__code_point <= __upper_bound);
180}}
181"""
182
183TABLES_HPP_TEMPLATE = """
184// -*- C++ -*-
185//===----------------------------------------------------------------------===//
186//
187// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
188// See https://llvm.org/LICENSE.txt for license information.
189// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
190//
191//===----------------------------------------------------------------------===//
192
193// WARNING, this entire header is generated by
194// utils/generate_width_estimation_table.py
195// DO NOT MODIFY!
196
197// UNICODE, INC. LICENSE AGREEMENT - DATA FILES AND SOFTWARE
198//
199// See Terms of Use <https://www.unicode.org/copyright.html>
200// for definitions of Unicode Inc.'s Data Files and Software.
201//
202// NOTICE TO USER: Carefully read the following legal agreement.
203// BY DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING UNICODE INC.'S
204// DATA FILES ("DATA FILES"), AND/OR SOFTWARE ("SOFTWARE"),
205// YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE
206// TERMS AND CONDITIONS OF THIS AGREEMENT.
207// IF YOU DO NOT AGREE, DO NOT DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE
208// THE DATA FILES OR SOFTWARE.
209//
210// COPYRIGHT AND PERMISSION NOTICE
211//
212// Copyright (c) 1991-2022 Unicode, Inc. All rights reserved.
213// Distributed under the Terms of Use in https://www.unicode.org/copyright.html.
214//
215// Permission is hereby granted, free of charge, to any person obtaining
216// a copy of the Unicode data files and any associated documentation
217// (the "Data Files") or Unicode software and any associated documentation
218// (the "Software") to deal in the Data Files or Software
219// without restriction, including without limitation the rights to use,
220// copy, modify, merge, publish, distribute, and/or sell copies of
221// the Data Files or Software, and to permit persons to whom the Data Files
222// or Software are furnished to do so, provided that either
223// (a) this copyright and permission notice appear with all copies
224// of the Data Files or Software, or
225// (b) this copyright and permission notice appear in associated
226// Documentation.
227//
228// THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF
229// ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
230// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
231// NONINFRINGEMENT OF THIRD PARTY RIGHTS.
232// IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS
233// NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL
234// DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
235// DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
236// TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
237// PERFORMANCE OF THE DATA FILES OR SOFTWARE.
238//
239// Except as contained in this notice, the name of a copyright holder
240// shall not be used in advertising or otherwise to promote the sale,
241// use or other dealings in these Data Files or Software without prior
242// written authorization of the copyright holder.
243
244#ifndef _LIBCPP___FORMAT_WIDTH_ESTIMATION_TABLE_H
245#define _LIBCPP___FORMAT_WIDTH_ESTIMATION_TABLE_H
246
247#include <__algorithm/ranges_upper_bound.h>
248#include <__config>
249#include <__cstddef/ptrdiff_t.h>
250#include <cstdint>
251
252#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
253#  pragma GCC system_header
254#endif
255
256_LIBCPP_BEGIN_NAMESPACE_STD
257
258#if _LIBCPP_STD_VER >= 20
259
260namespace __width_estimation_table {{
261{content}
262}} // namespace __width_estimation_table
263
264#endif // _LIBCPP_STD_VER >= 20
265
266_LIBCPP_END_NAMESPACE_STD
267
268#endif // _LIBCPP___FORMAT_WIDTH_ESTIMATION_TABLE_H"""
269
270
271def property_ranges_to_table(ranges: list[PropertyRange]) -> list[Entry]:
272    # The maximum value that can be encoded in the available bits in the
273    # __entries table.
274    upper_bound = 0x3FFFF
275    # The maximum offset in an __entries entry. Larger offsets will be
276    # splitted and stored in multiple entries.
277    chunk = 16384
278    result = list[Entry]()
279    high = -1
280    for range in sorted(ranges, key=lambda x: x.lower):
281        # Validate overlapping ranges
282        assert range.lower > high
283        high = range.upper
284        assert high <= upper_bound
285
286        while True:
287            e = Entry(range.lower, range.upper - range.lower)
288            if e.offset < chunk:
289                result.append(e)
290                break
291            e.offset = chunk - 1
292            result.append(e)
293            range.lower += chunk
294    return result
295
296
297cpp_entrytemplate = "    0x{:08x} /* {:08x} - {:08x} [{:>5}] */"
298
299
300def generate_cpp_data(ranges: list[PropertyRange], upper_bound: int) -> str:
301    result = StringIO()
302    table = property_ranges_to_table(ranges)
303    result.write(
304        DATA_ARRAY_TEMPLATE.format(
305            size=len(table),
306            entries=", //\n".join(
307                [
308                    cpp_entrytemplate.format(
309                        x.lower << 14 | x.offset,
310                        x.lower,
311                        x.lower + x.offset,
312                        x.offset + 1,
313                    )
314                    for x in table
315                ]
316            ),
317            upper_bound=upper_bound,
318        )
319    )
320
321    return result.getvalue()
322
323
324def generate_data_tables() -> str:
325    """
326    Generate Unicode data for [format.string.std]/12
327    """
328    east_asian_width_path = (
329        Path(__file__).absolute().parent / "data" / "unicode" / "EastAsianWidth.txt"
330    )
331
332    properties = list()
333    with east_asian_width_path.open(encoding="utf-8") as f:
334        properties.extend(
335            list(
336                filter(
337                    filterProperty,
338                    [x for line in f if (x := parsePropertyLine(line))],
339                )
340            )
341        )
342    # The range U+4DC0 - U+4DFF is neutral and should not be in the table
343    # The range U+1F300 - U+1F5FF is partly in the range, for example
344    #   1F300..1F320;W   # So    [33] CYCLONE..SHOOTING STAR
345    #   1F321..1F32C;N   # So    [12] THERMOMETER..WIND BLOWING FACE
346    #   1F32D..1F335;W   # So     [9] HOT DOG..CACTUS
347    # The first and last ranges are present, but the second isn't
348
349    # Validate the hardcode ranges are present
350
351    # Yijing Hexagram Symbols
352    for i in range(0x4DC0, 0x4DFF + 1):
353        assert [x for x in properties if i >= x.lower and i <= x.upper]
354
355    # Miscellaneous Symbols and Pictographs
356    for i in range(0x1F300, 0x1F5FF + 1):
357        assert [x for x in properties if i >= x.lower and i <= x.upper]
358
359    # Miscellaneous Symbols and Pictographs
360    for i in range(0x1F900, 0x1F9FF + 1):
361        assert [x for x in properties if i >= x.lower and i <= x.upper]
362
363    data = compactPropertyRanges(sorted(properties, key=lambda x: x.lower))
364
365    return "\n".join([generate_cpp_data(data, data[-1].upper)])
366
367
368if __name__ == "__main__":
369    if len(sys.argv) == 2:
370        sys.stdout = open(sys.argv[1], "w")
371    print(TABLES_HPP_TEMPLATE.lstrip().format(content=generate_data_tables()))
372