xref: /llvm-project/libcxx/utils/generate_width_estimation_table.py (revision e99c4906e44ae3f921fa05356909d006cda8d954)
168c3d66aSMark de Wever#!/usr/bin/env python
268c3d66aSMark de Wever# ===----------------------------------------------------------------------===##
368c3d66aSMark de Wever#
468c3d66aSMark de Wever# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
568c3d66aSMark de Wever# See https://llvm.org/LICENSE.txt for license information.
668c3d66aSMark de Wever# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
768c3d66aSMark de Wever#
868c3d66aSMark de Wever# ===----------------------------------------------------------------------===##
968c3d66aSMark de Wever
1068c3d66aSMark de Wever# The code is based on
1168c3d66aSMark de Wever# https://github.com/microsoft/STL/blob/main/tools/unicode_properties_parse/grapheme_break_property_data_gen.py
1268c3d66aSMark de Wever#
1368c3d66aSMark de Wever# Copyright (c) Microsoft Corporation.
1468c3d66aSMark de Wever# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
1568c3d66aSMark de Wever
1668c3d66aSMark de Weverfrom io import StringIO
1768c3d66aSMark de Weverfrom pathlib import Path
180d3c40b8SStephan T. Lavavejfrom dataclasses import dataclass
1968c3d66aSMark de Weverfrom typing import Optional
2068c3d66aSMark de Weverimport re
2168c3d66aSMark de Weverimport sys
2268c3d66aSMark de Wever
2368c3d66aSMark de Wever
2468c3d66aSMark de Wever@dataclass
2568c3d66aSMark de Weverclass PropertyRange:
2668c3d66aSMark de Wever    lower: int = -1
2768c3d66aSMark de Wever    upper: int = -1
2868c3d66aSMark de Wever    prop: str = None
2968c3d66aSMark de Wever
3068c3d66aSMark de Wever
3168c3d66aSMark de Wever@dataclass
3268c3d66aSMark de Weverclass Entry:
3368c3d66aSMark de Wever    lower: int = -1
3468c3d66aSMark de Wever    offset: int = -1
3568c3d66aSMark de Wever
3668c3d66aSMark de Wever
3768c3d66aSMark de WeverLINE_REGEX = re.compile(
3868c3d66aSMark de Wever    r"^(?P<lower>[0-9A-F]{4,6})(?:\.\.(?P<upper>[0-9A-F]{4,6}))?\s*;\s*(?P<prop>\w+)"
3968c3d66aSMark de Wever)
4068c3d66aSMark de Wever
4168c3d66aSMark de Wever
4268c3d66aSMark de Weverdef filterProperty(element: PropertyRange) -> Optional[PropertyRange]:
4368c3d66aSMark de Wever    ### Matches property predicate?
4468c3d66aSMark de Wever    if element.prop in ["W", "F"]:
4568c3d66aSMark de Wever        return element
4668c3d66aSMark de Wever
4768c3d66aSMark de Wever    ### Matches hardcode ranges predicate?
4868c3d66aSMark de Wever
4968c3d66aSMark de Wever    # Yijing Hexagram Symbols
5068c3d66aSMark de Wever    if element.lower >= 0x4DC0 and element.upper <= 0x4DFF:
5168c3d66aSMark de Wever        return element
5268c3d66aSMark de Wever
5368c3d66aSMark de Wever    # Miscellaneous Symbols and Pictographs
5468c3d66aSMark de Wever    if element.lower >= 0x1F300 and element.upper <= 0x1F5FF:
5568c3d66aSMark de Wever        return element
5668c3d66aSMark de Wever
5768c3d66aSMark de Wever    # Supplemental Symbols and Pictographs
5868c3d66aSMark de Wever    if element.lower >= 0x1F900 and element.upper <= 0x1F9FF:
5968c3d66aSMark de Wever        return element
6068c3d66aSMark de Wever
6168c3d66aSMark de Wever    return None
6268c3d66aSMark de Wever
6368c3d66aSMark de Wever
6468c3d66aSMark de Weverdef parsePropertyLine(inputLine: str) -> Optional[PropertyRange]:
6568c3d66aSMark de Wever    result = PropertyRange()
6668c3d66aSMark de Wever    if m := LINE_REGEX.match(inputLine):
6768c3d66aSMark de Wever        lower_str, upper_str, result.prop = m.group("lower", "upper", "prop")
6868c3d66aSMark de Wever        result.lower = int(lower_str, base=16)
6968c3d66aSMark de Wever        result.upper = result.lower
7068c3d66aSMark de Wever        if upper_str is not None:
7168c3d66aSMark de Wever            result.upper = int(upper_str, base=16)
7268c3d66aSMark de Wever        return result
7368c3d66aSMark de Wever
7468c3d66aSMark de Wever    else:
7568c3d66aSMark de Wever        return None
7668c3d66aSMark de Wever
7768c3d66aSMark de Wever
7868c3d66aSMark de Weverdef compactPropertyRanges(input: list[PropertyRange]) -> list[PropertyRange]:
7968c3d66aSMark de Wever    """
8068c3d66aSMark de Wever    Merges overlapping and consecutive ranges to one range.
8168c3d66aSMark de Wever
8268c3d66aSMark de Wever    Since the input properties are filtered the exact property isn't
8368c3d66aSMark de Wever    interesting anymore. The properties in the output are merged to aid
8468c3d66aSMark de Wever    debugging.
8568c3d66aSMark de Wever    Merging the ranges results in fewer ranges in the output table,
8668c3d66aSMark de Wever    reducing binary and improving lookup performance.
8768c3d66aSMark de Wever    """
8868c3d66aSMark de Wever    result = list()
8968c3d66aSMark de Wever    for x in input:
9068c3d66aSMark de Wever        if (
9168c3d66aSMark de Wever            len(result)
9268c3d66aSMark de Wever            and x.lower > result[-1].lower
9368c3d66aSMark de Wever            and x.lower <= result[-1].upper + 1
9468c3d66aSMark de Wever        ):
9568c3d66aSMark de Wever            result[-1].upper = max(result[-1].upper, x.upper)
9668c3d66aSMark de Wever            result[-1].prop += f" {x.prop}"
9768c3d66aSMark de Wever            continue
9868c3d66aSMark de Wever        result.append(x)
9968c3d66aSMark de Wever    return result
10068c3d66aSMark de Wever
10168c3d66aSMark de Wever
10288184e50SEisuke KawashimaDATA_ARRAY_TEMPLATE = r"""
10368c3d66aSMark de Wever/// The entries of the characters with an estimated width of 2.
10468c3d66aSMark de Wever///
10568c3d66aSMark de Wever/// Contains the entries for [format.string.std]/12
10668c3d66aSMark de Wever///  -  Any code point with the East_Asian_Width="W" or East_Asian_Width="F"
10768c3d66aSMark de Wever///     Derived Extracted Property as described by UAX #44
10868c3d66aSMark de Wever/// - U+4DC0 - U+4DFF (Yijing Hexagram Symbols)
10968c3d66aSMark de Wever/// - U+1F300 - U+1F5FF (Miscellaneous Symbols and Pictographs)
11068c3d66aSMark de Wever/// - U+1F900 - U+1F9FF (Supplemental Symbols and Pictographs)
11168c3d66aSMark de Wever///
11268c3d66aSMark de Wever/// The data is generated from
11368c3d66aSMark de Wever/// - https://www.unicode.org/Public/UCD/latest/ucd/EastAsianWidth.txt
11468c3d66aSMark de Wever/// - The "overrides" in [format.string.std]/12
11568c3d66aSMark de Wever///
11668c3d66aSMark de Wever/// The format of EastAsianWidth.txt is two fields separated by a semicolon.
11768c3d66aSMark de Wever/// Field 0: Unicode code point value or range of code point values
11868c3d66aSMark de Wever/// Field 1: East_Asian_Width property, consisting of one of the following values:
11968c3d66aSMark de Wever///         "A", "F", "H", "N", "Na", "W"
12068c3d66aSMark de Wever///  - All code points, assigned or unassigned, that are not listed
12168c3d66aSMark de Wever///      explicitly are given the value "N".
12268c3d66aSMark de Wever///  - The unassigned code points in the following blocks default to "W":
12368c3d66aSMark de Wever///         CJK Unified Ideographs Extension A: U+3400..U+4DBF
12468c3d66aSMark de Wever///         CJK Unified Ideographs:             U+4E00..U+9FFF
12568c3d66aSMark de Wever///         CJK Compatibility Ideographs:       U+F900..U+FAFF
12668c3d66aSMark de Wever///  - All undesignated code points in Planes 2 and 3, whether inside or
12768c3d66aSMark de Wever///      outside of allocated blocks, default to "W":
12868c3d66aSMark de Wever///         Plane 2:                            U+20000..U+2FFFD
12968c3d66aSMark de Wever///         Plane 3:                            U+30000..U+3FFFD
13068c3d66aSMark de Wever///
13168c3d66aSMark de Wever/// The table is similar to the table
13268c3d66aSMark de Wever///  __extended_grapheme_custer_property_boundary::__entries
13368c3d66aSMark de Wever/// which explains the details of these classes. The only difference is this
13468c3d66aSMark de Wever/// table lacks a property, thus having more bits available for the size.
13568c3d66aSMark de Wever///
13668c3d66aSMark de Wever/// The maximum code point that has an estimated width of 2 is U+3FFFD. This
13768c3d66aSMark de Wever/// value can be encoded in 18 bits. Thus the upper 3 bits of the code point
13868c3d66aSMark de Wever/// are always 0. These 3 bits are used to enlarge the offset range. This
13968c3d66aSMark de Wever/// optimization reduces the table in Unicode 15 from 184 to 104 entries,
14068c3d66aSMark de Wever/// saving 320 bytes.
14168c3d66aSMark de Wever///
14268c3d66aSMark de Wever/// The data has 2 values:
14368c3d66aSMark de Wever/// - bits [0, 13] The size of the range, allowing 16384 elements.
14468c3d66aSMark de Wever/// - bits [14, 31] The lower bound code point of the range. The upper bound of
14568c3d66aSMark de Wever///   the range is lower bound + size.
146d179176fSMark de Wever_LIBCPP_HIDE_FROM_ABI inline constexpr uint32_t __entries[{size}] = {{
14768c3d66aSMark de Wever{entries}}};
14868c3d66aSMark de Wever
14968c3d66aSMark de Wever/// The upper bound entry of EastAsianWidth.txt.
15068c3d66aSMark de Wever///
15168c3d66aSMark de Wever/// Values greater than this value may have more than 18 significant bits.
15268c3d66aSMark de Wever/// They always have a width of 1. This property makes it possible to store
15368c3d66aSMark de Wever/// the table in its compact form.
15468c3d66aSMark de Weverinline constexpr uint32_t __table_upper_bound = 0x{upper_bound:08x};
15568c3d66aSMark de Wever
15668c3d66aSMark de Wever/// Returns the estimated width of a Unicode code point.
15768c3d66aSMark de Wever///
158ae858b51SAngryLoki/// \\pre The code point is a valid Unicode code point.
15968c3d66aSMark de Wever[[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr int __estimated_width(const char32_t __code_point) noexcept {{
16068c3d66aSMark de Wever  // Since __table_upper_bound contains the unshifted range do the
16168c3d66aSMark de Wever  // comparison without shifting.
16268c3d66aSMark de Wever  if (__code_point > __table_upper_bound) [[unlikely]]
16368c3d66aSMark de Wever    return 1;
16468c3d66aSMark de Wever
16568c3d66aSMark de Wever  // When the code-point is less than the first element in the table
16668c3d66aSMark de Wever  // the lookup is quite expensive. Since quite some scripts are in
16768c3d66aSMark de Wever  // that range, it makes sense to validate that first.
16868c3d66aSMark de Wever  // The std_format_spec_string_unicode benchmark gives a measurable
16968c3d66aSMark de Wever  // improvement.
17068c3d66aSMark de Wever  if (__code_point < (__entries[0] >> 14))
17168c3d66aSMark de Wever    return 1;
17268c3d66aSMark de Wever
17368c3d66aSMark de Wever  ptrdiff_t __i = std::ranges::upper_bound(__entries, (__code_point << 14) | 0x3fffu) - __entries;
17468c3d66aSMark de Wever  if (__i == 0)
17568c3d66aSMark de Wever    return 1;
17668c3d66aSMark de Wever
17768c3d66aSMark de Wever  --__i;
17868c3d66aSMark de Wever  uint32_t __upper_bound = (__entries[__i] >> 14) + (__entries[__i] & 0x3fffu);
17968c3d66aSMark de Wever  return 1 + (__code_point <= __upper_bound);
18068c3d66aSMark de Wever}}
18168c3d66aSMark de Wever"""
18268c3d66aSMark de Wever
18368c3d66aSMark de WeverTABLES_HPP_TEMPLATE = """
18468c3d66aSMark de Wever// -*- C++ -*-
18568c3d66aSMark de Wever//===----------------------------------------------------------------------===//
18668c3d66aSMark de Wever//
18768c3d66aSMark de Wever// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
18868c3d66aSMark de Wever// See https://llvm.org/LICENSE.txt for license information.
18968c3d66aSMark de Wever// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
19068c3d66aSMark de Wever//
19168c3d66aSMark de Wever//===----------------------------------------------------------------------===//
19268c3d66aSMark de Wever
19368c3d66aSMark de Wever// WARNING, this entire header is generated by
19468c3d66aSMark de Wever// utils/generate_width_estimation_table.py
19568c3d66aSMark de Wever// DO NOT MODIFY!
19668c3d66aSMark de Wever
19768c3d66aSMark de Wever// UNICODE, INC. LICENSE AGREEMENT - DATA FILES AND SOFTWARE
19868c3d66aSMark de Wever//
19968c3d66aSMark de Wever// See Terms of Use <https://www.unicode.org/copyright.html>
20068c3d66aSMark de Wever// for definitions of Unicode Inc.'s Data Files and Software.
20168c3d66aSMark de Wever//
20268c3d66aSMark de Wever// NOTICE TO USER: Carefully read the following legal agreement.
20368c3d66aSMark de Wever// BY DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING UNICODE INC.'S
20468c3d66aSMark de Wever// DATA FILES ("DATA FILES"), AND/OR SOFTWARE ("SOFTWARE"),
20568c3d66aSMark de Wever// YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE
20668c3d66aSMark de Wever// TERMS AND CONDITIONS OF THIS AGREEMENT.
20768c3d66aSMark de Wever// IF YOU DO NOT AGREE, DO NOT DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE
20868c3d66aSMark de Wever// THE DATA FILES OR SOFTWARE.
20968c3d66aSMark de Wever//
21068c3d66aSMark de Wever// COPYRIGHT AND PERMISSION NOTICE
21168c3d66aSMark de Wever//
21268c3d66aSMark de Wever// Copyright (c) 1991-2022 Unicode, Inc. All rights reserved.
21368c3d66aSMark de Wever// Distributed under the Terms of Use in https://www.unicode.org/copyright.html.
21468c3d66aSMark de Wever//
21568c3d66aSMark de Wever// Permission is hereby granted, free of charge, to any person obtaining
21668c3d66aSMark de Wever// a copy of the Unicode data files and any associated documentation
21768c3d66aSMark de Wever// (the "Data Files") or Unicode software and any associated documentation
21868c3d66aSMark de Wever// (the "Software") to deal in the Data Files or Software
21968c3d66aSMark de Wever// without restriction, including without limitation the rights to use,
22068c3d66aSMark de Wever// copy, modify, merge, publish, distribute, and/or sell copies of
22168c3d66aSMark de Wever// the Data Files or Software, and to permit persons to whom the Data Files
22268c3d66aSMark de Wever// or Software are furnished to do so, provided that either
22368c3d66aSMark de Wever// (a) this copyright and permission notice appear with all copies
22468c3d66aSMark de Wever// of the Data Files or Software, or
22568c3d66aSMark de Wever// (b) this copyright and permission notice appear in associated
22668c3d66aSMark de Wever// Documentation.
22768c3d66aSMark de Wever//
22868c3d66aSMark de Wever// THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF
22968c3d66aSMark de Wever// ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
23068c3d66aSMark de Wever// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23168c3d66aSMark de Wever// NONINFRINGEMENT OF THIRD PARTY RIGHTS.
23268c3d66aSMark de Wever// IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS
23368c3d66aSMark de Wever// NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL
23468c3d66aSMark de Wever// DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
23568c3d66aSMark de Wever// DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
23668c3d66aSMark de Wever// TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
23768c3d66aSMark de Wever// PERFORMANCE OF THE DATA FILES OR SOFTWARE.
23868c3d66aSMark de Wever//
23968c3d66aSMark de Wever// Except as contained in this notice, the name of a copyright holder
24068c3d66aSMark de Wever// shall not be used in advertising or otherwise to promote the sale,
24168c3d66aSMark de Wever// use or other dealings in these Data Files or Software without prior
24268c3d66aSMark de Wever// written authorization of the copyright holder.
24368c3d66aSMark de Wever
24468c3d66aSMark de Wever#ifndef _LIBCPP___FORMAT_WIDTH_ESTIMATION_TABLE_H
24568c3d66aSMark de Wever#define _LIBCPP___FORMAT_WIDTH_ESTIMATION_TABLE_H
24668c3d66aSMark de Wever
24768c3d66aSMark de Wever#include <__algorithm/ranges_upper_bound.h>
24868c3d66aSMark de Wever#include <__config>
249*e99c4906SNikolas Klauser#include <__cstddef/ptrdiff_t.h>
25068c3d66aSMark de Wever#include <cstdint>
25168c3d66aSMark de Wever
25268c3d66aSMark de Wever#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
25368c3d66aSMark de Wever#  pragma GCC system_header
25468c3d66aSMark de Wever#endif
25568c3d66aSMark de Wever
25668c3d66aSMark de Wever_LIBCPP_BEGIN_NAMESPACE_STD
25768c3d66aSMark de Wever
25868c3d66aSMark de Wever#if _LIBCPP_STD_VER >= 20
25968c3d66aSMark de Wever
26068c3d66aSMark de Wevernamespace __width_estimation_table {{
26168c3d66aSMark de Wever{content}
26268c3d66aSMark de Wever}} // namespace __width_estimation_table
26368c3d66aSMark de Wever
26468c3d66aSMark de Wever#endif // _LIBCPP_STD_VER >= 20
26568c3d66aSMark de Wever
26668c3d66aSMark de Wever_LIBCPP_END_NAMESPACE_STD
26768c3d66aSMark de Wever
26868c3d66aSMark de Wever#endif // _LIBCPP___FORMAT_WIDTH_ESTIMATION_TABLE_H"""
26968c3d66aSMark de Wever
27068c3d66aSMark de Wever
27168c3d66aSMark de Weverdef property_ranges_to_table(ranges: list[PropertyRange]) -> list[Entry]:
27268c3d66aSMark de Wever    # The maximum value that can be encoded in the available bits in the
27368c3d66aSMark de Wever    # __entries table.
27468c3d66aSMark de Wever    upper_bound = 0x3FFFF
27568c3d66aSMark de Wever    # The maximum offset in an __entries entry. Larger offsets will be
27668c3d66aSMark de Wever    # splitted and stored in multiple entries.
27768c3d66aSMark de Wever    chunk = 16384
27868c3d66aSMark de Wever    result = list[Entry]()
27968c3d66aSMark de Wever    high = -1
28068c3d66aSMark de Wever    for range in sorted(ranges, key=lambda x: x.lower):
28168c3d66aSMark de Wever        # Validate overlapping ranges
28268c3d66aSMark de Wever        assert range.lower > high
28368c3d66aSMark de Wever        high = range.upper
28468c3d66aSMark de Wever        assert high <= upper_bound
28568c3d66aSMark de Wever
28668c3d66aSMark de Wever        while True:
28768c3d66aSMark de Wever            e = Entry(range.lower, range.upper - range.lower)
28868c3d66aSMark de Wever            if e.offset < chunk:
28968c3d66aSMark de Wever                result.append(e)
29068c3d66aSMark de Wever                break
29168c3d66aSMark de Wever            e.offset = chunk - 1
29268c3d66aSMark de Wever            result.append(e)
29368c3d66aSMark de Wever            range.lower += chunk
29468c3d66aSMark de Wever    return result
29568c3d66aSMark de Wever
29668c3d66aSMark de Wever
29768c3d66aSMark de Wevercpp_entrytemplate = "    0x{:08x} /* {:08x} - {:08x} [{:>5}] */"
29868c3d66aSMark de Wever
29968c3d66aSMark de Wever
30068c3d66aSMark de Weverdef generate_cpp_data(ranges: list[PropertyRange], upper_bound: int) -> str:
30168c3d66aSMark de Wever    result = StringIO()
30268c3d66aSMark de Wever    table = property_ranges_to_table(ranges)
30368c3d66aSMark de Wever    result.write(
30468c3d66aSMark de Wever        DATA_ARRAY_TEMPLATE.format(
30568c3d66aSMark de Wever            size=len(table),
30668c3d66aSMark de Wever            entries=", //\n".join(
30768c3d66aSMark de Wever                [
30868c3d66aSMark de Wever                    cpp_entrytemplate.format(
30968c3d66aSMark de Wever                        x.lower << 14 | x.offset,
31068c3d66aSMark de Wever                        x.lower,
31168c3d66aSMark de Wever                        x.lower + x.offset,
31268c3d66aSMark de Wever                        x.offset + 1,
31368c3d66aSMark de Wever                    )
31468c3d66aSMark de Wever                    for x in table
31568c3d66aSMark de Wever                ]
31668c3d66aSMark de Wever            ),
31768c3d66aSMark de Wever            upper_bound=upper_bound,
31868c3d66aSMark de Wever        )
31968c3d66aSMark de Wever    )
32068c3d66aSMark de Wever
32168c3d66aSMark de Wever    return result.getvalue()
32268c3d66aSMark de Wever
32368c3d66aSMark de Wever
32468c3d66aSMark de Weverdef generate_data_tables() -> str:
32568c3d66aSMark de Wever    """
32668c3d66aSMark de Wever    Generate Unicode data for [format.string.std]/12
32768c3d66aSMark de Wever    """
32868c3d66aSMark de Wever    east_asian_width_path = (
3297bfaa0f0STobias Hieta        Path(__file__).absolute().parent / "data" / "unicode" / "EastAsianWidth.txt"
33068c3d66aSMark de Wever    )
33168c3d66aSMark de Wever
33268c3d66aSMark de Wever    properties = list()
33368c3d66aSMark de Wever    with east_asian_width_path.open(encoding="utf-8") as f:
33468c3d66aSMark de Wever        properties.extend(
33568c3d66aSMark de Wever            list(
33668c3d66aSMark de Wever                filter(
33768c3d66aSMark de Wever                    filterProperty,
33868c3d66aSMark de Wever                    [x for line in f if (x := parsePropertyLine(line))],
33968c3d66aSMark de Wever                )
34068c3d66aSMark de Wever            )
34168c3d66aSMark de Wever        )
34268c3d66aSMark de Wever    # The range U+4DC0 - U+4DFF is neutral and should not be in the table
34368c3d66aSMark de Wever    # The range U+1F300 - U+1F5FF is partly in the range, for example
34468c3d66aSMark de Wever    #   1F300..1F320;W   # So    [33] CYCLONE..SHOOTING STAR
34568c3d66aSMark de Wever    #   1F321..1F32C;N   # So    [12] THERMOMETER..WIND BLOWING FACE
34668c3d66aSMark de Wever    #   1F32D..1F335;W   # So     [9] HOT DOG..CACTUS
34768c3d66aSMark de Wever    # The first and last ranges are present, but the second isn't
34868c3d66aSMark de Wever
34968c3d66aSMark de Wever    # Validate the hardcode ranges are present
35068c3d66aSMark de Wever
35168c3d66aSMark de Wever    # Yijing Hexagram Symbols
35268c3d66aSMark de Wever    for i in range(0x4DC0, 0x4DFF + 1):
35368c3d66aSMark de Wever        assert [x for x in properties if i >= x.lower and i <= x.upper]
35468c3d66aSMark de Wever
35568c3d66aSMark de Wever    # Miscellaneous Symbols and Pictographs
35668c3d66aSMark de Wever    for i in range(0x1F300, 0x1F5FF + 1):
35768c3d66aSMark de Wever        assert [x for x in properties if i >= x.lower and i <= x.upper]
35868c3d66aSMark de Wever
35968c3d66aSMark de Wever    # Miscellaneous Symbols and Pictographs
36068c3d66aSMark de Wever    for i in range(0x1F900, 0x1F9FF + 1):
36168c3d66aSMark de Wever        assert [x for x in properties if i >= x.lower and i <= x.upper]
36268c3d66aSMark de Wever
36368c3d66aSMark de Wever    data = compactPropertyRanges(sorted(properties, key=lambda x: x.lower))
36468c3d66aSMark de Wever
36568c3d66aSMark de Wever    return "\n".join([generate_cpp_data(data, data[-1].upper)])
36668c3d66aSMark de Wever
36768c3d66aSMark de Wever
36868c3d66aSMark de Weverif __name__ == "__main__":
36968c3d66aSMark de Wever    if len(sys.argv) == 2:
37068c3d66aSMark de Wever        sys.stdout = open(sys.argv[1], "w")
37168c3d66aSMark de Wever    print(TABLES_HPP_TEMPLATE.lstrip().format(content=generate_data_tables()))
372