xref: /llvm-project/libcxx/utils/generate_extended_grapheme_cluster_test.py (revision 0d3c40b82b3b667a4f7f69f45658c184615fd54f)
1857a78c0SMark de Wever#!/usr/bin/env python
2857a78c0SMark de Wever# ===----------------------------------------------------------------------===##
3857a78c0SMark de Wever#
4857a78c0SMark de Wever# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
5857a78c0SMark de Wever# See https://llvm.org/LICENSE.txt for license information.
6857a78c0SMark de Wever# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7857a78c0SMark de Wever#
8857a78c0SMark de Wever# ===----------------------------------------------------------------------===##
9857a78c0SMark de Wever
10857a78c0SMark de Wever# The code is based on
11857a78c0SMark de Wever# https://github.com/microsoft/STL/blob/main/tools/unicode_properties_parse/grapheme_break_test_data_gen.py
12857a78c0SMark de Wever#
13857a78c0SMark de Wever# Copyright (c) Microsoft Corporation.
14857a78c0SMark de Wever# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
15857a78c0SMark de Wever
16857a78c0SMark de Weverfrom pathlib import Path
17857a78c0SMark de Weverfrom dataclasses import dataclass, field
18857a78c0SMark de Weverfrom typing import Optional, TextIO
19130b1816SMark de Weverimport sys
20857a78c0SMark de Wever
21857a78c0SMark de Wever
22857a78c0SMark de Wever@dataclass
23857a78c0SMark de Weverclass BreakTestItem:
24857a78c0SMark de Wever    code_points: list[int] = field(default_factory=list)
25857a78c0SMark de Wever    encoded: str = ""
26857a78c0SMark de Wever    breaks_utf8: list[int] = field(default_factory=list)
27857a78c0SMark de Wever    breaks_utf16: list[int] = field(default_factory=list)
28857a78c0SMark de Wever    breaks_utf32: list[int] = field(default_factory=list)
29857a78c0SMark de Wever
30857a78c0SMark de Wever
31857a78c0SMark de Weverclass CommentLine:
32857a78c0SMark de Wever    pass
33857a78c0SMark de Wever
34857a78c0SMark de Wever
35857a78c0SMark de Weverclass EOF:
36857a78c0SMark de Wever    pass
37857a78c0SMark de Wever
38857a78c0SMark de Wever
39857a78c0SMark de Weverdef parseBreakTestLine(input: TextIO) -> Optional[BreakTestItem]:
40857a78c0SMark de Wever    result = BreakTestItem()
41857a78c0SMark de Wever    code_point = -1
42857a78c0SMark de Wever    utf8 = 0
43857a78c0SMark de Wever    utf16 = 0
44857a78c0SMark de Wever    utf32 = 0
45857a78c0SMark de Wever
46857a78c0SMark de Wever    while True:
47857a78c0SMark de Wever        c = input.read(1)
48857a78c0SMark de Wever        if c == "\N{DIVISION SIGN}":
49857a78c0SMark de Wever            # The line starts with a division sign, don't add it to the output.
50857a78c0SMark de Wever            if code_point != -1:
51857a78c0SMark de Wever                result.code_points.append(code_point)
52857a78c0SMark de Wever                code_point = -1
53857a78c0SMark de Wever                result.breaks_utf8.append(utf8)
54857a78c0SMark de Wever                result.breaks_utf16.append(utf16)
55857a78c0SMark de Wever                result.breaks_utf32.append(utf32)
56857a78c0SMark de Wever
57857a78c0SMark de Wever            assert input.read(1).isspace()
58857a78c0SMark de Wever            continue
59857a78c0SMark de Wever        if c == "\N{MULTIPLICATION SIGN}":
60857a78c0SMark de Wever            assert input.read(1).isspace()
61857a78c0SMark de Wever            continue
62857a78c0SMark de Wever        if c.isalnum():
63857a78c0SMark de Wever            while next := input.read(1):
64857a78c0SMark de Wever                if next.isalnum():
65857a78c0SMark de Wever                    c += next
66857a78c0SMark de Wever                else:
67857a78c0SMark de Wever                    assert next.isspace()
68857a78c0SMark de Wever                    break
69857a78c0SMark de Wever            i = int(c, base=16)
70857a78c0SMark de Wever            if code_point == -1:
71857a78c0SMark de Wever                code_point = i
72857a78c0SMark de Wever
73857a78c0SMark de Wever            result.encoded += f"\\U{i:08x}"
74857a78c0SMark de Wever            c = chr(i)
75857a78c0SMark de Wever            utf8 += c.encode().__len__()
76857a78c0SMark de Wever            # Since we only care about the number of code units the byte order
77857a78c0SMark de Wever            # doesn't matter. The byte order is specified to avoid the BOM
78857a78c0SMark de Wever            utf16 += int(c.encode("utf-16-le").__len__() / 2)
79857a78c0SMark de Wever            utf32 += int(c.encode("utf-32-le").__len__() / 4)
80857a78c0SMark de Wever            continue
81857a78c0SMark de Wever        if c == "#":
82857a78c0SMark de Wever            input.readline()
83857a78c0SMark de Wever            return result
84857a78c0SMark de Wever        if c == "\n":
85857a78c0SMark de Wever            return result
86857a78c0SMark de Wever        if c == "":
87857a78c0SMark de Wever            return None
88857a78c0SMark de Wever        assert False
89857a78c0SMark de Wever
90857a78c0SMark de Wever
91857a78c0SMark de Wevercpp_template = """// -*- C++ -*-
92857a78c0SMark de Wever//===----------------------------------------------------------------------===//
93857a78c0SMark de Wever//
94857a78c0SMark de Wever// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
95857a78c0SMark de Wever// See https://llvm.org/LICENSE.txt for license information.
96857a78c0SMark de Wever// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
97857a78c0SMark de Wever//
98857a78c0SMark de Wever//===----------------------------------------------------------------------===//
99857a78c0SMark de Wever
100857a78c0SMark de Wever// WARNING, this entire header is generated by
101b2093ca8SMark de Wever// utils/generate_extended_grapheme_cluster_test.py
102857a78c0SMark de Wever// DO NOT MODIFY!
103857a78c0SMark de Wever
104857a78c0SMark de Wever// UNICODE, INC. LICENSE AGREEMENT - DATA FILES AND SOFTWARE
105857a78c0SMark de Wever//
106857a78c0SMark de Wever// See Terms of Use <https://www.unicode.org/copyright.html>
107857a78c0SMark de Wever// for definitions of Unicode Inc.'s Data Files and Software.
108857a78c0SMark de Wever//
109857a78c0SMark de Wever// NOTICE TO USER: Carefully read the following legal agreement.
110857a78c0SMark de Wever// BY DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING UNICODE INC.'S
111857a78c0SMark de Wever// DATA FILES ("DATA FILES"), AND/OR SOFTWARE ("SOFTWARE"),
112857a78c0SMark de Wever// YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE
113857a78c0SMark de Wever// TERMS AND CONDITIONS OF THIS AGREEMENT.
114857a78c0SMark de Wever// IF YOU DO NOT AGREE, DO NOT DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE
115857a78c0SMark de Wever// THE DATA FILES OR SOFTWARE.
116857a78c0SMark de Wever//
117857a78c0SMark de Wever// COPYRIGHT AND PERMISSION NOTICE
118857a78c0SMark de Wever//
119857a78c0SMark de Wever// Copyright (c) 1991-2022 Unicode, Inc. All rights reserved.
120857a78c0SMark de Wever// Distributed under the Terms of Use in https://www.unicode.org/copyright.html.
121857a78c0SMark de Wever//
122857a78c0SMark de Wever// Permission is hereby granted, free of charge, to any person obtaining
123857a78c0SMark de Wever// a copy of the Unicode data files and any associated documentation
124857a78c0SMark de Wever// (the "Data Files") or Unicode software and any associated documentation
125857a78c0SMark de Wever// (the "Software") to deal in the Data Files or Software
126857a78c0SMark de Wever// without restriction, including without limitation the rights to use,
127857a78c0SMark de Wever// copy, modify, merge, publish, distribute, and/or sell copies of
128857a78c0SMark de Wever// the Data Files or Software, and to permit persons to whom the Data Files
129857a78c0SMark de Wever// or Software are furnished to do so, provided that either
130857a78c0SMark de Wever// (a) this copyright and permission notice appear with all copies
131857a78c0SMark de Wever// of the Data Files or Software, or
132857a78c0SMark de Wever// (b) this copyright and permission notice appear in associated
133857a78c0SMark de Wever// Documentation.
134857a78c0SMark de Wever//
135857a78c0SMark de Wever// THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF
136857a78c0SMark de Wever// ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
137857a78c0SMark de Wever// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
138857a78c0SMark de Wever// NONINFRINGEMENT OF THIRD PARTY RIGHTS.
139857a78c0SMark de Wever// IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS
140857a78c0SMark de Wever// NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL
141857a78c0SMark de Wever// DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
142857a78c0SMark de Wever// DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
143857a78c0SMark de Wever// TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
144857a78c0SMark de Wever// PERFORMANCE OF THE DATA FILES OR SOFTWARE.
145857a78c0SMark de Wever//
146857a78c0SMark de Wever// Except as contained in this notice, the name of a copyright holder
147857a78c0SMark de Wever// shall not be used in advertising or otherwise to promote the sale,
148857a78c0SMark de Wever// use or other dealings in these Data Files or Software without prior
149857a78c0SMark de Wever// written authorization of the copyright holder.
150857a78c0SMark de Wever
151857a78c0SMark de Wever#ifndef LIBCXX_TEST_STD_UTILITIES_FORMAT_FORMAT_STRING_FORMAT_STRING_STD_EXTENDED_GRAPHEME_CLUSTER_H
152857a78c0SMark de Wever#define LIBCXX_TEST_STD_UTILITIES_FORMAT_FORMAT_STRING_FORMAT_STRING_STD_EXTENDED_GRAPHEME_CLUSTER_H
153857a78c0SMark de Wever
154857a78c0SMark de Wever#include <array>
155857a78c0SMark de Wever#include <string_view>
156857a78c0SMark de Wever#include <vector>
157857a78c0SMark de Wever
1583465f022SLouis Dionne#include "test_macros.h"
1593465f022SLouis Dionne
160857a78c0SMark de Wevertemplate <class CharT>
161857a78c0SMark de Weverstruct data {{
162857a78c0SMark de Wever  /// The input to parse.
163857a78c0SMark de Wever  std::basic_string_view<CharT> input;
164857a78c0SMark de Wever
165857a78c0SMark de Wever  /// The first code point all extended grapheme clusters in the input.
166857a78c0SMark de Wever  std::vector<char32_t> code_points;
167857a78c0SMark de Wever
168857a78c0SMark de Wever  /// The offset of the last code units of the extended grapheme clusters in the input.
169857a78c0SMark de Wever  ///
170857a78c0SMark de Wever  /// The vector has the same number of entries as \\ref code_points.
171*fb855eb9SMark de Wever  std::vector<std::size_t> breaks;
172857a78c0SMark de Wever}};
173857a78c0SMark de Wever
174857a78c0SMark de Wever/// The data for UTF-8.
175da38bcfdSMark de Weverstd::array<data<char>, {0}> data_utf8 = {{{{
176da38bcfdSMark de Wever{1}}}}};
177857a78c0SMark de Wever
178857a78c0SMark de Wever/// The data for UTF-16.
179857a78c0SMark de Wever///
180857a78c0SMark de Wever/// Note that most of the data for the UTF-16 and UTF-32 are identical. However
181857a78c0SMark de Wever/// since the size of the code units differ the breaks can contain different
182857a78c0SMark de Wever/// values.
1833465f022SLouis Dionne#ifndef TEST_HAS_NO_WIDE_CHARACTERS
184da38bcfdSMark de Weverstd::array<data<wchar_t>, {0}> data_utf16 = {{{{
185da38bcfdSMark de Wever{2}}}}};
186857a78c0SMark de Wever
187857a78c0SMark de Wever/// The data for UTF-8.
188857a78c0SMark de Wever///
189857a78c0SMark de Wever/// Note that most of the data for the UTF-16 and UTF-32 are identical. However
190857a78c0SMark de Wever/// since the size of the code units differ the breaks can contain different
191857a78c0SMark de Wever/// values.
192da38bcfdSMark de Weverstd::array<data<wchar_t>, {0}> data_utf32 = {{{{
193da38bcfdSMark de Wever{3}}}}};
1943465f022SLouis Dionne#endif // TEST_HAS_NO_WIDE_CHARACTERS
195857a78c0SMark de Wever
196da38bcfdSMark de Wever#endif // LIBCXX_TEST_STD_UTILITIES_FORMAT_FORMAT_STRING_FORMAT_STRING_STD_EXTENDED_GRAPHEME_CLUSTER_H"""
197857a78c0SMark de Wever
198857a78c0SMark de Wevercpp_test_data_line_template = "     {{{}, {{{}}}, {{{}}}}}"
199857a78c0SMark de Wever
200857a78c0SMark de Wever
201857a78c0SMark de Weverdef lineToCppDataLineUtf8(line: BreakTestItem) -> str:
202857a78c0SMark de Wever    return cpp_test_data_line_template.format(
203857a78c0SMark de Wever        f'"{line.encoded}"',
204857a78c0SMark de Wever        ", ".join([str(x) for x in line.code_points]),
205857a78c0SMark de Wever        ", ".join([str(x) for x in line.breaks_utf8]),
206857a78c0SMark de Wever    )
207857a78c0SMark de Wever
208857a78c0SMark de Wever
209857a78c0SMark de Weverdef lineToCppDataLineUtf16(line: BreakTestItem) -> str:
210857a78c0SMark de Wever    return cpp_test_data_line_template.format(
211857a78c0SMark de Wever        f'L"{line.encoded}"',
212857a78c0SMark de Wever        ", ".join([str(x) for x in line.code_points]),
213857a78c0SMark de Wever        ", ".join([str(x) for x in line.breaks_utf16]),
214857a78c0SMark de Wever    )
215857a78c0SMark de Wever
216857a78c0SMark de Wever
217857a78c0SMark de Weverdef lineToCppDataLineUtf32(line: BreakTestItem) -> str:
218857a78c0SMark de Wever    return cpp_test_data_line_template.format(
219857a78c0SMark de Wever        f'L"{line.encoded}"',
220857a78c0SMark de Wever        ", ".join([str(x) for x in line.code_points]),
221857a78c0SMark de Wever        ", ".join([str(x) for x in line.breaks_utf32]),
222857a78c0SMark de Wever    )
223857a78c0SMark de Wever
224857a78c0SMark de Wever
225857a78c0SMark de Wever"""
226857a78c0SMark de WeverGenerate test data from "GraphemeBreakText.txt"
227857a78c0SMark de WeverThis file can be downloaded from:
228857a78c0SMark de Weverhttps://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakTest.txt
229857a78c0SMark de WeverThis script looks for GraphemeBreakTest.txt in same directory as this script
230857a78c0SMark de Wever"""
231857a78c0SMark de Wever
232857a78c0SMark de Wever
233857a78c0SMark de Weverdef generate_all() -> str:
234857a78c0SMark de Wever    test_data_path = Path(__file__)
235857a78c0SMark de Wever    test_data_path = test_data_path.absolute()
236130b1816SMark de Wever    test_data_path = (
237130b1816SMark de Wever        test_data_path.parent / "data" / "unicode" / "GraphemeBreakTest.txt"
238130b1816SMark de Wever    )
239857a78c0SMark de Wever    lines = list()
240857a78c0SMark de Wever    with open(test_data_path, mode="rt", encoding="utf-8") as file:
241857a78c0SMark de Wever        while line := parseBreakTestLine(file):
242857a78c0SMark de Wever            if len(line.encoded) > 0:
243857a78c0SMark de Wever                lines.append(line)
244857a78c0SMark de Wever    return cpp_template.format(
245857a78c0SMark de Wever        len(lines),
246da38bcfdSMark de Wever        ",\n".join(map(lineToCppDataLineUtf8, lines)),
247da38bcfdSMark de Wever        ",\n".join(map(lineToCppDataLineUtf16, lines)),
248da38bcfdSMark de Wever        ",\n".join(map(lineToCppDataLineUtf32, lines)),
249857a78c0SMark de Wever    )
250857a78c0SMark de Wever
251857a78c0SMark de Wever
252857a78c0SMark de Weverif __name__ == "__main__":
253130b1816SMark de Wever    if len(sys.argv) == 2:
254130b1816SMark de Wever        sys.stdout = open(sys.argv[1], "w")
255857a78c0SMark de Wever    print(generate_all())
256