xref: /openbsd-src/gnu/llvm/libcxx/utils/generate_extended_grapheme_cluster_test.py (revision 4bdff4bed0e3d54e55670334c7d0077db4170f86)
1#!/usr/bin/env python
2# ===----------------------------------------------------------------------===##
3#
4# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
5# See https://llvm.org/LICENSE.txt for license information.
6# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7#
8# ===----------------------------------------------------------------------===##
9
10# The code is based on
11# https://github.com/microsoft/STL/blob/main/tools/unicode_properties_parse/grapheme_break_test_data_gen.py
12#
13# Copyright (c) Microsoft Corporation.
14# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
15
16from pathlib import Path
17from dataclasses import dataclass, field
18from typing import Optional, TextIO
19from array import array
20import sys
21
22
23@dataclass
24class BreakTestItem:
25    code_points: list[int] = field(default_factory=list)
26    encoded: str = ""
27    breaks_utf8: list[int] = field(default_factory=list)
28    breaks_utf16: list[int] = field(default_factory=list)
29    breaks_utf32: list[int] = field(default_factory=list)
30
31
32class CommentLine:
33    pass
34
35
36class EOF:
37    pass
38
39
40def parseBreakTestLine(input: TextIO) -> Optional[BreakTestItem]:
41    result = BreakTestItem()
42    code_point = -1
43    utf8 = 0
44    utf16 = 0
45    utf32 = 0
46
47    while True:
48        c = input.read(1)
49        if c == "\N{DIVISION SIGN}":
50            # The line starts with a division sign, don't add it to the output.
51            if code_point != -1:
52                result.code_points.append(code_point)
53                code_point = -1
54                result.breaks_utf8.append(utf8)
55                result.breaks_utf16.append(utf16)
56                result.breaks_utf32.append(utf32)
57
58            assert input.read(1).isspace()
59            continue
60        if c == "\N{MULTIPLICATION SIGN}":
61            assert input.read(1).isspace()
62            continue
63        if c.isalnum():
64            while next := input.read(1):
65                if next.isalnum():
66                    c += next
67                else:
68                    assert next.isspace()
69                    break
70            i = int(c, base=16)
71            if code_point == -1:
72                code_point = i
73
74            result.encoded += f"\\U{i:08x}"
75            c = chr(i)
76            utf8 += c.encode().__len__()
77            # Since we only care about the number of code units the byte order
78            # doesn't matter. The byte order is specified to avoid the BOM
79            utf16 += int(c.encode("utf-16-le").__len__() / 2)
80            utf32 += int(c.encode("utf-32-le").__len__() / 4)
81            continue
82        if c == "#":
83            input.readline()
84            return result
85        if c == "\n":
86            return result
87        if c == "":
88            return None
89        assert False
90
91
92cpp_template = """// -*- C++ -*-
93//===----------------------------------------------------------------------===//
94//
95// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
96// See https://llvm.org/LICENSE.txt for license information.
97// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
98//
99//===----------------------------------------------------------------------===//
100
101// WARNING, this entire header is generated by
102// utiles/generate_extended_grapheme_cluster_test.py
103// DO NOT MODIFY!
104
105// UNICODE, INC. LICENSE AGREEMENT - DATA FILES AND SOFTWARE
106//
107// See Terms of Use <https://www.unicode.org/copyright.html>
108// for definitions of Unicode Inc.'s Data Files and Software.
109//
110// NOTICE TO USER: Carefully read the following legal agreement.
111// BY DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING UNICODE INC.'S
112// DATA FILES ("DATA FILES"), AND/OR SOFTWARE ("SOFTWARE"),
113// YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE
114// TERMS AND CONDITIONS OF THIS AGREEMENT.
115// IF YOU DO NOT AGREE, DO NOT DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE
116// THE DATA FILES OR SOFTWARE.
117//
118// COPYRIGHT AND PERMISSION NOTICE
119//
120// Copyright (c) 1991-2022 Unicode, Inc. All rights reserved.
121// Distributed under the Terms of Use in https://www.unicode.org/copyright.html.
122//
123// Permission is hereby granted, free of charge, to any person obtaining
124// a copy of the Unicode data files and any associated documentation
125// (the "Data Files") or Unicode software and any associated documentation
126// (the "Software") to deal in the Data Files or Software
127// without restriction, including without limitation the rights to use,
128// copy, modify, merge, publish, distribute, and/or sell copies of
129// the Data Files or Software, and to permit persons to whom the Data Files
130// or Software are furnished to do so, provided that either
131// (a) this copyright and permission notice appear with all copies
132// of the Data Files or Software, or
133// (b) this copyright and permission notice appear in associated
134// Documentation.
135//
136// THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF
137// ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
138// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
139// NONINFRINGEMENT OF THIRD PARTY RIGHTS.
140// IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS
141// NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL
142// DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
143// DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
144// TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
145// PERFORMANCE OF THE DATA FILES OR SOFTWARE.
146//
147// Except as contained in this notice, the name of a copyright holder
148// shall not be used in advertising or otherwise to promote the sale,
149// use or other dealings in these Data Files or Software without prior
150// written authorization of the copyright holder.
151
152#ifndef LIBCXX_TEST_STD_UTILITIES_FORMAT_FORMAT_STRING_FORMAT_STRING_STD_EXTENDED_GRAPHEME_CLUSTER_H
153#define LIBCXX_TEST_STD_UTILITIES_FORMAT_FORMAT_STRING_FORMAT_STRING_STD_EXTENDED_GRAPHEME_CLUSTER_H
154
155#include <array>
156#include <string_view>
157#include <vector>
158
159#include "test_macros.h"
160
161template <class CharT>
162struct data {{
163  /// The input to parse.
164  std::basic_string_view<CharT> input;
165
166  /// The first code point all extended grapheme clusters in the input.
167  std::vector<char32_t> code_points;
168
169  /// The offset of the last code units of the extended grapheme clusters in the input.
170  ///
171  /// The vector has the same number of entries as \\ref code_points.
172  std::vector<size_t> breaks;
173}};
174
175/// The data for UTF-8.
176std::array<data<char>, {0}> data_utf8 = {{{{
177{1}}}}};
178
179/// The data for UTF-16.
180///
181/// Note that most of the data for the UTF-16 and UTF-32 are identical. However
182/// since the size of the code units differ the breaks can contain different
183/// values.
184#ifndef TEST_HAS_NO_WIDE_CHARACTERS
185std::array<data<wchar_t>, {0}> data_utf16 = {{{{
186{2}}}}};
187
188/// The data for UTF-8.
189///
190/// Note that most of the data for the UTF-16 and UTF-32 are identical. However
191/// since the size of the code units differ the breaks can contain different
192/// values.
193std::array<data<wchar_t>, {0}> data_utf32 = {{{{
194{3}}}}};
195#endif // TEST_HAS_NO_WIDE_CHARACTERS
196
197#endif // LIBCXX_TEST_STD_UTILITIES_FORMAT_FORMAT_STRING_FORMAT_STRING_STD_EXTENDED_GRAPHEME_CLUSTER_H"""
198
199cpp_test_data_line_template = "     {{{}, {{{}}}, {{{}}}}}"
200
201
202def lineToCppDataLineUtf8(line: BreakTestItem) -> str:
203    return cpp_test_data_line_template.format(
204        f'"{line.encoded}"',
205        ", ".join([str(x) for x in line.code_points]),
206        ", ".join([str(x) for x in line.breaks_utf8]),
207    )
208
209
210def lineToCppDataLineUtf16(line: BreakTestItem) -> str:
211    return cpp_test_data_line_template.format(
212        f'L"{line.encoded}"',
213        ", ".join([str(x) for x in line.code_points]),
214        ", ".join([str(x) for x in line.breaks_utf16]),
215    )
216
217
218def lineToCppDataLineUtf32(line: BreakTestItem) -> str:
219    return cpp_test_data_line_template.format(
220        f'L"{line.encoded}"',
221        ", ".join([str(x) for x in line.code_points]),
222        ", ".join([str(x) for x in line.breaks_utf32]),
223    )
224
225
226"""
227Generate test data from "GraphemeBreakText.txt"
228This file can be downloaded from:
229https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakTest.txt
230This script looks for GraphemeBreakTest.txt in same directory as this script
231"""
232
233
234def generate_all() -> str:
235    test_data_path = Path(__file__)
236    test_data_path = test_data_path.absolute()
237    test_data_path = (
238        test_data_path.parent / "data" / "unicode" / "GraphemeBreakTest.txt"
239    )
240    lines = list()
241    with open(test_data_path, mode="rt", encoding="utf-8") as file:
242        while line := parseBreakTestLine(file):
243            if len(line.encoded) > 0:
244                lines.append(line)
245    return cpp_template.format(
246        len(lines),
247        ",\n".join(map(lineToCppDataLineUtf8, lines)),
248        ",\n".join(map(lineToCppDataLineUtf16, lines)),
249        ",\n".join(map(lineToCppDataLineUtf32, lines)),
250    )
251
252
253if __name__ == "__main__":
254    if len(sys.argv) == 2:
255        sys.stdout = open(sys.argv[1], "w")
256    print(generate_all())
257