xref: /openbsd-src/gnu/llvm/libcxx/utils/generate_extended_grapheme_cluster_test.py (revision 4bdff4bed0e3d54e55670334c7d0077db4170f86)
1*4bdff4beSrobert#!/usr/bin/env python
2*4bdff4beSrobert# ===----------------------------------------------------------------------===##
3*4bdff4beSrobert#
4*4bdff4beSrobert# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
5*4bdff4beSrobert# See https://llvm.org/LICENSE.txt for license information.
6*4bdff4beSrobert# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7*4bdff4beSrobert#
8*4bdff4beSrobert# ===----------------------------------------------------------------------===##
9*4bdff4beSrobert
10*4bdff4beSrobert# The code is based on
11*4bdff4beSrobert# https://github.com/microsoft/STL/blob/main/tools/unicode_properties_parse/grapheme_break_test_data_gen.py
12*4bdff4beSrobert#
13*4bdff4beSrobert# Copyright (c) Microsoft Corporation.
14*4bdff4beSrobert# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
15*4bdff4beSrobert
16*4bdff4beSrobertfrom pathlib import Path
17*4bdff4beSrobertfrom dataclasses import dataclass, field
18*4bdff4beSrobertfrom typing import Optional, TextIO
19*4bdff4beSrobertfrom array import array
20*4bdff4beSrobertimport sys
21*4bdff4beSrobert
22*4bdff4beSrobert
23*4bdff4beSrobert@dataclass
24*4bdff4beSrobertclass BreakTestItem:
25*4bdff4beSrobert    code_points: list[int] = field(default_factory=list)
26*4bdff4beSrobert    encoded: str = ""
27*4bdff4beSrobert    breaks_utf8: list[int] = field(default_factory=list)
28*4bdff4beSrobert    breaks_utf16: list[int] = field(default_factory=list)
29*4bdff4beSrobert    breaks_utf32: list[int] = field(default_factory=list)
30*4bdff4beSrobert
31*4bdff4beSrobert
32*4bdff4beSrobertclass CommentLine:
33*4bdff4beSrobert    pass
34*4bdff4beSrobert
35*4bdff4beSrobert
36*4bdff4beSrobertclass EOF:
37*4bdff4beSrobert    pass
38*4bdff4beSrobert
39*4bdff4beSrobert
40*4bdff4beSrobertdef parseBreakTestLine(input: TextIO) -> Optional[BreakTestItem]:
41*4bdff4beSrobert    result = BreakTestItem()
42*4bdff4beSrobert    code_point = -1
43*4bdff4beSrobert    utf8 = 0
44*4bdff4beSrobert    utf16 = 0
45*4bdff4beSrobert    utf32 = 0
46*4bdff4beSrobert
47*4bdff4beSrobert    while True:
48*4bdff4beSrobert        c = input.read(1)
49*4bdff4beSrobert        if c == "\N{DIVISION SIGN}":
50*4bdff4beSrobert            # The line starts with a division sign, don't add it to the output.
51*4bdff4beSrobert            if code_point != -1:
52*4bdff4beSrobert                result.code_points.append(code_point)
53*4bdff4beSrobert                code_point = -1
54*4bdff4beSrobert                result.breaks_utf8.append(utf8)
55*4bdff4beSrobert                result.breaks_utf16.append(utf16)
56*4bdff4beSrobert                result.breaks_utf32.append(utf32)
57*4bdff4beSrobert
58*4bdff4beSrobert            assert input.read(1).isspace()
59*4bdff4beSrobert            continue
60*4bdff4beSrobert        if c == "\N{MULTIPLICATION SIGN}":
61*4bdff4beSrobert            assert input.read(1).isspace()
62*4bdff4beSrobert            continue
63*4bdff4beSrobert        if c.isalnum():
64*4bdff4beSrobert            while next := input.read(1):
65*4bdff4beSrobert                if next.isalnum():
66*4bdff4beSrobert                    c += next
67*4bdff4beSrobert                else:
68*4bdff4beSrobert                    assert next.isspace()
69*4bdff4beSrobert                    break
70*4bdff4beSrobert            i = int(c, base=16)
71*4bdff4beSrobert            if code_point == -1:
72*4bdff4beSrobert                code_point = i
73*4bdff4beSrobert
74*4bdff4beSrobert            result.encoded += f"\\U{i:08x}"
75*4bdff4beSrobert            c = chr(i)
76*4bdff4beSrobert            utf8 += c.encode().__len__()
77*4bdff4beSrobert            # Since we only care about the number of code units the byte order
78*4bdff4beSrobert            # doesn't matter. The byte order is specified to avoid the BOM
79*4bdff4beSrobert            utf16 += int(c.encode("utf-16-le").__len__() / 2)
80*4bdff4beSrobert            utf32 += int(c.encode("utf-32-le").__len__() / 4)
81*4bdff4beSrobert            continue
82*4bdff4beSrobert        if c == "#":
83*4bdff4beSrobert            input.readline()
84*4bdff4beSrobert            return result
85*4bdff4beSrobert        if c == "\n":
86*4bdff4beSrobert            return result
87*4bdff4beSrobert        if c == "":
88*4bdff4beSrobert            return None
89*4bdff4beSrobert        assert False
90*4bdff4beSrobert
91*4bdff4beSrobert
92*4bdff4beSrobertcpp_template = """// -*- C++ -*-
93*4bdff4beSrobert//===----------------------------------------------------------------------===//
94*4bdff4beSrobert//
95*4bdff4beSrobert// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
96*4bdff4beSrobert// See https://llvm.org/LICENSE.txt for license information.
97*4bdff4beSrobert// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
98*4bdff4beSrobert//
99*4bdff4beSrobert//===----------------------------------------------------------------------===//
100*4bdff4beSrobert
101*4bdff4beSrobert// WARNING, this entire header is generated by
102*4bdff4beSrobert// utiles/generate_extended_grapheme_cluster_test.py
103*4bdff4beSrobert// DO NOT MODIFY!
104*4bdff4beSrobert
105*4bdff4beSrobert// UNICODE, INC. LICENSE AGREEMENT - DATA FILES AND SOFTWARE
106*4bdff4beSrobert//
107*4bdff4beSrobert// See Terms of Use <https://www.unicode.org/copyright.html>
108*4bdff4beSrobert// for definitions of Unicode Inc.'s Data Files and Software.
109*4bdff4beSrobert//
110*4bdff4beSrobert// NOTICE TO USER: Carefully read the following legal agreement.
111*4bdff4beSrobert// BY DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING UNICODE INC.'S
112*4bdff4beSrobert// DATA FILES ("DATA FILES"), AND/OR SOFTWARE ("SOFTWARE"),
113*4bdff4beSrobert// YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE
114*4bdff4beSrobert// TERMS AND CONDITIONS OF THIS AGREEMENT.
115*4bdff4beSrobert// IF YOU DO NOT AGREE, DO NOT DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE
116*4bdff4beSrobert// THE DATA FILES OR SOFTWARE.
117*4bdff4beSrobert//
118*4bdff4beSrobert// COPYRIGHT AND PERMISSION NOTICE
119*4bdff4beSrobert//
120*4bdff4beSrobert// Copyright (c) 1991-2022 Unicode, Inc. All rights reserved.
121*4bdff4beSrobert// Distributed under the Terms of Use in https://www.unicode.org/copyright.html.
122*4bdff4beSrobert//
123*4bdff4beSrobert// Permission is hereby granted, free of charge, to any person obtaining
124*4bdff4beSrobert// a copy of the Unicode data files and any associated documentation
125*4bdff4beSrobert// (the "Data Files") or Unicode software and any associated documentation
126*4bdff4beSrobert// (the "Software") to deal in the Data Files or Software
127*4bdff4beSrobert// without restriction, including without limitation the rights to use,
128*4bdff4beSrobert// copy, modify, merge, publish, distribute, and/or sell copies of
129*4bdff4beSrobert// the Data Files or Software, and to permit persons to whom the Data Files
130*4bdff4beSrobert// or Software are furnished to do so, provided that either
131*4bdff4beSrobert// (a) this copyright and permission notice appear with all copies
132*4bdff4beSrobert// of the Data Files or Software, or
133*4bdff4beSrobert// (b) this copyright and permission notice appear in associated
134*4bdff4beSrobert// Documentation.
135*4bdff4beSrobert//
136*4bdff4beSrobert// THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF
137*4bdff4beSrobert// ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
138*4bdff4beSrobert// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
139*4bdff4beSrobert// NONINFRINGEMENT OF THIRD PARTY RIGHTS.
140*4bdff4beSrobert// IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS
141*4bdff4beSrobert// NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL
142*4bdff4beSrobert// DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
143*4bdff4beSrobert// DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
144*4bdff4beSrobert// TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
145*4bdff4beSrobert// PERFORMANCE OF THE DATA FILES OR SOFTWARE.
146*4bdff4beSrobert//
147*4bdff4beSrobert// Except as contained in this notice, the name of a copyright holder
148*4bdff4beSrobert// shall not be used in advertising or otherwise to promote the sale,
149*4bdff4beSrobert// use or other dealings in these Data Files or Software without prior
150*4bdff4beSrobert// written authorization of the copyright holder.
151*4bdff4beSrobert
152*4bdff4beSrobert#ifndef LIBCXX_TEST_STD_UTILITIES_FORMAT_FORMAT_STRING_FORMAT_STRING_STD_EXTENDED_GRAPHEME_CLUSTER_H
153*4bdff4beSrobert#define LIBCXX_TEST_STD_UTILITIES_FORMAT_FORMAT_STRING_FORMAT_STRING_STD_EXTENDED_GRAPHEME_CLUSTER_H
154*4bdff4beSrobert
155*4bdff4beSrobert#include <array>
156*4bdff4beSrobert#include <string_view>
157*4bdff4beSrobert#include <vector>
158*4bdff4beSrobert
159*4bdff4beSrobert#include "test_macros.h"
160*4bdff4beSrobert
161*4bdff4beSroberttemplate <class CharT>
162*4bdff4beSrobertstruct data {{
163*4bdff4beSrobert  /// The input to parse.
164*4bdff4beSrobert  std::basic_string_view<CharT> input;
165*4bdff4beSrobert
166*4bdff4beSrobert  /// The first code point all extended grapheme clusters in the input.
167*4bdff4beSrobert  std::vector<char32_t> code_points;
168*4bdff4beSrobert
169*4bdff4beSrobert  /// The offset of the last code units of the extended grapheme clusters in the input.
170*4bdff4beSrobert  ///
171*4bdff4beSrobert  /// The vector has the same number of entries as \\ref code_points.
172*4bdff4beSrobert  std::vector<size_t> breaks;
173*4bdff4beSrobert}};
174*4bdff4beSrobert
175*4bdff4beSrobert/// The data for UTF-8.
176*4bdff4beSrobertstd::array<data<char>, {0}> data_utf8 = {{{{
177*4bdff4beSrobert{1}}}}};
178*4bdff4beSrobert
179*4bdff4beSrobert/// The data for UTF-16.
180*4bdff4beSrobert///
181*4bdff4beSrobert/// Note that most of the data for the UTF-16 and UTF-32 are identical. However
182*4bdff4beSrobert/// since the size of the code units differ the breaks can contain different
183*4bdff4beSrobert/// values.
184*4bdff4beSrobert#ifndef TEST_HAS_NO_WIDE_CHARACTERS
185*4bdff4beSrobertstd::array<data<wchar_t>, {0}> data_utf16 = {{{{
186*4bdff4beSrobert{2}}}}};
187*4bdff4beSrobert
188*4bdff4beSrobert/// The data for UTF-8.
189*4bdff4beSrobert///
190*4bdff4beSrobert/// Note that most of the data for the UTF-16 and UTF-32 are identical. However
191*4bdff4beSrobert/// since the size of the code units differ the breaks can contain different
192*4bdff4beSrobert/// values.
193*4bdff4beSrobertstd::array<data<wchar_t>, {0}> data_utf32 = {{{{
194*4bdff4beSrobert{3}}}}};
195*4bdff4beSrobert#endif // TEST_HAS_NO_WIDE_CHARACTERS
196*4bdff4beSrobert
197*4bdff4beSrobert#endif // LIBCXX_TEST_STD_UTILITIES_FORMAT_FORMAT_STRING_FORMAT_STRING_STD_EXTENDED_GRAPHEME_CLUSTER_H"""
198*4bdff4beSrobert
199*4bdff4beSrobertcpp_test_data_line_template = "     {{{}, {{{}}}, {{{}}}}}"
200*4bdff4beSrobert
201*4bdff4beSrobert
202*4bdff4beSrobertdef lineToCppDataLineUtf8(line: BreakTestItem) -> str:
203*4bdff4beSrobert    return cpp_test_data_line_template.format(
204*4bdff4beSrobert        f'"{line.encoded}"',
205*4bdff4beSrobert        ", ".join([str(x) for x in line.code_points]),
206*4bdff4beSrobert        ", ".join([str(x) for x in line.breaks_utf8]),
207*4bdff4beSrobert    )
208*4bdff4beSrobert
209*4bdff4beSrobert
210*4bdff4beSrobertdef lineToCppDataLineUtf16(line: BreakTestItem) -> str:
211*4bdff4beSrobert    return cpp_test_data_line_template.format(
212*4bdff4beSrobert        f'L"{line.encoded}"',
213*4bdff4beSrobert        ", ".join([str(x) for x in line.code_points]),
214*4bdff4beSrobert        ", ".join([str(x) for x in line.breaks_utf16]),
215*4bdff4beSrobert    )
216*4bdff4beSrobert
217*4bdff4beSrobert
218*4bdff4beSrobertdef lineToCppDataLineUtf32(line: BreakTestItem) -> str:
219*4bdff4beSrobert    return cpp_test_data_line_template.format(
220*4bdff4beSrobert        f'L"{line.encoded}"',
221*4bdff4beSrobert        ", ".join([str(x) for x in line.code_points]),
222*4bdff4beSrobert        ", ".join([str(x) for x in line.breaks_utf32]),
223*4bdff4beSrobert    )
224*4bdff4beSrobert
225*4bdff4beSrobert
226*4bdff4beSrobert"""
227*4bdff4beSrobertGenerate test data from "GraphemeBreakText.txt"
228*4bdff4beSrobertThis file can be downloaded from:
229*4bdff4beSroberthttps://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakTest.txt
230*4bdff4beSrobertThis script looks for GraphemeBreakTest.txt in same directory as this script
231*4bdff4beSrobert"""
232*4bdff4beSrobert
233*4bdff4beSrobert
234*4bdff4beSrobertdef generate_all() -> str:
235*4bdff4beSrobert    test_data_path = Path(__file__)
236*4bdff4beSrobert    test_data_path = test_data_path.absolute()
237*4bdff4beSrobert    test_data_path = (
238*4bdff4beSrobert        test_data_path.parent / "data" / "unicode" / "GraphemeBreakTest.txt"
239*4bdff4beSrobert    )
240*4bdff4beSrobert    lines = list()
241*4bdff4beSrobert    with open(test_data_path, mode="rt", encoding="utf-8") as file:
242*4bdff4beSrobert        while line := parseBreakTestLine(file):
243*4bdff4beSrobert            if len(line.encoded) > 0:
244*4bdff4beSrobert                lines.append(line)
245*4bdff4beSrobert    return cpp_template.format(
246*4bdff4beSrobert        len(lines),
247*4bdff4beSrobert        ",\n".join(map(lineToCppDataLineUtf8, lines)),
248*4bdff4beSrobert        ",\n".join(map(lineToCppDataLineUtf16, lines)),
249*4bdff4beSrobert        ",\n".join(map(lineToCppDataLineUtf32, lines)),
250*4bdff4beSrobert    )
251*4bdff4beSrobert
252*4bdff4beSrobert
253*4bdff4beSrobertif __name__ == "__main__":
254*4bdff4beSrobert    if len(sys.argv) == 2:
255*4bdff4beSrobert        sys.stdout = open(sys.argv[1], "w")
256*4bdff4beSrobert    print(generate_all())
257