xref: /llvm-project/libcxx/utils/generate_extended_grapheme_cluster_test.py (revision 0d3c40b82b3b667a4f7f69f45658c184615fd54f)
1#!/usr/bin/env python
2# ===----------------------------------------------------------------------===##
3#
4# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
5# See https://llvm.org/LICENSE.txt for license information.
6# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7#
8# ===----------------------------------------------------------------------===##
9
10# The code is based on
11# https://github.com/microsoft/STL/blob/main/tools/unicode_properties_parse/grapheme_break_test_data_gen.py
12#
13# Copyright (c) Microsoft Corporation.
14# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
15
16from pathlib import Path
17from dataclasses import dataclass, field
18from typing import Optional, TextIO
19import sys
20
21
22@dataclass
23class BreakTestItem:
24    code_points: list[int] = field(default_factory=list)
25    encoded: str = ""
26    breaks_utf8: list[int] = field(default_factory=list)
27    breaks_utf16: list[int] = field(default_factory=list)
28    breaks_utf32: list[int] = field(default_factory=list)
29
30
31class CommentLine:
32    pass
33
34
35class EOF:
36    pass
37
38
39def parseBreakTestLine(input: TextIO) -> Optional[BreakTestItem]:
40    result = BreakTestItem()
41    code_point = -1
42    utf8 = 0
43    utf16 = 0
44    utf32 = 0
45
46    while True:
47        c = input.read(1)
48        if c == "\N{DIVISION SIGN}":
49            # The line starts with a division sign, don't add it to the output.
50            if code_point != -1:
51                result.code_points.append(code_point)
52                code_point = -1
53                result.breaks_utf8.append(utf8)
54                result.breaks_utf16.append(utf16)
55                result.breaks_utf32.append(utf32)
56
57            assert input.read(1).isspace()
58            continue
59        if c == "\N{MULTIPLICATION SIGN}":
60            assert input.read(1).isspace()
61            continue
62        if c.isalnum():
63            while next := input.read(1):
64                if next.isalnum():
65                    c += next
66                else:
67                    assert next.isspace()
68                    break
69            i = int(c, base=16)
70            if code_point == -1:
71                code_point = i
72
73            result.encoded += f"\\U{i:08x}"
74            c = chr(i)
75            utf8 += c.encode().__len__()
76            # Since we only care about the number of code units the byte order
77            # doesn't matter. The byte order is specified to avoid the BOM
78            utf16 += int(c.encode("utf-16-le").__len__() / 2)
79            utf32 += int(c.encode("utf-32-le").__len__() / 4)
80            continue
81        if c == "#":
82            input.readline()
83            return result
84        if c == "\n":
85            return result
86        if c == "":
87            return None
88        assert False
89
90
91cpp_template = """// -*- C++ -*-
92//===----------------------------------------------------------------------===//
93//
94// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
95// See https://llvm.org/LICENSE.txt for license information.
96// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
97//
98//===----------------------------------------------------------------------===//
99
100// WARNING, this entire header is generated by
101// utils/generate_extended_grapheme_cluster_test.py
102// DO NOT MODIFY!
103
104// UNICODE, INC. LICENSE AGREEMENT - DATA FILES AND SOFTWARE
105//
106// See Terms of Use <https://www.unicode.org/copyright.html>
107// for definitions of Unicode Inc.'s Data Files and Software.
108//
109// NOTICE TO USER: Carefully read the following legal agreement.
110// BY DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING UNICODE INC.'S
111// DATA FILES ("DATA FILES"), AND/OR SOFTWARE ("SOFTWARE"),
112// YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE
113// TERMS AND CONDITIONS OF THIS AGREEMENT.
114// IF YOU DO NOT AGREE, DO NOT DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE
115// THE DATA FILES OR SOFTWARE.
116//
117// COPYRIGHT AND PERMISSION NOTICE
118//
119// Copyright (c) 1991-2022 Unicode, Inc. All rights reserved.
120// Distributed under the Terms of Use in https://www.unicode.org/copyright.html.
121//
122// Permission is hereby granted, free of charge, to any person obtaining
123// a copy of the Unicode data files and any associated documentation
124// (the "Data Files") or Unicode software and any associated documentation
125// (the "Software") to deal in the Data Files or Software
126// without restriction, including without limitation the rights to use,
127// copy, modify, merge, publish, distribute, and/or sell copies of
128// the Data Files or Software, and to permit persons to whom the Data Files
129// or Software are furnished to do so, provided that either
130// (a) this copyright and permission notice appear with all copies
131// of the Data Files or Software, or
132// (b) this copyright and permission notice appear in associated
133// Documentation.
134//
135// THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF
136// ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
137// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
138// NONINFRINGEMENT OF THIRD PARTY RIGHTS.
139// IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS
140// NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL
141// DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
142// DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
143// TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
144// PERFORMANCE OF THE DATA FILES OR SOFTWARE.
145//
146// Except as contained in this notice, the name of a copyright holder
147// shall not be used in advertising or otherwise to promote the sale,
148// use or other dealings in these Data Files or Software without prior
149// written authorization of the copyright holder.
150
151#ifndef LIBCXX_TEST_STD_UTILITIES_FORMAT_FORMAT_STRING_FORMAT_STRING_STD_EXTENDED_GRAPHEME_CLUSTER_H
152#define LIBCXX_TEST_STD_UTILITIES_FORMAT_FORMAT_STRING_FORMAT_STRING_STD_EXTENDED_GRAPHEME_CLUSTER_H
153
154#include <array>
155#include <string_view>
156#include <vector>
157
158#include "test_macros.h"
159
160template <class CharT>
161struct data {{
162  /// The input to parse.
163  std::basic_string_view<CharT> input;
164
165  /// The first code point all extended grapheme clusters in the input.
166  std::vector<char32_t> code_points;
167
168  /// The offset of the last code units of the extended grapheme clusters in the input.
169  ///
170  /// The vector has the same number of entries as \\ref code_points.
171  std::vector<std::size_t> breaks;
172}};
173
174/// The data for UTF-8.
175std::array<data<char>, {0}> data_utf8 = {{{{
176{1}}}}};
177
178/// The data for UTF-16.
179///
180/// Note that most of the data for the UTF-16 and UTF-32 are identical. However
181/// since the size of the code units differ the breaks can contain different
182/// values.
183#ifndef TEST_HAS_NO_WIDE_CHARACTERS
184std::array<data<wchar_t>, {0}> data_utf16 = {{{{
185{2}}}}};
186
187/// The data for UTF-8.
188///
189/// Note that most of the data for the UTF-16 and UTF-32 are identical. However
190/// since the size of the code units differ the breaks can contain different
191/// values.
192std::array<data<wchar_t>, {0}> data_utf32 = {{{{
193{3}}}}};
194#endif // TEST_HAS_NO_WIDE_CHARACTERS
195
196#endif // LIBCXX_TEST_STD_UTILITIES_FORMAT_FORMAT_STRING_FORMAT_STRING_STD_EXTENDED_GRAPHEME_CLUSTER_H"""
197
198cpp_test_data_line_template = "     {{{}, {{{}}}, {{{}}}}}"
199
200
201def lineToCppDataLineUtf8(line: BreakTestItem) -> str:
202    return cpp_test_data_line_template.format(
203        f'"{line.encoded}"',
204        ", ".join([str(x) for x in line.code_points]),
205        ", ".join([str(x) for x in line.breaks_utf8]),
206    )
207
208
209def lineToCppDataLineUtf16(line: BreakTestItem) -> str:
210    return cpp_test_data_line_template.format(
211        f'L"{line.encoded}"',
212        ", ".join([str(x) for x in line.code_points]),
213        ", ".join([str(x) for x in line.breaks_utf16]),
214    )
215
216
217def lineToCppDataLineUtf32(line: BreakTestItem) -> str:
218    return cpp_test_data_line_template.format(
219        f'L"{line.encoded}"',
220        ", ".join([str(x) for x in line.code_points]),
221        ", ".join([str(x) for x in line.breaks_utf32]),
222    )
223
224
225"""
226Generate test data from "GraphemeBreakText.txt"
227This file can be downloaded from:
228https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakTest.txt
229This script looks for GraphemeBreakTest.txt in same directory as this script
230"""
231
232
233def generate_all() -> str:
234    test_data_path = Path(__file__)
235    test_data_path = test_data_path.absolute()
236    test_data_path = (
237        test_data_path.parent / "data" / "unicode" / "GraphemeBreakTest.txt"
238    )
239    lines = list()
240    with open(test_data_path, mode="rt", encoding="utf-8") as file:
241        while line := parseBreakTestLine(file):
242            if len(line.encoded) > 0:
243                lines.append(line)
244    return cpp_template.format(
245        len(lines),
246        ",\n".join(map(lineToCppDataLineUtf8, lines)),
247        ",\n".join(map(lineToCppDataLineUtf16, lines)),
248        ",\n".join(map(lineToCppDataLineUtf32, lines)),
249    )
250
251
252if __name__ == "__main__":
253    if len(sys.argv) == 2:
254        sys.stdout = open(sys.argv[1], "w")
255    print(generate_all())
256