1#!/usr/bin/env python 2# ===----------------------------------------------------------------------===## 3# 4# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 5# See https://llvm.org/LICENSE.txt for license information. 6# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 7# 8# ===----------------------------------------------------------------------===## 9 10# The code is based on 11# https://github.com/microsoft/STL/blob/main/tools/unicode_properties_parse/grapheme_break_test_data_gen.py 12# 13# Copyright (c) Microsoft Corporation. 14# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 15 16from pathlib import Path 17from dataclasses import dataclass, field 18from typing import Optional, TextIO 19from array import array 20import sys 21 22 23@dataclass 24class BreakTestItem: 25 code_points: list[int] = field(default_factory=list) 26 encoded: str = "" 27 breaks_utf8: list[int] = field(default_factory=list) 28 breaks_utf16: list[int] = field(default_factory=list) 29 breaks_utf32: list[int] = field(default_factory=list) 30 31 32class CommentLine: 33 pass 34 35 36class EOF: 37 pass 38 39 40def parseBreakTestLine(input: TextIO) -> Optional[BreakTestItem]: 41 result = BreakTestItem() 42 code_point = -1 43 utf8 = 0 44 utf16 = 0 45 utf32 = 0 46 47 while True: 48 c = input.read(1) 49 if c == "\N{DIVISION SIGN}": 50 # The line starts with a division sign, don't add it to the output. 51 if code_point != -1: 52 result.code_points.append(code_point) 53 code_point = -1 54 result.breaks_utf8.append(utf8) 55 result.breaks_utf16.append(utf16) 56 result.breaks_utf32.append(utf32) 57 58 assert input.read(1).isspace() 59 continue 60 if c == "\N{MULTIPLICATION SIGN}": 61 assert input.read(1).isspace() 62 continue 63 if c.isalnum(): 64 while next := input.read(1): 65 if next.isalnum(): 66 c += next 67 else: 68 assert next.isspace() 69 break 70 i = int(c, base=16) 71 if code_point == -1: 72 code_point = i 73 74 result.encoded += f"\\U{i:08x}" 75 c = chr(i) 76 utf8 += c.encode().__len__() 77 # Since we only care about the number of code units the byte order 78 # doesn't matter. The byte order is specified to avoid the BOM 79 utf16 += int(c.encode("utf-16-le").__len__() / 2) 80 utf32 += int(c.encode("utf-32-le").__len__() / 4) 81 continue 82 if c == "#": 83 input.readline() 84 return result 85 if c == "\n": 86 return result 87 if c == "": 88 return None 89 assert False 90 91 92cpp_template = """// -*- C++ -*- 93//===----------------------------------------------------------------------===// 94// 95// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 96// See https://llvm.org/LICENSE.txt for license information. 97// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 98// 99//===----------------------------------------------------------------------===// 100 101// WARNING, this entire header is generated by 102// utiles/generate_extended_grapheme_cluster_test.py 103// DO NOT MODIFY! 104 105// UNICODE, INC. LICENSE AGREEMENT - DATA FILES AND SOFTWARE 106// 107// See Terms of Use <https://www.unicode.org/copyright.html> 108// for definitions of Unicode Inc.'s Data Files and Software. 109// 110// NOTICE TO USER: Carefully read the following legal agreement. 111// BY DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING UNICODE INC.'S 112// DATA FILES ("DATA FILES"), AND/OR SOFTWARE ("SOFTWARE"), 113// YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE 114// TERMS AND CONDITIONS OF THIS AGREEMENT. 115// IF YOU DO NOT AGREE, DO NOT DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE 116// THE DATA FILES OR SOFTWARE. 117// 118// COPYRIGHT AND PERMISSION NOTICE 119// 120// Copyright (c) 1991-2022 Unicode, Inc. All rights reserved. 121// Distributed under the Terms of Use in https://www.unicode.org/copyright.html. 122// 123// Permission is hereby granted, free of charge, to any person obtaining 124// a copy of the Unicode data files and any associated documentation 125// (the "Data Files") or Unicode software and any associated documentation 126// (the "Software") to deal in the Data Files or Software 127// without restriction, including without limitation the rights to use, 128// copy, modify, merge, publish, distribute, and/or sell copies of 129// the Data Files or Software, and to permit persons to whom the Data Files 130// or Software are furnished to do so, provided that either 131// (a) this copyright and permission notice appear with all copies 132// of the Data Files or Software, or 133// (b) this copyright and permission notice appear in associated 134// Documentation. 135// 136// THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF 137// ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE 138// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 139// NONINFRINGEMENT OF THIRD PARTY RIGHTS. 140// IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS 141// NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL 142// DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, 143// DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER 144// TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 145// PERFORMANCE OF THE DATA FILES OR SOFTWARE. 146// 147// Except as contained in this notice, the name of a copyright holder 148// shall not be used in advertising or otherwise to promote the sale, 149// use or other dealings in these Data Files or Software without prior 150// written authorization of the copyright holder. 151 152#ifndef LIBCXX_TEST_STD_UTILITIES_FORMAT_FORMAT_STRING_FORMAT_STRING_STD_EXTENDED_GRAPHEME_CLUSTER_H 153#define LIBCXX_TEST_STD_UTILITIES_FORMAT_FORMAT_STRING_FORMAT_STRING_STD_EXTENDED_GRAPHEME_CLUSTER_H 154 155#include <array> 156#include <string_view> 157#include <vector> 158 159#include "test_macros.h" 160 161template <class CharT> 162struct data {{ 163 /// The input to parse. 164 std::basic_string_view<CharT> input; 165 166 /// The first code point all extended grapheme clusters in the input. 167 std::vector<char32_t> code_points; 168 169 /// The offset of the last code units of the extended grapheme clusters in the input. 170 /// 171 /// The vector has the same number of entries as \\ref code_points. 172 std::vector<size_t> breaks; 173}}; 174 175/// The data for UTF-8. 176std::array<data<char>, {0}> data_utf8 = {{{{ 177{1}}}}}; 178 179/// The data for UTF-16. 180/// 181/// Note that most of the data for the UTF-16 and UTF-32 are identical. However 182/// since the size of the code units differ the breaks can contain different 183/// values. 184#ifndef TEST_HAS_NO_WIDE_CHARACTERS 185std::array<data<wchar_t>, {0}> data_utf16 = {{{{ 186{2}}}}}; 187 188/// The data for UTF-8. 189/// 190/// Note that most of the data for the UTF-16 and UTF-32 are identical. However 191/// since the size of the code units differ the breaks can contain different 192/// values. 193std::array<data<wchar_t>, {0}> data_utf32 = {{{{ 194{3}}}}}; 195#endif // TEST_HAS_NO_WIDE_CHARACTERS 196 197#endif // LIBCXX_TEST_STD_UTILITIES_FORMAT_FORMAT_STRING_FORMAT_STRING_STD_EXTENDED_GRAPHEME_CLUSTER_H""" 198 199cpp_test_data_line_template = " {{{}, {{{}}}, {{{}}}}}" 200 201 202def lineToCppDataLineUtf8(line: BreakTestItem) -> str: 203 return cpp_test_data_line_template.format( 204 f'"{line.encoded}"', 205 ", ".join([str(x) for x in line.code_points]), 206 ", ".join([str(x) for x in line.breaks_utf8]), 207 ) 208 209 210def lineToCppDataLineUtf16(line: BreakTestItem) -> str: 211 return cpp_test_data_line_template.format( 212 f'L"{line.encoded}"', 213 ", ".join([str(x) for x in line.code_points]), 214 ", ".join([str(x) for x in line.breaks_utf16]), 215 ) 216 217 218def lineToCppDataLineUtf32(line: BreakTestItem) -> str: 219 return cpp_test_data_line_template.format( 220 f'L"{line.encoded}"', 221 ", ".join([str(x) for x in line.code_points]), 222 ", ".join([str(x) for x in line.breaks_utf32]), 223 ) 224 225 226""" 227Generate test data from "GraphemeBreakText.txt" 228This file can be downloaded from: 229https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakTest.txt 230This script looks for GraphemeBreakTest.txt in same directory as this script 231""" 232 233 234def generate_all() -> str: 235 test_data_path = Path(__file__) 236 test_data_path = test_data_path.absolute() 237 test_data_path = ( 238 test_data_path.parent / "data" / "unicode" / "GraphemeBreakTest.txt" 239 ) 240 lines = list() 241 with open(test_data_path, mode="rt", encoding="utf-8") as file: 242 while line := parseBreakTestLine(file): 243 if len(line.encoded) > 0: 244 lines.append(line) 245 return cpp_template.format( 246 len(lines), 247 ",\n".join(map(lineToCppDataLineUtf8, lines)), 248 ",\n".join(map(lineToCppDataLineUtf16, lines)), 249 ",\n".join(map(lineToCppDataLineUtf32, lines)), 250 ) 251 252 253if __name__ == "__main__": 254 if len(sys.argv) == 2: 255 sys.stdout = open(sys.argv[1], "w") 256 print(generate_all()) 257