1#!/usr/bin/env python 2# ===----------------------------------------------------------------------===## 3# 4# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 5# See https://llvm.org/LICENSE.txt for license information. 6# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 7# 8# ===----------------------------------------------------------------------===## 9 10# The code is based on 11# https://github.com/microsoft/STL/blob/main/tools/unicode_properties_parse/grapheme_break_test_data_gen.py 12# 13# Copyright (c) Microsoft Corporation. 14# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 15 16from pathlib import Path 17from dataclasses import dataclass, field 18from typing import Optional, TextIO 19import sys 20 21 22@dataclass 23class BreakTestItem: 24 code_points: list[int] = field(default_factory=list) 25 encoded: str = "" 26 breaks_utf8: list[int] = field(default_factory=list) 27 breaks_utf16: list[int] = field(default_factory=list) 28 breaks_utf32: list[int] = field(default_factory=list) 29 30 31class CommentLine: 32 pass 33 34 35class EOF: 36 pass 37 38 39def parseBreakTestLine(input: TextIO) -> Optional[BreakTestItem]: 40 result = BreakTestItem() 41 code_point = -1 42 utf8 = 0 43 utf16 = 0 44 utf32 = 0 45 46 while True: 47 c = input.read(1) 48 if c == "\N{DIVISION SIGN}": 49 # The line starts with a division sign, don't add it to the output. 50 if code_point != -1: 51 result.code_points.append(code_point) 52 code_point = -1 53 result.breaks_utf8.append(utf8) 54 result.breaks_utf16.append(utf16) 55 result.breaks_utf32.append(utf32) 56 57 assert input.read(1).isspace() 58 continue 59 if c == "\N{MULTIPLICATION SIGN}": 60 assert input.read(1).isspace() 61 continue 62 if c.isalnum(): 63 while next := input.read(1): 64 if next.isalnum(): 65 c += next 66 else: 67 assert next.isspace() 68 break 69 i = int(c, base=16) 70 if code_point == -1: 71 code_point = i 72 73 result.encoded += f"\\U{i:08x}" 74 c = chr(i) 75 utf8 += c.encode().__len__() 76 # Since we only care about the number of code units the byte order 77 # doesn't matter. The byte order is specified to avoid the BOM 78 utf16 += int(c.encode("utf-16-le").__len__() / 2) 79 utf32 += int(c.encode("utf-32-le").__len__() / 4) 80 continue 81 if c == "#": 82 input.readline() 83 return result 84 if c == "\n": 85 return result 86 if c == "": 87 return None 88 assert False 89 90 91cpp_template = """// -*- C++ -*- 92//===----------------------------------------------------------------------===// 93// 94// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 95// See https://llvm.org/LICENSE.txt for license information. 96// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 97// 98//===----------------------------------------------------------------------===// 99 100// WARNING, this entire header is generated by 101// utils/generate_extended_grapheme_cluster_test.py 102// DO NOT MODIFY! 103 104// UNICODE, INC. LICENSE AGREEMENT - DATA FILES AND SOFTWARE 105// 106// See Terms of Use <https://www.unicode.org/copyright.html> 107// for definitions of Unicode Inc.'s Data Files and Software. 108// 109// NOTICE TO USER: Carefully read the following legal agreement. 110// BY DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING UNICODE INC.'S 111// DATA FILES ("DATA FILES"), AND/OR SOFTWARE ("SOFTWARE"), 112// YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE 113// TERMS AND CONDITIONS OF THIS AGREEMENT. 114// IF YOU DO NOT AGREE, DO NOT DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE 115// THE DATA FILES OR SOFTWARE. 116// 117// COPYRIGHT AND PERMISSION NOTICE 118// 119// Copyright (c) 1991-2022 Unicode, Inc. All rights reserved. 120// Distributed under the Terms of Use in https://www.unicode.org/copyright.html. 121// 122// Permission is hereby granted, free of charge, to any person obtaining 123// a copy of the Unicode data files and any associated documentation 124// (the "Data Files") or Unicode software and any associated documentation 125// (the "Software") to deal in the Data Files or Software 126// without restriction, including without limitation the rights to use, 127// copy, modify, merge, publish, distribute, and/or sell copies of 128// the Data Files or Software, and to permit persons to whom the Data Files 129// or Software are furnished to do so, provided that either 130// (a) this copyright and permission notice appear with all copies 131// of the Data Files or Software, or 132// (b) this copyright and permission notice appear in associated 133// Documentation. 134// 135// THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF 136// ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE 137// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 138// NONINFRINGEMENT OF THIRD PARTY RIGHTS. 139// IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS 140// NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL 141// DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, 142// DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER 143// TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 144// PERFORMANCE OF THE DATA FILES OR SOFTWARE. 145// 146// Except as contained in this notice, the name of a copyright holder 147// shall not be used in advertising or otherwise to promote the sale, 148// use or other dealings in these Data Files or Software without prior 149// written authorization of the copyright holder. 150 151#ifndef LIBCXX_TEST_STD_UTILITIES_FORMAT_FORMAT_STRING_FORMAT_STRING_STD_EXTENDED_GRAPHEME_CLUSTER_H 152#define LIBCXX_TEST_STD_UTILITIES_FORMAT_FORMAT_STRING_FORMAT_STRING_STD_EXTENDED_GRAPHEME_CLUSTER_H 153 154#include <array> 155#include <string_view> 156#include <vector> 157 158#include "test_macros.h" 159 160template <class CharT> 161struct data {{ 162 /// The input to parse. 163 std::basic_string_view<CharT> input; 164 165 /// The first code point all extended grapheme clusters in the input. 166 std::vector<char32_t> code_points; 167 168 /// The offset of the last code units of the extended grapheme clusters in the input. 169 /// 170 /// The vector has the same number of entries as \\ref code_points. 171 std::vector<std::size_t> breaks; 172}}; 173 174/// The data for UTF-8. 175std::array<data<char>, {0}> data_utf8 = {{{{ 176{1}}}}}; 177 178/// The data for UTF-16. 179/// 180/// Note that most of the data for the UTF-16 and UTF-32 are identical. However 181/// since the size of the code units differ the breaks can contain different 182/// values. 183#ifndef TEST_HAS_NO_WIDE_CHARACTERS 184std::array<data<wchar_t>, {0}> data_utf16 = {{{{ 185{2}}}}}; 186 187/// The data for UTF-8. 188/// 189/// Note that most of the data for the UTF-16 and UTF-32 are identical. However 190/// since the size of the code units differ the breaks can contain different 191/// values. 192std::array<data<wchar_t>, {0}> data_utf32 = {{{{ 193{3}}}}}; 194#endif // TEST_HAS_NO_WIDE_CHARACTERS 195 196#endif // LIBCXX_TEST_STD_UTILITIES_FORMAT_FORMAT_STRING_FORMAT_STRING_STD_EXTENDED_GRAPHEME_CLUSTER_H""" 197 198cpp_test_data_line_template = " {{{}, {{{}}}, {{{}}}}}" 199 200 201def lineToCppDataLineUtf8(line: BreakTestItem) -> str: 202 return cpp_test_data_line_template.format( 203 f'"{line.encoded}"', 204 ", ".join([str(x) for x in line.code_points]), 205 ", ".join([str(x) for x in line.breaks_utf8]), 206 ) 207 208 209def lineToCppDataLineUtf16(line: BreakTestItem) -> str: 210 return cpp_test_data_line_template.format( 211 f'L"{line.encoded}"', 212 ", ".join([str(x) for x in line.code_points]), 213 ", ".join([str(x) for x in line.breaks_utf16]), 214 ) 215 216 217def lineToCppDataLineUtf32(line: BreakTestItem) -> str: 218 return cpp_test_data_line_template.format( 219 f'L"{line.encoded}"', 220 ", ".join([str(x) for x in line.code_points]), 221 ", ".join([str(x) for x in line.breaks_utf32]), 222 ) 223 224 225""" 226Generate test data from "GraphemeBreakText.txt" 227This file can be downloaded from: 228https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakTest.txt 229This script looks for GraphemeBreakTest.txt in same directory as this script 230""" 231 232 233def generate_all() -> str: 234 test_data_path = Path(__file__) 235 test_data_path = test_data_path.absolute() 236 test_data_path = ( 237 test_data_path.parent / "data" / "unicode" / "GraphemeBreakTest.txt" 238 ) 239 lines = list() 240 with open(test_data_path, mode="rt", encoding="utf-8") as file: 241 while line := parseBreakTestLine(file): 242 if len(line.encoded) > 0: 243 lines.append(line) 244 return cpp_template.format( 245 len(lines), 246 ",\n".join(map(lineToCppDataLineUtf8, lines)), 247 ",\n".join(map(lineToCppDataLineUtf16, lines)), 248 ",\n".join(map(lineToCppDataLineUtf32, lines)), 249 ) 250 251 252if __name__ == "__main__": 253 if len(sys.argv) == 2: 254 sys.stdout = open(sys.argv[1], "w") 255 print(generate_all()) 256