1#!/usr/bin/env python 2# ===----------------------------------------------------------------------===## 3# 4# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 5# See https://llvm.org/LICENSE.txt for license information. 6# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 7# 8# ===----------------------------------------------------------------------===## 9 10# The code is based on 11# https://github.com/microsoft/STL/blob/main/tools/unicode_properties_parse/grapheme_break_test_data_gen.py 12# 13# Copyright (c) Microsoft Corporation. 14# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 15 16from pathlib import Path 17from dataclasses import dataclass, field 18from typing import Optional, TextIO 19from array import array 20 21 22@dataclass 23class BreakTestItem: 24 code_points: list[int] = field(default_factory=list) 25 encoded: str = "" 26 breaks_utf8: list[int] = field(default_factory=list) 27 breaks_utf16: list[int] = field(default_factory=list) 28 breaks_utf32: list[int] = field(default_factory=list) 29 30 31class CommentLine: 32 pass 33 34 35class EOF: 36 pass 37 38 39def parseBreakTestLine(input: TextIO) -> Optional[BreakTestItem]: 40 result = BreakTestItem() 41 code_point = -1 42 utf8 = 0 43 utf16 = 0 44 utf32 = 0 45 46 while True: 47 c = input.read(1) 48 if c == "\N{DIVISION SIGN}": 49 # The line starts with a division sign, don't add it to the output. 50 if code_point != -1: 51 result.code_points.append(code_point) 52 code_point = -1 53 result.breaks_utf8.append(utf8) 54 result.breaks_utf16.append(utf16) 55 result.breaks_utf32.append(utf32) 56 57 assert input.read(1).isspace() 58 continue 59 if c == "\N{MULTIPLICATION SIGN}": 60 assert input.read(1).isspace() 61 continue 62 if c.isalnum(): 63 while next := input.read(1): 64 if next.isalnum(): 65 c += next 66 else: 67 assert next.isspace() 68 break 69 i = int(c, base=16) 70 if code_point == -1: 71 code_point = i 72 73 result.encoded += f"\\U{i:08x}" 74 c = chr(i) 75 utf8 += c.encode().__len__() 76 # Since we only care about the number of code units the byte order 77 # doesn't matter. The byte order is specified to avoid the BOM 78 utf16 += int(c.encode("utf-16-le").__len__() / 2) 79 utf32 += int(c.encode("utf-32-le").__len__() / 4) 80 continue 81 if c == "#": 82 input.readline() 83 return result 84 if c == "\n": 85 return result 86 if c == "": 87 return None 88 assert False 89 90 91cpp_template = """// -*- C++ -*- 92//===----------------------------------------------------------------------===// 93// 94// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 95// See https://llvm.org/LICENSE.txt for license information. 96// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 97// 98//===----------------------------------------------------------------------===// 99 100// WARNING, this entire header is generated by 101// utiles/generate_extended_grapheme_cluster_test.py 102// DO NOT MODIFY! 103 104// UNICODE, INC. LICENSE AGREEMENT - DATA FILES AND SOFTWARE 105// 106// See Terms of Use <https://www.unicode.org/copyright.html> 107// for definitions of Unicode Inc.'s Data Files and Software. 108// 109// NOTICE TO USER: Carefully read the following legal agreement. 110// BY DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING UNICODE INC.'S 111// DATA FILES ("DATA FILES"), AND/OR SOFTWARE ("SOFTWARE"), 112// YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE 113// TERMS AND CONDITIONS OF THIS AGREEMENT. 114// IF YOU DO NOT AGREE, DO NOT DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE 115// THE DATA FILES OR SOFTWARE. 116// 117// COPYRIGHT AND PERMISSION NOTICE 118// 119// Copyright (c) 1991-2022 Unicode, Inc. All rights reserved. 120// Distributed under the Terms of Use in https://www.unicode.org/copyright.html. 121// 122// Permission is hereby granted, free of charge, to any person obtaining 123// a copy of the Unicode data files and any associated documentation 124// (the "Data Files") or Unicode software and any associated documentation 125// (the "Software") to deal in the Data Files or Software 126// without restriction, including without limitation the rights to use, 127// copy, modify, merge, publish, distribute, and/or sell copies of 128// the Data Files or Software, and to permit persons to whom the Data Files 129// or Software are furnished to do so, provided that either 130// (a) this copyright and permission notice appear with all copies 131// of the Data Files or Software, or 132// (b) this copyright and permission notice appear in associated 133// Documentation. 134// 135// THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF 136// ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE 137// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 138// NONINFRINGEMENT OF THIRD PARTY RIGHTS. 139// IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS 140// NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL 141// DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, 142// DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER 143// TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 144// PERFORMANCE OF THE DATA FILES OR SOFTWARE. 145// 146// Except as contained in this notice, the name of a copyright holder 147// shall not be used in advertising or otherwise to promote the sale, 148// use or other dealings in these Data Files or Software without prior 149// written authorization of the copyright holder. 150 151#ifndef LIBCXX_TEST_STD_UTILITIES_FORMAT_FORMAT_STRING_FORMAT_STRING_STD_EXTENDED_GRAPHEME_CLUSTER_H 152#define LIBCXX_TEST_STD_UTILITIES_FORMAT_FORMAT_STRING_FORMAT_STRING_STD_EXTENDED_GRAPHEME_CLUSTER_H 153 154#include <array> 155#include <string_view> 156#include <vector> 157 158template <class CharT> 159struct data {{ 160 /// The input to parse. 161 std::basic_string_view<CharT> input; 162 163 /// The first code point all extended grapheme clusters in the input. 164 std::vector<char32_t> code_points; 165 166 /// The offset of the last code units of the extended grapheme clusters in the input. 167 /// 168 /// The vector has the same number of entries as \\ref code_points. 169 std::vector<size_t> breaks; 170}}; 171 172/// The data for UTF-8. 173std::array<data<char>, {0}> data_utf8 = {{{{ 174{1}}}}}; 175 176/// The data for UTF-16. 177/// 178/// Note that most of the data for the UTF-16 and UTF-32 are identical. However 179/// since the size of the code units differ the breaks can contain different 180/// values. 181std::array<data<wchar_t>, {0}> data_utf16 = {{{{ 182{2}}}}}; 183 184/// The data for UTF-8. 185/// 186/// Note that most of the data for the UTF-16 and UTF-32 are identical. However 187/// since the size of the code units differ the breaks can contain different 188/// values. 189std::array<data<wchar_t>, {0}> data_utf32 = {{{{ 190{3}}}}}; 191 192#endif // LIBCXX_TEST_STD_UTILITIES_FORMAT_FORMAT_STRING_FORMAT_STRING_STD_EXTENDED_GRAPHEME_CLUSTER_H""" 193 194cpp_test_data_line_template = " {{{}, {{{}}}, {{{}}}}}" 195 196 197def lineToCppDataLineUtf8(line: BreakTestItem) -> str: 198 return cpp_test_data_line_template.format( 199 f'"{line.encoded}"', 200 ", ".join([str(x) for x in line.code_points]), 201 ", ".join([str(x) for x in line.breaks_utf8]), 202 ) 203 204 205def lineToCppDataLineUtf16(line: BreakTestItem) -> str: 206 return cpp_test_data_line_template.format( 207 f'L"{line.encoded}"', 208 ", ".join([str(x) for x in line.code_points]), 209 ", ".join([str(x) for x in line.breaks_utf16]), 210 ) 211 212 213def lineToCppDataLineUtf32(line: BreakTestItem) -> str: 214 return cpp_test_data_line_template.format( 215 f'L"{line.encoded}"', 216 ", ".join([str(x) for x in line.code_points]), 217 ", ".join([str(x) for x in line.breaks_utf32]), 218 ) 219 220 221""" 222Generate test data from "GraphemeBreakText.txt" 223This file can be downloaded from: 224https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakTest.txt 225This script looks for GraphemeBreakTest.txt in same directory as this script 226""" 227 228 229def generate_all() -> str: 230 test_data_path = Path(__file__) 231 test_data_path = test_data_path.absolute() 232 test_data_path = test_data_path.with_name("GraphemeBreakTest.txt") 233 lines = list() 234 with open(test_data_path, mode="rt", encoding="utf-8") as file: 235 while line := parseBreakTestLine(file): 236 if len(line.encoded) > 0: 237 lines.append(line) 238 return cpp_template.format( 239 len(lines), 240 ",\n".join(map(lineToCppDataLineUtf8, lines)), 241 ",\n".join(map(lineToCppDataLineUtf16, lines)), 242 ",\n".join(map(lineToCppDataLineUtf32, lines)), 243 ) 244 245 246if __name__ == "__main__": 247 print(generate_all()) 248