1*4bdff4beSrobert#!/usr/bin/env python 2*4bdff4beSrobert# ===----------------------------------------------------------------------===## 3*4bdff4beSrobert# 4*4bdff4beSrobert# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 5*4bdff4beSrobert# See https://llvm.org/LICENSE.txt for license information. 6*4bdff4beSrobert# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 7*4bdff4beSrobert# 8*4bdff4beSrobert# ===----------------------------------------------------------------------===## 9*4bdff4beSrobert 10*4bdff4beSrobert# The code is based on 11*4bdff4beSrobert# https://github.com/microsoft/STL/blob/main/tools/unicode_properties_parse/grapheme_break_test_data_gen.py 12*4bdff4beSrobert# 13*4bdff4beSrobert# Copyright (c) Microsoft Corporation. 14*4bdff4beSrobert# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 15*4bdff4beSrobert 16*4bdff4beSrobertfrom pathlib import Path 17*4bdff4beSrobertfrom dataclasses import dataclass, field 18*4bdff4beSrobertfrom typing import Optional, TextIO 19*4bdff4beSrobertfrom array import array 20*4bdff4beSrobertimport sys 21*4bdff4beSrobert 22*4bdff4beSrobert 23*4bdff4beSrobert@dataclass 24*4bdff4beSrobertclass BreakTestItem: 25*4bdff4beSrobert code_points: list[int] = field(default_factory=list) 26*4bdff4beSrobert encoded: str = "" 27*4bdff4beSrobert breaks_utf8: list[int] = field(default_factory=list) 28*4bdff4beSrobert breaks_utf16: list[int] = field(default_factory=list) 29*4bdff4beSrobert breaks_utf32: list[int] = field(default_factory=list) 30*4bdff4beSrobert 31*4bdff4beSrobert 32*4bdff4beSrobertclass CommentLine: 33*4bdff4beSrobert pass 34*4bdff4beSrobert 35*4bdff4beSrobert 36*4bdff4beSrobertclass EOF: 37*4bdff4beSrobert pass 38*4bdff4beSrobert 39*4bdff4beSrobert 40*4bdff4beSrobertdef parseBreakTestLine(input: TextIO) -> Optional[BreakTestItem]: 41*4bdff4beSrobert result = BreakTestItem() 42*4bdff4beSrobert code_point = -1 43*4bdff4beSrobert utf8 = 0 44*4bdff4beSrobert utf16 = 0 45*4bdff4beSrobert utf32 = 0 46*4bdff4beSrobert 47*4bdff4beSrobert while True: 48*4bdff4beSrobert c = input.read(1) 49*4bdff4beSrobert if c == "\N{DIVISION SIGN}": 50*4bdff4beSrobert # The line starts with a division sign, don't add it to the output. 51*4bdff4beSrobert if code_point != -1: 52*4bdff4beSrobert result.code_points.append(code_point) 53*4bdff4beSrobert code_point = -1 54*4bdff4beSrobert result.breaks_utf8.append(utf8) 55*4bdff4beSrobert result.breaks_utf16.append(utf16) 56*4bdff4beSrobert result.breaks_utf32.append(utf32) 57*4bdff4beSrobert 58*4bdff4beSrobert assert input.read(1).isspace() 59*4bdff4beSrobert continue 60*4bdff4beSrobert if c == "\N{MULTIPLICATION SIGN}": 61*4bdff4beSrobert assert input.read(1).isspace() 62*4bdff4beSrobert continue 63*4bdff4beSrobert if c.isalnum(): 64*4bdff4beSrobert while next := input.read(1): 65*4bdff4beSrobert if next.isalnum(): 66*4bdff4beSrobert c += next 67*4bdff4beSrobert else: 68*4bdff4beSrobert assert next.isspace() 69*4bdff4beSrobert break 70*4bdff4beSrobert i = int(c, base=16) 71*4bdff4beSrobert if code_point == -1: 72*4bdff4beSrobert code_point = i 73*4bdff4beSrobert 74*4bdff4beSrobert result.encoded += f"\\U{i:08x}" 75*4bdff4beSrobert c = chr(i) 76*4bdff4beSrobert utf8 += c.encode().__len__() 77*4bdff4beSrobert # Since we only care about the number of code units the byte order 78*4bdff4beSrobert # doesn't matter. The byte order is specified to avoid the BOM 79*4bdff4beSrobert utf16 += int(c.encode("utf-16-le").__len__() / 2) 80*4bdff4beSrobert utf32 += int(c.encode("utf-32-le").__len__() / 4) 81*4bdff4beSrobert continue 82*4bdff4beSrobert if c == "#": 83*4bdff4beSrobert input.readline() 84*4bdff4beSrobert return result 85*4bdff4beSrobert if c == "\n": 86*4bdff4beSrobert return result 87*4bdff4beSrobert if c == "": 88*4bdff4beSrobert return None 89*4bdff4beSrobert assert False 90*4bdff4beSrobert 91*4bdff4beSrobert 92*4bdff4beSrobertcpp_template = """// -*- C++ -*- 93*4bdff4beSrobert//===----------------------------------------------------------------------===// 94*4bdff4beSrobert// 95*4bdff4beSrobert// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 96*4bdff4beSrobert// See https://llvm.org/LICENSE.txt for license information. 97*4bdff4beSrobert// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 98*4bdff4beSrobert// 99*4bdff4beSrobert//===----------------------------------------------------------------------===// 100*4bdff4beSrobert 101*4bdff4beSrobert// WARNING, this entire header is generated by 102*4bdff4beSrobert// utiles/generate_extended_grapheme_cluster_test.py 103*4bdff4beSrobert// DO NOT MODIFY! 104*4bdff4beSrobert 105*4bdff4beSrobert// UNICODE, INC. LICENSE AGREEMENT - DATA FILES AND SOFTWARE 106*4bdff4beSrobert// 107*4bdff4beSrobert// See Terms of Use <https://www.unicode.org/copyright.html> 108*4bdff4beSrobert// for definitions of Unicode Inc.'s Data Files and Software. 109*4bdff4beSrobert// 110*4bdff4beSrobert// NOTICE TO USER: Carefully read the following legal agreement. 111*4bdff4beSrobert// BY DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING UNICODE INC.'S 112*4bdff4beSrobert// DATA FILES ("DATA FILES"), AND/OR SOFTWARE ("SOFTWARE"), 113*4bdff4beSrobert// YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE 114*4bdff4beSrobert// TERMS AND CONDITIONS OF THIS AGREEMENT. 115*4bdff4beSrobert// IF YOU DO NOT AGREE, DO NOT DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE 116*4bdff4beSrobert// THE DATA FILES OR SOFTWARE. 117*4bdff4beSrobert// 118*4bdff4beSrobert// COPYRIGHT AND PERMISSION NOTICE 119*4bdff4beSrobert// 120*4bdff4beSrobert// Copyright (c) 1991-2022 Unicode, Inc. All rights reserved. 121*4bdff4beSrobert// Distributed under the Terms of Use in https://www.unicode.org/copyright.html. 122*4bdff4beSrobert// 123*4bdff4beSrobert// Permission is hereby granted, free of charge, to any person obtaining 124*4bdff4beSrobert// a copy of the Unicode data files and any associated documentation 125*4bdff4beSrobert// (the "Data Files") or Unicode software and any associated documentation 126*4bdff4beSrobert// (the "Software") to deal in the Data Files or Software 127*4bdff4beSrobert// without restriction, including without limitation the rights to use, 128*4bdff4beSrobert// copy, modify, merge, publish, distribute, and/or sell copies of 129*4bdff4beSrobert// the Data Files or Software, and to permit persons to whom the Data Files 130*4bdff4beSrobert// or Software are furnished to do so, provided that either 131*4bdff4beSrobert// (a) this copyright and permission notice appear with all copies 132*4bdff4beSrobert// of the Data Files or Software, or 133*4bdff4beSrobert// (b) this copyright and permission notice appear in associated 134*4bdff4beSrobert// Documentation. 135*4bdff4beSrobert// 136*4bdff4beSrobert// THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF 137*4bdff4beSrobert// ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE 138*4bdff4beSrobert// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 139*4bdff4beSrobert// NONINFRINGEMENT OF THIRD PARTY RIGHTS. 140*4bdff4beSrobert// IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS 141*4bdff4beSrobert// NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL 142*4bdff4beSrobert// DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, 143*4bdff4beSrobert// DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER 144*4bdff4beSrobert// TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 145*4bdff4beSrobert// PERFORMANCE OF THE DATA FILES OR SOFTWARE. 146*4bdff4beSrobert// 147*4bdff4beSrobert// Except as contained in this notice, the name of a copyright holder 148*4bdff4beSrobert// shall not be used in advertising or otherwise to promote the sale, 149*4bdff4beSrobert// use or other dealings in these Data Files or Software without prior 150*4bdff4beSrobert// written authorization of the copyright holder. 151*4bdff4beSrobert 152*4bdff4beSrobert#ifndef LIBCXX_TEST_STD_UTILITIES_FORMAT_FORMAT_STRING_FORMAT_STRING_STD_EXTENDED_GRAPHEME_CLUSTER_H 153*4bdff4beSrobert#define LIBCXX_TEST_STD_UTILITIES_FORMAT_FORMAT_STRING_FORMAT_STRING_STD_EXTENDED_GRAPHEME_CLUSTER_H 154*4bdff4beSrobert 155*4bdff4beSrobert#include <array> 156*4bdff4beSrobert#include <string_view> 157*4bdff4beSrobert#include <vector> 158*4bdff4beSrobert 159*4bdff4beSrobert#include "test_macros.h" 160*4bdff4beSrobert 161*4bdff4beSroberttemplate <class CharT> 162*4bdff4beSrobertstruct data {{ 163*4bdff4beSrobert /// The input to parse. 164*4bdff4beSrobert std::basic_string_view<CharT> input; 165*4bdff4beSrobert 166*4bdff4beSrobert /// The first code point all extended grapheme clusters in the input. 167*4bdff4beSrobert std::vector<char32_t> code_points; 168*4bdff4beSrobert 169*4bdff4beSrobert /// The offset of the last code units of the extended grapheme clusters in the input. 170*4bdff4beSrobert /// 171*4bdff4beSrobert /// The vector has the same number of entries as \\ref code_points. 172*4bdff4beSrobert std::vector<size_t> breaks; 173*4bdff4beSrobert}}; 174*4bdff4beSrobert 175*4bdff4beSrobert/// The data for UTF-8. 176*4bdff4beSrobertstd::array<data<char>, {0}> data_utf8 = {{{{ 177*4bdff4beSrobert{1}}}}}; 178*4bdff4beSrobert 179*4bdff4beSrobert/// The data for UTF-16. 180*4bdff4beSrobert/// 181*4bdff4beSrobert/// Note that most of the data for the UTF-16 and UTF-32 are identical. However 182*4bdff4beSrobert/// since the size of the code units differ the breaks can contain different 183*4bdff4beSrobert/// values. 184*4bdff4beSrobert#ifndef TEST_HAS_NO_WIDE_CHARACTERS 185*4bdff4beSrobertstd::array<data<wchar_t>, {0}> data_utf16 = {{{{ 186*4bdff4beSrobert{2}}}}}; 187*4bdff4beSrobert 188*4bdff4beSrobert/// The data for UTF-8. 189*4bdff4beSrobert/// 190*4bdff4beSrobert/// Note that most of the data for the UTF-16 and UTF-32 are identical. However 191*4bdff4beSrobert/// since the size of the code units differ the breaks can contain different 192*4bdff4beSrobert/// values. 193*4bdff4beSrobertstd::array<data<wchar_t>, {0}> data_utf32 = {{{{ 194*4bdff4beSrobert{3}}}}}; 195*4bdff4beSrobert#endif // TEST_HAS_NO_WIDE_CHARACTERS 196*4bdff4beSrobert 197*4bdff4beSrobert#endif // LIBCXX_TEST_STD_UTILITIES_FORMAT_FORMAT_STRING_FORMAT_STRING_STD_EXTENDED_GRAPHEME_CLUSTER_H""" 198*4bdff4beSrobert 199*4bdff4beSrobertcpp_test_data_line_template = " {{{}, {{{}}}, {{{}}}}}" 200*4bdff4beSrobert 201*4bdff4beSrobert 202*4bdff4beSrobertdef lineToCppDataLineUtf8(line: BreakTestItem) -> str: 203*4bdff4beSrobert return cpp_test_data_line_template.format( 204*4bdff4beSrobert f'"{line.encoded}"', 205*4bdff4beSrobert ", ".join([str(x) for x in line.code_points]), 206*4bdff4beSrobert ", ".join([str(x) for x in line.breaks_utf8]), 207*4bdff4beSrobert ) 208*4bdff4beSrobert 209*4bdff4beSrobert 210*4bdff4beSrobertdef lineToCppDataLineUtf16(line: BreakTestItem) -> str: 211*4bdff4beSrobert return cpp_test_data_line_template.format( 212*4bdff4beSrobert f'L"{line.encoded}"', 213*4bdff4beSrobert ", ".join([str(x) for x in line.code_points]), 214*4bdff4beSrobert ", ".join([str(x) for x in line.breaks_utf16]), 215*4bdff4beSrobert ) 216*4bdff4beSrobert 217*4bdff4beSrobert 218*4bdff4beSrobertdef lineToCppDataLineUtf32(line: BreakTestItem) -> str: 219*4bdff4beSrobert return cpp_test_data_line_template.format( 220*4bdff4beSrobert f'L"{line.encoded}"', 221*4bdff4beSrobert ", ".join([str(x) for x in line.code_points]), 222*4bdff4beSrobert ", ".join([str(x) for x in line.breaks_utf32]), 223*4bdff4beSrobert ) 224*4bdff4beSrobert 225*4bdff4beSrobert 226*4bdff4beSrobert""" 227*4bdff4beSrobertGenerate test data from "GraphemeBreakText.txt" 228*4bdff4beSrobertThis file can be downloaded from: 229*4bdff4beSroberthttps://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakTest.txt 230*4bdff4beSrobertThis script looks for GraphemeBreakTest.txt in same directory as this script 231*4bdff4beSrobert""" 232*4bdff4beSrobert 233*4bdff4beSrobert 234*4bdff4beSrobertdef generate_all() -> str: 235*4bdff4beSrobert test_data_path = Path(__file__) 236*4bdff4beSrobert test_data_path = test_data_path.absolute() 237*4bdff4beSrobert test_data_path = ( 238*4bdff4beSrobert test_data_path.parent / "data" / "unicode" / "GraphemeBreakTest.txt" 239*4bdff4beSrobert ) 240*4bdff4beSrobert lines = list() 241*4bdff4beSrobert with open(test_data_path, mode="rt", encoding="utf-8") as file: 242*4bdff4beSrobert while line := parseBreakTestLine(file): 243*4bdff4beSrobert if len(line.encoded) > 0: 244*4bdff4beSrobert lines.append(line) 245*4bdff4beSrobert return cpp_template.format( 246*4bdff4beSrobert len(lines), 247*4bdff4beSrobert ",\n".join(map(lineToCppDataLineUtf8, lines)), 248*4bdff4beSrobert ",\n".join(map(lineToCppDataLineUtf16, lines)), 249*4bdff4beSrobert ",\n".join(map(lineToCppDataLineUtf32, lines)), 250*4bdff4beSrobert ) 251*4bdff4beSrobert 252*4bdff4beSrobert 253*4bdff4beSrobertif __name__ == "__main__": 254*4bdff4beSrobert if len(sys.argv) == 2: 255*4bdff4beSrobert sys.stdout = open(sys.argv[1], "w") 256*4bdff4beSrobert print(generate_all()) 257