1857a78c0SMark de Wever#!/usr/bin/env python 2857a78c0SMark de Wever# ===----------------------------------------------------------------------===## 3857a78c0SMark de Wever# 4857a78c0SMark de Wever# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 5857a78c0SMark de Wever# See https://llvm.org/LICENSE.txt for license information. 6857a78c0SMark de Wever# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 7857a78c0SMark de Wever# 8857a78c0SMark de Wever# ===----------------------------------------------------------------------===## 9857a78c0SMark de Wever 10857a78c0SMark de Wever# The code is based on 11857a78c0SMark de Wever# https://github.com/microsoft/STL/blob/main/tools/unicode_properties_parse/grapheme_break_test_data_gen.py 12857a78c0SMark de Wever# 13857a78c0SMark de Wever# Copyright (c) Microsoft Corporation. 14857a78c0SMark de Wever# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 15857a78c0SMark de Wever 16857a78c0SMark de Weverfrom pathlib import Path 17857a78c0SMark de Weverfrom dataclasses import dataclass, field 18857a78c0SMark de Weverfrom typing import Optional, TextIO 19130b1816SMark de Weverimport sys 20857a78c0SMark de Wever 21857a78c0SMark de Wever 22857a78c0SMark de Wever@dataclass 23857a78c0SMark de Weverclass BreakTestItem: 24857a78c0SMark de Wever code_points: list[int] = field(default_factory=list) 25857a78c0SMark de Wever encoded: str = "" 26857a78c0SMark de Wever breaks_utf8: list[int] = field(default_factory=list) 27857a78c0SMark de Wever breaks_utf16: list[int] = field(default_factory=list) 28857a78c0SMark de Wever breaks_utf32: list[int] = field(default_factory=list) 29857a78c0SMark de Wever 30857a78c0SMark de Wever 31857a78c0SMark de Weverclass CommentLine: 32857a78c0SMark de Wever pass 33857a78c0SMark de Wever 34857a78c0SMark de Wever 35857a78c0SMark de Weverclass EOF: 36857a78c0SMark de Wever pass 37857a78c0SMark de Wever 38857a78c0SMark de Wever 39857a78c0SMark de Weverdef parseBreakTestLine(input: TextIO) -> Optional[BreakTestItem]: 40857a78c0SMark de Wever result = BreakTestItem() 41857a78c0SMark de Wever code_point = -1 42857a78c0SMark de Wever utf8 = 0 43857a78c0SMark de Wever utf16 = 0 44857a78c0SMark de Wever utf32 = 0 45857a78c0SMark de Wever 46857a78c0SMark de Wever while True: 47857a78c0SMark de Wever c = input.read(1) 48857a78c0SMark de Wever if c == "\N{DIVISION SIGN}": 49857a78c0SMark de Wever # The line starts with a division sign, don't add it to the output. 50857a78c0SMark de Wever if code_point != -1: 51857a78c0SMark de Wever result.code_points.append(code_point) 52857a78c0SMark de Wever code_point = -1 53857a78c0SMark de Wever result.breaks_utf8.append(utf8) 54857a78c0SMark de Wever result.breaks_utf16.append(utf16) 55857a78c0SMark de Wever result.breaks_utf32.append(utf32) 56857a78c0SMark de Wever 57857a78c0SMark de Wever assert input.read(1).isspace() 58857a78c0SMark de Wever continue 59857a78c0SMark de Wever if c == "\N{MULTIPLICATION SIGN}": 60857a78c0SMark de Wever assert input.read(1).isspace() 61857a78c0SMark de Wever continue 62857a78c0SMark de Wever if c.isalnum(): 63857a78c0SMark de Wever while next := input.read(1): 64857a78c0SMark de Wever if next.isalnum(): 65857a78c0SMark de Wever c += next 66857a78c0SMark de Wever else: 67857a78c0SMark de Wever assert next.isspace() 68857a78c0SMark de Wever break 69857a78c0SMark de Wever i = int(c, base=16) 70857a78c0SMark de Wever if code_point == -1: 71857a78c0SMark de Wever code_point = i 72857a78c0SMark de Wever 73857a78c0SMark de Wever result.encoded += f"\\U{i:08x}" 74857a78c0SMark de Wever c = chr(i) 75857a78c0SMark de Wever utf8 += c.encode().__len__() 76857a78c0SMark de Wever # Since we only care about the number of code units the byte order 77857a78c0SMark de Wever # doesn't matter. The byte order is specified to avoid the BOM 78857a78c0SMark de Wever utf16 += int(c.encode("utf-16-le").__len__() / 2) 79857a78c0SMark de Wever utf32 += int(c.encode("utf-32-le").__len__() / 4) 80857a78c0SMark de Wever continue 81857a78c0SMark de Wever if c == "#": 82857a78c0SMark de Wever input.readline() 83857a78c0SMark de Wever return result 84857a78c0SMark de Wever if c == "\n": 85857a78c0SMark de Wever return result 86857a78c0SMark de Wever if c == "": 87857a78c0SMark de Wever return None 88857a78c0SMark de Wever assert False 89857a78c0SMark de Wever 90857a78c0SMark de Wever 91857a78c0SMark de Wevercpp_template = """// -*- C++ -*- 92857a78c0SMark de Wever//===----------------------------------------------------------------------===// 93857a78c0SMark de Wever// 94857a78c0SMark de Wever// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 95857a78c0SMark de Wever// See https://llvm.org/LICENSE.txt for license information. 96857a78c0SMark de Wever// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 97857a78c0SMark de Wever// 98857a78c0SMark de Wever//===----------------------------------------------------------------------===// 99857a78c0SMark de Wever 100857a78c0SMark de Wever// WARNING, this entire header is generated by 101b2093ca8SMark de Wever// utils/generate_extended_grapheme_cluster_test.py 102857a78c0SMark de Wever// DO NOT MODIFY! 103857a78c0SMark de Wever 104857a78c0SMark de Wever// UNICODE, INC. LICENSE AGREEMENT - DATA FILES AND SOFTWARE 105857a78c0SMark de Wever// 106857a78c0SMark de Wever// See Terms of Use <https://www.unicode.org/copyright.html> 107857a78c0SMark de Wever// for definitions of Unicode Inc.'s Data Files and Software. 108857a78c0SMark de Wever// 109857a78c0SMark de Wever// NOTICE TO USER: Carefully read the following legal agreement. 110857a78c0SMark de Wever// BY DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING UNICODE INC.'S 111857a78c0SMark de Wever// DATA FILES ("DATA FILES"), AND/OR SOFTWARE ("SOFTWARE"), 112857a78c0SMark de Wever// YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE 113857a78c0SMark de Wever// TERMS AND CONDITIONS OF THIS AGREEMENT. 114857a78c0SMark de Wever// IF YOU DO NOT AGREE, DO NOT DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE 115857a78c0SMark de Wever// THE DATA FILES OR SOFTWARE. 116857a78c0SMark de Wever// 117857a78c0SMark de Wever// COPYRIGHT AND PERMISSION NOTICE 118857a78c0SMark de Wever// 119857a78c0SMark de Wever// Copyright (c) 1991-2022 Unicode, Inc. All rights reserved. 120857a78c0SMark de Wever// Distributed under the Terms of Use in https://www.unicode.org/copyright.html. 121857a78c0SMark de Wever// 122857a78c0SMark de Wever// Permission is hereby granted, free of charge, to any person obtaining 123857a78c0SMark de Wever// a copy of the Unicode data files and any associated documentation 124857a78c0SMark de Wever// (the "Data Files") or Unicode software and any associated documentation 125857a78c0SMark de Wever// (the "Software") to deal in the Data Files or Software 126857a78c0SMark de Wever// without restriction, including without limitation the rights to use, 127857a78c0SMark de Wever// copy, modify, merge, publish, distribute, and/or sell copies of 128857a78c0SMark de Wever// the Data Files or Software, and to permit persons to whom the Data Files 129857a78c0SMark de Wever// or Software are furnished to do so, provided that either 130857a78c0SMark de Wever// (a) this copyright and permission notice appear with all copies 131857a78c0SMark de Wever// of the Data Files or Software, or 132857a78c0SMark de Wever// (b) this copyright and permission notice appear in associated 133857a78c0SMark de Wever// Documentation. 134857a78c0SMark de Wever// 135857a78c0SMark de Wever// THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF 136857a78c0SMark de Wever// ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE 137857a78c0SMark de Wever// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 138857a78c0SMark de Wever// NONINFRINGEMENT OF THIRD PARTY RIGHTS. 139857a78c0SMark de Wever// IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS 140857a78c0SMark de Wever// NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL 141857a78c0SMark de Wever// DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, 142857a78c0SMark de Wever// DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER 143857a78c0SMark de Wever// TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 144857a78c0SMark de Wever// PERFORMANCE OF THE DATA FILES OR SOFTWARE. 145857a78c0SMark de Wever// 146857a78c0SMark de Wever// Except as contained in this notice, the name of a copyright holder 147857a78c0SMark de Wever// shall not be used in advertising or otherwise to promote the sale, 148857a78c0SMark de Wever// use or other dealings in these Data Files or Software without prior 149857a78c0SMark de Wever// written authorization of the copyright holder. 150857a78c0SMark de Wever 151857a78c0SMark de Wever#ifndef LIBCXX_TEST_STD_UTILITIES_FORMAT_FORMAT_STRING_FORMAT_STRING_STD_EXTENDED_GRAPHEME_CLUSTER_H 152857a78c0SMark de Wever#define LIBCXX_TEST_STD_UTILITIES_FORMAT_FORMAT_STRING_FORMAT_STRING_STD_EXTENDED_GRAPHEME_CLUSTER_H 153857a78c0SMark de Wever 154857a78c0SMark de Wever#include <array> 155857a78c0SMark de Wever#include <string_view> 156857a78c0SMark de Wever#include <vector> 157857a78c0SMark de Wever 1583465f022SLouis Dionne#include "test_macros.h" 1593465f022SLouis Dionne 160857a78c0SMark de Wevertemplate <class CharT> 161857a78c0SMark de Weverstruct data {{ 162857a78c0SMark de Wever /// The input to parse. 163857a78c0SMark de Wever std::basic_string_view<CharT> input; 164857a78c0SMark de Wever 165857a78c0SMark de Wever /// The first code point all extended grapheme clusters in the input. 166857a78c0SMark de Wever std::vector<char32_t> code_points; 167857a78c0SMark de Wever 168857a78c0SMark de Wever /// The offset of the last code units of the extended grapheme clusters in the input. 169857a78c0SMark de Wever /// 170857a78c0SMark de Wever /// The vector has the same number of entries as \\ref code_points. 171*fb855eb9SMark de Wever std::vector<std::size_t> breaks; 172857a78c0SMark de Wever}}; 173857a78c0SMark de Wever 174857a78c0SMark de Wever/// The data for UTF-8. 175da38bcfdSMark de Weverstd::array<data<char>, {0}> data_utf8 = {{{{ 176da38bcfdSMark de Wever{1}}}}}; 177857a78c0SMark de Wever 178857a78c0SMark de Wever/// The data for UTF-16. 179857a78c0SMark de Wever/// 180857a78c0SMark de Wever/// Note that most of the data for the UTF-16 and UTF-32 are identical. However 181857a78c0SMark de Wever/// since the size of the code units differ the breaks can contain different 182857a78c0SMark de Wever/// values. 1833465f022SLouis Dionne#ifndef TEST_HAS_NO_WIDE_CHARACTERS 184da38bcfdSMark de Weverstd::array<data<wchar_t>, {0}> data_utf16 = {{{{ 185da38bcfdSMark de Wever{2}}}}}; 186857a78c0SMark de Wever 187857a78c0SMark de Wever/// The data for UTF-8. 188857a78c0SMark de Wever/// 189857a78c0SMark de Wever/// Note that most of the data for the UTF-16 and UTF-32 are identical. However 190857a78c0SMark de Wever/// since the size of the code units differ the breaks can contain different 191857a78c0SMark de Wever/// values. 192da38bcfdSMark de Weverstd::array<data<wchar_t>, {0}> data_utf32 = {{{{ 193da38bcfdSMark de Wever{3}}}}}; 1943465f022SLouis Dionne#endif // TEST_HAS_NO_WIDE_CHARACTERS 195857a78c0SMark de Wever 196da38bcfdSMark de Wever#endif // LIBCXX_TEST_STD_UTILITIES_FORMAT_FORMAT_STRING_FORMAT_STRING_STD_EXTENDED_GRAPHEME_CLUSTER_H""" 197857a78c0SMark de Wever 198857a78c0SMark de Wevercpp_test_data_line_template = " {{{}, {{{}}}, {{{}}}}}" 199857a78c0SMark de Wever 200857a78c0SMark de Wever 201857a78c0SMark de Weverdef lineToCppDataLineUtf8(line: BreakTestItem) -> str: 202857a78c0SMark de Wever return cpp_test_data_line_template.format( 203857a78c0SMark de Wever f'"{line.encoded}"', 204857a78c0SMark de Wever ", ".join([str(x) for x in line.code_points]), 205857a78c0SMark de Wever ", ".join([str(x) for x in line.breaks_utf8]), 206857a78c0SMark de Wever ) 207857a78c0SMark de Wever 208857a78c0SMark de Wever 209857a78c0SMark de Weverdef lineToCppDataLineUtf16(line: BreakTestItem) -> str: 210857a78c0SMark de Wever return cpp_test_data_line_template.format( 211857a78c0SMark de Wever f'L"{line.encoded}"', 212857a78c0SMark de Wever ", ".join([str(x) for x in line.code_points]), 213857a78c0SMark de Wever ", ".join([str(x) for x in line.breaks_utf16]), 214857a78c0SMark de Wever ) 215857a78c0SMark de Wever 216857a78c0SMark de Wever 217857a78c0SMark de Weverdef lineToCppDataLineUtf32(line: BreakTestItem) -> str: 218857a78c0SMark de Wever return cpp_test_data_line_template.format( 219857a78c0SMark de Wever f'L"{line.encoded}"', 220857a78c0SMark de Wever ", ".join([str(x) for x in line.code_points]), 221857a78c0SMark de Wever ", ".join([str(x) for x in line.breaks_utf32]), 222857a78c0SMark de Wever ) 223857a78c0SMark de Wever 224857a78c0SMark de Wever 225857a78c0SMark de Wever""" 226857a78c0SMark de WeverGenerate test data from "GraphemeBreakText.txt" 227857a78c0SMark de WeverThis file can be downloaded from: 228857a78c0SMark de Weverhttps://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakTest.txt 229857a78c0SMark de WeverThis script looks for GraphemeBreakTest.txt in same directory as this script 230857a78c0SMark de Wever""" 231857a78c0SMark de Wever 232857a78c0SMark de Wever 233857a78c0SMark de Weverdef generate_all() -> str: 234857a78c0SMark de Wever test_data_path = Path(__file__) 235857a78c0SMark de Wever test_data_path = test_data_path.absolute() 236130b1816SMark de Wever test_data_path = ( 237130b1816SMark de Wever test_data_path.parent / "data" / "unicode" / "GraphemeBreakTest.txt" 238130b1816SMark de Wever ) 239857a78c0SMark de Wever lines = list() 240857a78c0SMark de Wever with open(test_data_path, mode="rt", encoding="utf-8") as file: 241857a78c0SMark de Wever while line := parseBreakTestLine(file): 242857a78c0SMark de Wever if len(line.encoded) > 0: 243857a78c0SMark de Wever lines.append(line) 244857a78c0SMark de Wever return cpp_template.format( 245857a78c0SMark de Wever len(lines), 246da38bcfdSMark de Wever ",\n".join(map(lineToCppDataLineUtf8, lines)), 247da38bcfdSMark de Wever ",\n".join(map(lineToCppDataLineUtf16, lines)), 248da38bcfdSMark de Wever ",\n".join(map(lineToCppDataLineUtf32, lines)), 249857a78c0SMark de Wever ) 250857a78c0SMark de Wever 251857a78c0SMark de Wever 252857a78c0SMark de Weverif __name__ == "__main__": 253130b1816SMark de Wever if len(sys.argv) == 2: 254130b1816SMark de Wever sys.stdout = open(sys.argv[1], "w") 255857a78c0SMark de Wever print(generate_all()) 256