1 // -*- C++ -*- 2 //===----------------------------------------------------------------------===// 3 // 4 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 5 // See https://llvm.org/LICENSE.txt for license information. 6 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 7 // 8 //===----------------------------------------------------------------------===// 9 10 // WARNING, this entire header is generated by 11 // utils/generate_indic_conjunct_break_table.py 12 // DO NOT MODIFY! 13 14 // UNICODE, INC. LICENSE AGREEMENT - DATA FILES AND SOFTWARE 15 // 16 // See Terms of Use <https://www.unicode.org/copyright.html> 17 // for definitions of Unicode Inc.'s Data Files and Software. 18 // 19 // NOTICE TO USER: Carefully read the following legal agreement. 20 // BY DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING UNICODE INC.'S 21 // DATA FILES ("DATA FILES"), AND/OR SOFTWARE ("SOFTWARE"), 22 // YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE 23 // TERMS AND CONDITIONS OF THIS AGREEMENT. 24 // IF YOU DO NOT AGREE, DO NOT DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE 25 // THE DATA FILES OR SOFTWARE. 26 // 27 // COPYRIGHT AND PERMISSION NOTICE 28 // 29 // Copyright (c) 1991-2022 Unicode, Inc. All rights reserved. 30 // Distributed under the Terms of Use in https://www.unicode.org/copyright.html. 31 // 32 // Permission is hereby granted, free of charge, to any person obtaining 33 // a copy of the Unicode data files and any associated documentation 34 // (the "Data Files") or Unicode software and any associated documentation 35 // (the "Software") to deal in the Data Files or Software 36 // without restriction, including without limitation the rights to use, 37 // copy, modify, merge, publish, distribute, and/or sell copies of 38 // the Data Files or Software, and to permit persons to whom the Data Files 39 // or Software are furnished to do so, provided that either 40 // (a) this copyright and permission notice appear with all copies 41 // of the Data Files or Software, or 42 // (b) this copyright and permission notice appear in associated 43 // Documentation. 44 // 45 // THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF 46 // ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE 47 // WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 48 // NONINFRINGEMENT OF THIRD PARTY RIGHTS. 49 // IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS 50 // NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL 51 // DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, 52 // DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER 53 // TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 54 // PERFORMANCE OF THE DATA FILES OR SOFTWARE. 55 // 56 // Except as contained in this notice, the name of a copyright holder 57 // shall not be used in advertising or otherwise to promote the sale, 58 // use or other dealings in these Data Files or Software without prior 59 // written authorization of the copyright holder. 60 61 #ifndef _LIBCPP___FORMAT_INDIC_CONJUNCT_BREAK_TABLE_H 62 #define _LIBCPP___FORMAT_INDIC_CONJUNCT_BREAK_TABLE_H 63 64 #include <__algorithm/ranges_upper_bound.h> 65 #include <__config> 66 #include <__iterator/access.h> 67 #include <cstddef> 68 #include <cstdint> 69 70 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) 71 # pragma GCC system_header 72 #endif 73 74 _LIBCPP_BEGIN_NAMESPACE_STD 75 76 #if _LIBCPP_STD_VER >= 20 77 78 namespace __indic_conjunct_break { 79 80 enum class __property : uint8_t { 81 // Values generated from the data files. 82 __Consonant, 83 __Extend, 84 __Linker, 85 86 // The code unit has none of above properties. 87 __none 88 }; 89 90 /// The entries of the indic conjunct break property table. 91 /// 92 /// The data is generated from 93 /// - https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt 94 /// 95 /// The data has 3 values 96 /// - bits [0, 1] The property. One of the values generated from the datafiles 97 /// of \ref __property 98 /// - bits [2, 10] The size of the range. 99 /// - bits [11, 31] The lower bound code point of the range. The upper bound of 100 /// the range is lower bound + size. 101 /// 102 /// The 9 bits for the size allow a maximum range of 512 elements. Some ranges 103 /// in the Unicode tables are larger. They are stored in multiple consecutive 104 /// ranges in the data table. An alternative would be to store the sizes in a 105 /// separate 16-bit value. The original MSVC STL code had such an approach, but 106 /// this approach uses less space for the data and is about 4% faster in the 107 /// following benchmark. 108 /// libcxx/benchmarks/std_format_spec_string_unicode.bench.cpp 109 // clang-format off 110 _LIBCPP_HIDE_FROM_ABI inline constexpr uint32_t __entries[201] = { 111 0x00180139, 112 0x001a807d, 113 0x00241811, 114 0x002c88b1, 115 0x002df801, 116 0x002e0805, 117 0x002e2005, 118 0x002e3801, 119 0x00308029, 120 0x00325851, 121 0x00338001, 122 0x0036b019, 123 0x0036f815, 124 0x00373805, 125 0x0037500d, 126 0x00388801, 127 0x00398069, 128 0x003f5821, 129 0x003fe801, 130 0x0040b00d, 131 0x0040d821, 132 0x00412809, 133 0x00414811, 134 0x0042c809, 135 0x0044c01d, 136 0x0046505d, 137 0x00471871, 138 0x0048a890, 139 0x0049e001, 140 0x004a6802, 141 0x004a880d, 142 0x004ac01c, 143 0x004bc01c, 144 0x004ca84c, 145 0x004d5018, 146 0x004d9000, 147 0x004db00c, 148 0x004de001, 149 0x004e6802, 150 0x004ee004, 151 0x004ef800, 152 0x004f8004, 153 0x004ff001, 154 0x0051e001, 155 0x0054a84c, 156 0x00555018, 157 0x00559004, 158 0x0055a810, 159 0x0055e001, 160 0x00566802, 161 0x0057c800, 162 0x0058a84c, 163 0x00595018, 164 0x00599004, 165 0x0059a810, 166 0x0059e001, 167 0x005a6802, 168 0x005ae004, 169 0x005af800, 170 0x005b8800, 171 0x0060a84c, 172 0x0061503c, 173 0x0061e001, 174 0x00626802, 175 0x0062a805, 176 0x0062c008, 177 0x0065e001, 178 0x0068a894, 179 0x0069d805, 180 0x006a6802, 181 0x0071c009, 182 0x0072400d, 183 0x0075c009, 184 0x0076400d, 185 0x0078c005, 186 0x0079a801, 187 0x0079b801, 188 0x0079c801, 189 0x007b8805, 190 0x007ba001, 191 0x007bd00d, 192 0x007c0001, 193 0x007c1009, 194 0x007c3005, 195 0x007e3001, 196 0x0081b801, 197 0x0081c805, 198 0x00846801, 199 0x009ae809, 200 0x00b8a001, 201 0x00be9001, 202 0x00bee801, 203 0x00c54801, 204 0x00c9c809, 205 0x00d0b805, 206 0x00d30001, 207 0x00d3a81d, 208 0x00d3f801, 209 0x00d58035, 210 0x00d5f83d, 211 0x00d9a001, 212 0x00db5821, 213 0x00dd5801, 214 0x00df3001, 215 0x00e1b801, 216 0x00e68009, 217 0x00e6a031, 218 0x00e71019, 219 0x00e76801, 220 0x00e7a001, 221 0x00e7c005, 222 0x00ee00fd, 223 0x01006801, 224 0x01068031, 225 0x01070801, 226 0x0107282d, 227 0x01677809, 228 0x016bf801, 229 0x016f007d, 230 0x01815015, 231 0x0184c805, 232 0x05337801, 233 0x0533a025, 234 0x0534f005, 235 0x05378005, 236 0x05416001, 237 0x05470045, 238 0x05495809, 239 0x054d9801, 240 0x05558001, 241 0x05559009, 242 0x0555b805, 243 0x0555f005, 244 0x05560801, 245 0x0557b001, 246 0x055f6801, 247 0x07d8f001, 248 0x07f1003d, 249 0x080fe801, 250 0x08170001, 251 0x081bb011, 252 0x08506801, 253 0x08507801, 254 0x0851c009, 255 0x0851f801, 256 0x08572805, 257 0x0869200d, 258 0x08755805, 259 0x0877e809, 260 0x087a3029, 261 0x087c100d, 262 0x08838001, 263 0x0883f801, 264 0x0885d001, 265 0x08880009, 266 0x08899805, 267 0x088b9801, 268 0x088e5001, 269 0x0891b001, 270 0x08974805, 271 0x0899d805, 272 0x089b3019, 273 0x089b8011, 274 0x08a23001, 275 0x08a2f001, 276 0x08a61801, 277 0x08ae0001, 278 0x08b5b801, 279 0x08b95801, 280 0x08c1d001, 281 0x08c9f001, 282 0x08ca1801, 283 0x08d1a001, 284 0x08d23801, 285 0x08d4c801, 286 0x08ea1001, 287 0x08ea2005, 288 0x08ecb801, 289 0x08fa1001, 290 0x0b578011, 291 0x0b598019, 292 0x0de4f001, 293 0x0e8b2801, 294 0x0e8b3809, 295 0x0e8b7011, 296 0x0e8bd81d, 297 0x0e8c2819, 298 0x0e8d500d, 299 0x0e921009, 300 0x0f000019, 301 0x0f004041, 302 0x0f00d819, 303 0x0f011805, 304 0x0f013011, 305 0x0f047801, 306 0x0f098019, 307 0x0f157001, 308 0x0f17600d, 309 0x0f27600d, 310 0x0f468019, 311 0x0f4a2019}; 312 // clang-format on 313 314 /// Returns the indic conjuct break property of a code point. 315 [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr __property __get_property(const char32_t __code_point) noexcept { 316 // The algorithm searches for the upper bound of the range and, when found, 317 // steps back one entry. This algorithm is used since the code point can be 318 // anywhere in the range. After a lower bound is found the next step is to 319 // compare whether the code unit is indeed in the range. 320 // 321 // Since the entry contains a code unit, size, and property the code point 322 // being sought needs to be adjusted. Just shifting the code point to the 323 // proper position doesn't work; suppose an entry has property 0, size 1, 324 // and lower bound 3. This results in the entry 0x1810. 325 // When searching for code point 3 it will search for 0x1800, find 0x1810 326 // and moves to the previous entry. Thus the lower bound value will never 327 // be found. 328 // The simple solution is to set the bits belonging to the property and 329 // size. Then the upper bound for code point 3 will return the entry after 330 // 0x1810. After moving to the previous entry the algorithm arrives at the 331 // correct entry. 332 ptrdiff_t __i = std::ranges::upper_bound(__entries, (__code_point << 11) | 0x7ffu) - __entries; 333 if (__i == 0) 334 return __property::__none; 335 336 --__i; 337 uint32_t __upper_bound = (__entries[__i] >> 11) + ((__entries[__i] >> 2) & 0b1'1111'1111); 338 if (__code_point <= __upper_bound) 339 return static_cast<__property>(__entries[__i] & 0b11); 340 341 return __property::__none; 342 } 343 344 } // namespace __indic_conjunct_break 345 346 #endif //_LIBCPP_STD_VER >= 20 347 348 _LIBCPP_END_NAMESPACE_STD 349 350 #endif // _LIBCPP___FORMAT_INDIC_CONJUNCT_BREAK_TABLE_H 351