1 // Written in the D programming language.
2
3 /++
4 $(P The `std.uni` module provides an implementation
5 of fundamental Unicode algorithms and data structures.
6 This doesn't include UTF encoding and decoding primitives,
7 see $(REF decode, std,_utf) and $(REF encode, std,_utf) in $(MREF std, utf)
8 for this functionality. )
9
10 $(SCRIPT inhibitQuickIndex = 1;)
11 $(DIVC quickindex,
12 $(BOOKTABLE,
13 $(TR $(TH Category) $(TH Functions))
14 $(TR $(TD Decode) $(TD
15 $(LREF byCodePoint)
16 $(LREF byGrapheme)
17 $(LREF decodeGrapheme)
18 $(LREF graphemeStride)
19 ))
20 $(TR $(TD Comparison) $(TD
21 $(LREF icmp)
22 $(LREF sicmp)
23 ))
24 $(TR $(TD Classification) $(TD
25 $(LREF isAlpha)
26 $(LREF isAlphaNum)
27 $(LREF isCodepointSet)
28 $(LREF isControl)
29 $(LREF isFormat)
30 $(LREF isGraphical)
31 $(LREF isIntegralPair)
32 $(LREF isMark)
33 $(LREF isNonCharacter)
34 $(LREF isNumber)
35 $(LREF isPrivateUse)
36 $(LREF isPunctuation)
37 $(LREF isSpace)
38 $(LREF isSurrogate)
39 $(LREF isSurrogateHi)
40 $(LREF isSurrogateLo)
41 $(LREF isSymbol)
42 $(LREF isWhite)
43 ))
44 $(TR $(TD Normalization) $(TD
45 $(LREF NFC)
46 $(LREF NFD)
47 $(LREF NFKD)
48 $(LREF NormalizationForm)
49 $(LREF normalize)
50 ))
51 $(TR $(TD Decompose) $(TD
52 $(LREF decompose)
53 $(LREF decomposeHangul)
54 $(LREF UnicodeDecomposition)
55 ))
56 $(TR $(TD Compose) $(TD
57 $(LREF compose)
58 $(LREF composeJamo)
59 ))
60 $(TR $(TD Sets) $(TD
61 $(LREF CodepointInterval)
62 $(LREF CodepointSet)
63 $(LREF InversionList)
64 $(LREF unicode)
65 ))
66 $(TR $(TD Trie) $(TD
67 $(LREF codepointSetTrie)
68 $(LREF CodepointSetTrie)
69 $(LREF codepointTrie)
70 $(LREF CodepointTrie)
71 $(LREF toTrie)
72 $(LREF toDelegate)
73 ))
74 $(TR $(TD Casing) $(TD
75 $(LREF asCapitalized)
76 $(LREF asLowerCase)
77 $(LREF asUpperCase)
78 $(LREF isLower)
79 $(LREF isUpper)
80 $(LREF toLower)
81 $(LREF toLowerInPlace)
82 $(LREF toUpper)
83 $(LREF toUpperInPlace)
84 ))
85 $(TR $(TD Utf8Matcher) $(TD
86 $(LREF isUtfMatcher)
87 $(LREF MatcherConcept)
88 $(LREF utfMatcher)
89 ))
90 $(TR $(TD Separators) $(TD
91 $(LREF lineSep)
92 $(LREF nelSep)
93 $(LREF paraSep)
94 ))
95 $(TR $(TD Building blocks) $(TD
96 $(LREF allowedIn)
97 $(LREF combiningClass)
98 $(LREF Grapheme)
99 ))
100 ))
101
102 $(P All primitives listed operate on Unicode characters and
103 sets of characters. For functions which operate on ASCII characters
104 and ignore Unicode $(CHARACTERS), see $(MREF std, ascii).
105 For definitions of Unicode $(CHARACTER), $(CODEPOINT) and other terms
106 used throughout this module see the $(S_LINK Terminology, terminology) section
107 below.
108 )
109 $(P The focus of this module is the core needs of developing Unicode-aware
110 applications. To that effect it provides the following optimized primitives:
111 )
112 $(UL
113 $(LI Character classification by category and common properties:
114 $(LREF isAlpha), $(LREF isWhite) and others.
115 )
116 $(LI
117 Case-insensitive string comparison ($(LREF sicmp), $(LREF icmp)).
118 )
119 $(LI
120 Converting text to any of the four normalization forms via $(LREF normalize).
121 )
122 $(LI
123 Decoding ($(LREF decodeGrapheme)) and iteration ($(LREF byGrapheme), $(LREF graphemeStride))
124 by user-perceived characters, that is by $(LREF Grapheme) clusters.
125 )
126 $(LI
127 Decomposing and composing of individual character(s) according to canonical
128 or compatibility rules, see $(LREF compose) and $(LREF decompose),
129 including the specific version for Hangul syllables $(LREF composeJamo)
130 and $(LREF decomposeHangul).
131 )
132 )
133 $(P It's recognized that an application may need further enhancements
134 and extensions, such as less commonly known algorithms,
135 or tailoring existing ones for region specific needs. To help users
136 with building any extra functionality beyond the core primitives,
137 the module provides:
138 )
139 $(UL
140 $(LI
141 $(LREF CodepointSet), a type for easy manipulation of sets of characters.
142 Besides the typical set algebra it provides an unusual feature:
143 a D source code generator for detection of $(CODEPOINTS) in this set.
144 This is a boon for meta-programming parser frameworks,
145 and is used internally to power classification in small
146 sets like $(LREF isWhite).
147 )
148 $(LI
149 A way to construct optimal packed multi-stage tables also known as a
150 special case of $(LINK2 https://en.wikipedia.org/wiki/Trie, Trie).
151 The functions $(LREF codepointTrie), $(LREF codepointSetTrie)
152 construct custom tries that map dchar to value.
153 The end result is a fast and predictable $(BIGOH 1) lookup that powers
154 functions like $(LREF isAlpha) and $(LREF combiningClass),
155 but for user-defined data sets.
156 )
157 $(LI
158 A useful technique for Unicode-aware parsers that perform
159 character classification of encoded $(CODEPOINTS)
160 is to avoid unnecassary decoding at all costs.
161 $(LREF utfMatcher) provides an improvement over the usual workflow
162 of decode-classify-process, combining the decoding and classification
163 steps. By extracting necessary bits directly from encoded
164 $(S_LINK Code unit, code units) matchers achieve
165 significant performance improvements. See $(LREF MatcherConcept) for
166 the common interface of UTF matchers.
167 )
168 $(LI
169 Generally useful building blocks for customized normalization:
170 $(LREF combiningClass) for querying combining class
171 and $(LREF allowedIn) for testing the Quick_Check
172 property of a given normalization form.
173 )
174 $(LI
175 Access to a large selection of commonly used sets of $(CODEPOINTS).
176 $(S_LINK Unicode properties, Supported sets) include Script,
177 Block and General Category. The exact contents of a set can be
178 observed in the CLDR utility, on the
179 $(HTTP www.unicode.org/cldr/utility/properties.jsp, property index) page
180 of the Unicode website.
181 See $(LREF unicode) for easy and (optionally) compile-time checked set
182 queries.
183 )
184 )
185 $(SECTION Synopsis)
186 ---
187 import std.uni;
188 void main()
189 {
190 // initialize code point sets using script/block or property name
191 // now 'set' contains code points from both scripts.
192 auto set = unicode("Cyrillic") | unicode("Armenian");
193 // same thing but simpler and checked at compile-time
194 auto ascii = unicode.ASCII;
195 auto currency = unicode.Currency_Symbol;
196
197 // easy set ops
198 auto a = set & ascii;
199 assert(a.empty); // as it has no intersection with ascii
200 a = set | ascii;
201 auto b = currency - a; // subtract all ASCII, Cyrillic and Armenian
202
203 // some properties of code point sets
204 assert(b.length > 45); // 46 items in Unicode 6.1, even more in 6.2
205 // testing presence of a code point in a set
206 // is just fine, it is O(logN)
207 assert(!b['$']);
208 assert(!b['\u058F']); // Armenian dram sign
209 assert(b['¥']);
210
211 // building fast lookup tables, these guarantee O(1) complexity
212 // 1-level Trie lookup table essentially a huge bit-set ~262Kb
213 auto oneTrie = toTrie!1(b);
214 // 2-level far more compact but typically slightly slower
215 auto twoTrie = toTrie!2(b);
216 // 3-level even smaller, and a bit slower yet
217 auto threeTrie = toTrie!3(b);
218 assert(oneTrie['£']);
219 assert(twoTrie['£']);
220 assert(threeTrie['£']);
221
222 // build the trie with the most sensible trie level
223 // and bind it as a functor
224 auto cyrillicOrArmenian = toDelegate(set);
225 auto balance = find!(cyrillicOrArmenian)("Hello ընկեր!");
226 assert(balance == "ընկեր!");
227 // compatible with bool delegate(dchar)
228 bool delegate(dchar) bindIt = cyrillicOrArmenian;
229
230 // Normalization
231 string s = "Plain ascii (and not only), is always normalized!";
232 assert(s is normalize(s));// is the same string
233
234 string nonS = "A\u0308ffin"; // A ligature
235 auto nS = normalize(nonS); // to NFC, the W3C endorsed standard
236 assert(nS == "Äffin");
237 assert(nS != nonS);
238 string composed = "Äffin";
239
240 assert(normalize!NFD(composed) == "A\u0308ffin");
241 // to NFKD, compatibility decomposition useful for fuzzy matching/searching
242 assert(normalize!NFKD("2¹⁰") == "210");
243 }
244 ---
245 $(SECTION Terminology)
246 $(P The following is a list of important Unicode notions
247 and definitions. Any conventions used specifically in this
248 module alone are marked as such. The descriptions are based on the formal
249 definition as found in $(HTTP www.unicode.org/versions/Unicode6.2.0/ch03.pdf,
250 chapter three of The Unicode Standard Core Specification.)
251 )
252 $(P $(DEF Abstract character) A unit of information used for the organization,
253 control, or representation of textual data.
254 Note that:
255 $(UL
256 $(LI When representing data, the nature of that data
257 is generally symbolic as opposed to some other
258 kind of data (for example, visual).
259 )
260 $(LI An abstract character has no concrete form
261 and should not be confused with a $(S_LINK Glyph, glyph).
262 )
263 $(LI An abstract character does not necessarily
264 correspond to what a user thinks of as a “character”
265 and should not be confused with a $(LREF Grapheme).
266 )
267 $(LI The abstract characters encoded (see Encoded character)
268 are known as Unicode abstract characters.
269 )
270 $(LI Abstract characters not directly
271 encoded by the Unicode Standard can often be
272 represented by the use of combining character sequences.
273 )
274 )
275 )
276 $(P $(DEF Canonical decomposition)
277 The decomposition of a character or character sequence
278 that results from recursively applying the canonical
279 mappings found in the Unicode Character Database
280 and these described in Conjoining Jamo Behavior
281 (section 12 of
282 $(HTTP www.unicode.org/uni2book/ch03.pdf, Unicode Conformance)).
283 )
284 $(P $(DEF Canonical composition)
285 The precise definition of the Canonical composition
286 is the algorithm as specified in $(HTTP www.unicode.org/uni2book/ch03.pdf,
287 Unicode Conformance) section 11.
288 Informally it's the process that does the reverse of the canonical
289 decomposition with the addition of certain rules
290 that e.g. prevent legacy characters from appearing in the composed result.
291 )
292 $(P $(DEF Canonical equivalent)
293 Two character sequences are said to be canonical equivalents if
294 their full canonical decompositions are identical.
295 )
296 $(P $(DEF Character) Typically differs by context.
297 For the purpose of this documentation the term $(I character)
298 implies $(I encoded character), that is, a code point having
299 an assigned abstract character (a symbolic meaning).
300 )
301 $(P $(DEF Code point) Any value in the Unicode codespace;
302 that is, the range of integers from 0 to 10FFFF (hex).
303 Not all code points are assigned to encoded characters.
304 )
305 $(P $(DEF Code unit) The minimal bit combination that can represent
306 a unit of encoded text for processing or interchange.
307 Depending on the encoding this could be:
308 8-bit code units in the UTF-8 (`char`),
309 16-bit code units in the UTF-16 (`wchar`),
310 and 32-bit code units in the UTF-32 (`dchar`).
311 $(I Note that in UTF-32, a code unit is a code point
312 and is represented by the D `dchar` type.)
313 )
314 $(P $(DEF Combining character) A character with the General Category
315 of Combining Mark(M).
316 $(UL
317 $(LI All characters with non-zero canonical combining class
318 are combining characters, but the reverse is not the case:
319 there are combining characters with a zero combining class.
320 )
321 $(LI These characters are not normally used in isolation
322 unless they are being described. They include such characters
323 as accents, diacritics, Hebrew points, Arabic vowel signs,
324 and Indic matras.
325 )
326 )
327 )
328 $(P $(DEF Combining class)
329 A numerical value used by the Unicode Canonical Ordering Algorithm
330 to determine which sequences of combining marks are to be
331 considered canonically equivalent and which are not.
332 )
333 $(P $(DEF Compatibility decomposition)
334 The decomposition of a character or character sequence that results
335 from recursively applying both the compatibility mappings and
336 the canonical mappings found in the Unicode Character Database, and those
337 described in Conjoining Jamo Behavior no characters
338 can be further decomposed.
339 )
340 $(P $(DEF Compatibility equivalent)
341 Two character sequences are said to be compatibility
342 equivalents if their full compatibility decompositions are identical.
343 )
344 $(P $(DEF Encoded character) An association (or mapping)
345 between an abstract character and a code point.
346 )
347 $(P $(DEF Glyph) The actual, concrete image of a glyph representation
348 having been rasterized or otherwise imaged onto some display surface.
349 )
350 $(P $(DEF Grapheme base) A character with the property
351 Grapheme_Base, or any standard Korean syllable block.
352 )
353 $(P $(DEF Grapheme cluster) Defined as the text between
354 grapheme boundaries as specified by Unicode Standard Annex #29,
355 $(HTTP www.unicode.org/reports/tr29/, Unicode text segmentation).
356 Important general properties of a grapheme:
357 $(UL
358 $(LI The grapheme cluster represents a horizontally segmentable
359 unit of text, consisting of some grapheme base (which may
360 consist of a Korean syllable) together with any number of
361 nonspacing marks applied to it.
362 )
363 $(LI A grapheme cluster typically starts with a grapheme base
364 and then extends across any subsequent sequence of nonspacing marks.
365 A grapheme cluster is most directly relevant to text rendering and
366 processes such as cursor placement and text selection in editing,
367 but may also be relevant to comparison and searching.
368 )
369 $(LI For many processes, a grapheme cluster behaves as if it was a
370 single character with the same properties as its grapheme base.
371 Effectively, nonspacing marks apply $(I graphically) to the base,
372 but do not change its properties.
373 )
374 )
375 $(P This module defines a number of primitives that work with graphemes:
376 $(LREF Grapheme), $(LREF decodeGrapheme) and $(LREF graphemeStride).
377 All of them are using $(I extended grapheme) boundaries
378 as defined in the aforementioned standard annex.
379 )
380 )
381 $(P $(DEF Nonspacing mark) A combining character with the
382 General Category of Nonspacing Mark (Mn) or Enclosing Mark (Me).
383 )
384 $(P $(DEF Spacing mark) A combining character that is not a nonspacing mark.
385 )
386 $(SECTION Normalization)
387 $(P The concepts of $(S_LINK Canonical equivalent, canonical equivalent)
388 or $(S_LINK Compatibility equivalent, compatibility equivalent)
389 characters in the Unicode Standard make it necessary to have a full, formal
390 definition of equivalence for Unicode strings.
391 String equivalence is determined by a process called normalization,
392 whereby strings are converted into forms which are compared
393 directly for identity. This is the primary goal of the normalization process,
394 see the function $(LREF normalize) to convert into any of
395 the four defined forms.
396 )
397 $(P A very important attribute of the Unicode Normalization Forms
398 is that they must remain stable between versions of the Unicode Standard.
399 A Unicode string normalized to a particular Unicode Normalization Form
400 in one version of the standard is guaranteed to remain in that Normalization
401 Form for implementations of future versions of the standard.
402 )
403 $(P The Unicode Standard specifies four normalization forms.
404 Informally, two of these forms are defined by maximal decomposition
405 of equivalent sequences, and two of these forms are defined
406 by maximal $(I composition) of equivalent sequences.
407 $(UL
408 $(LI Normalization Form D (NFD): The $(S_LINK Canonical decomposition,
409 canonical decomposition) of a character sequence.)
410 $(LI Normalization Form KD (NFKD): The $(S_LINK Compatibility decomposition,
411 compatibility decomposition) of a character sequence.)
412 $(LI Normalization Form C (NFC): The canonical composition of the
413 $(S_LINK Canonical decomposition, canonical decomposition)
414 of a coded character sequence.)
415 $(LI Normalization Form KC (NFKC): The canonical composition
416 of the $(S_LINK Compatibility decomposition,
417 compatibility decomposition) of a character sequence)
418 )
419 )
420 $(P The choice of the normalization form depends on the particular use case.
421 NFC is the best form for general text, since it's more compatible with
422 strings converted from legacy encodings. NFKC is the preferred form for
423 identifiers, especially where there are security concerns. NFD and NFKD
424 are the most useful for internal processing.
425 )
426 $(SECTION Construction of lookup tables)
427 $(P The Unicode standard describes a set of algorithms that
428 depend on having the ability to quickly look up various properties
429 of a code point. Given the the codespace of about 1 million $(CODEPOINTS),
430 it is not a trivial task to provide a space-efficient solution for
431 the multitude of properties.
432 )
433 $(P Common approaches such as hash-tables or binary search over
434 sorted code point intervals (as in $(LREF InversionList)) are insufficient.
435 Hash-tables have enormous memory footprint and binary search
436 over intervals is not fast enough for some heavy-duty algorithms.
437 )
438 $(P The recommended solution (see Unicode Implementation Guidelines)
439 is using multi-stage tables that are an implementation of the
440 $(HTTP en.wikipedia.org/wiki/Trie, Trie) data structure with integer
441 keys and a fixed number of stages. For the remainder of the section
442 this will be called a fixed trie. The following describes a particular
443 implementation that is aimed for the speed of access at the expense
444 of ideal size savings.
445 )
446 $(P Taking a 2-level Trie as an example the principle of operation is as follows.
447 Split the number of bits in a key (code point, 21 bits) into 2 components
448 (e.g. 15 and 8). The first is the number of bits in the index of the trie
449 and the other is number of bits in each page of the trie.
450 The layout of the trie is then an array of size 2^^bits-of-index followed
451 an array of memory chunks of size 2^^bits-of-page/bits-per-element.
452 )
453 $(P The number of pages is variable (but not less then 1)
454 unlike the number of entries in the index. The slots of the index
455 all have to contain a number of a page that is present. The lookup is then
456 just a couple of operations - slice the upper bits,
457 lookup an index for these, take a page at this index and use
458 the lower bits as an offset within this page.
459
460 Assuming that pages are laid out consequently
461 in one array at `pages`, the pseudo-code is:
462 )
463 ---
464 auto elemsPerPage = (2 ^^ bits_per_page) / Value.sizeOfInBits;
465 pages[index[n >> bits_per_page]][n & (elemsPerPage - 1)];
466 ---
467 $(P Where if `elemsPerPage` is a power of 2 the whole process is
468 a handful of simple instructions and 2 array reads. Subsequent levels
469 of the trie are introduced by recursing on this notion - the index array
470 is treated as values. The number of bits in index is then again
471 split into 2 parts, with pages over 'current-index' and the new 'upper-index'.
472 )
473
474 $(P For completeness a level 1 trie is simply an array.
475 The current implementation takes advantage of bit-packing values
476 when the range is known to be limited in advance (such as `bool`).
477 See also $(LREF BitPacked) for enforcing it manually.
478 The major size advantage however comes from the fact
479 that multiple $(B identical pages on every level are merged) by construction.
480 )
481 $(P The process of constructing a trie is more involved and is hidden from
482 the user in a form of the convenience functions $(LREF codepointTrie),
483 $(LREF codepointSetTrie) and the even more convenient $(LREF toTrie).
484 In general a set or built-in AA with `dchar` type
485 can be turned into a trie. The trie object in this module
486 is read-only (immutable); it's effectively frozen after construction.
487 )
488 $(SECTION Unicode properties)
489 $(P This is a full list of Unicode properties accessible through $(LREF unicode)
490 with specific helpers per category nested within. Consult the
491 $(HTTP www.unicode.org/cldr/utility/properties.jsp, CLDR utility)
492 when in doubt about the contents of a particular set.
493 )
494 $(P General category sets listed below are only accessible with the
495 $(LREF unicode) shorthand accessor.)
496 $(BOOKTABLE $(B General category ),
497 $(TR $(TH Abb.) $(TH Long form)
498 $(TH Abb.) $(TH Long form)$(TH Abb.) $(TH Long form))
499 $(TR $(TD L) $(TD Letter)
500 $(TD Cn) $(TD Unassigned) $(TD Po) $(TD Other_Punctuation))
501 $(TR $(TD Ll) $(TD Lowercase_Letter)
502 $(TD Co) $(TD Private_Use) $(TD Ps) $(TD Open_Punctuation))
503 $(TR $(TD Lm) $(TD Modifier_Letter)
504 $(TD Cs) $(TD Surrogate) $(TD S) $(TD Symbol))
505 $(TR $(TD Lo) $(TD Other_Letter)
506 $(TD N) $(TD Number) $(TD Sc) $(TD Currency_Symbol))
507 $(TR $(TD Lt) $(TD Titlecase_Letter)
508 $(TD Nd) $(TD Decimal_Number) $(TD Sk) $(TD Modifier_Symbol))
509 $(TR $(TD Lu) $(TD Uppercase_Letter)
510 $(TD Nl) $(TD Letter_Number) $(TD Sm) $(TD Math_Symbol))
511 $(TR $(TD M) $(TD Mark)
512 $(TD No) $(TD Other_Number) $(TD So) $(TD Other_Symbol))
513 $(TR $(TD Mc) $(TD Spacing_Mark)
514 $(TD P) $(TD Punctuation) $(TD Z) $(TD Separator))
515 $(TR $(TD Me) $(TD Enclosing_Mark)
516 $(TD Pc) $(TD Connector_Punctuation) $(TD Zl) $(TD Line_Separator))
517 $(TR $(TD Mn) $(TD Nonspacing_Mark)
518 $(TD Pd) $(TD Dash_Punctuation) $(TD Zp) $(TD Paragraph_Separator))
519 $(TR $(TD C) $(TD Other)
520 $(TD Pe) $(TD Close_Punctuation) $(TD Zs) $(TD Space_Separator))
521 $(TR $(TD Cc) $(TD Control) $(TD Pf)
522 $(TD Final_Punctuation) $(TD -) $(TD Any))
523 $(TR $(TD Cf) $(TD Format)
524 $(TD Pi) $(TD Initial_Punctuation) $(TD -) $(TD ASCII))
525 )
526 $(P Sets for other commonly useful properties that are
527 accessible with $(LREF unicode):)
528 $(BOOKTABLE $(B Common binary properties),
529 $(TR $(TH Name) $(TH Name) $(TH Name))
530 $(TR $(TD Alphabetic) $(TD Ideographic) $(TD Other_Uppercase))
531 $(TR $(TD ASCII_Hex_Digit) $(TD IDS_Binary_Operator) $(TD Pattern_Syntax))
532 $(TR $(TD Bidi_Control) $(TD ID_Start) $(TD Pattern_White_Space))
533 $(TR $(TD Cased) $(TD IDS_Trinary_Operator) $(TD Quotation_Mark))
534 $(TR $(TD Case_Ignorable) $(TD Join_Control) $(TD Radical))
535 $(TR $(TD Dash) $(TD Logical_Order_Exception) $(TD Soft_Dotted))
536 $(TR $(TD Default_Ignorable_Code_Point) $(TD Lowercase) $(TD STerm))
537 $(TR $(TD Deprecated) $(TD Math) $(TD Terminal_Punctuation))
538 $(TR $(TD Diacritic) $(TD Noncharacter_Code_Point) $(TD Unified_Ideograph))
539 $(TR $(TD Extender) $(TD Other_Alphabetic) $(TD Uppercase))
540 $(TR $(TD Grapheme_Base) $(TD Other_Default_Ignorable_Code_Point) $(TD Variation_Selector))
541 $(TR $(TD Grapheme_Extend) $(TD Other_Grapheme_Extend) $(TD White_Space))
542 $(TR $(TD Grapheme_Link) $(TD Other_ID_Continue) $(TD XID_Continue))
543 $(TR $(TD Hex_Digit) $(TD Other_ID_Start) $(TD XID_Start))
544 $(TR $(TD Hyphen) $(TD Other_Lowercase) )
545 $(TR $(TD ID_Continue) $(TD Other_Math) )
546 )
547 $(P Below is the table with block names accepted by $(LREF unicode.block).
548 Note that the shorthand version $(LREF unicode) requires "In"
549 to be prepended to the names of blocks so as to disambiguate
550 scripts and blocks.
551 )
552 $(BOOKTABLE $(B Blocks),
553 $(TR $(TD Aegean Numbers) $(TD Ethiopic Extended) $(TD Mongolian))
554 $(TR $(TD Alchemical Symbols) $(TD Ethiopic Extended-A) $(TD Musical Symbols))
555 $(TR $(TD Alphabetic Presentation Forms) $(TD Ethiopic Supplement) $(TD Myanmar))
556 $(TR $(TD Ancient Greek Musical Notation) $(TD General Punctuation) $(TD Myanmar Extended-A))
557 $(TR $(TD Ancient Greek Numbers) $(TD Geometric Shapes) $(TD New Tai Lue))
558 $(TR $(TD Ancient Symbols) $(TD Georgian) $(TD NKo))
559 $(TR $(TD Arabic) $(TD Georgian Supplement) $(TD Number Forms))
560 $(TR $(TD Arabic Extended-A) $(TD Glagolitic) $(TD Ogham))
561 $(TR $(TD Arabic Mathematical Alphabetic Symbols) $(TD Gothic) $(TD Ol Chiki))
562 $(TR $(TD Arabic Presentation Forms-A) $(TD Greek and Coptic) $(TD Old Italic))
563 $(TR $(TD Arabic Presentation Forms-B) $(TD Greek Extended) $(TD Old Persian))
564 $(TR $(TD Arabic Supplement) $(TD Gujarati) $(TD Old South Arabian))
565 $(TR $(TD Armenian) $(TD Gurmukhi) $(TD Old Turkic))
566 $(TR $(TD Arrows) $(TD Halfwidth and Fullwidth Forms) $(TD Optical Character Recognition))
567 $(TR $(TD Avestan) $(TD Hangul Compatibility Jamo) $(TD Oriya))
568 $(TR $(TD Balinese) $(TD Hangul Jamo) $(TD Osmanya))
569 $(TR $(TD Bamum) $(TD Hangul Jamo Extended-A) $(TD Phags-pa))
570 $(TR $(TD Bamum Supplement) $(TD Hangul Jamo Extended-B) $(TD Phaistos Disc))
571 $(TR $(TD Basic Latin) $(TD Hangul Syllables) $(TD Phoenician))
572 $(TR $(TD Batak) $(TD Hanunoo) $(TD Phonetic Extensions))
573 $(TR $(TD Bengali) $(TD Hebrew) $(TD Phonetic Extensions Supplement))
574 $(TR $(TD Block Elements) $(TD High Private Use Surrogates) $(TD Playing Cards))
575 $(TR $(TD Bopomofo) $(TD High Surrogates) $(TD Private Use Area))
576 $(TR $(TD Bopomofo Extended) $(TD Hiragana) $(TD Rejang))
577 $(TR $(TD Box Drawing) $(TD Ideographic Description Characters) $(TD Rumi Numeral Symbols))
578 $(TR $(TD Brahmi) $(TD Imperial Aramaic) $(TD Runic))
579 $(TR $(TD Braille Patterns) $(TD Inscriptional Pahlavi) $(TD Samaritan))
580 $(TR $(TD Buginese) $(TD Inscriptional Parthian) $(TD Saurashtra))
581 $(TR $(TD Buhid) $(TD IPA Extensions) $(TD Sharada))
582 $(TR $(TD Byzantine Musical Symbols) $(TD Javanese) $(TD Shavian))
583 $(TR $(TD Carian) $(TD Kaithi) $(TD Sinhala))
584 $(TR $(TD Chakma) $(TD Kana Supplement) $(TD Small Form Variants))
585 $(TR $(TD Cham) $(TD Kanbun) $(TD Sora Sompeng))
586 $(TR $(TD Cherokee) $(TD Kangxi Radicals) $(TD Spacing Modifier Letters))
587 $(TR $(TD CJK Compatibility) $(TD Kannada) $(TD Specials))
588 $(TR $(TD CJK Compatibility Forms) $(TD Katakana) $(TD Sundanese))
589 $(TR $(TD CJK Compatibility Ideographs) $(TD Katakana Phonetic Extensions) $(TD Sundanese Supplement))
590 $(TR $(TD CJK Compatibility Ideographs Supplement) $(TD Kayah Li) $(TD Superscripts and Subscripts))
591 $(TR $(TD CJK Radicals Supplement) $(TD Kharoshthi) $(TD Supplemental Arrows-A))
592 $(TR $(TD CJK Strokes) $(TD Khmer) $(TD Supplemental Arrows-B))
593 $(TR $(TD CJK Symbols and Punctuation) $(TD Khmer Symbols) $(TD Supplemental Mathematical Operators))
594 $(TR $(TD CJK Unified Ideographs) $(TD Lao) $(TD Supplemental Punctuation))
595 $(TR $(TD CJK Unified Ideographs Extension A) $(TD Latin-1 Supplement) $(TD Supplementary Private Use Area-A))
596 $(TR $(TD CJK Unified Ideographs Extension B) $(TD Latin Extended-A) $(TD Supplementary Private Use Area-B))
597 $(TR $(TD CJK Unified Ideographs Extension C) $(TD Latin Extended Additional) $(TD Syloti Nagri))
598 $(TR $(TD CJK Unified Ideographs Extension D) $(TD Latin Extended-B) $(TD Syriac))
599 $(TR $(TD Combining Diacritical Marks) $(TD Latin Extended-C) $(TD Tagalog))
600 $(TR $(TD Combining Diacritical Marks for Symbols) $(TD Latin Extended-D) $(TD Tagbanwa))
601 $(TR $(TD Combining Diacritical Marks Supplement) $(TD Lepcha) $(TD Tags))
602 $(TR $(TD Combining Half Marks) $(TD Letterlike Symbols) $(TD Tai Le))
603 $(TR $(TD Common Indic Number Forms) $(TD Limbu) $(TD Tai Tham))
604 $(TR $(TD Control Pictures) $(TD Linear B Ideograms) $(TD Tai Viet))
605 $(TR $(TD Coptic) $(TD Linear B Syllabary) $(TD Tai Xuan Jing Symbols))
606 $(TR $(TD Counting Rod Numerals) $(TD Lisu) $(TD Takri))
607 $(TR $(TD Cuneiform) $(TD Low Surrogates) $(TD Tamil))
608 $(TR $(TD Cuneiform Numbers and Punctuation) $(TD Lycian) $(TD Telugu))
609 $(TR $(TD Currency Symbols) $(TD Lydian) $(TD Thaana))
610 $(TR $(TD Cypriot Syllabary) $(TD Mahjong Tiles) $(TD Thai))
611 $(TR $(TD Cyrillic) $(TD Malayalam) $(TD Tibetan))
612 $(TR $(TD Cyrillic Extended-A) $(TD Mandaic) $(TD Tifinagh))
613 $(TR $(TD Cyrillic Extended-B) $(TD Mathematical Alphanumeric Symbols) $(TD Transport And Map Symbols))
614 $(TR $(TD Cyrillic Supplement) $(TD Mathematical Operators) $(TD Ugaritic))
615 $(TR $(TD Deseret) $(TD Meetei Mayek) $(TD Unified Canadian Aboriginal Syllabics))
616 $(TR $(TD Devanagari) $(TD Meetei Mayek Extensions) $(TD Unified Canadian Aboriginal Syllabics Extended))
617 $(TR $(TD Devanagari Extended) $(TD Meroitic Cursive) $(TD Vai))
618 $(TR $(TD Dingbats) $(TD Meroitic Hieroglyphs) $(TD Variation Selectors))
619 $(TR $(TD Domino Tiles) $(TD Miao) $(TD Variation Selectors Supplement))
620 $(TR $(TD Egyptian Hieroglyphs) $(TD Miscellaneous Mathematical Symbols-A) $(TD Vedic Extensions))
621 $(TR $(TD Emoticons) $(TD Miscellaneous Mathematical Symbols-B) $(TD Vertical Forms))
622 $(TR $(TD Enclosed Alphanumerics) $(TD Miscellaneous Symbols) $(TD Yijing Hexagram Symbols))
623 $(TR $(TD Enclosed Alphanumeric Supplement) $(TD Miscellaneous Symbols and Arrows) $(TD Yi Radicals))
624 $(TR $(TD Enclosed CJK Letters and Months) $(TD Miscellaneous Symbols And Pictographs) $(TD Yi Syllables))
625 $(TR $(TD Enclosed Ideographic Supplement) $(TD Miscellaneous Technical) )
626 $(TR $(TD Ethiopic) $(TD Modifier Tone Letters) )
627 )
628 $(P Below is the table with script names accepted by $(LREF unicode.script)
629 and by the shorthand version $(LREF unicode):)
630 $(BOOKTABLE $(B Scripts),
631 $(TR $(TD Arabic) $(TD Hanunoo) $(TD Old_Italic))
632 $(TR $(TD Armenian) $(TD Hebrew) $(TD Old_Persian))
633 $(TR $(TD Avestan) $(TD Hiragana) $(TD Old_South_Arabian))
634 $(TR $(TD Balinese) $(TD Imperial_Aramaic) $(TD Old_Turkic))
635 $(TR $(TD Bamum) $(TD Inherited) $(TD Oriya))
636 $(TR $(TD Batak) $(TD Inscriptional_Pahlavi) $(TD Osmanya))
637 $(TR $(TD Bengali) $(TD Inscriptional_Parthian) $(TD Phags_Pa))
638 $(TR $(TD Bopomofo) $(TD Javanese) $(TD Phoenician))
639 $(TR $(TD Brahmi) $(TD Kaithi) $(TD Rejang))
640 $(TR $(TD Braille) $(TD Kannada) $(TD Runic))
641 $(TR $(TD Buginese) $(TD Katakana) $(TD Samaritan))
642 $(TR $(TD Buhid) $(TD Kayah_Li) $(TD Saurashtra))
643 $(TR $(TD Canadian_Aboriginal) $(TD Kharoshthi) $(TD Sharada))
644 $(TR $(TD Carian) $(TD Khmer) $(TD Shavian))
645 $(TR $(TD Chakma) $(TD Lao) $(TD Sinhala))
646 $(TR $(TD Cham) $(TD Latin) $(TD Sora_Sompeng))
647 $(TR $(TD Cherokee) $(TD Lepcha) $(TD Sundanese))
648 $(TR $(TD Common) $(TD Limbu) $(TD Syloti_Nagri))
649 $(TR $(TD Coptic) $(TD Linear_B) $(TD Syriac))
650 $(TR $(TD Cuneiform) $(TD Lisu) $(TD Tagalog))
651 $(TR $(TD Cypriot) $(TD Lycian) $(TD Tagbanwa))
652 $(TR $(TD Cyrillic) $(TD Lydian) $(TD Tai_Le))
653 $(TR $(TD Deseret) $(TD Malayalam) $(TD Tai_Tham))
654 $(TR $(TD Devanagari) $(TD Mandaic) $(TD Tai_Viet))
655 $(TR $(TD Egyptian_Hieroglyphs) $(TD Meetei_Mayek) $(TD Takri))
656 $(TR $(TD Ethiopic) $(TD Meroitic_Cursive) $(TD Tamil))
657 $(TR $(TD Georgian) $(TD Meroitic_Hieroglyphs) $(TD Telugu))
658 $(TR $(TD Glagolitic) $(TD Miao) $(TD Thaana))
659 $(TR $(TD Gothic) $(TD Mongolian) $(TD Thai))
660 $(TR $(TD Greek) $(TD Myanmar) $(TD Tibetan))
661 $(TR $(TD Gujarati) $(TD New_Tai_Lue) $(TD Tifinagh))
662 $(TR $(TD Gurmukhi) $(TD Nko) $(TD Ugaritic))
663 $(TR $(TD Han) $(TD Ogham) $(TD Vai))
664 $(TR $(TD Hangul) $(TD Ol_Chiki) $(TD Yi))
665 )
666 $(P Below is the table of names accepted by $(LREF unicode.hangulSyllableType).)
667 $(BOOKTABLE $(B Hangul syllable type),
668 $(TR $(TH Abb.) $(TH Long form))
669 $(TR $(TD L) $(TD Leading_Jamo))
670 $(TR $(TD LV) $(TD LV_Syllable))
671 $(TR $(TD LVT) $(TD LVT_Syllable) )
672 $(TR $(TD T) $(TD Trailing_Jamo))
673 $(TR $(TD V) $(TD Vowel_Jamo))
674 )
675 References:
676 $(HTTP www.digitalmars.com/d/ascii-table.html, ASCII Table),
677 $(HTTP en.wikipedia.org/wiki/Unicode, Wikipedia),
678 $(HTTP www.unicode.org, The Unicode Consortium),
679 $(HTTP www.unicode.org/reports/tr15/, Unicode normalization forms),
680 $(HTTP www.unicode.org/reports/tr29/, Unicode text segmentation)
681 $(HTTP www.unicode.org/uni2book/ch05.pdf,
682 Unicode Implementation Guidelines)
683 $(HTTP www.unicode.org/uni2book/ch03.pdf,
684 Unicode Conformance)
685 Trademarks:
686 Unicode(tm) is a trademark of Unicode, Inc.
687
688 Copyright: Copyright 2013 -
689 License: $(HTTP www.boost.org/LICENSE_1_0.txt, Boost License 1.0).
690 Authors: Dmitry Olshansky
691 Source: $(PHOBOSSRC std/uni/package.d)
692 Standards: $(HTTP www.unicode.org/versions/Unicode6.2.0/, Unicode v6.2)
693
694 Macros:
695
696 SECTION = <h3><a id="$1">$0</a></h3>
697 DEF = <div><a id="$1"><i>$0</i></a></div>
698 S_LINK = <a href="#$1">$+</a>
699 CODEPOINT = $(S_LINK Code point, code point)
700 CODEPOINTS = $(S_LINK Code point, code points)
701 CHARACTER = $(S_LINK Character, character)
702 CHARACTERS = $(S_LINK Character, characters)
703 CLUSTER = $(S_LINK Grapheme cluster, grapheme cluster)
704 +/
705 module std.uni;
706
707 import std.meta : AliasSeq;
708 import std.range.primitives : back, ElementEncodingType, ElementType, empty,
709 front, hasLength, hasSlicing, isForwardRange, isInputRange,
710 isRandomAccessRange, popFront, put, save;
711 import std.traits : isConvertibleToString, isIntegral, isSomeChar,
712 isSomeString, Unqual, isDynamicArray;
713 // debug = std_uni;
714
715 debug(std_uni) import std.stdio; // writefln, writeln
716
717 private:
718
719
copyBackwards(T,U)720 void copyBackwards(T,U)(T[] src, U[] dest)
721 {
722 assert(src.length == dest.length);
723 for (size_t i=src.length; i-- > 0; )
724 dest[i] = src[i];
725 }
726
727 void copyForward(T,U)(T[] src, U[] dest)
728 {
729 assert(src.length == dest.length);
730 for (size_t i=0; i<src.length; i++)
731 dest[i] = src[i];
732 }
733
734 // TODO: update to reflect all major CPUs supporting unaligned reads
735 version (X86)
736 enum hasUnalignedReads = true;
737 else version (X86_64)
738 enum hasUnalignedReads = true;
739 else version (SystemZ)
740 enum hasUnalignedReads = true;
741 else
742 enum hasUnalignedReads = false; // better be safe then sorry
743
744 public enum dchar lineSep = '\u2028'; /// Constant $(CODEPOINT) (0x2028) - line separator.
745 public enum dchar paraSep = '\u2029'; /// Constant $(CODEPOINT) (0x2029) - paragraph separator.
746 public enum dchar nelSep = '\u0085'; /// Constant $(CODEPOINT) (0x0085) - next line.
747
748 // test the intro example
749 @safe unittest
750 {
751 import std.algorithm.searching : find;
752 // initialize code point sets using script/block or property name
753 // set contains code points from both scripts.
754 auto set = unicode("Cyrillic") | unicode("Armenian");
755 // or simpler and statically-checked look
756 auto ascii = unicode.ASCII;
757 auto currency = unicode.Currency_Symbol;
758
759 // easy set ops
760 auto a = set & ascii;
761 assert(a.empty); // as it has no intersection with ascii
762 a = set | ascii;
763 auto b = currency - a; // subtract all ASCII, Cyrillic and Armenian
764
765 // some properties of code point sets
766 assert(b.length > 45); // 46 items in Unicode 6.1, even more in 6.2
767 // testing presence of a code point in a set
768 // is just fine, it is O(logN)
769 assert(!b['$']);
770 assert(!b['\u058F']); // Armenian dram sign
771 assert(b['¥']);
772
773 // building fast lookup tables, these guarantee O(1) complexity
774 // 1-level Trie lookup table essentially a huge bit-set ~262Kb
775 auto oneTrie = toTrie!1(b);
776 // 2-level far more compact but typically slightly slower
777 auto twoTrie = toTrie!2(b);
778 // 3-level even smaller, and a bit slower yet
779 auto threeTrie = toTrie!3(b);
780 assert(oneTrie['£']);
781 assert(twoTrie['£']);
782 assert(threeTrie['£']);
783
784 // build the trie with the most sensible trie level
785 // and bind it as a functor
786 auto cyrillicOrArmenian = toDelegate(set);
787 auto balance = find!(cyrillicOrArmenian)("Hello ընկեր!");
788 assert(balance == "ընկեր!");
789 // compatible with bool delegate(dchar)
790 bool delegate(dchar) bindIt = cyrillicOrArmenian;
791
792 // Normalization
793 string s = "Plain ascii (and not only), is always normalized!";
794 assert(s is normalize(s));// is the same string
795
796 string nonS = "A\u0308ffin"; // A ligature
797 auto nS = normalize(nonS); // to NFC, the W3C endorsed standard
798 assert(nS == "Äffin");
799 assert(nS != nonS);
800 string composed = "Äffin";
801
802 assert(normalize!NFD(composed) == "A\u0308ffin");
803 // to NFKD, compatibility decomposition useful for fuzzy matching/searching
804 assert(normalize!NFKD("2¹⁰") == "210");
805 }
806
807 enum lastDchar = 0x10FFFF;
808
809 auto force(T, F)(F from)
810 if (isIntegral!T && !is(T == F))
811 {
812 assert(from <= T.max && from >= T.min);
813 return cast(T) from;
814 }
815
816 auto force(T, F)(F from)
817 if (isBitPacked!T && !is(T == F))
818 {
819 assert(from <= 2^^bitSizeOf!T-1);
820 return T(cast(TypeOfBitPacked!T) from);
821 }
822
823 auto force(T, F)(F from)
824 if (is(T == F))
825 {
826 return from;
827 }
828
829 // repeat X times the bit-pattern in val assuming it's length is 'bits'
830 size_t replicateBits(size_t times, size_t bits)(size_t val) @safe pure nothrow @nogc
831 {
832 static if (times == 1)
833 return val;
834 else static if (bits == 1)
835 {
836 static if (times == size_t.sizeof*8)
837 return val ? size_t.max : 0;
838 else
839 return val ? (1 << times)-1 : 0;
840 }
841 else static if (times % 2)
842 return (replicateBits!(times-1, bits)(val)<<bits) | val;
843 else
844 return replicateBits!(times/2, bits*2)((val << bits) | val);
845 }
846
847 @safe pure nothrow @nogc unittest // for replicate
848 {
849 import std.algorithm.iteration : sum, map;
850 import std.range : iota;
851 size_t m = 0b111;
852 size_t m2 = 0b01;
853 static foreach (i; AliasSeq!(1, 2, 3, 4, 5, 6, 7, 8, 9, 10))
854 {
855 assert(replicateBits!(i, 3)(m)+1 == (1<<(3*i)));
856 assert(replicateBits!(i, 2)(m2) == iota(0, i).map!"2^^(2*a)"().sum());
857 }
858 }
859
860 // multiple arrays squashed into one memory block
MultiArray(Types...)861 struct MultiArray(Types...)
862 {
863 import std.range.primitives : isOutputRange;
864 this(size_t[] sizes...) @safe pure nothrow
865 {
866 assert(dim == sizes.length);
867 size_t full_size;
868 foreach (i, v; Types)
869 {
870 full_size += spaceFor!(bitSizeOf!v)(sizes[i]);
871 sz[i] = sizes[i];
872 static if (i >= 1)
873 offsets[i] = offsets[i-1] +
874 spaceFor!(bitSizeOf!(Types[i-1]))(sizes[i-1]);
875 }
876
877 storage = new size_t[full_size];
878 }
879
880 this(const(size_t)[] raw_offsets,
881 const(size_t)[] raw_sizes,
882 return scope const(size_t)[] data) return scope const @safe pure nothrow @nogc
883 {
884 offsets[] = raw_offsets[];
885 sz[] = raw_sizes[];
886 storage = data;
887 }
888
889 @property auto slice(size_t n)()inout pure nothrow @nogc
890 {
891 auto ptr = raw_ptr!n;
892 return packedArrayView!(Types[n])(ptr, sz[n]);
893 }
894
895 @property auto ptr(size_t n)()inout pure nothrow @nogc
896 {
897 auto ptr = raw_ptr!n;
898 return inout(PackedPtr!(Types[n]))(ptr);
899 }
900
901 template length(size_t n)
902 {
903 @property size_t length()const @safe pure nothrow @nogc{ return sz[n]; }
904
905 @property void length(size_t new_size)
906 {
907 if (new_size > sz[n])
908 {// extend
909 size_t delta = (new_size - sz[n]);
910 sz[n] += delta;
911 delta = spaceFor!(bitSizeOf!(Types[n]))(delta);
912 storage.length += delta;// extend space at end
913 // raw_slice!x must follow resize as it could be moved!
914 // next stmts move all data past this array, last-one-goes-first
915 static if (n != dim-1)
916 {
917 auto start = raw_ptr!(n+1);
918 // len includes delta
919 size_t len = (storage.ptr+storage.length-start);
920
921 copyBackwards(start[0 .. len-delta], start[delta .. len]);
922
923 start[0 .. delta] = 0;
924 // offsets are used for raw_slice, ptr etc.
925 foreach (i; n+1 .. dim)
926 offsets[i] += delta;
927 }
928 }
929 else if (new_size < sz[n])
930 {// shrink
931 size_t delta = (sz[n] - new_size);
932 sz[n] -= delta;
933 delta = spaceFor!(bitSizeOf!(Types[n]))(delta);
934 // move all data past this array, forward direction
935 static if (n != dim-1)
936 {
937 auto start = raw_ptr!(n+1);
938 size_t len = (storage.ptr+storage.length-start);
939 copyForward(start[0 .. len-delta], start[delta .. len]);
940
941 // adjust offsets last, they affect raw_slice
942 foreach (i; n+1 .. dim)
943 offsets[i] -= delta;
944 }
945 storage.length -= delta;
946 }
947 // else - NOP
948 }
949 }
950
951 @property size_t bytes(size_t n=size_t.max)() const @safe
952 {
953 static if (n == size_t.max)
954 return storage.length*size_t.sizeof;
955 else static if (n != Types.length-1)
956 return (raw_ptr!(n+1)-raw_ptr!n)*size_t.sizeof;
957 else
958 return (storage.ptr+storage.length - raw_ptr!n)*size_t.sizeof;
959 }
960
961 void store(OutRange)(scope OutRange sink) const
962 if (isOutputRange!(OutRange, char))
963 {
964 import std.format.write : formattedWrite;
965 formattedWrite(sink, "[%( 0x%x, %)]", offsets[]);
966 formattedWrite(sink, ", [%( 0x%x, %)]", sz[]);
967 formattedWrite(sink, ", [%( 0x%x, %)]", storage);
968 }
969
970 private:
971 import std.meta : staticMap;
972 @property auto raw_ptr(size_t n)()inout pure nothrow @nogc
973 {
974 static if (n == 0)
975 return storage.ptr;
976 else
977 {
978 return storage.ptr+offsets[n];
979 }
980 }
981 enum dim = Types.length;
982 size_t[dim] offsets;// offset for level x
983 size_t[dim] sz;// size of level x
984 alias bitWidth = staticMap!(bitSizeOf, Types);
985 size_t[] storage;
986 }
987
988 @system unittest
989 {
990 import std.conv : text;
991 enum dg = (){
992 // sizes are:
993 // lvl0: 3, lvl1 : 2, lvl2: 1
994 auto m = MultiArray!(int, ubyte, int)(3,2,1);
995
check(size_t k,T)996 static void check(size_t k, T)(ref T m, int n)
997 {
998 foreach (i; 0 .. n)
999 assert(m.slice!(k)[i] == i+1, text("level:",i," : ",m.slice!(k)[0 .. n]));
1000 }
1001
checkB(size_t k,T)1002 static void checkB(size_t k, T)(ref T m, int n)
1003 {
1004 foreach (i; 0 .. n)
1005 assert(m.slice!(k)[i] == n-i, text("level:",i," : ",m.slice!(k)[0 .. n]));
1006 }
1007
fill(size_t k,T)1008 static void fill(size_t k, T)(ref T m, int n)
1009 {
1010 foreach (i; 0 .. n)
1011 m.slice!(k)[i] = force!ubyte(i+1);
1012 }
1013
fillB(size_t k,T)1014 static void fillB(size_t k, T)(ref T m, int n)
1015 {
1016 foreach (i; 0 .. n)
1017 m.slice!(k)[i] = force!ubyte(n-i);
1018 }
1019
1020 m.length!1 = 100;
1021 fill!1(m, 100);
1022 check!1(m, 100);
1023
1024 m.length!0 = 220;
1025 fill!0(m, 220);
1026 check!1(m, 100);
1027 check!0(m, 220);
1028
1029 m.length!2 = 17;
1030 fillB!2(m, 17);
1031 checkB!2(m, 17);
1032 check!0(m, 220);
1033 check!1(m, 100);
1034
1035 m.length!2 = 33;
1036 checkB!2(m, 17);
1037 fillB!2(m, 33);
1038 checkB!2(m, 33);
1039 check!0(m, 220);
1040 check!1(m, 100);
1041
1042 m.length!1 = 195;
1043 fillB!1(m, 195);
1044 checkB!1(m, 195);
1045 checkB!2(m, 33);
1046 check!0(m, 220);
1047
1048 auto marr = MultiArray!(BitPacked!(uint, 4), BitPacked!(uint, 6))(20, 10);
1049 marr.length!0 = 15;
1050 marr.length!1 = 30;
1051 fill!1(marr, 30);
1052 fill!0(marr, 15);
1053 check!1(marr, 30);
1054 check!0(marr, 15);
1055 return 0;
1056 };
1057 enum ct = dg();
1058 auto rt = dg();
1059 }
1060
1061 @system unittest
1062 {// more bitpacking tests
1063 import std.conv : text;
1064
1065 alias Bitty =
1066 MultiArray!(BitPacked!(size_t, 3)
1067 , BitPacked!(size_t, 4)
1068 , BitPacked!(size_t, 3)
1069 , BitPacked!(size_t, 6)
1070 , bool);
1071 alias fn1 = sliceBits!(13, 16);
1072 alias fn2 = sliceBits!( 9, 13);
1073 alias fn3 = sliceBits!( 6, 9);
1074 alias fn4 = sliceBits!( 0, 6);
check(size_t lvl,MA)1075 static void check(size_t lvl, MA)(ref MA arr){
1076 for (size_t i = 0; i< arr.length!lvl; i++)
1077 assert(arr.slice!(lvl)[i] == i, text("Mismatch on lvl ", lvl, " idx ", i, " value: ", arr.slice!(lvl)[i]));
1078 }
1079
fillIdx(size_t lvl,MA)1080 static void fillIdx(size_t lvl, MA)(ref MA arr){
1081 for (size_t i = 0; i< arr.length!lvl; i++)
1082 arr.slice!(lvl)[i] = i;
1083 }
1084 Bitty m1;
1085
1086 m1.length!4 = 10;
1087 m1.length!3 = 2^^6;
1088 m1.length!2 = 2^^3;
1089 m1.length!1 = 2^^4;
1090 m1.length!0 = 2^^3;
1091
1092 m1.length!4 = 2^^16;
1093
1094 for (size_t i = 0; i< m1.length!4; i++)
1095 m1.slice!(4)[i] = i % 2;
1096
1097 fillIdx!1(m1);
1098 check!1(m1);
1099 fillIdx!2(m1);
1100 check!2(m1);
1101 fillIdx!3(m1);
1102 check!3(m1);
1103 fillIdx!0(m1);
1104 check!0(m1);
1105 check!3(m1);
1106 check!2(m1);
1107 check!1(m1);
1108 for (size_t i=0; i < 2^^16; i++)
1109 {
1110 m1.slice!(4)[i] = i % 2;
1111 m1.slice!(0)[fn1(i)] = fn1(i);
1112 m1.slice!(1)[fn2(i)] = fn2(i);
1113 m1.slice!(2)[fn3(i)] = fn3(i);
1114 m1.slice!(3)[fn4(i)] = fn4(i);
1115 }
1116 for (size_t i=0; i < 2^^16; i++)
1117 {
1118 assert(m1.slice!(4)[i] == i % 2);
1119 assert(m1.slice!(0)[fn1(i)] == fn1(i));
1120 assert(m1.slice!(1)[fn2(i)] == fn2(i));
1121 assert(m1.slice!(2)[fn3(i)] == fn3(i));
1122 assert(m1.slice!(3)[fn4(i)] == fn4(i));
1123 }
1124 }
1125
spaceFor(size_t _bits)1126 size_t spaceFor(size_t _bits)(size_t new_len) @safe pure nothrow @nogc
1127 {
1128 import std.math.algebraic : nextPow2;
1129 enum bits = _bits == 1 ? 1 : nextPow2(_bits - 1);// see PackedArrayView
1130 static if (bits > 8*size_t.sizeof)
1131 {
1132 static assert(bits % (size_t.sizeof*8) == 0);
1133 return new_len * bits/(8*size_t.sizeof);
1134 }
1135 else
1136 {
1137 enum factor = size_t.sizeof*8/bits;
1138 return (new_len+factor-1)/factor; // rounded up
1139 }
1140 }
1141
isBitPackableType(T)1142 template isBitPackableType(T)
1143 {
1144 enum isBitPackableType = isBitPacked!T
1145 || isIntegral!T || is(T == bool) || isSomeChar!T;
1146 }
1147
1148 //============================================================================
1149 template PackedArrayView(T)
1150 if ((is(T dummy == BitPacked!(U, sz), U, size_t sz)
1151 && isBitPackableType!U) || isBitPackableType!T)
1152 {
1153 import std.math.algebraic : nextPow2;
1154 private enum bits = bitSizeOf!T;
1155 alias PackedArrayView = PackedArrayViewImpl!(T, bits > 1 ? nextPow2(bits - 1) : 1);
1156 }
1157
1158 //unsafe and fast access to a chunk of RAM as if it contains packed values
1159 template PackedPtr(T)
1160 if ((is(T dummy == BitPacked!(U, sz), U, size_t sz)
1161 && isBitPackableType!U) || isBitPackableType!T)
1162 {
1163 import std.math.algebraic : nextPow2;
1164 private enum bits = bitSizeOf!T;
1165 alias PackedPtr = PackedPtrImpl!(T, bits > 1 ? nextPow2(bits - 1) : 1);
1166 }
1167
PackedPtrImpl(T,size_t bits)1168 struct PackedPtrImpl(T, size_t bits)
1169 {
1170 pure nothrow:
1171 static assert(isPow2OrZero(bits));
1172
1173 this(inout(size_t)* ptr)inout @safe @nogc
1174 {
1175 origin = ptr;
1176 }
1177
1178 private T simpleIndex(size_t n) inout
1179 {
1180 immutable q = n / factor;
1181 immutable r = n % factor;
1182 return cast(T)((origin[q] >> bits*r) & mask);
1183 }
1184
1185 private void simpleWrite(TypeOfBitPacked!T val, size_t n)
1186 in
1187 {
1188 static if (isIntegral!T)
1189 assert(val <= mask);
1190 }
1191 do
1192 {
1193 immutable q = n / factor;
1194 immutable r = n % factor;
1195 immutable tgt_shift = bits*r;
1196 immutable word = origin[q];
1197 origin[q] = (word & ~(mask << tgt_shift))
1198 | (cast(size_t) val << tgt_shift);
1199 }
1200
1201 static if (factor == bytesPerWord// can safely pack by byte
1202 || factor == 1 // a whole word at a time
1203 || ((factor == bytesPerWord/2 || factor == bytesPerWord/4)
1204 && hasUnalignedReads)) // this needs unaligned reads
1205 {
1206 static if (factor == bytesPerWord)
1207 alias U = ubyte;
1208 else static if (factor == bytesPerWord/2)
1209 alias U = ushort;
1210 else static if (factor == bytesPerWord/4)
1211 alias U = uint;
1212 else static if (size_t.sizeof == 8 && factor == bytesPerWord/8)
1213 alias U = ulong;
1214
1215 T opIndex(size_t idx) inout
1216 {
1217 T ret;
1218 version (LittleEndian)
1219 ret = __ctfe ? simpleIndex(idx) :
1220 cast(inout(T))(cast(U*) origin)[idx];
1221 else
1222 ret = simpleIndex(idx);
1223 return ret;
1224 }
1225
1226 static if (isBitPacked!T) // lack of user-defined implicit conversion
1227 {
1228 void opIndexAssign(T val, size_t idx)
1229 {
1230 return opIndexAssign(cast(TypeOfBitPacked!T) val, idx);
1231 }
1232 }
1233
1234 void opIndexAssign(TypeOfBitPacked!T val, size_t idx)
1235 {
1236 version (LittleEndian)
1237 {
1238 if (__ctfe)
1239 simpleWrite(val, idx);
1240 else
1241 (cast(U*) origin)[idx] = cast(U) val;
1242 }
1243 else
1244 simpleWrite(val, idx);
1245 }
1246 }
1247 else
1248 {
1249 T opIndex(size_t n) inout
1250 {
1251 return simpleIndex(n);
1252 }
1253
1254 static if (isBitPacked!T) // lack of user-defined implicit conversion
1255 {
1256 void opIndexAssign(T val, size_t idx)
1257 {
1258 return opIndexAssign(cast(TypeOfBitPacked!T) val, idx);
1259 }
1260 }
1261
1262 void opIndexAssign(TypeOfBitPacked!T val, size_t n)
1263 {
1264 return simpleWrite(val, n);
1265 }
1266 }
1267
1268 private:
1269 // factor - number of elements in one machine word
1270 enum factor = size_t.sizeof*8/bits, mask = 2^^bits-1;
1271 enum bytesPerWord = size_t.sizeof;
1272 size_t* origin;
1273 }
1274
1275 // data is packed only by power of two sized packs per word,
1276 // thus avoiding mul/div overhead at the cost of ultimate packing
1277 // this construct doesn't own memory, only provides access, see MultiArray for usage
PackedArrayViewImpl(T,size_t bits)1278 struct PackedArrayViewImpl(T, size_t bits)
1279 {
1280 pure nothrow:
1281
1282 this(inout(size_t)* origin, size_t offset, size_t items) inout @safe
1283 {
1284 ptr = inout(PackedPtr!(T))(origin);
1285 ofs = offset;
1286 limit = items;
1287 }
1288
1289 bool zeros(size_t s, size_t e)
1290 in
1291 {
1292 assert(s <= e);
1293 }
1294 do
1295 {
1296 s += ofs;
1297 e += ofs;
1298 immutable pad_s = roundUp(s);
1299 if ( s >= e)
1300 {
1301 foreach (i; s .. e)
1302 if (ptr[i])
1303 return false;
1304 return true;
1305 }
1306 immutable pad_e = roundDown(e);
1307 size_t i;
1308 for (i=s; i<pad_s; i++)
1309 if (ptr[i])
1310 return false;
1311 // all in between is x*factor elements
1312 for (size_t j=i/factor; i<pad_e; i+=factor, j++)
1313 if (ptr.origin[j])
1314 return false;
1315 for (; i<e; i++)
1316 if (ptr[i])
1317 return false;
1318 return true;
1319 }
1320
1321 T opIndex(size_t idx) inout
1322 in
1323 {
1324 assert(idx < limit);
1325 }
1326 do
1327 {
1328 return ptr[ofs + idx];
1329 }
1330
1331 static if (isBitPacked!T) // lack of user-defined implicit conversion
1332 {
1333 void opIndexAssign(T val, size_t idx)
1334 {
1335 return opIndexAssign(cast(TypeOfBitPacked!T) val, idx);
1336 }
1337 }
1338
1339 void opIndexAssign(TypeOfBitPacked!T val, size_t idx)
1340 in
1341 {
1342 assert(idx < limit);
1343 }
1344 do
1345 {
1346 ptr[ofs + idx] = val;
1347 }
1348
1349 static if (isBitPacked!T) // lack of user-defined implicit conversions
1350 {
1351 void opSliceAssign(T val, size_t start, size_t end)
1352 {
1353 opSliceAssign(cast(TypeOfBitPacked!T) val, start, end);
1354 }
1355 }
1356
1357 void opSliceAssign(TypeOfBitPacked!T val, size_t start, size_t end)
1358 in
1359 {
1360 assert(start <= end);
1361 assert(end <= limit);
1362 }
1363 do
1364 {
1365 // account for ofsetted view
1366 start += ofs;
1367 end += ofs;
1368 // rounded to factor granularity
1369 immutable pad_start = roundUp(start);// rounded up
1370 if (pad_start >= end) //rounded up >= then end of slice
1371 {
1372 //nothing to gain, use per element assignment
1373 foreach (i; start .. end)
1374 ptr[i] = val;
1375 return;
1376 }
1377 immutable pad_end = roundDown(end); // rounded down
1378 size_t i;
1379 for (i=start; i<pad_start; i++)
1380 ptr[i] = val;
1381 // all in between is x*factor elements
1382 if (pad_start != pad_end)
1383 {
1384 immutable repval = replicateBits!(factor, bits)(val);
1385 for (size_t j=i/factor; i<pad_end; i+=factor, j++)
1386 ptr.origin[j] = repval;// so speed it up by factor
1387 }
1388 for (; i<end; i++)
1389 ptr[i] = val;
1390 }
1391
1392 auto opSlice(size_t from, size_t to)inout
1393 in
1394 {
1395 assert(from <= to);
1396 assert(ofs + to <= limit);
1397 }
1398 do
1399 {
1400 return typeof(this)(ptr.origin, ofs + from, to - from);
1401 }
1402
1403 auto opSlice(){ return opSlice(0, length); }
1404
1405 bool opEquals(T)(auto ref T arr) const
1406 {
1407 if (limit != arr.limit)
1408 return false;
1409 size_t s1 = ofs, s2 = arr.ofs;
1410 size_t e1 = s1 + limit, e2 = s2 + limit;
1411 if (s1 % factor == 0 && s2 % factor == 0 && length % factor == 0)
1412 {
1413 return ptr.origin[s1/factor .. e1/factor]
1414 == arr.ptr.origin[s2/factor .. e2/factor];
1415 }
1416 for (size_t i=0;i<limit; i++)
1417 if (this[i] != arr[i])
1418 return false;
1419 return true;
1420 }
1421
1422 @property size_t length()const{ return limit; }
1423
1424 private:
1425 auto roundUp()(size_t val){ return (val+factor-1)/factor*factor; }
1426 auto roundDown()(size_t val){ return val/factor*factor; }
1427 // factor - number of elements in one machine word
1428 enum factor = size_t.sizeof*8/bits;
1429 PackedPtr!(T) ptr;
1430 size_t ofs, limit;
1431 }
1432
1433
SliceOverIndexed(T)1434 private struct SliceOverIndexed(T)
1435 {
1436 enum assignableIndex = is(typeof((){ T.init[0] = Item.init; }));
1437 enum assignableSlice = is(typeof((){ T.init[0 .. 0] = Item.init; }));
1438 auto opIndex(size_t idx)const
1439 in
1440 {
1441 assert(idx < to - from);
1442 }
1443 do
1444 {
1445 return (*arr)[from+idx];
1446 }
1447
1448 static if (assignableIndex)
1449 void opIndexAssign(Item val, size_t idx)
1450 in
1451 {
1452 assert(idx < to - from);
1453 }
1454 do
1455 {
1456 (*arr)[from+idx] = val;
1457 }
1458
1459 auto opSlice(size_t a, size_t b)
1460 {
1461 return typeof(this)(from+a, from+b, arr);
1462 }
1463
1464 // static if (assignableSlice)
1465 void opSliceAssign(T)(T val, size_t start, size_t end)
1466 {
1467 (*arr)[start+from .. end+from] = val;
1468 }
1469
1470 auto opSlice()
1471 {
1472 return typeof(this)(from, to, arr);
1473 }
1474
1475 @property size_t length()const { return to-from;}
1476
1477 alias opDollar = length;
1478
1479 @property bool empty()const { return from == to; }
1480
1481 @property auto front()const { return (*arr)[from]; }
1482
1483 static if (assignableIndex)
1484 @property void front(Item val) { (*arr)[from] = val; }
1485
1486 @property auto back()const { return (*arr)[to-1]; }
1487
1488 static if (assignableIndex)
1489 @property void back(Item val) { (*arr)[to-1] = val; }
1490
1491 @property auto save() inout { return this; }
1492
1493 void popFront() { from++; }
1494
1495 void popBack() { to--; }
1496
1497 bool opEquals(T)(auto ref T arr) const
1498 {
1499 if (arr.length != length)
1500 return false;
1501 for (size_t i=0; i <length; i++)
1502 if (this[i] != arr[i])
1503 return false;
1504 return true;
1505 }
1506 private:
1507 alias Item = typeof(T.init[0]);
1508 size_t from, to;
1509 T* arr;
1510 }
1511
1512 @safe pure nothrow @nogc unittest
1513 {
1514 static assert(isRandomAccessRange!(SliceOverIndexed!(int[])));
1515 }
1516
1517 SliceOverIndexed!(const(T)) sliceOverIndexed(T)(size_t a, size_t b, const(T)* x)
1518 if (is(Unqual!T == T))
1519 {
1520 return SliceOverIndexed!(const(T))(a, b, x);
1521 }
1522
1523 // BUG? inout is out of reach
1524 //...SliceOverIndexed.arr only parameters or stack based variables can be inout
1525 SliceOverIndexed!T sliceOverIndexed(T)(size_t a, size_t b, T* x)
1526 if (is(Unqual!T == T))
1527 {
1528 return SliceOverIndexed!T(a, b, x);
1529 }
1530
1531 @safe unittest
1532 {
1533 int[] idxArray = [2, 3, 5, 8, 13];
1534 auto sliced = sliceOverIndexed(0, idxArray.length, &idxArray);
1535
1536 assert(!sliced.empty);
1537 assert(sliced.front == 2);
1538 sliced.front = 1;
1539 assert(sliced.front == 1);
1540 assert(sliced.back == 13);
1541 sliced.popFront();
1542 assert(sliced.front == 3);
1543 assert(sliced.back == 13);
1544 sliced.back = 11;
1545 assert(sliced.back == 11);
1546 sliced.popBack();
1547
1548 assert(sliced.front == 3);
1549 assert(sliced[$-1] == 8);
1550 sliced = sliced[];
1551 assert(sliced[0] == 3);
1552 assert(sliced.back == 8);
1553 sliced = sliced[1..$];
1554 assert(sliced.front == 5);
1555 sliced = sliced[0..$-1];
1556 assert(sliced[$-1] == 5);
1557
1558 int[] other = [2, 5];
1559 assert(sliced[] == sliceOverIndexed(1, 2, &other));
1560 sliceOverIndexed(0, 2, &idxArray)[0 .. 2] = -1;
1561 assert(idxArray[0 .. 2] == [-1, -1]);
1562 uint[] nullArr = null;
1563 auto nullSlice = sliceOverIndexed(0, 0, &idxArray);
1564 assert(nullSlice.empty);
1565 }
1566
packedArrayView(T)1567 private inout(PackedArrayView!T) packedArrayView(T)(inout(size_t)* ptr, size_t items)
1568 {
1569 return inout(PackedArrayView!T)(ptr, 0, items);
1570 }
1571
1572
1573 //============================================================================
1574 // Partially unrolled binary search using Shar's method
1575 //============================================================================
1576
genUnrolledSwitchSearch(size_t size)1577 string genUnrolledSwitchSearch(size_t size) @safe pure nothrow
1578 {
1579 import core.bitop : bsr;
1580 import std.array : replace;
1581 import std.conv : to;
1582 assert(isPow2OrZero(size));
1583 string code = `
1584 import core.bitop : bsr;
1585 auto power = bsr(m)+1;
1586 switch (power){`;
1587 size_t i = bsr(size);
1588 foreach_reverse (val; 0 .. bsr(size))
1589 {
1590 auto v = 2^^val;
1591 code ~= `
1592 case pow:
1593 if (pred(range[idx+m], needle))
1594 idx += m;
1595 goto case;
1596 `.replace("m", to!string(v))
1597 .replace("pow", to!string(i));
1598 i--;
1599 }
1600 code ~= `
1601 case 0:
1602 if (pred(range[idx], needle))
1603 idx += 1;
1604 goto default;
1605 `;
1606 code ~= `
1607 default:
1608 }`;
1609 return code;
1610 }
1611
isPow2OrZero(size_t sz)1612 bool isPow2OrZero(size_t sz) @safe pure nothrow @nogc
1613 {
1614 // See also: std.math.isPowerOf2()
1615 return (sz & (sz-1)) == 0;
1616 }
1617
1618 size_t uniformLowerBound(alias pred, Range, T)(Range range, T needle)
1619 if (is(T : ElementType!Range))
1620 {
1621 assert(isPow2OrZero(range.length));
1622 size_t idx = 0, m = range.length/2;
1623 while (m != 0)
1624 {
1625 if (pred(range[idx+m], needle))
1626 idx += m;
1627 m /= 2;
1628 }
1629 if (pred(range[idx], needle))
1630 idx += 1;
1631 return idx;
1632 }
1633
1634 size_t switchUniformLowerBound(alias pred, Range, T)(Range range, T needle)
1635 if (is(T : ElementType!Range))
1636 {
1637 assert(isPow2OrZero(range.length));
1638 size_t idx = 0, m = range.length/2;
1639 enum max = 1 << 10;
1640 while (m >= max)
1641 {
1642 if (pred(range[idx+m], needle))
1643 idx += m;
1644 m /= 2;
1645 }
1646 mixin(genUnrolledSwitchSearch(max));
1647 return idx;
1648 }
1649
sharMethod(alias uniLowerBound)1650 template sharMethod(alias uniLowerBound)
1651 {
1652 size_t sharMethod(alias _pred="a<b", Range, T)(Range range, T needle)
1653 if (is(T : ElementType!Range))
1654 {
1655 import std.functional : binaryFun;
1656 import std.math.algebraic : nextPow2, truncPow2;
1657 alias pred = binaryFun!_pred;
1658 if (range.length == 0)
1659 return 0;
1660 if (isPow2OrZero(range.length))
1661 return uniLowerBound!pred(range, needle);
1662 size_t n = truncPow2(range.length);
1663 if (pred(range[n-1], needle))
1664 {// search in another 2^^k area that fully covers the tail of range
1665 size_t k = nextPow2(range.length - n + 1);
1666 return range.length - k + uniLowerBound!pred(range[$-k..$], needle);
1667 }
1668 else
1669 return uniLowerBound!pred(range[0 .. n], needle);
1670 }
1671 }
1672
1673 alias sharLowerBound = sharMethod!uniformLowerBound;
1674 alias sharSwitchLowerBound = sharMethod!switchUniformLowerBound;
1675
1676 @safe unittest
1677 {
1678 import std.array : array;
1679 import std.range : assumeSorted, iota;
1680
stdLowerBound(T)1681 auto stdLowerBound(T)(T[] range, T needle)
1682 {
1683 return assumeSorted(range).lowerBound(needle).length;
1684 }
1685 immutable MAX = 5*1173;
1686 auto arr = array(iota(5, MAX, 5));
1687 assert(arr.length == MAX/5-1);
1688 foreach (i; 0 .. MAX+5)
1689 {
1690 auto st = stdLowerBound(arr, i);
1691 assert(st == sharLowerBound(arr, i));
1692 assert(st == sharSwitchLowerBound(arr, i));
1693 }
1694 arr = [];
1695 auto st = stdLowerBound(arr, 33);
1696 assert(st == sharLowerBound(arr, 33));
1697 assert(st == sharSwitchLowerBound(arr, 33));
1698 }
1699 //============================================================================
1700
1701 @safe
1702 {
1703 // hope to see simillar stuff in public interface... once Allocators are out
1704 //@@@BUG moveFront and friends? dunno, for now it's POD-only
1705
1706 @trusted size_t genericReplace(Policy=void, T, Range)
1707 (ref T dest, size_t from, size_t to, Range stuff)
1708 {
1709 import std.algorithm.mutation : copy;
1710 size_t delta = to - from;
1711 size_t stuff_end = from+stuff.length;
1712 if (stuff.length > delta)
1713 {// replace increases length
1714 delta = stuff.length - delta;// now, new is > old by delta
1715 static if (is(Policy == void))
1716 dest.length = dest.length+delta;//@@@BUG lame @property
1717 else
1718 dest = Policy.realloc(dest, dest.length+delta);
1719 copyBackwards(dest[to .. dest.length-delta],
1720 dest[to+delta .. dest.length]);
1721 copyForward(stuff, dest[from .. stuff_end]);
1722 }
1723 else if (stuff.length == delta)
1724 {
1725 copy(stuff, dest[from .. to]);
1726 }
1727 else
1728 {// replace decreases length by delta
1729 delta = delta - stuff.length;
1730 copy(stuff, dest[from .. stuff_end]);
1731 copyForward(dest[to .. dest.length],
1732 dest[stuff_end .. dest.length-delta]);
1733 static if (is(Policy == void))
1734 dest.length = dest.length - delta;//@@@BUG lame @property
1735 else
1736 dest = Policy.realloc(dest, dest.length-delta);
1737 }
1738 return stuff_end;
1739 }
1740
1741
1742 // Simple storage manipulation policy
1743 @safe private struct GcPolicy
1744 {
1745 import std.traits : isDynamicArray;
1746
dupGcPolicy1747 static T[] dup(T)(const T[] arr)
1748 {
1749 return arr.dup;
1750 }
1751
allocGcPolicy1752 static T[] alloc(T)(size_t size)
1753 {
1754 return new T[size];
1755 }
1756
reallocGcPolicy1757 static T[] realloc(T)(T[] arr, size_t sz)
1758 {
1759 arr.length = sz;
1760 return arr;
1761 }
1762
replaceImplGcPolicy1763 static void replaceImpl(T, Range)(ref T[] dest, size_t from, size_t to, Range stuff)
1764 {
1765 replaceInPlace(dest, from, to, stuff);
1766 }
1767
1768 static void append(T, V)(ref T[] arr, V value)
1769 if (!isInputRange!V)
1770 {
1771 arr ~= force!T(value);
1772 }
1773
1774 static void append(T, V)(ref T[] arr, V value)
1775 if (isInputRange!V)
1776 {
1777 insertInPlace(arr, arr.length, value);
1778 }
1779
1780 static void destroy(T)(ref T arr) pure // pure required for -dip25, inferred for -dip1000
1781 if (isDynamicArray!T && is(Unqual!T == T))
1782 {
1783 debug
1784 {
1785 arr[] = cast(typeof(T.init[0]))(0xdead_beef);
1786 }
1787 arr = null;
1788 }
1789
1790 static void destroy(T)(ref T arr) pure // pure required for -dip25, inferred for -dip1000
1791 if (isDynamicArray!T && !is(Unqual!T == T))
1792 {
1793 arr = null;
1794 }
1795 }
1796
1797 // ditto
1798 @safe struct ReallocPolicy
1799 {
1800 import std.range.primitives : hasLength;
1801
dup(T)1802 static T[] dup(T)(const T[] arr)
1803 {
1804 auto result = alloc!T(arr.length);
1805 result[] = arr[];
1806 return result;
1807 }
1808
alloc(T)1809 static T[] alloc(T)(size_t size) @trusted
1810 {
1811 import std.internal.memory : enforceMalloc;
1812
1813 import core.checkedint : mulu;
1814 bool overflow;
1815 size_t nbytes = mulu(size, T.sizeof, overflow);
1816 if (overflow) assert(0);
1817
1818 auto ptr = cast(T*) enforceMalloc(nbytes);
1819 return ptr[0 .. size];
1820 }
1821
realloc(T)1822 static T[] realloc(T)(return scope T[] arr, size_t size) @trusted
1823 {
1824 import std.internal.memory : enforceRealloc;
1825 if (!size)
1826 {
1827 destroy(arr);
1828 return null;
1829 }
1830
1831 import core.checkedint : mulu;
1832 bool overflow;
1833 size_t nbytes = mulu(size, T.sizeof, overflow);
1834 if (overflow) assert(0);
1835
1836 auto ptr = cast(T*) enforceRealloc(arr.ptr, nbytes);
1837 return ptr[0 .. size];
1838 }
1839
replaceImpl(T,Range)1840 static void replaceImpl(T, Range)(ref T[] dest, size_t from, size_t to, Range stuff)
1841 {
1842 genericReplace!(ReallocPolicy)(dest, from, to, stuff);
1843 }
1844
1845 static void append(T, V)(ref T[] arr, V value)
1846 if (!isInputRange!V)
1847 {
1848 if (arr.length == size_t.max) assert(0);
1849 arr = realloc(arr, arr.length+1);
1850 arr[$-1] = force!T(value);
1851 }
1852
1853 pure @safe unittest
1854 {
1855 int[] arr;
1856 ReallocPolicy.append(arr, 3);
1857
1858 import std.algorithm.comparison : equal;
1859 assert(equal(arr, [3]));
1860 }
1861
1862 static void append(T, V)(ref T[] arr, V value)
1863 if (isInputRange!V && hasLength!V)
1864 {
1865 import core.checkedint : addu;
1866 bool overflow;
1867 size_t nelems = addu(arr.length, value.length, overflow);
1868 if (overflow) assert(0);
1869
1870 arr = realloc(arr, nelems);
1871
1872 import std.algorithm.mutation : copy;
1873 copy(value, arr[$-value.length..$]);
1874 }
1875
1876 pure @safe unittest
1877 {
1878 int[] arr;
1879 ReallocPolicy.append(arr, [1,2,3]);
1880
1881 import std.algorithm.comparison : equal;
1882 assert(equal(arr, [1,2,3]));
1883 }
1884
destroy(T)1885 static void destroy(T)(scope ref T[] arr) @trusted
1886 {
1887 import core.memory : pureFree;
1888 if (arr.ptr)
1889 pureFree(arr.ptr);
1890 arr = null;
1891 }
1892 }
1893
1894 //build hack
1895 alias _RealArray = CowArray!ReallocPolicy;
1896
1897 pure @safe unittest
1898 {
1899 import std.algorithm.comparison : equal;
1900
with(ReallocPolicy)1901 with(ReallocPolicy)
1902 {
1903 bool test(T, U, V)(T orig, size_t from, size_t to, U toReplace, V result,
1904 string file = __FILE__, size_t line = __LINE__)
1905 {
1906 {
1907 replaceImpl(orig, from, to, toReplace);
1908 scope(exit) destroy(orig);
1909 if (!equal(orig, result))
1910 return false;
1911 }
1912 return true;
1913 }
1914 static T[] arr(T)(T[] args... )
1915 {
1916 return dup(args);
1917 }
1918
1919 assert(test(arr([1, 2, 3, 4]), 0, 0, [5, 6, 7], [5, 6, 7, 1, 2, 3, 4]));
1920 assert(test(arr([1, 2, 3, 4]), 0, 2, cast(int[])[], [3, 4]));
1921 assert(test(arr([1, 2, 3, 4]), 0, 4, [5, 6, 7], [5, 6, 7]));
1922 assert(test(arr([1, 2, 3, 4]), 0, 2, [5, 6, 7], [5, 6, 7, 3, 4]));
1923 assert(test(arr([1, 2, 3, 4]), 2, 3, [5, 6, 7], [1, 2, 5, 6, 7, 4]));
1924 }
1925 }
1926
1927 /**
1928 Tests if T is some kind a set of code points. Intended for template constraints.
1929 */
isCodepointSet(T)1930 public template isCodepointSet(T)
1931 {
1932 static if (is(T dummy == InversionList!(Args), Args...))
1933 enum isCodepointSet = true;
1934 else
1935 enum isCodepointSet = false;
1936 }
1937
1938 /**
1939 Tests if `T` is a pair of integers that implicitly convert to `V`.
1940 The following code must compile for any pair `T`:
1941 ---
1942 (T x){ V a = x[0]; V b = x[1];}
1943 ---
1944 The following must not compile:
1945 ---
1946 (T x){ V c = x[2];}
1947 ---
1948 */
1949 public template isIntegralPair(T, V=uint)
1950 {
1951 enum isIntegralPair = is(typeof((T x){ V a = x[0]; V b = x[1];}))
1952 && !is(typeof((T x){ V c = x[2]; }));
1953 }
1954
1955
1956 /**
1957 The recommended default type for set of $(CODEPOINTS).
1958 For details, see the current implementation: $(LREF InversionList).
1959 */
1960 public alias CodepointSet = InversionList!GcPolicy;
1961
1962
1963 //@@@BUG: std.typecons tuples depend on std.format to produce fields mixin
1964 // which relies on std.uni.isGraphical and this chain blows up with Forward reference error
1965 // hence below doesn't seem to work
1966 // public alias CodepointInterval = Tuple!(uint, "a", uint, "b");
1967
1968 /**
1969 The recommended type of $(REF Tuple, std,_typecons)
1970 to represent [a, b$(RPAREN) intervals of $(CODEPOINTS). As used in $(LREF InversionList).
1971 Any interval type should pass $(LREF isIntegralPair) trait.
1972 */
1973 public struct CodepointInterval
1974 {
1975 pure:
1976 uint[2] _tuple;
1977 alias _tuple this;
1978
1979 @safe pure nothrow @nogc:
1980
thisCodepointInterval1981 this(uint low, uint high)
1982 {
1983 _tuple[0] = low;
1984 _tuple[1] = high;
1985 }
opEqualsCodepointInterval1986 bool opEquals(T)(T val) const
1987 {
1988 return this[0] == val[0] && this[1] == val[1];
1989 }
1990 @property ref inout(uint) a() return inout { return _tuple[0]; }
1991 @property ref inout(uint) b() return inout { return _tuple[1]; }
1992 }
1993
1994 /**
1995 $(P
1996 `InversionList` is a set of $(CODEPOINTS)
1997 represented as an array of open-right [a, b$(RPAREN)
1998 intervals (see $(LREF CodepointInterval) above).
1999 The name comes from the way the representation reads left to right.
2000 For instance a set of all values [10, 50$(RPAREN), [80, 90$(RPAREN),
2001 plus a singular value 60 looks like this:
2002 )
2003 ---
2004 10, 50, 60, 61, 80, 90
2005 ---
2006 $(P
2007 The way to read this is: start with negative meaning that all numbers
2008 smaller then the next one are not present in this set (and positive -
2009 the contrary). Then switch positive/negative after each
2010 number passed from left to right.
2011 )
2012 $(P This way negative spans until 10, then positive until 50,
2013 then negative until 60, then positive until 61, and so on.
2014 As seen this provides a space-efficient storage of highly redundant data
2015 that comes in long runs. A description which Unicode $(CHARACTER)
2016 properties fit nicely. The technique itself could be seen as a variation
2017 on $(LINK2 https://en.wikipedia.org/wiki/Run-length_encoding, RLE encoding).
2018 )
2019
2020 $(P Sets are value types (just like `int` is) thus they
2021 are never aliased.
2022 )
2023 Example:
2024 ---
2025 auto a = CodepointSet('a', 'z'+1);
2026 auto b = CodepointSet('A', 'Z'+1);
2027 auto c = a;
2028 a = a | b;
2029 assert(a == CodepointSet('A', 'Z'+1, 'a', 'z'+1));
2030 assert(a != c);
2031 ---
2032 $(P See also $(LREF unicode) for simpler construction of sets
2033 from predefined ones.
2034 )
2035
2036 $(P Memory usage is 8 bytes per each contiguous interval in a set.
2037 The value semantics are achieved by using the
2038 $(HTTP en.wikipedia.org/wiki/Copy-on-write, COW) technique
2039 and thus it's $(RED not) safe to cast this type to $(D_KEYWORD shared).
2040 )
2041
2042 Note:
2043 $(P It's not recommended to rely on the template parameters
2044 or the exact type of a current $(CODEPOINT) set in `std.uni`.
2045 The type and parameters may change when the standard
2046 allocators design is finalized.
2047 Use $(LREF isCodepointSet) with templates or just stick with the default
2048 alias $(LREF CodepointSet) throughout the whole code base.
2049 )
2050 */
2051 public struct InversionList(SP=GcPolicy)
2052 {
2053 import std.range : assumeSorted;
2054
2055 /**
2056 Construct from another code point set of any type.
2057 */
2058 this(Set)(Set set) pure
2059 if (isCodepointSet!Set)
2060 {
2061 uint[] arr;
2062 foreach (v; set.byInterval)
2063 {
2064 arr ~= v.a;
2065 arr ~= v.b;
2066 }
2067 data = CowArray!(SP).reuse(arr);
2068 }
2069
2070 /**
2071 Construct a set from a forward range of code point intervals.
2072 */
2073 this(Range)(Range intervals) pure
2074 if (isForwardRange!Range && isIntegralPair!(ElementType!Range))
2075 {
2076 uint[] arr;
2077 foreach (v; intervals)
2078 {
2079 SP.append(arr, v.a);
2080 SP.append(arr, v.b);
2081 }
2082 data = CowArray!(SP).reuse(arr);
2083 sanitize(); //enforce invariant: sort intervals etc.
2084 }
2085
2086 //helper function that avoids sanity check to be CTFE-friendly
2087 private static fromIntervals(Range)(Range intervals) pure
2088 {
2089 import std.algorithm.iteration : map;
2090 import std.range : roundRobin;
2091 auto flattened = roundRobin(intervals.save.map!"a[0]"(),
2092 intervals.save.map!"a[1]"());
2093 InversionList set;
2094 set.data = CowArray!(SP)(flattened);
2095 return set;
2096 }
2097 //ditto untill sort is CTFE-able
2098 private static fromIntervals()(uint[] intervals...) pure
2099 in
2100 {
2101 import std.conv : text;
2102 assert(intervals.length % 2 == 0, "Odd number of interval bounds [a, b)!");
2103 for (uint i = 0; i < intervals.length; i += 2)
2104 {
2105 auto a = intervals[i], b = intervals[i+1];
2106 assert(a < b, text("illegal interval [a, b): ", a, " > ", b));
2107 }
2108 }
2109 do
2110 {
2111 InversionList set;
2112 set.data = CowArray!(SP)(intervals);
2113 return set;
2114 }
2115
2116 /**
2117 Construct a set from plain values of code point intervals.
2118 */
2119 this()(uint[] intervals...)
2120 in
2121 {
2122 import std.conv : text;
2123 assert(intervals.length % 2 == 0, "Odd number of interval bounds [a, b)!");
2124 for (uint i = 0; i < intervals.length; i += 2)
2125 {
2126 auto a = intervals[i], b = intervals[i+1];
2127 assert(a < b, text("illegal interval [a, b): ", a, " > ", b));
2128 }
2129 }
2130 do
2131 {
2132 data = CowArray!(SP)(intervals);
2133 sanitize(); //enforce invariant: sort intervals etc.
2134 }
2135
2136 ///
2137 pure @safe unittest
2138 {
2139 import std.algorithm.comparison : equal;
2140
2141 auto set = CodepointSet('a', 'z'+1, 'а', 'я'+1);
2142 foreach (v; 'a'..'z'+1)
2143 assert(set[v]);
2144 // Cyrillic lowercase interval
2145 foreach (v; 'а'..'я'+1)
2146 assert(set[v]);
2147 //specific order is not required, intervals may interesect
2148 auto set2 = CodepointSet('а', 'я'+1, 'a', 'd', 'b', 'z'+1);
2149 //the same end result
2150 assert(set2.byInterval.equal(set.byInterval));
2151 // test constructor this(Range)(Range intervals)
2152 auto chessPiecesWhite = CodepointInterval(9812, 9818);
2153 auto chessPiecesBlack = CodepointInterval(9818, 9824);
2154 auto set3 = CodepointSet([chessPiecesWhite, chessPiecesBlack]);
2155 foreach (v; '♔'..'♟'+1)
2156 assert(set3[v]);
2157 }
2158
2159 /**
2160 Get range that spans all of the $(CODEPOINT) intervals in this $(LREF InversionList).
2161 */
2162 @property auto byInterval() scope
2163 {
2164 // TODO: change this to data[] once the -dip1000 errors have been fixed
2165 // see e.g. https://github.com/dlang/phobos/pull/6638
2166 import std.array : array;
2167 return Intervals!(typeof(data.array))(data.array);
2168 }
2169
2170 @safe unittest
2171 {
2172 import std.algorithm.comparison : equal;
2173 import std.typecons : tuple;
2174
2175 auto set = CodepointSet('A', 'D'+1, 'a', 'd'+1);
2176
2177 assert(set.byInterval.equal([tuple('A','E'), tuple('a','e')]));
2178 }
2179
2180 package(std) @property const(CodepointInterval)[] intervals() const
2181 {
2182 import std.array : array;
2183 return Intervals!(typeof(data[]))(data[]).array;
2184 }
2185
2186 /**
2187 Tests the presence of code point `val` in this set.
2188 */
2189 bool opIndex(uint val) const
2190 {
2191 // the <= ensures that searching in interval of [a, b) for 'a' you get .length == 1
2192 // return assumeSorted!((a,b) => a <= b)(data[]).lowerBound(val).length & 1;
2193 return sharSwitchLowerBound!"a <= b"(data[], val) & 1;
2194 }
2195
2196 ///
2197 pure @safe unittest
2198 {
2199 auto gothic = unicode.Gothic;
2200 // Gothic letter ahsa
2201 assert(gothic['\U00010330']);
2202 // no ascii in Gothic obviously
2203 assert(!gothic['$']);
2204 }
2205
2206
2207 // Linear scan for `ch`. Useful only for small sets.
2208 // TODO:
2209 // used internally in std.regex
2210 // should be properly exposed in a public API ?
2211 package(std) auto scanFor()(dchar ch) const
2212 {
2213 immutable len = data.length;
2214 for (size_t i = 0; i < len; i++)
2215 if (ch < data[i])
2216 return i & 1;
2217 return 0;
2218 }
2219
2220 /// Number of $(CODEPOINTS) in this set
2221 @property size_t length()
2222 {
2223 size_t sum = 0;
2224 foreach (iv; byInterval)
2225 {
2226 sum += iv.b - iv.a;
2227 }
2228 return sum;
2229 }
2230
2231 // bootstrap full set operations from 4 primitives (suitable as a template mixin):
2232 // addInterval, skipUpTo, dropUpTo & byInterval iteration
2233 //============================================================================
2234 public:
2235 /**
2236 $(P Sets support natural syntax for set algebra, namely: )
2237 $(BOOKTABLE ,
2238 $(TR $(TH Operator) $(TH Math notation) $(TH Description) )
2239 $(TR $(TD &) $(TD a ∩ b) $(TD intersection) )
2240 $(TR $(TD |) $(TD a ∪ b) $(TD union) )
2241 $(TR $(TD -) $(TD a ∖ b) $(TD subtraction) )
2242 $(TR $(TD ~) $(TD a ~ b) $(TD symmetric set difference i.e. (a ∪ b) \ (a ∩ b)) )
2243 )
2244 */
2245 This opBinary(string op, U)(U rhs)
2246 if (isCodepointSet!U || is(U:dchar))
2247 {
2248 static if (op == "&" || op == "|" || op == "~")
2249 {// symmetric ops thus can swap arguments to reuse r-value
2250 static if (is(U:dchar))
2251 {
2252 auto tmp = this;
2253 mixin("tmp "~op~"= rhs; ");
2254 return tmp;
2255 }
2256 else
2257 {
2258 static if (is(Unqual!U == U))
2259 {
2260 // try hard to reuse r-value
2261 mixin("rhs "~op~"= this;");
2262 return rhs;
2263 }
2264 else
2265 {
2266 auto tmp = this;
2267 mixin("tmp "~op~"= rhs;");
2268 return tmp;
2269 }
2270 }
2271 }
2272 else static if (op == "-") // anti-symmetric
2273 {
2274 auto tmp = this;
2275 tmp -= rhs;
2276 return tmp;
2277 }
2278 else
2279 static assert(0, "no operator "~op~" defined for Set");
2280 }
2281
2282 ///
2283 pure @safe unittest
2284 {
2285 import std.algorithm.comparison : equal;
2286 import std.range : iota;
2287
2288 auto lower = unicode.LowerCase;
2289 auto upper = unicode.UpperCase;
2290 auto ascii = unicode.ASCII;
2291
2292 assert((lower & upper).empty); // no intersection
2293 auto lowerASCII = lower & ascii;
2294 assert(lowerASCII.byCodepoint.equal(iota('a', 'z'+1)));
2295 // throw away all of the lowercase ASCII
2296 assert((ascii - lower).length == 128 - 26);
2297
2298 auto onlyOneOf = lower ~ ascii;
2299 assert(!onlyOneOf['Δ']); // not ASCII and not lowercase
2300 assert(onlyOneOf['$']); // ASCII and not lowercase
2301 assert(!onlyOneOf['a']); // ASCII and lowercase
2302 assert(onlyOneOf['я']); // not ASCII but lowercase
2303
2304 // throw away all cased letters from ASCII
2305 auto noLetters = ascii - (lower | upper);
2306 assert(noLetters.length == 128 - 26*2);
2307 }
2308
2309 /// The 'op=' versions of the above overloaded operators.
2310 ref This opOpAssign(string op, U)(U rhs)
2311 if (isCodepointSet!U || is(U:dchar))
2312 {
2313 static if (op == "|") // union
2314 {
2315 static if (is(U:dchar))
2316 {
2317 this.addInterval(rhs, rhs+1);
2318 return this;
2319 }
2320 else
2321 return this.add(rhs);
2322 }
2323 else static if (op == "&") // intersection
2324 return this.intersect(rhs);// overloaded
2325 else static if (op == "-") // set difference
2326 return this.sub(rhs);// overloaded
2327 else static if (op == "~") // symmetric set difference
2328 {
2329 auto copy = this & rhs;
2330 this |= rhs;
2331 this -= copy;
2332 return this;
2333 }
2334 else
2335 static assert(0, "no operator "~op~" defined for Set");
2336 }
2337
2338 /**
2339 Tests the presence of codepoint `ch` in this set,
2340 the same as $(LREF opIndex).
2341 */
2342 bool opBinaryRight(string op: "in", U)(U ch) const
2343 if (is(U : dchar))
2344 {
2345 return this[ch];
2346 }
2347
2348 ///
2349 pure @safe unittest
2350 {
2351 assert('я' in unicode.Cyrillic);
2352 assert(!('z' in unicode.Cyrillic));
2353 }
2354
2355
2356
2357 /**
2358 * Obtains a set that is the inversion of this set.
2359 *
2360 * See_Also: $(LREF inverted)
2361 */
2362 auto opUnary(string op: "!")()
2363 {
2364 return this.inverted;
2365 }
2366
2367 /**
2368 A range that spans each $(CODEPOINT) in this set.
2369 */
2370 @property auto byCodepoint()
2371 {
2372 static struct CodepointRange
2373 {
2374 this(This set)
2375 {
2376 r = set.byInterval;
2377 if (!r.empty)
2378 cur = r.front.a;
2379 }
2380
2381 @property dchar front() const
2382 {
2383 return cast(dchar) cur;
2384 }
2385
2386 @property bool empty() const
2387 {
2388 return r.empty;
2389 }
2390
2391 void popFront()
2392 {
2393 cur++;
2394 while (cur >= r.front.b)
2395 {
2396 r.popFront();
2397 if (r.empty)
2398 break;
2399 cur = r.front.a;
2400 }
2401 }
2402 private:
2403 uint cur;
2404 typeof(This.init.byInterval) r;
2405 }
2406
2407 return CodepointRange(this);
2408 }
2409
2410 ///
2411 pure @safe unittest
2412 {
2413 import std.algorithm.comparison : equal;
2414 import std.range : iota;
2415
2416 auto set = unicode.ASCII;
2417 set.byCodepoint.equal(iota(0, 0x80));
2418 }
2419
2420 /**
2421 $(P Obtain textual representation of this set in from of
2422 open-right intervals and feed it to `sink`.
2423 )
2424 $(P Used by various standard formatting facilities such as
2425 $(REF formattedWrite, std,format), $(REF write, std,stdio),
2426 $(REF writef, std,stdio), $(REF to, std,conv) and others.
2427 )
2428 Example:
2429 ---
2430 import std.conv;
2431 assert(unicode.ASCII.to!string == "[0..128$(RPAREN)");
2432 ---
2433 */
2434
2435 private import std.format.spec : FormatSpec;
2436
2437 /***************************************
2438 * Obtain a textual representation of this InversionList
2439 * in form of open-right intervals.
2440 *
2441 * The formatting flag is applied individually to each value, for example:
2442 * $(LI $(B %s) and $(B %d) format the intervals as a [low .. high$(RPAREN) range of integrals)
2443 * $(LI $(B %x) formats the intervals as a [low .. high$(RPAREN) range of lowercase hex characters)
2444 * $(LI $(B %X) formats the intervals as a [low .. high$(RPAREN) range of uppercase hex characters)
2445 */
2446 void toString(Writer)(scope Writer sink, scope const ref FormatSpec!char fmt) /* const */
2447 {
2448 import std.format.write : formatValue;
2449 auto range = byInterval;
2450 if (range.empty)
2451 return;
2452
2453 while (1)
2454 {
2455 auto i = range.front;
2456 range.popFront();
2457
2458 put(sink, "[");
2459 formatValue(sink, i.a, fmt);
2460 put(sink, "..");
2461 formatValue(sink, i.b, fmt);
2462 put(sink, ")");
2463 if (range.empty) return;
2464 put(sink, " ");
2465 }
2466 }
2467
2468 ///
2469 pure @safe unittest
2470 {
2471 import std.conv : to;
2472 import std.format : format;
2473 import std.uni : unicode;
2474
2475 assert(unicode.Cyrillic.to!string ==
2476 "[1024..1157) [1159..1320) [7467..7468) [7544..7545) [11744..11776) [42560..42648) [42655..42656)");
2477
2478 // The specs '%s' and '%d' are equivalent to the to!string call above.
2479 assert(format("%d", unicode.Cyrillic) == unicode.Cyrillic.to!string);
2480
2481 assert(format("%#x", unicode.Cyrillic) ==
2482 "[0x400..0x485) [0x487..0x528) [0x1d2b..0x1d2c) [0x1d78..0x1d79) [0x2de0..0x2e00) "
2483 ~"[0xa640..0xa698) [0xa69f..0xa6a0)");
2484
2485 assert(format("%#X", unicode.Cyrillic) ==
2486 "[0X400..0X485) [0X487..0X528) [0X1D2B..0X1D2C) [0X1D78..0X1D79) [0X2DE0..0X2E00) "
2487 ~"[0XA640..0XA698) [0XA69F..0XA6A0)");
2488 }
2489
2490 pure @safe unittest
2491 {
2492 import std.exception : assertThrown;
2493 import std.format : format, FormatException;
2494 assertThrown!FormatException(format("%z", unicode.ASCII));
2495 }
2496
2497
2498 /**
2499 Add an interval [a, b$(RPAREN) to this set.
2500 */
2501 ref add()(uint a, uint b)
2502 {
2503 addInterval(a, b);
2504 return this;
2505 }
2506
2507 ///
2508 pure @safe unittest
2509 {
2510 CodepointSet someSet;
2511 someSet.add('0', '5').add('A','Z'+1);
2512 someSet.add('5', '9'+1);
2513 assert(someSet['0']);
2514 assert(someSet['5']);
2515 assert(someSet['9']);
2516 assert(someSet['Z']);
2517 }
2518
2519 private:
2520
2521 package(std) // used from: std.regex.internal.parser
2522 ref intersect(U)(U rhs)
2523 if (isCodepointSet!U)
2524 {
2525 Marker mark;
2526 foreach ( i; rhs.byInterval)
2527 {
2528 mark = this.dropUpTo(i.a, mark);
2529 mark = this.skipUpTo(i.b, mark);
2530 }
2531 this.dropUpTo(uint.max, mark);
2532 return this;
2533 }
2534
2535 ref intersect()(dchar ch)
2536 {
2537 foreach (i; byInterval)
2538 if (i.a <= ch && ch < i.b)
2539 return this = This.init.add(ch, ch+1);
2540 this = This.init;
2541 return this;
2542 }
2543
2544 pure @safe unittest
2545 {
2546 assert(unicode.Cyrillic.intersect('-').byInterval.empty);
2547 }
2548
2549 ref sub()(dchar ch)
2550 {
2551 return subChar(ch);
2552 }
2553
2554 // same as the above except that skip & drop parts are swapped
2555 package(std) // used from: std.regex.internal.parser
2556 ref sub(U)(U rhs)
2557 if (isCodepointSet!U)
2558 {
2559 Marker mark;
2560 foreach (i; rhs.byInterval)
2561 {
2562 mark = this.skipUpTo(i.a, mark);
2563 mark = this.dropUpTo(i.b, mark);
2564 }
2565 return this;
2566 }
2567
2568 package(std) // used from: std.regex.internal.parse
2569 ref add(U)(U rhs)
2570 if (isCodepointSet!U)
2571 {
2572 Marker start;
2573 foreach (i; rhs.byInterval)
2574 {
2575 start = addInterval(i.a, i.b, start);
2576 }
2577 return this;
2578 }
2579
2580 // end of mixin-able part
2581 //============================================================================
2582 public:
2583 /**
2584 Obtains a set that is the inversion of this set.
2585
2586 See the '!' $(LREF opUnary) for the same but using operators.
2587 */
2588 @property auto inverted()
2589 {
2590 InversionList inversion = this;
2591 if (inversion.data.length == 0)
2592 {
2593 inversion.addInterval(0, lastDchar+1);
2594 return inversion;
2595 }
2596 if (inversion.data[0] != 0)
2597 genericReplace(inversion.data, 0, 0, [0]);
2598 else
2599 genericReplace(inversion.data, 0, 1, cast(uint[]) null);
2600 if (data[data.length-1] != lastDchar+1)
2601 genericReplace(inversion.data,
2602 inversion.data.length, inversion.data.length, [lastDchar+1]);
2603 else
2604 genericReplace(inversion.data,
2605 inversion.data.length-1, inversion.data.length, cast(uint[]) null);
2606
2607 return inversion;
2608 }
2609
2610 ///
2611 pure @safe unittest
2612 {
2613 auto set = unicode.ASCII;
2614 // union with the inverse gets all of the code points in the Unicode
2615 assert((set | set.inverted).length == 0x110000);
2616 // no intersection with the inverse
2617 assert((set & set.inverted).empty);
2618 }
2619
2620 package(std) static string toSourceCode(const(CodepointInterval)[] range, string funcName)
2621 {
2622 import std.algorithm.searching : countUntil;
2623 import std.format : format;
2624 enum maxBinary = 3;
2625 static string linearScope(R)(R ivals, string indent)
2626 {
2627 string result = indent~"{\n";
2628 string deeper = indent~" ";
2629 foreach (ival; ivals)
2630 {
2631 immutable span = ival[1] - ival[0];
2632 assert(span != 0);
2633 if (span == 1)
2634 {
2635 result ~= format("%sif (ch == %s) return true;\n", deeper, ival[0]);
2636 }
2637 else if (span == 2)
2638 {
2639 result ~= format("%sif (ch == %s || ch == %s) return true;\n",
2640 deeper, ival[0], ival[0]+1);
2641 }
2642 else
2643 {
2644 if (ival[0] != 0) // dchar is unsigned and < 0 is useless
2645 result ~= format("%sif (ch < %s) return false;\n", deeper, ival[0]);
2646 result ~= format("%sif (ch < %s) return true;\n", deeper, ival[1]);
2647 }
2648 }
2649 result ~= format("%sreturn false;\n%s}\n", deeper, indent); // including empty range of intervals
2650 return result;
2651 }
2652
2653 static string binaryScope(R)(R ivals, string indent) @safe
2654 {
2655 // time to do unrolled comparisons?
2656 if (ivals.length < maxBinary)
2657 return linearScope(ivals, indent);
2658 else
2659 return bisect(ivals, ivals.length/2, indent);
2660 }
2661
2662 // not used yet if/elsebinary search is far better with DMD as of 2.061
2663 // and GDC is doing fine job either way
2664 static string switchScope(R)(R ivals, string indent)
2665 {
2666 string result = indent~"switch (ch){\n";
2667 string deeper = indent~" ";
2668 foreach (ival; ivals)
2669 {
2670 if (ival[0]+1 == ival[1])
2671 {
2672 result ~= format("%scase %s: return true;\n",
2673 deeper, ival[0]);
2674 }
2675 else
2676 {
2677 result ~= format("%scase %s: .. case %s: return true;\n",
2678 deeper, ival[0], ival[1]-1);
2679 }
2680 }
2681 result ~= deeper~"default: return false;\n"~indent~"}\n";
2682 return result;
2683 }
2684
2685 static string bisect(R)(R range, size_t idx, string indent)
2686 {
2687 string deeper = indent ~ " ";
2688 // bisect on one [a, b) interval at idx
2689 string result = indent~"{\n";
2690 // less branch, < a
2691 result ~= format("%sif (ch < %s)\n%s",
2692 deeper, range[idx][0], binaryScope(range[0 .. idx], deeper));
2693 // middle point, >= a && < b
2694 result ~= format("%selse if (ch < %s) return true;\n",
2695 deeper, range[idx][1]);
2696 // greater or equal branch, >= b
2697 result ~= format("%selse\n%s",
2698 deeper, binaryScope(range[idx+1..$], deeper));
2699 return result~indent~"}\n";
2700 }
2701
2702 string code = format("bool %s(dchar ch) @safe pure nothrow @nogc\n",
2703 funcName.empty ? "function" : funcName);
2704 // special case first bisection to be on ASCII vs beyond
2705 auto tillAscii = countUntil!"a[0] > 0x80"(range);
2706 if (tillAscii <= 0) // everything is ASCII or nothing is ascii (-1 & 0)
2707 code ~= binaryScope(range, "");
2708 else
2709 code ~= bisect(range, tillAscii, "");
2710 return code;
2711 }
2712
2713 /**
2714 Generates string with D source code of unary function with name of
2715 `funcName` taking a single `dchar` argument. If `funcName` is empty
2716 the code is adjusted to be a lambda function.
2717
2718 The function generated tests if the $(CODEPOINT) passed
2719 belongs to this set or not. The result is to be used with string mixin.
2720 The intended usage area is aggressive optimization via meta programming
2721 in parser generators and the like.
2722
2723 Note: Use with care for relatively small or regular sets. It
2724 could end up being slower then just using multi-staged tables.
2725
2726 Example:
2727 ---
2728 import std.stdio;
2729
2730 // construct set directly from [a, b$RPAREN intervals
2731 auto set = CodepointSet(10, 12, 45, 65, 100, 200);
2732 writeln(set);
2733 writeln(set.toSourceCode("func"));
2734 ---
2735
2736 The above outputs something along the lines of:
2737 ---
2738 bool func(dchar ch) @safe pure nothrow @nogc
2739 {
2740 if (ch < 45)
2741 {
2742 if (ch == 10 || ch == 11) return true;
2743 return false;
2744 }
2745 else if (ch < 65) return true;
2746 else
2747 {
2748 if (ch < 100) return false;
2749 if (ch < 200) return true;
2750 return false;
2751 }
2752 }
2753 ---
2754 */
2755 string toSourceCode(string funcName="")
2756 {
2757 import std.array : array;
2758 auto range = byInterval.array();
2759 return toSourceCode(range, funcName);
2760 }
2761
2762 /**
2763 True if this set doesn't contain any $(CODEPOINTS).
2764 */
2765 @property bool empty() const
2766 {
2767 return data.length == 0;
2768 }
2769
2770 ///
2771 pure @safe unittest
2772 {
2773 CodepointSet emptySet;
2774 assert(emptySet.length == 0);
2775 assert(emptySet.empty);
2776 }
2777
2778 private:
2779 alias This = typeof(this);
2780 alias Marker = size_t;
2781
2782 // a random-access range of integral pairs
2783 static struct Intervals(Range)
2784 {
2785 import std.range.primitives : hasAssignableElements;
2786
2787 this(Range sp) scope
2788 {
2789 slice = sp;
2790 start = 0;
2791 end = sp.length;
2792 }
2793
2794 this(Range sp, size_t s, size_t e) scope
2795 {
2796 slice = sp;
2797 start = s;
2798 end = e;
2799 }
2800
2801 @property auto front()const
2802 {
2803 immutable a = slice[start];
2804 immutable b = slice[start+1];
2805 return CodepointInterval(a, b);
2806 }
2807
2808 //may break sorted property - but we need std.sort to access it
2809 //hence package(std) protection attribute
2810 static if (hasAssignableElements!Range)
2811 package(std) @property void front(CodepointInterval val)
2812 {
2813 slice[start] = val.a;
2814 slice[start+1] = val.b;
2815 }
2816
2817 @property auto back()const
2818 {
2819 immutable a = slice[end-2];
2820 immutable b = slice[end-1];
2821 return CodepointInterval(a, b);
2822 }
2823
2824 //ditto about package
2825 static if (hasAssignableElements!Range)
2826 package(std) @property void back(CodepointInterval val)
2827 {
2828 slice[end-2] = val.a;
2829 slice[end-1] = val.b;
2830 }
2831
2832 void popFront()
2833 {
2834 start += 2;
2835 }
2836
2837 void popBack()
2838 {
2839 end -= 2;
2840 }
2841
2842 auto opIndex(size_t idx) const
2843 {
2844 immutable a = slice[start+idx*2];
2845 immutable b = slice[start+idx*2+1];
2846 return CodepointInterval(a, b);
2847 }
2848
2849 //ditto about package
2850 static if (hasAssignableElements!Range)
2851 package(std) void opIndexAssign(CodepointInterval val, size_t idx)
2852 {
2853 slice[start+idx*2] = val.a;
2854 slice[start+idx*2+1] = val.b;
2855 }
2856
2857 auto opSlice(size_t s, size_t e)
2858 {
2859 return Intervals(slice, s*2+start, e*2+start);
2860 }
2861
2862 @property size_t length()const { return slice.length/2; }
2863
2864 @property bool empty()const { return start == end; }
2865
2866 @property auto save(){ return this; }
2867 private:
2868 size_t start, end;
2869 Range slice;
2870 }
2871
2872 // called after construction from intervals
2873 // to make sure invariants hold
2874 void sanitize()
2875 {
2876 import std.algorithm.comparison : max;
2877 import std.algorithm.mutation : SwapStrategy;
2878 import std.algorithm.sorting : sort;
2879 if (data.length == 0)
2880 return;
2881 alias Ival = CodepointInterval;
2882 //intervals wrapper for a _range_ over packed array
2883 auto ivals = Intervals!(typeof(data[]))(data[]);
2884 //@@@BUG@@@ can't use "a.a < b.a" see
2885 // https://issues.dlang.org/show_bug.cgi?id=12265
2886 sort!((a,b) => a.a < b.a, SwapStrategy.stable)(ivals);
2887 // what follows is a variation on stable remove
2888 // differences:
2889 // - predicate is binary, and is tested against
2890 // the last kept element (at 'i').
2891 // - predicate mutates lhs (merges rhs into lhs)
2892 size_t len = ivals.length;
2893 size_t i = 0;
2894 size_t j = 1;
2895 while (j < len)
2896 {
2897 if (ivals[i].b >= ivals[j].a)
2898 {
2899 ivals[i] = Ival(ivals[i].a, max(ivals[i].b, ivals[j].b));
2900 j++;
2901 }
2902 else //unmergable
2903 {
2904 // check if there is a hole after merges
2905 // (in the best case we do 0 writes to ivals)
2906 if (j != i+1)
2907 ivals[i+1] = ivals[j]; //copy over
2908 i++;
2909 j++;
2910 }
2911 }
2912 len = i + 1;
2913 for (size_t k=0; k + 1 < len; k++)
2914 {
2915 assert(ivals[k].a < ivals[k].b);
2916 assert(ivals[k].b < ivals[k+1].a);
2917 }
2918 data.length = len * 2;
2919 }
2920
2921 // special case for normal InversionList
2922 ref subChar(dchar ch)
2923 {
2924 auto mark = skipUpTo(ch);
2925 if (mark != data.length
2926 && data[mark] == ch && data[mark-1] == ch)
2927 {
2928 // it has split, meaning that ch happens to be in one of intervals
2929 data[mark] = data[mark]+1;
2930 }
2931 return this;
2932 }
2933
2934 //
2935 Marker addInterval(int a, int b, Marker hint=Marker.init) scope
2936 in
2937 {
2938 assert(a <= b);
2939 }
2940 do
2941 {
2942 import std.range : assumeSorted, SearchPolicy;
2943 auto range = assumeSorted(data[]);
2944 size_t pos;
2945 size_t a_idx = hint + range[hint..$].lowerBound!(SearchPolicy.gallop)(a).length;
2946 if (a_idx == range.length)
2947 {
2948 // [---+++----++++----++++++]
2949 // [ a b]
2950 data.append(a, b);
2951 return data.length-1;
2952 }
2953 size_t b_idx = range[a_idx .. range.length].lowerBound!(SearchPolicy.gallop)(b).length+a_idx;
2954 uint[3] buf = void;
2955 uint to_insert;
2956 debug(std_uni)
2957 {
2958 writefln("a_idx=%d; b_idx=%d;", a_idx, b_idx);
2959 }
2960 if (b_idx == range.length)
2961 {
2962 // [-------++++++++----++++++-]
2963 // [ s a b]
2964 if (a_idx & 1)// a in positive
2965 {
2966 buf[0] = b;
2967 to_insert = 1;
2968 }
2969 else// a in negative
2970 {
2971 buf[0] = a;
2972 buf[1] = b;
2973 to_insert = 2;
2974 }
2975 pos = genericReplace(data, a_idx, b_idx, buf[0 .. to_insert]);
2976 return pos - 1;
2977 }
2978
2979 uint top = data[b_idx];
2980
2981 debug(std_uni)
2982 {
2983 writefln("a_idx=%d; b_idx=%d;", a_idx, b_idx);
2984 writefln("a=%s; b=%s; top=%s;", a, b, top);
2985 }
2986 if (a_idx & 1)
2987 {// a in positive
2988 if (b_idx & 1)// b in positive
2989 {
2990 // [-------++++++++----++++++-]
2991 // [ s a b ]
2992 buf[0] = top;
2993 to_insert = 1;
2994 }
2995 else // b in negative
2996 {
2997 // [-------++++++++----++++++-]
2998 // [ s a b ]
2999 if (top == b)
3000 {
3001 assert(b_idx+1 < data.length);
3002 buf[0] = data[b_idx+1];
3003 pos = genericReplace(data, a_idx, b_idx+2, buf[0 .. 1]);
3004 return pos - 1;
3005 }
3006 buf[0] = b;
3007 buf[1] = top;
3008 to_insert = 2;
3009 }
3010 }
3011 else
3012 { // a in negative
3013 if (b_idx & 1) // b in positive
3014 {
3015 // [----------+++++----++++++-]
3016 // [ a b ]
3017 buf[0] = a;
3018 buf[1] = top;
3019 to_insert = 2;
3020 }
3021 else// b in negative
3022 {
3023 // [----------+++++----++++++-]
3024 // [ a s b ]
3025 if (top == b)
3026 {
3027 assert(b_idx+1 < data.length);
3028 buf[0] = a;
3029 buf[1] = data[b_idx+1];
3030 pos = genericReplace(data, a_idx, b_idx+2, buf[0 .. 2]);
3031 return pos - 1;
3032 }
3033 buf[0] = a;
3034 buf[1] = b;
3035 buf[2] = top;
3036 to_insert = 3;
3037 }
3038 }
3039 pos = genericReplace(data, a_idx, b_idx+1, buf[0 .. to_insert]);
3040 debug(std_uni)
3041 {
3042 writefln("marker idx: %d; length=%d", pos, data[pos], data.length);
3043 writeln("inserting ", buf[0 .. to_insert]);
3044 }
3045 return pos - 1;
3046 }
3047
3048 //
3049 Marker dropUpTo(uint a, Marker pos=Marker.init)
3050 in
3051 {
3052 assert(pos % 2 == 0); // at start of interval
3053 }
3054 do
3055 {
3056 auto range = assumeSorted!"a <= b"(data[pos .. data.length]);
3057 if (range.empty)
3058 return pos;
3059 size_t idx = pos;
3060 idx += range.lowerBound(a).length;
3061
3062 debug(std_uni)
3063 {
3064 writeln("dropUpTo full length=", data.length);
3065 writeln(pos,"~~~", idx);
3066 }
3067 if (idx == data.length)
3068 return genericReplace(data, pos, idx, cast(uint[])[]);
3069 if (idx & 1)
3070 { // a in positive
3071 //[--+++----++++++----+++++++------...]
3072 // |<---si s a t
3073 genericReplace(data, pos, idx, [a]);
3074 }
3075 else
3076 { // a in negative
3077 //[--+++----++++++----+++++++-------+++...]
3078 // |<---si s a t
3079 genericReplace(data, pos, idx, cast(uint[])[]);
3080 }
3081 return pos;
3082 }
3083
3084 //
3085 Marker skipUpTo(uint a, Marker pos=Marker.init)
3086 out(result)
3087 {
3088 assert(result % 2 == 0);// always start of interval
3089 //(may be 0-width after-split)
3090 }
3091 do
3092 {
3093 assert(data.length % 2 == 0);
3094 auto range = assumeSorted!"a <= b"(data[pos .. data.length]);
3095 size_t idx = pos+range.lowerBound(a).length;
3096
3097 if (idx >= data.length) // could have Marker point to recently removed stuff
3098 return data.length;
3099
3100 if (idx & 1)// inside of interval, check for split
3101 {
3102
3103 immutable top = data[idx];
3104 if (top == a)// no need to split, it's end
3105 return idx+1;
3106 immutable start = data[idx-1];
3107 if (a == start)
3108 return idx-1;
3109 // split it up
3110 genericReplace(data, idx, idx+1, [a, a, top]);
3111 return idx+1; // avoid odd index
3112 }
3113 return idx;
3114 }
3115
3116 CowArray!SP data;
3117 }
3118
3119 pure @safe unittest
3120 {
3121 import std.conv : to;
3122 assert(unicode.ASCII.to!string() == "[0..128)");
3123 }
3124
3125 // pedantic version for ctfe, and aligned-access only architectures
3126 @system private uint safeRead24(scope const ubyte* ptr, size_t idx) pure nothrow @nogc
3127 {
3128 idx *= 3;
3129 version (LittleEndian)
3130 return ptr[idx] + (cast(uint) ptr[idx+1]<<8)
3131 + (cast(uint) ptr[idx+2]<<16);
3132 else
3133 return (cast(uint) ptr[idx]<<16) + (cast(uint) ptr[idx+1]<<8)
3134 + ptr[idx+2];
3135 }
3136
3137 // ditto
3138 @system private void safeWrite24(scope ubyte* ptr, uint val, size_t idx) pure nothrow @nogc
3139 {
3140 idx *= 3;
3141 version (LittleEndian)
3142 {
3143 ptr[idx] = val & 0xFF;
3144 ptr[idx+1] = (val >> 8) & 0xFF;
3145 ptr[idx+2] = (val >> 16) & 0xFF;
3146 }
3147 else
3148 {
3149 ptr[idx] = (val >> 16) & 0xFF;
3150 ptr[idx+1] = (val >> 8) & 0xFF;
3151 ptr[idx+2] = val & 0xFF;
3152 }
3153 }
3154
3155 // unaligned x86-like read/write functions
3156 @system private uint unalignedRead24(scope const ubyte* ptr, size_t idx) pure nothrow @nogc
3157 {
3158 uint* src = cast(uint*)(ptr+3*idx);
3159 version (LittleEndian)
3160 return *src & 0xFF_FFFF;
3161 else
3162 return *src >> 8;
3163 }
3164
3165 // ditto
3166 @system private void unalignedWrite24(scope ubyte* ptr, uint val, size_t idx) pure nothrow @nogc
3167 {
3168 uint* dest = cast(uint*)(cast(ubyte*) ptr + 3*idx);
3169 version (LittleEndian)
3170 *dest = val | (*dest & 0xFF00_0000);
3171 else
3172 *dest = (val << 8) | (*dest & 0xFF);
3173 }
3174
3175 @system private uint read24(scope const ubyte* ptr, size_t idx) pure nothrow @nogc
3176 {
3177 static if (hasUnalignedReads)
3178 return __ctfe ? safeRead24(ptr, idx) : unalignedRead24(ptr, idx);
3179 else
3180 return safeRead24(ptr, idx);
3181 }
3182
3183 @system private void write24(scope ubyte* ptr, uint val, size_t idx) pure nothrow @nogc
3184 {
3185 static if (hasUnalignedReads)
3186 return __ctfe ? safeWrite24(ptr, val, idx) : unalignedWrite24(ptr, val, idx);
3187 else
3188 return safeWrite24(ptr, val, idx);
3189 }
3190
3191 struct CowArray(SP=GcPolicy)
3192 {
3193 import std.range.primitives : hasLength;
3194
3195 @safe:
3196 static auto reuse(uint[] arr)
3197 {
3198 CowArray cow;
3199 cow.data = arr;
3200 SP.append(cow.data, 1);
3201 assert(cow.refCount == 1);
3202 assert(cow.length == arr.length);
3203 return cow;
3204 }
3205
3206 this(Range)(Range range)
3207 if (isInputRange!Range && hasLength!Range)
3208 {
3209 import std.algorithm.mutation : copy;
3210 length = range.length;
3211 copy(range, data[0..$-1]);
3212 }
3213
3214 this(Range)(Range range)
3215 if (isForwardRange!Range && !hasLength!Range)
3216 {
3217 import std.algorithm.mutation : copy;
3218 import std.range.primitives : walkLength;
3219 immutable len = walkLength(range.save);
3220 length = len;
3221 copy(range, data[0..$-1]);
3222 }
3223
3224 this(this)
3225 {
3226 if (!empty)
3227 {
3228 refCount = refCount + 1;
3229 }
3230 }
3231
3232 ~this()
3233 {
3234 if (!empty)
3235 {
3236 immutable cnt = refCount;
3237 if (cnt == 1)
3238 SP.destroy(data);
3239 else
3240 refCount = cnt - 1;
3241 }
3242 }
3243
3244 // no ref-count for empty U24 array
3245 @property bool empty() const { return data.length == 0; }
3246
3247 // report one less then actual size
3248 @property size_t length() const
3249 {
3250 return data.length ? data.length - 1 : 0;
3251 }
3252
3253 //+ an extra slot for ref-count
3254 @property void length(size_t len)
3255 {
3256 import std.algorithm.comparison : min;
3257 import std.algorithm.mutation : copy;
3258 if (len == 0)
3259 {
3260 if (!empty)
3261 freeThisReference();
3262 return;
3263 }
3264 immutable total = len + 1; // including ref-count
3265 if (empty)
3266 {
3267 data = SP.alloc!uint(total);
3268 refCount = 1;
3269 return;
3270 }
3271 immutable cur_cnt = refCount;
3272 if (cur_cnt != 1) // have more references to this memory
3273 {
3274 refCount = cur_cnt - 1;
3275 auto new_data = SP.alloc!uint(total);
3276 // take shrinking into account
3277 auto to_copy = min(total, data.length) - 1;
3278 copy(data[0 .. to_copy], new_data[0 .. to_copy]);
3279 data = new_data; // before setting refCount!
3280 refCount = 1;
3281 }
3282 else // 'this' is the only reference
3283 {
3284 // use the realloc (hopefully in-place operation)
3285 data = SP.realloc(data, total);
3286 refCount = 1; // setup a ref-count in the new end of the array
3287 }
3288 }
3289
3290 alias opDollar = length;
3291
3292 uint opIndex()(size_t idx)const
3293 {
3294 return data[idx];
3295 }
3296
3297 void opIndexAssign(uint val, size_t idx)
3298 {
3299 auto cnt = refCount;
3300 if (cnt != 1)
3301 dupThisReference(cnt);
3302 data[idx] = val;
3303 }
3304
3305 //
3306 auto opSlice(size_t from, size_t to)
3307 {
3308 if (!empty)
3309 {
3310 auto cnt = refCount;
3311 if (cnt != 1)
3312 dupThisReference(cnt);
3313 }
3314 return data[from .. to];
3315
3316 }
3317
3318 //
3319 auto opSlice(size_t from, size_t to) const
3320 {
3321 return data[from .. to];
3322 }
3323
3324 // length slices before the ref count
3325 auto opSlice()
3326 {
3327 return opSlice(0, length);
3328 }
3329
3330 // ditto
3331 auto opSlice() const
3332 {
3333 return opSlice(0, length);
3334 }
3335
3336 void append(Range)(Range range)
3337 if (isInputRange!Range && hasLength!Range && is(ElementType!Range : uint))
3338 {
3339 size_t nl = length + range.length;
3340 length = nl;
3341 copy(range, this[nl-range.length .. nl]);
3342 }
3343
3344 void append()(uint[] val...)
3345 {
3346 length = length + val.length;
3347 data[$-val.length-1 .. $-1] = val[];
3348 }
3349
3350 bool opEquals()(auto const ref CowArray rhs)const
3351 {
3352 if (empty ^ rhs.empty)
3353 return false; // one is empty and the other isn't
3354 return empty || data[0..$-1] == rhs.data[0..$-1];
3355 }
3356
3357 private:
3358 // ref-count is right after the data
3359 @property uint refCount() const
3360 {
3361 return data[$-1];
3362 }
3363
3364 @property void refCount(uint cnt)
3365 {
3366 data[$-1] = cnt;
3367 }
3368
3369 void freeThisReference()
3370 {
3371 immutable count = refCount;
3372 if (count != 1) // have more references to this memory
3373 {
3374 // dec shared ref-count
3375 refCount = count - 1;
3376 data = [];
3377 }
3378 else
3379 SP.destroy(data);
3380 assert(!data.ptr);
3381 }
3382
3383 void dupThisReference(uint count)
3384 in
3385 {
3386 assert(!empty && count != 1 && count == refCount);
3387 }
3388 do
3389 {
3390 import std.algorithm.mutation : copy;
3391 // dec shared ref-count
3392 refCount = count - 1;
3393 // copy to the new chunk of RAM
3394 auto new_data = SP.alloc!uint(data.length);
3395 // bit-blit old stuff except the counter
3396 copy(data[0..$-1], new_data[0..$-1]);
3397 data = new_data; // before setting refCount!
3398 refCount = 1; // so that this updates the right one
3399 }
3400
3401 uint[] data;
3402 }
3403
3404 pure @safe unittest// Uint24 tests
3405 {
3406 import std.algorithm.comparison : equal;
3407 import std.algorithm.mutation : copy;
3408 import std.conv : text;
3409 import std.range : iota, chain;
3410 import std.range.primitives : isBidirectionalRange, isOutputRange;
3411 void funcRef(T)(ref T u24)
3412 {
3413 u24.length = 2;
3414 u24[1] = 1024;
3415 T u24_c = u24;
3416 assert(u24[1] == 1024);
3417 u24.length = 0;
3418 assert(u24.empty);
3419 u24.append([1, 2]);
3420 assert(equal(u24[], [1, 2]));
3421 u24.append(111);
3422 assert(equal(u24[], [1, 2, 111]));
3423 assert(!u24_c.empty && u24_c[1] == 1024);
3424 u24.length = 3;
3425 copy(iota(0, 3), u24[]);
3426 assert(equal(u24[], iota(0, 3)));
3427 assert(u24_c[1] == 1024);
3428 }
3429
3430 void func2(T)(T u24)
3431 {
3432 T u24_2 = u24;
3433 T u24_3;
3434 u24_3 = u24_2;
3435 assert(u24_2 == u24_3);
3436 assert(equal(u24[], u24_2[]));
3437 assert(equal(u24_2[], u24_3[]));
3438 funcRef(u24_3);
3439
3440 assert(equal(u24_3[], iota(0, 3)));
3441 assert(!equal(u24_2[], u24_3[]));
3442 assert(equal(u24_2[], u24[]));
3443 u24_2 = u24_3;
3444 assert(equal(u24_2[], iota(0, 3)));
3445 // to test that passed arg is intact outside
3446 // plus try out opEquals
3447 u24 = u24_3;
3448 u24 = T.init;
3449 u24_3 = T.init;
3450 assert(u24.empty);
3451 assert(u24 == u24_3);
3452 assert(u24 != u24_2);
3453 }
3454
3455 static foreach (Policy; AliasSeq!(GcPolicy, ReallocPolicy))
3456 {{
3457 alias Range = typeof(CowArray!Policy.init[]);
3458 alias U24A = CowArray!Policy;
3459 static assert(isForwardRange!Range);
3460 static assert(isBidirectionalRange!Range);
3461 static assert(isOutputRange!(Range, uint));
3462 static assert(isRandomAccessRange!(Range));
3463
3464 auto arr = U24A([42u, 36, 100]);
3465 assert(arr[0] == 42);
3466 assert(arr[1] == 36);
3467 arr[0] = 72;
3468 arr[1] = 0xFE_FEFE;
3469 assert(arr[0] == 72);
3470 assert(arr[1] == 0xFE_FEFE);
3471 assert(arr[2] == 100);
3472 U24A arr2 = arr;
3473 assert(arr2[0] == 72);
3474 arr2[0] = 11;
3475 // test COW-ness
3476 assert(arr[0] == 72);
3477 assert(arr2[0] == 11);
3478 // set this to about 100M to stress-test COW memory management
3479 foreach (v; 0 .. 10_000)
3480 func2(arr);
3481 assert(equal(arr[], [72, 0xFE_FEFE, 100]));
3482
3483 auto r2 = U24A(iota(0, 100));
3484 assert(equal(r2[], iota(0, 100)), text(r2[]));
3485 copy(iota(10, 170, 2), r2[10 .. 90]);
3486 assert(equal(r2[], chain(iota(0, 10), iota(10, 170, 2), iota(90, 100)))
3487 , text(r2[]));
3488 }}
3489 }
3490
3491 pure @safe unittest// core set primitives test
3492 {
3493 import std.conv : text;
3494 alias AllSets = AliasSeq!(InversionList!GcPolicy, InversionList!ReallocPolicy);
3495 foreach (CodeList; AllSets)
3496 {
3497 CodeList a;
3498 //"plug a hole" test
3499 a.add(10, 20).add(25, 30).add(15, 27);
3500 assert(a == CodeList(10, 30), text(a));
3501
3502 auto x = CodeList.init;
3503 x.add(10, 20).add(30, 40).add(50, 60);
3504
3505 a = x;
3506 a.add(20, 49);//[10, 49) [50, 60)
3507 assert(a == CodeList(10, 49, 50 ,60));
3508
3509 a = x;
3510 a.add(20, 50);
3511 assert(a == CodeList(10, 60), text(a));
3512
3513 // simple unions, mostly edge effects
3514 x = CodeList.init;
3515 x.add(10, 20).add(40, 60);
3516
3517 a = x;
3518 a.add(10, 25); //[10, 25) [40, 60)
3519 assert(a == CodeList(10, 25, 40, 60));
3520
3521 a = x;
3522 a.add(5, 15); //[5, 20) [40, 60)
3523 assert(a == CodeList(5, 20, 40, 60));
3524
3525 a = x;
3526 a.add(0, 10); // [0, 20) [40, 60)
3527 assert(a == CodeList(0, 20, 40, 60));
3528
3529 a = x;
3530 a.add(0, 5); // prepand
3531 assert(a == CodeList(0, 5, 10, 20, 40, 60), text(a));
3532
3533 a = x;
3534 a.add(5, 20);
3535 assert(a == CodeList(5, 20, 40, 60));
3536
3537 a = x;
3538 a.add(3, 37);
3539 assert(a == CodeList(3, 37, 40, 60));
3540
3541 a = x;
3542 a.add(37, 65);
3543 assert(a == CodeList(10, 20, 37, 65));
3544
3545 // some tests on helpers for set intersection
3546 x = CodeList.init.add(10, 20).add(40, 60).add(100, 120);
3547 a = x;
3548
3549 auto m = a.skipUpTo(60);
3550 a.dropUpTo(110, m);
3551 assert(a == CodeList(10, 20, 40, 60, 110, 120), text(a.data[]));
3552
3553 a = x;
3554 a.dropUpTo(100);
3555 assert(a == CodeList(100, 120), text(a.data[]));
3556
3557 a = x;
3558 m = a.skipUpTo(50);
3559 a.dropUpTo(140, m);
3560 assert(a == CodeList(10, 20, 40, 50), text(a.data[]));
3561 a = x;
3562 a.dropUpTo(60);
3563 assert(a == CodeList(100, 120), text(a.data[]));
3564 }
3565 }
3566
3567
3568 //test constructor to work with any order of intervals
3569 pure @safe unittest
3570 {
3571 import std.algorithm.comparison : equal;
3572 import std.conv : text, to;
3573 import std.range : chain, iota;
3574 import std.typecons : tuple;
3575 //ensure constructor handles bad ordering and overlap
3576 auto c1 = CodepointSet('а', 'я'+1, 'А','Я'+1);
3577 foreach (ch; chain(iota('а', 'я'+1), iota('А','Я'+1)))
3578 assert(ch in c1, to!string(ch));
3579
3580 //contiguos
3581 assert(CodepointSet(1000, 1006, 1006, 1009)
3582 .byInterval.equal([tuple(1000, 1009)]));
3583 //contains
3584 assert(CodepointSet(900, 1200, 1000, 1100)
3585 .byInterval.equal([tuple(900, 1200)]));
3586 //intersect left
3587 assert(CodepointSet(900, 1100, 1000, 1200)
3588 .byInterval.equal([tuple(900, 1200)]));
3589 //intersect right
3590 assert(CodepointSet(1000, 1200, 900, 1100)
3591 .byInterval.equal([tuple(900, 1200)]));
3592
3593 //ditto with extra items at end
3594 assert(CodepointSet(1000, 1200, 900, 1100, 800, 850)
3595 .byInterval.equal([tuple(800, 850), tuple(900, 1200)]));
3596 assert(CodepointSet(900, 1100, 1000, 1200, 800, 850)
3597 .byInterval.equal([tuple(800, 850), tuple(900, 1200)]));
3598
3599 //"plug a hole" test
3600 auto c2 = CodepointSet(20, 40,
3601 60, 80, 100, 140, 150, 200,
3602 40, 60, 80, 100, 140, 150
3603 );
3604 assert(c2.byInterval.equal([tuple(20, 200)]));
3605
3606 auto c3 = CodepointSet(
3607 20, 40, 60, 80, 100, 140, 150, 200,
3608 0, 10, 15, 100, 10, 20, 200, 220);
3609 assert(c3.byInterval.equal([tuple(0, 140), tuple(150, 220)]));
3610 }
3611
3612
3613 pure @safe unittest
3614 { // full set operations
3615 import std.conv : text;
3616 alias AllSets = AliasSeq!(InversionList!GcPolicy, InversionList!ReallocPolicy);
3617 foreach (CodeList; AllSets)
3618 {
3619 CodeList a, b, c, d;
3620
3621 //"plug a hole"
3622 a.add(20, 40).add(60, 80).add(100, 140).add(150, 200);
3623 b.add(40, 60).add(80, 100).add(140, 150);
3624 c = a | b;
3625 d = b | a;
3626 assert(c == CodeList(20, 200), text(CodeList.stringof," ", c));
3627 assert(c == d, text(c," vs ", d));
3628
3629 b = CodeList.init.add(25, 45).add(65, 85).add(95,110).add(150, 210);
3630 c = a | b; //[20,45) [60, 85) [95, 140) [150, 210)
3631 d = b | a;
3632 assert(c == CodeList(20, 45, 60, 85, 95, 140, 150, 210), text(c));
3633 assert(c == d, text(c," vs ", d));
3634
3635 b = CodeList.init.add(10, 20).add(30,100).add(145,200);
3636 c = a | b;//[10, 140) [145, 200)
3637 d = b | a;
3638 assert(c == CodeList(10, 140, 145, 200));
3639 assert(c == d, text(c," vs ", d));
3640
3641 b = CodeList.init.add(0, 10).add(15, 100).add(10, 20).add(200, 220);
3642 c = a | b;//[0, 140) [150, 220)
3643 d = b | a;
3644 assert(c == CodeList(0, 140, 150, 220));
3645 assert(c == d, text(c," vs ", d));
3646
3647
3648 a = CodeList.init.add(20, 40).add(60, 80);
3649 b = CodeList.init.add(25, 35).add(65, 75);
3650 c = a & b;
3651 d = b & a;
3652 assert(c == CodeList(25, 35, 65, 75), text(c));
3653 assert(c == d, text(c," vs ", d));
3654
3655 a = CodeList.init.add(20, 40).add(60, 80).add(100, 140).add(150, 200);
3656 b = CodeList.init.add(25, 35).add(65, 75).add(110, 130).add(160, 180);
3657 c = a & b;
3658 d = b & a;
3659 assert(c == CodeList(25, 35, 65, 75, 110, 130, 160, 180), text(c));
3660 assert(c == d, text(c," vs ", d));
3661
3662 a = CodeList.init.add(20, 40).add(60, 80).add(100, 140).add(150, 200);
3663 b = CodeList.init.add(10, 30).add(60, 120).add(135, 160);
3664 c = a & b;//[20, 30)[60, 80) [100, 120) [135, 140) [150, 160)
3665 d = b & a;
3666
3667 assert(c == CodeList(20, 30, 60, 80, 100, 120, 135, 140, 150, 160),text(c));
3668 assert(c == d, text(c, " vs ",d));
3669 assert((c & a) == c);
3670 assert((d & b) == d);
3671 assert((c & d) == d);
3672
3673 b = CodeList.init.add(40, 60).add(80, 100).add(140, 200);
3674 c = a & b;
3675 d = b & a;
3676 assert(c == CodeList(150, 200), text(c));
3677 assert(c == d, text(c, " vs ",d));
3678 assert((c & a) == c);
3679 assert((d & b) == d);
3680 assert((c & d) == d);
3681
3682 assert((a & a) == a);
3683 assert((b & b) == b);
3684
3685 a = CodeList.init.add(20, 40).add(60, 80).add(100, 140).add(150, 200);
3686 b = CodeList.init.add(30, 60).add(75, 120).add(190, 300);
3687 c = a - b;// [30, 40) [60, 75) [120, 140) [150, 190)
3688 d = b - a;// [40, 60) [80, 100) [200, 300)
3689 assert(c == CodeList(20, 30, 60, 75, 120, 140, 150, 190), text(c));
3690 assert(d == CodeList(40, 60, 80, 100, 200, 300), text(d));
3691 assert(c - d == c, text(c-d, " vs ", c));
3692 assert(d - c == d, text(d-c, " vs ", d));
3693 assert(c - c == CodeList.init);
3694 assert(d - d == CodeList.init);
3695
3696 a = CodeList.init.add(20, 40).add( 60, 80).add(100, 140).add(150, 200);
3697 b = CodeList.init.add(10, 50).add(60, 160).add(190, 300);
3698 c = a - b;// [160, 190)
3699 d = b - a;// [10, 20) [40, 50) [80, 100) [140, 150) [200, 300)
3700 assert(c == CodeList(160, 190), text(c));
3701 assert(d == CodeList(10, 20, 40, 50, 80, 100, 140, 150, 200, 300), text(d));
3702 assert(c - d == c, text(c-d, " vs ", c));
3703 assert(d - c == d, text(d-c, " vs ", d));
3704 assert(c - c == CodeList.init);
3705 assert(d - d == CodeList.init);
3706
3707 a = CodeList.init.add(20, 40).add(60, 80).add(100, 140).add(150, 200);
3708 b = CodeList.init.add(10, 30).add(45, 100).add(130, 190);
3709 c = a ~ b; // [10, 20) [30, 40) [45, 60) [80, 130) [140, 150) [190, 200)
3710 d = b ~ a;
3711 assert(c == CodeList(10, 20, 30, 40, 45, 60, 80, 130, 140, 150, 190, 200),
3712 text(c));
3713 assert(c == d, text(c, " vs ", d));
3714 }
3715 }
3716
3717 }
3718
3719 pure @safe unittest// vs single dchar
3720 {
3721 import std.conv : text;
3722 CodepointSet a = CodepointSet(10, 100, 120, 200);
3723 assert(a - 'A' == CodepointSet(10, 65, 66, 100, 120, 200), text(a - 'A'));
3724 assert((a & 'B') == CodepointSet(66, 67));
3725 }
3726
3727 pure @safe unittest// iteration & opIndex
3728 {
3729 import std.algorithm.comparison : equal;
3730 import std.conv : text;
3731 import std.typecons : tuple, Tuple;
3732
3733 static foreach (CodeList; AliasSeq!(InversionList!(ReallocPolicy)))
3734 {{
3735 auto arr = "ABCDEFGHIJKLMabcdefghijklm"d;
3736 auto a = CodeList('A','N','a', 'n');
3737 assert(equal(a.byInterval,
3738 [tuple(cast(uint)'A', cast(uint)'N'), tuple(cast(uint)'a', cast(uint)'n')]
3739 ), text(a.byInterval));
3740
3741 // same @@@BUG as in https://issues.dlang.org/show_bug.cgi?id=8949 ?
3742 version (bug8949)
3743 {
3744 import std.range : retro;
3745 assert(equal(retro(a.byInterval),
3746 [tuple(cast(uint)'a', cast(uint)'n'), tuple(cast(uint)'A', cast(uint)'N')]
3747 ), text(retro(a.byInterval)));
3748 }
3749 auto achr = a.byCodepoint;
3750 assert(equal(achr, arr), text(a.byCodepoint));
3751 foreach (ch; a.byCodepoint)
3752 assert(a[ch]);
3753 auto x = CodeList(100, 500, 600, 900, 1200, 1500);
3754 assert(equal(x.byInterval, [ tuple(100, 500), tuple(600, 900), tuple(1200, 1500)]), text(x.byInterval));
3755 foreach (ch; x.byCodepoint)
3756 assert(x[ch]);
3757 static if (is(CodeList == CodepointSet))
3758 {
3759 auto y = CodeList(x.byInterval);
3760 assert(equal(x.byInterval, y.byInterval));
3761 }
3762 assert(equal(CodepointSet.init.byInterval, cast(Tuple!(uint, uint)[])[]));
3763 assert(equal(CodepointSet.init.byCodepoint, cast(dchar[])[]));
3764 }}
3765 }
3766
3767 //============================================================================
3768 // Generic Trie template and various ways to build it
3769 //============================================================================
3770
3771 // debug helper to get a shortened array dump
3772 auto arrayRepr(T)(T x)
3773 {
3774 import std.conv : text;
3775 if (x.length > 32)
3776 {
3777 return text(x[0 .. 16],"~...~", x[x.length-16 .. x.length]);
3778 }
3779 else
3780 return text(x);
3781 }
3782
3783 /**
3784 Maps `Key` to a suitable integer index within the range of `size_t`.
3785 The mapping is constructed by applying predicates from `Prefix` left to right
3786 and concatenating the resulting bits.
3787
3788 The first (leftmost) predicate defines the most significant bits of
3789 the resulting index.
3790 */
3791 template mapTrieIndex(Prefix...)
3792 {
3793 size_t mapTrieIndex(Key)(Key key)
3794 if (isValidPrefixForTrie!(Key, Prefix))
3795 {
3796 alias p = Prefix;
3797 size_t idx;
3798 foreach (i, v; p[0..$-1])
3799 {
3800 idx |= p[i](key);
3801 idx <<= p[i+1].bitSize;
3802 }
3803 idx |= p[$-1](key);
3804 return idx;
3805 }
3806 }
3807
3808 /*
3809 `TrieBuilder` is a type used for incremental construction
3810 of $(LREF Trie)s.
3811
3812 See $(LREF buildTrie) for generic helpers built on top of it.
3813 */
3814 @trusted private struct TrieBuilder(Value, Key, Args...)
3815 if (isBitPackableType!Value && isValidArgsForTrie!(Key, Args))
3816 {
3817 import std.exception : enforce;
3818
3819 private:
3820 // last index is not stored in table, it is used as an offset to values in a block.
3821 static if (is(Value == bool))// always pack bool
3822 alias V = BitPacked!(Value, 1);
3823 else
3824 alias V = Value;
3825 static auto deduceMaxIndex(Preds...)()
3826 {
3827 size_t idx = 1;
3828 foreach (v; Preds)
3829 idx *= 2^^v.bitSize;
3830 return idx;
3831 }
3832
3833 static if (is(typeof(Args[0]) : Key)) // Args start with upper bound on Key
3834 {
3835 alias Prefix = Args[1..$];
3836 enum lastPageSize = 2^^Prefix[$-1].bitSize;
3837 enum translatedMaxIndex = mapTrieIndex!(Prefix)(Args[0]);
3838 enum roughedMaxIndex =
3839 (translatedMaxIndex + lastPageSize-1)/lastPageSize*lastPageSize;
3840 // check warp around - if wrapped, use the default deduction rule
3841 enum maxIndex = roughedMaxIndex < translatedMaxIndex ?
3842 deduceMaxIndex!(Prefix)() : roughedMaxIndex;
3843 }
3844 else
3845 {
3846 alias Prefix = Args;
3847 enum maxIndex = deduceMaxIndex!(Prefix)();
3848 }
3849
3850 alias getIndex = mapTrieIndex!(Prefix);
3851
3852 enum lastLevel = Prefix.length-1;
3853 struct ConstructState
3854 {
3855 size_t idx_zeros, idx_ones;
3856 }
3857 // iteration over levels of Trie, each indexes its own level and thus a shortened domain
3858 size_t[Prefix.length] indices;
3859 // default filler value to use
3860 Value defValue;
3861 // this is a full-width index of next item
3862 size_t curIndex;
3863 // all-zeros page index, all-ones page index (+ indicator if there is such a page)
3864 ConstructState[Prefix.length] state;
3865 // the table being constructed
3866 MultiArray!(idxTypes!(Key, fullBitSize!(Prefix), Prefix[0..$]), V) table;
3867
3868 @disable this();
3869
3870 //shortcut for index variable at level 'level'
3871 @property ref idx(size_t level)(){ return indices[level]; }
3872
3873 // this function assumes no holes in the input so
3874 // indices are going one by one
3875 void addValue(size_t level, T)(T val, size_t numVals)
3876 {
3877 alias j = idx!level;
3878 enum pageSize = 1 << Prefix[level].bitSize;
3879 if (numVals == 0)
3880 return;
3881 auto ptr = table.slice!(level);
3882 if (numVals == 1)
3883 {
3884 static if (level == Prefix.length-1)
3885 ptr[j] = val;
3886 else
3887 {// can incur narrowing conversion
3888 assert(j < ptr.length);
3889 ptr[j] = force!(typeof(ptr[j]))(val);
3890 }
3891 j++;
3892 if (j % pageSize == 0)
3893 spillToNextPage!level(ptr);
3894 return;
3895 }
3896 // longer row of values
3897 // get to the next page boundary
3898 immutable nextPB = (j + pageSize) & ~(pageSize-1);
3899 immutable n = nextPB - j;// can fill right in this page
3900 if (numVals < n) //fits in current page
3901 {
3902 ptr[j .. j+numVals] = val;
3903 j += numVals;
3904 return;
3905 }
3906 static if (level != 0)//on the first level it always fits
3907 {
3908 numVals -= n;
3909 //write till the end of current page
3910 ptr[j .. j+n] = val;
3911 j += n;
3912 //spill to the next page
3913 spillToNextPage!level(ptr);
3914 // page at once loop
3915 if (state[level].idx_zeros != size_t.max && val == T.init)
3916 {
3917 alias NextIdx = typeof(table.slice!(level-1)[0]);
3918 addValue!(level-1)(force!NextIdx(state[level].idx_zeros),
3919 numVals/pageSize);
3920 ptr = table.slice!level; //table structure might have changed
3921 numVals %= pageSize;
3922 }
3923 else
3924 {
3925 while (numVals >= pageSize)
3926 {
3927 numVals -= pageSize;
3928 ptr[j .. j+pageSize] = val;
3929 j += pageSize;
3930 spillToNextPage!level(ptr);
3931 }
3932 }
3933 if (numVals)
3934 {
3935 // the leftovers, an incomplete page
3936 ptr[j .. j+numVals] = val;
3937 j += numVals;
3938 }
3939 }
3940 }
3941
3942 void spillToNextPage(size_t level, Slice)(ref Slice ptr)
3943 {
3944 // last level (i.e. topmost) has 1 "page"
3945 // thus it need not to add a new page on upper level
3946 static if (level != 0)
3947 spillToNextPageImpl!(level)(ptr);
3948 }
3949
3950 // this can re-use the current page if duplicate or allocate a new one
3951 // it also makes sure that previous levels point to the correct page in this level
3952 void spillToNextPageImpl(size_t level, Slice)(ref Slice ptr)
3953 {
3954 alias NextIdx = typeof(table.slice!(level-1)[0]);
3955 NextIdx next_lvl_index;
3956 enum pageSize = 1 << Prefix[level].bitSize;
3957 assert(idx!level % pageSize == 0);
3958 immutable last = idx!level-pageSize;
3959 const slice = ptr[idx!level - pageSize .. idx!level];
3960 size_t j;
3961 for (j=0; j<last; j+=pageSize)
3962 {
3963 if (ptr[j .. j+pageSize] == slice)
3964 {
3965 // get index to it, reuse ptr space for the next block
3966 next_lvl_index = force!NextIdx(j/pageSize);
3967 version (none)
3968 {
3969 import std.stdio : writefln, writeln;
3970 writefln("LEVEL(%s) page mapped idx: %s: 0..%s ---> [%s..%s]"
3971 ,level
3972 ,indices[level-1], pageSize, j, j+pageSize);
3973 writeln("LEVEL(", level
3974 , ") mapped page is: ", slice, ": ", arrayRepr(ptr[j .. j+pageSize]));
3975 writeln("LEVEL(", level
3976 , ") src page is :", ptr, ": ", arrayRepr(slice[0 .. pageSize]));
3977 }
3978 idx!level -= pageSize; // reuse this page, it is duplicate
3979 break;
3980 }
3981 }
3982 if (j == last)
3983 {
3984 L_allocate_page:
3985 next_lvl_index = force!NextIdx(idx!level/pageSize - 1);
3986 if (state[level].idx_zeros == size_t.max && ptr.zeros(j, j+pageSize))
3987 {
3988 state[level].idx_zeros = next_lvl_index;
3989 }
3990 // allocate next page
3991 version (none)
3992 {
3993 import std.stdio : writefln;
3994 writefln("LEVEL(%s) page allocated: %s"
3995 , level, arrayRepr(slice[0 .. pageSize]));
3996 writefln("LEVEL(%s) index: %s ; page at this index %s"
3997 , level
3998 , next_lvl_index
3999 , arrayRepr(
4000 table.slice!(level)
4001 [pageSize*next_lvl_index..(next_lvl_index+1)*pageSize]
4002 ));
4003 }
4004 table.length!level = table.length!level + pageSize;
4005 }
4006 L_know_index:
4007 // for the previous level, values are indices to the pages in the current level
4008 addValue!(level-1)(next_lvl_index, 1);
4009 ptr = table.slice!level; //re-load the slice after moves
4010 }
4011
4012 // idx - full-width index to fill with v (full-width index != key)
4013 // fills everything in the range of [curIndex, idx) with filler
4014 void putAt(size_t idx, Value v)
4015 {
4016 assert(idx >= curIndex);
4017 immutable numFillers = idx - curIndex;
4018 addValue!lastLevel(defValue, numFillers);
4019 addValue!lastLevel(v, 1);
4020 curIndex = idx + 1;
4021 }
4022
4023 // ditto, but sets the range of [idxA, idxB) to v
4024 void putRangeAt(size_t idxA, size_t idxB, Value v)
4025 {
4026 assert(idxA >= curIndex);
4027 assert(idxB >= idxA);
4028 size_t numFillers = idxA - curIndex;
4029 addValue!lastLevel(defValue, numFillers);
4030 addValue!lastLevel(v, idxB - idxA);
4031 curIndex = idxB; // open-right
4032 }
4033
4034 enum errMsg = "non-monotonic prefix function(s), an unsorted range or "~
4035 "duplicate key->value mapping";
4036
4037 public:
4038 /**
4039 Construct a builder, where `filler` is a value
4040 to indicate empty slots (or "not found" condition).
4041 */
4042 this(Value filler)
4043 {
4044 curIndex = 0;
4045 defValue = filler;
4046 // zeros-page index, ones-page index
4047 foreach (ref v; state)
4048 v = ConstructState(size_t.max, size_t.max);
4049 table = typeof(table)(indices);
4050 // one page per level is a bootstrap minimum
4051 foreach (i, Pred; Prefix)
4052 table.length!i = (1 << Pred.bitSize);
4053 }
4054
4055 /**
4056 Put a value `v` into interval as
4057 mapped by keys from `a` to `b`.
4058 All slots prior to `a` are filled with
4059 the default filler.
4060 */
4061 void putRange(Key a, Key b, Value v)
4062 {
4063 auto idxA = getIndex(a), idxB = getIndex(b);
4064 // indexes of key should always grow
4065 enforce(idxB >= idxA && idxA >= curIndex, errMsg);
4066 putRangeAt(idxA, idxB, v);
4067 }
4068
4069 /**
4070 Put a value `v` into slot mapped by `key`.
4071 All slots prior to `key` are filled with the
4072 default filler.
4073 */
4074 void putValue(Key key, Value v)
4075 {
4076 auto idx = getIndex(key);
4077 enforce(idx >= curIndex, errMsg);
4078 putAt(idx, v);
4079 }
4080
4081 /// Finishes construction of Trie, yielding an immutable Trie instance.
4082 auto build()
4083 {
4084 static if (maxIndex != 0) // doesn't cover full range of size_t
4085 {
4086 assert(curIndex <= maxIndex);
4087 addValue!lastLevel(defValue, maxIndex - curIndex);
4088 }
4089 else
4090 {
4091 if (curIndex != 0 // couldn't wrap around
4092 || (Prefix.length != 1 && indices[lastLevel] == 0)) // can be just empty
4093 {
4094 addValue!lastLevel(defValue, size_t.max - curIndex);
4095 addValue!lastLevel(defValue, 1);
4096 }
4097 // else curIndex already completed the full range of size_t by wrapping around
4098 }
4099 return Trie!(V, Key, maxIndex, Prefix)(table);
4100 }
4101 }
4102
4103 /**
4104 $(P A generic Trie data-structure for a fixed number of stages.
4105 The design goal is optimal speed with smallest footprint size.
4106 )
4107 $(P It's intentionally read-only and doesn't provide constructors.
4108 To construct one use a special builder,
4109 see $(LREF TrieBuilder) and $(LREF buildTrie).
4110 )
4111
4112 */
4113 @trusted private struct Trie(Value, Key, Args...)
4114 if (isValidPrefixForTrie!(Key, Args)
4115 || (isValidPrefixForTrie!(Key, Args[1..$])
4116 && is(typeof(Args[0]) : size_t)))
4117 {
4118 import std.range.primitives : isOutputRange;
4119 static if (is(typeof(Args[0]) : size_t))
4120 {
4121 private enum maxIndex = Args[0];
4122 private enum hasBoundsCheck = true;
4123 private alias Prefix = Args[1..$];
4124 }
4125 else
4126 {
4127 private enum hasBoundsCheck = false;
4128 private alias Prefix = Args;
4129 }
4130
4131 private this()(typeof(_table) table)
4132 {
4133 _table = table;
4134 }
4135
4136 // only for constant Tries constructed from precompiled tables
4137 private this()(const(size_t)[] offsets, const(size_t)[] sizes,
4138 const(size_t)[] data) const
4139 {
4140 _table = typeof(_table)(offsets, sizes, data);
4141 }
4142
4143 /**
4144 $(P Lookup the `key` in this `Trie`. )
4145
4146 $(P The lookup always succeeds if key fits the domain
4147 provided during construction. The whole domain defined
4148 is covered so instead of not found condition
4149 the sentinel (filler) value could be used. )
4150
4151 $(P See $(LREF buildTrie), $(LREF TrieBuilder) for how to
4152 define a domain of `Trie` keys and the sentinel value. )
4153
4154 Note:
4155 Domain range-checking is only enabled in debug builds
4156 and results in assertion failure.
4157 */
4158 TypeOfBitPacked!Value opIndex()(Key key) const
4159 {
4160 static if (hasBoundsCheck)
4161 assert(mapTrieIndex!Prefix(key) < maxIndex);
4162 size_t idx;
4163 alias p = Prefix;
4164 idx = cast(size_t) p[0](key);
4165 foreach (i, v; p[0..$-1])
4166 idx = cast(size_t)((_table.ptr!i[idx]<<p[i+1].bitSize) + p[i+1](key));
4167 return _table.ptr!(p.length-1)[idx];
4168 }
4169
4170 ///
4171 @property size_t bytes(size_t n=size_t.max)() const
4172 {
4173 return _table.bytes!n;
4174 }
4175
4176 ///
4177 @property size_t pages(size_t n)() const
4178 {
4179 return (bytes!n+2^^(Prefix[n].bitSize-1))
4180 /2^^Prefix[n].bitSize;
4181 }
4182
4183 ///
4184 void store(OutRange)(scope OutRange sink) const
4185 if (isOutputRange!(OutRange, char))
4186 {
4187 _table.store(sink);
4188 }
4189
4190 private:
4191 MultiArray!(idxTypes!(Key, fullBitSize!(Prefix), Prefix[0..$]), Value) _table;
4192 }
4193
4194 // create a tuple of 'sliceBits' that slice the 'top' of bits into pieces of sizes 'sizes'
4195 // left-to-right, the most significant bits first
4196 template GetBitSlicing(size_t top, sizes...)
4197 {
4198 static if (sizes.length > 0)
4199 alias GetBitSlicing =
4200 AliasSeq!(sliceBits!(top - sizes[0], top),
4201 GetBitSlicing!(top - sizes[0], sizes[1..$]));
4202 else
4203 alias GetBitSlicing = AliasSeq!();
4204 }
4205
4206 template callableWith(T)
4207 {
4208 template callableWith(alias Pred)
4209 {
4210 static if (!is(typeof(Pred(T.init))))
4211 enum callableWith = false;
4212 else
4213 {
4214 alias Result = typeof(Pred(T.init));
4215 enum callableWith = isBitPackableType!(TypeOfBitPacked!(Result));
4216 }
4217 }
4218 }
4219
4220 /*
4221 Check if `Prefix` is a valid set of predicates
4222 for `Trie` template having `Key` as the type of keys.
4223 This requires all predicates to be callable, take
4224 single argument of type `Key` and return unsigned value.
4225 */
4226 template isValidPrefixForTrie(Key, Prefix...)
4227 {
4228 import std.meta : allSatisfy;
4229 enum isValidPrefixForTrie = allSatisfy!(callableWith!Key, Prefix); // TODO: tighten the screws
4230 }
4231
4232 /*
4233 Check if `Args` is a set of maximum key value followed by valid predicates
4234 for `Trie` template having `Key` as the type of keys.
4235 */
4236 template isValidArgsForTrie(Key, Args...)
4237 {
4238 static if (Args.length > 1)
4239 {
4240 enum isValidArgsForTrie = isValidPrefixForTrie!(Key, Args)
4241 || (isValidPrefixForTrie!(Key, Args[1..$]) && is(typeof(Args[0]) : Key));
4242 }
4243 else
4244 enum isValidArgsForTrie = isValidPrefixForTrie!Args;
4245 }
4246
4247 @property size_t sumOfIntegerTuple(ints...)()
4248 {
4249 size_t count=0;
4250 foreach (v; ints)
4251 count += v;
4252 return count;
4253 }
4254
4255 /**
4256 A shorthand for creating a custom multi-level fixed Trie
4257 from a `CodepointSet`. `sizes` are numbers of bits per level,
4258 with the most significant bits used first.
4259
4260 Note: The sum of `sizes` must be equal 21.
4261
4262 See_Also: $(LREF toTrie), which is even simpler.
4263
4264 Example:
4265 ---
4266 {
4267 import std.stdio;
4268 auto set = unicode("Number");
4269 auto trie = codepointSetTrie!(8, 5, 8)(set);
4270 writeln("Input code points to test:");
4271 foreach (line; stdin.byLine)
4272 {
4273 int count=0;
4274 foreach (dchar ch; line)
4275 if (trie[ch])// is number
4276 count++;
4277 writefln("Contains %d number code points.", count);
4278 }
4279 }
4280 ---
4281 */
4282 public template codepointSetTrie(sizes...)
4283 if (sumOfIntegerTuple!sizes == 21)
4284 {
4285 auto codepointSetTrie(Set)(Set set)
4286 if (isCodepointSet!Set)
4287 {
4288 auto builder = TrieBuilder!(bool, dchar, lastDchar+1, GetBitSlicing!(21, sizes))(false);
4289 foreach (ival; set.byInterval)
4290 builder.putRange(ival[0], ival[1], true);
4291 return builder.build();
4292 }
4293 }
4294
4295 /// Type of Trie generated by codepointSetTrie function.
4296 public template CodepointSetTrie(sizes...)
4297 if (sumOfIntegerTuple!sizes == 21)
4298 {
4299 alias Prefix = GetBitSlicing!(21, sizes);
4300 alias CodepointSetTrie = typeof(TrieBuilder!(bool, dchar, lastDchar+1, Prefix)(false).build());
4301 }
4302
4303 /**
4304 A slightly more general tool for building fixed `Trie`
4305 for the Unicode data.
4306
4307 Specifically unlike `codepointSetTrie` it's allows creating mappings
4308 of `dchar` to an arbitrary type `T`.
4309
4310 Note: Overload taking `CodepointSet`s will naturally convert
4311 only to bool mapping `Trie`s.
4312
4313 CodepointTrie is the type of Trie as generated by codepointTrie function.
4314 */
4315 public template codepointTrie(T, sizes...)
4316 if (sumOfIntegerTuple!sizes == 21)
4317 {
4318 alias Prefix = GetBitSlicing!(21, sizes);
4319
4320 static if (is(TypeOfBitPacked!T == bool))
4321 {
4322 auto codepointTrie(Set)(const scope Set set)
4323 if (isCodepointSet!Set)
4324 {
4325 return codepointSetTrie(set);
4326 }
4327 }
4328
4329 ///
4330 auto codepointTrie()(T[dchar] map, T defValue=T.init)
4331 {
4332 return buildTrie!(T, dchar, Prefix)(map, defValue);
4333 }
4334
4335 // unsorted range of pairs
4336 ///
4337 auto codepointTrie(R)(R range, T defValue=T.init)
4338 if (isInputRange!R
4339 && is(typeof(ElementType!R.init[0]) : T)
4340 && is(typeof(ElementType!R.init[1]) : dchar))
4341 {
4342 // build from unsorted array of pairs
4343 // TODO: expose index sorting functions for Trie
4344 return buildTrie!(T, dchar, Prefix)(range, defValue, true);
4345 }
4346 }
4347
4348 @system pure unittest
4349 {
4350 import std.algorithm.comparison : max;
4351 import std.algorithm.searching : count;
4352
4353 // pick characters from the Greek script
4354 auto set = unicode.Greek;
4355
4356 // a user-defined property (or an expensive function)
4357 // that we want to look up
4358 static uint luckFactor(dchar ch)
4359 {
4360 // here we consider a character lucky
4361 // if its code point has a lot of identical hex-digits
4362 // e.g. arabic letter DDAL (\u0688) has a "luck factor" of 2
4363 ubyte[6] nibbles; // 6 4-bit chunks of code point
4364 uint value = ch;
4365 foreach (i; 0 .. 6)
4366 {
4367 nibbles[i] = value & 0xF;
4368 value >>= 4;
4369 }
4370 uint luck;
4371 foreach (n; nibbles)
4372 luck = cast(uint) max(luck, count(nibbles[], n));
4373 return luck;
4374 }
4375
4376 // only unsigned built-ins are supported at the moment
4377 alias LuckFactor = BitPacked!(uint, 3);
4378
4379 // create a temporary associative array (AA)
4380 LuckFactor[dchar] map;
4381 foreach (ch; set.byCodepoint)
4382 map[ch] = LuckFactor(luckFactor(ch));
4383
4384 // bits per stage are chosen randomly, fell free to optimize
4385 auto trie = codepointTrie!(LuckFactor, 8, 5, 8)(map);
4386
4387 // from now on the AA is not needed
4388 foreach (ch; set.byCodepoint)
4389 assert(trie[ch] == luckFactor(ch)); // verify
4390 // CJK is not Greek, thus it has the default value
4391 assert(trie['\u4444'] == 0);
4392 // and here is a couple of quite lucky Greek characters:
4393 // Greek small letter epsilon with dasia
4394 assert(trie['\u1F11'] == 3);
4395 // Ancient Greek metretes sign
4396 assert(trie['\U00010181'] == 3);
4397
4398 }
4399
4400 /// ditto
4401 public template CodepointTrie(T, sizes...)
4402 if (sumOfIntegerTuple!sizes == 21)
4403 {
4404 alias Prefix = GetBitSlicing!(21, sizes);
4405 alias CodepointTrie = typeof(TrieBuilder!(T, dchar, lastDchar+1, Prefix)(T.init).build());
4406 }
4407
4408 package(std) template cmpK0(alias Pred)
4409 {
4410 import std.typecons : Tuple;
4411 static bool cmpK0(Value, Key)
4412 (Tuple!(Value, Key) a, Tuple!(Value, Key) b)
4413 {
4414 return Pred(a[1]) < Pred(b[1]);
4415 }
4416 }
4417
4418 /**
4419 The most general utility for construction of `Trie`s
4420 short of using `TrieBuilder` directly.
4421
4422 Provides a number of convenience overloads.
4423 `Args` is tuple of maximum key value followed by
4424 predicates to construct index from key.
4425
4426 Alternatively if the first argument is not a value convertible to `Key`
4427 then the whole tuple of `Args` is treated as predicates
4428 and the maximum Key is deduced from predicates.
4429 */
4430 private template buildTrie(Value, Key, Args...)
4431 if (isValidArgsForTrie!(Key, Args))
4432 {
4433 static if (is(typeof(Args[0]) : Key)) // prefix starts with upper bound on Key
4434 {
4435 alias Prefix = Args[1..$];
4436 }
4437 else
4438 alias Prefix = Args;
4439
4440 alias getIndex = mapTrieIndex!(Prefix);
4441
4442 // for multi-sort
4443 template GetComparators(size_t n)
4444 {
4445 static if (n > 0)
4446 alias GetComparators =
4447 AliasSeq!(GetComparators!(n-1), cmpK0!(Prefix[n-1]));
4448 else
4449 alias GetComparators = AliasSeq!();
4450 }
4451
4452 /*
4453 Build `Trie` from a range of a Key-Value pairs,
4454 assuming it is sorted by Key as defined by the following lambda:
4455 ------
4456 (a, b) => mapTrieIndex!(Prefix)(a) < mapTrieIndex!(Prefix)(b)
4457 ------
4458 Exception is thrown if it's detected that the above order doesn't hold.
4459
4460 In other words $(LREF mapTrieIndex) should be a
4461 monotonically increasing function that maps `Key` to an integer.
4462
4463 See_Also: $(REF sort, std,_algorithm),
4464 $(REF SortedRange, std,range),
4465 $(REF setUnion, std,_algorithm).
4466 */
4467 auto buildTrie(Range)(Range range, Value filler=Value.init)
4468 if (isInputRange!Range && is(typeof(Range.init.front[0]) : Value)
4469 && is(typeof(Range.init.front[1]) : Key))
4470 {
4471 auto builder = TrieBuilder!(Value, Key, Prefix)(filler);
4472 foreach (v; range)
4473 builder.putValue(v[1], v[0]);
4474 return builder.build();
4475 }
4476
4477 /*
4478 If `Value` is bool (or BitPacked!(bool, x)) then it's possible
4479 to build `Trie` from a range of open-right intervals of `Key`s.
4480 The requirement on the ordering of keys (and the behavior on the
4481 violation of it) is the same as for Key-Value range overload.
4482
4483 Intervals denote ranges of !`filler` i.e. the opposite of filler.
4484 If no filler provided keys inside of the intervals map to true,
4485 and `filler` is false.
4486 */
4487 auto buildTrie(Range)(Range range, Value filler=Value.init)
4488 if (is(TypeOfBitPacked!Value == bool)
4489 && isInputRange!Range && is(typeof(Range.init.front[0]) : Key)
4490 && is(typeof(Range.init.front[1]) : Key))
4491 {
4492 auto builder = TrieBuilder!(Value, Key, Prefix)(filler);
4493 foreach (ival; range)
4494 builder.putRange(ival[0], ival[1], !filler);
4495 return builder.build();
4496 }
4497
4498 auto buildTrie(Range)(Range range, Value filler, bool unsorted)
4499 if (isInputRange!Range
4500 && is(typeof(Range.init.front[0]) : Value)
4501 && is(typeof(Range.init.front[1]) : Key))
4502 {
4503 import std.algorithm.sorting : multiSort;
4504 alias Comps = GetComparators!(Prefix.length);
4505 if (unsorted)
4506 multiSort!(Comps)(range);
4507 return buildTrie(range, filler);
4508 }
4509
4510 /*
4511 If `Value` is bool (or BitPacked!(bool, x)) then it's possible
4512 to build `Trie` simply from an input range of `Key`s.
4513 The requirement on the ordering of keys (and the behavior on the
4514 violation of it) is the same as for Key-Value range overload.
4515
4516 Keys found in range denote !`filler` i.e. the opposite of filler.
4517 If no filler provided keys map to true, and `filler` is false.
4518 */
4519 auto buildTrie(Range)(Range range, Value filler=Value.init)
4520 if (is(TypeOfBitPacked!Value == bool)
4521 && isInputRange!Range && is(typeof(Range.init.front) : Key))
4522 {
4523 auto builder = TrieBuilder!(Value, Key, Prefix)(filler);
4524 foreach (v; range)
4525 builder.putValue(v, !filler);
4526 return builder.build();
4527 }
4528
4529 /*
4530 If `Key` is unsigned integer `Trie` could be constructed from array
4531 of values where array index serves as key.
4532 */
4533 auto buildTrie()(Value[] array, Value filler=Value.init)
4534 if (isUnsigned!Key)
4535 {
4536 auto builder = TrieBuilder!(Value, Key, Prefix)(filler);
4537 foreach (idx, v; array)
4538 builder.putValue(idx, v);
4539 return builder.build();
4540 }
4541
4542 /*
4543 Builds `Trie` from associative array.
4544 */
4545 auto buildTrie(Key, Value)(Value[Key] map, Value filler=Value.init)
4546 {
4547 import std.array : array;
4548 import std.range : zip;
4549 auto range = array(zip(map.values, map.keys));
4550 return buildTrie(range, filler, true); // sort it
4551 }
4552 }
4553
4554 // helper in place of assumeSize to
4555 //reduce mangled name & help DMD inline Trie functors
4556 struct clamp(size_t bits)
4557 {
4558 static size_t opCall(T)(T arg){ return arg; }
4559 enum bitSize = bits;
4560 }
4561
4562 struct clampIdx(size_t idx, size_t bits)
4563 {
4564 static size_t opCall(T)(T arg){ return arg[idx]; }
4565 enum bitSize = bits;
4566 }
4567
4568 /**
4569 Conceptual type that outlines the common properties of all UTF Matchers.
4570
4571 Note: For illustration purposes only, every method
4572 call results in assertion failure.
4573 Use $(LREF utfMatcher) to obtain a concrete matcher
4574 for UTF-8 or UTF-16 encodings.
4575 */
4576 public struct MatcherConcept
4577 {
4578 /**
4579 $(P Perform a semantic equivalent 2 operations:
4580 decoding a $(CODEPOINT) at front of `inp` and testing if
4581 it belongs to the set of $(CODEPOINTS) of this matcher. )
4582
4583 $(P The effect on `inp` depends on the kind of function called:)
4584
4585 $(P Match. If the codepoint is found in the set then range `inp`
4586 is advanced by its size in $(S_LINK Code unit, code units),
4587 otherwise the range is not modifed.)
4588
4589 $(P Skip. The range is always advanced by the size
4590 of the tested $(CODEPOINT) regardless of the result of test.)
4591
4592 $(P Test. The range is left unaffected regardless
4593 of the result of test.)
4594 */
4595 public bool match(Range)(ref Range inp)
4596 if (isRandomAccessRange!Range && is(ElementType!Range : char))
4597 {
4598 assert(false);
4599 }
4600
4601 ///ditto
4602 public bool skip(Range)(ref Range inp)
4603 if (isRandomAccessRange!Range && is(ElementType!Range : char))
4604 {
4605 assert(false);
4606 }
4607
4608 ///ditto
4609 public bool test(Range)(ref Range inp)
4610 if (isRandomAccessRange!Range && is(ElementType!Range : char))
4611 {
4612 assert(false);
4613 }
4614 ///
4615 pure @safe unittest
4616 {
4617 string truth = "2² = 4";
4618 auto m = utfMatcher!char(unicode.Number);
4619 assert(m.match(truth)); // '2' is a number all right
4620 assert(truth == "² = 4"); // skips on match
4621 assert(m.match(truth)); // so is the superscript '2'
4622 assert(!m.match(truth)); // space is not a number
4623 assert(truth == " = 4"); // unaffected on no match
4624 assert(!m.skip(truth)); // same test ...
4625 assert(truth == "= 4"); // but skips a codepoint regardless
4626 assert(!m.test(truth)); // '=' is not a number
4627 assert(truth == "= 4"); // test never affects argument
4628 }
4629
4630 /**
4631 Advanced feature - provide direct access to a subset of matcher based a
4632 set of known encoding lengths. Lengths are provided in
4633 $(S_LINK Code unit, code units). The sub-matcher then may do less
4634 operations per any `test`/`match`.
4635
4636 Use with care as the sub-matcher won't match
4637 any $(CODEPOINTS) that have encoded length that doesn't belong
4638 to the selected set of lengths. Also the sub-matcher object references
4639 the parent matcher and must not be used past the liftetime
4640 of the latter.
4641
4642 Another caveat of using sub-matcher is that skip is not available
4643 preciesly because sub-matcher doesn't detect all lengths.
4644 */
4645 @property auto subMatcher(Lengths...)()
4646 {
4647 assert(0);
4648 return this;
4649 }
4650
4651 pure @safe unittest
4652 {
4653 auto m = utfMatcher!char(unicode.Number);
4654 string square = "2²";
4655 // about sub-matchers
4656 assert(!m.subMatcher!(2,3,4).test(square)); // ASCII no covered
4657 assert(m.subMatcher!1.match(square)); // ASCII-only, works
4658 assert(!m.subMatcher!1.test(square)); // unicode '²'
4659 assert(m.subMatcher!(2,3,4).match(square)); //
4660 assert(square == "");
4661 wstring wsquare = "2²";
4662 auto m16 = utfMatcher!wchar(unicode.Number);
4663 // may keep ref, but the orignal (m16) must be kept alive
4664 auto bmp = m16.subMatcher!1;
4665 assert(bmp.match(wsquare)); // Okay, in basic multilingual plan
4666 assert(bmp.match(wsquare)); // And '²' too
4667 }
4668 }
4669
4670 /**
4671 Test if `M` is an UTF Matcher for ranges of `Char`.
4672 */
4673 public enum isUtfMatcher(M, C) = __traits(compiles, (){
4674 C[] s;
4675 auto d = s.decoder;
4676 M m;
4677 assert(is(typeof(m.match(d)) == bool));
4678 assert(is(typeof(m.test(d)) == bool));
4679 static if (is(typeof(m.skip(d))))
4680 {
4681 assert(is(typeof(m.skip(d)) == bool));
4682 assert(is(typeof(m.skip(s)) == bool));
4683 }
4684 assert(is(typeof(m.match(s)) == bool));
4685 assert(is(typeof(m.test(s)) == bool));
4686 });
4687
4688 pure @safe unittest
4689 {
4690 alias CharMatcher = typeof(utfMatcher!char(CodepointSet.init));
4691 alias WcharMatcher = typeof(utfMatcher!wchar(CodepointSet.init));
4692 static assert(isUtfMatcher!(CharMatcher, char));
4693 static assert(isUtfMatcher!(CharMatcher, immutable(char)));
4694 static assert(isUtfMatcher!(WcharMatcher, wchar));
4695 static assert(isUtfMatcher!(WcharMatcher, immutable(wchar)));
4696 }
4697
4698 enum Mode {
4699 alwaysSkip,
4700 neverSkip,
4701 skipOnMatch
4702 }
4703
4704 mixin template ForwardStrings()
4705 {
4706 private bool fwdStr(string fn, C)(ref C[] str) const @trusted
4707 {
4708 import std.utf : byCodeUnit;
4709 alias type = typeof(byCodeUnit(str));
4710 return mixin(fn~"(*cast(type*)&str)");
4711 }
4712 }
4713
4714 template Utf8Matcher()
4715 {
4716 enum validSize(int sz) = sz >= 1 && sz <= 4;
4717
4718 void badEncoding() pure @safe
4719 {
4720 import std.utf : UTFException;
4721 throw new UTFException("Invalid UTF-8 sequence");
4722 }
4723
4724 //for 1-stage ASCII
4725 alias AsciiSpec = AliasSeq!(bool, char, clamp!7);
4726 //for 2-stage lookup of 2 byte UTF-8 sequences
4727 alias Utf8Spec2 = AliasSeq!(bool, char[2],
4728 clampIdx!(0, 5), clampIdx!(1, 6));
4729 //ditto for 3 byte
4730 alias Utf8Spec3 = AliasSeq!(bool, char[3],
4731 clampIdx!(0, 4),
4732 clampIdx!(1, 6),
4733 clampIdx!(2, 6)
4734 );
4735 //ditto for 4 byte
4736 alias Utf8Spec4 = AliasSeq!(bool, char[4],
4737 clampIdx!(0, 3), clampIdx!(1, 6),
4738 clampIdx!(2, 6), clampIdx!(3, 6)
4739 );
4740 alias Tables = AliasSeq!(
4741 typeof(TrieBuilder!(AsciiSpec)(false).build()),
4742 typeof(TrieBuilder!(Utf8Spec2)(false).build()),
4743 typeof(TrieBuilder!(Utf8Spec3)(false).build()),
4744 typeof(TrieBuilder!(Utf8Spec4)(false).build())
4745 );
4746 alias Table(int size) = Tables[size-1];
4747
4748 enum leadMask(size_t size) = (cast(size_t) 1<<(7 - size))-1;
4749 enum encMask(size_t size) = ((1 << size)-1)<<(8-size);
4750
4751 char truncate()(char ch) pure @safe
4752 {
4753 ch -= 0x80;
4754 if (ch < 0x40)
4755 {
4756 return ch;
4757 }
4758 else
4759 {
4760 badEncoding();
4761 return cast(char) 0;
4762 }
4763 }
4764
4765 static auto encode(size_t sz)(dchar ch)
4766 if (sz > 1)
4767 {
4768 import std.utf : encodeUTF = encode;
4769 char[4] buf;
4770 encodeUTF(buf, ch);
4771 char[sz] ret;
4772 buf[0] &= leadMask!sz;
4773 foreach (n; 1 .. sz)
4774 buf[n] = buf[n] & 0x3f; //keep 6 lower bits
4775 ret[] = buf[0 .. sz];
4776 return ret;
4777 }
4778
4779 auto build(Set)(Set set)
4780 {
4781 import std.algorithm.iteration : map;
4782 auto ascii = set & unicode.ASCII;
4783 auto utf8_2 = set & CodepointSet(0x80, 0x800);
4784 auto utf8_3 = set & CodepointSet(0x800, 0x1_0000);
4785 auto utf8_4 = set & CodepointSet(0x1_0000, lastDchar+1);
4786 auto asciiT = ascii.byCodepoint.map!(x=>cast(char) x).buildTrie!(AsciiSpec);
4787 auto utf8_2T = utf8_2.byCodepoint.map!(x=>encode!2(x)).buildTrie!(Utf8Spec2);
4788 auto utf8_3T = utf8_3.byCodepoint.map!(x=>encode!3(x)).buildTrie!(Utf8Spec3);
4789 auto utf8_4T = utf8_4.byCodepoint.map!(x=>encode!4(x)).buildTrie!(Utf8Spec4);
4790 alias Ret = Impl!(1,2,3,4);
4791 return Ret(asciiT, utf8_2T, utf8_3T, utf8_4T);
4792 }
4793
4794 // Bootstrap UTF-8 static matcher interface
4795 // from 3 primitives: tab!(size), lookup and Sizes
4796 mixin template DefMatcher()
4797 {
4798 import std.format : format;
4799 import std.meta : Erase, staticIndexOf;
4800 enum hasASCII = staticIndexOf!(1, Sizes) >= 0;
4801 alias UniSizes = Erase!(1, Sizes);
4802
4803 //generate dispatch code sequence for unicode parts
4804 static auto genDispatch()
4805 {
4806 string code;
4807 foreach (size; UniSizes)
4808 code ~= format(q{
4809 if ((ch & ~leadMask!%d) == encMask!(%d))
4810 return lookup!(%d, mode)(inp);
4811 else
4812 }, size, size, size);
4813 static if (Sizes.length == 4) //covers all code unit cases
4814 code ~= "{ badEncoding(); return false; }";
4815 else
4816 code ~= "return false;"; //may be just fine but not covered
4817 return code;
4818 }
4819 enum dispatch = genDispatch();
4820
4821 public bool match(Range)(ref Range inp) const
4822 if (isRandomAccessRange!Range && is(ElementType!Range : char) &&
4823 !isDynamicArray!Range)
4824 {
4825 enum mode = Mode.skipOnMatch;
4826 assert(!inp.empty);
4827 immutable ch = inp[0];
4828 static if (hasASCII)
4829 {
4830 if (ch < 0x80)
4831 {
4832 immutable r = tab!1[ch];
4833 if (r)
4834 inp.popFront();
4835 return r;
4836 }
4837 else
4838 mixin(dispatch);
4839 }
4840 else
4841 mixin(dispatch);
4842 }
4843
4844 static if (Sizes.length == 4) // can skip iff can detect all encodings
4845 {
4846 public bool skip(Range)(ref Range inp) const
4847 if (isRandomAccessRange!Range && is(ElementType!Range : char) &&
4848 !isDynamicArray!Range)
4849 {
4850 enum mode = Mode.alwaysSkip;
4851 assert(!inp.empty);
4852 auto ch = inp[0];
4853 static if (hasASCII)
4854 {
4855 if (ch < 0x80)
4856 {
4857 inp.popFront();
4858 return tab!1[ch];
4859 }
4860 else
4861 mixin(dispatch);
4862 }
4863 else
4864 mixin(dispatch);
4865 }
4866 }
4867
4868 public bool test(Range)(ref Range inp) const
4869 if (isRandomAccessRange!Range && is(ElementType!Range : char) &&
4870 !isDynamicArray!Range)
4871 {
4872 enum mode = Mode.neverSkip;
4873 assert(!inp.empty);
4874 auto ch = inp[0];
4875 static if (hasASCII)
4876 {
4877 if (ch < 0x80)
4878 return tab!1[ch];
4879 else
4880 mixin(dispatch);
4881 }
4882 else
4883 mixin(dispatch);
4884 }
4885
4886 bool match(C)(ref C[] str) const
4887 if (isSomeChar!C)
4888 {
4889 return fwdStr!"match"(str);
4890 }
4891
4892 bool skip(C)(ref C[] str) const
4893 if (isSomeChar!C)
4894 {
4895 return fwdStr!"skip"(str);
4896 }
4897
4898 bool test(C)(ref C[] str) const
4899 if (isSomeChar!C)
4900 {
4901 return fwdStr!"test"(str);
4902 }
4903
4904 mixin ForwardStrings;
4905 }
4906
4907 struct Impl(Sizes...)
4908 {
4909 import std.meta : allSatisfy, staticMap;
4910 static assert(allSatisfy!(validSize, Sizes),
4911 "Only lengths of 1, 2, 3 and 4 code unit are possible for UTF-8");
4912 private:
4913 //pick tables for chosen sizes
4914 alias OurTabs = staticMap!(Table, Sizes);
4915 OurTabs tables;
4916 mixin DefMatcher;
4917 //static disptach helper UTF size ==> table
4918 alias tab(int i) = tables[i - 1];
4919
4920 package(std) @property CherryPick!(Impl, SizesToPick) subMatcher(SizesToPick...)()
4921 {
4922 return CherryPick!(Impl, SizesToPick)(&this);
4923 }
4924
4925 bool lookup(int size, Mode mode, Range)(ref Range inp) const
4926 {
4927 import std.range : popFrontN;
4928 if (inp.length < size)
4929 {
4930 badEncoding();
4931 return false;
4932 }
4933 char[size] needle = void;
4934 needle[0] = leadMask!size & inp[0];
4935 static foreach (i; 1 .. size)
4936 {
4937 needle[i] = truncate(inp[i]);
4938 }
4939 //overlong encoding checks
4940 static if (size == 2)
4941 {
4942 //0x80-0x7FF
4943 //got 6 bits in needle[1], must use at least 8 bits
4944 //must use at least 2 bits in needle[1]
4945 if (needle[0] < 2) badEncoding();
4946 }
4947 else static if (size == 3)
4948 {
4949 //0x800-0xFFFF
4950 //got 6 bits in needle[2], must use at least 12bits
4951 //must use 6 bits in needle[1] or anything in needle[0]
4952 if (needle[0] == 0 && needle[1] < 0x20) badEncoding();
4953 }
4954 else static if (size == 4)
4955 {
4956 //0x800-0xFFFF
4957 //got 2x6=12 bits in needle[2 .. 3] must use at least 17bits
4958 //must use 5 bits (or above) in needle[1] or anything in needle[0]
4959 if (needle[0] == 0 && needle[1] < 0x10) badEncoding();
4960 }
4961 static if (mode == Mode.alwaysSkip)
4962 {
4963 inp.popFrontN(size);
4964 return tab!size[needle];
4965 }
4966 else static if (mode == Mode.neverSkip)
4967 {
4968 return tab!size[needle];
4969 }
4970 else
4971 {
4972 static assert(mode == Mode.skipOnMatch);
4973 if (tab!size[needle])
4974 {
4975 inp.popFrontN(size);
4976 return true;
4977 }
4978 else
4979 return false;
4980 }
4981 }
4982 }
4983
4984 struct CherryPick(I, Sizes...)
4985 {
4986 import std.meta : allSatisfy;
4987 static assert(allSatisfy!(validSize, Sizes),
4988 "Only lengths of 1, 2, 3 and 4 code unit are possible for UTF-8");
4989 private:
4990 I* m;
4991 @property auto tab(int i)() const { return m.tables[i - 1]; }
4992 bool lookup(int size, Mode mode, Range)(ref Range inp) const
4993 {
4994 return m.lookup!(size, mode)(inp);
4995 }
4996 mixin DefMatcher;
4997 }
4998 }
4999
5000 template Utf16Matcher()
5001 {
5002 enum validSize(int sz) = sz >= 1 && sz <= 2;
5003
5004 void badEncoding() pure @safe
5005 {
5006 import std.utf : UTFException;
5007 throw new UTFException("Invalid UTF-16 sequence");
5008 }
5009
5010 // 1-stage ASCII
5011 alias AsciiSpec = AliasSeq!(bool, wchar, clamp!7);
5012 //2-stage BMP
5013 alias BmpSpec = AliasSeq!(bool, wchar, sliceBits!(7, 16), sliceBits!(0, 7));
5014 //4-stage - full Unicode
5015 //assume that 0xD800 & 0xDC00 bits are cleared
5016 //thus leaving 10 bit per wchar to worry about
5017 alias UniSpec = AliasSeq!(bool, wchar[2],
5018 assumeSize!(x=>x[0]>>4, 6), assumeSize!(x=>x[0]&0xf, 4),
5019 assumeSize!(x=>x[1]>>6, 4), assumeSize!(x=>x[1]&0x3f, 6),
5020 );
5021 alias Ascii = typeof(TrieBuilder!(AsciiSpec)(false).build());
5022 alias Bmp = typeof(TrieBuilder!(BmpSpec)(false).build());
5023 alias Uni = typeof(TrieBuilder!(UniSpec)(false).build());
5024
5025 auto encode2(dchar ch)
5026 {
5027 ch -= 0x1_0000;
5028 assert(ch <= 0xF_FFFF);
5029 wchar[2] ret;
5030 //do not put surrogate bits, they are sliced off
5031 ret[0] = cast(wchar)(ch >> 10);
5032 ret[1] = (ch & 0xFFF);
5033 return ret;
5034 }
5035
5036 auto build(Set)(Set set)
5037 {
5038 import std.algorithm.iteration : map;
5039 auto ascii = set & unicode.ASCII;
5040 auto bmp = (set & CodepointSet.fromIntervals(0x80, 0xFFFF+1))
5041 - CodepointSet.fromIntervals(0xD800, 0xDFFF+1);
5042 auto other = set - (bmp | ascii);
5043 auto asciiT = ascii.byCodepoint.map!(x=>cast(char) x).buildTrie!(AsciiSpec);
5044 auto bmpT = bmp.byCodepoint.map!(x=>cast(wchar) x).buildTrie!(BmpSpec);
5045 auto otherT = other.byCodepoint.map!(x=>encode2(x)).buildTrie!(UniSpec);
5046 alias Ret = Impl!(1,2);
5047 return Ret(asciiT, bmpT, otherT);
5048 }
5049
5050 //bootstrap full UTF-16 matcher interace from
5051 //sizeFlags, lookupUni and ascii
5052 mixin template DefMatcher()
5053 {
5054 public bool match(Range)(ref Range inp) const
5055 if (isRandomAccessRange!Range && is(ElementType!Range : wchar) &&
5056 !isDynamicArray!Range)
5057 {
5058 enum mode = Mode.skipOnMatch;
5059 assert(!inp.empty);
5060 immutable ch = inp[0];
5061 static if (sizeFlags & 1)
5062 {
5063 if (ch < 0x80)
5064 {
5065 if (ascii[ch])
5066 {
5067 inp.popFront();
5068 return true;
5069 }
5070 else
5071 return false;
5072 }
5073 return lookupUni!mode(inp);
5074 }
5075 else
5076 return lookupUni!mode(inp);
5077 }
5078
5079 static if (Sizes.length == 2)
5080 {
5081 public bool skip(Range)(ref Range inp) const
5082 if (isRandomAccessRange!Range && is(ElementType!Range : wchar) &&
5083 !isDynamicArray!Range)
5084 {
5085 enum mode = Mode.alwaysSkip;
5086 assert(!inp.empty);
5087 immutable ch = inp[0];
5088 static if (sizeFlags & 1)
5089 {
5090 if (ch < 0x80)
5091 {
5092 inp.popFront();
5093 return ascii[ch];
5094 }
5095 else
5096 return lookupUni!mode(inp);
5097 }
5098 else
5099 return lookupUni!mode(inp);
5100 }
5101 }
5102
5103 public bool test(Range)(ref Range inp) const
5104 if (isRandomAccessRange!Range && is(ElementType!Range : wchar) &&
5105 !isDynamicArray!Range)
5106 {
5107 enum mode = Mode.neverSkip;
5108 assert(!inp.empty);
5109 auto ch = inp[0];
5110 static if (sizeFlags & 1)
5111 return ch < 0x80 ? ascii[ch] : lookupUni!mode(inp);
5112 else
5113 return lookupUni!mode(inp);
5114 }
5115
5116 bool match(C)(ref C[] str) const
5117 if (isSomeChar!C)
5118 {
5119 return fwdStr!"match"(str);
5120 }
5121
5122 bool skip(C)(ref C[] str) const
5123 if (isSomeChar!C)
5124 {
5125 return fwdStr!"skip"(str);
5126 }
5127
5128 bool test(C)(ref C[] str) const
5129 if (isSomeChar!C)
5130 {
5131 return fwdStr!"test"(str);
5132 }
5133
5134 mixin ForwardStrings; //dispatch strings to range versions
5135 }
5136
5137 struct Impl(Sizes...)
5138 if (Sizes.length >= 1 && Sizes.length <= 2)
5139 {
5140 private:
5141 import std.meta : allSatisfy;
5142 static assert(allSatisfy!(validSize, Sizes),
5143 "Only lengths of 1 and 2 code units are possible in UTF-16");
5144 static if (Sizes.length > 1)
5145 enum sizeFlags = Sizes[0] | Sizes[1];
5146 else
5147 enum sizeFlags = Sizes[0];
5148
5149 static if (sizeFlags & 1)
5150 {
5151 Ascii ascii;
5152 Bmp bmp;
5153 }
5154 static if (sizeFlags & 2)
5155 {
5156 Uni uni;
5157 }
5158 mixin DefMatcher;
5159
5160 package(std) @property CherryPick!(Impl, SizesToPick) subMatcher(SizesToPick...)()
5161 {
5162 return CherryPick!(Impl, SizesToPick)(&this);
5163 }
5164
5165 bool lookupUni(Mode mode, Range)(ref Range inp) const
5166 {
5167 wchar x = cast(wchar)(inp[0] - 0xD800);
5168 //not a high surrogate
5169 if (x > 0x3FF)
5170 {
5171 //low surrogate
5172 if (x <= 0x7FF) badEncoding();
5173 static if (sizeFlags & 1)
5174 {
5175 auto ch = inp[0];
5176 static if (mode == Mode.alwaysSkip)
5177 inp.popFront();
5178 static if (mode == Mode.skipOnMatch)
5179 {
5180 if (bmp[ch])
5181 {
5182 inp.popFront();
5183 return true;
5184 }
5185 else
5186 return false;
5187 }
5188 else
5189 return bmp[ch];
5190 }
5191 else //skip is not available for sub-matchers, so just false
5192 return false;
5193 }
5194 else
5195 {
5196 import std.range : popFrontN;
5197 static if (sizeFlags & 2)
5198 {
5199 if (inp.length < 2)
5200 badEncoding();
5201 wchar y = cast(wchar)(inp[1] - 0xDC00);
5202 //not a low surrogate
5203 if (y > 0x3FF)
5204 badEncoding();
5205 wchar[2] needle = [inp[0] & 0x3ff, inp[1] & 0x3ff];
5206 static if (mode == Mode.alwaysSkip)
5207 inp.popFrontN(2);
5208 static if (mode == Mode.skipOnMatch)
5209 {
5210 if (uni[needle])
5211 {
5212 inp.popFrontN(2);
5213 return true;
5214 }
5215 else
5216 return false;
5217 }
5218 else
5219 return uni[needle];
5220 }
5221 else //ditto
5222 return false;
5223 }
5224 }
5225 }
5226
5227 struct CherryPick(I, Sizes...)
5228 if (Sizes.length >= 1 && Sizes.length <= 2)
5229 {
5230 private:
5231 import std.meta : allSatisfy;
5232 I* m;
5233 enum sizeFlags = I.sizeFlags;
5234
5235 static if (sizeFlags & 1)
5236 {
5237 @property auto ascii()() const { return m.ascii; }
5238 }
5239
5240 bool lookupUni(Mode mode, Range)(ref Range inp) const
5241 {
5242 return m.lookupUni!mode(inp);
5243 }
5244 mixin DefMatcher;
5245 static assert(allSatisfy!(validSize, Sizes),
5246 "Only lengths of 1 and 2 code units are possible in UTF-16");
5247 }
5248 }
5249
5250 private auto utf8Matcher(Set)(Set set)
5251 {
5252 return Utf8Matcher!().build(set);
5253 }
5254
5255 private auto utf16Matcher(Set)(Set set)
5256 {
5257 return Utf16Matcher!().build(set);
5258 }
5259
5260 /**
5261 Constructs a matcher object
5262 to classify $(CODEPOINTS) from the `set` for encoding
5263 that has `Char` as code unit.
5264
5265 See $(LREF MatcherConcept) for API outline.
5266 */
5267 public auto utfMatcher(Char, Set)(Set set)
5268 if (isCodepointSet!Set)
5269 {
5270 static if (is(Char : char))
5271 return utf8Matcher(set);
5272 else static if (is(Char : wchar))
5273 return utf16Matcher(set);
5274 else static if (is(Char : dchar))
5275 static assert(false, "UTF-32 needs no decoding,
5276 and thus not supported by utfMatcher");
5277 else
5278 static assert(false, "Only character types 'char' and 'wchar' are allowed");
5279 }
5280
5281
5282 //a range of code units, packed with index to speed up forward iteration
5283 package(std) auto decoder(C)(C[] s, size_t offset=0)
5284 if (is(C : wchar) || is(C : char))
5285 {
5286 static struct Decoder
5287 {
5288 pure nothrow:
5289 C[] str;
5290 size_t idx;
5291 @property C front(){ return str[idx]; }
5292 @property C back(){ return str[$-1]; }
5293 void popFront(){ idx++; }
5294 void popBack(){ str = str[0..$-1]; }
5295 void popFrontN(size_t n){ idx += n; }
5296 @property bool empty(){ return idx == str.length; }
5297 @property auto save(){ return this; }
5298 auto opIndex(size_t i){ return str[idx+i]; }
5299 @property size_t length(){ return str.length - idx; }
5300 alias opDollar = length;
5301 auto opSlice(size_t a, size_t b){ return Decoder(str[0 .. idx+b], idx+a); }
5302 }
5303 static assert(isRandomAccessRange!Decoder);
5304 static assert(is(ElementType!Decoder : C));
5305 return Decoder(s, offset);
5306 }
5307
5308 pure @safe unittest
5309 {
5310 string rs = "hi! ネемног砀 текста";
5311 auto codec = rs.decoder;
5312 auto utf8 = utf8Matcher(unicode.Letter);
5313 auto asc = utf8.subMatcher!(1);
5314 auto uni = utf8.subMatcher!(2,3,4);
5315 assert(asc.test(codec));
5316 assert(!uni.match(codec));
5317 assert(utf8.skip(codec));
5318 assert(codec.idx == 1);
5319
5320 assert(!uni.match(codec));
5321 assert(asc.test(codec));
5322 assert(utf8.skip(codec));
5323 assert(codec.idx == 2);
5324 assert(!asc.match(codec));
5325
5326 assert(!utf8.test(codec));
5327 assert(!utf8.skip(codec));
5328
5329 assert(!asc.test(codec));
5330 assert(!utf8.test(codec));
5331 assert(!utf8.skip(codec));
5332 assert(utf8.test(codec));
5333 foreach (i; 0 .. 7)
5334 {
5335 assert(!asc.test(codec));
5336 assert(uni.test(codec));
5337 assert(utf8.skip(codec));
5338 }
5339 assert(!utf8.test(codec));
5340 assert(!utf8.skip(codec));
5341 //the same with match where applicable
5342 codec = rs.decoder;
5343 assert(utf8.match(codec));
5344 assert(codec.idx == 1);
5345 assert(utf8.match(codec));
5346 assert(codec.idx == 2);
5347 assert(!utf8.match(codec));
5348 assert(codec.idx == 2);
5349 assert(!utf8.skip(codec));
5350 assert(!utf8.skip(codec));
5351
5352 foreach (i; 0 .. 7)
5353 {
5354 assert(!asc.test(codec));
5355 assert(utf8.test(codec));
5356 assert(utf8.match(codec));
5357 }
5358 auto i = codec.idx;
5359 assert(!utf8.match(codec));
5360 assert(codec.idx == i);
5361 }
5362
5363 pure @safe unittest
5364 {
5365 import std.range : stride;
5366 static bool testAll(Matcher, Range)(ref Matcher m, ref Range r) @safe
5367 {
5368 bool t = m.test(r);
5369 auto save = r.idx;
5370 assert(t == m.match(r));
5371 assert(r.idx == save || t); //ether no change or was match
5372 r.idx = save;
5373 static if (is(typeof(m.skip(r))))
5374 {
5375 assert(t == m.skip(r));
5376 assert(r.idx != save); //always changed
5377 r.idx = save;
5378 }
5379 return t;
5380 }
5381 auto utf16 = utfMatcher!wchar(unicode.L);
5382 auto bmp = utf16.subMatcher!1;
5383 auto nonBmp = utf16.subMatcher!1;
5384 auto utf8 = utfMatcher!char(unicode.L);
5385 auto ascii = utf8.subMatcher!1;
5386 auto uni2 = utf8.subMatcher!2;
5387 auto uni3 = utf8.subMatcher!3;
5388 auto uni24 = utf8.subMatcher!(2,4);
5389 foreach (ch; unicode.L.byCodepoint.stride(3))
5390 {
5391 import std.utf : encode;
5392 char[4] buf;
5393 wchar[2] buf16;
5394 auto len = encode(buf, ch);
5395 auto len16 = encode(buf16, ch);
5396 auto c8 = buf[0 .. len].decoder;
5397 auto c16 = buf16[0 .. len16].decoder;
5398 assert(testAll(utf16, c16));
5399 assert(testAll(bmp, c16) || len16 != 1);
5400 assert(testAll(nonBmp, c16) || len16 != 2);
5401
5402 assert(testAll(utf8, c8));
5403
5404 //submatchers return false on out of their domain
5405 assert(testAll(ascii, c8) || len != 1);
5406 assert(testAll(uni2, c8) || len != 2);
5407 assert(testAll(uni3, c8) || len != 3);
5408 assert(testAll(uni24, c8) || (len != 2 && len != 4));
5409 }
5410 }
5411
5412 // cover decode fail cases of Matcher
5413 pure @safe unittest
5414 {
5415 import std.algorithm.iteration : map;
5416 import std.exception : collectException;
5417 import std.format : format;
5418 auto utf16 = utfMatcher!wchar(unicode.L);
5419 auto utf8 = utfMatcher!char(unicode.L);
5420 //decode failure cases UTF-8
5421 alias fails8 = AliasSeq!("\xC1", "\x80\x00","\xC0\x00", "\xCF\x79",
5422 "\xFF\x00\0x00\0x00\x00", "\xC0\0x80\0x80\x80", "\x80\0x00\0x00\x00",
5423 "\xCF\x00\0x00\0x00\x00");
5424 foreach (msg; fails8)
5425 {
5426 assert(collectException((){
5427 auto s = msg;
5428 size_t idx = 0;
5429 utf8.test(s);
5430 }()), format("%( %2x %)", cast(immutable(ubyte)[]) msg));
5431 }
5432 //decode failure cases UTF-16
5433 alias fails16 = AliasSeq!([0xD811], [0xDC02]);
5434 foreach (msg; fails16)
5435 {
5436 assert(collectException((){
5437 auto s = msg.map!(x => cast(wchar) x);
5438 utf16.test(s);
5439 }()));
5440 }
5441 }
5442
5443 /++
5444 Convenience function to construct optimal configurations for
5445 packed Trie from any `set` of $(CODEPOINTS).
5446
5447 The parameter `level` indicates the number of trie levels to use,
5448 allowed values are: 1, 2, 3 or 4. Levels represent different trade-offs
5449 speed-size wise.
5450
5451 $(P Level 1 is fastest and the most memory hungry (a bit array). )
5452 $(P Level 4 is the slowest and has the smallest footprint. )
5453
5454 See the $(S_LINK Synopsis, Synopsis) section for example.
5455
5456 Note:
5457 Level 4 stays very practical (being faster and more predictable)
5458 compared to using direct lookup on the `set` itself.
5459
5460
5461 +/
5462 public auto toTrie(size_t level, Set)(Set set)
5463 if (isCodepointSet!Set)
5464 {
5465 static if (level == 1)
5466 return codepointSetTrie!(21)(set);
5467 else static if (level == 2)
5468 return codepointSetTrie!(10, 11)(set);
5469 else static if (level == 3)
5470 return codepointSetTrie!(8, 5, 8)(set);
5471 else static if (level == 4)
5472 return codepointSetTrie!(6, 4, 4, 7)(set);
5473 else
5474 static assert(false,
5475 "Sorry, toTrie doesn't support levels > 4, use codepointSetTrie directly");
5476 }
5477
5478 /**
5479 $(P Builds a `Trie` with typically optimal speed-size trade-off
5480 and wraps it into a delegate of the following type:
5481 $(D bool delegate(dchar ch)). )
5482
5483 $(P Effectively this creates a 'tester' lambda suitable
5484 for algorithms like std.algorithm.find that take unary predicates. )
5485
5486 See the $(S_LINK Synopsis, Synopsis) section for example.
5487 */
5488 public auto toDelegate(Set)(Set set)
5489 if (isCodepointSet!Set)
5490 {
5491 // 3 is very small and is almost as fast as 2-level (due to CPU caches?)
5492 auto t = toTrie!3(set);
5493 return (dchar ch) => t[ch];
5494 }
5495
5496 /**
5497 $(P Opaque wrapper around unsigned built-in integers and
5498 code unit (char/wchar/dchar) types.
5499 Parameter `sz` indicates that the value is confined
5500 to the range of [0, 2^^sz$(RPAREN). With this knowledge it can be
5501 packed more tightly when stored in certain
5502 data-structures like trie. )
5503
5504 Note:
5505 $(P The $(D BitPacked!(T, sz)) is implicitly convertible to `T`
5506 but not vise-versa. Users have to ensure the value fits in
5507 the range required and use the `cast`
5508 operator to perform the conversion.)
5509 */
5510 struct BitPacked(T, size_t sz)
5511 if (isIntegral!T || is(T:dchar))
5512 {
5513 enum bitSize = sz;
5514 T _value;
5515 alias _value this;
5516 }
5517
5518 /*
5519 Depending on the form of the passed argument `bitSizeOf` returns
5520 the amount of bits required to represent a given type
5521 or a return type of a given functor.
5522 */
5523 template bitSizeOf(Args...)
5524 if (Args.length == 1)
5525 {
5526 import std.traits : ReturnType;
5527 alias T = Args[0];
5528 static if (__traits(compiles, { size_t val = T.bitSize; })) //(is(typeof(T.bitSize) : size_t))
5529 {
5530 enum bitSizeOf = T.bitSize;
5531 }
5532 else static if (is(ReturnType!T dummy == BitPacked!(U, bits), U, size_t bits))
5533 {
5534 enum bitSizeOf = bitSizeOf!(ReturnType!T);
5535 }
5536 else
5537 {
5538 enum bitSizeOf = T.sizeof*8;
5539 }
5540 }
5541
5542 /**
5543 Tests if `T` is some instantiation of $(LREF BitPacked)!(U, x)
5544 and thus suitable for packing.
5545 */
5546 template isBitPacked(T)
5547 {
5548 static if (is(T dummy == BitPacked!(U, bits), U, size_t bits))
5549 enum isBitPacked = true;
5550 else
5551 enum isBitPacked = false;
5552 }
5553
5554 /**
5555 Gives the type `U` from $(LREF BitPacked)!(U, x)
5556 or `T` itself for every other type.
5557 */
5558 template TypeOfBitPacked(T)
5559 {
5560 static if (is(T dummy == BitPacked!(U, bits), U, size_t bits))
5561 alias TypeOfBitPacked = U;
5562 else
5563 alias TypeOfBitPacked = T;
5564 }
5565
5566 /*
5567 Wrapper, used in definition of custom data structures from `Trie` template.
5568 Applying it to a unary lambda function indicates that the returned value always
5569 fits within `bits` of bits.
5570 */
5571 struct assumeSize(alias Fn, size_t bits)
5572 {
5573 enum bitSize = bits;
5574 static auto ref opCall(T)(auto ref T arg)
5575 {
5576 return Fn(arg);
5577 }
5578 }
5579
5580 /*
5581 A helper for defining lambda function that yields a slice
5582 of certain bits from an unsigned integral value.
5583 The resulting lambda is wrapped in assumeSize and can be used directly
5584 with `Trie` template.
5585 */
5586 struct sliceBits(size_t from, size_t to)
5587 {
5588 //for now bypass assumeSize, DMD has trouble inlining it
5589 enum bitSize = to-from;
5590 static auto opCall(T)(T x)
5591 out(result)
5592 {
5593 assert(result < (1 << to-from));
5594 }
5595 do
5596 {
5597 static assert(from < to);
5598 static if (from == 0)
5599 return x & ((1 << to)-1);
5600 else
5601 return (x >> from) & ((1<<(to-from))-1);
5602 }
5603 }
5604
5605 @safe pure nothrow @nogc uint low_8(uint x) { return x&0xFF; }
5606 @safe pure nothrow @nogc uint midlow_8(uint x){ return (x&0xFF00)>>8; }
5607 alias lo8 = assumeSize!(low_8, 8);
5608 alias mlo8 = assumeSize!(midlow_8, 8);
5609
5610 @safe pure nothrow @nogc unittest
5611 {
5612 static assert(bitSizeOf!lo8 == 8);
5613 static assert(bitSizeOf!(sliceBits!(4, 7)) == 3);
5614 static assert(bitSizeOf!(BitPacked!(uint, 2)) == 2);
5615 }
5616
5617 template Sequence(size_t start, size_t end)
5618 {
5619 static if (start < end)
5620 alias Sequence = AliasSeq!(start, Sequence!(start+1, end));
5621 else
5622 alias Sequence = AliasSeq!();
5623 }
5624
5625 //---- TRIE TESTS ----
5626 @system unittest
5627 {
5628 import std.algorithm.iteration : map;
5629 import std.algorithm.sorting : sort;
5630 import std.array : array;
5631 import std.conv : text, to;
5632 import std.range : iota;
5633 static trieStats(TRIE)(TRIE t)
5634 {
5635 version (std_uni_stats)
5636 {
5637 import std.stdio : writefln, writeln;
5638 writeln("---TRIE FOOTPRINT STATS---");
5639 static foreach (i; 0 .. t.table.dim)
5640 {
5641 writefln("lvl%s = %s bytes; %s pages"
5642 , i, t.bytes!i, t.pages!i);
5643 }
5644 writefln("TOTAL: %s bytes", t.bytes);
5645 version (none)
5646 {
5647 writeln("INDEX (excluding value level):");
5648 static foreach (i; 0 .. t.table.dim-1)
5649 writeln(t.table.slice!(i)[0 .. t.table.length!i]);
5650 }
5651 writeln("---------------------------");
5652 }
5653 }
5654 //@@@BUG link failure, lambdas not found by linker somehow (in case of trie2)
5655 // alias lo8 = assumeSize!(8, function (uint x) { return x&0xFF; });
5656 // alias next8 = assumeSize!(7, function (uint x) { return (x&0x7F00)>>8; });
5657 alias Set = CodepointSet;
5658 auto set = Set('A','Z','a','z');
5659 auto trie = buildTrie!(bool, uint, 256, lo8)(set.byInterval);// simple bool array
5660 for (int a='a'; a<'z';a++)
5661 assert(trie[a]);
5662 for (int a='A'; a<'Z';a++)
5663 assert(trie[a]);
5664 for (int a=0; a<'A'; a++)
5665 assert(!trie[a]);
5666 for (int a ='Z'; a<'a'; a++)
5667 assert(!trie[a]);
5668 trieStats(trie);
5669
5670 auto redundant2 = Set(
5671 1, 18, 256+2, 256+111, 512+1, 512+18, 768+2, 768+111);
5672 auto trie2 = buildTrie!(bool, uint, 1024, mlo8, lo8)(redundant2.byInterval);
5673 trieStats(trie2);
5674 foreach (e; redundant2.byCodepoint)
5675 assert(trie2[e], text(cast(uint) e, " - ", trie2[e]));
5676 foreach (i; 0 .. 1024)
5677 {
5678 assert(trie2[i] == (i in redundant2));
5679 }
5680
5681
5682 auto redundant3 = Set(
5683 2, 4, 6, 8, 16,
5684 2+16, 4+16, 16+6, 16+8, 16+16,
5685 2+32, 4+32, 32+6, 32+8,
5686 );
5687
5688 enum max3 = 256;
5689 // sliceBits
5690 auto trie3 = buildTrie!(bool, uint, max3,
5691 sliceBits!(6,8), sliceBits!(4,6), sliceBits!(0,4)
5692 )(redundant3.byInterval);
5693 trieStats(trie3);
5694 foreach (i; 0 .. max3)
5695 assert(trie3[i] == (i in redundant3), text(cast(uint) i));
5696
5697 auto redundant4 = Set(
5698 10, 64, 64+10, 128, 128+10, 256, 256+10, 512,
5699 1000, 2000, 3000, 4000, 5000, 6000
5700 );
5701 enum max4 = 2^^16;
5702 auto trie4 = buildTrie!(bool, size_t, max4,
5703 sliceBits!(13, 16), sliceBits!(9, 13), sliceBits!(6, 9) , sliceBits!(0, 6)
5704 )(redundant4.byInterval);
5705 foreach (i; 0 .. max4)
5706 {
5707 if (i in redundant4)
5708 assert(trie4[i], text(cast(uint) i));
5709 }
5710 trieStats(trie4);
5711
5712 alias mapToS = mapTrieIndex!(useItemAt!(0, char));
5713 string[] redundantS = ["tea", "start", "orange"];
5714 redundantS.sort!((a,b) => mapToS(a) < mapToS(b))();
5715 auto strie = buildTrie!(bool, string, useItemAt!(0, char))(redundantS);
5716 // using first char only
5717 assert(redundantS == ["orange", "start", "tea"]);
5718 assert(strie["test"], text(strie["test"]));
5719 assert(!strie["aea"]);
5720 assert(strie["s"]);
5721
5722 // a bit size test
5723 auto a = array(map!(x => to!ubyte(x))(iota(0, 256)));
5724 auto bt = buildTrie!(bool, ubyte, sliceBits!(7, 8), sliceBits!(5, 7), sliceBits!(0, 5))(a);
5725 trieStats(bt);
5726 foreach (i; 0 .. 256)
5727 assert(bt[cast(ubyte) i]);
5728 }
5729
5730 template useItemAt(size_t idx, T)
5731 if (isIntegral!T || is(T: dchar))
5732 {
5733 size_t impl(const scope T[] arr){ return arr[idx]; }
5734 alias useItemAt = assumeSize!(impl, 8*T.sizeof);
5735 }
5736
5737 template useLastItem(T)
5738 {
5739 size_t impl(const scope T[] arr){ return arr[$-1]; }
5740 alias useLastItem = assumeSize!(impl, 8*T.sizeof);
5741 }
5742
5743 template fullBitSize(Prefix...)
5744 {
5745 static if (Prefix.length > 0)
5746 enum fullBitSize = bitSizeOf!(Prefix[0])+fullBitSize!(Prefix[1..$]);
5747 else
5748 enum fullBitSize = 0;
5749 }
5750
5751 template idxTypes(Key, size_t fullBits, Prefix...)
5752 {
5753 static if (Prefix.length == 1)
5754 {// the last level is value level, so no index once reduced to 1-level
5755 alias idxTypes = AliasSeq!();
5756 }
5757 else
5758 {
5759 // Important note on bit packing
5760 // Each level has to hold enough of bits to address the next one
5761 // The bottom level is known to hold full bit width
5762 // thus it's size in pages is full_bit_width - size_of_last_prefix
5763 // Recourse on this notion
5764 alias idxTypes =
5765 AliasSeq!(
5766 idxTypes!(Key, fullBits - bitSizeOf!(Prefix[$-1]), Prefix[0..$-1]),
5767 BitPacked!(typeof(Prefix[$-2](Key.init)), fullBits - bitSizeOf!(Prefix[$-1]))
5768 );
5769 }
5770 }
5771
5772 //============================================================================
5773
5774 @safe pure int comparePropertyName(Char1, Char2)(const(Char1)[] a, const(Char2)[] b)
5775 if (is(Char1 : dchar) && is(Char2 : dchar))
5776 {
5777 import std.algorithm.comparison : cmp;
5778 import std.algorithm.iteration : map, filter;
5779 import std.ascii : toLower;
5780 static bool pred(dchar c) {return !c.isWhite && c != '-' && c != '_';}
5781 return cmp(
5782 a.map!toLower.filter!pred,
5783 b.map!toLower.filter!pred);
5784 }
5785
5786 @safe pure unittest
5787 {
5788 assert(!comparePropertyName("foo-bar", "fooBar"));
5789 }
5790
5791 bool propertyNameLess(Char1, Char2)(const(Char1)[] a, const(Char2)[] b) @safe pure
5792 if (is(Char1 : dchar) && is(Char2 : dchar))
5793 {
5794 return comparePropertyName(a, b) < 0;
5795 }
5796
5797 //============================================================================
5798 // Utilities for compression of Unicode code point sets
5799 //============================================================================
5800
5801 @safe void compressTo(uint val, ref scope ubyte[] arr) pure nothrow
5802 {
5803 // not optimized as usually done 1 time (and not public interface)
5804 if (val < 128)
5805 arr ~= cast(ubyte) val;
5806 else if (val < (1 << 13))
5807 {
5808 arr ~= (0b1_00 << 5) | cast(ubyte)(val >> 8);
5809 arr ~= val & 0xFF;
5810 }
5811 else
5812 {
5813 assert(val < (1 << 21));
5814 arr ~= (0b1_01 << 5) | cast(ubyte)(val >> 16);
5815 arr ~= (val >> 8) & 0xFF;
5816 arr ~= val & 0xFF;
5817 }
5818 }
5819
5820 @safe uint decompressFrom(scope const(ubyte)[] arr, ref size_t idx) pure
5821 {
5822 import std.exception : enforce;
5823 immutable first = arr[idx++];
5824 if (!(first & 0x80)) // no top bit -> [0 .. 127]
5825 return first;
5826 immutable extra = ((first >> 5) & 1) + 1; // [1, 2]
5827 uint val = (first & 0x1F);
5828 enforce(idx + extra <= arr.length, "bad code point interval encoding");
5829 foreach (j; 0 .. extra)
5830 val = (val << 8) | arr[idx+j];
5831 idx += extra;
5832 return val;
5833 }
5834
5835
5836 package(std) ubyte[] compressIntervals(Range)(Range intervals)
5837 if (isInputRange!Range && isIntegralPair!(ElementType!Range))
5838 {
5839 ubyte[] storage;
5840 uint base = 0;
5841 // RLE encode
5842 foreach (val; intervals)
5843 {
5844 compressTo(val[0]-base, storage);
5845 base = val[0];
5846 if (val[1] != lastDchar+1) // till the end of the domain so don't store it
5847 {
5848 compressTo(val[1]-base, storage);
5849 base = val[1];
5850 }
5851 }
5852 return storage;
5853 }
5854
5855 @safe pure unittest
5856 {
5857 import std.algorithm.comparison : equal;
5858 import std.typecons : tuple;
5859
5860 auto run = [tuple(80, 127), tuple(128, (1 << 10)+128)];
5861 ubyte[] enc = [cast(ubyte) 80, 47, 1, (0b1_00 << 5) | (1 << 2), 0];
5862 assert(compressIntervals(run) == enc);
5863 auto run2 = [tuple(0, (1 << 20)+512+1), tuple((1 << 20)+512+4, lastDchar+1)];
5864 ubyte[] enc2 = [cast(ubyte) 0, (0b1_01 << 5) | (1 << 4), 2, 1, 3]; // odd length-ed
5865 assert(compressIntervals(run2) == enc2);
5866 size_t idx = 0;
5867 assert(decompressFrom(enc, idx) == 80);
5868 assert(decompressFrom(enc, idx) == 47);
5869 assert(decompressFrom(enc, idx) == 1);
5870 assert(decompressFrom(enc, idx) == (1 << 10));
5871 idx = 0;
5872 assert(decompressFrom(enc2, idx) == 0);
5873 assert(decompressFrom(enc2, idx) == (1 << 20)+512+1);
5874 assert(equal(decompressIntervals(compressIntervals(run)), run));
5875 assert(equal(decompressIntervals(compressIntervals(run2)), run2));
5876 }
5877
5878 // Creates a range of `CodepointInterval` that lazily decodes compressed data.
5879 @safe package(std) auto decompressIntervals(const(ubyte)[] data) pure
5880 {
5881 return DecompressedIntervals(data);
5882 }
5883
5884 @safe struct DecompressedIntervals
5885 {
5886 pure:
5887 const(ubyte)[] _stream;
5888 size_t _idx;
5889 CodepointInterval _front;
5890
5891 this(const(ubyte)[] stream)
5892 {
5893 _stream = stream;
5894 popFront();
5895 }
5896
5897 @property CodepointInterval front()
5898 {
5899 assert(!empty);
5900 return _front;
5901 }
5902
5903 void popFront()
5904 {
5905 if (_idx == _stream.length)
5906 {
5907 _idx = size_t.max;
5908 return;
5909 }
5910 uint base = _front[1];
5911 _front[0] = base + decompressFrom(_stream, _idx);
5912 if (_idx == _stream.length)// odd length ---> till the end
5913 _front[1] = lastDchar+1;
5914 else
5915 {
5916 base = _front[0];
5917 _front[1] = base + decompressFrom(_stream, _idx);
5918 }
5919 }
5920
5921 @property bool empty() const
5922 {
5923 return _idx == size_t.max;
5924 }
5925
5926 @property DecompressedIntervals save() return scope { return this; }
5927 }
5928
5929 @safe pure nothrow @nogc unittest
5930 {
5931 static assert(isInputRange!DecompressedIntervals);
5932 static assert(isForwardRange!DecompressedIntervals);
5933 }
5934
5935 //============================================================================
5936
5937 version (std_uni_bootstrap){}
5938 else
5939 {
5940
5941 // helper for looking up code point sets
5942 ptrdiff_t findUnicodeSet(alias table, C)(const scope C[] name)
5943 {
5944 import std.algorithm.iteration : map;
5945 import std.range : assumeSorted;
5946 auto range = assumeSorted!((a,b) => propertyNameLess(a,b))
5947 (table.map!"a.name"());
5948 size_t idx = range.lowerBound(name).length;
5949 if (idx < range.length && comparePropertyName(range[idx], name) == 0)
5950 return idx;
5951 return -1;
5952 }
5953
5954 // another one that loads it
5955 bool loadUnicodeSet(alias table, Set, C)(const scope C[] name, ref Set dest)
5956 {
5957 auto idx = findUnicodeSet!table(name);
5958 if (idx >= 0)
5959 {
5960 dest = Set(asSet(table[idx].compressed));
5961 return true;
5962 }
5963 return false;
5964 }
5965
5966 bool loadProperty(Set=CodepointSet, C)
5967 (const scope C[] name, ref Set target) pure
5968 {
5969 import std.internal.unicode_tables : uniProps; // generated file
5970 alias ucmp = comparePropertyName;
5971 // conjure cumulative properties by hand
5972 if (ucmp(name, "L") == 0 || ucmp(name, "Letter") == 0)
5973 {
5974 target = asSet(uniProps.Lu);
5975 target |= asSet(uniProps.Ll);
5976 target |= asSet(uniProps.Lt);
5977 target |= asSet(uniProps.Lo);
5978 target |= asSet(uniProps.Lm);
5979 }
5980 else if (ucmp(name,"LC") == 0 || ucmp(name,"Cased Letter")==0)
5981 {
5982 target = asSet(uniProps.Ll);
5983 target |= asSet(uniProps.Lu);
5984 target |= asSet(uniProps.Lt);// Title case
5985 }
5986 else if (ucmp(name, "M") == 0 || ucmp(name, "Mark") == 0)
5987 {
5988 target = asSet(uniProps.Mn);
5989 target |= asSet(uniProps.Mc);
5990 target |= asSet(uniProps.Me);
5991 }
5992 else if (ucmp(name, "N") == 0 || ucmp(name, "Number") == 0)
5993 {
5994 target = asSet(uniProps.Nd);
5995 target |= asSet(uniProps.Nl);
5996 target |= asSet(uniProps.No);
5997 }
5998 else if (ucmp(name, "P") == 0 || ucmp(name, "Punctuation") == 0)
5999 {
6000 target = asSet(uniProps.Pc);
6001 target |= asSet(uniProps.Pd);
6002 target |= asSet(uniProps.Ps);
6003 target |= asSet(uniProps.Pe);
6004 target |= asSet(uniProps.Pi);
6005 target |= asSet(uniProps.Pf);
6006 target |= asSet(uniProps.Po);
6007 }
6008 else if (ucmp(name, "S") == 0 || ucmp(name, "Symbol") == 0)
6009 {
6010 target = asSet(uniProps.Sm);
6011 target |= asSet(uniProps.Sc);
6012 target |= asSet(uniProps.Sk);
6013 target |= asSet(uniProps.So);
6014 }
6015 else if (ucmp(name, "Z") == 0 || ucmp(name, "Separator") == 0)
6016 {
6017 target = asSet(uniProps.Zs);
6018 target |= asSet(uniProps.Zl);
6019 target |= asSet(uniProps.Zp);
6020 }
6021 else if (ucmp(name, "C") == 0 || ucmp(name, "Other") == 0)
6022 {
6023 target = asSet(uniProps.Co);
6024 target |= asSet(uniProps.Lo);
6025 target |= asSet(uniProps.No);
6026 target |= asSet(uniProps.So);
6027 target |= asSet(uniProps.Po);
6028 }
6029 else if (ucmp(name, "graphical") == 0)
6030 {
6031 target = asSet(uniProps.Alphabetic);
6032
6033 target |= asSet(uniProps.Mn);
6034 target |= asSet(uniProps.Mc);
6035 target |= asSet(uniProps.Me);
6036
6037 target |= asSet(uniProps.Nd);
6038 target |= asSet(uniProps.Nl);
6039 target |= asSet(uniProps.No);
6040
6041 target |= asSet(uniProps.Pc);
6042 target |= asSet(uniProps.Pd);
6043 target |= asSet(uniProps.Ps);
6044 target |= asSet(uniProps.Pe);
6045 target |= asSet(uniProps.Pi);
6046 target |= asSet(uniProps.Pf);
6047 target |= asSet(uniProps.Po);
6048
6049 target |= asSet(uniProps.Zs);
6050
6051 target |= asSet(uniProps.Sm);
6052 target |= asSet(uniProps.Sc);
6053 target |= asSet(uniProps.Sk);
6054 target |= asSet(uniProps.So);
6055 }
6056 else if (ucmp(name, "any") == 0)
6057 target = Set.fromIntervals(0, 0x110000);
6058 else if (ucmp(name, "ascii") == 0)
6059 target = Set.fromIntervals(0, 0x80);
6060 else
6061 return loadUnicodeSet!(uniProps.tab)(name, target);
6062 return true;
6063 }
6064
6065 // CTFE-only helper for checking property names at compile-time
6066 @safe bool isPrettyPropertyName(C)(const scope C[] name)
6067 {
6068 import std.algorithm.searching : find;
6069 auto names = [
6070 "L", "Letter",
6071 "LC", "Cased Letter",
6072 "M", "Mark",
6073 "N", "Number",
6074 "P", "Punctuation",
6075 "S", "Symbol",
6076 "Z", "Separator",
6077 "Graphical",
6078 "any",
6079 "ascii"
6080 ];
6081 auto x = find!(x => comparePropertyName(x, name) == 0)(names);
6082 return !x.empty;
6083 }
6084
6085 // ditto, CTFE-only, not optimized
6086 @safe private static bool findSetName(alias table, C)(const scope C[] name)
6087 {
6088 return findUnicodeSet!table(name) >= 0;
6089 }
6090
6091 template SetSearcher(alias table, string kind)
6092 {
6093 /// Run-time checked search.
6094 static auto opCall(C)(const scope C[] name)
6095 if (is(C : dchar))
6096 {
6097 import std.conv : to;
6098 CodepointSet set;
6099 if (loadUnicodeSet!table(name, set))
6100 return set;
6101 throw new Exception("No unicode set for "~kind~" by name "
6102 ~name.to!string()~" was found.");
6103 }
6104 /// Compile-time checked search.
6105 static @property auto opDispatch(string name)()
6106 {
6107 static if (findSetName!table(name))
6108 {
6109 CodepointSet set;
6110 loadUnicodeSet!table(name, set);
6111 return set;
6112 }
6113 else
6114 static assert(false, "No unicode set for "~kind~" by name "
6115 ~name~" was found.");
6116 }
6117 }
6118
6119 // Characters that need escaping in string posed as regular expressions
6120 package(std) alias Escapables = AliasSeq!('[', ']', '\\', '^', '$', '.', '|', '?', ',', '-',
6121 ';', ':', '#', '&', '%', '/', '<', '>', '`', '*', '+', '(', ')', '{', '}', '~');
6122
6123 package(std) CodepointSet memoizeExpr(string expr)()
6124 {
6125 if (__ctfe)
6126 return mixin(expr);
6127 alias T = typeof(mixin(expr));
6128 static T slot;
6129 static bool initialized;
6130 if (!initialized)
6131 {
6132 slot = mixin(expr);
6133 initialized = true;
6134 }
6135 return slot;
6136 }
6137
6138 //property for \w character class
6139 package(std) @property CodepointSet wordCharacter() @safe
6140 {
6141 return memoizeExpr!("unicode.Alphabetic | unicode.Mn | unicode.Mc
6142 | unicode.Me | unicode.Nd | unicode.Pc")();
6143 }
6144
6145 //basic stack, just in case it gets used anywhere else then Parser
6146 package(std) struct Stack(T)
6147 {
6148 @safe:
6149 T[] data;
6150 @property bool empty(){ return data.empty; }
6151
6152 @property size_t length(){ return data.length; }
6153
6154 void push(T val){ data ~= val; }
6155
6156 @trusted T pop()
6157 {
6158 assert(!empty);
6159 auto val = data[$ - 1];
6160 data = data[0 .. $ - 1];
6161 if (!__ctfe)
6162 cast(void) data.assumeSafeAppend();
6163 return val;
6164 }
6165
6166 @property ref T top()
6167 {
6168 assert(!empty);
6169 return data[$ - 1];
6170 }
6171 }
6172
6173 //test if a given string starts with hex number of maxDigit that's a valid codepoint
6174 //returns it's value and skips these maxDigit chars on success, throws on failure
6175 package(std) dchar parseUniHex(Range)(ref Range str, size_t maxDigit)
6176 {
6177 import std.exception : enforce;
6178 //std.conv.parse is both @system and bogus
6179 uint val;
6180 for (int k = 0; k < maxDigit; k++)
6181 {
6182 enforce(!str.empty, "incomplete escape sequence");
6183 //accepts ascii only, so it's OK to index directly
6184 immutable current = str.front;
6185 if ('0' <= current && current <= '9')
6186 val = val * 16 + current - '0';
6187 else if ('a' <= current && current <= 'f')
6188 val = val * 16 + current -'a' + 10;
6189 else if ('A' <= current && current <= 'F')
6190 val = val * 16 + current - 'A' + 10;
6191 else
6192 throw new Exception("invalid escape sequence");
6193 str.popFront();
6194 }
6195 enforce(val <= 0x10FFFF, "invalid codepoint");
6196 return val;
6197 }
6198
6199 @safe unittest
6200 {
6201 import std.algorithm.searching : canFind;
6202 import std.exception : collectException;
6203 string[] non_hex = [ "000j", "000z", "FffG", "0Z"];
6204 string[] hex = [ "01", "ff", "00af", "10FFFF" ];
6205 int[] value = [ 1, 0xFF, 0xAF, 0x10FFFF ];
6206 foreach (v; non_hex)
6207 assert(collectException(parseUniHex(v, v.length)).msg
6208 .canFind("invalid escape sequence"));
6209 foreach (i, v; hex)
6210 assert(parseUniHex(v, v.length) == value[i]);
6211 string over = "0011FFFF";
6212 assert(collectException(parseUniHex(over, over.length)).msg
6213 .canFind("invalid codepoint"));
6214 }
6215
6216 auto caseEnclose(CodepointSet set)
6217 {
6218 auto cased = set & unicode.LC;
6219 foreach (dchar ch; cased.byCodepoint)
6220 {
6221 foreach (c; simpleCaseFoldings(ch))
6222 set |= c;
6223 }
6224 return set;
6225 }
6226
6227 /+
6228 fetch codepoint set corresponding to a name (InBlock or binary property)
6229 +/
6230 CodepointSet getUnicodeSet(const scope char[] name, bool negated, bool casefold) @safe
6231 {
6232 CodepointSet s = unicode(name);
6233 //FIXME: caseEnclose for new uni as Set | CaseEnclose(SET && LC)
6234 if (casefold)
6235 s = caseEnclose(s);
6236 if (negated)
6237 s = s.inverted;
6238 return s;
6239 }
6240
6241 struct UnicodeSetParser(Range)
6242 {
6243 import std.exception : enforce;
6244 import std.typecons : tuple, Tuple;
6245 Range range;
6246 bool casefold_;
6247
6248 @property bool empty(){ return range.empty; }
6249 @property dchar front(){ return range.front; }
6250 void popFront(){ range.popFront(); }
6251
6252 //CodepointSet operations relatively in order of priority
6253 enum Operator:uint {
6254 Open = 0, Negate, Difference, SymDifference, Intersection, Union, None
6255 }
6256
6257 //parse unit of CodepointSet spec, most notably escape sequences and char ranges
6258 //also fetches next set operation
6259 Tuple!(CodepointSet,Operator) parseCharTerm()
6260 {
6261 import std.range : drop;
6262 enum privateUseStart = '\U000F0000', privateUseEnd ='\U000FFFFD';
6263 enum State{ Start, Char, Escape, CharDash, CharDashEscape,
6264 PotentialTwinSymbolOperator }
6265 Operator op = Operator.None;
6266 dchar last;
6267 CodepointSet set;
6268 State state = State.Start;
6269
6270 void addWithFlags(ref CodepointSet set, uint ch)
6271 {
6272 if (casefold_)
6273 {
6274 auto range = simpleCaseFoldings(ch);
6275 foreach (v; range)
6276 set |= v;
6277 }
6278 else
6279 set |= ch;
6280 }
6281
6282 static Operator twinSymbolOperator(dchar symbol)
6283 {
6284 switch (symbol)
6285 {
6286 case '|':
6287 return Operator.Union;
6288 case '-':
6289 return Operator.Difference;
6290 case '~':
6291 return Operator.SymDifference;
6292 case '&':
6293 return Operator.Intersection;
6294 default:
6295 assert(false);
6296 }
6297 }
6298
6299 L_CharTermLoop:
6300 for (;;)
6301 {
6302 final switch (state)
6303 {
6304 case State.Start:
6305 switch (front)
6306 {
6307 case '|':
6308 case '-':
6309 case '~':
6310 case '&':
6311 state = State.PotentialTwinSymbolOperator;
6312 last = front;
6313 break;
6314 case '[':
6315 op = Operator.Union;
6316 goto case;
6317 case ']':
6318 break L_CharTermLoop;
6319 case '\\':
6320 state = State.Escape;
6321 break;
6322 default:
6323 state = State.Char;
6324 last = front;
6325 }
6326 break;
6327 case State.Char:
6328 // xxx last front xxx
6329 switch (front)
6330 {
6331 case '|':
6332 case '~':
6333 case '&':
6334 // then last is treated as normal char and added as implicit union
6335 state = State.PotentialTwinSymbolOperator;
6336 addWithFlags(set, last);
6337 last = front;
6338 break;
6339 case '-': // still need more info
6340 state = State.CharDash;
6341 break;
6342 case '\\':
6343 set |= last;
6344 state = State.Escape;
6345 break;
6346 case '[':
6347 op = Operator.Union;
6348 goto case;
6349 case ']':
6350 addWithFlags(set, last);
6351 break L_CharTermLoop;
6352 default:
6353 state = State.Char;
6354 addWithFlags(set, last);
6355 last = front;
6356 }
6357 break;
6358 case State.PotentialTwinSymbolOperator:
6359 // xxx last front xxxx
6360 // where last = [|-&~]
6361 if (front == last)
6362 {
6363 op = twinSymbolOperator(last);
6364 popFront();//skip second twin char
6365 break L_CharTermLoop;
6366 }
6367 goto case State.Char;
6368 case State.Escape:
6369 // xxx \ front xxx
6370 switch (front)
6371 {
6372 case 'f':
6373 last = '\f';
6374 state = State.Char;
6375 break;
6376 case 'n':
6377 last = '\n';
6378 state = State.Char;
6379 break;
6380 case 'r':
6381 last = '\r';
6382 state = State.Char;
6383 break;
6384 case 't':
6385 last = '\t';
6386 state = State.Char;
6387 break;
6388 case 'v':
6389 last = '\v';
6390 state = State.Char;
6391 break;
6392 case 'c':
6393 last = unicode.parseControlCode(this);
6394 state = State.Char;
6395 break;
6396 foreach (val; Escapables)
6397 {
6398 case val:
6399 }
6400 last = front;
6401 state = State.Char;
6402 break;
6403 case 'p':
6404 set.add(unicode.parsePropertySpec(this, false, casefold_));
6405 state = State.Start;
6406 continue L_CharTermLoop; //next char already fetched
6407 case 'P':
6408 set.add(unicode.parsePropertySpec(this, true, casefold_));
6409 state = State.Start;
6410 continue L_CharTermLoop; //next char already fetched
6411 case 'x':
6412 popFront();
6413 last = parseUniHex(this, 2);
6414 state = State.Char;
6415 continue L_CharTermLoop;
6416 case 'u':
6417 popFront();
6418 last = parseUniHex(this, 4);
6419 state = State.Char;
6420 continue L_CharTermLoop;
6421 case 'U':
6422 popFront();
6423 last = parseUniHex(this, 8);
6424 state = State.Char;
6425 continue L_CharTermLoop;
6426 case 'd':
6427 set.add(unicode.Nd);
6428 state = State.Start;
6429 break;
6430 case 'D':
6431 set.add(unicode.Nd.inverted);
6432 state = State.Start;
6433 break;
6434 case 's':
6435 set.add(unicode.White_Space);
6436 state = State.Start;
6437 break;
6438 case 'S':
6439 set.add(unicode.White_Space.inverted);
6440 state = State.Start;
6441 break;
6442 case 'w':
6443 set.add(wordCharacter);
6444 state = State.Start;
6445 break;
6446 case 'W':
6447 set.add(wordCharacter.inverted);
6448 state = State.Start;
6449 break;
6450 default:
6451 if (front >= privateUseStart && front <= privateUseEnd)
6452 enforce(false, "no matching ']' found while parsing character class");
6453 enforce(false, "invalid escape sequence");
6454 }
6455 break;
6456 case State.CharDash:
6457 // xxx last - front xxx
6458 switch (front)
6459 {
6460 case '[':
6461 op = Operator.Union;
6462 goto case;
6463 case ']':
6464 //means dash is a single char not an interval specifier
6465 addWithFlags(set, last);
6466 addWithFlags(set, '-');
6467 break L_CharTermLoop;
6468 case '-'://set Difference again
6469 addWithFlags(set, last);
6470 op = Operator.Difference;
6471 popFront();//skip '-'
6472 break L_CharTermLoop;
6473 case '\\':
6474 state = State.CharDashEscape;
6475 break;
6476 default:
6477 enforce(last <= front, "inverted range");
6478 if (casefold_)
6479 {
6480 for (uint ch = last; ch <= front; ch++)
6481 addWithFlags(set, ch);
6482 }
6483 else
6484 set.add(last, front + 1);
6485 state = State.Start;
6486 }
6487 break;
6488 case State.CharDashEscape:
6489 //xxx last - \ front xxx
6490 uint end;
6491 switch (front)
6492 {
6493 case 'f':
6494 end = '\f';
6495 break;
6496 case 'n':
6497 end = '\n';
6498 break;
6499 case 'r':
6500 end = '\r';
6501 break;
6502 case 't':
6503 end = '\t';
6504 break;
6505 case 'v':
6506 end = '\v';
6507 break;
6508 foreach (val; Escapables)
6509 {
6510 case val:
6511 }
6512 end = front;
6513 break;
6514 case 'c':
6515 end = unicode.parseControlCode(this);
6516 break;
6517 case 'x':
6518 popFront();
6519 end = parseUniHex(this, 2);
6520 enforce(last <= end,"inverted range");
6521 set.add(last, end + 1);
6522 state = State.Start;
6523 continue L_CharTermLoop;
6524 case 'u':
6525 popFront();
6526 end = parseUniHex(this, 4);
6527 enforce(last <= end,"inverted range");
6528 set.add(last, end + 1);
6529 state = State.Start;
6530 continue L_CharTermLoop;
6531 case 'U':
6532 popFront();
6533 end = parseUniHex(this, 8);
6534 enforce(last <= end,"inverted range");
6535 set.add(last, end + 1);
6536 state = State.Start;
6537 continue L_CharTermLoop;
6538 default:
6539 if (front >= privateUseStart && front <= privateUseEnd)
6540 enforce(false, "no matching ']' found while parsing character class");
6541 enforce(false, "invalid escape sequence");
6542 }
6543 // Lookahead to check if it's a \T
6544 // where T is sub-pattern terminator in multi-pattern scheme
6545 auto lookahead = range.save.drop(1);
6546 if (end == '\\' && !lookahead.empty)
6547 {
6548 if (lookahead.front >= privateUseStart && lookahead.front <= privateUseEnd)
6549 enforce(false, "no matching ']' found while parsing character class");
6550 }
6551 enforce(last <= end,"inverted range");
6552 set.add(last, end + 1);
6553 state = State.Start;
6554 break;
6555 }
6556 popFront();
6557 enforce(!empty, "unexpected end of CodepointSet");
6558 }
6559 return tuple(set, op);
6560 }
6561
6562 alias ValStack = Stack!(CodepointSet);
6563 alias OpStack = Stack!(Operator);
6564
6565 CodepointSet parseSet()
6566 {
6567 ValStack vstack;
6568 OpStack opstack;
6569 import std.functional : unaryFun;
6570 enforce(!empty, "unexpected end of input");
6571 enforce(front == '[', "expected '[' at the start of unicode set");
6572 //
6573 static bool apply(Operator op, ref ValStack stack)
6574 {
6575 switch (op)
6576 {
6577 case Operator.Negate:
6578 enforce(!stack.empty, "no operand for '^'");
6579 stack.top = stack.top.inverted;
6580 break;
6581 case Operator.Union:
6582 auto s = stack.pop();//2nd operand
6583 enforce(!stack.empty, "no operand for '||'");
6584 stack.top.add(s);
6585 break;
6586 case Operator.Difference:
6587 auto s = stack.pop();//2nd operand
6588 enforce(!stack.empty, "no operand for '--'");
6589 stack.top.sub(s);
6590 break;
6591 case Operator.SymDifference:
6592 auto s = stack.pop();//2nd operand
6593 enforce(!stack.empty, "no operand for '~~'");
6594 stack.top ~= s;
6595 break;
6596 case Operator.Intersection:
6597 auto s = stack.pop();//2nd operand
6598 enforce(!stack.empty, "no operand for '&&'");
6599 stack.top.intersect(s);
6600 break;
6601 default:
6602 return false;
6603 }
6604 return true;
6605 }
6606 static bool unrollWhile(alias cond)(ref ValStack vstack, ref OpStack opstack)
6607 {
6608 while (cond(opstack.top))
6609 {
6610 if (!apply(opstack.pop(),vstack))
6611 return false;//syntax error
6612 if (opstack.empty)
6613 return false;
6614 }
6615 return true;
6616 }
6617
6618 L_CharsetLoop:
6619 do
6620 {
6621 switch (front)
6622 {
6623 case '[':
6624 opstack.push(Operator.Open);
6625 popFront();
6626 enforce(!empty, "unexpected end of character class");
6627 if (front == '^')
6628 {
6629 opstack.push(Operator.Negate);
6630 popFront();
6631 enforce(!empty, "unexpected end of character class");
6632 }
6633 else if (front == ']') // []...] is special cased
6634 {
6635 popFront();
6636 enforce(!empty, "wrong character set");
6637 auto pair = parseCharTerm();
6638 pair[0].add(']', ']'+1);
6639 if (pair[1] != Operator.None)
6640 {
6641 if (opstack.top == Operator.Union)
6642 unrollWhile!(unaryFun!"a == a.Union")(vstack, opstack);
6643 opstack.push(pair[1]);
6644 }
6645 vstack.push(pair[0]);
6646 }
6647 break;
6648 case ']':
6649 enforce(unrollWhile!(unaryFun!"a != a.Open")(vstack, opstack),
6650 "character class syntax error");
6651 enforce(!opstack.empty, "unmatched ']'");
6652 opstack.pop();
6653 popFront();
6654 if (opstack.empty)
6655 break L_CharsetLoop;
6656 auto pair = parseCharTerm();
6657 if (!pair[0].empty)//not only operator e.g. -- or ~~
6658 {
6659 vstack.top.add(pair[0]);//apply union
6660 }
6661 if (pair[1] != Operator.None)
6662 {
6663 if (opstack.top == Operator.Union)
6664 unrollWhile!(unaryFun!"a == a.Union")(vstack, opstack);
6665 opstack.push(pair[1]);
6666 }
6667 break;
6668 //
6669 default://yet another pair of term(op)?
6670 auto pair = parseCharTerm();
6671 if (pair[1] != Operator.None)
6672 {
6673 if (opstack.top == Operator.Union)
6674 unrollWhile!(unaryFun!"a == a.Union")(vstack, opstack);
6675 opstack.push(pair[1]);
6676 }
6677 vstack.push(pair[0]);
6678 }
6679
6680 }while (!empty || !opstack.empty);
6681 while (!opstack.empty)
6682 apply(opstack.pop(),vstack);
6683 assert(vstack.length == 1);
6684 return vstack.top;
6685 }
6686 }
6687
6688 /**
6689 A single entry point to lookup Unicode $(CODEPOINT) sets by name or alias of
6690 a block, script or general category.
6691
6692 It uses well defined standard rules of property name lookup.
6693 This includes fuzzy matching of names, so that
6694 'White_Space', 'white-SpAce' and 'whitespace' are all considered equal
6695 and yield the same set of white space $(CHARACTERS).
6696 */
6697 @safe public struct unicode
6698 {
6699 import std.exception : enforce;
6700 /**
6701 Performs the lookup of set of $(CODEPOINTS)
6702 with compile-time correctness checking.
6703 This short-cut version combines 3 searches:
6704 across blocks, scripts, and common binary properties.
6705
6706 Note that since scripts and blocks overlap the
6707 usual trick to disambiguate is used - to get a block use
6708 `unicode.InBlockName`, to search a script
6709 use `unicode.ScriptName`.
6710
6711 See_Also: $(LREF block), $(LREF script)
6712 and (not included in this search) $(LREF hangulSyllableType).
6713 */
6714
6715 static @property auto opDispatch(string name)() pure
6716 {
6717 static if (findAny(name))
6718 return loadAny(name);
6719 else
6720 static assert(false, "No unicode set by name "~name~" was found.");
6721 }
6722
6723 ///
6724 @safe unittest
6725 {
6726 import std.exception : collectException;
6727 auto ascii = unicode.ASCII;
6728 assert(ascii['A']);
6729 assert(ascii['~']);
6730 assert(!ascii['\u00e0']);
6731 // matching is case-insensitive
6732 assert(ascii == unicode.ascII);
6733 assert(!ascii['à']);
6734 // underscores, '-' and whitespace in names are ignored too
6735 auto latin = unicode.in_latin1_Supplement;
6736 assert(latin['à']);
6737 assert(!latin['$']);
6738 // BTW Latin 1 Supplement is a block, hence "In" prefix
6739 assert(latin == unicode("In Latin 1 Supplement"));
6740 // run-time look up throws if no such set is found
6741 assert(collectException(unicode("InCyrilliac")));
6742 }
6743
6744 /**
6745 The same lookup across blocks, scripts, or binary properties,
6746 but performed at run-time.
6747 This version is provided for cases where `name`
6748 is not known beforehand; otherwise compile-time
6749 checked $(LREF opDispatch) is typically a better choice.
6750
6751 See the $(S_LINK Unicode properties, table of properties) for available
6752 sets.
6753 */
6754 static auto opCall(C)(const scope C[] name)
6755 if (is(C : dchar))
6756 {
6757 return loadAny(name);
6758 }
6759
6760 /**
6761 Narrows down the search for sets of $(CODEPOINTS) to all Unicode blocks.
6762
6763 Note:
6764 Here block names are unambiguous as no scripts are searched
6765 and thus to search use simply `unicode.block.BlockName` notation.
6766
6767 See $(S_LINK Unicode properties, table of properties) for available sets.
6768 See_Also: $(S_LINK Unicode properties, table of properties).
6769 */
6770 struct block
6771 {
6772 import std.internal.unicode_tables : blocks; // generated file
6773 mixin SetSearcher!(blocks.tab, "block");
6774 }
6775
6776 ///
6777 @safe unittest
6778 {
6779 // use .block for explicitness
6780 assert(unicode.block.Greek_and_Coptic == unicode.InGreek_and_Coptic);
6781 }
6782
6783 /**
6784 Narrows down the search for sets of $(CODEPOINTS) to all Unicode scripts.
6785
6786 See the $(S_LINK Unicode properties, table of properties) for available
6787 sets.
6788 */
6789 struct script
6790 {
6791 import std.internal.unicode_tables : scripts; // generated file
6792 mixin SetSearcher!(scripts.tab, "script");
6793 }
6794
6795 ///
6796 @safe unittest
6797 {
6798 auto arabicScript = unicode.script.arabic;
6799 auto arabicBlock = unicode.block.arabic;
6800 // there is an intersection between script and block
6801 assert(arabicBlock['']);
6802 assert(arabicScript['']);
6803 // but they are different
6804 assert(arabicBlock != arabicScript);
6805 assert(arabicBlock == unicode.inArabic);
6806 assert(arabicScript == unicode.arabic);
6807 }
6808
6809 /**
6810 Fetch a set of $(CODEPOINTS) that have the given hangul syllable type.
6811
6812 Other non-binary properties (once supported) follow the same
6813 notation - `unicode.propertyName.propertyValue` for compile-time
6814 checked access and `unicode.propertyName(propertyValue)`
6815 for run-time checked one.
6816
6817 See the $(S_LINK Unicode properties, table of properties) for available
6818 sets.
6819 */
6820 struct hangulSyllableType
6821 {
6822 import std.internal.unicode_tables : hangul; // generated file
6823 mixin SetSearcher!(hangul.tab, "hangul syllable type");
6824 }
6825
6826 ///
6827 @safe unittest
6828 {
6829 // L here is syllable type not Letter as in unicode.L short-cut
6830 auto leadingVowel = unicode.hangulSyllableType("L");
6831 // check that some leading vowels are present
6832 foreach (vowel; '\u1110'..'\u115F')
6833 assert(leadingVowel[vowel]);
6834 assert(leadingVowel == unicode.hangulSyllableType.L);
6835 }
6836
6837 //parse control code of form \cXXX, c assumed to be the current symbol
6838 static package(std) dchar parseControlCode(Parser)(ref Parser p)
6839 {
6840 with(p)
6841 {
6842 popFront();
6843 enforce(!empty, "Unfinished escape sequence");
6844 enforce(('a' <= front && front <= 'z')
6845 || ('A' <= front && front <= 'Z'),
6846 "Only letters are allowed after \\c");
6847 return front & 0x1f;
6848 }
6849 }
6850
6851 //parse and return a CodepointSet for \p{...Property...} and \P{...Property..},
6852 //\ - assumed to be processed, p - is current
6853 static package(std) CodepointSet parsePropertySpec(Range)(ref Range p,
6854 bool negated, bool casefold)
6855 {
6856 static import std.ascii;
6857 with(p)
6858 {
6859 enum MAX_PROPERTY = 128;
6860 char[MAX_PROPERTY] result;
6861 uint k = 0;
6862 popFront();
6863 enforce(!empty, "eof parsing unicode property spec");
6864 if (front == '{')
6865 {
6866 popFront();
6867 while (k < MAX_PROPERTY && !empty && front !='}'
6868 && front !=':')
6869 {
6870 if (front != '-' && front != ' ' && front != '_')
6871 result[k++] = cast(char) std.ascii.toLower(front);
6872 popFront();
6873 }
6874 enforce(k != MAX_PROPERTY, "invalid property name");
6875 enforce(front == '}', "} expected ");
6876 }
6877 else
6878 {//single char properties e.g.: \pL, \pN ...
6879 enforce(front < 0x80, "invalid property name");
6880 result[k++] = cast(char) front;
6881 }
6882 auto s = getUnicodeSet(result[0 .. k], negated, casefold);
6883 enforce(!s.empty, "unrecognized unicode property spec");
6884 popFront();
6885 return s;
6886 }
6887 }
6888
6889 /**
6890 Parse unicode codepoint set from given `range` using standard regex
6891 syntax '[...]'. The range is advanced skiping over regex set definition.
6892 `casefold` parameter determines if the set should be casefolded - that is
6893 include both lower and upper case versions for any letters in the set.
6894 */
6895 static CodepointSet parseSet(Range)(ref Range range, bool casefold=false)
6896 if (isInputRange!Range && is(ElementType!Range : dchar))
6897 {
6898 auto usParser = UnicodeSetParser!Range(range, casefold);
6899 auto set = usParser.parseSet();
6900 range = usParser.range;
6901 return set;
6902 }
6903
6904 ///
6905 @safe unittest
6906 {
6907 import std.uni : unicode;
6908 string pat = "[a-zA-Z0-9]hello";
6909 auto set = unicode.parseSet(pat);
6910 // check some of the codepoints
6911 assert(set['a'] && set['A'] && set['9']);
6912 assert(pat == "hello");
6913 }
6914
6915 private:
6916 alias ucmp = comparePropertyName;
6917
6918 static bool findAny(string name)
6919 {
6920 import std.internal.unicode_tables : blocks, scripts, uniProps; // generated file
6921 return isPrettyPropertyName(name)
6922 || findSetName!(uniProps.tab)(name) || findSetName!(scripts.tab)(name)
6923 || (ucmp(name[0 .. 2],"In") == 0 && findSetName!(blocks.tab)(name[2..$]));
6924 }
6925
6926 static auto loadAny(Set=CodepointSet, C)(const scope C[] name) pure
6927 {
6928 import std.conv : to;
6929 import std.internal.unicode_tables : blocks, scripts; // generated file
6930 Set set;
6931 immutable loaded = loadProperty(name, set) || loadUnicodeSet!(scripts.tab)(name, set)
6932 || (name.length > 2 && ucmp(name[0 .. 2],"In") == 0
6933 && loadUnicodeSet!(blocks.tab)(name[2..$], set));
6934 if (loaded)
6935 return set;
6936 throw new Exception("No unicode set by name "~name.to!string()~" was found.");
6937 }
6938
6939 // FIXME: re-disable once the compiler is fixed
6940 // Disabled to prevent the mistake of creating instances of this pseudo-struct.
6941 //@disable ~this();
6942 }
6943
6944 @safe unittest
6945 {
6946 import std.internal.unicode_tables : blocks, uniProps; // generated file
6947 assert(unicode("InHebrew") == asSet(blocks.Hebrew));
6948 assert(unicode("separator") == (asSet(uniProps.Zs) | asSet(uniProps.Zl) | asSet(uniProps.Zp)));
6949 assert(unicode("In-Kharoshthi") == asSet(blocks.Kharoshthi));
6950 }
6951
6952 enum EMPTY_CASE_TRIE = ushort.max;// from what gen_uni uses internally
6953
6954 // control - '\r'
6955 enum controlSwitch = `
6956 case '\u0000':..case '\u0008':case '\u000E':..case '\u001F':case '\u007F':..
6957 case '\u0084':case '\u0086':..case '\u009F': case '\u0009':..case '\u000C': case '\u0085':
6958 `;
6959 // TODO: redo the most of hangul stuff algorithmically in case of Graphemes too
6960 // kill unrolled switches
6961
6962 private static bool isRegionalIndicator(dchar ch) @safe pure @nogc nothrow
6963 {
6964 return ch >= '\U0001F1E6' && ch <= '\U0001F1FF';
6965 }
6966
6967 template genericDecodeGrapheme(bool getValue)
6968 {
6969 alias graphemeExtend = graphemeExtendTrie;
6970 alias spacingMark = mcTrie;
6971 static if (getValue)
6972 alias Value = Grapheme;
6973 else
6974 alias Value = void;
6975
6976 Value genericDecodeGrapheme(Input)(ref Input range)
6977 {
6978 import std.internal.unicode_tables : isHangL, isHangT, isHangV; // generated file
6979 enum GraphemeState {
6980 Start,
6981 CR,
6982 RI,
6983 L,
6984 V,
6985 LVT
6986 }
6987 static if (getValue)
6988 Grapheme grapheme;
6989 auto state = GraphemeState.Start;
6990 enum eat = q{
6991 static if (getValue)
6992 grapheme ~= ch;
6993 range.popFront();
6994 };
6995
6996 dchar ch;
6997 assert(!range.empty, "Attempting to decode grapheme from an empty " ~ Input.stringof);
6998 while (!range.empty)
6999 {
7000 ch = range.front;
7001 final switch (state) with(GraphemeState)
7002 {
7003 case Start:
7004 mixin(eat);
7005 if (ch == '\r')
7006 state = CR;
7007 else if (isRegionalIndicator(ch))
7008 state = RI;
7009 else if (isHangL(ch))
7010 state = L;
7011 else if (hangLV[ch] || isHangV(ch))
7012 state = V;
7013 else if (hangLVT[ch])
7014 state = LVT;
7015 else if (isHangT(ch))
7016 state = LVT;
7017 else
7018 {
7019 switch (ch)
7020 {
7021 mixin(controlSwitch);
7022 goto L_End;
7023 default:
7024 goto L_End_Extend;
7025 }
7026 }
7027 break;
7028 case CR:
7029 if (ch == '\n')
7030 mixin(eat);
7031 goto L_End_Extend;
7032 case RI:
7033 if (isRegionalIndicator(ch))
7034 mixin(eat);
7035 else
7036 goto L_End_Extend;
7037 break;
7038 case L:
7039 if (isHangL(ch))
7040 mixin(eat);
7041 else if (isHangV(ch) || hangLV[ch])
7042 {
7043 state = V;
7044 mixin(eat);
7045 }
7046 else if (hangLVT[ch])
7047 {
7048 state = LVT;
7049 mixin(eat);
7050 }
7051 else
7052 goto L_End_Extend;
7053 break;
7054 case V:
7055 if (isHangV(ch))
7056 mixin(eat);
7057 else if (isHangT(ch))
7058 {
7059 state = LVT;
7060 mixin(eat);
7061 }
7062 else
7063 goto L_End_Extend;
7064 break;
7065 case LVT:
7066 if (isHangT(ch))
7067 {
7068 mixin(eat);
7069 }
7070 else
7071 goto L_End_Extend;
7072 break;
7073 }
7074 }
7075 L_End_Extend:
7076 while (!range.empty)
7077 {
7078 ch = range.front;
7079 // extend & spacing marks
7080 if (!graphemeExtend[ch] && !spacingMark[ch])
7081 break;
7082 mixin(eat);
7083 }
7084 L_End:
7085 static if (getValue)
7086 return grapheme;
7087 }
7088
7089 }
7090
7091 public: // Public API continues
7092
7093 /++
7094 Computes the length of grapheme cluster starting at `index`.
7095 Both the resulting length and the `index` are measured
7096 in $(S_LINK Code unit, code units).
7097
7098 Params:
7099 C = type that is implicitly convertible to `dchars`
7100 input = array of grapheme clusters
7101 index = starting index into `input[]`
7102
7103 Returns:
7104 length of grapheme cluster
7105 +/
7106 size_t graphemeStride(C)(const scope C[] input, size_t index) @safe pure
7107 if (is(C : dchar))
7108 {
7109 auto src = input[index..$];
7110 auto n = src.length;
7111 genericDecodeGrapheme!(false)(src);
7112 return n - src.length;
7113 }
7114
7115 ///
7116 @safe unittest
7117 {
7118 assert(graphemeStride(" ", 1) == 1);
7119 // A + combing ring above
7120 string city = "A\u030Arhus";
7121 size_t first = graphemeStride(city, 0);
7122 assert(first == 3); //\u030A has 2 UTF-8 code units
7123 assert(city[0 .. first] == "A\u030A");
7124 assert(city[first..$] == "rhus");
7125 }
7126
7127 @safe unittest
7128 {
7129 // Ensure that graphemeStride is usable from CTFE.
7130 enum c1 = graphemeStride("A", 0);
7131 static assert(c1 == 1);
7132
7133 enum c2 = graphemeStride("A\u0301", 0);
7134 static assert(c2 == 3); // \u0301 has 2 UTF-8 code units
7135 }
7136
7137 /++
7138 Reads one full grapheme cluster from an
7139 $(REF_ALTTEXT input range, isInputRange, std,range,primitives) of dchar `inp`.
7140
7141 For examples see the $(LREF Grapheme) below.
7142
7143 Note:
7144 This function modifies `inp` and thus `inp`
7145 must be an L-value.
7146 +/
7147 Grapheme decodeGrapheme(Input)(ref Input inp)
7148 if (isInputRange!Input && is(immutable ElementType!Input == immutable dchar))
7149 {
7150 return genericDecodeGrapheme!true(inp);
7151 }
7152
7153 @safe unittest
7154 {
7155 import std.algorithm.comparison : equal;
7156
7157 Grapheme gr;
7158 string s = " \u0020\u0308 ";
7159 gr = decodeGrapheme(s);
7160 assert(gr.length == 1 && gr[0] == ' ');
7161 gr = decodeGrapheme(s);
7162 assert(gr.length == 2 && equal(gr[0 .. 2], " \u0308"));
7163 s = "\u0300\u0308\u1100";
7164 assert(equal(decodeGrapheme(s)[], "\u0300\u0308"));
7165 assert(equal(decodeGrapheme(s)[], "\u1100"));
7166 s = "\u11A8\u0308\uAC01";
7167 assert(equal(decodeGrapheme(s)[], "\u11A8\u0308"));
7168 assert(equal(decodeGrapheme(s)[], "\uAC01"));
7169 }
7170
7171 /++
7172 $(P Iterate a string by $(LREF Grapheme).)
7173
7174 $(P Useful for doing string manipulation that needs to be aware
7175 of graphemes.)
7176
7177 See_Also:
7178 $(LREF byCodePoint)
7179 +/
7180 auto byGrapheme(Range)(Range range)
7181 if (isInputRange!Range && is(immutable ElementType!Range == immutable dchar))
7182 {
7183 // TODO: Bidirectional access
7184 static struct Result(R)
7185 {
7186 private R _range;
7187 private Grapheme _front;
7188
7189 bool empty() @property
7190 {
7191 return _front.length == 0;
7192 }
7193
7194 Grapheme front() @property
7195 {
7196 return _front;
7197 }
7198
7199 void popFront()
7200 {
7201 _front = _range.empty ? Grapheme.init : _range.decodeGrapheme();
7202 }
7203
7204 static if (isForwardRange!R)
7205 {
7206 Result save() @property
7207 {
7208 return Result(_range.save, _front);
7209 }
7210 }
7211 }
7212
7213 auto result = Result!(Range)(range);
7214 result.popFront();
7215 return result;
7216 }
7217
7218 ///
7219 @safe unittest
7220 {
7221 import std.algorithm.comparison : equal;
7222 import std.range.primitives : walkLength;
7223 import std.range : take, drop;
7224 auto text = "noe\u0308l"; // noël using e + combining diaeresis
7225 assert(text.walkLength == 5); // 5 code points
7226
7227 auto gText = text.byGrapheme;
7228 assert(gText.walkLength == 4); // 4 graphemes
7229
7230 assert(gText.take(3).equal("noe\u0308".byGrapheme));
7231 assert(gText.drop(3).equal("l".byGrapheme));
7232 }
7233
7234 // For testing non-forward-range input ranges
7235 version (StdUnittest)
7236 private static @safe struct InputRangeString
7237 {
7238 private string s;
7239
7240 bool empty() @property { return s.empty; }
7241 dchar front() @property { return s.front; }
7242 void popFront() { s.popFront(); }
7243 }
7244
7245 @safe unittest
7246 {
7247 import std.algorithm.comparison : equal;
7248 import std.array : array;
7249 import std.range : retro;
7250 import std.range.primitives : walkLength;
7251 assert("".byGrapheme.walkLength == 0);
7252
7253 auto reverse = "le\u0308on";
7254 assert(reverse.walkLength == 5);
7255
7256 auto gReverse = reverse.byGrapheme;
7257 assert(gReverse.walkLength == 4);
7258
7259 static foreach (text; AliasSeq!("noe\u0308l"c, "noe\u0308l"w, "noe\u0308l"d))
7260 {{
7261 assert(text.walkLength == 5);
7262 static assert(isForwardRange!(typeof(text)));
7263
7264 auto gText = text.byGrapheme;
7265 static assert(isForwardRange!(typeof(gText)));
7266 assert(gText.walkLength == 4);
7267 assert(gText.array.retro.equal(gReverse));
7268 }}
7269
7270 auto nonForwardRange = InputRangeString("noe\u0308l").byGrapheme;
7271 static assert(!isForwardRange!(typeof(nonForwardRange)));
7272 assert(nonForwardRange.walkLength == 4);
7273 }
7274
7275 /++
7276 $(P Lazily transform a range of $(LREF Grapheme)s to a range of code points.)
7277
7278 $(P Useful for converting the result to a string after doing operations
7279 on graphemes.)
7280
7281 $(P If passed in a range of code points, returns a range with equivalent capabilities.)
7282 +/
7283 auto byCodePoint(Range)(Range range)
7284 if (isInputRange!Range && is(immutable ElementType!Range == immutable Grapheme))
7285 {
7286 // TODO: Propagate bidirectional access
7287 static struct Result
7288 {
7289 private Range _range;
7290 private size_t i = 0;
7291
7292 bool empty() @property
7293 {
7294 return _range.empty;
7295 }
7296
7297 dchar front() @property
7298 {
7299 return _range.front[i];
7300 }
7301
7302 void popFront()
7303 {
7304 ++i;
7305
7306 if (i >= _range.front.length)
7307 {
7308 _range.popFront();
7309 i = 0;
7310 }
7311 }
7312
7313 static if (isForwardRange!Range)
7314 {
7315 Result save() @property
7316 {
7317 return Result(_range.save, i);
7318 }
7319 }
7320 }
7321
7322 return Result(range);
7323 }
7324
7325 /// Ditto
7326 auto byCodePoint(Range)(Range range)
7327 if (isInputRange!Range && is(immutable ElementType!Range == immutable dchar))
7328 {
7329 import std.range.primitives : isBidirectionalRange, popBack;
7330 import std.traits : isNarrowString;
7331 static if (isNarrowString!Range)
7332 {
7333 static struct Result
7334 {
7335 private Range _range;
7336 @property bool empty() { return _range.empty; }
7337 @property dchar front(){ return _range.front; }
7338 void popFront(){ _range.popFront; }
7339 @property auto save() { return Result(_range.save); }
7340 @property dchar back(){ return _range.back; }
7341 void popBack(){ _range.popBack; }
7342 }
7343 static assert(isBidirectionalRange!(Result));
7344 return Result(range);
7345 }
7346 else
7347 return range;
7348 }
7349
7350 ///
7351 @safe unittest
7352 {
7353 import std.array : array;
7354 import std.conv : text;
7355 import std.range : retro;
7356
7357 string s = "noe\u0308l"; // noël
7358
7359 // reverse it and convert the result to a string
7360 string reverse = s.byGrapheme
7361 .array
7362 .retro
7363 .byCodePoint
7364 .text;
7365
7366 assert(reverse == "le\u0308on"); // lëon
7367 }
7368
7369 @safe unittest
7370 {
7371 import std.algorithm.comparison : equal;
7372 import std.range.primitives : walkLength;
7373 import std.range : retro;
7374 assert("".byGrapheme.byCodePoint.equal(""));
7375
7376 string text = "noe\u0308l";
7377 static assert(!__traits(compiles, "noe\u0308l".byCodePoint.length));
7378
7379 auto gText = InputRangeString(text).byGrapheme;
7380 static assert(!isForwardRange!(typeof(gText)));
7381
7382 auto cpText = gText.byCodePoint;
7383 static assert(!isForwardRange!(typeof(cpText)));
7384
7385 assert(cpText.walkLength == text.walkLength);
7386
7387 auto plainCp = text.byCodePoint;
7388 static assert(isForwardRange!(typeof(plainCp)));
7389 assert(equal(plainCp, text));
7390 assert(equal(retro(plainCp.save), retro(text.save)));
7391 // Check that we still have length for dstring
7392 assert("абвгд"d.byCodePoint.length == 5);
7393 }
7394
7395 /++
7396 $(P A structure designed to effectively pack $(CHARACTERS)
7397 of a $(CLUSTER).
7398 )
7399
7400 $(P `Grapheme` has value semantics so 2 copies of a `Grapheme`
7401 always refer to distinct objects. In most actual scenarios a `Grapheme`
7402 fits on the stack and avoids memory allocation overhead for all but quite
7403 long clusters.
7404 )
7405
7406 See_Also: $(LREF decodeGrapheme), $(LREF graphemeStride)
7407 +/
7408 @safe struct Grapheme
7409 {
7410 import std.exception : enforce;
7411 import std.traits : isDynamicArray;
7412
7413 public:
7414 /// Ctor
7415 this(C)(const scope C[] chars...)
7416 if (is(C : dchar))
7417 {
7418 this ~= chars;
7419 }
7420
7421 ///ditto
7422 this(Input)(Input seq)
7423 if (!isDynamicArray!Input
7424 && isInputRange!Input && is(ElementType!Input : dchar))
7425 {
7426 this ~= seq;
7427 }
7428
7429 /// Gets a $(CODEPOINT) at the given index in this cluster.
7430 dchar opIndex(size_t index) const @nogc nothrow pure @trusted
7431 {
7432 assert(index < length);
7433 return read24(isBig ? ptr_ : small_.ptr, index);
7434 }
7435
7436 /++
7437 Writes a $(CODEPOINT) `ch` at given index in this cluster.
7438
7439 Warning:
7440 Use of this facility may invalidate grapheme cluster,
7441 see also $(LREF Grapheme.valid).
7442 +/
7443 void opIndexAssign(dchar ch, size_t index) @nogc nothrow pure @trusted
7444 {
7445 assert(index < length);
7446 write24(isBig ? ptr_ : small_.ptr, ch, index);
7447 }
7448
7449 ///
7450 @safe unittest
7451 {
7452 auto g = Grapheme("A\u0302");
7453 assert(g[0] == 'A');
7454 assert(g.valid);
7455 g[1] = '~'; // ASCII tilda is not a combining mark
7456 assert(g[1] == '~');
7457 assert(!g.valid);
7458 }
7459
7460 /++
7461 Random-access range over Grapheme's $(CHARACTERS).
7462
7463 Warning: Invalidates when this Grapheme leaves the scope,
7464 attempts to use it then would lead to memory corruption.
7465 +/
7466 SliceOverIndexed!Grapheme opSlice(size_t a, size_t b) @nogc nothrow pure return
7467 {
7468 return sliceOverIndexed(a, b, &this);
7469 }
7470
7471 /// ditto
7472 SliceOverIndexed!Grapheme opSlice() @nogc nothrow pure return
7473 {
7474 return sliceOverIndexed(0, length, &this);
7475 }
7476
7477 /// Grapheme cluster length in $(CODEPOINTS).
7478 @property size_t length() const @nogc nothrow pure
7479 {
7480 return isBig ? len_ : slen_ & 0x7F;
7481 }
7482
7483 /++
7484 Append $(CHARACTER) `ch` to this grapheme.
7485 Warning:
7486 Use of this facility may invalidate grapheme cluster,
7487 see also `valid`.
7488
7489 See_Also: $(LREF Grapheme.valid)
7490 +/
7491 ref opOpAssign(string op)(dchar ch) @trusted
7492 {
7493 static if (op == "~")
7494 {
7495 import std.internal.memory : enforceRealloc;
7496 if (!isBig)
7497 {
7498 if (slen_ == small_cap)
7499 convertToBig();// & fallthrough to "big" branch
7500 else
7501 {
7502 write24(small_.ptr, ch, smallLength);
7503 slen_++;
7504 return this;
7505 }
7506 }
7507
7508 assert(isBig);
7509 if (len_ == cap_)
7510 {
7511 import core.checkedint : addu, mulu;
7512 bool overflow;
7513 cap_ = addu(cap_, grow, overflow);
7514 auto nelems = mulu(3, addu(cap_, 1, overflow), overflow);
7515 if (overflow) assert(0);
7516 ptr_ = cast(ubyte*) enforceRealloc(ptr_, nelems);
7517 }
7518 write24(ptr_, ch, len_++);
7519 return this;
7520 }
7521 else
7522 static assert(false, "No operation "~op~" defined for Grapheme");
7523 }
7524
7525 ///
7526 @safe unittest
7527 {
7528 import std.algorithm.comparison : equal;
7529 auto g = Grapheme("A");
7530 assert(g.valid);
7531 g ~= '\u0301';
7532 assert(g[].equal("A\u0301"));
7533 assert(g.valid);
7534 g ~= "B";
7535 // not a valid grapheme cluster anymore
7536 assert(!g.valid);
7537 // still could be useful though
7538 assert(g[].equal("A\u0301B"));
7539 }
7540
7541 /// Append all $(CHARACTERS) from the input range `inp` to this Grapheme.
7542 ref opOpAssign(string op, Input)(scope Input inp)
7543 if (isInputRange!Input && is(ElementType!Input : dchar))
7544 {
7545 static if (op == "~")
7546 {
7547 foreach (dchar ch; inp)
7548 this ~= ch;
7549 return this;
7550 }
7551 else
7552 static assert(false, "No operation "~op~" defined for Grapheme");
7553 }
7554
7555 /++
7556 True if this object contains valid extended grapheme cluster.
7557 Decoding primitives of this module always return a valid `Grapheme`.
7558
7559 Appending to and direct manipulation of grapheme's $(CHARACTERS) may
7560 render it no longer valid. Certain applications may chose to use
7561 Grapheme as a "small string" of any $(CODEPOINTS) and ignore this property
7562 entirely.
7563 +/
7564 @property bool valid()() /*const*/
7565 {
7566 auto r = this[];
7567 genericDecodeGrapheme!false(r);
7568 return r.length == 0;
7569 }
7570
7571 this(this) @nogc nothrow pure @trusted
7572 {
7573 import std.internal.memory : enforceMalloc;
7574 if (isBig)
7575 {// dup it
7576 import core.checkedint : addu, mulu;
7577 bool overflow;
7578 auto raw_cap = mulu(3, addu(cap_, 1, overflow), overflow);
7579 if (overflow) assert(0);
7580
7581 auto p = cast(ubyte*) enforceMalloc(raw_cap);
7582 p[0 .. raw_cap] = ptr_[0 .. raw_cap];
7583 ptr_ = p;
7584 }
7585 }
7586
7587 ~this() @nogc nothrow pure @trusted
7588 {
7589 import core.memory : pureFree;
7590 if (isBig)
7591 {
7592 pureFree(ptr_);
7593 }
7594 }
7595
7596
7597 private:
7598 enum small_bytes = ((ubyte*).sizeof+3*size_t.sizeof-1);
7599 // "out of the blue" grow rate, needs testing
7600 // (though graphemes are typically small < 9)
7601 enum grow = 20;
7602 enum small_cap = small_bytes/3;
7603 enum small_flag = 0x80, small_mask = 0x7F;
7604 // 16 bytes in 32bits, should be enough for the majority of cases
7605 union
7606 {
7607 struct
7608 {
7609 ubyte* ptr_;
7610 size_t cap_;
7611 size_t len_;
7612 size_t padding_;
7613 }
7614 struct
7615 {
7616 ubyte[small_bytes] small_;
7617 ubyte slen_;
7618 }
7619 }
7620
7621 void convertToBig() @nogc nothrow pure @trusted
7622 {
7623 import std.internal.memory : enforceMalloc;
7624 static assert(grow.max / 3 - 1 >= grow);
7625 enum nbytes = 3 * (grow + 1);
7626 size_t k = smallLength;
7627 ubyte* p = cast(ubyte*) enforceMalloc(nbytes);
7628 for (int i=0; i<k; i++)
7629 write24(p, read24(small_.ptr, i), i);
7630 // now we can overwrite small array data
7631 ptr_ = p;
7632 len_ = slen_;
7633 assert(grow > len_);
7634 cap_ = grow;
7635 setBig();
7636 }
7637
7638 void setBig() @nogc nothrow pure { slen_ |= small_flag; }
7639
7640 @property size_t smallLength() const @nogc nothrow pure
7641 {
7642 return slen_ & small_mask;
7643 }
7644 @property ubyte isBig() const @nogc nothrow pure
7645 {
7646 return slen_ & small_flag;
7647 }
7648 }
7649
7650 static assert(Grapheme.sizeof == size_t.sizeof*4);
7651
7652
7653 @safe pure /*nothrow @nogc*/ unittest // TODO: string .front is GC and throw
7654 {
7655 import std.algorithm.comparison : equal;
7656 Grapheme[3] data = [Grapheme("Ю"), Grapheme("У"), Grapheme("З")];
7657 assert(byGrapheme("ЮУЗ").equal(data[]));
7658 }
7659
7660 ///
7661 @safe unittest
7662 {
7663 import std.algorithm.comparison : equal;
7664 import std.algorithm.iteration : filter;
7665 import std.range : isRandomAccessRange;
7666
7667 string bold = "ku\u0308hn";
7668
7669 // note that decodeGrapheme takes parameter by ref
7670 auto first = decodeGrapheme(bold);
7671
7672 assert(first.length == 1);
7673 assert(first[0] == 'k');
7674
7675 // the next grapheme is 2 characters long
7676 auto wideOne = decodeGrapheme(bold);
7677 // slicing a grapheme yields a random-access range of dchar
7678 assert(wideOne[].equal("u\u0308"));
7679 assert(wideOne.length == 2);
7680 static assert(isRandomAccessRange!(typeof(wideOne[])));
7681
7682 // all of the usual range manipulation is possible
7683 assert(wideOne[].filter!isMark().equal("\u0308"));
7684
7685 auto g = Grapheme("A");
7686 assert(g.valid);
7687 g ~= '\u0301';
7688 assert(g[].equal("A\u0301"));
7689 assert(g.valid);
7690 g ~= "B";
7691 // not a valid grapheme cluster anymore
7692 assert(!g.valid);
7693 // still could be useful though
7694 assert(g[].equal("A\u0301B"));
7695 }
7696
7697 @safe unittest
7698 {
7699 auto g = Grapheme("A\u0302");
7700 assert(g[0] == 'A');
7701 assert(g.valid);
7702 g[1] = '~'; // ASCII tilda is not a combining mark
7703 assert(g[1] == '~');
7704 assert(!g.valid);
7705 }
7706
7707 @safe unittest
7708 {
7709 import std.algorithm.comparison : equal;
7710 import std.algorithm.iteration : map;
7711 import std.conv : text;
7712 import std.range : iota;
7713
7714 // not valid clusters (but it just a test)
7715 auto g = Grapheme('a', 'b', 'c', 'd', 'e');
7716 assert(g[0] == 'a');
7717 assert(g[1] == 'b');
7718 assert(g[2] == 'c');
7719 assert(g[3] == 'd');
7720 assert(g[4] == 'e');
7721 g[3] = 'Й';
7722 assert(g[2] == 'c');
7723 assert(g[3] == 'Й', text(g[3], " vs ", 'Й'));
7724 assert(g[4] == 'e');
7725 assert(!g.valid);
7726
7727 g ~= 'ц';
7728 g ~= '~';
7729 assert(g[0] == 'a');
7730 assert(g[1] == 'b');
7731 assert(g[2] == 'c');
7732 assert(g[3] == 'Й');
7733 assert(g[4] == 'e');
7734 assert(g[5] == 'ц');
7735 assert(g[6] == '~');
7736 assert(!g.valid);
7737
7738 Grapheme copy = g;
7739 copy[0] = 'X';
7740 copy[1] = '-';
7741 assert(g[0] == 'a' && copy[0] == 'X');
7742 assert(g[1] == 'b' && copy[1] == '-');
7743 assert(equal(g[2 .. g.length], copy[2 .. copy.length]));
7744 copy = Grapheme("АБВГДЕЁЖЗИКЛМ");
7745 assert(equal(copy[0 .. 8], "АБВГДЕЁЖ"), text(copy[0 .. 8]));
7746 copy ~= "xyz";
7747 assert(equal(copy[13 .. 15], "xy"), text(copy[13 .. 15]));
7748 assert(!copy.valid);
7749
7750 Grapheme h;
7751 foreach (dchar v; iota(cast(int)'A', cast(int)'Z'+1).map!"cast(dchar)a"())
7752 h ~= v;
7753 assert(equal(h[], iota(cast(int)'A', cast(int)'Z'+1)));
7754 }
7755
7756 /++
7757 $(P Does basic case-insensitive comparison of `r1` and `r2`.
7758 This function uses simpler comparison rule thus achieving better performance
7759 than $(LREF icmp). However keep in mind the warning below.)
7760
7761 Params:
7762 r1 = an $(REF_ALTTEXT input range, isInputRange, std,range,primitives) of characters
7763 r2 = an $(REF_ALTTEXT input range, isInputRange, std,range,primitives) of characters
7764
7765 Returns:
7766 An `int` that is 0 if the strings match,
7767 <0 if `r1` is lexicographically "less" than `r2`,
7768 >0 if `r1` is lexicographically "greater" than `r2`
7769
7770 Warning:
7771 This function only handles 1:1 $(CODEPOINT) mapping
7772 and thus is not sufficient for certain alphabets
7773 like German, Greek and few others.
7774
7775 See_Also:
7776 $(LREF icmp)
7777 $(REF cmp, std,algorithm,comparison)
7778 +/
7779 int sicmp(S1, S2)(scope S1 r1, scope S2 r2)
7780 if (isInputRange!S1 && isSomeChar!(ElementEncodingType!S1)
7781 && isInputRange!S2 && isSomeChar!(ElementEncodingType!S2))
7782 {
7783 import std.internal.unicode_tables : sTable = simpleCaseTable; // generated file
7784 import std.range.primitives : isInfinite;
7785 import std.utf : decodeFront;
7786 import std.traits : isDynamicArray;
7787 import std.typecons : Yes;
7788 static import std.ascii;
7789
7790 static if ((isDynamicArray!S1 || isRandomAccessRange!S1)
7791 && (isDynamicArray!S2 || isRandomAccessRange!S2)
7792 && !(isInfinite!S1 && isInfinite!S2)
7793 && __traits(compiles,
7794 {
7795 size_t s = size_t.sizeof / 2;
7796 r1 = r1[s .. $];
7797 r2 = r2[s .. $];
7798 }))
7799 {{
7800 // ASCII optimization for dynamic arrays & similar.
7801 size_t i = 0;
7802 static if (isInfinite!S1)
7803 immutable end = r2.length;
7804 else static if (isInfinite!S2)
7805 immutable end = r1.length;
7806 else
7807 immutable end = r1.length > r2.length ? r2.length : r1.length;
7808 for (; i < end; ++i)
7809 {
7810 auto lhs = r1[i];
7811 auto rhs = r2[i];
7812 if ((lhs | rhs) >= 0x80) goto NonAsciiPath;
7813 if (lhs == rhs) continue;
7814 auto lowDiff = std.ascii.toLower(lhs) - std.ascii.toLower(rhs);
7815 if (lowDiff) return lowDiff;
7816 }
7817 static if (isInfinite!S1)
7818 return 1;
7819 else static if (isInfinite!S2)
7820 return -1;
7821 else
7822 return (r1.length > r2.length) - (r2.length > r1.length);
7823
7824 NonAsciiPath:
7825 r1 = r1[i .. $];
7826 r2 = r2[i .. $];
7827 // Fall through to standard case.
7828 }}
7829
7830 while (!r1.empty)
7831 {
7832 immutable lhs = decodeFront!(Yes.useReplacementDchar)(r1);
7833 if (r2.empty)
7834 return 1;
7835 immutable rhs = decodeFront!(Yes.useReplacementDchar)(r2);
7836 int diff = lhs - rhs;
7837 if (!diff)
7838 continue;
7839 if ((lhs | rhs) < 0x80)
7840 {
7841 immutable d = std.ascii.toLower(lhs) - std.ascii.toLower(rhs);
7842 if (!d) continue;
7843 return d;
7844 }
7845 size_t idx = simpleCaseTrie[lhs];
7846 size_t idx2 = simpleCaseTrie[rhs];
7847 // simpleCaseTrie is packed index table
7848 if (idx != EMPTY_CASE_TRIE)
7849 {
7850 if (idx2 != EMPTY_CASE_TRIE)
7851 {// both cased chars
7852 // adjust idx --> start of bucket
7853 idx = idx - sTable[idx].n;
7854 idx2 = idx2 - sTable[idx2].n;
7855 if (idx == idx2)// one bucket, equivalent chars
7856 continue;
7857 else// not the same bucket
7858 diff = sTable[idx].ch - sTable[idx2].ch;
7859 }
7860 else
7861 diff = sTable[idx - sTable[idx].n].ch - rhs;
7862 }
7863 else if (idx2 != EMPTY_CASE_TRIE)
7864 {
7865 diff = lhs - sTable[idx2 - sTable[idx2].n].ch;
7866 }
7867 // one of chars is not cased at all
7868 return diff;
7869 }
7870 return int(r2.empty) - 1;
7871 }
7872
7873 ///
7874 @safe @nogc pure nothrow unittest
7875 {
7876 assert(sicmp("Август", "авгусТ") == 0);
7877 // Greek also works as long as there is no 1:M mapping in sight
7878 assert(sicmp("ΌΎ", "όύ") == 0);
7879 // things like the following won't get matched as equal
7880 // Greek small letter iota with dialytika and tonos
7881 assert(sicmp("ΐ", "\u03B9\u0308\u0301") != 0);
7882
7883 // while icmp has no problem with that
7884 assert(icmp("ΐ", "\u03B9\u0308\u0301") == 0);
7885 assert(icmp("ΌΎ", "όύ") == 0);
7886 }
7887
7888 // overloads for the most common cases to reduce compile time
7889 @safe @nogc pure nothrow
7890 {
7891 int sicmp(scope const(char)[] str1, scope const(char)[] str2)
7892 { return sicmp!(const(char)[], const(char)[])(str1, str2); }
7893
7894 int sicmp(scope const(wchar)[] str1, scope const(wchar)[] str2)
7895 { return sicmp!(const(wchar)[], const(wchar)[])(str1, str2); }
7896
7897 int sicmp(scope const(dchar)[] str1, scope const(dchar)[] str2)
7898 { return sicmp!(const(dchar)[], const(dchar)[])(str1, str2); }
7899 }
7900
7901 private int fullCasedCmp(Range)(dchar lhs, dchar rhs, ref Range rtail)
7902 {
7903 import std.algorithm.searching : skipOver;
7904 import std.internal.unicode_tables : fullCaseTable; // generated file
7905 alias fTable = fullCaseTable;
7906 size_t idx = fullCaseTrie[lhs];
7907 // fullCaseTrie is packed index table
7908 if (idx == EMPTY_CASE_TRIE)
7909 return lhs;
7910 immutable start = idx - fTable[idx].n;
7911 immutable end = fTable[idx].size + start;
7912 assert(fTable[start].entry_len == 1);
7913 for (idx=start; idx<end; idx++)
7914 {
7915 auto entryLen = fTable[idx].entry_len;
7916 if (entryLen == 1)
7917 {
7918 if (fTable[idx].seq[0] == rhs)
7919 {
7920 return 0;
7921 }
7922 }
7923 else
7924 {// OK it's a long chunk, like 'ss' for German
7925 dstring seq = fTable[idx].seq[0 .. entryLen];
7926 if (rhs == seq[0]
7927 && rtail.skipOver(seq[1..$]))
7928 {
7929 // note that this path modifies rtail
7930 // iff we managed to get there
7931 return 0;
7932 }
7933 }
7934 }
7935 return fTable[start].seq[0]; // new remapped character for accurate diffs
7936 }
7937
7938 /++
7939 Does case insensitive comparison of `r1` and `r2`.
7940 Follows the rules of full case-folding mapping.
7941 This includes matching as equal german ß with "ss" and
7942 other 1:M $(CODEPOINT) mappings unlike $(LREF sicmp).
7943 The cost of `icmp` being pedantically correct is
7944 slightly worse performance.
7945
7946 Params:
7947 r1 = a forward range of characters
7948 r2 = a forward range of characters
7949
7950 Returns:
7951 An `int` that is 0 if the strings match,
7952 <0 if `str1` is lexicographically "less" than `str2`,
7953 >0 if `str1` is lexicographically "greater" than `str2`
7954
7955 See_Also:
7956 $(LREF sicmp)
7957 $(REF cmp, std,algorithm,comparison)
7958 +/
7959 int icmp(S1, S2)(S1 r1, S2 r2)
7960 if (isForwardRange!S1 && isSomeChar!(ElementEncodingType!S1)
7961 && isForwardRange!S2 && isSomeChar!(ElementEncodingType!S2))
7962 {
7963 import std.range.primitives : isInfinite;
7964 import std.traits : isDynamicArray;
7965 import std.utf : byDchar;
7966 static import std.ascii;
7967
7968 static if ((isDynamicArray!S1 || isRandomAccessRange!S1)
7969 && (isDynamicArray!S2 || isRandomAccessRange!S2)
7970 && !(isInfinite!S1 && isInfinite!S2)
7971 && __traits(compiles,
7972 {
7973 size_t s = size_t.max / 2;
7974 r1 = r1[s .. $];
7975 r2 = r2[s .. $];
7976 }))
7977 {{
7978 // ASCII optimization for dynamic arrays & similar.
7979 size_t i = 0;
7980 static if (isInfinite!S1)
7981 immutable end = r2.length;
7982 else static if (isInfinite!S2)
7983 immutable end = r1.length;
7984 else
7985 immutable end = r1.length > r2.length ? r2.length : r1.length;
7986 for (; i < end; ++i)
7987 {
7988 auto lhs = r1[i];
7989 auto rhs = r2[i];
7990 if ((lhs | rhs) >= 0x80) goto NonAsciiPath;
7991 if (lhs == rhs) continue;
7992 auto lowDiff = std.ascii.toLower(lhs) - std.ascii.toLower(rhs);
7993 if (lowDiff) return lowDiff;
7994 }
7995 static if (isInfinite!S1)
7996 return 1;
7997 else static if (isInfinite!S2)
7998 return -1;
7999 else
8000 return (r1.length > r2.length) - (r2.length > r1.length);
8001
8002 NonAsciiPath:
8003 r1 = r1[i .. $];
8004 r2 = r2[i .. $];
8005 // Fall through to standard case.
8006 }}
8007
8008 auto str1 = r1.byDchar;
8009 auto str2 = r2.byDchar;
8010
8011 for (;;)
8012 {
8013 if (str1.empty)
8014 return str2.empty ? 0 : -1;
8015 immutable lhs = str1.front;
8016 if (str2.empty)
8017 return 1;
8018 immutable rhs = str2.front;
8019 str1.popFront();
8020 str2.popFront();
8021 if (!(lhs - rhs))
8022 continue;
8023 // first try to match lhs to <rhs,right-tail> sequence
8024 immutable cmpLR = fullCasedCmp(lhs, rhs, str2);
8025 if (!cmpLR)
8026 continue;
8027 // then rhs to <lhs,left-tail> sequence
8028 immutable cmpRL = fullCasedCmp(rhs, lhs, str1);
8029 if (!cmpRL)
8030 continue;
8031 // cmpXX contain remapped codepoints
8032 // to obtain stable ordering of icmp
8033 return cmpLR - cmpRL;
8034 }
8035 }
8036
8037 ///
8038 @safe @nogc pure nothrow unittest
8039 {
8040 assert(icmp("Rußland", "Russland") == 0);
8041 assert(icmp("ᾩ -> \u1F70\u03B9", "\u1F61\u03B9 -> ᾲ") == 0);
8042 }
8043
8044 /**
8045 * By using $(REF byUTF, std,utf) and its aliases, GC allocations via auto-decoding
8046 * and thrown exceptions can be avoided, making `icmp` `@safe @nogc nothrow pure`.
8047 */
8048 @safe @nogc nothrow pure unittest
8049 {
8050 import std.utf : byDchar;
8051
8052 assert(icmp("Rußland".byDchar, "Russland".byDchar) == 0);
8053 assert(icmp("ᾩ -> \u1F70\u03B9".byDchar, "\u1F61\u03B9 -> ᾲ".byDchar) == 0);
8054 }
8055
8056 // test different character types
8057 @safe unittest
8058 {
8059 assert(icmp("Rußland", "Russland") == 0);
8060 assert(icmp("Rußland"w, "Russland") == 0);
8061 assert(icmp("Rußland", "Russland"w) == 0);
8062 assert(icmp("Rußland"w, "Russland"w) == 0);
8063 assert(icmp("Rußland"d, "Russland"w) == 0);
8064 assert(icmp("Rußland"w, "Russland"d) == 0);
8065 }
8066
8067 // overloads for the most common cases to reduce compile time
8068 @safe @nogc pure nothrow
8069 {
8070 int icmp(const(char)[] str1, const(char)[] str2)
8071 { return icmp!(const(char)[], const(char)[])(str1, str2); }
8072 int icmp(const(wchar)[] str1, const(wchar)[] str2)
8073 { return icmp!(const(wchar)[], const(wchar)[])(str1, str2); }
8074 int icmp(const(dchar)[] str1, const(dchar)[] str2)
8075 { return icmp!(const(dchar)[], const(dchar)[])(str1, str2); }
8076 }
8077
8078 @safe unittest
8079 {
8080 import std.algorithm.sorting : sort;
8081 import std.conv : to;
8082 import std.exception : assertCTFEable;
8083 assertCTFEable!(
8084 {
8085 static foreach (cfunc; AliasSeq!(icmp, sicmp))
8086 {{
8087 static foreach (S1; AliasSeq!(string, wstring, dstring))
8088 static foreach (S2; AliasSeq!(string, wstring, dstring))
8089 {
8090 assert(cfunc("".to!S1(), "".to!S2()) == 0);
8091 assert(cfunc("A".to!S1(), "".to!S2()) > 0);
8092 assert(cfunc("".to!S1(), "0".to!S2()) < 0);
8093 assert(cfunc("abc".to!S1(), "abc".to!S2()) == 0);
8094 assert(cfunc("abcd".to!S1(), "abc".to!S2()) > 0);
8095 assert(cfunc("abc".to!S1(), "abcd".to!S2()) < 0);
8096 assert(cfunc("Abc".to!S1(), "aBc".to!S2()) == 0);
8097 assert(cfunc("авГуст".to!S1(), "АВгУСТ".to!S2()) == 0);
8098 // Check example:
8099 assert(cfunc("Август".to!S1(), "авгусТ".to!S2()) == 0);
8100 assert(cfunc("ΌΎ".to!S1(), "όύ".to!S2()) == 0);
8101 }
8102 // check that the order is properly agnostic to the case
8103 auto strs = [ "Apple", "ORANGE", "orAcle", "amp", "banana"];
8104 sort!((a,b) => cfunc(a,b) < 0)(strs);
8105 assert(strs == ["amp", "Apple", "banana", "orAcle", "ORANGE"]);
8106 }}
8107 assert(icmp("ßb", "ssa") > 0);
8108 // Check example:
8109 assert(icmp("Russland", "Rußland") == 0);
8110 assert(icmp("ᾩ -> \u1F70\u03B9", "\u1F61\u03B9 -> ᾲ") == 0);
8111 assert(icmp("ΐ"w, "\u03B9\u0308\u0301") == 0);
8112 assert(sicmp("ΐ", "\u03B9\u0308\u0301") != 0);
8113 // https://issues.dlang.org/show_bug.cgi?id=11057
8114 assert( icmp("K", "L") < 0 );
8115 });
8116 }
8117
8118 // https://issues.dlang.org/show_bug.cgi?id=17372
8119 @safe pure unittest
8120 {
8121 import std.algorithm.iteration : joiner, map;
8122 import std.algorithm.sorting : sort;
8123 import std.array : array;
8124 auto a = [["foo", "bar"], ["baz"]].map!(line => line.joiner(" ")).array.sort!((a, b) => icmp(a, b) < 0);
8125 }
8126
8127 // This is package(std) for the moment to be used as a support tool for std.regex
8128 // It needs a better API
8129 /*
8130 Return a range of all $(CODEPOINTS) that casefold to
8131 and from this `ch`.
8132 */
8133 package(std) auto simpleCaseFoldings(dchar ch) @safe
8134 {
8135 import std.internal.unicode_tables : simpleCaseTable; // generated file
8136 alias sTable = simpleCaseTable;
8137 static struct Range
8138 {
8139 @safe pure nothrow:
8140 uint idx; //if == uint.max, then read c.
8141 union
8142 {
8143 dchar c; // == 0 - empty range
8144 uint len;
8145 }
8146 @property bool isSmall() const { return idx == uint.max; }
8147
8148 this(dchar ch)
8149 {
8150 idx = uint.max;
8151 c = ch;
8152 }
8153
8154 this(uint start, uint size)
8155 {
8156 idx = start;
8157 len = size;
8158 }
8159
8160 @property dchar front() const
8161 {
8162 assert(!empty);
8163 if (isSmall)
8164 {
8165 return c;
8166 }
8167 auto ch = sTable[idx].ch;
8168 return ch;
8169 }
8170
8171 @property bool empty() const
8172 {
8173 if (isSmall)
8174 {
8175 return c == 0;
8176 }
8177 return len == 0;
8178 }
8179
8180 @property size_t length() const
8181 {
8182 if (isSmall)
8183 {
8184 return c == 0 ? 0 : 1;
8185 }
8186 return len;
8187 }
8188
8189 void popFront()
8190 {
8191 if (isSmall)
8192 c = 0;
8193 else
8194 {
8195 idx++;
8196 len--;
8197 }
8198 }
8199 }
8200 immutable idx = simpleCaseTrie[ch];
8201 if (idx == EMPTY_CASE_TRIE)
8202 return Range(ch);
8203 auto entry = sTable[idx];
8204 immutable start = idx - entry.n;
8205 return Range(start, entry.size);
8206 }
8207
8208 @safe unittest
8209 {
8210 import std.algorithm.comparison : equal;
8211 import std.algorithm.searching : canFind;
8212 import std.array : array;
8213 import std.exception : assertCTFEable;
8214 assertCTFEable!((){
8215 auto r = simpleCaseFoldings('Э').array;
8216 assert(r.length == 2);
8217 assert(r.canFind('э') && r.canFind('Э'));
8218 auto sr = simpleCaseFoldings('~');
8219 assert(sr.equal("~"));
8220 //A with ring above - casefolds to the same bucket as Angstrom sign
8221 sr = simpleCaseFoldings('Å');
8222 assert(sr.length == 3);
8223 assert(sr.canFind('å') && sr.canFind('Å') && sr.canFind('\u212B'));
8224 });
8225 }
8226
8227 /++
8228 $(P Returns the $(S_LINK Combining class, combining class) of `ch`.)
8229 +/
8230 ubyte combiningClass(dchar ch) @safe pure nothrow @nogc
8231 {
8232 return combiningClassTrie[ch];
8233 }
8234
8235 ///
8236 @safe unittest
8237 {
8238 // shorten the code
8239 alias CC = combiningClass;
8240
8241 // combining tilda
8242 assert(CC('\u0303') == 230);
8243 // combining ring below
8244 assert(CC('\u0325') == 220);
8245 // the simple consequence is that "tilda" should be
8246 // placed after a "ring below" in a sequence
8247 }
8248
8249 @safe pure nothrow @nogc unittest
8250 {
8251 foreach (ch; 0 .. 0x80)
8252 assert(combiningClass(ch) == 0);
8253 assert(combiningClass('\u05BD') == 22);
8254 assert(combiningClass('\u0300') == 230);
8255 assert(combiningClass('\u0317') == 220);
8256 assert(combiningClass('\u1939') == 222);
8257 }
8258
8259 /// Unicode character decomposition type.
8260 enum UnicodeDecomposition {
8261 /// Canonical decomposition. The result is canonically equivalent sequence.
8262 Canonical,
8263 /**
8264 Compatibility decomposition. The result is compatibility equivalent sequence.
8265 Note: Compatibility decomposition is a $(B lossy) conversion,
8266 typically suitable only for fuzzy matching and internal processing.
8267 */
8268 Compatibility
8269 }
8270
8271 /**
8272 Shorthand aliases for character decomposition type, passed as a
8273 template parameter to $(LREF decompose).
8274 */
8275 enum {
8276 Canonical = UnicodeDecomposition.Canonical,
8277 Compatibility = UnicodeDecomposition.Compatibility
8278 }
8279
8280 /++
8281 Try to canonically compose 2 $(CHARACTERS).
8282 Returns the composed $(CHARACTER) if they do compose and dchar.init otherwise.
8283
8284 The assumption is that `first` comes before `second` in the original text,
8285 usually meaning that the first is a starter.
8286
8287 Note: Hangul syllables are not covered by this function.
8288 See `composeJamo` below.
8289 +/
8290 public dchar compose(dchar first, dchar second) pure nothrow @safe
8291 {
8292 import std.algorithm.iteration : map;
8293 import std.internal.unicode_comp : compositionTable, composeCntShift, composeIdxMask;
8294 import std.range : assumeSorted;
8295 immutable packed = compositionJumpTrie[first];
8296 if (packed == ushort.max)
8297 return dchar.init;
8298 // unpack offset and length
8299 immutable idx = packed & composeIdxMask, cnt = packed >> composeCntShift;
8300 // TODO: optimize this micro binary search (no more then 4-5 steps)
8301 auto r = compositionTable[idx .. idx+cnt].map!"a.rhs"().assumeSorted();
8302 immutable target = r.lowerBound(second).length;
8303 if (target == cnt)
8304 return dchar.init;
8305 immutable entry = compositionTable[idx+target];
8306 if (entry.rhs != second)
8307 return dchar.init;
8308 return entry.composed;
8309 }
8310
8311 ///
8312 @safe unittest
8313 {
8314 assert(compose('A','\u0308') == '\u00C4');
8315 assert(compose('A', 'B') == dchar.init);
8316 assert(compose('C', '\u0301') == '\u0106');
8317 // note that the starter is the first one
8318 // thus the following doesn't compose
8319 assert(compose('\u0308', 'A') == dchar.init);
8320 }
8321
8322 /++
8323 Returns a full $(S_LINK Canonical decomposition, Canonical)
8324 (by default) or $(S_LINK Compatibility decomposition, Compatibility)
8325 decomposition of $(CHARACTER) `ch`.
8326 If no decomposition is available returns a $(LREF Grapheme)
8327 with the `ch` itself.
8328
8329 Note:
8330 This function also decomposes hangul syllables
8331 as prescribed by the standard.
8332
8333 See_Also: $(LREF decomposeHangul) for a restricted version
8334 that takes into account only hangul syllables but
8335 no other decompositions.
8336 +/
8337 public Grapheme decompose(UnicodeDecomposition decompType=Canonical)(dchar ch) @safe
8338 {
8339 import std.algorithm.searching : until;
8340 import std.internal.unicode_decomp : decompCompatTable, decompCanonTable;
8341 static if (decompType == Canonical)
8342 {
8343 alias table = decompCanonTable;
8344 alias mapping = canonMappingTrie;
8345 }
8346 else static if (decompType == Compatibility)
8347 {
8348 alias table = decompCompatTable;
8349 alias mapping = compatMappingTrie;
8350 }
8351 immutable idx = mapping[ch];
8352 if (!idx) // not found, check hangul arithmetic decomposition
8353 return decomposeHangul(ch);
8354 auto decomp = table[idx..$].until(0);
8355 return Grapheme(decomp);
8356 }
8357
8358 ///
8359 @safe unittest
8360 {
8361 import std.algorithm.comparison : equal;
8362
8363 assert(compose('A','\u0308') == '\u00C4');
8364 assert(compose('A', 'B') == dchar.init);
8365 assert(compose('C', '\u0301') == '\u0106');
8366 // note that the starter is the first one
8367 // thus the following doesn't compose
8368 assert(compose('\u0308', 'A') == dchar.init);
8369
8370 assert(decompose('Ĉ')[].equal("C\u0302"));
8371 assert(decompose('D')[].equal("D"));
8372 assert(decompose('\uD4DC')[].equal("\u1111\u1171\u11B7"));
8373 assert(decompose!Compatibility('¹')[].equal("1"));
8374 }
8375
8376 //----------------------------------------------------------------------------
8377 // Hangul specific composition/decomposition
8378 enum jamoSBase = 0xAC00;
8379 enum jamoLBase = 0x1100;
8380 enum jamoVBase = 0x1161;
8381 enum jamoTBase = 0x11A7;
8382 enum jamoLCount = 19, jamoVCount = 21, jamoTCount = 28;
8383 enum jamoNCount = jamoVCount * jamoTCount;
8384 enum jamoSCount = jamoLCount * jamoNCount;
8385
8386 // Tests if `ch` is a Hangul leading consonant jamo.
8387 bool isJamoL(dchar ch) pure nothrow @nogc @safe
8388 {
8389 // first cmp rejects ~ 1M code points above leading jamo range
8390 return ch < jamoLBase+jamoLCount && ch >= jamoLBase;
8391 }
8392
8393 // Tests if `ch` is a Hangul vowel jamo.
8394 bool isJamoT(dchar ch) pure nothrow @nogc @safe
8395 {
8396 // first cmp rejects ~ 1M code points above trailing jamo range
8397 // Note: ch == jamoTBase doesn't indicate trailing jamo (TIndex must be > 0)
8398 return ch < jamoTBase+jamoTCount && ch > jamoTBase;
8399 }
8400
8401 // Tests if `ch` is a Hangul trailnig consonant jamo.
8402 bool isJamoV(dchar ch) pure nothrow @nogc @safe
8403 {
8404 // first cmp rejects ~ 1M code points above vowel range
8405 return ch < jamoVBase+jamoVCount && ch >= jamoVBase;
8406 }
8407
8408 int hangulSyllableIndex(dchar ch) pure nothrow @nogc @safe
8409 {
8410 int idxS = cast(int) ch - jamoSBase;
8411 return idxS >= 0 && idxS < jamoSCount ? idxS : -1;
8412 }
8413
8414 // internal helper: compose hangul syllables leaving dchar.init in holes
8415 void hangulRecompose(scope dchar[] seq) pure nothrow @nogc @safe
8416 {
8417 for (size_t idx = 0; idx + 1 < seq.length; )
8418 {
8419 if (isJamoL(seq[idx]) && isJamoV(seq[idx+1]))
8420 {
8421 immutable int indexL = seq[idx] - jamoLBase;
8422 immutable int indexV = seq[idx+1] - jamoVBase;
8423 immutable int indexLV = indexL * jamoNCount + indexV * jamoTCount;
8424 if (idx + 2 < seq.length && isJamoT(seq[idx+2]))
8425 {
8426 seq[idx] = jamoSBase + indexLV + seq[idx+2] - jamoTBase;
8427 seq[idx+1] = dchar.init;
8428 seq[idx+2] = dchar.init;
8429 idx += 3;
8430 }
8431 else
8432 {
8433 seq[idx] = jamoSBase + indexLV;
8434 seq[idx+1] = dchar.init;
8435 idx += 2;
8436 }
8437 }
8438 else
8439 idx++;
8440 }
8441 }
8442
8443 //----------------------------------------------------------------------------
8444 public:
8445
8446 /**
8447 Decomposes a Hangul syllable. If `ch` is not a composed syllable
8448 then this function returns $(LREF Grapheme) containing only `ch` as is.
8449 */
8450 Grapheme decomposeHangul(dchar ch) @safe
8451 {
8452 immutable idxS = cast(int) ch - jamoSBase;
8453 if (idxS < 0 || idxS >= jamoSCount) return Grapheme(ch);
8454 immutable idxL = idxS / jamoNCount;
8455 immutable idxV = (idxS % jamoNCount) / jamoTCount;
8456 immutable idxT = idxS % jamoTCount;
8457
8458 immutable partL = jamoLBase + idxL;
8459 immutable partV = jamoVBase + idxV;
8460 if (idxT > 0) // there is a trailling consonant (T); <L,V,T> decomposition
8461 return Grapheme(partL, partV, jamoTBase + idxT);
8462 else // <L, V> decomposition
8463 return Grapheme(partL, partV);
8464 }
8465
8466 ///
8467 @safe unittest
8468 {
8469 import std.algorithm.comparison : equal;
8470 assert(decomposeHangul('\uD4DB')[].equal("\u1111\u1171\u11B6"));
8471 }
8472
8473 /++
8474 Try to compose hangul syllable out of a leading consonant (`lead`),
8475 a `vowel` and optional `trailing` consonant jamos.
8476
8477 On success returns the composed LV or LVT hangul syllable.
8478
8479 If any of `lead` and `vowel` are not a valid hangul jamo
8480 of the respective $(CHARACTER) class returns dchar.init.
8481 +/
8482 dchar composeJamo(dchar lead, dchar vowel, dchar trailing=dchar.init) pure nothrow @nogc @safe
8483 {
8484 if (!isJamoL(lead))
8485 return dchar.init;
8486 immutable indexL = lead - jamoLBase;
8487 if (!isJamoV(vowel))
8488 return dchar.init;
8489 immutable indexV = vowel - jamoVBase;
8490 immutable indexLV = indexL * jamoNCount + indexV * jamoTCount;
8491 immutable dchar syllable = jamoSBase + indexLV;
8492 return isJamoT(trailing) ? syllable + (trailing - jamoTBase) : syllable;
8493 }
8494
8495 ///
8496 @safe unittest
8497 {
8498 assert(composeJamo('\u1111', '\u1171', '\u11B6') == '\uD4DB');
8499 // leaving out T-vowel, or passing any codepoint
8500 // that is not trailing consonant composes an LV-syllable
8501 assert(composeJamo('\u1111', '\u1171') == '\uD4CC');
8502 assert(composeJamo('\u1111', '\u1171', ' ') == '\uD4CC');
8503 assert(composeJamo('\u1111', 'A') == dchar.init);
8504 assert(composeJamo('A', '\u1171') == dchar.init);
8505 }
8506
8507 @safe unittest
8508 {
8509 import std.algorithm.comparison : equal;
8510 import std.conv : text;
8511
8512 static void testDecomp(UnicodeDecomposition T)(dchar ch, string r)
8513 {
8514 Grapheme g = decompose!T(ch);
8515 assert(equal(g[], r), text(g[], " vs ", r));
8516 }
8517 testDecomp!Canonical('\u1FF4', "\u03C9\u0301\u0345");
8518 testDecomp!Canonical('\uF907', "\u9F9C");
8519 testDecomp!Compatibility('\u33FF', "\u0067\u0061\u006C");
8520 testDecomp!Compatibility('\uA7F9', "\u0153");
8521
8522 // check examples
8523 assert(decomposeHangul('\uD4DB')[].equal("\u1111\u1171\u11B6"));
8524 assert(composeJamo('\u1111', '\u1171', '\u11B6') == '\uD4DB');
8525 assert(composeJamo('\u1111', '\u1171') == '\uD4CC'); // leave out T-vowel
8526 assert(composeJamo('\u1111', '\u1171', ' ') == '\uD4CC');
8527 assert(composeJamo('\u1111', 'A') == dchar.init);
8528 assert(composeJamo('A', '\u1171') == dchar.init);
8529 }
8530
8531 /**
8532 Enumeration type for normalization forms,
8533 passed as template parameter for functions like $(LREF normalize).
8534 */
8535 enum NormalizationForm {
8536 NFC,
8537 NFD,
8538 NFKC,
8539 NFKD
8540 }
8541
8542
8543 enum {
8544 /**
8545 Shorthand aliases from values indicating normalization forms.
8546 */
8547 NFC = NormalizationForm.NFC,
8548 ///ditto
8549 NFD = NormalizationForm.NFD,
8550 ///ditto
8551 NFKC = NormalizationForm.NFKC,
8552 ///ditto
8553 NFKD = NormalizationForm.NFKD
8554 }
8555
8556 /++
8557 Returns `input` string normalized to the chosen form.
8558 Form C is used by default.
8559
8560 For more information on normalization forms see
8561 the $(S_LINK Normalization, normalization section).
8562
8563 Note:
8564 In cases where the string in question is already normalized,
8565 it is returned unmodified and no memory allocation happens.
8566 +/
8567 inout(C)[] normalize(NormalizationForm norm=NFC, C)(return scope inout(C)[] input)
8568 {
8569 import std.algorithm.mutation : SwapStrategy;
8570 import std.algorithm.sorting : sort;
8571 import std.array : appender;
8572 import std.range : zip;
8573
8574 auto anchors = splitNormalized!norm(input);
8575 if (anchors[0] == input.length && anchors[1] == input.length)
8576 return input;
8577 dchar[] decomposed;
8578 decomposed.reserve(31);
8579 ubyte[] ccc;
8580 ccc.reserve(31);
8581 auto app = appender!(C[])();
8582 do
8583 {
8584 app.put(input[0 .. anchors[0]]);
8585 foreach (dchar ch; input[anchors[0]..anchors[1]])
8586 static if (norm == NFD || norm == NFC)
8587 {
8588 foreach (dchar c; decompose!Canonical(ch)[])
8589 decomposed ~= c;
8590 }
8591 else // NFKD & NFKC
8592 {
8593 foreach (dchar c; decompose!Compatibility(ch)[])
8594 decomposed ~= c;
8595 }
8596 ccc.length = decomposed.length;
8597 size_t firstNonStable = 0;
8598 ubyte lastClazz = 0;
8599
8600 foreach (idx, dchar ch; decomposed)
8601 {
8602 immutable clazz = combiningClass(ch);
8603 ccc[idx] = clazz;
8604 if (clazz == 0 && lastClazz != 0)
8605 {
8606 // found a stable code point after unstable ones
8607 sort!("a[0] < b[0]", SwapStrategy.stable)
8608 (zip(ccc[firstNonStable .. idx], decomposed[firstNonStable .. idx]));
8609 firstNonStable = decomposed.length;
8610 }
8611 else if (clazz != 0 && lastClazz == 0)
8612 {
8613 // found first unstable code point after stable ones
8614 firstNonStable = idx;
8615 }
8616 lastClazz = clazz;
8617 }
8618 sort!("a[0] < b[0]", SwapStrategy.stable)
8619 (zip(ccc[firstNonStable..$], decomposed[firstNonStable..$]));
8620 static if (norm == NFC || norm == NFKC)
8621 {
8622 import std.algorithm.searching : countUntil;
8623 auto first = countUntil(ccc, 0);
8624 if (first >= 0) // no starters?? no recomposition
8625 {
8626 for (;;)
8627 {
8628 immutable second = recompose(first, decomposed, ccc);
8629 if (second == decomposed.length)
8630 break;
8631 first = second;
8632 }
8633 // 2nd pass for hangul syllables
8634 hangulRecompose(decomposed);
8635 }
8636 }
8637 static if (norm == NFD || norm == NFKD)
8638 app.put(decomposed);
8639 else
8640 {
8641 import std.algorithm.mutation : remove;
8642 auto clean = remove!("a == dchar.init", SwapStrategy.stable)(decomposed);
8643 app.put(decomposed[0 .. clean.length]);
8644 }
8645 // reset variables
8646 decomposed.length = 0;
8647 () @trusted {
8648 decomposed.assumeSafeAppend();
8649 ccc.length = 0;
8650 ccc.assumeSafeAppend();
8651 } ();
8652 input = input[anchors[1]..$];
8653 // and move on
8654 anchors = splitNormalized!norm(input);
8655 }while (anchors[0] != input.length);
8656 app.put(input[0 .. anchors[0]]);
8657 return () @trusted inout { return cast(inout(C)[]) app.data; } ();
8658 }
8659
8660 ///
8661 @safe unittest
8662 {
8663 // any encoding works
8664 wstring greet = "Hello world";
8665 assert(normalize(greet) is greet); // the same exact slice
8666
8667 // An example of a character with all 4 forms being different:
8668 // Greek upsilon with acute and hook symbol (code point 0x03D3)
8669 assert(normalize!NFC("ϓ") == "\u03D3");
8670 assert(normalize!NFD("ϓ") == "\u03D2\u0301");
8671 assert(normalize!NFKC("ϓ") == "\u038E");
8672 assert(normalize!NFKD("ϓ") == "\u03A5\u0301");
8673 }
8674
8675 @safe unittest
8676 {
8677 import std.conv : text;
8678
8679 assert(normalize!NFD("abc\uF904def") == "abc\u6ED1def", text(normalize!NFD("abc\uF904def")));
8680 assert(normalize!NFKD("2¹⁰") == "210", normalize!NFKD("2¹⁰"));
8681 assert(normalize!NFD("Äffin") == "A\u0308ffin");
8682
8683 // check example
8684
8685 // any encoding works
8686 wstring greet = "Hello world";
8687 assert(normalize(greet) is greet); // the same exact slice
8688
8689 // An example of a character with all 4 forms being different:
8690 // Greek upsilon with acute and hook symbol (code point 0x03D3)
8691 assert(normalize!NFC("ϓ") == "\u03D3");
8692 assert(normalize!NFD("ϓ") == "\u03D2\u0301");
8693 assert(normalize!NFKC("ϓ") == "\u038E");
8694 assert(normalize!NFKD("ϓ") == "\u03A5\u0301");
8695 }
8696
8697 // canonically recompose given slice of code points, works in-place and mutates data
8698 private size_t recompose(size_t start, scope dchar[] input, scope ubyte[] ccc) pure nothrow @safe
8699 {
8700 assert(input.length == ccc.length);
8701 int accumCC = -1;// so that it's out of 0 .. 255 range
8702 // writefln("recomposing %( %04x %)", input);
8703 // first one is always a starter thus we start at i == 1
8704 size_t i = start+1;
8705 for (; ; )
8706 {
8707 if (i == input.length)
8708 break;
8709 immutable curCC = ccc[i];
8710 // In any character sequence beginning with a starter S
8711 // a character C is blocked from S if and only if there
8712 // is some character B between S and C, and either B
8713 // is a starter or it has the same or higher combining class as C.
8714 //------------------------
8715 // Applying to our case:
8716 // S is input[0]
8717 // accumCC is the maximum CCC of characters between C and S,
8718 // as ccc are sorted
8719 // C is input[i]
8720
8721 if (curCC > accumCC)
8722 {
8723 immutable comp = compose(input[start], input[i]);
8724 if (comp != dchar.init)
8725 {
8726 input[start] = comp;
8727 input[i] = dchar.init;// put a sentinel
8728 // current was merged so its CCC shouldn't affect
8729 // composing with the next one
8730 }
8731 else
8732 {
8733 // if it was a starter then accumCC is now 0, end of loop
8734 accumCC = curCC;
8735 if (accumCC == 0)
8736 break;
8737 }
8738 }
8739 else
8740 {
8741 // ditto here
8742 accumCC = curCC;
8743 if (accumCC == 0)
8744 break;
8745 }
8746 i++;
8747 }
8748 return i;
8749 }
8750
8751 // returns tuple of 2 indexes that delimit:
8752 // normalized text, piece that needs normalization and
8753 // the rest of input starting with stable code point
8754 private auto splitNormalized(NormalizationForm norm, C)(scope const(C)[] input)
8755 {
8756 import std.typecons : tuple;
8757 ubyte lastCC = 0;
8758
8759 foreach (idx, dchar ch; input)
8760 {
8761 static if (norm == NFC)
8762 if (ch < 0x0300)
8763 {
8764 lastCC = 0;
8765 continue;
8766 }
8767 immutable ubyte CC = combiningClass(ch);
8768 if (lastCC > CC && CC != 0)
8769 {
8770 return seekStable!norm(idx, input);
8771 }
8772
8773 if (notAllowedIn!norm(ch))
8774 {
8775 return seekStable!norm(idx, input);
8776 }
8777 lastCC = CC;
8778 }
8779 return tuple(input.length, input.length);
8780 }
8781
8782 private auto seekStable(NormalizationForm norm, C)(size_t idx, const scope C[] input)
8783 {
8784 import std.typecons : tuple;
8785 import std.utf : codeLength;
8786
8787 auto br = input[0 .. idx];
8788 size_t region_start = 0;// default
8789 for (;;)
8790 {
8791 if (br.empty)// start is 0
8792 break;
8793 dchar ch = br.back;
8794 if (combiningClass(ch) == 0 && allowedIn!norm(ch))
8795 {
8796 region_start = br.length - codeLength!C(ch);
8797 break;
8798 }
8799 br.popFront();
8800 }
8801 ///@@@BUG@@@ can't use find: " find is a nested function and can't be used..."
8802 size_t region_end=input.length;// end is $ by default
8803 foreach (i, dchar ch; input[idx..$])
8804 {
8805 if (combiningClass(ch) == 0 && allowedIn!norm(ch))
8806 {
8807 region_end = i+idx;
8808 break;
8809 }
8810 }
8811 // writeln("Region to normalize: ", input[region_start .. region_end]);
8812 return tuple(region_start, region_end);
8813 }
8814
8815 /**
8816 Tests if dchar `ch` is always allowed (Quick_Check=YES) in normalization
8817 form `norm`.
8818 */
8819 public bool allowedIn(NormalizationForm norm)(dchar ch)
8820 {
8821 return !notAllowedIn!norm(ch);
8822 }
8823
8824 ///
8825 @safe unittest
8826 {
8827 // e.g. Cyrillic is always allowed, so is ASCII
8828 assert(allowedIn!NFC('я'));
8829 assert(allowedIn!NFD('я'));
8830 assert(allowedIn!NFKC('я'));
8831 assert(allowedIn!NFKD('я'));
8832 assert(allowedIn!NFC('Z'));
8833 }
8834
8835 // not user friendly name but more direct
8836 private bool notAllowedIn(NormalizationForm norm)(dchar ch)
8837 {
8838 static if (norm == NFC)
8839 alias qcTrie = nfcQCTrie;
8840 else static if (norm == NFD)
8841 alias qcTrie = nfdQCTrie;
8842 else static if (norm == NFKC)
8843 alias qcTrie = nfkcQCTrie;
8844 else static if (norm == NFKD)
8845 alias qcTrie = nfkdQCTrie;
8846 else
8847 static assert("Unknown normalization form "~norm);
8848 return qcTrie[ch];
8849 }
8850
8851 @safe unittest
8852 {
8853 assert(allowedIn!NFC('я'));
8854 assert(allowedIn!NFD('я'));
8855 assert(allowedIn!NFKC('я'));
8856 assert(allowedIn!NFKD('я'));
8857 assert(allowedIn!NFC('Z'));
8858 }
8859
8860 }
8861
8862 version (std_uni_bootstrap)
8863 {
8864 // old version used for bootstrapping of gen_uni.d that generates
8865 // up to date optimal versions of all of isXXX functions
8866 @safe pure nothrow @nogc public bool isWhite(dchar c)
8867 {
8868 import std.ascii : isWhite;
8869 return isWhite(c) ||
8870 c == lineSep || c == paraSep ||
8871 c == '\u0085' || c == '\u00A0' || c == '\u1680' || c == '\u180E' ||
8872 (c >= '\u2000' && c <= '\u200A') ||
8873 c == '\u202F' || c == '\u205F' || c == '\u3000';
8874 }
8875 }
8876 else
8877 {
8878
8879 // trusted -> avoid bounds check
8880 @trusted pure nothrow @nogc private
8881 {
8882 import std.internal.unicode_tables; // : toLowerTable, toTitleTable, toUpperTable; // generated file
8883
8884 // hide template instances behind functions
8885 // https://issues.dlang.org/show_bug.cgi?id=13232
8886 ushort toLowerIndex(dchar c) { return toLowerIndexTrie[c]; }
8887 ushort toLowerSimpleIndex(dchar c) { return toLowerSimpleIndexTrie[c]; }
8888 dchar toLowerTab(size_t idx) { return toLowerTable[idx]; }
8889
8890 ushort toTitleIndex(dchar c) { return toTitleIndexTrie[c]; }
8891 ushort toTitleSimpleIndex(dchar c) { return toTitleSimpleIndexTrie[c]; }
8892 dchar toTitleTab(size_t idx) { return toTitleTable[idx]; }
8893
8894 ushort toUpperIndex(dchar c) { return toUpperIndexTrie[c]; }
8895 ushort toUpperSimpleIndex(dchar c) { return toUpperSimpleIndexTrie[c]; }
8896 dchar toUpperTab(size_t idx) { return toUpperTable[idx]; }
8897 }
8898
8899 public:
8900
8901 /++
8902 Whether or not `c` is a Unicode whitespace $(CHARACTER).
8903 (general Unicode category: Part of C0(tab, vertical tab, form feed,
8904 carriage return, and linefeed characters), Zs, Zl, Zp, and NEL(U+0085))
8905 +/
8906 @safe pure nothrow @nogc
8907 public bool isWhite(dchar c)
8908 {
8909 import std.internal.unicode_tables : isWhiteGen; // generated file
8910 return isWhiteGen(c); // call pregenerated binary search
8911 }
8912
8913 /++
8914 Return whether `c` is a Unicode lowercase $(CHARACTER).
8915 +/
8916 @safe pure nothrow @nogc
8917 bool isLower(dchar c)
8918 {
8919 import std.ascii : isLower, isASCII;
8920 if (isASCII(c))
8921 return isLower(c);
8922 return lowerCaseTrie[c];
8923 }
8924
8925 @safe unittest
8926 {
8927 import std.ascii : isLower;
8928 foreach (v; 0 .. 0x80)
8929 assert(isLower(v) == .isLower(v));
8930 assert(.isLower('я'));
8931 assert(.isLower('й'));
8932 assert(!.isLower('Ж'));
8933 // Greek HETA
8934 assert(!.isLower('\u0370'));
8935 assert(.isLower('\u0371'));
8936 assert(!.isLower('\u039C')); // capital MU
8937 assert(.isLower('\u03B2')); // beta
8938 // from extended Greek
8939 assert(!.isLower('\u1F18'));
8940 assert(.isLower('\u1F00'));
8941 foreach (v; unicode.lowerCase.byCodepoint)
8942 assert(.isLower(v) && !isUpper(v));
8943 }
8944
8945
8946 /++
8947 Return whether `c` is a Unicode uppercase $(CHARACTER).
8948 +/
8949 @safe pure nothrow @nogc
8950 bool isUpper(dchar c)
8951 {
8952 import std.ascii : isUpper, isASCII;
8953 if (isASCII(c))
8954 return isUpper(c);
8955 return upperCaseTrie[c];
8956 }
8957
8958 @safe unittest
8959 {
8960 import std.ascii : isLower;
8961 foreach (v; 0 .. 0x80)
8962 assert(isLower(v) == .isLower(v));
8963 assert(!isUpper('й'));
8964 assert(isUpper('Ж'));
8965 // Greek HETA
8966 assert(isUpper('\u0370'));
8967 assert(!isUpper('\u0371'));
8968 assert(isUpper('\u039C')); // capital MU
8969 assert(!isUpper('\u03B2')); // beta
8970 // from extended Greek
8971 assert(!isUpper('\u1F00'));
8972 assert(isUpper('\u1F18'));
8973 foreach (v; unicode.upperCase.byCodepoint)
8974 assert(isUpper(v) && !.isLower(v));
8975 }
8976
8977
8978 //TODO: Hidden for now, needs better API.
8979 //Other transforms could use better API as well, but this one is a new primitive.
8980 @safe pure nothrow @nogc
8981 private dchar toTitlecase(dchar c)
8982 {
8983 // optimize ASCII case
8984 if (c < 0xAA)
8985 {
8986 if (c < 'a')
8987 return c;
8988 if (c <= 'z')
8989 return c - 32;
8990 return c;
8991 }
8992 size_t idx = toTitleSimpleIndex(c);
8993 if (idx != ushort.max)
8994 {
8995 return toTitleTab(idx);
8996 }
8997 return c;
8998 }
8999
9000 private alias UpperTriple = AliasSeq!(toUpperIndex, MAX_SIMPLE_UPPER, toUpperTab);
9001 private alias LowerTriple = AliasSeq!(toLowerIndex, MAX_SIMPLE_LOWER, toLowerTab);
9002
9003 // generic toUpper/toLower on whole string, creates new or returns as is
9004 private ElementEncodingType!S[] toCase(alias indexFn, uint maxIdx, alias tableFn, alias asciiConvert, S)(S s)
9005 if (isSomeString!S || (isRandomAccessRange!S && hasLength!S && hasSlicing!S && isSomeChar!(ElementType!S)))
9006 {
9007 import std.array : appender, array;
9008 import std.ascii : isASCII;
9009 import std.utf : byDchar, codeLength;
9010
9011 alias C = ElementEncodingType!S;
9012
9013 auto r = s.byDchar;
9014 for (size_t i; !r.empty; i += r.front.codeLength!C , r.popFront())
9015 {
9016 auto cOuter = r.front;
9017 ushort idx = indexFn(cOuter);
9018 if (idx == ushort.max)
9019 continue;
9020 auto result = appender!(C[])();
9021 result.reserve(s.length);
9022 result.put(s[0 .. i]);
9023 foreach (dchar c; s[i .. $].byDchar)
9024 {
9025 if (c.isASCII)
9026 {
9027 result.put(asciiConvert(c));
9028 }
9029 else
9030 {
9031 idx = indexFn(c);
9032 if (idx == ushort.max)
9033 result.put(c);
9034 else if (idx < maxIdx)
9035 {
9036 c = tableFn(idx);
9037 result.put(c);
9038 }
9039 else
9040 {
9041 auto val = tableFn(idx);
9042 // unpack length + codepoint
9043 immutable uint len = val >> 24;
9044 result.put(cast(dchar)(val & 0xFF_FFFF));
9045 foreach (j; idx+1 .. idx+len)
9046 result.put(tableFn(j));
9047 }
9048 }
9049 }
9050 return result.data;
9051 }
9052
9053 static if (isSomeString!S)
9054 return s;
9055 else
9056 return s.array;
9057 }
9058
9059 // https://issues.dlang.org/show_bug.cgi?id=12428
9060 @safe unittest
9061 {
9062 import std.array : replicate;
9063 auto s = "abcdefghij".replicate(300);
9064 s = s[0 .. 10];
9065
9066 toUpper(s);
9067
9068 assert(s == "abcdefghij");
9069 }
9070
9071 // https://issues.dlang.org/show_bug.cgi?id=18993
9072 @safe unittest
9073 {
9074 static assert(`몬스터/A`.toLower.length == `몬스터/a`.toLower.length);
9075 }
9076
9077
9078 // generic toUpper/toLower on whole range, returns range
9079 private auto toCaser(alias indexFn, uint maxIdx, alias tableFn, alias asciiConvert, Range)(Range str)
9080 // Accept range of dchar's
9081 if (isInputRange!Range &&
9082 isSomeChar!(ElementEncodingType!Range) &&
9083 ElementEncodingType!Range.sizeof == dchar.sizeof)
9084 {
9085 static struct ToCaserImpl
9086 {
9087 @property bool empty()
9088 {
9089 return !nLeft && r.empty;
9090 }
9091
9092 @property auto front()
9093 {
9094 import std.ascii : isASCII;
9095
9096 if (!nLeft)
9097 {
9098 dchar c = r.front;
9099 if (c.isASCII)
9100 {
9101 buf[0] = asciiConvert(c);
9102 nLeft = 1;
9103 }
9104 else
9105 {
9106 const idx = indexFn(c);
9107 if (idx == ushort.max)
9108 {
9109 buf[0] = c;
9110 nLeft = 1;
9111 }
9112 else if (idx < maxIdx)
9113 {
9114 buf[0] = tableFn(idx);
9115 nLeft = 1;
9116 }
9117 else
9118 {
9119 immutable val = tableFn(idx);
9120 // unpack length + codepoint
9121 nLeft = val >> 24;
9122 if (nLeft == 0)
9123 nLeft = 1;
9124 assert(nLeft <= buf.length);
9125 buf[nLeft - 1] = cast(dchar)(val & 0xFF_FFFF);
9126 foreach (j; 1 .. nLeft)
9127 buf[nLeft - j - 1] = tableFn(idx + j);
9128 }
9129 }
9130 }
9131 return buf[nLeft - 1];
9132 }
9133
9134 void popFront()
9135 {
9136 if (!nLeft)
9137 front;
9138 assert(nLeft);
9139 --nLeft;
9140 if (!nLeft)
9141 r.popFront();
9142 }
9143
9144 static if (isForwardRange!Range)
9145 {
9146 @property auto save()
9147 {
9148 auto ret = this;
9149 ret.r = r.save;
9150 return ret;
9151 }
9152 }
9153
9154 private:
9155 Range r;
9156 uint nLeft;
9157 dchar[3] buf = void;
9158 }
9159
9160 return ToCaserImpl(str);
9161 }
9162
9163 /*********************
9164 * Convert an $(REF_ALTTEXT input range, isInputRange, std,range,primitives)
9165 * or a string to upper or lower case.
9166 *
9167 * Does not allocate memory.
9168 * Characters in UTF-8 or UTF-16 format that cannot be decoded
9169 * are treated as $(REF replacementDchar, std,utf).
9170 *
9171 * Params:
9172 * str = string or range of characters
9173 *
9174 * Returns:
9175 * an input range of `dchar`s
9176 *
9177 * See_Also:
9178 * $(LREF toUpper), $(LREF toLower)
9179 */
9180
9181 auto asLowerCase(Range)(Range str)
9182 if (isInputRange!Range && isSomeChar!(ElementEncodingType!Range) &&
9183 !isConvertibleToString!Range)
9184 {
9185 static if (ElementEncodingType!Range.sizeof < dchar.sizeof)
9186 {
9187 import std.utf : byDchar;
9188
9189 // Decode first
9190 return asLowerCase(str.byDchar);
9191 }
9192 else
9193 {
9194 static import std.ascii;
9195 return toCaser!(LowerTriple, std.ascii.toLower)(str);
9196 }
9197 }
9198
9199 /// ditto
9200 auto asUpperCase(Range)(Range str)
9201 if (isInputRange!Range && isSomeChar!(ElementEncodingType!Range) &&
9202 !isConvertibleToString!Range)
9203 {
9204 static if (ElementEncodingType!Range.sizeof < dchar.sizeof)
9205 {
9206 import std.utf : byDchar;
9207
9208 // Decode first
9209 return asUpperCase(str.byDchar);
9210 }
9211 else
9212 {
9213 static import std.ascii;
9214 return toCaser!(UpperTriple, std.ascii.toUpper)(str);
9215 }
9216 }
9217
9218 ///
9219 @safe pure unittest
9220 {
9221 import std.algorithm.comparison : equal;
9222
9223 assert("hEllo".asUpperCase.equal("HELLO"));
9224 }
9225
9226 // explicitly undocumented
9227 auto asLowerCase(Range)(auto ref Range str)
9228 if (isConvertibleToString!Range)
9229 {
9230 import std.traits : StringTypeOf;
9231 return asLowerCase!(StringTypeOf!Range)(str);
9232 }
9233
9234 // explicitly undocumented
9235 auto asUpperCase(Range)(auto ref Range str)
9236 if (isConvertibleToString!Range)
9237 {
9238 import std.traits : StringTypeOf;
9239 return asUpperCase!(StringTypeOf!Range)(str);
9240 }
9241
9242 @safe unittest
9243 {
9244 static struct TestAliasedString
9245 {
9246 string get() @safe @nogc pure nothrow { return _s; }
9247 alias get this;
9248 @disable this(this);
9249 string _s;
9250 }
9251
9252 static bool testAliasedString(alias func, Args...)(string s, Args args)
9253 {
9254 import std.algorithm.comparison : equal;
9255 auto a = func(TestAliasedString(s), args);
9256 auto b = func(s, args);
9257 static if (is(typeof(equal(a, b))))
9258 {
9259 // For ranges, compare contents instead of object identity.
9260 return equal(a, b);
9261 }
9262 else
9263 {
9264 return a == b;
9265 }
9266 }
9267 assert(testAliasedString!asLowerCase("hEllo"));
9268 assert(testAliasedString!asUpperCase("hEllo"));
9269 assert(testAliasedString!asCapitalized("hEllo"));
9270 }
9271
9272 @safe unittest
9273 {
9274 import std.array : array;
9275
9276 auto a = "HELLo".asLowerCase;
9277 auto savea = a.save;
9278 auto s = a.array;
9279 assert(s == "hello");
9280 s = savea.array;
9281 assert(s == "hello");
9282
9283 string[] lower = ["123", "abcфеж", "\u0131\u023f\u03c9", "i\u0307\u1Fe2"];
9284 string[] upper = ["123", "ABCФЕЖ", "I\u2c7e\u2126", "\u0130\u03A5\u0308\u0300"];
9285
9286 foreach (i, slwr; lower)
9287 {
9288 import std.utf : byChar;
9289
9290 auto sx = slwr.asUpperCase.byChar.array;
9291 assert(sx == toUpper(slwr));
9292 auto sy = upper[i].asLowerCase.byChar.array;
9293 assert(sy == toLower(upper[i]));
9294 }
9295
9296 // Not necessary to call r.front
9297 for (auto r = lower[3].asUpperCase; !r.empty; r.popFront())
9298 {
9299 }
9300
9301 import std.algorithm.comparison : equal;
9302
9303 "HELLo"w.asLowerCase.equal("hello"d);
9304 "HELLo"w.asUpperCase.equal("HELLO"d);
9305 "HELLo"d.asLowerCase.equal("hello"d);
9306 "HELLo"d.asUpperCase.equal("HELLO"d);
9307
9308 import std.utf : byChar;
9309 assert(toLower("\u1Fe2") == asLowerCase("\u1Fe2").byChar.array);
9310 }
9311
9312 // generic capitalizer on whole range, returns range
9313 private auto toCapitalizer(alias indexFnUpper, uint maxIdxUpper, alias tableFnUpper,
9314 Range)(Range str)
9315 // Accept range of dchar's
9316 if (isInputRange!Range &&
9317 isSomeChar!(ElementEncodingType!Range) &&
9318 ElementEncodingType!Range.sizeof == dchar.sizeof)
9319 {
9320 static struct ToCapitalizerImpl
9321 {
9322 @property bool empty()
9323 {
9324 return lower ? lwr.empty : !nLeft && r.empty;
9325 }
9326
9327 @property auto front()
9328 {
9329 if (lower)
9330 return lwr.front;
9331
9332 if (!nLeft)
9333 {
9334 immutable dchar c = r.front;
9335 const idx = indexFnUpper(c);
9336 if (idx == ushort.max)
9337 {
9338 buf[0] = c;
9339 nLeft = 1;
9340 }
9341 else if (idx < maxIdxUpper)
9342 {
9343 buf[0] = tableFnUpper(idx);
9344 nLeft = 1;
9345 }
9346 else
9347 {
9348 immutable val = tableFnUpper(idx);
9349 // unpack length + codepoint
9350 nLeft = val >> 24;
9351 if (nLeft == 0)
9352 nLeft = 1;
9353 assert(nLeft <= buf.length);
9354 buf[nLeft - 1] = cast(dchar)(val & 0xFF_FFFF);
9355 foreach (j; 1 .. nLeft)
9356 buf[nLeft - j - 1] = tableFnUpper(idx + j);
9357 }
9358 }
9359 return buf[nLeft - 1];
9360 }
9361
9362 void popFront()
9363 {
9364 if (lower)
9365 lwr.popFront();
9366 else
9367 {
9368 if (!nLeft)
9369 front;
9370 assert(nLeft);
9371 --nLeft;
9372 if (!nLeft)
9373 {
9374 r.popFront();
9375 lwr = r.asLowerCase();
9376 lower = true;
9377 }
9378 }
9379 }
9380
9381 static if (isForwardRange!Range)
9382 {
9383 @property auto save()
9384 {
9385 auto ret = this;
9386 ret.r = r.save;
9387 ret.lwr = lwr.save;
9388 return ret;
9389 }
9390 }
9391
9392 private:
9393 Range r;
9394 typeof(r.asLowerCase) lwr; // range representing the lower case rest of string
9395 bool lower = false; // false for first character, true for rest of string
9396 dchar[3] buf = void;
9397 uint nLeft = 0;
9398 }
9399
9400 return ToCapitalizerImpl(str);
9401 }
9402
9403 /*********************
9404 * Capitalize an $(REF_ALTTEXT input range, isInputRange, std,range,primitives)
9405 * or string, meaning convert the first
9406 * character to upper case and subsequent characters to lower case.
9407 *
9408 * Does not allocate memory.
9409 * Characters in UTF-8 or UTF-16 format that cannot be decoded
9410 * are treated as $(REF replacementDchar, std,utf).
9411 *
9412 * Params:
9413 * str = string or range of characters
9414 *
9415 * Returns:
9416 * an InputRange of dchars
9417 *
9418 * See_Also:
9419 * $(LREF toUpper), $(LREF toLower)
9420 * $(LREF asUpperCase), $(LREF asLowerCase)
9421 */
9422
9423 auto asCapitalized(Range)(Range str)
9424 if (isInputRange!Range && isSomeChar!(ElementEncodingType!Range) &&
9425 !isConvertibleToString!Range)
9426 {
9427 static if (ElementEncodingType!Range.sizeof < dchar.sizeof)
9428 {
9429 import std.utf : byDchar;
9430
9431 // Decode first
9432 return toCapitalizer!UpperTriple(str.byDchar);
9433 }
9434 else
9435 {
9436 return toCapitalizer!UpperTriple(str);
9437 }
9438 }
9439
9440 ///
9441 @safe pure unittest
9442 {
9443 import std.algorithm.comparison : equal;
9444
9445 assert("hEllo".asCapitalized.equal("Hello"));
9446 }
9447
9448 auto asCapitalized(Range)(auto ref Range str)
9449 if (isConvertibleToString!Range)
9450 {
9451 import std.traits : StringTypeOf;
9452 return asCapitalized!(StringTypeOf!Range)(str);
9453 }
9454
9455 @safe pure nothrow @nogc unittest
9456 {
9457 auto r = "hEllo".asCapitalized();
9458 assert(r.front == 'H');
9459 }
9460
9461 @safe unittest
9462 {
9463 import std.array : array;
9464
9465 auto a = "hELLo".asCapitalized;
9466 auto savea = a.save;
9467 auto s = a.array;
9468 assert(s == "Hello");
9469 s = savea.array;
9470 assert(s == "Hello");
9471
9472 string[2][] cases =
9473 [
9474 ["", ""],
9475 ["h", "H"],
9476 ["H", "H"],
9477 ["3", "3"],
9478 ["123", "123"],
9479 ["h123A", "H123a"],
9480 ["феж", "Феж"],
9481 ["\u1Fe2", "\u03a5\u0308\u0300"],
9482 ];
9483
9484 foreach (i; 0 .. cases.length)
9485 {
9486 import std.utf : byChar;
9487
9488 auto r = cases[i][0].asCapitalized.byChar.array;
9489 auto result = cases[i][1];
9490 assert(r == result);
9491 }
9492
9493 // Don't call r.front
9494 for (auto r = "\u1Fe2".asCapitalized; !r.empty; r.popFront())
9495 {
9496 }
9497
9498 import std.algorithm.comparison : equal;
9499
9500 "HELLo"w.asCapitalized.equal("Hello"d);
9501 "hElLO"w.asCapitalized.equal("Hello"d);
9502 "hello"d.asCapitalized.equal("Hello"d);
9503 "HELLO"d.asCapitalized.equal("Hello"d);
9504
9505 import std.utf : byChar;
9506 assert(asCapitalized("\u0130").byChar.array == asUpperCase("\u0130").byChar.array);
9507 }
9508
9509 // TODO: helper, I wish std.utf was more flexible (and stright)
9510 private size_t encodeTo(scope char[] buf, size_t idx, dchar c) @trusted pure nothrow @nogc
9511 {
9512 if (c <= 0x7F)
9513 {
9514 buf[idx] = cast(char) c;
9515 idx++;
9516 }
9517 else if (c <= 0x7FF)
9518 {
9519 buf[idx] = cast(char)(0xC0 | (c >> 6));
9520 buf[idx+1] = cast(char)(0x80 | (c & 0x3F));
9521 idx += 2;
9522 }
9523 else if (c <= 0xFFFF)
9524 {
9525 buf[idx] = cast(char)(0xE0 | (c >> 12));
9526 buf[idx+1] = cast(char)(0x80 | ((c >> 6) & 0x3F));
9527 buf[idx+2] = cast(char)(0x80 | (c & 0x3F));
9528 idx += 3;
9529 }
9530 else if (c <= 0x10FFFF)
9531 {
9532 buf[idx] = cast(char)(0xF0 | (c >> 18));
9533 buf[idx+1] = cast(char)(0x80 | ((c >> 12) & 0x3F));
9534 buf[idx+2] = cast(char)(0x80 | ((c >> 6) & 0x3F));
9535 buf[idx+3] = cast(char)(0x80 | (c & 0x3F));
9536 idx += 4;
9537 }
9538 else
9539 assert(0);
9540 return idx;
9541 }
9542
9543 @safe unittest
9544 {
9545 char[] s = "abcd".dup;
9546 size_t i = 0;
9547 i = encodeTo(s, i, 'X');
9548 assert(s == "Xbcd");
9549
9550 i = encodeTo(s, i, cast(dchar)'\u00A9');
9551 assert(s == "X\xC2\xA9d");
9552 }
9553
9554 // TODO: helper, I wish std.utf was more flexible (and stright)
9555 private size_t encodeTo(scope wchar[] buf, size_t idx, dchar c) @trusted pure
9556 {
9557 import std.utf : UTFException;
9558 if (c <= 0xFFFF)
9559 {
9560 if (0xD800 <= c && c <= 0xDFFF)
9561 throw (new UTFException("Encoding an isolated surrogate code point in UTF-16")).setSequence(c);
9562 buf[idx] = cast(wchar) c;
9563 idx++;
9564 }
9565 else if (c <= 0x10FFFF)
9566 {
9567 buf[idx] = cast(wchar)((((c - 0x10000) >> 10) & 0x3FF) + 0xD800);
9568 buf[idx+1] = cast(wchar)(((c - 0x10000) & 0x3FF) + 0xDC00);
9569 idx += 2;
9570 }
9571 else
9572 assert(0);
9573 return idx;
9574 }
9575
9576 private size_t encodeTo(scope dchar[] buf, size_t idx, dchar c) @trusted pure nothrow @nogc
9577 {
9578 buf[idx] = c;
9579 idx++;
9580 return idx;
9581 }
9582
9583 private void toCaseInPlace(alias indexFn, uint maxIdx, alias tableFn, C)(ref C[] s) @trusted pure
9584 if (is(C == char) || is(C == wchar) || is(C == dchar))
9585 {
9586 import std.utf : decode, codeLength;
9587 size_t curIdx = 0;
9588 size_t destIdx = 0;
9589 alias slowToCase = toCaseInPlaceAlloc!(indexFn, maxIdx, tableFn);
9590 size_t lastUnchanged = 0;
9591 // in-buffer move of bytes to a new start index
9592 // the trick is that it may not need to copy at all
9593 static size_t moveTo(C[] str, size_t dest, size_t from, size_t to)
9594 {
9595 // Interestingly we may just bump pointer for a while
9596 // then have to copy if a re-cased char was smaller the original
9597 // later we may regain pace with char that got bigger
9598 // In the end it sometimes flip-flops between the 2 cases below
9599 if (dest == from)
9600 return to;
9601 // got to copy
9602 foreach (C c; str[from .. to])
9603 str[dest++] = c;
9604 return dest;
9605 }
9606 while (curIdx != s.length)
9607 {
9608 size_t startIdx = curIdx;
9609 immutable ch = decode(s, curIdx);
9610 // TODO: special case for ASCII
9611 immutable caseIndex = indexFn(ch);
9612 if (caseIndex == ushort.max) // unchanged, skip over
9613 {
9614 continue;
9615 }
9616 else if (caseIndex < maxIdx) // 1:1 codepoint mapping
9617 {
9618 // previous cased chars had the same length as uncased ones
9619 // thus can just adjust pointer
9620 destIdx = moveTo(s, destIdx, lastUnchanged, startIdx);
9621 lastUnchanged = curIdx;
9622 immutable cased = tableFn(caseIndex);
9623 immutable casedLen = codeLength!C(cased);
9624 if (casedLen + destIdx > curIdx) // no place to fit cased char
9625 {
9626 // switch to slow codepath, where we allocate
9627 return slowToCase(s, startIdx, destIdx);
9628 }
9629 else
9630 {
9631 destIdx = encodeTo(s, destIdx, cased);
9632 }
9633 }
9634 else // 1:m codepoint mapping, slow codepath
9635 {
9636 destIdx = moveTo(s, destIdx, lastUnchanged, startIdx);
9637 lastUnchanged = curIdx;
9638 return slowToCase(s, startIdx, destIdx);
9639 }
9640 assert(destIdx <= curIdx);
9641 }
9642 if (lastUnchanged != s.length)
9643 {
9644 destIdx = moveTo(s, destIdx, lastUnchanged, s.length);
9645 }
9646 s = s[0 .. destIdx];
9647 }
9648
9649 // helper to precalculate size of case-converted string
9650 private template toCaseLength(alias indexFn, uint maxIdx, alias tableFn)
9651 {
9652 size_t toCaseLength(C)(const scope C[] str)
9653 {
9654 import std.utf : decode, codeLength;
9655 size_t codeLen = 0;
9656 size_t lastNonTrivial = 0;
9657 size_t curIdx = 0;
9658 while (curIdx != str.length)
9659 {
9660 immutable startIdx = curIdx;
9661 immutable ch = decode(str, curIdx);
9662 immutable ushort caseIndex = indexFn(ch);
9663 if (caseIndex == ushort.max)
9664 continue;
9665 else if (caseIndex < maxIdx)
9666 {
9667 codeLen += startIdx - lastNonTrivial;
9668 lastNonTrivial = curIdx;
9669 immutable cased = tableFn(caseIndex);
9670 codeLen += codeLength!C(cased);
9671 }
9672 else
9673 {
9674 codeLen += startIdx - lastNonTrivial;
9675 lastNonTrivial = curIdx;
9676 immutable val = tableFn(caseIndex);
9677 immutable len = val >> 24;
9678 immutable dchar cased = val & 0xFF_FFFF;
9679 codeLen += codeLength!C(cased);
9680 foreach (j; caseIndex+1 .. caseIndex+len)
9681 codeLen += codeLength!C(tableFn(j));
9682 }
9683 }
9684 if (lastNonTrivial != str.length)
9685 codeLen += str.length - lastNonTrivial;
9686 return codeLen;
9687 }
9688 }
9689
9690 @safe unittest
9691 {
9692 alias toLowerLength = toCaseLength!(LowerTriple);
9693 assert(toLowerLength("abcd") == 4);
9694 assert(toLowerLength("аБВгд456") == 10+3);
9695 }
9696
9697 // slower code path that preallocates and then copies
9698 // case-converted stuf to the new string
9699 private template toCaseInPlaceAlloc(alias indexFn, uint maxIdx, alias tableFn)
9700 {
9701 void toCaseInPlaceAlloc(C)(ref C[] s, size_t curIdx,
9702 size_t destIdx) @trusted pure
9703 if (is(C == char) || is(C == wchar) || is(C == dchar))
9704 {
9705 import std.utf : decode;
9706 alias caseLength = toCaseLength!(indexFn, maxIdx, tableFn);
9707 auto trueLength = destIdx + caseLength(s[curIdx..$]);
9708 C[] ns = new C[trueLength];
9709 ns[0 .. destIdx] = s[0 .. destIdx];
9710 size_t lastUnchanged = curIdx;
9711 while (curIdx != s.length)
9712 {
9713 immutable startIdx = curIdx; // start of current codepoint
9714 immutable ch = decode(s, curIdx);
9715 immutable caseIndex = indexFn(ch);
9716 if (caseIndex == ushort.max) // skip over
9717 {
9718 continue;
9719 }
9720 else if (caseIndex < maxIdx) // 1:1 codepoint mapping
9721 {
9722 immutable cased = tableFn(caseIndex);
9723 auto toCopy = startIdx - lastUnchanged;
9724 ns[destIdx .. destIdx+toCopy] = s[lastUnchanged .. startIdx];
9725 lastUnchanged = curIdx;
9726 destIdx += toCopy;
9727 destIdx = encodeTo(ns, destIdx, cased);
9728 }
9729 else // 1:m codepoint mapping, slow codepath
9730 {
9731 auto toCopy = startIdx - lastUnchanged;
9732 ns[destIdx .. destIdx+toCopy] = s[lastUnchanged .. startIdx];
9733 lastUnchanged = curIdx;
9734 destIdx += toCopy;
9735 auto val = tableFn(caseIndex);
9736 // unpack length + codepoint
9737 immutable uint len = val >> 24;
9738 destIdx = encodeTo(ns, destIdx, cast(dchar)(val & 0xFF_FFFF));
9739 foreach (j; caseIndex+1 .. caseIndex+len)
9740 destIdx = encodeTo(ns, destIdx, tableFn(j));
9741 }
9742 }
9743 if (lastUnchanged != s.length)
9744 {
9745 auto toCopy = s.length - lastUnchanged;
9746 ns[destIdx .. destIdx+toCopy] = s[lastUnchanged..$];
9747 destIdx += toCopy;
9748 }
9749 assert(ns.length == destIdx);
9750 s = ns;
9751 }
9752 }
9753
9754 /++
9755 Converts `s` to lowercase (by performing Unicode lowercase mapping) in place.
9756 For a few characters string length may increase after the transformation,
9757 in such a case the function reallocates exactly once.
9758 If `s` does not have any uppercase characters, then `s` is unaltered.
9759 +/
9760 void toLowerInPlace(C)(ref C[] s) @trusted pure
9761 if (is(C == char) || is(C == wchar) || is(C == dchar))
9762 {
9763 toCaseInPlace!(LowerTriple)(s);
9764 }
9765 // overloads for the most common cases to reduce compile time
9766 @safe pure /*TODO nothrow*/
9767 {
9768 void toLowerInPlace(ref char[] s)
9769 { toLowerInPlace!char(s); }
9770 void toLowerInPlace(ref wchar[] s)
9771 { toLowerInPlace!wchar(s); }
9772 void toLowerInPlace(ref dchar[] s)
9773 { toLowerInPlace!dchar(s); }
9774 }
9775
9776 /++
9777 Converts `s` to uppercase (by performing Unicode uppercase mapping) in place.
9778 For a few characters string length may increase after the transformation,
9779 in such a case the function reallocates exactly once.
9780 If `s` does not have any lowercase characters, then `s` is unaltered.
9781 +/
9782 void toUpperInPlace(C)(ref C[] s) @trusted pure
9783 if (is(C == char) || is(C == wchar) || is(C == dchar))
9784 {
9785 toCaseInPlace!(UpperTriple)(s);
9786 }
9787 // overloads for the most common cases to reduce compile time/code size
9788 @safe pure /*TODO nothrow*/
9789 {
9790 void toUpperInPlace(ref char[] s)
9791 { toUpperInPlace!char(s); }
9792 void toUpperInPlace(ref wchar[] s)
9793 { toUpperInPlace!wchar(s); }
9794 void toUpperInPlace(ref dchar[] s)
9795 { toUpperInPlace!dchar(s); }
9796 }
9797
9798 /++
9799 If `c` is a Unicode uppercase $(CHARACTER), then its lowercase equivalent
9800 is returned. Otherwise `c` is returned.
9801
9802 Warning: certain alphabets like German and Greek have no 1:1
9803 upper-lower mapping. Use overload of toLower which takes full string instead.
9804 +/
9805 @safe pure nothrow @nogc
9806 dchar toLower(dchar c)
9807 {
9808 // optimize ASCII case
9809 if (c < 0xAA)
9810 {
9811 if (c < 'A')
9812 return c;
9813 if (c <= 'Z')
9814 return c + 32;
9815 return c;
9816 }
9817 size_t idx = toLowerSimpleIndex(c);
9818 if (idx != ushort.max)
9819 {
9820 return toLowerTab(idx);
9821 }
9822 return c;
9823 }
9824
9825 /++
9826 Creates a new array which is identical to `s` except that all of its
9827 characters are converted to lowercase (by performing Unicode lowercase mapping).
9828 If none of `s` characters were affected, then `s` itself is returned if `s` is a
9829 `string`-like type.
9830
9831 Params:
9832 s = A $(REF_ALTTEXT random access range, isRandomAccessRange, std,range,primitives)
9833 of characters
9834 Returns:
9835 An array with the same element type as `s`.
9836 +/
9837 ElementEncodingType!S[] toLower(S)(return scope S s) @trusted
9838 if (isSomeString!S)
9839 {
9840 static import std.ascii;
9841 return toCase!(LowerTriple, std.ascii.toLower)(s);
9842 }
9843
9844 /// ditto
9845 ElementEncodingType!S[] toLower(S)(S s)
9846 if (!isSomeString!S && (isRandomAccessRange!S && hasLength!S && hasSlicing!S && isSomeChar!(ElementType!S)))
9847 {
9848 static import std.ascii;
9849 return toCase!(LowerTriple, std.ascii.toLower)(s);
9850 }
9851
9852 // overloads for the most common cases to reduce compile time
9853 @safe pure /*TODO nothrow*/
9854 {
9855 string toLower(return scope string s)
9856 { return toLower!string(s); }
9857 wstring toLower(return scope wstring s)
9858 { return toLower!wstring(s); }
9859 dstring toLower(return scope dstring s)
9860 { return toLower!dstring(s); }
9861
9862 @safe unittest
9863 {
9864 // https://issues.dlang.org/show_bug.cgi?id=16663
9865
9866 static struct String
9867 {
9868 string data;
9869 alias data this;
9870 }
9871
9872 void foo()
9873 {
9874 auto u = toLower(String(""));
9875 }
9876 }
9877 }
9878
9879
9880 @safe unittest
9881 {
9882 static import std.ascii;
9883 import std.format : format;
9884 foreach (ch; 0 .. 0x80)
9885 assert(std.ascii.toLower(ch) == toLower(ch));
9886 assert(toLower('Я') == 'я');
9887 assert(toLower('Δ') == 'δ');
9888 foreach (ch; unicode.upperCase.byCodepoint)
9889 {
9890 dchar low = ch.toLower();
9891 assert(low == ch || isLower(low), format("%s -> %s", ch, low));
9892 }
9893 assert(toLower("АЯ") == "ая");
9894
9895 assert("\u1E9E".toLower == "\u00df");
9896 assert("\u00df".toUpper == "SS");
9897 }
9898
9899 // https://issues.dlang.org/show_bug.cgi?id=9629
9900 @safe unittest
9901 {
9902 wchar[] test = "hello þ world"w.dup;
9903 auto piece = test[6 .. 7];
9904 toUpperInPlace(piece);
9905 assert(test == "hello Þ world");
9906 }
9907
9908
9909 @safe unittest
9910 {
9911 import std.algorithm.comparison : cmp;
9912 string s1 = "FoL";
9913 string s2 = toLower(s1);
9914 assert(cmp(s2, "fol") == 0, s2);
9915 assert(s2 != s1);
9916
9917 char[] s3 = s1.dup;
9918 toLowerInPlace(s3);
9919 assert(s3 == s2);
9920
9921 s1 = "A\u0100B\u0101d";
9922 s2 = toLower(s1);
9923 s3 = s1.dup;
9924 assert(cmp(s2, "a\u0101b\u0101d") == 0);
9925 assert(s2 !is s1);
9926 toLowerInPlace(s3);
9927 assert(s3 == s2);
9928
9929 s1 = "A\u0460B\u0461d";
9930 s2 = toLower(s1);
9931 s3 = s1.dup;
9932 assert(cmp(s2, "a\u0461b\u0461d") == 0);
9933 assert(s2 !is s1);
9934 toLowerInPlace(s3);
9935 assert(s3 == s2);
9936
9937 s1 = "\u0130";
9938 s2 = toLower(s1);
9939 s3 = s1.dup;
9940 assert(s2 == "i\u0307");
9941 assert(s2 !is s1);
9942 toLowerInPlace(s3);
9943 assert(s3 == s2);
9944
9945 // Test on wchar and dchar strings.
9946 assert(toLower("Some String"w) == "some string"w);
9947 assert(toLower("Some String"d) == "some string"d);
9948
9949 // https://issues.dlang.org/show_bug.cgi?id=12455
9950 dchar c = 'İ'; // '\U0130' LATIN CAPITAL LETTER I WITH DOT ABOVE
9951 assert(isUpper(c));
9952 assert(toLower(c) == 'i');
9953 // extends on https://issues.dlang.org/show_bug.cgi?id=12455 report
9954 // check simple-case toUpper too
9955 c = '\u1f87';
9956 assert(isLower(c));
9957 assert(toUpper(c) == '\u1F8F');
9958 }
9959
9960 @safe pure unittest
9961 {
9962 import std.algorithm.comparison : cmp, equal;
9963 import std.utf : byCodeUnit;
9964 auto r1 = "FoL".byCodeUnit;
9965 assert(r1.toLower.cmp("fol") == 0);
9966 auto r2 = "A\u0460B\u0461d".byCodeUnit;
9967 assert(r2.toLower.cmp("a\u0461b\u0461d") == 0);
9968 }
9969
9970 /++
9971 If `c` is a Unicode lowercase $(CHARACTER), then its uppercase equivalent
9972 is returned. Otherwise `c` is returned.
9973
9974 Warning:
9975 Certain alphabets like German and Greek have no 1:1
9976 upper-lower mapping. Use overload of toUpper which takes full string instead.
9977
9978 toUpper can be used as an argument to $(REF map, std,algorithm,iteration)
9979 to produce an algorithm that can convert a range of characters to upper case
9980 without allocating memory.
9981 A string can then be produced by using $(REF copy, std,algorithm,mutation)
9982 to send it to an $(REF appender, std,array).
9983 +/
9984 @safe pure nothrow @nogc
9985 dchar toUpper(dchar c)
9986 {
9987 // optimize ASCII case
9988 if (c < 0xAA)
9989 {
9990 if (c < 'a')
9991 return c;
9992 if (c <= 'z')
9993 return c - 32;
9994 return c;
9995 }
9996 size_t idx = toUpperSimpleIndex(c);
9997 if (idx != ushort.max)
9998 {
9999 return toUpperTab(idx);
10000 }
10001 return c;
10002 }
10003
10004 ///
10005 @safe unittest
10006 {
10007 import std.algorithm.iteration : map;
10008 import std.algorithm.mutation : copy;
10009 import std.array : appender;
10010
10011 auto abuf = appender!(char[])();
10012 "hello".map!toUpper.copy(abuf);
10013 assert(abuf.data == "HELLO");
10014 }
10015
10016 @safe unittest
10017 {
10018 static import std.ascii;
10019 import std.format : format;
10020 foreach (ch; 0 .. 0x80)
10021 assert(std.ascii.toUpper(ch) == toUpper(ch));
10022 assert(toUpper('я') == 'Я');
10023 assert(toUpper('δ') == 'Δ');
10024 auto title = unicode.Titlecase_Letter;
10025 foreach (ch; unicode.lowerCase.byCodepoint)
10026 {
10027 dchar up = ch.toUpper();
10028 assert(up == ch || isUpper(up) || title[up],
10029 format("%x -> %x", ch, up));
10030 }
10031 }
10032
10033 /++
10034 Allocates a new array which is identical to `s` except that all of its
10035 characters are converted to uppercase (by performing Unicode uppercase mapping).
10036 If none of `s` characters were affected, then `s` itself is returned if `s`
10037 is a `string`-like type.
10038
10039 Params:
10040 s = A $(REF_ALTTEXT random access range, isRandomAccessRange, std,range,primitives)
10041 of characters
10042 Returns:
10043 An new array with the same element type as `s`.
10044 +/
10045 ElementEncodingType!S[] toUpper(S)(return scope S s) @trusted
10046 if (isSomeString!S)
10047 {
10048 static import std.ascii;
10049 return toCase!(UpperTriple, std.ascii.toUpper)(s);
10050 }
10051
10052 /// ditto
10053 ElementEncodingType!S[] toUpper(S)(S s)
10054 if (!isSomeString!S && (isRandomAccessRange!S && hasLength!S && hasSlicing!S && isSomeChar!(ElementType!S)))
10055 {
10056 static import std.ascii;
10057 return toCase!(UpperTriple, std.ascii.toUpper)(s);
10058 }
10059
10060 // overloads for the most common cases to reduce compile time
10061 @safe pure /*TODO nothrow*/
10062 {
10063 string toUpper(return scope string s)
10064 { return toUpper!string(s); }
10065 wstring toUpper(return scope wstring s)
10066 { return toUpper!wstring(s); }
10067 dstring toUpper(return scope dstring s)
10068 { return toUpper!dstring(s); }
10069
10070 @safe unittest
10071 {
10072 // https://issues.dlang.org/show_bug.cgi?id=16663
10073
10074 static struct String
10075 {
10076 string data;
10077 alias data this;
10078 }
10079
10080 void foo()
10081 {
10082 auto u = toUpper(String(""));
10083 }
10084 }
10085 }
10086
10087 @safe unittest
10088 {
10089 import std.algorithm.comparison : cmp;
10090
10091 string s1 = "FoL";
10092 string s2;
10093 char[] s3;
10094
10095 s2 = toUpper(s1);
10096 s3 = s1.dup; toUpperInPlace(s3);
10097 assert(s3 == s2, s3);
10098 assert(cmp(s2, "FOL") == 0);
10099 assert(s2 !is s1);
10100
10101 s1 = "a\u0100B\u0101d";
10102 s2 = toUpper(s1);
10103 s3 = s1.dup; toUpperInPlace(s3);
10104 assert(s3 == s2);
10105 assert(cmp(s2, "A\u0100B\u0100D") == 0);
10106 assert(s2 !is s1);
10107
10108 s1 = "a\u0460B\u0461d";
10109 s2 = toUpper(s1);
10110 s3 = s1.dup; toUpperInPlace(s3);
10111 assert(s3 == s2);
10112 assert(cmp(s2, "A\u0460B\u0460D") == 0);
10113 assert(s2 !is s1);
10114 }
10115
10116 @safe unittest
10117 {
10118 static void doTest(C)(const(C)[] s, const(C)[] trueUp, const(C)[] trueLow)
10119 {
10120 import std.format : format;
10121 string diff = "src: %( %x %)\nres: %( %x %)\ntru: %( %x %)";
10122 auto low = s.toLower() , up = s.toUpper();
10123 auto lowInp = s.dup, upInp = s.dup;
10124 lowInp.toLowerInPlace();
10125 upInp.toUpperInPlace();
10126 assert(low == trueLow, format(diff, low, trueLow));
10127 assert(up == trueUp, format(diff, up, trueUp));
10128 assert(lowInp == trueLow,
10129 format(diff, cast(const(ubyte)[]) s, cast(const(ubyte)[]) lowInp, cast(const(ubyte)[]) trueLow));
10130 assert(upInp == trueUp,
10131 format(diff, cast(const(ubyte)[]) s, cast(const(ubyte)[]) upInp, cast(const(ubyte)[]) trueUp));
10132 }
10133 static foreach (S; AliasSeq!(dstring, wstring, string))
10134 {{
10135
10136 S easy = "123";
10137 S good = "abCФеж";
10138 S awful = "\u0131\u023f\u2126";
10139 S wicked = "\u0130\u1FE2";
10140 auto options = [easy, good, awful, wicked];
10141 S[] lower = ["123", "abcфеж", "\u0131\u023f\u03c9", "i\u0307\u1Fe2"];
10142 S[] upper = ["123", "ABCФЕЖ", "I\u2c7e\u2126", "\u0130\u03A5\u0308\u0300"];
10143
10144 foreach (val; [easy, good])
10145 {
10146 auto e = val.dup;
10147 auto g = e;
10148 e.toUpperInPlace();
10149 assert(e is g);
10150 e.toLowerInPlace();
10151 assert(e is g);
10152 }
10153 foreach (i, v; options)
10154 {
10155 doTest(v, upper[i], lower[i]);
10156 }
10157
10158 // a few combinatorial runs
10159 foreach (i; 0 .. options.length)
10160 foreach (j; i .. options.length)
10161 foreach (k; j .. options.length)
10162 {
10163 auto sample = options[i] ~ options[j] ~ options[k];
10164 auto sample2 = options[k] ~ options[j] ~ options[i];
10165 doTest(sample, upper[i] ~ upper[j] ~ upper[k],
10166 lower[i] ~ lower[j] ~ lower[k]);
10167 doTest(sample2, upper[k] ~ upper[j] ~ upper[i],
10168 lower[k] ~ lower[j] ~ lower[i]);
10169 }
10170 }}
10171 }
10172
10173 // test random access ranges
10174 @safe pure unittest
10175 {
10176 import std.algorithm.comparison : cmp;
10177 import std.utf : byCodeUnit;
10178 auto s1 = "FoL".byCodeUnit;
10179 assert(s1.toUpper.cmp("FOL") == 0);
10180 auto s2 = "a\u0460B\u0461d".byCodeUnit;
10181 assert(s2.toUpper.cmp("A\u0460B\u0460D") == 0);
10182 }
10183
10184 /++
10185 Returns whether `c` is a Unicode alphabetic $(CHARACTER)
10186 (general Unicode category: Alphabetic).
10187 +/
10188 @safe pure nothrow @nogc
10189 bool isAlpha(dchar c)
10190 {
10191 // optimization
10192 if (c < 0xAA)
10193 {
10194 size_t x = c - 'A';
10195 if (x <= 'Z' - 'A')
10196 return true;
10197 else
10198 {
10199 x = c - 'a';
10200 if (x <= 'z'-'a')
10201 return true;
10202 }
10203 return false;
10204 }
10205
10206 return alphaTrie[c];
10207 }
10208
10209 @safe unittest
10210 {
10211 auto alpha = unicode("Alphabetic");
10212 foreach (ch; alpha.byCodepoint)
10213 assert(isAlpha(ch));
10214 foreach (ch; 0 .. 0x4000)
10215 assert((ch in alpha) == isAlpha(ch));
10216 }
10217
10218
10219 /++
10220 Returns whether `c` is a Unicode mark
10221 (general Unicode category: Mn, Me, Mc).
10222 +/
10223 @safe pure nothrow @nogc
10224 bool isMark(dchar c)
10225 {
10226 return markTrie[c];
10227 }
10228
10229 @safe unittest
10230 {
10231 auto mark = unicode("Mark");
10232 foreach (ch; mark.byCodepoint)
10233 assert(isMark(ch));
10234 foreach (ch; 0 .. 0x4000)
10235 assert((ch in mark) == isMark(ch));
10236 }
10237
10238 /++
10239 Returns whether `c` is a Unicode numerical $(CHARACTER)
10240 (general Unicode category: Nd, Nl, No).
10241 +/
10242 @safe pure nothrow @nogc
10243 bool isNumber(dchar c)
10244 {
10245 // optimization for ascii case
10246 if (c <= 0x7F)
10247 {
10248 return c >= '0' && c <= '9';
10249 }
10250 else
10251 {
10252 return numberTrie[c];
10253 }
10254 }
10255
10256 @safe unittest
10257 {
10258 auto n = unicode("N");
10259 foreach (ch; n.byCodepoint)
10260 assert(isNumber(ch));
10261 foreach (ch; 0 .. 0x4000)
10262 assert((ch in n) == isNumber(ch));
10263 }
10264
10265 /++
10266 Returns whether `c` is a Unicode alphabetic $(CHARACTER) or number.
10267 (general Unicode category: Alphabetic, Nd, Nl, No).
10268
10269 Params:
10270 c = any Unicode character
10271 Returns:
10272 `true` if the character is in the Alphabetic, Nd, Nl, or No Unicode
10273 categories
10274 +/
10275 @safe pure nothrow @nogc
10276 bool isAlphaNum(dchar c)
10277 {
10278 static import std.ascii;
10279
10280 // optimization for ascii case
10281 if (std.ascii.isASCII(c))
10282 {
10283 return std.ascii.isAlphaNum(c);
10284 }
10285 else
10286 {
10287 return isAlpha(c) || isNumber(c);
10288 }
10289 }
10290
10291 @safe unittest
10292 {
10293 auto n = unicode("N");
10294 auto alpha = unicode("Alphabetic");
10295
10296 foreach (ch; n.byCodepoint)
10297 assert(isAlphaNum(ch));
10298
10299 foreach (ch; alpha.byCodepoint)
10300 assert(isAlphaNum(ch));
10301
10302 foreach (ch; 0 .. 0x4000)
10303 {
10304 assert(((ch in n) || (ch in alpha)) == isAlphaNum(ch));
10305 }
10306 }
10307
10308 /++
10309 Returns whether `c` is a Unicode punctuation $(CHARACTER)
10310 (general Unicode category: Pd, Ps, Pe, Pc, Po, Pi, Pf).
10311 +/
10312 @safe pure nothrow @nogc
10313 bool isPunctuation(dchar c)
10314 {
10315 static import std.ascii;
10316
10317 // optimization for ascii case
10318 if (c <= 0x7F)
10319 {
10320 return std.ascii.isPunctuation(c);
10321 }
10322 else
10323 {
10324 return punctuationTrie[c];
10325 }
10326 }
10327
10328 @safe unittest
10329 {
10330 assert(isPunctuation('\u0021'));
10331 assert(isPunctuation('\u0028'));
10332 assert(isPunctuation('\u0029'));
10333 assert(isPunctuation('\u002D'));
10334 assert(isPunctuation('\u005F'));
10335 assert(isPunctuation('\u00AB'));
10336 assert(isPunctuation('\u00BB'));
10337 foreach (ch; unicode("P").byCodepoint)
10338 assert(isPunctuation(ch));
10339 }
10340
10341 /++
10342 Returns whether `c` is a Unicode symbol $(CHARACTER)
10343 (general Unicode category: Sm, Sc, Sk, So).
10344 +/
10345 @safe pure nothrow @nogc
10346 bool isSymbol(dchar c)
10347 {
10348 return symbolTrie[c];
10349 }
10350
10351 @safe unittest
10352 {
10353 import std.format : format;
10354 assert(isSymbol('\u0024'));
10355 assert(isSymbol('\u002B'));
10356 assert(isSymbol('\u005E'));
10357 assert(isSymbol('\u00A6'));
10358 foreach (ch; unicode("S").byCodepoint)
10359 assert(isSymbol(ch), format("%04x", ch));
10360 }
10361
10362 /++
10363 Returns whether `c` is a Unicode space $(CHARACTER)
10364 (general Unicode category: Zs)
10365 Note: This doesn't include '\n', '\r', \t' and other non-space $(CHARACTER).
10366 For commonly used less strict semantics see $(LREF isWhite).
10367 +/
10368 @safe pure nothrow @nogc
10369 bool isSpace(dchar c)
10370 {
10371 import std.internal.unicode_tables : isSpaceGen; // generated file
10372 return isSpaceGen(c);
10373 }
10374
10375 @safe unittest
10376 {
10377 assert(isSpace('\u0020'));
10378 auto space = unicode.Zs;
10379 foreach (ch; space.byCodepoint)
10380 assert(isSpace(ch));
10381 foreach (ch; 0 .. 0x1000)
10382 assert(isSpace(ch) == space[ch]);
10383 }
10384
10385
10386 /++
10387 Returns whether `c` is a Unicode graphical $(CHARACTER)
10388 (general Unicode category: L, M, N, P, S, Zs).
10389
10390 +/
10391 @safe pure nothrow @nogc
10392 bool isGraphical(dchar c)
10393 {
10394 return graphicalTrie[c];
10395 }
10396
10397
10398 @safe unittest
10399 {
10400 auto set = unicode("Graphical");
10401 import std.format : format;
10402 foreach (ch; set.byCodepoint)
10403 assert(isGraphical(ch), format("%4x", ch));
10404 foreach (ch; 0 .. 0x4000)
10405 assert((ch in set) == isGraphical(ch));
10406 }
10407
10408
10409 /++
10410 Returns whether `c` is a Unicode control $(CHARACTER)
10411 (general Unicode category: Cc).
10412 +/
10413 @safe pure nothrow @nogc
10414 bool isControl(dchar c)
10415 {
10416 import std.internal.unicode_tables : isControlGen; // generated file
10417 return isControlGen(c);
10418 }
10419
10420 @safe unittest
10421 {
10422 assert(isControl('\u0000'));
10423 assert(isControl('\u0081'));
10424 assert(!isControl('\u0100'));
10425 auto cc = unicode.Cc;
10426 foreach (ch; cc.byCodepoint)
10427 assert(isControl(ch));
10428 foreach (ch; 0 .. 0x1000)
10429 assert(isControl(ch) == cc[ch]);
10430 }
10431
10432
10433 /++
10434 Returns whether `c` is a Unicode formatting $(CHARACTER)
10435 (general Unicode category: Cf).
10436 +/
10437 @safe pure nothrow @nogc
10438 bool isFormat(dchar c)
10439 {
10440 import std.internal.unicode_tables : isFormatGen; // generated file
10441 return isFormatGen(c);
10442 }
10443
10444
10445 @safe unittest
10446 {
10447 assert(isFormat('\u00AD'));
10448 foreach (ch; unicode("Format").byCodepoint)
10449 assert(isFormat(ch));
10450 }
10451
10452 // code points for private use, surrogates are not likely to change in near feature
10453 // if need be they can be generated from unicode data as well
10454
10455 /++
10456 Returns whether `c` is a Unicode Private Use $(CODEPOINT)
10457 (general Unicode category: Co).
10458 +/
10459 @safe pure nothrow @nogc
10460 bool isPrivateUse(dchar c)
10461 {
10462 return (0x00_E000 <= c && c <= 0x00_F8FF)
10463 || (0x0F_0000 <= c && c <= 0x0F_FFFD)
10464 || (0x10_0000 <= c && c <= 0x10_FFFD);
10465 }
10466
10467 /++
10468 Returns whether `c` is a Unicode surrogate $(CODEPOINT)
10469 (general Unicode category: Cs).
10470 +/
10471 @safe pure nothrow @nogc
10472 bool isSurrogate(dchar c)
10473 {
10474 return (0xD800 <= c && c <= 0xDFFF);
10475 }
10476
10477 /++
10478 Returns whether `c` is a Unicode high surrogate (lead surrogate).
10479 +/
10480 @safe pure nothrow @nogc
10481 bool isSurrogateHi(dchar c)
10482 {
10483 return (0xD800 <= c && c <= 0xDBFF);
10484 }
10485
10486 /++
10487 Returns whether `c` is a Unicode low surrogate (trail surrogate).
10488 +/
10489 @safe pure nothrow @nogc
10490 bool isSurrogateLo(dchar c)
10491 {
10492 return (0xDC00 <= c && c <= 0xDFFF);
10493 }
10494
10495 /++
10496 Returns whether `c` is a Unicode non-character i.e.
10497 a $(CODEPOINT) with no assigned abstract character.
10498 (general Unicode category: Cn)
10499 +/
10500 @safe pure nothrow @nogc
10501 bool isNonCharacter(dchar c)
10502 {
10503 return nonCharacterTrie[c];
10504 }
10505
10506 @safe unittest
10507 {
10508 auto set = unicode("Cn");
10509 foreach (ch; set.byCodepoint)
10510 assert(isNonCharacter(ch));
10511 }
10512
10513 private:
10514 // load static data from pre-generated tables into usable datastructures
10515
10516
10517 @safe auto asSet(const (ubyte)[] compressed) pure
10518 {
10519 return CodepointSet.fromIntervals(decompressIntervals(compressed));
10520 }
10521
10522 @safe pure nothrow auto asTrie(T...)(const scope TrieEntry!T e)
10523 {
10524 return const(CodepointTrie!T)(e.offsets, e.sizes, e.data);
10525 }
10526
10527 @safe pure nothrow @nogc @property
10528 {
10529 import std.internal.unicode_tables; // generated file
10530
10531 // It's important to use auto return here, so that the compiler
10532 // only runs semantic on the return type if the function gets
10533 // used. Also these are functions rather than templates to not
10534 // increase the object size of the caller.
10535 auto lowerCaseTrie() { static immutable res = asTrie(lowerCaseTrieEntries); return res; }
10536 auto upperCaseTrie() { static immutable res = asTrie(upperCaseTrieEntries); return res; }
10537 auto simpleCaseTrie() { static immutable res = asTrie(simpleCaseTrieEntries); return res; }
10538 auto fullCaseTrie() { static immutable res = asTrie(fullCaseTrieEntries); return res; }
10539 auto alphaTrie() { static immutable res = asTrie(alphaTrieEntries); return res; }
10540 auto markTrie() { static immutable res = asTrie(markTrieEntries); return res; }
10541 auto numberTrie() { static immutable res = asTrie(numberTrieEntries); return res; }
10542 auto punctuationTrie() { static immutable res = asTrie(punctuationTrieEntries); return res; }
10543 auto symbolTrie() { static immutable res = asTrie(symbolTrieEntries); return res; }
10544 auto graphicalTrie() { static immutable res = asTrie(graphicalTrieEntries); return res; }
10545 auto nonCharacterTrie() { static immutable res = asTrie(nonCharacterTrieEntries); return res; }
10546
10547 //normalization quick-check tables
10548 auto nfcQCTrie()
10549 {
10550 import std.internal.unicode_norm : nfcQCTrieEntries;
10551 static immutable res = asTrie(nfcQCTrieEntries);
10552 return res;
10553 }
10554
10555 auto nfdQCTrie()
10556 {
10557 import std.internal.unicode_norm : nfdQCTrieEntries;
10558 static immutable res = asTrie(nfdQCTrieEntries);
10559 return res;
10560 }
10561
10562 auto nfkcQCTrie()
10563 {
10564 import std.internal.unicode_norm : nfkcQCTrieEntries;
10565 static immutable res = asTrie(nfkcQCTrieEntries);
10566 return res;
10567 }
10568
10569 auto nfkdQCTrie()
10570 {
10571 import std.internal.unicode_norm : nfkdQCTrieEntries;
10572 static immutable res = asTrie(nfkdQCTrieEntries);
10573 return res;
10574 }
10575
10576 //grapheme breaking algorithm tables
10577 auto mcTrie()
10578 {
10579 import std.internal.unicode_grapheme : mcTrieEntries;
10580 static immutable res = asTrie(mcTrieEntries);
10581 return res;
10582 }
10583
10584 auto graphemeExtendTrie()
10585 {
10586 import std.internal.unicode_grapheme : graphemeExtendTrieEntries;
10587 static immutable res = asTrie(graphemeExtendTrieEntries);
10588 return res;
10589 }
10590
10591 auto hangLV()
10592 {
10593 import std.internal.unicode_grapheme : hangulLVTrieEntries;
10594 static immutable res = asTrie(hangulLVTrieEntries);
10595 return res;
10596 }
10597
10598 auto hangLVT()
10599 {
10600 import std.internal.unicode_grapheme : hangulLVTTrieEntries;
10601 static immutable res = asTrie(hangulLVTTrieEntries);
10602 return res;
10603 }
10604
10605 // tables below are used for composition/decomposition
10606 auto combiningClassTrie()
10607 {
10608 import std.internal.unicode_comp : combiningClassTrieEntries;
10609 static immutable res = asTrie(combiningClassTrieEntries);
10610 return res;
10611 }
10612
10613 auto compatMappingTrie()
10614 {
10615 import std.internal.unicode_decomp : compatMappingTrieEntries;
10616 static immutable res = asTrie(compatMappingTrieEntries);
10617 return res;
10618 }
10619
10620 auto canonMappingTrie()
10621 {
10622 import std.internal.unicode_decomp : canonMappingTrieEntries;
10623 static immutable res = asTrie(canonMappingTrieEntries);
10624 return res;
10625 }
10626
10627 auto compositionJumpTrie()
10628 {
10629 import std.internal.unicode_comp : compositionJumpTrieEntries;
10630 static immutable res = asTrie(compositionJumpTrieEntries);
10631 return res;
10632 }
10633
10634 //case conversion tables
10635 auto toUpperIndexTrie() { static immutable res = asTrie(toUpperIndexTrieEntries); return res; }
10636 auto toLowerIndexTrie() { static immutable res = asTrie(toLowerIndexTrieEntries); return res; }
10637 auto toTitleIndexTrie() { static immutable res = asTrie(toTitleIndexTrieEntries); return res; }
10638 //simple case conversion tables
10639 auto toUpperSimpleIndexTrie() { static immutable res = asTrie(toUpperSimpleIndexTrieEntries); return res; }
10640 auto toLowerSimpleIndexTrie() { static immutable res = asTrie(toLowerSimpleIndexTrieEntries); return res; }
10641 auto toTitleSimpleIndexTrie() { static immutable res = asTrie(toTitleSimpleIndexTrieEntries); return res; }
10642
10643 }
10644
10645 }// version (!std_uni_bootstrap)
10646