xref: /netbsd-src/external/gpl3/gcc/dist/libphobos/src/std/uni/package.d (revision b1e838363e3c6fc78a55519254d99869742dd33c)
1 // Written in the D programming language.
2 
3 /++
4     $(P The `std.uni` module provides an implementation
5     of fundamental Unicode algorithms and data structures.
6     This doesn't include UTF encoding and decoding primitives,
7     see $(REF decode, std,_utf) and $(REF encode, std,_utf) in $(MREF std, utf)
8     for this functionality. )
9 
10 $(SCRIPT inhibitQuickIndex = 1;)
11 $(DIVC quickindex,
12 $(BOOKTABLE,
13 $(TR $(TH Category) $(TH Functions))
14 $(TR $(TD Decode) $(TD
15     $(LREF byCodePoint)
16     $(LREF byGrapheme)
17     $(LREF decodeGrapheme)
18     $(LREF graphemeStride)
19 ))
20 $(TR $(TD Comparison) $(TD
21     $(LREF icmp)
22     $(LREF sicmp)
23 ))
24 $(TR $(TD Classification) $(TD
25     $(LREF isAlpha)
26     $(LREF isAlphaNum)
27     $(LREF isCodepointSet)
28     $(LREF isControl)
29     $(LREF isFormat)
30     $(LREF isGraphical)
31     $(LREF isIntegralPair)
32     $(LREF isMark)
33     $(LREF isNonCharacter)
34     $(LREF isNumber)
35     $(LREF isPrivateUse)
36     $(LREF isPunctuation)
37     $(LREF isSpace)
38     $(LREF isSurrogate)
39     $(LREF isSurrogateHi)
40     $(LREF isSurrogateLo)
41     $(LREF isSymbol)
42     $(LREF isWhite)
43 ))
44 $(TR $(TD Normalization) $(TD
45     $(LREF NFC)
46     $(LREF NFD)
47     $(LREF NFKD)
48     $(LREF NormalizationForm)
49     $(LREF normalize)
50 ))
51 $(TR $(TD Decompose) $(TD
52     $(LREF decompose)
53     $(LREF decomposeHangul)
54     $(LREF UnicodeDecomposition)
55 ))
56 $(TR $(TD Compose) $(TD
57     $(LREF compose)
58     $(LREF composeJamo)
59 ))
60 $(TR $(TD Sets) $(TD
61     $(LREF CodepointInterval)
62     $(LREF CodepointSet)
63     $(LREF InversionList)
64     $(LREF unicode)
65 ))
66 $(TR $(TD Trie) $(TD
67     $(LREF codepointSetTrie)
68     $(LREF CodepointSetTrie)
69     $(LREF codepointTrie)
70     $(LREF CodepointTrie)
71     $(LREF toTrie)
72     $(LREF toDelegate)
73 ))
74 $(TR $(TD Casing) $(TD
75     $(LREF asCapitalized)
76     $(LREF asLowerCase)
77     $(LREF asUpperCase)
78     $(LREF isLower)
79     $(LREF isUpper)
80     $(LREF toLower)
81     $(LREF toLowerInPlace)
82     $(LREF toUpper)
83     $(LREF toUpperInPlace)
84 ))
85 $(TR $(TD Utf8Matcher) $(TD
86     $(LREF isUtfMatcher)
87     $(LREF MatcherConcept)
88     $(LREF utfMatcher)
89 ))
90 $(TR $(TD Separators) $(TD
91     $(LREF lineSep)
92     $(LREF nelSep)
93     $(LREF paraSep)
94 ))
95 $(TR $(TD Building blocks) $(TD
96     $(LREF allowedIn)
97     $(LREF combiningClass)
98     $(LREF Grapheme)
99 ))
100 ))
101 
102     $(P All primitives listed operate on Unicode characters and
103         sets of characters. For functions which operate on ASCII characters
104         and ignore Unicode $(CHARACTERS), see $(MREF std, ascii).
105         For definitions of Unicode $(CHARACTER), $(CODEPOINT) and other terms
106         used throughout this module see the $(S_LINK Terminology, terminology) section
107         below.
108     )
109     $(P The focus of this module is the core needs of developing Unicode-aware
110         applications. To that effect it provides the following optimized primitives:
111     )
112     $(UL
113         $(LI Character classification by category and common properties:
114             $(LREF isAlpha), $(LREF isWhite) and others.
115         )
116         $(LI
117             Case-insensitive string comparison ($(LREF sicmp), $(LREF icmp)).
118         )
119         $(LI
120             Converting text to any of the four normalization forms via $(LREF normalize).
121         )
122         $(LI
123             Decoding ($(LREF decodeGrapheme))  and iteration ($(LREF byGrapheme), $(LREF graphemeStride))
124             by user-perceived characters, that is by $(LREF Grapheme) clusters.
125         )
126         $(LI
127             Decomposing and composing of individual character(s) according to canonical
128             or compatibility rules, see $(LREF compose) and $(LREF decompose),
129             including the specific version for Hangul syllables $(LREF composeJamo)
130             and $(LREF decomposeHangul).
131         )
132     )
133     $(P It's recognized that an application may need further enhancements
134         and extensions, such as less commonly known algorithms,
135         or tailoring existing ones for region specific needs. To help users
136         with building any extra functionality beyond the core primitives,
137         the module provides:
138     )
139     $(UL
140         $(LI
141             $(LREF CodepointSet), a type for easy manipulation of sets of characters.
142             Besides the typical set algebra it provides an unusual feature:
143             a D source code generator for detection of $(CODEPOINTS) in this set.
144             This is a boon for meta-programming parser frameworks,
145             and is used internally to power classification in small
146             sets like $(LREF isWhite).
147         )
148         $(LI
149             A way to construct optimal packed multi-stage tables also known as a
150             special case of $(LINK2 https://en.wikipedia.org/wiki/Trie, Trie).
151             The functions $(LREF codepointTrie), $(LREF codepointSetTrie)
152             construct custom tries that map dchar to value.
153             The end result is a fast and predictable $(BIGOH 1) lookup that powers
154             functions like $(LREF isAlpha) and $(LREF combiningClass),
155             but for user-defined data sets.
156         )
157         $(LI
158             A useful technique for Unicode-aware parsers that perform
159             character classification of encoded $(CODEPOINTS)
160             is to avoid unnecassary decoding at all costs.
161             $(LREF utfMatcher) provides an improvement over the usual workflow
162             of decode-classify-process, combining the decoding and classification
163             steps. By extracting necessary bits directly from encoded
164             $(S_LINK Code unit, code units) matchers achieve
165             significant performance improvements. See $(LREF MatcherConcept) for
166             the common interface of UTF matchers.
167         )
168         $(LI
169             Generally useful building blocks for customized normalization:
170             $(LREF combiningClass) for querying combining class
171             and $(LREF allowedIn) for testing the Quick_Check
172             property of a given normalization form.
173         )
174         $(LI
175             Access to a large selection of commonly used sets of $(CODEPOINTS).
176             $(S_LINK Unicode properties, Supported sets) include Script,
177             Block and General Category. The exact contents of a set can be
178             observed in the CLDR utility, on the
179             $(HTTP www.unicode.org/cldr/utility/properties.jsp, property index) page
180             of the Unicode website.
181             See $(LREF unicode) for easy and (optionally) compile-time checked set
182             queries.
183         )
184     )
185     $(SECTION Synopsis)
186     ---
187     import std.uni;
188     void main()
189     {
190         // initialize code point sets using script/block or property name
191         // now 'set' contains code points from both scripts.
192         auto set = unicode("Cyrillic") | unicode("Armenian");
193         // same thing but simpler and checked at compile-time
194         auto ascii = unicode.ASCII;
195         auto currency = unicode.Currency_Symbol;
196 
197         // easy set ops
198         auto a = set & ascii;
199         assert(a.empty); // as it has no intersection with ascii
200         a = set | ascii;
201         auto b = currency - a; // subtract all ASCII, Cyrillic and Armenian
202 
203         // some properties of code point sets
204         assert(b.length > 45); // 46 items in Unicode 6.1, even more in 6.2
205         // testing presence of a code point in a set
206         // is just fine, it is O(logN)
207         assert(!b['$']);
208         assert(!b['\u058F']); // Armenian dram sign
209         assert(b['¥']);
210 
211         // building fast lookup tables, these guarantee O(1) complexity
212         // 1-level Trie lookup table essentially a huge bit-set ~262Kb
213         auto oneTrie = toTrie!1(b);
214         // 2-level far more compact but typically slightly slower
215         auto twoTrie = toTrie!2(b);
216         // 3-level even smaller, and a bit slower yet
217         auto threeTrie = toTrie!3(b);
218         assert(oneTrie['£']);
219         assert(twoTrie['£']);
220         assert(threeTrie['£']);
221 
222         // build the trie with the most sensible trie level
223         // and bind it as a functor
224         auto cyrillicOrArmenian = toDelegate(set);
225         auto balance = find!(cyrillicOrArmenian)("Hello ընկեր!");
226         assert(balance == "ընկեր!");
227         // compatible with bool delegate(dchar)
228         bool delegate(dchar) bindIt = cyrillicOrArmenian;
229 
230         // Normalization
231         string s = "Plain ascii (and not only), is always normalized!";
232         assert(s is normalize(s));// is the same string
233 
234         string nonS = "A\u0308ffin"; // A ligature
235         auto nS = normalize(nonS); // to NFC, the W3C endorsed standard
236         assert(nS == "Äffin");
237         assert(nS != nonS);
238         string composed = "Äffin";
239 
240         assert(normalize!NFD(composed) == "A\u0308ffin");
241         // to NFKD, compatibility decomposition useful for fuzzy matching/searching
242         assert(normalize!NFKD("2¹⁰") == "210");
243     }
244     ---
245     $(SECTION Terminology)
246     $(P The following is a list of important Unicode notions
247     and definitions. Any conventions used specifically in this
248     module alone are marked as such. The descriptions are based on the formal
249     definition as found in $(HTTP www.unicode.org/versions/Unicode6.2.0/ch03.pdf,
250     chapter three of The Unicode Standard Core Specification.)
251     )
252     $(P $(DEF Abstract character) A unit of information used for the organization,
253         control, or representation of textual data.
254         Note that:
255         $(UL
256             $(LI When representing data, the nature of that data
257                 is generally symbolic as opposed to some other
258                 kind of data (for example, visual).
259             )
260              $(LI An abstract character has no concrete form
261                 and should not be confused with a $(S_LINK Glyph, glyph).
262             )
263             $(LI An abstract character does not necessarily
264                 correspond to what a user thinks of as acharacter265                 and should not be confused with a $(LREF Grapheme).
266             )
267             $(LI The abstract characters encoded (see Encoded character)
268                 are known as Unicode abstract characters.
269             )
270             $(LI Abstract characters not directly
271                 encoded by the Unicode Standard can often be
272                 represented by the use of combining character sequences.
273             )
274         )
275     )
276     $(P $(DEF Canonical decomposition)
277         The decomposition of a character or character sequence
278         that results from recursively applying the canonical
279         mappings found in the Unicode Character Database
280         and these described in Conjoining Jamo Behavior
281         (section 12 of
282         $(HTTP www.unicode.org/uni2book/ch03.pdf, Unicode Conformance)).
283     )
284     $(P $(DEF Canonical composition)
285         The precise definition of the Canonical composition
286         is the algorithm as specified in $(HTTP www.unicode.org/uni2book/ch03.pdf,
287         Unicode Conformance) section 11.
288         Informally it's the process that does the reverse of the canonical
289         decomposition with the addition of certain rules
290         that e.g. prevent legacy characters from appearing in the composed result.
291     )
292     $(P $(DEF Canonical equivalent)
293         Two character sequences are said to be canonical equivalents if
294         their full canonical decompositions are identical.
295     )
296     $(P $(DEF Character) Typically differs by context.
297         For the purpose of this documentation the term $(I character)
298         implies $(I encoded character), that is, a code point having
299         an assigned abstract character (a symbolic meaning).
300     )
301     $(P $(DEF Code point) Any value in the Unicode codespace;
302         that is, the range of integers from 0 to 10FFFF (hex).
303         Not all code points are assigned to encoded characters.
304     )
305     $(P $(DEF Code unit) The minimal bit combination that can represent
306         a unit of encoded text for processing or interchange.
307         Depending on the encoding this could be:
308         8-bit code units in the UTF-8 (`char`),
309         16-bit code units in the UTF-16 (`wchar`),
310         and 32-bit code units in the UTF-32 (`dchar`).
311         $(I Note that in UTF-32, a code unit is a code point
312         and is represented by the D `dchar` type.)
313     )
314     $(P $(DEF Combining character) A character with the General Category
315         of Combining Mark(M).
316         $(UL
317             $(LI All characters with non-zero canonical combining class
318             are combining characters, but the reverse is not the case:
319             there are combining characters with a zero combining class.
320             )
321             $(LI These characters are not normally used in isolation
322             unless they are being described. They include such characters
323             as accents, diacritics, Hebrew points, Arabic vowel signs,
324             and Indic matras.
325             )
326         )
327     )
328     $(P $(DEF Combining class)
329         A numerical value used by the Unicode Canonical Ordering Algorithm
330         to determine which sequences of combining marks are to be
331         considered canonically equivalent and  which are not.
332     )
333     $(P $(DEF Compatibility decomposition)
334         The decomposition of a character or character sequence that results
335         from recursively applying both the compatibility mappings and
336         the canonical mappings found in the Unicode Character Database, and those
337         described in Conjoining Jamo Behavior no characters
338         can be further decomposed.
339     )
340     $(P $(DEF Compatibility equivalent)
341         Two character sequences are said to be compatibility
342         equivalents if their full compatibility decompositions are identical.
343     )
344     $(P $(DEF Encoded character) An association (or mapping)
345         between an abstract character and a code point.
346     )
347     $(P $(DEF Glyph) The actual, concrete image of a glyph representation
348         having been rasterized or otherwise imaged onto some display surface.
349     )
350     $(P $(DEF Grapheme base) A character with the property
351         Grapheme_Base, or any standard Korean syllable block.
352     )
353     $(P $(DEF Grapheme cluster) Defined as the text between
354         grapheme boundaries  as specified by Unicode Standard Annex #29,
355         $(HTTP www.unicode.org/reports/tr29/, Unicode text segmentation).
356         Important general properties of a grapheme:
357         $(UL
358             $(LI The grapheme cluster represents a horizontally segmentable
359             unit of text, consisting of some grapheme base (which may
360             consist of a Korean syllable) together with any number of
361             nonspacing marks applied to it.
362             )
363             $(LI  A grapheme cluster typically starts with a grapheme base
364             and then extends across any subsequent sequence of nonspacing marks.
365             A grapheme cluster is most directly relevant to text rendering and
366             processes such as cursor placement and text selection in editing,
367             but may also be relevant to comparison and searching.
368             )
369             $(LI For many processes, a grapheme cluster behaves as if it was a
370             single character with the same properties as its grapheme base.
371             Effectively, nonspacing marks apply $(I graphically) to the base,
372             but do not change its properties.
373             )
374         )
375         $(P This module defines a number of primitives that work with graphemes:
376         $(LREF Grapheme), $(LREF decodeGrapheme) and $(LREF graphemeStride).
377         All of them are using $(I extended grapheme) boundaries
378         as defined in the aforementioned standard annex.
379         )
380     )
381     $(P $(DEF Nonspacing mark) A combining character with the
382         General Category of Nonspacing Mark (Mn) or Enclosing Mark (Me).
383     )
384     $(P $(DEF Spacing mark) A combining character that is not a nonspacing mark.
385     )
386     $(SECTION Normalization)
387     $(P The concepts of $(S_LINK Canonical equivalent, canonical equivalent)
388         or $(S_LINK Compatibility equivalent, compatibility equivalent)
389         characters in the Unicode Standard make it necessary to have a full, formal
390         definition of equivalence for Unicode strings.
391         String equivalence is determined by a process called normalization,
392         whereby strings are converted into forms which are compared
393         directly for identity. This is the primary goal of the normalization process,
394         see the function $(LREF normalize) to convert into any of
395         the four defined forms.
396     )
397     $(P A very important attribute of the Unicode Normalization Forms
398         is that they must remain stable between versions of the Unicode Standard.
399         A Unicode string normalized to a particular Unicode Normalization Form
400         in one version of the standard is guaranteed to remain in that Normalization
401         Form for implementations of future versions of the standard.
402     )
403     $(P The Unicode Standard specifies four normalization forms.
404         Informally, two of these forms are defined by maximal decomposition
405         of equivalent sequences, and two of these forms are defined
406         by maximal $(I composition) of equivalent sequences.
407             $(UL
408             $(LI Normalization Form D (NFD): The $(S_LINK Canonical decomposition,
409                 canonical decomposition) of a character sequence.)
410             $(LI Normalization Form KD (NFKD): The $(S_LINK Compatibility decomposition,
411                 compatibility decomposition) of a character sequence.)
412             $(LI Normalization Form C (NFC): The canonical composition of the
413                 $(S_LINK Canonical decomposition, canonical decomposition)
414                 of a coded character sequence.)
415             $(LI Normalization Form KC (NFKC): The canonical composition
416             of the $(S_LINK Compatibility decomposition,
417                 compatibility decomposition) of a character sequence)
418             )
419     )
420     $(P The choice of the normalization form depends on the particular use case.
421         NFC is the best form for general text, since it's more compatible with
422         strings converted from legacy encodings. NFKC is the preferred form for
423         identifiers, especially where there are security concerns. NFD and NFKD
424         are the most useful for internal processing.
425     )
426     $(SECTION Construction of lookup tables)
427     $(P The Unicode standard describes a set of algorithms that
428         depend on having the ability to quickly look up various properties
429         of a code point. Given the the codespace of about 1 million $(CODEPOINTS),
430         it is not a trivial task to provide a space-efficient solution for
431         the multitude of properties.
432     )
433     $(P Common approaches such as hash-tables or binary search over
434         sorted code point intervals (as in $(LREF InversionList)) are insufficient.
435         Hash-tables have enormous memory footprint and binary search
436         over intervals is not fast enough for some heavy-duty algorithms.
437     )
438     $(P The recommended solution (see Unicode Implementation Guidelines)
439         is using multi-stage tables that are an implementation of the
440         $(HTTP en.wikipedia.org/wiki/Trie, Trie) data structure with integer
441         keys and a fixed number of stages. For the remainder of the section
442         this will be called a fixed trie. The following describes a particular
443         implementation that is aimed for the speed of access at the expense
444         of ideal size savings.
445     )
446     $(P Taking a 2-level Trie as an example the principle of operation is as follows.
447         Split the number of bits in a key (code point, 21 bits) into 2 components
448         (e.g. 15 and 8).  The first is the number of bits in the index of the trie
449          and the other is number of bits in each page of the trie.
450         The layout of the trie is then an array of size 2^^bits-of-index followed
451         an array of memory chunks of size 2^^bits-of-page/bits-per-element.
452     )
453     $(P The number of pages is variable (but not less then 1)
454         unlike the number of entries in the index. The slots of the index
455         all have to contain a number of a page that is present. The lookup is then
456         just a couple of operations - slice the upper bits,
457         lookup an index for these, take a page at this index and use
458         the lower bits as an offset within this page.
459 
460         Assuming that pages are laid out consequently
461         in one array at `pages`, the pseudo-code is:
462     )
463     ---
464     auto elemsPerPage = (2 ^^ bits_per_page) / Value.sizeOfInBits;
465     pages[index[n >> bits_per_page]][n & (elemsPerPage - 1)];
466     ---
467     $(P Where if `elemsPerPage` is a power of 2 the whole process is
468         a handful of simple instructions and 2 array reads. Subsequent levels
469         of the trie are introduced by recursing on this notion - the index array
470         is treated as values. The number of bits in index is then again
471         split into 2 parts, with pages over 'current-index' and the new 'upper-index'.
472     )
473 
474     $(P For completeness a level 1 trie is simply an array.
475         The current implementation takes advantage of bit-packing values
476         when the range is known to be limited in advance (such as `bool`).
477         See also $(LREF BitPacked) for enforcing it manually.
478         The major size advantage however comes from the fact
479         that multiple $(B identical pages on every level are merged) by construction.
480     )
481     $(P The process of constructing a trie is more involved and is hidden from
482         the user in a form of the convenience functions $(LREF codepointTrie),
483         $(LREF codepointSetTrie) and the even more convenient $(LREF toTrie).
484         In general a set or built-in AA with `dchar` type
485         can be turned into a trie. The trie object in this module
486         is read-only (immutable); it's effectively frozen after construction.
487     )
488     $(SECTION Unicode properties)
489     $(P This is a full list of Unicode properties accessible through $(LREF unicode)
490         with specific helpers per category nested within. Consult the
491         $(HTTP www.unicode.org/cldr/utility/properties.jsp, CLDR utility)
492         when in doubt about the contents of a particular set.
493     )
494     $(P General category sets listed below are only accessible with the
495         $(LREF unicode) shorthand accessor.)
496         $(BOOKTABLE $(B General category ),
497              $(TR $(TH Abb.) $(TH Long form)
498                 $(TH Abb.) $(TH Long form)$(TH Abb.) $(TH Long form))
499             $(TR $(TD L) $(TD Letter)
500                 $(TD Cn) $(TD Unassigned)  $(TD Po) $(TD Other_Punctuation))
501             $(TR $(TD Ll) $(TD Lowercase_Letter)
502                 $(TD Co) $(TD Private_Use) $(TD Ps) $(TD Open_Punctuation))
503             $(TR $(TD Lm) $(TD Modifier_Letter)
504                 $(TD Cs) $(TD Surrogate)   $(TD S) $(TD Symbol))
505             $(TR $(TD Lo) $(TD Other_Letter)
506                 $(TD N) $(TD Number)  $(TD Sc) $(TD Currency_Symbol))
507             $(TR $(TD Lt) $(TD Titlecase_Letter)
508               $(TD Nd) $(TD Decimal_Number)  $(TD Sk) $(TD Modifier_Symbol))
509             $(TR $(TD Lu) $(TD Uppercase_Letter)
510               $(TD Nl) $(TD Letter_Number)   $(TD Sm) $(TD Math_Symbol))
511             $(TR $(TD M) $(TD Mark)
512               $(TD No) $(TD Other_Number)    $(TD So) $(TD Other_Symbol))
513             $(TR $(TD Mc) $(TD Spacing_Mark)
514               $(TD P) $(TD Punctuation) $(TD Z) $(TD Separator))
515             $(TR $(TD Me) $(TD Enclosing_Mark)
516               $(TD Pc) $(TD Connector_Punctuation)   $(TD Zl) $(TD Line_Separator))
517             $(TR $(TD Mn) $(TD Nonspacing_Mark)
518               $(TD Pd) $(TD Dash_Punctuation)    $(TD Zp) $(TD Paragraph_Separator))
519             $(TR $(TD C) $(TD Other)
520               $(TD Pe) $(TD Close_Punctuation) $(TD Zs) $(TD Space_Separator))
521             $(TR $(TD Cc) $(TD Control) $(TD Pf)
522               $(TD Final_Punctuation)   $(TD -) $(TD Any))
523             $(TR $(TD Cf) $(TD Format)
524               $(TD Pi) $(TD Initial_Punctuation) $(TD -) $(TD ASCII))
525     )
526     $(P Sets for other commonly useful properties that are
527         accessible with $(LREF unicode):)
528         $(BOOKTABLE $(B Common binary properties),
529             $(TR $(TH Name) $(TH Name) $(TH Name))
530             $(TR $(TD Alphabetic)  $(TD Ideographic) $(TD Other_Uppercase))
531             $(TR $(TD ASCII_Hex_Digit) $(TD IDS_Binary_Operator) $(TD Pattern_Syntax))
532             $(TR $(TD Bidi_Control)    $(TD ID_Start)    $(TD Pattern_White_Space))
533             $(TR $(TD Cased)   $(TD IDS_Trinary_Operator)    $(TD Quotation_Mark))
534             $(TR $(TD Case_Ignorable)  $(TD Join_Control)    $(TD Radical))
535             $(TR $(TD Dash)    $(TD Logical_Order_Exception) $(TD Soft_Dotted))
536             $(TR $(TD Default_Ignorable_Code_Point)    $(TD Lowercase)   $(TD STerm))
537             $(TR $(TD Deprecated)  $(TD Math)    $(TD Terminal_Punctuation))
538             $(TR $(TD Diacritic)   $(TD Noncharacter_Code_Point) $(TD Unified_Ideograph))
539             $(TR $(TD Extender)    $(TD Other_Alphabetic)    $(TD Uppercase))
540             $(TR $(TD Grapheme_Base)   $(TD Other_Default_Ignorable_Code_Point)  $(TD Variation_Selector))
541             $(TR $(TD Grapheme_Extend) $(TD Other_Grapheme_Extend)   $(TD White_Space))
542             $(TR $(TD Grapheme_Link)   $(TD Other_ID_Continue)   $(TD XID_Continue))
543             $(TR $(TD Hex_Digit)   $(TD Other_ID_Start)  $(TD XID_Start))
544             $(TR $(TD Hyphen)  $(TD Other_Lowercase) )
545             $(TR $(TD ID_Continue) $(TD Other_Math)  )
546     )
547     $(P Below is the table with block names accepted by $(LREF unicode.block).
548         Note that the shorthand version $(LREF unicode) requires "In"
549         to be prepended to the names of blocks so as to disambiguate
550         scripts and blocks.
551     )
552     $(BOOKTABLE $(B Blocks),
553         $(TR $(TD Aegean Numbers)    $(TD Ethiopic Extended) $(TD Mongolian))
554         $(TR $(TD Alchemical Symbols)    $(TD Ethiopic Extended-A)   $(TD Musical Symbols))
555         $(TR $(TD Alphabetic Presentation Forms) $(TD Ethiopic Supplement)   $(TD Myanmar))
556         $(TR $(TD Ancient Greek Musical Notation)    $(TD General Punctuation)   $(TD Myanmar Extended-A))
557         $(TR $(TD Ancient Greek Numbers) $(TD Geometric Shapes)  $(TD New Tai Lue))
558         $(TR $(TD Ancient Symbols)   $(TD Georgian)  $(TD NKo))
559         $(TR $(TD Arabic)    $(TD Georgian Supplement)   $(TD Number Forms))
560         $(TR $(TD Arabic Extended-A) $(TD Glagolitic)    $(TD Ogham))
561         $(TR $(TD Arabic Mathematical Alphabetic Symbols)    $(TD Gothic)    $(TD Ol Chiki))
562         $(TR $(TD Arabic Presentation Forms-A)   $(TD Greek and Coptic)  $(TD Old Italic))
563         $(TR $(TD Arabic Presentation Forms-B)   $(TD Greek Extended)    $(TD Old Persian))
564         $(TR $(TD Arabic Supplement) $(TD Gujarati)  $(TD Old South Arabian))
565         $(TR $(TD Armenian)  $(TD Gurmukhi)  $(TD Old Turkic))
566         $(TR $(TD Arrows)    $(TD Halfwidth and Fullwidth Forms) $(TD Optical Character Recognition))
567         $(TR $(TD Avestan)   $(TD Hangul Compatibility Jamo) $(TD Oriya))
568         $(TR $(TD Balinese)  $(TD Hangul Jamo)   $(TD Osmanya))
569         $(TR $(TD Bamum) $(TD Hangul Jamo Extended-A)    $(TD Phags-pa))
570         $(TR $(TD Bamum Supplement)  $(TD Hangul Jamo Extended-B)    $(TD Phaistos Disc))
571         $(TR $(TD Basic Latin)   $(TD Hangul Syllables)  $(TD Phoenician))
572         $(TR $(TD Batak) $(TD Hanunoo)   $(TD Phonetic Extensions))
573         $(TR $(TD Bengali)   $(TD Hebrew)    $(TD Phonetic Extensions Supplement))
574         $(TR $(TD Block Elements)    $(TD High Private Use Surrogates)   $(TD Playing Cards))
575         $(TR $(TD Bopomofo)  $(TD High Surrogates)   $(TD Private Use Area))
576         $(TR $(TD Bopomofo Extended) $(TD Hiragana)  $(TD Rejang))
577         $(TR $(TD Box Drawing)   $(TD Ideographic Description Characters)    $(TD Rumi Numeral Symbols))
578         $(TR $(TD Brahmi)    $(TD Imperial Aramaic)  $(TD Runic))
579         $(TR $(TD Braille Patterns)  $(TD Inscriptional Pahlavi) $(TD Samaritan))
580         $(TR $(TD Buginese)  $(TD Inscriptional Parthian)    $(TD Saurashtra))
581         $(TR $(TD Buhid) $(TD IPA Extensions)    $(TD Sharada))
582         $(TR $(TD Byzantine Musical Symbols) $(TD Javanese)  $(TD Shavian))
583         $(TR $(TD Carian)    $(TD Kaithi)    $(TD Sinhala))
584         $(TR $(TD Chakma)    $(TD Kana Supplement)   $(TD Small Form Variants))
585         $(TR $(TD Cham)  $(TD Kanbun)    $(TD Sora Sompeng))
586         $(TR $(TD Cherokee)  $(TD Kangxi Radicals)   $(TD Spacing Modifier Letters))
587         $(TR $(TD CJK Compatibility) $(TD Kannada)   $(TD Specials))
588         $(TR $(TD CJK Compatibility Forms)   $(TD Katakana)  $(TD Sundanese))
589         $(TR $(TD CJK Compatibility Ideographs)  $(TD Katakana Phonetic Extensions)  $(TD Sundanese Supplement))
590         $(TR $(TD CJK Compatibility Ideographs Supplement)   $(TD Kayah Li)  $(TD Superscripts and Subscripts))
591         $(TR $(TD CJK Radicals Supplement)   $(TD Kharoshthi)    $(TD Supplemental Arrows-A))
592         $(TR $(TD CJK Strokes)   $(TD Khmer) $(TD Supplemental Arrows-B))
593         $(TR $(TD CJK Symbols and Punctuation)   $(TD Khmer Symbols) $(TD Supplemental Mathematical Operators))
594         $(TR $(TD CJK Unified Ideographs)    $(TD Lao)   $(TD Supplemental Punctuation))
595         $(TR $(TD CJK Unified Ideographs Extension A)    $(TD Latin-1 Supplement)    $(TD Supplementary Private Use Area-A))
596         $(TR $(TD CJK Unified Ideographs Extension B)    $(TD Latin Extended-A)  $(TD Supplementary Private Use Area-B))
597         $(TR $(TD CJK Unified Ideographs Extension C)    $(TD Latin Extended Additional) $(TD Syloti Nagri))
598         $(TR $(TD CJK Unified Ideographs Extension D)    $(TD Latin Extended-B)  $(TD Syriac))
599         $(TR $(TD Combining Diacritical Marks)   $(TD Latin Extended-C)  $(TD Tagalog))
600         $(TR $(TD Combining Diacritical Marks for Symbols)   $(TD Latin Extended-D)  $(TD Tagbanwa))
601         $(TR $(TD Combining Diacritical Marks Supplement)    $(TD Lepcha)    $(TD Tags))
602         $(TR $(TD Combining Half Marks)  $(TD Letterlike Symbols)    $(TD Tai Le))
603         $(TR $(TD Common Indic Number Forms) $(TD Limbu) $(TD Tai Tham))
604         $(TR $(TD Control Pictures)  $(TD Linear B Ideograms)    $(TD Tai Viet))
605         $(TR $(TD Coptic)    $(TD Linear B Syllabary)    $(TD Tai Xuan Jing Symbols))
606         $(TR $(TD Counting Rod Numerals) $(TD Lisu)  $(TD Takri))
607         $(TR $(TD Cuneiform) $(TD Low Surrogates)    $(TD Tamil))
608         $(TR $(TD Cuneiform Numbers and Punctuation) $(TD Lycian)    $(TD Telugu))
609         $(TR $(TD Currency Symbols)  $(TD Lydian)    $(TD Thaana))
610         $(TR $(TD Cypriot Syllabary) $(TD Mahjong Tiles) $(TD Thai))
611         $(TR $(TD Cyrillic)  $(TD Malayalam) $(TD Tibetan))
612         $(TR $(TD Cyrillic Extended-A)   $(TD Mandaic)   $(TD Tifinagh))
613         $(TR $(TD Cyrillic Extended-B)   $(TD Mathematical Alphanumeric Symbols) $(TD Transport And Map Symbols))
614         $(TR $(TD Cyrillic Supplement)   $(TD Mathematical Operators)    $(TD Ugaritic))
615         $(TR $(TD Deseret)   $(TD Meetei Mayek)  $(TD Unified Canadian Aboriginal Syllabics))
616         $(TR $(TD Devanagari)    $(TD Meetei Mayek Extensions)   $(TD Unified Canadian Aboriginal Syllabics Extended))
617         $(TR $(TD Devanagari Extended)   $(TD Meroitic Cursive)  $(TD Vai))
618         $(TR $(TD Dingbats)  $(TD Meroitic Hieroglyphs)  $(TD Variation Selectors))
619         $(TR $(TD Domino Tiles)  $(TD Miao)  $(TD Variation Selectors Supplement))
620         $(TR $(TD Egyptian Hieroglyphs)  $(TD Miscellaneous Mathematical Symbols-A)  $(TD Vedic Extensions))
621         $(TR $(TD Emoticons) $(TD Miscellaneous Mathematical Symbols-B)  $(TD Vertical Forms))
622         $(TR $(TD Enclosed Alphanumerics)    $(TD Miscellaneous Symbols) $(TD Yijing Hexagram Symbols))
623         $(TR $(TD Enclosed Alphanumeric Supplement)  $(TD Miscellaneous Symbols and Arrows)  $(TD Yi Radicals))
624         $(TR $(TD Enclosed CJK Letters and Months)   $(TD Miscellaneous Symbols And Pictographs) $(TD Yi Syllables))
625         $(TR $(TD Enclosed Ideographic Supplement)   $(TD Miscellaneous Technical)   )
626         $(TR $(TD Ethiopic)  $(TD Modifier Tone Letters) )
627     )
628     $(P Below is the table with script names accepted by $(LREF unicode.script)
629         and by the shorthand version $(LREF unicode):)
630         $(BOOKTABLE $(B Scripts),
631             $(TR $(TD Arabic)  $(TD Hanunoo) $(TD Old_Italic))
632             $(TR $(TD Armenian)    $(TD Hebrew)  $(TD Old_Persian))
633             $(TR $(TD Avestan) $(TD Hiragana)    $(TD Old_South_Arabian))
634             $(TR $(TD Balinese)    $(TD Imperial_Aramaic)    $(TD Old_Turkic))
635             $(TR $(TD Bamum)   $(TD Inherited)   $(TD Oriya))
636             $(TR $(TD Batak)   $(TD Inscriptional_Pahlavi)   $(TD Osmanya))
637             $(TR $(TD Bengali) $(TD Inscriptional_Parthian)  $(TD Phags_Pa))
638             $(TR $(TD Bopomofo)    $(TD Javanese)    $(TD Phoenician))
639             $(TR $(TD Brahmi)  $(TD Kaithi)  $(TD Rejang))
640             $(TR $(TD Braille) $(TD Kannada) $(TD Runic))
641             $(TR $(TD Buginese)    $(TD Katakana)    $(TD Samaritan))
642             $(TR $(TD Buhid)   $(TD Kayah_Li)    $(TD Saurashtra))
643             $(TR $(TD Canadian_Aboriginal) $(TD Kharoshthi)  $(TD Sharada))
644             $(TR $(TD Carian)  $(TD Khmer)   $(TD Shavian))
645             $(TR $(TD Chakma)  $(TD Lao) $(TD Sinhala))
646             $(TR $(TD Cham)    $(TD Latin)   $(TD Sora_Sompeng))
647             $(TR $(TD Cherokee)    $(TD Lepcha)  $(TD Sundanese))
648             $(TR $(TD Common)  $(TD Limbu)   $(TD Syloti_Nagri))
649             $(TR $(TD Coptic)  $(TD Linear_B)    $(TD Syriac))
650             $(TR $(TD Cuneiform)   $(TD Lisu)    $(TD Tagalog))
651             $(TR $(TD Cypriot) $(TD Lycian)  $(TD Tagbanwa))
652             $(TR $(TD Cyrillic)    $(TD Lydian)  $(TD Tai_Le))
653             $(TR $(TD Deseret) $(TD Malayalam)   $(TD Tai_Tham))
654             $(TR $(TD Devanagari)  $(TD Mandaic) $(TD Tai_Viet))
655             $(TR $(TD Egyptian_Hieroglyphs)    $(TD Meetei_Mayek)    $(TD Takri))
656             $(TR $(TD Ethiopic)    $(TD Meroitic_Cursive)    $(TD Tamil))
657             $(TR $(TD Georgian)    $(TD Meroitic_Hieroglyphs)    $(TD Telugu))
658             $(TR $(TD Glagolitic)  $(TD Miao)    $(TD Thaana))
659             $(TR $(TD Gothic)  $(TD Mongolian)   $(TD Thai))
660             $(TR $(TD Greek)   $(TD Myanmar) $(TD Tibetan))
661             $(TR $(TD Gujarati)    $(TD New_Tai_Lue) $(TD Tifinagh))
662             $(TR $(TD Gurmukhi)    $(TD Nko) $(TD Ugaritic))
663             $(TR $(TD Han) $(TD Ogham)   $(TD Vai))
664             $(TR $(TD Hangul)  $(TD Ol_Chiki)    $(TD Yi))
665     )
666     $(P Below is the table of names accepted by $(LREF unicode.hangulSyllableType).)
667         $(BOOKTABLE $(B Hangul syllable type),
668             $(TR $(TH Abb.) $(TH Long form))
669             $(TR $(TD L)   $(TD Leading_Jamo))
670             $(TR $(TD LV)  $(TD LV_Syllable))
671             $(TR $(TD LVT) $(TD LVT_Syllable) )
672             $(TR $(TD T)   $(TD Trailing_Jamo))
673             $(TR $(TD V)   $(TD Vowel_Jamo))
674     )
675     References:
676         $(HTTP www.digitalmars.com/d/ascii-table.html, ASCII Table),
677         $(HTTP en.wikipedia.org/wiki/Unicode, Wikipedia),
678         $(HTTP www.unicode.org, The Unicode Consortium),
679         $(HTTP www.unicode.org/reports/tr15/, Unicode normalization forms),
680         $(HTTP www.unicode.org/reports/tr29/, Unicode text segmentation)
681         $(HTTP www.unicode.org/uni2book/ch05.pdf,
682             Unicode Implementation Guidelines)
683         $(HTTP www.unicode.org/uni2book/ch03.pdf,
684             Unicode Conformance)
685     Trademarks:
686         Unicode(tm) is a trademark of Unicode, Inc.
687 
688     Copyright: Copyright 2013 -
689     License:   $(HTTP www.boost.org/LICENSE_1_0.txt, Boost License 1.0).
690     Authors:   Dmitry Olshansky
691     Source:    $(PHOBOSSRC std/uni/package.d)
692     Standards: $(HTTP www.unicode.org/versions/Unicode6.2.0/, Unicode v6.2)
693 
694 Macros:
695 
696 SECTION = <h3><a id="$1">$0</a></h3>
697 DEF = <div><a id="$1"><i>$0</i></a></div>
698 S_LINK = <a href="#$1">$+</a>
699 CODEPOINT = $(S_LINK Code point, code point)
700 CODEPOINTS = $(S_LINK Code point, code points)
701 CHARACTER = $(S_LINK Character, character)
702 CHARACTERS = $(S_LINK Character, characters)
703 CLUSTER = $(S_LINK Grapheme cluster, grapheme cluster)
704 +/
705 module std.uni;
706 
707 import std.meta : AliasSeq;
708 import std.range.primitives : back, ElementEncodingType, ElementType, empty,
709     front, hasLength, hasSlicing, isForwardRange, isInputRange,
710     isRandomAccessRange, popFront, put, save;
711 import std.traits : isConvertibleToString, isIntegral, isSomeChar,
712     isSomeString, Unqual, isDynamicArray;
713 // debug = std_uni;
714 
715 debug(std_uni) import std.stdio; // writefln, writeln
716 
717 private:
718 
719 
copyBackwards(T,U)720 void copyBackwards(T,U)(T[] src, U[] dest)
721 {
722     assert(src.length == dest.length);
723     for (size_t i=src.length; i-- > 0; )
724         dest[i] = src[i];
725 }
726 
727 void copyForward(T,U)(T[] src, U[] dest)
728 {
729     assert(src.length == dest.length);
730     for (size_t i=0; i<src.length; i++)
731         dest[i] = src[i];
732 }
733 
734 // TODO: update to reflect all major CPUs supporting unaligned reads
735 version (X86)
736     enum hasUnalignedReads = true;
737 else version (X86_64)
738     enum hasUnalignedReads = true;
739 else version (SystemZ)
740     enum hasUnalignedReads = true;
741 else
742     enum hasUnalignedReads = false; // better be safe then sorry
743 
744 public enum dchar lineSep = '\u2028'; /// Constant $(CODEPOINT) (0x2028) - line separator.
745 public enum dchar paraSep = '\u2029'; /// Constant $(CODEPOINT) (0x2029) - paragraph separator.
746 public enum dchar nelSep  = '\u0085'; /// Constant $(CODEPOINT) (0x0085) - next line.
747 
748 // test the intro example
749 @safe unittest
750 {
751     import std.algorithm.searching : find;
752     // initialize code point sets using script/block or property name
753     // set contains code points from both scripts.
754     auto set = unicode("Cyrillic") | unicode("Armenian");
755     // or simpler and statically-checked look
756     auto ascii = unicode.ASCII;
757     auto currency = unicode.Currency_Symbol;
758 
759     // easy set ops
760     auto a = set & ascii;
761     assert(a.empty); // as it has no intersection with ascii
762     a = set | ascii;
763     auto b = currency - a; // subtract all ASCII, Cyrillic and Armenian
764 
765     // some properties of code point sets
766     assert(b.length > 45); // 46 items in Unicode 6.1, even more in 6.2
767     // testing presence of a code point in a set
768     // is just fine, it is O(logN)
769     assert(!b['$']);
770     assert(!b['\u058F']); // Armenian dram sign
771     assert(b['¥']);
772 
773     // building fast lookup tables, these guarantee O(1) complexity
774     // 1-level Trie lookup table essentially a huge bit-set ~262Kb
775     auto oneTrie = toTrie!1(b);
776     // 2-level far more compact but typically slightly slower
777     auto twoTrie = toTrie!2(b);
778     // 3-level even smaller, and a bit slower yet
779     auto threeTrie = toTrie!3(b);
780     assert(oneTrie['£']);
781     assert(twoTrie['£']);
782     assert(threeTrie['£']);
783 
784     // build the trie with the most sensible trie level
785     // and bind it as a functor
786     auto cyrillicOrArmenian = toDelegate(set);
787     auto balance = find!(cyrillicOrArmenian)("Hello ընկեր!");
788     assert(balance == "ընկեր!");
789     // compatible with bool delegate(dchar)
790     bool delegate(dchar) bindIt = cyrillicOrArmenian;
791 
792     // Normalization
793     string s = "Plain ascii (and not only), is always normalized!";
794     assert(s is normalize(s));// is the same string
795 
796     string nonS = "A\u0308ffin"; // A ligature
797     auto nS = normalize(nonS); // to NFC, the W3C endorsed standard
798     assert(nS == "Äffin");
799     assert(nS != nonS);
800     string composed = "Äffin";
801 
802     assert(normalize!NFD(composed) == "A\u0308ffin");
803     // to NFKD, compatibility decomposition useful for fuzzy matching/searching
804     assert(normalize!NFKD("2¹⁰") == "210");
805 }
806 
807 enum lastDchar = 0x10FFFF;
808 
809 auto force(T, F)(F from)
810 if (isIntegral!T && !is(T == F))
811 {
812     assert(from <= T.max && from >= T.min);
813     return cast(T) from;
814 }
815 
816 auto force(T, F)(F from)
817 if (isBitPacked!T && !is(T == F))
818 {
819     assert(from <= 2^^bitSizeOf!T-1);
820     return T(cast(TypeOfBitPacked!T) from);
821 }
822 
823 auto force(T, F)(F from)
824 if (is(T == F))
825 {
826     return from;
827 }
828 
829 // repeat X times the bit-pattern in val assuming it's length is 'bits'
830 size_t replicateBits(size_t times, size_t bits)(size_t val) @safe pure nothrow @nogc
831 {
832     static if (times == 1)
833         return val;
834     else static if (bits == 1)
835     {
836         static if (times == size_t.sizeof*8)
837             return val ? size_t.max : 0;
838         else
839             return val ? (1 << times)-1 : 0;
840     }
841     else static if (times % 2)
842         return (replicateBits!(times-1, bits)(val)<<bits) | val;
843     else
844         return replicateBits!(times/2, bits*2)((val << bits) | val);
845 }
846 
847 @safe pure nothrow @nogc unittest // for replicate
848 {
849     import std.algorithm.iteration : sum, map;
850     import std.range : iota;
851     size_t m = 0b111;
852     size_t m2 = 0b01;
853     static foreach (i; AliasSeq!(1, 2, 3, 4, 5, 6, 7, 8, 9, 10))
854     {
855         assert(replicateBits!(i, 3)(m)+1 == (1<<(3*i)));
856         assert(replicateBits!(i, 2)(m2) == iota(0, i).map!"2^^(2*a)"().sum());
857     }
858 }
859 
860 // multiple arrays squashed into one memory block
MultiArray(Types...)861 struct MultiArray(Types...)
862 {
863     import std.range.primitives : isOutputRange;
864     this(size_t[] sizes...) @safe pure nothrow
865     {
866         assert(dim == sizes.length);
867         size_t full_size;
868         foreach (i, v; Types)
869         {
870             full_size += spaceFor!(bitSizeOf!v)(sizes[i]);
871             sz[i] = sizes[i];
872             static if (i >= 1)
873                 offsets[i] = offsets[i-1] +
874                     spaceFor!(bitSizeOf!(Types[i-1]))(sizes[i-1]);
875         }
876 
877         storage = new size_t[full_size];
878     }
879 
880     this(const(size_t)[] raw_offsets,
881         const(size_t)[] raw_sizes,
882         return scope const(size_t)[] data) return scope const @safe pure nothrow @nogc
883     {
884         offsets[] = raw_offsets[];
885         sz[] = raw_sizes[];
886         storage = data;
887     }
888 
889     @property auto slice(size_t n)()inout pure nothrow @nogc
890     {
891         auto ptr = raw_ptr!n;
892         return packedArrayView!(Types[n])(ptr, sz[n]);
893     }
894 
895     @property auto ptr(size_t n)()inout pure nothrow @nogc
896     {
897         auto ptr = raw_ptr!n;
898         return inout(PackedPtr!(Types[n]))(ptr);
899     }
900 
901     template length(size_t n)
902     {
903         @property size_t length()const @safe pure nothrow @nogc{ return sz[n]; }
904 
905         @property void length(size_t new_size)
906         {
907             if (new_size > sz[n])
908             {// extend
909                 size_t delta = (new_size - sz[n]);
910                 sz[n] += delta;
911                 delta = spaceFor!(bitSizeOf!(Types[n]))(delta);
912                 storage.length +=  delta;// extend space at end
913                 // raw_slice!x must follow resize as it could be moved!
914                 // next stmts move all data past this array, last-one-goes-first
915                 static if (n != dim-1)
916                 {
917                     auto start = raw_ptr!(n+1);
918                     // len includes delta
919                     size_t len = (storage.ptr+storage.length-start);
920 
921                     copyBackwards(start[0 .. len-delta], start[delta .. len]);
922 
923                     start[0 .. delta] = 0;
924                     // offsets are used for raw_slice, ptr etc.
925                     foreach (i; n+1 .. dim)
926                         offsets[i] += delta;
927                 }
928             }
929             else if (new_size < sz[n])
930             {// shrink
931                 size_t delta = (sz[n] - new_size);
932                 sz[n] -= delta;
933                 delta = spaceFor!(bitSizeOf!(Types[n]))(delta);
934                 // move all data past this array, forward direction
935                 static if (n != dim-1)
936                 {
937                     auto start = raw_ptr!(n+1);
938                     size_t len = (storage.ptr+storage.length-start);
939                     copyForward(start[0 .. len-delta], start[delta .. len]);
940 
941                     // adjust offsets last, they affect raw_slice
942                     foreach (i; n+1 .. dim)
943                         offsets[i] -= delta;
944                 }
945                 storage.length -= delta;
946             }
947             // else - NOP
948         }
949     }
950 
951     @property size_t bytes(size_t n=size_t.max)() const @safe
952     {
953         static if (n == size_t.max)
954             return storage.length*size_t.sizeof;
955         else static if (n != Types.length-1)
956             return (raw_ptr!(n+1)-raw_ptr!n)*size_t.sizeof;
957         else
958             return (storage.ptr+storage.length - raw_ptr!n)*size_t.sizeof;
959     }
960 
961     void store(OutRange)(scope OutRange sink) const
962         if (isOutputRange!(OutRange, char))
963     {
964         import std.format.write : formattedWrite;
965         formattedWrite(sink, "[%( 0x%x, %)]", offsets[]);
966         formattedWrite(sink, ", [%( 0x%x, %)]", sz[]);
967         formattedWrite(sink, ", [%( 0x%x, %)]", storage);
968     }
969 
970 private:
971     import std.meta : staticMap;
972     @property auto raw_ptr(size_t n)()inout pure nothrow @nogc
973     {
974         static if (n == 0)
975             return storage.ptr;
976         else
977         {
978             return storage.ptr+offsets[n];
979         }
980     }
981     enum dim = Types.length;
982     size_t[dim] offsets;// offset for level x
983     size_t[dim] sz;// size of level x
984     alias bitWidth = staticMap!(bitSizeOf, Types);
985     size_t[] storage;
986 }
987 
988 @system unittest
989 {
990     import std.conv : text;
991     enum dg = (){
992         // sizes are:
993         // lvl0: 3, lvl1 : 2, lvl2: 1
994         auto m = MultiArray!(int, ubyte, int)(3,2,1);
995 
check(size_t k,T)996         static void check(size_t k, T)(ref T m, int n)
997         {
998             foreach (i; 0 .. n)
999                 assert(m.slice!(k)[i] == i+1, text("level:",i," : ",m.slice!(k)[0 .. n]));
1000         }
1001 
checkB(size_t k,T)1002         static void checkB(size_t k, T)(ref T m, int n)
1003         {
1004             foreach (i; 0 .. n)
1005                 assert(m.slice!(k)[i] == n-i, text("level:",i," : ",m.slice!(k)[0 .. n]));
1006         }
1007 
fill(size_t k,T)1008         static void fill(size_t k, T)(ref T m, int n)
1009         {
1010             foreach (i; 0 .. n)
1011                 m.slice!(k)[i] = force!ubyte(i+1);
1012         }
1013 
fillB(size_t k,T)1014         static void fillB(size_t k, T)(ref T m, int n)
1015         {
1016             foreach (i; 0 .. n)
1017                 m.slice!(k)[i] = force!ubyte(n-i);
1018         }
1019 
1020         m.length!1 = 100;
1021         fill!1(m, 100);
1022         check!1(m, 100);
1023 
1024         m.length!0 = 220;
1025         fill!0(m, 220);
1026         check!1(m, 100);
1027         check!0(m, 220);
1028 
1029         m.length!2 = 17;
1030         fillB!2(m, 17);
1031         checkB!2(m, 17);
1032         check!0(m, 220);
1033         check!1(m, 100);
1034 
1035         m.length!2 = 33;
1036         checkB!2(m, 17);
1037         fillB!2(m, 33);
1038         checkB!2(m, 33);
1039         check!0(m, 220);
1040         check!1(m, 100);
1041 
1042         m.length!1 = 195;
1043         fillB!1(m, 195);
1044         checkB!1(m, 195);
1045         checkB!2(m, 33);
1046         check!0(m, 220);
1047 
1048         auto marr = MultiArray!(BitPacked!(uint, 4), BitPacked!(uint, 6))(20, 10);
1049         marr.length!0 = 15;
1050         marr.length!1 = 30;
1051         fill!1(marr, 30);
1052         fill!0(marr, 15);
1053         check!1(marr, 30);
1054         check!0(marr, 15);
1055         return 0;
1056     };
1057     enum ct = dg();
1058     auto rt = dg();
1059 }
1060 
1061 @system unittest
1062 {// more bitpacking tests
1063     import std.conv : text;
1064 
1065     alias Bitty =
1066       MultiArray!(BitPacked!(size_t, 3)
1067                 , BitPacked!(size_t, 4)
1068                 , BitPacked!(size_t, 3)
1069                 , BitPacked!(size_t, 6)
1070                 , bool);
1071     alias fn1 = sliceBits!(13, 16);
1072     alias fn2 = sliceBits!( 9, 13);
1073     alias fn3 = sliceBits!( 6,  9);
1074     alias fn4 = sliceBits!( 0,  6);
check(size_t lvl,MA)1075     static void check(size_t lvl, MA)(ref MA arr){
1076         for (size_t i = 0; i< arr.length!lvl; i++)
1077             assert(arr.slice!(lvl)[i] == i, text("Mismatch on lvl ", lvl, " idx ", i, " value: ", arr.slice!(lvl)[i]));
1078     }
1079 
fillIdx(size_t lvl,MA)1080     static void fillIdx(size_t lvl, MA)(ref MA arr){
1081         for (size_t i = 0; i< arr.length!lvl; i++)
1082             arr.slice!(lvl)[i] = i;
1083     }
1084     Bitty m1;
1085 
1086     m1.length!4 = 10;
1087     m1.length!3 = 2^^6;
1088     m1.length!2 = 2^^3;
1089     m1.length!1 = 2^^4;
1090     m1.length!0 = 2^^3;
1091 
1092     m1.length!4 = 2^^16;
1093 
1094     for (size_t i = 0; i< m1.length!4; i++)
1095         m1.slice!(4)[i] = i % 2;
1096 
1097     fillIdx!1(m1);
1098     check!1(m1);
1099     fillIdx!2(m1);
1100     check!2(m1);
1101     fillIdx!3(m1);
1102     check!3(m1);
1103     fillIdx!0(m1);
1104     check!0(m1);
1105     check!3(m1);
1106     check!2(m1);
1107     check!1(m1);
1108     for (size_t i=0; i < 2^^16; i++)
1109     {
1110         m1.slice!(4)[i] = i % 2;
1111         m1.slice!(0)[fn1(i)] = fn1(i);
1112         m1.slice!(1)[fn2(i)] = fn2(i);
1113         m1.slice!(2)[fn3(i)] = fn3(i);
1114         m1.slice!(3)[fn4(i)] = fn4(i);
1115     }
1116     for (size_t i=0; i < 2^^16; i++)
1117     {
1118         assert(m1.slice!(4)[i] == i % 2);
1119         assert(m1.slice!(0)[fn1(i)] == fn1(i));
1120         assert(m1.slice!(1)[fn2(i)] == fn2(i));
1121         assert(m1.slice!(2)[fn3(i)] == fn3(i));
1122         assert(m1.slice!(3)[fn4(i)] == fn4(i));
1123     }
1124 }
1125 
spaceFor(size_t _bits)1126 size_t spaceFor(size_t _bits)(size_t new_len) @safe pure nothrow @nogc
1127 {
1128     import std.math.algebraic : nextPow2;
1129     enum bits = _bits == 1 ? 1 : nextPow2(_bits - 1);// see PackedArrayView
1130     static if (bits > 8*size_t.sizeof)
1131     {
1132         static assert(bits % (size_t.sizeof*8) == 0);
1133         return new_len * bits/(8*size_t.sizeof);
1134     }
1135     else
1136     {
1137         enum factor = size_t.sizeof*8/bits;
1138         return (new_len+factor-1)/factor; // rounded up
1139     }
1140 }
1141 
isBitPackableType(T)1142 template isBitPackableType(T)
1143 {
1144     enum isBitPackableType = isBitPacked!T
1145         || isIntegral!T || is(T == bool) || isSomeChar!T;
1146 }
1147 
1148 //============================================================================
1149 template PackedArrayView(T)
1150 if ((is(T dummy == BitPacked!(U, sz), U, size_t sz)
1151     && isBitPackableType!U) || isBitPackableType!T)
1152 {
1153     import std.math.algebraic : nextPow2;
1154     private enum bits = bitSizeOf!T;
1155     alias PackedArrayView = PackedArrayViewImpl!(T, bits > 1 ? nextPow2(bits - 1) : 1);
1156 }
1157 
1158 //unsafe and fast access to a chunk of RAM as if it contains packed values
1159 template PackedPtr(T)
1160 if ((is(T dummy == BitPacked!(U, sz), U, size_t sz)
1161     && isBitPackableType!U) || isBitPackableType!T)
1162 {
1163     import std.math.algebraic : nextPow2;
1164     private enum bits = bitSizeOf!T;
1165     alias PackedPtr = PackedPtrImpl!(T, bits > 1 ? nextPow2(bits - 1) : 1);
1166 }
1167 
PackedPtrImpl(T,size_t bits)1168 struct PackedPtrImpl(T, size_t bits)
1169 {
1170 pure nothrow:
1171     static assert(isPow2OrZero(bits));
1172 
1173     this(inout(size_t)* ptr)inout @safe @nogc
1174     {
1175         origin = ptr;
1176     }
1177 
1178     private T simpleIndex(size_t n) inout
1179     {
1180         immutable q = n / factor;
1181         immutable r = n % factor;
1182         return cast(T)((origin[q] >> bits*r) & mask);
1183     }
1184 
1185     private void simpleWrite(TypeOfBitPacked!T val, size_t n)
1186     in
1187     {
1188         static if (isIntegral!T)
1189             assert(val <= mask);
1190     }
1191     do
1192     {
1193         immutable q = n / factor;
1194         immutable r = n % factor;
1195         immutable tgt_shift = bits*r;
1196         immutable word = origin[q];
1197         origin[q] = (word & ~(mask << tgt_shift))
1198             | (cast(size_t) val << tgt_shift);
1199     }
1200 
1201     static if (factor == bytesPerWord// can safely pack by byte
1202          || factor == 1 // a whole word at a time
1203          || ((factor == bytesPerWord/2 || factor == bytesPerWord/4)
1204                 && hasUnalignedReads)) // this needs unaligned reads
1205     {
1206         static if (factor == bytesPerWord)
1207             alias U = ubyte;
1208         else static if (factor == bytesPerWord/2)
1209             alias U = ushort;
1210         else static if (factor == bytesPerWord/4)
1211             alias U = uint;
1212         else static if (size_t.sizeof == 8 && factor == bytesPerWord/8)
1213             alias U = ulong;
1214 
1215         T opIndex(size_t idx) inout
1216         {
1217             T ret;
1218             version (LittleEndian)
1219                 ret = __ctfe ? simpleIndex(idx) :
1220                     cast(inout(T))(cast(U*) origin)[idx];
1221             else
1222                 ret = simpleIndex(idx);
1223             return ret;
1224         }
1225 
1226         static if (isBitPacked!T) // lack of user-defined implicit conversion
1227         {
1228             void opIndexAssign(T val, size_t idx)
1229             {
1230                 return opIndexAssign(cast(TypeOfBitPacked!T) val, idx);
1231             }
1232         }
1233 
1234         void opIndexAssign(TypeOfBitPacked!T val, size_t idx)
1235         {
1236             version (LittleEndian)
1237             {
1238                 if (__ctfe)
1239                     simpleWrite(val, idx);
1240                 else
1241                     (cast(U*) origin)[idx] = cast(U) val;
1242             }
1243             else
1244                 simpleWrite(val, idx);
1245         }
1246     }
1247     else
1248     {
1249         T opIndex(size_t n) inout
1250         {
1251             return simpleIndex(n);
1252         }
1253 
1254         static if (isBitPacked!T) // lack of user-defined implicit conversion
1255         {
1256             void opIndexAssign(T val, size_t idx)
1257             {
1258                 return opIndexAssign(cast(TypeOfBitPacked!T) val, idx);
1259             }
1260         }
1261 
1262         void opIndexAssign(TypeOfBitPacked!T val, size_t n)
1263         {
1264             return simpleWrite(val, n);
1265         }
1266     }
1267 
1268 private:
1269     // factor - number of elements in one machine word
1270     enum factor = size_t.sizeof*8/bits, mask = 2^^bits-1;
1271     enum bytesPerWord =  size_t.sizeof;
1272     size_t* origin;
1273 }
1274 
1275 // data is packed only by power of two sized packs per word,
1276 // thus avoiding mul/div overhead at the cost of ultimate packing
1277 // this construct doesn't own memory, only provides access, see MultiArray for usage
PackedArrayViewImpl(T,size_t bits)1278 struct PackedArrayViewImpl(T, size_t bits)
1279 {
1280 pure nothrow:
1281 
1282     this(inout(size_t)* origin, size_t offset, size_t items) inout @safe
1283     {
1284         ptr = inout(PackedPtr!(T))(origin);
1285         ofs = offset;
1286         limit = items;
1287     }
1288 
1289     bool zeros(size_t s, size_t e)
1290     in
1291     {
1292         assert(s <= e);
1293     }
1294     do
1295     {
1296         s += ofs;
1297         e += ofs;
1298         immutable pad_s = roundUp(s);
1299         if ( s >= e)
1300         {
1301             foreach (i; s .. e)
1302                 if (ptr[i])
1303                     return false;
1304             return true;
1305         }
1306         immutable pad_e = roundDown(e);
1307         size_t i;
1308         for (i=s; i<pad_s; i++)
1309             if (ptr[i])
1310                 return false;
1311         // all in between is x*factor elements
1312         for (size_t j=i/factor; i<pad_e; i+=factor, j++)
1313             if (ptr.origin[j])
1314                 return false;
1315         for (; i<e; i++)
1316             if (ptr[i])
1317                 return false;
1318         return true;
1319     }
1320 
1321     T opIndex(size_t idx) inout
1322     in
1323     {
1324         assert(idx < limit);
1325     }
1326     do
1327     {
1328         return ptr[ofs + idx];
1329     }
1330 
1331     static if (isBitPacked!T) // lack of user-defined implicit conversion
1332     {
1333         void opIndexAssign(T val, size_t idx)
1334         {
1335             return opIndexAssign(cast(TypeOfBitPacked!T) val, idx);
1336         }
1337     }
1338 
1339     void opIndexAssign(TypeOfBitPacked!T val, size_t idx)
1340     in
1341     {
1342         assert(idx < limit);
1343     }
1344     do
1345     {
1346         ptr[ofs + idx] = val;
1347     }
1348 
1349     static if (isBitPacked!T) // lack of user-defined implicit conversions
1350     {
1351         void opSliceAssign(T val, size_t start, size_t end)
1352         {
1353             opSliceAssign(cast(TypeOfBitPacked!T) val, start, end);
1354         }
1355     }
1356 
1357     void opSliceAssign(TypeOfBitPacked!T val, size_t start, size_t end)
1358     in
1359     {
1360         assert(start <= end);
1361         assert(end <= limit);
1362     }
1363     do
1364     {
1365         // account for ofsetted view
1366         start += ofs;
1367         end += ofs;
1368         // rounded to factor granularity
1369         immutable pad_start = roundUp(start);// rounded up
1370         if (pad_start >= end) //rounded up >= then end of slice
1371         {
1372             //nothing to gain, use per element assignment
1373             foreach (i; start .. end)
1374                 ptr[i] = val;
1375             return;
1376         }
1377         immutable pad_end = roundDown(end); // rounded down
1378         size_t i;
1379         for (i=start; i<pad_start; i++)
1380             ptr[i] = val;
1381         // all in between is x*factor elements
1382         if (pad_start != pad_end)
1383         {
1384             immutable repval = replicateBits!(factor, bits)(val);
1385             for (size_t j=i/factor; i<pad_end; i+=factor, j++)
1386                 ptr.origin[j] = repval;// so speed it up by factor
1387         }
1388         for (; i<end; i++)
1389             ptr[i] = val;
1390     }
1391 
1392     auto opSlice(size_t from, size_t to)inout
1393     in
1394     {
1395         assert(from <= to);
1396         assert(ofs + to <= limit);
1397     }
1398     do
1399     {
1400         return typeof(this)(ptr.origin, ofs + from, to - from);
1401     }
1402 
1403     auto opSlice(){ return opSlice(0, length); }
1404 
1405     bool opEquals(T)(auto ref T arr) const
1406     {
1407         if (limit != arr.limit)
1408            return false;
1409         size_t s1 = ofs, s2 = arr.ofs;
1410         size_t e1 = s1 + limit, e2 = s2 + limit;
1411         if (s1 % factor == 0 && s2 % factor == 0 && length % factor == 0)
1412         {
1413             return ptr.origin[s1/factor .. e1/factor]
1414                 == arr.ptr.origin[s2/factor .. e2/factor];
1415         }
1416         for (size_t i=0;i<limit; i++)
1417             if (this[i] != arr[i])
1418                 return false;
1419         return true;
1420     }
1421 
1422     @property size_t length()const{ return limit; }
1423 
1424 private:
1425     auto roundUp()(size_t val){ return (val+factor-1)/factor*factor; }
1426     auto roundDown()(size_t val){ return val/factor*factor; }
1427     // factor - number of elements in one machine word
1428     enum factor = size_t.sizeof*8/bits;
1429     PackedPtr!(T) ptr;
1430     size_t ofs, limit;
1431 }
1432 
1433 
SliceOverIndexed(T)1434 private struct SliceOverIndexed(T)
1435 {
1436     enum assignableIndex = is(typeof((){ T.init[0] = Item.init; }));
1437     enum assignableSlice = is(typeof((){ T.init[0 .. 0] = Item.init; }));
1438     auto opIndex(size_t idx)const
1439     in
1440     {
1441         assert(idx < to - from);
1442     }
1443     do
1444     {
1445         return (*arr)[from+idx];
1446     }
1447 
1448     static if (assignableIndex)
1449     void opIndexAssign(Item val, size_t idx)
1450     in
1451     {
1452         assert(idx < to - from);
1453     }
1454     do
1455     {
1456        (*arr)[from+idx] = val;
1457     }
1458 
1459     auto opSlice(size_t a, size_t b)
1460     {
1461         return typeof(this)(from+a, from+b, arr);
1462     }
1463 
1464     // static if (assignableSlice)
1465     void opSliceAssign(T)(T val, size_t start, size_t end)
1466     {
1467         (*arr)[start+from .. end+from] = val;
1468     }
1469 
1470     auto opSlice()
1471     {
1472         return typeof(this)(from, to, arr);
1473     }
1474 
1475     @property size_t length()const { return to-from;}
1476 
1477     alias opDollar = length;
1478 
1479     @property bool empty()const { return from == to; }
1480 
1481     @property auto front()const { return (*arr)[from]; }
1482 
1483     static if (assignableIndex)
1484     @property void front(Item val) { (*arr)[from] = val; }
1485 
1486     @property auto back()const { return (*arr)[to-1]; }
1487 
1488     static if (assignableIndex)
1489     @property void back(Item val) { (*arr)[to-1] = val; }
1490 
1491     @property auto save() inout { return this; }
1492 
1493     void popFront() {   from++; }
1494 
1495     void popBack() {    to--; }
1496 
1497     bool opEquals(T)(auto ref T arr) const
1498     {
1499         if (arr.length != length)
1500             return false;
1501         for (size_t i=0; i <length; i++)
1502             if (this[i] != arr[i])
1503                 return false;
1504         return true;
1505     }
1506 private:
1507     alias Item = typeof(T.init[0]);
1508     size_t from, to;
1509     T* arr;
1510 }
1511 
1512 @safe pure nothrow @nogc unittest
1513 {
1514     static assert(isRandomAccessRange!(SliceOverIndexed!(int[])));
1515 }
1516 
1517 SliceOverIndexed!(const(T)) sliceOverIndexed(T)(size_t a, size_t b, const(T)* x)
1518 if (is(Unqual!T == T))
1519 {
1520     return SliceOverIndexed!(const(T))(a, b, x);
1521 }
1522 
1523 // BUG? inout is out of reach
1524 //...SliceOverIndexed.arr only parameters or stack based variables can be inout
1525 SliceOverIndexed!T sliceOverIndexed(T)(size_t a, size_t b, T* x)
1526 if (is(Unqual!T == T))
1527 {
1528     return SliceOverIndexed!T(a, b, x);
1529 }
1530 
1531 @safe unittest
1532 {
1533     int[] idxArray = [2, 3, 5, 8, 13];
1534     auto sliced = sliceOverIndexed(0, idxArray.length, &idxArray);
1535 
1536     assert(!sliced.empty);
1537     assert(sliced.front == 2);
1538     sliced.front = 1;
1539     assert(sliced.front == 1);
1540     assert(sliced.back == 13);
1541     sliced.popFront();
1542     assert(sliced.front == 3);
1543     assert(sliced.back == 13);
1544     sliced.back = 11;
1545     assert(sliced.back == 11);
1546     sliced.popBack();
1547 
1548     assert(sliced.front == 3);
1549     assert(sliced[$-1] == 8);
1550     sliced = sliced[];
1551     assert(sliced[0] == 3);
1552     assert(sliced.back == 8);
1553     sliced = sliced[1..$];
1554     assert(sliced.front == 5);
1555     sliced = sliced[0..$-1];
1556     assert(sliced[$-1] == 5);
1557 
1558     int[] other = [2, 5];
1559     assert(sliced[] == sliceOverIndexed(1, 2, &other));
1560     sliceOverIndexed(0, 2, &idxArray)[0 .. 2] = -1;
1561     assert(idxArray[0 .. 2] == [-1, -1]);
1562     uint[] nullArr = null;
1563     auto nullSlice = sliceOverIndexed(0, 0, &idxArray);
1564     assert(nullSlice.empty);
1565 }
1566 
packedArrayView(T)1567 private inout(PackedArrayView!T) packedArrayView(T)(inout(size_t)* ptr, size_t items)
1568 {
1569     return inout(PackedArrayView!T)(ptr, 0, items);
1570 }
1571 
1572 
1573 //============================================================================
1574 // Partially unrolled binary search using Shar's method
1575 //============================================================================
1576 
genUnrolledSwitchSearch(size_t size)1577 string genUnrolledSwitchSearch(size_t size) @safe pure nothrow
1578 {
1579     import core.bitop : bsr;
1580     import std.array : replace;
1581     import std.conv : to;
1582     assert(isPow2OrZero(size));
1583     string code = `
1584     import core.bitop : bsr;
1585     auto power = bsr(m)+1;
1586     switch (power){`;
1587     size_t i = bsr(size);
1588     foreach_reverse (val; 0 .. bsr(size))
1589     {
1590         auto v = 2^^val;
1591         code ~= `
1592         case pow:
1593             if (pred(range[idx+m], needle))
1594                 idx +=  m;
1595             goto case;
1596         `.replace("m", to!string(v))
1597         .replace("pow", to!string(i));
1598         i--;
1599     }
1600     code ~= `
1601         case 0:
1602             if (pred(range[idx], needle))
1603                 idx += 1;
1604             goto default;
1605         `;
1606     code ~= `
1607         default:
1608     }`;
1609     return code;
1610 }
1611 
isPow2OrZero(size_t sz)1612 bool isPow2OrZero(size_t sz) @safe pure nothrow @nogc
1613 {
1614     // See also: std.math.isPowerOf2()
1615     return (sz & (sz-1)) == 0;
1616 }
1617 
1618 size_t uniformLowerBound(alias pred, Range, T)(Range range, T needle)
1619 if (is(T : ElementType!Range))
1620 {
1621     assert(isPow2OrZero(range.length));
1622     size_t idx = 0, m = range.length/2;
1623     while (m != 0)
1624     {
1625         if (pred(range[idx+m], needle))
1626             idx += m;
1627         m /= 2;
1628     }
1629     if (pred(range[idx], needle))
1630         idx += 1;
1631     return idx;
1632 }
1633 
1634 size_t switchUniformLowerBound(alias pred, Range, T)(Range range, T needle)
1635 if (is(T : ElementType!Range))
1636 {
1637     assert(isPow2OrZero(range.length));
1638     size_t idx = 0, m = range.length/2;
1639     enum max = 1 << 10;
1640     while (m >= max)
1641     {
1642         if (pred(range[idx+m], needle))
1643             idx += m;
1644         m /= 2;
1645     }
1646     mixin(genUnrolledSwitchSearch(max));
1647     return idx;
1648 }
1649 
sharMethod(alias uniLowerBound)1650 template sharMethod(alias uniLowerBound)
1651 {
1652     size_t sharMethod(alias _pred="a<b", Range, T)(Range range, T needle)
1653         if (is(T : ElementType!Range))
1654     {
1655         import std.functional : binaryFun;
1656         import std.math.algebraic : nextPow2, truncPow2;
1657         alias pred = binaryFun!_pred;
1658         if (range.length == 0)
1659             return 0;
1660         if (isPow2OrZero(range.length))
1661             return uniLowerBound!pred(range, needle);
1662         size_t n = truncPow2(range.length);
1663         if (pred(range[n-1], needle))
1664         {// search in another 2^^k area that fully covers the tail of range
1665             size_t k = nextPow2(range.length - n + 1);
1666             return range.length - k + uniLowerBound!pred(range[$-k..$], needle);
1667         }
1668         else
1669             return uniLowerBound!pred(range[0 .. n], needle);
1670     }
1671 }
1672 
1673 alias sharLowerBound = sharMethod!uniformLowerBound;
1674 alias sharSwitchLowerBound = sharMethod!switchUniformLowerBound;
1675 
1676 @safe unittest
1677 {
1678     import std.array : array;
1679     import std.range : assumeSorted, iota;
1680 
stdLowerBound(T)1681     auto stdLowerBound(T)(T[] range, T needle)
1682     {
1683         return assumeSorted(range).lowerBound(needle).length;
1684     }
1685     immutable MAX = 5*1173;
1686     auto arr = array(iota(5, MAX, 5));
1687     assert(arr.length == MAX/5-1);
1688     foreach (i; 0 .. MAX+5)
1689     {
1690         auto st = stdLowerBound(arr, i);
1691         assert(st == sharLowerBound(arr, i));
1692         assert(st == sharSwitchLowerBound(arr, i));
1693     }
1694     arr = [];
1695     auto st = stdLowerBound(arr, 33);
1696     assert(st == sharLowerBound(arr, 33));
1697     assert(st == sharSwitchLowerBound(arr, 33));
1698 }
1699 //============================================================================
1700 
1701 @safe
1702 {
1703 // hope to see simillar stuff in public interface... once Allocators are out
1704 //@@@BUG moveFront and friends? dunno, for now it's POD-only
1705 
1706 @trusted size_t genericReplace(Policy=void, T, Range)
1707     (ref T dest, size_t from, size_t to, Range stuff)
1708 {
1709     import std.algorithm.mutation : copy;
1710     size_t delta = to - from;
1711     size_t stuff_end = from+stuff.length;
1712     if (stuff.length > delta)
1713     {// replace increases length
1714         delta = stuff.length - delta;// now, new is > old  by delta
1715         static if (is(Policy == void))
1716             dest.length = dest.length+delta;//@@@BUG lame @property
1717         else
1718             dest = Policy.realloc(dest, dest.length+delta);
1719         copyBackwards(dest[to .. dest.length-delta],
1720             dest[to+delta .. dest.length]);
1721         copyForward(stuff, dest[from .. stuff_end]);
1722     }
1723     else if (stuff.length == delta)
1724     {
1725         copy(stuff, dest[from .. to]);
1726     }
1727     else
1728     {// replace decreases length by delta
1729         delta = delta - stuff.length;
1730         copy(stuff, dest[from .. stuff_end]);
1731         copyForward(dest[to .. dest.length],
1732             dest[stuff_end .. dest.length-delta]);
1733         static if (is(Policy == void))
1734             dest.length = dest.length - delta;//@@@BUG lame @property
1735         else
1736             dest = Policy.realloc(dest, dest.length-delta);
1737     }
1738     return stuff_end;
1739 }
1740 
1741 
1742 // Simple storage manipulation policy
1743 @safe private struct GcPolicy
1744 {
1745     import std.traits : isDynamicArray;
1746 
dupGcPolicy1747     static T[] dup(T)(const T[] arr)
1748     {
1749         return arr.dup;
1750     }
1751 
allocGcPolicy1752     static T[] alloc(T)(size_t size)
1753     {
1754         return new T[size];
1755     }
1756 
reallocGcPolicy1757     static T[] realloc(T)(T[] arr, size_t sz)
1758     {
1759         arr.length = sz;
1760         return arr;
1761     }
1762 
replaceImplGcPolicy1763     static void replaceImpl(T, Range)(ref T[] dest, size_t from, size_t to, Range stuff)
1764     {
1765         replaceInPlace(dest, from, to, stuff);
1766     }
1767 
1768     static void append(T, V)(ref T[] arr, V value)
1769         if (!isInputRange!V)
1770     {
1771         arr ~= force!T(value);
1772     }
1773 
1774     static void append(T, V)(ref T[] arr, V value)
1775         if (isInputRange!V)
1776     {
1777         insertInPlace(arr, arr.length, value);
1778     }
1779 
1780     static void destroy(T)(ref T arr) pure // pure required for -dip25, inferred for -dip1000
1781         if (isDynamicArray!T && is(Unqual!T == T))
1782     {
1783         debug
1784         {
1785             arr[] = cast(typeof(T.init[0]))(0xdead_beef);
1786         }
1787         arr = null;
1788     }
1789 
1790     static void destroy(T)(ref T arr) pure // pure required for -dip25, inferred for -dip1000
1791         if (isDynamicArray!T && !is(Unqual!T == T))
1792     {
1793         arr = null;
1794     }
1795 }
1796 
1797 // ditto
1798 @safe struct ReallocPolicy
1799 {
1800     import std.range.primitives : hasLength;
1801 
dup(T)1802     static T[] dup(T)(const T[] arr)
1803     {
1804         auto result = alloc!T(arr.length);
1805         result[] = arr[];
1806         return result;
1807     }
1808 
alloc(T)1809     static T[] alloc(T)(size_t size) @trusted
1810     {
1811         import std.internal.memory : enforceMalloc;
1812 
1813         import core.checkedint : mulu;
1814         bool overflow;
1815         size_t nbytes = mulu(size, T.sizeof, overflow);
1816         if (overflow) assert(0);
1817 
1818         auto ptr = cast(T*) enforceMalloc(nbytes);
1819         return ptr[0 .. size];
1820     }
1821 
realloc(T)1822     static T[] realloc(T)(return scope T[] arr, size_t size) @trusted
1823     {
1824         import std.internal.memory : enforceRealloc;
1825         if (!size)
1826         {
1827             destroy(arr);
1828             return null;
1829         }
1830 
1831         import core.checkedint : mulu;
1832         bool overflow;
1833         size_t nbytes = mulu(size, T.sizeof, overflow);
1834         if (overflow) assert(0);
1835 
1836         auto ptr = cast(T*) enforceRealloc(arr.ptr, nbytes);
1837         return ptr[0 .. size];
1838     }
1839 
replaceImpl(T,Range)1840     static void replaceImpl(T, Range)(ref T[] dest, size_t from, size_t to, Range stuff)
1841     {
1842         genericReplace!(ReallocPolicy)(dest, from, to, stuff);
1843     }
1844 
1845     static void append(T, V)(ref T[] arr, V value)
1846         if (!isInputRange!V)
1847     {
1848         if (arr.length == size_t.max) assert(0);
1849         arr = realloc(arr, arr.length+1);
1850         arr[$-1] = force!T(value);
1851     }
1852 
1853     pure @safe unittest
1854     {
1855         int[] arr;
1856         ReallocPolicy.append(arr, 3);
1857 
1858         import std.algorithm.comparison : equal;
1859         assert(equal(arr, [3]));
1860     }
1861 
1862     static void append(T, V)(ref T[] arr, V value)
1863         if (isInputRange!V && hasLength!V)
1864     {
1865         import core.checkedint : addu;
1866         bool overflow;
1867         size_t nelems = addu(arr.length, value.length, overflow);
1868         if (overflow) assert(0);
1869 
1870         arr = realloc(arr, nelems);
1871 
1872         import std.algorithm.mutation : copy;
1873         copy(value, arr[$-value.length..$]);
1874     }
1875 
1876     pure @safe unittest
1877     {
1878         int[] arr;
1879         ReallocPolicy.append(arr, [1,2,3]);
1880 
1881         import std.algorithm.comparison : equal;
1882         assert(equal(arr, [1,2,3]));
1883     }
1884 
destroy(T)1885     static void destroy(T)(scope ref T[] arr) @trusted
1886     {
1887         import core.memory : pureFree;
1888         if (arr.ptr)
1889             pureFree(arr.ptr);
1890         arr = null;
1891     }
1892 }
1893 
1894 //build hack
1895 alias _RealArray = CowArray!ReallocPolicy;
1896 
1897 pure @safe unittest
1898 {
1899     import std.algorithm.comparison : equal;
1900 
with(ReallocPolicy)1901     with(ReallocPolicy)
1902     {
1903         bool test(T, U, V)(T orig, size_t from, size_t to, U toReplace, V result,
1904                    string file = __FILE__, size_t line = __LINE__)
1905         {
1906             {
1907                 replaceImpl(orig, from, to, toReplace);
1908                 scope(exit) destroy(orig);
1909                 if (!equal(orig, result))
1910                     return false;
1911             }
1912             return true;
1913         }
1914         static T[] arr(T)(T[] args... )
1915         {
1916             return dup(args);
1917         }
1918 
1919         assert(test(arr([1, 2, 3, 4]), 0, 0, [5, 6, 7], [5, 6, 7, 1, 2, 3, 4]));
1920         assert(test(arr([1, 2, 3, 4]), 0, 2, cast(int[])[], [3, 4]));
1921         assert(test(arr([1, 2, 3, 4]), 0, 4, [5, 6, 7], [5, 6, 7]));
1922         assert(test(arr([1, 2, 3, 4]), 0, 2, [5, 6, 7], [5, 6, 7, 3, 4]));
1923         assert(test(arr([1, 2, 3, 4]), 2, 3, [5, 6, 7], [1, 2, 5, 6, 7, 4]));
1924     }
1925 }
1926 
1927 /**
1928     Tests if T is some kind a set of code points. Intended for template constraints.
1929 */
isCodepointSet(T)1930 public template isCodepointSet(T)
1931 {
1932     static if (is(T dummy == InversionList!(Args), Args...))
1933         enum isCodepointSet = true;
1934     else
1935         enum isCodepointSet = false;
1936 }
1937 
1938 /**
1939     Tests if `T` is a pair of integers that implicitly convert to `V`.
1940     The following code must compile for any pair `T`:
1941     ---
1942     (T x){ V a = x[0]; V b = x[1];}
1943     ---
1944     The following must not compile:
1945      ---
1946     (T x){ V c = x[2];}
1947     ---
1948 */
1949 public template isIntegralPair(T, V=uint)
1950 {
1951     enum isIntegralPair = is(typeof((T x){ V a = x[0]; V b = x[1];}))
1952         && !is(typeof((T x){ V c = x[2]; }));
1953 }
1954 
1955 
1956 /**
1957     The recommended default type for set of $(CODEPOINTS).
1958     For details, see the current implementation: $(LREF InversionList).
1959 */
1960 public alias CodepointSet = InversionList!GcPolicy;
1961 
1962 
1963 //@@@BUG: std.typecons tuples depend on std.format to produce fields mixin
1964 // which relies on std.uni.isGraphical and this chain blows up with Forward reference error
1965 // hence below doesn't seem to work
1966 // public alias CodepointInterval = Tuple!(uint, "a", uint, "b");
1967 
1968 /**
1969     The recommended type of $(REF Tuple, std,_typecons)
1970     to represent [a, b$(RPAREN) intervals of $(CODEPOINTS). As used in $(LREF InversionList).
1971     Any interval type should pass $(LREF isIntegralPair) trait.
1972 */
1973 public struct CodepointInterval
1974 {
1975 pure:
1976     uint[2] _tuple;
1977     alias _tuple this;
1978 
1979 @safe pure nothrow @nogc:
1980 
thisCodepointInterval1981     this(uint low, uint high)
1982     {
1983         _tuple[0] = low;
1984         _tuple[1] = high;
1985     }
opEqualsCodepointInterval1986     bool opEquals(T)(T val) const
1987     {
1988         return this[0] == val[0] && this[1] == val[1];
1989     }
1990     @property ref inout(uint) a() return inout { return _tuple[0]; }
1991     @property ref inout(uint) b() return inout { return _tuple[1]; }
1992 }
1993 
1994 /**
1995     $(P
1996     `InversionList` is a set of $(CODEPOINTS)
1997     represented as an array of open-right [a, b$(RPAREN)
1998     intervals (see $(LREF CodepointInterval) above).
1999     The name comes from the way the representation reads left to right.
2000     For instance a set of all values [10, 50$(RPAREN), [80, 90$(RPAREN),
2001     plus a singular value 60 looks like this:
2002     )
2003     ---
2004     10, 50, 60, 61, 80, 90
2005     ---
2006     $(P
2007     The way to read this is: start with negative meaning that all numbers
2008     smaller then the next one are not present in this set (and positive -
2009     the contrary). Then switch positive/negative after each
2010     number passed from left to right.
2011     )
2012     $(P This way negative spans until 10, then positive until 50,
2013     then negative until 60, then positive until 61, and so on.
2014     As seen this provides a space-efficient storage of highly redundant data
2015     that comes in long runs. A description which Unicode $(CHARACTER)
2016     properties fit nicely. The technique itself could be seen as a variation
2017     on $(LINK2 https://en.wikipedia.org/wiki/Run-length_encoding, RLE encoding).
2018     )
2019 
2020     $(P Sets are value types (just like `int` is) thus they
2021         are never aliased.
2022     )
2023         Example:
2024         ---
2025         auto a = CodepointSet('a', 'z'+1);
2026         auto b = CodepointSet('A', 'Z'+1);
2027         auto c = a;
2028         a = a | b;
2029         assert(a == CodepointSet('A', 'Z'+1, 'a', 'z'+1));
2030         assert(a != c);
2031         ---
2032     $(P See also $(LREF unicode) for simpler construction of sets
2033         from predefined ones.
2034     )
2035 
2036     $(P Memory usage is 8 bytes per each contiguous interval in a set.
2037     The value semantics are achieved by using the
2038     $(HTTP en.wikipedia.org/wiki/Copy-on-write, COW) technique
2039     and thus it's $(RED not) safe to cast this type to $(D_KEYWORD shared).
2040     )
2041 
2042     Note:
2043     $(P It's not recommended to rely on the template parameters
2044     or the exact type of a current $(CODEPOINT) set in `std.uni`.
2045     The type and parameters may change when the standard
2046     allocators design is finalized.
2047     Use $(LREF isCodepointSet) with templates or just stick with the default
2048     alias $(LREF CodepointSet) throughout the whole code base.
2049     )
2050 */
2051 public struct InversionList(SP=GcPolicy)
2052 {
2053     import std.range : assumeSorted;
2054 
2055     /**
2056         Construct from another code point set of any type.
2057     */
2058     this(Set)(Set set) pure
2059         if (isCodepointSet!Set)
2060     {
2061         uint[] arr;
2062         foreach (v; set.byInterval)
2063         {
2064             arr ~= v.a;
2065             arr ~= v.b;
2066         }
2067         data = CowArray!(SP).reuse(arr);
2068     }
2069 
2070     /**
2071         Construct a set from a forward range of code point intervals.
2072     */
2073     this(Range)(Range intervals) pure
2074         if (isForwardRange!Range && isIntegralPair!(ElementType!Range))
2075     {
2076         uint[] arr;
2077         foreach (v; intervals)
2078         {
2079             SP.append(arr, v.a);
2080             SP.append(arr, v.b);
2081         }
2082         data = CowArray!(SP).reuse(arr);
2083         sanitize(); //enforce invariant: sort intervals etc.
2084     }
2085 
2086     //helper function that avoids sanity check to be CTFE-friendly
2087     private static fromIntervals(Range)(Range intervals) pure
2088     {
2089         import std.algorithm.iteration : map;
2090         import std.range : roundRobin;
2091         auto flattened = roundRobin(intervals.save.map!"a[0]"(),
2092             intervals.save.map!"a[1]"());
2093         InversionList set;
2094         set.data = CowArray!(SP)(flattened);
2095         return set;
2096     }
2097     //ditto untill sort is CTFE-able
2098     private static fromIntervals()(uint[] intervals...) pure
2099     in
2100     {
2101         import std.conv : text;
2102         assert(intervals.length % 2 == 0, "Odd number of interval bounds [a, b)!");
2103         for (uint i = 0; i < intervals.length; i += 2)
2104         {
2105             auto a = intervals[i], b = intervals[i+1];
2106             assert(a < b, text("illegal interval [a, b): ", a, " > ", b));
2107         }
2108     }
2109     do
2110     {
2111         InversionList set;
2112         set.data = CowArray!(SP)(intervals);
2113         return set;
2114     }
2115 
2116     /**
2117         Construct a set from plain values of code point intervals.
2118     */
2119     this()(uint[] intervals...)
2120     in
2121     {
2122         import std.conv : text;
2123         assert(intervals.length % 2 == 0, "Odd number of interval bounds [a, b)!");
2124         for (uint i = 0; i < intervals.length; i += 2)
2125         {
2126             auto a = intervals[i], b = intervals[i+1];
2127             assert(a < b, text("illegal interval [a, b): ", a, " > ", b));
2128         }
2129     }
2130     do
2131     {
2132         data = CowArray!(SP)(intervals);
2133         sanitize(); //enforce invariant: sort intervals etc.
2134     }
2135 
2136     ///
2137     pure @safe unittest
2138     {
2139         import std.algorithm.comparison : equal;
2140 
2141         auto set = CodepointSet('a', 'z'+1, 'а', 'я'+1);
2142         foreach (v; 'a'..'z'+1)
2143             assert(set[v]);
2144         // Cyrillic lowercase interval
2145         foreach (v; 'а'..'я'+1)
2146             assert(set[v]);
2147         //specific order is not required, intervals may interesect
2148         auto set2 = CodepointSet('а', 'я'+1, 'a', 'd', 'b', 'z'+1);
2149         //the same end result
2150         assert(set2.byInterval.equal(set.byInterval));
2151         // test constructor this(Range)(Range intervals)
2152         auto chessPiecesWhite = CodepointInterval(9812, 9818);
2153         auto chessPiecesBlack = CodepointInterval(9818, 9824);
2154         auto set3 = CodepointSet([chessPiecesWhite, chessPiecesBlack]);
2155         foreach (v; '♔'..'♟'+1)
2156             assert(set3[v]);
2157     }
2158 
2159     /**
2160         Get range that spans all of the $(CODEPOINT) intervals in this $(LREF InversionList).
2161     */
2162     @property auto byInterval() scope
2163     {
2164         // TODO: change this to data[] once the -dip1000 errors have been fixed
2165         // see e.g. https://github.com/dlang/phobos/pull/6638
2166         import std.array : array;
2167         return Intervals!(typeof(data.array))(data.array);
2168     }
2169 
2170     @safe unittest
2171     {
2172         import std.algorithm.comparison : equal;
2173         import std.typecons : tuple;
2174 
2175         auto set = CodepointSet('A', 'D'+1, 'a', 'd'+1);
2176 
2177         assert(set.byInterval.equal([tuple('A','E'), tuple('a','e')]));
2178     }
2179 
2180     package(std) @property const(CodepointInterval)[] intervals() const
2181     {
2182         import std.array : array;
2183         return Intervals!(typeof(data[]))(data[]).array;
2184     }
2185 
2186     /**
2187         Tests the presence of code point `val` in this set.
2188     */
2189     bool opIndex(uint val) const
2190     {
2191         // the <= ensures that searching in  interval of [a, b) for 'a' you get .length == 1
2192         // return assumeSorted!((a,b) => a <= b)(data[]).lowerBound(val).length & 1;
2193         return sharSwitchLowerBound!"a <= b"(data[], val) & 1;
2194     }
2195 
2196     ///
2197     pure @safe unittest
2198     {
2199         auto gothic = unicode.Gothic;
2200         // Gothic letter ahsa
2201         assert(gothic['\U00010330']);
2202         // no ascii in Gothic obviously
2203         assert(!gothic['$']);
2204     }
2205 
2206 
2207     // Linear scan for `ch`. Useful only for small sets.
2208     // TODO:
2209     // used internally in std.regex
2210     // should be properly exposed in a public API ?
2211     package(std) auto scanFor()(dchar ch) const
2212     {
2213         immutable len = data.length;
2214         for (size_t i = 0; i < len; i++)
2215             if (ch < data[i])
2216                 return i & 1;
2217         return 0;
2218     }
2219 
2220     /// Number of $(CODEPOINTS) in this set
2221     @property size_t length()
2222     {
2223         size_t sum = 0;
2224         foreach (iv; byInterval)
2225         {
2226             sum += iv.b - iv.a;
2227         }
2228         return sum;
2229     }
2230 
2231 // bootstrap full set operations from 4 primitives (suitable as a template mixin):
2232 // addInterval, skipUpTo, dropUpTo & byInterval iteration
2233 //============================================================================
2234 public:
2235     /**
2236         $(P Sets support natural syntax for set algebra, namely: )
2237         $(BOOKTABLE ,
2238             $(TR $(TH Operator) $(TH Math notation) $(TH Description) )
2239             $(TR $(TD &) $(TD a ∩ b) $(TD intersection) )
2240             $(TR $(TD |) $(TD a ∪ b) $(TD union) )
2241             $(TR $(TD -) $(TD a ∖ b) $(TD subtraction) )
2242             $(TR $(TD ~) $(TD a ~ b) $(TD symmetric set difference i.e. (a ∪ b) \ (a ∩ b)) )
2243         )
2244     */
2245     This opBinary(string op, U)(U rhs)
2246         if (isCodepointSet!U || is(U:dchar))
2247     {
2248         static if (op == "&" || op == "|" || op == "~")
2249         {// symmetric ops thus can swap arguments to reuse r-value
2250             static if (is(U:dchar))
2251             {
2252                 auto tmp = this;
2253                 mixin("tmp "~op~"= rhs; ");
2254                 return tmp;
2255             }
2256             else
2257             {
2258                 static if (is(Unqual!U == U))
2259                 {
2260                     // try hard to reuse r-value
2261                     mixin("rhs "~op~"= this;");
2262                     return rhs;
2263                 }
2264                 else
2265                 {
2266                     auto tmp = this;
2267                     mixin("tmp "~op~"= rhs;");
2268                     return tmp;
2269                 }
2270             }
2271         }
2272         else static if (op == "-") // anti-symmetric
2273         {
2274             auto tmp = this;
2275             tmp -= rhs;
2276             return tmp;
2277         }
2278         else
2279             static assert(0, "no operator "~op~" defined for Set");
2280     }
2281 
2282     ///
2283     pure @safe unittest
2284     {
2285         import std.algorithm.comparison : equal;
2286         import std.range : iota;
2287 
2288         auto lower = unicode.LowerCase;
2289         auto upper = unicode.UpperCase;
2290         auto ascii = unicode.ASCII;
2291 
2292         assert((lower & upper).empty); // no intersection
2293         auto lowerASCII = lower & ascii;
2294         assert(lowerASCII.byCodepoint.equal(iota('a', 'z'+1)));
2295         // throw away all of the lowercase ASCII
2296         assert((ascii - lower).length == 128 - 26);
2297 
2298         auto onlyOneOf = lower ~ ascii;
2299         assert(!onlyOneOf['Δ']); // not ASCII and not lowercase
2300         assert(onlyOneOf['$']); // ASCII and not lowercase
2301         assert(!onlyOneOf['a']); // ASCII and lowercase
2302         assert(onlyOneOf['я']); // not ASCII but lowercase
2303 
2304         // throw away all cased letters from ASCII
2305         auto noLetters = ascii - (lower | upper);
2306         assert(noLetters.length == 128 - 26*2);
2307     }
2308 
2309     /// The 'op=' versions of the above overloaded operators.
2310     ref This opOpAssign(string op, U)(U rhs)
2311         if (isCodepointSet!U || is(U:dchar))
2312     {
2313         static if (op == "|")    // union
2314         {
2315             static if (is(U:dchar))
2316             {
2317                 this.addInterval(rhs, rhs+1);
2318                 return this;
2319             }
2320             else
2321                 return this.add(rhs);
2322         }
2323         else static if (op == "&")   // intersection
2324                 return this.intersect(rhs);// overloaded
2325         else static if (op == "-")   // set difference
2326                 return this.sub(rhs);// overloaded
2327         else static if (op == "~")   // symmetric set difference
2328         {
2329             auto copy = this & rhs;
2330             this |= rhs;
2331             this -= copy;
2332             return this;
2333         }
2334         else
2335             static assert(0, "no operator "~op~" defined for Set");
2336     }
2337 
2338     /**
2339         Tests the presence of codepoint `ch` in this set,
2340         the same as $(LREF opIndex).
2341     */
2342     bool opBinaryRight(string op: "in", U)(U ch) const
2343         if (is(U : dchar))
2344     {
2345         return this[ch];
2346     }
2347 
2348     ///
2349     pure @safe unittest
2350     {
2351         assert('я' in unicode.Cyrillic);
2352         assert(!('z' in unicode.Cyrillic));
2353     }
2354 
2355 
2356 
2357     /**
2358      * Obtains a set that is the inversion of this set.
2359      *
2360      * See_Also: $(LREF inverted)
2361      */
2362     auto opUnary(string op: "!")()
2363     {
2364         return this.inverted;
2365     }
2366 
2367     /**
2368         A range that spans each $(CODEPOINT) in this set.
2369     */
2370     @property auto byCodepoint()
2371     {
2372         static struct CodepointRange
2373         {
2374             this(This set)
2375             {
2376                 r = set.byInterval;
2377                 if (!r.empty)
2378                     cur = r.front.a;
2379             }
2380 
2381             @property dchar front() const
2382             {
2383                 return cast(dchar) cur;
2384             }
2385 
2386             @property bool empty() const
2387             {
2388                 return r.empty;
2389             }
2390 
2391             void popFront()
2392             {
2393                 cur++;
2394                 while (cur >= r.front.b)
2395                 {
2396                     r.popFront();
2397                     if (r.empty)
2398                         break;
2399                     cur = r.front.a;
2400                 }
2401             }
2402         private:
2403             uint cur;
2404             typeof(This.init.byInterval) r;
2405         }
2406 
2407         return CodepointRange(this);
2408     }
2409 
2410     ///
2411     pure @safe unittest
2412     {
2413         import std.algorithm.comparison : equal;
2414         import std.range : iota;
2415 
2416         auto set = unicode.ASCII;
2417         set.byCodepoint.equal(iota(0, 0x80));
2418     }
2419 
2420     /**
2421         $(P Obtain textual representation of this set in from of
2422         open-right intervals and feed it to `sink`.
2423         )
2424         $(P Used by various standard formatting facilities such as
2425          $(REF formattedWrite, std,format), $(REF write, std,stdio),
2426          $(REF writef, std,stdio), $(REF to, std,conv) and others.
2427         )
2428         Example:
2429         ---
2430         import std.conv;
2431         assert(unicode.ASCII.to!string == "[0..128$(RPAREN)");
2432         ---
2433     */
2434 
2435     private import std.format.spec : FormatSpec;
2436 
2437     /***************************************
2438      * Obtain a textual representation of this InversionList
2439      * in form of open-right intervals.
2440      *
2441      * The formatting flag is applied individually to each value, for example:
2442      * $(LI $(B %s) and $(B %d) format the intervals as a [low .. high$(RPAREN) range of integrals)
2443      * $(LI $(B %x) formats the intervals as a [low .. high$(RPAREN) range of lowercase hex characters)
2444      * $(LI $(B %X) formats the intervals as a [low .. high$(RPAREN) range of uppercase hex characters)
2445      */
2446     void toString(Writer)(scope Writer sink, scope const ref FormatSpec!char fmt) /* const */
2447     {
2448         import std.format.write : formatValue;
2449         auto range = byInterval;
2450         if (range.empty)
2451             return;
2452 
2453         while (1)
2454         {
2455             auto i = range.front;
2456             range.popFront();
2457 
2458             put(sink, "[");
2459             formatValue(sink, i.a, fmt);
2460             put(sink, "..");
2461             formatValue(sink, i.b, fmt);
2462             put(sink, ")");
2463             if (range.empty) return;
2464             put(sink, " ");
2465         }
2466     }
2467 
2468     ///
2469     pure @safe unittest
2470     {
2471         import std.conv : to;
2472         import std.format : format;
2473         import std.uni : unicode;
2474 
2475         assert(unicode.Cyrillic.to!string ==
2476             "[1024..1157) [1159..1320) [7467..7468) [7544..7545) [11744..11776) [42560..42648) [42655..42656)");
2477 
2478         // The specs '%s' and '%d' are equivalent to the to!string call above.
2479         assert(format("%d", unicode.Cyrillic) == unicode.Cyrillic.to!string);
2480 
2481         assert(format("%#x", unicode.Cyrillic) ==
2482             "[0x400..0x485) [0x487..0x528) [0x1d2b..0x1d2c) [0x1d78..0x1d79) [0x2de0..0x2e00) "
2483             ~"[0xa640..0xa698) [0xa69f..0xa6a0)");
2484 
2485         assert(format("%#X", unicode.Cyrillic) ==
2486             "[0X400..0X485) [0X487..0X528) [0X1D2B..0X1D2C) [0X1D78..0X1D79) [0X2DE0..0X2E00) "
2487             ~"[0XA640..0XA698) [0XA69F..0XA6A0)");
2488     }
2489 
2490     pure @safe unittest
2491     {
2492         import std.exception : assertThrown;
2493         import std.format : format, FormatException;
2494         assertThrown!FormatException(format("%z", unicode.ASCII));
2495     }
2496 
2497 
2498     /**
2499         Add an interval [a, b$(RPAREN) to this set.
2500     */
2501     ref add()(uint a, uint b)
2502     {
2503         addInterval(a, b);
2504         return this;
2505     }
2506 
2507     ///
2508     pure @safe unittest
2509     {
2510         CodepointSet someSet;
2511         someSet.add('0', '5').add('A','Z'+1);
2512         someSet.add('5', '9'+1);
2513         assert(someSet['0']);
2514         assert(someSet['5']);
2515         assert(someSet['9']);
2516         assert(someSet['Z']);
2517     }
2518 
2519 private:
2520 
2521   package(std)  // used from: std.regex.internal.parser
2522     ref intersect(U)(U rhs)
2523         if (isCodepointSet!U)
2524     {
2525         Marker mark;
2526         foreach ( i; rhs.byInterval)
2527         {
2528             mark = this.dropUpTo(i.a, mark);
2529             mark = this.skipUpTo(i.b, mark);
2530         }
2531         this.dropUpTo(uint.max, mark);
2532         return this;
2533     }
2534 
2535     ref intersect()(dchar ch)
2536     {
2537         foreach (i; byInterval)
2538             if (i.a <= ch && ch < i.b)
2539                 return this = This.init.add(ch, ch+1);
2540         this = This.init;
2541         return this;
2542     }
2543 
2544     pure @safe unittest
2545     {
2546         assert(unicode.Cyrillic.intersect('-').byInterval.empty);
2547     }
2548 
2549     ref sub()(dchar ch)
2550     {
2551         return subChar(ch);
2552     }
2553 
2554     // same as the above except that skip & drop parts are swapped
2555   package(std)  // used from: std.regex.internal.parser
2556     ref sub(U)(U rhs)
2557         if (isCodepointSet!U)
2558     {
2559         Marker mark;
2560         foreach (i; rhs.byInterval)
2561         {
2562             mark = this.skipUpTo(i.a, mark);
2563             mark = this.dropUpTo(i.b, mark);
2564         }
2565         return this;
2566     }
2567 
2568   package(std)  // used from: std.regex.internal.parse
2569     ref add(U)(U rhs)
2570         if (isCodepointSet!U)
2571     {
2572         Marker start;
2573         foreach (i; rhs.byInterval)
2574         {
2575             start = addInterval(i.a, i.b, start);
2576         }
2577         return this;
2578     }
2579 
2580 // end of mixin-able part
2581 //============================================================================
2582 public:
2583     /**
2584         Obtains a set that is the inversion of this set.
2585 
2586         See the '!' $(LREF opUnary) for the same but using operators.
2587     */
2588     @property auto inverted()
2589     {
2590         InversionList inversion = this;
2591         if (inversion.data.length == 0)
2592         {
2593             inversion.addInterval(0, lastDchar+1);
2594             return inversion;
2595         }
2596         if (inversion.data[0] != 0)
2597             genericReplace(inversion.data, 0, 0, [0]);
2598         else
2599             genericReplace(inversion.data, 0, 1, cast(uint[]) null);
2600         if (data[data.length-1] != lastDchar+1)
2601             genericReplace(inversion.data,
2602                 inversion.data.length, inversion.data.length, [lastDchar+1]);
2603         else
2604             genericReplace(inversion.data,
2605                 inversion.data.length-1, inversion.data.length, cast(uint[]) null);
2606 
2607         return inversion;
2608     }
2609 
2610     ///
2611     pure @safe unittest
2612     {
2613         auto set = unicode.ASCII;
2614         // union with the inverse gets all of the code points in the Unicode
2615         assert((set | set.inverted).length == 0x110000);
2616         // no intersection with the inverse
2617         assert((set & set.inverted).empty);
2618     }
2619 
2620     package(std) static string toSourceCode(const(CodepointInterval)[] range, string funcName)
2621     {
2622         import std.algorithm.searching : countUntil;
2623         import std.format : format;
2624         enum maxBinary = 3;
2625         static string linearScope(R)(R ivals, string indent)
2626         {
2627             string result = indent~"{\n";
2628             string deeper = indent~"    ";
2629             foreach (ival; ivals)
2630             {
2631                 immutable span = ival[1] - ival[0];
2632                 assert(span != 0);
2633                 if (span == 1)
2634                 {
2635                     result ~= format("%sif (ch == %s) return true;\n", deeper, ival[0]);
2636                 }
2637                 else if (span == 2)
2638                 {
2639                     result ~= format("%sif (ch == %s || ch == %s) return true;\n",
2640                         deeper, ival[0], ival[0]+1);
2641                 }
2642                 else
2643                 {
2644                     if (ival[0] != 0) // dchar is unsigned and  < 0 is useless
2645                         result ~= format("%sif (ch < %s) return false;\n", deeper, ival[0]);
2646                     result ~= format("%sif (ch < %s) return true;\n", deeper, ival[1]);
2647                 }
2648             }
2649             result ~= format("%sreturn false;\n%s}\n", deeper, indent); // including empty range of intervals
2650             return result;
2651         }
2652 
2653         static string binaryScope(R)(R ivals, string indent) @safe
2654         {
2655             // time to do unrolled comparisons?
2656             if (ivals.length < maxBinary)
2657                 return linearScope(ivals, indent);
2658             else
2659                 return bisect(ivals, ivals.length/2, indent);
2660         }
2661 
2662         // not used yet if/elsebinary search is far better with DMD  as of 2.061
2663         // and GDC is doing fine job either way
2664         static string switchScope(R)(R ivals, string indent)
2665         {
2666             string result = indent~"switch (ch){\n";
2667             string deeper = indent~"    ";
2668             foreach (ival; ivals)
2669             {
2670                 if (ival[0]+1 == ival[1])
2671                 {
2672                     result ~= format("%scase %s: return true;\n",
2673                         deeper, ival[0]);
2674                 }
2675                 else
2676                 {
2677                     result ~= format("%scase %s: .. case %s: return true;\n",
2678                          deeper, ival[0], ival[1]-1);
2679                 }
2680             }
2681             result ~= deeper~"default: return false;\n"~indent~"}\n";
2682             return result;
2683         }
2684 
2685         static string bisect(R)(R range, size_t idx, string indent)
2686         {
2687             string deeper = indent ~ "    ";
2688             // bisect on one [a, b) interval at idx
2689             string result = indent~"{\n";
2690             // less branch, < a
2691             result ~= format("%sif (ch < %s)\n%s",
2692                 deeper, range[idx][0], binaryScope(range[0 .. idx], deeper));
2693             // middle point,  >= a && < b
2694             result ~= format("%selse if (ch < %s) return true;\n",
2695                 deeper, range[idx][1]);
2696             // greater or equal branch,  >= b
2697             result ~= format("%selse\n%s",
2698                 deeper, binaryScope(range[idx+1..$], deeper));
2699             return result~indent~"}\n";
2700         }
2701 
2702         string code = format("bool %s(dchar ch) @safe pure nothrow @nogc\n",
2703             funcName.empty ? "function" : funcName);
2704         // special case first bisection to be on ASCII vs beyond
2705         auto tillAscii = countUntil!"a[0] > 0x80"(range);
2706         if (tillAscii <= 0) // everything is ASCII or nothing is ascii (-1 & 0)
2707             code ~= binaryScope(range, "");
2708         else
2709             code ~= bisect(range, tillAscii, "");
2710         return code;
2711     }
2712 
2713     /**
2714         Generates string with D source code of unary function with name of
2715         `funcName` taking a single `dchar` argument. If `funcName` is empty
2716         the code is adjusted to be a lambda function.
2717 
2718         The function generated tests if the $(CODEPOINT) passed
2719         belongs to this set or not. The result is to be used with string mixin.
2720         The intended usage area is aggressive optimization via meta programming
2721         in parser generators and the like.
2722 
2723         Note: Use with care for relatively small or regular sets. It
2724         could end up being slower then just using multi-staged tables.
2725 
2726         Example:
2727         ---
2728         import std.stdio;
2729 
2730         // construct set directly from [a, b$RPAREN intervals
2731         auto set = CodepointSet(10, 12, 45, 65, 100, 200);
2732         writeln(set);
2733         writeln(set.toSourceCode("func"));
2734         ---
2735 
2736         The above outputs something along the lines of:
2737         ---
2738         bool func(dchar ch)  @safe pure nothrow @nogc
2739         {
2740             if (ch < 45)
2741             {
2742                 if (ch == 10 || ch == 11) return true;
2743                 return false;
2744             }
2745             else if (ch < 65) return true;
2746             else
2747             {
2748                 if (ch < 100) return false;
2749                 if (ch < 200) return true;
2750                 return false;
2751             }
2752         }
2753         ---
2754     */
2755     string toSourceCode(string funcName="")
2756     {
2757         import std.array : array;
2758         auto range = byInterval.array();
2759         return toSourceCode(range, funcName);
2760     }
2761 
2762     /**
2763         True if this set doesn't contain any $(CODEPOINTS).
2764     */
2765     @property bool empty() const
2766     {
2767         return data.length == 0;
2768     }
2769 
2770     ///
2771     pure @safe unittest
2772     {
2773         CodepointSet emptySet;
2774         assert(emptySet.length == 0);
2775         assert(emptySet.empty);
2776     }
2777 
2778 private:
2779     alias This = typeof(this);
2780     alias Marker = size_t;
2781 
2782     // a random-access range of integral pairs
2783     static struct Intervals(Range)
2784     {
2785         import std.range.primitives : hasAssignableElements;
2786 
2787         this(Range sp) scope
2788         {
2789             slice = sp;
2790             start = 0;
2791             end = sp.length;
2792         }
2793 
2794         this(Range sp, size_t s, size_t e) scope
2795         {
2796             slice = sp;
2797             start = s;
2798             end = e;
2799         }
2800 
2801         @property auto front()const
2802         {
2803             immutable a = slice[start];
2804             immutable b = slice[start+1];
2805             return CodepointInterval(a, b);
2806         }
2807 
2808         //may break sorted property - but we need std.sort to access it
2809         //hence package(std) protection attribute
2810         static if (hasAssignableElements!Range)
2811         package(std) @property void front(CodepointInterval val)
2812         {
2813             slice[start] = val.a;
2814             slice[start+1] = val.b;
2815         }
2816 
2817         @property auto back()const
2818         {
2819             immutable a = slice[end-2];
2820             immutable b = slice[end-1];
2821             return CodepointInterval(a, b);
2822         }
2823 
2824         //ditto about package
2825         static if (hasAssignableElements!Range)
2826         package(std) @property void back(CodepointInterval val)
2827         {
2828             slice[end-2] = val.a;
2829             slice[end-1] = val.b;
2830         }
2831 
2832         void popFront()
2833         {
2834             start += 2;
2835         }
2836 
2837         void popBack()
2838         {
2839             end -= 2;
2840         }
2841 
2842         auto opIndex(size_t idx) const
2843         {
2844             immutable a = slice[start+idx*2];
2845             immutable b = slice[start+idx*2+1];
2846             return CodepointInterval(a, b);
2847         }
2848 
2849         //ditto about package
2850         static if (hasAssignableElements!Range)
2851         package(std) void opIndexAssign(CodepointInterval val, size_t idx)
2852         {
2853             slice[start+idx*2] = val.a;
2854             slice[start+idx*2+1] = val.b;
2855         }
2856 
2857         auto opSlice(size_t s, size_t e)
2858         {
2859             return Intervals(slice, s*2+start, e*2+start);
2860         }
2861 
2862         @property size_t length()const {  return slice.length/2; }
2863 
2864         @property bool empty()const { return start == end; }
2865 
2866         @property auto save(){ return this; }
2867     private:
2868         size_t start, end;
2869         Range slice;
2870     }
2871 
2872     // called after construction from intervals
2873     // to make sure invariants hold
2874     void sanitize()
2875     {
2876         import std.algorithm.comparison : max;
2877         import std.algorithm.mutation : SwapStrategy;
2878         import std.algorithm.sorting : sort;
2879         if (data.length == 0)
2880             return;
2881         alias Ival = CodepointInterval;
2882         //intervals wrapper for a _range_ over packed array
2883         auto ivals = Intervals!(typeof(data[]))(data[]);
2884         //@@@BUG@@@ can't use "a.a < b.a" see
2885         // https://issues.dlang.org/show_bug.cgi?id=12265
2886         sort!((a,b) => a.a < b.a, SwapStrategy.stable)(ivals);
2887         // what follows is a variation on stable remove
2888         // differences:
2889         // - predicate is binary, and is tested against
2890         //   the last kept element (at 'i').
2891         // - predicate mutates lhs (merges rhs into lhs)
2892         size_t len = ivals.length;
2893         size_t i = 0;
2894         size_t j = 1;
2895         while (j < len)
2896         {
2897             if (ivals[i].b >= ivals[j].a)
2898             {
2899                 ivals[i] = Ival(ivals[i].a, max(ivals[i].b, ivals[j].b));
2900                 j++;
2901             }
2902             else //unmergable
2903             {
2904                 // check if there is a hole after merges
2905                 // (in the best case we do 0 writes to ivals)
2906                 if (j != i+1)
2907                     ivals[i+1] = ivals[j]; //copy over
2908                 i++;
2909                 j++;
2910             }
2911         }
2912         len = i + 1;
2913         for (size_t k=0; k + 1 < len; k++)
2914         {
2915             assert(ivals[k].a < ivals[k].b);
2916             assert(ivals[k].b < ivals[k+1].a);
2917         }
2918         data.length = len * 2;
2919     }
2920 
2921     // special case for normal InversionList
2922     ref subChar(dchar ch)
2923     {
2924         auto mark = skipUpTo(ch);
2925         if (mark != data.length
2926             && data[mark] == ch && data[mark-1] == ch)
2927         {
2928             // it has split, meaning that ch happens to be in one of intervals
2929             data[mark] = data[mark]+1;
2930         }
2931         return this;
2932     }
2933 
2934     //
2935     Marker addInterval(int a, int b, Marker hint=Marker.init) scope
2936     in
2937     {
2938         assert(a <= b);
2939     }
2940     do
2941     {
2942         import std.range : assumeSorted, SearchPolicy;
2943         auto range = assumeSorted(data[]);
2944         size_t pos;
2945         size_t a_idx = hint + range[hint..$].lowerBound!(SearchPolicy.gallop)(a).length;
2946         if (a_idx == range.length)
2947         {
2948             //  [---+++----++++----++++++]
2949             //  [                         a  b]
2950             data.append(a, b);
2951             return data.length-1;
2952         }
2953         size_t b_idx = range[a_idx .. range.length].lowerBound!(SearchPolicy.gallop)(b).length+a_idx;
2954         uint[3] buf = void;
2955         uint to_insert;
2956         debug(std_uni)
2957         {
2958             writefln("a_idx=%d; b_idx=%d;", a_idx, b_idx);
2959         }
2960         if (b_idx == range.length)
2961         {
2962             //  [-------++++++++----++++++-]
2963             //  [      s     a                 b]
2964             if (a_idx & 1)// a in positive
2965             {
2966                 buf[0] = b;
2967                 to_insert = 1;
2968             }
2969             else// a in negative
2970             {
2971                 buf[0] = a;
2972                 buf[1] = b;
2973                 to_insert = 2;
2974             }
2975             pos = genericReplace(data, a_idx, b_idx, buf[0 .. to_insert]);
2976             return pos - 1;
2977         }
2978 
2979         uint top = data[b_idx];
2980 
2981         debug(std_uni)
2982         {
2983             writefln("a_idx=%d; b_idx=%d;", a_idx, b_idx);
2984             writefln("a=%s; b=%s; top=%s;", a, b, top);
2985         }
2986         if (a_idx & 1)
2987         {// a in positive
2988             if (b_idx & 1)// b in positive
2989             {
2990                 //  [-------++++++++----++++++-]
2991                 //  [       s    a        b    ]
2992                 buf[0] = top;
2993                 to_insert = 1;
2994             }
2995             else // b in negative
2996             {
2997                 //  [-------++++++++----++++++-]
2998                 //  [       s    a   b         ]
2999                 if (top == b)
3000                 {
3001                     assert(b_idx+1 < data.length);
3002                     buf[0] = data[b_idx+1];
3003                     pos = genericReplace(data, a_idx, b_idx+2, buf[0 .. 1]);
3004                     return pos - 1;
3005                 }
3006                 buf[0] = b;
3007                 buf[1] = top;
3008                 to_insert = 2;
3009             }
3010         }
3011         else
3012         { // a in negative
3013             if (b_idx & 1) // b in positive
3014             {
3015                 //  [----------+++++----++++++-]
3016                 //  [     a     b              ]
3017                 buf[0] = a;
3018                 buf[1] = top;
3019                 to_insert = 2;
3020             }
3021             else// b in negative
3022             {
3023                 //  [----------+++++----++++++-]
3024                 //  [  a       s      b        ]
3025                 if (top == b)
3026                 {
3027                     assert(b_idx+1 < data.length);
3028                     buf[0] = a;
3029                     buf[1] = data[b_idx+1];
3030                     pos = genericReplace(data, a_idx, b_idx+2, buf[0 .. 2]);
3031                     return pos - 1;
3032                 }
3033                 buf[0] = a;
3034                 buf[1] = b;
3035                 buf[2] = top;
3036                 to_insert = 3;
3037             }
3038         }
3039         pos = genericReplace(data, a_idx, b_idx+1, buf[0 .. to_insert]);
3040         debug(std_uni)
3041         {
3042             writefln("marker idx: %d; length=%d", pos, data[pos], data.length);
3043             writeln("inserting ", buf[0 .. to_insert]);
3044         }
3045         return pos - 1;
3046     }
3047 
3048     //
3049     Marker dropUpTo(uint a, Marker pos=Marker.init)
3050     in
3051     {
3052         assert(pos % 2 == 0); // at start of interval
3053     }
3054     do
3055     {
3056         auto range = assumeSorted!"a <= b"(data[pos .. data.length]);
3057         if (range.empty)
3058             return pos;
3059         size_t idx = pos;
3060         idx += range.lowerBound(a).length;
3061 
3062         debug(std_uni)
3063         {
3064             writeln("dropUpTo full length=", data.length);
3065             writeln(pos,"~~~", idx);
3066         }
3067         if (idx == data.length)
3068             return genericReplace(data, pos, idx, cast(uint[])[]);
3069         if (idx & 1)
3070         {   // a in positive
3071             //[--+++----++++++----+++++++------...]
3072             //      |<---si       s  a  t
3073             genericReplace(data, pos, idx, [a]);
3074         }
3075         else
3076         {   // a in negative
3077             //[--+++----++++++----+++++++-------+++...]
3078             //      |<---si              s  a  t
3079             genericReplace(data, pos, idx, cast(uint[])[]);
3080         }
3081         return pos;
3082     }
3083 
3084     //
3085     Marker skipUpTo(uint a, Marker pos=Marker.init)
3086     out(result)
3087     {
3088         assert(result % 2 == 0);// always start of interval
3089         //(may be  0-width after-split)
3090     }
3091     do
3092     {
3093         assert(data.length % 2 == 0);
3094         auto range = assumeSorted!"a <= b"(data[pos .. data.length]);
3095         size_t idx = pos+range.lowerBound(a).length;
3096 
3097         if (idx >= data.length) // could have Marker point to recently removed stuff
3098             return data.length;
3099 
3100         if (idx & 1)// inside of interval, check for split
3101         {
3102 
3103             immutable top = data[idx];
3104             if (top == a)// no need to split, it's end
3105                 return idx+1;
3106             immutable start = data[idx-1];
3107             if (a == start)
3108                 return idx-1;
3109             // split it up
3110             genericReplace(data, idx, idx+1, [a, a, top]);
3111             return idx+1;        // avoid odd index
3112         }
3113         return idx;
3114     }
3115 
3116     CowArray!SP data;
3117 }
3118 
3119 pure @safe unittest
3120 {
3121     import std.conv : to;
3122     assert(unicode.ASCII.to!string() == "[0..128)");
3123 }
3124 
3125 // pedantic version for ctfe, and aligned-access only architectures
3126 @system private uint safeRead24(scope const ubyte* ptr, size_t idx) pure nothrow @nogc
3127 {
3128     idx *= 3;
3129     version (LittleEndian)
3130         return ptr[idx] + (cast(uint) ptr[idx+1]<<8)
3131              + (cast(uint) ptr[idx+2]<<16);
3132     else
3133         return (cast(uint) ptr[idx]<<16) + (cast(uint) ptr[idx+1]<<8)
3134              + ptr[idx+2];
3135 }
3136 
3137 // ditto
3138 @system private void safeWrite24(scope ubyte* ptr, uint val, size_t idx) pure nothrow @nogc
3139 {
3140     idx *= 3;
3141     version (LittleEndian)
3142     {
3143         ptr[idx] = val & 0xFF;
3144         ptr[idx+1] = (val >> 8) & 0xFF;
3145         ptr[idx+2] = (val >> 16) & 0xFF;
3146     }
3147     else
3148     {
3149         ptr[idx] = (val >> 16) & 0xFF;
3150         ptr[idx+1] = (val >> 8) & 0xFF;
3151         ptr[idx+2] = val & 0xFF;
3152     }
3153 }
3154 
3155 // unaligned x86-like read/write functions
3156 @system private uint unalignedRead24(scope const ubyte* ptr, size_t idx) pure nothrow @nogc
3157 {
3158     uint* src = cast(uint*)(ptr+3*idx);
3159     version (LittleEndian)
3160         return *src & 0xFF_FFFF;
3161     else
3162         return *src >> 8;
3163 }
3164 
3165 // ditto
3166 @system private void unalignedWrite24(scope ubyte* ptr, uint val, size_t idx) pure nothrow @nogc
3167 {
3168     uint* dest = cast(uint*)(cast(ubyte*) ptr + 3*idx);
3169     version (LittleEndian)
3170         *dest = val | (*dest & 0xFF00_0000);
3171     else
3172         *dest = (val << 8) | (*dest & 0xFF);
3173 }
3174 
3175 @system private uint read24(scope const ubyte* ptr, size_t idx) pure nothrow @nogc
3176 {
3177     static if (hasUnalignedReads)
3178         return __ctfe ? safeRead24(ptr, idx) : unalignedRead24(ptr, idx);
3179     else
3180         return safeRead24(ptr, idx);
3181 }
3182 
3183 @system private void write24(scope ubyte* ptr, uint val, size_t idx) pure nothrow @nogc
3184 {
3185     static if (hasUnalignedReads)
3186         return __ctfe ? safeWrite24(ptr, val, idx) : unalignedWrite24(ptr, val, idx);
3187     else
3188         return safeWrite24(ptr, val, idx);
3189 }
3190 
3191 struct CowArray(SP=GcPolicy)
3192 {
3193     import std.range.primitives : hasLength;
3194 
3195   @safe:
3196     static auto reuse(uint[] arr)
3197     {
3198         CowArray cow;
3199         cow.data = arr;
3200         SP.append(cow.data, 1);
3201         assert(cow.refCount == 1);
3202         assert(cow.length == arr.length);
3203         return cow;
3204     }
3205 
3206     this(Range)(Range range)
3207         if (isInputRange!Range && hasLength!Range)
3208     {
3209         import std.algorithm.mutation : copy;
3210         length = range.length;
3211         copy(range, data[0..$-1]);
3212     }
3213 
3214     this(Range)(Range range)
3215         if (isForwardRange!Range && !hasLength!Range)
3216     {
3217         import std.algorithm.mutation : copy;
3218         import std.range.primitives : walkLength;
3219         immutable len = walkLength(range.save);
3220         length = len;
3221         copy(range, data[0..$-1]);
3222     }
3223 
3224     this(this)
3225     {
3226         if (!empty)
3227         {
3228             refCount = refCount + 1;
3229         }
3230     }
3231 
3232     ~this()
3233     {
3234         if (!empty)
3235         {
3236             immutable cnt = refCount;
3237             if (cnt == 1)
3238                 SP.destroy(data);
3239             else
3240                 refCount = cnt - 1;
3241         }
3242     }
3243 
3244     // no ref-count for empty U24 array
3245     @property bool empty() const { return data.length == 0; }
3246 
3247     // report one less then actual size
3248     @property size_t length() const
3249     {
3250         return data.length ? data.length - 1 : 0;
3251     }
3252 
3253     //+ an extra slot for ref-count
3254     @property void length(size_t len)
3255     {
3256         import std.algorithm.comparison : min;
3257         import std.algorithm.mutation : copy;
3258         if (len == 0)
3259         {
3260             if (!empty)
3261                 freeThisReference();
3262             return;
3263         }
3264         immutable total = len + 1; // including ref-count
3265         if (empty)
3266         {
3267             data = SP.alloc!uint(total);
3268             refCount = 1;
3269             return;
3270         }
3271         immutable cur_cnt = refCount;
3272         if (cur_cnt != 1) // have more references to this memory
3273         {
3274             refCount = cur_cnt - 1;
3275             auto new_data = SP.alloc!uint(total);
3276             // take shrinking into account
3277             auto to_copy = min(total, data.length) - 1;
3278             copy(data[0 .. to_copy], new_data[0 .. to_copy]);
3279             data = new_data; // before setting refCount!
3280             refCount = 1;
3281         }
3282         else // 'this' is the only reference
3283         {
3284             // use the realloc (hopefully in-place operation)
3285             data = SP.realloc(data, total);
3286             refCount = 1; // setup a ref-count in the new end of the array
3287         }
3288     }
3289 
3290     alias opDollar = length;
3291 
3292     uint opIndex()(size_t idx)const
3293     {
3294         return data[idx];
3295     }
3296 
3297     void opIndexAssign(uint val, size_t idx)
3298     {
3299         auto cnt = refCount;
3300         if (cnt != 1)
3301             dupThisReference(cnt);
3302         data[idx] = val;
3303     }
3304 
3305     //
3306     auto opSlice(size_t from, size_t to)
3307     {
3308         if (!empty)
3309         {
3310             auto cnt = refCount;
3311             if (cnt != 1)
3312                 dupThisReference(cnt);
3313         }
3314         return data[from .. to];
3315 
3316     }
3317 
3318     //
3319     auto opSlice(size_t from, size_t to) const
3320     {
3321         return data[from .. to];
3322     }
3323 
3324     // length slices before the ref count
3325     auto opSlice()
3326     {
3327         return opSlice(0, length);
3328     }
3329 
3330     // ditto
3331     auto opSlice() const
3332     {
3333         return opSlice(0, length);
3334     }
3335 
3336     void append(Range)(Range range)
3337         if (isInputRange!Range && hasLength!Range && is(ElementType!Range : uint))
3338     {
3339         size_t nl = length + range.length;
3340         length = nl;
3341         copy(range, this[nl-range.length .. nl]);
3342     }
3343 
3344     void append()(uint[] val...)
3345     {
3346         length = length + val.length;
3347         data[$-val.length-1 .. $-1] = val[];
3348     }
3349 
3350     bool opEquals()(auto const ref CowArray rhs)const
3351     {
3352         if (empty ^ rhs.empty)
3353             return false; // one is empty and the other isn't
3354         return empty || data[0..$-1] == rhs.data[0..$-1];
3355     }
3356 
3357 private:
3358     // ref-count is right after the data
3359     @property uint refCount() const
3360     {
3361         return data[$-1];
3362     }
3363 
3364     @property void refCount(uint cnt)
3365     {
3366         data[$-1] = cnt;
3367     }
3368 
3369     void freeThisReference()
3370     {
3371         immutable count = refCount;
3372         if (count != 1) // have more references to this memory
3373         {
3374             // dec shared ref-count
3375             refCount = count - 1;
3376             data = [];
3377         }
3378         else
3379             SP.destroy(data);
3380         assert(!data.ptr);
3381     }
3382 
3383     void dupThisReference(uint count)
3384     in
3385     {
3386         assert(!empty && count != 1 && count == refCount);
3387     }
3388     do
3389     {
3390         import std.algorithm.mutation : copy;
3391         // dec shared ref-count
3392         refCount = count - 1;
3393         // copy to the new chunk of RAM
3394         auto new_data = SP.alloc!uint(data.length);
3395         // bit-blit old stuff except the counter
3396         copy(data[0..$-1], new_data[0..$-1]);
3397         data = new_data; // before setting refCount!
3398         refCount = 1; // so that this updates the right one
3399     }
3400 
3401     uint[] data;
3402 }
3403 
3404 pure @safe unittest// Uint24 tests
3405 {
3406     import std.algorithm.comparison : equal;
3407     import std.algorithm.mutation : copy;
3408     import std.conv : text;
3409     import std.range : iota, chain;
3410     import std.range.primitives : isBidirectionalRange, isOutputRange;
3411     void funcRef(T)(ref T u24)
3412     {
3413         u24.length = 2;
3414         u24[1] = 1024;
3415         T u24_c = u24;
3416         assert(u24[1] == 1024);
3417         u24.length = 0;
3418         assert(u24.empty);
3419         u24.append([1, 2]);
3420         assert(equal(u24[], [1, 2]));
3421         u24.append(111);
3422         assert(equal(u24[], [1, 2, 111]));
3423         assert(!u24_c.empty && u24_c[1] == 1024);
3424         u24.length = 3;
3425         copy(iota(0, 3), u24[]);
3426         assert(equal(u24[], iota(0, 3)));
3427         assert(u24_c[1] == 1024);
3428     }
3429 
3430     void func2(T)(T u24)
3431     {
3432         T u24_2 = u24;
3433         T u24_3;
3434         u24_3 = u24_2;
3435         assert(u24_2 == u24_3);
3436         assert(equal(u24[], u24_2[]));
3437         assert(equal(u24_2[], u24_3[]));
3438         funcRef(u24_3);
3439 
3440         assert(equal(u24_3[], iota(0, 3)));
3441         assert(!equal(u24_2[], u24_3[]));
3442         assert(equal(u24_2[], u24[]));
3443         u24_2 = u24_3;
3444         assert(equal(u24_2[], iota(0, 3)));
3445         // to test that passed arg is intact outside
3446         // plus try out opEquals
3447         u24 = u24_3;
3448         u24 = T.init;
3449         u24_3 = T.init;
3450         assert(u24.empty);
3451         assert(u24 == u24_3);
3452         assert(u24 != u24_2);
3453     }
3454 
3455     static foreach (Policy; AliasSeq!(GcPolicy, ReallocPolicy))
3456     {{
3457         alias Range = typeof(CowArray!Policy.init[]);
3458         alias U24A = CowArray!Policy;
3459         static assert(isForwardRange!Range);
3460         static assert(isBidirectionalRange!Range);
3461         static assert(isOutputRange!(Range, uint));
3462         static assert(isRandomAccessRange!(Range));
3463 
3464         auto arr = U24A([42u, 36, 100]);
3465         assert(arr[0] == 42);
3466         assert(arr[1] == 36);
3467         arr[0] = 72;
3468         arr[1] = 0xFE_FEFE;
3469         assert(arr[0] == 72);
3470         assert(arr[1] == 0xFE_FEFE);
3471         assert(arr[2] == 100);
3472         U24A arr2 = arr;
3473         assert(arr2[0] == 72);
3474         arr2[0] = 11;
3475         // test COW-ness
3476         assert(arr[0] == 72);
3477         assert(arr2[0] == 11);
3478         // set this to about 100M to stress-test COW memory management
3479         foreach (v; 0 .. 10_000)
3480             func2(arr);
3481         assert(equal(arr[], [72, 0xFE_FEFE, 100]));
3482 
3483         auto r2 = U24A(iota(0, 100));
3484         assert(equal(r2[], iota(0, 100)), text(r2[]));
3485         copy(iota(10, 170, 2), r2[10 .. 90]);
3486         assert(equal(r2[], chain(iota(0, 10), iota(10, 170, 2), iota(90, 100)))
3487                , text(r2[]));
3488     }}
3489 }
3490 
3491 pure @safe unittest// core set primitives test
3492 {
3493     import std.conv : text;
3494     alias AllSets = AliasSeq!(InversionList!GcPolicy, InversionList!ReallocPolicy);
3495     foreach (CodeList; AllSets)
3496     {
3497         CodeList a;
3498         //"plug a hole" test
3499         a.add(10, 20).add(25, 30).add(15, 27);
3500         assert(a == CodeList(10, 30), text(a));
3501 
3502         auto x = CodeList.init;
3503         x.add(10, 20).add(30, 40).add(50, 60);
3504 
3505         a = x;
3506         a.add(20, 49);//[10, 49) [50, 60)
3507         assert(a == CodeList(10, 49, 50 ,60));
3508 
3509         a = x;
3510         a.add(20, 50);
3511         assert(a == CodeList(10, 60), text(a));
3512 
3513         // simple unions, mostly edge effects
3514         x = CodeList.init;
3515         x.add(10, 20).add(40, 60);
3516 
3517         a = x;
3518         a.add(10, 25); //[10, 25) [40, 60)
3519         assert(a == CodeList(10, 25, 40, 60));
3520 
3521         a = x;
3522         a.add(5, 15); //[5, 20) [40, 60)
3523         assert(a == CodeList(5, 20, 40, 60));
3524 
3525         a = x;
3526         a.add(0, 10); // [0, 20) [40, 60)
3527         assert(a == CodeList(0, 20, 40, 60));
3528 
3529         a = x;
3530         a.add(0, 5); // prepand
3531         assert(a == CodeList(0, 5, 10, 20, 40, 60), text(a));
3532 
3533         a = x;
3534         a.add(5, 20);
3535         assert(a == CodeList(5, 20, 40, 60));
3536 
3537         a = x;
3538         a.add(3, 37);
3539         assert(a == CodeList(3, 37, 40, 60));
3540 
3541         a = x;
3542         a.add(37, 65);
3543         assert(a == CodeList(10, 20, 37, 65));
3544 
3545         // some tests on helpers for set intersection
3546         x = CodeList.init.add(10, 20).add(40, 60).add(100, 120);
3547         a = x;
3548 
3549         auto m = a.skipUpTo(60);
3550         a.dropUpTo(110, m);
3551         assert(a == CodeList(10, 20, 40, 60, 110, 120), text(a.data[]));
3552 
3553         a = x;
3554         a.dropUpTo(100);
3555         assert(a == CodeList(100, 120), text(a.data[]));
3556 
3557         a = x;
3558         m = a.skipUpTo(50);
3559         a.dropUpTo(140, m);
3560         assert(a == CodeList(10, 20, 40, 50), text(a.data[]));
3561         a = x;
3562         a.dropUpTo(60);
3563         assert(a == CodeList(100, 120), text(a.data[]));
3564     }
3565 }
3566 
3567 
3568 //test constructor to work with any order of intervals
3569 pure @safe unittest
3570 {
3571     import std.algorithm.comparison : equal;
3572     import std.conv : text, to;
3573     import std.range : chain, iota;
3574     import std.typecons : tuple;
3575     //ensure constructor handles bad ordering and overlap
3576     auto c1 = CodepointSet('а', 'я'+1, 'А','Я'+1);
3577     foreach (ch; chain(iota('а', 'я'+1), iota('А','Я'+1)))
3578         assert(ch in c1, to!string(ch));
3579 
3580     //contiguos
3581     assert(CodepointSet(1000, 1006, 1006, 1009)
3582         .byInterval.equal([tuple(1000, 1009)]));
3583     //contains
3584     assert(CodepointSet(900, 1200, 1000, 1100)
3585         .byInterval.equal([tuple(900, 1200)]));
3586     //intersect left
3587     assert(CodepointSet(900, 1100, 1000, 1200)
3588         .byInterval.equal([tuple(900, 1200)]));
3589     //intersect right
3590     assert(CodepointSet(1000, 1200, 900, 1100)
3591         .byInterval.equal([tuple(900, 1200)]));
3592 
3593     //ditto with extra items at end
3594     assert(CodepointSet(1000, 1200, 900, 1100, 800, 850)
3595         .byInterval.equal([tuple(800, 850), tuple(900, 1200)]));
3596     assert(CodepointSet(900, 1100, 1000, 1200, 800, 850)
3597         .byInterval.equal([tuple(800, 850), tuple(900, 1200)]));
3598 
3599     //"plug a hole" test
3600     auto c2 = CodepointSet(20, 40,
3601         60, 80, 100, 140, 150, 200,
3602         40, 60, 80, 100, 140, 150
3603     );
3604     assert(c2.byInterval.equal([tuple(20, 200)]));
3605 
3606     auto c3 = CodepointSet(
3607         20, 40, 60, 80, 100, 140, 150, 200,
3608         0, 10, 15, 100, 10, 20, 200, 220);
3609     assert(c3.byInterval.equal([tuple(0, 140), tuple(150, 220)]));
3610 }
3611 
3612 
3613 pure @safe unittest
3614 {   // full set operations
3615     import std.conv : text;
3616     alias AllSets = AliasSeq!(InversionList!GcPolicy, InversionList!ReallocPolicy);
3617     foreach (CodeList; AllSets)
3618     {
3619         CodeList a, b, c, d;
3620 
3621         //"plug a hole"
3622         a.add(20, 40).add(60, 80).add(100, 140).add(150, 200);
3623         b.add(40, 60).add(80, 100).add(140, 150);
3624         c = a | b;
3625         d = b | a;
3626         assert(c == CodeList(20, 200), text(CodeList.stringof," ", c));
3627         assert(c == d, text(c," vs ", d));
3628 
3629         b = CodeList.init.add(25, 45).add(65, 85).add(95,110).add(150, 210);
3630         c = a | b; //[20,45) [60, 85) [95, 140) [150, 210)
3631         d = b | a;
3632         assert(c == CodeList(20, 45, 60, 85, 95, 140, 150, 210), text(c));
3633         assert(c == d, text(c," vs ", d));
3634 
3635         b = CodeList.init.add(10, 20).add(30,100).add(145,200);
3636         c = a | b;//[10, 140) [145, 200)
3637         d = b | a;
3638         assert(c == CodeList(10, 140, 145, 200));
3639         assert(c == d, text(c," vs ", d));
3640 
3641         b = CodeList.init.add(0, 10).add(15, 100).add(10, 20).add(200, 220);
3642         c = a | b;//[0, 140) [150, 220)
3643         d = b | a;
3644         assert(c == CodeList(0, 140, 150, 220));
3645         assert(c == d, text(c," vs ", d));
3646 
3647 
3648         a = CodeList.init.add(20, 40).add(60, 80);
3649         b = CodeList.init.add(25, 35).add(65, 75);
3650         c = a & b;
3651         d = b & a;
3652         assert(c == CodeList(25, 35, 65, 75), text(c));
3653         assert(c == d, text(c," vs ", d));
3654 
3655         a = CodeList.init.add(20, 40).add(60, 80).add(100, 140).add(150, 200);
3656         b = CodeList.init.add(25, 35).add(65, 75).add(110, 130).add(160, 180);
3657         c = a & b;
3658         d = b & a;
3659         assert(c == CodeList(25, 35, 65, 75, 110, 130, 160, 180), text(c));
3660         assert(c == d, text(c," vs ", d));
3661 
3662         a = CodeList.init.add(20, 40).add(60, 80).add(100, 140).add(150, 200);
3663         b = CodeList.init.add(10, 30).add(60, 120).add(135, 160);
3664         c = a & b;//[20, 30)[60, 80) [100, 120) [135, 140) [150, 160)
3665         d = b & a;
3666 
3667         assert(c == CodeList(20, 30, 60, 80, 100, 120, 135, 140, 150, 160),text(c));
3668         assert(c == d, text(c, " vs ",d));
3669         assert((c & a) == c);
3670         assert((d & b) == d);
3671         assert((c & d) == d);
3672 
3673         b = CodeList.init.add(40, 60).add(80, 100).add(140, 200);
3674         c = a & b;
3675         d = b & a;
3676         assert(c == CodeList(150, 200), text(c));
3677         assert(c == d, text(c, " vs ",d));
3678         assert((c & a) == c);
3679         assert((d & b) == d);
3680         assert((c & d) == d);
3681 
3682         assert((a & a) == a);
3683         assert((b & b) == b);
3684 
3685         a = CodeList.init.add(20, 40).add(60, 80).add(100, 140).add(150, 200);
3686         b = CodeList.init.add(30, 60).add(75, 120).add(190, 300);
3687         c = a - b;// [30, 40) [60, 75) [120, 140) [150, 190)
3688         d = b - a;// [40, 60) [80, 100) [200, 300)
3689         assert(c == CodeList(20, 30, 60, 75, 120, 140, 150, 190), text(c));
3690         assert(d == CodeList(40, 60, 80, 100, 200, 300), text(d));
3691         assert(c - d == c, text(c-d, " vs ", c));
3692         assert(d - c == d, text(d-c, " vs ", d));
3693         assert(c - c == CodeList.init);
3694         assert(d - d == CodeList.init);
3695 
3696         a = CodeList.init.add(20, 40).add( 60, 80).add(100, 140).add(150,            200);
3697         b = CodeList.init.add(10,  50).add(60,                           160).add(190, 300);
3698         c = a - b;// [160, 190)
3699         d = b - a;// [10, 20) [40, 50) [80, 100) [140, 150) [200, 300)
3700         assert(c == CodeList(160, 190), text(c));
3701         assert(d == CodeList(10, 20, 40, 50, 80, 100, 140, 150, 200, 300), text(d));
3702         assert(c - d == c, text(c-d, " vs ", c));
3703         assert(d - c == d, text(d-c, " vs ", d));
3704         assert(c - c == CodeList.init);
3705         assert(d - d == CodeList.init);
3706 
3707         a = CodeList.init.add(20,    40).add(60, 80).add(100,      140).add(150,  200);
3708         b = CodeList.init.add(10, 30).add(45,         100).add(130,             190);
3709         c = a ~ b; // [10, 20) [30, 40) [45, 60) [80, 130) [140, 150) [190, 200)
3710         d = b ~ a;
3711         assert(c == CodeList(10, 20, 30, 40, 45, 60, 80, 130, 140, 150, 190, 200),
3712                text(c));
3713         assert(c == d, text(c, " vs ", d));
3714     }
3715 }
3716 
3717 }
3718 
3719 pure @safe unittest// vs single dchar
3720 {
3721     import std.conv : text;
3722     CodepointSet a = CodepointSet(10, 100, 120, 200);
3723     assert(a - 'A' == CodepointSet(10, 65, 66, 100, 120, 200), text(a - 'A'));
3724     assert((a & 'B') == CodepointSet(66, 67));
3725 }
3726 
3727 pure @safe unittest// iteration & opIndex
3728 {
3729     import std.algorithm.comparison : equal;
3730     import std.conv : text;
3731     import std.typecons : tuple, Tuple;
3732 
3733     static foreach (CodeList; AliasSeq!(InversionList!(ReallocPolicy)))
3734     {{
3735         auto arr = "ABCDEFGHIJKLMabcdefghijklm"d;
3736         auto a = CodeList('A','N','a', 'n');
3737         assert(equal(a.byInterval,
3738                 [tuple(cast(uint)'A', cast(uint)'N'), tuple(cast(uint)'a', cast(uint)'n')]
3739             ), text(a.byInterval));
3740 
3741         // same @@@BUG as in https://issues.dlang.org/show_bug.cgi?id=8949 ?
3742         version (bug8949)
3743         {
3744             import std.range : retro;
3745             assert(equal(retro(a.byInterval),
3746                 [tuple(cast(uint)'a', cast(uint)'n'), tuple(cast(uint)'A', cast(uint)'N')]
3747             ), text(retro(a.byInterval)));
3748         }
3749         auto achr = a.byCodepoint;
3750         assert(equal(achr, arr), text(a.byCodepoint));
3751         foreach (ch; a.byCodepoint)
3752             assert(a[ch]);
3753         auto x = CodeList(100, 500, 600, 900, 1200, 1500);
3754         assert(equal(x.byInterval, [ tuple(100, 500), tuple(600, 900), tuple(1200, 1500)]), text(x.byInterval));
3755         foreach (ch; x.byCodepoint)
3756             assert(x[ch]);
3757         static if (is(CodeList == CodepointSet))
3758         {
3759             auto y = CodeList(x.byInterval);
3760             assert(equal(x.byInterval, y.byInterval));
3761         }
3762         assert(equal(CodepointSet.init.byInterval, cast(Tuple!(uint, uint)[])[]));
3763         assert(equal(CodepointSet.init.byCodepoint, cast(dchar[])[]));
3764     }}
3765 }
3766 
3767 //============================================================================
3768 // Generic Trie template and various ways to build it
3769 //============================================================================
3770 
3771 // debug helper to get a shortened array dump
3772 auto arrayRepr(T)(T x)
3773 {
3774     import std.conv : text;
3775     if (x.length > 32)
3776     {
3777         return text(x[0 .. 16],"~...~", x[x.length-16 .. x.length]);
3778     }
3779     else
3780         return text(x);
3781 }
3782 
3783 /**
3784     Maps `Key` to a suitable integer index within the range of `size_t`.
3785     The mapping is constructed by applying predicates from `Prefix` left to right
3786     and concatenating the resulting bits.
3787 
3788     The first (leftmost) predicate defines the most significant bits of
3789     the resulting index.
3790  */
3791 template mapTrieIndex(Prefix...)
3792 {
3793     size_t mapTrieIndex(Key)(Key key)
3794         if (isValidPrefixForTrie!(Key, Prefix))
3795     {
3796         alias p = Prefix;
3797         size_t idx;
3798         foreach (i, v; p[0..$-1])
3799         {
3800             idx |= p[i](key);
3801             idx <<= p[i+1].bitSize;
3802         }
3803         idx |= p[$-1](key);
3804         return idx;
3805     }
3806 }
3807 
3808 /*
3809     `TrieBuilder` is a type used for incremental construction
3810     of $(LREF Trie)s.
3811 
3812     See $(LREF buildTrie) for generic helpers built on top of it.
3813 */
3814 @trusted private struct TrieBuilder(Value, Key, Args...)
3815 if (isBitPackableType!Value && isValidArgsForTrie!(Key, Args))
3816 {
3817     import std.exception : enforce;
3818 
3819 private:
3820     // last index is not stored in table, it is used as an offset to values in a block.
3821     static if (is(Value == bool))// always pack bool
3822         alias V = BitPacked!(Value, 1);
3823     else
3824         alias V = Value;
3825     static auto deduceMaxIndex(Preds...)()
3826     {
3827         size_t idx = 1;
3828         foreach (v; Preds)
3829             idx *= 2^^v.bitSize;
3830         return idx;
3831     }
3832 
3833     static if (is(typeof(Args[0]) : Key)) // Args start with upper bound on Key
3834     {
3835         alias Prefix = Args[1..$];
3836         enum lastPageSize = 2^^Prefix[$-1].bitSize;
3837         enum translatedMaxIndex = mapTrieIndex!(Prefix)(Args[0]);
3838         enum roughedMaxIndex =
3839             (translatedMaxIndex + lastPageSize-1)/lastPageSize*lastPageSize;
3840         // check warp around - if wrapped, use the default deduction rule
3841         enum maxIndex = roughedMaxIndex < translatedMaxIndex ?
3842             deduceMaxIndex!(Prefix)() : roughedMaxIndex;
3843     }
3844     else
3845     {
3846         alias Prefix = Args;
3847         enum maxIndex = deduceMaxIndex!(Prefix)();
3848     }
3849 
3850     alias getIndex = mapTrieIndex!(Prefix);
3851 
3852     enum lastLevel = Prefix.length-1;
3853     struct ConstructState
3854     {
3855         size_t idx_zeros, idx_ones;
3856     }
3857     // iteration over levels of Trie, each indexes its own level and thus a shortened domain
3858     size_t[Prefix.length] indices;
3859     // default filler value to use
3860     Value defValue;
3861     // this is a full-width index of next item
3862     size_t curIndex;
3863     // all-zeros page index, all-ones page index (+ indicator if there is such a page)
3864     ConstructState[Prefix.length] state;
3865     // the table being constructed
3866     MultiArray!(idxTypes!(Key, fullBitSize!(Prefix), Prefix[0..$]), V) table;
3867 
3868     @disable this();
3869 
3870     //shortcut for index variable at level 'level'
3871     @property ref idx(size_t level)(){ return indices[level]; }
3872 
3873     // this function assumes no holes in the input so
3874     // indices are going one by one
3875     void addValue(size_t level, T)(T val, size_t numVals)
3876     {
3877         alias j = idx!level;
3878         enum pageSize = 1 << Prefix[level].bitSize;
3879         if (numVals == 0)
3880             return;
3881         auto ptr = table.slice!(level);
3882         if (numVals == 1)
3883         {
3884             static if (level == Prefix.length-1)
3885                 ptr[j] = val;
3886             else
3887             {// can incur narrowing conversion
3888                 assert(j < ptr.length);
3889                 ptr[j] = force!(typeof(ptr[j]))(val);
3890             }
3891             j++;
3892             if (j % pageSize == 0)
3893                 spillToNextPage!level(ptr);
3894             return;
3895         }
3896         // longer row of values
3897         // get to the next page boundary
3898         immutable nextPB = (j + pageSize) & ~(pageSize-1);
3899         immutable n =  nextPB - j;// can fill right in this page
3900         if (numVals < n) //fits in current page
3901         {
3902             ptr[j .. j+numVals]  = val;
3903             j += numVals;
3904             return;
3905         }
3906         static if (level != 0)//on the first level it always fits
3907         {
3908             numVals -= n;
3909             //write till the end of current page
3910             ptr[j .. j+n]  = val;
3911             j += n;
3912             //spill to the next page
3913             spillToNextPage!level(ptr);
3914             // page at once loop
3915             if (state[level].idx_zeros != size_t.max && val == T.init)
3916             {
3917                 alias NextIdx = typeof(table.slice!(level-1)[0]);
3918                 addValue!(level-1)(force!NextIdx(state[level].idx_zeros),
3919                     numVals/pageSize);
3920                 ptr = table.slice!level; //table structure might have changed
3921                 numVals %= pageSize;
3922             }
3923             else
3924             {
3925                 while (numVals >= pageSize)
3926                 {
3927                     numVals -= pageSize;
3928                     ptr[j .. j+pageSize]  = val;
3929                     j += pageSize;
3930                     spillToNextPage!level(ptr);
3931                 }
3932             }
3933             if (numVals)
3934             {
3935                 // the leftovers, an incomplete page
3936                 ptr[j .. j+numVals]  = val;
3937                 j += numVals;
3938             }
3939         }
3940     }
3941 
3942     void spillToNextPage(size_t level, Slice)(ref Slice ptr)
3943     {
3944         // last level (i.e. topmost) has 1 "page"
3945         // thus it need not to add a new page on upper level
3946         static if (level != 0)
3947             spillToNextPageImpl!(level)(ptr);
3948     }
3949 
3950     // this can re-use the current page if duplicate or allocate a new one
3951     // it also makes sure that previous levels point to the correct page in this level
3952     void spillToNextPageImpl(size_t level, Slice)(ref Slice ptr)
3953     {
3954         alias NextIdx = typeof(table.slice!(level-1)[0]);
3955         NextIdx next_lvl_index;
3956         enum pageSize = 1 << Prefix[level].bitSize;
3957         assert(idx!level % pageSize == 0);
3958         immutable last = idx!level-pageSize;
3959         const slice = ptr[idx!level - pageSize .. idx!level];
3960         size_t j;
3961         for (j=0; j<last; j+=pageSize)
3962         {
3963             if (ptr[j .. j+pageSize] == slice)
3964             {
3965                 // get index to it, reuse ptr space for the next block
3966                 next_lvl_index = force!NextIdx(j/pageSize);
3967                 version (none)
3968                 {
3969                 import std.stdio : writefln, writeln;
3970                 writefln("LEVEL(%s) page mapped idx: %s: 0..%s  ---> [%s..%s]"
3971                         ,level
3972                         ,indices[level-1], pageSize, j, j+pageSize);
3973                 writeln("LEVEL(", level
3974                         , ") mapped page is: ", slice, ": ", arrayRepr(ptr[j .. j+pageSize]));
3975                 writeln("LEVEL(", level
3976                         , ") src page is :", ptr, ": ", arrayRepr(slice[0 .. pageSize]));
3977                 }
3978                 idx!level -= pageSize; // reuse this page, it is duplicate
3979                 break;
3980             }
3981         }
3982         if (j == last)
3983         {
3984     L_allocate_page:
3985             next_lvl_index = force!NextIdx(idx!level/pageSize - 1);
3986             if (state[level].idx_zeros == size_t.max && ptr.zeros(j, j+pageSize))
3987             {
3988                 state[level].idx_zeros = next_lvl_index;
3989             }
3990             // allocate next page
3991             version (none)
3992             {
3993             import std.stdio : writefln;
3994             writefln("LEVEL(%s) page allocated: %s"
3995                      , level, arrayRepr(slice[0 .. pageSize]));
3996             writefln("LEVEL(%s) index: %s ; page at this index %s"
3997                      , level
3998                      , next_lvl_index
3999                      , arrayRepr(
4000                          table.slice!(level)
4001                           [pageSize*next_lvl_index..(next_lvl_index+1)*pageSize]
4002                         ));
4003             }
4004             table.length!level = table.length!level + pageSize;
4005         }
4006     L_know_index:
4007         // for the previous level, values are indices to the pages in the current level
4008         addValue!(level-1)(next_lvl_index, 1);
4009         ptr = table.slice!level; //re-load the slice after moves
4010     }
4011 
4012     // idx - full-width index to fill with v (full-width index != key)
4013     // fills everything in the range of [curIndex, idx) with filler
4014     void putAt(size_t idx, Value v)
4015     {
4016         assert(idx >= curIndex);
4017         immutable numFillers = idx - curIndex;
4018         addValue!lastLevel(defValue, numFillers);
4019         addValue!lastLevel(v, 1);
4020         curIndex = idx + 1;
4021     }
4022 
4023     // ditto, but sets the range of [idxA, idxB) to v
4024     void putRangeAt(size_t idxA, size_t idxB, Value v)
4025     {
4026         assert(idxA >= curIndex);
4027         assert(idxB >= idxA);
4028         size_t numFillers = idxA - curIndex;
4029         addValue!lastLevel(defValue, numFillers);
4030         addValue!lastLevel(v, idxB - idxA);
4031         curIndex = idxB; // open-right
4032     }
4033 
4034     enum errMsg = "non-monotonic prefix function(s), an unsorted range or "~
4035         "duplicate key->value mapping";
4036 
4037 public:
4038     /**
4039         Construct a builder, where `filler` is a value
4040         to indicate empty slots (or "not found" condition).
4041     */
4042     this(Value filler)
4043     {
4044         curIndex = 0;
4045         defValue = filler;
4046         // zeros-page index, ones-page index
4047         foreach (ref v; state)
4048             v = ConstructState(size_t.max, size_t.max);
4049         table = typeof(table)(indices);
4050         // one page per level is a bootstrap minimum
4051         foreach (i, Pred; Prefix)
4052             table.length!i = (1 << Pred.bitSize);
4053     }
4054 
4055     /**
4056         Put a value `v` into interval as
4057         mapped by keys from `a` to `b`.
4058         All slots prior to `a` are filled with
4059         the default filler.
4060     */
4061     void putRange(Key a, Key b, Value v)
4062     {
4063         auto idxA = getIndex(a), idxB = getIndex(b);
4064         // indexes of key should always grow
4065         enforce(idxB >= idxA && idxA >= curIndex, errMsg);
4066         putRangeAt(idxA, idxB, v);
4067     }
4068 
4069     /**
4070         Put a value `v` into slot mapped by `key`.
4071         All slots prior to `key` are filled with the
4072         default filler.
4073     */
4074     void putValue(Key key, Value v)
4075     {
4076         auto idx = getIndex(key);
4077         enforce(idx >= curIndex, errMsg);
4078         putAt(idx, v);
4079     }
4080 
4081     /// Finishes construction of Trie, yielding an immutable Trie instance.
4082     auto build()
4083     {
4084         static if (maxIndex != 0) // doesn't cover full range of size_t
4085         {
4086             assert(curIndex <= maxIndex);
4087             addValue!lastLevel(defValue, maxIndex - curIndex);
4088         }
4089         else
4090         {
4091             if (curIndex != 0 // couldn't wrap around
4092                 || (Prefix.length != 1 && indices[lastLevel] == 0)) // can be just empty
4093             {
4094                 addValue!lastLevel(defValue, size_t.max - curIndex);
4095                 addValue!lastLevel(defValue, 1);
4096             }
4097             // else curIndex already completed the full range of size_t by wrapping around
4098         }
4099         return Trie!(V, Key, maxIndex, Prefix)(table);
4100     }
4101 }
4102 
4103 /**
4104     $(P A generic Trie data-structure for a fixed number of stages.
4105     The design goal is optimal speed with smallest footprint size.
4106     )
4107     $(P It's intentionally read-only and doesn't provide constructors.
4108      To construct one use a special builder,
4109      see $(LREF TrieBuilder) and $(LREF buildTrie).
4110     )
4111 
4112 */
4113 @trusted private struct Trie(Value, Key, Args...)
4114 if (isValidPrefixForTrie!(Key, Args)
4115     || (isValidPrefixForTrie!(Key, Args[1..$])
4116     && is(typeof(Args[0]) : size_t)))
4117 {
4118     import std.range.primitives : isOutputRange;
4119     static if (is(typeof(Args[0]) : size_t))
4120     {
4121         private enum maxIndex = Args[0];
4122         private enum hasBoundsCheck = true;
4123         private alias Prefix = Args[1..$];
4124     }
4125     else
4126     {
4127         private enum hasBoundsCheck = false;
4128         private alias Prefix = Args;
4129     }
4130 
4131     private this()(typeof(_table) table)
4132     {
4133         _table = table;
4134     }
4135 
4136     // only for constant Tries constructed from precompiled tables
4137     private this()(const(size_t)[] offsets, const(size_t)[] sizes,
4138         const(size_t)[] data) const
4139     {
4140         _table = typeof(_table)(offsets, sizes, data);
4141     }
4142 
4143     /**
4144         $(P Lookup the `key` in this `Trie`. )
4145 
4146         $(P The lookup always succeeds if key fits the domain
4147         provided during construction. The whole domain defined
4148         is covered so instead of not found condition
4149         the sentinel (filler) value could be used. )
4150 
4151         $(P See $(LREF buildTrie), $(LREF TrieBuilder) for how to
4152         define a domain of `Trie` keys and the sentinel value. )
4153 
4154         Note:
4155         Domain range-checking is only enabled in debug builds
4156         and results in assertion failure.
4157     */
4158     TypeOfBitPacked!Value opIndex()(Key key) const
4159     {
4160         static if (hasBoundsCheck)
4161             assert(mapTrieIndex!Prefix(key) < maxIndex);
4162         size_t idx;
4163         alias p = Prefix;
4164         idx = cast(size_t) p[0](key);
4165         foreach (i, v; p[0..$-1])
4166             idx = cast(size_t)((_table.ptr!i[idx]<<p[i+1].bitSize) + p[i+1](key));
4167         return _table.ptr!(p.length-1)[idx];
4168     }
4169 
4170     ///
4171     @property size_t bytes(size_t n=size_t.max)() const
4172     {
4173         return _table.bytes!n;
4174     }
4175 
4176     ///
4177     @property size_t pages(size_t n)() const
4178     {
4179         return (bytes!n+2^^(Prefix[n].bitSize-1))
4180                 /2^^Prefix[n].bitSize;
4181     }
4182 
4183     ///
4184     void store(OutRange)(scope OutRange sink) const
4185         if (isOutputRange!(OutRange, char))
4186     {
4187         _table.store(sink);
4188     }
4189 
4190 private:
4191     MultiArray!(idxTypes!(Key, fullBitSize!(Prefix), Prefix[0..$]), Value) _table;
4192 }
4193 
4194 // create a tuple of 'sliceBits' that slice the 'top' of bits into pieces of sizes 'sizes'
4195 // left-to-right, the most significant bits first
4196 template GetBitSlicing(size_t top, sizes...)
4197 {
4198     static if (sizes.length > 0)
4199         alias GetBitSlicing =
4200             AliasSeq!(sliceBits!(top - sizes[0], top),
4201                       GetBitSlicing!(top - sizes[0], sizes[1..$]));
4202     else
4203         alias GetBitSlicing = AliasSeq!();
4204 }
4205 
4206 template callableWith(T)
4207 {
4208     template callableWith(alias Pred)
4209     {
4210         static if (!is(typeof(Pred(T.init))))
4211             enum callableWith = false;
4212         else
4213         {
4214             alias Result = typeof(Pred(T.init));
4215             enum callableWith = isBitPackableType!(TypeOfBitPacked!(Result));
4216         }
4217     }
4218 }
4219 
4220 /*
4221     Check if `Prefix` is a valid set of predicates
4222     for `Trie` template having `Key` as the type of keys.
4223     This requires all predicates to be callable, take
4224     single argument of type `Key` and return unsigned value.
4225 */
4226 template isValidPrefixForTrie(Key, Prefix...)
4227 {
4228     import std.meta : allSatisfy;
4229     enum isValidPrefixForTrie = allSatisfy!(callableWith!Key, Prefix); // TODO: tighten the screws
4230 }
4231 
4232 /*
4233     Check if `Args` is a set of maximum key value followed by valid predicates
4234     for `Trie` template having `Key` as the type of keys.
4235 */
4236 template isValidArgsForTrie(Key, Args...)
4237 {
4238     static if (Args.length > 1)
4239     {
4240         enum isValidArgsForTrie = isValidPrefixForTrie!(Key, Args)
4241             || (isValidPrefixForTrie!(Key, Args[1..$]) && is(typeof(Args[0]) : Key));
4242     }
4243     else
4244         enum isValidArgsForTrie = isValidPrefixForTrie!Args;
4245 }
4246 
4247 @property size_t sumOfIntegerTuple(ints...)()
4248 {
4249     size_t count=0;
4250     foreach (v; ints)
4251         count += v;
4252     return count;
4253 }
4254 
4255 /**
4256     A shorthand for creating a custom multi-level fixed Trie
4257     from a `CodepointSet`. `sizes` are numbers of bits per level,
4258     with the most significant bits used first.
4259 
4260     Note: The sum of `sizes` must be equal 21.
4261 
4262     See_Also: $(LREF toTrie), which is even simpler.
4263 
4264     Example:
4265     ---
4266     {
4267         import std.stdio;
4268         auto set = unicode("Number");
4269         auto trie = codepointSetTrie!(8, 5, 8)(set);
4270         writeln("Input code points to test:");
4271         foreach (line; stdin.byLine)
4272         {
4273             int count=0;
4274             foreach (dchar ch; line)
4275                 if (trie[ch])// is number
4276                     count++;
4277             writefln("Contains %d number code points.", count);
4278         }
4279     }
4280     ---
4281 */
4282 public template codepointSetTrie(sizes...)
4283 if (sumOfIntegerTuple!sizes == 21)
4284 {
4285     auto codepointSetTrie(Set)(Set set)
4286         if (isCodepointSet!Set)
4287     {
4288         auto builder = TrieBuilder!(bool, dchar, lastDchar+1, GetBitSlicing!(21, sizes))(false);
4289         foreach (ival; set.byInterval)
4290             builder.putRange(ival[0], ival[1], true);
4291         return builder.build();
4292     }
4293 }
4294 
4295 /// Type of Trie generated by codepointSetTrie function.
4296 public template CodepointSetTrie(sizes...)
4297 if (sumOfIntegerTuple!sizes == 21)
4298 {
4299     alias Prefix = GetBitSlicing!(21, sizes);
4300     alias CodepointSetTrie = typeof(TrieBuilder!(bool, dchar, lastDchar+1, Prefix)(false).build());
4301 }
4302 
4303 /**
4304     A slightly more general tool for building fixed `Trie`
4305     for the Unicode data.
4306 
4307     Specifically unlike `codepointSetTrie` it's allows creating mappings
4308     of `dchar` to an arbitrary type `T`.
4309 
4310     Note: Overload taking `CodepointSet`s will naturally convert
4311     only to bool mapping `Trie`s.
4312 
4313     CodepointTrie is the type of Trie as generated by codepointTrie function.
4314 */
4315 public template codepointTrie(T, sizes...)
4316 if (sumOfIntegerTuple!sizes == 21)
4317 {
4318     alias Prefix = GetBitSlicing!(21, sizes);
4319 
4320     static if (is(TypeOfBitPacked!T == bool))
4321     {
4322         auto codepointTrie(Set)(const scope Set set)
4323             if (isCodepointSet!Set)
4324         {
4325             return codepointSetTrie(set);
4326         }
4327     }
4328 
4329     ///
4330     auto codepointTrie()(T[dchar] map, T defValue=T.init)
4331     {
4332         return buildTrie!(T, dchar, Prefix)(map, defValue);
4333     }
4334 
4335     // unsorted range of pairs
4336     ///
4337     auto codepointTrie(R)(R range, T defValue=T.init)
4338         if (isInputRange!R
4339             && is(typeof(ElementType!R.init[0]) : T)
4340             && is(typeof(ElementType!R.init[1]) : dchar))
4341     {
4342         // build from unsorted array of pairs
4343         // TODO: expose index sorting functions for Trie
4344         return buildTrie!(T, dchar, Prefix)(range, defValue, true);
4345     }
4346 }
4347 
4348 @system pure unittest
4349 {
4350     import std.algorithm.comparison : max;
4351     import std.algorithm.searching : count;
4352 
4353     // pick characters from the Greek script
4354     auto set = unicode.Greek;
4355 
4356     // a user-defined property (or an expensive function)
4357     // that we want to look up
4358     static uint luckFactor(dchar ch)
4359     {
4360         // here we consider a character lucky
4361         // if its code point has a lot of identical hex-digits
4362         // e.g. arabic letter DDAL (\u0688) has a "luck factor" of 2
4363         ubyte[6] nibbles; // 6 4-bit chunks of code point
4364         uint value = ch;
4365         foreach (i; 0 .. 6)
4366         {
4367             nibbles[i] = value & 0xF;
4368             value >>= 4;
4369         }
4370         uint luck;
4371         foreach (n; nibbles)
4372             luck = cast(uint) max(luck, count(nibbles[], n));
4373         return luck;
4374     }
4375 
4376     // only unsigned built-ins are supported at the moment
4377     alias LuckFactor = BitPacked!(uint, 3);
4378 
4379     // create a temporary associative array (AA)
4380     LuckFactor[dchar] map;
4381     foreach (ch; set.byCodepoint)
4382         map[ch] = LuckFactor(luckFactor(ch));
4383 
4384     // bits per stage are chosen randomly, fell free to optimize
4385     auto trie = codepointTrie!(LuckFactor, 8, 5, 8)(map);
4386 
4387     // from now on the AA is not needed
4388     foreach (ch; set.byCodepoint)
4389         assert(trie[ch] == luckFactor(ch)); // verify
4390     // CJK is not Greek, thus it has the default value
4391     assert(trie['\u4444'] == 0);
4392     // and here is a couple of quite lucky Greek characters:
4393     // Greek small letter epsilon with dasia
4394     assert(trie['\u1F11'] == 3);
4395     // Ancient Greek metretes sign
4396     assert(trie['\U00010181'] == 3);
4397 
4398 }
4399 
4400 /// ditto
4401 public template CodepointTrie(T, sizes...)
4402 if (sumOfIntegerTuple!sizes == 21)
4403 {
4404     alias Prefix = GetBitSlicing!(21, sizes);
4405     alias CodepointTrie = typeof(TrieBuilder!(T, dchar, lastDchar+1, Prefix)(T.init).build());
4406 }
4407 
4408 package(std) template cmpK0(alias Pred)
4409 {
4410     import std.typecons : Tuple;
4411     static bool cmpK0(Value, Key)
4412         (Tuple!(Value, Key) a, Tuple!(Value, Key) b)
4413     {
4414         return Pred(a[1]) < Pred(b[1]);
4415     }
4416 }
4417 
4418 /**
4419     The most general utility for construction of `Trie`s
4420     short of using `TrieBuilder` directly.
4421 
4422     Provides a number of convenience overloads.
4423     `Args` is tuple of maximum key value followed by
4424     predicates to construct index from key.
4425 
4426     Alternatively if the first argument is not a value convertible to `Key`
4427     then the whole tuple of `Args` is treated as predicates
4428     and the maximum Key is deduced from predicates.
4429 */
4430 private template buildTrie(Value, Key, Args...)
4431 if (isValidArgsForTrie!(Key, Args))
4432 {
4433     static if (is(typeof(Args[0]) : Key)) // prefix starts with upper bound on Key
4434     {
4435         alias Prefix = Args[1..$];
4436     }
4437     else
4438         alias Prefix = Args;
4439 
4440     alias getIndex = mapTrieIndex!(Prefix);
4441 
4442     // for multi-sort
4443     template GetComparators(size_t n)
4444     {
4445         static if (n > 0)
4446             alias GetComparators =
4447                 AliasSeq!(GetComparators!(n-1), cmpK0!(Prefix[n-1]));
4448         else
4449             alias GetComparators = AliasSeq!();
4450     }
4451 
4452     /*
4453         Build `Trie` from a range of a Key-Value pairs,
4454         assuming it is sorted by Key as defined by the following lambda:
4455         ------
4456         (a, b) => mapTrieIndex!(Prefix)(a) < mapTrieIndex!(Prefix)(b)
4457         ------
4458         Exception is thrown if it's detected that the above order doesn't hold.
4459 
4460         In other words $(LREF mapTrieIndex) should be a
4461         monotonically increasing function that maps `Key` to an integer.
4462 
4463         See_Also: $(REF sort, std,_algorithm),
4464         $(REF SortedRange, std,range),
4465         $(REF setUnion, std,_algorithm).
4466     */
4467     auto buildTrie(Range)(Range range, Value filler=Value.init)
4468         if (isInputRange!Range && is(typeof(Range.init.front[0]) : Value)
4469             && is(typeof(Range.init.front[1]) : Key))
4470     {
4471         auto builder = TrieBuilder!(Value, Key, Prefix)(filler);
4472         foreach (v; range)
4473             builder.putValue(v[1], v[0]);
4474         return builder.build();
4475     }
4476 
4477     /*
4478         If `Value` is bool (or BitPacked!(bool, x)) then it's possible
4479         to build `Trie` from a range of open-right intervals of `Key`s.
4480         The requirement  on the ordering of keys (and the behavior on the
4481         violation of it) is the same as for Key-Value range overload.
4482 
4483         Intervals denote ranges of !`filler` i.e. the opposite of filler.
4484         If no filler provided keys inside of the intervals map to true,
4485         and `filler` is false.
4486     */
4487     auto buildTrie(Range)(Range range, Value filler=Value.init)
4488         if (is(TypeOfBitPacked!Value ==  bool)
4489             && isInputRange!Range && is(typeof(Range.init.front[0]) : Key)
4490             && is(typeof(Range.init.front[1]) : Key))
4491     {
4492         auto builder = TrieBuilder!(Value, Key, Prefix)(filler);
4493         foreach (ival; range)
4494             builder.putRange(ival[0], ival[1], !filler);
4495         return builder.build();
4496     }
4497 
4498     auto buildTrie(Range)(Range range, Value filler, bool unsorted)
4499         if (isInputRange!Range
4500             && is(typeof(Range.init.front[0]) : Value)
4501             && is(typeof(Range.init.front[1]) : Key))
4502     {
4503         import std.algorithm.sorting : multiSort;
4504         alias Comps = GetComparators!(Prefix.length);
4505         if (unsorted)
4506             multiSort!(Comps)(range);
4507         return buildTrie(range, filler);
4508     }
4509 
4510     /*
4511         If `Value` is bool (or BitPacked!(bool, x)) then it's possible
4512         to build `Trie` simply from an input range of `Key`s.
4513         The requirement  on the ordering of keys (and the behavior on the
4514         violation of it) is the same as for Key-Value range overload.
4515 
4516         Keys found in range denote !`filler` i.e. the opposite of filler.
4517         If no filler provided keys map to true, and `filler` is false.
4518     */
4519     auto buildTrie(Range)(Range range, Value filler=Value.init)
4520         if (is(TypeOfBitPacked!Value ==  bool)
4521             && isInputRange!Range && is(typeof(Range.init.front) : Key))
4522     {
4523         auto builder = TrieBuilder!(Value, Key, Prefix)(filler);
4524         foreach (v; range)
4525             builder.putValue(v, !filler);
4526         return builder.build();
4527     }
4528 
4529     /*
4530         If `Key` is unsigned integer `Trie` could be constructed from array
4531         of values where array index serves as key.
4532     */
4533     auto buildTrie()(Value[] array, Value filler=Value.init)
4534         if (isUnsigned!Key)
4535     {
4536         auto builder = TrieBuilder!(Value, Key, Prefix)(filler);
4537         foreach (idx, v; array)
4538             builder.putValue(idx, v);
4539         return builder.build();
4540     }
4541 
4542     /*
4543         Builds `Trie` from associative array.
4544     */
4545     auto buildTrie(Key, Value)(Value[Key] map, Value filler=Value.init)
4546     {
4547         import std.array : array;
4548         import std.range : zip;
4549         auto range = array(zip(map.values, map.keys));
4550         return buildTrie(range, filler, true); // sort it
4551     }
4552 }
4553 
4554 // helper in place of assumeSize to
4555 //reduce mangled name & help DMD inline Trie functors
4556 struct clamp(size_t bits)
4557 {
4558     static size_t opCall(T)(T arg){ return arg; }
4559     enum bitSize = bits;
4560 }
4561 
4562 struct clampIdx(size_t idx, size_t bits)
4563 {
4564     static size_t opCall(T)(T arg){ return arg[idx]; }
4565     enum bitSize = bits;
4566 }
4567 
4568 /**
4569     Conceptual type that outlines the common properties of all UTF Matchers.
4570 
4571     Note: For illustration purposes only, every method
4572     call results in assertion failure.
4573     Use $(LREF utfMatcher) to obtain a concrete matcher
4574     for UTF-8 or UTF-16 encodings.
4575 */
4576 public struct MatcherConcept
4577 {
4578     /**
4579         $(P Perform a semantic equivalent 2 operations:
4580         decoding a $(CODEPOINT) at front of `inp` and testing if
4581         it belongs to the set of $(CODEPOINTS) of this matcher. )
4582 
4583         $(P The effect on `inp` depends on the kind of function called:)
4584 
4585         $(P Match. If the codepoint is found in the set then range `inp`
4586         is advanced by its size in $(S_LINK Code unit, code units),
4587         otherwise the range is not modifed.)
4588 
4589         $(P Skip. The range is always advanced by the size
4590         of the tested $(CODEPOINT) regardless of the result of test.)
4591 
4592         $(P Test. The range is left unaffected regardless
4593         of the result of test.)
4594     */
4595     public bool match(Range)(ref Range inp)
4596         if (isRandomAccessRange!Range && is(ElementType!Range : char))
4597     {
4598        assert(false);
4599     }
4600 
4601     ///ditto
4602     public bool skip(Range)(ref Range inp)
4603         if (isRandomAccessRange!Range && is(ElementType!Range : char))
4604     {
4605         assert(false);
4606     }
4607 
4608     ///ditto
4609     public bool test(Range)(ref Range inp)
4610         if (isRandomAccessRange!Range && is(ElementType!Range : char))
4611     {
4612         assert(false);
4613     }
4614     ///
4615     pure @safe unittest
4616     {
4617         string truth = "2² = 4";
4618         auto m = utfMatcher!char(unicode.Number);
4619         assert(m.match(truth)); // '2' is a number all right
4620         assert(truth == "² = 4"); // skips on match
4621         assert(m.match(truth)); // so is the superscript '2'
4622         assert(!m.match(truth)); // space is not a number
4623         assert(truth == " = 4"); // unaffected on no match
4624         assert(!m.skip(truth)); // same test ...
4625         assert(truth == "= 4"); // but skips a codepoint regardless
4626         assert(!m.test(truth)); // '=' is not a number
4627         assert(truth == "= 4"); // test never affects argument
4628     }
4629 
4630     /**
4631         Advanced feature - provide direct access to a subset of matcher based a
4632         set of known encoding lengths. Lengths are provided in
4633         $(S_LINK Code unit, code units). The sub-matcher then may do less
4634         operations per any `test`/`match`.
4635 
4636         Use with care as the sub-matcher won't match
4637         any $(CODEPOINTS) that have encoded length that doesn't belong
4638         to the selected set of lengths. Also the sub-matcher object references
4639         the parent matcher and must not be used past the liftetime
4640         of the latter.
4641 
4642         Another caveat of using sub-matcher is that skip is not available
4643         preciesly because sub-matcher doesn't detect all lengths.
4644     */
4645     @property auto subMatcher(Lengths...)()
4646     {
4647         assert(0);
4648         return this;
4649     }
4650 
4651     pure @safe unittest
4652     {
4653         auto m = utfMatcher!char(unicode.Number);
4654         string square = "2²";
4655         // about sub-matchers
4656         assert(!m.subMatcher!(2,3,4).test(square)); // ASCII no covered
4657         assert(m.subMatcher!1.match(square)); // ASCII-only, works
4658         assert(!m.subMatcher!1.test(square)); // unicode '²'
4659         assert(m.subMatcher!(2,3,4).match(square));  //
4660         assert(square == "");
4661         wstring wsquare = "2²";
4662         auto m16 = utfMatcher!wchar(unicode.Number);
4663         // may keep ref, but the orignal (m16) must be kept alive
4664         auto bmp = m16.subMatcher!1;
4665         assert(bmp.match(wsquare)); // Okay, in basic multilingual plan
4666         assert(bmp.match(wsquare)); // And '²' too
4667     }
4668 }
4669 
4670 /**
4671     Test if `M` is an UTF Matcher for ranges of `Char`.
4672 */
4673 public enum isUtfMatcher(M, C) = __traits(compiles, (){
4674     C[] s;
4675     auto d = s.decoder;
4676     M m;
4677     assert(is(typeof(m.match(d)) == bool));
4678     assert(is(typeof(m.test(d)) == bool));
4679     static if (is(typeof(m.skip(d))))
4680     {
4681         assert(is(typeof(m.skip(d)) == bool));
4682         assert(is(typeof(m.skip(s)) == bool));
4683     }
4684     assert(is(typeof(m.match(s)) == bool));
4685     assert(is(typeof(m.test(s)) == bool));
4686 });
4687 
4688 pure @safe unittest
4689 {
4690     alias CharMatcher = typeof(utfMatcher!char(CodepointSet.init));
4691     alias WcharMatcher = typeof(utfMatcher!wchar(CodepointSet.init));
4692     static assert(isUtfMatcher!(CharMatcher, char));
4693     static assert(isUtfMatcher!(CharMatcher, immutable(char)));
4694     static assert(isUtfMatcher!(WcharMatcher, wchar));
4695     static assert(isUtfMatcher!(WcharMatcher, immutable(wchar)));
4696 }
4697 
4698 enum Mode {
4699     alwaysSkip,
4700     neverSkip,
4701     skipOnMatch
4702 }
4703 
4704 mixin template ForwardStrings()
4705 {
4706     private bool fwdStr(string fn, C)(ref C[] str) const @trusted
4707     {
4708         import std.utf : byCodeUnit;
4709         alias type = typeof(byCodeUnit(str));
4710         return mixin(fn~"(*cast(type*)&str)");
4711     }
4712 }
4713 
4714 template Utf8Matcher()
4715 {
4716     enum validSize(int sz) = sz >= 1 && sz <= 4;
4717 
4718     void badEncoding() pure @safe
4719     {
4720         import std.utf : UTFException;
4721         throw new UTFException("Invalid UTF-8 sequence");
4722     }
4723 
4724     //for 1-stage ASCII
4725     alias AsciiSpec = AliasSeq!(bool, char, clamp!7);
4726     //for 2-stage lookup of 2 byte UTF-8 sequences
4727     alias Utf8Spec2 = AliasSeq!(bool, char[2],
4728         clampIdx!(0, 5), clampIdx!(1, 6));
4729     //ditto for 3 byte
4730     alias Utf8Spec3 = AliasSeq!(bool, char[3],
4731         clampIdx!(0, 4),
4732         clampIdx!(1, 6),
4733         clampIdx!(2, 6)
4734     );
4735     //ditto for 4 byte
4736     alias Utf8Spec4 = AliasSeq!(bool, char[4],
4737         clampIdx!(0, 3), clampIdx!(1, 6),
4738         clampIdx!(2, 6), clampIdx!(3, 6)
4739     );
4740     alias Tables = AliasSeq!(
4741         typeof(TrieBuilder!(AsciiSpec)(false).build()),
4742         typeof(TrieBuilder!(Utf8Spec2)(false).build()),
4743         typeof(TrieBuilder!(Utf8Spec3)(false).build()),
4744         typeof(TrieBuilder!(Utf8Spec4)(false).build())
4745     );
4746     alias Table(int size) = Tables[size-1];
4747 
4748     enum leadMask(size_t size) = (cast(size_t) 1<<(7 - size))-1;
4749     enum encMask(size_t size) = ((1 << size)-1)<<(8-size);
4750 
4751     char truncate()(char ch) pure @safe
4752     {
4753         ch -= 0x80;
4754         if (ch < 0x40)
4755         {
4756             return ch;
4757         }
4758         else
4759         {
4760             badEncoding();
4761             return cast(char) 0;
4762         }
4763     }
4764 
4765     static auto encode(size_t sz)(dchar ch)
4766         if (sz > 1)
4767     {
4768         import std.utf : encodeUTF = encode;
4769         char[4] buf;
4770         encodeUTF(buf, ch);
4771         char[sz] ret;
4772         buf[0] &= leadMask!sz;
4773         foreach (n; 1 .. sz)
4774             buf[n] = buf[n] & 0x3f; //keep 6 lower bits
4775         ret[] = buf[0 .. sz];
4776         return ret;
4777     }
4778 
4779     auto build(Set)(Set set)
4780     {
4781         import std.algorithm.iteration : map;
4782         auto ascii = set & unicode.ASCII;
4783         auto utf8_2 = set & CodepointSet(0x80, 0x800);
4784         auto utf8_3 = set & CodepointSet(0x800, 0x1_0000);
4785         auto utf8_4 = set & CodepointSet(0x1_0000, lastDchar+1);
4786         auto asciiT = ascii.byCodepoint.map!(x=>cast(char) x).buildTrie!(AsciiSpec);
4787         auto utf8_2T = utf8_2.byCodepoint.map!(x=>encode!2(x)).buildTrie!(Utf8Spec2);
4788         auto utf8_3T = utf8_3.byCodepoint.map!(x=>encode!3(x)).buildTrie!(Utf8Spec3);
4789         auto utf8_4T = utf8_4.byCodepoint.map!(x=>encode!4(x)).buildTrie!(Utf8Spec4);
4790         alias Ret = Impl!(1,2,3,4);
4791         return Ret(asciiT, utf8_2T, utf8_3T, utf8_4T);
4792     }
4793 
4794     // Bootstrap UTF-8 static matcher interface
4795     // from 3 primitives: tab!(size), lookup and Sizes
4796     mixin template DefMatcher()
4797     {
4798         import std.format : format;
4799         import std.meta : Erase, staticIndexOf;
4800         enum hasASCII = staticIndexOf!(1, Sizes) >= 0;
4801         alias UniSizes = Erase!(1, Sizes);
4802 
4803         //generate dispatch code sequence for unicode parts
4804         static auto genDispatch()
4805         {
4806             string code;
4807             foreach (size; UniSizes)
4808                 code ~= format(q{
4809                     if ((ch & ~leadMask!%d) == encMask!(%d))
4810                         return lookup!(%d, mode)(inp);
4811                     else
4812                 }, size, size, size);
4813             static if (Sizes.length == 4) //covers all code unit cases
4814                 code ~= "{ badEncoding(); return false; }";
4815             else
4816                 code ~= "return false;"; //may be just fine but not covered
4817             return code;
4818         }
4819         enum dispatch = genDispatch();
4820 
4821         public bool match(Range)(ref Range inp) const
4822             if (isRandomAccessRange!Range && is(ElementType!Range : char) &&
4823                 !isDynamicArray!Range)
4824         {
4825             enum mode = Mode.skipOnMatch;
4826             assert(!inp.empty);
4827             immutable ch = inp[0];
4828             static if (hasASCII)
4829             {
4830                 if (ch < 0x80)
4831                 {
4832                     immutable r = tab!1[ch];
4833                     if (r)
4834                         inp.popFront();
4835                     return r;
4836                 }
4837                 else
4838                     mixin(dispatch);
4839             }
4840             else
4841                 mixin(dispatch);
4842         }
4843 
4844         static if (Sizes.length == 4) // can skip iff can detect all encodings
4845         {
4846             public bool skip(Range)(ref Range inp) const
4847                 if (isRandomAccessRange!Range && is(ElementType!Range : char) &&
4848                     !isDynamicArray!Range)
4849             {
4850                 enum mode = Mode.alwaysSkip;
4851                 assert(!inp.empty);
4852                 auto ch = inp[0];
4853                 static if (hasASCII)
4854                 {
4855                     if (ch < 0x80)
4856                     {
4857                         inp.popFront();
4858                         return tab!1[ch];
4859                     }
4860                     else
4861                         mixin(dispatch);
4862                 }
4863                 else
4864                     mixin(dispatch);
4865             }
4866         }
4867 
4868         public bool test(Range)(ref Range inp) const
4869             if (isRandomAccessRange!Range && is(ElementType!Range : char) &&
4870                 !isDynamicArray!Range)
4871         {
4872             enum mode = Mode.neverSkip;
4873             assert(!inp.empty);
4874             auto ch = inp[0];
4875             static if (hasASCII)
4876             {
4877                 if (ch < 0x80)
4878                     return tab!1[ch];
4879                 else
4880                     mixin(dispatch);
4881             }
4882             else
4883                 mixin(dispatch);
4884         }
4885 
4886         bool match(C)(ref C[] str) const
4887             if (isSomeChar!C)
4888         {
4889             return fwdStr!"match"(str);
4890         }
4891 
4892         bool skip(C)(ref C[] str) const
4893             if (isSomeChar!C)
4894         {
4895             return fwdStr!"skip"(str);
4896         }
4897 
4898         bool test(C)(ref C[] str) const
4899             if (isSomeChar!C)
4900         {
4901             return fwdStr!"test"(str);
4902         }
4903 
4904         mixin ForwardStrings;
4905     }
4906 
4907     struct Impl(Sizes...)
4908     {
4909         import std.meta : allSatisfy, staticMap;
4910         static assert(allSatisfy!(validSize, Sizes),
4911             "Only lengths of 1, 2, 3 and 4 code unit are possible for UTF-8");
4912     private:
4913         //pick tables for chosen sizes
4914         alias OurTabs = staticMap!(Table, Sizes);
4915         OurTabs tables;
4916         mixin DefMatcher;
4917         //static disptach helper UTF size ==> table
4918         alias tab(int i) = tables[i - 1];
4919 
4920         package(std) @property CherryPick!(Impl, SizesToPick) subMatcher(SizesToPick...)()
4921         {
4922             return CherryPick!(Impl, SizesToPick)(&this);
4923         }
4924 
4925         bool lookup(int size, Mode mode, Range)(ref Range inp) const
4926         {
4927             import std.range : popFrontN;
4928             if (inp.length < size)
4929             {
4930                 badEncoding();
4931                 return false;
4932             }
4933             char[size] needle = void;
4934             needle[0] = leadMask!size & inp[0];
4935             static foreach (i; 1 .. size)
4936             {
4937                 needle[i] = truncate(inp[i]);
4938             }
4939             //overlong encoding checks
4940             static if (size == 2)
4941             {
4942                 //0x80-0x7FF
4943                 //got 6 bits in needle[1], must use at least 8 bits
4944                 //must use at least 2 bits in needle[1]
4945                 if (needle[0] < 2) badEncoding();
4946             }
4947             else static if (size == 3)
4948             {
4949                 //0x800-0xFFFF
4950                 //got 6 bits in needle[2], must use at least 12bits
4951                 //must use 6 bits in needle[1] or anything in needle[0]
4952                 if (needle[0] == 0 && needle[1] < 0x20) badEncoding();
4953             }
4954             else static if (size == 4)
4955             {
4956                 //0x800-0xFFFF
4957                 //got 2x6=12 bits in needle[2 .. 3] must use at least 17bits
4958                 //must use 5 bits (or above) in needle[1] or anything in needle[0]
4959                 if (needle[0] == 0 && needle[1] < 0x10) badEncoding();
4960             }
4961             static if (mode == Mode.alwaysSkip)
4962             {
4963                 inp.popFrontN(size);
4964                 return tab!size[needle];
4965             }
4966             else static if (mode == Mode.neverSkip)
4967             {
4968                 return tab!size[needle];
4969             }
4970             else
4971             {
4972                 static assert(mode == Mode.skipOnMatch);
4973                 if (tab!size[needle])
4974                 {
4975                     inp.popFrontN(size);
4976                     return true;
4977                 }
4978                 else
4979                     return false;
4980             }
4981         }
4982     }
4983 
4984     struct CherryPick(I, Sizes...)
4985     {
4986         import std.meta : allSatisfy;
4987         static assert(allSatisfy!(validSize, Sizes),
4988             "Only lengths of 1, 2, 3 and 4 code unit are possible for UTF-8");
4989     private:
4990         I* m;
4991         @property auto tab(int i)() const { return m.tables[i - 1]; }
4992         bool lookup(int size, Mode mode, Range)(ref Range inp) const
4993         {
4994             return m.lookup!(size, mode)(inp);
4995         }
4996         mixin DefMatcher;
4997     }
4998 }
4999 
5000 template Utf16Matcher()
5001 {
5002     enum validSize(int sz) = sz >= 1 && sz <= 2;
5003 
5004     void badEncoding() pure @safe
5005     {
5006         import std.utf : UTFException;
5007         throw new UTFException("Invalid UTF-16 sequence");
5008     }
5009 
5010     // 1-stage ASCII
5011     alias AsciiSpec = AliasSeq!(bool, wchar, clamp!7);
5012     //2-stage BMP
5013     alias BmpSpec = AliasSeq!(bool, wchar, sliceBits!(7, 16), sliceBits!(0, 7));
5014     //4-stage - full Unicode
5015     //assume that 0xD800 & 0xDC00 bits are cleared
5016     //thus leaving 10 bit per wchar to worry about
5017     alias UniSpec = AliasSeq!(bool, wchar[2],
5018         assumeSize!(x=>x[0]>>4, 6), assumeSize!(x=>x[0]&0xf, 4),
5019         assumeSize!(x=>x[1]>>6, 4), assumeSize!(x=>x[1]&0x3f, 6),
5020     );
5021     alias Ascii = typeof(TrieBuilder!(AsciiSpec)(false).build());
5022     alias Bmp = typeof(TrieBuilder!(BmpSpec)(false).build());
5023     alias Uni = typeof(TrieBuilder!(UniSpec)(false).build());
5024 
5025     auto encode2(dchar ch)
5026     {
5027         ch -= 0x1_0000;
5028         assert(ch <= 0xF_FFFF);
5029         wchar[2] ret;
5030         //do not put surrogate bits, they are sliced off
5031         ret[0] = cast(wchar)(ch >> 10);
5032         ret[1] = (ch & 0xFFF);
5033         return ret;
5034     }
5035 
5036     auto build(Set)(Set set)
5037     {
5038         import std.algorithm.iteration : map;
5039         auto ascii = set & unicode.ASCII;
5040         auto bmp = (set & CodepointSet.fromIntervals(0x80, 0xFFFF+1))
5041             - CodepointSet.fromIntervals(0xD800, 0xDFFF+1);
5042         auto other = set - (bmp | ascii);
5043         auto asciiT = ascii.byCodepoint.map!(x=>cast(char) x).buildTrie!(AsciiSpec);
5044         auto bmpT = bmp.byCodepoint.map!(x=>cast(wchar) x).buildTrie!(BmpSpec);
5045         auto otherT = other.byCodepoint.map!(x=>encode2(x)).buildTrie!(UniSpec);
5046         alias Ret = Impl!(1,2);
5047         return Ret(asciiT, bmpT, otherT);
5048     }
5049 
5050     //bootstrap full UTF-16 matcher interace from
5051     //sizeFlags, lookupUni and ascii
5052     mixin template DefMatcher()
5053     {
5054         public bool match(Range)(ref Range inp) const
5055             if (isRandomAccessRange!Range && is(ElementType!Range : wchar) &&
5056                 !isDynamicArray!Range)
5057         {
5058             enum mode = Mode.skipOnMatch;
5059             assert(!inp.empty);
5060             immutable ch = inp[0];
5061             static if (sizeFlags & 1)
5062             {
5063                 if (ch < 0x80)
5064                 {
5065                   if (ascii[ch])
5066                   {
5067                       inp.popFront();
5068                       return true;
5069                   }
5070                   else
5071                       return false;
5072                 }
5073                 return lookupUni!mode(inp);
5074             }
5075             else
5076                 return lookupUni!mode(inp);
5077         }
5078 
5079         static if (Sizes.length == 2)
5080         {
5081             public bool skip(Range)(ref Range inp) const
5082                 if (isRandomAccessRange!Range && is(ElementType!Range : wchar) &&
5083                     !isDynamicArray!Range)
5084             {
5085                 enum mode = Mode.alwaysSkip;
5086                 assert(!inp.empty);
5087                 immutable ch = inp[0];
5088                 static if (sizeFlags & 1)
5089                 {
5090                     if (ch < 0x80)
5091                     {
5092                         inp.popFront();
5093                         return ascii[ch];
5094                     }
5095                     else
5096                         return lookupUni!mode(inp);
5097                 }
5098                 else
5099                     return lookupUni!mode(inp);
5100             }
5101         }
5102 
5103         public bool test(Range)(ref Range inp) const
5104             if (isRandomAccessRange!Range && is(ElementType!Range : wchar) &&
5105                 !isDynamicArray!Range)
5106         {
5107             enum mode = Mode.neverSkip;
5108             assert(!inp.empty);
5109             auto ch = inp[0];
5110             static if (sizeFlags & 1)
5111                 return ch < 0x80 ? ascii[ch] : lookupUni!mode(inp);
5112             else
5113                 return lookupUni!mode(inp);
5114         }
5115 
5116         bool match(C)(ref C[] str) const
5117             if (isSomeChar!C)
5118         {
5119             return fwdStr!"match"(str);
5120         }
5121 
5122         bool skip(C)(ref C[] str) const
5123             if (isSomeChar!C)
5124         {
5125             return fwdStr!"skip"(str);
5126         }
5127 
5128         bool test(C)(ref C[] str) const
5129             if (isSomeChar!C)
5130         {
5131             return fwdStr!"test"(str);
5132         }
5133 
5134         mixin ForwardStrings; //dispatch strings to range versions
5135     }
5136 
5137     struct Impl(Sizes...)
5138         if (Sizes.length >= 1 && Sizes.length <= 2)
5139     {
5140     private:
5141         import std.meta : allSatisfy;
5142         static assert(allSatisfy!(validSize, Sizes),
5143             "Only lengths of 1 and 2 code units are possible in UTF-16");
5144         static if (Sizes.length > 1)
5145             enum sizeFlags = Sizes[0] | Sizes[1];
5146         else
5147             enum sizeFlags = Sizes[0];
5148 
5149         static if (sizeFlags & 1)
5150         {
5151             Ascii ascii;
5152             Bmp bmp;
5153         }
5154         static if (sizeFlags & 2)
5155         {
5156             Uni uni;
5157         }
5158         mixin DefMatcher;
5159 
5160         package(std) @property CherryPick!(Impl, SizesToPick) subMatcher(SizesToPick...)()
5161         {
5162             return CherryPick!(Impl, SizesToPick)(&this);
5163         }
5164 
5165         bool lookupUni(Mode mode, Range)(ref Range inp) const
5166         {
5167             wchar x = cast(wchar)(inp[0] - 0xD800);
5168             //not a high surrogate
5169             if (x > 0x3FF)
5170             {
5171                 //low surrogate
5172                 if (x <= 0x7FF) badEncoding();
5173                 static if (sizeFlags & 1)
5174                 {
5175                     auto ch = inp[0];
5176                     static if (mode == Mode.alwaysSkip)
5177                         inp.popFront();
5178                     static if (mode == Mode.skipOnMatch)
5179                     {
5180                         if (bmp[ch])
5181                         {
5182                             inp.popFront();
5183                             return true;
5184                         }
5185                         else
5186                             return false;
5187                     }
5188                     else
5189                         return bmp[ch];
5190                 }
5191                 else //skip is not available for sub-matchers, so just false
5192                     return false;
5193             }
5194             else
5195             {
5196                 import std.range : popFrontN;
5197                 static if (sizeFlags & 2)
5198                 {
5199                     if (inp.length < 2)
5200                         badEncoding();
5201                     wchar y = cast(wchar)(inp[1] - 0xDC00);
5202                     //not a low surrogate
5203                     if (y > 0x3FF)
5204                         badEncoding();
5205                     wchar[2] needle = [inp[0] & 0x3ff, inp[1] & 0x3ff];
5206                     static if (mode == Mode.alwaysSkip)
5207                         inp.popFrontN(2);
5208                     static if (mode == Mode.skipOnMatch)
5209                     {
5210                         if (uni[needle])
5211                         {
5212                             inp.popFrontN(2);
5213                             return true;
5214                         }
5215                         else
5216                             return false;
5217                     }
5218                     else
5219                         return uni[needle];
5220                 }
5221                 else //ditto
5222                     return false;
5223             }
5224         }
5225     }
5226 
5227     struct CherryPick(I, Sizes...)
5228         if (Sizes.length >= 1 && Sizes.length <= 2)
5229     {
5230     private:
5231         import std.meta : allSatisfy;
5232         I* m;
5233         enum sizeFlags = I.sizeFlags;
5234 
5235         static if (sizeFlags & 1)
5236         {
5237             @property auto ascii()() const { return m.ascii; }
5238         }
5239 
5240         bool lookupUni(Mode mode, Range)(ref Range inp) const
5241         {
5242             return m.lookupUni!mode(inp);
5243         }
5244         mixin DefMatcher;
5245         static assert(allSatisfy!(validSize, Sizes),
5246             "Only lengths of 1 and 2 code units are possible in UTF-16");
5247     }
5248 }
5249 
5250 private auto utf8Matcher(Set)(Set set)
5251 {
5252     return Utf8Matcher!().build(set);
5253 }
5254 
5255 private auto utf16Matcher(Set)(Set set)
5256 {
5257     return Utf16Matcher!().build(set);
5258 }
5259 
5260 /**
5261     Constructs a matcher object
5262     to classify $(CODEPOINTS) from the `set` for encoding
5263     that has `Char` as code unit.
5264 
5265     See $(LREF MatcherConcept) for API outline.
5266 */
5267 public auto utfMatcher(Char, Set)(Set set)
5268 if (isCodepointSet!Set)
5269 {
5270     static if (is(Char : char))
5271         return utf8Matcher(set);
5272     else static if (is(Char : wchar))
5273         return utf16Matcher(set);
5274     else static if (is(Char : dchar))
5275         static assert(false, "UTF-32 needs no decoding,
5276             and thus not supported by utfMatcher");
5277     else
5278         static assert(false, "Only character types 'char' and 'wchar' are allowed");
5279 }
5280 
5281 
5282 //a range of code units, packed with index to speed up forward iteration
5283 package(std) auto decoder(C)(C[] s, size_t offset=0)
5284 if (is(C : wchar) || is(C : char))
5285 {
5286     static struct Decoder
5287     {
5288     pure nothrow:
5289         C[] str;
5290         size_t idx;
5291         @property C front(){ return str[idx]; }
5292         @property C back(){ return str[$-1]; }
5293         void popFront(){ idx++; }
5294         void popBack(){ str = str[0..$-1]; }
5295         void popFrontN(size_t n){ idx += n; }
5296         @property bool empty(){ return idx == str.length; }
5297         @property auto save(){ return this; }
5298         auto opIndex(size_t i){ return str[idx+i]; }
5299         @property size_t length(){ return str.length - idx; }
5300         alias opDollar = length;
5301         auto opSlice(size_t a, size_t b){ return Decoder(str[0 .. idx+b], idx+a); }
5302     }
5303     static assert(isRandomAccessRange!Decoder);
5304     static assert(is(ElementType!Decoder : C));
5305     return Decoder(s, offset);
5306 }
5307 
5308 pure @safe unittest
5309 {
5310     string rs = "hi! ネемног砀 текста";
5311     auto codec = rs.decoder;
5312     auto utf8 =  utf8Matcher(unicode.Letter);
5313     auto asc = utf8.subMatcher!(1);
5314     auto uni = utf8.subMatcher!(2,3,4);
5315     assert(asc.test(codec));
5316     assert(!uni.match(codec));
5317     assert(utf8.skip(codec));
5318     assert(codec.idx == 1);
5319 
5320     assert(!uni.match(codec));
5321     assert(asc.test(codec));
5322     assert(utf8.skip(codec));
5323     assert(codec.idx == 2);
5324     assert(!asc.match(codec));
5325 
5326     assert(!utf8.test(codec));
5327     assert(!utf8.skip(codec));
5328 
5329     assert(!asc.test(codec));
5330     assert(!utf8.test(codec));
5331     assert(!utf8.skip(codec));
5332     assert(utf8.test(codec));
5333     foreach (i; 0 .. 7)
5334     {
5335         assert(!asc.test(codec));
5336         assert(uni.test(codec));
5337         assert(utf8.skip(codec));
5338     }
5339     assert(!utf8.test(codec));
5340     assert(!utf8.skip(codec));
5341     //the same with match where applicable
5342     codec = rs.decoder;
5343     assert(utf8.match(codec));
5344     assert(codec.idx == 1);
5345     assert(utf8.match(codec));
5346     assert(codec.idx == 2);
5347     assert(!utf8.match(codec));
5348     assert(codec.idx == 2);
5349     assert(!utf8.skip(codec));
5350     assert(!utf8.skip(codec));
5351 
5352     foreach (i; 0 .. 7)
5353     {
5354         assert(!asc.test(codec));
5355         assert(utf8.test(codec));
5356         assert(utf8.match(codec));
5357     }
5358     auto i = codec.idx;
5359     assert(!utf8.match(codec));
5360     assert(codec.idx == i);
5361 }
5362 
5363 pure @safe unittest
5364 {
5365     import std.range : stride;
5366     static bool testAll(Matcher, Range)(ref Matcher m, ref Range r) @safe
5367     {
5368         bool t = m.test(r);
5369         auto save = r.idx;
5370         assert(t == m.match(r));
5371         assert(r.idx == save || t); //ether no change or was match
5372         r.idx = save;
5373         static if (is(typeof(m.skip(r))))
5374         {
5375             assert(t == m.skip(r));
5376             assert(r.idx != save); //always changed
5377             r.idx = save;
5378         }
5379         return t;
5380     }
5381     auto utf16 = utfMatcher!wchar(unicode.L);
5382     auto bmp = utf16.subMatcher!1;
5383     auto nonBmp = utf16.subMatcher!1;
5384     auto utf8 = utfMatcher!char(unicode.L);
5385     auto ascii = utf8.subMatcher!1;
5386     auto uni2 = utf8.subMatcher!2;
5387     auto uni3 = utf8.subMatcher!3;
5388     auto uni24 = utf8.subMatcher!(2,4);
5389     foreach (ch; unicode.L.byCodepoint.stride(3))
5390     {
5391         import std.utf : encode;
5392         char[4] buf;
5393         wchar[2] buf16;
5394         auto len = encode(buf, ch);
5395         auto len16 = encode(buf16, ch);
5396         auto c8 = buf[0 .. len].decoder;
5397         auto c16 = buf16[0 .. len16].decoder;
5398         assert(testAll(utf16, c16));
5399         assert(testAll(bmp, c16) || len16 != 1);
5400         assert(testAll(nonBmp, c16) || len16 != 2);
5401 
5402         assert(testAll(utf8, c8));
5403 
5404         //submatchers return false on out of their domain
5405         assert(testAll(ascii, c8) || len != 1);
5406         assert(testAll(uni2, c8) || len != 2);
5407         assert(testAll(uni3, c8) || len != 3);
5408         assert(testAll(uni24, c8) || (len != 2 && len != 4));
5409     }
5410 }
5411 
5412 // cover decode fail cases of Matcher
5413 pure @safe unittest
5414 {
5415     import std.algorithm.iteration : map;
5416     import std.exception : collectException;
5417     import std.format : format;
5418     auto utf16 = utfMatcher!wchar(unicode.L);
5419     auto utf8 = utfMatcher!char(unicode.L);
5420     //decode failure cases UTF-8
5421     alias fails8 = AliasSeq!("\xC1", "\x80\x00","\xC0\x00", "\xCF\x79",
5422         "\xFF\x00\0x00\0x00\x00", "\xC0\0x80\0x80\x80", "\x80\0x00\0x00\x00",
5423         "\xCF\x00\0x00\0x00\x00");
5424     foreach (msg; fails8)
5425     {
5426         assert(collectException((){
5427             auto s = msg;
5428             size_t idx = 0;
5429             utf8.test(s);
5430         }()), format("%( %2x %)", cast(immutable(ubyte)[]) msg));
5431     }
5432     //decode failure cases UTF-16
5433     alias fails16 = AliasSeq!([0xD811], [0xDC02]);
5434     foreach (msg; fails16)
5435     {
5436         assert(collectException((){
5437             auto s = msg.map!(x => cast(wchar) x);
5438             utf16.test(s);
5439         }()));
5440     }
5441 }
5442 
5443 /++
5444     Convenience function to construct optimal configurations for
5445     packed Trie from any `set` of $(CODEPOINTS).
5446 
5447     The parameter `level` indicates the number of trie levels to use,
5448     allowed values are: 1, 2, 3 or 4. Levels represent different trade-offs
5449     speed-size wise.
5450 
5451     $(P Level 1 is fastest and the most memory hungry (a bit array). )
5452     $(P Level 4 is the slowest and has the smallest footprint. )
5453 
5454     See the $(S_LINK Synopsis, Synopsis) section for example.
5455 
5456     Note:
5457     Level 4 stays very practical (being faster and more predictable)
5458     compared to using direct lookup on the `set` itself.
5459 
5460 
5461 +/
5462 public auto toTrie(size_t level, Set)(Set set)
5463 if (isCodepointSet!Set)
5464 {
5465     static if (level == 1)
5466         return codepointSetTrie!(21)(set);
5467     else static if (level == 2)
5468         return codepointSetTrie!(10, 11)(set);
5469     else static if (level == 3)
5470         return codepointSetTrie!(8, 5, 8)(set);
5471     else static if (level == 4)
5472          return codepointSetTrie!(6, 4, 4, 7)(set);
5473     else
5474         static assert(false,
5475             "Sorry, toTrie doesn't support levels > 4, use codepointSetTrie directly");
5476 }
5477 
5478 /**
5479     $(P Builds a `Trie` with typically optimal speed-size trade-off
5480     and wraps it into a delegate of the following type:
5481     $(D bool delegate(dchar ch)). )
5482 
5483     $(P Effectively this creates a 'tester' lambda suitable
5484     for algorithms like std.algorithm.find that take unary predicates. )
5485 
5486     See the $(S_LINK Synopsis, Synopsis) section for example.
5487 */
5488 public auto toDelegate(Set)(Set set)
5489 if (isCodepointSet!Set)
5490 {
5491     // 3 is very small and is almost as fast as 2-level (due to CPU caches?)
5492     auto t = toTrie!3(set);
5493     return (dchar ch) => t[ch];
5494 }
5495 
5496 /**
5497     $(P Opaque wrapper around unsigned built-in integers and
5498     code unit (char/wchar/dchar) types.
5499     Parameter `sz` indicates that the value is confined
5500     to the range of [0, 2^^sz$(RPAREN). With this knowledge it can be
5501     packed more tightly when stored in certain
5502     data-structures like trie. )
5503 
5504     Note:
5505     $(P The $(D BitPacked!(T, sz)) is implicitly convertible to `T`
5506     but not vise-versa. Users have to ensure the value fits in
5507     the range required and use the `cast`
5508     operator to perform the conversion.)
5509 */
5510 struct BitPacked(T, size_t sz)
5511 if (isIntegral!T || is(T:dchar))
5512 {
5513     enum bitSize = sz;
5514     T _value;
5515     alias _value this;
5516 }
5517 
5518 /*
5519     Depending on the form of the passed argument `bitSizeOf` returns
5520     the amount of bits required to represent a given type
5521     or a return type of a given functor.
5522 */
5523 template bitSizeOf(Args...)
5524 if (Args.length == 1)
5525 {
5526     import std.traits : ReturnType;
5527     alias T = Args[0];
5528     static if (__traits(compiles, { size_t val = T.bitSize; })) //(is(typeof(T.bitSize) : size_t))
5529     {
5530         enum bitSizeOf = T.bitSize;
5531     }
5532     else static if (is(ReturnType!T dummy == BitPacked!(U, bits), U, size_t bits))
5533     {
5534         enum bitSizeOf = bitSizeOf!(ReturnType!T);
5535     }
5536     else
5537     {
5538         enum bitSizeOf = T.sizeof*8;
5539     }
5540 }
5541 
5542 /**
5543     Tests if `T` is some instantiation of $(LREF BitPacked)!(U, x)
5544     and thus suitable for packing.
5545 */
5546 template isBitPacked(T)
5547 {
5548     static if (is(T dummy == BitPacked!(U, bits), U, size_t bits))
5549         enum isBitPacked = true;
5550     else
5551         enum isBitPacked = false;
5552 }
5553 
5554 /**
5555     Gives the type `U` from $(LREF BitPacked)!(U, x)
5556     or `T` itself for every other type.
5557 */
5558 template TypeOfBitPacked(T)
5559 {
5560     static if (is(T dummy == BitPacked!(U, bits), U, size_t bits))
5561         alias TypeOfBitPacked = U;
5562     else
5563         alias TypeOfBitPacked = T;
5564 }
5565 
5566 /*
5567     Wrapper, used in definition of custom data structures from `Trie` template.
5568     Applying it to a unary lambda function indicates that the returned value always
5569     fits within `bits` of bits.
5570 */
5571 struct assumeSize(alias Fn, size_t bits)
5572 {
5573     enum bitSize = bits;
5574     static auto ref opCall(T)(auto ref T arg)
5575     {
5576         return Fn(arg);
5577     }
5578 }
5579 
5580 /*
5581     A helper for defining lambda function that yields a slice
5582     of certain bits from an unsigned integral value.
5583     The resulting lambda is wrapped in assumeSize and can be used directly
5584     with `Trie` template.
5585 */
5586 struct sliceBits(size_t from, size_t to)
5587 {
5588     //for now bypass assumeSize, DMD has trouble inlining it
5589     enum bitSize = to-from;
5590     static auto opCall(T)(T x)
5591     out(result)
5592     {
5593         assert(result < (1 << to-from));
5594     }
5595     do
5596     {
5597         static assert(from < to);
5598         static if (from == 0)
5599             return x & ((1 << to)-1);
5600         else
5601         return (x >> from) & ((1<<(to-from))-1);
5602     }
5603 }
5604 
5605 @safe pure nothrow @nogc uint low_8(uint x) { return x&0xFF; }
5606 @safe pure nothrow @nogc uint midlow_8(uint x){ return (x&0xFF00)>>8; }
5607 alias lo8 = assumeSize!(low_8, 8);
5608 alias mlo8 = assumeSize!(midlow_8, 8);
5609 
5610 @safe pure nothrow @nogc unittest
5611 {
5612     static assert(bitSizeOf!lo8 == 8);
5613     static assert(bitSizeOf!(sliceBits!(4, 7)) == 3);
5614     static assert(bitSizeOf!(BitPacked!(uint, 2)) == 2);
5615 }
5616 
5617 template Sequence(size_t start, size_t end)
5618 {
5619     static if (start < end)
5620         alias Sequence = AliasSeq!(start, Sequence!(start+1, end));
5621     else
5622         alias Sequence = AliasSeq!();
5623 }
5624 
5625 //---- TRIE TESTS ----
5626 @system unittest
5627 {
5628     import std.algorithm.iteration : map;
5629     import std.algorithm.sorting : sort;
5630     import std.array : array;
5631     import std.conv : text, to;
5632     import std.range : iota;
5633     static trieStats(TRIE)(TRIE t)
5634     {
5635         version (std_uni_stats)
5636         {
5637             import std.stdio : writefln, writeln;
5638             writeln("---TRIE FOOTPRINT STATS---");
5639             static foreach (i; 0 .. t.table.dim)
5640             {
5641                 writefln("lvl%s = %s bytes;  %s pages"
5642                          , i, t.bytes!i, t.pages!i);
5643             }
5644             writefln("TOTAL: %s bytes", t.bytes);
5645             version (none)
5646             {
5647                 writeln("INDEX (excluding value level):");
5648                 static foreach (i; 0 .. t.table.dim-1)
5649                     writeln(t.table.slice!(i)[0 .. t.table.length!i]);
5650             }
5651             writeln("---------------------------");
5652         }
5653     }
5654     //@@@BUG link failure, lambdas not found by linker somehow (in case of trie2)
5655     // alias lo8   = assumeSize!(8, function (uint x) { return x&0xFF; });
5656     // alias next8 = assumeSize!(7, function (uint x) { return (x&0x7F00)>>8; });
5657     alias Set = CodepointSet;
5658     auto set = Set('A','Z','a','z');
5659     auto trie = buildTrie!(bool, uint, 256, lo8)(set.byInterval);// simple bool array
5660     for (int a='a'; a<'z';a++)
5661         assert(trie[a]);
5662     for (int a='A'; a<'Z';a++)
5663         assert(trie[a]);
5664     for (int a=0; a<'A'; a++)
5665         assert(!trie[a]);
5666     for (int a ='Z'; a<'a'; a++)
5667         assert(!trie[a]);
5668     trieStats(trie);
5669 
5670     auto redundant2 = Set(
5671         1, 18, 256+2, 256+111, 512+1, 512+18, 768+2, 768+111);
5672     auto trie2 = buildTrie!(bool, uint, 1024, mlo8, lo8)(redundant2.byInterval);
5673     trieStats(trie2);
5674     foreach (e; redundant2.byCodepoint)
5675         assert(trie2[e], text(cast(uint) e, " - ", trie2[e]));
5676     foreach (i; 0 .. 1024)
5677     {
5678         assert(trie2[i] == (i in redundant2));
5679     }
5680 
5681 
5682     auto redundant3 = Set(
5683           2,    4,    6,    8,    16,
5684        2+16, 4+16, 16+6, 16+8, 16+16,
5685        2+32, 4+32, 32+6, 32+8,
5686       );
5687 
5688     enum max3 = 256;
5689     // sliceBits
5690     auto trie3 = buildTrie!(bool, uint, max3,
5691             sliceBits!(6,8), sliceBits!(4,6), sliceBits!(0,4)
5692         )(redundant3.byInterval);
5693     trieStats(trie3);
5694     foreach (i; 0 .. max3)
5695         assert(trie3[i] == (i in redundant3), text(cast(uint) i));
5696 
5697     auto redundant4 = Set(
5698             10, 64, 64+10, 128, 128+10, 256, 256+10, 512,
5699             1000, 2000, 3000, 4000, 5000, 6000
5700         );
5701     enum max4 = 2^^16;
5702     auto trie4 = buildTrie!(bool, size_t, max4,
5703             sliceBits!(13, 16), sliceBits!(9, 13), sliceBits!(6, 9) , sliceBits!(0, 6)
5704         )(redundant4.byInterval);
5705     foreach (i; 0 .. max4)
5706     {
5707         if (i in redundant4)
5708             assert(trie4[i], text(cast(uint) i));
5709     }
5710     trieStats(trie4);
5711 
5712         alias mapToS = mapTrieIndex!(useItemAt!(0, char));
5713         string[] redundantS = ["tea", "start", "orange"];
5714         redundantS.sort!((a,b) => mapToS(a) < mapToS(b))();
5715         auto strie = buildTrie!(bool, string, useItemAt!(0, char))(redundantS);
5716         // using first char only
5717         assert(redundantS == ["orange", "start", "tea"]);
5718         assert(strie["test"], text(strie["test"]));
5719         assert(!strie["aea"]);
5720         assert(strie["s"]);
5721 
5722     // a bit size test
5723     auto a = array(map!(x => to!ubyte(x))(iota(0, 256)));
5724     auto bt = buildTrie!(bool, ubyte, sliceBits!(7, 8), sliceBits!(5, 7), sliceBits!(0, 5))(a);
5725     trieStats(bt);
5726     foreach (i; 0 .. 256)
5727         assert(bt[cast(ubyte) i]);
5728 }
5729 
5730 template useItemAt(size_t idx, T)
5731 if (isIntegral!T || is(T: dchar))
5732 {
5733     size_t impl(const scope T[] arr){ return arr[idx]; }
5734     alias useItemAt = assumeSize!(impl, 8*T.sizeof);
5735 }
5736 
5737 template useLastItem(T)
5738 {
5739     size_t impl(const scope T[] arr){ return arr[$-1]; }
5740     alias useLastItem = assumeSize!(impl, 8*T.sizeof);
5741 }
5742 
5743 template fullBitSize(Prefix...)
5744 {
5745     static if (Prefix.length > 0)
5746         enum fullBitSize = bitSizeOf!(Prefix[0])+fullBitSize!(Prefix[1..$]);
5747     else
5748         enum fullBitSize = 0;
5749 }
5750 
5751 template idxTypes(Key, size_t fullBits, Prefix...)
5752 {
5753     static if (Prefix.length == 1)
5754     {// the last level is value level, so no index once reduced to 1-level
5755         alias idxTypes = AliasSeq!();
5756     }
5757     else
5758     {
5759         // Important note on bit packing
5760         // Each level has to hold enough of bits to address the next one
5761         // The bottom level is known to hold full bit width
5762         // thus it's size in pages is full_bit_width - size_of_last_prefix
5763         // Recourse on this notion
5764         alias idxTypes =
5765             AliasSeq!(
5766                 idxTypes!(Key, fullBits - bitSizeOf!(Prefix[$-1]), Prefix[0..$-1]),
5767                 BitPacked!(typeof(Prefix[$-2](Key.init)), fullBits - bitSizeOf!(Prefix[$-1]))
5768             );
5769     }
5770 }
5771 
5772 //============================================================================
5773 
5774 @safe pure int comparePropertyName(Char1, Char2)(const(Char1)[] a, const(Char2)[] b)
5775 if (is(Char1 : dchar) && is(Char2 : dchar))
5776 {
5777     import std.algorithm.comparison : cmp;
5778     import std.algorithm.iteration : map, filter;
5779     import std.ascii : toLower;
5780     static bool pred(dchar c) {return !c.isWhite && c != '-' && c != '_';}
5781     return cmp(
5782         a.map!toLower.filter!pred,
5783         b.map!toLower.filter!pred);
5784 }
5785 
5786 @safe pure unittest
5787 {
5788     assert(!comparePropertyName("foo-bar", "fooBar"));
5789 }
5790 
5791 bool propertyNameLess(Char1, Char2)(const(Char1)[] a, const(Char2)[] b) @safe pure
5792 if (is(Char1 : dchar) && is(Char2 : dchar))
5793 {
5794     return comparePropertyName(a, b) < 0;
5795 }
5796 
5797 //============================================================================
5798 // Utilities for compression of Unicode code point sets
5799 //============================================================================
5800 
5801 @safe void compressTo(uint val, ref scope ubyte[] arr) pure nothrow
5802 {
5803     // not optimized as usually done 1 time (and not public interface)
5804     if (val < 128)
5805         arr ~= cast(ubyte) val;
5806     else if (val < (1 << 13))
5807     {
5808         arr ~= (0b1_00 << 5) | cast(ubyte)(val >> 8);
5809         arr ~= val & 0xFF;
5810     }
5811     else
5812     {
5813         assert(val < (1 << 21));
5814         arr ~= (0b1_01 << 5) | cast(ubyte)(val >> 16);
5815         arr ~= (val >> 8) & 0xFF;
5816         arr ~= val  & 0xFF;
5817     }
5818 }
5819 
5820 @safe uint decompressFrom(scope const(ubyte)[] arr, ref size_t idx) pure
5821 {
5822     import std.exception : enforce;
5823     immutable first = arr[idx++];
5824     if (!(first & 0x80)) // no top bit -> [0 .. 127]
5825         return first;
5826     immutable extra = ((first >> 5) & 1) + 1; // [1, 2]
5827     uint val = (first & 0x1F);
5828     enforce(idx + extra <= arr.length, "bad code point interval encoding");
5829     foreach (j; 0 .. extra)
5830         val = (val << 8) | arr[idx+j];
5831     idx += extra;
5832     return val;
5833 }
5834 
5835 
5836 package(std) ubyte[] compressIntervals(Range)(Range intervals)
5837 if (isInputRange!Range && isIntegralPair!(ElementType!Range))
5838 {
5839     ubyte[] storage;
5840     uint base = 0;
5841     // RLE encode
5842     foreach (val; intervals)
5843     {
5844         compressTo(val[0]-base, storage);
5845         base = val[0];
5846         if (val[1] != lastDchar+1) // till the end of the domain so don't store it
5847         {
5848             compressTo(val[1]-base, storage);
5849             base = val[1];
5850         }
5851     }
5852     return storage;
5853 }
5854 
5855 @safe pure unittest
5856 {
5857     import std.algorithm.comparison : equal;
5858     import std.typecons : tuple;
5859 
5860     auto run = [tuple(80, 127), tuple(128, (1 << 10)+128)];
5861     ubyte[] enc = [cast(ubyte) 80, 47, 1, (0b1_00 << 5) | (1 << 2), 0];
5862     assert(compressIntervals(run) == enc);
5863     auto run2 = [tuple(0, (1 << 20)+512+1), tuple((1 << 20)+512+4, lastDchar+1)];
5864     ubyte[] enc2 = [cast(ubyte) 0, (0b1_01 << 5) | (1 << 4), 2, 1, 3]; // odd length-ed
5865     assert(compressIntervals(run2) == enc2);
5866     size_t  idx = 0;
5867     assert(decompressFrom(enc, idx) == 80);
5868     assert(decompressFrom(enc, idx) == 47);
5869     assert(decompressFrom(enc, idx) == 1);
5870     assert(decompressFrom(enc, idx) == (1 << 10));
5871     idx = 0;
5872     assert(decompressFrom(enc2, idx) == 0);
5873     assert(decompressFrom(enc2, idx) == (1 << 20)+512+1);
5874     assert(equal(decompressIntervals(compressIntervals(run)), run));
5875     assert(equal(decompressIntervals(compressIntervals(run2)), run2));
5876 }
5877 
5878 // Creates a range of `CodepointInterval` that lazily decodes compressed data.
5879 @safe package(std) auto decompressIntervals(const(ubyte)[] data) pure
5880 {
5881     return DecompressedIntervals(data);
5882 }
5883 
5884 @safe struct DecompressedIntervals
5885 {
5886 pure:
5887     const(ubyte)[] _stream;
5888     size_t _idx;
5889     CodepointInterval _front;
5890 
5891     this(const(ubyte)[] stream)
5892     {
5893         _stream = stream;
5894         popFront();
5895     }
5896 
5897     @property CodepointInterval front()
5898     {
5899         assert(!empty);
5900         return _front;
5901     }
5902 
5903     void popFront()
5904     {
5905         if (_idx == _stream.length)
5906         {
5907             _idx = size_t.max;
5908             return;
5909         }
5910         uint base = _front[1];
5911         _front[0] = base + decompressFrom(_stream, _idx);
5912         if (_idx == _stream.length)// odd length ---> till the end
5913             _front[1] = lastDchar+1;
5914         else
5915         {
5916             base = _front[0];
5917             _front[1] = base + decompressFrom(_stream, _idx);
5918         }
5919     }
5920 
5921     @property bool empty() const
5922     {
5923         return _idx == size_t.max;
5924     }
5925 
5926     @property DecompressedIntervals save() return scope { return this; }
5927 }
5928 
5929 @safe pure nothrow @nogc unittest
5930 {
5931     static assert(isInputRange!DecompressedIntervals);
5932     static assert(isForwardRange!DecompressedIntervals);
5933 }
5934 
5935 //============================================================================
5936 
5937 version (std_uni_bootstrap){}
5938 else
5939 {
5940 
5941 // helper for looking up code point sets
5942 ptrdiff_t findUnicodeSet(alias table, C)(const scope C[] name)
5943 {
5944     import std.algorithm.iteration : map;
5945     import std.range : assumeSorted;
5946     auto range = assumeSorted!((a,b) => propertyNameLess(a,b))
5947         (table.map!"a.name"());
5948     size_t idx = range.lowerBound(name).length;
5949     if (idx < range.length && comparePropertyName(range[idx], name) == 0)
5950         return idx;
5951     return -1;
5952 }
5953 
5954 // another one that loads it
5955 bool loadUnicodeSet(alias table, Set, C)(const scope C[] name, ref Set dest)
5956 {
5957     auto idx = findUnicodeSet!table(name);
5958     if (idx >= 0)
5959     {
5960         dest = Set(asSet(table[idx].compressed));
5961         return true;
5962     }
5963     return false;
5964 }
5965 
5966 bool loadProperty(Set=CodepointSet, C)
5967     (const scope C[] name, ref Set target) pure
5968 {
5969     import std.internal.unicode_tables : uniProps; // generated file
5970     alias ucmp = comparePropertyName;
5971     // conjure cumulative properties by hand
5972     if (ucmp(name, "L") == 0 || ucmp(name, "Letter") == 0)
5973     {
5974         target = asSet(uniProps.Lu);
5975         target |= asSet(uniProps.Ll);
5976         target |= asSet(uniProps.Lt);
5977         target |= asSet(uniProps.Lo);
5978         target |= asSet(uniProps.Lm);
5979     }
5980     else if (ucmp(name,"LC") == 0 || ucmp(name,"Cased Letter")==0)
5981     {
5982         target = asSet(uniProps.Ll);
5983         target |= asSet(uniProps.Lu);
5984         target |= asSet(uniProps.Lt);// Title case
5985     }
5986     else if (ucmp(name, "M") == 0 || ucmp(name, "Mark") == 0)
5987     {
5988         target = asSet(uniProps.Mn);
5989         target |= asSet(uniProps.Mc);
5990         target |= asSet(uniProps.Me);
5991     }
5992     else if (ucmp(name, "N") == 0 || ucmp(name, "Number") == 0)
5993     {
5994         target = asSet(uniProps.Nd);
5995         target |= asSet(uniProps.Nl);
5996         target |= asSet(uniProps.No);
5997     }
5998     else if (ucmp(name, "P") == 0 || ucmp(name, "Punctuation") == 0)
5999     {
6000         target = asSet(uniProps.Pc);
6001         target |= asSet(uniProps.Pd);
6002         target |= asSet(uniProps.Ps);
6003         target |= asSet(uniProps.Pe);
6004         target |= asSet(uniProps.Pi);
6005         target |= asSet(uniProps.Pf);
6006         target |= asSet(uniProps.Po);
6007     }
6008     else if (ucmp(name, "S") == 0 || ucmp(name, "Symbol") == 0)
6009     {
6010         target = asSet(uniProps.Sm);
6011         target |= asSet(uniProps.Sc);
6012         target |= asSet(uniProps.Sk);
6013         target |= asSet(uniProps.So);
6014     }
6015     else if (ucmp(name, "Z") == 0 || ucmp(name, "Separator") == 0)
6016     {
6017         target = asSet(uniProps.Zs);
6018         target |= asSet(uniProps.Zl);
6019         target |= asSet(uniProps.Zp);
6020     }
6021     else if (ucmp(name, "C") == 0 || ucmp(name, "Other") == 0)
6022     {
6023         target = asSet(uniProps.Co);
6024         target |= asSet(uniProps.Lo);
6025         target |= asSet(uniProps.No);
6026         target |= asSet(uniProps.So);
6027         target |= asSet(uniProps.Po);
6028     }
6029     else if (ucmp(name, "graphical") == 0)
6030     {
6031         target = asSet(uniProps.Alphabetic);
6032 
6033         target |= asSet(uniProps.Mn);
6034         target |= asSet(uniProps.Mc);
6035         target |= asSet(uniProps.Me);
6036 
6037         target |= asSet(uniProps.Nd);
6038         target |= asSet(uniProps.Nl);
6039         target |= asSet(uniProps.No);
6040 
6041         target |= asSet(uniProps.Pc);
6042         target |= asSet(uniProps.Pd);
6043         target |= asSet(uniProps.Ps);
6044         target |= asSet(uniProps.Pe);
6045         target |= asSet(uniProps.Pi);
6046         target |= asSet(uniProps.Pf);
6047         target |= asSet(uniProps.Po);
6048 
6049         target |= asSet(uniProps.Zs);
6050 
6051         target |= asSet(uniProps.Sm);
6052         target |= asSet(uniProps.Sc);
6053         target |= asSet(uniProps.Sk);
6054         target |= asSet(uniProps.So);
6055     }
6056     else if (ucmp(name, "any") == 0)
6057         target = Set.fromIntervals(0, 0x110000);
6058     else if (ucmp(name, "ascii") == 0)
6059         target = Set.fromIntervals(0, 0x80);
6060     else
6061         return loadUnicodeSet!(uniProps.tab)(name, target);
6062     return true;
6063 }
6064 
6065 // CTFE-only helper for checking property names at compile-time
6066 @safe bool isPrettyPropertyName(C)(const scope C[] name)
6067 {
6068     import std.algorithm.searching : find;
6069     auto names = [
6070         "L", "Letter",
6071         "LC", "Cased Letter",
6072         "M", "Mark",
6073         "N", "Number",
6074         "P", "Punctuation",
6075         "S", "Symbol",
6076         "Z", "Separator",
6077         "Graphical",
6078         "any",
6079         "ascii"
6080     ];
6081     auto x = find!(x => comparePropertyName(x, name) == 0)(names);
6082     return !x.empty;
6083 }
6084 
6085 // ditto, CTFE-only, not optimized
6086 @safe private static bool findSetName(alias table, C)(const scope C[] name)
6087 {
6088     return findUnicodeSet!table(name) >= 0;
6089 }
6090 
6091 template SetSearcher(alias table, string kind)
6092 {
6093     /// Run-time checked search.
6094     static auto opCall(C)(const scope C[] name)
6095         if (is(C : dchar))
6096     {
6097         import std.conv : to;
6098         CodepointSet set;
6099         if (loadUnicodeSet!table(name, set))
6100             return set;
6101         throw new Exception("No unicode set for "~kind~" by name "
6102             ~name.to!string()~" was found.");
6103     }
6104     /// Compile-time checked search.
6105     static @property auto opDispatch(string name)()
6106     {
6107         static if (findSetName!table(name))
6108         {
6109             CodepointSet set;
6110             loadUnicodeSet!table(name, set);
6111             return set;
6112         }
6113         else
6114             static assert(false, "No unicode set for "~kind~" by name "
6115                 ~name~" was found.");
6116     }
6117 }
6118 
6119 // Characters that need escaping in string posed as regular expressions
6120 package(std) alias Escapables = AliasSeq!('[', ']', '\\', '^', '$', '.', '|', '?', ',', '-',
6121     ';', ':', '#', '&', '%', '/', '<', '>', '`',  '*', '+', '(', ')', '{', '}',  '~');
6122 
6123 package(std) CodepointSet memoizeExpr(string expr)()
6124 {
6125     if (__ctfe)
6126         return mixin(expr);
6127     alias T = typeof(mixin(expr));
6128     static T slot;
6129     static bool initialized;
6130     if (!initialized)
6131     {
6132         slot =  mixin(expr);
6133         initialized = true;
6134     }
6135     return slot;
6136 }
6137 
6138 //property for \w character class
6139 package(std) @property CodepointSet wordCharacter() @safe
6140 {
6141     return memoizeExpr!("unicode.Alphabetic | unicode.Mn | unicode.Mc
6142         | unicode.Me | unicode.Nd | unicode.Pc")();
6143 }
6144 
6145 //basic stack, just in case it gets used anywhere else then Parser
6146 package(std) struct Stack(T)
6147 {
6148 @safe:
6149     T[] data;
6150     @property bool empty(){ return data.empty; }
6151 
6152     @property size_t length(){ return data.length; }
6153 
6154     void push(T val){ data ~= val;  }
6155 
6156     @trusted T pop()
6157     {
6158         assert(!empty);
6159         auto val = data[$ - 1];
6160         data = data[0 .. $ - 1];
6161         if (!__ctfe)
6162             cast(void) data.assumeSafeAppend();
6163         return val;
6164     }
6165 
6166     @property ref T top()
6167     {
6168         assert(!empty);
6169         return data[$ - 1];
6170     }
6171 }
6172 
6173 //test if a given string starts with hex number of maxDigit that's a valid codepoint
6174 //returns it's value and skips these maxDigit chars on success, throws on failure
6175 package(std) dchar parseUniHex(Range)(ref Range str, size_t maxDigit)
6176 {
6177     import std.exception : enforce;
6178     //std.conv.parse is both @system and bogus
6179     uint val;
6180     for (int k = 0; k < maxDigit; k++)
6181     {
6182         enforce(!str.empty, "incomplete escape sequence");
6183         //accepts ascii only, so it's OK to index directly
6184         immutable current = str.front;
6185         if ('0' <= current && current <= '9')
6186             val = val * 16 + current - '0';
6187         else if ('a' <= current && current <= 'f')
6188             val = val * 16 + current -'a' + 10;
6189         else if ('A' <= current && current <= 'F')
6190             val = val * 16 + current - 'A' + 10;
6191         else
6192             throw new Exception("invalid escape sequence");
6193         str.popFront();
6194     }
6195     enforce(val <= 0x10FFFF, "invalid codepoint");
6196     return val;
6197 }
6198 
6199 @safe unittest
6200 {
6201     import std.algorithm.searching : canFind;
6202     import std.exception : collectException;
6203     string[] non_hex = [ "000j", "000z", "FffG", "0Z"];
6204     string[] hex = [ "01", "ff", "00af", "10FFFF" ];
6205     int[] value = [ 1, 0xFF, 0xAF, 0x10FFFF ];
6206     foreach (v; non_hex)
6207         assert(collectException(parseUniHex(v, v.length)).msg
6208           .canFind("invalid escape sequence"));
6209     foreach (i, v; hex)
6210         assert(parseUniHex(v, v.length) == value[i]);
6211     string over = "0011FFFF";
6212     assert(collectException(parseUniHex(over, over.length)).msg
6213       .canFind("invalid codepoint"));
6214 }
6215 
6216 auto caseEnclose(CodepointSet set)
6217 {
6218     auto cased = set & unicode.LC;
6219     foreach (dchar ch; cased.byCodepoint)
6220     {
6221         foreach (c; simpleCaseFoldings(ch))
6222             set |= c;
6223     }
6224     return set;
6225 }
6226 
6227 /+
6228     fetch codepoint set corresponding to a name (InBlock or binary property)
6229 +/
6230 CodepointSet getUnicodeSet(const scope char[] name, bool negated,  bool casefold) @safe
6231 {
6232     CodepointSet s = unicode(name);
6233     //FIXME: caseEnclose for new uni as Set | CaseEnclose(SET && LC)
6234     if (casefold)
6235        s = caseEnclose(s);
6236     if (negated)
6237         s = s.inverted;
6238     return s;
6239 }
6240 
6241 struct UnicodeSetParser(Range)
6242 {
6243     import std.exception : enforce;
6244     import std.typecons : tuple, Tuple;
6245     Range range;
6246     bool casefold_;
6247 
6248     @property bool empty(){ return range.empty; }
6249     @property dchar front(){ return range.front; }
6250     void popFront(){ range.popFront(); }
6251 
6252     //CodepointSet operations relatively in order of priority
6253     enum Operator:uint {
6254         Open = 0, Negate,  Difference, SymDifference, Intersection, Union, None
6255     }
6256 
6257     //parse unit of CodepointSet spec, most notably escape sequences and char ranges
6258     //also fetches next set operation
6259     Tuple!(CodepointSet,Operator) parseCharTerm()
6260     {
6261         import std.range : drop;
6262         enum privateUseStart = '\U000F0000', privateUseEnd ='\U000FFFFD';
6263         enum State{ Start, Char, Escape, CharDash, CharDashEscape,
6264             PotentialTwinSymbolOperator }
6265         Operator op = Operator.None;
6266         dchar last;
6267         CodepointSet set;
6268         State state = State.Start;
6269 
6270         void addWithFlags(ref CodepointSet set, uint ch)
6271         {
6272             if (casefold_)
6273             {
6274                 auto range = simpleCaseFoldings(ch);
6275                 foreach (v; range)
6276                     set |= v;
6277             }
6278             else
6279                 set |= ch;
6280         }
6281 
6282         static Operator twinSymbolOperator(dchar symbol)
6283         {
6284             switch (symbol)
6285             {
6286             case '|':
6287                 return Operator.Union;
6288             case '-':
6289                 return Operator.Difference;
6290             case '~':
6291                 return Operator.SymDifference;
6292             case '&':
6293                 return Operator.Intersection;
6294             default:
6295                 assert(false);
6296             }
6297         }
6298 
6299         L_CharTermLoop:
6300         for (;;)
6301         {
6302             final switch (state)
6303             {
6304             case State.Start:
6305                 switch (front)
6306                 {
6307                 case '|':
6308                 case '-':
6309                 case '~':
6310                 case '&':
6311                     state = State.PotentialTwinSymbolOperator;
6312                     last = front;
6313                     break;
6314                 case '[':
6315                     op = Operator.Union;
6316                     goto case;
6317                 case ']':
6318                     break L_CharTermLoop;
6319                 case '\\':
6320                     state = State.Escape;
6321                     break;
6322                 default:
6323                     state = State.Char;
6324                     last = front;
6325                 }
6326                 break;
6327             case State.Char:
6328                 // xxx last front xxx
6329                 switch (front)
6330                 {
6331                 case '|':
6332                 case '~':
6333                 case '&':
6334                     // then last is treated as normal char and added as implicit union
6335                     state = State.PotentialTwinSymbolOperator;
6336                     addWithFlags(set, last);
6337                     last = front;
6338                     break;
6339                 case '-': // still need more info
6340                     state = State.CharDash;
6341                     break;
6342                 case '\\':
6343                     set |= last;
6344                     state = State.Escape;
6345                     break;
6346                 case '[':
6347                     op = Operator.Union;
6348                     goto case;
6349                 case ']':
6350                     addWithFlags(set, last);
6351                     break L_CharTermLoop;
6352                 default:
6353                     state = State.Char;
6354                     addWithFlags(set, last);
6355                     last = front;
6356                 }
6357                 break;
6358             case State.PotentialTwinSymbolOperator:
6359                 // xxx last front xxxx
6360                 // where last = [|-&~]
6361                 if (front == last)
6362                 {
6363                     op = twinSymbolOperator(last);
6364                     popFront();//skip second twin char
6365                     break L_CharTermLoop;
6366                 }
6367                 goto case State.Char;
6368             case State.Escape:
6369                 // xxx \ front xxx
6370                 switch (front)
6371                 {
6372                 case 'f':
6373                     last = '\f';
6374                     state = State.Char;
6375                     break;
6376                 case 'n':
6377                     last = '\n';
6378                     state = State.Char;
6379                     break;
6380                 case 'r':
6381                     last = '\r';
6382                     state = State.Char;
6383                     break;
6384                 case 't':
6385                     last = '\t';
6386                     state = State.Char;
6387                     break;
6388                 case 'v':
6389                     last = '\v';
6390                     state = State.Char;
6391                     break;
6392                 case 'c':
6393                     last = unicode.parseControlCode(this);
6394                     state = State.Char;
6395                     break;
6396                 foreach (val; Escapables)
6397                 {
6398                 case val:
6399                 }
6400                     last = front;
6401                     state = State.Char;
6402                     break;
6403                 case 'p':
6404                     set.add(unicode.parsePropertySpec(this, false, casefold_));
6405                     state = State.Start;
6406                     continue L_CharTermLoop; //next char already fetched
6407                 case 'P':
6408                     set.add(unicode.parsePropertySpec(this, true, casefold_));
6409                     state = State.Start;
6410                     continue L_CharTermLoop; //next char already fetched
6411                 case 'x':
6412                     popFront();
6413                     last = parseUniHex(this, 2);
6414                     state = State.Char;
6415                     continue L_CharTermLoop;
6416                 case 'u':
6417                     popFront();
6418                     last = parseUniHex(this, 4);
6419                     state = State.Char;
6420                     continue L_CharTermLoop;
6421                 case 'U':
6422                     popFront();
6423                     last = parseUniHex(this, 8);
6424                     state = State.Char;
6425                     continue L_CharTermLoop;
6426                 case 'd':
6427                     set.add(unicode.Nd);
6428                     state = State.Start;
6429                     break;
6430                 case 'D':
6431                     set.add(unicode.Nd.inverted);
6432                     state = State.Start;
6433                     break;
6434                 case 's':
6435                     set.add(unicode.White_Space);
6436                     state = State.Start;
6437                     break;
6438                 case 'S':
6439                     set.add(unicode.White_Space.inverted);
6440                     state = State.Start;
6441                     break;
6442                 case 'w':
6443                     set.add(wordCharacter);
6444                     state = State.Start;
6445                     break;
6446                 case 'W':
6447                     set.add(wordCharacter.inverted);
6448                     state = State.Start;
6449                     break;
6450                 default:
6451                     if (front >= privateUseStart && front <= privateUseEnd)
6452                         enforce(false, "no matching ']' found while parsing character class");
6453                     enforce(false, "invalid escape sequence");
6454                 }
6455                 break;
6456             case State.CharDash:
6457                 // xxx last - front xxx
6458                 switch (front)
6459                 {
6460                 case '[':
6461                     op = Operator.Union;
6462                     goto case;
6463                 case ']':
6464                     //means dash is a single char not an interval specifier
6465                     addWithFlags(set, last);
6466                     addWithFlags(set, '-');
6467                     break L_CharTermLoop;
6468                  case '-'://set Difference again
6469                     addWithFlags(set, last);
6470                     op = Operator.Difference;
6471                     popFront();//skip '-'
6472                     break L_CharTermLoop;
6473                 case '\\':
6474                     state = State.CharDashEscape;
6475                     break;
6476                 default:
6477                     enforce(last <= front, "inverted range");
6478                     if (casefold_)
6479                     {
6480                         for (uint ch = last; ch <= front; ch++)
6481                             addWithFlags(set, ch);
6482                     }
6483                     else
6484                         set.add(last, front + 1);
6485                     state = State.Start;
6486                 }
6487                 break;
6488             case State.CharDashEscape:
6489             //xxx last - \ front xxx
6490                 uint end;
6491                 switch (front)
6492                 {
6493                 case 'f':
6494                     end = '\f';
6495                     break;
6496                 case 'n':
6497                     end = '\n';
6498                     break;
6499                 case 'r':
6500                     end = '\r';
6501                     break;
6502                 case 't':
6503                     end = '\t';
6504                     break;
6505                 case 'v':
6506                     end = '\v';
6507                     break;
6508                 foreach (val; Escapables)
6509                 {
6510                 case val:
6511                 }
6512                     end = front;
6513                     break;
6514                 case 'c':
6515                     end = unicode.parseControlCode(this);
6516                     break;
6517                 case 'x':
6518                     popFront();
6519                     end = parseUniHex(this, 2);
6520                     enforce(last <= end,"inverted range");
6521                     set.add(last, end + 1);
6522                     state = State.Start;
6523                     continue L_CharTermLoop;
6524                 case 'u':
6525                     popFront();
6526                     end = parseUniHex(this, 4);
6527                     enforce(last <= end,"inverted range");
6528                     set.add(last, end + 1);
6529                     state = State.Start;
6530                     continue L_CharTermLoop;
6531                 case 'U':
6532                     popFront();
6533                     end = parseUniHex(this, 8);
6534                     enforce(last <= end,"inverted range");
6535                     set.add(last, end + 1);
6536                     state = State.Start;
6537                     continue L_CharTermLoop;
6538                 default:
6539                     if (front >= privateUseStart && front <= privateUseEnd)
6540                         enforce(false, "no matching ']' found while parsing character class");
6541                     enforce(false, "invalid escape sequence");
6542                 }
6543                 // Lookahead to check if it's a \T
6544                 // where T is sub-pattern terminator in multi-pattern scheme
6545                 auto lookahead = range.save.drop(1);
6546                 if (end == '\\' && !lookahead.empty)
6547                 {
6548                     if (lookahead.front >= privateUseStart && lookahead.front <= privateUseEnd)
6549                         enforce(false, "no matching ']' found while parsing character class");
6550                 }
6551                 enforce(last <= end,"inverted range");
6552                 set.add(last, end + 1);
6553                 state = State.Start;
6554                 break;
6555             }
6556             popFront();
6557             enforce(!empty, "unexpected end of CodepointSet");
6558         }
6559         return tuple(set, op);
6560     }
6561 
6562     alias ValStack = Stack!(CodepointSet);
6563     alias OpStack = Stack!(Operator);
6564 
6565     CodepointSet parseSet()
6566     {
6567         ValStack vstack;
6568         OpStack opstack;
6569         import std.functional : unaryFun;
6570         enforce(!empty, "unexpected end of input");
6571         enforce(front == '[', "expected '[' at the start of unicode set");
6572         //
6573         static bool apply(Operator op, ref ValStack stack)
6574         {
6575             switch (op)
6576             {
6577             case Operator.Negate:
6578                 enforce(!stack.empty, "no operand for '^'");
6579                 stack.top = stack.top.inverted;
6580                 break;
6581             case Operator.Union:
6582                 auto s = stack.pop();//2nd operand
6583                 enforce(!stack.empty, "no operand for '||'");
6584                 stack.top.add(s);
6585                 break;
6586             case Operator.Difference:
6587                 auto s = stack.pop();//2nd operand
6588                 enforce(!stack.empty, "no operand for '--'");
6589                 stack.top.sub(s);
6590                 break;
6591             case Operator.SymDifference:
6592                 auto s = stack.pop();//2nd operand
6593                 enforce(!stack.empty, "no operand for '~~'");
6594                 stack.top ~= s;
6595                 break;
6596             case Operator.Intersection:
6597                 auto s = stack.pop();//2nd operand
6598                 enforce(!stack.empty, "no operand for '&&'");
6599                 stack.top.intersect(s);
6600                 break;
6601             default:
6602                 return false;
6603             }
6604             return true;
6605         }
6606         static bool unrollWhile(alias cond)(ref ValStack vstack, ref OpStack opstack)
6607         {
6608             while (cond(opstack.top))
6609             {
6610                 if (!apply(opstack.pop(),vstack))
6611                     return false;//syntax error
6612                 if (opstack.empty)
6613                     return false;
6614             }
6615             return true;
6616         }
6617 
6618         L_CharsetLoop:
6619         do
6620         {
6621             switch (front)
6622             {
6623             case '[':
6624                 opstack.push(Operator.Open);
6625                 popFront();
6626                 enforce(!empty, "unexpected end of character class");
6627                 if (front == '^')
6628                 {
6629                     opstack.push(Operator.Negate);
6630                     popFront();
6631                     enforce(!empty, "unexpected end of character class");
6632                 }
6633                 else if (front == ']') // []...] is special cased
6634                 {
6635                     popFront();
6636                     enforce(!empty, "wrong character set");
6637                     auto pair = parseCharTerm();
6638                     pair[0].add(']', ']'+1);
6639                     if (pair[1] != Operator.None)
6640                     {
6641                         if (opstack.top == Operator.Union)
6642                             unrollWhile!(unaryFun!"a == a.Union")(vstack, opstack);
6643                         opstack.push(pair[1]);
6644                     }
6645                     vstack.push(pair[0]);
6646                 }
6647                 break;
6648             case ']':
6649                 enforce(unrollWhile!(unaryFun!"a != a.Open")(vstack, opstack),
6650                     "character class syntax error");
6651                 enforce(!opstack.empty, "unmatched ']'");
6652                 opstack.pop();
6653                 popFront();
6654                 if (opstack.empty)
6655                     break L_CharsetLoop;
6656                 auto pair  = parseCharTerm();
6657                 if (!pair[0].empty)//not only operator e.g. -- or ~~
6658                 {
6659                     vstack.top.add(pair[0]);//apply union
6660                 }
6661                 if (pair[1] != Operator.None)
6662                 {
6663                     if (opstack.top == Operator.Union)
6664                         unrollWhile!(unaryFun!"a == a.Union")(vstack, opstack);
6665                     opstack.push(pair[1]);
6666                 }
6667                 break;
6668             //
6669             default://yet another pair of term(op)?
6670                 auto pair = parseCharTerm();
6671                 if (pair[1] != Operator.None)
6672                 {
6673                     if (opstack.top == Operator.Union)
6674                         unrollWhile!(unaryFun!"a == a.Union")(vstack, opstack);
6675                     opstack.push(pair[1]);
6676                 }
6677                 vstack.push(pair[0]);
6678             }
6679 
6680         }while (!empty || !opstack.empty);
6681         while (!opstack.empty)
6682             apply(opstack.pop(),vstack);
6683         assert(vstack.length == 1);
6684         return vstack.top;
6685     }
6686 }
6687 
6688 /**
6689     A single entry point to lookup Unicode $(CODEPOINT) sets by name or alias of
6690     a block, script or general category.
6691 
6692     It uses well defined standard rules of property name lookup.
6693     This includes fuzzy matching of names, so that
6694     'White_Space', 'white-SpAce' and 'whitespace' are all considered equal
6695     and yield the same set of white space $(CHARACTERS).
6696 */
6697 @safe public struct unicode
6698 {
6699     import std.exception : enforce;
6700     /**
6701         Performs the lookup of set of $(CODEPOINTS)
6702         with compile-time correctness checking.
6703         This short-cut version combines 3 searches:
6704         across blocks, scripts, and common binary properties.
6705 
6706         Note that since scripts and blocks overlap the
6707         usual trick to disambiguate is used - to get a block use
6708         `unicode.InBlockName`, to search a script
6709         use `unicode.ScriptName`.
6710 
6711         See_Also: $(LREF block), $(LREF script)
6712         and (not included in this search) $(LREF hangulSyllableType).
6713     */
6714 
6715     static @property auto opDispatch(string name)() pure
6716     {
6717         static if (findAny(name))
6718             return loadAny(name);
6719         else
6720             static assert(false, "No unicode set by name "~name~" was found.");
6721     }
6722 
6723     ///
6724     @safe unittest
6725     {
6726         import std.exception : collectException;
6727         auto ascii = unicode.ASCII;
6728         assert(ascii['A']);
6729         assert(ascii['~']);
6730         assert(!ascii['\u00e0']);
6731         // matching is case-insensitive
6732         assert(ascii == unicode.ascII);
6733         assert(!ascii['à']);
6734         // underscores, '-' and whitespace in names are ignored too
6735         auto latin = unicode.in_latin1_Supplement;
6736         assert(latin['à']);
6737         assert(!latin['$']);
6738         // BTW Latin 1 Supplement is a block, hence "In" prefix
6739         assert(latin == unicode("In Latin 1 Supplement"));
6740         // run-time look up throws if no such set is found
6741         assert(collectException(unicode("InCyrilliac")));
6742     }
6743 
6744     /**
6745         The same lookup across blocks, scripts, or binary properties,
6746         but performed at run-time.
6747         This version is provided for cases where `name`
6748         is not known beforehand; otherwise compile-time
6749         checked $(LREF opDispatch) is typically a better choice.
6750 
6751         See the $(S_LINK Unicode properties, table of properties) for available
6752         sets.
6753     */
6754     static auto opCall(C)(const scope C[] name)
6755         if (is(C : dchar))
6756     {
6757         return loadAny(name);
6758     }
6759 
6760     /**
6761         Narrows down the search for sets of $(CODEPOINTS) to all Unicode blocks.
6762 
6763         Note:
6764         Here block names are unambiguous as no scripts are searched
6765         and thus to search use simply `unicode.block.BlockName` notation.
6766 
6767         See $(S_LINK Unicode properties, table of properties) for available sets.
6768         See_Also: $(S_LINK Unicode properties, table of properties).
6769     */
6770     struct block
6771     {
6772         import std.internal.unicode_tables : blocks; // generated file
6773         mixin SetSearcher!(blocks.tab, "block");
6774     }
6775 
6776     ///
6777     @safe unittest
6778     {
6779         // use .block for explicitness
6780         assert(unicode.block.Greek_and_Coptic == unicode.InGreek_and_Coptic);
6781     }
6782 
6783     /**
6784         Narrows down the search for sets of $(CODEPOINTS) to all Unicode scripts.
6785 
6786         See the $(S_LINK Unicode properties, table of properties) for available
6787         sets.
6788     */
6789     struct script
6790     {
6791         import std.internal.unicode_tables : scripts; // generated file
6792         mixin SetSearcher!(scripts.tab, "script");
6793     }
6794 
6795     ///
6796     @safe unittest
6797     {
6798         auto arabicScript = unicode.script.arabic;
6799         auto arabicBlock = unicode.block.arabic;
6800         // there is an intersection between script and block
6801         assert(arabicBlock['؁']);
6802         assert(arabicScript['؁']);
6803         // but they are different
6804         assert(arabicBlock != arabicScript);
6805         assert(arabicBlock == unicode.inArabic);
6806         assert(arabicScript == unicode.arabic);
6807     }
6808 
6809     /**
6810         Fetch a set of $(CODEPOINTS) that have the given hangul syllable type.
6811 
6812         Other non-binary properties (once supported) follow the same
6813         notation - `unicode.propertyName.propertyValue` for compile-time
6814         checked access and `unicode.propertyName(propertyValue)`
6815         for run-time checked one.
6816 
6817         See the $(S_LINK Unicode properties, table of properties) for available
6818         sets.
6819     */
6820     struct hangulSyllableType
6821     {
6822         import std.internal.unicode_tables : hangul; // generated file
6823         mixin SetSearcher!(hangul.tab, "hangul syllable type");
6824     }
6825 
6826     ///
6827     @safe unittest
6828     {
6829         // L here is syllable type not Letter as in unicode.L short-cut
6830         auto leadingVowel = unicode.hangulSyllableType("L");
6831         // check that some leading vowels are present
6832         foreach (vowel; '\u1110'..'\u115F')
6833             assert(leadingVowel[vowel]);
6834         assert(leadingVowel == unicode.hangulSyllableType.L);
6835     }
6836 
6837     //parse control code of form \cXXX, c assumed to be the current symbol
6838     static package(std) dchar parseControlCode(Parser)(ref Parser p)
6839     {
6840         with(p)
6841         {
6842             popFront();
6843             enforce(!empty, "Unfinished escape sequence");
6844             enforce(('a' <= front && front <= 'z')
6845                 || ('A' <= front && front <= 'Z'),
6846             "Only letters are allowed after \\c");
6847             return front & 0x1f;
6848         }
6849     }
6850 
6851     //parse and return a CodepointSet for \p{...Property...} and \P{...Property..},
6852     //\ - assumed to be processed, p - is current
6853     static package(std) CodepointSet parsePropertySpec(Range)(ref Range p,
6854         bool negated, bool casefold)
6855     {
6856         static import std.ascii;
6857         with(p)
6858         {
6859             enum MAX_PROPERTY = 128;
6860             char[MAX_PROPERTY] result;
6861             uint k = 0;
6862             popFront();
6863             enforce(!empty, "eof parsing unicode property spec");
6864             if (front == '{')
6865             {
6866                 popFront();
6867                 while (k < MAX_PROPERTY && !empty && front !='}'
6868                     && front !=':')
6869                 {
6870                     if (front != '-' && front != ' ' && front != '_')
6871                         result[k++] = cast(char) std.ascii.toLower(front);
6872                     popFront();
6873                 }
6874                 enforce(k != MAX_PROPERTY, "invalid property name");
6875                 enforce(front == '}', "} expected ");
6876             }
6877             else
6878             {//single char properties e.g.: \pL, \pN ...
6879                 enforce(front < 0x80, "invalid property name");
6880                 result[k++] = cast(char) front;
6881             }
6882             auto s = getUnicodeSet(result[0 .. k], negated, casefold);
6883             enforce(!s.empty, "unrecognized unicode property spec");
6884             popFront();
6885             return s;
6886         }
6887     }
6888 
6889     /**
6890         Parse unicode codepoint set from given `range` using standard regex
6891         syntax '[...]'. The range is advanced skiping over regex set definition.
6892         `casefold` parameter determines if the set should be casefolded - that is
6893         include both lower and upper case versions for any letters in the set.
6894     */
6895     static CodepointSet parseSet(Range)(ref Range range, bool casefold=false)
6896     if (isInputRange!Range && is(ElementType!Range : dchar))
6897     {
6898         auto usParser = UnicodeSetParser!Range(range, casefold);
6899         auto set = usParser.parseSet();
6900         range = usParser.range;
6901         return set;
6902     }
6903 
6904     ///
6905     @safe unittest
6906     {
6907         import std.uni : unicode;
6908         string pat = "[a-zA-Z0-9]hello";
6909         auto set = unicode.parseSet(pat);
6910         // check some of the codepoints
6911         assert(set['a'] && set['A'] && set['9']);
6912         assert(pat == "hello");
6913     }
6914 
6915 private:
6916     alias ucmp = comparePropertyName;
6917 
6918     static bool findAny(string name)
6919     {
6920         import std.internal.unicode_tables : blocks, scripts, uniProps; // generated file
6921         return isPrettyPropertyName(name)
6922             || findSetName!(uniProps.tab)(name) || findSetName!(scripts.tab)(name)
6923             || (ucmp(name[0 .. 2],"In") == 0 && findSetName!(blocks.tab)(name[2..$]));
6924     }
6925 
6926     static auto loadAny(Set=CodepointSet, C)(const scope C[] name) pure
6927     {
6928         import std.conv : to;
6929         import std.internal.unicode_tables : blocks, scripts; // generated file
6930         Set set;
6931         immutable loaded = loadProperty(name, set) || loadUnicodeSet!(scripts.tab)(name, set)
6932             || (name.length > 2 && ucmp(name[0 .. 2],"In") == 0
6933                 && loadUnicodeSet!(blocks.tab)(name[2..$], set));
6934         if (loaded)
6935             return set;
6936         throw new Exception("No unicode set by name "~name.to!string()~" was found.");
6937     }
6938 
6939     // FIXME: re-disable once the compiler is fixed
6940     // Disabled to prevent the mistake of creating instances of this pseudo-struct.
6941     //@disable ~this();
6942 }
6943 
6944 @safe unittest
6945 {
6946     import std.internal.unicode_tables : blocks, uniProps; // generated file
6947     assert(unicode("InHebrew") == asSet(blocks.Hebrew));
6948     assert(unicode("separator") == (asSet(uniProps.Zs) | asSet(uniProps.Zl) | asSet(uniProps.Zp)));
6949     assert(unicode("In-Kharoshthi") == asSet(blocks.Kharoshthi));
6950 }
6951 
6952 enum EMPTY_CASE_TRIE = ushort.max;// from what gen_uni uses internally
6953 
6954 // control - '\r'
6955 enum controlSwitch = `
6956     case '\u0000':..case '\u0008':case '\u000E':..case '\u001F':case '\u007F':..
6957     case '\u0084':case '\u0086':..case '\u009F': case '\u0009':..case '\u000C': case '\u0085':
6958 `;
6959 // TODO: redo the most of hangul stuff algorithmically in case of Graphemes too
6960 // kill unrolled switches
6961 
6962 private static bool isRegionalIndicator(dchar ch) @safe pure @nogc nothrow
6963 {
6964     return ch >= '\U0001F1E6' && ch <= '\U0001F1FF';
6965 }
6966 
6967 template genericDecodeGrapheme(bool getValue)
6968 {
6969     alias graphemeExtend = graphemeExtendTrie;
6970     alias spacingMark = mcTrie;
6971     static if (getValue)
6972         alias Value = Grapheme;
6973     else
6974         alias Value = void;
6975 
6976     Value genericDecodeGrapheme(Input)(ref Input range)
6977     {
6978         import std.internal.unicode_tables : isHangL, isHangT, isHangV; // generated file
6979         enum GraphemeState {
6980             Start,
6981             CR,
6982             RI,
6983             L,
6984             V,
6985             LVT
6986         }
6987         static if (getValue)
6988             Grapheme grapheme;
6989         auto state = GraphemeState.Start;
6990         enum eat = q{
6991             static if (getValue)
6992                 grapheme ~= ch;
6993             range.popFront();
6994         };
6995 
6996         dchar ch;
6997         assert(!range.empty, "Attempting to decode grapheme from an empty " ~ Input.stringof);
6998         while (!range.empty)
6999         {
7000             ch = range.front;
7001             final switch (state) with(GraphemeState)
7002             {
7003             case Start:
7004                 mixin(eat);
7005                 if (ch == '\r')
7006                     state = CR;
7007                 else if (isRegionalIndicator(ch))
7008                     state = RI;
7009                 else if (isHangL(ch))
7010                     state = L;
7011                 else if (hangLV[ch] || isHangV(ch))
7012                     state = V;
7013                 else if (hangLVT[ch])
7014                     state = LVT;
7015                 else if (isHangT(ch))
7016                     state = LVT;
7017                 else
7018                 {
7019                     switch (ch)
7020                     {
7021                     mixin(controlSwitch);
7022                         goto L_End;
7023                     default:
7024                         goto L_End_Extend;
7025                     }
7026                 }
7027             break;
7028             case CR:
7029                 if (ch == '\n')
7030                     mixin(eat);
7031                 goto L_End_Extend;
7032             case RI:
7033                 if (isRegionalIndicator(ch))
7034                     mixin(eat);
7035                 else
7036                     goto L_End_Extend;
7037             break;
7038             case L:
7039                 if (isHangL(ch))
7040                     mixin(eat);
7041                 else if (isHangV(ch) || hangLV[ch])
7042                 {
7043                     state = V;
7044                     mixin(eat);
7045                 }
7046                 else if (hangLVT[ch])
7047                 {
7048                     state = LVT;
7049                     mixin(eat);
7050                 }
7051                 else
7052                     goto L_End_Extend;
7053             break;
7054             case V:
7055                 if (isHangV(ch))
7056                     mixin(eat);
7057                 else if (isHangT(ch))
7058                 {
7059                     state = LVT;
7060                     mixin(eat);
7061                 }
7062                 else
7063                     goto L_End_Extend;
7064             break;
7065             case LVT:
7066                 if (isHangT(ch))
7067                 {
7068                     mixin(eat);
7069                 }
7070                 else
7071                     goto L_End_Extend;
7072             break;
7073             }
7074         }
7075     L_End_Extend:
7076         while (!range.empty)
7077         {
7078             ch = range.front;
7079             // extend & spacing marks
7080             if (!graphemeExtend[ch] && !spacingMark[ch])
7081                 break;
7082             mixin(eat);
7083         }
7084     L_End:
7085         static if (getValue)
7086             return grapheme;
7087     }
7088 
7089 }
7090 
7091 public: // Public API continues
7092 
7093 /++
7094     Computes the length of grapheme cluster starting at `index`.
7095     Both the resulting length and the `index` are measured
7096     in $(S_LINK Code unit, code units).
7097 
7098     Params:
7099         C = type that is implicitly convertible to `dchars`
7100         input = array of grapheme clusters
7101         index = starting index into `input[]`
7102 
7103     Returns:
7104         length of grapheme cluster
7105 +/
7106 size_t graphemeStride(C)(const scope C[] input, size_t index) @safe pure
7107 if (is(C : dchar))
7108 {
7109     auto src = input[index..$];
7110     auto n = src.length;
7111     genericDecodeGrapheme!(false)(src);
7112     return n - src.length;
7113 }
7114 
7115 ///
7116 @safe unittest
7117 {
7118     assert(graphemeStride("  ", 1) == 1);
7119     // A + combing ring above
7120     string city = "A\u030Arhus";
7121     size_t first = graphemeStride(city, 0);
7122     assert(first == 3); //\u030A has 2 UTF-8 code units
7123     assert(city[0 .. first] == "A\u030A");
7124     assert(city[first..$] == "rhus");
7125 }
7126 
7127 @safe unittest
7128 {
7129     // Ensure that graphemeStride is usable from CTFE.
7130     enum c1 = graphemeStride("A", 0);
7131     static assert(c1 == 1);
7132 
7133     enum c2 = graphemeStride("A\u0301", 0);
7134     static assert(c2 == 3); // \u0301 has 2 UTF-8 code units
7135 }
7136 
7137 /++
7138     Reads one full grapheme cluster from an
7139     $(REF_ALTTEXT input range, isInputRange, std,range,primitives) of dchar `inp`.
7140 
7141     For examples see the $(LREF Grapheme) below.
7142 
7143     Note:
7144     This function modifies `inp` and thus `inp`
7145     must be an L-value.
7146 +/
7147 Grapheme decodeGrapheme(Input)(ref Input inp)
7148 if (isInputRange!Input && is(immutable ElementType!Input == immutable dchar))
7149 {
7150     return genericDecodeGrapheme!true(inp);
7151 }
7152 
7153 @safe unittest
7154 {
7155     import std.algorithm.comparison : equal;
7156 
7157     Grapheme gr;
7158     string s = " \u0020\u0308 ";
7159     gr = decodeGrapheme(s);
7160     assert(gr.length == 1 && gr[0] == ' ');
7161     gr = decodeGrapheme(s);
7162     assert(gr.length == 2 && equal(gr[0 .. 2], " \u0308"));
7163     s = "\u0300\u0308\u1100";
7164     assert(equal(decodeGrapheme(s)[], "\u0300\u0308"));
7165     assert(equal(decodeGrapheme(s)[], "\u1100"));
7166     s = "\u11A8\u0308\uAC01";
7167     assert(equal(decodeGrapheme(s)[], "\u11A8\u0308"));
7168     assert(equal(decodeGrapheme(s)[], "\uAC01"));
7169 }
7170 
7171 /++
7172     $(P Iterate a string by $(LREF Grapheme).)
7173 
7174     $(P Useful for doing string manipulation that needs to be aware
7175     of graphemes.)
7176 
7177     See_Also:
7178         $(LREF byCodePoint)
7179 +/
7180 auto byGrapheme(Range)(Range range)
7181 if (isInputRange!Range && is(immutable ElementType!Range == immutable dchar))
7182 {
7183     // TODO: Bidirectional access
7184     static struct Result(R)
7185     {
7186         private R _range;
7187         private Grapheme _front;
7188 
7189         bool empty() @property
7190         {
7191             return _front.length == 0;
7192         }
7193 
7194         Grapheme front() @property
7195         {
7196             return _front;
7197         }
7198 
7199         void popFront()
7200         {
7201             _front = _range.empty ? Grapheme.init : _range.decodeGrapheme();
7202         }
7203 
7204         static if (isForwardRange!R)
7205         {
7206             Result save() @property
7207             {
7208                 return Result(_range.save, _front);
7209             }
7210         }
7211     }
7212 
7213     auto result = Result!(Range)(range);
7214     result.popFront();
7215     return result;
7216 }
7217 
7218 ///
7219 @safe unittest
7220 {
7221     import std.algorithm.comparison : equal;
7222     import std.range.primitives : walkLength;
7223     import std.range : take, drop;
7224     auto text = "noe\u0308l"; // noël using e + combining diaeresis
7225     assert(text.walkLength == 5); // 5 code points
7226 
7227     auto gText = text.byGrapheme;
7228     assert(gText.walkLength == 4); // 4 graphemes
7229 
7230     assert(gText.take(3).equal("noe\u0308".byGrapheme));
7231     assert(gText.drop(3).equal("l".byGrapheme));
7232 }
7233 
7234 // For testing non-forward-range input ranges
7235 version (StdUnittest)
7236 private static @safe struct InputRangeString
7237 {
7238     private string s;
7239 
7240     bool empty() @property { return s.empty; }
7241     dchar front() @property { return s.front; }
7242     void popFront() { s.popFront(); }
7243 }
7244 
7245 @safe unittest
7246 {
7247     import std.algorithm.comparison : equal;
7248     import std.array : array;
7249     import std.range : retro;
7250     import std.range.primitives : walkLength;
7251     assert("".byGrapheme.walkLength == 0);
7252 
7253     auto reverse = "le\u0308on";
7254     assert(reverse.walkLength == 5);
7255 
7256     auto gReverse = reverse.byGrapheme;
7257     assert(gReverse.walkLength == 4);
7258 
7259     static foreach (text; AliasSeq!("noe\u0308l"c, "noe\u0308l"w, "noe\u0308l"d))
7260     {{
7261         assert(text.walkLength == 5);
7262         static assert(isForwardRange!(typeof(text)));
7263 
7264         auto gText = text.byGrapheme;
7265         static assert(isForwardRange!(typeof(gText)));
7266         assert(gText.walkLength == 4);
7267         assert(gText.array.retro.equal(gReverse));
7268     }}
7269 
7270     auto nonForwardRange = InputRangeString("noe\u0308l").byGrapheme;
7271     static assert(!isForwardRange!(typeof(nonForwardRange)));
7272     assert(nonForwardRange.walkLength == 4);
7273 }
7274 
7275 /++
7276     $(P Lazily transform a range of $(LREF Grapheme)s to a range of code points.)
7277 
7278     $(P Useful for converting the result to a string after doing operations
7279     on graphemes.)
7280 
7281     $(P If passed in a range of code points, returns a range with equivalent capabilities.)
7282 +/
7283 auto byCodePoint(Range)(Range range)
7284 if (isInputRange!Range && is(immutable ElementType!Range == immutable Grapheme))
7285 {
7286     // TODO: Propagate bidirectional access
7287     static struct Result
7288     {
7289         private Range _range;
7290         private size_t i = 0;
7291 
7292         bool empty() @property
7293         {
7294             return _range.empty;
7295         }
7296 
7297         dchar front() @property
7298         {
7299             return _range.front[i];
7300         }
7301 
7302         void popFront()
7303         {
7304             ++i;
7305 
7306             if (i >= _range.front.length)
7307             {
7308                 _range.popFront();
7309                 i = 0;
7310             }
7311         }
7312 
7313         static if (isForwardRange!Range)
7314         {
7315             Result save() @property
7316             {
7317                 return Result(_range.save, i);
7318             }
7319         }
7320     }
7321 
7322     return Result(range);
7323 }
7324 
7325 /// Ditto
7326 auto byCodePoint(Range)(Range range)
7327 if (isInputRange!Range && is(immutable ElementType!Range == immutable dchar))
7328 {
7329     import std.range.primitives : isBidirectionalRange, popBack;
7330     import std.traits : isNarrowString;
7331     static if (isNarrowString!Range)
7332     {
7333         static struct Result
7334         {
7335             private Range _range;
7336             @property bool empty() { return _range.empty; }
7337             @property dchar front(){ return _range.front; }
7338             void popFront(){ _range.popFront; }
7339             @property auto save() { return Result(_range.save); }
7340             @property dchar back(){ return _range.back; }
7341             void popBack(){ _range.popBack; }
7342         }
7343         static assert(isBidirectionalRange!(Result));
7344         return Result(range);
7345     }
7346     else
7347         return range;
7348 }
7349 
7350 ///
7351 @safe unittest
7352 {
7353     import std.array : array;
7354     import std.conv : text;
7355     import std.range : retro;
7356 
7357     string s = "noe\u0308l"; // noël
7358 
7359     // reverse it and convert the result to a string
7360     string reverse = s.byGrapheme
7361         .array
7362         .retro
7363         .byCodePoint
7364         .text;
7365 
7366     assert(reverse == "le\u0308on"); // lëon
7367 }
7368 
7369 @safe unittest
7370 {
7371     import std.algorithm.comparison : equal;
7372     import std.range.primitives : walkLength;
7373     import std.range : retro;
7374     assert("".byGrapheme.byCodePoint.equal(""));
7375 
7376     string text = "noe\u0308l";
7377     static assert(!__traits(compiles, "noe\u0308l".byCodePoint.length));
7378 
7379     auto gText = InputRangeString(text).byGrapheme;
7380     static assert(!isForwardRange!(typeof(gText)));
7381 
7382     auto cpText = gText.byCodePoint;
7383     static assert(!isForwardRange!(typeof(cpText)));
7384 
7385     assert(cpText.walkLength == text.walkLength);
7386 
7387     auto plainCp = text.byCodePoint;
7388     static assert(isForwardRange!(typeof(plainCp)));
7389     assert(equal(plainCp, text));
7390     assert(equal(retro(plainCp.save), retro(text.save)));
7391     // Check that we still have length for dstring
7392     assert("абвгд"d.byCodePoint.length == 5);
7393 }
7394 
7395 /++
7396     $(P A structure designed to effectively pack $(CHARACTERS)
7397     of a $(CLUSTER).
7398     )
7399 
7400     $(P `Grapheme` has value semantics so 2 copies of a `Grapheme`
7401     always refer to distinct objects. In most actual scenarios a `Grapheme`
7402     fits on the stack and avoids memory allocation overhead for all but quite
7403     long clusters.
7404     )
7405 
7406     See_Also: $(LREF decodeGrapheme), $(LREF graphemeStride)
7407 +/
7408 @safe struct Grapheme
7409 {
7410     import std.exception : enforce;
7411     import std.traits : isDynamicArray;
7412 
7413 public:
7414     /// Ctor
7415     this(C)(const scope C[] chars...)
7416         if (is(C : dchar))
7417     {
7418         this ~= chars;
7419     }
7420 
7421     ///ditto
7422     this(Input)(Input seq)
7423         if (!isDynamicArray!Input
7424             && isInputRange!Input && is(ElementType!Input : dchar))
7425     {
7426         this ~= seq;
7427     }
7428 
7429     /// Gets a $(CODEPOINT) at the given index in this cluster.
7430     dchar opIndex(size_t index) const @nogc nothrow pure @trusted
7431     {
7432         assert(index < length);
7433         return read24(isBig ? ptr_ : small_.ptr, index);
7434     }
7435 
7436     /++
7437         Writes a $(CODEPOINT) `ch` at given index in this cluster.
7438 
7439         Warning:
7440         Use of this facility may invalidate grapheme cluster,
7441         see also $(LREF Grapheme.valid).
7442     +/
7443     void opIndexAssign(dchar ch, size_t index) @nogc nothrow pure @trusted
7444     {
7445         assert(index < length);
7446         write24(isBig ? ptr_ : small_.ptr, ch, index);
7447     }
7448 
7449     ///
7450     @safe unittest
7451     {
7452         auto g = Grapheme("A\u0302");
7453         assert(g[0] == 'A');
7454         assert(g.valid);
7455         g[1] = '~'; // ASCII tilda is not a combining mark
7456         assert(g[1] == '~');
7457         assert(!g.valid);
7458     }
7459 
7460     /++
7461         Random-access range over Grapheme's $(CHARACTERS).
7462 
7463         Warning: Invalidates when this Grapheme leaves the scope,
7464         attempts to use it then would lead to memory corruption.
7465     +/
7466     SliceOverIndexed!Grapheme opSlice(size_t a, size_t b) @nogc nothrow pure return
7467     {
7468         return sliceOverIndexed(a, b, &this);
7469     }
7470 
7471     /// ditto
7472     SliceOverIndexed!Grapheme opSlice() @nogc nothrow pure return
7473     {
7474         return sliceOverIndexed(0, length, &this);
7475     }
7476 
7477     /// Grapheme cluster length in $(CODEPOINTS).
7478     @property size_t length() const @nogc nothrow pure
7479     {
7480         return isBig ? len_ : slen_ & 0x7F;
7481     }
7482 
7483     /++
7484         Append $(CHARACTER) `ch` to this grapheme.
7485         Warning:
7486         Use of this facility may invalidate grapheme cluster,
7487         see also `valid`.
7488 
7489         See_Also: $(LREF Grapheme.valid)
7490     +/
7491     ref opOpAssign(string op)(dchar ch) @trusted
7492     {
7493         static if (op == "~")
7494         {
7495             import std.internal.memory : enforceRealloc;
7496             if (!isBig)
7497             {
7498                 if (slen_ == small_cap)
7499                     convertToBig();// & fallthrough to "big" branch
7500                 else
7501                 {
7502                     write24(small_.ptr, ch, smallLength);
7503                     slen_++;
7504                     return this;
7505                 }
7506             }
7507 
7508             assert(isBig);
7509             if (len_ == cap_)
7510             {
7511                 import core.checkedint : addu, mulu;
7512                 bool overflow;
7513                 cap_ = addu(cap_, grow, overflow);
7514                 auto nelems = mulu(3, addu(cap_, 1, overflow), overflow);
7515                 if (overflow) assert(0);
7516                 ptr_ = cast(ubyte*) enforceRealloc(ptr_, nelems);
7517             }
7518             write24(ptr_, ch, len_++);
7519             return this;
7520         }
7521         else
7522             static assert(false, "No operation "~op~" defined for Grapheme");
7523     }
7524 
7525     ///
7526     @safe unittest
7527     {
7528         import std.algorithm.comparison : equal;
7529         auto g = Grapheme("A");
7530         assert(g.valid);
7531         g ~= '\u0301';
7532         assert(g[].equal("A\u0301"));
7533         assert(g.valid);
7534         g ~= "B";
7535         // not a valid grapheme cluster anymore
7536         assert(!g.valid);
7537         // still could be useful though
7538         assert(g[].equal("A\u0301B"));
7539     }
7540 
7541     /// Append all $(CHARACTERS) from the input range `inp` to this Grapheme.
7542     ref opOpAssign(string op, Input)(scope Input inp)
7543         if (isInputRange!Input && is(ElementType!Input : dchar))
7544     {
7545         static if (op == "~")
7546         {
7547             foreach (dchar ch; inp)
7548                 this ~= ch;
7549             return this;
7550         }
7551         else
7552             static assert(false, "No operation "~op~" defined for Grapheme");
7553     }
7554 
7555     /++
7556         True if this object contains valid extended grapheme cluster.
7557         Decoding primitives of this module always return a valid `Grapheme`.
7558 
7559         Appending to and direct manipulation of grapheme's $(CHARACTERS) may
7560         render it no longer valid. Certain applications may chose to use
7561         Grapheme as a "small string" of any $(CODEPOINTS) and ignore this property
7562         entirely.
7563     +/
7564     @property bool valid()() /*const*/
7565     {
7566         auto r = this[];
7567         genericDecodeGrapheme!false(r);
7568         return r.length == 0;
7569     }
7570 
7571     this(this) @nogc nothrow pure @trusted
7572     {
7573         import std.internal.memory : enforceMalloc;
7574         if (isBig)
7575         {// dup it
7576             import core.checkedint : addu, mulu;
7577             bool overflow;
7578             auto raw_cap = mulu(3, addu(cap_, 1, overflow), overflow);
7579             if (overflow) assert(0);
7580 
7581             auto p = cast(ubyte*) enforceMalloc(raw_cap);
7582             p[0 .. raw_cap] = ptr_[0 .. raw_cap];
7583             ptr_ = p;
7584         }
7585     }
7586 
7587     ~this() @nogc nothrow pure @trusted
7588     {
7589         import core.memory : pureFree;
7590         if (isBig)
7591         {
7592             pureFree(ptr_);
7593         }
7594     }
7595 
7596 
7597 private:
7598     enum small_bytes = ((ubyte*).sizeof+3*size_t.sizeof-1);
7599     // "out of the blue" grow rate, needs testing
7600     // (though graphemes are typically small < 9)
7601     enum grow = 20;
7602     enum small_cap = small_bytes/3;
7603     enum small_flag = 0x80, small_mask = 0x7F;
7604     // 16 bytes in 32bits, should be enough for the majority of cases
7605     union
7606     {
7607         struct
7608         {
7609             ubyte* ptr_;
7610             size_t cap_;
7611             size_t len_;
7612             size_t padding_;
7613         }
7614         struct
7615         {
7616             ubyte[small_bytes] small_;
7617             ubyte slen_;
7618         }
7619     }
7620 
7621     void convertToBig() @nogc nothrow pure @trusted
7622     {
7623         import std.internal.memory : enforceMalloc;
7624         static assert(grow.max / 3 - 1 >= grow);
7625         enum nbytes = 3 * (grow + 1);
7626         size_t k = smallLength;
7627         ubyte* p = cast(ubyte*) enforceMalloc(nbytes);
7628         for (int i=0; i<k; i++)
7629             write24(p, read24(small_.ptr, i), i);
7630         // now we can overwrite small array data
7631         ptr_ = p;
7632         len_ = slen_;
7633         assert(grow > len_);
7634         cap_ = grow;
7635         setBig();
7636     }
7637 
7638     void setBig() @nogc nothrow pure { slen_ |= small_flag; }
7639 
7640     @property size_t smallLength() const @nogc nothrow pure
7641     {
7642         return slen_ & small_mask;
7643     }
7644     @property ubyte isBig() const @nogc nothrow pure
7645     {
7646         return slen_ & small_flag;
7647     }
7648 }
7649 
7650 static assert(Grapheme.sizeof == size_t.sizeof*4);
7651 
7652 
7653 @safe pure /*nothrow @nogc*/ unittest // TODO: string .front is GC and throw
7654 {
7655     import std.algorithm.comparison : equal;
7656     Grapheme[3] data = [Grapheme("Ю"), Grapheme("У"), Grapheme("З")];
7657     assert(byGrapheme("ЮУЗ").equal(data[]));
7658 }
7659 
7660 ///
7661 @safe unittest
7662 {
7663     import std.algorithm.comparison : equal;
7664     import std.algorithm.iteration : filter;
7665     import std.range : isRandomAccessRange;
7666 
7667     string bold = "ku\u0308hn";
7668 
7669     // note that decodeGrapheme takes parameter by ref
7670     auto first = decodeGrapheme(bold);
7671 
7672     assert(first.length == 1);
7673     assert(first[0] == 'k');
7674 
7675     // the next grapheme is 2 characters long
7676     auto wideOne = decodeGrapheme(bold);
7677     // slicing a grapheme yields a random-access range of dchar
7678     assert(wideOne[].equal("u\u0308"));
7679     assert(wideOne.length == 2);
7680     static assert(isRandomAccessRange!(typeof(wideOne[])));
7681 
7682     // all of the usual range manipulation is possible
7683     assert(wideOne[].filter!isMark().equal("\u0308"));
7684 
7685     auto g = Grapheme("A");
7686     assert(g.valid);
7687     g ~= '\u0301';
7688     assert(g[].equal("A\u0301"));
7689     assert(g.valid);
7690     g ~= "B";
7691     // not a valid grapheme cluster anymore
7692     assert(!g.valid);
7693     // still could be useful though
7694     assert(g[].equal("A\u0301B"));
7695 }
7696 
7697 @safe unittest
7698 {
7699     auto g = Grapheme("A\u0302");
7700     assert(g[0] == 'A');
7701     assert(g.valid);
7702     g[1] = '~'; // ASCII tilda is not a combining mark
7703     assert(g[1] == '~');
7704     assert(!g.valid);
7705 }
7706 
7707 @safe unittest
7708 {
7709     import std.algorithm.comparison : equal;
7710     import std.algorithm.iteration : map;
7711     import std.conv : text;
7712     import std.range : iota;
7713 
7714     // not valid clusters (but it just a test)
7715     auto g  = Grapheme('a', 'b', 'c', 'd', 'e');
7716     assert(g[0] == 'a');
7717     assert(g[1] == 'b');
7718     assert(g[2] == 'c');
7719     assert(g[3] == 'd');
7720     assert(g[4] == 'e');
7721     g[3] = 'Й';
7722     assert(g[2] == 'c');
7723     assert(g[3] == 'Й', text(g[3], " vs ", 'Й'));
7724     assert(g[4] == 'e');
7725     assert(!g.valid);
7726 
7727     g ~= 'ц';
7728     g ~= '~';
7729     assert(g[0] == 'a');
7730     assert(g[1] == 'b');
7731     assert(g[2] == 'c');
7732     assert(g[3] == 'Й');
7733     assert(g[4] == 'e');
7734     assert(g[5] == 'ц');
7735     assert(g[6] == '~');
7736     assert(!g.valid);
7737 
7738     Grapheme copy = g;
7739     copy[0] = 'X';
7740     copy[1] = '-';
7741     assert(g[0] == 'a' && copy[0] == 'X');
7742     assert(g[1] == 'b' && copy[1] == '-');
7743     assert(equal(g[2 .. g.length], copy[2 .. copy.length]));
7744     copy = Grapheme("АБВГДЕЁЖЗИКЛМ");
7745     assert(equal(copy[0 .. 8], "АБВГДЕЁЖ"), text(copy[0 .. 8]));
7746     copy ~= "xyz";
7747     assert(equal(copy[13 .. 15], "xy"), text(copy[13 .. 15]));
7748     assert(!copy.valid);
7749 
7750     Grapheme h;
7751     foreach (dchar v; iota(cast(int)'A', cast(int)'Z'+1).map!"cast(dchar)a"())
7752         h ~= v;
7753     assert(equal(h[], iota(cast(int)'A', cast(int)'Z'+1)));
7754 }
7755 
7756 /++
7757     $(P Does basic case-insensitive comparison of `r1` and `r2`.
7758     This function uses simpler comparison rule thus achieving better performance
7759     than $(LREF icmp). However keep in mind the warning below.)
7760 
7761     Params:
7762         r1 = an $(REF_ALTTEXT input range, isInputRange, std,range,primitives) of characters
7763         r2 = an $(REF_ALTTEXT input range, isInputRange, std,range,primitives) of characters
7764 
7765     Returns:
7766         An `int` that is 0 if the strings match,
7767         &lt;0 if `r1` is lexicographically "less" than `r2`,
7768         &gt;0 if `r1` is lexicographically "greater" than `r2`
7769 
7770     Warning:
7771     This function only handles 1:1 $(CODEPOINT) mapping
7772     and thus is not sufficient for certain alphabets
7773     like German, Greek and few others.
7774 
7775     See_Also:
7776         $(LREF icmp)
7777         $(REF cmp, std,algorithm,comparison)
7778 +/
7779 int sicmp(S1, S2)(scope S1 r1, scope S2 r2)
7780 if (isInputRange!S1 && isSomeChar!(ElementEncodingType!S1)
7781     && isInputRange!S2 && isSomeChar!(ElementEncodingType!S2))
7782 {
7783     import std.internal.unicode_tables : sTable = simpleCaseTable; // generated file
7784     import std.range.primitives : isInfinite;
7785     import std.utf : decodeFront;
7786     import std.traits : isDynamicArray;
7787     import std.typecons : Yes;
7788     static import std.ascii;
7789 
7790     static if ((isDynamicArray!S1 || isRandomAccessRange!S1)
7791         && (isDynamicArray!S2 || isRandomAccessRange!S2)
7792         && !(isInfinite!S1 && isInfinite!S2)
7793         && __traits(compiles,
7794             {
7795                 size_t s = size_t.sizeof / 2;
7796                 r1 = r1[s .. $];
7797                 r2 = r2[s .. $];
7798             }))
7799     {{
7800         // ASCII optimization for dynamic arrays & similar.
7801         size_t i = 0;
7802         static if (isInfinite!S1)
7803             immutable end = r2.length;
7804         else static if (isInfinite!S2)
7805             immutable end = r1.length;
7806         else
7807             immutable end = r1.length > r2.length ? r2.length : r1.length;
7808         for (; i < end; ++i)
7809         {
7810             auto lhs = r1[i];
7811             auto rhs = r2[i];
7812             if ((lhs | rhs) >= 0x80) goto NonAsciiPath;
7813             if (lhs == rhs) continue;
7814             auto lowDiff = std.ascii.toLower(lhs) - std.ascii.toLower(rhs);
7815             if (lowDiff) return lowDiff;
7816         }
7817         static if (isInfinite!S1)
7818             return 1;
7819         else static if (isInfinite!S2)
7820             return -1;
7821         else
7822             return (r1.length > r2.length) - (r2.length > r1.length);
7823 
7824     NonAsciiPath:
7825         r1 = r1[i .. $];
7826         r2 = r2[i .. $];
7827         // Fall through to standard case.
7828     }}
7829 
7830     while (!r1.empty)
7831     {
7832         immutable lhs = decodeFront!(Yes.useReplacementDchar)(r1);
7833         if (r2.empty)
7834             return 1;
7835         immutable rhs = decodeFront!(Yes.useReplacementDchar)(r2);
7836         int diff = lhs - rhs;
7837         if (!diff)
7838             continue;
7839         if ((lhs | rhs) < 0x80)
7840         {
7841             immutable d = std.ascii.toLower(lhs) - std.ascii.toLower(rhs);
7842             if (!d) continue;
7843             return d;
7844         }
7845         size_t idx = simpleCaseTrie[lhs];
7846         size_t idx2 = simpleCaseTrie[rhs];
7847         // simpleCaseTrie is packed index table
7848         if (idx != EMPTY_CASE_TRIE)
7849         {
7850             if (idx2 != EMPTY_CASE_TRIE)
7851             {// both cased chars
7852                 // adjust idx --> start of bucket
7853                 idx = idx - sTable[idx].n;
7854                 idx2 = idx2 - sTable[idx2].n;
7855                 if (idx == idx2)// one bucket, equivalent chars
7856                     continue;
7857                 else//  not the same bucket
7858                     diff = sTable[idx].ch - sTable[idx2].ch;
7859             }
7860             else
7861                 diff = sTable[idx - sTable[idx].n].ch - rhs;
7862         }
7863         else if (idx2 != EMPTY_CASE_TRIE)
7864         {
7865             diff = lhs - sTable[idx2 - sTable[idx2].n].ch;
7866         }
7867         // one of chars is not cased at all
7868         return diff;
7869     }
7870     return int(r2.empty) - 1;
7871 }
7872 
7873 ///
7874 @safe @nogc pure nothrow unittest
7875 {
7876     assert(sicmp("Август", "авгусТ") == 0);
7877     // Greek also works as long as there is no 1:M mapping in sight
7878     assert(sicmp("ΌΎ", "όύ") == 0);
7879     // things like the following won't get matched as equal
7880     // Greek small letter iota with dialytika and tonos
7881     assert(sicmp("ΐ", "\u03B9\u0308\u0301") != 0);
7882 
7883     // while icmp has no problem with that
7884     assert(icmp("ΐ", "\u03B9\u0308\u0301") == 0);
7885     assert(icmp("ΌΎ", "όύ") == 0);
7886 }
7887 
7888 // overloads for the most common cases to reduce compile time
7889 @safe @nogc pure nothrow
7890 {
7891     int sicmp(scope const(char)[] str1, scope const(char)[] str2)
7892     { return sicmp!(const(char)[], const(char)[])(str1, str2); }
7893 
7894     int sicmp(scope const(wchar)[] str1, scope const(wchar)[] str2)
7895     { return sicmp!(const(wchar)[], const(wchar)[])(str1, str2); }
7896 
7897     int sicmp(scope const(dchar)[] str1, scope const(dchar)[] str2)
7898     { return sicmp!(const(dchar)[], const(dchar)[])(str1, str2); }
7899 }
7900 
7901 private int fullCasedCmp(Range)(dchar lhs, dchar rhs, ref Range rtail)
7902 {
7903     import std.algorithm.searching : skipOver;
7904     import std.internal.unicode_tables : fullCaseTable; // generated file
7905     alias fTable = fullCaseTable;
7906     size_t idx = fullCaseTrie[lhs];
7907     // fullCaseTrie is packed index table
7908     if (idx == EMPTY_CASE_TRIE)
7909         return lhs;
7910     immutable start = idx - fTable[idx].n;
7911     immutable end = fTable[idx].size + start;
7912     assert(fTable[start].entry_len == 1);
7913     for (idx=start; idx<end; idx++)
7914     {
7915         auto entryLen = fTable[idx].entry_len;
7916         if (entryLen == 1)
7917         {
7918             if (fTable[idx].seq[0] == rhs)
7919             {
7920                 return 0;
7921             }
7922         }
7923         else
7924         {// OK it's a long chunk, like 'ss' for German
7925             dstring seq = fTable[idx].seq[0 .. entryLen];
7926             if (rhs == seq[0]
7927                 && rtail.skipOver(seq[1..$]))
7928             {
7929                 // note that this path modifies rtail
7930                 // iff we managed to get there
7931                 return 0;
7932             }
7933         }
7934     }
7935     return fTable[start].seq[0]; // new remapped character for accurate diffs
7936 }
7937 
7938 /++
7939     Does case insensitive comparison of `r1` and `r2`.
7940     Follows the rules of full case-folding mapping.
7941     This includes matching as equal german ß with "ss" and
7942     other 1:M $(CODEPOINT) mappings unlike $(LREF sicmp).
7943     The cost of `icmp` being pedantically correct is
7944     slightly worse performance.
7945 
7946     Params:
7947         r1 = a forward range of characters
7948         r2 = a forward range of characters
7949 
7950     Returns:
7951         An `int` that is 0 if the strings match,
7952         &lt;0 if `str1` is lexicographically "less" than `str2`,
7953         &gt;0 if `str1` is lexicographically "greater" than `str2`
7954 
7955     See_Also:
7956         $(LREF sicmp)
7957         $(REF cmp, std,algorithm,comparison)
7958 +/
7959 int icmp(S1, S2)(S1 r1, S2 r2)
7960 if (isForwardRange!S1 && isSomeChar!(ElementEncodingType!S1)
7961     && isForwardRange!S2 && isSomeChar!(ElementEncodingType!S2))
7962 {
7963     import std.range.primitives : isInfinite;
7964     import std.traits : isDynamicArray;
7965     import std.utf : byDchar;
7966     static import std.ascii;
7967 
7968     static if ((isDynamicArray!S1 || isRandomAccessRange!S1)
7969         && (isDynamicArray!S2 || isRandomAccessRange!S2)
7970         && !(isInfinite!S1 && isInfinite!S2)
7971         && __traits(compiles,
7972             {
7973                 size_t s = size_t.max / 2;
7974                 r1 = r1[s .. $];
7975                 r2 = r2[s .. $];
7976             }))
7977     {{
7978         // ASCII optimization for dynamic arrays & similar.
7979         size_t i = 0;
7980         static if (isInfinite!S1)
7981             immutable end = r2.length;
7982         else static if (isInfinite!S2)
7983             immutable end = r1.length;
7984         else
7985             immutable end = r1.length > r2.length ? r2.length : r1.length;
7986         for (; i < end; ++i)
7987         {
7988             auto lhs = r1[i];
7989             auto rhs = r2[i];
7990             if ((lhs | rhs) >= 0x80) goto NonAsciiPath;
7991             if (lhs == rhs) continue;
7992             auto lowDiff = std.ascii.toLower(lhs) - std.ascii.toLower(rhs);
7993             if (lowDiff) return lowDiff;
7994         }
7995         static if (isInfinite!S1)
7996             return 1;
7997         else static if (isInfinite!S2)
7998             return -1;
7999         else
8000             return (r1.length > r2.length) - (r2.length > r1.length);
8001 
8002     NonAsciiPath:
8003         r1 = r1[i .. $];
8004         r2 = r2[i .. $];
8005         // Fall through to standard case.
8006     }}
8007 
8008     auto str1 = r1.byDchar;
8009     auto str2 = r2.byDchar;
8010 
8011     for (;;)
8012     {
8013         if (str1.empty)
8014             return str2.empty ? 0 : -1;
8015         immutable lhs = str1.front;
8016         if (str2.empty)
8017             return 1;
8018         immutable rhs = str2.front;
8019         str1.popFront();
8020         str2.popFront();
8021         if (!(lhs - rhs))
8022             continue;
8023         // first try to match lhs to <rhs,right-tail> sequence
8024         immutable cmpLR = fullCasedCmp(lhs, rhs, str2);
8025         if (!cmpLR)
8026             continue;
8027         // then rhs to <lhs,left-tail> sequence
8028         immutable cmpRL = fullCasedCmp(rhs, lhs, str1);
8029         if (!cmpRL)
8030             continue;
8031         // cmpXX contain remapped codepoints
8032         // to obtain stable ordering of icmp
8033         return cmpLR - cmpRL;
8034     }
8035 }
8036 
8037 ///
8038 @safe @nogc pure nothrow unittest
8039 {
8040     assert(icmp("Rußland", "Russland") == 0);
8041     assert(icmp("ᾩ -> \u1F70\u03B9", "\u1F61\u03B9 -> ᾲ") == 0);
8042 }
8043 
8044 /**
8045  * By using $(REF byUTF, std,utf) and its aliases, GC allocations via auto-decoding
8046  * and thrown exceptions can be avoided, making `icmp` `@safe @nogc nothrow pure`.
8047  */
8048 @safe @nogc nothrow pure unittest
8049 {
8050     import std.utf : byDchar;
8051 
8052     assert(icmp("Rußland".byDchar, "Russland".byDchar) == 0);
8053     assert(icmp("ᾩ -> \u1F70\u03B9".byDchar, "\u1F61\u03B9 -> ᾲ".byDchar) == 0);
8054 }
8055 
8056 // test different character types
8057 @safe unittest
8058 {
8059     assert(icmp("Rußland", "Russland") == 0);
8060     assert(icmp("Rußland"w, "Russland") == 0);
8061     assert(icmp("Rußland", "Russland"w) == 0);
8062     assert(icmp("Rußland"w, "Russland"w) == 0);
8063     assert(icmp("Rußland"d, "Russland"w) == 0);
8064     assert(icmp("Rußland"w, "Russland"d) == 0);
8065 }
8066 
8067 // overloads for the most common cases to reduce compile time
8068 @safe @nogc pure nothrow
8069 {
8070     int icmp(const(char)[] str1, const(char)[] str2)
8071     { return icmp!(const(char)[], const(char)[])(str1, str2); }
8072     int icmp(const(wchar)[] str1, const(wchar)[] str2)
8073     { return icmp!(const(wchar)[], const(wchar)[])(str1, str2); }
8074     int icmp(const(dchar)[] str1, const(dchar)[] str2)
8075     { return icmp!(const(dchar)[], const(dchar)[])(str1, str2); }
8076 }
8077 
8078 @safe unittest
8079 {
8080     import std.algorithm.sorting : sort;
8081     import std.conv : to;
8082     import std.exception : assertCTFEable;
8083     assertCTFEable!(
8084     {
8085     static foreach (cfunc; AliasSeq!(icmp, sicmp))
8086     {{
8087         static foreach (S1; AliasSeq!(string, wstring, dstring))
8088         static foreach (S2; AliasSeq!(string, wstring, dstring))
8089         {
8090             assert(cfunc("".to!S1(), "".to!S2()) == 0);
8091             assert(cfunc("A".to!S1(), "".to!S2()) > 0);
8092             assert(cfunc("".to!S1(), "0".to!S2()) < 0);
8093             assert(cfunc("abc".to!S1(), "abc".to!S2()) == 0);
8094             assert(cfunc("abcd".to!S1(), "abc".to!S2()) > 0);
8095             assert(cfunc("abc".to!S1(), "abcd".to!S2()) < 0);
8096             assert(cfunc("Abc".to!S1(), "aBc".to!S2()) == 0);
8097             assert(cfunc("авГуст".to!S1(), "АВгУСТ".to!S2()) == 0);
8098             // Check example:
8099             assert(cfunc("Август".to!S1(), "авгусТ".to!S2()) == 0);
8100             assert(cfunc("ΌΎ".to!S1(), "όύ".to!S2()) == 0);
8101         }
8102         // check that the order is properly agnostic to the case
8103         auto strs = [ "Apple", "ORANGE",  "orAcle", "amp", "banana"];
8104         sort!((a,b) => cfunc(a,b) < 0)(strs);
8105         assert(strs == ["amp", "Apple",  "banana", "orAcle", "ORANGE"]);
8106     }}
8107     assert(icmp("ßb", "ssa") > 0);
8108     // Check example:
8109     assert(icmp("Russland", "Rußland") == 0);
8110     assert(icmp("ᾩ -> \u1F70\u03B9", "\u1F61\u03B9 -> ᾲ") == 0);
8111     assert(icmp("ΐ"w, "\u03B9\u0308\u0301") == 0);
8112     assert(sicmp("ΐ", "\u03B9\u0308\u0301") != 0);
8113     // https://issues.dlang.org/show_bug.cgi?id=11057
8114     assert( icmp("K", "L") < 0 );
8115     });
8116 }
8117 
8118 // https://issues.dlang.org/show_bug.cgi?id=17372
8119 @safe pure unittest
8120 {
8121     import std.algorithm.iteration : joiner, map;
8122     import std.algorithm.sorting : sort;
8123     import std.array : array;
8124     auto a = [["foo", "bar"], ["baz"]].map!(line => line.joiner(" ")).array.sort!((a, b) => icmp(a, b) < 0);
8125 }
8126 
8127 // This is package(std) for the moment to be used as a support tool for std.regex
8128 // It needs a better API
8129 /*
8130     Return a range of all $(CODEPOINTS) that casefold to
8131     and from this `ch`.
8132 */
8133 package(std) auto simpleCaseFoldings(dchar ch) @safe
8134 {
8135     import std.internal.unicode_tables : simpleCaseTable; // generated file
8136     alias sTable = simpleCaseTable;
8137     static struct Range
8138     {
8139     @safe pure nothrow:
8140         uint idx; //if == uint.max, then read c.
8141         union
8142         {
8143             dchar c; // == 0 - empty range
8144             uint len;
8145         }
8146         @property bool isSmall() const { return idx == uint.max; }
8147 
8148         this(dchar ch)
8149         {
8150             idx = uint.max;
8151             c = ch;
8152         }
8153 
8154         this(uint start, uint size)
8155         {
8156             idx = start;
8157             len = size;
8158         }
8159 
8160         @property dchar front() const
8161         {
8162             assert(!empty);
8163             if (isSmall)
8164             {
8165                 return c;
8166             }
8167             auto ch = sTable[idx].ch;
8168             return ch;
8169         }
8170 
8171         @property bool empty() const
8172         {
8173             if (isSmall)
8174             {
8175                 return c == 0;
8176             }
8177             return len == 0;
8178         }
8179 
8180         @property size_t length() const
8181         {
8182             if (isSmall)
8183             {
8184                 return c == 0 ? 0 : 1;
8185             }
8186             return len;
8187         }
8188 
8189         void popFront()
8190         {
8191             if (isSmall)
8192                 c = 0;
8193             else
8194             {
8195                 idx++;
8196                 len--;
8197             }
8198         }
8199     }
8200     immutable idx = simpleCaseTrie[ch];
8201     if (idx == EMPTY_CASE_TRIE)
8202         return Range(ch);
8203     auto entry = sTable[idx];
8204     immutable start = idx - entry.n;
8205     return Range(start, entry.size);
8206 }
8207 
8208 @safe unittest
8209 {
8210     import std.algorithm.comparison : equal;
8211     import std.algorithm.searching : canFind;
8212     import std.array : array;
8213     import std.exception : assertCTFEable;
8214     assertCTFEable!((){
8215         auto r = simpleCaseFoldings('Э').array;
8216         assert(r.length == 2);
8217         assert(r.canFind('э') && r.canFind('Э'));
8218         auto sr = simpleCaseFoldings('~');
8219         assert(sr.equal("~"));
8220         //A with ring above - casefolds to the same bucket as Angstrom sign
8221         sr = simpleCaseFoldings('Å');
8222         assert(sr.length == 3);
8223         assert(sr.canFind('å') && sr.canFind('Å') && sr.canFind('\u212B'));
8224     });
8225 }
8226 
8227 /++
8228     $(P Returns the $(S_LINK Combining class, combining class) of `ch`.)
8229 +/
8230 ubyte combiningClass(dchar ch) @safe pure nothrow @nogc
8231 {
8232     return combiningClassTrie[ch];
8233 }
8234 
8235 ///
8236 @safe unittest
8237 {
8238     // shorten the code
8239     alias CC = combiningClass;
8240 
8241     // combining tilda
8242     assert(CC('\u0303') == 230);
8243     // combining ring below
8244     assert(CC('\u0325') == 220);
8245     // the simple consequence is that  "tilda" should be
8246     // placed after a "ring below" in a sequence
8247 }
8248 
8249 @safe pure nothrow @nogc unittest
8250 {
8251     foreach (ch; 0 .. 0x80)
8252         assert(combiningClass(ch) == 0);
8253     assert(combiningClass('\u05BD') == 22);
8254     assert(combiningClass('\u0300') == 230);
8255     assert(combiningClass('\u0317') == 220);
8256     assert(combiningClass('\u1939') == 222);
8257 }
8258 
8259 /// Unicode character decomposition type.
8260 enum UnicodeDecomposition {
8261     /// Canonical decomposition. The result is canonically equivalent sequence.
8262     Canonical,
8263     /**
8264          Compatibility decomposition. The result is compatibility equivalent sequence.
8265          Note: Compatibility decomposition is a $(B lossy) conversion,
8266          typically suitable only for fuzzy matching and internal processing.
8267     */
8268     Compatibility
8269 }
8270 
8271 /**
8272     Shorthand aliases for character decomposition type, passed as a
8273     template parameter to $(LREF decompose).
8274 */
8275 enum {
8276     Canonical = UnicodeDecomposition.Canonical,
8277     Compatibility = UnicodeDecomposition.Compatibility
8278 }
8279 
8280 /++
8281     Try to canonically compose 2 $(CHARACTERS).
8282     Returns the composed $(CHARACTER) if they do compose and dchar.init otherwise.
8283 
8284     The assumption is that `first` comes before `second` in the original text,
8285     usually meaning that the first is a starter.
8286 
8287     Note: Hangul syllables are not covered by this function.
8288     See `composeJamo` below.
8289 +/
8290 public dchar compose(dchar first, dchar second) pure nothrow @safe
8291 {
8292     import std.algorithm.iteration : map;
8293     import std.internal.unicode_comp : compositionTable, composeCntShift, composeIdxMask;
8294     import std.range : assumeSorted;
8295     immutable packed = compositionJumpTrie[first];
8296     if (packed == ushort.max)
8297         return dchar.init;
8298     // unpack offset and length
8299     immutable idx = packed & composeIdxMask, cnt = packed >> composeCntShift;
8300     // TODO: optimize this micro binary search (no more then 4-5 steps)
8301     auto r = compositionTable[idx .. idx+cnt].map!"a.rhs"().assumeSorted();
8302     immutable target = r.lowerBound(second).length;
8303     if (target == cnt)
8304         return dchar.init;
8305     immutable entry = compositionTable[idx+target];
8306     if (entry.rhs != second)
8307         return dchar.init;
8308     return entry.composed;
8309 }
8310 
8311 ///
8312 @safe unittest
8313 {
8314     assert(compose('A','\u0308') == '\u00C4');
8315     assert(compose('A', 'B') == dchar.init);
8316     assert(compose('C', '\u0301') == '\u0106');
8317     // note that the starter is the first one
8318     // thus the following doesn't compose
8319     assert(compose('\u0308', 'A') == dchar.init);
8320 }
8321 
8322 /++
8323     Returns a full $(S_LINK Canonical decomposition, Canonical)
8324     (by default) or $(S_LINK Compatibility decomposition, Compatibility)
8325     decomposition of $(CHARACTER) `ch`.
8326     If no decomposition is available returns a $(LREF Grapheme)
8327     with the `ch` itself.
8328 
8329     Note:
8330     This function also decomposes hangul syllables
8331     as prescribed by the standard.
8332 
8333     See_Also: $(LREF decomposeHangul) for a restricted version
8334     that takes into account only hangul syllables  but
8335     no other decompositions.
8336 +/
8337 public Grapheme decompose(UnicodeDecomposition decompType=Canonical)(dchar ch) @safe
8338 {
8339     import std.algorithm.searching : until;
8340     import std.internal.unicode_decomp : decompCompatTable, decompCanonTable;
8341     static if (decompType == Canonical)
8342     {
8343         alias table = decompCanonTable;
8344         alias mapping = canonMappingTrie;
8345     }
8346     else static if (decompType == Compatibility)
8347     {
8348         alias table = decompCompatTable;
8349         alias mapping = compatMappingTrie;
8350     }
8351     immutable idx = mapping[ch];
8352     if (!idx) // not found, check hangul arithmetic decomposition
8353         return decomposeHangul(ch);
8354     auto decomp = table[idx..$].until(0);
8355     return Grapheme(decomp);
8356 }
8357 
8358 ///
8359 @safe unittest
8360 {
8361     import std.algorithm.comparison : equal;
8362 
8363     assert(compose('A','\u0308') == '\u00C4');
8364     assert(compose('A', 'B') == dchar.init);
8365     assert(compose('C', '\u0301') == '\u0106');
8366     // note that the starter is the first one
8367     // thus the following doesn't compose
8368     assert(compose('\u0308', 'A') == dchar.init);
8369 
8370     assert(decompose('Ĉ')[].equal("C\u0302"));
8371     assert(decompose('D')[].equal("D"));
8372     assert(decompose('\uD4DC')[].equal("\u1111\u1171\u11B7"));
8373     assert(decompose!Compatibility('¹')[].equal("1"));
8374 }
8375 
8376 //----------------------------------------------------------------------------
8377 // Hangul specific composition/decomposition
8378 enum jamoSBase = 0xAC00;
8379 enum jamoLBase = 0x1100;
8380 enum jamoVBase = 0x1161;
8381 enum jamoTBase = 0x11A7;
8382 enum jamoLCount = 19, jamoVCount = 21, jamoTCount = 28;
8383 enum jamoNCount = jamoVCount * jamoTCount;
8384 enum jamoSCount = jamoLCount * jamoNCount;
8385 
8386 // Tests if `ch` is a Hangul leading consonant jamo.
8387 bool isJamoL(dchar ch) pure nothrow @nogc @safe
8388 {
8389     // first cmp rejects ~ 1M code points above leading jamo range
8390     return ch < jamoLBase+jamoLCount && ch >= jamoLBase;
8391 }
8392 
8393 // Tests if `ch` is a Hangul vowel jamo.
8394 bool isJamoT(dchar ch) pure nothrow @nogc @safe
8395 {
8396     // first cmp rejects ~ 1M code points above trailing jamo range
8397     // Note: ch == jamoTBase doesn't indicate trailing jamo (TIndex must be > 0)
8398     return ch < jamoTBase+jamoTCount && ch > jamoTBase;
8399 }
8400 
8401 // Tests if `ch` is a Hangul trailnig consonant jamo.
8402 bool isJamoV(dchar ch) pure nothrow @nogc @safe
8403 {
8404     // first cmp rejects ~ 1M code points above vowel range
8405     return  ch < jamoVBase+jamoVCount && ch >= jamoVBase;
8406 }
8407 
8408 int hangulSyllableIndex(dchar ch) pure nothrow @nogc @safe
8409 {
8410     int idxS = cast(int) ch - jamoSBase;
8411     return idxS >= 0 && idxS < jamoSCount ? idxS : -1;
8412 }
8413 
8414 // internal helper: compose hangul syllables leaving dchar.init in holes
8415 void hangulRecompose(scope dchar[] seq) pure nothrow @nogc @safe
8416 {
8417     for (size_t idx = 0; idx + 1 < seq.length; )
8418     {
8419         if (isJamoL(seq[idx]) && isJamoV(seq[idx+1]))
8420         {
8421             immutable int indexL = seq[idx] - jamoLBase;
8422             immutable int indexV = seq[idx+1] - jamoVBase;
8423             immutable int indexLV = indexL * jamoNCount + indexV * jamoTCount;
8424             if (idx + 2 < seq.length && isJamoT(seq[idx+2]))
8425             {
8426                 seq[idx] = jamoSBase + indexLV + seq[idx+2] - jamoTBase;
8427                 seq[idx+1] = dchar.init;
8428                 seq[idx+2] = dchar.init;
8429                 idx += 3;
8430             }
8431             else
8432             {
8433                 seq[idx] = jamoSBase + indexLV;
8434                 seq[idx+1] = dchar.init;
8435                 idx += 2;
8436             }
8437         }
8438         else
8439             idx++;
8440     }
8441 }
8442 
8443 //----------------------------------------------------------------------------
8444 public:
8445 
8446 /**
8447     Decomposes a Hangul syllable. If `ch` is not a composed syllable
8448     then this function returns $(LREF Grapheme) containing only `ch` as is.
8449 */
8450 Grapheme decomposeHangul(dchar ch) @safe
8451 {
8452     immutable idxS = cast(int) ch - jamoSBase;
8453     if (idxS < 0 || idxS >= jamoSCount) return Grapheme(ch);
8454     immutable idxL = idxS / jamoNCount;
8455     immutable idxV = (idxS % jamoNCount) / jamoTCount;
8456     immutable idxT = idxS % jamoTCount;
8457 
8458     immutable partL = jamoLBase + idxL;
8459     immutable partV = jamoVBase + idxV;
8460     if (idxT > 0) // there is a trailling consonant (T); <L,V,T> decomposition
8461         return Grapheme(partL, partV, jamoTBase + idxT);
8462     else // <L, V> decomposition
8463         return Grapheme(partL, partV);
8464 }
8465 
8466 ///
8467 @safe unittest
8468 {
8469     import std.algorithm.comparison : equal;
8470     assert(decomposeHangul('\uD4DB')[].equal("\u1111\u1171\u11B6"));
8471 }
8472 
8473 /++
8474     Try to compose hangul syllable out of a leading consonant (`lead`),
8475     a `vowel` and optional `trailing` consonant jamos.
8476 
8477     On success returns the composed LV or LVT hangul syllable.
8478 
8479     If any of `lead` and `vowel` are not a valid hangul jamo
8480     of the respective $(CHARACTER) class returns dchar.init.
8481 +/
8482 dchar composeJamo(dchar lead, dchar vowel, dchar trailing=dchar.init) pure nothrow @nogc @safe
8483 {
8484     if (!isJamoL(lead))
8485         return dchar.init;
8486     immutable indexL = lead - jamoLBase;
8487     if (!isJamoV(vowel))
8488         return dchar.init;
8489     immutable indexV = vowel - jamoVBase;
8490     immutable indexLV = indexL * jamoNCount + indexV * jamoTCount;
8491     immutable dchar syllable = jamoSBase + indexLV;
8492     return isJamoT(trailing) ? syllable + (trailing - jamoTBase) : syllable;
8493 }
8494 
8495 ///
8496 @safe unittest
8497 {
8498     assert(composeJamo('\u1111', '\u1171', '\u11B6') == '\uD4DB');
8499     // leaving out T-vowel, or passing any codepoint
8500     // that is not trailing consonant composes an LV-syllable
8501     assert(composeJamo('\u1111', '\u1171') == '\uD4CC');
8502     assert(composeJamo('\u1111', '\u1171', ' ') == '\uD4CC');
8503     assert(composeJamo('\u1111', 'A') == dchar.init);
8504     assert(composeJamo('A', '\u1171') == dchar.init);
8505 }
8506 
8507 @safe unittest
8508 {
8509     import std.algorithm.comparison : equal;
8510     import std.conv : text;
8511 
8512     static void testDecomp(UnicodeDecomposition T)(dchar ch, string r)
8513     {
8514         Grapheme g = decompose!T(ch);
8515         assert(equal(g[], r), text(g[], " vs ", r));
8516     }
8517     testDecomp!Canonical('\u1FF4', "\u03C9\u0301\u0345");
8518     testDecomp!Canonical('\uF907', "\u9F9C");
8519     testDecomp!Compatibility('\u33FF', "\u0067\u0061\u006C");
8520     testDecomp!Compatibility('\uA7F9', "\u0153");
8521 
8522     // check examples
8523     assert(decomposeHangul('\uD4DB')[].equal("\u1111\u1171\u11B6"));
8524     assert(composeJamo('\u1111', '\u1171', '\u11B6') == '\uD4DB');
8525     assert(composeJamo('\u1111', '\u1171') == '\uD4CC'); // leave out T-vowel
8526     assert(composeJamo('\u1111', '\u1171', ' ') == '\uD4CC');
8527     assert(composeJamo('\u1111', 'A') == dchar.init);
8528     assert(composeJamo('A', '\u1171') == dchar.init);
8529 }
8530 
8531 /**
8532     Enumeration type for normalization forms,
8533     passed as template parameter for functions like $(LREF normalize).
8534 */
8535 enum NormalizationForm {
8536     NFC,
8537     NFD,
8538     NFKC,
8539     NFKD
8540 }
8541 
8542 
8543 enum {
8544     /**
8545         Shorthand aliases from values indicating normalization forms.
8546     */
8547     NFC = NormalizationForm.NFC,
8548     ///ditto
8549     NFD = NormalizationForm.NFD,
8550     ///ditto
8551     NFKC = NormalizationForm.NFKC,
8552     ///ditto
8553     NFKD = NormalizationForm.NFKD
8554 }
8555 
8556 /++
8557     Returns `input` string normalized to the chosen form.
8558     Form C is used by default.
8559 
8560     For more information on normalization forms see
8561     the $(S_LINK Normalization, normalization section).
8562 
8563     Note:
8564     In cases where the string in question is already normalized,
8565     it is returned unmodified and no memory allocation happens.
8566 +/
8567 inout(C)[] normalize(NormalizationForm norm=NFC, C)(return scope inout(C)[] input)
8568 {
8569     import std.algorithm.mutation : SwapStrategy;
8570     import std.algorithm.sorting : sort;
8571     import std.array : appender;
8572     import std.range : zip;
8573 
8574     auto anchors = splitNormalized!norm(input);
8575     if (anchors[0] == input.length && anchors[1] == input.length)
8576         return input;
8577     dchar[] decomposed;
8578     decomposed.reserve(31);
8579     ubyte[] ccc;
8580     ccc.reserve(31);
8581     auto app = appender!(C[])();
8582     do
8583     {
8584         app.put(input[0 .. anchors[0]]);
8585         foreach (dchar ch; input[anchors[0]..anchors[1]])
8586             static if (norm == NFD || norm == NFC)
8587             {
8588                 foreach (dchar c; decompose!Canonical(ch)[])
8589                     decomposed ~= c;
8590             }
8591             else // NFKD & NFKC
8592             {
8593                 foreach (dchar c; decompose!Compatibility(ch)[])
8594                     decomposed ~= c;
8595             }
8596         ccc.length = decomposed.length;
8597         size_t firstNonStable = 0;
8598         ubyte lastClazz = 0;
8599 
8600         foreach (idx, dchar ch; decomposed)
8601         {
8602             immutable clazz = combiningClass(ch);
8603             ccc[idx] = clazz;
8604             if (clazz == 0 && lastClazz != 0)
8605             {
8606                 // found a stable code point after unstable ones
8607                 sort!("a[0] < b[0]", SwapStrategy.stable)
8608                     (zip(ccc[firstNonStable .. idx], decomposed[firstNonStable .. idx]));
8609                 firstNonStable = decomposed.length;
8610             }
8611             else if (clazz != 0 && lastClazz == 0)
8612             {
8613                 // found first unstable code point after stable ones
8614                 firstNonStable = idx;
8615             }
8616             lastClazz = clazz;
8617         }
8618         sort!("a[0] < b[0]", SwapStrategy.stable)
8619             (zip(ccc[firstNonStable..$], decomposed[firstNonStable..$]));
8620         static if (norm == NFC || norm == NFKC)
8621         {
8622             import std.algorithm.searching : countUntil;
8623             auto first = countUntil(ccc, 0);
8624             if (first >= 0) // no starters?? no recomposition
8625             {
8626                 for (;;)
8627                 {
8628                     immutable second = recompose(first, decomposed, ccc);
8629                     if (second == decomposed.length)
8630                         break;
8631                     first = second;
8632                 }
8633                 // 2nd pass for hangul syllables
8634                 hangulRecompose(decomposed);
8635             }
8636         }
8637         static if (norm == NFD || norm == NFKD)
8638             app.put(decomposed);
8639         else
8640         {
8641             import std.algorithm.mutation : remove;
8642             auto clean = remove!("a == dchar.init", SwapStrategy.stable)(decomposed);
8643             app.put(decomposed[0 .. clean.length]);
8644         }
8645         // reset variables
8646         decomposed.length = 0;
8647         () @trusted {
8648             decomposed.assumeSafeAppend();
8649             ccc.length = 0;
8650             ccc.assumeSafeAppend();
8651         } ();
8652         input = input[anchors[1]..$];
8653         // and move on
8654         anchors = splitNormalized!norm(input);
8655     }while (anchors[0] != input.length);
8656     app.put(input[0 .. anchors[0]]);
8657     return () @trusted inout { return cast(inout(C)[]) app.data; } ();
8658 }
8659 
8660 ///
8661 @safe unittest
8662 {
8663     // any encoding works
8664     wstring greet = "Hello world";
8665     assert(normalize(greet) is greet); // the same exact slice
8666 
8667     // An example of a character with all 4 forms being different:
8668     // Greek upsilon with acute and hook symbol (code point 0x03D3)
8669     assert(normalize!NFC("ϓ") == "\u03D3");
8670     assert(normalize!NFD("ϓ") == "\u03D2\u0301");
8671     assert(normalize!NFKC("ϓ") == "\u038E");
8672     assert(normalize!NFKD("ϓ") == "\u03A5\u0301");
8673 }
8674 
8675 @safe unittest
8676 {
8677     import std.conv : text;
8678 
8679     assert(normalize!NFD("abc\uF904def") == "abc\u6ED1def", text(normalize!NFD("abc\uF904def")));
8680     assert(normalize!NFKD("2¹⁰") == "210", normalize!NFKD("2¹⁰"));
8681     assert(normalize!NFD("Äffin") == "A\u0308ffin");
8682 
8683     // check example
8684 
8685     // any encoding works
8686     wstring greet = "Hello world";
8687     assert(normalize(greet) is greet); // the same exact slice
8688 
8689     // An example of a character with all 4 forms being different:
8690     // Greek upsilon with acute and hook symbol (code point 0x03D3)
8691     assert(normalize!NFC("ϓ") == "\u03D3");
8692     assert(normalize!NFD("ϓ") == "\u03D2\u0301");
8693     assert(normalize!NFKC("ϓ") == "\u038E");
8694     assert(normalize!NFKD("ϓ") == "\u03A5\u0301");
8695 }
8696 
8697 // canonically recompose given slice of code points, works in-place and mutates data
8698 private size_t recompose(size_t start, scope dchar[] input, scope ubyte[] ccc) pure nothrow @safe
8699 {
8700     assert(input.length == ccc.length);
8701     int accumCC = -1;// so that it's out of 0 .. 255 range
8702     // writefln("recomposing %( %04x %)", input);
8703     // first one is always a starter thus we start at i == 1
8704     size_t i = start+1;
8705     for (; ; )
8706     {
8707         if (i == input.length)
8708             break;
8709         immutable curCC = ccc[i];
8710         // In any character sequence beginning with a starter S
8711         // a character C is blocked from S if and only if there
8712         // is some character B between S and C, and either B
8713         // is a starter or it has the same or higher combining class as C.
8714         //------------------------
8715         // Applying to our case:
8716         // S is input[0]
8717         // accumCC is the maximum CCC of characters between C and S,
8718         //     as ccc are sorted
8719         // C is input[i]
8720 
8721         if (curCC > accumCC)
8722         {
8723             immutable comp = compose(input[start], input[i]);
8724             if (comp != dchar.init)
8725             {
8726                 input[start] = comp;
8727                 input[i] = dchar.init;// put a sentinel
8728                 // current was merged so its CCC shouldn't affect
8729                 // composing with the next one
8730             }
8731             else
8732             {
8733                 // if it was a starter then accumCC is now 0, end of loop
8734                 accumCC = curCC;
8735                 if (accumCC == 0)
8736                     break;
8737             }
8738         }
8739         else
8740         {
8741             // ditto here
8742             accumCC = curCC;
8743             if (accumCC == 0)
8744                 break;
8745         }
8746         i++;
8747     }
8748     return i;
8749 }
8750 
8751 // returns tuple of 2 indexes that delimit:
8752 // normalized text, piece that needs normalization and
8753 // the rest of input starting with stable code point
8754 private auto splitNormalized(NormalizationForm norm, C)(scope const(C)[] input)
8755 {
8756     import std.typecons : tuple;
8757     ubyte lastCC = 0;
8758 
8759     foreach (idx, dchar ch; input)
8760     {
8761         static if (norm == NFC)
8762             if (ch < 0x0300)
8763             {
8764                 lastCC = 0;
8765                 continue;
8766             }
8767         immutable ubyte CC = combiningClass(ch);
8768         if (lastCC > CC && CC != 0)
8769         {
8770             return seekStable!norm(idx, input);
8771         }
8772 
8773         if (notAllowedIn!norm(ch))
8774         {
8775            return seekStable!norm(idx, input);
8776         }
8777         lastCC = CC;
8778     }
8779     return tuple(input.length, input.length);
8780 }
8781 
8782 private auto seekStable(NormalizationForm norm, C)(size_t idx, const scope C[] input)
8783 {
8784     import std.typecons : tuple;
8785     import std.utf : codeLength;
8786 
8787     auto br = input[0 .. idx];
8788     size_t region_start = 0;// default
8789     for (;;)
8790     {
8791         if (br.empty)// start is 0
8792             break;
8793         dchar ch = br.back;
8794         if (combiningClass(ch) == 0 && allowedIn!norm(ch))
8795         {
8796             region_start = br.length - codeLength!C(ch);
8797             break;
8798         }
8799         br.popFront();
8800     }
8801     ///@@@BUG@@@ can't use find: " find is a nested function and can't be used..."
8802     size_t region_end=input.length;// end is $ by default
8803     foreach (i, dchar ch; input[idx..$])
8804     {
8805         if (combiningClass(ch) == 0 && allowedIn!norm(ch))
8806         {
8807             region_end = i+idx;
8808             break;
8809         }
8810     }
8811     // writeln("Region to normalize: ", input[region_start .. region_end]);
8812     return tuple(region_start, region_end);
8813 }
8814 
8815 /**
8816     Tests if dchar `ch` is always allowed (Quick_Check=YES) in normalization
8817     form `norm`.
8818 */
8819 public bool allowedIn(NormalizationForm norm)(dchar ch)
8820 {
8821     return !notAllowedIn!norm(ch);
8822 }
8823 
8824 ///
8825 @safe unittest
8826 {
8827     // e.g. Cyrillic is always allowed, so is ASCII
8828     assert(allowedIn!NFC('я'));
8829     assert(allowedIn!NFD('я'));
8830     assert(allowedIn!NFKC('я'));
8831     assert(allowedIn!NFKD('я'));
8832     assert(allowedIn!NFC('Z'));
8833 }
8834 
8835 // not user friendly name but more direct
8836 private bool notAllowedIn(NormalizationForm norm)(dchar ch)
8837 {
8838     static if (norm == NFC)
8839         alias qcTrie = nfcQCTrie;
8840     else static if (norm == NFD)
8841         alias qcTrie = nfdQCTrie;
8842     else static if (norm == NFKC)
8843         alias qcTrie = nfkcQCTrie;
8844     else static if (norm == NFKD)
8845         alias qcTrie = nfkdQCTrie;
8846     else
8847         static assert("Unknown normalization form "~norm);
8848     return qcTrie[ch];
8849 }
8850 
8851 @safe unittest
8852 {
8853     assert(allowedIn!NFC('я'));
8854     assert(allowedIn!NFD('я'));
8855     assert(allowedIn!NFKC('я'));
8856     assert(allowedIn!NFKD('я'));
8857     assert(allowedIn!NFC('Z'));
8858 }
8859 
8860 }
8861 
8862 version (std_uni_bootstrap)
8863 {
8864     // old version used for bootstrapping of gen_uni.d that generates
8865     // up to date optimal versions of all of isXXX functions
8866     @safe pure nothrow @nogc public bool isWhite(dchar c)
8867     {
8868         import std.ascii : isWhite;
8869         return isWhite(c) ||
8870                c == lineSep || c == paraSep ||
8871                c == '\u0085' || c == '\u00A0' || c == '\u1680' || c == '\u180E' ||
8872                (c >= '\u2000' && c <= '\u200A') ||
8873                c == '\u202F' || c == '\u205F' || c == '\u3000';
8874     }
8875 }
8876 else
8877 {
8878 
8879 // trusted -> avoid bounds check
8880 @trusted pure nothrow @nogc private
8881 {
8882     import std.internal.unicode_tables; // : toLowerTable, toTitleTable, toUpperTable; // generated file
8883 
8884     // hide template instances behind functions
8885     // https://issues.dlang.org/show_bug.cgi?id=13232
8886     ushort toLowerIndex(dchar c) { return toLowerIndexTrie[c]; }
8887     ushort toLowerSimpleIndex(dchar c) { return toLowerSimpleIndexTrie[c]; }
8888     dchar toLowerTab(size_t idx) { return toLowerTable[idx]; }
8889 
8890     ushort toTitleIndex(dchar c) { return toTitleIndexTrie[c]; }
8891     ushort toTitleSimpleIndex(dchar c) { return toTitleSimpleIndexTrie[c]; }
8892     dchar toTitleTab(size_t idx) { return toTitleTable[idx]; }
8893 
8894     ushort toUpperIndex(dchar c) { return toUpperIndexTrie[c]; }
8895     ushort toUpperSimpleIndex(dchar c) { return toUpperSimpleIndexTrie[c]; }
8896     dchar toUpperTab(size_t idx) { return toUpperTable[idx]; }
8897 }
8898 
8899 public:
8900 
8901 /++
8902     Whether or not `c` is a Unicode whitespace $(CHARACTER).
8903     (general Unicode category: Part of C0(tab, vertical tab, form feed,
8904     carriage return, and linefeed characters), Zs, Zl, Zp, and NEL(U+0085))
8905 +/
8906 @safe pure nothrow @nogc
8907 public bool isWhite(dchar c)
8908 {
8909     import std.internal.unicode_tables : isWhiteGen; // generated file
8910     return isWhiteGen(c); // call pregenerated binary search
8911 }
8912 
8913 /++
8914     Return whether `c` is a Unicode lowercase $(CHARACTER).
8915 +/
8916 @safe pure nothrow @nogc
8917 bool isLower(dchar c)
8918 {
8919     import std.ascii : isLower, isASCII;
8920     if (isASCII(c))
8921         return isLower(c);
8922     return lowerCaseTrie[c];
8923 }
8924 
8925 @safe unittest
8926 {
8927     import std.ascii : isLower;
8928     foreach (v; 0 .. 0x80)
8929         assert(isLower(v) == .isLower(v));
8930     assert(.isLower('я'));
8931     assert(.isLower('й'));
8932     assert(!.isLower('Ж'));
8933     // Greek HETA
8934     assert(!.isLower('\u0370'));
8935     assert(.isLower('\u0371'));
8936     assert(!.isLower('\u039C')); // capital MU
8937     assert(.isLower('\u03B2')); // beta
8938     // from extended Greek
8939     assert(!.isLower('\u1F18'));
8940     assert(.isLower('\u1F00'));
8941     foreach (v; unicode.lowerCase.byCodepoint)
8942         assert(.isLower(v) && !isUpper(v));
8943 }
8944 
8945 
8946 /++
8947     Return whether `c` is a Unicode uppercase $(CHARACTER).
8948 +/
8949 @safe pure nothrow @nogc
8950 bool isUpper(dchar c)
8951 {
8952     import std.ascii : isUpper, isASCII;
8953     if (isASCII(c))
8954         return isUpper(c);
8955     return upperCaseTrie[c];
8956 }
8957 
8958 @safe unittest
8959 {
8960     import std.ascii : isLower;
8961     foreach (v; 0 .. 0x80)
8962         assert(isLower(v) == .isLower(v));
8963     assert(!isUpper('й'));
8964     assert(isUpper('Ж'));
8965     // Greek HETA
8966     assert(isUpper('\u0370'));
8967     assert(!isUpper('\u0371'));
8968     assert(isUpper('\u039C')); // capital MU
8969     assert(!isUpper('\u03B2')); // beta
8970     // from extended Greek
8971     assert(!isUpper('\u1F00'));
8972     assert(isUpper('\u1F18'));
8973     foreach (v; unicode.upperCase.byCodepoint)
8974         assert(isUpper(v) && !.isLower(v));
8975 }
8976 
8977 
8978 //TODO: Hidden for now, needs better API.
8979 //Other transforms could use better API as well, but this one is a new primitive.
8980 @safe pure nothrow @nogc
8981 private dchar toTitlecase(dchar c)
8982 {
8983     // optimize ASCII case
8984     if (c < 0xAA)
8985     {
8986         if (c < 'a')
8987             return c;
8988         if (c <= 'z')
8989             return c - 32;
8990         return c;
8991     }
8992     size_t idx = toTitleSimpleIndex(c);
8993     if (idx != ushort.max)
8994     {
8995         return toTitleTab(idx);
8996     }
8997     return c;
8998 }
8999 
9000 private alias UpperTriple = AliasSeq!(toUpperIndex, MAX_SIMPLE_UPPER, toUpperTab);
9001 private alias LowerTriple = AliasSeq!(toLowerIndex, MAX_SIMPLE_LOWER, toLowerTab);
9002 
9003 // generic toUpper/toLower on whole string, creates new or returns as is
9004 private ElementEncodingType!S[] toCase(alias indexFn, uint maxIdx, alias tableFn, alias asciiConvert, S)(S s)
9005 if (isSomeString!S || (isRandomAccessRange!S && hasLength!S && hasSlicing!S && isSomeChar!(ElementType!S)))
9006 {
9007     import std.array : appender, array;
9008     import std.ascii : isASCII;
9009     import std.utf : byDchar, codeLength;
9010 
9011     alias C = ElementEncodingType!S;
9012 
9013     auto r = s.byDchar;
9014     for (size_t i; !r.empty; i += r.front.codeLength!C , r.popFront())
9015     {
9016         auto cOuter = r.front;
9017         ushort idx = indexFn(cOuter);
9018         if (idx == ushort.max)
9019             continue;
9020         auto result = appender!(C[])();
9021         result.reserve(s.length);
9022         result.put(s[0 .. i]);
9023         foreach (dchar c; s[i .. $].byDchar)
9024         {
9025             if (c.isASCII)
9026             {
9027                 result.put(asciiConvert(c));
9028             }
9029             else
9030             {
9031                 idx = indexFn(c);
9032                 if (idx == ushort.max)
9033                     result.put(c);
9034                 else if (idx < maxIdx)
9035                 {
9036                     c = tableFn(idx);
9037                     result.put(c);
9038                 }
9039                 else
9040                 {
9041                     auto val = tableFn(idx);
9042                     // unpack length + codepoint
9043                     immutable uint len = val >> 24;
9044                     result.put(cast(dchar)(val & 0xFF_FFFF));
9045                     foreach (j; idx+1 .. idx+len)
9046                         result.put(tableFn(j));
9047                 }
9048             }
9049         }
9050         return result.data;
9051     }
9052 
9053     static if (isSomeString!S)
9054         return s;
9055     else
9056         return s.array;
9057 }
9058 
9059 // https://issues.dlang.org/show_bug.cgi?id=12428
9060 @safe unittest
9061 {
9062     import std.array : replicate;
9063     auto s = "abcdefghij".replicate(300);
9064     s = s[0 .. 10];
9065 
9066     toUpper(s);
9067 
9068     assert(s == "abcdefghij");
9069 }
9070 
9071 // https://issues.dlang.org/show_bug.cgi?id=18993
9072 @safe unittest
9073 {
9074     static assert(`몬스터/A`.toLower.length == `몬스터/a`.toLower.length);
9075 }
9076 
9077 
9078 // generic toUpper/toLower on whole range, returns range
9079 private auto toCaser(alias indexFn, uint maxIdx, alias tableFn, alias asciiConvert, Range)(Range str)
9080     // Accept range of dchar's
9081 if (isInputRange!Range &&
9082     isSomeChar!(ElementEncodingType!Range) &&
9083     ElementEncodingType!Range.sizeof == dchar.sizeof)
9084 {
9085     static struct ToCaserImpl
9086     {
9087         @property bool empty()
9088         {
9089             return !nLeft && r.empty;
9090         }
9091 
9092         @property auto front()
9093         {
9094             import std.ascii : isASCII;
9095 
9096             if (!nLeft)
9097             {
9098                 dchar c = r.front;
9099                 if (c.isASCII)
9100                 {
9101                     buf[0] = asciiConvert(c);
9102                     nLeft = 1;
9103                 }
9104                 else
9105                 {
9106                     const idx = indexFn(c);
9107                     if (idx == ushort.max)
9108                     {
9109                         buf[0] = c;
9110                         nLeft = 1;
9111                     }
9112                     else if (idx < maxIdx)
9113                     {
9114                         buf[0] = tableFn(idx);
9115                         nLeft = 1;
9116                     }
9117                     else
9118                     {
9119                         immutable val = tableFn(idx);
9120                         // unpack length + codepoint
9121                         nLeft = val >> 24;
9122                         if (nLeft == 0)
9123                             nLeft = 1;
9124                         assert(nLeft <= buf.length);
9125                         buf[nLeft - 1] = cast(dchar)(val & 0xFF_FFFF);
9126                         foreach (j; 1 .. nLeft)
9127                             buf[nLeft - j - 1] = tableFn(idx + j);
9128                     }
9129                 }
9130             }
9131             return buf[nLeft - 1];
9132         }
9133 
9134         void popFront()
9135         {
9136             if (!nLeft)
9137                 front;
9138             assert(nLeft);
9139             --nLeft;
9140             if (!nLeft)
9141                 r.popFront();
9142         }
9143 
9144         static if (isForwardRange!Range)
9145         {
9146             @property auto save()
9147             {
9148                 auto ret = this;
9149                 ret.r = r.save;
9150                 return ret;
9151             }
9152         }
9153 
9154       private:
9155         Range r;
9156         uint nLeft;
9157         dchar[3] buf = void;
9158     }
9159 
9160     return ToCaserImpl(str);
9161 }
9162 
9163 /*********************
9164  * Convert an $(REF_ALTTEXT input range, isInputRange, std,range,primitives)
9165  * or a string to upper or lower case.
9166  *
9167  * Does not allocate memory.
9168  * Characters in UTF-8 or UTF-16 format that cannot be decoded
9169  * are treated as $(REF replacementDchar, std,utf).
9170  *
9171  * Params:
9172  *      str = string or range of characters
9173  *
9174  * Returns:
9175  *      an input range of `dchar`s
9176  *
9177  * See_Also:
9178  *      $(LREF toUpper), $(LREF toLower)
9179  */
9180 
9181 auto asLowerCase(Range)(Range str)
9182 if (isInputRange!Range && isSomeChar!(ElementEncodingType!Range) &&
9183     !isConvertibleToString!Range)
9184 {
9185     static if (ElementEncodingType!Range.sizeof < dchar.sizeof)
9186     {
9187         import std.utf : byDchar;
9188 
9189         // Decode first
9190         return asLowerCase(str.byDchar);
9191     }
9192     else
9193     {
9194         static import std.ascii;
9195         return toCaser!(LowerTriple, std.ascii.toLower)(str);
9196     }
9197 }
9198 
9199 /// ditto
9200 auto asUpperCase(Range)(Range str)
9201 if (isInputRange!Range && isSomeChar!(ElementEncodingType!Range) &&
9202     !isConvertibleToString!Range)
9203 {
9204     static if (ElementEncodingType!Range.sizeof < dchar.sizeof)
9205     {
9206         import std.utf : byDchar;
9207 
9208         // Decode first
9209         return asUpperCase(str.byDchar);
9210     }
9211     else
9212     {
9213         static import std.ascii;
9214         return toCaser!(UpperTriple, std.ascii.toUpper)(str);
9215     }
9216 }
9217 
9218 ///
9219 @safe pure unittest
9220 {
9221     import std.algorithm.comparison : equal;
9222 
9223     assert("hEllo".asUpperCase.equal("HELLO"));
9224 }
9225 
9226 // explicitly undocumented
9227 auto asLowerCase(Range)(auto ref Range str)
9228 if (isConvertibleToString!Range)
9229 {
9230     import std.traits : StringTypeOf;
9231     return asLowerCase!(StringTypeOf!Range)(str);
9232 }
9233 
9234 // explicitly undocumented
9235 auto asUpperCase(Range)(auto ref Range str)
9236 if (isConvertibleToString!Range)
9237 {
9238     import std.traits : StringTypeOf;
9239     return asUpperCase!(StringTypeOf!Range)(str);
9240 }
9241 
9242 @safe unittest
9243 {
9244     static struct TestAliasedString
9245     {
9246         string get() @safe @nogc pure nothrow { return _s; }
9247         alias get this;
9248         @disable this(this);
9249         string _s;
9250     }
9251 
9252     static bool testAliasedString(alias func, Args...)(string s, Args args)
9253     {
9254         import std.algorithm.comparison : equal;
9255         auto a = func(TestAliasedString(s), args);
9256         auto b = func(s, args);
9257         static if (is(typeof(equal(a, b))))
9258         {
9259             // For ranges, compare contents instead of object identity.
9260             return equal(a, b);
9261         }
9262         else
9263         {
9264             return a == b;
9265         }
9266     }
9267     assert(testAliasedString!asLowerCase("hEllo"));
9268     assert(testAliasedString!asUpperCase("hEllo"));
9269     assert(testAliasedString!asCapitalized("hEllo"));
9270 }
9271 
9272 @safe unittest
9273 {
9274     import std.array : array;
9275 
9276     auto a = "HELLo".asLowerCase;
9277     auto savea = a.save;
9278     auto s = a.array;
9279     assert(s == "hello");
9280     s = savea.array;
9281     assert(s == "hello");
9282 
9283     string[] lower = ["123", "abcфеж", "\u0131\u023f\u03c9", "i\u0307\u1Fe2"];
9284     string[] upper = ["123", "ABCФЕЖ", "I\u2c7e\u2126", "\u0130\u03A5\u0308\u0300"];
9285 
9286     foreach (i, slwr; lower)
9287     {
9288         import std.utf : byChar;
9289 
9290         auto sx = slwr.asUpperCase.byChar.array;
9291         assert(sx == toUpper(slwr));
9292         auto sy = upper[i].asLowerCase.byChar.array;
9293         assert(sy == toLower(upper[i]));
9294     }
9295 
9296     // Not necessary to call r.front
9297     for (auto r = lower[3].asUpperCase; !r.empty; r.popFront())
9298     {
9299     }
9300 
9301     import std.algorithm.comparison : equal;
9302 
9303     "HELLo"w.asLowerCase.equal("hello"d);
9304     "HELLo"w.asUpperCase.equal("HELLO"d);
9305     "HELLo"d.asLowerCase.equal("hello"d);
9306     "HELLo"d.asUpperCase.equal("HELLO"d);
9307 
9308     import std.utf : byChar;
9309     assert(toLower("\u1Fe2") == asLowerCase("\u1Fe2").byChar.array);
9310 }
9311 
9312 // generic capitalizer on whole range, returns range
9313 private auto toCapitalizer(alias indexFnUpper, uint maxIdxUpper, alias tableFnUpper,
9314                            Range)(Range str)
9315     // Accept range of dchar's
9316 if (isInputRange!Range &&
9317     isSomeChar!(ElementEncodingType!Range) &&
9318     ElementEncodingType!Range.sizeof == dchar.sizeof)
9319 {
9320     static struct ToCapitalizerImpl
9321     {
9322         @property bool empty()
9323         {
9324             return lower ? lwr.empty : !nLeft && r.empty;
9325         }
9326 
9327         @property auto front()
9328         {
9329             if (lower)
9330                 return lwr.front;
9331 
9332             if (!nLeft)
9333             {
9334                 immutable dchar c = r.front;
9335                 const idx = indexFnUpper(c);
9336                 if (idx == ushort.max)
9337                 {
9338                     buf[0] = c;
9339                     nLeft = 1;
9340                 }
9341                 else if (idx < maxIdxUpper)
9342                 {
9343                     buf[0] = tableFnUpper(idx);
9344                     nLeft = 1;
9345                 }
9346                 else
9347                 {
9348                     immutable val = tableFnUpper(idx);
9349                     // unpack length + codepoint
9350                     nLeft = val >> 24;
9351                     if (nLeft == 0)
9352                         nLeft = 1;
9353                     assert(nLeft <= buf.length);
9354                     buf[nLeft - 1] = cast(dchar)(val & 0xFF_FFFF);
9355                     foreach (j; 1 .. nLeft)
9356                         buf[nLeft - j - 1] = tableFnUpper(idx + j);
9357                 }
9358             }
9359             return buf[nLeft - 1];
9360         }
9361 
9362         void popFront()
9363         {
9364             if (lower)
9365                 lwr.popFront();
9366             else
9367             {
9368                 if (!nLeft)
9369                     front;
9370                 assert(nLeft);
9371                 --nLeft;
9372                 if (!nLeft)
9373                 {
9374                     r.popFront();
9375                     lwr = r.asLowerCase();
9376                     lower = true;
9377                 }
9378             }
9379         }
9380 
9381         static if (isForwardRange!Range)
9382         {
9383             @property auto save()
9384             {
9385                 auto ret = this;
9386                 ret.r = r.save;
9387                 ret.lwr = lwr.save;
9388                 return ret;
9389             }
9390         }
9391 
9392       private:
9393         Range r;
9394         typeof(r.asLowerCase) lwr; // range representing the lower case rest of string
9395         bool lower = false;     // false for first character, true for rest of string
9396         dchar[3] buf = void;
9397         uint nLeft = 0;
9398     }
9399 
9400     return ToCapitalizerImpl(str);
9401 }
9402 
9403 /*********************
9404  * Capitalize an $(REF_ALTTEXT input range, isInputRange, std,range,primitives)
9405  * or string, meaning convert the first
9406  * character to upper case and subsequent characters to lower case.
9407  *
9408  * Does not allocate memory.
9409  * Characters in UTF-8 or UTF-16 format that cannot be decoded
9410  * are treated as $(REF replacementDchar, std,utf).
9411  *
9412  * Params:
9413  *      str = string or range of characters
9414  *
9415  * Returns:
9416  *      an InputRange of dchars
9417  *
9418  * See_Also:
9419  *      $(LREF toUpper), $(LREF toLower)
9420  *      $(LREF asUpperCase), $(LREF asLowerCase)
9421  */
9422 
9423 auto asCapitalized(Range)(Range str)
9424 if (isInputRange!Range && isSomeChar!(ElementEncodingType!Range) &&
9425     !isConvertibleToString!Range)
9426 {
9427     static if (ElementEncodingType!Range.sizeof < dchar.sizeof)
9428     {
9429         import std.utf : byDchar;
9430 
9431         // Decode first
9432         return toCapitalizer!UpperTriple(str.byDchar);
9433     }
9434     else
9435     {
9436         return toCapitalizer!UpperTriple(str);
9437     }
9438 }
9439 
9440 ///
9441 @safe pure unittest
9442 {
9443     import std.algorithm.comparison : equal;
9444 
9445     assert("hEllo".asCapitalized.equal("Hello"));
9446 }
9447 
9448 auto asCapitalized(Range)(auto ref Range str)
9449 if (isConvertibleToString!Range)
9450 {
9451     import std.traits : StringTypeOf;
9452     return asCapitalized!(StringTypeOf!Range)(str);
9453 }
9454 
9455 @safe pure nothrow @nogc unittest
9456 {
9457     auto r = "hEllo".asCapitalized();
9458     assert(r.front == 'H');
9459 }
9460 
9461 @safe unittest
9462 {
9463     import std.array : array;
9464 
9465     auto a = "hELLo".asCapitalized;
9466     auto savea = a.save;
9467     auto s = a.array;
9468     assert(s == "Hello");
9469     s = savea.array;
9470     assert(s == "Hello");
9471 
9472     string[2][] cases =
9473     [
9474         ["", ""],
9475         ["h", "H"],
9476         ["H", "H"],
9477         ["3", "3"],
9478         ["123", "123"],
9479         ["h123A", "H123a"],
9480         ["феж", "Феж"],
9481         ["\u1Fe2", "\u03a5\u0308\u0300"],
9482     ];
9483 
9484     foreach (i; 0 .. cases.length)
9485     {
9486         import std.utf : byChar;
9487 
9488         auto r = cases[i][0].asCapitalized.byChar.array;
9489         auto result = cases[i][1];
9490         assert(r == result);
9491     }
9492 
9493     // Don't call r.front
9494     for (auto r = "\u1Fe2".asCapitalized; !r.empty; r.popFront())
9495     {
9496     }
9497 
9498     import std.algorithm.comparison : equal;
9499 
9500     "HELLo"w.asCapitalized.equal("Hello"d);
9501     "hElLO"w.asCapitalized.equal("Hello"d);
9502     "hello"d.asCapitalized.equal("Hello"d);
9503     "HELLO"d.asCapitalized.equal("Hello"d);
9504 
9505     import std.utf : byChar;
9506     assert(asCapitalized("\u0130").byChar.array == asUpperCase("\u0130").byChar.array);
9507 }
9508 
9509 // TODO: helper, I wish std.utf was more flexible (and stright)
9510 private size_t encodeTo(scope char[] buf, size_t idx, dchar c) @trusted pure nothrow @nogc
9511 {
9512     if (c <= 0x7F)
9513     {
9514         buf[idx] = cast(char) c;
9515         idx++;
9516     }
9517     else if (c <= 0x7FF)
9518     {
9519         buf[idx] = cast(char)(0xC0 | (c >> 6));
9520         buf[idx+1] = cast(char)(0x80 | (c & 0x3F));
9521         idx += 2;
9522     }
9523     else if (c <= 0xFFFF)
9524     {
9525         buf[idx] = cast(char)(0xE0 | (c >> 12));
9526         buf[idx+1] = cast(char)(0x80 | ((c >> 6) & 0x3F));
9527         buf[idx+2] = cast(char)(0x80 | (c & 0x3F));
9528         idx += 3;
9529     }
9530     else if (c <= 0x10FFFF)
9531     {
9532         buf[idx] = cast(char)(0xF0 | (c >> 18));
9533         buf[idx+1] = cast(char)(0x80 | ((c >> 12) & 0x3F));
9534         buf[idx+2] = cast(char)(0x80 | ((c >> 6) & 0x3F));
9535         buf[idx+3] = cast(char)(0x80 | (c & 0x3F));
9536         idx += 4;
9537     }
9538     else
9539         assert(0);
9540     return idx;
9541 }
9542 
9543 @safe unittest
9544 {
9545     char[] s = "abcd".dup;
9546     size_t i = 0;
9547     i = encodeTo(s, i, 'X');
9548     assert(s == "Xbcd");
9549 
9550     i = encodeTo(s, i, cast(dchar)'\u00A9');
9551     assert(s == "X\xC2\xA9d");
9552 }
9553 
9554 // TODO: helper, I wish std.utf was more flexible (and stright)
9555 private size_t encodeTo(scope wchar[] buf, size_t idx, dchar c) @trusted pure
9556 {
9557     import std.utf : UTFException;
9558     if (c <= 0xFFFF)
9559     {
9560         if (0xD800 <= c && c <= 0xDFFF)
9561             throw (new UTFException("Encoding an isolated surrogate code point in UTF-16")).setSequence(c);
9562         buf[idx] = cast(wchar) c;
9563         idx++;
9564     }
9565     else if (c <= 0x10FFFF)
9566     {
9567         buf[idx] = cast(wchar)((((c - 0x10000) >> 10) & 0x3FF) + 0xD800);
9568         buf[idx+1] = cast(wchar)(((c - 0x10000) & 0x3FF) + 0xDC00);
9569         idx += 2;
9570     }
9571     else
9572         assert(0);
9573     return idx;
9574 }
9575 
9576 private size_t encodeTo(scope dchar[] buf, size_t idx, dchar c) @trusted pure nothrow @nogc
9577 {
9578     buf[idx] = c;
9579     idx++;
9580     return idx;
9581 }
9582 
9583 private void toCaseInPlace(alias indexFn, uint maxIdx, alias tableFn, C)(ref C[] s) @trusted pure
9584 if (is(C == char) || is(C == wchar)  || is(C == dchar))
9585 {
9586     import std.utf : decode, codeLength;
9587     size_t curIdx = 0;
9588     size_t destIdx = 0;
9589     alias slowToCase = toCaseInPlaceAlloc!(indexFn, maxIdx, tableFn);
9590     size_t lastUnchanged = 0;
9591     // in-buffer move of bytes to a new start index
9592     // the trick is that it may not need to copy at all
9593     static size_t moveTo(C[] str, size_t dest, size_t from, size_t to)
9594     {
9595         // Interestingly we may just bump pointer for a while
9596         // then have to copy if a re-cased char was smaller the original
9597         // later we may regain pace with char that got bigger
9598         // In the end it sometimes flip-flops between the 2 cases below
9599         if (dest == from)
9600             return to;
9601         // got to copy
9602         foreach (C c; str[from .. to])
9603             str[dest++] = c;
9604         return dest;
9605     }
9606     while (curIdx != s.length)
9607     {
9608         size_t startIdx = curIdx;
9609         immutable ch = decode(s, curIdx);
9610         // TODO: special case for ASCII
9611         immutable caseIndex = indexFn(ch);
9612         if (caseIndex == ushort.max) // unchanged, skip over
9613         {
9614             continue;
9615         }
9616         else if (caseIndex < maxIdx)  // 1:1 codepoint mapping
9617         {
9618             // previous cased chars had the same length as uncased ones
9619             // thus can just adjust pointer
9620             destIdx = moveTo(s, destIdx, lastUnchanged, startIdx);
9621             lastUnchanged = curIdx;
9622             immutable cased = tableFn(caseIndex);
9623             immutable casedLen = codeLength!C(cased);
9624             if (casedLen + destIdx > curIdx) // no place to fit cased char
9625             {
9626                 // switch to slow codepath, where we allocate
9627                 return slowToCase(s, startIdx, destIdx);
9628             }
9629             else
9630             {
9631                 destIdx = encodeTo(s, destIdx, cased);
9632             }
9633         }
9634         else  // 1:m codepoint mapping, slow codepath
9635         {
9636             destIdx = moveTo(s, destIdx, lastUnchanged, startIdx);
9637             lastUnchanged = curIdx;
9638             return slowToCase(s, startIdx, destIdx);
9639         }
9640         assert(destIdx <= curIdx);
9641     }
9642     if (lastUnchanged != s.length)
9643     {
9644         destIdx = moveTo(s, destIdx, lastUnchanged, s.length);
9645     }
9646     s = s[0 .. destIdx];
9647 }
9648 
9649 // helper to precalculate size of case-converted string
9650 private template toCaseLength(alias indexFn, uint maxIdx, alias tableFn)
9651 {
9652     size_t toCaseLength(C)(const scope C[] str)
9653     {
9654         import std.utf : decode, codeLength;
9655         size_t codeLen = 0;
9656         size_t lastNonTrivial = 0;
9657         size_t curIdx = 0;
9658         while (curIdx != str.length)
9659         {
9660             immutable startIdx = curIdx;
9661             immutable ch = decode(str, curIdx);
9662             immutable ushort caseIndex = indexFn(ch);
9663             if (caseIndex == ushort.max)
9664                 continue;
9665             else if (caseIndex < maxIdx)
9666             {
9667                 codeLen += startIdx - lastNonTrivial;
9668                 lastNonTrivial = curIdx;
9669                 immutable cased = tableFn(caseIndex);
9670                 codeLen += codeLength!C(cased);
9671             }
9672             else
9673             {
9674                 codeLen += startIdx - lastNonTrivial;
9675                 lastNonTrivial = curIdx;
9676                 immutable val = tableFn(caseIndex);
9677                 immutable len = val >> 24;
9678                 immutable dchar cased = val & 0xFF_FFFF;
9679                 codeLen += codeLength!C(cased);
9680                 foreach (j; caseIndex+1 .. caseIndex+len)
9681                     codeLen += codeLength!C(tableFn(j));
9682             }
9683         }
9684         if (lastNonTrivial != str.length)
9685             codeLen += str.length - lastNonTrivial;
9686         return codeLen;
9687     }
9688 }
9689 
9690 @safe unittest
9691 {
9692     alias toLowerLength = toCaseLength!(LowerTriple);
9693     assert(toLowerLength("abcd") == 4);
9694     assert(toLowerLength("аБВгд456") == 10+3);
9695 }
9696 
9697 // slower code path that preallocates and then copies
9698 // case-converted stuf to the new string
9699 private template toCaseInPlaceAlloc(alias indexFn, uint maxIdx, alias tableFn)
9700 {
9701     void toCaseInPlaceAlloc(C)(ref C[] s, size_t curIdx,
9702         size_t destIdx) @trusted pure
9703         if (is(C == char) || is(C == wchar) || is(C == dchar))
9704     {
9705         import std.utf : decode;
9706         alias caseLength = toCaseLength!(indexFn, maxIdx, tableFn);
9707         auto trueLength = destIdx + caseLength(s[curIdx..$]);
9708         C[] ns = new C[trueLength];
9709         ns[0 .. destIdx] = s[0 .. destIdx];
9710         size_t lastUnchanged = curIdx;
9711         while (curIdx != s.length)
9712         {
9713             immutable startIdx = curIdx; // start of current codepoint
9714             immutable ch = decode(s, curIdx);
9715             immutable caseIndex = indexFn(ch);
9716             if (caseIndex == ushort.max) // skip over
9717             {
9718                 continue;
9719             }
9720             else if (caseIndex < maxIdx)  // 1:1 codepoint mapping
9721             {
9722                 immutable cased = tableFn(caseIndex);
9723                 auto toCopy = startIdx - lastUnchanged;
9724                 ns[destIdx .. destIdx+toCopy] = s[lastUnchanged .. startIdx];
9725                 lastUnchanged = curIdx;
9726                 destIdx += toCopy;
9727                 destIdx = encodeTo(ns, destIdx, cased);
9728             }
9729             else  // 1:m codepoint mapping, slow codepath
9730             {
9731                 auto toCopy = startIdx - lastUnchanged;
9732                 ns[destIdx .. destIdx+toCopy] = s[lastUnchanged .. startIdx];
9733                 lastUnchanged = curIdx;
9734                 destIdx += toCopy;
9735                 auto val = tableFn(caseIndex);
9736                 // unpack length + codepoint
9737                 immutable uint len = val >> 24;
9738                 destIdx = encodeTo(ns, destIdx, cast(dchar)(val & 0xFF_FFFF));
9739                 foreach (j; caseIndex+1 .. caseIndex+len)
9740                     destIdx = encodeTo(ns, destIdx, tableFn(j));
9741             }
9742         }
9743         if (lastUnchanged != s.length)
9744         {
9745             auto toCopy = s.length - lastUnchanged;
9746             ns[destIdx .. destIdx+toCopy] = s[lastUnchanged..$];
9747             destIdx += toCopy;
9748         }
9749         assert(ns.length == destIdx);
9750         s = ns;
9751     }
9752 }
9753 
9754 /++
9755     Converts `s` to lowercase (by performing Unicode lowercase mapping) in place.
9756     For a few characters string length may increase after the transformation,
9757     in such a case the function reallocates exactly once.
9758     If `s` does not have any uppercase characters, then `s` is unaltered.
9759 +/
9760 void toLowerInPlace(C)(ref C[] s) @trusted pure
9761 if (is(C == char) || is(C == wchar) || is(C == dchar))
9762 {
9763     toCaseInPlace!(LowerTriple)(s);
9764 }
9765 // overloads for the most common cases to reduce compile time
9766 @safe pure /*TODO nothrow*/
9767 {
9768     void toLowerInPlace(ref char[] s)
9769     { toLowerInPlace!char(s); }
9770     void toLowerInPlace(ref wchar[] s)
9771     { toLowerInPlace!wchar(s); }
9772     void toLowerInPlace(ref dchar[] s)
9773     { toLowerInPlace!dchar(s); }
9774 }
9775 
9776 /++
9777     Converts `s` to uppercase  (by performing Unicode uppercase mapping) in place.
9778     For a few characters string length may increase after the transformation,
9779     in such a case the function reallocates exactly once.
9780     If `s` does not have any lowercase characters, then `s` is unaltered.
9781 +/
9782 void toUpperInPlace(C)(ref C[] s) @trusted pure
9783 if (is(C == char) || is(C == wchar) || is(C == dchar))
9784 {
9785     toCaseInPlace!(UpperTriple)(s);
9786 }
9787 // overloads for the most common cases to reduce compile time/code size
9788 @safe pure /*TODO nothrow*/
9789 {
9790     void toUpperInPlace(ref char[] s)
9791     { toUpperInPlace!char(s); }
9792     void toUpperInPlace(ref wchar[] s)
9793     { toUpperInPlace!wchar(s); }
9794     void toUpperInPlace(ref dchar[] s)
9795     { toUpperInPlace!dchar(s); }
9796 }
9797 
9798 /++
9799     If `c` is a Unicode uppercase $(CHARACTER), then its lowercase equivalent
9800     is returned. Otherwise `c` is returned.
9801 
9802     Warning: certain alphabets like German and Greek have no 1:1
9803     upper-lower mapping. Use overload of toLower which takes full string instead.
9804 +/
9805 @safe pure nothrow @nogc
9806 dchar toLower(dchar c)
9807 {
9808      // optimize ASCII case
9809     if (c < 0xAA)
9810     {
9811         if (c < 'A')
9812             return c;
9813         if (c <= 'Z')
9814             return c + 32;
9815         return c;
9816     }
9817     size_t idx = toLowerSimpleIndex(c);
9818     if (idx != ushort.max)
9819     {
9820         return toLowerTab(idx);
9821     }
9822     return c;
9823 }
9824 
9825 /++
9826     Creates a new array which is identical to `s` except that all of its
9827     characters are converted to lowercase (by performing Unicode lowercase mapping).
9828     If none of `s` characters were affected, then `s` itself is returned if `s` is a
9829     `string`-like type.
9830 
9831     Params:
9832         s = A $(REF_ALTTEXT random access range, isRandomAccessRange, std,range,primitives)
9833         of characters
9834     Returns:
9835         An array with the same element type as `s`.
9836 +/
9837 ElementEncodingType!S[] toLower(S)(return scope S s) @trusted
9838 if (isSomeString!S)
9839 {
9840     static import std.ascii;
9841     return toCase!(LowerTriple, std.ascii.toLower)(s);
9842 }
9843 
9844 /// ditto
9845 ElementEncodingType!S[] toLower(S)(S s)
9846 if (!isSomeString!S && (isRandomAccessRange!S && hasLength!S && hasSlicing!S && isSomeChar!(ElementType!S)))
9847 {
9848     static import std.ascii;
9849     return toCase!(LowerTriple, std.ascii.toLower)(s);
9850 }
9851 
9852 // overloads for the most common cases to reduce compile time
9853 @safe pure /*TODO nothrow*/
9854 {
9855     string toLower(return scope string s)
9856     { return toLower!string(s); }
9857     wstring toLower(return scope wstring s)
9858     { return toLower!wstring(s); }
9859     dstring toLower(return scope dstring s)
9860     { return toLower!dstring(s); }
9861 
9862     @safe unittest
9863     {
9864         // https://issues.dlang.org/show_bug.cgi?id=16663
9865 
9866         static struct String
9867         {
9868             string data;
9869             alias data this;
9870         }
9871 
9872         void foo()
9873         {
9874             auto u = toLower(String(""));
9875         }
9876     }
9877 }
9878 
9879 
9880 @safe unittest
9881 {
9882     static import std.ascii;
9883     import std.format : format;
9884     foreach (ch; 0 .. 0x80)
9885         assert(std.ascii.toLower(ch) == toLower(ch));
9886     assert(toLower('Я') == 'я');
9887     assert(toLower('Δ') == 'δ');
9888     foreach (ch; unicode.upperCase.byCodepoint)
9889     {
9890         dchar low = ch.toLower();
9891         assert(low == ch || isLower(low), format("%s -> %s", ch, low));
9892     }
9893     assert(toLower("АЯ") == "ая");
9894 
9895     assert("\u1E9E".toLower == "\u00df");
9896     assert("\u00df".toUpper == "SS");
9897 }
9898 
9899 // https://issues.dlang.org/show_bug.cgi?id=9629
9900 @safe unittest
9901 {
9902     wchar[] test = "hello þ world"w.dup;
9903     auto piece = test[6 .. 7];
9904     toUpperInPlace(piece);
9905     assert(test == "hello Þ world");
9906 }
9907 
9908 
9909 @safe unittest
9910 {
9911     import std.algorithm.comparison : cmp;
9912     string s1 = "FoL";
9913     string s2 = toLower(s1);
9914     assert(cmp(s2, "fol") == 0, s2);
9915     assert(s2 != s1);
9916 
9917     char[] s3 = s1.dup;
9918     toLowerInPlace(s3);
9919     assert(s3 == s2);
9920 
9921     s1 = "A\u0100B\u0101d";
9922     s2 = toLower(s1);
9923     s3 = s1.dup;
9924     assert(cmp(s2, "a\u0101b\u0101d") == 0);
9925     assert(s2 !is s1);
9926     toLowerInPlace(s3);
9927     assert(s3 == s2);
9928 
9929     s1 = "A\u0460B\u0461d";
9930     s2 = toLower(s1);
9931     s3 = s1.dup;
9932     assert(cmp(s2, "a\u0461b\u0461d") == 0);
9933     assert(s2 !is s1);
9934     toLowerInPlace(s3);
9935     assert(s3 == s2);
9936 
9937     s1 = "\u0130";
9938     s2 = toLower(s1);
9939     s3 = s1.dup;
9940     assert(s2 == "i\u0307");
9941     assert(s2 !is s1);
9942     toLowerInPlace(s3);
9943     assert(s3 == s2);
9944 
9945     // Test on wchar and dchar strings.
9946     assert(toLower("Some String"w) == "some string"w);
9947     assert(toLower("Some String"d) == "some string"d);
9948 
9949     // https://issues.dlang.org/show_bug.cgi?id=12455
9950     dchar c = 'İ'; // '\U0130' LATIN CAPITAL LETTER I WITH DOT ABOVE
9951     assert(isUpper(c));
9952     assert(toLower(c) == 'i');
9953     // extends on https://issues.dlang.org/show_bug.cgi?id=12455 report
9954     // check simple-case toUpper too
9955     c = '\u1f87';
9956     assert(isLower(c));
9957     assert(toUpper(c) == '\u1F8F');
9958 }
9959 
9960 @safe pure unittest
9961 {
9962     import std.algorithm.comparison : cmp, equal;
9963     import std.utf : byCodeUnit;
9964     auto r1 = "FoL".byCodeUnit;
9965     assert(r1.toLower.cmp("fol") == 0);
9966     auto r2 = "A\u0460B\u0461d".byCodeUnit;
9967     assert(r2.toLower.cmp("a\u0461b\u0461d") == 0);
9968 }
9969 
9970 /++
9971     If `c` is a Unicode lowercase $(CHARACTER), then its uppercase equivalent
9972     is returned. Otherwise `c` is returned.
9973 
9974     Warning:
9975     Certain alphabets like German and Greek have no 1:1
9976     upper-lower mapping. Use overload of toUpper which takes full string instead.
9977 
9978     toUpper can be used as an argument to $(REF map, std,algorithm,iteration)
9979     to produce an algorithm that can convert a range of characters to upper case
9980     without allocating memory.
9981     A string can then be produced by using $(REF copy, std,algorithm,mutation)
9982     to send it to an $(REF appender, std,array).
9983 +/
9984 @safe pure nothrow @nogc
9985 dchar toUpper(dchar c)
9986 {
9987     // optimize ASCII case
9988     if (c < 0xAA)
9989     {
9990         if (c < 'a')
9991             return c;
9992         if (c <= 'z')
9993             return c - 32;
9994         return c;
9995     }
9996     size_t idx = toUpperSimpleIndex(c);
9997     if (idx != ushort.max)
9998     {
9999         return toUpperTab(idx);
10000     }
10001     return c;
10002 }
10003 
10004 ///
10005 @safe unittest
10006 {
10007     import std.algorithm.iteration : map;
10008     import std.algorithm.mutation : copy;
10009     import std.array : appender;
10010 
10011     auto abuf = appender!(char[])();
10012     "hello".map!toUpper.copy(abuf);
10013     assert(abuf.data == "HELLO");
10014 }
10015 
10016 @safe unittest
10017 {
10018     static import std.ascii;
10019     import std.format : format;
10020     foreach (ch; 0 .. 0x80)
10021         assert(std.ascii.toUpper(ch) == toUpper(ch));
10022     assert(toUpper('я') == 'Я');
10023     assert(toUpper('δ') == 'Δ');
10024     auto title = unicode.Titlecase_Letter;
10025     foreach (ch; unicode.lowerCase.byCodepoint)
10026     {
10027         dchar up = ch.toUpper();
10028         assert(up == ch || isUpper(up) || title[up],
10029             format("%x -> %x", ch, up));
10030     }
10031 }
10032 
10033 /++
10034     Allocates a new array which is identical to `s` except that all of its
10035     characters are converted to uppercase (by performing Unicode uppercase mapping).
10036     If none of `s` characters were affected, then `s` itself is returned if `s`
10037     is a `string`-like type.
10038 
10039     Params:
10040         s = A $(REF_ALTTEXT random access range, isRandomAccessRange, std,range,primitives)
10041         of characters
10042     Returns:
10043         An new array with the same element type as `s`.
10044 +/
10045 ElementEncodingType!S[] toUpper(S)(return scope S s) @trusted
10046 if (isSomeString!S)
10047 {
10048     static import std.ascii;
10049     return toCase!(UpperTriple, std.ascii.toUpper)(s);
10050 }
10051 
10052 /// ditto
10053 ElementEncodingType!S[] toUpper(S)(S s)
10054 if (!isSomeString!S && (isRandomAccessRange!S && hasLength!S && hasSlicing!S && isSomeChar!(ElementType!S)))
10055 {
10056     static import std.ascii;
10057     return toCase!(UpperTriple, std.ascii.toUpper)(s);
10058 }
10059 
10060 // overloads for the most common cases to reduce compile time
10061 @safe pure /*TODO nothrow*/
10062 {
10063     string toUpper(return scope string s)
10064     { return toUpper!string(s); }
10065     wstring toUpper(return scope wstring s)
10066     { return toUpper!wstring(s); }
10067     dstring toUpper(return scope dstring s)
10068     { return toUpper!dstring(s); }
10069 
10070     @safe unittest
10071     {
10072         // https://issues.dlang.org/show_bug.cgi?id=16663
10073 
10074         static struct String
10075         {
10076             string data;
10077             alias data this;
10078         }
10079 
10080         void foo()
10081         {
10082             auto u = toUpper(String(""));
10083         }
10084     }
10085 }
10086 
10087 @safe unittest
10088 {
10089     import std.algorithm.comparison : cmp;
10090 
10091     string s1 = "FoL";
10092     string s2;
10093     char[] s3;
10094 
10095     s2 = toUpper(s1);
10096     s3 = s1.dup; toUpperInPlace(s3);
10097     assert(s3 == s2, s3);
10098     assert(cmp(s2, "FOL") == 0);
10099     assert(s2 !is s1);
10100 
10101     s1 = "a\u0100B\u0101d";
10102     s2 = toUpper(s1);
10103     s3 = s1.dup; toUpperInPlace(s3);
10104     assert(s3 == s2);
10105     assert(cmp(s2, "A\u0100B\u0100D") == 0);
10106     assert(s2 !is s1);
10107 
10108     s1 = "a\u0460B\u0461d";
10109     s2 = toUpper(s1);
10110     s3 = s1.dup; toUpperInPlace(s3);
10111     assert(s3 == s2);
10112     assert(cmp(s2, "A\u0460B\u0460D") == 0);
10113     assert(s2 !is s1);
10114 }
10115 
10116 @safe unittest
10117 {
10118     static void doTest(C)(const(C)[] s, const(C)[] trueUp, const(C)[] trueLow)
10119     {
10120         import std.format : format;
10121         string diff = "src: %( %x %)\nres: %( %x %)\ntru: %( %x %)";
10122         auto low = s.toLower() , up = s.toUpper();
10123         auto lowInp = s.dup, upInp = s.dup;
10124         lowInp.toLowerInPlace();
10125         upInp.toUpperInPlace();
10126         assert(low == trueLow, format(diff, low, trueLow));
10127         assert(up == trueUp,  format(diff, up, trueUp));
10128         assert(lowInp == trueLow,
10129             format(diff, cast(const(ubyte)[]) s, cast(const(ubyte)[]) lowInp, cast(const(ubyte)[]) trueLow));
10130         assert(upInp == trueUp,
10131             format(diff, cast(const(ubyte)[]) s, cast(const(ubyte)[]) upInp, cast(const(ubyte)[]) trueUp));
10132     }
10133     static foreach (S; AliasSeq!(dstring, wstring, string))
10134     {{
10135 
10136         S easy = "123";
10137         S good = "abCФеж";
10138         S awful = "\u0131\u023f\u2126";
10139         S wicked = "\u0130\u1FE2";
10140         auto options = [easy, good, awful, wicked];
10141         S[] lower = ["123", "abcфеж", "\u0131\u023f\u03c9", "i\u0307\u1Fe2"];
10142         S[] upper = ["123", "ABCФЕЖ", "I\u2c7e\u2126", "\u0130\u03A5\u0308\u0300"];
10143 
10144         foreach (val; [easy, good])
10145         {
10146             auto e = val.dup;
10147             auto g = e;
10148             e.toUpperInPlace();
10149             assert(e is g);
10150             e.toLowerInPlace();
10151             assert(e is g);
10152         }
10153         foreach (i, v; options)
10154         {
10155             doTest(v, upper[i], lower[i]);
10156         }
10157 
10158         // a few combinatorial runs
10159         foreach (i; 0 .. options.length)
10160         foreach (j; i .. options.length)
10161         foreach (k; j .. options.length)
10162         {
10163             auto sample = options[i] ~ options[j] ~ options[k];
10164             auto sample2 = options[k] ~ options[j] ~ options[i];
10165             doTest(sample, upper[i] ~ upper[j] ~ upper[k],
10166                 lower[i] ~ lower[j] ~ lower[k]);
10167             doTest(sample2, upper[k] ~ upper[j] ~ upper[i],
10168                 lower[k] ~ lower[j] ~ lower[i]);
10169         }
10170     }}
10171 }
10172 
10173 // test random access ranges
10174 @safe pure unittest
10175 {
10176     import std.algorithm.comparison : cmp;
10177     import std.utf : byCodeUnit;
10178     auto s1 = "FoL".byCodeUnit;
10179     assert(s1.toUpper.cmp("FOL") == 0);
10180     auto s2 = "a\u0460B\u0461d".byCodeUnit;
10181     assert(s2.toUpper.cmp("A\u0460B\u0460D") == 0);
10182 }
10183 
10184 /++
10185     Returns whether `c` is a Unicode alphabetic $(CHARACTER)
10186     (general Unicode category: Alphabetic).
10187 +/
10188 @safe pure nothrow @nogc
10189 bool isAlpha(dchar c)
10190 {
10191     // optimization
10192     if (c < 0xAA)
10193     {
10194         size_t x = c - 'A';
10195         if (x <= 'Z' - 'A')
10196             return true;
10197         else
10198         {
10199             x = c - 'a';
10200             if (x <= 'z'-'a')
10201                 return true;
10202         }
10203         return false;
10204     }
10205 
10206     return alphaTrie[c];
10207 }
10208 
10209 @safe unittest
10210 {
10211     auto alpha = unicode("Alphabetic");
10212     foreach (ch; alpha.byCodepoint)
10213         assert(isAlpha(ch));
10214     foreach (ch; 0 .. 0x4000)
10215         assert((ch in alpha) == isAlpha(ch));
10216 }
10217 
10218 
10219 /++
10220     Returns whether `c` is a Unicode mark
10221     (general Unicode category: Mn, Me, Mc).
10222 +/
10223 @safe pure nothrow @nogc
10224 bool isMark(dchar c)
10225 {
10226     return markTrie[c];
10227 }
10228 
10229 @safe unittest
10230 {
10231     auto mark = unicode("Mark");
10232     foreach (ch; mark.byCodepoint)
10233         assert(isMark(ch));
10234     foreach (ch; 0 .. 0x4000)
10235         assert((ch in mark) == isMark(ch));
10236 }
10237 
10238 /++
10239     Returns whether `c` is a Unicode numerical $(CHARACTER)
10240     (general Unicode category: Nd, Nl, No).
10241 +/
10242 @safe pure nothrow @nogc
10243 bool isNumber(dchar c)
10244 {
10245     // optimization for ascii case
10246     if (c <= 0x7F)
10247     {
10248         return c >= '0' && c <= '9';
10249     }
10250     else
10251     {
10252         return numberTrie[c];
10253     }
10254 }
10255 
10256 @safe unittest
10257 {
10258     auto n = unicode("N");
10259     foreach (ch; n.byCodepoint)
10260         assert(isNumber(ch));
10261     foreach (ch; 0 .. 0x4000)
10262         assert((ch in n) == isNumber(ch));
10263 }
10264 
10265 /++
10266     Returns whether `c` is a Unicode alphabetic $(CHARACTER) or number.
10267     (general Unicode category: Alphabetic, Nd, Nl, No).
10268 
10269     Params:
10270         c = any Unicode character
10271     Returns:
10272         `true` if the character is in the Alphabetic, Nd, Nl, or No Unicode
10273         categories
10274 +/
10275 @safe pure nothrow @nogc
10276 bool isAlphaNum(dchar c)
10277 {
10278     static import std.ascii;
10279 
10280     // optimization for ascii case
10281     if (std.ascii.isASCII(c))
10282     {
10283         return std.ascii.isAlphaNum(c);
10284     }
10285     else
10286     {
10287         return isAlpha(c) || isNumber(c);
10288     }
10289 }
10290 
10291 @safe unittest
10292 {
10293     auto n = unicode("N");
10294     auto alpha = unicode("Alphabetic");
10295 
10296     foreach (ch; n.byCodepoint)
10297         assert(isAlphaNum(ch));
10298 
10299     foreach (ch; alpha.byCodepoint)
10300         assert(isAlphaNum(ch));
10301 
10302     foreach (ch; 0 .. 0x4000)
10303     {
10304         assert(((ch in n) || (ch in alpha)) == isAlphaNum(ch));
10305     }
10306 }
10307 
10308 /++
10309     Returns whether `c` is a Unicode punctuation $(CHARACTER)
10310     (general Unicode category: Pd, Ps, Pe, Pc, Po, Pi, Pf).
10311 +/
10312 @safe pure nothrow @nogc
10313 bool isPunctuation(dchar c)
10314 {
10315     static import std.ascii;
10316 
10317     // optimization for ascii case
10318     if (c <= 0x7F)
10319     {
10320         return std.ascii.isPunctuation(c);
10321     }
10322     else
10323     {
10324         return punctuationTrie[c];
10325     }
10326 }
10327 
10328 @safe unittest
10329 {
10330     assert(isPunctuation('\u0021'));
10331     assert(isPunctuation('\u0028'));
10332     assert(isPunctuation('\u0029'));
10333     assert(isPunctuation('\u002D'));
10334     assert(isPunctuation('\u005F'));
10335     assert(isPunctuation('\u00AB'));
10336     assert(isPunctuation('\u00BB'));
10337     foreach (ch; unicode("P").byCodepoint)
10338         assert(isPunctuation(ch));
10339 }
10340 
10341 /++
10342     Returns whether `c` is a Unicode symbol $(CHARACTER)
10343     (general Unicode category: Sm, Sc, Sk, So).
10344 +/
10345 @safe pure nothrow @nogc
10346 bool isSymbol(dchar c)
10347 {
10348    return symbolTrie[c];
10349 }
10350 
10351 @safe unittest
10352 {
10353     import std.format : format;
10354     assert(isSymbol('\u0024'));
10355     assert(isSymbol('\u002B'));
10356     assert(isSymbol('\u005E'));
10357     assert(isSymbol('\u00A6'));
10358     foreach (ch; unicode("S").byCodepoint)
10359         assert(isSymbol(ch), format("%04x", ch));
10360 }
10361 
10362 /++
10363     Returns whether `c` is a Unicode space $(CHARACTER)
10364     (general Unicode category: Zs)
10365     Note: This doesn't include '\n', '\r', \t' and other non-space $(CHARACTER).
10366     For commonly used less strict semantics see $(LREF isWhite).
10367 +/
10368 @safe pure nothrow @nogc
10369 bool isSpace(dchar c)
10370 {
10371     import std.internal.unicode_tables : isSpaceGen; // generated file
10372     return isSpaceGen(c);
10373 }
10374 
10375 @safe unittest
10376 {
10377     assert(isSpace('\u0020'));
10378     auto space = unicode.Zs;
10379     foreach (ch; space.byCodepoint)
10380         assert(isSpace(ch));
10381     foreach (ch; 0 .. 0x1000)
10382         assert(isSpace(ch) == space[ch]);
10383 }
10384 
10385 
10386 /++
10387     Returns whether `c` is a Unicode graphical $(CHARACTER)
10388     (general Unicode category: L, M, N, P, S, Zs).
10389 
10390 +/
10391 @safe pure nothrow @nogc
10392 bool isGraphical(dchar c)
10393 {
10394     return graphicalTrie[c];
10395 }
10396 
10397 
10398 @safe unittest
10399 {
10400     auto set = unicode("Graphical");
10401     import std.format : format;
10402     foreach (ch; set.byCodepoint)
10403         assert(isGraphical(ch), format("%4x", ch));
10404     foreach (ch; 0 .. 0x4000)
10405         assert((ch in set) == isGraphical(ch));
10406 }
10407 
10408 
10409 /++
10410     Returns whether `c` is a Unicode control $(CHARACTER)
10411     (general Unicode category: Cc).
10412 +/
10413 @safe pure nothrow @nogc
10414 bool isControl(dchar c)
10415 {
10416     import std.internal.unicode_tables : isControlGen; // generated file
10417     return isControlGen(c);
10418 }
10419 
10420 @safe unittest
10421 {
10422     assert(isControl('\u0000'));
10423     assert(isControl('\u0081'));
10424     assert(!isControl('\u0100'));
10425     auto cc = unicode.Cc;
10426     foreach (ch; cc.byCodepoint)
10427         assert(isControl(ch));
10428     foreach (ch; 0 .. 0x1000)
10429         assert(isControl(ch) == cc[ch]);
10430 }
10431 
10432 
10433 /++
10434     Returns whether `c` is a Unicode formatting $(CHARACTER)
10435     (general Unicode category: Cf).
10436 +/
10437 @safe pure nothrow @nogc
10438 bool isFormat(dchar c)
10439 {
10440     import std.internal.unicode_tables : isFormatGen; // generated file
10441     return isFormatGen(c);
10442 }
10443 
10444 
10445 @safe unittest
10446 {
10447     assert(isFormat('\u00AD'));
10448     foreach (ch; unicode("Format").byCodepoint)
10449         assert(isFormat(ch));
10450 }
10451 
10452 // code points for private use, surrogates are not likely to change in near feature
10453 // if need be they can be generated from unicode data as well
10454 
10455 /++
10456     Returns whether `c` is a Unicode Private Use $(CODEPOINT)
10457     (general Unicode category: Co).
10458 +/
10459 @safe pure nothrow @nogc
10460 bool isPrivateUse(dchar c)
10461 {
10462     return (0x00_E000 <= c && c <= 0x00_F8FF)
10463         || (0x0F_0000 <= c && c <= 0x0F_FFFD)
10464         || (0x10_0000 <= c && c <= 0x10_FFFD);
10465 }
10466 
10467 /++
10468     Returns whether `c` is a Unicode surrogate $(CODEPOINT)
10469     (general Unicode category: Cs).
10470 +/
10471 @safe pure nothrow @nogc
10472 bool isSurrogate(dchar c)
10473 {
10474     return (0xD800 <= c && c <= 0xDFFF);
10475 }
10476 
10477 /++
10478     Returns whether `c` is a Unicode high surrogate (lead surrogate).
10479 +/
10480 @safe pure nothrow @nogc
10481 bool isSurrogateHi(dchar c)
10482 {
10483     return (0xD800 <= c && c <= 0xDBFF);
10484 }
10485 
10486 /++
10487     Returns whether `c` is a Unicode low surrogate (trail surrogate).
10488 +/
10489 @safe pure nothrow @nogc
10490 bool isSurrogateLo(dchar c)
10491 {
10492     return (0xDC00 <= c && c <= 0xDFFF);
10493 }
10494 
10495 /++
10496     Returns whether `c` is a Unicode non-character i.e.
10497     a $(CODEPOINT) with no assigned abstract character.
10498     (general Unicode category: Cn)
10499 +/
10500 @safe pure nothrow @nogc
10501 bool isNonCharacter(dchar c)
10502 {
10503     return nonCharacterTrie[c];
10504 }
10505 
10506 @safe unittest
10507 {
10508     auto set = unicode("Cn");
10509     foreach (ch; set.byCodepoint)
10510         assert(isNonCharacter(ch));
10511 }
10512 
10513 private:
10514 // load static data from pre-generated tables into usable datastructures
10515 
10516 
10517 @safe auto asSet(const (ubyte)[] compressed) pure
10518 {
10519     return CodepointSet.fromIntervals(decompressIntervals(compressed));
10520 }
10521 
10522 @safe pure nothrow auto asTrie(T...)(const scope TrieEntry!T e)
10523 {
10524     return const(CodepointTrie!T)(e.offsets, e.sizes, e.data);
10525 }
10526 
10527 @safe pure nothrow @nogc @property
10528 {
10529     import std.internal.unicode_tables; // generated file
10530 
10531     // It's important to use auto return here, so that the compiler
10532     // only runs semantic on the return type if the function gets
10533     // used. Also these are functions rather than templates to not
10534     // increase the object size of the caller.
10535     auto lowerCaseTrie() { static immutable res = asTrie(lowerCaseTrieEntries); return res; }
10536     auto upperCaseTrie() { static immutable res = asTrie(upperCaseTrieEntries); return res; }
10537     auto simpleCaseTrie() { static immutable res = asTrie(simpleCaseTrieEntries); return res; }
10538     auto fullCaseTrie() { static immutable res = asTrie(fullCaseTrieEntries); return res; }
10539     auto alphaTrie() { static immutable res = asTrie(alphaTrieEntries); return res; }
10540     auto markTrie() { static immutable res = asTrie(markTrieEntries); return res; }
10541     auto numberTrie() { static immutable res = asTrie(numberTrieEntries); return res; }
10542     auto punctuationTrie() { static immutable res = asTrie(punctuationTrieEntries); return res; }
10543     auto symbolTrie() { static immutable res = asTrie(symbolTrieEntries); return res; }
10544     auto graphicalTrie() { static immutable res = asTrie(graphicalTrieEntries); return res; }
10545     auto nonCharacterTrie() { static immutable res = asTrie(nonCharacterTrieEntries); return res; }
10546 
10547     //normalization quick-check tables
10548     auto nfcQCTrie()
10549     {
10550         import std.internal.unicode_norm : nfcQCTrieEntries;
10551         static immutable res = asTrie(nfcQCTrieEntries);
10552         return res;
10553     }
10554 
10555     auto nfdQCTrie()
10556     {
10557         import std.internal.unicode_norm : nfdQCTrieEntries;
10558         static immutable res = asTrie(nfdQCTrieEntries);
10559         return res;
10560     }
10561 
10562     auto nfkcQCTrie()
10563     {
10564         import std.internal.unicode_norm : nfkcQCTrieEntries;
10565         static immutable res = asTrie(nfkcQCTrieEntries);
10566         return res;
10567     }
10568 
10569     auto nfkdQCTrie()
10570     {
10571         import std.internal.unicode_norm : nfkdQCTrieEntries;
10572         static immutable res = asTrie(nfkdQCTrieEntries);
10573         return res;
10574     }
10575 
10576     //grapheme breaking algorithm tables
10577     auto mcTrie()
10578     {
10579         import std.internal.unicode_grapheme : mcTrieEntries;
10580         static immutable res = asTrie(mcTrieEntries);
10581         return res;
10582     }
10583 
10584     auto graphemeExtendTrie()
10585     {
10586         import std.internal.unicode_grapheme : graphemeExtendTrieEntries;
10587         static immutable res = asTrie(graphemeExtendTrieEntries);
10588         return res;
10589     }
10590 
10591     auto hangLV()
10592     {
10593         import std.internal.unicode_grapheme : hangulLVTrieEntries;
10594         static immutable res = asTrie(hangulLVTrieEntries);
10595         return res;
10596     }
10597 
10598     auto hangLVT()
10599     {
10600         import std.internal.unicode_grapheme : hangulLVTTrieEntries;
10601         static immutable res = asTrie(hangulLVTTrieEntries);
10602         return res;
10603     }
10604 
10605     // tables below are used for composition/decomposition
10606     auto combiningClassTrie()
10607     {
10608         import std.internal.unicode_comp : combiningClassTrieEntries;
10609         static immutable res = asTrie(combiningClassTrieEntries);
10610         return res;
10611     }
10612 
10613     auto compatMappingTrie()
10614     {
10615         import std.internal.unicode_decomp : compatMappingTrieEntries;
10616         static immutable res = asTrie(compatMappingTrieEntries);
10617         return res;
10618     }
10619 
10620     auto canonMappingTrie()
10621     {
10622         import std.internal.unicode_decomp : canonMappingTrieEntries;
10623         static immutable res = asTrie(canonMappingTrieEntries);
10624         return res;
10625     }
10626 
10627     auto compositionJumpTrie()
10628     {
10629         import std.internal.unicode_comp : compositionJumpTrieEntries;
10630         static immutable res = asTrie(compositionJumpTrieEntries);
10631         return res;
10632     }
10633 
10634     //case conversion tables
10635     auto toUpperIndexTrie() { static immutable res = asTrie(toUpperIndexTrieEntries); return res; }
10636     auto toLowerIndexTrie() { static immutable res = asTrie(toLowerIndexTrieEntries); return res; }
10637     auto toTitleIndexTrie() { static immutable res = asTrie(toTitleIndexTrieEntries); return res; }
10638     //simple case conversion tables
10639     auto toUpperSimpleIndexTrie() { static immutable res = asTrie(toUpperSimpleIndexTrieEntries); return res; }
10640     auto toLowerSimpleIndexTrie() { static immutable res = asTrie(toLowerSimpleIndexTrieEntries); return res; }
10641     auto toTitleSimpleIndexTrie() { static immutable res = asTrie(toTitleSimpleIndexTrieEntries); return res; }
10642 
10643 }
10644 
10645 }// version (!std_uni_bootstrap)
10646