1// Written in the D programming language.
2
3/++
4    $(P The `std.uni` module provides an implementation
5    of fundamental Unicode algorithms and data structures.
6    This doesn't include UTF encoding and decoding primitives,
7    see $(REF decode, std,_utf) and $(REF encode, std,_utf) in $(MREF std, utf)
8    for this functionality. )
9
10$(SCRIPT inhibitQuickIndex = 1;)
11$(DIVC quickindex,
12$(BOOKTABLE,
13$(TR $(TH Category) $(TH Functions))
14$(TR $(TD Decode) $(TD
15    $(LREF byCodePoint)
16    $(LREF byGrapheme)
17    $(LREF decodeGrapheme)
18    $(LREF graphemeStride)
19))
20$(TR $(TD Comparison) $(TD
21    $(LREF icmp)
22    $(LREF sicmp)
23))
24$(TR $(TD Classification) $(TD
25    $(LREF isAlpha)
26    $(LREF isAlphaNum)
27    $(LREF isCodepointSet)
28    $(LREF isControl)
29    $(LREF isFormat)
30    $(LREF isGraphical)
31    $(LREF isIntegralPair)
32    $(LREF isMark)
33    $(LREF isNonCharacter)
34    $(LREF isNumber)
35    $(LREF isPrivateUse)
36    $(LREF isPunctuation)
37    $(LREF isSpace)
38    $(LREF isSurrogate)
39    $(LREF isSurrogateHi)
40    $(LREF isSurrogateLo)
41    $(LREF isSymbol)
42    $(LREF isWhite)
43))
44$(TR $(TD Normalization) $(TD
45    $(LREF NFC)
46    $(LREF NFD)
47    $(LREF NFKD)
48    $(LREF NormalizationForm)
49    $(LREF normalize)
50))
51$(TR $(TD Decompose) $(TD
52    $(LREF decompose)
53    $(LREF decomposeHangul)
54    $(LREF UnicodeDecomposition)
55))
56$(TR $(TD Compose) $(TD
57    $(LREF compose)
58    $(LREF composeJamo)
59))
60$(TR $(TD Sets) $(TD
61    $(LREF CodepointInterval)
62    $(LREF CodepointSet)
63    $(LREF InversionList)
64    $(LREF unicode)
65))
66$(TR $(TD Trie) $(TD
67    $(LREF codepointSetTrie)
68    $(LREF CodepointSetTrie)
69    $(LREF codepointTrie)
70    $(LREF CodepointTrie)
71    $(LREF toTrie)
72    $(LREF toDelegate)
73))
74$(TR $(TD Casing) $(TD
75    $(LREF asCapitalized)
76    $(LREF asLowerCase)
77    $(LREF asUpperCase)
78    $(LREF isLower)
79    $(LREF isUpper)
80    $(LREF toLower)
81    $(LREF toLowerInPlace)
82    $(LREF toUpper)
83    $(LREF toUpperInPlace)
84))
85$(TR $(TD Utf8Matcher) $(TD
86    $(LREF isUtfMatcher)
87    $(LREF MatcherConcept)
88    $(LREF utfMatcher)
89))
90$(TR $(TD Separators) $(TD
91    $(LREF lineSep)
92    $(LREF nelSep)
93    $(LREF paraSep)
94))
95$(TR $(TD Building blocks) $(TD
96    $(LREF allowedIn)
97    $(LREF combiningClass)
98    $(LREF Grapheme)
99))
100))
101
102    $(P All primitives listed operate on Unicode characters and
103        sets of characters. For functions which operate on ASCII characters
104        and ignore Unicode $(CHARACTERS), see $(MREF std, ascii).
105        For definitions of Unicode $(CHARACTER), $(CODEPOINT) and other terms
106        used throughout this module see the $(S_LINK Terminology, terminology) section
107        below.
108    )
109    $(P The focus of this module is the core needs of developing Unicode-aware
110        applications. To that effect it provides the following optimized primitives:
111    )
112    $(UL
113        $(LI Character classification by category and common properties:
114            $(LREF isAlpha), $(LREF isWhite) and others.
115        )
116        $(LI
117            Case-insensitive string comparison ($(LREF sicmp), $(LREF icmp)).
118        )
119        $(LI
120            Converting text to any of the four normalization forms via $(LREF normalize).
121        )
122        $(LI
123            Decoding ($(LREF decodeGrapheme))  and iteration ($(LREF byGrapheme), $(LREF graphemeStride))
124            by user-perceived characters, that is by $(LREF Grapheme) clusters.
125        )
126        $(LI
127            Decomposing and composing of individual character(s) according to canonical
128            or compatibility rules, see $(LREF compose) and $(LREF decompose),
129            including the specific version for Hangul syllables $(LREF composeJamo)
130            and $(LREF decomposeHangul).
131        )
132    )
133    $(P It's recognized that an application may need further enhancements
134        and extensions, such as less commonly known algorithms,
135        or tailoring existing ones for region specific needs. To help users
136        with building any extra functionality beyond the core primitives,
137        the module provides:
138    )
139    $(UL
140        $(LI
141            $(LREF CodepointSet), a type for easy manipulation of sets of characters.
142            Besides the typical set algebra it provides an unusual feature:
143            a D source code generator for detection of $(CODEPOINTS) in this set.
144            This is a boon for meta-programming parser frameworks,
145            and is used internally to power classification in small
146            sets like $(LREF isWhite).
147        )
148        $(LI
149            A way to construct optimal packed multi-stage tables also known as a
150            special case of $(LINK2 https://en.wikipedia.org/wiki/Trie, Trie).
151            The functions $(LREF codepointTrie), $(LREF codepointSetTrie)
152            construct custom tries that map dchar to value.
153            The end result is a fast and predictable $(BIGOH 1) lookup that powers
154            functions like $(LREF isAlpha) and $(LREF combiningClass),
155            but for user-defined data sets.
156        )
157        $(LI
158            A useful technique for Unicode-aware parsers that perform
159            character classification of encoded $(CODEPOINTS)
160            is to avoid unnecassary decoding at all costs.
161            $(LREF utfMatcher) provides an improvement over the usual workflow
162            of decode-classify-process, combining the decoding and classification
163            steps. By extracting necessary bits directly from encoded
164            $(S_LINK Code unit, code units) matchers achieve
165            significant performance improvements. See $(LREF MatcherConcept) for
166            the common interface of UTF matchers.
167        )
168        $(LI
169            Generally useful building blocks for customized normalization:
170            $(LREF combiningClass) for querying combining class
171            and $(LREF allowedIn) for testing the Quick_Check
172            property of a given normalization form.
173        )
174        $(LI
175            Access to a large selection of commonly used sets of $(CODEPOINTS).
176            $(S_LINK Unicode properties, Supported sets) include Script,
177            Block and General Category. The exact contents of a set can be
178            observed in the CLDR utility, on the
179            $(HTTP www.unicode.org/cldr/utility/properties.jsp, property index) page
180            of the Unicode website.
181            See $(LREF unicode) for easy and (optionally) compile-time checked set
182            queries.
183        )
184    )
185    $(SECTION Synopsis)
186    ---
187    import std.uni;
188    void main()
189    {
190        // initialize code point sets using script/block or property name
191        // now 'set' contains code points from both scripts.
192        auto set = unicode("Cyrillic") | unicode("Armenian");
193        // same thing but simpler and checked at compile-time
194        auto ascii = unicode.ASCII;
195        auto currency = unicode.Currency_Symbol;
196
197        // easy set ops
198        auto a = set & ascii;
199        assert(a.empty); // as it has no intersection with ascii
200        a = set | ascii;
201        auto b = currency - a; // subtract all ASCII, Cyrillic and Armenian
202
203        // some properties of code point sets
204        assert(b.length > 45); // 46 items in Unicode 6.1, even more in 6.2
205        // testing presence of a code point in a set
206        // is just fine, it is O(logN)
207        assert(!b['$']);
208        assert(!b['\u058F']); // Armenian dram sign
209        assert(b['��']);
210
211        // building fast lookup tables, these guarantee O(1) complexity
212        // 1-level Trie lookup table essentially a huge bit-set ~262Kb
213        auto oneTrie = toTrie!1(b);
214        // 2-level far more compact but typically slightly slower
215        auto twoTrie = toTrie!2(b);
216        // 3-level even smaller, and a bit slower yet
217        auto threeTrie = toTrie!3(b);
218        assert(oneTrie['��']);
219        assert(twoTrie['��']);
220        assert(threeTrie['��']);
221
222        // build the trie with the most sensible trie level
223        // and bind it as a functor
224        auto cyrillicOrArmenian = toDelegate(set);
225        auto balance = find!(cyrillicOrArmenian)("Hello ����������!");
226        assert(balance == "����������!");
227        // compatible with bool delegate(dchar)
228        bool delegate(dchar) bindIt = cyrillicOrArmenian;
229
230        // Normalization
231        string s = "Plain ascii (and not only), is always normalized!";
232        assert(s is normalize(s));// is the same string
233
234        string nonS = "A\u0308ffin"; // A ligature
235        auto nS = normalize(nonS); // to NFC, the W3C endorsed standard
236        assert(nS == "��ffin");
237        assert(nS != nonS);
238        string composed = "��ffin";
239
240        assert(normalize!NFD(composed) == "A\u0308ffin");
241        // to NFKD, compatibility decomposition useful for fuzzy matching/searching
242        assert(normalize!NFKD("2�����") == "210");
243    }
244    ---
245    $(SECTION Terminology)
246    $(P The following is a list of important Unicode notions
247    and definitions. Any conventions used specifically in this
248    module alone are marked as such. The descriptions are based on the formal
249    definition as found in $(HTTP www.unicode.org/versions/Unicode6.2.0/ch03.pdf,
250    chapter three of The Unicode Standard Core Specification.)
251    )
252    $(P $(DEF Abstract character) A unit of information used for the organization,
253        control, or representation of textual data.
254        Note that:
255        $(UL
256            $(LI When representing data, the nature of that data
257                is generally symbolic as opposed to some other
258                kind of data (for example, visual).
259            )
260             $(LI An abstract character has no concrete form
261                and should not be confused with a $(S_LINK Glyph, glyph).
262            )
263            $(LI An abstract character does not necessarily
264                correspond to what a user thinks of as a ���character���
265                and should not be confused with a $(LREF Grapheme).
266            )
267            $(LI The abstract characters encoded (see Encoded character)
268                are known as Unicode abstract characters.
269            )
270            $(LI Abstract characters not directly
271                encoded by the Unicode Standard can often be
272                represented by the use of combining character sequences.
273            )
274        )
275    )
276    $(P $(DEF Canonical decomposition)
277        The decomposition of a character or character sequence
278        that results from recursively applying the canonical
279        mappings found in the Unicode Character Database
280        and these described in Conjoining Jamo Behavior
281        (section 12 of
282        $(HTTP www.unicode.org/uni2book/ch03.pdf, Unicode Conformance)).
283    )
284    $(P $(DEF Canonical composition)
285        The precise definition of the Canonical composition
286        is the algorithm as specified in $(HTTP www.unicode.org/uni2book/ch03.pdf,
287        Unicode Conformance) section 11.
288        Informally it's the process that does the reverse of the canonical
289        decomposition with the addition of certain rules
290        that e.g. prevent legacy characters from appearing in the composed result.
291    )
292    $(P $(DEF Canonical equivalent)
293        Two character sequences are said to be canonical equivalents if
294        their full canonical decompositions are identical.
295    )
296    $(P $(DEF Character) Typically differs by context.
297        For the purpose of this documentation the term $(I character)
298        implies $(I encoded character), that is, a code point having
299        an assigned abstract character (a symbolic meaning).
300    )
301    $(P $(DEF Code point) Any value in the Unicode codespace;
302        that is, the range of integers from 0 to 10FFFF (hex).
303        Not all code points are assigned to encoded characters.
304    )
305    $(P $(DEF Code unit) The minimal bit combination that can represent
306        a unit of encoded text for processing or interchange.
307        Depending on the encoding this could be:
308        8-bit code units in the UTF-8 (`char`),
309        16-bit code units in the UTF-16 (`wchar`),
310        and 32-bit code units in the UTF-32 (`dchar`).
311        $(I Note that in UTF-32, a code unit is a code point
312        and is represented by the D `dchar` type.)
313    )
314    $(P $(DEF Combining character) A character with the General Category
315        of Combining Mark(M).
316        $(UL
317            $(LI All characters with non-zero canonical combining class
318            are combining characters, but the reverse is not the case:
319            there are combining characters with a zero combining class.
320            )
321            $(LI These characters are not normally used in isolation
322            unless they are being described. They include such characters
323            as accents, diacritics, Hebrew points, Arabic vowel signs,
324            and Indic matras.
325            )
326        )
327    )
328    $(P $(DEF Combining class)
329        A numerical value used by the Unicode Canonical Ordering Algorithm
330        to determine which sequences of combining marks are to be
331        considered canonically equivalent and  which are not.
332    )
333    $(P $(DEF Compatibility decomposition)
334        The decomposition of a character or character sequence that results
335        from recursively applying both the compatibility mappings and
336        the canonical mappings found in the Unicode Character Database, and those
337        described in Conjoining Jamo Behavior no characters
338        can be further decomposed.
339    )
340    $(P $(DEF Compatibility equivalent)
341        Two character sequences are said to be compatibility
342        equivalents if their full compatibility decompositions are identical.
343    )
344    $(P $(DEF Encoded character) An association (or mapping)
345        between an abstract character and a code point.
346    )
347    $(P $(DEF Glyph) The actual, concrete image of a glyph representation
348        having been rasterized or otherwise imaged onto some display surface.
349    )
350    $(P $(DEF Grapheme base) A character with the property
351        Grapheme_Base, or any standard Korean syllable block.
352    )
353    $(P $(DEF Grapheme cluster) Defined as the text between
354        grapheme boundaries  as specified by Unicode Standard Annex #29,
355        $(HTTP www.unicode.org/reports/tr29/, Unicode text segmentation).
356        Important general properties of a grapheme:
357        $(UL
358            $(LI The grapheme cluster represents a horizontally segmentable
359            unit of text, consisting of some grapheme base (which may
360            consist of a Korean syllable) together with any number of
361            nonspacing marks applied to it.
362            )
363            $(LI  A grapheme cluster typically starts with a grapheme base
364            and then extends across any subsequent sequence of nonspacing marks.
365            A grapheme cluster is most directly relevant to text rendering and
366            processes such as cursor placement and text selection in editing,
367            but may also be relevant to comparison and searching.
368            )
369            $(LI For many processes, a grapheme cluster behaves as if it was a
370            single character with the same properties as its grapheme base.
371            Effectively, nonspacing marks apply $(I graphically) to the base,
372            but do not change its properties.
373            )
374        )
375        $(P This module defines a number of primitives that work with graphemes:
376        $(LREF Grapheme), $(LREF decodeGrapheme) and $(LREF graphemeStride).
377        All of them are using $(I extended grapheme) boundaries
378        as defined in the aforementioned standard annex.
379        )
380    )
381    $(P $(DEF Nonspacing mark) A combining character with the
382        General Category of Nonspacing Mark (Mn) or Enclosing Mark (Me).
383    )
384    $(P $(DEF Spacing mark) A combining character that is not a nonspacing mark.
385    )
386    $(SECTION Normalization)
387    $(P The concepts of $(S_LINK Canonical equivalent, canonical equivalent)
388        or $(S_LINK Compatibility equivalent, compatibility equivalent)
389        characters in the Unicode Standard make it necessary to have a full, formal
390        definition of equivalence for Unicode strings.
391        String equivalence is determined by a process called normalization,
392        whereby strings are converted into forms which are compared
393        directly for identity. This is the primary goal of the normalization process,
394        see the function $(LREF normalize) to convert into any of
395        the four defined forms.
396    )
397    $(P A very important attribute of the Unicode Normalization Forms
398        is that they must remain stable between versions of the Unicode Standard.
399        A Unicode string normalized to a particular Unicode Normalization Form
400        in one version of the standard is guaranteed to remain in that Normalization
401        Form for implementations of future versions of the standard.
402    )
403    $(P The Unicode Standard specifies four normalization forms.
404        Informally, two of these forms are defined by maximal decomposition
405        of equivalent sequences, and two of these forms are defined
406        by maximal $(I composition) of equivalent sequences.
407            $(UL
408            $(LI Normalization Form D (NFD): The $(S_LINK Canonical decomposition,
409                canonical decomposition) of a character sequence.)
410            $(LI Normalization Form KD (NFKD): The $(S_LINK Compatibility decomposition,
411                compatibility decomposition) of a character sequence.)
412            $(LI Normalization Form C (NFC): The canonical composition of the
413                $(S_LINK Canonical decomposition, canonical decomposition)
414                of a coded character sequence.)
415            $(LI Normalization Form KC (NFKC): The canonical composition
416            of the $(S_LINK Compatibility decomposition,
417                compatibility decomposition) of a character sequence)
418            )
419    )
420    $(P The choice of the normalization form depends on the particular use case.
421        NFC is the best form for general text, since it's more compatible with
422        strings converted from legacy encodings. NFKC is the preferred form for
423        identifiers, especially where there are security concerns. NFD and NFKD
424        are the most useful for internal processing.
425    )
426    $(SECTION Construction of lookup tables)
427    $(P The Unicode standard describes a set of algorithms that
428        depend on having the ability to quickly look up various properties
429        of a code point. Given the the codespace of about 1 million $(CODEPOINTS),
430        it is not a trivial task to provide a space-efficient solution for
431        the multitude of properties.
432    )
433    $(P Common approaches such as hash-tables or binary search over
434        sorted code point intervals (as in $(LREF InversionList)) are insufficient.
435        Hash-tables have enormous memory footprint and binary search
436        over intervals is not fast enough for some heavy-duty algorithms.
437    )
438    $(P The recommended solution (see Unicode Implementation Guidelines)
439        is using multi-stage tables that are an implementation of the
440        $(HTTP en.wikipedia.org/wiki/Trie, Trie) data structure with integer
441        keys and a fixed number of stages. For the remainder of the section
442        this will be called a fixed trie. The following describes a particular
443        implementation that is aimed for the speed of access at the expense
444        of ideal size savings.
445    )
446    $(P Taking a 2-level Trie as an example the principle of operation is as follows.
447        Split the number of bits in a key (code point, 21 bits) into 2 components
448        (e.g. 15 and 8).  The first is the number of bits in the index of the trie
449         and the other is number of bits in each page of the trie.
450        The layout of the trie is then an array of size 2^^bits-of-index followed
451        an array of memory chunks of size 2^^bits-of-page/bits-per-element.
452    )
453    $(P The number of pages is variable (but not less then 1)
454        unlike the number of entries in the index. The slots of the index
455        all have to contain a number of a page that is present. The lookup is then
456        just a couple of operations - slice the upper bits,
457        lookup an index for these, take a page at this index and use
458        the lower bits as an offset within this page.
459
460        Assuming that pages are laid out consequently
461        in one array at `pages`, the pseudo-code is:
462    )
463    ---
464    auto elemsPerPage = (2 ^^ bits_per_page) / Value.sizeOfInBits;
465    pages[index[n >> bits_per_page]][n & (elemsPerPage - 1)];
466    ---
467    $(P Where if `elemsPerPage` is a power of 2 the whole process is
468        a handful of simple instructions and 2 array reads. Subsequent levels
469        of the trie are introduced by recursing on this notion - the index array
470        is treated as values. The number of bits in index is then again
471        split into 2 parts, with pages over 'current-index' and the new 'upper-index'.
472    )
473
474    $(P For completeness a level 1 trie is simply an array.
475        The current implementation takes advantage of bit-packing values
476        when the range is known to be limited in advance (such as `bool`).
477        See also $(LREF BitPacked) for enforcing it manually.
478        The major size advantage however comes from the fact
479        that multiple $(B identical pages on every level are merged) by construction.
480    )
481    $(P The process of constructing a trie is more involved and is hidden from
482        the user in a form of the convenience functions $(LREF codepointTrie),
483        $(LREF codepointSetTrie) and the even more convenient $(LREF toTrie).
484        In general a set or built-in AA with `dchar` type
485        can be turned into a trie. The trie object in this module
486        is read-only (immutable); it's effectively frozen after construction.
487    )
488    $(SECTION Unicode properties)
489    $(P This is a full list of Unicode properties accessible through $(LREF unicode)
490        with specific helpers per category nested within. Consult the
491        $(HTTP www.unicode.org/cldr/utility/properties.jsp, CLDR utility)
492        when in doubt about the contents of a particular set.
493    )
494    $(P General category sets listed below are only accessible with the
495        $(LREF unicode) shorthand accessor.)
496        $(BOOKTABLE $(B General category ),
497             $(TR $(TH Abb.) $(TH Long form)
498                $(TH Abb.) $(TH Long form)$(TH Abb.) $(TH Long form))
499            $(TR $(TD L) $(TD Letter)
500                $(TD Cn) $(TD Unassigned)  $(TD Po) $(TD Other_Punctuation))
501            $(TR $(TD Ll) $(TD Lowercase_Letter)
502                $(TD Co) $(TD Private_Use) $(TD Ps) $(TD Open_Punctuation))
503            $(TR $(TD Lm) $(TD Modifier_Letter)
504                $(TD Cs) $(TD Surrogate)   $(TD S) $(TD Symbol))
505            $(TR $(TD Lo) $(TD Other_Letter)
506                $(TD N) $(TD Number)  $(TD Sc) $(TD Currency_Symbol))
507            $(TR $(TD Lt) $(TD Titlecase_Letter)
508              $(TD Nd) $(TD Decimal_Number)  $(TD Sk) $(TD Modifier_Symbol))
509            $(TR $(TD Lu) $(TD Uppercase_Letter)
510              $(TD Nl) $(TD Letter_Number)   $(TD Sm) $(TD Math_Symbol))
511            $(TR $(TD M) $(TD Mark)
512              $(TD No) $(TD Other_Number)    $(TD So) $(TD Other_Symbol))
513            $(TR $(TD Mc) $(TD Spacing_Mark)
514              $(TD P) $(TD Punctuation) $(TD Z) $(TD Separator))
515            $(TR $(TD Me) $(TD Enclosing_Mark)
516              $(TD Pc) $(TD Connector_Punctuation)   $(TD Zl) $(TD Line_Separator))
517            $(TR $(TD Mn) $(TD Nonspacing_Mark)
518              $(TD Pd) $(TD Dash_Punctuation)    $(TD Zp) $(TD Paragraph_Separator))
519            $(TR $(TD C) $(TD Other)
520              $(TD Pe) $(TD Close_Punctuation) $(TD Zs) $(TD Space_Separator))
521            $(TR $(TD Cc) $(TD Control) $(TD Pf)
522              $(TD Final_Punctuation)   $(TD -) $(TD Any))
523            $(TR $(TD Cf) $(TD Format)
524              $(TD Pi) $(TD Initial_Punctuation) $(TD -) $(TD ASCII))
525    )
526    $(P Sets for other commonly useful properties that are
527        accessible with $(LREF unicode):)
528        $(BOOKTABLE $(B Common binary properties),
529            $(TR $(TH Name) $(TH Name) $(TH Name))
530            $(TR $(TD Alphabetic)  $(TD Ideographic) $(TD Other_Uppercase))
531            $(TR $(TD ASCII_Hex_Digit) $(TD IDS_Binary_Operator) $(TD Pattern_Syntax))
532            $(TR $(TD Bidi_Control)    $(TD ID_Start)    $(TD Pattern_White_Space))
533            $(TR $(TD Cased)   $(TD IDS_Trinary_Operator)    $(TD Quotation_Mark))
534            $(TR $(TD Case_Ignorable)  $(TD Join_Control)    $(TD Radical))
535            $(TR $(TD Dash)    $(TD Logical_Order_Exception) $(TD Soft_Dotted))
536            $(TR $(TD Default_Ignorable_Code_Point)    $(TD Lowercase)   $(TD STerm))
537            $(TR $(TD Deprecated)  $(TD Math)    $(TD Terminal_Punctuation))
538            $(TR $(TD Diacritic)   $(TD Noncharacter_Code_Point) $(TD Unified_Ideograph))
539            $(TR $(TD Extender)    $(TD Other_Alphabetic)    $(TD Uppercase))
540            $(TR $(TD Grapheme_Base)   $(TD Other_Default_Ignorable_Code_Point)  $(TD Variation_Selector))
541            $(TR $(TD Grapheme_Extend) $(TD Other_Grapheme_Extend)   $(TD White_Space))
542            $(TR $(TD Grapheme_Link)   $(TD Other_ID_Continue)   $(TD XID_Continue))
543            $(TR $(TD Hex_Digit)   $(TD Other_ID_Start)  $(TD XID_Start))
544            $(TR $(TD Hyphen)  $(TD Other_Lowercase) )
545            $(TR $(TD ID_Continue) $(TD Other_Math)  )
546    )
547    $(P Below is the table with block names accepted by $(LREF unicode.block).
548        Note that the shorthand version $(LREF unicode) requires "In"
549        to be prepended to the names of blocks so as to disambiguate
550        scripts and blocks.
551    )
552    $(BOOKTABLE $(B Blocks),
553        $(TR $(TD Aegean Numbers)    $(TD Ethiopic Extended) $(TD Mongolian))
554        $(TR $(TD Alchemical Symbols)    $(TD Ethiopic Extended-A)   $(TD Musical Symbols))
555        $(TR $(TD Alphabetic Presentation Forms) $(TD Ethiopic Supplement)   $(TD Myanmar))
556        $(TR $(TD Ancient Greek Musical Notation)    $(TD General Punctuation)   $(TD Myanmar Extended-A))
557        $(TR $(TD Ancient Greek Numbers) $(TD Geometric Shapes)  $(TD New Tai Lue))
558        $(TR $(TD Ancient Symbols)   $(TD Georgian)  $(TD NKo))
559        $(TR $(TD Arabic)    $(TD Georgian Supplement)   $(TD Number Forms))
560        $(TR $(TD Arabic Extended-A) $(TD Glagolitic)    $(TD Ogham))
561        $(TR $(TD Arabic Mathematical Alphabetic Symbols)    $(TD Gothic)    $(TD Ol Chiki))
562        $(TR $(TD Arabic Presentation Forms-A)   $(TD Greek and Coptic)  $(TD Old Italic))
563        $(TR $(TD Arabic Presentation Forms-B)   $(TD Greek Extended)    $(TD Old Persian))
564        $(TR $(TD Arabic Supplement) $(TD Gujarati)  $(TD Old South Arabian))
565        $(TR $(TD Armenian)  $(TD Gurmukhi)  $(TD Old Turkic))
566        $(TR $(TD Arrows)    $(TD Halfwidth and Fullwidth Forms) $(TD Optical Character Recognition))
567        $(TR $(TD Avestan)   $(TD Hangul Compatibility Jamo) $(TD Oriya))
568        $(TR $(TD Balinese)  $(TD Hangul Jamo)   $(TD Osmanya))
569        $(TR $(TD Bamum) $(TD Hangul Jamo Extended-A)    $(TD Phags-pa))
570        $(TR $(TD Bamum Supplement)  $(TD Hangul Jamo Extended-B)    $(TD Phaistos Disc))
571        $(TR $(TD Basic Latin)   $(TD Hangul Syllables)  $(TD Phoenician))
572        $(TR $(TD Batak) $(TD Hanunoo)   $(TD Phonetic Extensions))
573        $(TR $(TD Bengali)   $(TD Hebrew)    $(TD Phonetic Extensions Supplement))
574        $(TR $(TD Block Elements)    $(TD High Private Use Surrogates)   $(TD Playing Cards))
575        $(TR $(TD Bopomofo)  $(TD High Surrogates)   $(TD Private Use Area))
576        $(TR $(TD Bopomofo Extended) $(TD Hiragana)  $(TD Rejang))
577        $(TR $(TD Box Drawing)   $(TD Ideographic Description Characters)    $(TD Rumi Numeral Symbols))
578        $(TR $(TD Brahmi)    $(TD Imperial Aramaic)  $(TD Runic))
579        $(TR $(TD Braille Patterns)  $(TD Inscriptional Pahlavi) $(TD Samaritan))
580        $(TR $(TD Buginese)  $(TD Inscriptional Parthian)    $(TD Saurashtra))
581        $(TR $(TD Buhid) $(TD IPA Extensions)    $(TD Sharada))
582        $(TR $(TD Byzantine Musical Symbols) $(TD Javanese)  $(TD Shavian))
583        $(TR $(TD Carian)    $(TD Kaithi)    $(TD Sinhala))
584        $(TR $(TD Chakma)    $(TD Kana Supplement)   $(TD Small Form Variants))
585        $(TR $(TD Cham)  $(TD Kanbun)    $(TD Sora Sompeng))
586        $(TR $(TD Cherokee)  $(TD Kangxi Radicals)   $(TD Spacing Modifier Letters))
587        $(TR $(TD CJK Compatibility) $(TD Kannada)   $(TD Specials))
588        $(TR $(TD CJK Compatibility Forms)   $(TD Katakana)  $(TD Sundanese))
589        $(TR $(TD CJK Compatibility Ideographs)  $(TD Katakana Phonetic Extensions)  $(TD Sundanese Supplement))
590        $(TR $(TD CJK Compatibility Ideographs Supplement)   $(TD Kayah Li)  $(TD Superscripts and Subscripts))
591        $(TR $(TD CJK Radicals Supplement)   $(TD Kharoshthi)    $(TD Supplemental Arrows-A))
592        $(TR $(TD CJK Strokes)   $(TD Khmer) $(TD Supplemental Arrows-B))
593        $(TR $(TD CJK Symbols and Punctuation)   $(TD Khmer Symbols) $(TD Supplemental Mathematical Operators))
594        $(TR $(TD CJK Unified Ideographs)    $(TD Lao)   $(TD Supplemental Punctuation))
595        $(TR $(TD CJK Unified Ideographs Extension A)    $(TD Latin-1 Supplement)    $(TD Supplementary Private Use Area-A))
596        $(TR $(TD CJK Unified Ideographs Extension B)    $(TD Latin Extended-A)  $(TD Supplementary Private Use Area-B))
597        $(TR $(TD CJK Unified Ideographs Extension C)    $(TD Latin Extended Additional) $(TD Syloti Nagri))
598        $(TR $(TD CJK Unified Ideographs Extension D)    $(TD Latin Extended-B)  $(TD Syriac))
599        $(TR $(TD Combining Diacritical Marks)   $(TD Latin Extended-C)  $(TD Tagalog))
600        $(TR $(TD Combining Diacritical Marks for Symbols)   $(TD Latin Extended-D)  $(TD Tagbanwa))
601        $(TR $(TD Combining Diacritical Marks Supplement)    $(TD Lepcha)    $(TD Tags))
602        $(TR $(TD Combining Half Marks)  $(TD Letterlike Symbols)    $(TD Tai Le))
603        $(TR $(TD Common Indic Number Forms) $(TD Limbu) $(TD Tai Tham))
604        $(TR $(TD Control Pictures)  $(TD Linear B Ideograms)    $(TD Tai Viet))
605        $(TR $(TD Coptic)    $(TD Linear B Syllabary)    $(TD Tai Xuan Jing Symbols))
606        $(TR $(TD Counting Rod Numerals) $(TD Lisu)  $(TD Takri))
607        $(TR $(TD Cuneiform) $(TD Low Surrogates)    $(TD Tamil))
608        $(TR $(TD Cuneiform Numbers and Punctuation) $(TD Lycian)    $(TD Telugu))
609        $(TR $(TD Currency Symbols)  $(TD Lydian)    $(TD Thaana))
610        $(TR $(TD Cypriot Syllabary) $(TD Mahjong Tiles) $(TD Thai))
611        $(TR $(TD Cyrillic)  $(TD Malayalam) $(TD Tibetan))
612        $(TR $(TD Cyrillic Extended-A)   $(TD Mandaic)   $(TD Tifinagh))
613        $(TR $(TD Cyrillic Extended-B)   $(TD Mathematical Alphanumeric Symbols) $(TD Transport And Map Symbols))
614        $(TR $(TD Cyrillic Supplement)   $(TD Mathematical Operators)    $(TD Ugaritic))
615        $(TR $(TD Deseret)   $(TD Meetei Mayek)  $(TD Unified Canadian Aboriginal Syllabics))
616        $(TR $(TD Devanagari)    $(TD Meetei Mayek Extensions)   $(TD Unified Canadian Aboriginal Syllabics Extended))
617        $(TR $(TD Devanagari Extended)   $(TD Meroitic Cursive)  $(TD Vai))
618        $(TR $(TD Dingbats)  $(TD Meroitic Hieroglyphs)  $(TD Variation Selectors))
619        $(TR $(TD Domino Tiles)  $(TD Miao)  $(TD Variation Selectors Supplement))
620        $(TR $(TD Egyptian Hieroglyphs)  $(TD Miscellaneous Mathematical Symbols-A)  $(TD Vedic Extensions))
621        $(TR $(TD Emoticons) $(TD Miscellaneous Mathematical Symbols-B)  $(TD Vertical Forms))
622        $(TR $(TD Enclosed Alphanumerics)    $(TD Miscellaneous Symbols) $(TD Yijing Hexagram Symbols))
623        $(TR $(TD Enclosed Alphanumeric Supplement)  $(TD Miscellaneous Symbols and Arrows)  $(TD Yi Radicals))
624        $(TR $(TD Enclosed CJK Letters and Months)   $(TD Miscellaneous Symbols And Pictographs) $(TD Yi Syllables))
625        $(TR $(TD Enclosed Ideographic Supplement)   $(TD Miscellaneous Technical)   )
626        $(TR $(TD Ethiopic)  $(TD Modifier Tone Letters) )
627    )
628    $(P Below is the table with script names accepted by $(LREF unicode.script)
629        and by the shorthand version $(LREF unicode):)
630        $(BOOKTABLE $(B Scripts),
631            $(TR $(TD Arabic)  $(TD Hanunoo) $(TD Old_Italic))
632            $(TR $(TD Armenian)    $(TD Hebrew)  $(TD Old_Persian))
633            $(TR $(TD Avestan) $(TD Hiragana)    $(TD Old_South_Arabian))
634            $(TR $(TD Balinese)    $(TD Imperial_Aramaic)    $(TD Old_Turkic))
635            $(TR $(TD Bamum)   $(TD Inherited)   $(TD Oriya))
636            $(TR $(TD Batak)   $(TD Inscriptional_Pahlavi)   $(TD Osmanya))
637            $(TR $(TD Bengali) $(TD Inscriptional_Parthian)  $(TD Phags_Pa))
638            $(TR $(TD Bopomofo)    $(TD Javanese)    $(TD Phoenician))
639            $(TR $(TD Brahmi)  $(TD Kaithi)  $(TD Rejang))
640            $(TR $(TD Braille) $(TD Kannada) $(TD Runic))
641            $(TR $(TD Buginese)    $(TD Katakana)    $(TD Samaritan))
642            $(TR $(TD Buhid)   $(TD Kayah_Li)    $(TD Saurashtra))
643            $(TR $(TD Canadian_Aboriginal) $(TD Kharoshthi)  $(TD Sharada))
644            $(TR $(TD Carian)  $(TD Khmer)   $(TD Shavian))
645            $(TR $(TD Chakma)  $(TD Lao) $(TD Sinhala))
646            $(TR $(TD Cham)    $(TD Latin)   $(TD Sora_Sompeng))
647            $(TR $(TD Cherokee)    $(TD Lepcha)  $(TD Sundanese))
648            $(TR $(TD Common)  $(TD Limbu)   $(TD Syloti_Nagri))
649            $(TR $(TD Coptic)  $(TD Linear_B)    $(TD Syriac))
650            $(TR $(TD Cuneiform)   $(TD Lisu)    $(TD Tagalog))
651            $(TR $(TD Cypriot) $(TD Lycian)  $(TD Tagbanwa))
652            $(TR $(TD Cyrillic)    $(TD Lydian)  $(TD Tai_Le))
653            $(TR $(TD Deseret) $(TD Malayalam)   $(TD Tai_Tham))
654            $(TR $(TD Devanagari)  $(TD Mandaic) $(TD Tai_Viet))
655            $(TR $(TD Egyptian_Hieroglyphs)    $(TD Meetei_Mayek)    $(TD Takri))
656            $(TR $(TD Ethiopic)    $(TD Meroitic_Cursive)    $(TD Tamil))
657            $(TR $(TD Georgian)    $(TD Meroitic_Hieroglyphs)    $(TD Telugu))
658            $(TR $(TD Glagolitic)  $(TD Miao)    $(TD Thaana))
659            $(TR $(TD Gothic)  $(TD Mongolian)   $(TD Thai))
660            $(TR $(TD Greek)   $(TD Myanmar) $(TD Tibetan))
661            $(TR $(TD Gujarati)    $(TD New_Tai_Lue) $(TD Tifinagh))
662            $(TR $(TD Gurmukhi)    $(TD Nko) $(TD Ugaritic))
663            $(TR $(TD Han) $(TD Ogham)   $(TD Vai))
664            $(TR $(TD Hangul)  $(TD Ol_Chiki)    $(TD Yi))
665    )
666    $(P Below is the table of names accepted by $(LREF unicode.hangulSyllableType).)
667        $(BOOKTABLE $(B Hangul syllable type),
668            $(TR $(TH Abb.) $(TH Long form))
669            $(TR $(TD L)   $(TD Leading_Jamo))
670            $(TR $(TD LV)  $(TD LV_Syllable))
671            $(TR $(TD LVT) $(TD LVT_Syllable) )
672            $(TR $(TD T)   $(TD Trailing_Jamo))
673            $(TR $(TD V)   $(TD Vowel_Jamo))
674    )
675    References:
676        $(HTTP www.digitalmars.com/d/ascii-table.html, ASCII Table),
677        $(HTTP en.wikipedia.org/wiki/Unicode, Wikipedia),
678        $(HTTP www.unicode.org, The Unicode Consortium),
679        $(HTTP www.unicode.org/reports/tr15/, Unicode normalization forms),
680        $(HTTP www.unicode.org/reports/tr29/, Unicode text segmentation)
681        $(HTTP www.unicode.org/uni2book/ch05.pdf,
682            Unicode Implementation Guidelines)
683        $(HTTP www.unicode.org/uni2book/ch03.pdf,
684            Unicode Conformance)
685    Trademarks:
686        Unicode(tm) is a trademark of Unicode, Inc.
687
688    Copyright: Copyright 2013 -
689    License:   $(HTTP www.boost.org/LICENSE_1_0.txt, Boost License 1.0).
690    Authors:   Dmitry Olshansky
691    Source:    $(PHOBOSSRC std/uni/package.d)
692    Standards: $(HTTP www.unicode.org/versions/Unicode6.2.0/, Unicode v6.2)
693
694Macros:
695
696SECTION = <h3><a id="$1">$0</a></h3>
697DEF = <div><a id="$1"><i>$0</i></a></div>
698S_LINK = <a href="#$1">$+</a>
699CODEPOINT = $(S_LINK Code point, code point)
700CODEPOINTS = $(S_LINK Code point, code points)
701CHARACTER = $(S_LINK Character, character)
702CHARACTERS = $(S_LINK Character, characters)
703CLUSTER = $(S_LINK Grapheme cluster, grapheme cluster)
704+/
705module std.uni;
706
707import std.meta : AliasSeq;
708import std.range.primitives : back, ElementEncodingType, ElementType, empty,
709    front, hasLength, hasSlicing, isForwardRange, isInputRange,
710    isRandomAccessRange, popFront, put, save;
711import std.traits : isConvertibleToString, isIntegral, isSomeChar,
712    isSomeString, Unqual, isDynamicArray;
713// debug = std_uni;
714
715debug(std_uni) import std.stdio; // writefln, writeln
716
717private:
718
719
720void copyBackwards(T,U)(T[] src, U[] dest)
721{
722    assert(src.length == dest.length);
723    for (size_t i=src.length; i-- > 0; )
724        dest[i] = src[i];
725}
726
727void copyForward(T,U)(T[] src, U[] dest)
728{
729    assert(src.length == dest.length);
730    for (size_t i=0; i<src.length; i++)
731        dest[i] = src[i];
732}
733
734// TODO: update to reflect all major CPUs supporting unaligned reads
735version (X86)
736    enum hasUnalignedReads = true;
737else version (X86_64)
738    enum hasUnalignedReads = true;
739else version (SystemZ)
740    enum hasUnalignedReads = true;
741else
742    enum hasUnalignedReads = false; // better be safe then sorry
743
744public enum dchar lineSep = '\u2028'; /// Constant $(CODEPOINT) (0x2028) - line separator.
745public enum dchar paraSep = '\u2029'; /// Constant $(CODEPOINT) (0x2029) - paragraph separator.
746public enum dchar nelSep  = '\u0085'; /// Constant $(CODEPOINT) (0x0085) - next line.
747
748// test the intro example
749@safe unittest
750{
751    import std.algorithm.searching : find;
752    // initialize code point sets using script/block or property name
753    // set contains code points from both scripts.
754    auto set = unicode("Cyrillic") | unicode("Armenian");
755    // or simpler and statically-checked look
756    auto ascii = unicode.ASCII;
757    auto currency = unicode.Currency_Symbol;
758
759    // easy set ops
760    auto a = set & ascii;
761    assert(a.empty); // as it has no intersection with ascii
762    a = set | ascii;
763    auto b = currency - a; // subtract all ASCII, Cyrillic and Armenian
764
765    // some properties of code point sets
766    assert(b.length > 45); // 46 items in Unicode 6.1, even more in 6.2
767    // testing presence of a code point in a set
768    // is just fine, it is O(logN)
769    assert(!b['$']);
770    assert(!b['\u058F']); // Armenian dram sign
771    assert(b['��']);
772
773    // building fast lookup tables, these guarantee O(1) complexity
774    // 1-level Trie lookup table essentially a huge bit-set ~262Kb
775    auto oneTrie = toTrie!1(b);
776    // 2-level far more compact but typically slightly slower
777    auto twoTrie = toTrie!2(b);
778    // 3-level even smaller, and a bit slower yet
779    auto threeTrie = toTrie!3(b);
780    assert(oneTrie['��']);
781    assert(twoTrie['��']);
782    assert(threeTrie['��']);
783
784    // build the trie with the most sensible trie level
785    // and bind it as a functor
786    auto cyrillicOrArmenian = toDelegate(set);
787    auto balance = find!(cyrillicOrArmenian)("Hello ����������!");
788    assert(balance == "����������!");
789    // compatible with bool delegate(dchar)
790    bool delegate(dchar) bindIt = cyrillicOrArmenian;
791
792    // Normalization
793    string s = "Plain ascii (and not only), is always normalized!";
794    assert(s is normalize(s));// is the same string
795
796    string nonS = "A\u0308ffin"; // A ligature
797    auto nS = normalize(nonS); // to NFC, the W3C endorsed standard
798    assert(nS == "��ffin");
799    assert(nS != nonS);
800    string composed = "��ffin";
801
802    assert(normalize!NFD(composed) == "A\u0308ffin");
803    // to NFKD, compatibility decomposition useful for fuzzy matching/searching
804    assert(normalize!NFKD("2�����") == "210");
805}
806
807enum lastDchar = 0x10FFFF;
808
809auto force(T, F)(F from)
810if (isIntegral!T && !is(T == F))
811{
812    assert(from <= T.max && from >= T.min);
813    return cast(T) from;
814}
815
816auto force(T, F)(F from)
817if (isBitPacked!T && !is(T == F))
818{
819    assert(from <= 2^^bitSizeOf!T-1);
820    return T(cast(TypeOfBitPacked!T) from);
821}
822
823auto force(T, F)(F from)
824if (is(T == F))
825{
826    return from;
827}
828
829// repeat X times the bit-pattern in val assuming it's length is 'bits'
830size_t replicateBits(size_t times, size_t bits)(size_t val) @safe pure nothrow @nogc
831{
832    static if (times == 1)
833        return val;
834    else static if (bits == 1)
835    {
836        static if (times == size_t.sizeof*8)
837            return val ? size_t.max : 0;
838        else
839            return val ? (1 << times)-1 : 0;
840    }
841    else static if (times % 2)
842        return (replicateBits!(times-1, bits)(val)<<bits) | val;
843    else
844        return replicateBits!(times/2, bits*2)((val << bits) | val);
845}
846
847@safe pure nothrow @nogc unittest // for replicate
848{
849    import std.algorithm.iteration : sum, map;
850    import std.range : iota;
851    size_t m = 0b111;
852    size_t m2 = 0b01;
853    static foreach (i; AliasSeq!(1, 2, 3, 4, 5, 6, 7, 8, 9, 10))
854    {
855        assert(replicateBits!(i, 3)(m)+1 == (1<<(3*i)));
856        assert(replicateBits!(i, 2)(m2) == iota(0, i).map!"2^^(2*a)"().sum());
857    }
858}
859
860// multiple arrays squashed into one memory block
861struct MultiArray(Types...)
862{
863    import std.range.primitives : isOutputRange;
864    this(size_t[] sizes...) @safe pure nothrow
865    {
866        assert(dim == sizes.length);
867        size_t full_size;
868        foreach (i, v; Types)
869        {
870            full_size += spaceFor!(bitSizeOf!v)(sizes[i]);
871            sz[i] = sizes[i];
872            static if (i >= 1)
873                offsets[i] = offsets[i-1] +
874                    spaceFor!(bitSizeOf!(Types[i-1]))(sizes[i-1]);
875        }
876
877        storage = new size_t[full_size];
878    }
879
880    this(const(size_t)[] raw_offsets,
881        const(size_t)[] raw_sizes,
882        return scope const(size_t)[] data) return scope const @safe pure nothrow @nogc
883    {
884        offsets[] = raw_offsets[];
885        sz[] = raw_sizes[];
886        storage = data;
887    }
888
889    @property auto slice(size_t n)()inout pure nothrow @nogc
890    {
891        auto ptr = raw_ptr!n;
892        return packedArrayView!(Types[n])(ptr, sz[n]);
893    }
894
895    @property auto ptr(size_t n)()inout pure nothrow @nogc
896    {
897        auto ptr = raw_ptr!n;
898        return inout(PackedPtr!(Types[n]))(ptr);
899    }
900
901    template length(size_t n)
902    {
903        @property size_t length()const @safe pure nothrow @nogc{ return sz[n]; }
904
905        @property void length(size_t new_size)
906        {
907            if (new_size > sz[n])
908            {// extend
909                size_t delta = (new_size - sz[n]);
910                sz[n] += delta;
911                delta = spaceFor!(bitSizeOf!(Types[n]))(delta);
912                storage.length +=  delta;// extend space at end
913                // raw_slice!x must follow resize as it could be moved!
914                // next stmts move all data past this array, last-one-goes-first
915                static if (n != dim-1)
916                {
917                    auto start = raw_ptr!(n+1);
918                    // len includes delta
919                    size_t len = (storage.ptr+storage.length-start);
920
921                    copyBackwards(start[0 .. len-delta], start[delta .. len]);
922
923                    start[0 .. delta] = 0;
924                    // offsets are used for raw_slice, ptr etc.
925                    foreach (i; n+1 .. dim)
926                        offsets[i] += delta;
927                }
928            }
929            else if (new_size < sz[n])
930            {// shrink
931                size_t delta = (sz[n] - new_size);
932                sz[n] -= delta;
933                delta = spaceFor!(bitSizeOf!(Types[n]))(delta);
934                // move all data past this array, forward direction
935                static if (n != dim-1)
936                {
937                    auto start = raw_ptr!(n+1);
938                    size_t len = (storage.ptr+storage.length-start);
939                    copyForward(start[0 .. len-delta], start[delta .. len]);
940
941                    // adjust offsets last, they affect raw_slice
942                    foreach (i; n+1 .. dim)
943                        offsets[i] -= delta;
944                }
945                storage.length -= delta;
946            }
947            // else - NOP
948        }
949    }
950
951    @property size_t bytes(size_t n=size_t.max)() const @safe
952    {
953        static if (n == size_t.max)
954            return storage.length*size_t.sizeof;
955        else static if (n != Types.length-1)
956            return (raw_ptr!(n+1)-raw_ptr!n)*size_t.sizeof;
957        else
958            return (storage.ptr+storage.length - raw_ptr!n)*size_t.sizeof;
959    }
960
961    void store(OutRange)(scope OutRange sink) const
962        if (isOutputRange!(OutRange, char))
963    {
964        import std.format.write : formattedWrite;
965        formattedWrite(sink, "[%( 0x%x, %)]", offsets[]);
966        formattedWrite(sink, ", [%( 0x%x, %)]", sz[]);
967        formattedWrite(sink, ", [%( 0x%x, %)]", storage);
968    }
969
970private:
971    import std.meta : staticMap;
972    @property auto raw_ptr(size_t n)()inout pure nothrow @nogc
973    {
974        static if (n == 0)
975            return storage.ptr;
976        else
977        {
978            return storage.ptr+offsets[n];
979        }
980    }
981    enum dim = Types.length;
982    size_t[dim] offsets;// offset for level x
983    size_t[dim] sz;// size of level x
984    alias bitWidth = staticMap!(bitSizeOf, Types);
985    size_t[] storage;
986}
987
988@system unittest
989{
990    import std.conv : text;
991    enum dg = (){
992        // sizes are:
993        // lvl0: 3, lvl1 : 2, lvl2: 1
994        auto m = MultiArray!(int, ubyte, int)(3,2,1);
995
996        static void check(size_t k, T)(ref T m, int n)
997        {
998            foreach (i; 0 .. n)
999                assert(m.slice!(k)[i] == i+1, text("level:",i," : ",m.slice!(k)[0 .. n]));
1000        }
1001
1002        static void checkB(size_t k, T)(ref T m, int n)
1003        {
1004            foreach (i; 0 .. n)
1005                assert(m.slice!(k)[i] == n-i, text("level:",i," : ",m.slice!(k)[0 .. n]));
1006        }
1007
1008        static void fill(size_t k, T)(ref T m, int n)
1009        {
1010            foreach (i; 0 .. n)
1011                m.slice!(k)[i] = force!ubyte(i+1);
1012        }
1013
1014        static void fillB(size_t k, T)(ref T m, int n)
1015        {
1016            foreach (i; 0 .. n)
1017                m.slice!(k)[i] = force!ubyte(n-i);
1018        }
1019
1020        m.length!1 = 100;
1021        fill!1(m, 100);
1022        check!1(m, 100);
1023
1024        m.length!0 = 220;
1025        fill!0(m, 220);
1026        check!1(m, 100);
1027        check!0(m, 220);
1028
1029        m.length!2 = 17;
1030        fillB!2(m, 17);
1031        checkB!2(m, 17);
1032        check!0(m, 220);
1033        check!1(m, 100);
1034
1035        m.length!2 = 33;
1036        checkB!2(m, 17);
1037        fillB!2(m, 33);
1038        checkB!2(m, 33);
1039        check!0(m, 220);
1040        check!1(m, 100);
1041
1042        m.length!1 = 195;
1043        fillB!1(m, 195);
1044        checkB!1(m, 195);
1045        checkB!2(m, 33);
1046        check!0(m, 220);
1047
1048        auto marr = MultiArray!(BitPacked!(uint, 4), BitPacked!(uint, 6))(20, 10);
1049        marr.length!0 = 15;
1050        marr.length!1 = 30;
1051        fill!1(marr, 30);
1052        fill!0(marr, 15);
1053        check!1(marr, 30);
1054        check!0(marr, 15);
1055        return 0;
1056    };
1057    enum ct = dg();
1058    auto rt = dg();
1059}
1060
1061@system unittest
1062{// more bitpacking tests
1063    import std.conv : text;
1064
1065    alias Bitty =
1066      MultiArray!(BitPacked!(size_t, 3)
1067                , BitPacked!(size_t, 4)
1068                , BitPacked!(size_t, 3)
1069                , BitPacked!(size_t, 6)
1070                , bool);
1071    alias fn1 = sliceBits!(13, 16);
1072    alias fn2 = sliceBits!( 9, 13);
1073    alias fn3 = sliceBits!( 6,  9);
1074    alias fn4 = sliceBits!( 0,  6);
1075    static void check(size_t lvl, MA)(ref MA arr){
1076        for (size_t i = 0; i< arr.length!lvl; i++)
1077            assert(arr.slice!(lvl)[i] == i, text("Mismatch on lvl ", lvl, " idx ", i, " value: ", arr.slice!(lvl)[i]));
1078    }
1079
1080    static void fillIdx(size_t lvl, MA)(ref MA arr){
1081        for (size_t i = 0; i< arr.length!lvl; i++)
1082            arr.slice!(lvl)[i] = i;
1083    }
1084    Bitty m1;
1085
1086    m1.length!4 = 10;
1087    m1.length!3 = 2^^6;
1088    m1.length!2 = 2^^3;
1089    m1.length!1 = 2^^4;
1090    m1.length!0 = 2^^3;
1091
1092    m1.length!4 = 2^^16;
1093
1094    for (size_t i = 0; i< m1.length!4; i++)
1095        m1.slice!(4)[i] = i % 2;
1096
1097    fillIdx!1(m1);
1098    check!1(m1);
1099    fillIdx!2(m1);
1100    check!2(m1);
1101    fillIdx!3(m1);
1102    check!3(m1);
1103    fillIdx!0(m1);
1104    check!0(m1);
1105    check!3(m1);
1106    check!2(m1);
1107    check!1(m1);
1108    for (size_t i=0; i < 2^^16; i++)
1109    {
1110        m1.slice!(4)[i] = i % 2;
1111        m1.slice!(0)[fn1(i)] = fn1(i);
1112        m1.slice!(1)[fn2(i)] = fn2(i);
1113        m1.slice!(2)[fn3(i)] = fn3(i);
1114        m1.slice!(3)[fn4(i)] = fn4(i);
1115    }
1116    for (size_t i=0; i < 2^^16; i++)
1117    {
1118        assert(m1.slice!(4)[i] == i % 2);
1119        assert(m1.slice!(0)[fn1(i)] == fn1(i));
1120        assert(m1.slice!(1)[fn2(i)] == fn2(i));
1121        assert(m1.slice!(2)[fn3(i)] == fn3(i));
1122        assert(m1.slice!(3)[fn4(i)] == fn4(i));
1123    }
1124}
1125
1126size_t spaceFor(size_t _bits)(size_t new_len) @safe pure nothrow @nogc
1127{
1128    import std.math.algebraic : nextPow2;
1129    enum bits = _bits == 1 ? 1 : nextPow2(_bits - 1);// see PackedArrayView
1130    static if (bits > 8*size_t.sizeof)
1131    {
1132        static assert(bits % (size_t.sizeof*8) == 0);
1133        return new_len * bits/(8*size_t.sizeof);
1134    }
1135    else
1136    {
1137        enum factor = size_t.sizeof*8/bits;
1138        return (new_len+factor-1)/factor; // rounded up
1139    }
1140}
1141
1142template isBitPackableType(T)
1143{
1144    enum isBitPackableType = isBitPacked!T
1145        || isIntegral!T || is(T == bool) || isSomeChar!T;
1146}
1147
1148//============================================================================
1149template PackedArrayView(T)
1150if ((is(T dummy == BitPacked!(U, sz), U, size_t sz)
1151    && isBitPackableType!U) || isBitPackableType!T)
1152{
1153    import std.math.algebraic : nextPow2;
1154    private enum bits = bitSizeOf!T;
1155    alias PackedArrayView = PackedArrayViewImpl!(T, bits > 1 ? nextPow2(bits - 1) : 1);
1156}
1157
1158//unsafe and fast access to a chunk of RAM as if it contains packed values
1159template PackedPtr(T)
1160if ((is(T dummy == BitPacked!(U, sz), U, size_t sz)
1161    && isBitPackableType!U) || isBitPackableType!T)
1162{
1163    import std.math.algebraic : nextPow2;
1164    private enum bits = bitSizeOf!T;
1165    alias PackedPtr = PackedPtrImpl!(T, bits > 1 ? nextPow2(bits - 1) : 1);
1166}
1167
1168struct PackedPtrImpl(T, size_t bits)
1169{
1170pure nothrow:
1171    static assert(isPow2OrZero(bits));
1172
1173    this(inout(size_t)* ptr)inout @safe @nogc
1174    {
1175        origin = ptr;
1176    }
1177
1178    private T simpleIndex(size_t n) inout
1179    {
1180        immutable q = n / factor;
1181        immutable r = n % factor;
1182        return cast(T)((origin[q] >> bits*r) & mask);
1183    }
1184
1185    private void simpleWrite(TypeOfBitPacked!T val, size_t n)
1186    in
1187    {
1188        static if (isIntegral!T)
1189            assert(val <= mask);
1190    }
1191    do
1192    {
1193        immutable q = n / factor;
1194        immutable r = n % factor;
1195        immutable tgt_shift = bits*r;
1196        immutable word = origin[q];
1197        origin[q] = (word & ~(mask << tgt_shift))
1198            | (cast(size_t) val << tgt_shift);
1199    }
1200
1201    static if (factor == bytesPerWord// can safely pack by byte
1202         || factor == 1 // a whole word at a time
1203         || ((factor == bytesPerWord/2 || factor == bytesPerWord/4)
1204                && hasUnalignedReads)) // this needs unaligned reads
1205    {
1206        static if (factor == bytesPerWord)
1207            alias U = ubyte;
1208        else static if (factor == bytesPerWord/2)
1209            alias U = ushort;
1210        else static if (factor == bytesPerWord/4)
1211            alias U = uint;
1212        else static if (size_t.sizeof == 8 && factor == bytesPerWord/8)
1213            alias U = ulong;
1214
1215        T opIndex(size_t idx) inout
1216        {
1217            T ret;
1218            version (LittleEndian)
1219                ret = __ctfe ? simpleIndex(idx) :
1220                    cast(inout(T))(cast(U*) origin)[idx];
1221            else
1222                ret = simpleIndex(idx);
1223            return ret;
1224        }
1225
1226        static if (isBitPacked!T) // lack of user-defined implicit conversion
1227        {
1228            void opIndexAssign(T val, size_t idx)
1229            {
1230                return opIndexAssign(cast(TypeOfBitPacked!T) val, idx);
1231            }
1232        }
1233
1234        void opIndexAssign(TypeOfBitPacked!T val, size_t idx)
1235        {
1236            version (LittleEndian)
1237            {
1238                if (__ctfe)
1239                    simpleWrite(val, idx);
1240                else
1241                    (cast(U*) origin)[idx] = cast(U) val;
1242            }
1243            else
1244                simpleWrite(val, idx);
1245        }
1246    }
1247    else
1248    {
1249        T opIndex(size_t n) inout
1250        {
1251            return simpleIndex(n);
1252        }
1253
1254        static if (isBitPacked!T) // lack of user-defined implicit conversion
1255        {
1256            void opIndexAssign(T val, size_t idx)
1257            {
1258                return opIndexAssign(cast(TypeOfBitPacked!T) val, idx);
1259            }
1260        }
1261
1262        void opIndexAssign(TypeOfBitPacked!T val, size_t n)
1263        {
1264            return simpleWrite(val, n);
1265        }
1266    }
1267
1268private:
1269    // factor - number of elements in one machine word
1270    enum factor = size_t.sizeof*8/bits, mask = 2^^bits-1;
1271    enum bytesPerWord =  size_t.sizeof;
1272    size_t* origin;
1273}
1274
1275// data is packed only by power of two sized packs per word,
1276// thus avoiding mul/div overhead at the cost of ultimate packing
1277// this construct doesn't own memory, only provides access, see MultiArray for usage
1278struct PackedArrayViewImpl(T, size_t bits)
1279{
1280pure nothrow:
1281
1282    this(inout(size_t)* origin, size_t offset, size_t items) inout @safe
1283    {
1284        ptr = inout(PackedPtr!(T))(origin);
1285        ofs = offset;
1286        limit = items;
1287    }
1288
1289    bool zeros(size_t s, size_t e)
1290    in
1291    {
1292        assert(s <= e);
1293    }
1294    do
1295    {
1296        s += ofs;
1297        e += ofs;
1298        immutable pad_s = roundUp(s);
1299        if ( s >= e)
1300        {
1301            foreach (i; s .. e)
1302                if (ptr[i])
1303                    return false;
1304            return true;
1305        }
1306        immutable pad_e = roundDown(e);
1307        size_t i;
1308        for (i=s; i<pad_s; i++)
1309            if (ptr[i])
1310                return false;
1311        // all in between is x*factor elements
1312        for (size_t j=i/factor; i<pad_e; i+=factor, j++)
1313            if (ptr.origin[j])
1314                return false;
1315        for (; i<e; i++)
1316            if (ptr[i])
1317                return false;
1318        return true;
1319    }
1320
1321    T opIndex(size_t idx) inout
1322    in
1323    {
1324        assert(idx < limit);
1325    }
1326    do
1327    {
1328        return ptr[ofs + idx];
1329    }
1330
1331    static if (isBitPacked!T) // lack of user-defined implicit conversion
1332    {
1333        void opIndexAssign(T val, size_t idx)
1334        {
1335            return opIndexAssign(cast(TypeOfBitPacked!T) val, idx);
1336        }
1337    }
1338
1339    void opIndexAssign(TypeOfBitPacked!T val, size_t idx)
1340    in
1341    {
1342        assert(idx < limit);
1343    }
1344    do
1345    {
1346        ptr[ofs + idx] = val;
1347    }
1348
1349    static if (isBitPacked!T) // lack of user-defined implicit conversions
1350    {
1351        void opSliceAssign(T val, size_t start, size_t end)
1352        {
1353            opSliceAssign(cast(TypeOfBitPacked!T) val, start, end);
1354        }
1355    }
1356
1357    void opSliceAssign(TypeOfBitPacked!T val, size_t start, size_t end)
1358    in
1359    {
1360        assert(start <= end);
1361        assert(end <= limit);
1362    }
1363    do
1364    {
1365        // account for ofsetted view
1366        start += ofs;
1367        end += ofs;
1368        // rounded to factor granularity
1369        immutable pad_start = roundUp(start);// rounded up
1370        if (pad_start >= end) //rounded up >= then end of slice
1371        {
1372            //nothing to gain, use per element assignment
1373            foreach (i; start .. end)
1374                ptr[i] = val;
1375            return;
1376        }
1377        immutable pad_end = roundDown(end); // rounded down
1378        size_t i;
1379        for (i=start; i<pad_start; i++)
1380            ptr[i] = val;
1381        // all in between is x*factor elements
1382        if (pad_start != pad_end)
1383        {
1384            immutable repval = replicateBits!(factor, bits)(val);
1385            for (size_t j=i/factor; i<pad_end; i+=factor, j++)
1386                ptr.origin[j] = repval;// so speed it up by factor
1387        }
1388        for (; i<end; i++)
1389            ptr[i] = val;
1390    }
1391
1392    auto opSlice(size_t from, size_t to)inout
1393    in
1394    {
1395        assert(from <= to);
1396        assert(ofs + to <= limit);
1397    }
1398    do
1399    {
1400        return typeof(this)(ptr.origin, ofs + from, to - from);
1401    }
1402
1403    auto opSlice(){ return opSlice(0, length); }
1404
1405    bool opEquals(T)(auto ref T arr) const
1406    {
1407        if (limit != arr.limit)
1408           return false;
1409        size_t s1 = ofs, s2 = arr.ofs;
1410        size_t e1 = s1 + limit, e2 = s2 + limit;
1411        if (s1 % factor == 0 && s2 % factor == 0 && length % factor == 0)
1412        {
1413            return ptr.origin[s1/factor .. e1/factor]
1414                == arr.ptr.origin[s2/factor .. e2/factor];
1415        }
1416        for (size_t i=0;i<limit; i++)
1417            if (this[i] != arr[i])
1418                return false;
1419        return true;
1420    }
1421
1422    @property size_t length()const{ return limit; }
1423
1424private:
1425    auto roundUp()(size_t val){ return (val+factor-1)/factor*factor; }
1426    auto roundDown()(size_t val){ return val/factor*factor; }
1427    // factor - number of elements in one machine word
1428    enum factor = size_t.sizeof*8/bits;
1429    PackedPtr!(T) ptr;
1430    size_t ofs, limit;
1431}
1432
1433
1434private struct SliceOverIndexed(T)
1435{
1436    enum assignableIndex = is(typeof((){ T.init[0] = Item.init; }));
1437    enum assignableSlice = is(typeof((){ T.init[0 .. 0] = Item.init; }));
1438    auto opIndex(size_t idx)const
1439    in
1440    {
1441        assert(idx < to - from);
1442    }
1443    do
1444    {
1445        return (*arr)[from+idx];
1446    }
1447
1448    static if (assignableIndex)
1449    void opIndexAssign(Item val, size_t idx)
1450    in
1451    {
1452        assert(idx < to - from);
1453    }
1454    do
1455    {
1456       (*arr)[from+idx] = val;
1457    }
1458
1459    auto opSlice(size_t a, size_t b)
1460    {
1461        return typeof(this)(from+a, from+b, arr);
1462    }
1463
1464    // static if (assignableSlice)
1465    void opSliceAssign(T)(T val, size_t start, size_t end)
1466    {
1467        (*arr)[start+from .. end+from] = val;
1468    }
1469
1470    auto opSlice()
1471    {
1472        return typeof(this)(from, to, arr);
1473    }
1474
1475    @property size_t length()const { return to-from;}
1476
1477    alias opDollar = length;
1478
1479    @property bool empty()const { return from == to; }
1480
1481    @property auto front()const { return (*arr)[from]; }
1482
1483    static if (assignableIndex)
1484    @property void front(Item val) { (*arr)[from] = val; }
1485
1486    @property auto back()const { return (*arr)[to-1]; }
1487
1488    static if (assignableIndex)
1489    @property void back(Item val) { (*arr)[to-1] = val; }
1490
1491    @property auto save() inout { return this; }
1492
1493    void popFront() {   from++; }
1494
1495    void popBack() {    to--; }
1496
1497    bool opEquals(T)(auto ref T arr) const
1498    {
1499        if (arr.length != length)
1500            return false;
1501        for (size_t i=0; i <length; i++)
1502            if (this[i] != arr[i])
1503                return false;
1504        return true;
1505    }
1506private:
1507    alias Item = typeof(T.init[0]);
1508    size_t from, to;
1509    T* arr;
1510}
1511
1512@safe pure nothrow @nogc unittest
1513{
1514    static assert(isRandomAccessRange!(SliceOverIndexed!(int[])));
1515}
1516
1517SliceOverIndexed!(const(T)) sliceOverIndexed(T)(size_t a, size_t b, const(T)* x)
1518if (is(Unqual!T == T))
1519{
1520    return SliceOverIndexed!(const(T))(a, b, x);
1521}
1522
1523// BUG? inout is out of reach
1524//...SliceOverIndexed.arr only parameters or stack based variables can be inout
1525SliceOverIndexed!T sliceOverIndexed(T)(size_t a, size_t b, T* x)
1526if (is(Unqual!T == T))
1527{
1528    return SliceOverIndexed!T(a, b, x);
1529}
1530
1531@safe unittest
1532{
1533    int[] idxArray = [2, 3, 5, 8, 13];
1534    auto sliced = sliceOverIndexed(0, idxArray.length, &idxArray);
1535
1536    assert(!sliced.empty);
1537    assert(sliced.front == 2);
1538    sliced.front = 1;
1539    assert(sliced.front == 1);
1540    assert(sliced.back == 13);
1541    sliced.popFront();
1542    assert(sliced.front == 3);
1543    assert(sliced.back == 13);
1544    sliced.back = 11;
1545    assert(sliced.back == 11);
1546    sliced.popBack();
1547
1548    assert(sliced.front == 3);
1549    assert(sliced[$-1] == 8);
1550    sliced = sliced[];
1551    assert(sliced[0] == 3);
1552    assert(sliced.back == 8);
1553    sliced = sliced[1..$];
1554    assert(sliced.front == 5);
1555    sliced = sliced[0..$-1];
1556    assert(sliced[$-1] == 5);
1557
1558    int[] other = [2, 5];
1559    assert(sliced[] == sliceOverIndexed(1, 2, &other));
1560    sliceOverIndexed(0, 2, &idxArray)[0 .. 2] = -1;
1561    assert(idxArray[0 .. 2] == [-1, -1]);
1562    uint[] nullArr = null;
1563    auto nullSlice = sliceOverIndexed(0, 0, &idxArray);
1564    assert(nullSlice.empty);
1565}
1566
1567private inout(PackedArrayView!T) packedArrayView(T)(inout(size_t)* ptr, size_t items)
1568{
1569    return inout(PackedArrayView!T)(ptr, 0, items);
1570}
1571
1572
1573//============================================================================
1574// Partially unrolled binary search using Shar's method
1575//============================================================================
1576
1577string genUnrolledSwitchSearch(size_t size) @safe pure nothrow
1578{
1579    import core.bitop : bsr;
1580    import std.array : replace;
1581    import std.conv : to;
1582    assert(isPow2OrZero(size));
1583    string code = `
1584    import core.bitop : bsr;
1585    auto power = bsr(m)+1;
1586    switch (power){`;
1587    size_t i = bsr(size);
1588    foreach_reverse (val; 0 .. bsr(size))
1589    {
1590        auto v = 2^^val;
1591        code ~= `
1592        case pow:
1593            if (pred(range[idx+m], needle))
1594                idx +=  m;
1595            goto case;
1596        `.replace("m", to!string(v))
1597        .replace("pow", to!string(i));
1598        i--;
1599    }
1600    code ~= `
1601        case 0:
1602            if (pred(range[idx], needle))
1603                idx += 1;
1604            goto default;
1605        `;
1606    code ~= `
1607        default:
1608    }`;
1609    return code;
1610}
1611
1612bool isPow2OrZero(size_t sz) @safe pure nothrow @nogc
1613{
1614    // See also: std.math.isPowerOf2()
1615    return (sz & (sz-1)) == 0;
1616}
1617
1618size_t uniformLowerBound(alias pred, Range, T)(Range range, T needle)
1619if (is(T : ElementType!Range))
1620{
1621    assert(isPow2OrZero(range.length));
1622    size_t idx = 0, m = range.length/2;
1623    while (m != 0)
1624    {
1625        if (pred(range[idx+m], needle))
1626            idx += m;
1627        m /= 2;
1628    }
1629    if (pred(range[idx], needle))
1630        idx += 1;
1631    return idx;
1632}
1633
1634size_t switchUniformLowerBound(alias pred, Range, T)(Range range, T needle)
1635if (is(T : ElementType!Range))
1636{
1637    assert(isPow2OrZero(range.length));
1638    size_t idx = 0, m = range.length/2;
1639    enum max = 1 << 10;
1640    while (m >= max)
1641    {
1642        if (pred(range[idx+m], needle))
1643            idx += m;
1644        m /= 2;
1645    }
1646    mixin(genUnrolledSwitchSearch(max));
1647    return idx;
1648}
1649
1650template sharMethod(alias uniLowerBound)
1651{
1652    size_t sharMethod(alias _pred="a<b", Range, T)(Range range, T needle)
1653        if (is(T : ElementType!Range))
1654    {
1655        import std.functional : binaryFun;
1656        import std.math.algebraic : nextPow2, truncPow2;
1657        alias pred = binaryFun!_pred;
1658        if (range.length == 0)
1659            return 0;
1660        if (isPow2OrZero(range.length))
1661            return uniLowerBound!pred(range, needle);
1662        size_t n = truncPow2(range.length);
1663        if (pred(range[n-1], needle))
1664        {// search in another 2^^k area that fully covers the tail of range
1665            size_t k = nextPow2(range.length - n + 1);
1666            return range.length - k + uniLowerBound!pred(range[$-k..$], needle);
1667        }
1668        else
1669            return uniLowerBound!pred(range[0 .. n], needle);
1670    }
1671}
1672
1673alias sharLowerBound = sharMethod!uniformLowerBound;
1674alias sharSwitchLowerBound = sharMethod!switchUniformLowerBound;
1675
1676@safe unittest
1677{
1678    import std.array : array;
1679    import std.range : assumeSorted, iota;
1680
1681    auto stdLowerBound(T)(T[] range, T needle)
1682    {
1683        return assumeSorted(range).lowerBound(needle).length;
1684    }
1685    immutable MAX = 5*1173;
1686    auto arr = array(iota(5, MAX, 5));
1687    assert(arr.length == MAX/5-1);
1688    foreach (i; 0 .. MAX+5)
1689    {
1690        auto st = stdLowerBound(arr, i);
1691        assert(st == sharLowerBound(arr, i));
1692        assert(st == sharSwitchLowerBound(arr, i));
1693    }
1694    arr = [];
1695    auto st = stdLowerBound(arr, 33);
1696    assert(st == sharLowerBound(arr, 33));
1697    assert(st == sharSwitchLowerBound(arr, 33));
1698}
1699//============================================================================
1700
1701@safe
1702{
1703// hope to see simillar stuff in public interface... once Allocators are out
1704//@@@BUG moveFront and friends? dunno, for now it's POD-only
1705
1706@trusted size_t genericReplace(Policy=void, T, Range)
1707    (ref T dest, size_t from, size_t to, Range stuff)
1708{
1709    import std.algorithm.mutation : copy;
1710    size_t delta = to - from;
1711    size_t stuff_end = from+stuff.length;
1712    if (stuff.length > delta)
1713    {// replace increases length
1714        delta = stuff.length - delta;// now, new is > old  by delta
1715        static if (is(Policy == void))
1716            dest.length = dest.length+delta;//@@@BUG lame @property
1717        else
1718            dest = Policy.realloc(dest, dest.length+delta);
1719        copyBackwards(dest[to .. dest.length-delta],
1720            dest[to+delta .. dest.length]);
1721        copyForward(stuff, dest[from .. stuff_end]);
1722    }
1723    else if (stuff.length == delta)
1724    {
1725        copy(stuff, dest[from .. to]);
1726    }
1727    else
1728    {// replace decreases length by delta
1729        delta = delta - stuff.length;
1730        copy(stuff, dest[from .. stuff_end]);
1731        copyForward(dest[to .. dest.length],
1732            dest[stuff_end .. dest.length-delta]);
1733        static if (is(Policy == void))
1734            dest.length = dest.length - delta;//@@@BUG lame @property
1735        else
1736            dest = Policy.realloc(dest, dest.length-delta);
1737    }
1738    return stuff_end;
1739}
1740
1741
1742// Simple storage manipulation policy
1743@safe private struct GcPolicy
1744{
1745    import std.traits : isDynamicArray;
1746
1747    static T[] dup(T)(const T[] arr)
1748    {
1749        return arr.dup;
1750    }
1751
1752    static T[] alloc(T)(size_t size)
1753    {
1754        return new T[size];
1755    }
1756
1757    static T[] realloc(T)(T[] arr, size_t sz)
1758    {
1759        arr.length = sz;
1760        return arr;
1761    }
1762
1763    static void replaceImpl(T, Range)(ref T[] dest, size_t from, size_t to, Range stuff)
1764    {
1765        replaceInPlace(dest, from, to, stuff);
1766    }
1767
1768    static void append(T, V)(ref T[] arr, V value)
1769        if (!isInputRange!V)
1770    {
1771        arr ~= force!T(value);
1772    }
1773
1774    static void append(T, V)(ref T[] arr, V value)
1775        if (isInputRange!V)
1776    {
1777        insertInPlace(arr, arr.length, value);
1778    }
1779
1780    static void destroy(T)(ref T arr) pure // pure required for -dip25, inferred for -dip1000
1781        if (isDynamicArray!T && is(Unqual!T == T))
1782    {
1783        debug
1784        {
1785            arr[] = cast(typeof(T.init[0]))(0xdead_beef);
1786        }
1787        arr = null;
1788    }
1789
1790    static void destroy(T)(ref T arr) pure // pure required for -dip25, inferred for -dip1000
1791        if (isDynamicArray!T && !is(Unqual!T == T))
1792    {
1793        arr = null;
1794    }
1795}
1796
1797// ditto
1798@safe struct ReallocPolicy
1799{
1800    import std.range.primitives : hasLength;
1801
1802    static T[] dup(T)(const T[] arr)
1803    {
1804        auto result = alloc!T(arr.length);
1805        result[] = arr[];
1806        return result;
1807    }
1808
1809    static T[] alloc(T)(size_t size) @trusted
1810    {
1811        import std.internal.memory : enforceMalloc;
1812
1813        import core.checkedint : mulu;
1814        bool overflow;
1815        size_t nbytes = mulu(size, T.sizeof, overflow);
1816        if (overflow) assert(0);
1817
1818        auto ptr = cast(T*) enforceMalloc(nbytes);
1819        return ptr[0 .. size];
1820    }
1821
1822    static T[] realloc(T)(return scope T[] arr, size_t size) @trusted
1823    {
1824        import std.internal.memory : enforceRealloc;
1825        if (!size)
1826        {
1827            destroy(arr);
1828            return null;
1829        }
1830
1831        import core.checkedint : mulu;
1832        bool overflow;
1833        size_t nbytes = mulu(size, T.sizeof, overflow);
1834        if (overflow) assert(0);
1835
1836        auto ptr = cast(T*) enforceRealloc(arr.ptr, nbytes);
1837        return ptr[0 .. size];
1838    }
1839
1840    static void replaceImpl(T, Range)(ref T[] dest, size_t from, size_t to, Range stuff)
1841    {
1842        genericReplace!(ReallocPolicy)(dest, from, to, stuff);
1843    }
1844
1845    static void append(T, V)(ref T[] arr, V value)
1846        if (!isInputRange!V)
1847    {
1848        if (arr.length == size_t.max) assert(0);
1849        arr = realloc(arr, arr.length+1);
1850        arr[$-1] = force!T(value);
1851    }
1852
1853    pure @safe unittest
1854    {
1855        int[] arr;
1856        ReallocPolicy.append(arr, 3);
1857
1858        import std.algorithm.comparison : equal;
1859        assert(equal(arr, [3]));
1860    }
1861
1862    static void append(T, V)(ref T[] arr, V value)
1863        if (isInputRange!V && hasLength!V)
1864    {
1865        import core.checkedint : addu;
1866        bool overflow;
1867        size_t nelems = addu(arr.length, value.length, overflow);
1868        if (overflow) assert(0);
1869
1870        arr = realloc(arr, nelems);
1871
1872        import std.algorithm.mutation : copy;
1873        copy(value, arr[$-value.length..$]);
1874    }
1875
1876    pure @safe unittest
1877    {
1878        int[] arr;
1879        ReallocPolicy.append(arr, [1,2,3]);
1880
1881        import std.algorithm.comparison : equal;
1882        assert(equal(arr, [1,2,3]));
1883    }
1884
1885    static void destroy(T)(scope ref T[] arr) @trusted
1886    {
1887        import core.memory : pureFree;
1888        if (arr.ptr)
1889            pureFree(arr.ptr);
1890        arr = null;
1891    }
1892}
1893
1894//build hack
1895alias _RealArray = CowArray!ReallocPolicy;
1896
1897pure @safe unittest
1898{
1899    import std.algorithm.comparison : equal;
1900
1901    with(ReallocPolicy)
1902    {
1903        bool test(T, U, V)(T orig, size_t from, size_t to, U toReplace, V result,
1904                   string file = __FILE__, size_t line = __LINE__)
1905        {
1906            {
1907                replaceImpl(orig, from, to, toReplace);
1908                scope(exit) destroy(orig);
1909                if (!equal(orig, result))
1910                    return false;
1911            }
1912            return true;
1913        }
1914        static T[] arr(T)(T[] args... )
1915        {
1916            return dup(args);
1917        }
1918
1919        assert(test(arr([1, 2, 3, 4]), 0, 0, [5, 6, 7], [5, 6, 7, 1, 2, 3, 4]));
1920        assert(test(arr([1, 2, 3, 4]), 0, 2, cast(int[])[], [3, 4]));
1921        assert(test(arr([1, 2, 3, 4]), 0, 4, [5, 6, 7], [5, 6, 7]));
1922        assert(test(arr([1, 2, 3, 4]), 0, 2, [5, 6, 7], [5, 6, 7, 3, 4]));
1923        assert(test(arr([1, 2, 3, 4]), 2, 3, [5, 6, 7], [1, 2, 5, 6, 7, 4]));
1924    }
1925}
1926
1927/**
1928    Tests if T is some kind a set of code points. Intended for template constraints.
1929*/
1930public template isCodepointSet(T)
1931{
1932    static if (is(T dummy == InversionList!(Args), Args...))
1933        enum isCodepointSet = true;
1934    else
1935        enum isCodepointSet = false;
1936}
1937
1938/**
1939    Tests if `T` is a pair of integers that implicitly convert to `V`.
1940    The following code must compile for any pair `T`:
1941    ---
1942    (T x){ V a = x[0]; V b = x[1];}
1943    ---
1944    The following must not compile:
1945     ---
1946    (T x){ V c = x[2];}
1947    ---
1948*/
1949public template isIntegralPair(T, V=uint)
1950{
1951    enum isIntegralPair = is(typeof((T x){ V a = x[0]; V b = x[1];}))
1952        && !is(typeof((T x){ V c = x[2]; }));
1953}
1954
1955
1956/**
1957    The recommended default type for set of $(CODEPOINTS).
1958    For details, see the current implementation: $(LREF InversionList).
1959*/
1960public alias CodepointSet = InversionList!GcPolicy;
1961
1962
1963//@@@BUG: std.typecons tuples depend on std.format to produce fields mixin
1964// which relies on std.uni.isGraphical and this chain blows up with Forward reference error
1965// hence below doesn't seem to work
1966// public alias CodepointInterval = Tuple!(uint, "a", uint, "b");
1967
1968/**
1969    The recommended type of $(REF Tuple, std,_typecons)
1970    to represent [a, b$(RPAREN) intervals of $(CODEPOINTS). As used in $(LREF InversionList).
1971    Any interval type should pass $(LREF isIntegralPair) trait.
1972*/
1973public struct CodepointInterval
1974{
1975pure:
1976    uint[2] _tuple;
1977    alias _tuple this;
1978
1979@safe pure nothrow @nogc:
1980
1981    this(uint low, uint high)
1982    {
1983        _tuple[0] = low;
1984        _tuple[1] = high;
1985    }
1986    bool opEquals(T)(T val) const
1987    {
1988        return this[0] == val[0] && this[1] == val[1];
1989    }
1990    @property ref inout(uint) a() return inout { return _tuple[0]; }
1991    @property ref inout(uint) b() return inout { return _tuple[1]; }
1992}
1993
1994/**
1995    $(P
1996    `InversionList` is a set of $(CODEPOINTS)
1997    represented as an array of open-right [a, b$(RPAREN)
1998    intervals (see $(LREF CodepointInterval) above).
1999    The name comes from the way the representation reads left to right.
2000    For instance a set of all values [10, 50$(RPAREN), [80, 90$(RPAREN),
2001    plus a singular value 60 looks like this:
2002    )
2003    ---
2004    10, 50, 60, 61, 80, 90
2005    ---
2006    $(P
2007    The way to read this is: start with negative meaning that all numbers
2008    smaller then the next one are not present in this set (and positive -
2009    the contrary). Then switch positive/negative after each
2010    number passed from left to right.
2011    )
2012    $(P This way negative spans until 10, then positive until 50,
2013    then negative until 60, then positive until 61, and so on.
2014    As seen this provides a space-efficient storage of highly redundant data
2015    that comes in long runs. A description which Unicode $(CHARACTER)
2016    properties fit nicely. The technique itself could be seen as a variation
2017    on $(LINK2 https://en.wikipedia.org/wiki/Run-length_encoding, RLE encoding).
2018    )
2019
2020    $(P Sets are value types (just like `int` is) thus they
2021        are never aliased.
2022    )
2023        Example:
2024        ---
2025        auto a = CodepointSet('a', 'z'+1);
2026        auto b = CodepointSet('A', 'Z'+1);
2027        auto c = a;
2028        a = a | b;
2029        assert(a == CodepointSet('A', 'Z'+1, 'a', 'z'+1));
2030        assert(a != c);
2031        ---
2032    $(P See also $(LREF unicode) for simpler construction of sets
2033        from predefined ones.
2034    )
2035
2036    $(P Memory usage is 8 bytes per each contiguous interval in a set.
2037    The value semantics are achieved by using the
2038    $(HTTP en.wikipedia.org/wiki/Copy-on-write, COW) technique
2039    and thus it's $(RED not) safe to cast this type to $(D_KEYWORD shared).
2040    )
2041
2042    Note:
2043    $(P It's not recommended to rely on the template parameters
2044    or the exact type of a current $(CODEPOINT) set in `std.uni`.
2045    The type and parameters may change when the standard
2046    allocators design is finalized.
2047    Use $(LREF isCodepointSet) with templates or just stick with the default
2048    alias $(LREF CodepointSet) throughout the whole code base.
2049    )
2050*/
2051public struct InversionList(SP=GcPolicy)
2052{
2053    import std.range : assumeSorted;
2054
2055    /**
2056        Construct from another code point set of any type.
2057    */
2058    this(Set)(Set set) pure
2059        if (isCodepointSet!Set)
2060    {
2061        uint[] arr;
2062        foreach (v; set.byInterval)
2063        {
2064            arr ~= v.a;
2065            arr ~= v.b;
2066        }
2067        data = CowArray!(SP).reuse(arr);
2068    }
2069
2070    /**
2071        Construct a set from a forward range of code point intervals.
2072    */
2073    this(Range)(Range intervals) pure
2074        if (isForwardRange!Range && isIntegralPair!(ElementType!Range))
2075    {
2076        uint[] arr;
2077        foreach (v; intervals)
2078        {
2079            SP.append(arr, v.a);
2080            SP.append(arr, v.b);
2081        }
2082        data = CowArray!(SP).reuse(arr);
2083        sanitize(); //enforce invariant: sort intervals etc.
2084    }
2085
2086    //helper function that avoids sanity check to be CTFE-friendly
2087    private static fromIntervals(Range)(Range intervals) pure
2088    {
2089        import std.algorithm.iteration : map;
2090        import std.range : roundRobin;
2091        auto flattened = roundRobin(intervals.save.map!"a[0]"(),
2092            intervals.save.map!"a[1]"());
2093        InversionList set;
2094        set.data = CowArray!(SP)(flattened);
2095        return set;
2096    }
2097    //ditto untill sort is CTFE-able
2098    private static fromIntervals()(uint[] intervals...) pure
2099    in
2100    {
2101        import std.conv : text;
2102        assert(intervals.length % 2 == 0, "Odd number of interval bounds [a, b)!");
2103        for (uint i = 0; i < intervals.length; i += 2)
2104        {
2105            auto a = intervals[i], b = intervals[i+1];
2106            assert(a < b, text("illegal interval [a, b): ", a, " > ", b));
2107        }
2108    }
2109    do
2110    {
2111        InversionList set;
2112        set.data = CowArray!(SP)(intervals);
2113        return set;
2114    }
2115
2116    /**
2117        Construct a set from plain values of code point intervals.
2118    */
2119    this()(uint[] intervals...)
2120    in
2121    {
2122        import std.conv : text;
2123        assert(intervals.length % 2 == 0, "Odd number of interval bounds [a, b)!");
2124        for (uint i = 0; i < intervals.length; i += 2)
2125        {
2126            auto a = intervals[i], b = intervals[i+1];
2127            assert(a < b, text("illegal interval [a, b): ", a, " > ", b));
2128        }
2129    }
2130    do
2131    {
2132        data = CowArray!(SP)(intervals);
2133        sanitize(); //enforce invariant: sort intervals etc.
2134    }
2135
2136    ///
2137    pure @safe unittest
2138    {
2139        import std.algorithm.comparison : equal;
2140
2141        auto set = CodepointSet('a', 'z'+1, '��', '��'+1);
2142        foreach (v; 'a'..'z'+1)
2143            assert(set[v]);
2144        // Cyrillic lowercase interval
2145        foreach (v; '��'..'��'+1)
2146            assert(set[v]);
2147        //specific order is not required, intervals may interesect
2148        auto set2 = CodepointSet('��', '��'+1, 'a', 'd', 'b', 'z'+1);
2149        //the same end result
2150        assert(set2.byInterval.equal(set.byInterval));
2151        // test constructor this(Range)(Range intervals)
2152        auto chessPiecesWhite = CodepointInterval(9812, 9818);
2153        auto chessPiecesBlack = CodepointInterval(9818, 9824);
2154        auto set3 = CodepointSet([chessPiecesWhite, chessPiecesBlack]);
2155        foreach (v; '���'..'���'+1)
2156            assert(set3[v]);
2157    }
2158
2159    /**
2160        Get range that spans all of the $(CODEPOINT) intervals in this $(LREF InversionList).
2161    */
2162    @property auto byInterval() scope
2163    {
2164        // TODO: change this to data[] once the -dip1000 errors have been fixed
2165        // see e.g. https://github.com/dlang/phobos/pull/6638
2166        import std.array : array;
2167        return Intervals!(typeof(data.array))(data.array);
2168    }
2169
2170    @safe unittest
2171    {
2172        import std.algorithm.comparison : equal;
2173        import std.typecons : tuple;
2174
2175        auto set = CodepointSet('A', 'D'+1, 'a', 'd'+1);
2176
2177        assert(set.byInterval.equal([tuple('A','E'), tuple('a','e')]));
2178    }
2179
2180    package(std) @property const(CodepointInterval)[] intervals() const
2181    {
2182        import std.array : array;
2183        return Intervals!(typeof(data[]))(data[]).array;
2184    }
2185
2186    /**
2187        Tests the presence of code point `val` in this set.
2188    */
2189    bool opIndex(uint val) const
2190    {
2191        // the <= ensures that searching in  interval of [a, b) for 'a' you get .length == 1
2192        // return assumeSorted!((a,b) => a <= b)(data[]).lowerBound(val).length & 1;
2193        return sharSwitchLowerBound!"a <= b"(data[], val) & 1;
2194    }
2195
2196    ///
2197    pure @safe unittest
2198    {
2199        auto gothic = unicode.Gothic;
2200        // Gothic letter ahsa
2201        assert(gothic['\U00010330']);
2202        // no ascii in Gothic obviously
2203        assert(!gothic['$']);
2204    }
2205
2206
2207    // Linear scan for `ch`. Useful only for small sets.
2208    // TODO:
2209    // used internally in std.regex
2210    // should be properly exposed in a public API ?
2211    package(std) auto scanFor()(dchar ch) const
2212    {
2213        immutable len = data.length;
2214        for (size_t i = 0; i < len; i++)
2215            if (ch < data[i])
2216                return i & 1;
2217        return 0;
2218    }
2219
2220    /// Number of $(CODEPOINTS) in this set
2221    @property size_t length()
2222    {
2223        size_t sum = 0;
2224        foreach (iv; byInterval)
2225        {
2226            sum += iv.b - iv.a;
2227        }
2228        return sum;
2229    }
2230
2231// bootstrap full set operations from 4 primitives (suitable as a template mixin):
2232// addInterval, skipUpTo, dropUpTo & byInterval iteration
2233//============================================================================
2234public:
2235    /**
2236        $(P Sets support natural syntax for set algebra, namely: )
2237        $(BOOKTABLE ,
2238            $(TR $(TH Operator) $(TH Math notation) $(TH Description) )
2239            $(TR $(TD &) $(TD a ��� b) $(TD intersection) )
2240            $(TR $(TD |) $(TD a ��� b) $(TD union) )
2241            $(TR $(TD -) $(TD a ��� b) $(TD subtraction) )
2242            $(TR $(TD ~) $(TD a ~ b) $(TD symmetric set difference i.e. (a ��� b) \ (a ��� b)) )
2243        )
2244    */
2245    This opBinary(string op, U)(U rhs)
2246        if (isCodepointSet!U || is(U:dchar))
2247    {
2248        static if (op == "&" || op == "|" || op == "~")
2249        {// symmetric ops thus can swap arguments to reuse r-value
2250            static if (is(U:dchar))
2251            {
2252                auto tmp = this;
2253                mixin("tmp "~op~"= rhs; ");
2254                return tmp;
2255            }
2256            else
2257            {
2258                static if (is(Unqual!U == U))
2259                {
2260                    // try hard to reuse r-value
2261                    mixin("rhs "~op~"= this;");
2262                    return rhs;
2263                }
2264                else
2265                {
2266                    auto tmp = this;
2267                    mixin("tmp "~op~"= rhs;");
2268                    return tmp;
2269                }
2270            }
2271        }
2272        else static if (op == "-") // anti-symmetric
2273        {
2274            auto tmp = this;
2275            tmp -= rhs;
2276            return tmp;
2277        }
2278        else
2279            static assert(0, "no operator "~op~" defined for Set");
2280    }
2281
2282    ///
2283    pure @safe unittest
2284    {
2285        import std.algorithm.comparison : equal;
2286        import std.range : iota;
2287
2288        auto lower = unicode.LowerCase;
2289        auto upper = unicode.UpperCase;
2290        auto ascii = unicode.ASCII;
2291
2292        assert((lower & upper).empty); // no intersection
2293        auto lowerASCII = lower & ascii;
2294        assert(lowerASCII.byCodepoint.equal(iota('a', 'z'+1)));
2295        // throw away all of the lowercase ASCII
2296        assert((ascii - lower).length == 128 - 26);
2297
2298        auto onlyOneOf = lower ~ ascii;
2299        assert(!onlyOneOf['��']); // not ASCII and not lowercase
2300        assert(onlyOneOf['$']); // ASCII and not lowercase
2301        assert(!onlyOneOf['a']); // ASCII and lowercase
2302        assert(onlyOneOf['��']); // not ASCII but lowercase
2303
2304        // throw away all cased letters from ASCII
2305        auto noLetters = ascii - (lower | upper);
2306        assert(noLetters.length == 128 - 26*2);
2307    }
2308
2309    /// The 'op=' versions of the above overloaded operators.
2310    ref This opOpAssign(string op, U)(U rhs)
2311        if (isCodepointSet!U || is(U:dchar))
2312    {
2313        static if (op == "|")    // union
2314        {
2315            static if (is(U:dchar))
2316            {
2317                this.addInterval(rhs, rhs+1);
2318                return this;
2319            }
2320            else
2321                return this.add(rhs);
2322        }
2323        else static if (op == "&")   // intersection
2324                return this.intersect(rhs);// overloaded
2325        else static if (op == "-")   // set difference
2326                return this.sub(rhs);// overloaded
2327        else static if (op == "~")   // symmetric set difference
2328        {
2329            auto copy = this & rhs;
2330            this |= rhs;
2331            this -= copy;
2332            return this;
2333        }
2334        else
2335            static assert(0, "no operator "~op~" defined for Set");
2336    }
2337
2338    /**
2339        Tests the presence of codepoint `ch` in this set,
2340        the same as $(LREF opIndex).
2341    */
2342    bool opBinaryRight(string op: "in", U)(U ch) const
2343        if (is(U : dchar))
2344    {
2345        return this[ch];
2346    }
2347
2348    ///
2349    pure @safe unittest
2350    {
2351        assert('��' in unicode.Cyrillic);
2352        assert(!('z' in unicode.Cyrillic));
2353    }
2354
2355
2356
2357    /**
2358     * Obtains a set that is the inversion of this set.
2359     *
2360     * See_Also: $(LREF inverted)
2361     */
2362    auto opUnary(string op: "!")()
2363    {
2364        return this.inverted;
2365    }
2366
2367    /**
2368        A range that spans each $(CODEPOINT) in this set.
2369    */
2370    @property auto byCodepoint()
2371    {
2372        static struct CodepointRange
2373        {
2374            this(This set)
2375            {
2376                r = set.byInterval;
2377                if (!r.empty)
2378                    cur = r.front.a;
2379            }
2380
2381            @property dchar front() const
2382            {
2383                return cast(dchar) cur;
2384            }
2385
2386            @property bool empty() const
2387            {
2388                return r.empty;
2389            }
2390
2391            void popFront()
2392            {
2393                cur++;
2394                while (cur >= r.front.b)
2395                {
2396                    r.popFront();
2397                    if (r.empty)
2398                        break;
2399                    cur = r.front.a;
2400                }
2401            }
2402        private:
2403            uint cur;
2404            typeof(This.init.byInterval) r;
2405        }
2406
2407        return CodepointRange(this);
2408    }
2409
2410    ///
2411    pure @safe unittest
2412    {
2413        import std.algorithm.comparison : equal;
2414        import std.range : iota;
2415
2416        auto set = unicode.ASCII;
2417        set.byCodepoint.equal(iota(0, 0x80));
2418    }
2419
2420    /**
2421        $(P Obtain textual representation of this set in from of
2422        open-right intervals and feed it to `sink`.
2423        )
2424        $(P Used by various standard formatting facilities such as
2425         $(REF formattedWrite, std,format), $(REF write, std,stdio),
2426         $(REF writef, std,stdio), $(REF to, std,conv) and others.
2427        )
2428        Example:
2429        ---
2430        import std.conv;
2431        assert(unicode.ASCII.to!string == "[0..128$(RPAREN)");
2432        ---
2433    */
2434
2435    private import std.format.spec : FormatSpec;
2436
2437    /***************************************
2438     * Obtain a textual representation of this InversionList
2439     * in form of open-right intervals.
2440     *
2441     * The formatting flag is applied individually to each value, for example:
2442     * $(LI $(B %s) and $(B %d) format the intervals as a [low .. high$(RPAREN) range of integrals)
2443     * $(LI $(B %x) formats the intervals as a [low .. high$(RPAREN) range of lowercase hex characters)
2444     * $(LI $(B %X) formats the intervals as a [low .. high$(RPAREN) range of uppercase hex characters)
2445     */
2446    void toString(Writer)(scope Writer sink, scope const ref FormatSpec!char fmt) /* const */
2447    {
2448        import std.format.write : formatValue;
2449        auto range = byInterval;
2450        if (range.empty)
2451            return;
2452
2453        while (1)
2454        {
2455            auto i = range.front;
2456            range.popFront();
2457
2458            put(sink, "[");
2459            formatValue(sink, i.a, fmt);
2460            put(sink, "..");
2461            formatValue(sink, i.b, fmt);
2462            put(sink, ")");
2463            if (range.empty) return;
2464            put(sink, " ");
2465        }
2466    }
2467
2468    ///
2469    pure @safe unittest
2470    {
2471        import std.conv : to;
2472        import std.format : format;
2473        import std.uni : unicode;
2474
2475        assert(unicode.Cyrillic.to!string ==
2476            "[1024..1157) [1159..1320) [7467..7468) [7544..7545) [11744..11776) [42560..42648) [42655..42656)");
2477
2478        // The specs '%s' and '%d' are equivalent to the to!string call above.
2479        assert(format("%d", unicode.Cyrillic) == unicode.Cyrillic.to!string);
2480
2481        assert(format("%#x", unicode.Cyrillic) ==
2482            "[0x400..0x485) [0x487..0x528) [0x1d2b..0x1d2c) [0x1d78..0x1d79) [0x2de0..0x2e00) "
2483            ~"[0xa640..0xa698) [0xa69f..0xa6a0)");
2484
2485        assert(format("%#X", unicode.Cyrillic) ==
2486            "[0X400..0X485) [0X487..0X528) [0X1D2B..0X1D2C) [0X1D78..0X1D79) [0X2DE0..0X2E00) "
2487            ~"[0XA640..0XA698) [0XA69F..0XA6A0)");
2488    }
2489
2490    pure @safe unittest
2491    {
2492        import std.exception : assertThrown;
2493        import std.format : format, FormatException;
2494        assertThrown!FormatException(format("%z", unicode.ASCII));
2495    }
2496
2497
2498    /**
2499        Add an interval [a, b$(RPAREN) to this set.
2500    */
2501    ref add()(uint a, uint b)
2502    {
2503        addInterval(a, b);
2504        return this;
2505    }
2506
2507    ///
2508    pure @safe unittest
2509    {
2510        CodepointSet someSet;
2511        someSet.add('0', '5').add('A','Z'+1);
2512        someSet.add('5', '9'+1);
2513        assert(someSet['0']);
2514        assert(someSet['5']);
2515        assert(someSet['9']);
2516        assert(someSet['Z']);
2517    }
2518
2519private:
2520
2521  package(std)  // used from: std.regex.internal.parser
2522    ref intersect(U)(U rhs)
2523        if (isCodepointSet!U)
2524    {
2525        Marker mark;
2526        foreach ( i; rhs.byInterval)
2527        {
2528            mark = this.dropUpTo(i.a, mark);
2529            mark = this.skipUpTo(i.b, mark);
2530        }
2531        this.dropUpTo(uint.max, mark);
2532        return this;
2533    }
2534
2535    ref intersect()(dchar ch)
2536    {
2537        foreach (i; byInterval)
2538            if (i.a <= ch && ch < i.b)
2539                return this = This.init.add(ch, ch+1);
2540        this = This.init;
2541        return this;
2542    }
2543
2544    pure @safe unittest
2545    {
2546        assert(unicode.Cyrillic.intersect('-').byInterval.empty);
2547    }
2548
2549    ref sub()(dchar ch)
2550    {
2551        return subChar(ch);
2552    }
2553
2554    // same as the above except that skip & drop parts are swapped
2555  package(std)  // used from: std.regex.internal.parser
2556    ref sub(U)(U rhs)
2557        if (isCodepointSet!U)
2558    {
2559        Marker mark;
2560        foreach (i; rhs.byInterval)
2561        {
2562            mark = this.skipUpTo(i.a, mark);
2563            mark = this.dropUpTo(i.b, mark);
2564        }
2565        return this;
2566    }
2567
2568  package(std)  // used from: std.regex.internal.parse
2569    ref add(U)(U rhs)
2570        if (isCodepointSet!U)
2571    {
2572        Marker start;
2573        foreach (i; rhs.byInterval)
2574        {
2575            start = addInterval(i.a, i.b, start);
2576        }
2577        return this;
2578    }
2579
2580// end of mixin-able part
2581//============================================================================
2582public:
2583    /**
2584        Obtains a set that is the inversion of this set.
2585
2586        See the '!' $(LREF opUnary) for the same but using operators.
2587    */
2588    @property auto inverted()
2589    {
2590        InversionList inversion = this;
2591        if (inversion.data.length == 0)
2592        {
2593            inversion.addInterval(0, lastDchar+1);
2594            return inversion;
2595        }
2596        if (inversion.data[0] != 0)
2597            genericReplace(inversion.data, 0, 0, [0]);
2598        else
2599            genericReplace(inversion.data, 0, 1, cast(uint[]) null);
2600        if (data[data.length-1] != lastDchar+1)
2601            genericReplace(inversion.data,
2602                inversion.data.length, inversion.data.length, [lastDchar+1]);
2603        else
2604            genericReplace(inversion.data,
2605                inversion.data.length-1, inversion.data.length, cast(uint[]) null);
2606
2607        return inversion;
2608    }
2609
2610    ///
2611    pure @safe unittest
2612    {
2613        auto set = unicode.ASCII;
2614        // union with the inverse gets all of the code points in the Unicode
2615        assert((set | set.inverted).length == 0x110000);
2616        // no intersection with the inverse
2617        assert((set & set.inverted).empty);
2618    }
2619
2620    package(std) static string toSourceCode(const(CodepointInterval)[] range, string funcName)
2621    {
2622        import std.algorithm.searching : countUntil;
2623        import std.format : format;
2624        enum maxBinary = 3;
2625        static string linearScope(R)(R ivals, string indent)
2626        {
2627            string result = indent~"{\n";
2628            string deeper = indent~"    ";
2629            foreach (ival; ivals)
2630            {
2631                immutable span = ival[1] - ival[0];
2632                assert(span != 0);
2633                if (span == 1)
2634                {
2635                    result ~= format("%sif (ch == %s) return true;\n", deeper, ival[0]);
2636                }
2637                else if (span == 2)
2638                {
2639                    result ~= format("%sif (ch == %s || ch == %s) return true;\n",
2640                        deeper, ival[0], ival[0]+1);
2641                }
2642                else
2643                {
2644                    if (ival[0] != 0) // dchar is unsigned and  < 0 is useless
2645                        result ~= format("%sif (ch < %s) return false;\n", deeper, ival[0]);
2646                    result ~= format("%sif (ch < %s) return true;\n", deeper, ival[1]);
2647                }
2648            }
2649            result ~= format("%sreturn false;\n%s}\n", deeper, indent); // including empty range of intervals
2650            return result;
2651        }
2652
2653        static string binaryScope(R)(R ivals, string indent) @safe
2654        {
2655            // time to do unrolled comparisons?
2656            if (ivals.length < maxBinary)
2657                return linearScope(ivals, indent);
2658            else
2659                return bisect(ivals, ivals.length/2, indent);
2660        }
2661
2662        // not used yet if/elsebinary search is far better with DMD  as of 2.061
2663        // and GDC is doing fine job either way
2664        static string switchScope(R)(R ivals, string indent)
2665        {
2666            string result = indent~"switch (ch){\n";
2667            string deeper = indent~"    ";
2668            foreach (ival; ivals)
2669            {
2670                if (ival[0]+1 == ival[1])
2671                {
2672                    result ~= format("%scase %s: return true;\n",
2673                        deeper, ival[0]);
2674                }
2675                else
2676                {
2677                    result ~= format("%scase %s: .. case %s: return true;\n",
2678                         deeper, ival[0], ival[1]-1);
2679                }
2680            }
2681            result ~= deeper~"default: return false;\n"~indent~"}\n";
2682            return result;
2683        }
2684
2685        static string bisect(R)(R range, size_t idx, string indent)
2686        {
2687            string deeper = indent ~ "    ";
2688            // bisect on one [a, b) interval at idx
2689            string result = indent~"{\n";
2690            // less branch, < a
2691            result ~= format("%sif (ch < %s)\n%s",
2692                deeper, range[idx][0], binaryScope(range[0 .. idx], deeper));
2693            // middle point,  >= a && < b
2694            result ~= format("%selse if (ch < %s) return true;\n",
2695                deeper, range[idx][1]);
2696            // greater or equal branch,  >= b
2697            result ~= format("%selse\n%s",
2698                deeper, binaryScope(range[idx+1..$], deeper));
2699            return result~indent~"}\n";
2700        }
2701
2702        string code = format("bool %s(dchar ch) @safe pure nothrow @nogc\n",
2703            funcName.empty ? "function" : funcName);
2704        // special case first bisection to be on ASCII vs beyond
2705        auto tillAscii = countUntil!"a[0] > 0x80"(range);
2706        if (tillAscii <= 0) // everything is ASCII or nothing is ascii (-1 & 0)
2707            code ~= binaryScope(range, "");
2708        else
2709            code ~= bisect(range, tillAscii, "");
2710        return code;
2711    }
2712
2713    /**
2714        Generates string with D source code of unary function with name of
2715        `funcName` taking a single `dchar` argument. If `funcName` is empty
2716        the code is adjusted to be a lambda function.
2717
2718        The function generated tests if the $(CODEPOINT) passed
2719        belongs to this set or not. The result is to be used with string mixin.
2720        The intended usage area is aggressive optimization via meta programming
2721        in parser generators and the like.
2722
2723        Note: Use with care for relatively small or regular sets. It
2724        could end up being slower then just using multi-staged tables.
2725
2726        Example:
2727        ---
2728        import std.stdio;
2729
2730        // construct set directly from [a, b$RPAREN intervals
2731        auto set = CodepointSet(10, 12, 45, 65, 100, 200);
2732        writeln(set);
2733        writeln(set.toSourceCode("func"));
2734        ---
2735
2736        The above outputs something along the lines of:
2737        ---
2738        bool func(dchar ch)  @safe pure nothrow @nogc
2739        {
2740            if (ch < 45)
2741            {
2742                if (ch == 10 || ch == 11) return true;
2743                return false;
2744            }
2745            else if (ch < 65) return true;
2746            else
2747            {
2748                if (ch < 100) return false;
2749                if (ch < 200) return true;
2750                return false;
2751            }
2752        }
2753        ---
2754    */
2755    string toSourceCode(string funcName="")
2756    {
2757        import std.array : array;
2758        auto range = byInterval.array();
2759        return toSourceCode(range, funcName);
2760    }
2761
2762    /**
2763        True if this set doesn't contain any $(CODEPOINTS).
2764    */
2765    @property bool empty() const
2766    {
2767        return data.length == 0;
2768    }
2769
2770    ///
2771    pure @safe unittest
2772    {
2773        CodepointSet emptySet;
2774        assert(emptySet.length == 0);
2775        assert(emptySet.empty);
2776    }
2777
2778private:
2779    alias This = typeof(this);
2780    alias Marker = size_t;
2781
2782    // a random-access range of integral pairs
2783    static struct Intervals(Range)
2784    {
2785        import std.range.primitives : hasAssignableElements;
2786
2787        this(Range sp) scope
2788        {
2789            slice = sp;
2790            start = 0;
2791            end = sp.length;
2792        }
2793
2794        this(Range sp, size_t s, size_t e) scope
2795        {
2796            slice = sp;
2797            start = s;
2798            end = e;
2799        }
2800
2801        @property auto front()const
2802        {
2803            immutable a = slice[start];
2804            immutable b = slice[start+1];
2805            return CodepointInterval(a, b);
2806        }
2807
2808        //may break sorted property - but we need std.sort to access it
2809        //hence package(std) protection attribute
2810        static if (hasAssignableElements!Range)
2811        package(std) @property void front(CodepointInterval val)
2812        {
2813            slice[start] = val.a;
2814            slice[start+1] = val.b;
2815        }
2816
2817        @property auto back()const
2818        {
2819            immutable a = slice[end-2];
2820            immutable b = slice[end-1];
2821            return CodepointInterval(a, b);
2822        }
2823
2824        //ditto about package
2825        static if (hasAssignableElements!Range)
2826        package(std) @property void back(CodepointInterval val)
2827        {
2828            slice[end-2] = val.a;
2829            slice[end-1] = val.b;
2830        }
2831
2832        void popFront()
2833        {
2834            start += 2;
2835        }
2836
2837        void popBack()
2838        {
2839            end -= 2;
2840        }
2841
2842        auto opIndex(size_t idx) const
2843        {
2844            immutable a = slice[start+idx*2];
2845            immutable b = slice[start+idx*2+1];
2846            return CodepointInterval(a, b);
2847        }
2848
2849        //ditto about package
2850        static if (hasAssignableElements!Range)
2851        package(std) void opIndexAssign(CodepointInterval val, size_t idx)
2852        {
2853            slice[start+idx*2] = val.a;
2854            slice[start+idx*2+1] = val.b;
2855        }
2856
2857        auto opSlice(size_t s, size_t e)
2858        {
2859            return Intervals(slice, s*2+start, e*2+start);
2860        }
2861
2862        @property size_t length()const {  return slice.length/2; }
2863
2864        @property bool empty()const { return start == end; }
2865
2866        @property auto save(){ return this; }
2867    private:
2868        size_t start, end;
2869        Range slice;
2870    }
2871
2872    // called after construction from intervals
2873    // to make sure invariants hold
2874    void sanitize()
2875    {
2876        import std.algorithm.comparison : max;
2877        import std.algorithm.mutation : SwapStrategy;
2878        import std.algorithm.sorting : sort;
2879        if (data.length == 0)
2880            return;
2881        alias Ival = CodepointInterval;
2882        //intervals wrapper for a _range_ over packed array
2883        auto ivals = Intervals!(typeof(data[]))(data[]);
2884        //@@@BUG@@@ can't use "a.a < b.a" see
2885        // https://issues.dlang.org/show_bug.cgi?id=12265
2886        sort!((a,b) => a.a < b.a, SwapStrategy.stable)(ivals);
2887        // what follows is a variation on stable remove
2888        // differences:
2889        // - predicate is binary, and is tested against
2890        //   the last kept element (at 'i').
2891        // - predicate mutates lhs (merges rhs into lhs)
2892        size_t len = ivals.length;
2893        size_t i = 0;
2894        size_t j = 1;
2895        while (j < len)
2896        {
2897            if (ivals[i].b >= ivals[j].a)
2898            {
2899                ivals[i] = Ival(ivals[i].a, max(ivals[i].b, ivals[j].b));
2900                j++;
2901            }
2902            else //unmergable
2903            {
2904                // check if there is a hole after merges
2905                // (in the best case we do 0 writes to ivals)
2906                if (j != i+1)
2907                    ivals[i+1] = ivals[j]; //copy over
2908                i++;
2909                j++;
2910            }
2911        }
2912        len = i + 1;
2913        for (size_t k=0; k + 1 < len; k++)
2914        {
2915            assert(ivals[k].a < ivals[k].b);
2916            assert(ivals[k].b < ivals[k+1].a);
2917        }
2918        data.length = len * 2;
2919    }
2920
2921    // special case for normal InversionList
2922    ref subChar(dchar ch)
2923    {
2924        auto mark = skipUpTo(ch);
2925        if (mark != data.length
2926            && data[mark] == ch && data[mark-1] == ch)
2927        {
2928            // it has split, meaning that ch happens to be in one of intervals
2929            data[mark] = data[mark]+1;
2930        }
2931        return this;
2932    }
2933
2934    //
2935    Marker addInterval(int a, int b, Marker hint=Marker.init) scope
2936    in
2937    {
2938        assert(a <= b);
2939    }
2940    do
2941    {
2942        import std.range : assumeSorted, SearchPolicy;
2943        auto range = assumeSorted(data[]);
2944        size_t pos;
2945        size_t a_idx = hint + range[hint..$].lowerBound!(SearchPolicy.gallop)(a).length;
2946        if (a_idx == range.length)
2947        {
2948            //  [---+++----++++----++++++]
2949            //  [                         a  b]
2950            data.append(a, b);
2951            return data.length-1;
2952        }
2953        size_t b_idx = range[a_idx .. range.length].lowerBound!(SearchPolicy.gallop)(b).length+a_idx;
2954        uint[3] buf = void;
2955        uint to_insert;
2956        debug(std_uni)
2957        {
2958            writefln("a_idx=%d; b_idx=%d;", a_idx, b_idx);
2959        }
2960        if (b_idx == range.length)
2961        {
2962            //  [-------++++++++----++++++-]
2963            //  [      s     a                 b]
2964            if (a_idx & 1)// a in positive
2965            {
2966                buf[0] = b;
2967                to_insert = 1;
2968            }
2969            else// a in negative
2970            {
2971                buf[0] = a;
2972                buf[1] = b;
2973                to_insert = 2;
2974            }
2975            pos = genericReplace(data, a_idx, b_idx, buf[0 .. to_insert]);
2976            return pos - 1;
2977        }
2978
2979        uint top = data[b_idx];
2980
2981        debug(std_uni)
2982        {
2983            writefln("a_idx=%d; b_idx=%d;", a_idx, b_idx);
2984            writefln("a=%s; b=%s; top=%s;", a, b, top);
2985        }
2986        if (a_idx & 1)
2987        {// a in positive
2988            if (b_idx & 1)// b in positive
2989            {
2990                //  [-------++++++++----++++++-]
2991                //  [       s    a        b    ]
2992                buf[0] = top;
2993                to_insert = 1;
2994            }
2995            else // b in negative
2996            {
2997                //  [-------++++++++----++++++-]
2998                //  [       s    a   b         ]
2999                if (top == b)
3000                {
3001                    assert(b_idx+1 < data.length);
3002                    buf[0] = data[b_idx+1];
3003                    pos = genericReplace(data, a_idx, b_idx+2, buf[0 .. 1]);
3004                    return pos - 1;
3005                }
3006                buf[0] = b;
3007                buf[1] = top;
3008                to_insert = 2;
3009            }
3010        }
3011        else
3012        { // a in negative
3013            if (b_idx & 1) // b in positive
3014            {
3015                //  [----------+++++----++++++-]
3016                //  [     a     b              ]
3017                buf[0] = a;
3018                buf[1] = top;
3019                to_insert = 2;
3020            }
3021            else// b in negative
3022            {
3023                //  [----------+++++----++++++-]
3024                //  [  a       s      b        ]
3025                if (top == b)
3026                {
3027                    assert(b_idx+1 < data.length);
3028                    buf[0] = a;
3029                    buf[1] = data[b_idx+1];
3030                    pos = genericReplace(data, a_idx, b_idx+2, buf[0 .. 2]);
3031                    return pos - 1;
3032                }
3033                buf[0] = a;
3034                buf[1] = b;
3035                buf[2] = top;
3036                to_insert = 3;
3037            }
3038        }
3039        pos = genericReplace(data, a_idx, b_idx+1, buf[0 .. to_insert]);
3040        debug(std_uni)
3041        {
3042            writefln("marker idx: %d; length=%d", pos, data[pos], data.length);
3043            writeln("inserting ", buf[0 .. to_insert]);
3044        }
3045        return pos - 1;
3046    }
3047
3048    //
3049    Marker dropUpTo(uint a, Marker pos=Marker.init)
3050    in
3051    {
3052        assert(pos % 2 == 0); // at start of interval
3053    }
3054    do
3055    {
3056        auto range = assumeSorted!"a <= b"(data[pos .. data.length]);
3057        if (range.empty)
3058            return pos;
3059        size_t idx = pos;
3060        idx += range.lowerBound(a).length;
3061
3062        debug(std_uni)
3063        {
3064            writeln("dropUpTo full length=", data.length);
3065            writeln(pos,"~~~", idx);
3066        }
3067        if (idx == data.length)
3068            return genericReplace(data, pos, idx, cast(uint[])[]);
3069        if (idx & 1)
3070        {   // a in positive
3071            //[--+++----++++++----+++++++------...]
3072            //      |<---si       s  a  t
3073            genericReplace(data, pos, idx, [a]);
3074        }
3075        else
3076        {   // a in negative
3077            //[--+++----++++++----+++++++-------+++...]
3078            //      |<---si              s  a  t
3079            genericReplace(data, pos, idx, cast(uint[])[]);
3080        }
3081        return pos;
3082    }
3083
3084    //
3085    Marker skipUpTo(uint a, Marker pos=Marker.init)
3086    out(result)
3087    {
3088        assert(result % 2 == 0);// always start of interval
3089        //(may be  0-width after-split)
3090    }
3091    do
3092    {
3093        assert(data.length % 2 == 0);
3094        auto range = assumeSorted!"a <= b"(data[pos .. data.length]);
3095        size_t idx = pos+range.lowerBound(a).length;
3096
3097        if (idx >= data.length) // could have Marker point to recently removed stuff
3098            return data.length;
3099
3100        if (idx & 1)// inside of interval, check for split
3101        {
3102
3103            immutable top = data[idx];
3104            if (top == a)// no need to split, it's end
3105                return idx+1;
3106            immutable start = data[idx-1];
3107            if (a == start)
3108                return idx-1;
3109            // split it up
3110            genericReplace(data, idx, idx+1, [a, a, top]);
3111            return idx+1;        // avoid odd index
3112        }
3113        return idx;
3114    }
3115
3116    CowArray!SP data;
3117}
3118
3119pure @safe unittest
3120{
3121    import std.conv : to;
3122    assert(unicode.ASCII.to!string() == "[0..128)");
3123}
3124
3125// pedantic version for ctfe, and aligned-access only architectures
3126@system private uint safeRead24(scope const ubyte* ptr, size_t idx) pure nothrow @nogc
3127{
3128    idx *= 3;
3129    version (LittleEndian)
3130        return ptr[idx] + (cast(uint) ptr[idx+1]<<8)
3131             + (cast(uint) ptr[idx+2]<<16);
3132    else
3133        return (cast(uint) ptr[idx]<<16) + (cast(uint) ptr[idx+1]<<8)
3134             + ptr[idx+2];
3135}
3136
3137// ditto
3138@system private void safeWrite24(scope ubyte* ptr, uint val, size_t idx) pure nothrow @nogc
3139{
3140    idx *= 3;
3141    version (LittleEndian)
3142    {
3143        ptr[idx] = val & 0xFF;
3144        ptr[idx+1] = (val >> 8) & 0xFF;
3145        ptr[idx+2] = (val >> 16) & 0xFF;
3146    }
3147    else
3148    {
3149        ptr[idx] = (val >> 16) & 0xFF;
3150        ptr[idx+1] = (val >> 8) & 0xFF;
3151        ptr[idx+2] = val & 0xFF;
3152    }
3153}
3154
3155// unaligned x86-like read/write functions
3156@system private uint unalignedRead24(scope const ubyte* ptr, size_t idx) pure nothrow @nogc
3157{
3158    uint* src = cast(uint*)(ptr+3*idx);
3159    version (LittleEndian)
3160        return *src & 0xFF_FFFF;
3161    else
3162        return *src >> 8;
3163}
3164
3165// ditto
3166@system private void unalignedWrite24(scope ubyte* ptr, uint val, size_t idx) pure nothrow @nogc
3167{
3168    uint* dest = cast(uint*)(cast(ubyte*) ptr + 3*idx);
3169    version (LittleEndian)
3170        *dest = val | (*dest & 0xFF00_0000);
3171    else
3172        *dest = (val << 8) | (*dest & 0xFF);
3173}
3174
3175@system private uint read24(scope const ubyte* ptr, size_t idx) pure nothrow @nogc
3176{
3177    static if (hasUnalignedReads)
3178        return __ctfe ? safeRead24(ptr, idx) : unalignedRead24(ptr, idx);
3179    else
3180        return safeRead24(ptr, idx);
3181}
3182
3183@system private void write24(scope ubyte* ptr, uint val, size_t idx) pure nothrow @nogc
3184{
3185    static if (hasUnalignedReads)
3186        return __ctfe ? safeWrite24(ptr, val, idx) : unalignedWrite24(ptr, val, idx);
3187    else
3188        return safeWrite24(ptr, val, idx);
3189}
3190
3191struct CowArray(SP=GcPolicy)
3192{
3193    import std.range.primitives : hasLength;
3194
3195  @safe:
3196    static auto reuse(uint[] arr)
3197    {
3198        CowArray cow;
3199        cow.data = arr;
3200        SP.append(cow.data, 1);
3201        assert(cow.refCount == 1);
3202        assert(cow.length == arr.length);
3203        return cow;
3204    }
3205
3206    this(Range)(Range range)
3207        if (isInputRange!Range && hasLength!Range)
3208    {
3209        import std.algorithm.mutation : copy;
3210        length = range.length;
3211        copy(range, data[0..$-1]);
3212    }
3213
3214    this(Range)(Range range)
3215        if (isForwardRange!Range && !hasLength!Range)
3216    {
3217        import std.algorithm.mutation : copy;
3218        import std.range.primitives : walkLength;
3219        immutable len = walkLength(range.save);
3220        length = len;
3221        copy(range, data[0..$-1]);
3222    }
3223
3224    this(this)
3225    {
3226        if (!empty)
3227        {
3228            refCount = refCount + 1;
3229        }
3230    }
3231
3232    ~this()
3233    {
3234        if (!empty)
3235        {
3236            immutable cnt = refCount;
3237            if (cnt == 1)
3238                SP.destroy(data);
3239            else
3240                refCount = cnt - 1;
3241        }
3242    }
3243
3244    // no ref-count for empty U24 array
3245    @property bool empty() const { return data.length == 0; }
3246
3247    // report one less then actual size
3248    @property size_t length() const
3249    {
3250        return data.length ? data.length - 1 : 0;
3251    }
3252
3253    //+ an extra slot for ref-count
3254    @property void length(size_t len)
3255    {
3256        import std.algorithm.comparison : min;
3257        import std.algorithm.mutation : copy;
3258        if (len == 0)
3259        {
3260            if (!empty)
3261                freeThisReference();
3262            return;
3263        }
3264        immutable total = len + 1; // including ref-count
3265        if (empty)
3266        {
3267            data = SP.alloc!uint(total);
3268            refCount = 1;
3269            return;
3270        }
3271        immutable cur_cnt = refCount;
3272        if (cur_cnt != 1) // have more references to this memory
3273        {
3274            refCount = cur_cnt - 1;
3275            auto new_data = SP.alloc!uint(total);
3276            // take shrinking into account
3277            auto to_copy = min(total, data.length) - 1;
3278            copy(data[0 .. to_copy], new_data[0 .. to_copy]);
3279            data = new_data; // before setting refCount!
3280            refCount = 1;
3281        }
3282        else // 'this' is the only reference
3283        {
3284            // use the realloc (hopefully in-place operation)
3285            data = SP.realloc(data, total);
3286            refCount = 1; // setup a ref-count in the new end of the array
3287        }
3288    }
3289
3290    alias opDollar = length;
3291
3292    uint opIndex()(size_t idx)const
3293    {
3294        return data[idx];
3295    }
3296
3297    void opIndexAssign(uint val, size_t idx)
3298    {
3299        auto cnt = refCount;
3300        if (cnt != 1)
3301            dupThisReference(cnt);
3302        data[idx] = val;
3303    }
3304
3305    //
3306    auto opSlice(size_t from, size_t to)
3307    {
3308        if (!empty)
3309        {
3310            auto cnt = refCount;
3311            if (cnt != 1)
3312                dupThisReference(cnt);
3313        }
3314        return data[from .. to];
3315
3316    }
3317
3318    //
3319    auto opSlice(size_t from, size_t to) const
3320    {
3321        return data[from .. to];
3322    }
3323
3324    // length slices before the ref count
3325    auto opSlice()
3326    {
3327        return opSlice(0, length);
3328    }
3329
3330    // ditto
3331    auto opSlice() const
3332    {
3333        return opSlice(0, length);
3334    }
3335
3336    void append(Range)(Range range)
3337        if (isInputRange!Range && hasLength!Range && is(ElementType!Range : uint))
3338    {
3339        size_t nl = length + range.length;
3340        length = nl;
3341        copy(range, this[nl-range.length .. nl]);
3342    }
3343
3344    void append()(uint[] val...)
3345    {
3346        length = length + val.length;
3347        data[$-val.length-1 .. $-1] = val[];
3348    }
3349
3350    bool opEquals()(auto const ref CowArray rhs)const
3351    {
3352        if (empty ^ rhs.empty)
3353            return false; // one is empty and the other isn't
3354        return empty || data[0..$-1] == rhs.data[0..$-1];
3355    }
3356
3357private:
3358    // ref-count is right after the data
3359    @property uint refCount() const
3360    {
3361        return data[$-1];
3362    }
3363
3364    @property void refCount(uint cnt)
3365    {
3366        data[$-1] = cnt;
3367    }
3368
3369    void freeThisReference()
3370    {
3371        immutable count = refCount;
3372        if (count != 1) // have more references to this memory
3373        {
3374            // dec shared ref-count
3375            refCount = count - 1;
3376            data = [];
3377        }
3378        else
3379            SP.destroy(data);
3380        assert(!data.ptr);
3381    }
3382
3383    void dupThisReference(uint count)
3384    in
3385    {
3386        assert(!empty && count != 1 && count == refCount);
3387    }
3388    do
3389    {
3390        import std.algorithm.mutation : copy;
3391        // dec shared ref-count
3392        refCount = count - 1;
3393        // copy to the new chunk of RAM
3394        auto new_data = SP.alloc!uint(data.length);
3395        // bit-blit old stuff except the counter
3396        copy(data[0..$-1], new_data[0..$-1]);
3397        data = new_data; // before setting refCount!
3398        refCount = 1; // so that this updates the right one
3399    }
3400
3401    uint[] data;
3402}
3403
3404pure @safe unittest// Uint24 tests
3405{
3406    import std.algorithm.comparison : equal;
3407    import std.algorithm.mutation : copy;
3408    import std.conv : text;
3409    import std.range : iota, chain;
3410    import std.range.primitives : isBidirectionalRange, isOutputRange;
3411    void funcRef(T)(ref T u24)
3412    {
3413        u24.length = 2;
3414        u24[1] = 1024;
3415        T u24_c = u24;
3416        assert(u24[1] == 1024);
3417        u24.length = 0;
3418        assert(u24.empty);
3419        u24.append([1, 2]);
3420        assert(equal(u24[], [1, 2]));
3421        u24.append(111);
3422        assert(equal(u24[], [1, 2, 111]));
3423        assert(!u24_c.empty && u24_c[1] == 1024);
3424        u24.length = 3;
3425        copy(iota(0, 3), u24[]);
3426        assert(equal(u24[], iota(0, 3)));
3427        assert(u24_c[1] == 1024);
3428    }
3429
3430    void func2(T)(T u24)
3431    {
3432        T u24_2 = u24;
3433        T u24_3;
3434        u24_3 = u24_2;
3435        assert(u24_2 == u24_3);
3436        assert(equal(u24[], u24_2[]));
3437        assert(equal(u24_2[], u24_3[]));
3438        funcRef(u24_3);
3439
3440        assert(equal(u24_3[], iota(0, 3)));
3441        assert(!equal(u24_2[], u24_3[]));
3442        assert(equal(u24_2[], u24[]));
3443        u24_2 = u24_3;
3444        assert(equal(u24_2[], iota(0, 3)));
3445        // to test that passed arg is intact outside
3446        // plus try out opEquals
3447        u24 = u24_3;
3448        u24 = T.init;
3449        u24_3 = T.init;
3450        assert(u24.empty);
3451        assert(u24 == u24_3);
3452        assert(u24 != u24_2);
3453    }
3454
3455    static foreach (Policy; AliasSeq!(GcPolicy, ReallocPolicy))
3456    {{
3457        alias Range = typeof(CowArray!Policy.init[]);
3458        alias U24A = CowArray!Policy;
3459        static assert(isForwardRange!Range);
3460        static assert(isBidirectionalRange!Range);
3461        static assert(isOutputRange!(Range, uint));
3462        static assert(isRandomAccessRange!(Range));
3463
3464        auto arr = U24A([42u, 36, 100]);
3465        assert(arr[0] == 42);
3466        assert(arr[1] == 36);
3467        arr[0] = 72;
3468        arr[1] = 0xFE_FEFE;
3469        assert(arr[0] == 72);
3470        assert(arr[1] == 0xFE_FEFE);
3471        assert(arr[2] == 100);
3472        U24A arr2 = arr;
3473        assert(arr2[0] == 72);
3474        arr2[0] = 11;
3475        // test COW-ness
3476        assert(arr[0] == 72);
3477        assert(arr2[0] == 11);
3478        // set this to about 100M to stress-test COW memory management
3479        foreach (v; 0 .. 10_000)
3480            func2(arr);
3481        assert(equal(arr[], [72, 0xFE_FEFE, 100]));
3482
3483        auto r2 = U24A(iota(0, 100));
3484        assert(equal(r2[], iota(0, 100)), text(r2[]));
3485        copy(iota(10, 170, 2), r2[10 .. 90]);
3486        assert(equal(r2[], chain(iota(0, 10), iota(10, 170, 2), iota(90, 100)))
3487               , text(r2[]));
3488    }}
3489}
3490
3491pure @safe unittest// core set primitives test
3492{
3493    import std.conv : text;
3494    alias AllSets = AliasSeq!(InversionList!GcPolicy, InversionList!ReallocPolicy);
3495    foreach (CodeList; AllSets)
3496    {
3497        CodeList a;
3498        //"plug a hole" test
3499        a.add(10, 20).add(25, 30).add(15, 27);
3500        assert(a == CodeList(10, 30), text(a));
3501
3502        auto x = CodeList.init;
3503        x.add(10, 20).add(30, 40).add(50, 60);
3504
3505        a = x;
3506        a.add(20, 49);//[10, 49) [50, 60)
3507        assert(a == CodeList(10, 49, 50 ,60));
3508
3509        a = x;
3510        a.add(20, 50);
3511        assert(a == CodeList(10, 60), text(a));
3512
3513        // simple unions, mostly edge effects
3514        x = CodeList.init;
3515        x.add(10, 20).add(40, 60);
3516
3517        a = x;
3518        a.add(10, 25); //[10, 25) [40, 60)
3519        assert(a == CodeList(10, 25, 40, 60));
3520
3521        a = x;
3522        a.add(5, 15); //[5, 20) [40, 60)
3523        assert(a == CodeList(5, 20, 40, 60));
3524
3525        a = x;
3526        a.add(0, 10); // [0, 20) [40, 60)
3527        assert(a == CodeList(0, 20, 40, 60));
3528
3529        a = x;
3530        a.add(0, 5); // prepand
3531        assert(a == CodeList(0, 5, 10, 20, 40, 60), text(a));
3532
3533        a = x;
3534        a.add(5, 20);
3535        assert(a == CodeList(5, 20, 40, 60));
3536
3537        a = x;
3538        a.add(3, 37);
3539        assert(a == CodeList(3, 37, 40, 60));
3540
3541        a = x;
3542        a.add(37, 65);
3543        assert(a == CodeList(10, 20, 37, 65));
3544
3545        // some tests on helpers for set intersection
3546        x = CodeList.init.add(10, 20).add(40, 60).add(100, 120);
3547        a = x;
3548
3549        auto m = a.skipUpTo(60);
3550        a.dropUpTo(110, m);
3551        assert(a == CodeList(10, 20, 40, 60, 110, 120), text(a.data[]));
3552
3553        a = x;
3554        a.dropUpTo(100);
3555        assert(a == CodeList(100, 120), text(a.data[]));
3556
3557        a = x;
3558        m = a.skipUpTo(50);
3559        a.dropUpTo(140, m);
3560        assert(a == CodeList(10, 20, 40, 50), text(a.data[]));
3561        a = x;
3562        a.dropUpTo(60);
3563        assert(a == CodeList(100, 120), text(a.data[]));
3564    }
3565}
3566
3567
3568//test constructor to work with any order of intervals
3569pure @safe unittest
3570{
3571    import std.algorithm.comparison : equal;
3572    import std.conv : text, to;
3573    import std.range : chain, iota;
3574    import std.typecons : tuple;
3575    //ensure constructor handles bad ordering and overlap
3576    auto c1 = CodepointSet('��', '��'+1, '��','��'+1);
3577    foreach (ch; chain(iota('��', '��'+1), iota('��','��'+1)))
3578        assert(ch in c1, to!string(ch));
3579
3580    //contiguos
3581    assert(CodepointSet(1000, 1006, 1006, 1009)
3582        .byInterval.equal([tuple(1000, 1009)]));
3583    //contains
3584    assert(CodepointSet(900, 1200, 1000, 1100)
3585        .byInterval.equal([tuple(900, 1200)]));
3586    //intersect left
3587    assert(CodepointSet(900, 1100, 1000, 1200)
3588        .byInterval.equal([tuple(900, 1200)]));
3589    //intersect right
3590    assert(CodepointSet(1000, 1200, 900, 1100)
3591        .byInterval.equal([tuple(900, 1200)]));
3592
3593    //ditto with extra items at end
3594    assert(CodepointSet(1000, 1200, 900, 1100, 800, 850)
3595        .byInterval.equal([tuple(800, 850), tuple(900, 1200)]));
3596    assert(CodepointSet(900, 1100, 1000, 1200, 800, 850)
3597        .byInterval.equal([tuple(800, 850), tuple(900, 1200)]));
3598
3599    //"plug a hole" test
3600    auto c2 = CodepointSet(20, 40,
3601        60, 80, 100, 140, 150, 200,
3602        40, 60, 80, 100, 140, 150
3603    );
3604    assert(c2.byInterval.equal([tuple(20, 200)]));
3605
3606    auto c3 = CodepointSet(
3607        20, 40, 60, 80, 100, 140, 150, 200,
3608        0, 10, 15, 100, 10, 20, 200, 220);
3609    assert(c3.byInterval.equal([tuple(0, 140), tuple(150, 220)]));
3610}
3611
3612
3613pure @safe unittest
3614{   // full set operations
3615    import std.conv : text;
3616    alias AllSets = AliasSeq!(InversionList!GcPolicy, InversionList!ReallocPolicy);
3617    foreach (CodeList; AllSets)
3618    {
3619        CodeList a, b, c, d;
3620
3621        //"plug a hole"
3622        a.add(20, 40).add(60, 80).add(100, 140).add(150, 200);
3623        b.add(40, 60).add(80, 100).add(140, 150);
3624        c = a | b;
3625        d = b | a;
3626        assert(c == CodeList(20, 200), text(CodeList.stringof," ", c));
3627        assert(c == d, text(c," vs ", d));
3628
3629        b = CodeList.init.add(25, 45).add(65, 85).add(95,110).add(150, 210);
3630        c = a | b; //[20,45) [60, 85) [95, 140) [150, 210)
3631        d = b | a;
3632        assert(c == CodeList(20, 45, 60, 85, 95, 140, 150, 210), text(c));
3633        assert(c == d, text(c," vs ", d));
3634
3635        b = CodeList.init.add(10, 20).add(30,100).add(145,200);
3636        c = a | b;//[10, 140) [145, 200)
3637        d = b | a;
3638        assert(c == CodeList(10, 140, 145, 200));
3639        assert(c == d, text(c," vs ", d));
3640
3641        b = CodeList.init.add(0, 10).add(15, 100).add(10, 20).add(200, 220);
3642        c = a | b;//[0, 140) [150, 220)
3643        d = b | a;
3644        assert(c == CodeList(0, 140, 150, 220));
3645        assert(c == d, text(c," vs ", d));
3646
3647
3648        a = CodeList.init.add(20, 40).add(60, 80);
3649        b = CodeList.init.add(25, 35).add(65, 75);
3650        c = a & b;
3651        d = b & a;
3652        assert(c == CodeList(25, 35, 65, 75), text(c));
3653        assert(c == d, text(c," vs ", d));
3654
3655        a = CodeList.init.add(20, 40).add(60, 80).add(100, 140).add(150, 200);
3656        b = CodeList.init.add(25, 35).add(65, 75).add(110, 130).add(160, 180);
3657        c = a & b;
3658        d = b & a;
3659        assert(c == CodeList(25, 35, 65, 75, 110, 130, 160, 180), text(c));
3660        assert(c == d, text(c," vs ", d));
3661
3662        a = CodeList.init.add(20, 40).add(60, 80).add(100, 140).add(150, 200);
3663        b = CodeList.init.add(10, 30).add(60, 120).add(135, 160);
3664        c = a & b;//[20, 30)[60, 80) [100, 120) [135, 140) [150, 160)
3665        d = b & a;
3666
3667        assert(c == CodeList(20, 30, 60, 80, 100, 120, 135, 140, 150, 160),text(c));
3668        assert(c == d, text(c, " vs ",d));
3669        assert((c & a) == c);
3670        assert((d & b) == d);
3671        assert((c & d) == d);
3672
3673        b = CodeList.init.add(40, 60).add(80, 100).add(140, 200);
3674        c = a & b;
3675        d = b & a;
3676        assert(c == CodeList(150, 200), text(c));
3677        assert(c == d, text(c, " vs ",d));
3678        assert((c & a) == c);
3679        assert((d & b) == d);
3680        assert((c & d) == d);
3681
3682        assert((a & a) == a);
3683        assert((b & b) == b);
3684
3685        a = CodeList.init.add(20, 40).add(60, 80).add(100, 140).add(150, 200);
3686        b = CodeList.init.add(30, 60).add(75, 120).add(190, 300);
3687        c = a - b;// [30, 40) [60, 75) [120, 140) [150, 190)
3688        d = b - a;// [40, 60) [80, 100) [200, 300)
3689        assert(c == CodeList(20, 30, 60, 75, 120, 140, 150, 190), text(c));
3690        assert(d == CodeList(40, 60, 80, 100, 200, 300), text(d));
3691        assert(c - d == c, text(c-d, " vs ", c));
3692        assert(d - c == d, text(d-c, " vs ", d));
3693        assert(c - c == CodeList.init);
3694        assert(d - d == CodeList.init);
3695
3696        a = CodeList.init.add(20, 40).add( 60, 80).add(100, 140).add(150,            200);
3697        b = CodeList.init.add(10,  50).add(60,                           160).add(190, 300);
3698        c = a - b;// [160, 190)
3699        d = b - a;// [10, 20) [40, 50) [80, 100) [140, 150) [200, 300)
3700        assert(c == CodeList(160, 190), text(c));
3701        assert(d == CodeList(10, 20, 40, 50, 80, 100, 140, 150, 200, 300), text(d));
3702        assert(c - d == c, text(c-d, " vs ", c));
3703        assert(d - c == d, text(d-c, " vs ", d));
3704        assert(c - c == CodeList.init);
3705        assert(d - d == CodeList.init);
3706
3707        a = CodeList.init.add(20,    40).add(60, 80).add(100,      140).add(150,  200);
3708        b = CodeList.init.add(10, 30).add(45,         100).add(130,             190);
3709        c = a ~ b; // [10, 20) [30, 40) [45, 60) [80, 130) [140, 150) [190, 200)
3710        d = b ~ a;
3711        assert(c == CodeList(10, 20, 30, 40, 45, 60, 80, 130, 140, 150, 190, 200),
3712               text(c));
3713        assert(c == d, text(c, " vs ", d));
3714    }
3715}
3716
3717}
3718
3719pure @safe unittest// vs single dchar
3720{
3721    import std.conv : text;
3722    CodepointSet a = CodepointSet(10, 100, 120, 200);
3723    assert(a - 'A' == CodepointSet(10, 65, 66, 100, 120, 200), text(a - 'A'));
3724    assert((a & 'B') == CodepointSet(66, 67));
3725}
3726
3727pure @safe unittest// iteration & opIndex
3728{
3729    import std.algorithm.comparison : equal;
3730    import std.conv : text;
3731    import std.typecons : tuple, Tuple;
3732
3733    static foreach (CodeList; AliasSeq!(InversionList!(ReallocPolicy)))
3734    {{
3735        auto arr = "ABCDEFGHIJKLMabcdefghijklm"d;
3736        auto a = CodeList('A','N','a', 'n');
3737        assert(equal(a.byInterval,
3738                [tuple(cast(uint)'A', cast(uint)'N'), tuple(cast(uint)'a', cast(uint)'n')]
3739            ), text(a.byInterval));
3740
3741        // same @@@BUG as in https://issues.dlang.org/show_bug.cgi?id=8949 ?
3742        version (bug8949)
3743        {
3744            import std.range : retro;
3745            assert(equal(retro(a.byInterval),
3746                [tuple(cast(uint)'a', cast(uint)'n'), tuple(cast(uint)'A', cast(uint)'N')]
3747            ), text(retro(a.byInterval)));
3748        }
3749        auto achr = a.byCodepoint;
3750        assert(equal(achr, arr), text(a.byCodepoint));
3751        foreach (ch; a.byCodepoint)
3752            assert(a[ch]);
3753        auto x = CodeList(100, 500, 600, 900, 1200, 1500);
3754        assert(equal(x.byInterval, [ tuple(100, 500), tuple(600, 900), tuple(1200, 1500)]), text(x.byInterval));
3755        foreach (ch; x.byCodepoint)
3756            assert(x[ch]);
3757        static if (is(CodeList == CodepointSet))
3758        {
3759            auto y = CodeList(x.byInterval);
3760            assert(equal(x.byInterval, y.byInterval));
3761        }
3762        assert(equal(CodepointSet.init.byInterval, cast(Tuple!(uint, uint)[])[]));
3763        assert(equal(CodepointSet.init.byCodepoint, cast(dchar[])[]));
3764    }}
3765}
3766
3767//============================================================================
3768// Generic Trie template and various ways to build it
3769//============================================================================
3770
3771// debug helper to get a shortened array dump
3772auto arrayRepr(T)(T x)
3773{
3774    import std.conv : text;
3775    if (x.length > 32)
3776    {
3777        return text(x[0 .. 16],"~...~", x[x.length-16 .. x.length]);
3778    }
3779    else
3780        return text(x);
3781}
3782
3783/**
3784    Maps `Key` to a suitable integer index within the range of `size_t`.
3785    The mapping is constructed by applying predicates from `Prefix` left to right
3786    and concatenating the resulting bits.
3787
3788    The first (leftmost) predicate defines the most significant bits of
3789    the resulting index.
3790 */
3791template mapTrieIndex(Prefix...)
3792{
3793    size_t mapTrieIndex(Key)(Key key)
3794        if (isValidPrefixForTrie!(Key, Prefix))
3795    {
3796        alias p = Prefix;
3797        size_t idx;
3798        foreach (i, v; p[0..$-1])
3799        {
3800            idx |= p[i](key);
3801            idx <<= p[i+1].bitSize;
3802        }
3803        idx |= p[$-1](key);
3804        return idx;
3805    }
3806}
3807
3808/*
3809    `TrieBuilder` is a type used for incremental construction
3810    of $(LREF Trie)s.
3811
3812    See $(LREF buildTrie) for generic helpers built on top of it.
3813*/
3814@trusted private struct TrieBuilder(Value, Key, Args...)
3815if (isBitPackableType!Value && isValidArgsForTrie!(Key, Args))
3816{
3817    import std.exception : enforce;
3818
3819private:
3820    // last index is not stored in table, it is used as an offset to values in a block.
3821    static if (is(Value == bool))// always pack bool
3822        alias V = BitPacked!(Value, 1);
3823    else
3824        alias V = Value;
3825    static auto deduceMaxIndex(Preds...)()
3826    {
3827        size_t idx = 1;
3828        foreach (v; Preds)
3829            idx *= 2^^v.bitSize;
3830        return idx;
3831    }
3832
3833    static if (is(typeof(Args[0]) : Key)) // Args start with upper bound on Key
3834    {
3835        alias Prefix = Args[1..$];
3836        enum lastPageSize = 2^^Prefix[$-1].bitSize;
3837        enum translatedMaxIndex = mapTrieIndex!(Prefix)(Args[0]);
3838        enum roughedMaxIndex =
3839            (translatedMaxIndex + lastPageSize-1)/lastPageSize*lastPageSize;
3840        // check warp around - if wrapped, use the default deduction rule
3841        enum maxIndex = roughedMaxIndex < translatedMaxIndex ?
3842            deduceMaxIndex!(Prefix)() : roughedMaxIndex;
3843    }
3844    else
3845    {
3846        alias Prefix = Args;
3847        enum maxIndex = deduceMaxIndex!(Prefix)();
3848    }
3849
3850    alias getIndex = mapTrieIndex!(Prefix);
3851
3852    enum lastLevel = Prefix.length-1;
3853    struct ConstructState
3854    {
3855        size_t idx_zeros, idx_ones;
3856    }
3857    // iteration over levels of Trie, each indexes its own level and thus a shortened domain
3858    size_t[Prefix.length] indices;
3859    // default filler value to use
3860    Value defValue;
3861    // this is a full-width index of next item
3862    size_t curIndex;
3863    // all-zeros page index, all-ones page index (+ indicator if there is such a page)
3864    ConstructState[Prefix.length] state;
3865    // the table being constructed
3866    MultiArray!(idxTypes!(Key, fullBitSize!(Prefix), Prefix[0..$]), V) table;
3867
3868    @disable this();
3869
3870    //shortcut for index variable at level 'level'
3871    @property ref idx(size_t level)(){ return indices[level]; }
3872
3873    // this function assumes no holes in the input so
3874    // indices are going one by one
3875    void addValue(size_t level, T)(T val, size_t numVals)
3876    {
3877        alias j = idx!level;
3878        enum pageSize = 1 << Prefix[level].bitSize;
3879        if (numVals == 0)
3880            return;
3881        auto ptr = table.slice!(level);
3882        if (numVals == 1)
3883        {
3884            static if (level == Prefix.length-1)
3885                ptr[j] = val;
3886            else
3887            {// can incur narrowing conversion
3888                assert(j < ptr.length);
3889                ptr[j] = force!(typeof(ptr[j]))(val);
3890            }
3891            j++;
3892            if (j % pageSize == 0)
3893                spillToNextPage!level(ptr);
3894            return;
3895        }
3896        // longer row of values
3897        // get to the next page boundary
3898        immutable nextPB = (j + pageSize) & ~(pageSize-1);
3899        immutable n =  nextPB - j;// can fill right in this page
3900        if (numVals < n) //fits in current page
3901        {
3902            ptr[j .. j+numVals]  = val;
3903            j += numVals;
3904            return;
3905        }
3906        static if (level != 0)//on the first level it always fits
3907        {
3908            numVals -= n;
3909            //write till the end of current page
3910            ptr[j .. j+n]  = val;
3911            j += n;
3912            //spill to the next page
3913            spillToNextPage!level(ptr);
3914            // page at once loop
3915            if (state[level].idx_zeros != size_t.max && val == T.init)
3916            {
3917                alias NextIdx = typeof(table.slice!(level-1)[0]);
3918                addValue!(level-1)(force!NextIdx(state[level].idx_zeros),
3919                    numVals/pageSize);
3920                ptr = table.slice!level; //table structure might have changed
3921                numVals %= pageSize;
3922            }
3923            else
3924            {
3925                while (numVals >= pageSize)
3926                {
3927                    numVals -= pageSize;
3928                    ptr[j .. j+pageSize]  = val;
3929                    j += pageSize;
3930                    spillToNextPage!level(ptr);
3931                }
3932            }
3933            if (numVals)
3934            {
3935                // the leftovers, an incomplete page
3936                ptr[j .. j+numVals]  = val;
3937                j += numVals;
3938            }
3939        }
3940    }
3941
3942    void spillToNextPage(size_t level, Slice)(ref Slice ptr)
3943    {
3944        // last level (i.e. topmost) has 1 "page"
3945        // thus it need not to add a new page on upper level
3946        static if (level != 0)
3947            spillToNextPageImpl!(level)(ptr);
3948    }
3949
3950    // this can re-use the current page if duplicate or allocate a new one
3951    // it also makes sure that previous levels point to the correct page in this level
3952    void spillToNextPageImpl(size_t level, Slice)(ref Slice ptr)
3953    {
3954        alias NextIdx = typeof(table.slice!(level-1)[0]);
3955        NextIdx next_lvl_index;
3956        enum pageSize = 1 << Prefix[level].bitSize;
3957        assert(idx!level % pageSize == 0);
3958        immutable last = idx!level-pageSize;
3959        const slice = ptr[idx!level - pageSize .. idx!level];
3960        size_t j;
3961        for (j=0; j<last; j+=pageSize)
3962        {
3963            if (ptr[j .. j+pageSize] == slice)
3964            {
3965                // get index to it, reuse ptr space for the next block
3966                next_lvl_index = force!NextIdx(j/pageSize);
3967                version (none)
3968                {
3969                import std.stdio : writefln, writeln;
3970                writefln("LEVEL(%s) page mapped idx: %s: 0..%s  ---> [%s..%s]"
3971                        ,level
3972                        ,indices[level-1], pageSize, j, j+pageSize);
3973                writeln("LEVEL(", level
3974                        , ") mapped page is: ", slice, ": ", arrayRepr(ptr[j .. j+pageSize]));
3975                writeln("LEVEL(", level
3976                        , ") src page is :", ptr, ": ", arrayRepr(slice[0 .. pageSize]));
3977                }
3978                idx!level -= pageSize; // reuse this page, it is duplicate
3979                break;
3980            }
3981        }
3982        if (j == last)
3983        {
3984    L_allocate_page:
3985            next_lvl_index = force!NextIdx(idx!level/pageSize - 1);
3986            if (state[level].idx_zeros == size_t.max && ptr.zeros(j, j+pageSize))
3987            {
3988                state[level].idx_zeros = next_lvl_index;
3989            }
3990            // allocate next page
3991            version (none)
3992            {
3993            import std.stdio : writefln;
3994            writefln("LEVEL(%s) page allocated: %s"
3995                     , level, arrayRepr(slice[0 .. pageSize]));
3996            writefln("LEVEL(%s) index: %s ; page at this index %s"
3997                     , level
3998                     , next_lvl_index
3999                     , arrayRepr(
4000                         table.slice!(level)
4001                          [pageSize*next_lvl_index..(next_lvl_index+1)*pageSize]
4002                        ));
4003            }
4004            table.length!level = table.length!level + pageSize;
4005        }
4006    L_know_index:
4007        // for the previous level, values are indices to the pages in the current level
4008        addValue!(level-1)(next_lvl_index, 1);
4009        ptr = table.slice!level; //re-load the slice after moves
4010    }
4011
4012    // idx - full-width index to fill with v (full-width index != key)
4013    // fills everything in the range of [curIndex, idx) with filler
4014    void putAt(size_t idx, Value v)
4015    {
4016        assert(idx >= curIndex);
4017        immutable numFillers = idx - curIndex;
4018        addValue!lastLevel(defValue, numFillers);
4019        addValue!lastLevel(v, 1);
4020        curIndex = idx + 1;
4021    }
4022
4023    // ditto, but sets the range of [idxA, idxB) to v
4024    void putRangeAt(size_t idxA, size_t idxB, Value v)
4025    {
4026        assert(idxA >= curIndex);
4027        assert(idxB >= idxA);
4028        size_t numFillers = idxA - curIndex;
4029        addValue!lastLevel(defValue, numFillers);
4030        addValue!lastLevel(v, idxB - idxA);
4031        curIndex = idxB; // open-right
4032    }
4033
4034    enum errMsg = "non-monotonic prefix function(s), an unsorted range or "~
4035        "duplicate key->value mapping";
4036
4037public:
4038    /**
4039        Construct a builder, where `filler` is a value
4040        to indicate empty slots (or "not found" condition).
4041    */
4042    this(Value filler)
4043    {
4044        curIndex = 0;
4045        defValue = filler;
4046        // zeros-page index, ones-page index
4047        foreach (ref v; state)
4048            v = ConstructState(size_t.max, size_t.max);
4049        table = typeof(table)(indices);
4050        // one page per level is a bootstrap minimum
4051        foreach (i, Pred; Prefix)
4052            table.length!i = (1 << Pred.bitSize);
4053    }
4054
4055    /**
4056        Put a value `v` into interval as
4057        mapped by keys from `a` to `b`.
4058        All slots prior to `a` are filled with
4059        the default filler.
4060    */
4061    void putRange(Key a, Key b, Value v)
4062    {
4063        auto idxA = getIndex(a), idxB = getIndex(b);
4064        // indexes of key should always grow
4065        enforce(idxB >= idxA && idxA >= curIndex, errMsg);
4066        putRangeAt(idxA, idxB, v);
4067    }
4068
4069    /**
4070        Put a value `v` into slot mapped by `key`.
4071        All slots prior to `key` are filled with the
4072        default filler.
4073    */
4074    void putValue(Key key, Value v)
4075    {
4076        auto idx = getIndex(key);
4077        enforce(idx >= curIndex, errMsg);
4078        putAt(idx, v);
4079    }
4080
4081    /// Finishes construction of Trie, yielding an immutable Trie instance.
4082    auto build()
4083    {
4084        static if (maxIndex != 0) // doesn't cover full range of size_t
4085        {
4086            assert(curIndex <= maxIndex);
4087            addValue!lastLevel(defValue, maxIndex - curIndex);
4088        }
4089        else
4090        {
4091            if (curIndex != 0 // couldn't wrap around
4092                || (Prefix.length != 1 && indices[lastLevel] == 0)) // can be just empty
4093            {
4094                addValue!lastLevel(defValue, size_t.max - curIndex);
4095                addValue!lastLevel(defValue, 1);
4096            }
4097            // else curIndex already completed the full range of size_t by wrapping around
4098        }
4099        return Trie!(V, Key, maxIndex, Prefix)(table);
4100    }
4101}
4102
4103/**
4104    $(P A generic Trie data-structure for a fixed number of stages.
4105    The design goal is optimal speed with smallest footprint size.
4106    )
4107    $(P It's intentionally read-only and doesn't provide constructors.
4108     To construct one use a special builder,
4109     see $(LREF TrieBuilder) and $(LREF buildTrie).
4110    )
4111
4112*/
4113@trusted private struct Trie(Value, Key, Args...)
4114if (isValidPrefixForTrie!(Key, Args)
4115    || (isValidPrefixForTrie!(Key, Args[1..$])
4116    && is(typeof(Args[0]) : size_t)))
4117{
4118    import std.range.primitives : isOutputRange;
4119    static if (is(typeof(Args[0]) : size_t))
4120    {
4121        private enum maxIndex = Args[0];
4122        private enum hasBoundsCheck = true;
4123        private alias Prefix = Args[1..$];
4124    }
4125    else
4126    {
4127        private enum hasBoundsCheck = false;
4128        private alias Prefix = Args;
4129    }
4130
4131    private this()(typeof(_table) table)
4132    {
4133        _table = table;
4134    }
4135
4136    // only for constant Tries constructed from precompiled tables
4137    private this()(const(size_t)[] offsets, const(size_t)[] sizes,
4138        const(size_t)[] data) const
4139    {
4140        _table = typeof(_table)(offsets, sizes, data);
4141    }
4142
4143    /**
4144        $(P Lookup the `key` in this `Trie`. )
4145
4146        $(P The lookup always succeeds if key fits the domain
4147        provided during construction. The whole domain defined
4148        is covered so instead of not found condition
4149        the sentinel (filler) value could be used. )
4150
4151        $(P See $(LREF buildTrie), $(LREF TrieBuilder) for how to
4152        define a domain of `Trie` keys and the sentinel value. )
4153
4154        Note:
4155        Domain range-checking is only enabled in debug builds
4156        and results in assertion failure.
4157    */
4158    TypeOfBitPacked!Value opIndex()(Key key) const
4159    {
4160        static if (hasBoundsCheck)
4161            assert(mapTrieIndex!Prefix(key) < maxIndex);
4162        size_t idx;
4163        alias p = Prefix;
4164        idx = cast(size_t) p[0](key);
4165        foreach (i, v; p[0..$-1])
4166            idx = cast(size_t)((_table.ptr!i[idx]<<p[i+1].bitSize) + p[i+1](key));
4167        return _table.ptr!(p.length-1)[idx];
4168    }
4169
4170    ///
4171    @property size_t bytes(size_t n=size_t.max)() const
4172    {
4173        return _table.bytes!n;
4174    }
4175
4176    ///
4177    @property size_t pages(size_t n)() const
4178    {
4179        return (bytes!n+2^^(Prefix[n].bitSize-1))
4180                /2^^Prefix[n].bitSize;
4181    }
4182
4183    ///
4184    void store(OutRange)(scope OutRange sink) const
4185        if (isOutputRange!(OutRange, char))
4186    {
4187        _table.store(sink);
4188    }
4189
4190private:
4191    MultiArray!(idxTypes!(Key, fullBitSize!(Prefix), Prefix[0..$]), Value) _table;
4192}
4193
4194// create a tuple of 'sliceBits' that slice the 'top' of bits into pieces of sizes 'sizes'
4195// left-to-right, the most significant bits first
4196template GetBitSlicing(size_t top, sizes...)
4197{
4198    static if (sizes.length > 0)
4199        alias GetBitSlicing =
4200            AliasSeq!(sliceBits!(top - sizes[0], top),
4201                      GetBitSlicing!(top - sizes[0], sizes[1..$]));
4202    else
4203        alias GetBitSlicing = AliasSeq!();
4204}
4205
4206template callableWith(T)
4207{
4208    template callableWith(alias Pred)
4209    {
4210        static if (!is(typeof(Pred(T.init))))
4211            enum callableWith = false;
4212        else
4213        {
4214            alias Result = typeof(Pred(T.init));
4215            enum callableWith = isBitPackableType!(TypeOfBitPacked!(Result));
4216        }
4217    }
4218}
4219
4220/*
4221    Check if `Prefix` is a valid set of predicates
4222    for `Trie` template having `Key` as the type of keys.
4223    This requires all predicates to be callable, take
4224    single argument of type `Key` and return unsigned value.
4225*/
4226template isValidPrefixForTrie(Key, Prefix...)
4227{
4228    import std.meta : allSatisfy;
4229    enum isValidPrefixForTrie = allSatisfy!(callableWith!Key, Prefix); // TODO: tighten the screws
4230}
4231
4232/*
4233    Check if `Args` is a set of maximum key value followed by valid predicates
4234    for `Trie` template having `Key` as the type of keys.
4235*/
4236template isValidArgsForTrie(Key, Args...)
4237{
4238    static if (Args.length > 1)
4239    {
4240        enum isValidArgsForTrie = isValidPrefixForTrie!(Key, Args)
4241            || (isValidPrefixForTrie!(Key, Args[1..$]) && is(typeof(Args[0]) : Key));
4242    }
4243    else
4244        enum isValidArgsForTrie = isValidPrefixForTrie!Args;
4245}
4246
4247@property size_t sumOfIntegerTuple(ints...)()
4248{
4249    size_t count=0;
4250    foreach (v; ints)
4251        count += v;
4252    return count;
4253}
4254
4255/**
4256    A shorthand for creating a custom multi-level fixed Trie
4257    from a `CodepointSet`. `sizes` are numbers of bits per level,
4258    with the most significant bits used first.
4259
4260    Note: The sum of `sizes` must be equal 21.
4261
4262    See_Also: $(LREF toTrie), which is even simpler.
4263
4264    Example:
4265    ---
4266    {
4267        import std.stdio;
4268        auto set = unicode("Number");
4269        auto trie = codepointSetTrie!(8, 5, 8)(set);
4270        writeln("Input code points to test:");
4271        foreach (line; stdin.byLine)
4272        {
4273            int count=0;
4274            foreach (dchar ch; line)
4275                if (trie[ch])// is number
4276                    count++;
4277            writefln("Contains %d number code points.", count);
4278        }
4279    }
4280    ---
4281*/
4282public template codepointSetTrie(sizes...)
4283if (sumOfIntegerTuple!sizes == 21)
4284{
4285    auto codepointSetTrie(Set)(Set set)
4286        if (isCodepointSet!Set)
4287    {
4288        auto builder = TrieBuilder!(bool, dchar, lastDchar+1, GetBitSlicing!(21, sizes))(false);
4289        foreach (ival; set.byInterval)
4290            builder.putRange(ival[0], ival[1], true);
4291        return builder.build();
4292    }
4293}
4294
4295/// Type of Trie generated by codepointSetTrie function.
4296public template CodepointSetTrie(sizes...)
4297if (sumOfIntegerTuple!sizes == 21)
4298{
4299    alias Prefix = GetBitSlicing!(21, sizes);
4300    alias CodepointSetTrie = typeof(TrieBuilder!(bool, dchar, lastDchar+1, Prefix)(false).build());
4301}
4302
4303/**
4304    A slightly more general tool for building fixed `Trie`
4305    for the Unicode data.
4306
4307    Specifically unlike `codepointSetTrie` it's allows creating mappings
4308    of `dchar` to an arbitrary type `T`.
4309
4310    Note: Overload taking `CodepointSet`s will naturally convert
4311    only to bool mapping `Trie`s.
4312
4313    CodepointTrie is the type of Trie as generated by codepointTrie function.
4314*/
4315public template codepointTrie(T, sizes...)
4316if (sumOfIntegerTuple!sizes == 21)
4317{
4318    alias Prefix = GetBitSlicing!(21, sizes);
4319
4320    static if (is(TypeOfBitPacked!T == bool))
4321    {
4322        auto codepointTrie(Set)(const scope Set set)
4323            if (isCodepointSet!Set)
4324        {
4325            return codepointSetTrie(set);
4326        }
4327    }
4328
4329    ///
4330    auto codepointTrie()(T[dchar] map, T defValue=T.init)
4331    {
4332        return buildTrie!(T, dchar, Prefix)(map, defValue);
4333    }
4334
4335    // unsorted range of pairs
4336    ///
4337    auto codepointTrie(R)(R range, T defValue=T.init)
4338        if (isInputRange!R
4339            && is(typeof(ElementType!R.init[0]) : T)
4340            && is(typeof(ElementType!R.init[1]) : dchar))
4341    {
4342        // build from unsorted array of pairs
4343        // TODO: expose index sorting functions for Trie
4344        return buildTrie!(T, dchar, Prefix)(range, defValue, true);
4345    }
4346}
4347
4348@system pure unittest
4349{
4350    import std.algorithm.comparison : max;
4351    import std.algorithm.searching : count;
4352
4353    // pick characters from the Greek script
4354    auto set = unicode.Greek;
4355
4356    // a user-defined property (or an expensive function)
4357    // that we want to look up
4358    static uint luckFactor(dchar ch)
4359    {
4360        // here we consider a character lucky
4361        // if its code point has a lot of identical hex-digits
4362        // e.g. arabic letter DDAL (\u0688) has a "luck factor" of 2
4363        ubyte[6] nibbles; // 6 4-bit chunks of code point
4364        uint value = ch;
4365        foreach (i; 0 .. 6)
4366        {
4367            nibbles[i] = value & 0xF;
4368            value >>= 4;
4369        }
4370        uint luck;
4371        foreach (n; nibbles)
4372            luck = cast(uint) max(luck, count(nibbles[], n));
4373        return luck;
4374    }
4375
4376    // only unsigned built-ins are supported at the moment
4377    alias LuckFactor = BitPacked!(uint, 3);
4378
4379    // create a temporary associative array (AA)
4380    LuckFactor[dchar] map;
4381    foreach (ch; set.byCodepoint)
4382        map[ch] = LuckFactor(luckFactor(ch));
4383
4384    // bits per stage are chosen randomly, fell free to optimize
4385    auto trie = codepointTrie!(LuckFactor, 8, 5, 8)(map);
4386
4387    // from now on the AA is not needed
4388    foreach (ch; set.byCodepoint)
4389        assert(trie[ch] == luckFactor(ch)); // verify
4390    // CJK is not Greek, thus it has the default value
4391    assert(trie['\u4444'] == 0);
4392    // and here is a couple of quite lucky Greek characters:
4393    // Greek small letter epsilon with dasia
4394    assert(trie['\u1F11'] == 3);
4395    // Ancient Greek metretes sign
4396    assert(trie['\U00010181'] == 3);
4397
4398}
4399
4400/// ditto
4401public template CodepointTrie(T, sizes...)
4402if (sumOfIntegerTuple!sizes == 21)
4403{
4404    alias Prefix = GetBitSlicing!(21, sizes);
4405    alias CodepointTrie = typeof(TrieBuilder!(T, dchar, lastDchar+1, Prefix)(T.init).build());
4406}
4407
4408package(std) template cmpK0(alias Pred)
4409{
4410    import std.typecons : Tuple;
4411    static bool cmpK0(Value, Key)
4412        (Tuple!(Value, Key) a, Tuple!(Value, Key) b)
4413    {
4414        return Pred(a[1]) < Pred(b[1]);
4415    }
4416}
4417
4418/**
4419    The most general utility for construction of `Trie`s
4420    short of using `TrieBuilder` directly.
4421
4422    Provides a number of convenience overloads.
4423    `Args` is tuple of maximum key value followed by
4424    predicates to construct index from key.
4425
4426    Alternatively if the first argument is not a value convertible to `Key`
4427    then the whole tuple of `Args` is treated as predicates
4428    and the maximum Key is deduced from predicates.
4429*/
4430private template buildTrie(Value, Key, Args...)
4431if (isValidArgsForTrie!(Key, Args))
4432{
4433    static if (is(typeof(Args[0]) : Key)) // prefix starts with upper bound on Key
4434    {
4435        alias Prefix = Args[1..$];
4436    }
4437    else
4438        alias Prefix = Args;
4439
4440    alias getIndex = mapTrieIndex!(Prefix);
4441
4442    // for multi-sort
4443    template GetComparators(size_t n)
4444    {
4445        static if (n > 0)
4446            alias GetComparators =
4447                AliasSeq!(GetComparators!(n-1), cmpK0!(Prefix[n-1]));
4448        else
4449            alias GetComparators = AliasSeq!();
4450    }
4451
4452    /*
4453        Build `Trie` from a range of a Key-Value pairs,
4454        assuming it is sorted by Key as defined by the following lambda:
4455        ------
4456        (a, b) => mapTrieIndex!(Prefix)(a) < mapTrieIndex!(Prefix)(b)
4457        ------
4458        Exception is thrown if it's detected that the above order doesn't hold.
4459
4460        In other words $(LREF mapTrieIndex) should be a
4461        monotonically increasing function that maps `Key` to an integer.
4462
4463        See_Also: $(REF sort, std,_algorithm),
4464        $(REF SortedRange, std,range),
4465        $(REF setUnion, std,_algorithm).
4466    */
4467    auto buildTrie(Range)(Range range, Value filler=Value.init)
4468        if (isInputRange!Range && is(typeof(Range.init.front[0]) : Value)
4469            && is(typeof(Range.init.front[1]) : Key))
4470    {
4471        auto builder = TrieBuilder!(Value, Key, Prefix)(filler);
4472        foreach (v; range)
4473            builder.putValue(v[1], v[0]);
4474        return builder.build();
4475    }
4476
4477    /*
4478        If `Value` is bool (or BitPacked!(bool, x)) then it's possible
4479        to build `Trie` from a range of open-right intervals of `Key`s.
4480        The requirement  on the ordering of keys (and the behavior on the
4481        violation of it) is the same as for Key-Value range overload.
4482
4483        Intervals denote ranges of !`filler` i.e. the opposite of filler.
4484        If no filler provided keys inside of the intervals map to true,
4485        and `filler` is false.
4486    */
4487    auto buildTrie(Range)(Range range, Value filler=Value.init)
4488        if (is(TypeOfBitPacked!Value ==  bool)
4489            && isInputRange!Range && is(typeof(Range.init.front[0]) : Key)
4490            && is(typeof(Range.init.front[1]) : Key))
4491    {
4492        auto builder = TrieBuilder!(Value, Key, Prefix)(filler);
4493        foreach (ival; range)
4494            builder.putRange(ival[0], ival[1], !filler);
4495        return builder.build();
4496    }
4497
4498    auto buildTrie(Range)(Range range, Value filler, bool unsorted)
4499        if (isInputRange!Range
4500            && is(typeof(Range.init.front[0]) : Value)
4501            && is(typeof(Range.init.front[1]) : Key))
4502    {
4503        import std.algorithm.sorting : multiSort;
4504        alias Comps = GetComparators!(Prefix.length);
4505        if (unsorted)
4506            multiSort!(Comps)(range);
4507        return buildTrie(range, filler);
4508    }
4509
4510    /*
4511        If `Value` is bool (or BitPacked!(bool, x)) then it's possible
4512        to build `Trie` simply from an input range of `Key`s.
4513        The requirement  on the ordering of keys (and the behavior on the
4514        violation of it) is the same as for Key-Value range overload.
4515
4516        Keys found in range denote !`filler` i.e. the opposite of filler.
4517        If no filler provided keys map to true, and `filler` is false.
4518    */
4519    auto buildTrie(Range)(Range range, Value filler=Value.init)
4520        if (is(TypeOfBitPacked!Value ==  bool)
4521            && isInputRange!Range && is(typeof(Range.init.front) : Key))
4522    {
4523        auto builder = TrieBuilder!(Value, Key, Prefix)(filler);
4524        foreach (v; range)
4525            builder.putValue(v, !filler);
4526        return builder.build();
4527    }
4528
4529    /*
4530        If `Key` is unsigned integer `Trie` could be constructed from array
4531        of values where array index serves as key.
4532    */
4533    auto buildTrie()(Value[] array, Value filler=Value.init)
4534        if (isUnsigned!Key)
4535    {
4536        auto builder = TrieBuilder!(Value, Key, Prefix)(filler);
4537        foreach (idx, v; array)
4538            builder.putValue(idx, v);
4539        return builder.build();
4540    }
4541
4542    /*
4543        Builds `Trie` from associative array.
4544    */
4545    auto buildTrie(Key, Value)(Value[Key] map, Value filler=Value.init)
4546    {
4547        import std.array : array;
4548        import std.range : zip;
4549        auto range = array(zip(map.values, map.keys));
4550        return buildTrie(range, filler, true); // sort it
4551    }
4552}
4553
4554// helper in place of assumeSize to
4555//reduce mangled name & help DMD inline Trie functors
4556struct clamp(size_t bits)
4557{
4558    static size_t opCall(T)(T arg){ return arg; }
4559    enum bitSize = bits;
4560}
4561
4562struct clampIdx(size_t idx, size_t bits)
4563{
4564    static size_t opCall(T)(T arg){ return arg[idx]; }
4565    enum bitSize = bits;
4566}
4567
4568/**
4569    Conceptual type that outlines the common properties of all UTF Matchers.
4570
4571    Note: For illustration purposes only, every method
4572    call results in assertion failure.
4573    Use $(LREF utfMatcher) to obtain a concrete matcher
4574    for UTF-8 or UTF-16 encodings.
4575*/
4576public struct MatcherConcept
4577{
4578    /**
4579        $(P Perform a semantic equivalent 2 operations:
4580        decoding a $(CODEPOINT) at front of `inp` and testing if
4581        it belongs to the set of $(CODEPOINTS) of this matcher. )
4582
4583        $(P The effect on `inp` depends on the kind of function called:)
4584
4585        $(P Match. If the codepoint is found in the set then range `inp`
4586        is advanced by its size in $(S_LINK Code unit, code units),
4587        otherwise the range is not modifed.)
4588
4589        $(P Skip. The range is always advanced by the size
4590        of the tested $(CODEPOINT) regardless of the result of test.)
4591
4592        $(P Test. The range is left unaffected regardless
4593        of the result of test.)
4594    */
4595    public bool match(Range)(ref Range inp)
4596        if (isRandomAccessRange!Range && is(ElementType!Range : char))
4597    {
4598       assert(false);
4599    }
4600
4601    ///ditto
4602    public bool skip(Range)(ref Range inp)
4603        if (isRandomAccessRange!Range && is(ElementType!Range : char))
4604    {
4605        assert(false);
4606    }
4607
4608    ///ditto
4609    public bool test(Range)(ref Range inp)
4610        if (isRandomAccessRange!Range && is(ElementType!Range : char))
4611    {
4612        assert(false);
4613    }
4614    ///
4615    pure @safe unittest
4616    {
4617        string truth = "2�� = 4";
4618        auto m = utfMatcher!char(unicode.Number);
4619        assert(m.match(truth)); // '2' is a number all right
4620        assert(truth == "�� = 4"); // skips on match
4621        assert(m.match(truth)); // so is the superscript '2'
4622        assert(!m.match(truth)); // space is not a number
4623        assert(truth == " = 4"); // unaffected on no match
4624        assert(!m.skip(truth)); // same test ...
4625        assert(truth == "= 4"); // but skips a codepoint regardless
4626        assert(!m.test(truth)); // '=' is not a number
4627        assert(truth == "= 4"); // test never affects argument
4628    }
4629
4630    /**
4631        Advanced feature - provide direct access to a subset of matcher based a
4632        set of known encoding lengths. Lengths are provided in
4633        $(S_LINK Code unit, code units). The sub-matcher then may do less
4634        operations per any `test`/`match`.
4635
4636        Use with care as the sub-matcher won't match
4637        any $(CODEPOINTS) that have encoded length that doesn't belong
4638        to the selected set of lengths. Also the sub-matcher object references
4639        the parent matcher and must not be used past the liftetime
4640        of the latter.
4641
4642        Another caveat of using sub-matcher is that skip is not available
4643        preciesly because sub-matcher doesn't detect all lengths.
4644    */
4645    @property auto subMatcher(Lengths...)()
4646    {
4647        assert(0);
4648        return this;
4649    }
4650
4651    pure @safe unittest
4652    {
4653        auto m = utfMatcher!char(unicode.Number);
4654        string square = "2��";
4655        // about sub-matchers
4656        assert(!m.subMatcher!(2,3,4).test(square)); // ASCII no covered
4657        assert(m.subMatcher!1.match(square)); // ASCII-only, works
4658        assert(!m.subMatcher!1.test(square)); // unicode '��'
4659        assert(m.subMatcher!(2,3,4).match(square));  //
4660        assert(square == "");
4661        wstring wsquare = "2��";
4662        auto m16 = utfMatcher!wchar(unicode.Number);
4663        // may keep ref, but the orignal (m16) must be kept alive
4664        auto bmp = m16.subMatcher!1;
4665        assert(bmp.match(wsquare)); // Okay, in basic multilingual plan
4666        assert(bmp.match(wsquare)); // And '��' too
4667    }
4668}
4669
4670/**
4671    Test if `M` is an UTF Matcher for ranges of `Char`.
4672*/
4673public enum isUtfMatcher(M, C) = __traits(compiles, (){
4674    C[] s;
4675    auto d = s.decoder;
4676    M m;
4677    assert(is(typeof(m.match(d)) == bool));
4678    assert(is(typeof(m.test(d)) == bool));
4679    static if (is(typeof(m.skip(d))))
4680    {
4681        assert(is(typeof(m.skip(d)) == bool));
4682        assert(is(typeof(m.skip(s)) == bool));
4683    }
4684    assert(is(typeof(m.match(s)) == bool));
4685    assert(is(typeof(m.test(s)) == bool));
4686});
4687
4688pure @safe unittest
4689{
4690    alias CharMatcher = typeof(utfMatcher!char(CodepointSet.init));
4691    alias WcharMatcher = typeof(utfMatcher!wchar(CodepointSet.init));
4692    static assert(isUtfMatcher!(CharMatcher, char));
4693    static assert(isUtfMatcher!(CharMatcher, immutable(char)));
4694    static assert(isUtfMatcher!(WcharMatcher, wchar));
4695    static assert(isUtfMatcher!(WcharMatcher, immutable(wchar)));
4696}
4697
4698enum Mode {
4699    alwaysSkip,
4700    neverSkip,
4701    skipOnMatch
4702}
4703
4704mixin template ForwardStrings()
4705{
4706    private bool fwdStr(string fn, C)(ref C[] str) const @trusted
4707    {
4708        import std.utf : byCodeUnit;
4709        alias type = typeof(byCodeUnit(str));
4710        return mixin(fn~"(*cast(type*)&str)");
4711    }
4712}
4713
4714template Utf8Matcher()
4715{
4716    enum validSize(int sz) = sz >= 1 && sz <= 4;
4717
4718    void badEncoding() pure @safe
4719    {
4720        import std.utf : UTFException;
4721        throw new UTFException("Invalid UTF-8 sequence");
4722    }
4723
4724    //for 1-stage ASCII
4725    alias AsciiSpec = AliasSeq!(bool, char, clamp!7);
4726    //for 2-stage lookup of 2 byte UTF-8 sequences
4727    alias Utf8Spec2 = AliasSeq!(bool, char[2],
4728        clampIdx!(0, 5), clampIdx!(1, 6));
4729    //ditto for 3 byte
4730    alias Utf8Spec3 = AliasSeq!(bool, char[3],
4731        clampIdx!(0, 4),
4732        clampIdx!(1, 6),
4733        clampIdx!(2, 6)
4734    );
4735    //ditto for 4 byte
4736    alias Utf8Spec4 = AliasSeq!(bool, char[4],
4737        clampIdx!(0, 3), clampIdx!(1, 6),
4738        clampIdx!(2, 6), clampIdx!(3, 6)
4739    );
4740    alias Tables = AliasSeq!(
4741        typeof(TrieBuilder!(AsciiSpec)(false).build()),
4742        typeof(TrieBuilder!(Utf8Spec2)(false).build()),
4743        typeof(TrieBuilder!(Utf8Spec3)(false).build()),
4744        typeof(TrieBuilder!(Utf8Spec4)(false).build())
4745    );
4746    alias Table(int size) = Tables[size-1];
4747
4748    enum leadMask(size_t size) = (cast(size_t) 1<<(7 - size))-1;
4749    enum encMask(size_t size) = ((1 << size)-1)<<(8-size);
4750
4751    char truncate()(char ch) pure @safe
4752    {
4753        ch -= 0x80;
4754        if (ch < 0x40)
4755        {
4756            return ch;
4757        }
4758        else
4759        {
4760            badEncoding();
4761            return cast(char) 0;
4762        }
4763    }
4764
4765    static auto encode(size_t sz)(dchar ch)
4766        if (sz > 1)
4767    {
4768        import std.utf : encodeUTF = encode;
4769        char[4] buf;
4770        encodeUTF(buf, ch);
4771        char[sz] ret;
4772        buf[0] &= leadMask!sz;
4773        foreach (n; 1 .. sz)
4774            buf[n] = buf[n] & 0x3f; //keep 6 lower bits
4775        ret[] = buf[0 .. sz];
4776        return ret;
4777    }
4778
4779    auto build(Set)(Set set)
4780    {
4781        import std.algorithm.iteration : map;
4782        auto ascii = set & unicode.ASCII;
4783        auto utf8_2 = set & CodepointSet(0x80, 0x800);
4784        auto utf8_3 = set & CodepointSet(0x800, 0x1_0000);
4785        auto utf8_4 = set & CodepointSet(0x1_0000, lastDchar+1);
4786        auto asciiT = ascii.byCodepoint.map!(x=>cast(char) x).buildTrie!(AsciiSpec);
4787        auto utf8_2T = utf8_2.byCodepoint.map!(x=>encode!2(x)).buildTrie!(Utf8Spec2);
4788        auto utf8_3T = utf8_3.byCodepoint.map!(x=>encode!3(x)).buildTrie!(Utf8Spec3);
4789        auto utf8_4T = utf8_4.byCodepoint.map!(x=>encode!4(x)).buildTrie!(Utf8Spec4);
4790        alias Ret = Impl!(1,2,3,4);
4791        return Ret(asciiT, utf8_2T, utf8_3T, utf8_4T);
4792    }
4793
4794    // Bootstrap UTF-8 static matcher interface
4795    // from 3 primitives: tab!(size), lookup and Sizes
4796    mixin template DefMatcher()
4797    {
4798        import std.format : format;
4799        import std.meta : Erase, staticIndexOf;
4800        enum hasASCII = staticIndexOf!(1, Sizes) >= 0;
4801        alias UniSizes = Erase!(1, Sizes);
4802
4803        //generate dispatch code sequence for unicode parts
4804        static auto genDispatch()
4805        {
4806            string code;
4807            foreach (size; UniSizes)
4808                code ~= format(q{
4809                    if ((ch & ~leadMask!%d) == encMask!(%d))
4810                        return lookup!(%d, mode)(inp);
4811                    else
4812                }, size, size, size);
4813            static if (Sizes.length == 4) //covers all code unit cases
4814                code ~= "{ badEncoding(); return false; }";
4815            else
4816                code ~= "return false;"; //may be just fine but not covered
4817            return code;
4818        }
4819        enum dispatch = genDispatch();
4820
4821        public bool match(Range)(ref Range inp) const
4822            if (isRandomAccessRange!Range && is(ElementType!Range : char) &&
4823                !isDynamicArray!Range)
4824        {
4825            enum mode = Mode.skipOnMatch;
4826            assert(!inp.empty);
4827            immutable ch = inp[0];
4828            static if (hasASCII)
4829            {
4830                if (ch < 0x80)
4831                {
4832                    immutable r = tab!1[ch];
4833                    if (r)
4834                        inp.popFront();
4835                    return r;
4836                }
4837                else
4838                    mixin(dispatch);
4839            }
4840            else
4841                mixin(dispatch);
4842        }
4843
4844        static if (Sizes.length == 4) // can skip iff can detect all encodings
4845        {
4846            public bool skip(Range)(ref Range inp) const
4847                if (isRandomAccessRange!Range && is(ElementType!Range : char) &&
4848                    !isDynamicArray!Range)
4849            {
4850                enum mode = Mode.alwaysSkip;
4851                assert(!inp.empty);
4852                auto ch = inp[0];
4853                static if (hasASCII)
4854                {
4855                    if (ch < 0x80)
4856                    {
4857                        inp.popFront();
4858                        return tab!1[ch];
4859                    }
4860                    else
4861                        mixin(dispatch);
4862                }
4863                else
4864                    mixin(dispatch);
4865            }
4866        }
4867
4868        public bool test(Range)(ref Range inp) const
4869            if (isRandomAccessRange!Range && is(ElementType!Range : char) &&
4870                !isDynamicArray!Range)
4871        {
4872            enum mode = Mode.neverSkip;
4873            assert(!inp.empty);
4874            auto ch = inp[0];
4875            static if (hasASCII)
4876            {
4877                if (ch < 0x80)
4878                    return tab!1[ch];
4879                else
4880                    mixin(dispatch);
4881            }
4882            else
4883                mixin(dispatch);
4884        }
4885
4886        bool match(C)(ref C[] str) const
4887            if (isSomeChar!C)
4888        {
4889            return fwdStr!"match"(str);
4890        }
4891
4892        bool skip(C)(ref C[] str) const
4893            if (isSomeChar!C)
4894        {
4895            return fwdStr!"skip"(str);
4896        }
4897
4898        bool test(C)(ref C[] str) const
4899            if (isSomeChar!C)
4900        {
4901            return fwdStr!"test"(str);
4902        }
4903
4904        mixin ForwardStrings;
4905    }
4906
4907    struct Impl(Sizes...)
4908    {
4909        import std.meta : allSatisfy, staticMap;
4910        static assert(allSatisfy!(validSize, Sizes),
4911            "Only lengths of 1, 2, 3 and 4 code unit are possible for UTF-8");
4912    private:
4913        //pick tables for chosen sizes
4914        alias OurTabs = staticMap!(Table, Sizes);
4915        OurTabs tables;
4916        mixin DefMatcher;
4917        //static disptach helper UTF size ==> table
4918        alias tab(int i) = tables[i - 1];
4919
4920        package(std) @property CherryPick!(Impl, SizesToPick) subMatcher(SizesToPick...)()
4921        {
4922            return CherryPick!(Impl, SizesToPick)(&this);
4923        }
4924
4925        bool lookup(int size, Mode mode, Range)(ref Range inp) const
4926        {
4927            import std.range : popFrontN;
4928            if (inp.length < size)
4929            {
4930                badEncoding();
4931                return false;
4932            }
4933            char[size] needle = void;
4934            needle[0] = leadMask!size & inp[0];
4935            static foreach (i; 1 .. size)
4936            {
4937                needle[i] = truncate(inp[i]);
4938            }
4939            //overlong encoding checks
4940            static if (size == 2)
4941            {
4942                //0x80-0x7FF
4943                //got 6 bits in needle[1], must use at least 8 bits
4944                //must use at least 2 bits in needle[1]
4945                if (needle[0] < 2) badEncoding();
4946            }
4947            else static if (size == 3)
4948            {
4949                //0x800-0xFFFF
4950                //got 6 bits in needle[2], must use at least 12bits
4951                //must use 6 bits in needle[1] or anything in needle[0]
4952                if (needle[0] == 0 && needle[1] < 0x20) badEncoding();
4953            }
4954            else static if (size == 4)
4955            {
4956                //0x800-0xFFFF
4957                //got 2x6=12 bits in needle[2 .. 3] must use at least 17bits
4958                //must use 5 bits (or above) in needle[1] or anything in needle[0]
4959                if (needle[0] == 0 && needle[1] < 0x10) badEncoding();
4960            }
4961            static if (mode == Mode.alwaysSkip)
4962            {
4963                inp.popFrontN(size);
4964                return tab!size[needle];
4965            }
4966            else static if (mode == Mode.neverSkip)
4967            {
4968                return tab!size[needle];
4969            }
4970            else
4971            {
4972                static assert(mode == Mode.skipOnMatch);
4973                if (tab!size[needle])
4974                {
4975                    inp.popFrontN(size);
4976                    return true;
4977                }
4978                else
4979                    return false;
4980            }
4981        }
4982    }
4983
4984    struct CherryPick(I, Sizes...)
4985    {
4986        import std.meta : allSatisfy;
4987        static assert(allSatisfy!(validSize, Sizes),
4988            "Only lengths of 1, 2, 3 and 4 code unit are possible for UTF-8");
4989    private:
4990        I* m;
4991        @property auto tab(int i)() const { return m.tables[i - 1]; }
4992        bool lookup(int size, Mode mode, Range)(ref Range inp) const
4993        {
4994            return m.lookup!(size, mode)(inp);
4995        }
4996        mixin DefMatcher;
4997    }
4998}
4999
5000template Utf16Matcher()
5001{
5002    enum validSize(int sz) = sz >= 1 && sz <= 2;
5003
5004    void badEncoding() pure @safe
5005    {
5006        import std.utf : UTFException;
5007        throw new UTFException("Invalid UTF-16 sequence");
5008    }
5009
5010    // 1-stage ASCII
5011    alias AsciiSpec = AliasSeq!(bool, wchar, clamp!7);
5012    //2-stage BMP
5013    alias BmpSpec = AliasSeq!(bool, wchar, sliceBits!(7, 16), sliceBits!(0, 7));
5014    //4-stage - full Unicode
5015    //assume that 0xD800 & 0xDC00 bits are cleared
5016    //thus leaving 10 bit per wchar to worry about
5017    alias UniSpec = AliasSeq!(bool, wchar[2],
5018        assumeSize!(x=>x[0]>>4, 6), assumeSize!(x=>x[0]&0xf, 4),
5019        assumeSize!(x=>x[1]>>6, 4), assumeSize!(x=>x[1]&0x3f, 6),
5020    );
5021    alias Ascii = typeof(TrieBuilder!(AsciiSpec)(false).build());
5022    alias Bmp = typeof(TrieBuilder!(BmpSpec)(false).build());
5023    alias Uni = typeof(TrieBuilder!(UniSpec)(false).build());
5024
5025    auto encode2(dchar ch)
5026    {
5027        ch -= 0x1_0000;
5028        assert(ch <= 0xF_FFFF);
5029        wchar[2] ret;
5030        //do not put surrogate bits, they are sliced off
5031        ret[0] = cast(wchar)(ch >> 10);
5032        ret[1] = (ch & 0xFFF);
5033        return ret;
5034    }
5035
5036    auto build(Set)(Set set)
5037    {
5038        import std.algorithm.iteration : map;
5039        auto ascii = set & unicode.ASCII;
5040        auto bmp = (set & CodepointSet.fromIntervals(0x80, 0xFFFF+1))
5041            - CodepointSet.fromIntervals(0xD800, 0xDFFF+1);
5042        auto other = set - (bmp | ascii);
5043        auto asciiT = ascii.byCodepoint.map!(x=>cast(char) x).buildTrie!(AsciiSpec);
5044        auto bmpT = bmp.byCodepoint.map!(x=>cast(wchar) x).buildTrie!(BmpSpec);
5045        auto otherT = other.byCodepoint.map!(x=>encode2(x)).buildTrie!(UniSpec);
5046        alias Ret = Impl!(1,2);
5047        return Ret(asciiT, bmpT, otherT);
5048    }
5049
5050    //bootstrap full UTF-16 matcher interace from
5051    //sizeFlags, lookupUni and ascii
5052    mixin template DefMatcher()
5053    {
5054        public bool match(Range)(ref Range inp) const
5055            if (isRandomAccessRange!Range && is(ElementType!Range : wchar) &&
5056                !isDynamicArray!Range)
5057        {
5058            enum mode = Mode.skipOnMatch;
5059            assert(!inp.empty);
5060            immutable ch = inp[0];
5061            static if (sizeFlags & 1)
5062            {
5063                if (ch < 0x80)
5064                {
5065                  if (ascii[ch])
5066                  {
5067                      inp.popFront();
5068                      return true;
5069                  }
5070                  else
5071                      return false;
5072                }
5073                return lookupUni!mode(inp);
5074            }
5075            else
5076                return lookupUni!mode(inp);
5077        }
5078
5079        static if (Sizes.length == 2)
5080        {
5081            public bool skip(Range)(ref Range inp) const
5082                if (isRandomAccessRange!Range && is(ElementType!Range : wchar) &&
5083                    !isDynamicArray!Range)
5084            {
5085                enum mode = Mode.alwaysSkip;
5086                assert(!inp.empty);
5087                immutable ch = inp[0];
5088                static if (sizeFlags & 1)
5089                {
5090                    if (ch < 0x80)
5091                    {
5092                        inp.popFront();
5093                        return ascii[ch];
5094                    }
5095                    else
5096                        return lookupUni!mode(inp);
5097                }
5098                else
5099                    return lookupUni!mode(inp);
5100            }
5101        }
5102
5103        public bool test(Range)(ref Range inp) const
5104            if (isRandomAccessRange!Range && is(ElementType!Range : wchar) &&
5105                !isDynamicArray!Range)
5106        {
5107            enum mode = Mode.neverSkip;
5108            assert(!inp.empty);
5109            auto ch = inp[0];
5110            static if (sizeFlags & 1)
5111                return ch < 0x80 ? ascii[ch] : lookupUni!mode(inp);
5112            else
5113                return lookupUni!mode(inp);
5114        }
5115
5116        bool match(C)(ref C[] str) const
5117            if (isSomeChar!C)
5118        {
5119            return fwdStr!"match"(str);
5120        }
5121
5122        bool skip(C)(ref C[] str) const
5123            if (isSomeChar!C)
5124        {
5125            return fwdStr!"skip"(str);
5126        }
5127
5128        bool test(C)(ref C[] str) const
5129            if (isSomeChar!C)
5130        {
5131            return fwdStr!"test"(str);
5132        }
5133
5134        mixin ForwardStrings; //dispatch strings to range versions
5135    }
5136
5137    struct Impl(Sizes...)
5138        if (Sizes.length >= 1 && Sizes.length <= 2)
5139    {
5140    private:
5141        import std.meta : allSatisfy;
5142        static assert(allSatisfy!(validSize, Sizes),
5143            "Only lengths of 1 and 2 code units are possible in UTF-16");
5144        static if (Sizes.length > 1)
5145            enum sizeFlags = Sizes[0] | Sizes[1];
5146        else
5147            enum sizeFlags = Sizes[0];
5148
5149        static if (sizeFlags & 1)
5150        {
5151            Ascii ascii;
5152            Bmp bmp;
5153        }
5154        static if (sizeFlags & 2)
5155        {
5156            Uni uni;
5157        }
5158        mixin DefMatcher;
5159
5160        package(std) @property CherryPick!(Impl, SizesToPick) subMatcher(SizesToPick...)()
5161        {
5162            return CherryPick!(Impl, SizesToPick)(&this);
5163        }
5164
5165        bool lookupUni(Mode mode, Range)(ref Range inp) const
5166        {
5167            wchar x = cast(wchar)(inp[0] - 0xD800);
5168            //not a high surrogate
5169            if (x > 0x3FF)
5170            {
5171                //low surrogate
5172                if (x <= 0x7FF) badEncoding();
5173                static if (sizeFlags & 1)
5174                {
5175                    auto ch = inp[0];
5176                    static if (mode == Mode.alwaysSkip)
5177                        inp.popFront();
5178                    static if (mode == Mode.skipOnMatch)
5179                    {
5180                        if (bmp[ch])
5181                        {
5182                            inp.popFront();
5183                            return true;
5184                        }
5185                        else
5186                            return false;
5187                    }
5188                    else
5189                        return bmp[ch];
5190                }
5191                else //skip is not available for sub-matchers, so just false
5192                    return false;
5193            }
5194            else
5195            {
5196                import std.range : popFrontN;
5197                static if (sizeFlags & 2)
5198                {
5199                    if (inp.length < 2)
5200                        badEncoding();
5201                    wchar y = cast(wchar)(inp[1] - 0xDC00);
5202                    //not a low surrogate
5203                    if (y > 0x3FF)
5204                        badEncoding();
5205                    wchar[2] needle = [inp[0] & 0x3ff, inp[1] & 0x3ff];
5206                    static if (mode == Mode.alwaysSkip)
5207                        inp.popFrontN(2);
5208                    static if (mode == Mode.skipOnMatch)
5209                    {
5210                        if (uni[needle])
5211                        {
5212                            inp.popFrontN(2);
5213                            return true;
5214                        }
5215                        else
5216                            return false;
5217                    }
5218                    else
5219                        return uni[needle];
5220                }
5221                else //ditto
5222                    return false;
5223            }
5224        }
5225    }
5226
5227    struct CherryPick(I, Sizes...)
5228        if (Sizes.length >= 1 && Sizes.length <= 2)
5229    {
5230    private:
5231        import std.meta : allSatisfy;
5232        I* m;
5233        enum sizeFlags = I.sizeFlags;
5234
5235        static if (sizeFlags & 1)
5236        {
5237            @property auto ascii()() const { return m.ascii; }
5238        }
5239
5240        bool lookupUni(Mode mode, Range)(ref Range inp) const
5241        {
5242            return m.lookupUni!mode(inp);
5243        }
5244        mixin DefMatcher;
5245        static assert(allSatisfy!(validSize, Sizes),
5246            "Only lengths of 1 and 2 code units are possible in UTF-16");
5247    }
5248}
5249
5250private auto utf8Matcher(Set)(Set set)
5251{
5252    return Utf8Matcher!().build(set);
5253}
5254
5255private auto utf16Matcher(Set)(Set set)
5256{
5257    return Utf16Matcher!().build(set);
5258}
5259
5260/**
5261    Constructs a matcher object
5262    to classify $(CODEPOINTS) from the `set` for encoding
5263    that has `Char` as code unit.
5264
5265    See $(LREF MatcherConcept) for API outline.
5266*/
5267public auto utfMatcher(Char, Set)(Set set)
5268if (isCodepointSet!Set)
5269{
5270    static if (is(Char : char))
5271        return utf8Matcher(set);
5272    else static if (is(Char : wchar))
5273        return utf16Matcher(set);
5274    else static if (is(Char : dchar))
5275        static assert(false, "UTF-32 needs no decoding,
5276            and thus not supported by utfMatcher");
5277    else
5278        static assert(false, "Only character types 'char' and 'wchar' are allowed");
5279}
5280
5281
5282//a range of code units, packed with index to speed up forward iteration
5283package(std) auto decoder(C)(C[] s, size_t offset=0)
5284if (is(C : wchar) || is(C : char))
5285{
5286    static struct Decoder
5287    {
5288    pure nothrow:
5289        C[] str;
5290        size_t idx;
5291        @property C front(){ return str[idx]; }
5292        @property C back(){ return str[$-1]; }
5293        void popFront(){ idx++; }
5294        void popBack(){ str = str[0..$-1]; }
5295        void popFrontN(size_t n){ idx += n; }
5296        @property bool empty(){ return idx == str.length; }
5297        @property auto save(){ return this; }
5298        auto opIndex(size_t i){ return str[idx+i]; }
5299        @property size_t length(){ return str.length - idx; }
5300        alias opDollar = length;
5301        auto opSlice(size_t a, size_t b){ return Decoder(str[0 .. idx+b], idx+a); }
5302    }
5303    static assert(isRandomAccessRange!Decoder);
5304    static assert(is(ElementType!Decoder : C));
5305    return Decoder(s, offset);
5306}
5307
5308pure @safe unittest
5309{
5310    string rs = "hi! ���������������� ������������";
5311    auto codec = rs.decoder;
5312    auto utf8 =  utf8Matcher(unicode.Letter);
5313    auto asc = utf8.subMatcher!(1);
5314    auto uni = utf8.subMatcher!(2,3,4);
5315    assert(asc.test(codec));
5316    assert(!uni.match(codec));
5317    assert(utf8.skip(codec));
5318    assert(codec.idx == 1);
5319
5320    assert(!uni.match(codec));
5321    assert(asc.test(codec));
5322    assert(utf8.skip(codec));
5323    assert(codec.idx == 2);
5324    assert(!asc.match(codec));
5325
5326    assert(!utf8.test(codec));
5327    assert(!utf8.skip(codec));
5328
5329    assert(!asc.test(codec));
5330    assert(!utf8.test(codec));
5331    assert(!utf8.skip(codec));
5332    assert(utf8.test(codec));
5333    foreach (i; 0 .. 7)
5334    {
5335        assert(!asc.test(codec));
5336        assert(uni.test(codec));
5337        assert(utf8.skip(codec));
5338    }
5339    assert(!utf8.test(codec));
5340    assert(!utf8.skip(codec));
5341    //the same with match where applicable
5342    codec = rs.decoder;
5343    assert(utf8.match(codec));
5344    assert(codec.idx == 1);
5345    assert(utf8.match(codec));
5346    assert(codec.idx == 2);
5347    assert(!utf8.match(codec));
5348    assert(codec.idx == 2);
5349    assert(!utf8.skip(codec));
5350    assert(!utf8.skip(codec));
5351
5352    foreach (i; 0 .. 7)
5353    {
5354        assert(!asc.test(codec));
5355        assert(utf8.test(codec));
5356        assert(utf8.match(codec));
5357    }
5358    auto i = codec.idx;
5359    assert(!utf8.match(codec));
5360    assert(codec.idx == i);
5361}
5362
5363pure @safe unittest
5364{
5365    import std.range : stride;
5366    static bool testAll(Matcher, Range)(ref Matcher m, ref Range r) @safe
5367    {
5368        bool t = m.test(r);
5369        auto save = r.idx;
5370        assert(t == m.match(r));
5371        assert(r.idx == save || t); //ether no change or was match
5372        r.idx = save;
5373        static if (is(typeof(m.skip(r))))
5374        {
5375            assert(t == m.skip(r));
5376            assert(r.idx != save); //always changed
5377            r.idx = save;
5378        }
5379        return t;
5380    }
5381    auto utf16 = utfMatcher!wchar(unicode.L);
5382    auto bmp = utf16.subMatcher!1;
5383    auto nonBmp = utf16.subMatcher!1;
5384    auto utf8 = utfMatcher!char(unicode.L);
5385    auto ascii = utf8.subMatcher!1;
5386    auto uni2 = utf8.subMatcher!2;
5387    auto uni3 = utf8.subMatcher!3;
5388    auto uni24 = utf8.subMatcher!(2,4);
5389    foreach (ch; unicode.L.byCodepoint.stride(3))
5390    {
5391        import std.utf : encode;
5392        char[4] buf;
5393        wchar[2] buf16;
5394        auto len = encode(buf, ch);
5395        auto len16 = encode(buf16, ch);
5396        auto c8 = buf[0 .. len].decoder;
5397        auto c16 = buf16[0 .. len16].decoder;
5398        assert(testAll(utf16, c16));
5399        assert(testAll(bmp, c16) || len16 != 1);
5400        assert(testAll(nonBmp, c16) || len16 != 2);
5401
5402        assert(testAll(utf8, c8));
5403
5404        //submatchers return false on out of their domain
5405        assert(testAll(ascii, c8) || len != 1);
5406        assert(testAll(uni2, c8) || len != 2);
5407        assert(testAll(uni3, c8) || len != 3);
5408        assert(testAll(uni24, c8) || (len != 2 && len != 4));
5409    }
5410}
5411
5412// cover decode fail cases of Matcher
5413pure @safe unittest
5414{
5415    import std.algorithm.iteration : map;
5416    import std.exception : collectException;
5417    import std.format : format;
5418    auto utf16 = utfMatcher!wchar(unicode.L);
5419    auto utf8 = utfMatcher!char(unicode.L);
5420    //decode failure cases UTF-8
5421    alias fails8 = AliasSeq!("\xC1", "\x80\x00","\xC0\x00", "\xCF\x79",
5422        "\xFF\x00\0x00\0x00\x00", "\xC0\0x80\0x80\x80", "\x80\0x00\0x00\x00",
5423        "\xCF\x00\0x00\0x00\x00");
5424    foreach (msg; fails8)
5425    {
5426        assert(collectException((){
5427            auto s = msg;
5428            size_t idx = 0;
5429            utf8.test(s);
5430        }()), format("%( %2x %)", cast(immutable(ubyte)[]) msg));
5431    }
5432    //decode failure cases UTF-16
5433    alias fails16 = AliasSeq!([0xD811], [0xDC02]);
5434    foreach (msg; fails16)
5435    {
5436        assert(collectException((){
5437            auto s = msg.map!(x => cast(wchar) x);
5438            utf16.test(s);
5439        }()));
5440    }
5441}
5442
5443/++
5444    Convenience function to construct optimal configurations for
5445    packed Trie from any `set` of $(CODEPOINTS).
5446
5447    The parameter `level` indicates the number of trie levels to use,
5448    allowed values are: 1, 2, 3 or 4. Levels represent different trade-offs
5449    speed-size wise.
5450
5451    $(P Level 1 is fastest and the most memory hungry (a bit array). )
5452    $(P Level 4 is the slowest and has the smallest footprint. )
5453
5454    See the $(S_LINK Synopsis, Synopsis) section for example.
5455
5456    Note:
5457    Level 4 stays very practical (being faster and more predictable)
5458    compared to using direct lookup on the `set` itself.
5459
5460
5461+/
5462public auto toTrie(size_t level, Set)(Set set)
5463if (isCodepointSet!Set)
5464{
5465    static if (level == 1)
5466        return codepointSetTrie!(21)(set);
5467    else static if (level == 2)
5468        return codepointSetTrie!(10, 11)(set);
5469    else static if (level == 3)
5470        return codepointSetTrie!(8, 5, 8)(set);
5471    else static if (level == 4)
5472         return codepointSetTrie!(6, 4, 4, 7)(set);
5473    else
5474        static assert(false,
5475            "Sorry, toTrie doesn't support levels > 4, use codepointSetTrie directly");
5476}
5477
5478/**
5479    $(P Builds a `Trie` with typically optimal speed-size trade-off
5480    and wraps it into a delegate of the following type:
5481    $(D bool delegate(dchar ch)). )
5482
5483    $(P Effectively this creates a 'tester' lambda suitable
5484    for algorithms like std.algorithm.find that take unary predicates. )
5485
5486    See the $(S_LINK Synopsis, Synopsis) section for example.
5487*/
5488public auto toDelegate(Set)(Set set)
5489if (isCodepointSet!Set)
5490{
5491    // 3 is very small and is almost as fast as 2-level (due to CPU caches?)
5492    auto t = toTrie!3(set);
5493    return (dchar ch) => t[ch];
5494}
5495
5496/**
5497    $(P Opaque wrapper around unsigned built-in integers and
5498    code unit (char/wchar/dchar) types.
5499    Parameter `sz` indicates that the value is confined
5500    to the range of [0, 2^^sz$(RPAREN). With this knowledge it can be
5501    packed more tightly when stored in certain
5502    data-structures like trie. )
5503
5504    Note:
5505    $(P The $(D BitPacked!(T, sz)) is implicitly convertible to `T`
5506    but not vise-versa. Users have to ensure the value fits in
5507    the range required and use the `cast`
5508    operator to perform the conversion.)
5509*/
5510struct BitPacked(T, size_t sz)
5511if (isIntegral!T || is(T:dchar))
5512{
5513    enum bitSize = sz;
5514    T _value;
5515    alias _value this;
5516}
5517
5518/*
5519    Depending on the form of the passed argument `bitSizeOf` returns
5520    the amount of bits required to represent a given type
5521    or a return type of a given functor.
5522*/
5523template bitSizeOf(Args...)
5524if (Args.length == 1)
5525{
5526    import std.traits : ReturnType;
5527    alias T = Args[0];
5528    static if (__traits(compiles, { size_t val = T.bitSize; })) //(is(typeof(T.bitSize) : size_t))
5529    {
5530        enum bitSizeOf = T.bitSize;
5531    }
5532    else static if (is(ReturnType!T dummy == BitPacked!(U, bits), U, size_t bits))
5533    {
5534        enum bitSizeOf = bitSizeOf!(ReturnType!T);
5535    }
5536    else
5537    {
5538        enum bitSizeOf = T.sizeof*8;
5539    }
5540}
5541
5542/**
5543    Tests if `T` is some instantiation of $(LREF BitPacked)!(U, x)
5544    and thus suitable for packing.
5545*/
5546template isBitPacked(T)
5547{
5548    static if (is(T dummy == BitPacked!(U, bits), U, size_t bits))
5549        enum isBitPacked = true;
5550    else
5551        enum isBitPacked = false;
5552}
5553
5554/**
5555    Gives the type `U` from $(LREF BitPacked)!(U, x)
5556    or `T` itself for every other type.
5557*/
5558template TypeOfBitPacked(T)
5559{
5560    static if (is(T dummy == BitPacked!(U, bits), U, size_t bits))
5561        alias TypeOfBitPacked = U;
5562    else
5563        alias TypeOfBitPacked = T;
5564}
5565
5566/*
5567    Wrapper, used in definition of custom data structures from `Trie` template.
5568    Applying it to a unary lambda function indicates that the returned value always
5569    fits within `bits` of bits.
5570*/
5571struct assumeSize(alias Fn, size_t bits)
5572{
5573    enum bitSize = bits;
5574    static auto ref opCall(T)(auto ref T arg)
5575    {
5576        return Fn(arg);
5577    }
5578}
5579
5580/*
5581    A helper for defining lambda function that yields a slice
5582    of certain bits from an unsigned integral value.
5583    The resulting lambda is wrapped in assumeSize and can be used directly
5584    with `Trie` template.
5585*/
5586struct sliceBits(size_t from, size_t to)
5587{
5588    //for now bypass assumeSize, DMD has trouble inlining it
5589    enum bitSize = to-from;
5590    static auto opCall(T)(T x)
5591    out(result)
5592    {
5593        assert(result < (1 << to-from));
5594    }
5595    do
5596    {
5597        static assert(from < to);
5598        static if (from == 0)
5599            return x & ((1 << to)-1);
5600        else
5601        return (x >> from) & ((1<<(to-from))-1);
5602    }
5603}
5604
5605@safe pure nothrow @nogc uint low_8(uint x) { return x&0xFF; }
5606@safe pure nothrow @nogc uint midlow_8(uint x){ return (x&0xFF00)>>8; }
5607alias lo8 = assumeSize!(low_8, 8);
5608alias mlo8 = assumeSize!(midlow_8, 8);
5609
5610@safe pure nothrow @nogc unittest
5611{
5612    static assert(bitSizeOf!lo8 == 8);
5613    static assert(bitSizeOf!(sliceBits!(4, 7)) == 3);
5614    static assert(bitSizeOf!(BitPacked!(uint, 2)) == 2);
5615}
5616
5617template Sequence(size_t start, size_t end)
5618{
5619    static if (start < end)
5620        alias Sequence = AliasSeq!(start, Sequence!(start+1, end));
5621    else
5622        alias Sequence = AliasSeq!();
5623}
5624
5625//---- TRIE TESTS ----
5626@system unittest
5627{
5628    import std.algorithm.iteration : map;
5629    import std.algorithm.sorting : sort;
5630    import std.array : array;
5631    import std.conv : text, to;
5632    import std.range : iota;
5633    static trieStats(TRIE)(TRIE t)
5634    {
5635        version (std_uni_stats)
5636        {
5637            import std.stdio : writefln, writeln;
5638            writeln("---TRIE FOOTPRINT STATS---");
5639            static foreach (i; 0 .. t.table.dim)
5640            {
5641                writefln("lvl%s = %s bytes;  %s pages"
5642                         , i, t.bytes!i, t.pages!i);
5643            }
5644            writefln("TOTAL: %s bytes", t.bytes);
5645            version (none)
5646            {
5647                writeln("INDEX (excluding value level):");
5648                static foreach (i; 0 .. t.table.dim-1)
5649                    writeln(t.table.slice!(i)[0 .. t.table.length!i]);
5650            }
5651            writeln("---------------------------");
5652        }
5653    }
5654    //@@@BUG link failure, lambdas not found by linker somehow (in case of trie2)
5655    // alias lo8   = assumeSize!(8, function (uint x) { return x&0xFF; });
5656    // alias next8 = assumeSize!(7, function (uint x) { return (x&0x7F00)>>8; });
5657    alias Set = CodepointSet;
5658    auto set = Set('A','Z','a','z');
5659    auto trie = buildTrie!(bool, uint, 256, lo8)(set.byInterval);// simple bool array
5660    for (int a='a'; a<'z';a++)
5661        assert(trie[a]);
5662    for (int a='A'; a<'Z';a++)
5663        assert(trie[a]);
5664    for (int a=0; a<'A'; a++)
5665        assert(!trie[a]);
5666    for (int a ='Z'; a<'a'; a++)
5667        assert(!trie[a]);
5668    trieStats(trie);
5669
5670    auto redundant2 = Set(
5671        1, 18, 256+2, 256+111, 512+1, 512+18, 768+2, 768+111);
5672    auto trie2 = buildTrie!(bool, uint, 1024, mlo8, lo8)(redundant2.byInterval);
5673    trieStats(trie2);
5674    foreach (e; redundant2.byCodepoint)
5675        assert(trie2[e], text(cast(uint) e, " - ", trie2[e]));
5676    foreach (i; 0 .. 1024)
5677    {
5678        assert(trie2[i] == (i in redundant2));
5679    }
5680
5681
5682    auto redundant3 = Set(
5683          2,    4,    6,    8,    16,
5684       2+16, 4+16, 16+6, 16+8, 16+16,
5685       2+32, 4+32, 32+6, 32+8,
5686      );
5687
5688    enum max3 = 256;
5689    // sliceBits
5690    auto trie3 = buildTrie!(bool, uint, max3,
5691            sliceBits!(6,8), sliceBits!(4,6), sliceBits!(0,4)
5692        )(redundant3.byInterval);
5693    trieStats(trie3);
5694    foreach (i; 0 .. max3)
5695        assert(trie3[i] == (i in redundant3), text(cast(uint) i));
5696
5697    auto redundant4 = Set(
5698            10, 64, 64+10, 128, 128+10, 256, 256+10, 512,
5699            1000, 2000, 3000, 4000, 5000, 6000
5700        );
5701    enum max4 = 2^^16;
5702    auto trie4 = buildTrie!(bool, size_t, max4,
5703            sliceBits!(13, 16), sliceBits!(9, 13), sliceBits!(6, 9) , sliceBits!(0, 6)
5704        )(redundant4.byInterval);
5705    foreach (i; 0 .. max4)
5706    {
5707        if (i in redundant4)
5708            assert(trie4[i], text(cast(uint) i));
5709    }
5710    trieStats(trie4);
5711
5712        alias mapToS = mapTrieIndex!(useItemAt!(0, char));
5713        string[] redundantS = ["tea", "start", "orange"];
5714        redundantS.sort!((a,b) => mapToS(a) < mapToS(b))();
5715        auto strie = buildTrie!(bool, string, useItemAt!(0, char))(redundantS);
5716        // using first char only
5717        assert(redundantS == ["orange", "start", "tea"]);
5718        assert(strie["test"], text(strie["test"]));
5719        assert(!strie["aea"]);
5720        assert(strie["s"]);
5721
5722    // a bit size test
5723    auto a = array(map!(x => to!ubyte(x))(iota(0, 256)));
5724    auto bt = buildTrie!(bool, ubyte, sliceBits!(7, 8), sliceBits!(5, 7), sliceBits!(0, 5))(a);
5725    trieStats(bt);
5726    foreach (i; 0 .. 256)
5727        assert(bt[cast(ubyte) i]);
5728}
5729
5730template useItemAt(size_t idx, T)
5731if (isIntegral!T || is(T: dchar))
5732{
5733    size_t impl(const scope T[] arr){ return arr[idx]; }
5734    alias useItemAt = assumeSize!(impl, 8*T.sizeof);
5735}
5736
5737template useLastItem(T)
5738{
5739    size_t impl(const scope T[] arr){ return arr[$-1]; }
5740    alias useLastItem = assumeSize!(impl, 8*T.sizeof);
5741}
5742
5743template fullBitSize(Prefix...)
5744{
5745    static if (Prefix.length > 0)
5746        enum fullBitSize = bitSizeOf!(Prefix[0])+fullBitSize!(Prefix[1..$]);
5747    else
5748        enum fullBitSize = 0;
5749}
5750
5751template idxTypes(Key, size_t fullBits, Prefix...)
5752{
5753    static if (Prefix.length == 1)
5754    {// the last level is value level, so no index once reduced to 1-level
5755        alias idxTypes = AliasSeq!();
5756    }
5757    else
5758    {
5759        // Important note on bit packing
5760        // Each level has to hold enough of bits to address the next one
5761        // The bottom level is known to hold full bit width
5762        // thus it's size in pages is full_bit_width - size_of_last_prefix
5763        // Recourse on this notion
5764        alias idxTypes =
5765            AliasSeq!(
5766                idxTypes!(Key, fullBits - bitSizeOf!(Prefix[$-1]), Prefix[0..$-1]),
5767                BitPacked!(typeof(Prefix[$-2](Key.init)), fullBits - bitSizeOf!(Prefix[$-1]))
5768            );
5769    }
5770}
5771
5772//============================================================================
5773
5774@safe pure int comparePropertyName(Char1, Char2)(const(Char1)[] a, const(Char2)[] b)
5775if (is(Char1 : dchar) && is(Char2 : dchar))
5776{
5777    import std.algorithm.comparison : cmp;
5778    import std.algorithm.iteration : map, filter;
5779    import std.ascii : toLower;
5780    static bool pred(dchar c) {return !c.isWhite && c != '-' && c != '_';}
5781    return cmp(
5782        a.map!toLower.filter!pred,
5783        b.map!toLower.filter!pred);
5784}
5785
5786@safe pure unittest
5787{
5788    assert(!comparePropertyName("foo-bar", "fooBar"));
5789}
5790
5791bool propertyNameLess(Char1, Char2)(const(Char1)[] a, const(Char2)[] b) @safe pure
5792if (is(Char1 : dchar) && is(Char2 : dchar))
5793{
5794    return comparePropertyName(a, b) < 0;
5795}
5796
5797//============================================================================
5798// Utilities for compression of Unicode code point sets
5799//============================================================================
5800
5801@safe void compressTo(uint val, ref scope ubyte[] arr) pure nothrow
5802{
5803    // not optimized as usually done 1 time (and not public interface)
5804    if (val < 128)
5805        arr ~= cast(ubyte) val;
5806    else if (val < (1 << 13))
5807    {
5808        arr ~= (0b1_00 << 5) | cast(ubyte)(val >> 8);
5809        arr ~= val & 0xFF;
5810    }
5811    else
5812    {
5813        assert(val < (1 << 21));
5814        arr ~= (0b1_01 << 5) | cast(ubyte)(val >> 16);
5815        arr ~= (val >> 8) & 0xFF;
5816        arr ~= val  & 0xFF;
5817    }
5818}
5819
5820@safe uint decompressFrom(scope const(ubyte)[] arr, ref size_t idx) pure
5821{
5822    import std.exception : enforce;
5823    immutable first = arr[idx++];
5824    if (!(first & 0x80)) // no top bit -> [0 .. 127]
5825        return first;
5826    immutable extra = ((first >> 5) & 1) + 1; // [1, 2]
5827    uint val = (first & 0x1F);
5828    enforce(idx + extra <= arr.length, "bad code point interval encoding");
5829    foreach (j; 0 .. extra)
5830        val = (val << 8) | arr[idx+j];
5831    idx += extra;
5832    return val;
5833}
5834
5835
5836package(std) ubyte[] compressIntervals(Range)(Range intervals)
5837if (isInputRange!Range && isIntegralPair!(ElementType!Range))
5838{
5839    ubyte[] storage;
5840    uint base = 0;
5841    // RLE encode
5842    foreach (val; intervals)
5843    {
5844        compressTo(val[0]-base, storage);
5845        base = val[0];
5846        if (val[1] != lastDchar+1) // till the end of the domain so don't store it
5847        {
5848            compressTo(val[1]-base, storage);
5849            base = val[1];
5850        }
5851    }
5852    return storage;
5853}
5854
5855@safe pure unittest
5856{
5857    import std.algorithm.comparison : equal;
5858    import std.typecons : tuple;
5859
5860    auto run = [tuple(80, 127), tuple(128, (1 << 10)+128)];
5861    ubyte[] enc = [cast(ubyte) 80, 47, 1, (0b1_00 << 5) | (1 << 2), 0];
5862    assert(compressIntervals(run) == enc);
5863    auto run2 = [tuple(0, (1 << 20)+512+1), tuple((1 << 20)+512+4, lastDchar+1)];
5864    ubyte[] enc2 = [cast(ubyte) 0, (0b1_01 << 5) | (1 << 4), 2, 1, 3]; // odd length-ed
5865    assert(compressIntervals(run2) == enc2);
5866    size_t  idx = 0;
5867    assert(decompressFrom(enc, idx) == 80);
5868    assert(decompressFrom(enc, idx) == 47);
5869    assert(decompressFrom(enc, idx) == 1);
5870    assert(decompressFrom(enc, idx) == (1 << 10));
5871    idx = 0;
5872    assert(decompressFrom(enc2, idx) == 0);
5873    assert(decompressFrom(enc2, idx) == (1 << 20)+512+1);
5874    assert(equal(decompressIntervals(compressIntervals(run)), run));
5875    assert(equal(decompressIntervals(compressIntervals(run2)), run2));
5876}
5877
5878// Creates a range of `CodepointInterval` that lazily decodes compressed data.
5879@safe package(std) auto decompressIntervals(const(ubyte)[] data) pure
5880{
5881    return DecompressedIntervals(data);
5882}
5883
5884@safe struct DecompressedIntervals
5885{
5886pure:
5887    const(ubyte)[] _stream;
5888    size_t _idx;
5889    CodepointInterval _front;
5890
5891    this(const(ubyte)[] stream)
5892    {
5893        _stream = stream;
5894        popFront();
5895    }
5896
5897    @property CodepointInterval front()
5898    {
5899        assert(!empty);
5900        return _front;
5901    }
5902
5903    void popFront()
5904    {
5905        if (_idx == _stream.length)
5906        {
5907            _idx = size_t.max;
5908            return;
5909        }
5910        uint base = _front[1];
5911        _front[0] = base + decompressFrom(_stream, _idx);
5912        if (_idx == _stream.length)// odd length ---> till the end
5913            _front[1] = lastDchar+1;
5914        else
5915        {
5916            base = _front[0];
5917            _front[1] = base + decompressFrom(_stream, _idx);
5918        }
5919    }
5920
5921    @property bool empty() const
5922    {
5923        return _idx == size_t.max;
5924    }
5925
5926    @property DecompressedIntervals save() return scope { return this; }
5927}
5928
5929@safe pure nothrow @nogc unittest
5930{
5931    static assert(isInputRange!DecompressedIntervals);
5932    static assert(isForwardRange!DecompressedIntervals);
5933}
5934
5935//============================================================================
5936
5937version (std_uni_bootstrap){}
5938else
5939{
5940
5941// helper for looking up code point sets
5942ptrdiff_t findUnicodeSet(alias table, C)(const scope C[] name)
5943{
5944    import std.algorithm.iteration : map;
5945    import std.range : assumeSorted;
5946    auto range = assumeSorted!((a,b) => propertyNameLess(a,b))
5947        (table.map!"a.name"());
5948    size_t idx = range.lowerBound(name).length;
5949    if (idx < range.length && comparePropertyName(range[idx], name) == 0)
5950        return idx;
5951    return -1;
5952}
5953
5954// another one that loads it
5955bool loadUnicodeSet(alias table, Set, C)(const scope C[] name, ref Set dest)
5956{
5957    auto idx = findUnicodeSet!table(name);
5958    if (idx >= 0)
5959    {
5960        dest = Set(asSet(table[idx].compressed));
5961        return true;
5962    }
5963    return false;
5964}
5965
5966bool loadProperty(Set=CodepointSet, C)
5967    (const scope C[] name, ref Set target) pure
5968{
5969    import std.internal.unicode_tables : uniProps; // generated file
5970    alias ucmp = comparePropertyName;
5971    // conjure cumulative properties by hand
5972    if (ucmp(name, "L") == 0 || ucmp(name, "Letter") == 0)
5973    {
5974        target = asSet(uniProps.Lu);
5975        target |= asSet(uniProps.Ll);
5976        target |= asSet(uniProps.Lt);
5977        target |= asSet(uniProps.Lo);
5978        target |= asSet(uniProps.Lm);
5979    }
5980    else if (ucmp(name,"LC") == 0 || ucmp(name,"Cased Letter")==0)
5981    {
5982        target = asSet(uniProps.Ll);
5983        target |= asSet(uniProps.Lu);
5984        target |= asSet(uniProps.Lt);// Title case
5985    }
5986    else if (ucmp(name, "M") == 0 || ucmp(name, "Mark") == 0)
5987    {
5988        target = asSet(uniProps.Mn);
5989        target |= asSet(uniProps.Mc);
5990        target |= asSet(uniProps.Me);
5991    }
5992    else if (ucmp(name, "N") == 0 || ucmp(name, "Number") == 0)
5993    {
5994        target = asSet(uniProps.Nd);
5995        target |= asSet(uniProps.Nl);
5996        target |= asSet(uniProps.No);
5997    }
5998    else if (ucmp(name, "P") == 0 || ucmp(name, "Punctuation") == 0)
5999    {
6000        target = asSet(uniProps.Pc);
6001        target |= asSet(uniProps.Pd);
6002        target |= asSet(uniProps.Ps);
6003        target |= asSet(uniProps.Pe);
6004        target |= asSet(uniProps.Pi);
6005        target |= asSet(uniProps.Pf);
6006        target |= asSet(uniProps.Po);
6007    }
6008    else if (ucmp(name, "S") == 0 || ucmp(name, "Symbol") == 0)
6009    {
6010        target = asSet(uniProps.Sm);
6011        target |= asSet(uniProps.Sc);
6012        target |= asSet(uniProps.Sk);
6013        target |= asSet(uniProps.So);
6014    }
6015    else if (ucmp(name, "Z") == 0 || ucmp(name, "Separator") == 0)
6016    {
6017        target = asSet(uniProps.Zs);
6018        target |= asSet(uniProps.Zl);
6019        target |= asSet(uniProps.Zp);
6020    }
6021    else if (ucmp(name, "C") == 0 || ucmp(name, "Other") == 0)
6022    {
6023        target = asSet(uniProps.Co);
6024        target |= asSet(uniProps.Lo);
6025        target |= asSet(uniProps.No);
6026        target |= asSet(uniProps.So);
6027        target |= asSet(uniProps.Po);
6028    }
6029    else if (ucmp(name, "graphical") == 0)
6030    {
6031        target = asSet(uniProps.Alphabetic);
6032
6033        target |= asSet(uniProps.Mn);
6034        target |= asSet(uniProps.Mc);
6035        target |= asSet(uniProps.Me);
6036
6037        target |= asSet(uniProps.Nd);
6038        target |= asSet(uniProps.Nl);
6039        target |= asSet(uniProps.No);
6040
6041        target |= asSet(uniProps.Pc);
6042        target |= asSet(uniProps.Pd);
6043        target |= asSet(uniProps.Ps);
6044        target |= asSet(uniProps.Pe);
6045        target |= asSet(uniProps.Pi);
6046        target |= asSet(uniProps.Pf);
6047        target |= asSet(uniProps.Po);
6048
6049        target |= asSet(uniProps.Zs);
6050
6051        target |= asSet(uniProps.Sm);
6052        target |= asSet(uniProps.Sc);
6053        target |= asSet(uniProps.Sk);
6054        target |= asSet(uniProps.So);
6055    }
6056    else if (ucmp(name, "any") == 0)
6057        target = Set.fromIntervals(0, 0x110000);
6058    else if (ucmp(name, "ascii") == 0)
6059        target = Set.fromIntervals(0, 0x80);
6060    else
6061        return loadUnicodeSet!(uniProps.tab)(name, target);
6062    return true;
6063}
6064
6065// CTFE-only helper for checking property names at compile-time
6066@safe bool isPrettyPropertyName(C)(const scope C[] name)
6067{
6068    import std.algorithm.searching : find;
6069    auto names = [
6070        "L", "Letter",
6071        "LC", "Cased Letter",
6072        "M", "Mark",
6073        "N", "Number",
6074        "P", "Punctuation",
6075        "S", "Symbol",
6076        "Z", "Separator",
6077        "Graphical",
6078        "any",
6079        "ascii"
6080    ];
6081    auto x = find!(x => comparePropertyName(x, name) == 0)(names);
6082    return !x.empty;
6083}
6084
6085// ditto, CTFE-only, not optimized
6086@safe private static bool findSetName(alias table, C)(const scope C[] name)
6087{
6088    return findUnicodeSet!table(name) >= 0;
6089}
6090
6091template SetSearcher(alias table, string kind)
6092{
6093    /// Run-time checked search.
6094    static auto opCall(C)(const scope C[] name)
6095        if (is(C : dchar))
6096    {
6097        import std.conv : to;
6098        CodepointSet set;
6099        if (loadUnicodeSet!table(name, set))
6100            return set;
6101        throw new Exception("No unicode set for "~kind~" by name "
6102            ~name.to!string()~" was found.");
6103    }
6104    /// Compile-time checked search.
6105    static @property auto opDispatch(string name)()
6106    {
6107        static if (findSetName!table(name))
6108        {
6109            CodepointSet set;
6110            loadUnicodeSet!table(name, set);
6111            return set;
6112        }
6113        else
6114            static assert(false, "No unicode set for "~kind~" by name "
6115                ~name~" was found.");
6116    }
6117}
6118
6119// Characters that need escaping in string posed as regular expressions
6120package(std) alias Escapables = AliasSeq!('[', ']', '\\', '^', '$', '.', '|', '?', ',', '-',
6121    ';', ':', '#', '&', '%', '/', '<', '>', '`',  '*', '+', '(', ')', '{', '}',  '~');
6122
6123package(std) CodepointSet memoizeExpr(string expr)()
6124{
6125    if (__ctfe)
6126        return mixin(expr);
6127    alias T = typeof(mixin(expr));
6128    static T slot;
6129    static bool initialized;
6130    if (!initialized)
6131    {
6132        slot =  mixin(expr);
6133        initialized = true;
6134    }
6135    return slot;
6136}
6137
6138//property for \w character class
6139package(std) @property CodepointSet wordCharacter() @safe
6140{
6141    return memoizeExpr!("unicode.Alphabetic | unicode.Mn | unicode.Mc
6142        | unicode.Me | unicode.Nd | unicode.Pc")();
6143}
6144
6145//basic stack, just in case it gets used anywhere else then Parser
6146package(std) struct Stack(T)
6147{
6148@safe:
6149    T[] data;
6150    @property bool empty(){ return data.empty; }
6151
6152    @property size_t length(){ return data.length; }
6153
6154    void push(T val){ data ~= val;  }
6155
6156    @trusted T pop()
6157    {
6158        assert(!empty);
6159        auto val = data[$ - 1];
6160        data = data[0 .. $ - 1];
6161        if (!__ctfe)
6162            cast(void) data.assumeSafeAppend();
6163        return val;
6164    }
6165
6166    @property ref T top()
6167    {
6168        assert(!empty);
6169        return data[$ - 1];
6170    }
6171}
6172
6173//test if a given string starts with hex number of maxDigit that's a valid codepoint
6174//returns it's value and skips these maxDigit chars on success, throws on failure
6175package(std) dchar parseUniHex(Range)(ref Range str, size_t maxDigit)
6176{
6177    import std.exception : enforce;
6178    //std.conv.parse is both @system and bogus
6179    uint val;
6180    for (int k = 0; k < maxDigit; k++)
6181    {
6182        enforce(!str.empty, "incomplete escape sequence");
6183        //accepts ascii only, so it's OK to index directly
6184        immutable current = str.front;
6185        if ('0' <= current && current <= '9')
6186            val = val * 16 + current - '0';
6187        else if ('a' <= current && current <= 'f')
6188            val = val * 16 + current -'a' + 10;
6189        else if ('A' <= current && current <= 'F')
6190            val = val * 16 + current - 'A' + 10;
6191        else
6192            throw new Exception("invalid escape sequence");
6193        str.popFront();
6194    }
6195    enforce(val <= 0x10FFFF, "invalid codepoint");
6196    return val;
6197}
6198
6199@safe unittest
6200{
6201    import std.algorithm.searching : canFind;
6202    import std.exception : collectException;
6203    string[] non_hex = [ "000j", "000z", "FffG", "0Z"];
6204    string[] hex = [ "01", "ff", "00af", "10FFFF" ];
6205    int[] value = [ 1, 0xFF, 0xAF, 0x10FFFF ];
6206    foreach (v; non_hex)
6207        assert(collectException(parseUniHex(v, v.length)).msg
6208          .canFind("invalid escape sequence"));
6209    foreach (i, v; hex)
6210        assert(parseUniHex(v, v.length) == value[i]);
6211    string over = "0011FFFF";
6212    assert(collectException(parseUniHex(over, over.length)).msg
6213      .canFind("invalid codepoint"));
6214}
6215
6216auto caseEnclose(CodepointSet set)
6217{
6218    auto cased = set & unicode.LC;
6219    foreach (dchar ch; cased.byCodepoint)
6220    {
6221        foreach (c; simpleCaseFoldings(ch))
6222            set |= c;
6223    }
6224    return set;
6225}
6226
6227/+
6228    fetch codepoint set corresponding to a name (InBlock or binary property)
6229+/
6230CodepointSet getUnicodeSet(const scope char[] name, bool negated,  bool casefold) @safe
6231{
6232    CodepointSet s = unicode(name);
6233    //FIXME: caseEnclose for new uni as Set | CaseEnclose(SET && LC)
6234    if (casefold)
6235       s = caseEnclose(s);
6236    if (negated)
6237        s = s.inverted;
6238    return s;
6239}
6240
6241struct UnicodeSetParser(Range)
6242{
6243    import std.exception : enforce;
6244    import std.typecons : tuple, Tuple;
6245    Range range;
6246    bool casefold_;
6247
6248    @property bool empty(){ return range.empty; }
6249    @property dchar front(){ return range.front; }
6250    void popFront(){ range.popFront(); }
6251
6252    //CodepointSet operations relatively in order of priority
6253    enum Operator:uint {
6254        Open = 0, Negate,  Difference, SymDifference, Intersection, Union, None
6255    }
6256
6257    //parse unit of CodepointSet spec, most notably escape sequences and char ranges
6258    //also fetches next set operation
6259    Tuple!(CodepointSet,Operator) parseCharTerm()
6260    {
6261        import std.range : drop;
6262        enum privateUseStart = '\U000F0000', privateUseEnd ='\U000FFFFD';
6263        enum State{ Start, Char, Escape, CharDash, CharDashEscape,
6264            PotentialTwinSymbolOperator }
6265        Operator op = Operator.None;
6266        dchar last;
6267        CodepointSet set;
6268        State state = State.Start;
6269
6270        void addWithFlags(ref CodepointSet set, uint ch)
6271        {
6272            if (casefold_)
6273            {
6274                auto range = simpleCaseFoldings(ch);
6275                foreach (v; range)
6276                    set |= v;
6277            }
6278            else
6279                set |= ch;
6280        }
6281
6282        static Operator twinSymbolOperator(dchar symbol)
6283        {
6284            switch (symbol)
6285            {
6286            case '|':
6287                return Operator.Union;
6288            case '-':
6289                return Operator.Difference;
6290            case '~':
6291                return Operator.SymDifference;
6292            case '&':
6293                return Operator.Intersection;
6294            default:
6295                assert(false);
6296            }
6297        }
6298
6299        L_CharTermLoop:
6300        for (;;)
6301        {
6302            final switch (state)
6303            {
6304            case State.Start:
6305                switch (front)
6306                {
6307                case '|':
6308                case '-':
6309                case '~':
6310                case '&':
6311                    state = State.PotentialTwinSymbolOperator;
6312                    last = front;
6313                    break;
6314                case '[':
6315                    op = Operator.Union;
6316                    goto case;
6317                case ']':
6318                    break L_CharTermLoop;
6319                case '\\':
6320                    state = State.Escape;
6321                    break;
6322                default:
6323                    state = State.Char;
6324                    last = front;
6325                }
6326                break;
6327            case State.Char:
6328                // xxx last front xxx
6329                switch (front)
6330                {
6331                case '|':
6332                case '~':
6333                case '&':
6334                    // then last is treated as normal char and added as implicit union
6335                    state = State.PotentialTwinSymbolOperator;
6336                    addWithFlags(set, last);
6337                    last = front;
6338                    break;
6339                case '-': // still need more info
6340                    state = State.CharDash;
6341                    break;
6342                case '\\':
6343                    set |= last;
6344                    state = State.Escape;
6345                    break;
6346                case '[':
6347                    op = Operator.Union;
6348                    goto case;
6349                case ']':
6350                    addWithFlags(set, last);
6351                    break L_CharTermLoop;
6352                default:
6353                    state = State.Char;
6354                    addWithFlags(set, last);
6355                    last = front;
6356                }
6357                break;
6358            case State.PotentialTwinSymbolOperator:
6359                // xxx last front xxxx
6360                // where last = [|-&~]
6361                if (front == last)
6362                {
6363                    op = twinSymbolOperator(last);
6364                    popFront();//skip second twin char
6365                    break L_CharTermLoop;
6366                }
6367                goto case State.Char;
6368            case State.Escape:
6369                // xxx \ front xxx
6370                switch (front)
6371                {
6372                case 'f':
6373                    last = '\f';
6374                    state = State.Char;
6375                    break;
6376                case 'n':
6377                    last = '\n';
6378                    state = State.Char;
6379                    break;
6380                case 'r':
6381                    last = '\r';
6382                    state = State.Char;
6383                    break;
6384                case 't':
6385                    last = '\t';
6386                    state = State.Char;
6387                    break;
6388                case 'v':
6389                    last = '\v';
6390                    state = State.Char;
6391                    break;
6392                case 'c':
6393                    last = unicode.parseControlCode(this);
6394                    state = State.Char;
6395                    break;
6396                foreach (val; Escapables)
6397                {
6398                case val:
6399                }
6400                    last = front;
6401                    state = State.Char;
6402                    break;
6403                case 'p':
6404                    set.add(unicode.parsePropertySpec(this, false, casefold_));
6405                    state = State.Start;
6406                    continue L_CharTermLoop; //next char already fetched
6407                case 'P':
6408                    set.add(unicode.parsePropertySpec(this, true, casefold_));
6409                    state = State.Start;
6410                    continue L_CharTermLoop; //next char already fetched
6411                case 'x':
6412                    popFront();
6413                    last = parseUniHex(this, 2);
6414                    state = State.Char;
6415                    continue L_CharTermLoop;
6416                case 'u':
6417                    popFront();
6418                    last = parseUniHex(this, 4);
6419                    state = State.Char;
6420                    continue L_CharTermLoop;
6421                case 'U':
6422                    popFront();
6423                    last = parseUniHex(this, 8);
6424                    state = State.Char;
6425                    continue L_CharTermLoop;
6426                case 'd':
6427                    set.add(unicode.Nd);
6428                    state = State.Start;
6429                    break;
6430                case 'D':
6431                    set.add(unicode.Nd.inverted);
6432                    state = State.Start;
6433                    break;
6434                case 's':
6435                    set.add(unicode.White_Space);
6436                    state = State.Start;
6437                    break;
6438                case 'S':
6439                    set.add(unicode.White_Space.inverted);
6440                    state = State.Start;
6441                    break;
6442                case 'w':
6443                    set.add(wordCharacter);
6444                    state = State.Start;
6445                    break;
6446                case 'W':
6447                    set.add(wordCharacter.inverted);
6448                    state = State.Start;
6449                    break;
6450                default:
6451                    if (front >= privateUseStart && front <= privateUseEnd)
6452                        enforce(false, "no matching ']' found while parsing character class");
6453                    enforce(false, "invalid escape sequence");
6454                }
6455                break;
6456            case State.CharDash:
6457                // xxx last - front xxx
6458                switch (front)
6459                {
6460                case '[':
6461                    op = Operator.Union;
6462                    goto case;
6463                case ']':
6464                    //means dash is a single char not an interval specifier
6465                    addWithFlags(set, last);
6466                    addWithFlags(set, '-');
6467                    break L_CharTermLoop;
6468                 case '-'://set Difference again
6469                    addWithFlags(set, last);
6470                    op = Operator.Difference;
6471                    popFront();//skip '-'
6472                    break L_CharTermLoop;
6473                case '\\':
6474                    state = State.CharDashEscape;
6475                    break;
6476                default:
6477                    enforce(last <= front, "inverted range");
6478                    if (casefold_)
6479                    {
6480                        for (uint ch = last; ch <= front; ch++)
6481                            addWithFlags(set, ch);
6482                    }
6483                    else
6484                        set.add(last, front + 1);
6485                    state = State.Start;
6486                }
6487                break;
6488            case State.CharDashEscape:
6489            //xxx last - \ front xxx
6490                uint end;
6491                switch (front)
6492                {
6493                case 'f':
6494                    end = '\f';
6495                    break;
6496                case 'n':
6497                    end = '\n';
6498                    break;
6499                case 'r':
6500                    end = '\r';
6501                    break;
6502                case 't':
6503                    end = '\t';
6504                    break;
6505                case 'v':
6506                    end = '\v';
6507                    break;
6508                foreach (val; Escapables)
6509                {
6510                case val:
6511                }
6512                    end = front;
6513                    break;
6514                case 'c':
6515                    end = unicode.parseControlCode(this);
6516                    break;
6517                case 'x':
6518                    popFront();
6519                    end = parseUniHex(this, 2);
6520                    enforce(last <= end,"inverted range");
6521                    set.add(last, end + 1);
6522                    state = State.Start;
6523                    continue L_CharTermLoop;
6524                case 'u':
6525                    popFront();
6526                    end = parseUniHex(this, 4);
6527                    enforce(last <= end,"inverted range");
6528                    set.add(last, end + 1);
6529                    state = State.Start;
6530                    continue L_CharTermLoop;
6531                case 'U':
6532                    popFront();
6533                    end = parseUniHex(this, 8);
6534                    enforce(last <= end,"inverted range");
6535                    set.add(last, end + 1);
6536                    state = State.Start;
6537                    continue L_CharTermLoop;
6538                default:
6539                    if (front >= privateUseStart && front <= privateUseEnd)
6540                        enforce(false, "no matching ']' found while parsing character class");
6541                    enforce(false, "invalid escape sequence");
6542                }
6543                // Lookahead to check if it's a \T
6544                // where T is sub-pattern terminator in multi-pattern scheme
6545                auto lookahead = range.save.drop(1);
6546                if (end == '\\' && !lookahead.empty)
6547                {
6548                    if (lookahead.front >= privateUseStart && lookahead.front <= privateUseEnd)
6549                        enforce(false, "no matching ']' found while parsing character class");
6550                }
6551                enforce(last <= end,"inverted range");
6552                set.add(last, end + 1);
6553                state = State.Start;
6554                break;
6555            }
6556            popFront();
6557            enforce(!empty, "unexpected end of CodepointSet");
6558        }
6559        return tuple(set, op);
6560    }
6561
6562    alias ValStack = Stack!(CodepointSet);
6563    alias OpStack = Stack!(Operator);
6564
6565    CodepointSet parseSet()
6566    {
6567        ValStack vstack;
6568        OpStack opstack;
6569        import std.functional : unaryFun;
6570        enforce(!empty, "unexpected end of input");
6571        enforce(front == '[', "expected '[' at the start of unicode set");
6572        //
6573        static bool apply(Operator op, ref ValStack stack)
6574        {
6575            switch (op)
6576            {
6577            case Operator.Negate:
6578                enforce(!stack.empty, "no operand for '^'");
6579                stack.top = stack.top.inverted;
6580                break;
6581            case Operator.Union:
6582                auto s = stack.pop();//2nd operand
6583                enforce(!stack.empty, "no operand for '||'");
6584                stack.top.add(s);
6585                break;
6586            case Operator.Difference:
6587                auto s = stack.pop();//2nd operand
6588                enforce(!stack.empty, "no operand for '--'");
6589                stack.top.sub(s);
6590                break;
6591            case Operator.SymDifference:
6592                auto s = stack.pop();//2nd operand
6593                enforce(!stack.empty, "no operand for '~~'");
6594                stack.top ~= s;
6595                break;
6596            case Operator.Intersection:
6597                auto s = stack.pop();//2nd operand
6598                enforce(!stack.empty, "no operand for '&&'");
6599                stack.top.intersect(s);
6600                break;
6601            default:
6602                return false;
6603            }
6604            return true;
6605        }
6606        static bool unrollWhile(alias cond)(ref ValStack vstack, ref OpStack opstack)
6607        {
6608            while (cond(opstack.top))
6609            {
6610                if (!apply(opstack.pop(),vstack))
6611                    return false;//syntax error
6612                if (opstack.empty)
6613                    return false;
6614            }
6615            return true;
6616        }
6617
6618        L_CharsetLoop:
6619        do
6620        {
6621            switch (front)
6622            {
6623            case '[':
6624                opstack.push(Operator.Open);
6625                popFront();
6626                enforce(!empty, "unexpected end of character class");
6627                if (front == '^')
6628                {
6629                    opstack.push(Operator.Negate);
6630                    popFront();
6631                    enforce(!empty, "unexpected end of character class");
6632                }
6633                else if (front == ']') // []...] is special cased
6634                {
6635                    popFront();
6636                    enforce(!empty, "wrong character set");
6637                    auto pair = parseCharTerm();
6638                    pair[0].add(']', ']'+1);
6639                    if (pair[1] != Operator.None)
6640                    {
6641                        if (opstack.top == Operator.Union)
6642                            unrollWhile!(unaryFun!"a == a.Union")(vstack, opstack);
6643                        opstack.push(pair[1]);
6644                    }
6645                    vstack.push(pair[0]);
6646                }
6647                break;
6648            case ']':
6649                enforce(unrollWhile!(unaryFun!"a != a.Open")(vstack, opstack),
6650                    "character class syntax error");
6651                enforce(!opstack.empty, "unmatched ']'");
6652                opstack.pop();
6653                popFront();
6654                if (opstack.empty)
6655                    break L_CharsetLoop;
6656                auto pair  = parseCharTerm();
6657                if (!pair[0].empty)//not only operator e.g. -- or ~~
6658                {
6659                    vstack.top.add(pair[0]);//apply union
6660                }
6661                if (pair[1] != Operator.None)
6662                {
6663                    if (opstack.top == Operator.Union)
6664                        unrollWhile!(unaryFun!"a == a.Union")(vstack, opstack);
6665                    opstack.push(pair[1]);
6666                }
6667                break;
6668            //
6669            default://yet another pair of term(op)?
6670                auto pair = parseCharTerm();
6671                if (pair[1] != Operator.None)
6672                {
6673                    if (opstack.top == Operator.Union)
6674                        unrollWhile!(unaryFun!"a == a.Union")(vstack, opstack);
6675                    opstack.push(pair[1]);
6676                }
6677                vstack.push(pair[0]);
6678            }
6679
6680        }while (!empty || !opstack.empty);
6681        while (!opstack.empty)
6682            apply(opstack.pop(),vstack);
6683        assert(vstack.length == 1);
6684        return vstack.top;
6685    }
6686}
6687
6688/**
6689    A single entry point to lookup Unicode $(CODEPOINT) sets by name or alias of
6690    a block, script or general category.
6691
6692    It uses well defined standard rules of property name lookup.
6693    This includes fuzzy matching of names, so that
6694    'White_Space', 'white-SpAce' and 'whitespace' are all considered equal
6695    and yield the same set of white space $(CHARACTERS).
6696*/
6697@safe public struct unicode
6698{
6699    import std.exception : enforce;
6700    /**
6701        Performs the lookup of set of $(CODEPOINTS)
6702        with compile-time correctness checking.
6703        This short-cut version combines 3 searches:
6704        across blocks, scripts, and common binary properties.
6705
6706        Note that since scripts and blocks overlap the
6707        usual trick to disambiguate is used - to get a block use
6708        `unicode.InBlockName`, to search a script
6709        use `unicode.ScriptName`.
6710
6711        See_Also: $(LREF block), $(LREF script)
6712        and (not included in this search) $(LREF hangulSyllableType).
6713    */
6714
6715    static @property auto opDispatch(string name)() pure
6716    {
6717        static if (findAny(name))
6718            return loadAny(name);
6719        else
6720            static assert(false, "No unicode set by name "~name~" was found.");
6721    }
6722
6723    ///
6724    @safe unittest
6725    {
6726        import std.exception : collectException;
6727        auto ascii = unicode.ASCII;
6728        assert(ascii['A']);
6729        assert(ascii['~']);
6730        assert(!ascii['\u00e0']);
6731        // matching is case-insensitive
6732        assert(ascii == unicode.ascII);
6733        assert(!ascii['��']);
6734        // underscores, '-' and whitespace in names are ignored too
6735        auto latin = unicode.in_latin1_Supplement;
6736        assert(latin['��']);
6737        assert(!latin['$']);
6738        // BTW Latin 1 Supplement is a block, hence "In" prefix
6739        assert(latin == unicode("In Latin 1 Supplement"));
6740        // run-time look up throws if no such set is found
6741        assert(collectException(unicode("InCyrilliac")));
6742    }
6743
6744    /**
6745        The same lookup across blocks, scripts, or binary properties,
6746        but performed at run-time.
6747        This version is provided for cases where `name`
6748        is not known beforehand; otherwise compile-time
6749        checked $(LREF opDispatch) is typically a better choice.
6750
6751        See the $(S_LINK Unicode properties, table of properties) for available
6752        sets.
6753    */
6754    static auto opCall(C)(const scope C[] name)
6755        if (is(C : dchar))
6756    {
6757        return loadAny(name);
6758    }
6759
6760    /**
6761        Narrows down the search for sets of $(CODEPOINTS) to all Unicode blocks.
6762
6763        Note:
6764        Here block names are unambiguous as no scripts are searched
6765        and thus to search use simply `unicode.block.BlockName` notation.
6766
6767        See $(S_LINK Unicode properties, table of properties) for available sets.
6768        See_Also: $(S_LINK Unicode properties, table of properties).
6769    */
6770    struct block
6771    {
6772        import std.internal.unicode_tables : blocks; // generated file
6773        mixin SetSearcher!(blocks.tab, "block");
6774    }
6775
6776    ///
6777    @safe unittest
6778    {
6779        // use .block for explicitness
6780        assert(unicode.block.Greek_and_Coptic == unicode.InGreek_and_Coptic);
6781    }
6782
6783    /**
6784        Narrows down the search for sets of $(CODEPOINTS) to all Unicode scripts.
6785
6786        See the $(S_LINK Unicode properties, table of properties) for available
6787        sets.
6788    */
6789    struct script
6790    {
6791        import std.internal.unicode_tables : scripts; // generated file
6792        mixin SetSearcher!(scripts.tab, "script");
6793    }
6794
6795    ///
6796    @safe unittest
6797    {
6798        auto arabicScript = unicode.script.arabic;
6799        auto arabicBlock = unicode.block.arabic;
6800        // there is an intersection between script and block
6801        assert(arabicBlock['��']);
6802        assert(arabicScript['��']);
6803        // but they are different
6804        assert(arabicBlock != arabicScript);
6805        assert(arabicBlock == unicode.inArabic);
6806        assert(arabicScript == unicode.arabic);
6807    }
6808
6809    /**
6810        Fetch a set of $(CODEPOINTS) that have the given hangul syllable type.
6811
6812        Other non-binary properties (once supported) follow the same
6813        notation - `unicode.propertyName.propertyValue` for compile-time
6814        checked access and `unicode.propertyName(propertyValue)`
6815        for run-time checked one.
6816
6817        See the $(S_LINK Unicode properties, table of properties) for available
6818        sets.
6819    */
6820    struct hangulSyllableType
6821    {
6822        import std.internal.unicode_tables : hangul; // generated file
6823        mixin SetSearcher!(hangul.tab, "hangul syllable type");
6824    }
6825
6826    ///
6827    @safe unittest
6828    {
6829        // L here is syllable type not Letter as in unicode.L short-cut
6830        auto leadingVowel = unicode.hangulSyllableType("L");
6831        // check that some leading vowels are present
6832        foreach (vowel; '\u1110'..'\u115F')
6833            assert(leadingVowel[vowel]);
6834        assert(leadingVowel == unicode.hangulSyllableType.L);
6835    }
6836
6837    //parse control code of form \cXXX, c assumed to be the current symbol
6838    static package(std) dchar parseControlCode(Parser)(ref Parser p)
6839    {
6840        with(p)
6841        {
6842            popFront();
6843            enforce(!empty, "Unfinished escape sequence");
6844            enforce(('a' <= front && front <= 'z')
6845                || ('A' <= front && front <= 'Z'),
6846            "Only letters are allowed after \\c");
6847            return front & 0x1f;
6848        }
6849    }
6850
6851    //parse and return a CodepointSet for \p{...Property...} and \P{...Property..},
6852    //\ - assumed to be processed, p - is current
6853    static package(std) CodepointSet parsePropertySpec(Range)(ref Range p,
6854        bool negated, bool casefold)
6855    {
6856        static import std.ascii;
6857        with(p)
6858        {
6859            enum MAX_PROPERTY = 128;
6860            char[MAX_PROPERTY] result;
6861            uint k = 0;
6862            popFront();
6863            enforce(!empty, "eof parsing unicode property spec");
6864            if (front == '{')
6865            {
6866                popFront();
6867                while (k < MAX_PROPERTY && !empty && front !='}'
6868                    && front !=':')
6869                {
6870                    if (front != '-' && front != ' ' && front != '_')
6871                        result[k++] = cast(char) std.ascii.toLower(front);
6872                    popFront();
6873                }
6874                enforce(k != MAX_PROPERTY, "invalid property name");
6875                enforce(front == '}', "} expected ");
6876            }
6877            else
6878            {//single char properties e.g.: \pL, \pN ...
6879                enforce(front < 0x80, "invalid property name");
6880                result[k++] = cast(char) front;
6881            }
6882            auto s = getUnicodeSet(result[0 .. k], negated, casefold);
6883            enforce(!s.empty, "unrecognized unicode property spec");
6884            popFront();
6885            return s;
6886        }
6887    }
6888
6889    /**
6890        Parse unicode codepoint set from given `range` using standard regex
6891        syntax '[...]'. The range is advanced skiping over regex set definition.
6892        `casefold` parameter determines if the set should be casefolded - that is
6893        include both lower and upper case versions for any letters in the set.
6894    */
6895    static CodepointSet parseSet(Range)(ref Range range, bool casefold=false)
6896    if (isInputRange!Range && is(ElementType!Range : dchar))
6897    {
6898        auto usParser = UnicodeSetParser!Range(range, casefold);
6899        auto set = usParser.parseSet();
6900        range = usParser.range;
6901        return set;
6902    }
6903
6904    ///
6905    @safe unittest
6906    {
6907        import std.uni : unicode;
6908        string pat = "[a-zA-Z0-9]hello";
6909        auto set = unicode.parseSet(pat);
6910        // check some of the codepoints
6911        assert(set['a'] && set['A'] && set['9']);
6912        assert(pat == "hello");
6913    }
6914
6915private:
6916    alias ucmp = comparePropertyName;
6917
6918    static bool findAny(string name)
6919    {
6920        import std.internal.unicode_tables : blocks, scripts, uniProps; // generated file
6921        return isPrettyPropertyName(name)
6922            || findSetName!(uniProps.tab)(name) || findSetName!(scripts.tab)(name)
6923            || (ucmp(name[0 .. 2],"In") == 0 && findSetName!(blocks.tab)(name[2..$]));
6924    }
6925
6926    static auto loadAny(Set=CodepointSet, C)(const scope C[] name) pure
6927    {
6928        import std.conv : to;
6929        import std.internal.unicode_tables : blocks, scripts; // generated file
6930        Set set;
6931        immutable loaded = loadProperty(name, set) || loadUnicodeSet!(scripts.tab)(name, set)
6932            || (name.length > 2 && ucmp(name[0 .. 2],"In") == 0
6933                && loadUnicodeSet!(blocks.tab)(name[2..$], set));
6934        if (loaded)
6935            return set;
6936        throw new Exception("No unicode set by name "~name.to!string()~" was found.");
6937    }
6938
6939    // FIXME: re-disable once the compiler is fixed
6940    // Disabled to prevent the mistake of creating instances of this pseudo-struct.
6941    //@disable ~this();
6942}
6943
6944@safe unittest
6945{
6946    import std.internal.unicode_tables : blocks, uniProps; // generated file
6947    assert(unicode("InHebrew") == asSet(blocks.Hebrew));
6948    assert(unicode("separator") == (asSet(uniProps.Zs) | asSet(uniProps.Zl) | asSet(uniProps.Zp)));
6949    assert(unicode("In-Kharoshthi") == asSet(blocks.Kharoshthi));
6950}
6951
6952enum EMPTY_CASE_TRIE = ushort.max;// from what gen_uni uses internally
6953
6954// control - '\r'
6955enum controlSwitch = `
6956    case '\u0000':..case '\u0008':case '\u000E':..case '\u001F':case '\u007F':..
6957    case '\u0084':case '\u0086':..case '\u009F': case '\u0009':..case '\u000C': case '\u0085':
6958`;
6959// TODO: redo the most of hangul stuff algorithmically in case of Graphemes too
6960// kill unrolled switches
6961
6962private static bool isRegionalIndicator(dchar ch) @safe pure @nogc nothrow
6963{
6964    return ch >= '\U0001F1E6' && ch <= '\U0001F1FF';
6965}
6966
6967template genericDecodeGrapheme(bool getValue)
6968{
6969    alias graphemeExtend = graphemeExtendTrie;
6970    alias spacingMark = mcTrie;
6971    static if (getValue)
6972        alias Value = Grapheme;
6973    else
6974        alias Value = void;
6975
6976    Value genericDecodeGrapheme(Input)(ref Input range)
6977    {
6978        import std.internal.unicode_tables : isHangL, isHangT, isHangV; // generated file
6979        enum GraphemeState {
6980            Start,
6981            CR,
6982            RI,
6983            L,
6984            V,
6985            LVT
6986        }
6987        static if (getValue)
6988            Grapheme grapheme;
6989        auto state = GraphemeState.Start;
6990        enum eat = q{
6991            static if (getValue)
6992                grapheme ~= ch;
6993            range.popFront();
6994        };
6995
6996        dchar ch;
6997        assert(!range.empty, "Attempting to decode grapheme from an empty " ~ Input.stringof);
6998        while (!range.empty)
6999        {
7000            ch = range.front;
7001            final switch (state) with(GraphemeState)
7002            {
7003            case Start:
7004                mixin(eat);
7005                if (ch == '\r')
7006                    state = CR;
7007                else if (isRegionalIndicator(ch))
7008                    state = RI;
7009                else if (isHangL(ch))
7010                    state = L;
7011                else if (hangLV[ch] || isHangV(ch))
7012                    state = V;
7013                else if (hangLVT[ch])
7014                    state = LVT;
7015                else if (isHangT(ch))
7016                    state = LVT;
7017                else
7018                {
7019                    switch (ch)
7020                    {
7021                    mixin(controlSwitch);
7022                        goto L_End;
7023                    default:
7024                        goto L_End_Extend;
7025                    }
7026                }
7027            break;
7028            case CR:
7029                if (ch == '\n')
7030                    mixin(eat);
7031                goto L_End_Extend;
7032            case RI:
7033                if (isRegionalIndicator(ch))
7034                    mixin(eat);
7035                else
7036                    goto L_End_Extend;
7037            break;
7038            case L:
7039                if (isHangL(ch))
7040                    mixin(eat);
7041                else if (isHangV(ch) || hangLV[ch])
7042                {
7043                    state = V;
7044                    mixin(eat);
7045                }
7046                else if (hangLVT[ch])
7047                {
7048                    state = LVT;
7049                    mixin(eat);
7050                }
7051                else
7052                    goto L_End_Extend;
7053            break;
7054            case V:
7055                if (isHangV(ch))
7056                    mixin(eat);
7057                else if (isHangT(ch))
7058                {
7059                    state = LVT;
7060                    mixin(eat);
7061                }
7062                else
7063                    goto L_End_Extend;
7064            break;
7065            case LVT:
7066                if (isHangT(ch))
7067                {
7068                    mixin(eat);
7069                }
7070                else
7071                    goto L_End_Extend;
7072            break;
7073            }
7074        }
7075    L_End_Extend:
7076        while (!range.empty)
7077        {
7078            ch = range.front;
7079            // extend & spacing marks
7080            if (!graphemeExtend[ch] && !spacingMark[ch])
7081                break;
7082            mixin(eat);
7083        }
7084    L_End:
7085        static if (getValue)
7086            return grapheme;
7087    }
7088
7089}
7090
7091public: // Public API continues
7092
7093/++
7094    Computes the length of grapheme cluster starting at `index`.
7095    Both the resulting length and the `index` are measured
7096    in $(S_LINK Code unit, code units).
7097
7098    Params:
7099        C = type that is implicitly convertible to `dchars`
7100        input = array of grapheme clusters
7101        index = starting index into `input[]`
7102
7103    Returns:
7104        length of grapheme cluster
7105+/
7106size_t graphemeStride(C)(const scope C[] input, size_t index) @safe pure
7107if (is(C : dchar))
7108{
7109    auto src = input[index..$];
7110    auto n = src.length;
7111    genericDecodeGrapheme!(false)(src);
7112    return n - src.length;
7113}
7114
7115///
7116@safe unittest
7117{
7118    assert(graphemeStride("  ", 1) == 1);
7119    // A + combing ring above
7120    string city = "A\u030Arhus";
7121    size_t first = graphemeStride(city, 0);
7122    assert(first == 3); //\u030A has 2 UTF-8 code units
7123    assert(city[0 .. first] == "A\u030A");
7124    assert(city[first..$] == "rhus");
7125}
7126
7127@safe unittest
7128{
7129    // Ensure that graphemeStride is usable from CTFE.
7130    enum c1 = graphemeStride("A", 0);
7131    static assert(c1 == 1);
7132
7133    enum c2 = graphemeStride("A\u0301", 0);
7134    static assert(c2 == 3); // \u0301 has 2 UTF-8 code units
7135}
7136
7137/++
7138    Reads one full grapheme cluster from an
7139    $(REF_ALTTEXT input range, isInputRange, std,range,primitives) of dchar `inp`.
7140
7141    For examples see the $(LREF Grapheme) below.
7142
7143    Note:
7144    This function modifies `inp` and thus `inp`
7145    must be an L-value.
7146+/
7147Grapheme decodeGrapheme(Input)(ref Input inp)
7148if (isInputRange!Input && is(immutable ElementType!Input == immutable dchar))
7149{
7150    return genericDecodeGrapheme!true(inp);
7151}
7152
7153@safe unittest
7154{
7155    import std.algorithm.comparison : equal;
7156
7157    Grapheme gr;
7158    string s = " \u0020\u0308 ";
7159    gr = decodeGrapheme(s);
7160    assert(gr.length == 1 && gr[0] == ' ');
7161    gr = decodeGrapheme(s);
7162    assert(gr.length == 2 && equal(gr[0 .. 2], " \u0308"));
7163    s = "\u0300\u0308\u1100";
7164    assert(equal(decodeGrapheme(s)[], "\u0300\u0308"));
7165    assert(equal(decodeGrapheme(s)[], "\u1100"));
7166    s = "\u11A8\u0308\uAC01";
7167    assert(equal(decodeGrapheme(s)[], "\u11A8\u0308"));
7168    assert(equal(decodeGrapheme(s)[], "\uAC01"));
7169}
7170
7171/++
7172    $(P Iterate a string by $(LREF Grapheme).)
7173
7174    $(P Useful for doing string manipulation that needs to be aware
7175    of graphemes.)
7176
7177    See_Also:
7178        $(LREF byCodePoint)
7179+/
7180auto byGrapheme(Range)(Range range)
7181if (isInputRange!Range && is(immutable ElementType!Range == immutable dchar))
7182{
7183    // TODO: Bidirectional access
7184    static struct Result(R)
7185    {
7186        private R _range;
7187        private Grapheme _front;
7188
7189        bool empty() @property
7190        {
7191            return _front.length == 0;
7192        }
7193
7194        Grapheme front() @property
7195        {
7196            return _front;
7197        }
7198
7199        void popFront()
7200        {
7201            _front = _range.empty ? Grapheme.init : _range.decodeGrapheme();
7202        }
7203
7204        static if (isForwardRange!R)
7205        {
7206            Result save() @property
7207            {
7208                return Result(_range.save, _front);
7209            }
7210        }
7211    }
7212
7213    auto result = Result!(Range)(range);
7214    result.popFront();
7215    return result;
7216}
7217
7218///
7219@safe unittest
7220{
7221    import std.algorithm.comparison : equal;
7222    import std.range.primitives : walkLength;
7223    import std.range : take, drop;
7224    auto text = "noe\u0308l"; // no��l using e + combining diaeresis
7225    assert(text.walkLength == 5); // 5 code points
7226
7227    auto gText = text.byGrapheme;
7228    assert(gText.walkLength == 4); // 4 graphemes
7229
7230    assert(gText.take(3).equal("noe\u0308".byGrapheme));
7231    assert(gText.drop(3).equal("l".byGrapheme));
7232}
7233
7234// For testing non-forward-range input ranges
7235version (StdUnittest)
7236private static @safe struct InputRangeString
7237{
7238    private string s;
7239
7240    bool empty() @property { return s.empty; }
7241    dchar front() @property { return s.front; }
7242    void popFront() { s.popFront(); }
7243}
7244
7245@safe unittest
7246{
7247    import std.algorithm.comparison : equal;
7248    import std.array : array;
7249    import std.range : retro;
7250    import std.range.primitives : walkLength;
7251    assert("".byGrapheme.walkLength == 0);
7252
7253    auto reverse = "le\u0308on";
7254    assert(reverse.walkLength == 5);
7255
7256    auto gReverse = reverse.byGrapheme;
7257    assert(gReverse.walkLength == 4);
7258
7259    static foreach (text; AliasSeq!("noe\u0308l"c, "noe\u0308l"w, "noe\u0308l"d))
7260    {{
7261        assert(text.walkLength == 5);
7262        static assert(isForwardRange!(typeof(text)));
7263
7264        auto gText = text.byGrapheme;
7265        static assert(isForwardRange!(typeof(gText)));
7266        assert(gText.walkLength == 4);
7267        assert(gText.array.retro.equal(gReverse));
7268    }}
7269
7270    auto nonForwardRange = InputRangeString("noe\u0308l").byGrapheme;
7271    static assert(!isForwardRange!(typeof(nonForwardRange)));
7272    assert(nonForwardRange.walkLength == 4);
7273}
7274
7275/++
7276    $(P Lazily transform a range of $(LREF Grapheme)s to a range of code points.)
7277
7278    $(P Useful for converting the result to a string after doing operations
7279    on graphemes.)
7280
7281    $(P If passed in a range of code points, returns a range with equivalent capabilities.)
7282+/
7283auto byCodePoint(Range)(Range range)
7284if (isInputRange!Range && is(immutable ElementType!Range == immutable Grapheme))
7285{
7286    // TODO: Propagate bidirectional access
7287    static struct Result
7288    {
7289        private Range _range;
7290        private size_t i = 0;
7291
7292        bool empty() @property
7293        {
7294            return _range.empty;
7295        }
7296
7297        dchar front() @property
7298        {
7299            return _range.front[i];
7300        }
7301
7302        void popFront()
7303        {
7304            ++i;
7305
7306            if (i >= _range.front.length)
7307            {
7308                _range.popFront();
7309                i = 0;
7310            }
7311        }
7312
7313        static if (isForwardRange!Range)
7314        {
7315            Result save() @property
7316            {
7317                return Result(_range.save, i);
7318            }
7319        }
7320    }
7321
7322    return Result(range);
7323}
7324
7325/// Ditto
7326auto byCodePoint(Range)(Range range)
7327if (isInputRange!Range && is(immutable ElementType!Range == immutable dchar))
7328{
7329    import std.range.primitives : isBidirectionalRange, popBack;
7330    import std.traits : isNarrowString;
7331    static if (isNarrowString!Range)
7332    {
7333        static struct Result
7334        {
7335            private Range _range;
7336            @property bool empty() { return _range.empty; }
7337            @property dchar front(){ return _range.front; }
7338            void popFront(){ _range.popFront; }
7339            @property auto save() { return Result(_range.save); }
7340            @property dchar back(){ return _range.back; }
7341            void popBack(){ _range.popBack; }
7342        }
7343        static assert(isBidirectionalRange!(Result));
7344        return Result(range);
7345    }
7346    else
7347        return range;
7348}
7349
7350///
7351@safe unittest
7352{
7353    import std.array : array;
7354    import std.conv : text;
7355    import std.range : retro;
7356
7357    string s = "noe\u0308l"; // no��l
7358
7359    // reverse it and convert the result to a string
7360    string reverse = s.byGrapheme
7361        .array
7362        .retro
7363        .byCodePoint
7364        .text;
7365
7366    assert(reverse == "le\u0308on"); // l��on
7367}
7368
7369@safe unittest
7370{
7371    import std.algorithm.comparison : equal;
7372    import std.range.primitives : walkLength;
7373    import std.range : retro;
7374    assert("".byGrapheme.byCodePoint.equal(""));
7375
7376    string text = "noe\u0308l";
7377    static assert(!__traits(compiles, "noe\u0308l".byCodePoint.length));
7378
7379    auto gText = InputRangeString(text).byGrapheme;
7380    static assert(!isForwardRange!(typeof(gText)));
7381
7382    auto cpText = gText.byCodePoint;
7383    static assert(!isForwardRange!(typeof(cpText)));
7384
7385    assert(cpText.walkLength == text.walkLength);
7386
7387    auto plainCp = text.byCodePoint;
7388    static assert(isForwardRange!(typeof(plainCp)));
7389    assert(equal(plainCp, text));
7390    assert(equal(retro(plainCp.save), retro(text.save)));
7391    // Check that we still have length for dstring
7392    assert("����������"d.byCodePoint.length == 5);
7393}
7394
7395/++
7396    $(P A structure designed to effectively pack $(CHARACTERS)
7397    of a $(CLUSTER).
7398    )
7399
7400    $(P `Grapheme` has value semantics so 2 copies of a `Grapheme`
7401    always refer to distinct objects. In most actual scenarios a `Grapheme`
7402    fits on the stack and avoids memory allocation overhead for all but quite
7403    long clusters.
7404    )
7405
7406    See_Also: $(LREF decodeGrapheme), $(LREF graphemeStride)
7407+/
7408@safe struct Grapheme
7409{
7410    import std.exception : enforce;
7411    import std.traits : isDynamicArray;
7412
7413public:
7414    /// Ctor
7415    this(C)(const scope C[] chars...)
7416        if (is(C : dchar))
7417    {
7418        this ~= chars;
7419    }
7420
7421    ///ditto
7422    this(Input)(Input seq)
7423        if (!isDynamicArray!Input
7424            && isInputRange!Input && is(ElementType!Input : dchar))
7425    {
7426        this ~= seq;
7427    }
7428
7429    /// Gets a $(CODEPOINT) at the given index in this cluster.
7430    dchar opIndex(size_t index) const @nogc nothrow pure @trusted
7431    {
7432        assert(index < length);
7433        return read24(isBig ? ptr_ : small_.ptr, index);
7434    }
7435
7436    /++
7437        Writes a $(CODEPOINT) `ch` at given index in this cluster.
7438
7439        Warning:
7440        Use of this facility may invalidate grapheme cluster,
7441        see also $(LREF Grapheme.valid).
7442    +/
7443    void opIndexAssign(dchar ch, size_t index) @nogc nothrow pure @trusted
7444    {
7445        assert(index < length);
7446        write24(isBig ? ptr_ : small_.ptr, ch, index);
7447    }
7448
7449    ///
7450    @safe unittest
7451    {
7452        auto g = Grapheme("A\u0302");
7453        assert(g[0] == 'A');
7454        assert(g.valid);
7455        g[1] = '~'; // ASCII tilda is not a combining mark
7456        assert(g[1] == '~');
7457        assert(!g.valid);
7458    }
7459
7460    /++
7461        Random-access range over Grapheme's $(CHARACTERS).
7462
7463        Warning: Invalidates when this Grapheme leaves the scope,
7464        attempts to use it then would lead to memory corruption.
7465    +/
7466    SliceOverIndexed!Grapheme opSlice(size_t a, size_t b) @nogc nothrow pure return
7467    {
7468        return sliceOverIndexed(a, b, &this);
7469    }
7470
7471    /// ditto
7472    SliceOverIndexed!Grapheme opSlice() @nogc nothrow pure return
7473    {
7474        return sliceOverIndexed(0, length, &this);
7475    }
7476
7477    /// Grapheme cluster length in $(CODEPOINTS).
7478    @property size_t length() const @nogc nothrow pure
7479    {
7480        return isBig ? len_ : slen_ & 0x7F;
7481    }
7482
7483    /++
7484        Append $(CHARACTER) `ch` to this grapheme.
7485        Warning:
7486        Use of this facility may invalidate grapheme cluster,
7487        see also `valid`.
7488
7489        See_Also: $(LREF Grapheme.valid)
7490    +/
7491    ref opOpAssign(string op)(dchar ch) @trusted
7492    {
7493        static if (op == "~")
7494        {
7495            import std.internal.memory : enforceRealloc;
7496            if (!isBig)
7497            {
7498                if (slen_ == small_cap)
7499                    convertToBig();// & fallthrough to "big" branch
7500                else
7501                {
7502                    write24(small_.ptr, ch, smallLength);
7503                    slen_++;
7504                    return this;
7505                }
7506            }
7507
7508            assert(isBig);
7509            if (len_ == cap_)
7510            {
7511                import core.checkedint : addu, mulu;
7512                bool overflow;
7513                cap_ = addu(cap_, grow, overflow);
7514                auto nelems = mulu(3, addu(cap_, 1, overflow), overflow);
7515                if (overflow) assert(0);
7516                ptr_ = cast(ubyte*) enforceRealloc(ptr_, nelems);
7517            }
7518            write24(ptr_, ch, len_++);
7519            return this;
7520        }
7521        else
7522            static assert(false, "No operation "~op~" defined for Grapheme");
7523    }
7524
7525    ///
7526    @safe unittest
7527    {
7528        import std.algorithm.comparison : equal;
7529        auto g = Grapheme("A");
7530        assert(g.valid);
7531        g ~= '\u0301';
7532        assert(g[].equal("A\u0301"));
7533        assert(g.valid);
7534        g ~= "B";
7535        // not a valid grapheme cluster anymore
7536        assert(!g.valid);
7537        // still could be useful though
7538        assert(g[].equal("A\u0301B"));
7539    }
7540
7541    /// Append all $(CHARACTERS) from the input range `inp` to this Grapheme.
7542    ref opOpAssign(string op, Input)(scope Input inp)
7543        if (isInputRange!Input && is(ElementType!Input : dchar))
7544    {
7545        static if (op == "~")
7546        {
7547            foreach (dchar ch; inp)
7548                this ~= ch;
7549            return this;
7550        }
7551        else
7552            static assert(false, "No operation "~op~" defined for Grapheme");
7553    }
7554
7555    /++
7556        True if this object contains valid extended grapheme cluster.
7557        Decoding primitives of this module always return a valid `Grapheme`.
7558
7559        Appending to and direct manipulation of grapheme's $(CHARACTERS) may
7560        render it no longer valid. Certain applications may chose to use
7561        Grapheme as a "small string" of any $(CODEPOINTS) and ignore this property
7562        entirely.
7563    +/
7564    @property bool valid()() /*const*/
7565    {
7566        auto r = this[];
7567        genericDecodeGrapheme!false(r);
7568        return r.length == 0;
7569    }
7570
7571    this(this) @nogc nothrow pure @trusted
7572    {
7573        import std.internal.memory : enforceMalloc;
7574        if (isBig)
7575        {// dup it
7576            import core.checkedint : addu, mulu;
7577            bool overflow;
7578            auto raw_cap = mulu(3, addu(cap_, 1, overflow), overflow);
7579            if (overflow) assert(0);
7580
7581            auto p = cast(ubyte*) enforceMalloc(raw_cap);
7582            p[0 .. raw_cap] = ptr_[0 .. raw_cap];
7583            ptr_ = p;
7584        }
7585    }
7586
7587    ~this() @nogc nothrow pure @trusted
7588    {
7589        import core.memory : pureFree;
7590        if (isBig)
7591        {
7592            pureFree(ptr_);
7593        }
7594    }
7595
7596
7597private:
7598    enum small_bytes = ((ubyte*).sizeof+3*size_t.sizeof-1);
7599    // "out of the blue" grow rate, needs testing
7600    // (though graphemes are typically small < 9)
7601    enum grow = 20;
7602    enum small_cap = small_bytes/3;
7603    enum small_flag = 0x80, small_mask = 0x7F;
7604    // 16 bytes in 32bits, should be enough for the majority of cases
7605    union
7606    {
7607        struct
7608        {
7609            ubyte* ptr_;
7610            size_t cap_;
7611            size_t len_;
7612            size_t padding_;
7613        }
7614        struct
7615        {
7616            ubyte[small_bytes] small_;
7617            ubyte slen_;
7618        }
7619    }
7620
7621    void convertToBig() @nogc nothrow pure @trusted
7622    {
7623        import std.internal.memory : enforceMalloc;
7624        static assert(grow.max / 3 - 1 >= grow);
7625        enum nbytes = 3 * (grow + 1);
7626        size_t k = smallLength;
7627        ubyte* p = cast(ubyte*) enforceMalloc(nbytes);
7628        for (int i=0; i<k; i++)
7629            write24(p, read24(small_.ptr, i), i);
7630        // now we can overwrite small array data
7631        ptr_ = p;
7632        len_ = slen_;
7633        assert(grow > len_);
7634        cap_ = grow;
7635        setBig();
7636    }
7637
7638    void setBig() @nogc nothrow pure { slen_ |= small_flag; }
7639
7640    @property size_t smallLength() const @nogc nothrow pure
7641    {
7642        return slen_ & small_mask;
7643    }
7644    @property ubyte isBig() const @nogc nothrow pure
7645    {
7646        return slen_ & small_flag;
7647    }
7648}
7649
7650static assert(Grapheme.sizeof == size_t.sizeof*4);
7651
7652
7653@safe pure /*nothrow @nogc*/ unittest // TODO: string .front is GC and throw
7654{
7655    import std.algorithm.comparison : equal;
7656    Grapheme[3] data = [Grapheme("��"), Grapheme("��"), Grapheme("��")];
7657    assert(byGrapheme("������").equal(data[]));
7658}
7659
7660///
7661@safe unittest
7662{
7663    import std.algorithm.comparison : equal;
7664    import std.algorithm.iteration : filter;
7665    import std.range : isRandomAccessRange;
7666
7667    string bold = "ku\u0308hn";
7668
7669    // note that decodeGrapheme takes parameter by ref
7670    auto first = decodeGrapheme(bold);
7671
7672    assert(first.length == 1);
7673    assert(first[0] == 'k');
7674
7675    // the next grapheme is 2 characters long
7676    auto wideOne = decodeGrapheme(bold);
7677    // slicing a grapheme yields a random-access range of dchar
7678    assert(wideOne[].equal("u\u0308"));
7679    assert(wideOne.length == 2);
7680    static assert(isRandomAccessRange!(typeof(wideOne[])));
7681
7682    // all of the usual range manipulation is possible
7683    assert(wideOne[].filter!isMark().equal("\u0308"));
7684
7685    auto g = Grapheme("A");
7686    assert(g.valid);
7687    g ~= '\u0301';
7688    assert(g[].equal("A\u0301"));
7689    assert(g.valid);
7690    g ~= "B";
7691    // not a valid grapheme cluster anymore
7692    assert(!g.valid);
7693    // still could be useful though
7694    assert(g[].equal("A\u0301B"));
7695}
7696
7697@safe unittest
7698{
7699    auto g = Grapheme("A\u0302");
7700    assert(g[0] == 'A');
7701    assert(g.valid);
7702    g[1] = '~'; // ASCII tilda is not a combining mark
7703    assert(g[1] == '~');
7704    assert(!g.valid);
7705}
7706
7707@safe unittest
7708{
7709    import std.algorithm.comparison : equal;
7710    import std.algorithm.iteration : map;
7711    import std.conv : text;
7712    import std.range : iota;
7713
7714    // not valid clusters (but it just a test)
7715    auto g  = Grapheme('a', 'b', 'c', 'd', 'e');
7716    assert(g[0] == 'a');
7717    assert(g[1] == 'b');
7718    assert(g[2] == 'c');
7719    assert(g[3] == 'd');
7720    assert(g[4] == 'e');
7721    g[3] = '��';
7722    assert(g[2] == 'c');
7723    assert(g[3] == '��', text(g[3], " vs ", '��'));
7724    assert(g[4] == 'e');
7725    assert(!g.valid);
7726
7727    g ~= '��';
7728    g ~= '~';
7729    assert(g[0] == 'a');
7730    assert(g[1] == 'b');
7731    assert(g[2] == 'c');
7732    assert(g[3] == '��');
7733    assert(g[4] == 'e');
7734    assert(g[5] == '��');
7735    assert(g[6] == '~');
7736    assert(!g.valid);
7737
7738    Grapheme copy = g;
7739    copy[0] = 'X';
7740    copy[1] = '-';
7741    assert(g[0] == 'a' && copy[0] == 'X');
7742    assert(g[1] == 'b' && copy[1] == '-');
7743    assert(equal(g[2 .. g.length], copy[2 .. copy.length]));
7744    copy = Grapheme("��������������������������");
7745    assert(equal(copy[0 .. 8], "����������������"), text(copy[0 .. 8]));
7746    copy ~= "xyz";
7747    assert(equal(copy[13 .. 15], "xy"), text(copy[13 .. 15]));
7748    assert(!copy.valid);
7749
7750    Grapheme h;
7751    foreach (dchar v; iota(cast(int)'A', cast(int)'Z'+1).map!"cast(dchar)a"())
7752        h ~= v;
7753    assert(equal(h[], iota(cast(int)'A', cast(int)'Z'+1)));
7754}
7755
7756/++
7757    $(P Does basic case-insensitive comparison of `r1` and `r2`.
7758    This function uses simpler comparison rule thus achieving better performance
7759    than $(LREF icmp). However keep in mind the warning below.)
7760
7761    Params:
7762        r1 = an $(REF_ALTTEXT input range, isInputRange, std,range,primitives) of characters
7763        r2 = an $(REF_ALTTEXT input range, isInputRange, std,range,primitives) of characters
7764
7765    Returns:
7766        An `int` that is 0 if the strings match,
7767        &lt;0 if `r1` is lexicographically "less" than `r2`,
7768        &gt;0 if `r1` is lexicographically "greater" than `r2`
7769
7770    Warning:
7771    This function only handles 1:1 $(CODEPOINT) mapping
7772    and thus is not sufficient for certain alphabets
7773    like German, Greek and few others.
7774
7775    See_Also:
7776        $(LREF icmp)
7777        $(REF cmp, std,algorithm,comparison)
7778+/
7779int sicmp(S1, S2)(scope S1 r1, scope S2 r2)
7780if (isInputRange!S1 && isSomeChar!(ElementEncodingType!S1)
7781    && isInputRange!S2 && isSomeChar!(ElementEncodingType!S2))
7782{
7783    import std.internal.unicode_tables : sTable = simpleCaseTable; // generated file
7784    import std.range.primitives : isInfinite;
7785    import std.utf : decodeFront;
7786    import std.traits : isDynamicArray;
7787    import std.typecons : Yes;
7788    static import std.ascii;
7789
7790    static if ((isDynamicArray!S1 || isRandomAccessRange!S1)
7791        && (isDynamicArray!S2 || isRandomAccessRange!S2)
7792        && !(isInfinite!S1 && isInfinite!S2)
7793        && __traits(compiles,
7794            {
7795                size_t s = size_t.sizeof / 2;
7796                r1 = r1[s .. $];
7797                r2 = r2[s .. $];
7798            }))
7799    {{
7800        // ASCII optimization for dynamic arrays & similar.
7801        size_t i = 0;
7802        static if (isInfinite!S1)
7803            immutable end = r2.length;
7804        else static if (isInfinite!S2)
7805            immutable end = r1.length;
7806        else
7807            immutable end = r1.length > r2.length ? r2.length : r1.length;
7808        for (; i < end; ++i)
7809        {
7810            auto lhs = r1[i];
7811            auto rhs = r2[i];
7812            if ((lhs | rhs) >= 0x80) goto NonAsciiPath;
7813            if (lhs == rhs) continue;
7814            auto lowDiff = std.ascii.toLower(lhs) - std.ascii.toLower(rhs);
7815            if (lowDiff) return lowDiff;
7816        }
7817        static if (isInfinite!S1)
7818            return 1;
7819        else static if (isInfinite!S2)
7820            return -1;
7821        else
7822            return (r1.length > r2.length) - (r2.length > r1.length);
7823
7824    NonAsciiPath:
7825        r1 = r1[i .. $];
7826        r2 = r2[i .. $];
7827        // Fall through to standard case.
7828    }}
7829
7830    while (!r1.empty)
7831    {
7832        immutable lhs = decodeFront!(Yes.useReplacementDchar)(r1);
7833        if (r2.empty)
7834            return 1;
7835        immutable rhs = decodeFront!(Yes.useReplacementDchar)(r2);
7836        int diff = lhs - rhs;
7837        if (!diff)
7838            continue;
7839        if ((lhs | rhs) < 0x80)
7840        {
7841            immutable d = std.ascii.toLower(lhs) - std.ascii.toLower(rhs);
7842            if (!d) continue;
7843            return d;
7844        }
7845        size_t idx = simpleCaseTrie[lhs];
7846        size_t idx2 = simpleCaseTrie[rhs];
7847        // simpleCaseTrie is packed index table
7848        if (idx != EMPTY_CASE_TRIE)
7849        {
7850            if (idx2 != EMPTY_CASE_TRIE)
7851            {// both cased chars
7852                // adjust idx --> start of bucket
7853                idx = idx - sTable[idx].n;
7854                idx2 = idx2 - sTable[idx2].n;
7855                if (idx == idx2)// one bucket, equivalent chars
7856                    continue;
7857                else//  not the same bucket
7858                    diff = sTable[idx].ch - sTable[idx2].ch;
7859            }
7860            else
7861                diff = sTable[idx - sTable[idx].n].ch - rhs;
7862        }
7863        else if (idx2 != EMPTY_CASE_TRIE)
7864        {
7865            diff = lhs - sTable[idx2 - sTable[idx2].n].ch;
7866        }
7867        // one of chars is not cased at all
7868        return diff;
7869    }
7870    return int(r2.empty) - 1;
7871}
7872
7873///
7874@safe @nogc pure nothrow unittest
7875{
7876    assert(sicmp("������������", "������������") == 0);
7877    // Greek also works as long as there is no 1:M mapping in sight
7878    assert(sicmp("����", "����") == 0);
7879    // things like the following won't get matched as equal
7880    // Greek small letter iota with dialytika and tonos
7881    assert(sicmp("��", "\u03B9\u0308\u0301") != 0);
7882
7883    // while icmp has no problem with that
7884    assert(icmp("��", "\u03B9\u0308\u0301") == 0);
7885    assert(icmp("����", "����") == 0);
7886}
7887
7888// overloads for the most common cases to reduce compile time
7889@safe @nogc pure nothrow
7890{
7891    int sicmp(scope const(char)[] str1, scope const(char)[] str2)
7892    { return sicmp!(const(char)[], const(char)[])(str1, str2); }
7893
7894    int sicmp(scope const(wchar)[] str1, scope const(wchar)[] str2)
7895    { return sicmp!(const(wchar)[], const(wchar)[])(str1, str2); }
7896
7897    int sicmp(scope const(dchar)[] str1, scope const(dchar)[] str2)
7898    { return sicmp!(const(dchar)[], const(dchar)[])(str1, str2); }
7899}
7900
7901private int fullCasedCmp(Range)(dchar lhs, dchar rhs, ref Range rtail)
7902{
7903    import std.algorithm.searching : skipOver;
7904    import std.internal.unicode_tables : fullCaseTable; // generated file
7905    alias fTable = fullCaseTable;
7906    size_t idx = fullCaseTrie[lhs];
7907    // fullCaseTrie is packed index table
7908    if (idx == EMPTY_CASE_TRIE)
7909        return lhs;
7910    immutable start = idx - fTable[idx].n;
7911    immutable end = fTable[idx].size + start;
7912    assert(fTable[start].entry_len == 1);
7913    for (idx=start; idx<end; idx++)
7914    {
7915        auto entryLen = fTable[idx].entry_len;
7916        if (entryLen == 1)
7917        {
7918            if (fTable[idx].seq[0] == rhs)
7919            {
7920                return 0;
7921            }
7922        }
7923        else
7924        {// OK it's a long chunk, like 'ss' for German
7925            dstring seq = fTable[idx].seq[0 .. entryLen];
7926            if (rhs == seq[0]
7927                && rtail.skipOver(seq[1..$]))
7928            {
7929                // note that this path modifies rtail
7930                // iff we managed to get there
7931                return 0;
7932            }
7933        }
7934    }
7935    return fTable[start].seq[0]; // new remapped character for accurate diffs
7936}
7937
7938/++
7939    Does case insensitive comparison of `r1` and `r2`.
7940    Follows the rules of full case-folding mapping.
7941    This includes matching as equal german �� with "ss" and
7942    other 1:M $(CODEPOINT) mappings unlike $(LREF sicmp).
7943    The cost of `icmp` being pedantically correct is
7944    slightly worse performance.
7945
7946    Params:
7947        r1 = a forward range of characters
7948        r2 = a forward range of characters
7949
7950    Returns:
7951        An `int` that is 0 if the strings match,
7952        &lt;0 if `str1` is lexicographically "less" than `str2`,
7953        &gt;0 if `str1` is lexicographically "greater" than `str2`
7954
7955    See_Also:
7956        $(LREF sicmp)
7957        $(REF cmp, std,algorithm,comparison)
7958+/
7959int icmp(S1, S2)(S1 r1, S2 r2)
7960if (isForwardRange!S1 && isSomeChar!(ElementEncodingType!S1)
7961    && isForwardRange!S2 && isSomeChar!(ElementEncodingType!S2))
7962{
7963    import std.range.primitives : isInfinite;
7964    import std.traits : isDynamicArray;
7965    import std.utf : byDchar;
7966    static import std.ascii;
7967
7968    static if ((isDynamicArray!S1 || isRandomAccessRange!S1)
7969        && (isDynamicArray!S2 || isRandomAccessRange!S2)
7970        && !(isInfinite!S1 && isInfinite!S2)
7971        && __traits(compiles,
7972            {
7973                size_t s = size_t.max / 2;
7974                r1 = r1[s .. $];
7975                r2 = r2[s .. $];
7976            }))
7977    {{
7978        // ASCII optimization for dynamic arrays & similar.
7979        size_t i = 0;
7980        static if (isInfinite!S1)
7981            immutable end = r2.length;
7982        else static if (isInfinite!S2)
7983            immutable end = r1.length;
7984        else
7985            immutable end = r1.length > r2.length ? r2.length : r1.length;
7986        for (; i < end; ++i)
7987        {
7988            auto lhs = r1[i];
7989            auto rhs = r2[i];
7990            if ((lhs | rhs) >= 0x80) goto NonAsciiPath;
7991            if (lhs == rhs) continue;
7992            auto lowDiff = std.ascii.toLower(lhs) - std.ascii.toLower(rhs);
7993            if (lowDiff) return lowDiff;
7994        }
7995        static if (isInfinite!S1)
7996            return 1;
7997        else static if (isInfinite!S2)
7998            return -1;
7999        else
8000            return (r1.length > r2.length) - (r2.length > r1.length);
8001
8002    NonAsciiPath:
8003        r1 = r1[i .. $];
8004        r2 = r2[i .. $];
8005        // Fall through to standard case.
8006    }}
8007
8008    auto str1 = r1.byDchar;
8009    auto str2 = r2.byDchar;
8010
8011    for (;;)
8012    {
8013        if (str1.empty)
8014            return str2.empty ? 0 : -1;
8015        immutable lhs = str1.front;
8016        if (str2.empty)
8017            return 1;
8018        immutable rhs = str2.front;
8019        str1.popFront();
8020        str2.popFront();
8021        if (!(lhs - rhs))
8022            continue;
8023        // first try to match lhs to <rhs,right-tail> sequence
8024        immutable cmpLR = fullCasedCmp(lhs, rhs, str2);
8025        if (!cmpLR)
8026            continue;
8027        // then rhs to <lhs,left-tail> sequence
8028        immutable cmpRL = fullCasedCmp(rhs, lhs, str1);
8029        if (!cmpRL)
8030            continue;
8031        // cmpXX contain remapped codepoints
8032        // to obtain stable ordering of icmp
8033        return cmpLR - cmpRL;
8034    }
8035}
8036
8037///
8038@safe @nogc pure nothrow unittest
8039{
8040    assert(icmp("Ru��land", "Russland") == 0);
8041    assert(icmp("��� -> \u1F70\u03B9", "\u1F61\u03B9 -> ���") == 0);
8042}
8043
8044/**
8045 * By using $(REF byUTF, std,utf) and its aliases, GC allocations via auto-decoding
8046 * and thrown exceptions can be avoided, making `icmp` `@safe @nogc nothrow pure`.
8047 */
8048@safe @nogc nothrow pure unittest
8049{
8050    import std.utf : byDchar;
8051
8052    assert(icmp("Ru��land".byDchar, "Russland".byDchar) == 0);
8053    assert(icmp("��� -> \u1F70\u03B9".byDchar, "\u1F61\u03B9 -> ���".byDchar) == 0);
8054}
8055
8056// test different character types
8057@safe unittest
8058{
8059    assert(icmp("Ru��land", "Russland") == 0);
8060    assert(icmp("Ru��land"w, "Russland") == 0);
8061    assert(icmp("Ru��land", "Russland"w) == 0);
8062    assert(icmp("Ru��land"w, "Russland"w) == 0);
8063    assert(icmp("Ru��land"d, "Russland"w) == 0);
8064    assert(icmp("Ru��land"w, "Russland"d) == 0);
8065}
8066
8067// overloads for the most common cases to reduce compile time
8068@safe @nogc pure nothrow
8069{
8070    int icmp(const(char)[] str1, const(char)[] str2)
8071    { return icmp!(const(char)[], const(char)[])(str1, str2); }
8072    int icmp(const(wchar)[] str1, const(wchar)[] str2)
8073    { return icmp!(const(wchar)[], const(wchar)[])(str1, str2); }
8074    int icmp(const(dchar)[] str1, const(dchar)[] str2)
8075    { return icmp!(const(dchar)[], const(dchar)[])(str1, str2); }
8076}
8077
8078@safe unittest
8079{
8080    import std.algorithm.sorting : sort;
8081    import std.conv : to;
8082    import std.exception : assertCTFEable;
8083    assertCTFEable!(
8084    {
8085    static foreach (cfunc; AliasSeq!(icmp, sicmp))
8086    {{
8087        static foreach (S1; AliasSeq!(string, wstring, dstring))
8088        static foreach (S2; AliasSeq!(string, wstring, dstring))
8089        {
8090            assert(cfunc("".to!S1(), "".to!S2()) == 0);
8091            assert(cfunc("A".to!S1(), "".to!S2()) > 0);
8092            assert(cfunc("".to!S1(), "0".to!S2()) < 0);
8093            assert(cfunc("abc".to!S1(), "abc".to!S2()) == 0);
8094            assert(cfunc("abcd".to!S1(), "abc".to!S2()) > 0);
8095            assert(cfunc("abc".to!S1(), "abcd".to!S2()) < 0);
8096            assert(cfunc("Abc".to!S1(), "aBc".to!S2()) == 0);
8097            assert(cfunc("������������".to!S1(), "������������".to!S2()) == 0);
8098            // Check example:
8099            assert(cfunc("������������".to!S1(), "������������".to!S2()) == 0);
8100            assert(cfunc("����".to!S1(), "����".to!S2()) == 0);
8101        }
8102        // check that the order is properly agnostic to the case
8103        auto strs = [ "Apple", "ORANGE",  "orAcle", "amp", "banana"];
8104        sort!((a,b) => cfunc(a,b) < 0)(strs);
8105        assert(strs == ["amp", "Apple",  "banana", "orAcle", "ORANGE"]);
8106    }}
8107    assert(icmp("��b", "ssa") > 0);
8108    // Check example:
8109    assert(icmp("Russland", "Ru��land") == 0);
8110    assert(icmp("��� -> \u1F70\u03B9", "\u1F61\u03B9 -> ���") == 0);
8111    assert(icmp("��"w, "\u03B9\u0308\u0301") == 0);
8112    assert(sicmp("��", "\u03B9\u0308\u0301") != 0);
8113    // https://issues.dlang.org/show_bug.cgi?id=11057
8114    assert( icmp("K", "L") < 0 );
8115    });
8116}
8117
8118// https://issues.dlang.org/show_bug.cgi?id=17372
8119@safe pure unittest
8120{
8121    import std.algorithm.iteration : joiner, map;
8122    import std.algorithm.sorting : sort;
8123    import std.array : array;
8124    auto a = [["foo", "bar"], ["baz"]].map!(line => line.joiner(" ")).array.sort!((a, b) => icmp(a, b) < 0);
8125}
8126
8127// This is package(std) for the moment to be used as a support tool for std.regex
8128// It needs a better API
8129/*
8130    Return a range of all $(CODEPOINTS) that casefold to
8131    and from this `ch`.
8132*/
8133package(std) auto simpleCaseFoldings(dchar ch) @safe
8134{
8135    import std.internal.unicode_tables : simpleCaseTable; // generated file
8136    alias sTable = simpleCaseTable;
8137    static struct Range
8138    {
8139    @safe pure nothrow:
8140        uint idx; //if == uint.max, then read c.
8141        union
8142        {
8143            dchar c; // == 0 - empty range
8144            uint len;
8145        }
8146        @property bool isSmall() const { return idx == uint.max; }
8147
8148        this(dchar ch)
8149        {
8150            idx = uint.max;
8151            c = ch;
8152        }
8153
8154        this(uint start, uint size)
8155        {
8156            idx = start;
8157            len = size;
8158        }
8159
8160        @property dchar front() const
8161        {
8162            assert(!empty);
8163            if (isSmall)
8164            {
8165                return c;
8166            }
8167            auto ch = sTable[idx].ch;
8168            return ch;
8169        }
8170
8171        @property bool empty() const
8172        {
8173            if (isSmall)
8174            {
8175                return c == 0;
8176            }
8177            return len == 0;
8178        }
8179
8180        @property size_t length() const
8181        {
8182            if (isSmall)
8183            {
8184                return c == 0 ? 0 : 1;
8185            }
8186            return len;
8187        }
8188
8189        void popFront()
8190        {
8191            if (isSmall)
8192                c = 0;
8193            else
8194            {
8195                idx++;
8196                len--;
8197            }
8198        }
8199    }
8200    immutable idx = simpleCaseTrie[ch];
8201    if (idx == EMPTY_CASE_TRIE)
8202        return Range(ch);
8203    auto entry = sTable[idx];
8204    immutable start = idx - entry.n;
8205    return Range(start, entry.size);
8206}
8207
8208@safe unittest
8209{
8210    import std.algorithm.comparison : equal;
8211    import std.algorithm.searching : canFind;
8212    import std.array : array;
8213    import std.exception : assertCTFEable;
8214    assertCTFEable!((){
8215        auto r = simpleCaseFoldings('��').array;
8216        assert(r.length == 2);
8217        assert(r.canFind('��') && r.canFind('��'));
8218        auto sr = simpleCaseFoldings('~');
8219        assert(sr.equal("~"));
8220        //A with ring above - casefolds to the same bucket as Angstrom sign
8221        sr = simpleCaseFoldings('��');
8222        assert(sr.length == 3);
8223        assert(sr.canFind('��') && sr.canFind('��') && sr.canFind('\u212B'));
8224    });
8225}
8226
8227/++
8228    $(P Returns the $(S_LINK Combining class, combining class) of `ch`.)
8229+/
8230ubyte combiningClass(dchar ch) @safe pure nothrow @nogc
8231{
8232    return combiningClassTrie[ch];
8233}
8234
8235///
8236@safe unittest
8237{
8238    // shorten the code
8239    alias CC = combiningClass;
8240
8241    // combining tilda
8242    assert(CC('\u0303') == 230);
8243    // combining ring below
8244    assert(CC('\u0325') == 220);
8245    // the simple consequence is that  "tilda" should be
8246    // placed after a "ring below" in a sequence
8247}
8248
8249@safe pure nothrow @nogc unittest
8250{
8251    foreach (ch; 0 .. 0x80)
8252        assert(combiningClass(ch) == 0);
8253    assert(combiningClass('\u05BD') == 22);
8254    assert(combiningClass('\u0300') == 230);
8255    assert(combiningClass('\u0317') == 220);
8256    assert(combiningClass('\u1939') == 222);
8257}
8258
8259/// Unicode character decomposition type.
8260enum UnicodeDecomposition {
8261    /// Canonical decomposition. The result is canonically equivalent sequence.
8262    Canonical,
8263    /**
8264         Compatibility decomposition. The result is compatibility equivalent sequence.
8265         Note: Compatibility decomposition is a $(B lossy) conversion,
8266         typically suitable only for fuzzy matching and internal processing.
8267    */
8268    Compatibility
8269}
8270
8271/**
8272    Shorthand aliases for character decomposition type, passed as a
8273    template parameter to $(LREF decompose).
8274*/
8275enum {
8276    Canonical = UnicodeDecomposition.Canonical,
8277    Compatibility = UnicodeDecomposition.Compatibility
8278}
8279
8280/++
8281    Try to canonically compose 2 $(CHARACTERS).
8282    Returns the composed $(CHARACTER) if they do compose and dchar.init otherwise.
8283
8284    The assumption is that `first` comes before `second` in the original text,
8285    usually meaning that the first is a starter.
8286
8287    Note: Hangul syllables are not covered by this function.
8288    See `composeJamo` below.
8289+/
8290public dchar compose(dchar first, dchar second) pure nothrow @safe
8291{
8292    import std.algorithm.iteration : map;
8293    import std.internal.unicode_comp : compositionTable, composeCntShift, composeIdxMask;
8294    import std.range : assumeSorted;
8295    immutable packed = compositionJumpTrie[first];
8296    if (packed == ushort.max)
8297        return dchar.init;
8298    // unpack offset and length
8299    immutable idx = packed & composeIdxMask, cnt = packed >> composeCntShift;
8300    // TODO: optimize this micro binary search (no more then 4-5 steps)
8301    auto r = compositionTable[idx .. idx+cnt].map!"a.rhs"().assumeSorted();
8302    immutable target = r.lowerBound(second).length;
8303    if (target == cnt)
8304        return dchar.init;
8305    immutable entry = compositionTable[idx+target];
8306    if (entry.rhs != second)
8307        return dchar.init;
8308    return entry.composed;
8309}
8310
8311///
8312@safe unittest
8313{
8314    assert(compose('A','\u0308') == '\u00C4');
8315    assert(compose('A', 'B') == dchar.init);
8316    assert(compose('C', '\u0301') == '\u0106');
8317    // note that the starter is the first one
8318    // thus the following doesn't compose
8319    assert(compose('\u0308', 'A') == dchar.init);
8320}
8321
8322/++
8323    Returns a full $(S_LINK Canonical decomposition, Canonical)
8324    (by default) or $(S_LINK Compatibility decomposition, Compatibility)
8325    decomposition of $(CHARACTER) `ch`.
8326    If no decomposition is available returns a $(LREF Grapheme)
8327    with the `ch` itself.
8328
8329    Note:
8330    This function also decomposes hangul syllables
8331    as prescribed by the standard.
8332
8333    See_Also: $(LREF decomposeHangul) for a restricted version
8334    that takes into account only hangul syllables  but
8335    no other decompositions.
8336+/
8337public Grapheme decompose(UnicodeDecomposition decompType=Canonical)(dchar ch) @safe
8338{
8339    import std.algorithm.searching : until;
8340    import std.internal.unicode_decomp : decompCompatTable, decompCanonTable;
8341    static if (decompType == Canonical)
8342    {
8343        alias table = decompCanonTable;
8344        alias mapping = canonMappingTrie;
8345    }
8346    else static if (decompType == Compatibility)
8347    {
8348        alias table = decompCompatTable;
8349        alias mapping = compatMappingTrie;
8350    }
8351    immutable idx = mapping[ch];
8352    if (!idx) // not found, check hangul arithmetic decomposition
8353        return decomposeHangul(ch);
8354    auto decomp = table[idx..$].until(0);
8355    return Grapheme(decomp);
8356}
8357
8358///
8359@safe unittest
8360{
8361    import std.algorithm.comparison : equal;
8362
8363    assert(compose('A','\u0308') == '\u00C4');
8364    assert(compose('A', 'B') == dchar.init);
8365    assert(compose('C', '\u0301') == '\u0106');
8366    // note that the starter is the first one
8367    // thus the following doesn't compose
8368    assert(compose('\u0308', 'A') == dchar.init);
8369
8370    assert(decompose('��')[].equal("C\u0302"));
8371    assert(decompose('D')[].equal("D"));
8372    assert(decompose('\uD4DC')[].equal("\u1111\u1171\u11B7"));
8373    assert(decompose!Compatibility('��')[].equal("1"));
8374}
8375
8376//----------------------------------------------------------------------------
8377// Hangul specific composition/decomposition
8378enum jamoSBase = 0xAC00;
8379enum jamoLBase = 0x1100;
8380enum jamoVBase = 0x1161;
8381enum jamoTBase = 0x11A7;
8382enum jamoLCount = 19, jamoVCount = 21, jamoTCount = 28;
8383enum jamoNCount = jamoVCount * jamoTCount;
8384enum jamoSCount = jamoLCount * jamoNCount;
8385
8386// Tests if `ch` is a Hangul leading consonant jamo.
8387bool isJamoL(dchar ch) pure nothrow @nogc @safe
8388{
8389    // first cmp rejects ~ 1M code points above leading jamo range
8390    return ch < jamoLBase+jamoLCount && ch >= jamoLBase;
8391}
8392
8393// Tests if `ch` is a Hangul vowel jamo.
8394bool isJamoT(dchar ch) pure nothrow @nogc @safe
8395{
8396    // first cmp rejects ~ 1M code points above trailing jamo range
8397    // Note: ch == jamoTBase doesn't indicate trailing jamo (TIndex must be > 0)
8398    return ch < jamoTBase+jamoTCount && ch > jamoTBase;
8399}
8400
8401// Tests if `ch` is a Hangul trailnig consonant jamo.
8402bool isJamoV(dchar ch) pure nothrow @nogc @safe
8403{
8404    // first cmp rejects ~ 1M code points above vowel range
8405    return  ch < jamoVBase+jamoVCount && ch >= jamoVBase;
8406}
8407
8408int hangulSyllableIndex(dchar ch) pure nothrow @nogc @safe
8409{
8410    int idxS = cast(int) ch - jamoSBase;
8411    return idxS >= 0 && idxS < jamoSCount ? idxS : -1;
8412}
8413
8414// internal helper: compose hangul syllables leaving dchar.init in holes
8415void hangulRecompose(scope dchar[] seq) pure nothrow @nogc @safe
8416{
8417    for (size_t idx = 0; idx + 1 < seq.length; )
8418    {
8419        if (isJamoL(seq[idx]) && isJamoV(seq[idx+1]))
8420        {
8421            immutable int indexL = seq[idx] - jamoLBase;
8422            immutable int indexV = seq[idx+1] - jamoVBase;
8423            immutable int indexLV = indexL * jamoNCount + indexV * jamoTCount;
8424            if (idx + 2 < seq.length && isJamoT(seq[idx+2]))
8425            {
8426                seq[idx] = jamoSBase + indexLV + seq[idx+2] - jamoTBase;
8427                seq[idx+1] = dchar.init;
8428                seq[idx+2] = dchar.init;
8429                idx += 3;
8430            }
8431            else
8432            {
8433                seq[idx] = jamoSBase + indexLV;
8434                seq[idx+1] = dchar.init;
8435                idx += 2;
8436            }
8437        }
8438        else
8439            idx++;
8440    }
8441}
8442
8443//----------------------------------------------------------------------------
8444public:
8445
8446/**
8447    Decomposes a Hangul syllable. If `ch` is not a composed syllable
8448    then this function returns $(LREF Grapheme) containing only `ch` as is.
8449*/
8450Grapheme decomposeHangul(dchar ch) @safe
8451{
8452    immutable idxS = cast(int) ch - jamoSBase;
8453    if (idxS < 0 || idxS >= jamoSCount) return Grapheme(ch);
8454    immutable idxL = idxS / jamoNCount;
8455    immutable idxV = (idxS % jamoNCount) / jamoTCount;
8456    immutable idxT = idxS % jamoTCount;
8457
8458    immutable partL = jamoLBase + idxL;
8459    immutable partV = jamoVBase + idxV;
8460    if (idxT > 0) // there is a trailling consonant (T); <L,V,T> decomposition
8461        return Grapheme(partL, partV, jamoTBase + idxT);
8462    else // <L, V> decomposition
8463        return Grapheme(partL, partV);
8464}
8465
8466///
8467@safe unittest
8468{
8469    import std.algorithm.comparison : equal;
8470    assert(decomposeHangul('\uD4DB')[].equal("\u1111\u1171\u11B6"));
8471}
8472
8473/++
8474    Try to compose hangul syllable out of a leading consonant (`lead`),
8475    a `vowel` and optional `trailing` consonant jamos.
8476
8477    On success returns the composed LV or LVT hangul syllable.
8478
8479    If any of `lead` and `vowel` are not a valid hangul jamo
8480    of the respective $(CHARACTER) class returns dchar.init.
8481+/
8482dchar composeJamo(dchar lead, dchar vowel, dchar trailing=dchar.init) pure nothrow @nogc @safe
8483{
8484    if (!isJamoL(lead))
8485        return dchar.init;
8486    immutable indexL = lead - jamoLBase;
8487    if (!isJamoV(vowel))
8488        return dchar.init;
8489    immutable indexV = vowel - jamoVBase;
8490    immutable indexLV = indexL * jamoNCount + indexV * jamoTCount;
8491    immutable dchar syllable = jamoSBase + indexLV;
8492    return isJamoT(trailing) ? syllable + (trailing - jamoTBase) : syllable;
8493}
8494
8495///
8496@safe unittest
8497{
8498    assert(composeJamo('\u1111', '\u1171', '\u11B6') == '\uD4DB');
8499    // leaving out T-vowel, or passing any codepoint
8500    // that is not trailing consonant composes an LV-syllable
8501    assert(composeJamo('\u1111', '\u1171') == '\uD4CC');
8502    assert(composeJamo('\u1111', '\u1171', ' ') == '\uD4CC');
8503    assert(composeJamo('\u1111', 'A') == dchar.init);
8504    assert(composeJamo('A', '\u1171') == dchar.init);
8505}
8506
8507@safe unittest
8508{
8509    import std.algorithm.comparison : equal;
8510    import std.conv : text;
8511
8512    static void testDecomp(UnicodeDecomposition T)(dchar ch, string r)
8513    {
8514        Grapheme g = decompose!T(ch);
8515        assert(equal(g[], r), text(g[], " vs ", r));
8516    }
8517    testDecomp!Canonical('\u1FF4', "\u03C9\u0301\u0345");
8518    testDecomp!Canonical('\uF907', "\u9F9C");
8519    testDecomp!Compatibility('\u33FF', "\u0067\u0061\u006C");
8520    testDecomp!Compatibility('\uA7F9', "\u0153");
8521
8522    // check examples
8523    assert(decomposeHangul('\uD4DB')[].equal("\u1111\u1171\u11B6"));
8524    assert(composeJamo('\u1111', '\u1171', '\u11B6') == '\uD4DB');
8525    assert(composeJamo('\u1111', '\u1171') == '\uD4CC'); // leave out T-vowel
8526    assert(composeJamo('\u1111', '\u1171', ' ') == '\uD4CC');
8527    assert(composeJamo('\u1111', 'A') == dchar.init);
8528    assert(composeJamo('A', '\u1171') == dchar.init);
8529}
8530
8531/**
8532    Enumeration type for normalization forms,
8533    passed as template parameter for functions like $(LREF normalize).
8534*/
8535enum NormalizationForm {
8536    NFC,
8537    NFD,
8538    NFKC,
8539    NFKD
8540}
8541
8542
8543enum {
8544    /**
8545        Shorthand aliases from values indicating normalization forms.
8546    */
8547    NFC = NormalizationForm.NFC,
8548    ///ditto
8549    NFD = NormalizationForm.NFD,
8550    ///ditto
8551    NFKC = NormalizationForm.NFKC,
8552    ///ditto
8553    NFKD = NormalizationForm.NFKD
8554}
8555
8556/++
8557    Returns `input` string normalized to the chosen form.
8558    Form C is used by default.
8559
8560    For more information on normalization forms see
8561    the $(S_LINK Normalization, normalization section).
8562
8563    Note:
8564    In cases where the string in question is already normalized,
8565    it is returned unmodified and no memory allocation happens.
8566+/
8567inout(C)[] normalize(NormalizationForm norm=NFC, C)(return scope inout(C)[] input)
8568{
8569    import std.algorithm.mutation : SwapStrategy;
8570    import std.algorithm.sorting : sort;
8571    import std.array : appender;
8572    import std.range : zip;
8573
8574    auto anchors = splitNormalized!norm(input);
8575    if (anchors[0] == input.length && anchors[1] == input.length)
8576        return input;
8577    dchar[] decomposed;
8578    decomposed.reserve(31);
8579    ubyte[] ccc;
8580    ccc.reserve(31);
8581    auto app = appender!(C[])();
8582    do
8583    {
8584        app.put(input[0 .. anchors[0]]);
8585        foreach (dchar ch; input[anchors[0]..anchors[1]])
8586            static if (norm == NFD || norm == NFC)
8587            {
8588                foreach (dchar c; decompose!Canonical(ch)[])
8589                    decomposed ~= c;
8590            }
8591            else // NFKD & NFKC
8592            {
8593                foreach (dchar c; decompose!Compatibility(ch)[])
8594                    decomposed ~= c;
8595            }
8596        ccc.length = decomposed.length;
8597        size_t firstNonStable = 0;
8598        ubyte lastClazz = 0;
8599
8600        foreach (idx, dchar ch; decomposed)
8601        {
8602            immutable clazz = combiningClass(ch);
8603            ccc[idx] = clazz;
8604            if (clazz == 0 && lastClazz != 0)
8605            {
8606                // found a stable code point after unstable ones
8607                sort!("a[0] < b[0]", SwapStrategy.stable)
8608                    (zip(ccc[firstNonStable .. idx], decomposed[firstNonStable .. idx]));
8609                firstNonStable = decomposed.length;
8610            }
8611            else if (clazz != 0 && lastClazz == 0)
8612            {
8613                // found first unstable code point after stable ones
8614                firstNonStable = idx;
8615            }
8616            lastClazz = clazz;
8617        }
8618        sort!("a[0] < b[0]", SwapStrategy.stable)
8619            (zip(ccc[firstNonStable..$], decomposed[firstNonStable..$]));
8620        static if (norm == NFC || norm == NFKC)
8621        {
8622            import std.algorithm.searching : countUntil;
8623            auto first = countUntil(ccc, 0);
8624            if (first >= 0) // no starters?? no recomposition
8625            {
8626                for (;;)
8627                {
8628                    immutable second = recompose(first, decomposed, ccc);
8629                    if (second == decomposed.length)
8630                        break;
8631                    first = second;
8632                }
8633                // 2nd pass for hangul syllables
8634                hangulRecompose(decomposed);
8635            }
8636        }
8637        static if (norm == NFD || norm == NFKD)
8638            app.put(decomposed);
8639        else
8640        {
8641            import std.algorithm.mutation : remove;
8642            auto clean = remove!("a == dchar.init", SwapStrategy.stable)(decomposed);
8643            app.put(decomposed[0 .. clean.length]);
8644        }
8645        // reset variables
8646        decomposed.length = 0;
8647        () @trusted {
8648            decomposed.assumeSafeAppend();
8649            ccc.length = 0;
8650            ccc.assumeSafeAppend();
8651        } ();
8652        input = input[anchors[1]..$];
8653        // and move on
8654        anchors = splitNormalized!norm(input);
8655    }while (anchors[0] != input.length);
8656    app.put(input[0 .. anchors[0]]);
8657    return () @trusted inout { return cast(inout(C)[]) app.data; } ();
8658}
8659
8660///
8661@safe unittest
8662{
8663    // any encoding works
8664    wstring greet = "Hello world";
8665    assert(normalize(greet) is greet); // the same exact slice
8666
8667    // An example of a character with all 4 forms being different:
8668    // Greek upsilon with acute and hook symbol (code point 0x03D3)
8669    assert(normalize!NFC("��") == "\u03D3");
8670    assert(normalize!NFD("��") == "\u03D2\u0301");
8671    assert(normalize!NFKC("��") == "\u038E");
8672    assert(normalize!NFKD("��") == "\u03A5\u0301");
8673}
8674
8675@safe unittest
8676{
8677    import std.conv : text;
8678
8679    assert(normalize!NFD("abc\uF904def") == "abc\u6ED1def", text(normalize!NFD("abc\uF904def")));
8680    assert(normalize!NFKD("2�����") == "210", normalize!NFKD("2�����"));
8681    assert(normalize!NFD("��ffin") == "A\u0308ffin");
8682
8683    // check example
8684
8685    // any encoding works
8686    wstring greet = "Hello world";
8687    assert(normalize(greet) is greet); // the same exact slice
8688
8689    // An example of a character with all 4 forms being different:
8690    // Greek upsilon with acute and hook symbol (code point 0x03D3)
8691    assert(normalize!NFC("��") == "\u03D3");
8692    assert(normalize!NFD("��") == "\u03D2\u0301");
8693    assert(normalize!NFKC("��") == "\u038E");
8694    assert(normalize!NFKD("��") == "\u03A5\u0301");
8695}
8696
8697// canonically recompose given slice of code points, works in-place and mutates data
8698private size_t recompose(size_t start, scope dchar[] input, scope ubyte[] ccc) pure nothrow @safe
8699{
8700    assert(input.length == ccc.length);
8701    int accumCC = -1;// so that it's out of 0 .. 255 range
8702    // writefln("recomposing %( %04x %)", input);
8703    // first one is always a starter thus we start at i == 1
8704    size_t i = start+1;
8705    for (; ; )
8706    {
8707        if (i == input.length)
8708            break;
8709        immutable curCC = ccc[i];
8710        // In any character sequence beginning with a starter S
8711        // a character C is blocked from S if and only if there
8712        // is some character B between S and C, and either B
8713        // is a starter or it has the same or higher combining class as C.
8714        //------------------------
8715        // Applying to our case:
8716        // S is input[0]
8717        // accumCC is the maximum CCC of characters between C and S,
8718        //     as ccc are sorted
8719        // C is input[i]
8720
8721        if (curCC > accumCC)
8722        {
8723            immutable comp = compose(input[start], input[i]);
8724            if (comp != dchar.init)
8725            {
8726                input[start] = comp;
8727                input[i] = dchar.init;// put a sentinel
8728                // current was merged so its CCC shouldn't affect
8729                // composing with the next one
8730            }
8731            else
8732            {
8733                // if it was a starter then accumCC is now 0, end of loop
8734                accumCC = curCC;
8735                if (accumCC == 0)
8736                    break;
8737            }
8738        }
8739        else
8740        {
8741            // ditto here
8742            accumCC = curCC;
8743            if (accumCC == 0)
8744                break;
8745        }
8746        i++;
8747    }
8748    return i;
8749}
8750
8751// returns tuple of 2 indexes that delimit:
8752// normalized text, piece that needs normalization and
8753// the rest of input starting with stable code point
8754private auto splitNormalized(NormalizationForm norm, C)(scope const(C)[] input)
8755{
8756    import std.typecons : tuple;
8757    ubyte lastCC = 0;
8758
8759    foreach (idx, dchar ch; input)
8760    {
8761        static if (norm == NFC)
8762            if (ch < 0x0300)
8763            {
8764                lastCC = 0;
8765                continue;
8766            }
8767        immutable ubyte CC = combiningClass(ch);
8768        if (lastCC > CC && CC != 0)
8769        {
8770            return seekStable!norm(idx, input);
8771        }
8772
8773        if (notAllowedIn!norm(ch))
8774        {
8775           return seekStable!norm(idx, input);
8776        }
8777        lastCC = CC;
8778    }
8779    return tuple(input.length, input.length);
8780}
8781
8782private auto seekStable(NormalizationForm norm, C)(size_t idx, const scope C[] input)
8783{
8784    import std.typecons : tuple;
8785    import std.utf : codeLength;
8786
8787    auto br = input[0 .. idx];
8788    size_t region_start = 0;// default
8789    for (;;)
8790    {
8791        if (br.empty)// start is 0
8792            break;
8793        dchar ch = br.back;
8794        if (combiningClass(ch) == 0 && allowedIn!norm(ch))
8795        {
8796            region_start = br.length - codeLength!C(ch);
8797            break;
8798        }
8799        br.popFront();
8800    }
8801    ///@@@BUG@@@ can't use find: " find is a nested function and can't be used..."
8802    size_t region_end=input.length;// end is $ by default
8803    foreach (i, dchar ch; input[idx..$])
8804    {
8805        if (combiningClass(ch) == 0 && allowedIn!norm(ch))
8806        {
8807            region_end = i+idx;
8808            break;
8809        }
8810    }
8811    // writeln("Region to normalize: ", input[region_start .. region_end]);
8812    return tuple(region_start, region_end);
8813}
8814
8815/**
8816    Tests if dchar `ch` is always allowed (Quick_Check=YES) in normalization
8817    form `norm`.
8818*/
8819public bool allowedIn(NormalizationForm norm)(dchar ch)
8820{
8821    return !notAllowedIn!norm(ch);
8822}
8823
8824///
8825@safe unittest
8826{
8827    // e.g. Cyrillic is always allowed, so is ASCII
8828    assert(allowedIn!NFC('��'));
8829    assert(allowedIn!NFD('��'));
8830    assert(allowedIn!NFKC('��'));
8831    assert(allowedIn!NFKD('��'));
8832    assert(allowedIn!NFC('Z'));
8833}
8834
8835// not user friendly name but more direct
8836private bool notAllowedIn(NormalizationForm norm)(dchar ch)
8837{
8838    static if (norm == NFC)
8839        alias qcTrie = nfcQCTrie;
8840    else static if (norm == NFD)
8841        alias qcTrie = nfdQCTrie;
8842    else static if (norm == NFKC)
8843        alias qcTrie = nfkcQCTrie;
8844    else static if (norm == NFKD)
8845        alias qcTrie = nfkdQCTrie;
8846    else
8847        static assert("Unknown normalization form "~norm);
8848    return qcTrie[ch];
8849}
8850
8851@safe unittest
8852{
8853    assert(allowedIn!NFC('��'));
8854    assert(allowedIn!NFD('��'));
8855    assert(allowedIn!NFKC('��'));
8856    assert(allowedIn!NFKD('��'));
8857    assert(allowedIn!NFC('Z'));
8858}
8859
8860}
8861
8862version (std_uni_bootstrap)
8863{
8864    // old version used for bootstrapping of gen_uni.d that generates
8865    // up to date optimal versions of all of isXXX functions
8866    @safe pure nothrow @nogc public bool isWhite(dchar c)
8867    {
8868        import std.ascii : isWhite;
8869        return isWhite(c) ||
8870               c == lineSep || c == paraSep ||
8871               c == '\u0085' || c == '\u00A0' || c == '\u1680' || c == '\u180E' ||
8872               (c >= '\u2000' && c <= '\u200A') ||
8873               c == '\u202F' || c == '\u205F' || c == '\u3000';
8874    }
8875}
8876else
8877{
8878
8879// trusted -> avoid bounds check
8880@trusted pure nothrow @nogc private
8881{
8882    import std.internal.unicode_tables; // : toLowerTable, toTitleTable, toUpperTable; // generated file
8883
8884    // hide template instances behind functions
8885    // https://issues.dlang.org/show_bug.cgi?id=13232
8886    ushort toLowerIndex(dchar c) { return toLowerIndexTrie[c]; }
8887    ushort toLowerSimpleIndex(dchar c) { return toLowerSimpleIndexTrie[c]; }
8888    dchar toLowerTab(size_t idx) { return toLowerTable[idx]; }
8889
8890    ushort toTitleIndex(dchar c) { return toTitleIndexTrie[c]; }
8891    ushort toTitleSimpleIndex(dchar c) { return toTitleSimpleIndexTrie[c]; }
8892    dchar toTitleTab(size_t idx) { return toTitleTable[idx]; }
8893
8894    ushort toUpperIndex(dchar c) { return toUpperIndexTrie[c]; }
8895    ushort toUpperSimpleIndex(dchar c) { return toUpperSimpleIndexTrie[c]; }
8896    dchar toUpperTab(size_t idx) { return toUpperTable[idx]; }
8897}
8898
8899public:
8900
8901/++
8902    Whether or not `c` is a Unicode whitespace $(CHARACTER).
8903    (general Unicode category: Part of C0(tab, vertical tab, form feed,
8904    carriage return, and linefeed characters), Zs, Zl, Zp, and NEL(U+0085))
8905+/
8906@safe pure nothrow @nogc
8907public bool isWhite(dchar c)
8908{
8909    import std.internal.unicode_tables : isWhiteGen; // generated file
8910    return isWhiteGen(c); // call pregenerated binary search
8911}
8912
8913/++
8914    Return whether `c` is a Unicode lowercase $(CHARACTER).
8915+/
8916@safe pure nothrow @nogc
8917bool isLower(dchar c)
8918{
8919    import std.ascii : isLower, isASCII;
8920    if (isASCII(c))
8921        return isLower(c);
8922    return lowerCaseTrie[c];
8923}
8924
8925@safe unittest
8926{
8927    import std.ascii : isLower;
8928    foreach (v; 0 .. 0x80)
8929        assert(isLower(v) == .isLower(v));
8930    assert(.isLower('��'));
8931    assert(.isLower('��'));
8932    assert(!.isLower('��'));
8933    // Greek HETA
8934    assert(!.isLower('\u0370'));
8935    assert(.isLower('\u0371'));
8936    assert(!.isLower('\u039C')); // capital MU
8937    assert(.isLower('\u03B2')); // beta
8938    // from extended Greek
8939    assert(!.isLower('\u1F18'));
8940    assert(.isLower('\u1F00'));
8941    foreach (v; unicode.lowerCase.byCodepoint)
8942        assert(.isLower(v) && !isUpper(v));
8943}
8944
8945
8946/++
8947    Return whether `c` is a Unicode uppercase $(CHARACTER).
8948+/
8949@safe pure nothrow @nogc
8950bool isUpper(dchar c)
8951{
8952    import std.ascii : isUpper, isASCII;
8953    if (isASCII(c))
8954        return isUpper(c);
8955    return upperCaseTrie[c];
8956}
8957
8958@safe unittest
8959{
8960    import std.ascii : isLower;
8961    foreach (v; 0 .. 0x80)
8962        assert(isLower(v) == .isLower(v));
8963    assert(!isUpper('��'));
8964    assert(isUpper('��'));
8965    // Greek HETA
8966    assert(isUpper('\u0370'));
8967    assert(!isUpper('\u0371'));
8968    assert(isUpper('\u039C')); // capital MU
8969    assert(!isUpper('\u03B2')); // beta
8970    // from extended Greek
8971    assert(!isUpper('\u1F00'));
8972    assert(isUpper('\u1F18'));
8973    foreach (v; unicode.upperCase.byCodepoint)
8974        assert(isUpper(v) && !.isLower(v));
8975}
8976
8977
8978//TODO: Hidden for now, needs better API.
8979//Other transforms could use better API as well, but this one is a new primitive.
8980@safe pure nothrow @nogc
8981private dchar toTitlecase(dchar c)
8982{
8983    // optimize ASCII case
8984    if (c < 0xAA)
8985    {
8986        if (c < 'a')
8987            return c;
8988        if (c <= 'z')
8989            return c - 32;
8990        return c;
8991    }
8992    size_t idx = toTitleSimpleIndex(c);
8993    if (idx != ushort.max)
8994    {
8995        return toTitleTab(idx);
8996    }
8997    return c;
8998}
8999
9000private alias UpperTriple = AliasSeq!(toUpperIndex, MAX_SIMPLE_UPPER, toUpperTab);
9001private alias LowerTriple = AliasSeq!(toLowerIndex, MAX_SIMPLE_LOWER, toLowerTab);
9002
9003// generic toUpper/toLower on whole string, creates new or returns as is
9004private ElementEncodingType!S[] toCase(alias indexFn, uint maxIdx, alias tableFn, alias asciiConvert, S)(S s)
9005if (isSomeString!S || (isRandomAccessRange!S && hasLength!S && hasSlicing!S && isSomeChar!(ElementType!S)))
9006{
9007    import std.array : appender, array;
9008    import std.ascii : isASCII;
9009    import std.utf : byDchar, codeLength;
9010
9011    alias C = ElementEncodingType!S;
9012
9013    auto r = s.byDchar;
9014    for (size_t i; !r.empty; i += r.front.codeLength!C , r.popFront())
9015    {
9016        auto cOuter = r.front;
9017        ushort idx = indexFn(cOuter);
9018        if (idx == ushort.max)
9019            continue;
9020        auto result = appender!(C[])();
9021        result.reserve(s.length);
9022        result.put(s[0 .. i]);
9023        foreach (dchar c; s[i .. $].byDchar)
9024        {
9025            if (c.isASCII)
9026            {
9027                result.put(asciiConvert(c));
9028            }
9029            else
9030            {
9031                idx = indexFn(c);
9032                if (idx == ushort.max)
9033                    result.put(c);
9034                else if (idx < maxIdx)
9035                {
9036                    c = tableFn(idx);
9037                    result.put(c);
9038                }
9039                else
9040                {
9041                    auto val = tableFn(idx);
9042                    // unpack length + codepoint
9043                    immutable uint len = val >> 24;
9044                    result.put(cast(dchar)(val & 0xFF_FFFF));
9045                    foreach (j; idx+1 .. idx+len)
9046                        result.put(tableFn(j));
9047                }
9048            }
9049        }
9050        return result.data;
9051    }
9052
9053    static if (isSomeString!S)
9054        return s;
9055    else
9056        return s.array;
9057}
9058
9059// https://issues.dlang.org/show_bug.cgi?id=12428
9060@safe unittest
9061{
9062    import std.array : replicate;
9063    auto s = "abcdefghij".replicate(300);
9064    s = s[0 .. 10];
9065
9066    toUpper(s);
9067
9068    assert(s == "abcdefghij");
9069}
9070
9071// https://issues.dlang.org/show_bug.cgi?id=18993
9072@safe unittest
9073{
9074    static assert(`���������/A`.toLower.length == `���������/a`.toLower.length);
9075}
9076
9077
9078// generic toUpper/toLower on whole range, returns range
9079private auto toCaser(alias indexFn, uint maxIdx, alias tableFn, alias asciiConvert, Range)(Range str)
9080    // Accept range of dchar's
9081if (isInputRange!Range &&
9082    isSomeChar!(ElementEncodingType!Range) &&
9083    ElementEncodingType!Range.sizeof == dchar.sizeof)
9084{
9085    static struct ToCaserImpl
9086    {
9087        @property bool empty()
9088        {
9089            return !nLeft && r.empty;
9090        }
9091
9092        @property auto front()
9093        {
9094            import std.ascii : isASCII;
9095
9096            if (!nLeft)
9097            {
9098                dchar c = r.front;
9099                if (c.isASCII)
9100                {
9101                    buf[0] = asciiConvert(c);
9102                    nLeft = 1;
9103                }
9104                else
9105                {
9106                    const idx = indexFn(c);
9107                    if (idx == ushort.max)
9108                    {
9109                        buf[0] = c;
9110                        nLeft = 1;
9111                    }
9112                    else if (idx < maxIdx)
9113                    {
9114                        buf[0] = tableFn(idx);
9115                        nLeft = 1;
9116                    }
9117                    else
9118                    {
9119                        immutable val = tableFn(idx);
9120                        // unpack length + codepoint
9121                        nLeft = val >> 24;
9122                        if (nLeft == 0)
9123                            nLeft = 1;
9124                        assert(nLeft <= buf.length);
9125                        buf[nLeft - 1] = cast(dchar)(val & 0xFF_FFFF);
9126                        foreach (j; 1 .. nLeft)
9127                            buf[nLeft - j - 1] = tableFn(idx + j);
9128                    }
9129                }
9130            }
9131            return buf[nLeft - 1];
9132        }
9133
9134        void popFront()
9135        {
9136            if (!nLeft)
9137                front;
9138            assert(nLeft);
9139            --nLeft;
9140            if (!nLeft)
9141                r.popFront();
9142        }
9143
9144        static if (isForwardRange!Range)
9145        {
9146            @property auto save()
9147            {
9148                auto ret = this;
9149                ret.r = r.save;
9150                return ret;
9151            }
9152        }
9153
9154      private:
9155        Range r;
9156        uint nLeft;
9157        dchar[3] buf = void;
9158    }
9159
9160    return ToCaserImpl(str);
9161}
9162
9163/*********************
9164 * Convert an $(REF_ALTTEXT input range, isInputRange, std,range,primitives)
9165 * or a string to upper or lower case.
9166 *
9167 * Does not allocate memory.
9168 * Characters in UTF-8 or UTF-16 format that cannot be decoded
9169 * are treated as $(REF replacementDchar, std,utf).
9170 *
9171 * Params:
9172 *      str = string or range of characters
9173 *
9174 * Returns:
9175 *      an input range of `dchar`s
9176 *
9177 * See_Also:
9178 *      $(LREF toUpper), $(LREF toLower)
9179 */
9180
9181auto asLowerCase(Range)(Range str)
9182if (isInputRange!Range && isSomeChar!(ElementEncodingType!Range) &&
9183    !isConvertibleToString!Range)
9184{
9185    static if (ElementEncodingType!Range.sizeof < dchar.sizeof)
9186    {
9187        import std.utf : byDchar;
9188
9189        // Decode first
9190        return asLowerCase(str.byDchar);
9191    }
9192    else
9193    {
9194        static import std.ascii;
9195        return toCaser!(LowerTriple, std.ascii.toLower)(str);
9196    }
9197}
9198
9199/// ditto
9200auto asUpperCase(Range)(Range str)
9201if (isInputRange!Range && isSomeChar!(ElementEncodingType!Range) &&
9202    !isConvertibleToString!Range)
9203{
9204    static if (ElementEncodingType!Range.sizeof < dchar.sizeof)
9205    {
9206        import std.utf : byDchar;
9207
9208        // Decode first
9209        return asUpperCase(str.byDchar);
9210    }
9211    else
9212    {
9213        static import std.ascii;
9214        return toCaser!(UpperTriple, std.ascii.toUpper)(str);
9215    }
9216}
9217
9218///
9219@safe pure unittest
9220{
9221    import std.algorithm.comparison : equal;
9222
9223    assert("hEllo".asUpperCase.equal("HELLO"));
9224}
9225
9226// explicitly undocumented
9227auto asLowerCase(Range)(auto ref Range str)
9228if (isConvertibleToString!Range)
9229{
9230    import std.traits : StringTypeOf;
9231    return asLowerCase!(StringTypeOf!Range)(str);
9232}
9233
9234// explicitly undocumented
9235auto asUpperCase(Range)(auto ref Range str)
9236if (isConvertibleToString!Range)
9237{
9238    import std.traits : StringTypeOf;
9239    return asUpperCase!(StringTypeOf!Range)(str);
9240}
9241
9242@safe unittest
9243{
9244    static struct TestAliasedString
9245    {
9246        string get() @safe @nogc pure nothrow { return _s; }
9247        alias get this;
9248        @disable this(this);
9249        string _s;
9250    }
9251
9252    static bool testAliasedString(alias func, Args...)(string s, Args args)
9253    {
9254        import std.algorithm.comparison : equal;
9255        auto a = func(TestAliasedString(s), args);
9256        auto b = func(s, args);
9257        static if (is(typeof(equal(a, b))))
9258        {
9259            // For ranges, compare contents instead of object identity.
9260            return equal(a, b);
9261        }
9262        else
9263        {
9264            return a == b;
9265        }
9266    }
9267    assert(testAliasedString!asLowerCase("hEllo"));
9268    assert(testAliasedString!asUpperCase("hEllo"));
9269    assert(testAliasedString!asCapitalized("hEllo"));
9270}
9271
9272@safe unittest
9273{
9274    import std.array : array;
9275
9276    auto a = "HELLo".asLowerCase;
9277    auto savea = a.save;
9278    auto s = a.array;
9279    assert(s == "hello");
9280    s = savea.array;
9281    assert(s == "hello");
9282
9283    string[] lower = ["123", "abc������", "\u0131\u023f\u03c9", "i\u0307\u1Fe2"];
9284    string[] upper = ["123", "ABC������", "I\u2c7e\u2126", "\u0130\u03A5\u0308\u0300"];
9285
9286    foreach (i, slwr; lower)
9287    {
9288        import std.utf : byChar;
9289
9290        auto sx = slwr.asUpperCase.byChar.array;
9291        assert(sx == toUpper(slwr));
9292        auto sy = upper[i].asLowerCase.byChar.array;
9293        assert(sy == toLower(upper[i]));
9294    }
9295
9296    // Not necessary to call r.front
9297    for (auto r = lower[3].asUpperCase; !r.empty; r.popFront())
9298    {
9299    }
9300
9301    import std.algorithm.comparison : equal;
9302
9303    "HELLo"w.asLowerCase.equal("hello"d);
9304    "HELLo"w.asUpperCase.equal("HELLO"d);
9305    "HELLo"d.asLowerCase.equal("hello"d);
9306    "HELLo"d.asUpperCase.equal("HELLO"d);
9307
9308    import std.utf : byChar;
9309    assert(toLower("\u1Fe2") == asLowerCase("\u1Fe2").byChar.array);
9310}
9311
9312// generic capitalizer on whole range, returns range
9313private auto toCapitalizer(alias indexFnUpper, uint maxIdxUpper, alias tableFnUpper,
9314                           Range)(Range str)
9315    // Accept range of dchar's
9316if (isInputRange!Range &&
9317    isSomeChar!(ElementEncodingType!Range) &&
9318    ElementEncodingType!Range.sizeof == dchar.sizeof)
9319{
9320    static struct ToCapitalizerImpl
9321    {
9322        @property bool empty()
9323        {
9324            return lower ? lwr.empty : !nLeft && r.empty;
9325        }
9326
9327        @property auto front()
9328        {
9329            if (lower)
9330                return lwr.front;
9331
9332            if (!nLeft)
9333            {
9334                immutable dchar c = r.front;
9335                const idx = indexFnUpper(c);
9336                if (idx == ushort.max)
9337                {
9338                    buf[0] = c;
9339                    nLeft = 1;
9340                }
9341                else if (idx < maxIdxUpper)
9342                {
9343                    buf[0] = tableFnUpper(idx);
9344                    nLeft = 1;
9345                }
9346                else
9347                {
9348                    immutable val = tableFnUpper(idx);
9349                    // unpack length + codepoint
9350                    nLeft = val >> 24;
9351                    if (nLeft == 0)
9352                        nLeft = 1;
9353                    assert(nLeft <= buf.length);
9354                    buf[nLeft - 1] = cast(dchar)(val & 0xFF_FFFF);
9355                    foreach (j; 1 .. nLeft)
9356                        buf[nLeft - j - 1] = tableFnUpper(idx + j);
9357                }
9358            }
9359            return buf[nLeft - 1];
9360        }
9361
9362        void popFront()
9363        {
9364            if (lower)
9365                lwr.popFront();
9366            else
9367            {
9368                if (!nLeft)
9369                    front;
9370                assert(nLeft);
9371                --nLeft;
9372                if (!nLeft)
9373                {
9374                    r.popFront();
9375                    lwr = r.asLowerCase();
9376                    lower = true;
9377                }
9378            }
9379        }
9380
9381        static if (isForwardRange!Range)
9382        {
9383            @property auto save()
9384            {
9385                auto ret = this;
9386                ret.r = r.save;
9387                ret.lwr = lwr.save;
9388                return ret;
9389            }
9390        }
9391
9392      private:
9393        Range r;
9394        typeof(r.asLowerCase) lwr; // range representing the lower case rest of string
9395        bool lower = false;     // false for first character, true for rest of string
9396        dchar[3] buf = void;
9397        uint nLeft = 0;
9398    }
9399
9400    return ToCapitalizerImpl(str);
9401}
9402
9403/*********************
9404 * Capitalize an $(REF_ALTTEXT input range, isInputRange, std,range,primitives)
9405 * or string, meaning convert the first
9406 * character to upper case and subsequent characters to lower case.
9407 *
9408 * Does not allocate memory.
9409 * Characters in UTF-8 or UTF-16 format that cannot be decoded
9410 * are treated as $(REF replacementDchar, std,utf).
9411 *
9412 * Params:
9413 *      str = string or range of characters
9414 *
9415 * Returns:
9416 *      an InputRange of dchars
9417 *
9418 * See_Also:
9419 *      $(LREF toUpper), $(LREF toLower)
9420 *      $(LREF asUpperCase), $(LREF asLowerCase)
9421 */
9422
9423auto asCapitalized(Range)(Range str)
9424if (isInputRange!Range && isSomeChar!(ElementEncodingType!Range) &&
9425    !isConvertibleToString!Range)
9426{
9427    static if (ElementEncodingType!Range.sizeof < dchar.sizeof)
9428    {
9429        import std.utf : byDchar;
9430
9431        // Decode first
9432        return toCapitalizer!UpperTriple(str.byDchar);
9433    }
9434    else
9435    {
9436        return toCapitalizer!UpperTriple(str);
9437    }
9438}
9439
9440///
9441@safe pure unittest
9442{
9443    import std.algorithm.comparison : equal;
9444
9445    assert("hEllo".asCapitalized.equal("Hello"));
9446}
9447
9448auto asCapitalized(Range)(auto ref Range str)
9449if (isConvertibleToString!Range)
9450{
9451    import std.traits : StringTypeOf;
9452    return asCapitalized!(StringTypeOf!Range)(str);
9453}
9454
9455@safe pure nothrow @nogc unittest
9456{
9457    auto r = "hEllo".asCapitalized();
9458    assert(r.front == 'H');
9459}
9460
9461@safe unittest
9462{
9463    import std.array : array;
9464
9465    auto a = "hELLo".asCapitalized;
9466    auto savea = a.save;
9467    auto s = a.array;
9468    assert(s == "Hello");
9469    s = savea.array;
9470    assert(s == "Hello");
9471
9472    string[2][] cases =
9473    [
9474        ["", ""],
9475        ["h", "H"],
9476        ["H", "H"],
9477        ["3", "3"],
9478        ["123", "123"],
9479        ["h123A", "H123a"],
9480        ["������", "������"],
9481        ["\u1Fe2", "\u03a5\u0308\u0300"],
9482    ];
9483
9484    foreach (i; 0 .. cases.length)
9485    {
9486        import std.utf : byChar;
9487
9488        auto r = cases[i][0].asCapitalized.byChar.array;
9489        auto result = cases[i][1];
9490        assert(r == result);
9491    }
9492
9493    // Don't call r.front
9494    for (auto r = "\u1Fe2".asCapitalized; !r.empty; r.popFront())
9495    {
9496    }
9497
9498    import std.algorithm.comparison : equal;
9499
9500    "HELLo"w.asCapitalized.equal("Hello"d);
9501    "hElLO"w.asCapitalized.equal("Hello"d);
9502    "hello"d.asCapitalized.equal("Hello"d);
9503    "HELLO"d.asCapitalized.equal("Hello"d);
9504
9505    import std.utf : byChar;
9506    assert(asCapitalized("\u0130").byChar.array == asUpperCase("\u0130").byChar.array);
9507}
9508
9509// TODO: helper, I wish std.utf was more flexible (and stright)
9510private size_t encodeTo(scope char[] buf, size_t idx, dchar c) @trusted pure nothrow @nogc
9511{
9512    if (c <= 0x7F)
9513    {
9514        buf[idx] = cast(char) c;
9515        idx++;
9516    }
9517    else if (c <= 0x7FF)
9518    {
9519        buf[idx] = cast(char)(0xC0 | (c >> 6));
9520        buf[idx+1] = cast(char)(0x80 | (c & 0x3F));
9521        idx += 2;
9522    }
9523    else if (c <= 0xFFFF)
9524    {
9525        buf[idx] = cast(char)(0xE0 | (c >> 12));
9526        buf[idx+1] = cast(char)(0x80 | ((c >> 6) & 0x3F));
9527        buf[idx+2] = cast(char)(0x80 | (c & 0x3F));
9528        idx += 3;
9529    }
9530    else if (c <= 0x10FFFF)
9531    {
9532        buf[idx] = cast(char)(0xF0 | (c >> 18));
9533        buf[idx+1] = cast(char)(0x80 | ((c >> 12) & 0x3F));
9534        buf[idx+2] = cast(char)(0x80 | ((c >> 6) & 0x3F));
9535        buf[idx+3] = cast(char)(0x80 | (c & 0x3F));
9536        idx += 4;
9537    }
9538    else
9539        assert(0);
9540    return idx;
9541}
9542
9543@safe unittest
9544{
9545    char[] s = "abcd".dup;
9546    size_t i = 0;
9547    i = encodeTo(s, i, 'X');
9548    assert(s == "Xbcd");
9549
9550    i = encodeTo(s, i, cast(dchar)'\u00A9');
9551    assert(s == "X\xC2\xA9d");
9552}
9553
9554// TODO: helper, I wish std.utf was more flexible (and stright)
9555private size_t encodeTo(scope wchar[] buf, size_t idx, dchar c) @trusted pure
9556{
9557    import std.utf : UTFException;
9558    if (c <= 0xFFFF)
9559    {
9560        if (0xD800 <= c && c <= 0xDFFF)
9561            throw (new UTFException("Encoding an isolated surrogate code point in UTF-16")).setSequence(c);
9562        buf[idx] = cast(wchar) c;
9563        idx++;
9564    }
9565    else if (c <= 0x10FFFF)
9566    {
9567        buf[idx] = cast(wchar)((((c - 0x10000) >> 10) & 0x3FF) + 0xD800);
9568        buf[idx+1] = cast(wchar)(((c - 0x10000) & 0x3FF) + 0xDC00);
9569        idx += 2;
9570    }
9571    else
9572        assert(0);
9573    return idx;
9574}
9575
9576private size_t encodeTo(scope dchar[] buf, size_t idx, dchar c) @trusted pure nothrow @nogc
9577{
9578    buf[idx] = c;
9579    idx++;
9580    return idx;
9581}
9582
9583private void toCaseInPlace(alias indexFn, uint maxIdx, alias tableFn, C)(ref C[] s) @trusted pure
9584if (is(C == char) || is(C == wchar)  || is(C == dchar))
9585{
9586    import std.utf : decode, codeLength;
9587    size_t curIdx = 0;
9588    size_t destIdx = 0;
9589    alias slowToCase = toCaseInPlaceAlloc!(indexFn, maxIdx, tableFn);
9590    size_t lastUnchanged = 0;
9591    // in-buffer move of bytes to a new start index
9592    // the trick is that it may not need to copy at all
9593    static size_t moveTo(C[] str, size_t dest, size_t from, size_t to)
9594    {
9595        // Interestingly we may just bump pointer for a while
9596        // then have to copy if a re-cased char was smaller the original
9597        // later we may regain pace with char that got bigger
9598        // In the end it sometimes flip-flops between the 2 cases below
9599        if (dest == from)
9600            return to;
9601        // got to copy
9602        foreach (C c; str[from .. to])
9603            str[dest++] = c;
9604        return dest;
9605    }
9606    while (curIdx != s.length)
9607    {
9608        size_t startIdx = curIdx;
9609        immutable ch = decode(s, curIdx);
9610        // TODO: special case for ASCII
9611        immutable caseIndex = indexFn(ch);
9612        if (caseIndex == ushort.max) // unchanged, skip over
9613        {
9614            continue;
9615        }
9616        else if (caseIndex < maxIdx)  // 1:1 codepoint mapping
9617        {
9618            // previous cased chars had the same length as uncased ones
9619            // thus can just adjust pointer
9620            destIdx = moveTo(s, destIdx, lastUnchanged, startIdx);
9621            lastUnchanged = curIdx;
9622            immutable cased = tableFn(caseIndex);
9623            immutable casedLen = codeLength!C(cased);
9624            if (casedLen + destIdx > curIdx) // no place to fit cased char
9625            {
9626                // switch to slow codepath, where we allocate
9627                return slowToCase(s, startIdx, destIdx);
9628            }
9629            else
9630            {
9631                destIdx = encodeTo(s, destIdx, cased);
9632            }
9633        }
9634        else  // 1:m codepoint mapping, slow codepath
9635        {
9636            destIdx = moveTo(s, destIdx, lastUnchanged, startIdx);
9637            lastUnchanged = curIdx;
9638            return slowToCase(s, startIdx, destIdx);
9639        }
9640        assert(destIdx <= curIdx);
9641    }
9642    if (lastUnchanged != s.length)
9643    {
9644        destIdx = moveTo(s, destIdx, lastUnchanged, s.length);
9645    }
9646    s = s[0 .. destIdx];
9647}
9648
9649// helper to precalculate size of case-converted string
9650private template toCaseLength(alias indexFn, uint maxIdx, alias tableFn)
9651{
9652    size_t toCaseLength(C)(const scope C[] str)
9653    {
9654        import std.utf : decode, codeLength;
9655        size_t codeLen = 0;
9656        size_t lastNonTrivial = 0;
9657        size_t curIdx = 0;
9658        while (curIdx != str.length)
9659        {
9660            immutable startIdx = curIdx;
9661            immutable ch = decode(str, curIdx);
9662            immutable ushort caseIndex = indexFn(ch);
9663            if (caseIndex == ushort.max)
9664                continue;
9665            else if (caseIndex < maxIdx)
9666            {
9667                codeLen += startIdx - lastNonTrivial;
9668                lastNonTrivial = curIdx;
9669                immutable cased = tableFn(caseIndex);
9670                codeLen += codeLength!C(cased);
9671            }
9672            else
9673            {
9674                codeLen += startIdx - lastNonTrivial;
9675                lastNonTrivial = curIdx;
9676                immutable val = tableFn(caseIndex);
9677                immutable len = val >> 24;
9678                immutable dchar cased = val & 0xFF_FFFF;
9679                codeLen += codeLength!C(cased);
9680                foreach (j; caseIndex+1 .. caseIndex+len)
9681                    codeLen += codeLength!C(tableFn(j));
9682            }
9683        }
9684        if (lastNonTrivial != str.length)
9685            codeLen += str.length - lastNonTrivial;
9686        return codeLen;
9687    }
9688}
9689
9690@safe unittest
9691{
9692    alias toLowerLength = toCaseLength!(LowerTriple);
9693    assert(toLowerLength("abcd") == 4);
9694    assert(toLowerLength("����������456") == 10+3);
9695}
9696
9697// slower code path that preallocates and then copies
9698// case-converted stuf to the new string
9699private template toCaseInPlaceAlloc(alias indexFn, uint maxIdx, alias tableFn)
9700{
9701    void toCaseInPlaceAlloc(C)(ref C[] s, size_t curIdx,
9702        size_t destIdx) @trusted pure
9703        if (is(C == char) || is(C == wchar) || is(C == dchar))
9704    {
9705        import std.utf : decode;
9706        alias caseLength = toCaseLength!(indexFn, maxIdx, tableFn);
9707        auto trueLength = destIdx + caseLength(s[curIdx..$]);
9708        C[] ns = new C[trueLength];
9709        ns[0 .. destIdx] = s[0 .. destIdx];
9710        size_t lastUnchanged = curIdx;
9711        while (curIdx != s.length)
9712        {
9713            immutable startIdx = curIdx; // start of current codepoint
9714            immutable ch = decode(s, curIdx);
9715            immutable caseIndex = indexFn(ch);
9716            if (caseIndex == ushort.max) // skip over
9717            {
9718                continue;
9719            }
9720            else if (caseIndex < maxIdx)  // 1:1 codepoint mapping
9721            {
9722                immutable cased = tableFn(caseIndex);
9723                auto toCopy = startIdx - lastUnchanged;
9724                ns[destIdx .. destIdx+toCopy] = s[lastUnchanged .. startIdx];
9725                lastUnchanged = curIdx;
9726                destIdx += toCopy;
9727                destIdx = encodeTo(ns, destIdx, cased);
9728            }
9729            else  // 1:m codepoint mapping, slow codepath
9730            {
9731                auto toCopy = startIdx - lastUnchanged;
9732                ns[destIdx .. destIdx+toCopy] = s[lastUnchanged .. startIdx];
9733                lastUnchanged = curIdx;
9734                destIdx += toCopy;
9735                auto val = tableFn(caseIndex);
9736                // unpack length + codepoint
9737                immutable uint len = val >> 24;
9738                destIdx = encodeTo(ns, destIdx, cast(dchar)(val & 0xFF_FFFF));
9739                foreach (j; caseIndex+1 .. caseIndex+len)
9740                    destIdx = encodeTo(ns, destIdx, tableFn(j));
9741            }
9742        }
9743        if (lastUnchanged != s.length)
9744        {
9745            auto toCopy = s.length - lastUnchanged;
9746            ns[destIdx .. destIdx+toCopy] = s[lastUnchanged..$];
9747            destIdx += toCopy;
9748        }
9749        assert(ns.length == destIdx);
9750        s = ns;
9751    }
9752}
9753
9754/++
9755    Converts `s` to lowercase (by performing Unicode lowercase mapping) in place.
9756    For a few characters string length may increase after the transformation,
9757    in such a case the function reallocates exactly once.
9758    If `s` does not have any uppercase characters, then `s` is unaltered.
9759+/
9760void toLowerInPlace(C)(ref C[] s) @trusted pure
9761if (is(C == char) || is(C == wchar) || is(C == dchar))
9762{
9763    toCaseInPlace!(LowerTriple)(s);
9764}
9765// overloads for the most common cases to reduce compile time
9766@safe pure /*TODO nothrow*/
9767{
9768    void toLowerInPlace(ref char[] s)
9769    { toLowerInPlace!char(s); }
9770    void toLowerInPlace(ref wchar[] s)
9771    { toLowerInPlace!wchar(s); }
9772    void toLowerInPlace(ref dchar[] s)
9773    { toLowerInPlace!dchar(s); }
9774}
9775
9776/++
9777    Converts `s` to uppercase  (by performing Unicode uppercase mapping) in place.
9778    For a few characters string length may increase after the transformation,
9779    in such a case the function reallocates exactly once.
9780    If `s` does not have any lowercase characters, then `s` is unaltered.
9781+/
9782void toUpperInPlace(C)(ref C[] s) @trusted pure
9783if (is(C == char) || is(C == wchar) || is(C == dchar))
9784{
9785    toCaseInPlace!(UpperTriple)(s);
9786}
9787// overloads for the most common cases to reduce compile time/code size
9788@safe pure /*TODO nothrow*/
9789{
9790    void toUpperInPlace(ref char[] s)
9791    { toUpperInPlace!char(s); }
9792    void toUpperInPlace(ref wchar[] s)
9793    { toUpperInPlace!wchar(s); }
9794    void toUpperInPlace(ref dchar[] s)
9795    { toUpperInPlace!dchar(s); }
9796}
9797
9798/++
9799    If `c` is a Unicode uppercase $(CHARACTER), then its lowercase equivalent
9800    is returned. Otherwise `c` is returned.
9801
9802    Warning: certain alphabets like German and Greek have no 1:1
9803    upper-lower mapping. Use overload of toLower which takes full string instead.
9804+/
9805@safe pure nothrow @nogc
9806dchar toLower(dchar c)
9807{
9808     // optimize ASCII case
9809    if (c < 0xAA)
9810    {
9811        if (c < 'A')
9812            return c;
9813        if (c <= 'Z')
9814            return c + 32;
9815        return c;
9816    }
9817    size_t idx = toLowerSimpleIndex(c);
9818    if (idx != ushort.max)
9819    {
9820        return toLowerTab(idx);
9821    }
9822    return c;
9823}
9824
9825/++
9826    Creates a new array which is identical to `s` except that all of its
9827    characters are converted to lowercase (by performing Unicode lowercase mapping).
9828    If none of `s` characters were affected, then `s` itself is returned if `s` is a
9829    `string`-like type.
9830
9831    Params:
9832        s = A $(REF_ALTTEXT random access range, isRandomAccessRange, std,range,primitives)
9833        of characters
9834    Returns:
9835        An array with the same element type as `s`.
9836+/
9837ElementEncodingType!S[] toLower(S)(return scope S s) @trusted
9838if (isSomeString!S)
9839{
9840    static import std.ascii;
9841    return toCase!(LowerTriple, std.ascii.toLower)(s);
9842}
9843
9844/// ditto
9845ElementEncodingType!S[] toLower(S)(S s)
9846if (!isSomeString!S && (isRandomAccessRange!S && hasLength!S && hasSlicing!S && isSomeChar!(ElementType!S)))
9847{
9848    static import std.ascii;
9849    return toCase!(LowerTriple, std.ascii.toLower)(s);
9850}
9851
9852// overloads for the most common cases to reduce compile time
9853@safe pure /*TODO nothrow*/
9854{
9855    string toLower(return scope string s)
9856    { return toLower!string(s); }
9857    wstring toLower(return scope wstring s)
9858    { return toLower!wstring(s); }
9859    dstring toLower(return scope dstring s)
9860    { return toLower!dstring(s); }
9861
9862    @safe unittest
9863    {
9864        // https://issues.dlang.org/show_bug.cgi?id=16663
9865
9866        static struct String
9867        {
9868            string data;
9869            alias data this;
9870        }
9871
9872        void foo()
9873        {
9874            auto u = toLower(String(""));
9875        }
9876    }
9877}
9878
9879
9880@safe unittest
9881{
9882    static import std.ascii;
9883    import std.format : format;
9884    foreach (ch; 0 .. 0x80)
9885        assert(std.ascii.toLower(ch) == toLower(ch));
9886    assert(toLower('��') == '��');
9887    assert(toLower('��') == '��');
9888    foreach (ch; unicode.upperCase.byCodepoint)
9889    {
9890        dchar low = ch.toLower();
9891        assert(low == ch || isLower(low), format("%s -> %s", ch, low));
9892    }
9893    assert(toLower("����") == "����");
9894
9895    assert("\u1E9E".toLower == "\u00df");
9896    assert("\u00df".toUpper == "SS");
9897}
9898
9899// https://issues.dlang.org/show_bug.cgi?id=9629
9900@safe unittest
9901{
9902    wchar[] test = "hello �� world"w.dup;
9903    auto piece = test[6 .. 7];
9904    toUpperInPlace(piece);
9905    assert(test == "hello �� world");
9906}
9907
9908
9909@safe unittest
9910{
9911    import std.algorithm.comparison : cmp;
9912    string s1 = "FoL";
9913    string s2 = toLower(s1);
9914    assert(cmp(s2, "fol") == 0, s2);
9915    assert(s2 != s1);
9916
9917    char[] s3 = s1.dup;
9918    toLowerInPlace(s3);
9919    assert(s3 == s2);
9920
9921    s1 = "A\u0100B\u0101d";
9922    s2 = toLower(s1);
9923    s3 = s1.dup;
9924    assert(cmp(s2, "a\u0101b\u0101d") == 0);
9925    assert(s2 !is s1);
9926    toLowerInPlace(s3);
9927    assert(s3 == s2);
9928
9929    s1 = "A\u0460B\u0461d";
9930    s2 = toLower(s1);
9931    s3 = s1.dup;
9932    assert(cmp(s2, "a\u0461b\u0461d") == 0);
9933    assert(s2 !is s1);
9934    toLowerInPlace(s3);
9935    assert(s3 == s2);
9936
9937    s1 = "\u0130";
9938    s2 = toLower(s1);
9939    s3 = s1.dup;
9940    assert(s2 == "i\u0307");
9941    assert(s2 !is s1);
9942    toLowerInPlace(s3);
9943    assert(s3 == s2);
9944
9945    // Test on wchar and dchar strings.
9946    assert(toLower("Some String"w) == "some string"w);
9947    assert(toLower("Some String"d) == "some string"d);
9948
9949    // https://issues.dlang.org/show_bug.cgi?id=12455
9950    dchar c = '��'; // '\U0130' LATIN CAPITAL LETTER I WITH DOT ABOVE
9951    assert(isUpper(c));
9952    assert(toLower(c) == 'i');
9953    // extends on https://issues.dlang.org/show_bug.cgi?id=12455 report
9954    // check simple-case toUpper too
9955    c = '\u1f87';
9956    assert(isLower(c));
9957    assert(toUpper(c) == '\u1F8F');
9958}
9959
9960@safe pure unittest
9961{
9962    import std.algorithm.comparison : cmp, equal;
9963    import std.utf : byCodeUnit;
9964    auto r1 = "FoL".byCodeUnit;
9965    assert(r1.toLower.cmp("fol") == 0);
9966    auto r2 = "A\u0460B\u0461d".byCodeUnit;
9967    assert(r2.toLower.cmp("a\u0461b\u0461d") == 0);
9968}
9969
9970/++
9971    If `c` is a Unicode lowercase $(CHARACTER), then its uppercase equivalent
9972    is returned. Otherwise `c` is returned.
9973
9974    Warning:
9975    Certain alphabets like German and Greek have no 1:1
9976    upper-lower mapping. Use overload of toUpper which takes full string instead.
9977
9978    toUpper can be used as an argument to $(REF map, std,algorithm,iteration)
9979    to produce an algorithm that can convert a range of characters to upper case
9980    without allocating memory.
9981    A string can then be produced by using $(REF copy, std,algorithm,mutation)
9982    to send it to an $(REF appender, std,array).
9983+/
9984@safe pure nothrow @nogc
9985dchar toUpper(dchar c)
9986{
9987    // optimize ASCII case
9988    if (c < 0xAA)
9989    {
9990        if (c < 'a')
9991            return c;
9992        if (c <= 'z')
9993            return c - 32;
9994        return c;
9995    }
9996    size_t idx = toUpperSimpleIndex(c);
9997    if (idx != ushort.max)
9998    {
9999        return toUpperTab(idx);
10000    }
10001    return c;
10002}
10003
10004///
10005@safe unittest
10006{
10007    import std.algorithm.iteration : map;
10008    import std.algorithm.mutation : copy;
10009    import std.array : appender;
10010
10011    auto abuf = appender!(char[])();
10012    "hello".map!toUpper.copy(abuf);
10013    assert(abuf.data == "HELLO");
10014}
10015
10016@safe unittest
10017{
10018    static import std.ascii;
10019    import std.format : format;
10020    foreach (ch; 0 .. 0x80)
10021        assert(std.ascii.toUpper(ch) == toUpper(ch));
10022    assert(toUpper('��') == '��');
10023    assert(toUpper('��') == '��');
10024    auto title = unicode.Titlecase_Letter;
10025    foreach (ch; unicode.lowerCase.byCodepoint)
10026    {
10027        dchar up = ch.toUpper();
10028        assert(up == ch || isUpper(up) || title[up],
10029            format("%x -> %x", ch, up));
10030    }
10031}
10032
10033/++
10034    Allocates a new array which is identical to `s` except that all of its
10035    characters are converted to uppercase (by performing Unicode uppercase mapping).
10036    If none of `s` characters were affected, then `s` itself is returned if `s`
10037    is a `string`-like type.
10038
10039    Params:
10040        s = A $(REF_ALTTEXT random access range, isRandomAccessRange, std,range,primitives)
10041        of characters
10042    Returns:
10043        An new array with the same element type as `s`.
10044+/
10045ElementEncodingType!S[] toUpper(S)(return scope S s) @trusted
10046if (isSomeString!S)
10047{
10048    static import std.ascii;
10049    return toCase!(UpperTriple, std.ascii.toUpper)(s);
10050}
10051
10052/// ditto
10053ElementEncodingType!S[] toUpper(S)(S s)
10054if (!isSomeString!S && (isRandomAccessRange!S && hasLength!S && hasSlicing!S && isSomeChar!(ElementType!S)))
10055{
10056    static import std.ascii;
10057    return toCase!(UpperTriple, std.ascii.toUpper)(s);
10058}
10059
10060// overloads for the most common cases to reduce compile time
10061@safe pure /*TODO nothrow*/
10062{
10063    string toUpper(return scope string s)
10064    { return toUpper!string(s); }
10065    wstring toUpper(return scope wstring s)
10066    { return toUpper!wstring(s); }
10067    dstring toUpper(return scope dstring s)
10068    { return toUpper!dstring(s); }
10069
10070    @safe unittest
10071    {
10072        // https://issues.dlang.org/show_bug.cgi?id=16663
10073
10074        static struct String
10075        {
10076            string data;
10077            alias data this;
10078        }
10079
10080        void foo()
10081        {
10082            auto u = toUpper(String(""));
10083        }
10084    }
10085}
10086
10087@safe unittest
10088{
10089    import std.algorithm.comparison : cmp;
10090
10091    string s1 = "FoL";
10092    string s2;
10093    char[] s3;
10094
10095    s2 = toUpper(s1);
10096    s3 = s1.dup; toUpperInPlace(s3);
10097    assert(s3 == s2, s3);
10098    assert(cmp(s2, "FOL") == 0);
10099    assert(s2 !is s1);
10100
10101    s1 = "a\u0100B\u0101d";
10102    s2 = toUpper(s1);
10103    s3 = s1.dup; toUpperInPlace(s3);
10104    assert(s3 == s2);
10105    assert(cmp(s2, "A\u0100B\u0100D") == 0);
10106    assert(s2 !is s1);
10107
10108    s1 = "a\u0460B\u0461d";
10109    s2 = toUpper(s1);
10110    s3 = s1.dup; toUpperInPlace(s3);
10111    assert(s3 == s2);
10112    assert(cmp(s2, "A\u0460B\u0460D") == 0);
10113    assert(s2 !is s1);
10114}
10115
10116@safe unittest
10117{
10118    static void doTest(C)(const(C)[] s, const(C)[] trueUp, const(C)[] trueLow)
10119    {
10120        import std.format : format;
10121        string diff = "src: %( %x %)\nres: %( %x %)\ntru: %( %x %)";
10122        auto low = s.toLower() , up = s.toUpper();
10123        auto lowInp = s.dup, upInp = s.dup;
10124        lowInp.toLowerInPlace();
10125        upInp.toUpperInPlace();
10126        assert(low == trueLow, format(diff, low, trueLow));
10127        assert(up == trueUp,  format(diff, up, trueUp));
10128        assert(lowInp == trueLow,
10129            format(diff, cast(const(ubyte)[]) s, cast(const(ubyte)[]) lowInp, cast(const(ubyte)[]) trueLow));
10130        assert(upInp == trueUp,
10131            format(diff, cast(const(ubyte)[]) s, cast(const(ubyte)[]) upInp, cast(const(ubyte)[]) trueUp));
10132    }
10133    static foreach (S; AliasSeq!(dstring, wstring, string))
10134    {{
10135
10136        S easy = "123";
10137        S good = "abC������";
10138        S awful = "\u0131\u023f\u2126";
10139        S wicked = "\u0130\u1FE2";
10140        auto options = [easy, good, awful, wicked];
10141        S[] lower = ["123", "abc������", "\u0131\u023f\u03c9", "i\u0307\u1Fe2"];
10142        S[] upper = ["123", "ABC������", "I\u2c7e\u2126", "\u0130\u03A5\u0308\u0300"];
10143
10144        foreach (val; [easy, good])
10145        {
10146            auto e = val.dup;
10147            auto g = e;
10148            e.toUpperInPlace();
10149            assert(e is g);
10150            e.toLowerInPlace();
10151            assert(e is g);
10152        }
10153        foreach (i, v; options)
10154        {
10155            doTest(v, upper[i], lower[i]);
10156        }
10157
10158        // a few combinatorial runs
10159        foreach (i; 0 .. options.length)
10160        foreach (j; i .. options.length)
10161        foreach (k; j .. options.length)
10162        {
10163            auto sample = options[i] ~ options[j] ~ options[k];
10164            auto sample2 = options[k] ~ options[j] ~ options[i];
10165            doTest(sample, upper[i] ~ upper[j] ~ upper[k],
10166                lower[i] ~ lower[j] ~ lower[k]);
10167            doTest(sample2, upper[k] ~ upper[j] ~ upper[i],
10168                lower[k] ~ lower[j] ~ lower[i]);
10169        }
10170    }}
10171}
10172
10173// test random access ranges
10174@safe pure unittest
10175{
10176    import std.algorithm.comparison : cmp;
10177    import std.utf : byCodeUnit;
10178    auto s1 = "FoL".byCodeUnit;
10179    assert(s1.toUpper.cmp("FOL") == 0);
10180    auto s2 = "a\u0460B\u0461d".byCodeUnit;
10181    assert(s2.toUpper.cmp("A\u0460B\u0460D") == 0);
10182}
10183
10184/++
10185    Returns whether `c` is a Unicode alphabetic $(CHARACTER)
10186    (general Unicode category: Alphabetic).
10187+/
10188@safe pure nothrow @nogc
10189bool isAlpha(dchar c)
10190{
10191    // optimization
10192    if (c < 0xAA)
10193    {
10194        size_t x = c - 'A';
10195        if (x <= 'Z' - 'A')
10196            return true;
10197        else
10198        {
10199            x = c - 'a';
10200            if (x <= 'z'-'a')
10201                return true;
10202        }
10203        return false;
10204    }
10205
10206    return alphaTrie[c];
10207}
10208
10209@safe unittest
10210{
10211    auto alpha = unicode("Alphabetic");
10212    foreach (ch; alpha.byCodepoint)
10213        assert(isAlpha(ch));
10214    foreach (ch; 0 .. 0x4000)
10215        assert((ch in alpha) == isAlpha(ch));
10216}
10217
10218
10219/++
10220    Returns whether `c` is a Unicode mark
10221    (general Unicode category: Mn, Me, Mc).
10222+/
10223@safe pure nothrow @nogc
10224bool isMark(dchar c)
10225{
10226    return markTrie[c];
10227}
10228
10229@safe unittest
10230{
10231    auto mark = unicode("Mark");
10232    foreach (ch; mark.byCodepoint)
10233        assert(isMark(ch));
10234    foreach (ch; 0 .. 0x4000)
10235        assert((ch in mark) == isMark(ch));
10236}
10237
10238/++
10239    Returns whether `c` is a Unicode numerical $(CHARACTER)
10240    (general Unicode category: Nd, Nl, No).
10241+/
10242@safe pure nothrow @nogc
10243bool isNumber(dchar c)
10244{
10245    // optimization for ascii case
10246    if (c <= 0x7F)
10247    {
10248        return c >= '0' && c <= '9';
10249    }
10250    else
10251    {
10252        return numberTrie[c];
10253    }
10254}
10255
10256@safe unittest
10257{
10258    auto n = unicode("N");
10259    foreach (ch; n.byCodepoint)
10260        assert(isNumber(ch));
10261    foreach (ch; 0 .. 0x4000)
10262        assert((ch in n) == isNumber(ch));
10263}
10264
10265/++
10266    Returns whether `c` is a Unicode alphabetic $(CHARACTER) or number.
10267    (general Unicode category: Alphabetic, Nd, Nl, No).
10268
10269    Params:
10270        c = any Unicode character
10271    Returns:
10272        `true` if the character is in the Alphabetic, Nd, Nl, or No Unicode
10273        categories
10274+/
10275@safe pure nothrow @nogc
10276bool isAlphaNum(dchar c)
10277{
10278    static import std.ascii;
10279
10280    // optimization for ascii case
10281    if (std.ascii.isASCII(c))
10282    {
10283        return std.ascii.isAlphaNum(c);
10284    }
10285    else
10286    {
10287        return isAlpha(c) || isNumber(c);
10288    }
10289}
10290
10291@safe unittest
10292{
10293    auto n = unicode("N");
10294    auto alpha = unicode("Alphabetic");
10295
10296    foreach (ch; n.byCodepoint)
10297        assert(isAlphaNum(ch));
10298
10299    foreach (ch; alpha.byCodepoint)
10300        assert(isAlphaNum(ch));
10301
10302    foreach (ch; 0 .. 0x4000)
10303    {
10304        assert(((ch in n) || (ch in alpha)) == isAlphaNum(ch));
10305    }
10306}
10307
10308/++
10309    Returns whether `c` is a Unicode punctuation $(CHARACTER)
10310    (general Unicode category: Pd, Ps, Pe, Pc, Po, Pi, Pf).
10311+/
10312@safe pure nothrow @nogc
10313bool isPunctuation(dchar c)
10314{
10315    static import std.ascii;
10316
10317    // optimization for ascii case
10318    if (c <= 0x7F)
10319    {
10320        return std.ascii.isPunctuation(c);
10321    }
10322    else
10323    {
10324        return punctuationTrie[c];
10325    }
10326}
10327
10328@safe unittest
10329{
10330    assert(isPunctuation('\u0021'));
10331    assert(isPunctuation('\u0028'));
10332    assert(isPunctuation('\u0029'));
10333    assert(isPunctuation('\u002D'));
10334    assert(isPunctuation('\u005F'));
10335    assert(isPunctuation('\u00AB'));
10336    assert(isPunctuation('\u00BB'));
10337    foreach (ch; unicode("P").byCodepoint)
10338        assert(isPunctuation(ch));
10339}
10340
10341/++
10342    Returns whether `c` is a Unicode symbol $(CHARACTER)
10343    (general Unicode category: Sm, Sc, Sk, So).
10344+/
10345@safe pure nothrow @nogc
10346bool isSymbol(dchar c)
10347{
10348   return symbolTrie[c];
10349}
10350
10351@safe unittest
10352{
10353    import std.format : format;
10354    assert(isSymbol('\u0024'));
10355    assert(isSymbol('\u002B'));
10356    assert(isSymbol('\u005E'));
10357    assert(isSymbol('\u00A6'));
10358    foreach (ch; unicode("S").byCodepoint)
10359        assert(isSymbol(ch), format("%04x", ch));
10360}
10361
10362/++
10363    Returns whether `c` is a Unicode space $(CHARACTER)
10364    (general Unicode category: Zs)
10365    Note: This doesn't include '\n', '\r', \t' and other non-space $(CHARACTER).
10366    For commonly used less strict semantics see $(LREF isWhite).
10367+/
10368@safe pure nothrow @nogc
10369bool isSpace(dchar c)
10370{
10371    import std.internal.unicode_tables : isSpaceGen; // generated file
10372    return isSpaceGen(c);
10373}
10374
10375@safe unittest
10376{
10377    assert(isSpace('\u0020'));
10378    auto space = unicode.Zs;
10379    foreach (ch; space.byCodepoint)
10380        assert(isSpace(ch));
10381    foreach (ch; 0 .. 0x1000)
10382        assert(isSpace(ch) == space[ch]);
10383}
10384
10385
10386/++
10387    Returns whether `c` is a Unicode graphical $(CHARACTER)
10388    (general Unicode category: L, M, N, P, S, Zs).
10389
10390+/
10391@safe pure nothrow @nogc
10392bool isGraphical(dchar c)
10393{
10394    return graphicalTrie[c];
10395}
10396
10397
10398@safe unittest
10399{
10400    auto set = unicode("Graphical");
10401    import std.format : format;
10402    foreach (ch; set.byCodepoint)
10403        assert(isGraphical(ch), format("%4x", ch));
10404    foreach (ch; 0 .. 0x4000)
10405        assert((ch in set) == isGraphical(ch));
10406}
10407
10408
10409/++
10410    Returns whether `c` is a Unicode control $(CHARACTER)
10411    (general Unicode category: Cc).
10412+/
10413@safe pure nothrow @nogc
10414bool isControl(dchar c)
10415{
10416    import std.internal.unicode_tables : isControlGen; // generated file
10417    return isControlGen(c);
10418}
10419
10420@safe unittest
10421{
10422    assert(isControl('\u0000'));
10423    assert(isControl('\u0081'));
10424    assert(!isControl('\u0100'));
10425    auto cc = unicode.Cc;
10426    foreach (ch; cc.byCodepoint)
10427        assert(isControl(ch));
10428    foreach (ch; 0 .. 0x1000)
10429        assert(isControl(ch) == cc[ch]);
10430}
10431
10432
10433/++
10434    Returns whether `c` is a Unicode formatting $(CHARACTER)
10435    (general Unicode category: Cf).
10436+/
10437@safe pure nothrow @nogc
10438bool isFormat(dchar c)
10439{
10440    import std.internal.unicode_tables : isFormatGen; // generated file
10441    return isFormatGen(c);
10442}
10443
10444
10445@safe unittest
10446{
10447    assert(isFormat('\u00AD'));
10448    foreach (ch; unicode("Format").byCodepoint)
10449        assert(isFormat(ch));
10450}
10451
10452// code points for private use, surrogates are not likely to change in near feature
10453// if need be they can be generated from unicode data as well
10454
10455/++
10456    Returns whether `c` is a Unicode Private Use $(CODEPOINT)
10457    (general Unicode category: Co).
10458+/
10459@safe pure nothrow @nogc
10460bool isPrivateUse(dchar c)
10461{
10462    return (0x00_E000 <= c && c <= 0x00_F8FF)
10463        || (0x0F_0000 <= c && c <= 0x0F_FFFD)
10464        || (0x10_0000 <= c && c <= 0x10_FFFD);
10465}
10466
10467/++
10468    Returns whether `c` is a Unicode surrogate $(CODEPOINT)
10469    (general Unicode category: Cs).
10470+/
10471@safe pure nothrow @nogc
10472bool isSurrogate(dchar c)
10473{
10474    return (0xD800 <= c && c <= 0xDFFF);
10475}
10476
10477/++
10478    Returns whether `c` is a Unicode high surrogate (lead surrogate).
10479+/
10480@safe pure nothrow @nogc
10481bool isSurrogateHi(dchar c)
10482{
10483    return (0xD800 <= c && c <= 0xDBFF);
10484}
10485
10486/++
10487    Returns whether `c` is a Unicode low surrogate (trail surrogate).
10488+/
10489@safe pure nothrow @nogc
10490bool isSurrogateLo(dchar c)
10491{
10492    return (0xDC00 <= c && c <= 0xDFFF);
10493}
10494
10495/++
10496    Returns whether `c` is a Unicode non-character i.e.
10497    a $(CODEPOINT) with no assigned abstract character.
10498    (general Unicode category: Cn)
10499+/
10500@safe pure nothrow @nogc
10501bool isNonCharacter(dchar c)
10502{
10503    return nonCharacterTrie[c];
10504}
10505
10506@safe unittest
10507{
10508    auto set = unicode("Cn");
10509    foreach (ch; set.byCodepoint)
10510        assert(isNonCharacter(ch));
10511}
10512
10513private:
10514// load static data from pre-generated tables into usable datastructures
10515
10516
10517@safe auto asSet(const (ubyte)[] compressed) pure
10518{
10519    return CodepointSet.fromIntervals(decompressIntervals(compressed));
10520}
10521
10522@safe pure nothrow auto asTrie(T...)(const scope TrieEntry!T e)
10523{
10524    return const(CodepointTrie!T)(e.offsets, e.sizes, e.data);
10525}
10526
10527@safe pure nothrow @nogc @property
10528{
10529    import std.internal.unicode_tables; // generated file
10530
10531    // It's important to use auto return here, so that the compiler
10532    // only runs semantic on the return type if the function gets
10533    // used. Also these are functions rather than templates to not
10534    // increase the object size of the caller.
10535    auto lowerCaseTrie() { static immutable res = asTrie(lowerCaseTrieEntries); return res; }
10536    auto upperCaseTrie() { static immutable res = asTrie(upperCaseTrieEntries); return res; }
10537    auto simpleCaseTrie() { static immutable res = asTrie(simpleCaseTrieEntries); return res; }
10538    auto fullCaseTrie() { static immutable res = asTrie(fullCaseTrieEntries); return res; }
10539    auto alphaTrie() { static immutable res = asTrie(alphaTrieEntries); return res; }
10540    auto markTrie() { static immutable res = asTrie(markTrieEntries); return res; }
10541    auto numberTrie() { static immutable res = asTrie(numberTrieEntries); return res; }
10542    auto punctuationTrie() { static immutable res = asTrie(punctuationTrieEntries); return res; }
10543    auto symbolTrie() { static immutable res = asTrie(symbolTrieEntries); return res; }
10544    auto graphicalTrie() { static immutable res = asTrie(graphicalTrieEntries); return res; }
10545    auto nonCharacterTrie() { static immutable res = asTrie(nonCharacterTrieEntries); return res; }
10546
10547    //normalization quick-check tables
10548    auto nfcQCTrie()
10549    {
10550        import std.internal.unicode_norm : nfcQCTrieEntries;
10551        static immutable res = asTrie(nfcQCTrieEntries);
10552        return res;
10553    }
10554
10555    auto nfdQCTrie()
10556    {
10557        import std.internal.unicode_norm : nfdQCTrieEntries;
10558        static immutable res = asTrie(nfdQCTrieEntries);
10559        return res;
10560    }
10561
10562    auto nfkcQCTrie()
10563    {
10564        import std.internal.unicode_norm : nfkcQCTrieEntries;
10565        static immutable res = asTrie(nfkcQCTrieEntries);
10566        return res;
10567    }
10568
10569    auto nfkdQCTrie()
10570    {
10571        import std.internal.unicode_norm : nfkdQCTrieEntries;
10572        static immutable res = asTrie(nfkdQCTrieEntries);
10573        return res;
10574    }
10575
10576    //grapheme breaking algorithm tables
10577    auto mcTrie()
10578    {
10579        import std.internal.unicode_grapheme : mcTrieEntries;
10580        static immutable res = asTrie(mcTrieEntries);
10581        return res;
10582    }
10583
10584    auto graphemeExtendTrie()
10585    {
10586        import std.internal.unicode_grapheme : graphemeExtendTrieEntries;
10587        static immutable res = asTrie(graphemeExtendTrieEntries);
10588        return res;
10589    }
10590
10591    auto hangLV()
10592    {
10593        import std.internal.unicode_grapheme : hangulLVTrieEntries;
10594        static immutable res = asTrie(hangulLVTrieEntries);
10595        return res;
10596    }
10597
10598    auto hangLVT()
10599    {
10600        import std.internal.unicode_grapheme : hangulLVTTrieEntries;
10601        static immutable res = asTrie(hangulLVTTrieEntries);
10602        return res;
10603    }
10604
10605    // tables below are used for composition/decomposition
10606    auto combiningClassTrie()
10607    {
10608        import std.internal.unicode_comp : combiningClassTrieEntries;
10609        static immutable res = asTrie(combiningClassTrieEntries);
10610        return res;
10611    }
10612
10613    auto compatMappingTrie()
10614    {
10615        import std.internal.unicode_decomp : compatMappingTrieEntries;
10616        static immutable res = asTrie(compatMappingTrieEntries);
10617        return res;
10618    }
10619
10620    auto canonMappingTrie()
10621    {
10622        import std.internal.unicode_decomp : canonMappingTrieEntries;
10623        static immutable res = asTrie(canonMappingTrieEntries);
10624        return res;
10625    }
10626
10627    auto compositionJumpTrie()
10628    {
10629        import std.internal.unicode_comp : compositionJumpTrieEntries;
10630        static immutable res = asTrie(compositionJumpTrieEntries);
10631        return res;
10632    }
10633
10634    //case conversion tables
10635    auto toUpperIndexTrie() { static immutable res = asTrie(toUpperIndexTrieEntries); return res; }
10636    auto toLowerIndexTrie() { static immutable res = asTrie(toLowerIndexTrieEntries); return res; }
10637    auto toTitleIndexTrie() { static immutable res = asTrie(toTitleIndexTrieEntries); return res; }
10638    //simple case conversion tables
10639    auto toUpperSimpleIndexTrie() { static immutable res = asTrie(toUpperSimpleIndexTrieEntries); return res; }
10640    auto toLowerSimpleIndexTrie() { static immutable res = asTrie(toLowerSimpleIndexTrieEntries); return res; }
10641    auto toTitleSimpleIndexTrie() { static immutable res = asTrie(toTitleSimpleIndexTrieEntries); return res; }
10642
10643}
10644
10645}// version (!std_uni_bootstrap)
10646