1/*
2    Implementation of std.regex IR, an intermediate representation
3    of a regular expression pattern.
4
5    This is a common ground between frontend regex component (parser)
6    and backend components - generators, matchers and other "filters".
7*/
8module std.regex.internal.ir;
9
10package(std.regex):
11
12import std.exception, std.meta, std.range.primitives, std.traits, std.uni;
13
14debug(std_regex_parser) import std.stdio;
15// just a common trait, may be moved elsewhere
16alias BasicElementOf(Range) = Unqual!(ElementEncodingType!Range);
17
18enum privateUseStart = '\U000F0000', privateUseEnd ='\U000FFFFD';
19
20// heuristic value determines maximum CodepointSet length suitable for linear search
21enum maxCharsetUsed = 6;
22
23// another variable to tweak behavior of caching generated Tries for character classes
24enum maxCachedMatchers = 8;
25
26alias Trie = CodepointSetTrie!(13, 8);
27alias makeTrie = codepointSetTrie!(13, 8);
28
29CharMatcher[CodepointSet] matcherCache;
30
31//accessor with caching
32@trusted CharMatcher getMatcher(CodepointSet set)
33{// @@@BUG@@@ 6357 almost all properties of AA are not @safe
34    if (__ctfe || maxCachedMatchers == 0)
35        return CharMatcher(set);
36    else
37    {
38        auto p = set in matcherCache;
39        if (p)
40            return *p;
41        if (matcherCache.length == maxCachedMatchers)
42        {
43            // flush enmatchers in trieCache
44            matcherCache = null;
45        }
46        return (matcherCache[set] = CharMatcher(set));
47    }
48}
49
50@trusted auto memoizeExpr(string expr)()
51{
52    if (__ctfe)
53        return mixin(expr);
54    alias T = typeof(mixin(expr));
55    static T slot;
56    static bool initialized;
57    if (!initialized)
58    {
59        slot =  mixin(expr);
60        initialized = true;
61    }
62    return slot;
63}
64
65//property for \w character class
66@property CodepointSet wordCharacter()
67{
68    return memoizeExpr!("unicode.Alphabetic | unicode.Mn | unicode.Mc
69        | unicode.Me | unicode.Nd | unicode.Pc")();
70}
71
72@property CharMatcher wordMatcher()
73{
74    return memoizeExpr!("CharMatcher(wordCharacter)")();
75}
76
77// some special Unicode white space characters
78private enum NEL = '\u0085', LS = '\u2028', PS = '\u2029';
79
80// Characters that need escaping in string posed as regular expressions
81alias Escapables = AliasSeq!('[', ']', '\\', '^', '$', '.', '|', '?', ',', '-',
82    ';', ':', '#', '&', '%', '/', '<', '>', '`',  '*', '+', '(', ')', '{', '}',  '~');
83
84//Regular expression engine/parser options:
85// global - search  all nonoverlapping matches in input
86// casefold - case insensitive matching, do casefolding on match in unicode mode
87// freeform - ignore whitespace in pattern, to match space use [ ] or \s
88// multiline - switch  ^, $ detect start and end of linesinstead of just start and end of input
89enum RegexOption: uint {
90    global = 0x1,
91    casefold = 0x2,
92    freeform = 0x4,
93    nonunicode = 0x8,
94    multiline = 0x10,
95    singleline = 0x20
96}
97//do not reorder this list
98alias RegexOptionNames = AliasSeq!('g', 'i', 'x', 'U', 'm', 's');
99static assert( RegexOption.max < 0x80);
100// flags that allow guide execution of engine
101enum RegexInfo : uint { oneShot = 0x80 }
102
103// IR bit pattern: 0b1_xxxxx_yy
104// where yy indicates class of instruction, xxxxx for actual operation code
105//     00: atom, a normal instruction
106//     01: open, opening of a group, has length of contained IR in the low bits
107//     10: close, closing of a group, has length of contained IR in the low bits
108//     11 unused
109//
110// Loops with Q (non-greedy, with ? mark) must have the same size / other properties as non Q version
111// Possible changes:
112//* merge group, option, infinite/repeat start (to never copy during parsing of (a|b){1,2})
113//* reorganize groups to make n args easier to find, or simplify the check for groups of similar ops
114//  (like lookaround), or make it easier to identify hotspots.
115
116enum IR:uint {
117    Char              = 0b1_00000_00, //a character
118    Any               = 0b1_00001_00, //any character
119    CodepointSet      = 0b1_00010_00, //a most generic CodepointSet [...]
120    Trie              = 0b1_00011_00, //CodepointSet implemented as Trie
121    //match with any of a consecutive OrChar's in this sequence
122    //(used for case insensitive match)
123    //OrChar holds in upper two bits of data total number of OrChars in this _sequence_
124    //the drawback of this representation is that it is difficult
125    // to detect a jump in the middle of it
126    OrChar             = 0b1_00100_00,
127    Nop                = 0b1_00101_00, //no operation (padding)
128    End                = 0b1_00110_00, //end of program
129    Bol                = 0b1_00111_00, //beginning of a line ^
130    Eol                = 0b1_01000_00, //end of a line $
131    Wordboundary       = 0b1_01001_00, //boundary of a word
132    Notwordboundary    = 0b1_01010_00, //not a word boundary
133    Backref            = 0b1_01011_00, //backreference to a group (that has to be pinned, i.e. locally unique) (group index)
134    GroupStart         = 0b1_01100_00, //start of a group (x) (groupIndex+groupPinning(1bit))
135    GroupEnd           = 0b1_01101_00, //end of a group (x) (groupIndex+groupPinning(1bit))
136    Option             = 0b1_01110_00, //start of an option within an alternation x | y (length)
137    GotoEndOr          = 0b1_01111_00, //end of an option (length of the rest)
138    Bof                = 0b1_10000_00, //begining of "file" (string) ^
139    Eof                = 0b1_10001_00, //end of "file" (string) $
140    //... any additional atoms here
141
142    OrStart            = 0b1_00000_01, //start of alternation group  (length)
143    OrEnd              = 0b1_00000_10, //end of the or group (length,mergeIndex)
144    //with this instruction order
145    //bit mask 0b1_00001_00 could be used to test/set greediness
146    InfiniteStart      = 0b1_00001_01, //start of an infinite repetition x* (length)
147    InfiniteEnd        = 0b1_00001_10, //end of infinite repetition x* (length,mergeIndex)
148    InfiniteQStart     = 0b1_00010_01, //start of a non eager infinite repetition x*? (length)
149    InfiniteQEnd       = 0b1_00010_10, //end of non eager infinite repetition x*? (length,mergeIndex)
150    InfiniteBloomStart = 0b1_00011_01, //start of an filtered infinite repetition x* (length)
151    InfiniteBloomEnd   = 0b1_00011_10, //end of filtered infinite repetition x* (length,mergeIndex)
152    RepeatStart        = 0b1_00100_01, //start of a {n,m} repetition (length)
153    RepeatEnd          = 0b1_00100_10, //end of x{n,m} repetition (length,step,minRep,maxRep)
154    RepeatQStart       = 0b1_00101_01, //start of a non eager x{n,m}? repetition (length)
155    RepeatQEnd         = 0b1_00101_10, //end of non eager x{n,m}? repetition (length,step,minRep,maxRep)
156
157    //
158    LookaheadStart     = 0b1_00110_01, //begin of the lookahead group (length)
159    LookaheadEnd       = 0b1_00110_10, //end of a lookahead group (length)
160    NeglookaheadStart  = 0b1_00111_01, //start of a negative lookahead (length)
161    NeglookaheadEnd    = 0b1_00111_10, //end of a negative lookahead (length)
162    LookbehindStart    = 0b1_01000_01, //start of a lookbehind (length)
163    LookbehindEnd      = 0b1_01000_10, //end of a lookbehind (length)
164    NeglookbehindStart = 0b1_01001_01, //start of a negative lookbehind (length)
165    NeglookbehindEnd   = 0b1_01001_10, //end of negative lookbehind (length)
166}
167
168//a shorthand for IR length - full length of specific opcode evaluated at compile time
169template IRL(IR code)
170{
171    enum uint IRL =  lengthOfIR(code);
172}
173static assert(IRL!(IR.LookaheadStart) == 3);
174
175//how many parameters follow the IR, should be optimized fixing some IR bits
176int immediateParamsIR(IR i){
177    switch (i)
178    {
179    case IR.OrEnd,IR.InfiniteEnd,IR.InfiniteQEnd:
180        return 1;  // merge table index
181    case IR.InfiniteBloomEnd:
182        return 2;  // bloom filter index + merge table index
183    case IR.RepeatEnd, IR.RepeatQEnd:
184        return 4;
185    case IR.LookaheadStart, IR.NeglookaheadStart, IR.LookbehindStart, IR.NeglookbehindStart:
186        return 2;  // start-end of captures used
187    default:
188        return 0;
189    }
190}
191
192//full length of IR instruction inlcuding all parameters that might follow it
193int lengthOfIR(IR i)
194{
195    return 1 + immediateParamsIR(i);
196}
197
198//full length of the paired IR instruction inlcuding all parameters that might follow it
199int lengthOfPairedIR(IR i)
200{
201    return 1 + immediateParamsIR(pairedIR(i));
202}
203
204//if the operation has a merge point (this relies on the order of the ops)
205bool hasMerge(IR i)
206{
207    return (i&0b11)==0b10 && i <= IR.RepeatQEnd;
208}
209
210//is an IR that opens a "group"
211bool isStartIR(IR i)
212{
213    return (i&0b11)==0b01;
214}
215
216//is an IR that ends a "group"
217bool isEndIR(IR i)
218{
219    return (i&0b11)==0b10;
220}
221
222//is a standalone IR
223bool isAtomIR(IR i)
224{
225    return (i&0b11)==0b00;
226}
227
228//makes respective pair out of IR i, swapping start/end bits of instruction
229IR pairedIR(IR i)
230{
231    assert(isStartIR(i) || isEndIR(i));
232    return cast(IR)(i ^ 0b11);
233}
234
235//encoded IR instruction
236struct Bytecode
237{
238    uint raw;
239    //natural constraints
240    enum maxSequence = 2+4;
241    enum maxData = 1 << 22;
242    enum maxRaw = 1 << 31;
243
244    this(IR code, uint data)
245    {
246        assert(data < (1 << 22) && code < 256);
247        raw = code << 24 | data;
248    }
249
250    this(IR code, uint data, uint seq)
251    {
252        assert(data < (1 << 22) && code < 256 );
253        assert(seq >= 2 && seq < maxSequence);
254        raw = code << 24 | (seq - 2)<<22 | data;
255    }
256
257    //store raw data
258    static Bytecode fromRaw(uint data)
259    {
260        Bytecode t;
261        t.raw = data;
262        return t;
263    }
264
265    //bit twiddling helpers
266    //0-arg template due to @@@BUG@@@ 10985
267    @property uint data()() const { return raw & 0x003f_ffff; }
268
269    @property void data()(uint val)
270    {
271        raw = (raw & ~0x003f_ffff) | (val & 0x003f_ffff);
272    }
273
274    //ditto
275    //0-arg template due to @@@BUG@@@ 10985
276    @property uint sequence()() const { return 2 + (raw >> 22 & 0x3); }
277
278    //ditto
279    //0-arg template due to @@@BUG@@@ 10985
280    @property IR code()() const { return cast(IR)(raw >> 24); }
281
282    //ditto
283    @property bool hotspot() const { return hasMerge(code); }
284
285    //test the class of this instruction
286    @property bool isAtom() const { return isAtomIR(code); }
287
288    //ditto
289    @property bool isStart() const { return isStartIR(code); }
290
291    //ditto
292    @property bool isEnd() const { return isEndIR(code); }
293
294    //number of arguments for this instruction
295    @property int args() const { return immediateParamsIR(code); }
296
297    //mark this GroupStart or GroupEnd as referenced in backreference
298    void setBackrefence()
299    {
300        assert(code == IR.GroupStart || code == IR.GroupEnd);
301        raw = raw | 1 << 23;
302    }
303
304    //is referenced
305    @property bool backreference() const
306    {
307        assert(code == IR.GroupStart || code == IR.GroupEnd);
308        return cast(bool)(raw & 1 << 23);
309    }
310
311    //mark as local reference (for backrefs in lookarounds)
312    void setLocalRef()
313    {
314        assert(code == IR.Backref);
315        raw = raw | 1 << 23;
316    }
317
318    //is a local ref
319    @property bool localRef() const
320    {
321        assert(code == IR.Backref);
322        return cast(bool)(raw & 1 << 23);
323    }
324
325    //human readable name of instruction
326    @trusted @property string mnemonic()() const
327    {//@@@BUG@@@ to is @system
328        import std.conv : to;
329        return to!string(code);
330    }
331
332    //full length of instruction
333    @property uint length() const
334    {
335        return lengthOfIR(code);
336    }
337
338    //full length of respective start/end of this instruction
339    @property uint pairedLength() const
340    {
341        return lengthOfPairedIR(code);
342    }
343
344    //returns bytecode of paired instruction (assuming this one is start or end)
345    @property Bytecode paired() const
346    {//depends on bit and struct layout order
347        assert(isStart || isEnd);
348        return Bytecode.fromRaw(raw ^ 0b11 << 24);
349    }
350
351    //gets an index into IR block of the respective pair
352    uint indexOfPair(uint pc) const
353    {
354        assert(isStart || isEnd);
355        return isStart ? pc + data + length  : pc - data - lengthOfPairedIR(code);
356    }
357}
358
359static assert(Bytecode.sizeof == 4);
360
361
362//index entry structure for name --> number of submatch
363struct NamedGroup
364{
365    string name;
366    uint group;
367}
368
369//holds pair of start-end markers for a submatch
370struct Group(DataIndex)
371{
372    DataIndex begin, end;
373    @trusted string toString()() const
374    {
375        import std.array : appender;
376        import std.format : formattedWrite;
377        auto a = appender!string();
378        formattedWrite(a, "%s..%s", begin, end);
379        return a.data;
380    }
381}
382
383//debugging tool, prints out instruction along with opcodes
384@trusted string disassemble(in Bytecode[] irb, uint pc, in NamedGroup[] dict=[])
385{
386    import std.array : appender;
387    import std.format : formattedWrite;
388    auto output = appender!string();
389    formattedWrite(output,"%s", irb[pc].mnemonic);
390    switch (irb[pc].code)
391    {
392    case IR.Char:
393        formattedWrite(output, " %s (0x%x)",cast(dchar) irb[pc].data, irb[pc].data);
394        break;
395    case IR.OrChar:
396        formattedWrite(output, " %s (0x%x) seq=%d", cast(dchar) irb[pc].data, irb[pc].data, irb[pc].sequence);
397        break;
398    case IR.RepeatStart, IR.InfiniteStart, IR.InfiniteBloomStart,
399    IR.Option, IR.GotoEndOr, IR.OrStart:
400        //forward-jump instructions
401        uint len = irb[pc].data;
402        formattedWrite(output, " pc=>%u", pc+len+IRL!(IR.RepeatStart));
403        break;
404    case IR.RepeatEnd, IR.RepeatQEnd: //backward-jump instructions
405        uint len = irb[pc].data;
406        formattedWrite(output, " pc=>%u min=%u max=%u step=%u",
407            pc - len, irb[pc + 3].raw, irb[pc + 4].raw, irb[pc + 2].raw);
408        break;
409    case IR.InfiniteEnd, IR.InfiniteQEnd, IR.InfiniteBloomEnd, IR.OrEnd: //ditto
410        uint len = irb[pc].data;
411        formattedWrite(output, " pc=>%u", pc-len);
412        break;
413    case  IR.LookaheadEnd, IR.NeglookaheadEnd: //ditto
414        uint len = irb[pc].data;
415        formattedWrite(output, " pc=>%u", pc-len);
416        break;
417    case IR.GroupStart, IR.GroupEnd:
418        uint n = irb[pc].data;
419        string name;
420        foreach (v;dict)
421            if (v.group == n)
422            {
423                name = "'"~v.name~"'";
424                break;
425            }
426        formattedWrite(output, " %s #%u " ~ (irb[pc].backreference ? "referenced" : ""),
427                name, n);
428        break;
429    case IR.LookaheadStart, IR.NeglookaheadStart, IR.LookbehindStart, IR.NeglookbehindStart:
430        uint len = irb[pc].data;
431        uint start = irb[pc+1].raw, end = irb[pc+2].raw;
432        formattedWrite(output, " pc=>%u [%u..%u]", pc + len + IRL!(IR.LookaheadStart), start, end);
433        break;
434    case IR.Backref: case IR.CodepointSet: case IR.Trie:
435        uint n = irb[pc].data;
436        formattedWrite(output, " %u",  n);
437        if (irb[pc].code == IR.Backref)
438            formattedWrite(output, " %s", irb[pc].localRef ? "local" : "global");
439        break;
440    default://all data-free instructions
441    }
442    if (irb[pc].hotspot)
443        formattedWrite(output, " Hotspot %u", irb[pc+1].raw);
444    return output.data;
445}
446
447//disassemble the whole chunk
448@trusted void printBytecode()(in Bytecode[] slice, in NamedGroup[] dict=[])
449{
450    import std.stdio : writeln;
451    for (uint pc=0; pc<slice.length; pc += slice[pc].length)
452        writeln("\t", disassemble(slice, pc, dict));
453}
454
455/++
456    $(D Regex) object holds regular expression pattern in compiled form.
457    Instances of this object are constructed via calls to $(D regex).
458    This is an intended form for caching and storage of frequently
459    used regular expressions.
460+/
461struct Regex(Char)
462{
463    //temporary workaround for identifier lookup
464    CodepointSet[] charsets; //
465    Bytecode[] ir;      //compiled bytecode of pattern
466
467
468    @safe @property bool empty() const nothrow {  return ir is null; }
469
470    @safe @property auto namedCaptures()
471    {
472        static struct NamedGroupRange
473        {
474        private:
475            NamedGroup[] groups;
476            size_t start;
477            size_t end;
478        public:
479            this(NamedGroup[] g, size_t s, size_t e)
480            {
481                assert(s <= e);
482                assert(e <= g.length);
483                groups = g;
484                start = s;
485                end = e;
486            }
487
488            @property string front() { return groups[start].name; }
489            @property string back() { return groups[end-1].name; }
490            @property bool empty() { return start >= end; }
491            @property size_t length() { return end - start; }
492            alias opDollar = length;
493            @property NamedGroupRange save()
494            {
495                return NamedGroupRange(groups, start, end);
496            }
497            void popFront() { assert(!empty); start++; }
498            void popBack() { assert(!empty); end--; }
499            string opIndex()(size_t i)
500            {
501                assert(start + i < end,
502                       "Requested named group is out of range.");
503                return groups[start+i].name;
504            }
505            NamedGroupRange opSlice(size_t low, size_t high) {
506                assert(low <= high);
507                assert(start + high <= end);
508                return NamedGroupRange(groups, start + low, start + high);
509            }
510            NamedGroupRange opSlice() { return this.save; }
511        }
512        return NamedGroupRange(dict, 0, dict.length);
513    }
514
515package(std.regex):
516    import std.regex.internal.kickstart : Kickstart; //TODO: get rid of this dependency
517    NamedGroup[] dict;                     // maps name -> user group number
518    uint ngroup;                           // number of internal groups
519    uint maxCounterDepth;                  // max depth of nested {n,m} repetitions
520    uint hotspotTableSize;                 // number of entries in merge table
521    uint threadCount;                      // upper bound on number of Thompson VM threads
522    uint flags;                            // global regex flags
523    public const(CharMatcher)[]  matchers; // tables that represent character sets
524    public const(BitTable)[] filters;      // bloom filters for conditional loops
525    uint[] backrefed;                      // bit array of backreferenced submatches
526    Kickstart!Char kickstart;
527
528    //bit access helper
529    uint isBackref(uint n)
530    {
531        if (n/32 >= backrefed.length)
532            return 0;
533        return backrefed[n / 32] & (1 << (n & 31));
534    }
535
536    //check if searching is not needed
537    void checkIfOneShot()
538    {
539    L_CheckLoop:
540        for (uint i = 0; i < ir.length; i += ir[i].length)
541        {
542            switch (ir[i].code)
543            {
544                case IR.Bof:
545                    flags |= RegexInfo.oneShot;
546                    break L_CheckLoop;
547                case IR.GroupStart, IR.GroupEnd, IR.Bol, IR.Eol, IR.Eof,
548                IR.Wordboundary, IR.Notwordboundary:
549                    break;
550                default:
551                    break L_CheckLoop;
552            }
553        }
554    }
555
556    //print out disassembly a program's IR
557    @trusted debug(std_regex_parser) void print() const
558    {//@@@BUG@@@ write is system
559        for (uint i = 0; i < ir.length; i += ir[i].length)
560        {
561            writefln("%d\t%s ", i, disassemble(ir, i, dict));
562        }
563        writeln("Total merge table size: ", hotspotTableSize);
564        writeln("Max counter nesting depth: ", maxCounterDepth);
565    }
566
567}
568
569//@@@BUG@@@ (unreduced) - public makes it inaccessible in std.regex.package (!)
570/*public*/ struct StaticRegex(Char)
571{
572package(std.regex):
573    import std.regex.internal.backtracking : BacktrackingMatcher;
574    alias Matcher = BacktrackingMatcher!(true);
575    alias MatchFn = bool function(ref Matcher!Char) @trusted;
576    MatchFn nativeFn;
577public:
578    Regex!Char _regex;
579    alias _regex this;
580    this(Regex!Char re, MatchFn fn)
581    {
582        _regex = re;
583        nativeFn = fn;
584
585    }
586
587}
588
589// The stuff below this point is temporarrily part of IR module
590// but may need better place in the future (all internals)
591package(std.regex):
592
593//Simple UTF-string abstraction compatible with stream interface
594struct Input(Char)
595if (is(Char :dchar))
596{
597    import std.utf : decode;
598    alias DataIndex = size_t;
599    enum bool isLoopback = false;
600    alias String = const(Char)[];
601    String _origin;
602    size_t _index;
603
604    //constructs Input object out of plain string
605    this(String input, size_t idx = 0)
606    {
607        _origin = input;
608        _index = idx;
609    }
610
611    //codepoint at current stream position
612    pragma(inline, true) bool nextChar(ref dchar res, ref size_t pos)
613    {
614        pos = _index;
615        // DMD's inliner hates multiple return functions
616        // but can live with single statement if/else bodies
617        bool n = !(_index == _origin.length);
618        if (n)
619            res = decode(_origin, _index);
620        return n;
621    }
622    @property bool atEnd(){
623        return _index == _origin.length;
624    }
625    bool search(Kickstart)(ref Kickstart kick, ref dchar res, ref size_t pos)
626    {
627        size_t idx = kick.search(_origin, _index);
628        _index = idx;
629        return nextChar(res, pos);
630    }
631
632    //index of at End position
633    @property size_t lastIndex(){   return _origin.length; }
634
635    //support for backtracker engine, might not be present
636    void reset(size_t index){   _index = index;  }
637
638    String opSlice(size_t start, size_t end){   return _origin[start .. end]; }
639
640    auto loopBack(size_t index){   return BackLooper!Input(this, index); }
641}
642
643struct BackLooperImpl(Input)
644{
645    import std.utf : strideBack;
646    alias DataIndex = size_t;
647    alias String = Input.String;
648    enum bool isLoopback = true;
649    String _origin;
650    size_t _index;
651    this(Input input, size_t index)
652    {
653        _origin = input._origin;
654        _index = index;
655    }
656    @trusted bool nextChar(ref dchar res,ref size_t pos)
657    {
658        pos = _index;
659        if (_index == 0)
660            return false;
661
662        res = _origin[0.._index].back;
663        _index -= strideBack(_origin, _index);
664
665        return true;
666    }
667    @property atEnd(){ return _index == 0 || _index == strideBack(_origin, _index); }
668    auto loopBack(size_t index){   return Input(_origin, index); }
669
670    //support for backtracker engine, might not be present
671    //void reset(size_t index){   _index = index ? index-std.utf.strideBack(_origin, index) : 0;  }
672    void reset(size_t index){   _index = index;  }
673
674    String opSlice(size_t start, size_t end){   return _origin[end .. start]; }
675    //index of at End position
676    @property size_t lastIndex(){   return 0; }
677}
678
679template BackLooper(E)
680{
681    static if (is(E : BackLooperImpl!U, U))
682    {
683        alias BackLooper = U;
684    }
685    else
686    {
687        alias BackLooper = BackLooperImpl!E;
688    }
689}
690
691//both helpers below are internal, on its own are quite "explosive"
692//unsafe, no initialization of elements
693@system T[] mallocArray(T)(size_t len)
694{
695    import core.stdc.stdlib : malloc;
696    return (cast(T*) malloc(len * T.sizeof))[0 .. len];
697}
698
699//very unsafe, no initialization
700@system T[] arrayInChunk(T)(size_t len, ref void[] chunk)
701{
702    auto ret = (cast(T*) chunk.ptr)[0 .. len];
703    chunk = chunk[len * T.sizeof .. $];
704    return ret;
705}
706
707//
708@trusted uint lookupNamedGroup(String)(NamedGroup[] dict, String name)
709{//equal is @system?
710    import std.algorithm.comparison : equal;
711    import std.algorithm.iteration : map;
712    import std.conv : text;
713    import std.range : assumeSorted;
714
715    auto fnd = assumeSorted!"cmp(a,b) < 0"(map!"a.name"(dict)).lowerBound(name).length;
716    enforce(fnd < dict.length && equal(dict[fnd].name, name),
717        text("no submatch named ", name));
718    return dict[fnd].group;
719}
720
721//whether ch is one of unicode newline sequences
722//0-arg template due to @@@BUG@@@ 10985
723bool endOfLine()(dchar front, bool seenCr)
724{
725    return ((front == '\n') ^ seenCr) || front == '\r'
726    || front == NEL || front == LS || front == PS;
727}
728
729//
730//0-arg template due to @@@BUG@@@ 10985
731bool startOfLine()(dchar back, bool seenNl)
732{
733    return ((back == '\r') ^ seenNl) || back == '\n'
734    || back == NEL || back == LS || back == PS;
735}
736
737///Exception object thrown in case of errors during regex compilation.
738public class RegexException : Exception
739{
740    mixin basicExceptionCtors;
741}
742
743// simple 128-entry bit-table used with a hash function
744struct BitTable {
745    uint[4] filter;
746
747    this(CodepointSet set){
748        foreach (iv; set.byInterval)
749        {
750            foreach (v; iv.a .. iv.b)
751                add(v);
752        }
753    }
754
755    void add()(dchar ch){
756        immutable i = index(ch);
757        filter[i >> 5]  |=  1<<(i & 31);
758    }
759    // non-zero -> might be present, 0 -> absent
760    bool opIndex()(dchar ch) const{
761        immutable i = index(ch);
762        return (filter[i >> 5]>>(i & 31)) & 1;
763    }
764
765    static uint index()(dchar ch){
766        return ((ch >> 7) ^ ch) & 0x7F;
767    }
768}
769
770struct CharMatcher {
771    BitTable ascii; // fast path for ASCII
772    Trie trie;      // slow path for Unicode
773
774    this(CodepointSet set)
775    {
776        auto asciiSet = set & unicode.ASCII;
777        ascii = BitTable(asciiSet);
778        trie = makeTrie(set);
779    }
780
781    bool opIndex()(dchar ch) const
782    {
783        if (ch < 0x80)
784            return ascii[ch];
785        else
786            return trie[ch];
787    }
788}
789