1/* 2 Implementation of std.regex IR, an intermediate representation 3 of a regular expression pattern. 4 5 This is a common ground between frontend regex component (parser) 6 and backend components - generators, matchers and other "filters". 7*/ 8module std.regex.internal.ir; 9 10package(std.regex): 11 12import std.exception, std.meta, std.range.primitives, std.traits, std.uni; 13 14debug(std_regex_parser) import std.stdio; 15// just a common trait, may be moved elsewhere 16alias BasicElementOf(Range) = Unqual!(ElementEncodingType!Range); 17 18enum privateUseStart = '\U000F0000', privateUseEnd ='\U000FFFFD'; 19 20// heuristic value determines maximum CodepointSet length suitable for linear search 21enum maxCharsetUsed = 6; 22 23// another variable to tweak behavior of caching generated Tries for character classes 24enum maxCachedMatchers = 8; 25 26alias Trie = CodepointSetTrie!(13, 8); 27alias makeTrie = codepointSetTrie!(13, 8); 28 29CharMatcher[CodepointSet] matcherCache; 30 31//accessor with caching 32@trusted CharMatcher getMatcher(CodepointSet set) 33{// @@@BUG@@@ 6357 almost all properties of AA are not @safe 34 if (__ctfe || maxCachedMatchers == 0) 35 return CharMatcher(set); 36 else 37 { 38 auto p = set in matcherCache; 39 if (p) 40 return *p; 41 if (matcherCache.length == maxCachedMatchers) 42 { 43 // flush enmatchers in trieCache 44 matcherCache = null; 45 } 46 return (matcherCache[set] = CharMatcher(set)); 47 } 48} 49 50@trusted auto memoizeExpr(string expr)() 51{ 52 if (__ctfe) 53 return mixin(expr); 54 alias T = typeof(mixin(expr)); 55 static T slot; 56 static bool initialized; 57 if (!initialized) 58 { 59 slot = mixin(expr); 60 initialized = true; 61 } 62 return slot; 63} 64 65//property for \w character class 66@property CodepointSet wordCharacter() 67{ 68 return memoizeExpr!("unicode.Alphabetic | unicode.Mn | unicode.Mc 69 | unicode.Me | unicode.Nd | unicode.Pc")(); 70} 71 72@property CharMatcher wordMatcher() 73{ 74 return memoizeExpr!("CharMatcher(wordCharacter)")(); 75} 76 77// some special Unicode white space characters 78private enum NEL = '\u0085', LS = '\u2028', PS = '\u2029'; 79 80// Characters that need escaping in string posed as regular expressions 81alias Escapables = AliasSeq!('[', ']', '\\', '^', '$', '.', '|', '?', ',', '-', 82 ';', ':', '#', '&', '%', '/', '<', '>', '`', '*', '+', '(', ')', '{', '}', '~'); 83 84//Regular expression engine/parser options: 85// global - search all nonoverlapping matches in input 86// casefold - case insensitive matching, do casefolding on match in unicode mode 87// freeform - ignore whitespace in pattern, to match space use [ ] or \s 88// multiline - switch ^, $ detect start and end of linesinstead of just start and end of input 89enum RegexOption: uint { 90 global = 0x1, 91 casefold = 0x2, 92 freeform = 0x4, 93 nonunicode = 0x8, 94 multiline = 0x10, 95 singleline = 0x20 96} 97//do not reorder this list 98alias RegexOptionNames = AliasSeq!('g', 'i', 'x', 'U', 'm', 's'); 99static assert( RegexOption.max < 0x80); 100// flags that allow guide execution of engine 101enum RegexInfo : uint { oneShot = 0x80 } 102 103// IR bit pattern: 0b1_xxxxx_yy 104// where yy indicates class of instruction, xxxxx for actual operation code 105// 00: atom, a normal instruction 106// 01: open, opening of a group, has length of contained IR in the low bits 107// 10: close, closing of a group, has length of contained IR in the low bits 108// 11 unused 109// 110// Loops with Q (non-greedy, with ? mark) must have the same size / other properties as non Q version 111// Possible changes: 112//* merge group, option, infinite/repeat start (to never copy during parsing of (a|b){1,2}) 113//* reorganize groups to make n args easier to find, or simplify the check for groups of similar ops 114// (like lookaround), or make it easier to identify hotspots. 115 116enum IR:uint { 117 Char = 0b1_00000_00, //a character 118 Any = 0b1_00001_00, //any character 119 CodepointSet = 0b1_00010_00, //a most generic CodepointSet [...] 120 Trie = 0b1_00011_00, //CodepointSet implemented as Trie 121 //match with any of a consecutive OrChar's in this sequence 122 //(used for case insensitive match) 123 //OrChar holds in upper two bits of data total number of OrChars in this _sequence_ 124 //the drawback of this representation is that it is difficult 125 // to detect a jump in the middle of it 126 OrChar = 0b1_00100_00, 127 Nop = 0b1_00101_00, //no operation (padding) 128 End = 0b1_00110_00, //end of program 129 Bol = 0b1_00111_00, //beginning of a line ^ 130 Eol = 0b1_01000_00, //end of a line $ 131 Wordboundary = 0b1_01001_00, //boundary of a word 132 Notwordboundary = 0b1_01010_00, //not a word boundary 133 Backref = 0b1_01011_00, //backreference to a group (that has to be pinned, i.e. locally unique) (group index) 134 GroupStart = 0b1_01100_00, //start of a group (x) (groupIndex+groupPinning(1bit)) 135 GroupEnd = 0b1_01101_00, //end of a group (x) (groupIndex+groupPinning(1bit)) 136 Option = 0b1_01110_00, //start of an option within an alternation x | y (length) 137 GotoEndOr = 0b1_01111_00, //end of an option (length of the rest) 138 Bof = 0b1_10000_00, //begining of "file" (string) ^ 139 Eof = 0b1_10001_00, //end of "file" (string) $ 140 //... any additional atoms here 141 142 OrStart = 0b1_00000_01, //start of alternation group (length) 143 OrEnd = 0b1_00000_10, //end of the or group (length,mergeIndex) 144 //with this instruction order 145 //bit mask 0b1_00001_00 could be used to test/set greediness 146 InfiniteStart = 0b1_00001_01, //start of an infinite repetition x* (length) 147 InfiniteEnd = 0b1_00001_10, //end of infinite repetition x* (length,mergeIndex) 148 InfiniteQStart = 0b1_00010_01, //start of a non eager infinite repetition x*? (length) 149 InfiniteQEnd = 0b1_00010_10, //end of non eager infinite repetition x*? (length,mergeIndex) 150 InfiniteBloomStart = 0b1_00011_01, //start of an filtered infinite repetition x* (length) 151 InfiniteBloomEnd = 0b1_00011_10, //end of filtered infinite repetition x* (length,mergeIndex) 152 RepeatStart = 0b1_00100_01, //start of a {n,m} repetition (length) 153 RepeatEnd = 0b1_00100_10, //end of x{n,m} repetition (length,step,minRep,maxRep) 154 RepeatQStart = 0b1_00101_01, //start of a non eager x{n,m}? repetition (length) 155 RepeatQEnd = 0b1_00101_10, //end of non eager x{n,m}? repetition (length,step,minRep,maxRep) 156 157 // 158 LookaheadStart = 0b1_00110_01, //begin of the lookahead group (length) 159 LookaheadEnd = 0b1_00110_10, //end of a lookahead group (length) 160 NeglookaheadStart = 0b1_00111_01, //start of a negative lookahead (length) 161 NeglookaheadEnd = 0b1_00111_10, //end of a negative lookahead (length) 162 LookbehindStart = 0b1_01000_01, //start of a lookbehind (length) 163 LookbehindEnd = 0b1_01000_10, //end of a lookbehind (length) 164 NeglookbehindStart = 0b1_01001_01, //start of a negative lookbehind (length) 165 NeglookbehindEnd = 0b1_01001_10, //end of negative lookbehind (length) 166} 167 168//a shorthand for IR length - full length of specific opcode evaluated at compile time 169template IRL(IR code) 170{ 171 enum uint IRL = lengthOfIR(code); 172} 173static assert(IRL!(IR.LookaheadStart) == 3); 174 175//how many parameters follow the IR, should be optimized fixing some IR bits 176int immediateParamsIR(IR i){ 177 switch (i) 178 { 179 case IR.OrEnd,IR.InfiniteEnd,IR.InfiniteQEnd: 180 return 1; // merge table index 181 case IR.InfiniteBloomEnd: 182 return 2; // bloom filter index + merge table index 183 case IR.RepeatEnd, IR.RepeatQEnd: 184 return 4; 185 case IR.LookaheadStart, IR.NeglookaheadStart, IR.LookbehindStart, IR.NeglookbehindStart: 186 return 2; // start-end of captures used 187 default: 188 return 0; 189 } 190} 191 192//full length of IR instruction inlcuding all parameters that might follow it 193int lengthOfIR(IR i) 194{ 195 return 1 + immediateParamsIR(i); 196} 197 198//full length of the paired IR instruction inlcuding all parameters that might follow it 199int lengthOfPairedIR(IR i) 200{ 201 return 1 + immediateParamsIR(pairedIR(i)); 202} 203 204//if the operation has a merge point (this relies on the order of the ops) 205bool hasMerge(IR i) 206{ 207 return (i&0b11)==0b10 && i <= IR.RepeatQEnd; 208} 209 210//is an IR that opens a "group" 211bool isStartIR(IR i) 212{ 213 return (i&0b11)==0b01; 214} 215 216//is an IR that ends a "group" 217bool isEndIR(IR i) 218{ 219 return (i&0b11)==0b10; 220} 221 222//is a standalone IR 223bool isAtomIR(IR i) 224{ 225 return (i&0b11)==0b00; 226} 227 228//makes respective pair out of IR i, swapping start/end bits of instruction 229IR pairedIR(IR i) 230{ 231 assert(isStartIR(i) || isEndIR(i)); 232 return cast(IR)(i ^ 0b11); 233} 234 235//encoded IR instruction 236struct Bytecode 237{ 238 uint raw; 239 //natural constraints 240 enum maxSequence = 2+4; 241 enum maxData = 1 << 22; 242 enum maxRaw = 1 << 31; 243 244 this(IR code, uint data) 245 { 246 assert(data < (1 << 22) && code < 256); 247 raw = code << 24 | data; 248 } 249 250 this(IR code, uint data, uint seq) 251 { 252 assert(data < (1 << 22) && code < 256 ); 253 assert(seq >= 2 && seq < maxSequence); 254 raw = code << 24 | (seq - 2)<<22 | data; 255 } 256 257 //store raw data 258 static Bytecode fromRaw(uint data) 259 { 260 Bytecode t; 261 t.raw = data; 262 return t; 263 } 264 265 //bit twiddling helpers 266 //0-arg template due to @@@BUG@@@ 10985 267 @property uint data()() const { return raw & 0x003f_ffff; } 268 269 @property void data()(uint val) 270 { 271 raw = (raw & ~0x003f_ffff) | (val & 0x003f_ffff); 272 } 273 274 //ditto 275 //0-arg template due to @@@BUG@@@ 10985 276 @property uint sequence()() const { return 2 + (raw >> 22 & 0x3); } 277 278 //ditto 279 //0-arg template due to @@@BUG@@@ 10985 280 @property IR code()() const { return cast(IR)(raw >> 24); } 281 282 //ditto 283 @property bool hotspot() const { return hasMerge(code); } 284 285 //test the class of this instruction 286 @property bool isAtom() const { return isAtomIR(code); } 287 288 //ditto 289 @property bool isStart() const { return isStartIR(code); } 290 291 //ditto 292 @property bool isEnd() const { return isEndIR(code); } 293 294 //number of arguments for this instruction 295 @property int args() const { return immediateParamsIR(code); } 296 297 //mark this GroupStart or GroupEnd as referenced in backreference 298 void setBackrefence() 299 { 300 assert(code == IR.GroupStart || code == IR.GroupEnd); 301 raw = raw | 1 << 23; 302 } 303 304 //is referenced 305 @property bool backreference() const 306 { 307 assert(code == IR.GroupStart || code == IR.GroupEnd); 308 return cast(bool)(raw & 1 << 23); 309 } 310 311 //mark as local reference (for backrefs in lookarounds) 312 void setLocalRef() 313 { 314 assert(code == IR.Backref); 315 raw = raw | 1 << 23; 316 } 317 318 //is a local ref 319 @property bool localRef() const 320 { 321 assert(code == IR.Backref); 322 return cast(bool)(raw & 1 << 23); 323 } 324 325 //human readable name of instruction 326 @trusted @property string mnemonic()() const 327 {//@@@BUG@@@ to is @system 328 import std.conv : to; 329 return to!string(code); 330 } 331 332 //full length of instruction 333 @property uint length() const 334 { 335 return lengthOfIR(code); 336 } 337 338 //full length of respective start/end of this instruction 339 @property uint pairedLength() const 340 { 341 return lengthOfPairedIR(code); 342 } 343 344 //returns bytecode of paired instruction (assuming this one is start or end) 345 @property Bytecode paired() const 346 {//depends on bit and struct layout order 347 assert(isStart || isEnd); 348 return Bytecode.fromRaw(raw ^ 0b11 << 24); 349 } 350 351 //gets an index into IR block of the respective pair 352 uint indexOfPair(uint pc) const 353 { 354 assert(isStart || isEnd); 355 return isStart ? pc + data + length : pc - data - lengthOfPairedIR(code); 356 } 357} 358 359static assert(Bytecode.sizeof == 4); 360 361 362//index entry structure for name --> number of submatch 363struct NamedGroup 364{ 365 string name; 366 uint group; 367} 368 369//holds pair of start-end markers for a submatch 370struct Group(DataIndex) 371{ 372 DataIndex begin, end; 373 @trusted string toString()() const 374 { 375 import std.array : appender; 376 import std.format : formattedWrite; 377 auto a = appender!string(); 378 formattedWrite(a, "%s..%s", begin, end); 379 return a.data; 380 } 381} 382 383//debugging tool, prints out instruction along with opcodes 384@trusted string disassemble(in Bytecode[] irb, uint pc, in NamedGroup[] dict=[]) 385{ 386 import std.array : appender; 387 import std.format : formattedWrite; 388 auto output = appender!string(); 389 formattedWrite(output,"%s", irb[pc].mnemonic); 390 switch (irb[pc].code) 391 { 392 case IR.Char: 393 formattedWrite(output, " %s (0x%x)",cast(dchar) irb[pc].data, irb[pc].data); 394 break; 395 case IR.OrChar: 396 formattedWrite(output, " %s (0x%x) seq=%d", cast(dchar) irb[pc].data, irb[pc].data, irb[pc].sequence); 397 break; 398 case IR.RepeatStart, IR.InfiniteStart, IR.InfiniteBloomStart, 399 IR.Option, IR.GotoEndOr, IR.OrStart: 400 //forward-jump instructions 401 uint len = irb[pc].data; 402 formattedWrite(output, " pc=>%u", pc+len+IRL!(IR.RepeatStart)); 403 break; 404 case IR.RepeatEnd, IR.RepeatQEnd: //backward-jump instructions 405 uint len = irb[pc].data; 406 formattedWrite(output, " pc=>%u min=%u max=%u step=%u", 407 pc - len, irb[pc + 3].raw, irb[pc + 4].raw, irb[pc + 2].raw); 408 break; 409 case IR.InfiniteEnd, IR.InfiniteQEnd, IR.InfiniteBloomEnd, IR.OrEnd: //ditto 410 uint len = irb[pc].data; 411 formattedWrite(output, " pc=>%u", pc-len); 412 break; 413 case IR.LookaheadEnd, IR.NeglookaheadEnd: //ditto 414 uint len = irb[pc].data; 415 formattedWrite(output, " pc=>%u", pc-len); 416 break; 417 case IR.GroupStart, IR.GroupEnd: 418 uint n = irb[pc].data; 419 string name; 420 foreach (v;dict) 421 if (v.group == n) 422 { 423 name = "'"~v.name~"'"; 424 break; 425 } 426 formattedWrite(output, " %s #%u " ~ (irb[pc].backreference ? "referenced" : ""), 427 name, n); 428 break; 429 case IR.LookaheadStart, IR.NeglookaheadStart, IR.LookbehindStart, IR.NeglookbehindStart: 430 uint len = irb[pc].data; 431 uint start = irb[pc+1].raw, end = irb[pc+2].raw; 432 formattedWrite(output, " pc=>%u [%u..%u]", pc + len + IRL!(IR.LookaheadStart), start, end); 433 break; 434 case IR.Backref: case IR.CodepointSet: case IR.Trie: 435 uint n = irb[pc].data; 436 formattedWrite(output, " %u", n); 437 if (irb[pc].code == IR.Backref) 438 formattedWrite(output, " %s", irb[pc].localRef ? "local" : "global"); 439 break; 440 default://all data-free instructions 441 } 442 if (irb[pc].hotspot) 443 formattedWrite(output, " Hotspot %u", irb[pc+1].raw); 444 return output.data; 445} 446 447//disassemble the whole chunk 448@trusted void printBytecode()(in Bytecode[] slice, in NamedGroup[] dict=[]) 449{ 450 import std.stdio : writeln; 451 for (uint pc=0; pc<slice.length; pc += slice[pc].length) 452 writeln("\t", disassemble(slice, pc, dict)); 453} 454 455/++ 456 $(D Regex) object holds regular expression pattern in compiled form. 457 Instances of this object are constructed via calls to $(D regex). 458 This is an intended form for caching and storage of frequently 459 used regular expressions. 460+/ 461struct Regex(Char) 462{ 463 //temporary workaround for identifier lookup 464 CodepointSet[] charsets; // 465 Bytecode[] ir; //compiled bytecode of pattern 466 467 468 @safe @property bool empty() const nothrow { return ir is null; } 469 470 @safe @property auto namedCaptures() 471 { 472 static struct NamedGroupRange 473 { 474 private: 475 NamedGroup[] groups; 476 size_t start; 477 size_t end; 478 public: 479 this(NamedGroup[] g, size_t s, size_t e) 480 { 481 assert(s <= e); 482 assert(e <= g.length); 483 groups = g; 484 start = s; 485 end = e; 486 } 487 488 @property string front() { return groups[start].name; } 489 @property string back() { return groups[end-1].name; } 490 @property bool empty() { return start >= end; } 491 @property size_t length() { return end - start; } 492 alias opDollar = length; 493 @property NamedGroupRange save() 494 { 495 return NamedGroupRange(groups, start, end); 496 } 497 void popFront() { assert(!empty); start++; } 498 void popBack() { assert(!empty); end--; } 499 string opIndex()(size_t i) 500 { 501 assert(start + i < end, 502 "Requested named group is out of range."); 503 return groups[start+i].name; 504 } 505 NamedGroupRange opSlice(size_t low, size_t high) { 506 assert(low <= high); 507 assert(start + high <= end); 508 return NamedGroupRange(groups, start + low, start + high); 509 } 510 NamedGroupRange opSlice() { return this.save; } 511 } 512 return NamedGroupRange(dict, 0, dict.length); 513 } 514 515package(std.regex): 516 import std.regex.internal.kickstart : Kickstart; //TODO: get rid of this dependency 517 NamedGroup[] dict; // maps name -> user group number 518 uint ngroup; // number of internal groups 519 uint maxCounterDepth; // max depth of nested {n,m} repetitions 520 uint hotspotTableSize; // number of entries in merge table 521 uint threadCount; // upper bound on number of Thompson VM threads 522 uint flags; // global regex flags 523 public const(CharMatcher)[] matchers; // tables that represent character sets 524 public const(BitTable)[] filters; // bloom filters for conditional loops 525 uint[] backrefed; // bit array of backreferenced submatches 526 Kickstart!Char kickstart; 527 528 //bit access helper 529 uint isBackref(uint n) 530 { 531 if (n/32 >= backrefed.length) 532 return 0; 533 return backrefed[n / 32] & (1 << (n & 31)); 534 } 535 536 //check if searching is not needed 537 void checkIfOneShot() 538 { 539 L_CheckLoop: 540 for (uint i = 0; i < ir.length; i += ir[i].length) 541 { 542 switch (ir[i].code) 543 { 544 case IR.Bof: 545 flags |= RegexInfo.oneShot; 546 break L_CheckLoop; 547 case IR.GroupStart, IR.GroupEnd, IR.Bol, IR.Eol, IR.Eof, 548 IR.Wordboundary, IR.Notwordboundary: 549 break; 550 default: 551 break L_CheckLoop; 552 } 553 } 554 } 555 556 //print out disassembly a program's IR 557 @trusted debug(std_regex_parser) void print() const 558 {//@@@BUG@@@ write is system 559 for (uint i = 0; i < ir.length; i += ir[i].length) 560 { 561 writefln("%d\t%s ", i, disassemble(ir, i, dict)); 562 } 563 writeln("Total merge table size: ", hotspotTableSize); 564 writeln("Max counter nesting depth: ", maxCounterDepth); 565 } 566 567} 568 569//@@@BUG@@@ (unreduced) - public makes it inaccessible in std.regex.package (!) 570/*public*/ struct StaticRegex(Char) 571{ 572package(std.regex): 573 import std.regex.internal.backtracking : BacktrackingMatcher; 574 alias Matcher = BacktrackingMatcher!(true); 575 alias MatchFn = bool function(ref Matcher!Char) @trusted; 576 MatchFn nativeFn; 577public: 578 Regex!Char _regex; 579 alias _regex this; 580 this(Regex!Char re, MatchFn fn) 581 { 582 _regex = re; 583 nativeFn = fn; 584 585 } 586 587} 588 589// The stuff below this point is temporarrily part of IR module 590// but may need better place in the future (all internals) 591package(std.regex): 592 593//Simple UTF-string abstraction compatible with stream interface 594struct Input(Char) 595if (is(Char :dchar)) 596{ 597 import std.utf : decode; 598 alias DataIndex = size_t; 599 enum bool isLoopback = false; 600 alias String = const(Char)[]; 601 String _origin; 602 size_t _index; 603 604 //constructs Input object out of plain string 605 this(String input, size_t idx = 0) 606 { 607 _origin = input; 608 _index = idx; 609 } 610 611 //codepoint at current stream position 612 pragma(inline, true) bool nextChar(ref dchar res, ref size_t pos) 613 { 614 pos = _index; 615 // DMD's inliner hates multiple return functions 616 // but can live with single statement if/else bodies 617 bool n = !(_index == _origin.length); 618 if (n) 619 res = decode(_origin, _index); 620 return n; 621 } 622 @property bool atEnd(){ 623 return _index == _origin.length; 624 } 625 bool search(Kickstart)(ref Kickstart kick, ref dchar res, ref size_t pos) 626 { 627 size_t idx = kick.search(_origin, _index); 628 _index = idx; 629 return nextChar(res, pos); 630 } 631 632 //index of at End position 633 @property size_t lastIndex(){ return _origin.length; } 634 635 //support for backtracker engine, might not be present 636 void reset(size_t index){ _index = index; } 637 638 String opSlice(size_t start, size_t end){ return _origin[start .. end]; } 639 640 auto loopBack(size_t index){ return BackLooper!Input(this, index); } 641} 642 643struct BackLooperImpl(Input) 644{ 645 import std.utf : strideBack; 646 alias DataIndex = size_t; 647 alias String = Input.String; 648 enum bool isLoopback = true; 649 String _origin; 650 size_t _index; 651 this(Input input, size_t index) 652 { 653 _origin = input._origin; 654 _index = index; 655 } 656 @trusted bool nextChar(ref dchar res,ref size_t pos) 657 { 658 pos = _index; 659 if (_index == 0) 660 return false; 661 662 res = _origin[0.._index].back; 663 _index -= strideBack(_origin, _index); 664 665 return true; 666 } 667 @property atEnd(){ return _index == 0 || _index == strideBack(_origin, _index); } 668 auto loopBack(size_t index){ return Input(_origin, index); } 669 670 //support for backtracker engine, might not be present 671 //void reset(size_t index){ _index = index ? index-std.utf.strideBack(_origin, index) : 0; } 672 void reset(size_t index){ _index = index; } 673 674 String opSlice(size_t start, size_t end){ return _origin[end .. start]; } 675 //index of at End position 676 @property size_t lastIndex(){ return 0; } 677} 678 679template BackLooper(E) 680{ 681 static if (is(E : BackLooperImpl!U, U)) 682 { 683 alias BackLooper = U; 684 } 685 else 686 { 687 alias BackLooper = BackLooperImpl!E; 688 } 689} 690 691//both helpers below are internal, on its own are quite "explosive" 692//unsafe, no initialization of elements 693@system T[] mallocArray(T)(size_t len) 694{ 695 import core.stdc.stdlib : malloc; 696 return (cast(T*) malloc(len * T.sizeof))[0 .. len]; 697} 698 699//very unsafe, no initialization 700@system T[] arrayInChunk(T)(size_t len, ref void[] chunk) 701{ 702 auto ret = (cast(T*) chunk.ptr)[0 .. len]; 703 chunk = chunk[len * T.sizeof .. $]; 704 return ret; 705} 706 707// 708@trusted uint lookupNamedGroup(String)(NamedGroup[] dict, String name) 709{//equal is @system? 710 import std.algorithm.comparison : equal; 711 import std.algorithm.iteration : map; 712 import std.conv : text; 713 import std.range : assumeSorted; 714 715 auto fnd = assumeSorted!"cmp(a,b) < 0"(map!"a.name"(dict)).lowerBound(name).length; 716 enforce(fnd < dict.length && equal(dict[fnd].name, name), 717 text("no submatch named ", name)); 718 return dict[fnd].group; 719} 720 721//whether ch is one of unicode newline sequences 722//0-arg template due to @@@BUG@@@ 10985 723bool endOfLine()(dchar front, bool seenCr) 724{ 725 return ((front == '\n') ^ seenCr) || front == '\r' 726 || front == NEL || front == LS || front == PS; 727} 728 729// 730//0-arg template due to @@@BUG@@@ 10985 731bool startOfLine()(dchar back, bool seenNl) 732{ 733 return ((back == '\r') ^ seenNl) || back == '\n' 734 || back == NEL || back == LS || back == PS; 735} 736 737///Exception object thrown in case of errors during regex compilation. 738public class RegexException : Exception 739{ 740 mixin basicExceptionCtors; 741} 742 743// simple 128-entry bit-table used with a hash function 744struct BitTable { 745 uint[4] filter; 746 747 this(CodepointSet set){ 748 foreach (iv; set.byInterval) 749 { 750 foreach (v; iv.a .. iv.b) 751 add(v); 752 } 753 } 754 755 void add()(dchar ch){ 756 immutable i = index(ch); 757 filter[i >> 5] |= 1<<(i & 31); 758 } 759 // non-zero -> might be present, 0 -> absent 760 bool opIndex()(dchar ch) const{ 761 immutable i = index(ch); 762 return (filter[i >> 5]>>(i & 31)) & 1; 763 } 764 765 static uint index()(dchar ch){ 766 return ((ch >> 7) ^ ch) & 0x7F; 767 } 768} 769 770struct CharMatcher { 771 BitTable ascii; // fast path for ASCII 772 Trie trie; // slow path for Unicode 773 774 this(CodepointSet set) 775 { 776 auto asciiSet = set & unicode.ASCII; 777 ascii = BitTable(asciiSet); 778 trie = makeTrie(set); 779 } 780 781 bool opIndex()(dchar ch) const 782 { 783 if (ch < 0x80) 784 return ascii[ch]; 785 else 786 return trie[ch]; 787 } 788} 789