//Written in the D programming language /* Regular expression pattern parser. */ module std.regex.internal.parser; import std.regex.internal.ir; import std.range.primitives, std.uni, std.meta, std.traits, std.typecons, std.exception; static import std.ascii; // package relevant info from parser into a regex object auto makeRegex(S, CG)(Parser!(S, CG) p) { import std.regex.internal.backtracking : BacktrackingMatcher; import std.regex.internal.thompson : ThompsonMatcher; import std.algorithm.searching : canFind; alias Char = BasicElementOf!S; Regex!Char re; auto g = p.g; with(re) { ir = g.ir; dict = g.dict; ngroup = g.ngroup; maxCounterDepth = g.counterDepth; flags = p.re_flags; charsets = g.charsets; matchers = g.matchers; backrefed = g.backrefed; re.pattern = p.origin.idup; re.postprocess(); // check if we have backreferences, if so - use backtracking if (__ctfe) factory = null; // allows us to use the awful enum re = regex(...); else if (re.backrefed.canFind!"a != 0") factory = new RuntimeFactory!(BacktrackingMatcher, Char); else factory = new RuntimeFactory!(ThompsonMatcher, Char); debug(std_regex_parser) { __ctfe || print(); } //@@@BUG@@@ (not reduced) //somehow just using validate _collides_ with std.utf.validate (!) version (assert) re.validateRe(); } return re; } // helper for unittest auto makeRegex(S)(S arg) if (isSomeString!S) { return makeRegex(Parser!(S, CodeGen)(arg, "")); } @system unittest { import std.algorithm.comparison : equal; auto re = makeRegex(`(?P\w+) = (?P\d+)`); auto nc = re.namedCaptures; static assert(isRandomAccessRange!(typeof(nc))); assert(!nc.empty); assert(nc.length == 2); assert(nc.equal(["name", "var"])); assert(nc[0] == "name"); assert(nc[1..$].equal(["var"])); re = makeRegex(`(\w+) (?P\w+) (\w+)`); nc = re.namedCaptures; assert(nc.length == 1); assert(nc[0] == "named"); assert(nc.front == "named"); assert(nc.back == "named"); re = makeRegex(`(\w+) (\w+)`); nc = re.namedCaptures; assert(nc.empty); re = makeRegex(`(?P\d{4})/(?P\d{2})/(?P\d{2})/`); nc = re.namedCaptures; auto cp = nc.save; assert(nc.equal(cp)); nc.popFront(); assert(nc.equal(cp[1..$])); nc.popBack(); assert(nc.equal(cp[1 .. $ - 1])); } @trusted void reverseBytecode()(Bytecode[] code) { Bytecode[] rev = new Bytecode[code.length]; uint revPc = cast(uint) rev.length; Stack!(Tuple!(uint, uint, uint)) stack; uint start = 0; uint end = cast(uint) code.length; for (;;) { for (uint pc = start; pc < end; ) { immutable len = code[pc].length; if (code[pc].code == IR.GotoEndOr) break; //pick next alternation branch if (code[pc].isAtom) { rev[revPc - len .. revPc] = code[pc .. pc + len]; revPc -= len; pc += len; } else if (code[pc].isStart || code[pc].isEnd) { //skip over other embedded lookbehinds they are reversed if (code[pc].code == IR.LookbehindStart || code[pc].code == IR.NeglookbehindStart) { immutable blockLen = len + code[pc].data + code[pc].pairedLength; rev[revPc - blockLen .. revPc] = code[pc .. pc + blockLen]; pc += blockLen; revPc -= blockLen; continue; } immutable second = code[pc].indexOfPair(pc); immutable secLen = code[second].length; rev[revPc - secLen .. revPc] = code[second .. second + secLen]; revPc -= secLen; if (code[pc].code == IR.OrStart) { //we pass len bytes forward, but secLen in reverse immutable revStart = revPc - (second + len - secLen - pc); uint r = revStart; uint i = pc + IRL!(IR.OrStart); while (code[i].code == IR.Option) { if (code[i - 1].code != IR.OrStart) { assert(code[i - 1].code == IR.GotoEndOr); rev[r - 1] = code[i - 1]; } rev[r] = code[i]; auto newStart = i + IRL!(IR.Option); auto newEnd = newStart + code[i].data; auto newRpc = r + code[i].data + IRL!(IR.Option); if (code[newEnd].code != IR.OrEnd) { newRpc--; } stack.push(tuple(newStart, newEnd, newRpc)); r += code[i].data + IRL!(IR.Option); i += code[i].data + IRL!(IR.Option); } pc = i; revPc = revStart; assert(code[pc].code == IR.OrEnd); } else pc += len; } } if (stack.empty) break; start = stack.top[0]; end = stack.top[1]; revPc = stack.top[2]; stack.pop(); } code[] = rev[]; } struct CodeGen { Bytecode[] ir; // resulting bytecode Stack!(uint) fixupStack; // stack of opened start instructions NamedGroup[] dict; // maps name -> user group number Stack!(uint) groupStack; // stack of current number of group uint nesting = 0; // group nesting level and repetitions step uint lookaroundNest = 0; // nesting of lookaround uint counterDepth = 0; // current depth of nested counted repetitions CodepointSet[] charsets; // sets for char classes const(CharMatcher)[] matchers; // matchers for char classes uint[] backrefed; // bitarray for groups refered by backref uint ngroup; // final number of groups (of all patterns) void start(uint length) { if (!__ctfe) ir.reserve((length*5+2)/4); fixupStack.push(0); groupStack.push(1);//0 - whole match } //mark referenced groups for latter processing void markBackref(uint n) { if (n/32 >= backrefed.length) backrefed.length = n/32 + 1; backrefed[n / 32] |= 1 << (n & 31); } bool isOpenGroup(uint n) { import std.algorithm.searching : canFind; // walk the fixup stack and see if there are groups labeled 'n' // fixup '0' is reserved for alternations return fixupStack.data[1..$]. canFind!(fix => ir[fix].code == IR.GroupStart && ir[fix].data == n)(); } void put(Bytecode code) { enforce(ir.length < maxCompiledLength, "maximum compiled pattern length is exceeded"); ir ~= code; } void putRaw(uint number) { enforce(ir.length < maxCompiledLength, "maximum compiled pattern length is exceeded"); ir ~= Bytecode.fromRaw(number); } //try to generate optimal IR code for this CodepointSet @trusted void charsetToIr(CodepointSet set) {//@@@BUG@@@ writeln is @system uint chars = cast(uint) set.length; if (chars < Bytecode.maxSequence) { switch (chars) { case 1: put(Bytecode(IR.Char, set.byCodepoint.front)); break; case 0: throw new RegexException("empty CodepointSet not allowed"); default: foreach (ch; set.byCodepoint) put(Bytecode(IR.OrChar, ch, chars)); } } else { import std.algorithm.searching : countUntil; const ivals = set.byInterval; immutable n = charsets.countUntil(set); if (n >= 0) { if (ivals.length*2 > maxCharsetUsed) put(Bytecode(IR.Trie, cast(uint) n)); else put(Bytecode(IR.CodepointSet, cast(uint) n)); return; } if (ivals.length*2 > maxCharsetUsed) { auto t = getMatcher(set); put(Bytecode(IR.Trie, cast(uint) matchers.length)); matchers ~= t; debug(std_regex_allocation) writeln("Trie generated"); } else { put(Bytecode(IR.CodepointSet, cast(uint) charsets.length)); matchers ~= CharMatcher.init; } charsets ~= set; assert(charsets.length == matchers.length); } } void genLogicGroup() { nesting++; pushFixup(length); put(Bytecode(IR.Nop, 0)); } void genGroup() { nesting++; pushFixup(length); immutable nglob = groupStack.top++; enforce(groupStack.top <= maxGroupNumber, "limit on number of submatches is exceeded"); put(Bytecode(IR.GroupStart, nglob)); } void genNamedGroup(string name) { import std.array : insertInPlace; import std.range : assumeSorted; nesting++; pushFixup(length); immutable nglob = groupStack.top++; enforce(groupStack.top <= maxGroupNumber, "limit on submatches is exceeded"); auto t = NamedGroup(name, nglob); auto d = assumeSorted!"a.name < b.name"(dict); immutable ind = d.lowerBound(t).length; insertInPlace(dict, ind, t); put(Bytecode(IR.GroupStart, nglob)); } //generate code for start of lookaround: (?= (?! (?<= (? fix && ir[fix].code == IR.Option) { ir[fix] = Bytecode(ir[fix].code, cast(uint) ir.length - fix); put(Bytecode(IR.GotoEndOr, 0)); fixupStack.top = cast(uint) ir.length; //replace latest fixup for Option put(Bytecode(IR.Option, 0)); return; } uint len, orStart; //start a new option if (fixupStack.length == 1) {//only root entry, effectively no fixup len = cast(uint) ir.length + IRL!(IR.GotoEndOr); orStart = 0; } else {//IR.lookahead, etc. fixups that have length > 1, thus check ir[x].length len = cast(uint) ir.length - fix - (ir[fix].length - 1); orStart = fix + ir[fix].length; } insertInPlace(ir, orStart, Bytecode(IR.OrStart, 0), Bytecode(IR.Option, len)); assert(ir[orStart].code == IR.OrStart); put(Bytecode(IR.GotoEndOr, 0)); fixupStack.push(orStart); //fixup for StartOR fixupStack.push(cast(uint) ir.length); //for second Option put(Bytecode(IR.Option, 0)); } // finalizes IR.Option, fix points to the first option of sequence void finishAlternation(uint fix) { enforce(ir[fix].code == IR.Option, "no matching ')'"); ir[fix] = Bytecode(ir[fix].code, cast(uint) ir.length - fix - IRL!(IR.OrStart)); fix = fixupStack.pop(); enforce(ir[fix].code == IR.OrStart, "no matching ')'"); ir[fix] = Bytecode(IR.OrStart, cast(uint) ir.length - fix - IRL!(IR.OrStart)); put(Bytecode(IR.OrEnd, cast(uint) ir.length - fix - IRL!(IR.OrStart))); uint pc = fix + IRL!(IR.OrStart); while (ir[pc].code == IR.Option) { pc = pc + ir[pc].data; if (ir[pc].code != IR.GotoEndOr) break; ir[pc] = Bytecode(IR.GotoEndOr, cast(uint)(ir.length - pc - IRL!(IR.OrEnd))); pc += IRL!(IR.GotoEndOr); } put(Bytecode.fromRaw(0)); } // returns: (flag - repetition possible?, fixup of the start of this "group") Tuple!(bool, uint) onClose() { nesting--; uint fix = popFixup(); switch (ir[fix].code) { case IR.GroupStart: put(Bytecode(IR.GroupEnd, ir[fix].data)); return tuple(true, fix); case IR.LookaheadStart, IR.NeglookaheadStart, IR.LookbehindStart, IR.NeglookbehindStart: assert(lookaroundNest); fixLookaround(fix); return tuple(false, 0u); case IR.Option: //| xxx ) //two fixups: last option + full OR finishAlternation(fix); fix = topFixup; switch (ir[fix].code) { case IR.GroupStart: popFixup(); put(Bytecode(IR.GroupEnd, ir[fix].data)); return tuple(true, fix); case IR.LookaheadStart, IR.NeglookaheadStart, IR.LookbehindStart, IR.NeglookbehindStart: assert(lookaroundNest); fix = popFixup(); fixLookaround(fix); return tuple(false, 0u); default://(?:xxx) popFixup(); return tuple(true, fix); } default://(?:xxx) return tuple(true, fix); } } uint popFixup(){ return fixupStack.pop(); } void pushFixup(uint val){ return fixupStack.push(val); } @property uint topFixup(){ return fixupStack.top; } @property size_t fixupLength(){ return fixupStack.data.length; } @property uint length(){ return cast(uint) ir.length; } } // safety limits enum maxGroupNumber = 2^^19; enum maxLookaroundDepth = 16; // *Bytecode.sizeof, i.e. 1Mb of bytecode alone enum maxCompiledLength = 2^^18; // amounts to up to 4 Mb of auxilary table for matching enum maxCumulativeRepetitionLength = 2^^20; // marker to indicate infinite repetition enum infinite = ~0u; struct Parser(R, Generator) if (isForwardRange!R && is(ElementType!R : dchar)) { dchar front; bool empty; R pat, origin; //keep full pattern for pretty printing error messages uint re_flags = 0; //global flags e.g. multiline + internal ones Generator g; @trusted this(S)(R pattern, S flags) if (isSomeString!S) { pat = origin = pattern; //reserve slightly more then avg as sampled from unittests parseFlags(flags); front = ' ';//a safe default for freeform parsing popFront(); g.start(cast(uint) pat.length); try { parseRegex(); } catch (Exception e) { error(e.msg);//also adds pattern location } g.endPattern(1); } void _popFront() { if (pat.empty) { empty = true; } else { front = pat.front; pat.popFront(); } } void skipSpace() { while (!empty && isWhite(front)) _popFront(); } void popFront() { _popFront(); if (re_flags & RegexOption.freeform) skipSpace(); } auto save(){ return this; } //parsing number with basic overflow check uint parseDecimal() { uint r = 0; while (std.ascii.isDigit(front)) { if (r >= (uint.max/10)) error("Overflow in decimal number"); r = 10*r + cast(uint)(front-'0'); popFront(); if (empty) break; } return r; } // @trusted void parseFlags(S)(S flags) {//@@@BUG@@@ text is @system import std.conv : text; foreach (ch; flags)//flags are ASCII anyway { L_FlagSwitch: switch (ch) { foreach (i, op; __traits(allMembers, RegexOption)) { case RegexOptionNames[i]: if (re_flags & mixin("RegexOption."~op)) throw new RegexException(text("redundant flag specified: ",ch)); re_flags |= mixin("RegexOption."~op); break L_FlagSwitch; } default: throw new RegexException(text("unknown regex flag '",ch,"'")); } } } //parse and store IR for regex pattern @trusted void parseRegex() { uint fix;//fixup pointer while (!empty) { debug(std_regex_parser) __ctfe || writeln("*LR*\nSource: ", pat, "\nStack: ",fixupStack.data); switch (front) { case '(': popFront(); if (front == '?') { popFront(); switch (front) { case '#': for (;;) { popFront(); enforce(!empty, "Unexpected end of pattern"); if (front == ')') { popFront(); break; } } break; case ':': g.genLogicGroup(); popFront(); break; case '=': g.genLookaround(IR.LookaheadStart); popFront(); break; case '!': g.genLookaround(IR.NeglookaheadStart); popFront(); break; case 'P': popFront(); enforce(front == '<', "Expected '<' in named group"); string name; popFront(); if (empty || !(isAlpha(front) || front == '_')) error("Expected alpha starting a named group"); name ~= front; popFront(); while (!empty && (isAlpha(front) || front == '_' || std.ascii.isDigit(front))) { name ~= front; popFront(); } enforce(front == '>', "Expected '>' closing named group"); popFront(); g.genNamedGroup(name); break; case '<': popFront(); if (front == '=') g.genLookaround(IR.LookbehindStart); else if (front == '!') g.genLookaround(IR.NeglookbehindStart); else error("'!' or '=' expected after '<'"); popFront(); break; default: uint enableFlags, disableFlags; bool enable = true; do { switch (front) { case 's': if (enable) enableFlags |= RegexOption.singleline; else disableFlags |= RegexOption.singleline; break; case 'x': if (enable) enableFlags |= RegexOption.freeform; else disableFlags |= RegexOption.freeform; break; case 'i': if (enable) enableFlags |= RegexOption.casefold; else disableFlags |= RegexOption.casefold; break; case 'm': if (enable) enableFlags |= RegexOption.multiline; else disableFlags |= RegexOption.multiline; break; case '-': if (!enable) error(" unexpected second '-' in flags"); enable = false; break; default: error(" 's', 'x', 'i', 'm' or '-' expected after '(?' "); } popFront(); }while (front != ')'); popFront(); re_flags |= enableFlags; re_flags &= ~disableFlags; } } else { g.genGroup(); } break; case ')': enforce(g.nesting, "Unmatched ')'"); popFront(); auto pair = g.onClose(); if (pair[0]) parseQuantifier(pair[1]); break; case '|': popFront(); g.fixAlternation(); break; default://no groups or whatever immutable start = g.length; parseAtom(); parseQuantifier(start); } } if (g.fixupLength != 1) { fix = g.popFixup(); g.finishAlternation(fix); enforce(g.fixupLength == 1, "no matching ')'"); } } //parse and store IR for atom-quantifier pair @trusted void parseQuantifier(uint offset) {//copy is @system if (empty) return g.fixRepetition(offset); uint min, max; switch (front) { case '*': min = 0; max = infinite; break; case '?': min = 0; max = 1; break; case '+': min = 1; max = infinite; break; case '{': popFront(); enforce(!empty, "Unexpected end of regex pattern"); enforce(std.ascii.isDigit(front), "First number required in repetition"); min = parseDecimal(); if (front == '}') max = min; else if (front == ',') { popFront(); if (std.ascii.isDigit(front)) max = parseDecimal(); else if (front == '}') max = infinite; else error("Unexpected symbol in regex pattern"); skipSpace(); enforce(front == '}', "Unmatched '{' in regex pattern"); } else error("Unexpected symbol in regex pattern"); enforce(min <= max, "Illegal {n,m} quantifier"); break; default: g.fixRepetition(offset); return; } bool greedy = true; //check only if we managed to get new symbol popFront(); if (!empty && front == '?') { greedy = false; popFront(); } g.fixRepetition(offset, min, max, greedy); } //parse and store IR for atom void parseAtom() { if (empty) return; switch (front) { case '*', '?', '+', '|', '{', '}': return error("'*', '+', '?', '{', '}' not allowed in atom"); case '.': if (re_flags & RegexOption.singleline) g.put(Bytecode(IR.Any, 0)); else { CodepointSet set; g.charsetToIr(set.add('\n','\n'+1).add('\r', '\r'+1).inverted); } popFront(); break; case '[': parseCharset(); break; case '\\': _popFront(); enforce(!empty, "Unfinished escape sequence"); parseEscape(); break; case '^': if (re_flags & RegexOption.multiline) g.put(Bytecode(IR.Bol, 0)); else g.put(Bytecode(IR.Bof, 0)); popFront(); break; case '$': if (re_flags & RegexOption.multiline) g.put(Bytecode(IR.Eol, 0)); else g.put(Bytecode(IR.Eof, 0)); popFront(); break; default: if (re_flags & RegexOption.casefold) { auto range = simpleCaseFoldings(front); assert(range.length <= 5); if (range.length == 1) g.put(Bytecode(IR.Char, range.front)); else foreach (v; range) g.put(Bytecode(IR.OrChar, v, cast(uint) range.length)); } else g.put(Bytecode(IR.Char, front)); popFront(); } } //parse and store IR for CodepointSet void parseCharset() { const save = re_flags; re_flags &= ~RegexOption.freeform; // stop ignoring whitespace if we did bool casefold = cast(bool)(re_flags & RegexOption.casefold); g.charsetToIr(unicode.parseSet(this, casefold)); re_flags = save; // Last next() in parseCharset is executed w/o freeform flag if (re_flags & RegexOption.freeform) skipSpace(); } //parse and generate IR for escape stand alone escape sequence @trusted void parseEscape() {//accesses array of appender import std.algorithm.iteration : sum; switch (front) { case 'f': popFront(); g.put(Bytecode(IR.Char, '\f')); break; case 'n': popFront(); g.put(Bytecode(IR.Char, '\n')); break; case 'r': popFront(); g.put(Bytecode(IR.Char, '\r')); break; case 't': popFront(); g.put(Bytecode(IR.Char, '\t')); break; case 'v': popFront(); g.put(Bytecode(IR.Char, '\v')); break; case 'd': popFront(); g.charsetToIr(unicode.Nd); break; case 'D': popFront(); g.charsetToIr(unicode.Nd.inverted); break; case 'b': popFront(); g.put(Bytecode(IR.Wordboundary, 0)); break; case 'B': popFront(); g.put(Bytecode(IR.Notwordboundary, 0)); break; case 's': popFront(); g.charsetToIr(unicode.White_Space); break; case 'S': popFront(); g.charsetToIr(unicode.White_Space.inverted); break; case 'w': popFront(); g.charsetToIr(wordCharacter); break; case 'W': popFront(); g.charsetToIr(wordCharacter.inverted); break; case 'p': case 'P': bool casefold = cast(bool)(re_flags & RegexOption.casefold); auto set = unicode.parsePropertySpec(this, front == 'P', casefold); g.charsetToIr(set); break; case 'x': immutable code = parseUniHex(pat, 2); popFront(); g.put(Bytecode(IR.Char,code)); break; case 'u': case 'U': immutable code = parseUniHex(pat, front == 'u' ? 4 : 8); popFront(); g.put(Bytecode(IR.Char, code)); break; case 'c': //control codes Bytecode code = Bytecode(IR.Char, unicode.parseControlCode(this)); popFront(); g.put(code); break; case '0': popFront(); g.put(Bytecode(IR.Char, 0));//NUL character break; case '1': .. case '9': uint nref = cast(uint) front - '0'; immutable maxBackref = sum(g.groupStack.data); enforce(nref < maxBackref, "Backref to unseen group"); //perl's disambiguation rule i.e. //get next digit only if there is such group number popFront(); while (nref < maxBackref && !empty && std.ascii.isDigit(front)) { nref = nref * 10 + front - '0'; popFront(); } if (nref >= maxBackref) nref /= 10; enforce(!g.isOpenGroup(nref), "Backref to open group"); uint localLimit = maxBackref - g.groupStack.top; if (nref >= localLimit) { g.put(Bytecode(IR.Backref, nref-localLimit)); g.ir[$-1].setLocalRef(); } else g.put(Bytecode(IR.Backref, nref)); g.markBackref(nref); break; default: if (front == '\\' && !pat.empty) { if (pat.front >= privateUseStart && pat.front <= privateUseEnd) enforce(false, "invalid escape sequence"); } if (front >= privateUseStart && front <= privateUseEnd) { g.endPattern(front - privateUseStart + 1); break; } auto op = Bytecode(IR.Char, front); popFront(); g.put(op); } } // @trusted void error(string msg) { import std.array : appender; import std.format.write : formattedWrite; auto app = appender!string(); formattedWrite(app, "%s\nPattern with error: `%s` <--HERE-- `%s`", msg, origin[0..$-pat.length], pat); throw new RegexException(app.data); } alias Char = BasicElementOf!R; @property program() { return makeRegex(this); } } /+ Postproces the IR, then optimize. +/ @trusted void postprocess(Char)(ref Regex!Char zis) {//@@@BUG@@@ write is @system with(zis) { struct FixedStack(T) { T[] arr; uint _top; //this(T[] storage){ arr = storage; _top = -1; } @property ref T top(){ assert(!empty); return arr[_top]; } void push(T x){ arr[++_top] = x; } T pop() { assert(!empty); return arr[_top--]; } @property bool empty(){ return _top == -1; } } auto counterRange = FixedStack!uint(new uint[maxCounterDepth+1], -1); counterRange.push(1); ulong cumRange = 0; for (uint i = 0; i < ir.length; i += ir[i].length) { if (ir[i].hotspot) { assert(i + 1 < ir.length, "unexpected end of IR while looking for hotspot"); ir[i+1] = Bytecode.fromRaw(hotspotTableSize); hotspotTableSize += counterRange.top; } switch (ir[i].code) { case IR.RepeatStart, IR.RepeatQStart: uint repEnd = cast(uint)(i + ir[i].data + IRL!(IR.RepeatStart)); assert(ir[repEnd].code == ir[i].paired.code); immutable max = ir[repEnd + 4].raw; ir[repEnd+2].raw = counterRange.top; ir[repEnd+3].raw *= counterRange.top; ir[repEnd+4].raw *= counterRange.top; ulong cntRange = cast(ulong)(max)*counterRange.top; cumRange += cntRange; enforce(cumRange < maxCumulativeRepetitionLength, "repetition length limit is exceeded"); counterRange.push(cast(uint) cntRange + counterRange.top); threadCount += counterRange.top; break; case IR.RepeatEnd, IR.RepeatQEnd: threadCount += counterRange.top; counterRange.pop(); break; case IR.GroupStart: if (isBackref(ir[i].data)) ir[i].setBackrefence(); threadCount += counterRange.top; break; case IR.GroupEnd: if (isBackref(ir[i].data)) ir[i].setBackrefence(); threadCount += counterRange.top; break; default: threadCount += counterRange.top; } } checkIfOneShot(); if (!(flags & RegexInfo.oneShot)) kickstart = Kickstart!Char(zis, new uint[](256)); debug(std_regex_allocation) writefln("IR processed, max threads: %d", threadCount); optimize(zis); } } void fixupBytecode()(Bytecode[] ir) { Stack!uint fixups; with(IR) for (uint i=0; i