/* Generators - components that generate strings for a given regex pattern. For the moment undocumented, and is subject to change. */ module std.regex.internal.generator; /* Useful utility for self-testing, an infinite range of string samples that _have_ to match given compiled regex. Caveats: supports only a simple subset of bytecode. */ @trusted private struct SampleGenerator(Char) { import std.array : appender, Appender; import std.format.write : formattedWrite; import std.random : Xorshift; import std.regex.internal.ir : Regex, IR, IRL; import std.utf : isValidDchar, byChar; Regex!Char re; Appender!(char[]) app; uint limit, seed; Xorshift gen; //generator for pattern r, with soft maximum of threshold elements //and a given random seed this(ref Regex!Char r, uint threshold, uint randomSeed) { re = r; limit = threshold; seed = randomSeed; app = appender!(Char[])(); compose(); } uint rand(uint x) { uint r = gen.front % x; gen.popFront(); return r; } void compose() { uint pc = 0, counter = 0, dataLenOld = uint.max; for (;;) { switch (re.ir[pc].code) { case IR.Char: formattedWrite(app,"%s", cast(dchar) re.ir[pc].data); pc += IRL!(IR.Char); break; case IR.OrChar: uint len = re.ir[pc].sequence; formattedWrite(app, "%s", cast(dchar) re.ir[pc + rand(len)].data); pc += len; break; case IR.CodepointSet: case IR.Trie: auto set = re.charsets[re.ir[pc].data]; auto x = rand(cast(uint) set.byInterval.length); auto y = rand(set.byInterval[x].b - set.byInterval[x].a); formattedWrite(app, "%s", cast(dchar)(set.byInterval[x].a+y)); pc += IRL!(IR.CodepointSet); break; case IR.Any: uint x; do { x = rand(0x11_000); }while (x == '\r' || x == '\n' || !isValidDchar(x)); formattedWrite(app, "%s", cast(dchar) x); pc += IRL!(IR.Any); break; case IR.GotoEndOr: pc += IRL!(IR.GotoEndOr)+re.ir[pc].data; assert(re.ir[pc].code == IR.OrEnd); goto case; case IR.OrEnd: pc += IRL!(IR.OrEnd); break; case IR.OrStart: pc += IRL!(IR.OrStart); goto case; case IR.Option: uint next = pc + re.ir[pc].data + IRL!(IR.Option); uint nOpt = 0; //queue next Option while (re.ir[next].code == IR.Option) { nOpt++; next += re.ir[next].data + IRL!(IR.Option); } nOpt++; nOpt = rand(nOpt); for (;nOpt; nOpt--) { pc += re.ir[pc].data + IRL!(IR.Option); } assert(re.ir[pc].code == IR.Option); pc += IRL!(IR.Option); break; case IR.RepeatStart:case IR.RepeatQStart: pc += IRL!(IR.RepeatStart)+re.ir[pc].data; goto case IR.RepeatEnd; case IR.RepeatEnd: case IR.RepeatQEnd: uint len = re.ir[pc].data; uint step = re.ir[pc+2].raw; uint min = re.ir[pc+3].raw; if (counter < min) { counter += step; pc -= len; break; } uint max = re.ir[pc+4].raw; if (counter < max) { if (app.data.length < limit && rand(3) > 0) { pc -= len; counter += step; } else { counter = counter%step; pc += IRL!(IR.RepeatEnd); } } else { counter = counter%step; pc += IRL!(IR.RepeatEnd); } break; case IR.InfiniteStart, IR.InfiniteBloomStart, IR.InfiniteQStart: pc += re.ir[pc].data + IRL!(IR.InfiniteStart); goto case IR.InfiniteEnd; //both Q and non-Q case IR.InfiniteEnd, IR.InfiniteBloomEnd, IR.InfiniteQEnd: uint len = re.ir[pc].data; if (app.data.length == dataLenOld) { pc += IRL!(IR.InfiniteEnd); break; } dataLenOld = cast(uint) app.data.length; if (app.data.length < limit && rand(3) > 0) pc = pc - len; else pc = pc + re.ir[pc].length; break; case IR.GroupStart, IR.GroupEnd: pc += IRL!(IR.GroupStart); break; case IR.Bol, IR.Wordboundary, IR.Notwordboundary: case IR.LookaheadStart, IR.NeglookaheadStart, IR.LookbehindStart, IR.NeglookbehindStart: default: return; } } } @property Char[] front() { return app.data; } enum empty = false; void popFront() { app.shrinkTo(0); compose(); } } @system unittest { import std.range, std.regex; auto re = regex(`P[a-z]{3,}q`); auto gen = SampleGenerator!char(re, 20, 3141592); static assert(isInputRange!(typeof(gen))); //@@@BUG@@@ somehow gen.take(1_000) doesn't work foreach (v; take(gen, 1_000)) assert(v.match(re)); }