regcomp.pl revision 1.2
1#!/usr/bin/perl -w
2#
3#
4# Regenerate (overwriting only if changed):
5#
6#    pod/perldebguts.pod
7#    regnodes.h
8#
9# from information stored in
10#
11#    regcomp.sym
12#    regexp.h
13#
14# pod/perldebguts.pod is not completely regenerated.  Only the table of
15# regexp nodes is replaced; other parts remain unchanged.
16#
17# Accepts the standard regen_lib -q and -v args.
18#
19# This script is normally invoked from regen.pl.
20
21BEGIN {
22    # Get function prototypes
23    require 'regen/regen_lib.pl';
24}
25use strict;
26
27# NOTE I don't think anyone actually knows what all of these properties mean,
28# and I suspect some of them are outright unused. This is a first attempt to
29# clean up the generation so maybe one day we can move to something more self
30# documenting. (One might argue that an array of hashes of properties would
31# be easier to use.)
32#
33# Why we use the term regnode and nodes, and not say, opcodes, I am not sure.
34
35# General thoughts:
36# 1. We use a single continuum to represent both opcodes and states,
37#    and in regexec.c we switch on the combined set.
38# 2. Opcodes have more information associated to them, states are simpler,
39#    basically just an identifier/number that can be used to switch within
40#    the state machine.
41# 3. Some opcode are order dependent.
42# 4. Output files often use "tricks" to reduce diff effects. Some of what
43#    we do below is more clumsy looking than it could be because of this.
44
45# Op/state properties:
46#
47# Property      In      Descr
48# ----------------------------------------------------------------------------
49# name          Both    Name of op/state
50# id            Both    integer value for this opcode/state
51# optype        Both    Either 'op' or 'state'
52# line_num          Both    line_num number of the input file for this item.
53# type          Op      Type of node (aka regkind)
54# code          Op      what code is associated with this node (???)
55# args          Op      what type of args the node has (which regnode struct)
56# flags         Op      (???)
57# longj         Op      Whether this node is a longjump
58# comment       Both    Comment about node, if any
59# pod_comment   Both    Special comments for pod output (preceding lines in def)
60
61# Global State
62my @all;    # all opcodes/state
63my %all;    # hash of all opcode/state names
64
65my @ops;    # array of just opcodes
66my @states; # array of just states
67
68my $longest_name_length= 0; # track lengths of names for nicer reports
69my (%type_alias);           # map the type (??)
70
71# register a newly constructed node into our state tables.
72# ensures that we have no name collisions (on name anyway),
73# and issues the "id" for the node.
74sub register_node {
75    my ($node)= @_;
76
77    if ( $all{ $node->{name} } ) {
78        die "Duplicate item '$node->{name}' in regcomp.sym line $node->{line_num} "
79            . "previously defined on line $all{ $node->{name} }{line_num}\n";
80    } elsif (!$node->{optype}) {
81        die "must have an optype in node ", Dumper($node);
82    } elsif ($node->{optype} eq "op") {
83        push @ops, $node;
84    } elsif ($node->{optype} eq "state") {
85        push @states, $node;
86    } else {
87        die "Uknown optype '$node->{optype}' in ", Dumper($node);
88    }
89    $node->{id}= 0 + @all;
90    push @all, $node;
91    $all{ $node->{name} }= $node;
92}
93
94# Parse and add an opcode definition to the global state.
95# An opcode definition looks like this:
96#
97#                             +- args
98#                             | +- flags
99#                             | | +- longjmp
100# Name        Type       code | | | ; comment
101# --------------------------------------------------------------------------
102# IFMATCH     BRANCHJ,    off 1 . 2 ; Succeeds if the following matches.
103# UNLESSM     BRANCHJ,    off 1 . 2 ; Fails if the following matches.
104# SUSPEND     BRANCHJ,    off 1 V 1 ; "Independent" sub-RE.
105# IFTHEN      BRANCHJ,    off 1 V 1 ; Switch, should be preceded by switcher.
106# GROUPP      GROUPP,     num 1     ; Whether the group matched.
107#
108# Not every opcode definition has all of these. We should maybe make this
109# nicer/easier to read in the future. Also note that the above is tab
110# sensitive.
111
112sub parse_opcode_def {
113    my ( $text, $line_num, $pod_comment )= @_;
114    my $node= {
115        line_num    => $line_num,
116        pod_comment => $pod_comment,
117        optype      => "op",
118    };
119
120    # first split the line into three, the initial NAME, a middle part
121    # that we call "desc" which contains various (not well documented) things,
122    # and a comment section.
123    @{$node}{qw(name desc comment)}= /^(\S+)\s+([^\t]+?)\s*;\s*(.*)/
124        or die "Failed to match $_";
125
126    # the content of the "desc" field from the first step is extracted here:
127    @{$node}{qw(type code args flags longj)}= split /[,\s]\s*/, $node->{desc};
128
129    $node->{$_} //= "" for qw(type code args flags longj);
130
131    register_node($node); # has to be before the type_alias code below
132
133    if ( !$all{ $node->{type} } and !$type_alias{ $node->{type} } ) {
134
135        #warn "Regop type '$node->{type}' from regcomp.sym line $line_num"
136        #     ." is not an existing regop, and will be aliased to $node->{name}\n"
137        #    if -t STDERR;
138        $type_alias{ $node->{type} }= $node->{name};
139    }
140
141    $longest_name_length= length $node->{name}
142        if length $node->{name} > $longest_name_length;
143}
144
145# parse out a state definition and add the resulting data
146# into the global state. may create multiple new states from
147# a single definition (this is part of the point).
148# Format for states:
149# REGOP \t typelist [ \t typelist]
150# typelist= namelist
151#         = namelist:FAIL
152#         = name:count
153# Eg:
154# WHILEM          A_pre,A_min,A_max,B_min,B_max:FAIL
155# BRANCH          next:FAIL
156# CURLYM          A,B:FAIL
157#
158# The CURLYM definition would create the states:
159# CURLYM_A, CURLYM_A_fail, CURLYM_B, CURLYM_B_fail
160sub parse_state_def {
161    my ( $text, $line_num, $pod_comment )= @_;
162    my ( $type, @lists )= split /\s+/, $text;
163    die "No list? $type" if !@lists;
164    foreach my $list (@lists) {
165        my ( $names, $special )= split /:/, $list, 2;
166        $special ||= "";
167        foreach my $name ( split /,/, $names ) {
168            my $real=
169                $name eq 'resume'
170                ? "resume_$type"
171                : "${type}_$name";
172            my @suffix;
173            if ( !$special ) {
174                @suffix= ("");
175            }
176            elsif ( $special =~ /\d/ ) {
177                @suffix= ( 1 .. $special );
178            }
179            elsif ( $special eq 'FAIL' ) {
180                @suffix= ( "", "_fail" );
181            }
182            else {
183                die "unknown :type ':$special'";
184            }
185            foreach my $suffix (@suffix) {
186                my $node= {
187                    name        => "$real$suffix",
188                    optype      => "state",
189                    type        => $type || "",
190                    comment     => "state for $type",
191                    line_num    => $line_num,
192                };
193                register_node($node);
194            }
195        }
196    }
197}
198
199sub process_flags {
200    my ( $flag, $varname, $comment )= @_;
201    $comment= '' unless defined $comment;
202
203    my @selected;
204    my $bitmap= '';
205    for my $node (@ops) {
206        my $set= $node->{flags} && $node->{flags} eq $flag ? 1 : 0;
207
208        # Whilst I could do this with vec, I'd prefer to do longhand the arithmetic
209        # ops in the C code.
210        my $current= do {
211            no warnings;
212            ord substr $bitmap, ( $node->{id} >> 3 );
213        };
214        substr( $bitmap, ( $node->{id} >> 3 ), 1 )=
215            chr( $current | ( $set << ( $node->{id} & 7 ) ) );
216
217        push @selected, $node->{name} if $set;
218    }
219    my $out_string= join ', ', @selected, 0;
220    $out_string =~ s/(.{1,70},) /$1\n    /g;
221
222    my $out_mask= join ', ', map { sprintf "0x%02X", ord $_ } split '', $bitmap;
223
224    return $comment . <<"EOP";
225#define REGNODE_\U$varname\E(node) (PL_${varname}_bitmask[(node) >> 3] & (1 << ((node) & 7)))
226
227#ifndef DOINIT
228EXTCONST U8 PL_${varname}\[] __attribute__deprecated__;
229#else
230EXTCONST U8 PL_${varname}\[] __attribute__deprecated__ = {
231    $out_string
232};
233#endif /* DOINIT */
234
235#ifndef DOINIT
236EXTCONST U8 PL_${varname}_bitmask[];
237#else
238EXTCONST U8 PL_${varname}_bitmask[] = {
239    $out_mask
240};
241#endif /* DOINIT */
242EOP
243}
244
245sub read_definition {
246    my ( $file )= @_;
247    my ( $seen_sep, $pod_comment )= "";
248    open my $in_fh, "<", $file
249        or die "Failed to open '$file' for reading: $!";
250    while (<$in_fh>) {
251
252        # Special pod comments
253        if (/^#\* ?/) { $pod_comment .= "# $'"; }
254
255        # Truly blank lines possibly surrounding pod comments
256        elsif (/^\s*$/) { $pod_comment .= "\n" }
257
258        next if /\A\s*#/ || /\A\s*\z/;
259
260        s/\s*\z//;
261        if (/^-+\s*$/) {
262            $seen_sep= 1;
263            next;
264        }
265
266        if ($seen_sep) {
267            parse_state_def( $_, $., $pod_comment );
268        }
269        else {
270            parse_opcode_def( $_, $., $pod_comment );
271        }
272        $pod_comment= "";
273    }
274    close $in_fh;
275    die "Too many regexp/state opcodes! Maximum is 256, but there are ", 0 + @all,
276        " in file!"
277        if @all > 256;
278}
279
280# use fixed width to keep the diffs between regcomp.pl recompiles
281# as small as possible.
282my ( $width, $rwidth, $twidth )= ( 22, 12, 9 );
283
284sub print_state_defs {
285    my ($out)= @_;
286    printf $out <<EOP,
287/* Regops and State definitions */
288
289#define %*s\t%d
290#define %*s\t%d
291
292EOP
293        -$width,
294        REGNODE_MAX => $#ops,
295        -$width, REGMATCH_STATE_MAX => $#all;
296
297    my %rev_type_alias= reverse %type_alias;
298    for my $node (@ops) {
299        printf $out "#define\t%*s\t%d\t/* %#04x %s */\n",
300            -$width, $node->{name}, $node->{id}, $node->{id}, $node->{comment};
301        if ( defined( my $alias= $rev_type_alias{ $node->{name} } ) ) {
302            printf $out "#define\t%*s\t%d\t/* %#04x %s */\n",
303                -$width, $alias, $node->{id}, $node->{id}, "type alias";
304        }
305    }
306
307    print $out "\t/* ------------ States ------------- */\n";
308    for my $node (@states) {
309        printf $out "#define\t%*s\t(REGNODE_MAX + %d)\t/* %s */\n",
310            -$width, $node->{name}, $node->{id} - $#ops, $node->{comment};
311    }
312}
313
314sub print_regkind {
315    my ($out)= @_;
316    print $out <<EOP;
317
318/* PL_regkind[] What type of regop or state is this. */
319
320#ifndef DOINIT
321EXTCONST U8 PL_regkind[];
322#else
323EXTCONST U8 PL_regkind[] = {
324EOP
325    use Data::Dumper;
326    foreach my $node (@all) {
327        print Dumper($node) if !defined $node->{type} or !defined( $node->{name} );
328        printf $out "\t%*s\t/* %*s */\n",
329            -1 - $twidth, "$node->{type},", -$width, $node->{name};
330        print $out "\t/* ------------ States ------------- */\n"
331            if $node->{id} == $#ops and $node->{id} != $#all;
332    }
333
334    print $out <<EOP;
335};
336#endif
337EOP
338}
339
340sub wrap_ifdef_print {
341    my $out= shift;
342    my $token= shift;
343    print $out <<EOP;
344
345#ifdef $token
346EOP
347    $_->($out) for @_;
348    print $out <<EOP;
349#endif /* $token */
350
351EOP
352}
353
354sub print_regarglen {
355    my ($out)= @_;
356    print $out <<EOP;
357
358/* regarglen[] - How large is the argument part of the node (in regnodes) */
359
360static const U8 regarglen[] = {
361EOP
362
363    foreach my $node (@ops) {
364        my $size= 0;
365        $size= "EXTRA_SIZE(struct regnode_$node->{args})" if $node->{args};
366
367        printf $out "\t%*s\t/* %*s */\n", -37, "$size,", -$rwidth, $node->{name};
368    }
369
370    print $out <<EOP;
371};
372EOP
373}
374
375sub print_reg_off_by_arg {
376    my ($out)= @_;
377    print $out <<EOP;
378
379/* reg_off_by_arg[] - Which argument holds the offset to the next node */
380
381static const char reg_off_by_arg[] = {
382EOP
383
384    foreach my $node (@ops) {
385        my $size= $node->{longj} || 0;
386
387        printf $out "\t%d,\t/* %*s */\n", $size, -$rwidth, $node->{name};
388    }
389
390    print $out <<EOP;
391};
392
393EOP
394}
395
396sub print_reg_name {
397    my ($out)= @_;
398    print $out <<EOP;
399
400/* reg_name[] - Opcode/state names in string form, for debugging */
401
402#ifndef DOINIT
403EXTCONST char * PL_reg_name[];
404#else
405EXTCONST char * const PL_reg_name[] = {
406EOP
407
408    my $ofs= 0;
409    my $sym= "";
410    foreach my $node (@all) {
411        my $size= $node->{longj} || 0;
412
413        printf $out "\t%*s\t/* $sym%#04x */\n",
414            -3 - $width, qq("$node->{name}",), $node->{id} - $ofs;
415        if ( $node->{id} == $#ops and @ops != @all ) {
416            print $out "\t/* ------------ States ------------- */\n";
417            $ofs= $#ops;
418            $sym= 'REGNODE_MAX +';
419        }
420    }
421
422    print $out <<EOP;
423};
424#endif /* DOINIT */
425
426EOP
427}
428
429sub print_reg_extflags_name {
430    my ($out)= @_;
431    print $out <<EOP;
432/* PL_reg_extflags_name[] - Opcode/state names in string form, for debugging */
433
434#ifndef DOINIT
435EXTCONST char * PL_reg_extflags_name[];
436#else
437EXTCONST char * const PL_reg_extflags_name[] = {
438EOP
439
440    my %rxfv;
441    my %definitions;    # Remember what the symbol definitions are
442    my $val= 0;
443    my %reverse;
444    my $REG_EXTFLAGS_NAME_SIZE= 0;
445    foreach my $file ( "op_reg_common.h", "regexp.h" ) {
446        open my $in_fh, "<", $file or die "Can't read '$file': $!";
447        while (<$in_fh>) {
448
449            # optional leading '_'.  Return symbol in $1, and strip it from
450            # comment of line
451            if (s/^ \# \s* define \s+ ( _? RXf_ \w+ ) \s+ //xi) {
452                chomp;
453                my $define= $1;
454                my $orig= $_;
455                s{ /\* .*? \*/ }{ }x;    # Replace comments by a blank
456
457                # Replace any prior defined symbols by their values
458                foreach my $key ( keys %definitions ) {
459                    s/\b$key\b/$definitions{$key}/g;
460                }
461
462                # Remove the U suffix from unsigned int literals
463                s/\b([0-9]+)U\b/$1/g;
464
465                my $newval= eval $_;     # Get numeric definition
466
467                $definitions{$define}= $newval;
468
469                next unless $_ =~ /<</;    # Bit defines use left shift
470                if ( $val & $newval ) {
471                    my @names= ( $define, $reverse{$newval} );
472                    s/PMf_// for @names;
473                    if ( $names[0] ne $names[1] ) {
474                        die sprintf
475                            "ERROR: both $define and $reverse{$newval} use 0x%08X (%s:%s)",
476                            $newval, $orig, $_;
477                    }
478                    next;
479                }
480                $val |= $newval;
481                $rxfv{$define}= $newval;
482                $reverse{$newval}= $define;
483            }
484        }
485    }
486    my %vrxf= reverse %rxfv;
487    printf $out "\t/* Bits in extflags defined: %s */\n", unpack 'B*', pack 'N',
488        $val;
489    my %multibits;
490    for ( 0 .. 31 ) {
491        my $power_of_2= 2**$_;
492        my $n= $vrxf{$power_of_2};
493        my $extra= "";
494        if ( !$n ) {
495
496            # Here, there was no name that matched exactly the bit.  It could be
497            # either that it is unused, or the name matches multiple bits.
498            if ( !( $val & $power_of_2 ) ) {
499                $n= "UNUSED_BIT_$_";
500            }
501            else {
502
503                # Here, must be because it matches multiple bits.  Look through
504                # all possibilities until find one that matches this one.  Use
505                # that name, and all the bits it matches
506                foreach my $name ( keys %rxfv ) {
507                    if ( $rxfv{$name} & $power_of_2 ) {
508                        $n= $name . ( $multibits{$name}++ );
509                        $extra= sprintf qq{ : "%s" - 0x%08x}, $name,
510                            $rxfv{$name}
511                            if $power_of_2 != $rxfv{$name};
512                        last;
513                    }
514                }
515            }
516        }
517        s/\bRXf_(PMf_)?// for $n, $extra;
518        printf $out qq(\t%-20s/* 0x%08x%s */\n), qq("$n",), $power_of_2, $extra;
519        $REG_EXTFLAGS_NAME_SIZE++;
520    }
521
522    print $out <<EOP;
523};
524#endif /* DOINIT */
525
526#ifdef DEBUGGING
527#  define REG_EXTFLAGS_NAME_SIZE $REG_EXTFLAGS_NAME_SIZE
528#endif
529EOP
530
531}
532
533sub print_reg_intflags_name {
534    my ($out)= @_;
535    print $out <<EOP;
536
537/* PL_reg_intflags_name[] - Opcode/state names in string form, for debugging */
538
539#ifndef DOINIT
540EXTCONST char * PL_reg_intflags_name[];
541#else
542EXTCONST char * const PL_reg_intflags_name[] = {
543EOP
544
545    my %rxfv;
546    my %definitions;    # Remember what the symbol definitions are
547    my $val= 0;
548    my %reverse;
549    my $REG_INTFLAGS_NAME_SIZE= 0;
550    foreach my $file ("regcomp.h") {
551        open my $fh, "<", $file or die "Can't read $file: $!";
552        while (<$fh>) {
553
554            # optional leading '_'.  Return symbol in $1, and strip it from
555            # comment of line
556            if (
557                m/^ \# \s* define \s+ ( PREGf_ ( \w+ ) ) \s+ 0x([0-9a-f]+)(?:\s*\/\*(.*)\*\/)?/xi
558                )
559            {
560                chomp;
561                my $define= $1;
562                my $abbr= $2;
563                my $hex= $3;
564                my $comment= $4;
565                my $val= hex($hex);
566                $comment= $comment ? " - $comment" : "";
567
568                printf $out qq(\t%-30s/* 0x%08x - %s%s */\n), qq("$abbr",),
569                    $val, $define, $comment;
570                $REG_INTFLAGS_NAME_SIZE++;
571            }
572        }
573    }
574
575    print $out <<EOP;
576};
577#endif /* DOINIT */
578
579EOP
580    print $out <<EOQ;
581#ifdef DEBUGGING
582#  define REG_INTFLAGS_NAME_SIZE $REG_INTFLAGS_NAME_SIZE
583#endif
584
585EOQ
586}
587
588sub print_process_flags {
589    my ($out)= @_;
590
591    print $out process_flags( 'V', 'varies', <<'EOC');
592/* The following have no fixed length. U8 so we can do strchr() on it. */
593EOC
594
595    print $out process_flags( 'S', 'simple', <<'EOC');
596
597/* The following always have a length of 1. U8 we can do strchr() on it. */
598/* (Note that length 1 means "one character" under UTF8, not "one octet".) */
599EOC
600
601}
602
603sub do_perldebguts {
604    my $guts= open_new( 'pod/perldebguts.pod', '>' );
605
606    my $node;
607    my $code;
608    my $name_fmt= '<' x  ( $longest_name_length - 1 );
609    my $descr_fmt= '<' x ( 58 - $longest_name_length );
610    eval <<EOD or die $@;
611format GuTS =
612 ^*~~
613 \$node->{pod_comment}
614 ^$name_fmt ^<<<<<<<<< ^$descr_fmt~~
615 \$node->{name}, \$code, \$node->{comment}//''
616.
6171;
618EOD
619
620    my $old_fh= select($guts);
621    $~= "GuTS";
622
623    open my $oldguts, "pod/perldebguts.pod"
624        or die "$0 cannot open pod/perldebguts.pod for reading: $!";
625    while (<$oldguts>) {
626        print;
627        last if /=for regcomp.pl begin/;
628    }
629
630    print <<'END_OF_DESCR';
631
632 # TYPE arg-description [num-args] [longjump-len] DESCRIPTION
633END_OF_DESCR
634    for my $n (@ops) {
635        $node= $n;
636        $code= "$node->{code} " . ( $node->{args} || "" );
637        $code .= " $node->{longj}" if $node->{longj};
638        if ( $node->{pod_comment} ||= "" ) {
639
640            # Trim multiple blanks
641            $node->{pod_comment} =~ s/^\n\n+/\n/;
642            $node->{pod_comment} =~ s/\n\n+$/\n\n/;
643        }
644        write;
645    }
646    print "\n";
647
648    while (<$oldguts>) {
649        last if /=for regcomp.pl end/;
650    }
651    do { print } while <$oldguts>;
652    select $old_fh;
653    close_and_rename($guts);
654}
655
656read_definition("regcomp.sym");
657my $out= open_new( 'regnodes.h', '>',
658    { by => 'regen/regcomp.pl', from => 'regcomp.sym' } );
659print_state_defs($out);
660print_regkind($out);
661wrap_ifdef_print(
662    $out,
663    "REG_COMP_C",
664    \&print_regarglen,
665    \&print_reg_off_by_arg
666);
667print_reg_name($out);
668print_reg_extflags_name($out);
669print_reg_intflags_name($out);
670print_process_flags($out);
671read_only_bottom_close_and_rename($out);
672
673do_perldebguts();
674