1#!/usr/bin/perl -w
2#
3#
4# Regenerate (overwriting only if changed):
5#
6#    pod/perldebguts.pod
7#    regnodes.h
8#
9# from information stored in
10#
11#    regcomp.sym
12#    op_reg_common.h
13#    regexp.h
14#
15# pod/perldebguts.pod is not completely regenerated.  Only the table of
16# regexp nodes is replaced; other parts remain unchanged.
17#
18# Accepts the standard regen_lib -q and -v args.
19#
20# This script is normally invoked from regen.pl.
21#
22# F<regcomp.sym> defines the opcodes and states used in the regex
23# engine, it also includes documentation on the opcodes. This script
24# parses those definitions out and turns them into typedefs, defines,
25# and data structures, and maybe even code which the regex engine can
26# use to operate.
27#
28# F<regexp.h> and op_reg_common.h contain defines C<RXf_xxx> and
29# C<PREGf_xxx> that are used in flags in our code. These defines are
30# parsed out and data structures are created to allow the debug mode of
31# the regex engine to show things such as which flags were set during
32# compilation. In some cases we transform the C code in the header files
33# into perl code which we execute to C<eval()> the contents. For instance
34# in a situation like this:
35#
36#   #define RXf_X 0x1   /* the X mode */
37#   #define RXf_Y 0x2   /* the Y mode */
38#   #define RXf_Z (X|Y) /* the Z mode */
39#
40# this script might end up eval()ing something like C<0x1> and then
41# C<0x2> and then C<(0x1|0x2)> the results of which it then might use in
42# constructing a data structure, or pod in perldebguts, or a comment in
43# C<regnodes.h>. It also would separate out the "X", "Y", and "Z" and
44# use them, and would also use the data in the line comment if present.
45#
46# If you compile a regex under perl -Mre=Debug,ALL you can see much
47# of the content that this file generates and parses out of its input
48# files.
49
50BEGIN {
51    # Get function prototypes
52    require './regen/regen_lib.pl';
53    require './regen/HeaderParser.pm';
54}
55
56use strict;
57
58# NOTE I don't think anyone actually knows what all of these properties mean,
59# and I suspect some of them are outright unused. This is a first attempt to
60# clean up the generation so maybe one day we can move to something more self
61# documenting. (One might argue that an array of hashes of properties would
62# be easier to use.)
63#
64# Why we use the term regnode and nodes, and not say, opcodes, I am not sure.
65
66# General thoughts:
67# 1. We use a single continuum to represent both opcodes and states,
68#    and in regexec.c we switch on the combined set.
69# 2. Opcodes have more information associated to them, states are simpler,
70#    basically just an identifier/number that can be used to switch within
71#    the state machine.
72# 3. Some opcode are order dependent.
73# 4. Output files often use "tricks" to reduce diff effects. Some of what
74#    we do below is more clumsy looking than it could be because of this.
75
76# Op/state properties:
77#
78# Property      In      Descr
79# ----------------------------------------------------------------------------
80# name          Both    Name of op/state
81# id            Both    integer value for this opcode/state
82# optype        Both    Either 'op' or 'state'
83# line_num      Both    line_num number of the input file for this item.
84# type          Op      Type of node (aka regnode_kind)
85# code          Op      Meta about the node, used to detect variable length nodes
86# suffix        Op      which regnode struct this uses, so if this is '1', it
87#                       uses 'struct regnode_1'
88# flags         Op      S for simple; V for varies
89# longj         Op      Boolean as to if this node is a longjump
90# comment       Both    Comment about node, if any.  Placed in perlredebguts
91#                       as its description
92# pod_comment   Both    Special comments for pod output (preceding lines in def)
93#                       Such lines begin with '#*'
94
95# Global State
96my @all;    # all opcodes/state
97my %all;    # hash of all opcode/state names
98
99my @ops;    # array of just opcodes
100my @states; # array of just states
101
102my $longest_name_length= 0; # track lengths of names for nicer reports
103my (%type_alias);           # map the type (??)
104
105# register a newly constructed node into our state tables.
106# ensures that we have no name collisions (on name anyway),
107# and issues the "id" for the node.
108sub register_node {
109    my ($node)= @_;
110
111    if ( $all{ $node->{name} } ) {
112        die "Duplicate item '$node->{name}' in regcomp.sym line $node->{line_num} "
113            . "previously defined on line $all{ $node->{name} }{line_num}\n";
114    } elsif (!$node->{optype}) {
115        die "must have an optype in node ", Dumper($node);
116    } elsif ($node->{optype} eq "op") {
117        push @ops, $node;
118    } elsif ($node->{optype} eq "state") {
119        push @states, $node;
120    } else {
121        die "Uknown optype '$node->{optype}' in ", Dumper($node);
122    }
123    $node->{id}= 0 + @all;
124    push @all, $node;
125    $all{ $node->{name} }= $node;
126
127    if ($node->{longj} && $node->{longj} != 1) {
128        die "longj field must be in [01] if present in ", Dumper($node);
129    }
130
131}
132
133# Parse and add an opcode definition to the global state.
134# What an opcode definition looks like is given in regcomp.sym.
135#
136# Not every opcode definition has all of the components. We should maybe make
137# this nicer/easier to read in the future. Also note that the above is tab
138# sensitive.
139
140# Special comments for an entry precede it, and begin with '#*' and are placed
141# in the generated pod file just before the entry.
142
143sub parse_opcode_def {
144    my ( $text, $line_num, $pod_comment )= @_;
145    my $node= {
146        line_num    => $line_num,
147        pod_comment => $pod_comment,
148        optype      => "op",
149    };
150
151    # first split the line into three, the initial NAME, a middle part
152    # that we call "desc" which contains various (not well documented) things,
153    # and a comment section.
154    @{$node}{qw(name desc comment)}= /^(\S+)\s+([^\t]+?)\s*;\s*(.*)/
155        or die "Failed to match $_";
156
157    # the content of the "desc" field from the first step is extracted here:
158    @{$node}{qw(type code suffix flags longj)}= split /[,\s]\s*/, $node->{desc};
159
160    defined $node->{$_} or $node->{$_} = ""
161        for qw(type code suffix flags longj);
162
163    register_node($node); # has to be before the type_alias code below
164
165    if ( !$all{ $node->{type} } and !$type_alias{ $node->{type} } ) {
166
167        #warn "Regop type '$node->{type}' from regcomp.sym line $line_num"
168        #     ." is not an existing regop, and will be aliased to $node->{name}\n"
169        #    if -t STDERR;
170        $type_alias{ $node->{type} }= $node->{name};
171    }
172
173    $longest_name_length= length $node->{name}
174        if length $node->{name} > $longest_name_length;
175}
176
177# parse out a state definition and add the resulting data
178# into the global state. may create multiple new states from
179# a single definition (this is part of the point).
180# Format for states:
181# REGOP \t typelist [ \t typelist]
182# typelist= namelist
183#         = namelist:FAIL
184#         = name:count
185# Eg:
186# WHILEM          A_pre,A_min,A_max,B_min,B_max:FAIL
187# BRANCH          next:FAIL
188# CURLYM          A,B:FAIL
189#
190# The CURLYM definition would create the states:
191# CURLYM_A, CURLYM_A_fail, CURLYM_B, CURLYM_B_fail
192sub parse_state_def {
193    my ( $text, $line_num, $pod_comment )= @_;
194    my ( $type, @lists )= split /\s+/, $text;
195    die "No list? $type" if !@lists;
196    foreach my $list (@lists) {
197        my ( $names, $special )= split /:/, $list, 2;
198        $special ||= "";
199        foreach my $name ( split /,/, $names ) {
200            my $real=
201                $name eq 'resume'
202                ? "resume_$type"
203                : "${type}_$name";
204            my @suffix;
205            if ( !$special ) {
206                @suffix= ("");
207            }
208            elsif ( $special =~ /\d/ ) {
209                @suffix= ( 1 .. $special );
210            }
211            elsif ( $special eq 'FAIL' ) {
212                @suffix= ( "", "_fail" );
213            }
214            else {
215                die "unknown :type ':$special'";
216            }
217            foreach my $suffix (@suffix) {
218                my $node= {
219                    name        => "$real$suffix",
220                    optype      => "state",
221                    type        => $type || "",
222                    comment     => "state for $type",
223                    line_num    => $line_num,
224                };
225                register_node($node);
226            }
227        }
228    }
229}
230
231sub process_flags {
232    my ( $flag, $varname, $comment )= @_;
233    $comment= '' unless defined $comment;
234
235    my @selected;
236    my $bitmap= '';
237    for my $node (@ops) {
238        my $set= $node->{flags} && $node->{flags} eq $flag ? 1 : 0;
239
240        # Whilst I could do this with vec, I'd prefer to do longhand the arithmetic
241        # ops in the C code.
242        my $current= do {
243            no warnings;
244            ord substr $bitmap, ( $node->{id} >> 3 );
245        };
246        substr( $bitmap, ( $node->{id} >> 3 ), 1 )=
247            chr( $current | ( $set << ( $node->{id} & 7 ) ) );
248
249        push @selected, $node->{name} if $set;
250    }
251    my $out_string= join ', ', @selected, 0;
252    $out_string =~ s/(.{1,70},) /$1\n    /g;
253
254    my $out_mask= join ', ', map { sprintf "0x%02X", ord $_ } split '', $bitmap;
255
256    return $comment . <<"EOP";
257#define REGNODE_\U$varname\E(node) (PL_${varname}_bitmask[(node) >> 3] & (1 << ((node) & 7)))
258
259#ifndef DOINIT
260EXTCONST U8 PL_${varname}\[] __attribute__deprecated__;
261#else
262EXTCONST U8 PL_${varname}\[] __attribute__deprecated__ = {
263    $out_string
264};
265#endif /* DOINIT */
266
267#ifndef DOINIT
268EXTCONST U8 PL_${varname}_bitmask[];
269#else
270EXTCONST U8 PL_${varname}_bitmask[] = {
271    $out_mask
272};
273#endif /* DOINIT */
274EOP
275}
276
277sub print_process_EXACTish {
278    my ($out)= @_;
279
280    # Creates some bitmaps for EXACTish nodes.
281
282    my @folded;
283    my @req8;
284
285    my $base;
286    for my $node (@ops) {
287        next unless $node->{type} eq 'EXACT';
288        my $name = $node->{name};
289        $base = $node->{id} if $name eq 'EXACT';
290
291        my $index = $node->{id} - $base;
292
293        # This depends entirely on naming conventions in regcomp.sym
294        $folded[$index] = $name =~ /^EXACTF/ || 0;
295        $req8[$index] = $name =~ /8/ || 0;
296    }
297
298    die "Can't cope with > 32 EXACTish nodes" if @folded > 32;
299
300    my $exactf = sprintf "%X", oct("0b" . join "", reverse @folded);
301    my $req8 =   sprintf "%X", oct("0b" . join "", reverse @req8);
302    print $out <<EOP,
303
304/* Is 'op', known to be of type EXACT, folding? */
305#define isEXACTFish(op) (__ASSERT_(REGNODE_TYPE(op) == EXACT) (PL_EXACTFish_bitmask & (1U << (op - EXACT))))
306
307/* Do only UTF-8 target strings match 'op', known to be of type EXACT? */
308#define isEXACT_REQ8(op) (__ASSERT_(REGNODE_TYPE(op) == EXACT) (PL_EXACT_REQ8_bitmask & (1U << (op - EXACT))))
309
310#ifndef DOINIT
311EXTCONST U32 PL_EXACTFish_bitmask;
312EXTCONST U32 PL_EXACT_REQ8_bitmask;
313#else
314EXTCONST U32 PL_EXACTFish_bitmask = 0x$exactf;
315EXTCONST U32 PL_EXACT_REQ8_bitmask = 0x$req8;
316#endif /* DOINIT */
317EOP
318}
319
320sub read_definition {
321    my ( $file )= @_;
322    my ( $seen_sep, $pod_comment )= "";
323    open my $in_fh, "<", $file
324        or die "Failed to open '$file' for reading: $!";
325    while (<$in_fh>) {
326
327        # Special pod comments
328        if (/^#\* ?/) { $pod_comment .= "# $'"; }
329
330        # Truly blank lines possibly surrounding pod comments
331        elsif (/^\s*$/) { $pod_comment .= "\n" }
332
333        next if /\A\s*#/ || /\A\s*\z/;
334
335        s/\s*\z//;
336        if (/^-+\s*$/) {
337            $seen_sep= 1;
338            next;
339        }
340
341        if ($seen_sep) {
342            parse_state_def( $_, $., $pod_comment );
343        }
344        else {
345            parse_opcode_def( $_, $., $pod_comment );
346        }
347        $pod_comment= "";
348    }
349    close $in_fh;
350    die "Too many regexp/state opcodes! Maximum is 256, but there are ", 0 + @all,
351        " in file!"
352        if @all > 256;
353}
354
355# use fixed width to keep the diffs between regcomp.pl recompiles
356# as small as possible.
357my ( $base_name_width, $rwidth, $twidth )= ( 22, 12, 9 );
358
359sub print_state_defs {
360    my ($out)= @_;
361    printf $out <<EOP,
362/* Regops and State definitions */
363
364#define %*s\t%d
365#define %*s\t%d
366
367EOP
368        -$base_name_width,
369        REGNODE_MAX => $#ops,
370        -$base_name_width, REGMATCH_STATE_MAX => $#all;
371
372    my %rev_type_alias= reverse %type_alias;
373    my $base_format = "#define %*s\t%d\t/* %#04x %s */\n";
374    my @withs;
375    my $in_states = 0;
376
377    my $max_name_width = 0;
378    for my $ref (\@ops, \@states) {
379        for my $node ($ref->@*) {
380            my $len = length $node->{name};
381            $max_name_width = $len if $max_name_width < $len;
382        }
383    }
384
385    die "Do a white-space only commit to increase \$base_name_width to"
386     .  " $max_name_width; then re-run"  if $base_name_width < $max_name_width;
387
388    print $out <<EOT;
389/* -- For regexec.c to switch on target being utf8 (t8) or not (tb, b='byte'); */
390#define with_t_UTF8ness(op, t_utf8) (((op) << 1) + (cBOOL(t_utf8)))
391/* -- same, but also with pattern (p8, pb) -- */
392#define with_tp_UTF8ness(op, t_utf8, p_utf8)                        \\
393\t\t(((op) << 2) + (cBOOL(t_utf8) << 1) + cBOOL(p_utf8))
394
395/* The #defines below give both the basic regnode and the expanded version for
396   switching on utf8ness */
397EOT
398
399    for my $node (@ops) {
400        print_state_def_line($out, $node->{name}, $node->{id}, $node->{comment});
401        if ( defined( my $alias= $rev_type_alias{ $node->{name} } ) ) {
402            print_state_def_line($out, $alias, $node->{id}, $node->{comment});
403        }
404    }
405
406    print $out "\t/* ------------ States ------------- */\n";
407    for my $node (@states) {
408        print_state_def_line($out, $node->{name}, $node->{id}, $node->{comment});
409    }
410}
411
412sub print_state_def_line
413{
414    my ($fh, $name, $id, $comment) = @_;
415
416    # The sub-names are like '_tb' or '_tb_p8' = max 6 chars wide
417    my $name_col_width = $base_name_width + 6;
418    my $base_id_width = 3;  # Max is '255' or 3 cols
419    my $mid_id_width  = 3;  # Max is '511' or 3 cols
420    my $full_id_width = 3;  # Max is '1023' but not close to using the 4th
421
422    my $line = "#define " . $name;
423    $line .= " " x ($name_col_width - length($name));
424
425    $line .= sprintf "%*s", $base_id_width, $id;
426    $line .= " " x $mid_id_width;
427    $line .= " " x ($full_id_width + 2);
428
429    $line .= "/* ";
430    my $hanging = length $line;     # Indent any subsequent line to this pos
431    $line .= sprintf "0x%02x", $id;
432
433    my $columns = 78;
434
435    # From the documentation: 'In fact, every resulting line will have length
436    # of no more than "$columns - 1"'
437    $line = wrap($columns + 1, "", " " x $hanging, "$line $comment");
438    chomp $line;            # wrap always adds a trailing \n
439    $line =~ s/ \s+ $ //x;  # trim, just in case.
440
441    # The comment may have wrapped.  Find the final \n and measure the length
442    # to the end.  If it is short enough, just append the ' */' to the line.
443    # If it is too close to the end of the space available, add an extra line
444    # that consists solely of blanks and the ' */'
445    my $len = length($line); my $rindex = rindex($line, "\n");
446    if (length($line) - rindex($line, "\n") - 1 <= $columns - 3) {
447        $line .= " */\n";
448    }
449    else {
450        $line .= "\n" . " " x ($hanging - 3) . "*/\n";
451    }
452
453    print $fh $line;
454
455    # And add the 2 subsidiary #defines used when switching on
456    # with_t_UTF8nes()
457    my $with_id_t = $id * 2;
458    for my $with (qw(tb  t8)) {
459        my $with_name = "${name}_$with";
460        print  $fh "#define ", $with_name;
461        print  $fh " " x ($name_col_width - length($with_name) + $base_id_width);
462        printf $fh "%*s", $mid_id_width, $with_id_t;
463        print  $fh " " x $full_id_width;
464        printf $fh "  /*";
465        print  $fh " " x (4 + 2);  # 4 is width of 0xHH that the base entry uses
466        printf $fh "0x%03x */\n", $with_id_t;
467
468        $with_id_t++;
469    }
470
471    # Finally add the 4 subsidiary #defines used when switching on
472    # with_tp_UTF8nes()
473    my $with_id_tp = $id * 4;
474    for my $with (qw(tb_pb  tb_p8  t8_pb  t8_p8)) {
475        my $with_name = "${name}_$with";
476        print  $fh "#define ", $with_name;
477        print  $fh " " x ($name_col_width - length($with_name) + $base_id_width + $mid_id_width);
478        printf $fh "%*s", $full_id_width, $with_id_tp;
479        printf $fh "  /*";
480        print  $fh " " x (4 + 2);  # 4 is width of 0xHH that the base entry uses
481        printf $fh "0x%03x */\n", $with_id_tp;
482
483        $with_id_tp++;
484    }
485
486    print $fh "\n"; # Blank line separates groups for clarity
487}
488
489sub print_typedefs {
490    my ($out)= @_;
491    print $out <<EOP;
492
493/* typedefs for regex nodes - one typedef per node type */
494
495EOP
496    my $len= 0;
497    foreach my $node (@ops) {
498        if ($node->{suffix} and $len < length($node->{suffix})) {
499            $len= length $node->{suffix};
500        }
501    }
502    $len += length "struct regnode_";
503    $len = (int($len/5)+2)*5;
504    my $prefix= "tregnode";
505
506    foreach my $node (sort { $a->{name} cmp $b->{name} } @ops) {
507        my $struct_name= "struct regnode";
508        if (my $suffix= $node->{suffix}) {
509            $struct_name .= "_$suffix";
510        }
511        $node->{typedef}= $prefix . "_" . $node->{name};
512        printf $out "typedef %*s %s;\n", -$len, $struct_name, $node->{typedef};
513    }
514    print $out <<EOP;
515
516/* end typedefs */
517
518EOP
519
520}
521
522
523
524
525sub print_regnode_info {
526    my ($out)= @_;
527    print $out <<EOP;
528
529/* PL_regnode_info[] - Opcode/state names in string form, for debugging */
530
531#ifndef DOINIT
532EXTCONST struct regnode_meta PL_regnode_info[];
533#else
534EXTCONST struct regnode_meta PL_regnode_info[] = {
535EOP
536    my @fields= qw(type arg_len arg_len_varies off_by_arg);
537    foreach my $node_idx (0..$#all) {
538        my $node= $all[$node_idx];
539        {
540            my $size= 0;
541            $size= "EXTRA_SIZE($node->{typedef})" if $node->{suffix};
542            $node->{arg_len}= $size;
543
544        }
545        {
546            my $varies= 0;
547            $varies= 1 if $node->{code} and $node->{code}=~"str";
548            $node->{arg_len_varies}= $varies;
549        }
550        $node->{off_by_arg}= $node->{longj} || 0;
551        print $out "    {\n";
552        print $out "        /* #$node_idx $node->{optype} $node->{name} */\n";
553        foreach my $f_idx (0..$#fields) {
554            my $field= $fields[$f_idx];
555            printf $out  "        .%s = %s", $field, $node->{$field} // 0;
556            printf $out $f_idx == $#fields ? "\n" : ",\n";
557        }
558        print $out "    }";
559        print $out $node_idx==$#all ? "\n" : ",\n";
560    }
561
562    print $out <<EOP;
563};
564#endif /* DOINIT */
565
566EOP
567}
568
569
570sub print_regnode_name {
571    my ($out)= @_;
572    print $out <<EOP;
573
574/* PL_regnode_name[] - Opcode/state names in string form, for debugging */
575
576#ifndef DOINIT
577EXTCONST char * PL_regnode_name[];
578#else
579EXTCONST char * const PL_regnode_name[] = {
580EOP
581
582    my $ofs= 0;
583    my $sym= "";
584    foreach my $node (@all) {
585        printf $out "\t%*s\t/* $sym%#04x */\n",
586            -3 - $base_name_width, qq("$node->{name}",), $node->{id} - $ofs;
587        if ( $node->{id} == $#ops and @ops != @all ) {
588            print $out "\t/* ------------ States ------------- */\n";
589            $ofs= $#ops;
590            $sym= 'REGNODE_MAX +';
591        }
592    }
593
594    print $out <<EOP;
595};
596#endif /* DOINIT */
597
598EOP
599}
600
601sub print_reg_extflags_name {
602    my ($out)= @_;
603    print $out <<EOP;
604/* PL_reg_extflags_name[] - Opcode/state names in string form, for debugging */
605
606#ifndef DOINIT
607EXTCONST char * PL_reg_extflags_name[];
608#else
609EXTCONST char * const PL_reg_extflags_name[] = {
610EOP
611
612    my %rxfv;
613    my %definitions;    # Remember what the symbol definitions are
614    my $val= 0;
615    my %reverse;
616    my $REG_EXTFLAGS_NAME_SIZE= 0;
617    my $hp= HeaderParser->new();
618    foreach my $file ( "op_reg_common.h", "regexp.h" ) {
619        $hp->read_file($file);
620        foreach my $line_info (@{$hp->lines}) {
621            next unless $line_info->{type}     eq "content"
622                    and $line_info->{sub_type} eq "#define";
623            my $line= $line_info->{line};
624            $line=~s/\s*\\\n\s*/ /g;
625
626            # optional leading '_'.  Return symbol in $1, and strip it from
627            # comment of line.  Currently doesn't handle comments running onto
628            # next line
629            if ($line=~s/^ \# \s* define \s+ ( _? RXf_ \w+ ) \s+ //xi) {
630                chomp($line);
631                my $define= $1;
632                my $orig= $_;
633                $line=~s{ /\* .*? \*/ }{ }x;    # Replace comments by a blank
634
635                # Replace any prior defined symbols by their values
636                foreach my $key ( keys %definitions ) {
637                    $line=~s/\b$key\b/$definitions{$key}/g;
638                }
639
640                # Remove the U suffix from unsigned int literals
641                $line=~s/\b([0-9]+)U\b/$1/g;
642
643                my $newval= eval $line;     # Get numeric definition
644
645                $definitions{$define}= $newval;
646
647                next unless $line =~ /<</;    # Bit defines use left shift
648                if ( $val & $newval ) {
649                    my @names= ( $define, $reverse{$newval} );
650                    s/PMf_// for @names;
651                    if ( $names[0] ne $names[1] ) {
652                        die sprintf
653                            "ERROR: both $define and $reverse{$newval} use 0x%08X (%s:%s)",
654                            $newval, $orig, $line;
655                    }
656                    next;
657                }
658                $val |= $newval;
659                $rxfv{$define}= $newval;
660                $reverse{$newval}= $define;
661            }
662        }
663    }
664    my %vrxf= reverse %rxfv;
665    printf $out "\t/* Bits in extflags defined: %s */\n", unpack 'B*', pack 'N',
666        $val;
667    my %multibits;
668    for ( 0 .. 31 ) {
669        my $power_of_2= 2**$_;
670        my $n= $vrxf{$power_of_2};
671        my $extra= "";
672        if ( !$n ) {
673
674            # Here, there was no name that matched exactly the bit.  It could be
675            # either that it is unused, or the name matches multiple bits.
676            if ( !( $val & $power_of_2 ) ) {
677                $n= "UNUSED_BIT_$_";
678            }
679            else {
680
681                # Here, must be because it matches multiple bits.  Look through
682                # all possibilities until find one that matches this one.  Use
683                # that name, and all the bits it matches
684                foreach my $name ( keys %rxfv ) {
685                    if ( $rxfv{$name} & $power_of_2 ) {
686                        $n= $name . ( $multibits{$name}++ );
687                        $extra= sprintf qq{ : "%s" - 0x%08x}, $name,
688                            $rxfv{$name}
689                            if $power_of_2 != $rxfv{$name};
690                        last;
691                    }
692                }
693            }
694        }
695        s/\bRXf_(PMf_)?// for $n, $extra;
696        printf $out qq(\t%-20s/* 0x%08x%s */\n), qq("$n",), $power_of_2, $extra;
697        $REG_EXTFLAGS_NAME_SIZE++;
698    }
699
700    print $out <<EOP;
701};
702#endif /* DOINIT */
703
704#ifdef DEBUGGING
705#  define REG_EXTFLAGS_NAME_SIZE $REG_EXTFLAGS_NAME_SIZE
706#endif
707EOP
708
709}
710
711sub print_reg_intflags_name {
712    my ($out)= @_;
713    print $out <<EOP;
714
715/* PL_reg_intflags_name[] - Opcode/state names in string form, for debugging */
716
717#ifndef DOINIT
718EXTCONST char * PL_reg_intflags_name[];
719#else
720EXTCONST char * const PL_reg_intflags_name[] = {
721EOP
722
723    my %rxfv;
724    my %definitions;    # Remember what the symbol definitions are
725    my $val= 0;
726    my %reverse;
727    my $REG_INTFLAGS_NAME_SIZE= 0;
728    my $hp= HeaderParser->new();
729    my $last_val = 0;
730    foreach my $file ("regcomp.h") {
731        $hp->read_file($file);
732        my @bit_tuples;
733        foreach my $line_info (@{$hp->lines}) {
734            next unless $line_info->{type}     eq "content"
735                    and $line_info->{sub_type} eq "#define";
736            my $line= $line_info->{line};
737            $line=~s/\s*\\\n\s*/ /g;
738
739            # optional leading '_'.  Return symbol in $1, and strip it from
740            # comment of line
741            if (
742                $line =~ m/^ \# \s* define \s+ ( PREGf_ ( \w+ ) ) \s+ 0x([0-9a-f]+)(?:\s*\/\*(.*)\*\/)?/xi
743            ){
744                chomp $line;
745                my $define= $1;
746                my $abbr= $2;
747                my $hex= $3;
748                my $comment= $4;
749                my $val= hex($hex);
750                my $bin= sprintf "%b", $val;
751                if ($bin=~/1.*?1/) { die "Not expecting multiple bits in PREGf" }
752                my $bit= length($bin) - 1 ;
753                $comment= $comment ? " - $comment" : "";
754                if ($bit_tuples[$bit]) {
755                    die "Duplicate PREGf bit '$bit': $define $val ($hex)";
756                }
757                $bit_tuples[$bit]= [ $bit, $val, $abbr, $define, $comment ];
758            }
759        }
760        foreach my $i (0..$#bit_tuples) {
761            my $bit_tuple= $bit_tuples[$i];
762            if (!$bit_tuple) {
763                $bit_tuple= [ $i, 1<<$i, "", "", "*UNUSED*" ];
764            }
765            my ($bit, $val, $abbr, $define, $comment)= @$bit_tuple;
766            printf $out qq(\t%-30s/* (1<<%2d) - 0x%08x - %s%s */\n),
767                qq("$abbr",), $bit, $val, $define, $comment;
768        }
769        $REG_INTFLAGS_NAME_SIZE=0+@bit_tuples;
770    }
771
772    print $out <<EOP;
773};
774#endif /* DOINIT */
775
776EOP
777    print $out <<EOQ;
778#ifdef DEBUGGING
779#  define REG_INTFLAGS_NAME_SIZE $REG_INTFLAGS_NAME_SIZE
780#endif
781
782EOQ
783}
784
785sub print_process_flags {
786    my ($out)= @_;
787
788    print $out process_flags( 'V', 'varies', <<'EOC');
789/* The following have no fixed length. U8 so we can do strchr() on it. */
790EOC
791
792    print $out process_flags( 'S', 'simple', <<'EOC');
793
794/* The following always have a length of 1. U8 we can do strchr() on it. */
795/* (Note that length 1 means "one character" under UTF8, not "one octet".) */
796EOC
797
798}
799
800sub do_perldebguts {
801    my $guts= open_new( 'pod/perldebguts.pod', '>' );
802
803    my $node;
804    my $code;
805    my $name_fmt= '<' x  ( $longest_name_length - 1 );
806    my $descr_fmt= '<' x ( 58 - $longest_name_length );
807    eval <<EOD or die $@;
808format GuTS =
809 ^*~~
810 \$node->{pod_comment}
811 ^$name_fmt ^<<<<<<<<< ^$descr_fmt~~
812 \$node->{name}, \$code, defined \$node->{comment} ? \$node->{comment} : ''
813.
8141;
815EOD
816
817    my $old_fh= select($guts);
818    $~= "GuTS";
819
820    open my $oldguts, '<', 'pod/perldebguts.pod'
821        or die "$0 cannot open pod/perldebguts.pod for reading: $!";
822    while (<$oldguts>) {
823        print;
824        last if /=for regcomp.pl begin/;
825    }
826
827    print <<'END_OF_DESCR';
828
829 # TYPE arg-description [regnode-struct-suffix] [longjump-len] DESCRIPTION
830END_OF_DESCR
831    for my $n (@ops) {
832        $node= $n;
833        $code= "$node->{code} " . ( $node->{suffix} || "" );
834        $code .= " $node->{longj}" if $node->{longj};
835        if ( $node->{pod_comment} ||= "" ) {
836
837            # Trim multiple blanks
838            $node->{pod_comment} =~ s/^\n\n+/\n/;
839            $node->{pod_comment} =~ s/\n\n+$/\n\n/;
840        }
841        write;
842    }
843    print "\n";
844
845    while (<$oldguts>) {
846        last if /=for regcomp.pl end/;
847    }
848    do { print } while <$oldguts>; #win32 can't unlink an open FH
849    close $oldguts or die "Error closing pod/perldebguts.pod: $!";
850    select $old_fh;
851    close_and_rename($guts);
852}
853
854my $confine_to_core = 'defined(PERL_CORE) || defined(PERL_EXT_RE_BUILD)';
855read_definition("regcomp.sym");
856if ($ENV{DUMP}) {
857    require Data::Dumper;
858    print Data::Dumper::Dumper(\@all);
859    exit(1);
860}
861my $out= open_new( 'regnodes.h', '>',
862    {
863        by      => 'regen/regcomp.pl',
864        from    => [ 'regcomp.sym', 'op_reg_common.h', 'regexp.h' ],
865    },
866);
867print $out "#if $confine_to_core\n\n";
868print_typedefs($out);
869print_state_defs($out);
870
871print_regnode_name($out);
872print_regnode_info($out);
873
874
875print_reg_extflags_name($out);
876print_reg_intflags_name($out);
877print_process_flags($out);
878print_process_EXACTish($out);
879print $out "\n#endif /* $confine_to_core */\n";
880read_only_bottom_close_and_rename($out);
881
882do_perldebguts();
883