scanprov revision 1.2
1102644Snectar#!/usr/bin/perl -w
2102644Snectar$|=1;
3102644Snectar################################################################################
4102644Snectar#
5102644Snectar#  scanprov -- scan Perl headers for macros, and add known exceptions, and
6102644Snectar#              functions we weren't able to otherwise find.  Thus the purpose
7102644Snectar#              of this file has been expanded beyond what its name says.
8102644Snectar#
9102644Snectar#  Besides the normal options, 'mode=clean' is understood as 'write', but
10102644Snectar#  first remove any scanprov lines added in previous runs of this.
11102644Snectar#
12102644Snectar#  The lines added have a code to signify they are added by us:
13102644Snectar#   F means it is a function in embed.fnc that the normal routines didn't find
14102644Snectar#   K means it is a macro in config.h, hence is provided, and documented
15102644Snectar#   M means it is a provided by D:P macro
16102644Snectar#   X means it is a known exceptional item
17102644Snectar#   Z means it is an unprovided macro without documentation
18102644Snectar#
19102644Snectar#  The regeneration routines do not know the prototypes for the macros scanned
20102644Snectar#  for, which is gotten from documentation in the source.  (If they were
21102644Snectar#  documented, they would be put in parts/apidoc.fnc, and test cases generated
22102644Snectar#  for them in mktodo.pl).  Therefore these are all undocumented, except for
23102644Snectar#  things from config.h which are all documented there, and many of which are
24102644Snectar#  just defined or not defined, and hence can't be tested.  Thus looking for
25102644Snectar#  them here is the most convenient option, which is why it's done here.
26102644Snectar#
27102644Snectar#  The scope of this program has also expanded to look in almost all header
28102644Snectar#  files for almost all macros that aren't documented nor provided.  This
29102644Snectar#  allows ppport.h --api-info=/foo/ to return when a given element actually
30102644Snectar#  came into existence, which can be a time saver for developers of the perl
31102644Snectar#  core.
32102644Snectar#
33102644Snectar#  It would be best if people would add documentation to them in the perl
34102644Snectar#  source, and then this portion of this function would be minimized.
35102644Snectar#
36102644Snectar#  On Linux nm and other uses by D:P, these are the remaining unused capital
37102644Snectar#  flags: HJLOQY
38102644Snectar#
39102644Snectar################################################################################
40102644Snectar#
41102644Snectar#  Version 3.x, Copyright (C) 2004-2013, Marcus Holland-Moritz.
42102644Snectar#  Version 2.x, Copyright (C) 2001, Paul Marquess.
43102644Snectar#  Version 1.x, Copyright (C) 1999, Kenneth Albanowski.
44102644Snectar#
45102644Snectar#  This program is free software; you can redistribute it and/or
46102644Snectar#  modify it under the same terms as Perl itself.
47102644Snectar#
48102644Snectar################################################################################
49102644Snectar
50102644Snectaruse strict;
51102644Snectaruse Getopt::Long;
52102644Snectar
53102644Snectarrequire './parts/ppptools.pl';
54102644Snectarrequire './parts/inc/inctools';
55102644Snectarrequire './devel/devtools.pl';
56102644Snectar
57102644Snectarour %opt = (
58102644Snectar  mode    => 'check',
59102644Snectar  install => '/tmp/perl/install/default',
60102644Snectar  blead   => 'bleadperl',
61102644Snectar  debug   => 0,
62102644Snectar 'debug-start' => "",
63102644Snectar);
64102644Snectar
65102644SnectarGetOptions(\%opt, qw( install=s mode=s blead=s debug=i debug-start=s)) or die;
66102644Snectar
67102644Snectarmy $clean = $opt{mode} eq 'clean';
68102644Snectarmy $write = $clean || $opt{mode} eq 'write';
69102644Snectarmy $debug = $opt{debug};
70102644Snectar
71102644Snectar# Get the list of known macros.  Functions are calculated separately below
72102644Snectarmy %embed = map { $_->{flags}{m} ? ( $_->{name} => 1 ) : () }
73102644Snectar            parse_embed(qw(parts/embed.fnc parts/apidoc.fnc));
74102644Snectar
75102644Snectar# @provided is set to everthing provided
7690926Snectarmy @provided = map { /^(\w+)/ ? $1 : () } `$^X ppport.h --list-provided`;
7790926Snectar
7890926Snectar# There are a few exceptions that have to be dealt with specially.  Add these
7990926Snectar# to the list of things to scan for.
8090926Snectarmy $hard_to_test_ref = known_but_hard_to_test_for();
8190926Snectarpush @provided, keys %$hard_to_test_ref;
8290926Snectar
8390926Snectarmy $base_dir = 'parts/base';
8490926Snectarmy $todo_dir = 'parts/todo';
8590926Snectar
8690926Snectar# The identifying text placed in every entry by this program
8790926Snectarmy $id_text = "added by $0";
8890926Snectar
8990926Snectarif ($write) {
9090926Snectar
9190926Snectar    # Get the list of files
9290926Snectar    my @files = all_files_in_dir($base_dir);
9390926Snectar
9490926Snectar    # If asked to, first strip out the results of previous incarnations of
9590926Snectar    # this script
9690926Snectar    if ($clean) {
9790926Snectar        print "Cleaning previous $0 runs\n";
9890926Snectar        foreach my $file (@files) {
9990926Snectar            open my $fh, "+<", $file or die "$file: $!\n";
10090926Snectar            my @lines = <$fh>;
10190926Snectar            my $orig_count = @lines;
10290926Snectar            @lines = grep { $_ !~ /$id_text/ } @lines;
10390926Snectar            next if @lines == $orig_count;  # No need to write if unchanged.
10490926Snectar            truncate $fh, 0;
10590926Snectar            seek $fh, 0, 0;
10690926Snectar            print $fh @lines;
10790926Snectar            close $fh or die "$file: $!\n";
10890926Snectar        }
10990926Snectar    }
11090926Snectar
11190926Snectar    # The file list is returned sorted, and so the min version is in the 0th
11290926Snectar    # element
11390926Snectar    my $file =  $files[0];
11490926Snectar    my $min_perl = $file;
11590926Snectar    $min_perl =~ s,.*/,,;    # The name is the integer of __MIN_PERL__
11690926Snectar
11790926Snectar    # There are a very few special cases that we may not find in scanning, but
11890926Snectar    # exist all the way back.  Add them now to avoid throwing later things
11990926Snectar    # off.
12090926Snectar    print "-- $file --\n";
12190926Snectar    open my $fh, "+<", $file or die "$file: $!\n";
12290926Snectar    my @lines = <$fh>;
12390926Snectar    my $count = @lines;
12490926Snectar    for (qw(RETVAL CALL THIS)) { # These are also in hard_to_test_for(),
12590926Snectar                                 # so can't be in blead, as they are skipped
12690926Snectar                                 # in testing, so no real need to check that
12790926Snectar                                 # they aren't dups.
12890926Snectar        my $line = format_output_line($_, 'X');
12990926Snectar        next if grep { /$line/ } @lines;
13090926Snectar        print "Adding $_ to $file\n";
13190926Snectar        push @lines, $line;
13290926Snectar    }
13390926Snectar    if ($count != @lines) {
13490926Snectar        @lines = sort symbol_order @lines;
13590926Snectar        truncate $fh, 0;
13690926Snectar        seek $fh, 0, 0;
13790926Snectar        print $fh @lines;
13890926Snectar    }
13990926Snectar    close $fh;
14090926Snectar
14190926Snectar    # Now we're going to add the hard to test symbols.  The hash has been
14290926Snectar    # manually populated and commited, with the version number ppport supports
14390926Snectar    # them to.
14490926Snectar    #
14590926Snectar    # This is a hash ref with the keys being all symbols found in all the
14690926Snectar    # files in the directory, and the values being the perl versions of each
14790926Snectar    # symbol.
14890926Snectar    my $todo = parse_todo($todo_dir);
14990926Snectar
15090926Snectar    # The keys of $hard_to_test_ref are the symbols, and the values are
15190926Snectar    # subhashes, with each 'version' key being its proper perl version.
15290926Snectar    # Below, we invert %hard_to_test, so that the keys are the version, and
15390926Snectar    # the values are the symbols that go in that version
15490926Snectar    my %add_by_version;
15590926Snectar    for my $hard (keys %$hard_to_test_ref) {
15690926Snectar
15790926Snectar        # But if someone ups the min version we support, we don't want to add
15890926Snectar        # something less than that.
15990926Snectar        my $version = int_parse_version($hard_to_test_ref->{$hard});
16090926Snectar        $version = $min_perl if $version < $min_perl;
16190926Snectar        $version = format_version_line($version);
16290926Snectar
16390926Snectar        push @{$add_by_version{$version}}, $hard
16490926Snectar                unless grep { $todo->{$_}->{version} eq $hard } keys %$todo;
16590926Snectar    }
16690926Snectar
16790926Snectar    # Only a few files will have exceptions that apply to them.  Rewrite each
16890926Snectar    foreach my $version (keys %add_by_version) {
16990926Snectar        my $file = "$todo_dir/" . int_parse_version($version);
17090926Snectar        print "-- Adding known exceptions to $file --\n";
17190926Snectar        open my $fh, "+<", $file or die "$file: $!\n";
17290926Snectar        my @lines = <$fh>;
17390926Snectar        my $count = @lines;
17490926Snectar        push @lines, format_version_line($version) . "\n" unless @lines;
17590926Snectar        foreach my $symbol (@{$add_by_version{$version}}) {
17690926Snectar            my $line = format_output_line($symbol, 'X');
17790926Snectar            unless (grep { /$line/ } @lines) {;
17890926Snectar                print "adding $symbol\n";
17990926Snectar                push @lines, $line unless grep { /$line/ } @lines;
18090926Snectar            }
18190926Snectar        }
18290926Snectar        if (@lines != $count) {
18390926Snectar            @lines = sort symbol_order @lines;
18490926Snectar            truncate $fh, 0;
18590926Snectar            seek $fh, 0, 0;
18690926Snectar            print $fh @lines;
18790926Snectar        }
18890926Snectar        close $fh;
18990926Snectar    }
19090926Snectar}
19190926Snectar
19290926Snectar# Now that we've added the exceptions to a few files, we can parse
19390926Snectar# and deal with all of them.
19490926Snectarmy $perls_ref = get_and_sort_perls(\%opt);
19590926Snectar
19690926Snectardie "Couldn't find any perls" unless @$perls_ref > 1;
19790926Snectar
19890926Snectarfind_first_mentions($perls_ref,   # perls to look in
19990926Snectar                    \@provided,   # List of symbol names to look for
20090926Snectar                    '*.h',        # Look in all hdrs.
20190926Snectar                    1,            # Strip comments
20290926Snectar                   'M'
20390926Snectar                   );
20490926Snectar
20590926Snectar# Now look for functions that we didn't test in mktodo.pl, generally because
20690926Snectar# these were hidden behind #ifdef's.
20790926Snectarmy $base_ref = parse_todo($base_dir);
20890926Snectarmy @functions = parse_embed(qw(parts/embed.fnc));
20990926Snectar
21090926Snectar# We could just gather data for the publicly available ones, but having this
21190926Snectar# information available for everything is useful.
21290926Snectar#@functions = grep { exists $_->{flags}{A} } @functions;
21390926Snectar
21490926Snectar# The ones we don't have info on are the ones in embed.fnc that aren't in the
21590926Snectar# base files.  Certain of these will only be in the Perl_foo form.
21690926Snectarmy @missing = map { exists $base_ref->{$_->{name}}
21790926Snectar                    ? ()
21890926Snectar                    : ((exists $_->{flags}{p} && exists $_->{flags}{o})
21990926Snectar                       ? ((exists $base_ref->{$_->{"Perl_$_->{name}"}}
22090926Snectar                           ? ()
22190926Snectar                           : "Perl_$_->{name}"))
22290926Snectar                       : $_->{name})
22390926Snectar                  } @functions;
22490926Snectar
22590926Snectar# These symbols will be found in the autogen'd files, and they may be
22690926Snectar# commented out in them.
22790926Snectarfind_first_mentions($perls_ref,
22890926Snectar                    \@missing,
22990926Snectar                    [ 'embed.h', 'proto.h' ],
23090926Snectar                    0,          # Don't strip comments
23190926Snectar                   'F'
23290926Snectar                   );
23390926Snectar
23490926Snectarsub symbol_order    # Sort based on first word on line
23590926Snectar{
23690926Snectar    my $stripped_a = $a =~ s/ ^ \s* //rx;
23790926Snectar    $stripped_a =~ s/ \s.* //x;
23890926Snectar
23990926Snectar    my $stripped_b = $b =~ s/ ^ \s* //rx;
24090926Snectar    $stripped_b =~ s/ \s.* //x;
24190926Snectar
24290926Snectar    return dictionary_order($stripped_a, $stripped_b);
24390926Snectar}
24490926Snectar
24590926Snectarsub format_output_line
24690926Snectar{
24790926Snectar    my $sym = shift;
24890926Snectar    my $code = shift;
24990926Snectar
25090926Snectar    return sprintf "%-30s # $code $id_text\n", $sym;
25190926Snectar}
25290926Snectar
25390926Snectarsub find_first_mentions
25490926Snectar{
25590926Snectar    my $perls_ref =    shift;   # List of perls to look in
25690926Snectar    my $look_for_ref = shift;   # List of symbol names to look for
25790926Snectar    my $hdrs =         shift;   # Glob of hdrs to look in
25890926Snectar    my $strip_comments = shift;
25990926Snectar    my $code           = shift; # Mark entries as having this type
26090926Snectar
26190926Snectar    use feature 'state';
26290926Snectar    state $first_perl = 1;
26390926Snectar
26490926Snectar    $hdrs = [ $hdrs ] unless ref $hdrs;
26590926Snectar
26690926Snectar    my %remaining;
26790926Snectar    $remaining{$_} = $code for @$look_for_ref;
26890926Snectar
26990926Snectar    my %v;
27090926Snectar
27190926Snectar    # We look in descending order of perl versions.  Each time through the
27290926Snectar    # loop %remaining is narrowed.
27390926Snectar    for my $p (@$perls_ref) {
27490926Snectar        print "checking perl $p->{version}...\n";
27590926Snectar
27690926Snectar        # Get the hdr files associated with this version
27790926Snectar        my $archlib = `$p->{path} -MConfig -l -e 'print \$Config{archlib}'`;
27890926Snectar        chomp $archlib;
27990926Snectar        local @ARGV;
28078527Sassar        push @ARGV, glob "$archlib/CORE/$_" for @$hdrs;
28178527Sassar
28290926Snectar        # %sym's keys are every single thing that looks like an identifier
28390926Snectar        # (beginning with a non-digit \w, followed by \w*) that occurs in any
28490926Snectar        # header, regardless of where (outside of comments).  For macros, it
28578527Sassar        # can't end in an underscore, nor be like 'AbCd', which are marks for
28678527Sassar        # internal.
28778527Sassar        my %sym;
28878527Sassar
28978527Sassar        local $/ = undef;
29078527Sassar        while (<<>>) {  # Read in the whole next file as one string.
29178527Sassar
29278527Sassar            # This would override function definitions with macro ones
29378527Sassar            next if $code eq 'M' && $ARGV =~ m! / embed\.h $ !x;
29478527Sassar
29578527Sassar            my $is_config_h = $ARGV =~ m! / config\.h $ !x;
29678527Sassar
29778527Sassar            my $contents = $_;
29878527Sassar
29978527Sassar            # Strip initial '/*' in config.h /*#define... lines.  This just
30078527Sassar            # means the item isn't available on the platform this program is
30178527Sassar            # being run on.
30278527Sassar            $contents =~ s! ^ /\* \s* (?=\#\s*define\s) !!mx if $is_config_h;
30378527Sassar
30478527Sassar            # Strip comments, from perl faq
30578527Sassar            if ($strip_comments) {
30678527Sassar                $contents =~ s#/\*[^*]*\*+([^/*][^*]*\*+)*/|("(\\.|[^"\\])*"|'(\\.|[^'\\])*'|.[^/"'\\]*)#defined $2 ? $2 : ""#gse;
30778527Sassar            }
30878527Sassar
30978527Sassar            # For macros, we look for #defines
31078527Sassar            if ($code eq 'M') {
31178527Sassar                my %defines;
31278527Sassar
31378527Sassar                while ($contents =~ m/ ^ \s* \# \s* define \s+
31478527Sassar
31578527Sassar                                       # A symbol not ending in underscore
31678527Sassar                                       ( [A-Za-z][_A-Za-z0-9]*[A-Za-z0-9] )
31778527Sassar                                     /mxg)
31878527Sassar                {
31978527Sassar                    my $this_define = $1;
32078527Sassar
32178527Sassar                    # These are internal and not of external interest, so just
32278527Sassar                    # noise if we were to index them
32378527Sassar                    next if $this_define =~ / ^ PERL_ARGS_ASSERT /x;
32478527Sassar
32578527Sassar                    # Names like AbCd are internal
32678527Sassar                    next if $this_define =~ /[[:upper:]][[:lower:]][[:upper:]][[:lower:]]/;
32778527Sassar
32878527Sassar                    $defines{$this_define}++;
32978527Sassar                }
33078527Sassar                $sym{$_}++ for keys %defines;
33178527Sassar
33278527Sassar                # For functions, etc we get all the symbols for the latest
33378527Sassar                # perl passed in, but for macros, it is just the ones for the
33478527Sassar                # known documented ones, and we have to find the rest.  This
33578527Sassar                # allows us to keep the logic for that in just one place:
33678527Sassar                # here.
33778527Sassar                if ($first_perl) {
33878527Sassar
33978527Sassar                    # config.h symbols are documented; the rest aren't, so use
34078527Sassar                    # different flags so downstream processing knows which are
34178527Sassar                    # which.
34278527Sassar                    if ($is_config_h) {
34378527Sassar                        foreach my $define (keys %defines) {
34478527Sassar                            $remaining{$define} = 'K';
34578527Sassar                        }
34678527Sassar                    }
34778527Sassar                    else {
34878527Sassar                        foreach my $define (keys %defines) {
34978527Sassar                            # Don't override input 'M' symbols.
35078527Sassar                            $remaining{$define} = 'Z'
35178527Sassar                                            unless defined $remaining{$define};
35278527Sassar                        }
35378527Sassar                    }
35478527Sassar                }
35578527Sassar            }
35678527Sassar            else {  # Look for potential function names; remember comments
35772445Sassar                    # have been stripped off.
35872445Sassar                $sym{$_}++ for /(\b[^\W\d]\w*)/g;
35972445Sassar            }
36072445Sassar        }
36172445Sassar
36272445Sassar        # %remaining is narrowed to include only those identifier-like things
36372445Sassar        # that are mentioned in one of the input hdrs in this release.  (If it
36472445Sassar        # isn't even mentioned, it won't exist in the release.)  For those not
36572445Sassar        # mentioned, a key is added of the identifier-like thing in %v.  It is
36672445Sassar        # a subkey of this release's "todo" release, which is the next higher
36772445Sassar        # one.  If we are at version n, we have already done version n+1 and
36872445Sassar        # the provided element was mentioned there, and now it no longer is.
36972445Sassar        # We take that to mean that to mean that the element became provided
37072445Sassar        # for in n+1.
37172445Sassar        foreach my $symbol (keys %remaining) {
37272445Sassar            next if defined $sym{$symbol};  # Still exists in this release
37372445Sassar
37472445Sassar            # Gone in this release, must have come into existence in the next
37572445Sassar            # higher one.
37672445Sassar            $v{$p->{todo}}{$symbol} = delete $remaining{$symbol};
37772445Sassar        }
37872445Sassar
37972445Sassar        $first_perl = 0;
38072445Sassar    }
38172445Sassar
38272445Sassar    # After all releases, assume that anything still defined came into
38372445Sassar    # existence in that earliest release.
38472445Sassar    $v{$perls_ref->[-1]{file}}{$_} = $remaining{$_} for keys %remaining;
38572445Sassar
38672445Sassar    # Read in the parts/base files.  The hash ref has keys being all symbols
38772445Sassar    # found in all the files in base/, which are all we are concerned with
38872445Sassar    # became defined in.
38972445Sassar    my $base_ref = parse_todo($base_dir);
39072445Sassar
39172445Sassar
39272445Sassar    # Now add the results from above.  At this point, The keys of %v are the 7
39372445Sassar    # digit BCD version numbers, and their subkeys are the symbols provided by
39472445Sassar    # D:P that are first mentioned in this version, like this:
39572445Sassar    #   '5009002' => {
39672445Sassar    #                  'MY_CXT_CLONE' => 1,
39772445Sassar    #                  'SV_NOSTEAL' => 1,
39872445Sassar    #                  'UTF8_MAXBYTES' => 1
39972445Sassar    #                },
40072445Sassar
40172445Sassar    for my $version (keys %v) {
40272445Sassar
40372445Sassar        # Things listed in blead (the most recent file) are special.  They are
40472445Sassar        # there by default because we haven't found them anywhere, so they
40572445Sassar        # don't really exist as far as we can determine, so shouldn't be
40672445Sassar        # listed as existing.
40772445Sassar        next if $version > $perls_ref->[0]->{file};
40872445Sassar
40972445Sassar        # @new becomes the symbols for $version not already in the file for it
41072445Sassar        my @new = sort symbol_order grep { !exists $base_ref->{$_} }
41172445Sassar                                                                keys %{$v{$version}};
41272445Sassar        @new or next; # Nothing new, skip writing
41372445Sassar
41472445Sassar        my $file = $version;
41572445Sassar        $file =~ s/\.//g;
41672445Sassar        $file = "$base_dir/$file";
41772445Sassar        -e $file or die "non-existent: $file\n";
41872445Sassar        print "-- $file --\n";
41972445Sassar        if ($write) {
42072445Sassar            open my $fh, "+<", $file or die "$file: $!\n";
42172445Sassar            my @lines = <$fh>;
42272445Sassar            my $count = @lines;
42372445Sassar            for my $new (@new) {
42472445Sassar                my $line = format_output_line($new, $v{$version}{$new});
42572445Sassar                next if grep { /$line/ } @lines;
42672445Sassar                print "adding $new\n";
42772445Sassar                push @lines, $line;
42872445Sassar            }
42972445Sassar            if (@lines != $count) {
43072445Sassar                @lines = sort symbol_order @lines;
43172445Sassar                truncate $fh, 0;
43272445Sassar                seek $fh, 0, 0;
43372445Sassar                print $fh @lines;
43472445Sassar            }
43572445Sassar            close $fh;
43672445Sassar        }
43772445Sassar    }
43872445Sassar}
43972445Sassar