1#!/usr/bin/perl -w
2use strict;
3
4# Walk through a perl script and create a masked file which is
5# similar but which masks comments, quotes, patterns, and non-code
6# lines so that it is easy to parse with regular expressions.
7#
8# usage:
9# perlmask [-cn]  myfile.pl >myfile.new
10# perlmask [-cn] <myfile.pl >myfile.new
11#
12# In the masked file,
13#  -comments and pod will be masked (or removed)
14#  -here-doc text lines will be masked (or removed)
15#  -quotes and patterns, qw quotes, and here doc << operators will be
16#   replaced by the letters 'Q', 'q', or 'h'
17#
18# The result is a file in which all braces, parens, and square brackets
19# are balanced, and it can be parsed relatively easily by regular
20# expressions.
21#
22# -cn is an optional 'compression' flag.  By default the masked file will have
23# the same number of characters as the input file, with the difference being
24# that certain characters will be changed (masked).
25#
26# If character position correspondence is not required, the size of the masked
27# file can be significantly reduced by increasing the 'compression' level as
28# follows:
29#
30# -c0 all mask file line numbers and character positions agree with
31#     original file (DEFAULT)
32# -c1 line numbers agree and character positions agree within lines of code
33# -c2 line numbers agree but character positions do not
34# -c3 no correspondence between line numbers or character positions
35#
36# Try each of these on a file of significant size to see how they work.
37# The default, -c0, is required if you are working with character positions
38# that span multiple lines.  The other levels may be useful if you
39# do not need this level of correspondence.
40#
41# This file is one of the examples distributed with perltidy and demonstrates
42# using a callback object with Perl::Tidy to walk through a perl file and find
43# all of its tokens.  It can be useful for simple perl code parsing tasks.  It
44# might even be helpful in debugging.  Or you may want to modify it to suit
45# your own purposes.
46#
47use Getopt::Std;
48use IO::File;
49$| = 1;
50use vars qw($opt_c $opt_h);
51my $usage = <<EOM;
52   usage: perlmask [ -cn ] filename >outfile
53EOM
54getopts('c:h') or die "$usage";
55if ($opt_h) { die $usage }
56unless ( defined($opt_c) ) { $opt_c = 0 }
57if (@ARGV > 1) { die $usage }
58
59my $source=$ARGV[0];   # an undefined filename will become stdin
60
61# strings to hold the files (arrays could be used to)
62my ( $masked_file, $original_file );
63
64PerlMask::perlmask(
65    _source         => $source,
66    _rmasked_file   => \$masked_file,
67    _roriginal_file => \$original_file,    # optional
68    _compression    => $opt_c              # optional, default=0
69);
70
71# Now we have the masked and original files in strings of equal length.
72# We could search for specific text in the masked file here.  But here
73# we'll just print the masked file:
74if ($masked_file) { print $masked_file; }
75
76#####################################################################
77#
78# The PerlMask package is an interface to perltidy which accepts a
79# source filehandle and returns a 'masked' version of the source as
80# a string or array.  It can also optionally return the original file
81# as a string or array.
82#
83# It works by making a a callback object with a write_line() method to
84# receive tokenized lines from perltidy.  This write_line method
85# selectively replaces tokens with either their original text or with a
86# benign masking character (such as '#' or 'Q').
87#
88# Usage:
89#
90#   PerlMask::perlmask(
91#       _source         => $fh,             # required source
92#       _rmasked_file   => \$masked_file,   # required ref to ARRAY or SCALAR
93#       _roriginal_file => \$original_file, # optional ref to ARRAY or SCALAR
94#       _compression    => $opt_c           # optional
95#   );
96#
97# _source is any source that perltidy will accept, including a
98# filehandle or reference to SCALAR or ARRAY
99#
100# The compression flag may have these values:
101#  0 all mask file line numbers and character positions agree with
102#    original file (DEFAULT)
103#  1 line numbers agree and character positions agree within lines of code
104#  2 line numbers agree but character positions do not
105#  3 no correspondence between line numbers or character positions
106#
107#####################################################################
108
109package PerlMask;
110use Carp;
111use Perl::Tidy;
112
113sub perlmask {
114
115    my %args = ( _compression => 0, @_ );
116    my $rfile = $args{_rmasked_file};
117    unless ( defined($rfile) ) {
118        croak
119          "Missing required parameter '_rmasked_file' in call to perlmask\n";
120    }
121    my $ref=ref($rfile);
122    unless ( $ref =~ /^(SCALAR|ARRAY)$/ ) {
123            croak <<EOM;
124Expecting _rmasked_file = ref to SCALAR or ARRAY in perlmask but got : ($ref)
125EOM
126    }
127
128    # run perltidy, which will call $formatter's write_line() for each line
129    my $err=perltidy(
130        'source'    => $args{_source},
131        'formatter' => bless( \%args, __PACKAGE__ ),    # callback object
132        'argv'        => "-npro -se",    # -npro : ignore .perltidyrc,
133                                         # -se   : errors to STDOUT
134    );
135    if ($err) {
136        die "Error calling perltidy\n";
137    }
138}
139
140sub print_line {
141
142    # called from write_line to dispatch one line (either masked or original)..
143    # here we'll either append it to a string or array, as appropriate
144    my ( $rfile, $line ) = @_;
145    if ( defined($rfile) ) {
146        if ( ref($rfile) eq 'SCALAR' ) {
147            $$rfile .= $line . "\n";
148        }
149        elsif ( ref($rfile) eq 'ARRAY' ) {
150            push @{$rfile}, $line . "\n";
151        }
152    }
153}
154
155sub write_line {
156
157    # This is called from perltidy line-by-line
158    my ( $self, $line_of_tokens ) = @_;
159    my $rmasked_file   = $self->{_rmasked_file};
160    my $roriginal_file = $self->{_roriginal_file};
161    my $opt_c          = $self->{_compression};
162
163    my $line_type         = $line_of_tokens->{_line_type};
164    my $input_line_number = $line_of_tokens->{_line_number};
165    my $input_line        = $line_of_tokens->{_line_text};
166    my $rtoken_type       = $line_of_tokens->{_rtoken_type};
167    my $rtokens           = $line_of_tokens->{_rtokens};
168    chomp $input_line;
169
170    # mask non-CODE lines
171    if ( $line_type ne 'CODE' ) {
172        return if ( $opt_c == 3 );
173        my $len = length($input_line);
174        if ( $opt_c == 0 && $len > 0 ) {
175            print_line( $roriginal_file, $input_line ) if $roriginal_file;
176            print_line( $rmasked_file, '#' x $len );
177        }
178        else {
179            print_line( $roriginal_file, $input_line ) if $roriginal_file;
180            print_line( $rmasked_file, "" );
181        }
182        return;
183    }
184
185    # we'll build the masked line token by token
186    my $masked_line = "";
187
188    # add leading spaces if not in a higher compression mode
189    if ( $opt_c <= 1 ) {
190
191        # Find leading whitespace.  But be careful..we don't want the
192        # whitespace if it is part of quoted text, because it will
193        # already be contained in a token.
194        if ( $input_line =~ /^(\s+)/ && !$line_of_tokens->{_starting_in_quote} )
195        {
196            $masked_line = $1;
197        }
198    }
199
200    # loop over tokens to construct one masked line
201    for ( my $j = 0 ; $j < @$rtoken_type ; $j++ ) {
202
203        # Mask certain token types by replacing them with their type code:
204        # type  definition
205        # ----  ----------
206        # Q     quote or pattern
207        # q     qw quote
208        # h     << here doc operator
209        # #     comment
210        #
211        # This choice will produce a mask file that has balanced
212        # container tokens and does not cause parsing problems.
213        if ( $$rtoken_type[$j] =~ /^[Qqh]$/ ) {
214            if ( $opt_c <= 1 ) {
215                $masked_line .= $$rtoken_type[$j] x length( $$rtokens[$j] );
216            }
217            else {
218                $masked_line .= $$rtoken_type[$j];
219            }
220        }
221
222        # Mask a comment
223        elsif ( $$rtoken_type[$j] eq '#' ) {
224            if ( $opt_c == 0 ) {
225                $masked_line .= '#' x length( $$rtokens[$j] );
226            }
227        }
228
229        # All other tokens go out verbatim
230        else {
231            $masked_line .= $$rtokens[$j];
232        }
233    }
234    print_line( $roriginal_file, $input_line ) if $roriginal_file;
235    print_line( $rmasked_file, $masked_line );
236
237    # self-check lengths; this error should never happen
238    if ( $opt_c == 0 && length($masked_line) != length($input_line) ) {
239        my $lmask  = length($masked_line);
240        my $linput = length($input_line);
241        print STDERR
242"$input_line_number: length ERROR, masked length=$lmask but input length=$linput\n";
243    }
244}
245
246# called once after the last line of a file
247sub finish_formatting {
248    my $self = shift;
249    return;
250}
251