1#!/usr/bin/perl -w 2use strict; 3 4# Walk through a perl script and create a masked file which is 5# similar but which masks comments, quotes, patterns, and non-code 6# lines so that it is easy to parse with regular expressions. 7# 8# usage: 9# perlmask [-cn] myfile.pl >myfile.new 10# perlmask [-cn] <myfile.pl >myfile.new 11# 12# In the masked file, 13# -comments and pod will be masked (or removed) 14# -here-doc text lines will be masked (or removed) 15# -quotes and patterns, qw quotes, and here doc << operators will be 16# replaced by the letters 'Q', 'q', or 'h' 17# 18# The result is a file in which all braces, parens, and square brackets 19# are balanced, and it can be parsed relatively easily by regular 20# expressions. 21# 22# -cn is an optional 'compression' flag. By default the masked file will have 23# the same number of characters as the input file, with the difference being 24# that certain characters will be changed (masked). 25# 26# If character position correspondence is not required, the size of the masked 27# file can be significantly reduced by increasing the 'compression' level as 28# follows: 29# 30# -c0 all mask file line numbers and character positions agree with 31# original file (DEFAULT) 32# -c1 line numbers agree and character positions agree within lines of code 33# -c2 line numbers agree but character positions do not 34# -c3 no correspondence between line numbers or character positions 35# 36# Try each of these on a file of significant size to see how they work. 37# The default, -c0, is required if you are working with character positions 38# that span multiple lines. The other levels may be useful if you 39# do not need this level of correspondence. 40# 41# This file is one of the examples distributed with perltidy and demonstrates 42# using a callback object with Perl::Tidy to walk through a perl file and find 43# all of its tokens. It can be useful for simple perl code parsing tasks. It 44# might even be helpful in debugging. Or you may want to modify it to suit 45# your own purposes. 46# 47use Getopt::Std; 48use IO::File; 49$| = 1; 50use vars qw($opt_c $opt_h); 51my $usage = <<EOM; 52 usage: perlmask [ -cn ] filename >outfile 53EOM 54getopts('c:h') or die "$usage"; 55if ($opt_h) { die $usage } 56unless ( defined($opt_c) ) { $opt_c = 0 } 57if (@ARGV > 1) { die $usage } 58 59my $source=$ARGV[0]; # an undefined filename will become stdin 60 61# strings to hold the files (arrays could be used to) 62my ( $masked_file, $original_file ); 63 64PerlMask::perlmask( 65 _source => $source, 66 _rmasked_file => \$masked_file, 67 _roriginal_file => \$original_file, # optional 68 _compression => $opt_c # optional, default=0 69); 70 71# Now we have the masked and original files in strings of equal length. 72# We could search for specific text in the masked file here. But here 73# we'll just print the masked file: 74if ($masked_file) { print $masked_file; } 75 76##################################################################### 77# 78# The PerlMask package is an interface to perltidy which accepts a 79# source filehandle and returns a 'masked' version of the source as 80# a string or array. It can also optionally return the original file 81# as a string or array. 82# 83# It works by making a a callback object with a write_line() method to 84# receive tokenized lines from perltidy. This write_line method 85# selectively replaces tokens with either their original text or with a 86# benign masking character (such as '#' or 'Q'). 87# 88# Usage: 89# 90# PerlMask::perlmask( 91# _source => $fh, # required source 92# _rmasked_file => \$masked_file, # required ref to ARRAY or SCALAR 93# _roriginal_file => \$original_file, # optional ref to ARRAY or SCALAR 94# _compression => $opt_c # optional 95# ); 96# 97# _source is any source that perltidy will accept, including a 98# filehandle or reference to SCALAR or ARRAY 99# 100# The compression flag may have these values: 101# 0 all mask file line numbers and character positions agree with 102# original file (DEFAULT) 103# 1 line numbers agree and character positions agree within lines of code 104# 2 line numbers agree but character positions do not 105# 3 no correspondence between line numbers or character positions 106# 107##################################################################### 108 109package PerlMask; 110use Carp; 111use Perl::Tidy; 112 113sub perlmask { 114 115 my %args = ( _compression => 0, @_ ); 116 my $rfile = $args{_rmasked_file}; 117 unless ( defined($rfile) ) { 118 croak 119 "Missing required parameter '_rmasked_file' in call to perlmask\n"; 120 } 121 my $ref=ref($rfile); 122 unless ( $ref =~ /^(SCALAR|ARRAY)$/ ) { 123 croak <<EOM; 124Expecting _rmasked_file = ref to SCALAR or ARRAY in perlmask but got : ($ref) 125EOM 126 } 127 128 # run perltidy, which will call $formatter's write_line() for each line 129 my $err=perltidy( 130 'source' => $args{_source}, 131 'formatter' => bless( \%args, __PACKAGE__ ), # callback object 132 'argv' => "-npro -se", # -npro : ignore .perltidyrc, 133 # -se : errors to STDOUT 134 ); 135 if ($err) { 136 die "Error calling perltidy\n"; 137 } 138} 139 140sub print_line { 141 142 # called from write_line to dispatch one line (either masked or original).. 143 # here we'll either append it to a string or array, as appropriate 144 my ( $rfile, $line ) = @_; 145 if ( defined($rfile) ) { 146 if ( ref($rfile) eq 'SCALAR' ) { 147 $$rfile .= $line . "\n"; 148 } 149 elsif ( ref($rfile) eq 'ARRAY' ) { 150 push @{$rfile}, $line . "\n"; 151 } 152 } 153} 154 155sub write_line { 156 157 # This is called from perltidy line-by-line 158 my ( $self, $line_of_tokens ) = @_; 159 my $rmasked_file = $self->{_rmasked_file}; 160 my $roriginal_file = $self->{_roriginal_file}; 161 my $opt_c = $self->{_compression}; 162 163 my $line_type = $line_of_tokens->{_line_type}; 164 my $input_line_number = $line_of_tokens->{_line_number}; 165 my $input_line = $line_of_tokens->{_line_text}; 166 my $rtoken_type = $line_of_tokens->{_rtoken_type}; 167 my $rtokens = $line_of_tokens->{_rtokens}; 168 chomp $input_line; 169 170 # mask non-CODE lines 171 if ( $line_type ne 'CODE' ) { 172 return if ( $opt_c == 3 ); 173 my $len = length($input_line); 174 if ( $opt_c == 0 && $len > 0 ) { 175 print_line( $roriginal_file, $input_line ) if $roriginal_file; 176 print_line( $rmasked_file, '#' x $len ); 177 } 178 else { 179 print_line( $roriginal_file, $input_line ) if $roriginal_file; 180 print_line( $rmasked_file, "" ); 181 } 182 return; 183 } 184 185 # we'll build the masked line token by token 186 my $masked_line = ""; 187 188 # add leading spaces if not in a higher compression mode 189 if ( $opt_c <= 1 ) { 190 191 # Find leading whitespace. But be careful..we don't want the 192 # whitespace if it is part of quoted text, because it will 193 # already be contained in a token. 194 if ( $input_line =~ /^(\s+)/ && !$line_of_tokens->{_starting_in_quote} ) 195 { 196 $masked_line = $1; 197 } 198 } 199 200 # loop over tokens to construct one masked line 201 for ( my $j = 0 ; $j < @$rtoken_type ; $j++ ) { 202 203 # Mask certain token types by replacing them with their type code: 204 # type definition 205 # ---- ---------- 206 # Q quote or pattern 207 # q qw quote 208 # h << here doc operator 209 # # comment 210 # 211 # This choice will produce a mask file that has balanced 212 # container tokens and does not cause parsing problems. 213 if ( $$rtoken_type[$j] =~ /^[Qqh]$/ ) { 214 if ( $opt_c <= 1 ) { 215 $masked_line .= $$rtoken_type[$j] x length( $$rtokens[$j] ); 216 } 217 else { 218 $masked_line .= $$rtoken_type[$j]; 219 } 220 } 221 222 # Mask a comment 223 elsif ( $$rtoken_type[$j] eq '#' ) { 224 if ( $opt_c == 0 ) { 225 $masked_line .= '#' x length( $$rtokens[$j] ); 226 } 227 } 228 229 # All other tokens go out verbatim 230 else { 231 $masked_line .= $$rtokens[$j]; 232 } 233 } 234 print_line( $roriginal_file, $input_line ) if $roriginal_file; 235 print_line( $rmasked_file, $masked_line ); 236 237 # self-check lengths; this error should never happen 238 if ( $opt_c == 0 && length($masked_line) != length($input_line) ) { 239 my $lmask = length($masked_line); 240 my $linput = length($input_line); 241 print STDERR 242"$input_line_number: length ERROR, masked length=$lmask but input length=$linput\n"; 243 } 244} 245 246# called once after the last line of a file 247sub finish_formatting { 248 my $self = shift; 249 return; 250} 251