1#!/usr/bin/perl -w 2use strict; 3# 4# Convert a perl script into an xml file 5# 6# usage: 7# perlxmltok myfile.pl >myfile.xml 8# perlxmltok <myfile.pl >myfile.xml 9# 10# The script is broken at the line and token level. 11# 12# This file is one of the examples distributed with perltidy and demonstrates 13# using a callback object with Perl::Tidy to walk through a perl file and 14# process its tokens. It may or may not have any actual usefulness. You can 15# modify it to suit your own purposes; see sub get_line(). 16# 17use Perl::Tidy; 18use IO::File; 19use Getopt::Std; 20use vars qw($opt_h); 21my $file; 22my $usage = <<EOM; 23 usage: perlxmltok filename >outfile 24EOM 25getopts('h') or die "$usage"; 26if ($opt_h) {die $usage} 27if ( @ARGV == 1 ) { 28 $file = $ARGV[0]; 29} 30else { die $usage } 31my $source; 32my $fh; 33if ($file) { 34 $fh = IO::File->new( $file, 'r' ); 35 unless ($fh) { die "cannot open '$file': $!\n" } 36 $source = $fh; 37} 38else { 39 $source = '-'; 40} 41my $formatter = Perl::Tidy::XmlWriter->new($file); 42my $dest; 43 44# start perltidy, which will start calling our write_line() 45perltidy( 46 'formatter' => $formatter, # callback object 47 'source' => $source, 48 'destination' => \$dest, # not really needed 49 'argv' => "-npro -se", # dont need .perltidyrc 50 # errors to STDOUT 51); 52$fh->close() if $fh; 53 54##################################################################### 55# 56# The Perl::Tidy::XmlWriter class writes a copy of the input stream in xml 57# 58##################################################################### 59 60package Perl::Tidy::XmlWriter; 61 62# class variables 63use vars qw{ 64 %token_short_names 65 %short_to_long_names 66 $rOpts 67 $missing_html_entities 68}; 69 70# replace unsafe characters with HTML entity representation if HTML::Entities 71# is available 72{ eval "use HTML::Entities"; $missing_html_entities = $@; } 73 74sub new { 75 76 my ( $class, $input_file ) = @_; 77 my $self = bless { }, $class; 78 79 $self->print( <<"HEADER"); 80<?xml version = "1.0"?> 81HEADER 82 83 unless ( !$input_file || $input_file eq '-' || ref($input_file) ) { 84 85 $self->print( <<"COMMENT"); 86<!-- created by perltidy from file: $input_file --> 87COMMENT 88 } 89 90 $self->print("<file>\n"); 91 return $self; 92} 93 94sub print { 95 my ( $self, $line ) = @_; 96 print $line; 97} 98 99sub write_line { 100 101 # This routine will be called once perl line by perltidy 102 my $self = shift; 103 my ($line_of_tokens) = @_; 104 my $line_type = $line_of_tokens->{_line_type}; 105 my $input_line = $line_of_tokens->{_line_text}; 106 my $line_number = $line_of_tokens->{_line_number}; 107 chomp $input_line; 108 $self->print(" <line type='$line_type'>\n"); 109 $self->print(" <text>\n"); 110 111 $input_line = my_encode_entities($input_line); 112 $self->print("$input_line\n"); 113 $self->print(" </text>\n"); 114 115 # markup line of code.. 116 if ( $line_type eq 'CODE' ) { 117 my $xml_line; 118 my $rtoken_type = $line_of_tokens->{_rtoken_type}; 119 my $rtokens = $line_of_tokens->{_rtokens}; 120 121 if ( $input_line =~ /(^\s*)/ ) { 122 $xml_line = $1; 123 } 124 else { 125 $xml_line = ""; 126 } 127 my $rmarked_tokens = $self->markup_tokens( $rtokens, $rtoken_type ); 128 $xml_line .= join '', @$rmarked_tokens; 129 130 $self->print(" <tokens>\n"); 131 $self->print("$xml_line\n"); 132 $self->print(" </tokens>\n"); 133 } 134 135 $self->print(" </line>\n"); 136} 137 138BEGIN { 139 140 # This is the official list of tokens which may be identified by the 141 # user. Long names are used as getopt keys. Short names are 142 # convenient short abbreviations for specifying input. Short names 143 # somewhat resemble token type characters, but are often different 144 # because they may only be alphanumeric, to allow command line 145 # input. Also, note that because of case insensitivity of xml, 146 # this table must be in a single case only (I've chosen to use all 147 # lower case). 148 # When adding NEW_TOKENS: update this hash table 149 # short names => long names 150 %short_to_long_names = ( 151 'n' => 'numeric', 152 'p' => 'paren', 153 'q' => 'quote', 154 's' => 'structure', 155 'c' => 'comment', 156 'b' => 'blank', 157 'v' => 'v-string', 158 'cm' => 'comma', 159 'w' => 'bareword', 160 'co' => 'colon', 161 'pu' => 'punctuation', 162 'i' => 'identifier', 163 'j' => 'label', 164 'h' => 'here-doc-target', 165 'hh' => 'here-doc-text', 166 'k' => 'keyword', 167 'sc' => 'semicolon', 168 'm' => 'subroutine', 169 'pd' => 'pod-text', 170 ); 171 172 # Now we have to map actual token types into one of the above short 173 # names; any token types not mapped will get 'punctuation' 174 # properties. 175 176 # The values of this hash table correspond to the keys of the 177 # previous hash table. 178 # The keys of this hash table are token types and can be seen 179 # by running with --dump-token-types (-dtt). 180 181 # When adding NEW_TOKENS: update this hash table 182 # $type => $short_name 183 %token_short_names = ( 184 '#' => 'c', 185 'n' => 'n', 186 'v' => 'v', 187 'b' => 'b', 188 'k' => 'k', 189 'F' => 'k', 190 'Q' => 'q', 191 'q' => 'q', 192 'J' => 'j', 193 'j' => 'j', 194 'h' => 'h', 195 'H' => 'hh', 196 'w' => 'w', 197 ',' => 'cm', 198 '=>' => 'cm', 199 ';' => 'sc', 200 ':' => 'co', 201 'f' => 'sc', 202 '(' => 'p', 203 ')' => 'p', 204 'M' => 'm', 205 'P' => 'pd', 206 ); 207 208 # These token types will all be called identifiers for now 209 # FIXME: need to separate user defined modules as separate type 210 my @identifier = qw" i t U C Y Z G :: "; 211 @token_short_names{@identifier} = ('i') x scalar(@identifier); 212 213 # These token types will be called 'structure' 214 my @structure = qw" { } "; 215 @token_short_names{@structure} = ('s') x scalar(@structure); 216 217} 218 219sub markup_tokens { 220 my $self = shift; 221 my ( $rtokens, $rtoken_type ) = @_; 222 my ( @marked_tokens, $j, $string, $type, $token ); 223 224 for ( $j = 0 ; $j < @$rtoken_type ; $j++ ) { 225 $type = $$rtoken_type[$j]; 226 $token = $$rtokens[$j]; 227 228 #------------------------------------------------------- 229 # Patch : intercept a sub name here and split it 230 # into keyword 'sub' and sub name 231 if ( $type eq 'i' && $token =~ /^(sub\s+)(\w.*)$/ ) { 232 $token = $self->markup_xml_element( $1, 'k' ); 233 push @marked_tokens, $token; 234 $token = $2; 235 $type = 'M'; 236 } 237 238 # Patch : intercept a package name here and split it 239 # into keyword 'package' and name 240 if ( $type eq 'i' && $token =~ /^(package\s+)(\w.*)$/ ) { 241 $token = $self->markup_xml_element( $1, 'k' ); 242 push @marked_tokens, $token; 243 $token = $2; 244 $type = 'i'; 245 } 246 #------------------------------------------------------- 247 248 $token = $self->markup_xml_element( $token, $type ); 249 push @marked_tokens, $token; 250 } 251 return \@marked_tokens; 252} 253 254sub my_encode_entities { 255 my ($token) = @_; 256 257 # escape any characters not allowed in XML content. 258 # ??s/�/'/; 259 if ($missing_html_entities) { 260 $token =~ s/\&/&/g; 261 $token =~ s/\</</g; 262 $token =~ s/\>/>/g; 263 $token =~ s/\"/"/g; 264 } 265 else { 266 HTML::Entities::encode_entities($token); 267 } 268 return $token; 269} 270 271sub markup_xml_element { 272 my $self = shift; 273 my ( $token, $type ) = @_; 274 if ($token) { $token = my_encode_entities($token) } 275 276 # get the short abbreviation for this token type 277 my $short_name = $token_short_names{$type}; 278 if ( !defined($short_name) ) { 279 $short_name = "pu"; # punctuation is default 280 } 281 $token = qq(<$short_name>) . $token . qq(</$short_name>); 282 return $token; 283} 284 285sub finish_formatting { 286 287 # called after last line 288 my $self = shift; 289 $self->print("</file>\n"); 290 return; 291} 292