1#!/usr/bin/perl -w 2use strict; 3# 4# Convert a perl script into an xml file 5# 6# usage: 7# perlxmltok myfile.pl >myfile.xml 8# perlxmltok <myfile.pl >myfile.xml 9# 10# The script is broken at the line and token level. 11# 12# This file is one of the examples distributed with perltidy and demonstrates 13# using a callback object with Perl::Tidy to walk through a perl file and 14# process its tokens. It may or may not have any actual usefulness. You can 15# modify it to suit your own purposes; see sub get_line(). 16# 17use Perl::Tidy; 18use IO::File; 19use Getopt::Std; 20use vars qw($opt_h); 21my $file; 22my $usage = <<EOM; 23 usage: perlxmltok filename >outfile 24EOM 25getopts('h') or die "$usage"; 26if ($opt_h) {die $usage} 27if ( @ARGV == 1 ) { 28 $file = $ARGV[0]; 29} 30else { die $usage } 31my $source; 32my $fh; 33if ($file) { 34 $fh = IO::File->new( $file, 'r' ); 35 unless ($fh) { die "cannot open '$file': $!\n" } 36 $source = $fh; 37} 38else { 39 $source = '-'; 40} 41my $formatter = Perl::Tidy::XmlWriter->new($file); 42my $dest; 43 44# start perltidy, which will start calling our write_line() 45my $err = perltidy( 46 'formatter' => $formatter, # callback object 47 'source' => $source, 48 'destination' => \$dest, # not really needed 49 'argv' => "-npro -se", # dont need .perltidyrc 50 # errors to STDOUT 51); 52if ($err) { 53 die "Error calling perltidy\n"; 54} 55$fh->close() if $fh; 56 57##################################################################### 58# 59# The Perl::Tidy::XmlWriter class writes a copy of the input stream in xml 60# 61##################################################################### 62 63package Perl::Tidy::XmlWriter; 64 65# class variables 66use vars qw{ 67 %token_short_names 68 %short_to_long_names 69 $rOpts 70 $missing_html_entities 71}; 72 73# replace unsafe characters with HTML entity representation if HTML::Entities 74# is available 75{ eval "use HTML::Entities"; $missing_html_entities = $@; } 76 77sub new { 78 79 my ( $class, $input_file ) = @_; 80 my $self = bless { }, $class; 81 82 $self->print( <<"HEADER"); 83<?xml version = "1.0"?> 84HEADER 85 86 unless ( !$input_file || $input_file eq '-' || ref($input_file) ) { 87 88 $self->print( <<"COMMENT"); 89<!-- created by perltidy from file: $input_file --> 90COMMENT 91 } 92 93 $self->print("<file>\n"); 94 return $self; 95} 96 97sub print { 98 my ( $self, $line ) = @_; 99 print $line; 100} 101 102sub write_line { 103 104 # This routine will be called once perl line by perltidy 105 my $self = shift; 106 my ($line_of_tokens) = @_; 107 my $line_type = $line_of_tokens->{_line_type}; 108 my $input_line = $line_of_tokens->{_line_text}; 109 my $line_number = $line_of_tokens->{_line_number}; 110 chomp $input_line; 111 $self->print(" <line type='$line_type'>\n"); 112 $self->print(" <text>\n"); 113 114 $input_line = my_encode_entities($input_line); 115 $self->print("$input_line\n"); 116 $self->print(" </text>\n"); 117 118 # markup line of code.. 119 if ( $line_type eq 'CODE' ) { 120 my $xml_line; 121 my $rtoken_type = $line_of_tokens->{_rtoken_type}; 122 my $rtokens = $line_of_tokens->{_rtokens}; 123 124 if ( $input_line =~ /(^\s*)/ ) { 125 $xml_line = $1; 126 } 127 else { 128 $xml_line = ""; 129 } 130 my $rmarked_tokens = $self->markup_tokens( $rtokens, $rtoken_type ); 131 $xml_line .= join '', @$rmarked_tokens; 132 133 $self->print(" <tokens>\n"); 134 $self->print("$xml_line\n"); 135 $self->print(" </tokens>\n"); 136 } 137 138 $self->print(" </line>\n"); 139} 140 141BEGIN { 142 143 # This is the official list of tokens which may be identified by the 144 # user. Long names are used as getopt keys. Short names are 145 # convenient short abbreviations for specifying input. Short names 146 # somewhat resemble token type characters, but are often different 147 # because they may only be alphanumeric, to allow command line 148 # input. Also, note that because of case insensitivity of xml, 149 # this table must be in a single case only (I've chosen to use all 150 # lower case). 151 # When adding NEW_TOKENS: update this hash table 152 # short names => long names 153 %short_to_long_names = ( 154 'n' => 'numeric', 155 'p' => 'paren', 156 'q' => 'quote', 157 's' => 'structure', 158 'c' => 'comment', 159 'b' => 'blank', 160 'v' => 'v-string', 161 'cm' => 'comma', 162 'w' => 'bareword', 163 'co' => 'colon', 164 'pu' => 'punctuation', 165 'i' => 'identifier', 166 'j' => 'label', 167 'h' => 'here-doc-target', 168 'hh' => 'here-doc-text', 169 'k' => 'keyword', 170 'sc' => 'semicolon', 171 'm' => 'subroutine', 172 'pd' => 'pod-text', 173 ); 174 175 # Now we have to map actual token types into one of the above short 176 # names; any token types not mapped will get 'punctuation' 177 # properties. 178 179 # The values of this hash table correspond to the keys of the 180 # previous hash table. 181 # The keys of this hash table are token types and can be seen 182 # by running with --dump-token-types (-dtt). 183 184 # When adding NEW_TOKENS: update this hash table 185 # $type => $short_name 186 %token_short_names = ( 187 '#' => 'c', 188 'n' => 'n', 189 'v' => 'v', 190 'b' => 'b', 191 'k' => 'k', 192 'F' => 'k', 193 'Q' => 'q', 194 'q' => 'q', 195 'J' => 'j', 196 'j' => 'j', 197 'h' => 'h', 198 'H' => 'hh', 199 'w' => 'w', 200 ',' => 'cm', 201 '=>' => 'cm', 202 ';' => 'sc', 203 ':' => 'co', 204 'f' => 'sc', 205 '(' => 'p', 206 ')' => 'p', 207 'M' => 'm', 208 'P' => 'pd', 209 ); 210 211 # These token types will all be called identifiers for now 212 # FIXME: need to separate user defined modules as separate type 213 my @identifier = qw" i t U C Y Z G :: "; 214 @token_short_names{@identifier} = ('i') x scalar(@identifier); 215 216 # These token types will be called 'structure' 217 my @structure = qw" { } "; 218 @token_short_names{@structure} = ('s') x scalar(@structure); 219 220} 221 222sub markup_tokens { 223 my $self = shift; 224 my ( $rtokens, $rtoken_type ) = @_; 225 my ( @marked_tokens, $j, $string, $type, $token ); 226 227 for ( $j = 0 ; $j < @$rtoken_type ; $j++ ) { 228 $type = $$rtoken_type[$j]; 229 $token = $$rtokens[$j]; 230 231 #------------------------------------------------------- 232 # Patch : intercept a sub name here and split it 233 # into keyword 'sub' and sub name 234 if ( $type eq 'i' && $token =~ /^(sub\s+)(\w.*)$/ ) { 235 $token = $self->markup_xml_element( $1, 'k' ); 236 push @marked_tokens, $token; 237 $token = $2; 238 $type = 'M'; 239 } 240 241 # Patch : intercept a package name here and split it 242 # into keyword 'package' and name 243 if ( $type eq 'i' && $token =~ /^(package\s+)(\w.*)$/ ) { 244 $token = $self->markup_xml_element( $1, 'k' ); 245 push @marked_tokens, $token; 246 $token = $2; 247 $type = 'i'; 248 } 249 #------------------------------------------------------- 250 251 $token = $self->markup_xml_element( $token, $type ); 252 push @marked_tokens, $token; 253 } 254 return \@marked_tokens; 255} 256 257sub my_encode_entities { 258 my ($token) = @_; 259 260 # escape any characters not allowed in XML content. 261 # ??s/�/'/; 262 if ($missing_html_entities) { 263 $token =~ s/\&/&/g; 264 $token =~ s/\</</g; 265 $token =~ s/\>/>/g; 266 $token =~ s/\"/"/g; 267 } 268 else { 269 HTML::Entities::encode_entities($token); 270 } 271 return $token; 272} 273 274sub markup_xml_element { 275 my $self = shift; 276 my ( $token, $type ) = @_; 277 if ($token) { $token = my_encode_entities($token) } 278 279 # get the short abbreviation for this token type 280 my $short_name = $token_short_names{$type}; 281 if ( !defined($short_name) ) { 282 $short_name = "pu"; # punctuation is default 283 } 284 $token = qq(<$short_name>) . $token . qq(</$short_name>); 285 return $token; 286} 287 288sub finish_formatting { 289 290 # called after last line 291 my $self = shift; 292 $self->print("</file>\n"); 293 return; 294} 295