1#!/usr/bin/perl -w
2use strict;
3#
4# Convert a perl script into an xml file
5#
6# usage:
7# perlxmltok myfile.pl >myfile.xml
8# perlxmltok <myfile.pl >myfile.xml
9#
10# The script is broken at the line and token level.
11#
12# This file is one of the examples distributed with perltidy and demonstrates
13# using a callback object with Perl::Tidy to walk through a perl file and
14# process its tokens.  It may or may not have any actual usefulness.  You can
15# modify it to suit your own purposes; see sub get_line().
16#
17use Perl::Tidy;
18use IO::File;
19use Getopt::Std;
20use vars qw($opt_h);
21my $file;
22my $usage = <<EOM;
23   usage: perlxmltok filename >outfile
24EOM
25getopts('h') or die "$usage";
26if ($opt_h) {die $usage}
27if ( @ARGV == 1 ) {
28    $file = $ARGV[0];
29}
30else { die $usage }
31my $source;
32my $fh;
33if ($file) {
34    $fh = IO::File->new( $file, 'r' );
35    unless ($fh) { die "cannot open '$file': $!\n" }
36    $source = $fh;
37}
38else {
39    $source = '-';
40}
41my $formatter = Perl::Tidy::XmlWriter->new($file);
42my $dest;
43
44# start perltidy, which will start calling our write_line()
45my $err = perltidy(
46    'formatter'   => $formatter,    # callback object
47    'source'      => $source,
48    'destination' => \$dest,        # not really needed
49    'argv'        => "-npro -se",   # dont need .perltidyrc
50                                    # errors to STDOUT
51);
52if ($err) {
53    die "Error calling perltidy\n";
54}
55$fh->close() if $fh;
56
57#####################################################################
58#
59# The Perl::Tidy::XmlWriter class writes a copy of the input stream in xml
60#
61#####################################################################
62
63package Perl::Tidy::XmlWriter;
64
65# class variables
66use vars qw{
67  %token_short_names
68  %short_to_long_names
69  $rOpts
70  $missing_html_entities
71};
72
73# replace unsafe characters with HTML entity representation if HTML::Entities
74# is available
75{ eval "use HTML::Entities"; $missing_html_entities = $@; }
76
77sub new {
78
79    my ( $class, $input_file ) = @_;
80    my $self = bless { }, $class;
81
82    $self->print( <<"HEADER");
83<?xml version = "1.0"?>
84HEADER
85
86    unless ( !$input_file || $input_file eq '-' || ref($input_file) ) {
87
88        $self->print( <<"COMMENT");
89<!-- created by perltidy from file: $input_file -->
90COMMENT
91    }
92
93    $self->print("<file>\n");
94    return $self;
95}
96
97sub print {
98    my ( $self, $line ) = @_;
99    print $line;
100}
101
102sub write_line {
103
104    # This routine will be called once perl line by perltidy
105    my $self = shift;
106    my ($line_of_tokens) = @_;
107    my $line_type        = $line_of_tokens->{_line_type};
108    my $input_line       = $line_of_tokens->{_line_text};
109    my $line_number      = $line_of_tokens->{_line_number};
110    chomp $input_line;
111    $self->print(" <line type='$line_type'>\n");
112    $self->print("  <text>\n");
113
114    $input_line = my_encode_entities($input_line);
115    $self->print("$input_line\n");
116    $self->print("  </text>\n");
117
118    # markup line of code..
119    if ( $line_type eq 'CODE' ) {
120        my $xml_line;
121        my $rtoken_type = $line_of_tokens->{_rtoken_type};
122        my $rtokens     = $line_of_tokens->{_rtokens};
123
124        if ( $input_line =~ /(^\s*)/ ) {
125            $xml_line = $1;
126        }
127        else {
128            $xml_line = "";
129        }
130        my $rmarked_tokens = $self->markup_tokens( $rtokens, $rtoken_type );
131        $xml_line .= join '', @$rmarked_tokens;
132
133        $self->print("  <tokens>\n");
134        $self->print("$xml_line\n");
135        $self->print("  </tokens>\n");
136    }
137
138    $self->print(" </line>\n");
139}
140
141BEGIN {
142
143    # This is the official list of tokens which may be identified by the
144    # user.  Long names are used as getopt keys.  Short names are
145    # convenient short abbreviations for specifying input.  Short names
146    # somewhat resemble token type characters, but are often different
147    # because they may only be alphanumeric, to allow command line
148    # input.  Also, note that because of case insensitivity of xml,
149    # this table must be in a single case only (I've chosen to use all
150    # lower case).
151    # When adding NEW_TOKENS: update this hash table
152    # short names => long names
153    %short_to_long_names = (
154        'n'  => 'numeric',
155        'p'  => 'paren',
156        'q'  => 'quote',
157        's'  => 'structure',
158        'c'  => 'comment',
159        'b'  => 'blank',
160        'v'  => 'v-string',
161        'cm' => 'comma',
162        'w'  => 'bareword',
163        'co' => 'colon',
164        'pu' => 'punctuation',
165        'i'  => 'identifier',
166        'j'  => 'label',
167        'h'  => 'here-doc-target',
168        'hh' => 'here-doc-text',
169        'k'  => 'keyword',
170        'sc' => 'semicolon',
171        'm'  => 'subroutine',
172        'pd' => 'pod-text',
173    );
174
175    # Now we have to map actual token types into one of the above short
176    # names; any token types not mapped will get 'punctuation'
177    # properties.
178
179    # The values of this hash table correspond to the keys of the
180    # previous hash table.
181    # The keys of this hash table are token types and can be seen
182    # by running with --dump-token-types (-dtt).
183
184    # When adding NEW_TOKENS: update this hash table
185    # $type => $short_name
186    %token_short_names = (
187        '#'  => 'c',
188        'n'  => 'n',
189        'v'  => 'v',
190        'b'  => 'b',
191        'k'  => 'k',
192        'F'  => 'k',
193        'Q'  => 'q',
194        'q'  => 'q',
195        'J'  => 'j',
196        'j'  => 'j',
197        'h'  => 'h',
198        'H'  => 'hh',
199        'w'  => 'w',
200        ','  => 'cm',
201        '=>' => 'cm',
202        ';'  => 'sc',
203        ':'  => 'co',
204        'f'  => 'sc',
205        '('  => 'p',
206        ')'  => 'p',
207        'M'  => 'm',
208        'P'  => 'pd',
209    );
210
211    # These token types will all be called identifiers for now
212    # FIXME: need to separate user defined modules as separate type
213    my @identifier = qw" i t U C Y Z G :: ";
214    @token_short_names{@identifier} = ('i') x scalar(@identifier);
215
216    # These token types will be called 'structure'
217    my @structure = qw" { } ";
218    @token_short_names{@structure} = ('s') x scalar(@structure);
219
220}
221
222sub markup_tokens {
223    my $self = shift;
224    my ( $rtokens, $rtoken_type ) = @_;
225    my ( @marked_tokens, $j, $string, $type, $token );
226
227    for ( $j = 0 ; $j < @$rtoken_type ; $j++ ) {
228        $type  = $$rtoken_type[$j];
229        $token = $$rtokens[$j];
230
231        #-------------------------------------------------------
232        # Patch : intercept a sub name here and split it
233        # into keyword 'sub' and sub name
234        if ( $type eq 'i' && $token =~ /^(sub\s+)(\w.*)$/ ) {
235            $token = $self->markup_xml_element( $1, 'k' );
236            push @marked_tokens, $token;
237            $token = $2;
238            $type  = 'M';
239        }
240
241        # Patch : intercept a package name here and split it
242        # into keyword 'package' and name
243        if ( $type eq 'i' && $token =~ /^(package\s+)(\w.*)$/ ) {
244            $token = $self->markup_xml_element( $1, 'k' );
245            push @marked_tokens, $token;
246            $token = $2;
247            $type  = 'i';
248        }
249        #-------------------------------------------------------
250
251        $token = $self->markup_xml_element( $token, $type );
252        push @marked_tokens, $token;
253    }
254    return \@marked_tokens;
255}
256
257sub my_encode_entities {
258    my ($token) = @_;
259
260    # escape any characters not allowed in XML content.
261    # ??s/�/&apos;/;
262    if ($missing_html_entities) {
263        $token =~ s/\&/&amp;/g;
264        $token =~ s/\</&lt;/g;
265        $token =~ s/\>/&gt;/g;
266        $token =~ s/\"/&quot;/g;
267    }
268    else {
269        HTML::Entities::encode_entities($token);
270    }
271    return $token;
272}
273
274sub markup_xml_element {
275    my $self = shift;
276    my ( $token, $type ) = @_;
277    if ($token) { $token = my_encode_entities($token) }
278
279    # get the short abbreviation for this token type
280    my $short_name = $token_short_names{$type};
281    if ( !defined($short_name) ) {
282        $short_name = "pu";    # punctuation is default
283    }
284    $token = qq(<$short_name>) . $token . qq(</$short_name>);
285    return $token;
286}
287
288sub finish_formatting {
289
290    # called after last line
291    my $self = shift;
292    $self->print("</file>\n");
293    return;
294}
295