1# ====================================================================== 2# 3# Copyright (C) 2000-2007 Paul Kulchenko (paulclinger@yahoo.com) 4# Copyright (C) 2008 Martin Kutter (martin.kutter@fen-net.de) 5# SOAP::Lite is free software; you can redistribute it 6# and/or modify it under the same terms as Perl itself. 7# 8# $Id: Lite.pm 414 2012-07-15 09:18:42Z kutterma $ 9# 10# ====================================================================== 11 12package XML::Parser::Lite; 13 14use strict; 15use warnings; 16 17our $VERSION = 0.715; 18 19sub new { 20 my $class = shift; 21 22 return $class if ref $class; 23 my $self = bless {} => $class; 24 25 my %parameters = @_; 26 $self->setHandlers(); # clear first 27 $self->setHandlers(%{$parameters{Handlers} || {}}); 28 29 return $self; 30} 31 32sub setHandlers { 33 my $self = shift; 34 35 # allow symbolic refs, avoid "subroutine redefined" warnings 36 no strict 'refs'; 37 no warnings qw(redefine); 38 # clear all handlers if called without parameters 39 if (not @_) { 40 for (qw(Start End Char Final Init Comment Doctype XMLDecl)) { 41 *$_ = sub {} 42 } 43 } 44 45 # we could use each here, too... 46 while (@_) { 47 my($name, $func) = splice(@_, 0, 2); 48 *$name = defined $func 49 ? $func 50 : sub {} 51 } 52 return $self; 53} 54 55sub _regexp { 56 my $patch = shift || ''; 57 my $package = __PACKAGE__; 58 59 # This parser is based on "shallow parser" http://www.cs.sfu.ca/~cameron/REX.html 60 61 # Robert D. Cameron "REX: XML Shallow Parsing with Regular Expressions", 62 # Technical Report TR 1998-17, School of Computing Science, Simon Fraser University, November, 1998. 63 # Copyright (c) 1998, Robert D. Cameron. 64 # The following code may be freely used and distributed provided that 65 # this copyright and citation notice remains intact and that modifications 66 # or additions are clearly identified. 67 68 # Modifications may be tracked on SOAP::Lite's SVN at 69 # https://soaplite.svn.sourceforge.net/svnroot/soaplite/ 70 # 71 use re 'eval'; 72 my $TextSE = "[^<]+"; 73 my $UntilHyphen = "[^-]*-"; 74 my $Until2Hyphens = "([^-]*)-(?:[^-]$[^-]*-)*-"; 75 #my $CommentCE = "$Until2Hyphens(?{${package}::comment(\$2)})>?"; 76 my $CommentCE = "(.+)--(?{${package}::comment(\$2)})>?"; 77# my $Until2Hyphens = "$UntilHyphen(?:[^-]$UntilHyphen)*-"; 78# my $CommentCE = "$Until2Hyphens>?"; 79 my $UntilRSBs = "[^\\]]*](?:[^\\]]+])*]+"; 80 my $CDATA_CE = "$UntilRSBs(?:[^\\]>]$UntilRSBs)*>"; 81 my $S = "[ \\n\\t\\r]+"; 82 my $NameStrt = "[A-Za-z_:]|[^\\x00-\\x7F]"; 83 my $NameChar = "[A-Za-z0-9_:.-]|[^\\x00-\\x7F]"; 84 my $Name = "(?:$NameStrt)(?:$NameChar)*"; 85 my $QuoteSE = "\"[^\"]*\"|'[^']*'"; 86 my $DT_IdentSE = "$Name(?:$S(?:$Name|$QuoteSE))*"; 87# my $DT_IdentSE = "$S$Name(?:$S(?:$Name|$QuoteSE))*"; 88 my $MarkupDeclCE = "(?:[^\\]\"'><]+|$QuoteSE)*>"; 89 my $S1 = "[\\n\\r\\t ]"; 90 my $UntilQMs = "[^?]*\\?"; 91 my $PI_Tail = "\\?>|$S1$UntilQMs(?:[^>?]$UntilQMs)*"; 92 my $DT_ItemSE = "<(?:!(?:--$Until2Hyphens>|[^-]$MarkupDeclCE)|\\?$Name(?:$PI_Tail>))|%$Name;|$S"; 93 my $DocTypeCE = "$S($DT_IdentSE(?:$S)?(?:\\[(?:$DT_ItemSE)*](?:$S)?)?)>(?{${package}::_doctype(\$3)})"; 94# my $PI_Tail = "\\?>|$S1$UntilQMs(?:[^>?]$UntilQMs)*>"; 95# my $DT_ItemSE = "<(?:!(?:--$Until2Hyphens>|[^-]$MarkupDeclCE)|\\?$Name(?:$PI_Tail))|%$Name;|$S"; 96# my $DocTypeCE = "$DT_IdentSE(?:$S)?(?:\\[(?:$DT_ItemSE)*](?:$S)?)?>?"; 97 my $DeclCE = "--(?:$CommentCE)?|\\[CDATA\\[(?:$CDATA_CE)?|DOCTYPE(?:$DocTypeCE)?"; 98# my $PI_CE = "$Name(?:$PI_Tail)?"; 99 my $PI_CE = "($Name(?:$PI_Tail))>(?{${package}::_xmldecl(\$5)})"; 100 # these expressions were modified for backtracking and events 101# my $EndTagCE = "($Name)(?{${package}::_end(\$2)})(?:$S)?>"; 102 my $EndTagCE = "($Name)(?{${package}::_end(\$6)})(?:$S)?>"; 103 my $AttValSE = "\"([^<\"]*)\"|'([^<']*)'"; 104# my $ElemTagCE = "($Name)(?:$S($Name)(?:$S)?=(?:$S)?(?:$AttValSE)(?{[\@{\$^R||[]},\$4=>defined\$5?\$5:\$6]}))*(?:$S)?(/)?>(?{${package}::_start( \$3,\@{\$^R||[]})})(?{\${7} and ${package}::_end(\$3)})"; 105 my $ElemTagCE = "($Name)" 106 . "(?:$S($Name)(?:$S)?=(?:$S)?(?:$AttValSE)" 107 . "(?{[\@{\$^R||[]},\$8=>defined\$9?\$9:\$10]}))*(?:$S)?(/)?>" 108 . "(?{${package}::_start(\$7,\@{\$^R||[]})})(?{\$11 and ${package}::_end(\$7)})"; 109 110 my $MarkupSPE = "<(?:!(?:$DeclCE)?|\\?(?:$PI_CE)?|/(?:$EndTagCE)?|(?:$ElemTagCE)?)"; 111 112 # Next expression is under "black magic". 113 # Ideally it should be '($TextSE)(?{${package}::char(\$1)})|$MarkupSPE', 114 # but it doesn't work under Perl 5.005 and only magic with 115 # (?:....)?? solved the problem. 116 # I would appreciate if someone let me know what is the right thing to do 117 # and what's the reason for all this magic. 118 # Seems like a problem related to (?:....)? rather than to ?{} feature. 119 # Tests are in t/31-xmlparserlite.t if you decide to play with it. 120 #"(?{[]})(?:($TextSE)(?{${package}::_char(\$1)}))$patch|$MarkupSPE"; 121 "(?:($TextSE)(?{${package}::_char(\$1)}))$patch|$MarkupSPE"; 122} 123 124setHandlers(); 125 126# Try 5.6 and 5.10 regex first 127my $REGEXP = _regexp('??'); 128 129sub _parse_re { 130 use re "eval"; 131 undef $^R; 132 no strict 'refs'; 133 1 while $_[0] =~ m{$REGEXP}go 134}; 135 136# fixup regex if it does not work... 137{ 138 if (not eval { _parse_re('<soap:foo xmlns:soap="foo">bar</soap:foo>'); 1; } ) { 139 $REGEXP = _regexp(); 140 local $^W; 141 *_parse_re = sub { 142 use re "eval"; 143 undef $^R; 144 1 while $_[0] =~ m{$REGEXP}go 145 }; 146 } 147} 148 149sub parse { 150 _init(); 151 _parse_re($_[1]); 152 _final(); 153} 154 155my(@stack, $level); 156 157sub _init { 158 @stack = (); 159 $level = 0; 160 Init(__PACKAGE__, @_); 161} 162 163sub _final { 164 die "not properly closed tag '$stack[-1]'\n" if @stack; 165 die "no element found\n" unless $level; 166 Final(__PACKAGE__, @_) 167} 168 169sub _start { 170 die "multiple roots, wrong element '$_[0]'\n" if $level++ && !@stack; 171 push(@stack, $_[0]); 172 my $r=Start(__PACKAGE__, @_); 173 return ref($r) eq 'ARRAY' ? $r : undef; 174} 175 176sub _char { 177 Char(__PACKAGE__, $_[0]), return if @stack; 178 179 # check for junk before or after element 180 # can't use split or regexp due to limitations in ?{} implementation, 181 # will iterate with loop, but we'll do it no more than two times, so 182 # it shouldn't affect performance 183 for (my $i=0; $i < length $_[0]; $i++) { 184 die "junk '$_[0]' @{[$level ? 'after' : 'before']} XML element\n" 185 if index("\n\r\t ", substr($_[0],$i,1)) < 0; # or should '< $[' be there 186 } 187} 188 189sub _end { 190 no warnings qw(uninitialized); 191 pop(@stack) eq $_[0] or die "mismatched tag '$_[0]'\n"; 192 my $r=End(__PACKAGE__, $_[0]); 193 return ref($r) eq 'ARRAY' ? $r : undef; 194} 195 196sub comment { 197 my $r=Comment(__PACKAGE__, $_[0]); 198 return ref($r) eq 'ARRAY' ? $r : undef; 199} 200 201sub end { 202 pop(@stack) eq $_[0] or die "mismatched tag '$_[0]'\n"; 203 my $r=End(__PACKAGE__, $_[0]); 204 return ref($r) eq 'ARRAY' ? $r : undef; 205} 206 207sub _doctype { 208 my $r=Doctype(__PACKAGE__, $_[0]); 209 return ref($r) eq 'ARRAY' ? $r : undef; 210} 211 212sub _xmldecl { 213 XMLDecl(__PACKAGE__, $_[0]); 214} 215 216 217 218# ====================================================================== 2191; 220 221__END__ 222 223=head1 NAME 224 225XML::Parser::Lite - Lightweight regexp-based XML parser 226 227=head1 SYNOPSIS 228 229 use XML::Parser::Lite; 230 231 $p1 = new XML::Parser::Lite; 232 $p1->setHandlers( 233 Start => sub { shift; print "start: @_\n" }, 234 Char => sub { shift; print "char: @_\n" }, 235 End => sub { shift; print "end: @_\n" }, 236 ); 237 $p1->parse('<foo id="me">Hello World!</foo>'); 238 239 $p2 = new XML::Parser::Lite 240 Handlers => { 241 Start => sub { shift; print "start: @_\n" }, 242 Char => sub { shift; print "char: @_\n" }, 243 End => sub { shift; print "end: @_\n" }, 244 } 245 ; 246 $p2->parse('<foo id="me">Hello <bar>cruel</bar> World!</foo>'); 247 248=head1 DESCRIPTION 249 250This Perl implements an XML parser with a interface similar to 251XML::Parser. Though not all callbacks are supported, you should be able to 252use it in the same way you use XML::Parser. Due to using experimantal regexp 253features it'll work only on Perl 5.6 and above and may behave differently on 254different platforms. 255 256Note that you cannot use regular expressions or split in callbacks. This is 257due to a limitation of perl's regular expression implementation (which is 258not re-entrant). 259 260=head1 SUBROUTINES/METHODS 261 262=head2 new 263 264Constructor. 265 266As (almost) all SOAP::Lite constructors, new() returns the object called on 267when called as object method. This means that the following effectifely is 268a no-op if $obj is a object: 269 270 $obj = $obj->new(); 271 272New accepts a single named parameter, C<Handlers> with a hash ref as value: 273 274 my $parser = XML::Parser::Lite->new( 275 Handlers => { 276 Start => sub { shift; print "start: @_\n" }, 277 Char => sub { shift; print "char: @_\n" }, 278 End => sub { shift; print "end: @_\n" }, 279 } 280 ); 281 282The handlers given will be passed to setHandlers. 283 284=head2 setHandlers 285 286Sets (or resets) the parsing handlers. Accepts a hash with the handler names 287and handler code references as parameters. Passing C<undef> instead of a 288code reference replaces the handler by a no-op. 289 290The following handlers can be set: 291 292 Init 293 Start 294 Char 295 End 296 Final 297 298All other handlers are ignored. 299 300Calling setHandlers without parameters resets all handlers to no-ops. 301 302=head2 parse 303 304Parses the XML given. In contrast to L<XML::Parser|XML::Parser>'s parse 305method, parse() only parses strings. 306 307=head1 Handler methods 308 309=head2 Init 310 311Called before parsing starts. You should perform any necessary initializations 312in Init. 313 314=head2 Start 315 316Called at the start of each XML node. See L<XML::Parser> for details. 317 318=head2 Char 319 320Called for each character sequence. May be called multiple times for the 321characters contained in an XML node (even for every single character). 322Your implementation has to make sure that it captures all characters. 323 324=head2 End 325 326Called at the end of each XML node. See L<XML::Parser> for details 327 328=head2 Comment 329 330See L<XML::Parser> for details 331 332=head2 XMLDecl 333 334See L<XML::Parser> for details 335 336=head2 Doctype 337 338See L<XML::Parser> for details 339 340=head2 Final 341 342Called at the end of the parsing process. You should perform any neccessary 343cleanup here. 344 345=head1 SEE ALSO 346 347 XML::Parser 348 349=head1 COPYRIGHT 350 351Copyright (C) 2000-2007 Paul Kulchenko. All rights reserved. 352 353Copyright (C) 2008- Martin Kutter. All rights reserved. 354 355This library is free software; you can redistribute it and/or modify 356it under the same terms as Perl itself. 357 358This parser is based on "shallow parser" http://www.cs.sfu.ca/~cameron/REX.html 359Copyright (c) 1998, Robert D. Cameron. 360 361=head1 AUTHOR 362 363Paul Kulchenko (paulclinger@yahoo.com) 364 365Martin Kutter (martin.kutter@fen-net.de) 366 367Additional handlers supplied by Adam Leggett. 368 369=cut 370 371 372 373 374