1# ====================================================================== 2# 3# Copyright (C) 2000-2001 Paul Kulchenko (paulclinger@yahoo.com) 4# SOAP::Lite is free software; you can redistribute it 5# and/or modify it under the same terms as Perl itself. 6# 7# $Id: Lite.pm,v 1.2 2005/02/22 01:47:42 byrnereese Exp $ 8# 9# ====================================================================== 10 11package XML::Parser::Lite; 12 13use strict; 14use vars qw($VERSION); 15#$VERSION = sprintf("%d.%s", map {s/_//g; $_} q$Name: $ =~ /-(\d+)_([\d_]+)/); 16$VERSION = '0.65_3'; 17 18sub new { 19 my $self = shift; 20 my $class = ref($self) || $self; 21 return $self if ref $self; 22 23 $self = bless {} => $class; 24 my %parameters = @_; 25 $self->setHandlers(); # clear first 26 $self->setHandlers(%{$parameters{Handlers} || {}}); 27 return $self; 28} 29 30sub setHandlers { 31 my $self = shift; 32 no strict 'refs'; local $^W; 33 # clear all handlers if called without parameters 34 unless (@_) { foreach (qw(Start End Char Final Init)) { *$_ = sub {} } } 35 while (@_) { my($name => $func) = splice(@_, 0, 2); *$name = defined $func ? $func : sub {} } 36 return $self; 37} 38 39sub regexp { 40 my $patch = shift || ''; 41 my $package = __PACKAGE__; 42 43 # This parser is based on "shallow parser" http://www.cs.sfu.ca/~cameron/REX.html 44 45 # Robert D. Cameron "REX: XML Shallow Parsing with Regular Expressions", 46 # Technical Report TR 1998-17, School of Computing Science, Simon Fraser University, November, 1998. 47 # Copyright (c) 1998, Robert D. Cameron. 48 # The following code may be freely used and distributed provided that 49 # this copyright and citation notice remains intact and that modifications 50 # or additions are clearly identified. 51 52 my $TextSE = "[^<]+"; 53 my $UntilHyphen = "[^-]*-"; 54 my $Until2Hyphens = "$UntilHyphen(?:[^-]$UntilHyphen)*-"; 55 my $CommentCE = "$Until2Hyphens>?"; 56 my $UntilRSBs = "[^\\]]*](?:[^\\]]+])*]+"; 57 my $CDATA_CE = "$UntilRSBs(?:[^\\]>]$UntilRSBs)*>"; 58 my $S = "[ \\n\\t\\r]+"; 59 my $NameStrt = "[A-Za-z_:]|[^\\x00-\\x7F]"; 60 my $NameChar = "[A-Za-z0-9_:.-]|[^\\x00-\\x7F]"; 61 my $Name = "(?:$NameStrt)(?:$NameChar)*"; 62 my $QuoteSE = "\"[^\"]*\"|'[^']*'"; 63 my $DT_IdentSE = "$S$Name(?:$S(?:$Name|$QuoteSE))*"; 64 my $MarkupDeclCE = "(?:[^\\]\"'><]+|$QuoteSE)*>"; 65 my $S1 = "[\\n\\r\\t ]"; 66 my $UntilQMs = "[^?]*\\?+"; 67 my $PI_Tail = "\\?>|$S1$UntilQMs(?:[^>?]$UntilQMs)*>"; 68 my $DT_ItemSE = "<(?:!(?:--$Until2Hyphens>|[^-]$MarkupDeclCE)|\\?$Name(?:$PI_Tail))|%$Name;|$S"; 69 my $DocTypeCE = "$DT_IdentSE(?:$S)?(?:\\[(?:$DT_ItemSE)*](?:$S)?)?>?"; 70 my $DeclCE = "--(?:$CommentCE)?|\\[CDATA\\[(?:$CDATA_CE)?|DOCTYPE(?:$DocTypeCE)?"; 71 my $PI_CE = "$Name(?:$PI_Tail)?"; 72 73 # these expressions were modified for backtracking and events 74 my $EndTagCE = "($Name)(?{${package}::end(\$2)})(?:$S)?>"; 75 my $AttValSE = "\"([^<\"]*)\"|'([^<']*)'"; 76 my $ElemTagCE = "($Name)(?:$S($Name)(?:$S)?=(?:$S)?(?:$AttValSE)(?{[\@{\$^R||[]},\$4=>defined\$5?\$5:\$6]}))*(?:$S)?(/)?>(?{${package}::start(\$3,\@{\$^R||[]})})(?{\${7} and ${package}::end(\$3)})"; 77 my $MarkupSPE = "<(?:!(?:$DeclCE)?|\\?(?:$PI_CE)?|/(?:$EndTagCE)?|(?:$ElemTagCE)?)"; 78 79 # Next expression is under "black magic". 80 # Ideally it should be '($TextSE)(?{${package}::char(\$1)})|$MarkupSPE', 81 # but it doesn't work under Perl 5.005 and only magic with 82 # (?:....)?? solved the problem. 83 # I would appreciate if someone let me know what is the right thing to do 84 # and what's the reason for all this magic. 85 # Seems like a problem related to (?:....)? rather than to ?{} feature. 86 # Tests are in t/31-xmlparserlite.t if you decide to play with it. 87 "(?:($TextSE)(?{${package}::char(\$1)}))$patch|$MarkupSPE"; 88} 89 90sub compile { local $^W; 91 # try regexp as it should be, apply patch if doesn't work 92 foreach (regexp(), regexp('??')) { 93 eval qq{sub parse_re { use re "eval"; 1 while \$_[0] =~ m{$_}go }; 1} or die; 94 last if eval { parse_re('<foo>bar</foo>'); 1 } 95 }; 96 97 *compile = sub {}; 98} 99 100setHandlers(); 101compile(); 102 103sub parse { 104 init(); 105 parse_re($_[1]); 106 final(); 107} 108 109my(@stack, $level); 110 111sub init { 112 @stack = (); $level = 0; 113 Init(__PACKAGE__, @_); 114} 115 116sub final { 117 die "not properly closed tag '$stack[-1]'\n" if @stack; 118 die "no element found\n" unless $level; 119 Final(__PACKAGE__, @_) 120} 121 122sub start { 123 die "multiple roots, wrong element '$_[0]'\n" if $level++ && !@stack; 124 push(@stack, $_[0]); 125 Start(__PACKAGE__, @_); 126} 127 128sub char { 129 Char(__PACKAGE__, $_[0]), return if @stack; 130 131 # check for junk before or after element 132 # can't use split or regexp due to limitations in ?{} implementation, 133 # will iterate with loop, but we'll do it no more than two times, so 134 # it shouldn't affect performance 135 for (my $i=0; $i < length $_[0]; $i++) { 136 die "junk '$_[0]' @{[$level ? 'after' : 'before']} XML element\n" 137 if index("\n\r\t ", substr($_[0],$i,1)) < 0; # or should '< $[' be there 138 } 139} 140 141sub end { 142 pop(@stack) eq $_[0] or die "mismatched tag '$_[0]'\n"; 143 End(__PACKAGE__, $_[0]); 144} 145 146# ====================================================================== 147 1481; 149 150__END__ 151 152=head1 NAME 153 154XML::Parser::Lite - Lightweight regexp-based XML parser 155 156=head1 SYNOPSIS 157 158 use XML::Parser::Lite; 159 160 $p1 = new XML::Parser::Lite; 161 $p1->setHandlers( 162 Start => sub { shift; print "start: @_\n" }, 163 Char => sub { shift; print "char: @_\n" }, 164 End => sub { shift; print "end: @_\n" }, 165 ); 166 $p1->parse('<foo id="me">Hello World!</foo>'); 167 168 $p2 = new XML::Parser::Lite 169 Handlers => { 170 Start => sub { shift; print "start: @_\n" }, 171 Char => sub { shift; print "char: @_\n" }, 172 End => sub { shift; print "end: @_\n" }, 173 } 174 ; 175 $p2->parse('<foo id="me">Hello <bar>cruel</bar> World!</foo>'); 176 177=head1 DESCRIPTION 178 179This Perl module gives you access to XML parser with interface similar to 180XML::Parser interface. Though only basic calls are supported (init, final, 181start, char, and end) you should be able to use it in the same way you use 182XML::Parser. Due to using experimantal regexp features it'll work only on 183Perl 5.6 and may behave differently on different platforms. 184 185=head1 SEE ALSO 186 187 XML::Parser 188 189=head1 COPYRIGHT 190 191Copyright (C) 2000-2001 Paul Kulchenko. All rights reserved. 192 193This library is free software; you can redistribute it and/or modify 194it under the same terms as Perl itself. 195 196This parser is based on "shallow parser" http://www.cs.sfu.ca/~cameron/REX.html 197Copyright (c) 1998, Robert D. Cameron. 198 199=head1 AUTHOR 200 201Paul Kulchenko (paulclinger@yahoo.com) 202 203=cut 204