1# ======================================================================
2#
3# Copyright (C) 2000-2007 Paul Kulchenko (paulclinger@yahoo.com)
4# Copyright (C) 2008 Martin Kutter (martin.kutter@fen-net.de)
5# SOAP::Lite is free software; you can redistribute it
6# and/or modify it under the same terms as Perl itself.
7#
8# $Id: Lite.pm 414 2012-07-15 09:18:42Z kutterma $
9#
10# ======================================================================
11
12package XML::Parser::Lite;
13
14use strict;
15use warnings;
16
17our $VERSION = 0.715;
18
19sub new {
20    my $class = shift;
21
22    return $class if ref $class;
23    my $self = bless {} => $class;
24
25    my %parameters = @_;
26    $self->setHandlers(); # clear first
27    $self->setHandlers(%{$parameters{Handlers} || {}});
28
29    return $self;
30}
31
32sub setHandlers {
33    my $self = shift;
34
35    # allow symbolic refs, avoid "subroutine redefined" warnings
36    no strict 'refs';
37    no warnings qw(redefine);
38    # clear all handlers if called without parameters
39    if (not @_) {
40        for (qw(Start End Char Final Init Comment Doctype XMLDecl)) {
41            *$_ = sub {}
42        }
43    }
44
45    # we could use each here, too...
46    while (@_) {
47        my($name, $func) = splice(@_, 0, 2);
48        *$name = defined $func
49            ? $func
50            : sub {}
51    }
52    return $self;
53}
54
55sub _regexp {
56    my $patch = shift || '';
57    my $package = __PACKAGE__;
58
59    # This parser is based on "shallow parser" http://www.cs.sfu.ca/~cameron/REX.html
60
61    # Robert D. Cameron "REX: XML Shallow Parsing with Regular Expressions",
62    # Technical Report TR 1998-17, School of Computing Science, Simon Fraser University, November, 1998.
63    # Copyright (c) 1998, Robert D. Cameron.
64    # The following code may be freely used and distributed provided that
65    # this copyright and citation notice remains intact and that modifications
66    # or additions are clearly identified.
67
68    # Modifications may be tracked on SOAP::Lite's SVN at
69    # https://soaplite.svn.sourceforge.net/svnroot/soaplite/
70    #
71    use re 'eval';
72    my $TextSE = "[^<]+";
73    my $UntilHyphen = "[^-]*-";
74    my $Until2Hyphens = "([^-]*)-(?:[^-]$[^-]*-)*-";
75    #my $CommentCE = "$Until2Hyphens(?{${package}::comment(\$2)})>?";
76    my $CommentCE = "(.+)--(?{${package}::comment(\$2)})>?";
77#    my $Until2Hyphens = "$UntilHyphen(?:[^-]$UntilHyphen)*-";
78#    my $CommentCE = "$Until2Hyphens>?";
79    my $UntilRSBs = "[^\\]]*](?:[^\\]]+])*]+";
80    my $CDATA_CE = "$UntilRSBs(?:[^\\]>]$UntilRSBs)*>";
81    my $S = "[ \\n\\t\\r]+";
82    my $NameStrt = "[A-Za-z_:]|[^\\x00-\\x7F]";
83    my $NameChar = "[A-Za-z0-9_:.-]|[^\\x00-\\x7F]";
84    my $Name = "(?:$NameStrt)(?:$NameChar)*";
85    my $QuoteSE = "\"[^\"]*\"|'[^']*'";
86    my $DT_IdentSE = "$Name(?:$S(?:$Name|$QuoteSE))*";
87#    my $DT_IdentSE = "$S$Name(?:$S(?:$Name|$QuoteSE))*";
88    my $MarkupDeclCE = "(?:[^\\]\"'><]+|$QuoteSE)*>";
89    my $S1 = "[\\n\\r\\t ]";
90    my $UntilQMs = "[^?]*\\?";
91    my $PI_Tail = "\\?>|$S1$UntilQMs(?:[^>?]$UntilQMs)*";
92    my $DT_ItemSE = "<(?:!(?:--$Until2Hyphens>|[^-]$MarkupDeclCE)|\\?$Name(?:$PI_Tail>))|%$Name;|$S";
93    my $DocTypeCE = "$S($DT_IdentSE(?:$S)?(?:\\[(?:$DT_ItemSE)*](?:$S)?)?)>(?{${package}::_doctype(\$3)})";
94#    my $PI_Tail = "\\?>|$S1$UntilQMs(?:[^>?]$UntilQMs)*>";
95#    my $DT_ItemSE = "<(?:!(?:--$Until2Hyphens>|[^-]$MarkupDeclCE)|\\?$Name(?:$PI_Tail))|%$Name;|$S";
96#    my $DocTypeCE = "$DT_IdentSE(?:$S)?(?:\\[(?:$DT_ItemSE)*](?:$S)?)?>?";
97    my $DeclCE = "--(?:$CommentCE)?|\\[CDATA\\[(?:$CDATA_CE)?|DOCTYPE(?:$DocTypeCE)?";
98#    my $PI_CE = "$Name(?:$PI_Tail)?";
99    my $PI_CE = "($Name(?:$PI_Tail))>(?{${package}::_xmldecl(\$5)})";
100    # these expressions were modified for backtracking and events
101#    my $EndTagCE = "($Name)(?{${package}::_end(\$2)})(?:$S)?>";
102    my $EndTagCE = "($Name)(?{${package}::_end(\$6)})(?:$S)?>";
103    my $AttValSE = "\"([^<\"]*)\"|'([^<']*)'";
104#    my $ElemTagCE = "($Name)(?:$S($Name)(?:$S)?=(?:$S)?(?:$AttValSE)(?{[\@{\$^R||[]},\$4=>defined\$5?\$5:\$6]}))*(?:$S)?(/)?>(?{${package}::_start( \$3,\@{\$^R||[]})})(?{\${7} and ${package}::_end(\$3)})";
105    my $ElemTagCE = "($Name)"
106        . "(?:$S($Name)(?:$S)?=(?:$S)?(?:$AttValSE)"
107        . "(?{[\@{\$^R||[]},\$8=>defined\$9?\$9:\$10]}))*(?:$S)?(/)?>"
108        . "(?{${package}::_start(\$7,\@{\$^R||[]})})(?{\$11 and ${package}::_end(\$7)})";
109
110    my $MarkupSPE = "<(?:!(?:$DeclCE)?|\\?(?:$PI_CE)?|/(?:$EndTagCE)?|(?:$ElemTagCE)?)";
111
112    # Next expression is under "black magic".
113    # Ideally it should be '($TextSE)(?{${package}::char(\$1)})|$MarkupSPE',
114    # but it doesn't work under Perl 5.005 and only magic with
115    # (?:....)?? solved the problem.
116    # I would appreciate if someone let me know what is the right thing to do
117    # and what's the reason for all this magic.
118    # Seems like a problem related to (?:....)? rather than to ?{} feature.
119    # Tests are in t/31-xmlparserlite.t if you decide to play with it.
120    #"(?{[]})(?:($TextSE)(?{${package}::_char(\$1)}))$patch|$MarkupSPE";
121    "(?:($TextSE)(?{${package}::_char(\$1)}))$patch|$MarkupSPE";
122}
123
124setHandlers();
125
126# Try 5.6 and 5.10 regex first
127my $REGEXP = _regexp('??');
128
129sub _parse_re {
130    use re "eval";
131    undef $^R;
132    no strict 'refs';
133    1 while $_[0] =~ m{$REGEXP}go
134};
135
136# fixup regex if it does not work...
137{
138    if (not eval { _parse_re('<soap:foo xmlns:soap="foo">bar</soap:foo>'); 1; } ) {
139        $REGEXP = _regexp();
140        local $^W;
141        *_parse_re = sub {
142                use re "eval";
143                undef $^R;
144                1 while $_[0] =~ m{$REGEXP}go
145            };
146    }
147}
148
149sub parse {
150    _init();
151    _parse_re($_[1]);
152    _final();
153}
154
155my(@stack, $level);
156
157sub _init {
158    @stack = ();
159    $level = 0;
160    Init(__PACKAGE__, @_);
161}
162
163sub _final {
164    die "not properly closed tag '$stack[-1]'\n" if @stack;
165    die "no element found\n" unless $level;
166    Final(__PACKAGE__, @_)
167}
168
169sub _start {
170    die "multiple roots, wrong element '$_[0]'\n" if $level++ && !@stack;
171    push(@stack, $_[0]);
172    my $r=Start(__PACKAGE__, @_);
173    return ref($r) eq 'ARRAY' ? $r : undef;
174}
175
176sub _char {
177    Char(__PACKAGE__, $_[0]), return if @stack;
178
179    # check for junk before or after element
180    # can't use split or regexp due to limitations in ?{} implementation,
181    # will iterate with loop, but we'll do it no more than two times, so
182    # it shouldn't affect performance
183    for (my $i=0; $i < length $_[0]; $i++) {
184        die "junk '$_[0]' @{[$level ? 'after' : 'before']} XML element\n"
185        if index("\n\r\t ", substr($_[0],$i,1)) < 0; # or should '< $[' be there
186    }
187}
188
189sub _end {
190    no warnings qw(uninitialized);
191    pop(@stack) eq $_[0] or die "mismatched tag '$_[0]'\n";
192    my $r=End(__PACKAGE__, $_[0]);
193    return ref($r) eq 'ARRAY' ? $r : undef;
194}
195
196sub comment {
197    my $r=Comment(__PACKAGE__, $_[0]);
198    return ref($r) eq 'ARRAY' ? $r : undef;
199}
200
201sub end {
202    pop(@stack) eq $_[0] or die "mismatched tag '$_[0]'\n";
203    my $r=End(__PACKAGE__, $_[0]);
204    return ref($r) eq 'ARRAY' ? $r : undef;
205}
206
207sub _doctype {
208    my $r=Doctype(__PACKAGE__, $_[0]);
209    return ref($r) eq 'ARRAY' ? $r : undef;
210}
211
212sub _xmldecl {
213    XMLDecl(__PACKAGE__, $_[0]);
214}
215
216
217
218# ======================================================================
2191;
220
221__END__
222
223=head1 NAME
224
225XML::Parser::Lite - Lightweight regexp-based XML parser
226
227=head1 SYNOPSIS
228
229  use XML::Parser::Lite;
230
231  $p1 = new XML::Parser::Lite;
232  $p1->setHandlers(
233    Start => sub { shift; print "start: @_\n" },
234    Char => sub { shift; print "char: @_\n" },
235    End => sub { shift; print "end: @_\n" },
236  );
237  $p1->parse('<foo id="me">Hello World!</foo>');
238
239  $p2 = new XML::Parser::Lite
240    Handlers => {
241      Start => sub { shift; print "start: @_\n" },
242      Char => sub { shift; print "char: @_\n" },
243      End => sub { shift; print "end: @_\n" },
244    }
245  ;
246  $p2->parse('<foo id="me">Hello <bar>cruel</bar> World!</foo>');
247
248=head1 DESCRIPTION
249
250This Perl implements an XML parser with a interface similar to
251XML::Parser. Though not all callbacks are supported, you should be able to
252use it in the same way you use XML::Parser. Due to using experimantal regexp
253features it'll work only on Perl 5.6 and above and may behave differently on
254different platforms.
255
256Note that you cannot use regular expressions or split in callbacks. This is
257due to a limitation of perl's regular expression implementation (which is
258not re-entrant).
259
260=head1 SUBROUTINES/METHODS
261
262=head2 new
263
264Constructor.
265
266As (almost) all SOAP::Lite constructors, new() returns the object called on
267when called as object method. This means that the following effectifely is
268a no-op if $obj is a object:
269
270 $obj = $obj->new();
271
272New accepts a single named parameter, C<Handlers> with a hash ref as value:
273
274 my $parser = XML::Parser::Lite->new(
275    Handlers => {
276        Start => sub { shift; print "start: @_\n" },
277        Char => sub { shift; print "char: @_\n" },
278        End => sub { shift; print "end: @_\n" },
279    }
280 );
281
282The handlers given will be passed to setHandlers.
283
284=head2 setHandlers
285
286Sets (or resets) the parsing handlers. Accepts a hash with the handler names
287and handler code references as parameters. Passing C<undef> instead of a
288code reference replaces the handler by a no-op.
289
290The following handlers can be set:
291
292 Init
293 Start
294 Char
295 End
296 Final
297
298All other handlers are ignored.
299
300Calling setHandlers without parameters resets all handlers to no-ops.
301
302=head2 parse
303
304Parses the XML given. In contrast to L<XML::Parser|XML::Parser>'s parse
305method, parse() only parses strings.
306
307=head1 Handler methods
308
309=head2 Init
310
311Called before parsing starts. You should perform any necessary initializations
312in Init.
313
314=head2 Start
315
316Called at the start of each XML node. See L<XML::Parser> for details.
317
318=head2 Char
319
320Called for each character sequence. May be called multiple times for the
321characters contained in an XML node (even for every single character).
322Your implementation has to make sure that it captures all characters.
323
324=head2 End
325
326Called at the end of each XML node. See L<XML::Parser> for details
327
328=head2 Comment
329
330See L<XML::Parser> for details
331
332=head2 XMLDecl
333
334See L<XML::Parser> for details
335
336=head2 Doctype
337
338See L<XML::Parser> for details
339
340=head2 Final
341
342Called at the end of the parsing process. You should perform any neccessary
343cleanup here.
344
345=head1 SEE ALSO
346
347 XML::Parser
348
349=head1 COPYRIGHT
350
351Copyright (C) 2000-2007 Paul Kulchenko. All rights reserved.
352
353Copyright (C) 2008- Martin Kutter. All rights reserved.
354
355This library is free software; you can redistribute it and/or modify
356it under the same terms as Perl itself.
357
358This parser is based on "shallow parser" http://www.cs.sfu.ca/~cameron/REX.html
359Copyright (c) 1998, Robert D. Cameron.
360
361=head1 AUTHOR
362
363Paul Kulchenko (paulclinger@yahoo.com)
364
365Martin Kutter (martin.kutter@fen-net.de)
366
367Additional handlers supplied by Adam Leggett.
368
369=cut
370
371
372
373
374