1# ======================================================================
2#
3# Copyright (C) 2000-2001 Paul Kulchenko (paulclinger@yahoo.com)
4# SOAP::Lite is free software; you can redistribute it
5# and/or modify it under the same terms as Perl itself.
6#
7# $Id: Lite.pm,v 1.2 2005/02/22 01:47:42 byrnereese Exp $
8#
9# ======================================================================
10
11package XML::Parser::Lite;
12
13use strict;
14use vars qw($VERSION);
15#$VERSION = sprintf("%d.%s", map {s/_//g; $_} q$Name:  $ =~ /-(\d+)_([\d_]+)/);
16$VERSION = '0.65_3';
17
18sub new {
19  my $self = shift;
20  my $class = ref($self) || $self;
21  return $self if ref $self;
22
23  $self = bless {} => $class;
24  my %parameters = @_;
25  $self->setHandlers(); # clear first
26  $self->setHandlers(%{$parameters{Handlers} || {}});
27  return $self;
28}
29
30sub setHandlers {
31  my $self = shift;
32  no strict 'refs'; local $^W;
33  # clear all handlers if called without parameters
34  unless (@_) { foreach (qw(Start End Char Final Init)) { *$_ = sub {} } }
35  while (@_) { my($name => $func) = splice(@_, 0, 2); *$name = defined $func ? $func : sub {} }
36  return $self;
37}
38
39sub regexp {
40  my $patch = shift || '';
41  my $package = __PACKAGE__;
42
43  # This parser is based on "shallow parser" http://www.cs.sfu.ca/~cameron/REX.html
44
45  # Robert D. Cameron "REX: XML Shallow Parsing with Regular Expressions",
46  # Technical Report TR 1998-17, School of Computing Science, Simon Fraser University, November, 1998.
47  # Copyright (c) 1998, Robert D. Cameron.
48  # The following code may be freely used and distributed provided that
49  # this copyright and citation notice remains intact and that modifications
50  # or additions are clearly identified.
51
52  my $TextSE = "[^<]+";
53  my $UntilHyphen = "[^-]*-";
54  my $Until2Hyphens = "$UntilHyphen(?:[^-]$UntilHyphen)*-";
55  my $CommentCE = "$Until2Hyphens>?";
56  my $UntilRSBs = "[^\\]]*](?:[^\\]]+])*]+";
57  my $CDATA_CE = "$UntilRSBs(?:[^\\]>]$UntilRSBs)*>";
58  my $S = "[ \\n\\t\\r]+";
59  my $NameStrt = "[A-Za-z_:]|[^\\x00-\\x7F]";
60  my $NameChar = "[A-Za-z0-9_:.-]|[^\\x00-\\x7F]";
61  my $Name = "(?:$NameStrt)(?:$NameChar)*";
62  my $QuoteSE = "\"[^\"]*\"|'[^']*'";
63  my $DT_IdentSE = "$S$Name(?:$S(?:$Name|$QuoteSE))*";
64  my $MarkupDeclCE = "(?:[^\\]\"'><]+|$QuoteSE)*>";
65  my $S1 = "[\\n\\r\\t ]";
66  my $UntilQMs = "[^?]*\\?+";
67  my $PI_Tail = "\\?>|$S1$UntilQMs(?:[^>?]$UntilQMs)*>";
68  my $DT_ItemSE = "<(?:!(?:--$Until2Hyphens>|[^-]$MarkupDeclCE)|\\?$Name(?:$PI_Tail))|%$Name;|$S";
69  my $DocTypeCE = "$DT_IdentSE(?:$S)?(?:\\[(?:$DT_ItemSE)*](?:$S)?)?>?";
70  my $DeclCE = "--(?:$CommentCE)?|\\[CDATA\\[(?:$CDATA_CE)?|DOCTYPE(?:$DocTypeCE)?";
71  my $PI_CE = "$Name(?:$PI_Tail)?";
72
73  # these expressions were modified for backtracking and events
74  my $EndTagCE = "($Name)(?{${package}::end(\$2)})(?:$S)?>";
75  my $AttValSE = "\"([^<\"]*)\"|'([^<']*)'";
76  my $ElemTagCE = "($Name)(?:$S($Name)(?:$S)?=(?:$S)?(?:$AttValSE)(?{[\@{\$^R||[]},\$4=>defined\$5?\$5:\$6]}))*(?:$S)?(/)?>(?{${package}::start(\$3,\@{\$^R||[]})})(?{\${7} and ${package}::end(\$3)})";
77  my $MarkupSPE = "<(?:!(?:$DeclCE)?|\\?(?:$PI_CE)?|/(?:$EndTagCE)?|(?:$ElemTagCE)?)";
78
79  # Next expression is under "black magic".
80  # Ideally it should be '($TextSE)(?{${package}::char(\$1)})|$MarkupSPE',
81  # but it doesn't work under Perl 5.005 and only magic with
82  # (?:....)?? solved the problem.
83  # I would appreciate if someone let me know what is the right thing to do
84  # and what's the reason for all this magic.
85  # Seems like a problem related to (?:....)? rather than to ?{} feature.
86  # Tests are in t/31-xmlparserlite.t if you decide to play with it.
87  "(?:($TextSE)(?{${package}::char(\$1)}))$patch|$MarkupSPE";
88}
89
90sub compile { local $^W;
91  # try regexp as it should be, apply patch if doesn't work
92  foreach (regexp(), regexp('??')) {
93    eval qq{sub parse_re { use re "eval"; 1 while \$_[0] =~ m{$_}go }; 1} or die;
94    last if eval { parse_re('<foo>bar</foo>'); 1 }
95  };
96
97  *compile = sub {};
98}
99
100setHandlers();
101compile();
102
103sub parse {
104  init();
105  parse_re($_[1]);
106  final();
107}
108
109my(@stack, $level);
110
111sub init {
112  @stack = (); $level = 0;
113  Init(__PACKAGE__, @_);
114}
115
116sub final {
117  die "not properly closed tag '$stack[-1]'\n" if @stack;
118  die "no element found\n" unless $level;
119  Final(__PACKAGE__, @_)
120}
121
122sub start {
123  die "multiple roots, wrong element '$_[0]'\n" if $level++ && !@stack;
124  push(@stack, $_[0]);
125  Start(__PACKAGE__, @_);
126}
127
128sub char {
129  Char(__PACKAGE__, $_[0]), return if @stack;
130
131  # check for junk before or after element
132  # can't use split or regexp due to limitations in ?{} implementation,
133  # will iterate with loop, but we'll do it no more than two times, so
134  # it shouldn't affect performance
135  for (my $i=0; $i < length $_[0]; $i++) {
136    die "junk '$_[0]' @{[$level ? 'after' : 'before']} XML element\n"
137      if index("\n\r\t ", substr($_[0],$i,1)) < 0; # or should '< $[' be there
138  }
139}
140
141sub end {
142  pop(@stack) eq $_[0] or die "mismatched tag '$_[0]'\n";
143  End(__PACKAGE__, $_[0]);
144}
145
146# ======================================================================
147
1481;
149
150__END__
151
152=head1 NAME
153
154XML::Parser::Lite - Lightweight regexp-based XML parser
155
156=head1 SYNOPSIS
157
158  use XML::Parser::Lite;
159
160  $p1 = new XML::Parser::Lite;
161  $p1->setHandlers(
162    Start => sub { shift; print "start: @_\n" },
163    Char => sub { shift; print "char: @_\n" },
164    End => sub { shift; print "end: @_\n" },
165  );
166  $p1->parse('<foo id="me">Hello World!</foo>');
167
168  $p2 = new XML::Parser::Lite
169    Handlers => {
170      Start => sub { shift; print "start: @_\n" },
171      Char => sub { shift; print "char: @_\n" },
172      End => sub { shift; print "end: @_\n" },
173    }
174  ;
175  $p2->parse('<foo id="me">Hello <bar>cruel</bar> World!</foo>');
176
177=head1 DESCRIPTION
178
179This Perl module gives you access to XML parser with interface similar to
180XML::Parser interface. Though only basic calls are supported (init, final,
181start, char, and end) you should be able to use it in the same way you use
182XML::Parser. Due to using experimantal regexp features it'll work only on
183Perl 5.6 and may behave differently on different platforms.
184
185=head1 SEE ALSO
186
187 XML::Parser
188
189=head1 COPYRIGHT
190
191Copyright (C) 2000-2001 Paul Kulchenko. All rights reserved.
192
193This library is free software; you can redistribute it and/or modify
194it under the same terms as Perl itself.
195
196This parser is based on "shallow parser" http://www.cs.sfu.ca/~cameron/REX.html
197Copyright (c) 1998, Robert D. Cameron.
198
199=head1 AUTHOR
200
201Paul Kulchenko (paulclinger@yahoo.com)
202
203=cut
204