1# $Id: PurePerl.pm,v 1.1.1.1 2004/05/20 17:59:56 jpetri Exp $
2
3package XML::SAX::PurePerl;
4
5use strict;
6use vars qw/$VERSION/;
7
8$VERSION = '0.90';
9
10use XML::SAX::PurePerl::Productions qw($Any $CharMinusDash $SingleChar);
11use XML::SAX::PurePerl::Reader;
12use XML::SAX::PurePerl::EncodingDetect ();
13use XML::SAX::Exception;
14use XML::SAX::PurePerl::DocType ();
15use XML::SAX::PurePerl::DTDDecls ();
16use XML::SAX::PurePerl::XMLDecl ();
17use XML::SAX::DocumentLocator ();
18use XML::SAX::Base ();
19use XML::SAX qw(Namespaces);
20use XML::NamespaceSupport ();
21use IO::File;
22
23if ($] < 5.006) {
24    require XML::SAX::PurePerl::NoUnicodeExt;
25}
26else {
27    require XML::SAX::PurePerl::UnicodeExt;
28}
29
30use vars qw(@ISA);
31@ISA = ('XML::SAX::Base');
32
33my %int_ents = (
34        amp => '&',
35        lt => '<',
36        gt => '>',
37        quot => '"',
38        apos => "'",
39        );
40
41my $xmlns_ns = "http://www.w3.org/2000/xmlns/";
42my $xml_ns = "http://www.w3.org/XML/1998/namespace";
43
44use Carp;
45sub _parse_characterstream {
46    my $self = shift;
47    my ($fh) = @_;
48    confess("CharacterStream is not yet correctly implemented");
49    my $reader = XML::SAX::PurePerl::Reader::Stream->new($fh);
50    return $self->_parse($reader);
51}
52
53sub _parse_bytestream {
54    my $self = shift;
55    my ($fh) = @_;
56    my $reader = XML::SAX::PurePerl::Reader::Stream->new($fh);
57    return $self->_parse($reader);
58}
59
60sub _parse_string {
61    my $self = shift;
62    my ($str) = @_;
63    my $reader = XML::SAX::PurePerl::Reader::String->new($str);
64    return $self->_parse($reader);
65}
66
67sub _parse_systemid {
68    my $self = shift;
69    my ($uri) = @_;
70    my $reader = XML::SAX::PurePerl::Reader::URI->new($uri);
71    return $self->_parse($reader);
72}
73
74sub _parse {
75    my ($self, $reader) = @_;
76
77    $reader->public_id($self->{ParseOptions}{Source}{PublicId});
78    $reader->system_id($self->{ParseOptions}{Source}{SystemId});
79    $reader->next;
80
81    $self->{NSHelper} = XML::NamespaceSupport->new({xmlns => 1});
82
83    $self->set_document_locator(
84        XML::SAX::DocumentLocator->new(
85            sub { $reader->public_id },
86            sub { $reader->system_id },
87            sub { $reader->line },
88            sub { $reader->column },
89        ),
90    );
91
92    $self->start_document({});
93
94    if (defined $self->{ParseOptions}{Source}{Encoding}) {
95        $reader->set_encoding($self->{ParseOptions}{Source}{Encoding});
96    }
97    else {
98        $self->encoding_detect($reader);
99    }
100
101    # parse a document
102    $self->document($reader);
103
104    return $self->end_document({});
105}
106
107sub parser_error {
108    my $self = shift;
109    my ($error, $reader) = @_;
110
111# warn("parser error: $error from ", $reader->line, " : ", $reader->column, "\n");
112    my $exception = XML::SAX::Exception::Parse->new(
113                Message => $error,
114                ColumnNumber => $reader->column,
115                LineNumber => $reader->line,
116                PublicId => $reader->public_id,
117                SystemId => $reader->system_id,
118            );
119
120    $self->fatal_error($exception);
121    $exception->throw;
122}
123
124sub document {
125    my ($self, $reader) = @_;
126
127    # document ::= prolog element Misc*
128
129    $self->prolog($reader);
130    $self->element($reader) ||
131        $self->parser_error("Document requires an element", $reader);
132
133    while(!$reader->eof) {
134        $self->Misc($reader) ||
135                $self->parser_error("Only Comments, PIs and whitespace allowed at end of document", $reader);
136    }
137}
138
139sub prolog {
140    my ($self, $reader) = @_;
141
142    $self->XMLDecl($reader);
143
144    # consume all misc bits
145    1 while($self->Misc($reader));
146
147    if ($self->doctypedecl($reader)) {
148        while (!$reader->eof) {
149            $self->Misc($reader) || last;
150        }
151    }
152}
153
154sub element {
155    my ($self, $reader) = @_;
156
157    if ($reader->match_char('<')) {
158        my $name = $self->Name($reader) ||
159                $self->parser_error("Invalid element name", $reader);
160
161        my %attribs;
162
163        while( my ($k, $v) = $self->Attribute($reader) ) {
164            $attribs{$k} = $v;
165        }
166
167        $self->skip_whitespace($reader);
168
169        my $content;
170        unless ($reader->match_sequence('/', '>')) {
171            $reader->match_char('>') ||
172                $self->parser_error("No close element tag", $reader);
173
174            # only push onto _el_stack if not an empty element
175            push @{$self->{_el_stack}}, $name;
176            $content++;
177        }
178
179        # Namespace processing
180        $self->{NSHelper}->push_context;
181        my @new_ns;
182#        my %attrs = @attribs;
183#        while (my ($k,$v) = each %attrs) {
184        if ($self->get_feature(Namespaces)) {
185            while ( my ($k, $v) = each %attribs ) {
186                if ($k =~ m/^xmlns(:(.*))?$/) {
187                    my $prefix = $2 || '';
188                    $self->{NSHelper}->declare_prefix($prefix, $v);
189                    my $ns =
190                        {
191                            Prefix       => $prefix,
192                            NamespaceURI => $v,
193                        };
194                    push @new_ns, $ns;
195                    $self->SUPER::start_prefix_mapping($ns);
196                }
197            }
198        }
199
200        # Create element object and fire event
201        my %attrib_hash;
202        while (my ($name, $value) = each %attribs ) {
203            # TODO normalise value here
204            my ($ns, $prefix, $lname);
205            if ($self->get_feature(Namespaces)) {
206                ($ns, $prefix, $lname) = $self->{NSHelper}->process_attribute_name($name);
207            }
208            $ns ||= ''; $prefix ||= ''; $lname ||= '';
209            $attrib_hash{"{$ns}$lname"} = {
210                Name => $name,
211                LocalName => $lname,
212                Prefix => $prefix,
213                NamespaceURI => $ns,
214                Value => $value,
215            };
216        }
217
218        %attribs = (); # lose the memory since we recurse deep
219
220        my ($ns, $prefix, $lname);
221        if ($self->get_feature(Namespaces)) {
222            ($ns, $prefix, $lname) = $self->{NSHelper}->process_element_name($name);
223        }
224        $ns ||= ''; $prefix ||= ''; $lname ||= '';
225
226        my $el =
227        {
228            Name => $name,
229            LocalName => $lname,
230            Prefix => $prefix,
231            NamespaceURI => $ns,
232            Attributes => \%attrib_hash,
233        };
234        $self->start_element($el);
235
236        # warn("($name\n");
237
238        if ($content) {
239            $self->content($reader);
240
241            $reader->match_sequence('<', '/') || $self->parser_error("No close tag marker", $reader);
242            my $end_name = $self->Name($reader);
243            $end_name eq $name || $self->parser_error("End tag mismatch ($end_name != $name)", $reader);
244            $self->skip_whitespace($reader);
245            $reader->match_char('>') || $self->parser_error("No close '>' on end tag", $reader);
246        }
247
248        my %end_el = %$el;
249        delete $end_el{Attributes};
250        $self->end_element(\%end_el);
251
252        for my $ns (@new_ns) {
253            $self->end_prefix_mapping($ns);
254        }
255        $self->{NSHelper}->pop_context;
256
257        return 1;
258    }
259
260    return 0;
261}
262
263sub content {
264    my ($self, $reader) = @_;
265
266    $self->CharData($reader);
267
268    while (1) {
269        if ($reader->match_sequence('<', '/')) {
270            $reader->buffer('</');
271            return 1;
272        }
273        elsif ( $self->Reference($reader) ||
274                $self->CDSect($reader) ||
275                $self->PI($reader) ||
276                $self->Comment($reader) ||
277                $self->element($reader)
278               )
279        {
280            $self->CharData($reader);
281            next;
282        }
283        else {
284            last;
285        }
286    }
287
288    return 1;
289}
290
291sub CDSect {
292    my ($self, $reader) = @_;
293
294    if ($reader->match_sequence('<', '!', '[', 'C', 'D', 'A', 'T', 'A', '[')) {
295        $self->start_cdata({});
296        my $chars = '';
297        while (1) {
298            if ($reader->eof) {
299                $self->parser_error("EOF looking for CDATA section end", $reader);
300            }
301            $reader->consume_not(']');
302            $chars .= $reader->consumed;
303            if ($reader->match_char(']')) {
304                if ($reader->match_sequence(']', '>')) {
305                    # end of CDATA section
306
307                    $self->characters({Data => $chars});
308                    last;
309                }
310                $chars .= ']';
311            }
312        }
313        $self->end_cdata({});
314        return 1;
315    }
316
317    return 0;
318}
319
320sub CharData {
321    my ($self, $reader) = @_;
322
323    my $chars = '';
324    while (1) {
325        $reader->consume_not('<', '&', ']');
326        $chars .= $reader->consumed;
327        if ($reader->match_char(']')) {
328            if ($reader->match_sequence(']', '>')) {
329                $self->parser_error("String ']]>' not allowed in character data", $reader);
330            }
331            else {
332                $chars .= ']';
333            }
334            next;
335        }
336        last;
337    }
338
339    $self->characters({ Data => $chars }) if length($chars);
340}
341
342sub Misc {
343    my ($self, $reader) = @_;
344    if ($self->Comment($reader)) {
345        return 1;
346    }
347    elsif ($self->PI($reader)) {
348        return 1;
349    }
350    elsif ($self->skip_whitespace($reader)) {
351        return 1;
352    }
353
354    return 0;
355}
356
357sub Reference {
358    my ($self, $reader) = @_;
359
360    if (!$reader->match_char('&')) {
361        return 0;
362    }
363
364    if ($reader->match_char('#')) {
365        # CharRef
366        my $char;
367        my $ref;
368        if ($reader->match_char('x')) {
369            $reader->consume(qr/[0-9a-fA-F]/) ||
370                $self->parser_error("Hex character reference contains illegal characters", $reader);
371            $ref = $reader->consumed;
372            $char = chr_ref(hex($ref));
373            $ref = "x$ref";
374        }
375        else {
376            $reader->consume(qr/[0-9]/) ||
377                $self->parser_error("Decimal character reference contains illegal characters", $reader);
378            $ref = $reader->consumed;
379            $char = chr_ref($ref);
380        }
381        $reader->match_char(';') ||
382                $self->parser_error("No semi-colon found after character reference", $reader);
383        if ($char !~ $SingleChar) { # match a single character
384            $self->parser_error("Character reference &#$ref; refers to an illegal XML character ($char)", $reader);
385        }
386        $self->characters({ Data => $char });
387        return 1;
388    }
389    else {
390        # EntityRef
391        my $name = $self->Name($reader);
392        $reader->match_char(';') ||
393                $self->parser_error("No semi-colon found after entity name", $reader);
394
395        # expand it
396        if ($self->_is_entity($name)) {
397
398            if ($self->_is_external($name)) {
399                my $value = $self->_get_entity($name);
400                my $ent_reader = XML::SAX::PurePerl::Reader::URI->new($value);
401                $self->encoding_detect($ent_reader);
402                $self->extParsedEnt($ent_reader);
403            }
404            else {
405                my $value = $self->_stringify_entity($name);
406                my $ent_reader = XML::SAX::PurePerl::Reader::String->new($value);
407                $self->content($ent_reader);
408            }
409            return 1;
410        }
411        elsif (_is_internal($name)) {
412            $self->characters({ Data => $int_ents{$name} });
413            return 1;
414        }
415        else {
416            $self->parser_error("Undeclared entity", $reader);
417        }
418    }
419}
420
421sub AttReference {
422    # a reference in an attribute value.
423    my ($self, $reader) = @_;
424
425    if ($reader->match_char('#')) {
426        # CharRef
427        my $char;
428        my $ref;
429        if ($reader->match_char('x')) {
430            $reader->consume(qr/[0-9a-fA-F]/) ||
431                $self->parser_error("Hex character reference contains illegal characters", $reader);
432            $ref = $reader->consumed;
433            $char = chr_ref(hex($ref));
434            $ref = "x$ref";
435        }
436        else {
437            $reader->consume(qr/[0-9]/) ||
438                $self->parser_error("Decimal character reference contains illegal characters", $reader);
439            $ref = $reader->consumed;
440            $char = chr_ref($ref);
441        }
442        $reader->match_char(';') ||
443                $self->parser_error("No semi-colon found after character reference", $reader);
444        if ($char !~ $SingleChar) { # match a single character
445            $self->parser_error("Character reference '&#$ref;' refers to an illegal XML character ($char)", $reader);
446        }
447        return $char;
448    }
449    else {
450        # EntityRef
451        my $name = $self->Name($reader);
452        $reader->match_char(';') ||
453                $self->parser_error("No semi-colon found after entity name", $reader);
454
455        # expand it
456        if ($self->_is_entity($name)) {
457            if ($self->_is_external($name)) {
458                $self->parser_error("No external entity references allowed in attribute values", $reader);
459            }
460            else {
461                my $value = $self->_stringify_entity($name);
462                return $value;
463            }
464        }
465        elsif (_is_internal($name)) {
466            return $int_ents{$name};
467        }
468        else {
469            $self->parser_error("Undeclared entity '$name'", $reader);
470        }
471    }
472
473}
474
475sub extParsedEnt {
476    my ($self, $reader) = @_;
477
478    $self->TextDecl($reader);
479    $self->content($reader);
480}
481
482sub _is_internal {
483    my $e = shift;
484    return 1 if $e eq 'amp' || $e eq 'lt' || $e eq 'gt' || $e eq 'quot' || $e eq 'apos';
485    return 0;
486}
487
488sub _is_external {
489    my ($self, $name) = @_;
490# TODO: Fix this to use $reader to store the entities perhaps.
491    if ($self->{ParseOptions}{external_entities}{$name}) {
492        return 1;
493    }
494    return ;
495}
496
497sub _is_entity {
498    my ($self, $name) = @_;
499# TODO: ditto above
500    if (exists $self->{ParseOptions}{entities}{$name}) {
501        return 1;
502    }
503    return 0;
504}
505
506sub _stringify_entity {
507    my ($self, $name) = @_;
508# TODO: ditto above
509    if (exists $self->{ParseOptions}{expanded_entity}{$name}) {
510        return $self->{ParseOptions}{expanded_entity}{$name};
511    }
512    # expand
513    my $reader = XML::SAX::PurePerl::Reader::URI->new($self->{ParseOptions}{entities}{$name});
514    $reader->consume(qr/./);
515    return $self->{ParseOptions}{expanded_entity}{$name} = $reader->consumed;
516}
517
518sub _get_entity {
519    my ($self, $name) = @_;
520# TODO: ditto above
521    return $self->{ParseOptions}{entities}{$name};
522}
523
524sub skip_whitespace {
525    my ($self, $reader) = @_;
526
527    my $found = 0;
528    while (1) {
529        if ($reader->match_char("\x20") ||
530            $reader->match_char("\x0A") ||
531            $reader->match_char("\x0D") ||
532            $reader->match_char("\x09"))
533        {
534            $found++;
535        }
536        else {
537            last;
538        }
539    }
540    return $found;
541}
542
543sub Attribute {
544    my ($self, $reader) = @_;
545
546    $self->skip_whitespace($reader) || return;
547    if ($reader->match_sequence('/', '>')) {
548        $reader->buffer("/>");
549        return;
550    }
551    if ($reader->match_char(">")) {
552        $reader->buffer(">");
553        return;
554    }
555    if (my $name = $self->Name($reader)) {
556        $self->skip_whitespace($reader);
557        $reader->match_char('=') ||
558                $self->parser_error("No '=' in Attribute", $reader);
559        $self->skip_whitespace($reader);
560        my $value = $self->AttValue($reader);
561
562        if (!$self->cdata_attrib($name)) {
563            $value =~ s/^\x20*//; # discard leading spaces
564            $value =~ s/\x20*$//; # discard trailing spaces
565            $value =~ s/ {1,}/ /g; # all >1 space to single space
566        }
567
568        return $name, $value;
569    }
570
571    return;
572}
573
574sub cdata_attrib {
575    # TODO implement this!
576    return 0;
577}
578
579sub AttValue {
580    my ($self, $reader) = @_;
581
582    my $quote = '"';
583    if (!$reader->match_char($quote)) {
584        $quote = "'";
585        $reader->match_char($quote) ||
586                $self->parser_error("Not a quote character", $reader);
587    }
588
589    my $value = '';
590
591    while (1) {
592        if ($reader->consume_not('<', '&', $quote)) {
593            my $to_append = $reader->consumed;
594            $to_append =~ s/[\x09\x0A\x0D]/\x20/g; # Attrib value normalize
595            $value .= $to_append;
596        }
597        elsif ($reader->match_char('&')) {
598            $value .= $self->AttReference($reader);
599        }
600        elsif ($reader->match_char($quote)) {
601            # end of attrib
602            last;
603        }
604        else {
605            $self->parser_error("Invalid character in attribute value", $reader);
606        }
607    }
608
609    return $value;
610}
611
612sub Comment {
613    my ($self, $reader) = @_;
614
615    if ($reader->match_sequence('<', '!', '-', '-')) {
616        my $comment_str = '';
617        while (1) {
618            if ($reader->match_char('-')) {
619                if ($reader->match_char('-')) {
620                    $reader->match_char('>') ||
621                        $self->parser_error("Invalid string in comment field", $reader);
622                    last;
623                }
624                $comment_str .= '-';
625                $reader->consume($CharMinusDash) ||
626                    $self->parser_error("Invalid string in comment field", $reader);
627                $comment_str .= $reader->consumed;
628            }
629            elsif ($reader->consume($CharMinusDash)) {
630                $comment_str .= $reader->consumed;
631            }
632            else {
633                $self->parser_error("Invalid string in comment field", $reader);
634            }
635        }
636
637        $self->comment({ Data => $comment_str });
638
639        return 1;
640    }
641    return 0;
642}
643
644sub PI {
645    my ($self, $reader) = @_;
646    if ($reader->match_sequence('<', '?')) {
647        my ($target, $data);
648        $target = $self->Name($reader) ||
649            $self->parser_error("PI has no target", $reader);
650        if ($self->skip_whitespace($reader)) {
651            while (1) {
652                if ($reader->match_sequence('?', '>')) {
653                    last;
654                }
655                elsif ($reader->match_re($Any)) {
656                    $data .= $reader->matched;
657                }
658                else {
659                    last;
660                }
661            }
662        }
663        else {
664            $reader->match_sequence('?', '>') ||
665                $self->parser_error("PI closing sequence not found", $reader);
666        }
667        $self->processing_instruction({ Target => $target, Data => $data });
668
669        return 1;
670    }
671    return 0;
672}
673
674sub Name {
675    my ($self, $reader) = @_;
676
677    return $reader->consume_name();
678}
679
680sub quote {
681    my ($self, $reader) = @_;
682    my $quote = '"';
683
684    if (!$reader->match_char($quote)) {
685        $quote = "'";
686        $reader->match_char($quote) ||
687            $self->parser_error("Invalid quote token", $reader);
688    }
689    return $quote;
690}
691
6921;
693__END__
694
695=head1 NAME
696
697XML::SAX::PurePerl - Pure Perl XML Parser with SAX2 interface
698
699=head1 SYNOPSIS
700
701  use XML::Handler::Foo;
702  use XML::SAX::PurePerl;
703  my $handler = XML::Handler::Foo->new();
704  my $parser = XML::SAX::PurePerl->new(Handler => $handler);
705  $parser->parse_uri("myfile.xml");
706
707=head1 DESCRIPTION
708
709This module implements an XML parser in pure perl. It is written around the
710upcoming perl 5.8's unicode support and support for multiple document
711encodings (using the PerlIO layer), however it has been ported to work with
712ASCII/UTF8 documents under lower perl versions.
713
714The SAX2 API is described in detail at http://sourceforge.net/projects/perl-xml/, in
715the CVS archive, under libxml-perl/docs. Hopefully those documents will be in a
716better location soon.
717
718Please refer to the SAX2 documentation for how to use this module - it is merely a
719front end to SAX2, and implements nothing that is not in that spec (or at least tries
720not to - please email me if you find errors in this implementation).
721
722=head1 BUGS
723
724XML::SAX::PurePerl is B<slow>. Very slow. I suggest you use something else
725in fact. However it is great as a fallback parser for XML::SAX, where the
726user might not be able to install an XS based parser or C library.
727
728Currently lots, probably. At the moment the weakest area is parsing DOCTYPE declarations,
729though the code is in place to start doing this. Also parsing parameter entity
730references is causing me much confusion, since it's not exactly what I would call
731trivial, or well documented in the XML grammar. XML documents with internal subsets
732are likely to fail.
733
734I am however trying to work towards full conformance using the Oasis test suite.
735
736=head1 AUTHOR
737
738Matt Sergeant, matt@sergeant.org. Copyright 2001.
739
740Please report all bugs to the Perl-XML mailing list at perl-xml@listserv.activestate.com.
741
742=head1 LICENSE
743
744This is free software. You may use it or redistribute it under the same terms as
745Perl 5.7.2 itself.
746
747=cut
748
749