1# $Id: PurePerl.pm,v 1.1.1.1 2004/05/20 17:59:56 jpetri Exp $ 2 3package XML::SAX::PurePerl; 4 5use strict; 6use vars qw/$VERSION/; 7 8$VERSION = '0.90'; 9 10use XML::SAX::PurePerl::Productions qw($Any $CharMinusDash $SingleChar); 11use XML::SAX::PurePerl::Reader; 12use XML::SAX::PurePerl::EncodingDetect (); 13use XML::SAX::Exception; 14use XML::SAX::PurePerl::DocType (); 15use XML::SAX::PurePerl::DTDDecls (); 16use XML::SAX::PurePerl::XMLDecl (); 17use XML::SAX::DocumentLocator (); 18use XML::SAX::Base (); 19use XML::SAX qw(Namespaces); 20use XML::NamespaceSupport (); 21use IO::File; 22 23if ($] < 5.006) { 24 require XML::SAX::PurePerl::NoUnicodeExt; 25} 26else { 27 require XML::SAX::PurePerl::UnicodeExt; 28} 29 30use vars qw(@ISA); 31@ISA = ('XML::SAX::Base'); 32 33my %int_ents = ( 34 amp => '&', 35 lt => '<', 36 gt => '>', 37 quot => '"', 38 apos => "'", 39 ); 40 41my $xmlns_ns = "http://www.w3.org/2000/xmlns/"; 42my $xml_ns = "http://www.w3.org/XML/1998/namespace"; 43 44use Carp; 45sub _parse_characterstream { 46 my $self = shift; 47 my ($fh) = @_; 48 confess("CharacterStream is not yet correctly implemented"); 49 my $reader = XML::SAX::PurePerl::Reader::Stream->new($fh); 50 return $self->_parse($reader); 51} 52 53sub _parse_bytestream { 54 my $self = shift; 55 my ($fh) = @_; 56 my $reader = XML::SAX::PurePerl::Reader::Stream->new($fh); 57 return $self->_parse($reader); 58} 59 60sub _parse_string { 61 my $self = shift; 62 my ($str) = @_; 63 my $reader = XML::SAX::PurePerl::Reader::String->new($str); 64 return $self->_parse($reader); 65} 66 67sub _parse_systemid { 68 my $self = shift; 69 my ($uri) = @_; 70 my $reader = XML::SAX::PurePerl::Reader::URI->new($uri); 71 return $self->_parse($reader); 72} 73 74sub _parse { 75 my ($self, $reader) = @_; 76 77 $reader->public_id($self->{ParseOptions}{Source}{PublicId}); 78 $reader->system_id($self->{ParseOptions}{Source}{SystemId}); 79 $reader->next; 80 81 $self->{NSHelper} = XML::NamespaceSupport->new({xmlns => 1}); 82 83 $self->set_document_locator( 84 XML::SAX::DocumentLocator->new( 85 sub { $reader->public_id }, 86 sub { $reader->system_id }, 87 sub { $reader->line }, 88 sub { $reader->column }, 89 ), 90 ); 91 92 $self->start_document({}); 93 94 if (defined $self->{ParseOptions}{Source}{Encoding}) { 95 $reader->set_encoding($self->{ParseOptions}{Source}{Encoding}); 96 } 97 else { 98 $self->encoding_detect($reader); 99 } 100 101 # parse a document 102 $self->document($reader); 103 104 return $self->end_document({}); 105} 106 107sub parser_error { 108 my $self = shift; 109 my ($error, $reader) = @_; 110 111# warn("parser error: $error from ", $reader->line, " : ", $reader->column, "\n"); 112 my $exception = XML::SAX::Exception::Parse->new( 113 Message => $error, 114 ColumnNumber => $reader->column, 115 LineNumber => $reader->line, 116 PublicId => $reader->public_id, 117 SystemId => $reader->system_id, 118 ); 119 120 $self->fatal_error($exception); 121 $exception->throw; 122} 123 124sub document { 125 my ($self, $reader) = @_; 126 127 # document ::= prolog element Misc* 128 129 $self->prolog($reader); 130 $self->element($reader) || 131 $self->parser_error("Document requires an element", $reader); 132 133 while(!$reader->eof) { 134 $self->Misc($reader) || 135 $self->parser_error("Only Comments, PIs and whitespace allowed at end of document", $reader); 136 } 137} 138 139sub prolog { 140 my ($self, $reader) = @_; 141 142 $self->XMLDecl($reader); 143 144 # consume all misc bits 145 1 while($self->Misc($reader)); 146 147 if ($self->doctypedecl($reader)) { 148 while (!$reader->eof) { 149 $self->Misc($reader) || last; 150 } 151 } 152} 153 154sub element { 155 my ($self, $reader) = @_; 156 157 if ($reader->match_char('<')) { 158 my $name = $self->Name($reader) || 159 $self->parser_error("Invalid element name", $reader); 160 161 my %attribs; 162 163 while( my ($k, $v) = $self->Attribute($reader) ) { 164 $attribs{$k} = $v; 165 } 166 167 $self->skip_whitespace($reader); 168 169 my $content; 170 unless ($reader->match_sequence('/', '>')) { 171 $reader->match_char('>') || 172 $self->parser_error("No close element tag", $reader); 173 174 # only push onto _el_stack if not an empty element 175 push @{$self->{_el_stack}}, $name; 176 $content++; 177 } 178 179 # Namespace processing 180 $self->{NSHelper}->push_context; 181 my @new_ns; 182# my %attrs = @attribs; 183# while (my ($k,$v) = each %attrs) { 184 if ($self->get_feature(Namespaces)) { 185 while ( my ($k, $v) = each %attribs ) { 186 if ($k =~ m/^xmlns(:(.*))?$/) { 187 my $prefix = $2 || ''; 188 $self->{NSHelper}->declare_prefix($prefix, $v); 189 my $ns = 190 { 191 Prefix => $prefix, 192 NamespaceURI => $v, 193 }; 194 push @new_ns, $ns; 195 $self->SUPER::start_prefix_mapping($ns); 196 } 197 } 198 } 199 200 # Create element object and fire event 201 my %attrib_hash; 202 while (my ($name, $value) = each %attribs ) { 203 # TODO normalise value here 204 my ($ns, $prefix, $lname); 205 if ($self->get_feature(Namespaces)) { 206 ($ns, $prefix, $lname) = $self->{NSHelper}->process_attribute_name($name); 207 } 208 $ns ||= ''; $prefix ||= ''; $lname ||= ''; 209 $attrib_hash{"{$ns}$lname"} = { 210 Name => $name, 211 LocalName => $lname, 212 Prefix => $prefix, 213 NamespaceURI => $ns, 214 Value => $value, 215 }; 216 } 217 218 %attribs = (); # lose the memory since we recurse deep 219 220 my ($ns, $prefix, $lname); 221 if ($self->get_feature(Namespaces)) { 222 ($ns, $prefix, $lname) = $self->{NSHelper}->process_element_name($name); 223 } 224 $ns ||= ''; $prefix ||= ''; $lname ||= ''; 225 226 my $el = 227 { 228 Name => $name, 229 LocalName => $lname, 230 Prefix => $prefix, 231 NamespaceURI => $ns, 232 Attributes => \%attrib_hash, 233 }; 234 $self->start_element($el); 235 236 # warn("($name\n"); 237 238 if ($content) { 239 $self->content($reader); 240 241 $reader->match_sequence('<', '/') || $self->parser_error("No close tag marker", $reader); 242 my $end_name = $self->Name($reader); 243 $end_name eq $name || $self->parser_error("End tag mismatch ($end_name != $name)", $reader); 244 $self->skip_whitespace($reader); 245 $reader->match_char('>') || $self->parser_error("No close '>' on end tag", $reader); 246 } 247 248 my %end_el = %$el; 249 delete $end_el{Attributes}; 250 $self->end_element(\%end_el); 251 252 for my $ns (@new_ns) { 253 $self->end_prefix_mapping($ns); 254 } 255 $self->{NSHelper}->pop_context; 256 257 return 1; 258 } 259 260 return 0; 261} 262 263sub content { 264 my ($self, $reader) = @_; 265 266 $self->CharData($reader); 267 268 while (1) { 269 if ($reader->match_sequence('<', '/')) { 270 $reader->buffer('</'); 271 return 1; 272 } 273 elsif ( $self->Reference($reader) || 274 $self->CDSect($reader) || 275 $self->PI($reader) || 276 $self->Comment($reader) || 277 $self->element($reader) 278 ) 279 { 280 $self->CharData($reader); 281 next; 282 } 283 else { 284 last; 285 } 286 } 287 288 return 1; 289} 290 291sub CDSect { 292 my ($self, $reader) = @_; 293 294 if ($reader->match_sequence('<', '!', '[', 'C', 'D', 'A', 'T', 'A', '[')) { 295 $self->start_cdata({}); 296 my $chars = ''; 297 while (1) { 298 if ($reader->eof) { 299 $self->parser_error("EOF looking for CDATA section end", $reader); 300 } 301 $reader->consume_not(']'); 302 $chars .= $reader->consumed; 303 if ($reader->match_char(']')) { 304 if ($reader->match_sequence(']', '>')) { 305 # end of CDATA section 306 307 $self->characters({Data => $chars}); 308 last; 309 } 310 $chars .= ']'; 311 } 312 } 313 $self->end_cdata({}); 314 return 1; 315 } 316 317 return 0; 318} 319 320sub CharData { 321 my ($self, $reader) = @_; 322 323 my $chars = ''; 324 while (1) { 325 $reader->consume_not('<', '&', ']'); 326 $chars .= $reader->consumed; 327 if ($reader->match_char(']')) { 328 if ($reader->match_sequence(']', '>')) { 329 $self->parser_error("String ']]>' not allowed in character data", $reader); 330 } 331 else { 332 $chars .= ']'; 333 } 334 next; 335 } 336 last; 337 } 338 339 $self->characters({ Data => $chars }) if length($chars); 340} 341 342sub Misc { 343 my ($self, $reader) = @_; 344 if ($self->Comment($reader)) { 345 return 1; 346 } 347 elsif ($self->PI($reader)) { 348 return 1; 349 } 350 elsif ($self->skip_whitespace($reader)) { 351 return 1; 352 } 353 354 return 0; 355} 356 357sub Reference { 358 my ($self, $reader) = @_; 359 360 if (!$reader->match_char('&')) { 361 return 0; 362 } 363 364 if ($reader->match_char('#')) { 365 # CharRef 366 my $char; 367 my $ref; 368 if ($reader->match_char('x')) { 369 $reader->consume(qr/[0-9a-fA-F]/) || 370 $self->parser_error("Hex character reference contains illegal characters", $reader); 371 $ref = $reader->consumed; 372 $char = chr_ref(hex($ref)); 373 $ref = "x$ref"; 374 } 375 else { 376 $reader->consume(qr/[0-9]/) || 377 $self->parser_error("Decimal character reference contains illegal characters", $reader); 378 $ref = $reader->consumed; 379 $char = chr_ref($ref); 380 } 381 $reader->match_char(';') || 382 $self->parser_error("No semi-colon found after character reference", $reader); 383 if ($char !~ $SingleChar) { # match a single character 384 $self->parser_error("Character reference &#$ref; refers to an illegal XML character ($char)", $reader); 385 } 386 $self->characters({ Data => $char }); 387 return 1; 388 } 389 else { 390 # EntityRef 391 my $name = $self->Name($reader); 392 $reader->match_char(';') || 393 $self->parser_error("No semi-colon found after entity name", $reader); 394 395 # expand it 396 if ($self->_is_entity($name)) { 397 398 if ($self->_is_external($name)) { 399 my $value = $self->_get_entity($name); 400 my $ent_reader = XML::SAX::PurePerl::Reader::URI->new($value); 401 $self->encoding_detect($ent_reader); 402 $self->extParsedEnt($ent_reader); 403 } 404 else { 405 my $value = $self->_stringify_entity($name); 406 my $ent_reader = XML::SAX::PurePerl::Reader::String->new($value); 407 $self->content($ent_reader); 408 } 409 return 1; 410 } 411 elsif (_is_internal($name)) { 412 $self->characters({ Data => $int_ents{$name} }); 413 return 1; 414 } 415 else { 416 $self->parser_error("Undeclared entity", $reader); 417 } 418 } 419} 420 421sub AttReference { 422 # a reference in an attribute value. 423 my ($self, $reader) = @_; 424 425 if ($reader->match_char('#')) { 426 # CharRef 427 my $char; 428 my $ref; 429 if ($reader->match_char('x')) { 430 $reader->consume(qr/[0-9a-fA-F]/) || 431 $self->parser_error("Hex character reference contains illegal characters", $reader); 432 $ref = $reader->consumed; 433 $char = chr_ref(hex($ref)); 434 $ref = "x$ref"; 435 } 436 else { 437 $reader->consume(qr/[0-9]/) || 438 $self->parser_error("Decimal character reference contains illegal characters", $reader); 439 $ref = $reader->consumed; 440 $char = chr_ref($ref); 441 } 442 $reader->match_char(';') || 443 $self->parser_error("No semi-colon found after character reference", $reader); 444 if ($char !~ $SingleChar) { # match a single character 445 $self->parser_error("Character reference '&#$ref;' refers to an illegal XML character ($char)", $reader); 446 } 447 return $char; 448 } 449 else { 450 # EntityRef 451 my $name = $self->Name($reader); 452 $reader->match_char(';') || 453 $self->parser_error("No semi-colon found after entity name", $reader); 454 455 # expand it 456 if ($self->_is_entity($name)) { 457 if ($self->_is_external($name)) { 458 $self->parser_error("No external entity references allowed in attribute values", $reader); 459 } 460 else { 461 my $value = $self->_stringify_entity($name); 462 return $value; 463 } 464 } 465 elsif (_is_internal($name)) { 466 return $int_ents{$name}; 467 } 468 else { 469 $self->parser_error("Undeclared entity '$name'", $reader); 470 } 471 } 472 473} 474 475sub extParsedEnt { 476 my ($self, $reader) = @_; 477 478 $self->TextDecl($reader); 479 $self->content($reader); 480} 481 482sub _is_internal { 483 my $e = shift; 484 return 1 if $e eq 'amp' || $e eq 'lt' || $e eq 'gt' || $e eq 'quot' || $e eq 'apos'; 485 return 0; 486} 487 488sub _is_external { 489 my ($self, $name) = @_; 490# TODO: Fix this to use $reader to store the entities perhaps. 491 if ($self->{ParseOptions}{external_entities}{$name}) { 492 return 1; 493 } 494 return ; 495} 496 497sub _is_entity { 498 my ($self, $name) = @_; 499# TODO: ditto above 500 if (exists $self->{ParseOptions}{entities}{$name}) { 501 return 1; 502 } 503 return 0; 504} 505 506sub _stringify_entity { 507 my ($self, $name) = @_; 508# TODO: ditto above 509 if (exists $self->{ParseOptions}{expanded_entity}{$name}) { 510 return $self->{ParseOptions}{expanded_entity}{$name}; 511 } 512 # expand 513 my $reader = XML::SAX::PurePerl::Reader::URI->new($self->{ParseOptions}{entities}{$name}); 514 $reader->consume(qr/./); 515 return $self->{ParseOptions}{expanded_entity}{$name} = $reader->consumed; 516} 517 518sub _get_entity { 519 my ($self, $name) = @_; 520# TODO: ditto above 521 return $self->{ParseOptions}{entities}{$name}; 522} 523 524sub skip_whitespace { 525 my ($self, $reader) = @_; 526 527 my $found = 0; 528 while (1) { 529 if ($reader->match_char("\x20") || 530 $reader->match_char("\x0A") || 531 $reader->match_char("\x0D") || 532 $reader->match_char("\x09")) 533 { 534 $found++; 535 } 536 else { 537 last; 538 } 539 } 540 return $found; 541} 542 543sub Attribute { 544 my ($self, $reader) = @_; 545 546 $self->skip_whitespace($reader) || return; 547 if ($reader->match_sequence('/', '>')) { 548 $reader->buffer("/>"); 549 return; 550 } 551 if ($reader->match_char(">")) { 552 $reader->buffer(">"); 553 return; 554 } 555 if (my $name = $self->Name($reader)) { 556 $self->skip_whitespace($reader); 557 $reader->match_char('=') || 558 $self->parser_error("No '=' in Attribute", $reader); 559 $self->skip_whitespace($reader); 560 my $value = $self->AttValue($reader); 561 562 if (!$self->cdata_attrib($name)) { 563 $value =~ s/^\x20*//; # discard leading spaces 564 $value =~ s/\x20*$//; # discard trailing spaces 565 $value =~ s/ {1,}/ /g; # all >1 space to single space 566 } 567 568 return $name, $value; 569 } 570 571 return; 572} 573 574sub cdata_attrib { 575 # TODO implement this! 576 return 0; 577} 578 579sub AttValue { 580 my ($self, $reader) = @_; 581 582 my $quote = '"'; 583 if (!$reader->match_char($quote)) { 584 $quote = "'"; 585 $reader->match_char($quote) || 586 $self->parser_error("Not a quote character", $reader); 587 } 588 589 my $value = ''; 590 591 while (1) { 592 if ($reader->consume_not('<', '&', $quote)) { 593 my $to_append = $reader->consumed; 594 $to_append =~ s/[\x09\x0A\x0D]/\x20/g; # Attrib value normalize 595 $value .= $to_append; 596 } 597 elsif ($reader->match_char('&')) { 598 $value .= $self->AttReference($reader); 599 } 600 elsif ($reader->match_char($quote)) { 601 # end of attrib 602 last; 603 } 604 else { 605 $self->parser_error("Invalid character in attribute value", $reader); 606 } 607 } 608 609 return $value; 610} 611 612sub Comment { 613 my ($self, $reader) = @_; 614 615 if ($reader->match_sequence('<', '!', '-', '-')) { 616 my $comment_str = ''; 617 while (1) { 618 if ($reader->match_char('-')) { 619 if ($reader->match_char('-')) { 620 $reader->match_char('>') || 621 $self->parser_error("Invalid string in comment field", $reader); 622 last; 623 } 624 $comment_str .= '-'; 625 $reader->consume($CharMinusDash) || 626 $self->parser_error("Invalid string in comment field", $reader); 627 $comment_str .= $reader->consumed; 628 } 629 elsif ($reader->consume($CharMinusDash)) { 630 $comment_str .= $reader->consumed; 631 } 632 else { 633 $self->parser_error("Invalid string in comment field", $reader); 634 } 635 } 636 637 $self->comment({ Data => $comment_str }); 638 639 return 1; 640 } 641 return 0; 642} 643 644sub PI { 645 my ($self, $reader) = @_; 646 if ($reader->match_sequence('<', '?')) { 647 my ($target, $data); 648 $target = $self->Name($reader) || 649 $self->parser_error("PI has no target", $reader); 650 if ($self->skip_whitespace($reader)) { 651 while (1) { 652 if ($reader->match_sequence('?', '>')) { 653 last; 654 } 655 elsif ($reader->match_re($Any)) { 656 $data .= $reader->matched; 657 } 658 else { 659 last; 660 } 661 } 662 } 663 else { 664 $reader->match_sequence('?', '>') || 665 $self->parser_error("PI closing sequence not found", $reader); 666 } 667 $self->processing_instruction({ Target => $target, Data => $data }); 668 669 return 1; 670 } 671 return 0; 672} 673 674sub Name { 675 my ($self, $reader) = @_; 676 677 return $reader->consume_name(); 678} 679 680sub quote { 681 my ($self, $reader) = @_; 682 my $quote = '"'; 683 684 if (!$reader->match_char($quote)) { 685 $quote = "'"; 686 $reader->match_char($quote) || 687 $self->parser_error("Invalid quote token", $reader); 688 } 689 return $quote; 690} 691 6921; 693__END__ 694 695=head1 NAME 696 697XML::SAX::PurePerl - Pure Perl XML Parser with SAX2 interface 698 699=head1 SYNOPSIS 700 701 use XML::Handler::Foo; 702 use XML::SAX::PurePerl; 703 my $handler = XML::Handler::Foo->new(); 704 my $parser = XML::SAX::PurePerl->new(Handler => $handler); 705 $parser->parse_uri("myfile.xml"); 706 707=head1 DESCRIPTION 708 709This module implements an XML parser in pure perl. It is written around the 710upcoming perl 5.8's unicode support and support for multiple document 711encodings (using the PerlIO layer), however it has been ported to work with 712ASCII/UTF8 documents under lower perl versions. 713 714The SAX2 API is described in detail at http://sourceforge.net/projects/perl-xml/, in 715the CVS archive, under libxml-perl/docs. Hopefully those documents will be in a 716better location soon. 717 718Please refer to the SAX2 documentation for how to use this module - it is merely a 719front end to SAX2, and implements nothing that is not in that spec (or at least tries 720not to - please email me if you find errors in this implementation). 721 722=head1 BUGS 723 724XML::SAX::PurePerl is B<slow>. Very slow. I suggest you use something else 725in fact. However it is great as a fallback parser for XML::SAX, where the 726user might not be able to install an XS based parser or C library. 727 728Currently lots, probably. At the moment the weakest area is parsing DOCTYPE declarations, 729though the code is in place to start doing this. Also parsing parameter entity 730references is causing me much confusion, since it's not exactly what I would call 731trivial, or well documented in the XML grammar. XML documents with internal subsets 732are likely to fail. 733 734I am however trying to work towards full conformance using the Oasis test suite. 735 736=head1 AUTHOR 737 738Matt Sergeant, matt@sergeant.org. Copyright 2001. 739 740Please report all bugs to the Perl-XML mailing list at perl-xml@listserv.activestate.com. 741 742=head1 LICENSE 743 744This is free software. You may use it or redistribute it under the same terms as 745Perl 5.7.2 itself. 746 747=cut 748 749