1# htmlparse.tcl -- 2# 3# This file implements a simple HTML parsing library in Tcl. 4# It may take advantage of parsers coded in C in the future. 5# 6# The functionality here is a subset of the 7# 8# Simple HTML display library by Stephen Uhler (stephen.uhler@sun.com) 9# Copyright (c) 1995 by Sun Microsystems 10# Version 0.3 Fri Sep 1 10:47:17 PDT 1995 11# 12# The main restriction is that all Tk-related code in the above 13# was left out of the code here. It is expected that this code 14# will go into a 'tklib' in the future. 15# 16# Copyright (c) 2001 by ActiveState Tool Corp. 17# See the file license.terms. 18 19package require Tcl 8.2 20package require struct::stack 21package require cmdline 1.1 22 23namespace eval ::htmlparse { 24 namespace export \ 25 parse \ 26 debugCallback \ 27 mapEscapes \ 28 2tree \ 29 removeVisualFluff \ 30 removeFormDefs 31 32 # Table of escape characters. Maps from their names to the actual 33 # character. See http://htmlhelp.org/reference/html40/entities/ 34 35 variable namedEntities 36 37 # I. Latin-1 Entities (HTML 4.01) 38 array set namedEntities { 39 nbsp \xa0 iexcl \xa1 cent \xa2 pound \xa3 curren \xa4 40 yen \xa5 brvbar \xa6 sect \xa7 uml \xa8 copy \xa9 41 ordf \xaa laquo \xab not \xac shy \xad reg \xae 42 macr \xaf deg \xb0 plusmn \xb1 sup2 \xb2 sup3 \xb3 43 acute \xb4 micro \xb5 para \xb6 middot \xb7 cedil \xb8 44 sup1 \xb9 ordm \xba raquo \xbb frac14 \xbc frac12 \xbd 45 frac34 \xbe iquest \xbf Agrave \xc0 Aacute \xc1 Acirc \xc2 46 Atilde \xc3 Auml \xc4 Aring \xc5 AElig \xc6 Ccedil \xc7 47 Egrave \xc8 Eacute \xc9 Ecirc \xca Euml \xcb Igrave \xcc 48 Iacute \xcd Icirc \xce Iuml \xcf ETH \xd0 Ntilde \xd1 49 Ograve \xd2 Oacute \xd3 Ocirc \xd4 Otilde \xd5 Ouml \xd6 50 times \xd7 Oslash \xd8 Ugrave \xd9 Uacute \xda Ucirc \xdb 51 Uuml \xdc Yacute \xdd THORN \xde szlig \xdf agrave \xe0 52 aacute \xe1 acirc \xe2 atilde \xe3 auml \xe4 aring \xe5 53 aelig \xe6 ccedil \xe7 egrave \xe8 eacute \xe9 ecirc \xea 54 euml \xeb igrave \xec iacute \xed icirc \xee iuml \xef 55 eth \xf0 ntilde \xf1 ograve \xf2 oacute \xf3 ocirc \xf4 56 otilde \xf5 ouml \xf6 divide \xf7 oslash \xf8 ugrave \xf9 57 uacute \xfa ucirc \xfb uuml \xfc yacute \xfd thorn \xfe 58 yuml \xff 59 } 60 61 # II. Entities for Symbols and Greek Letters (HTML 4.01) 62 array set namedEntities { 63 fnof \u192 Alpha \u391 Beta \u392 Gamma \u393 Delta \u394 64 Epsilon \u395 Zeta \u396 Eta \u397 Theta \u398 Iota \u399 65 Kappa \u39A Lambda \u39B Mu \u39C Nu \u39D Xi \u39E 66 Omicron \u39F Pi \u3A0 Rho \u3A1 Sigma \u3A3 Tau \u3A4 67 Upsilon \u3A5 Phi \u3A6 Chi \u3A7 Psi \u3A8 Omega \u3A9 68 alpha \u3B1 beta \u3B2 gamma \u3B3 delta \u3B4 epsilon \u3B5 69 zeta \u3B6 eta \u3B7 theta \u3B8 iota \u3B9 kappa \u3BA 70 lambda \u3BB mu \u3BC nu \u3BD xi \u3BE omicron \u3BF 71 pi \u3C0 rho \u3C1 sigmaf \u3C2 sigma \u3C3 tau \u3C4 72 upsilon \u3C5 phi \u3C6 chi \u3C7 psi \u3C8 omega \u3C9 73 thetasym \u3D1 upsih \u3D2 piv \u3D6 bull \u2022 74 hellip \u2026 prime \u2032 Prime \u2033 oline \u203E 75 frasl \u2044 weierp \u2118 image \u2111 real \u211C 76 trade \u2122 alefsym \u2135 larr \u2190 uarr \u2191 77 rarr \u2192 darr \u2193 harr \u2194 crarr \u21B5 78 lArr \u21D0 uArr \u21D1 rArr \u21D2 dArr \u21D3 hArr \u21D4 79 forall \u2200 part \u2202 exist \u2203 empty \u2205 80 nabla \u2207 isin \u2208 notin \u2209 ni \u220B prod \u220F 81 sum \u2211 minus \u2212 lowast \u2217 radic \u221A 82 prop \u221D infin \u221E ang \u2220 and \u2227 or \u2228 83 cap \u2229 cup \u222A int \u222B there4 \u2234 sim \u223C 84 cong \u2245 asymp \u2248 ne \u2260 equiv \u2261 le \u2264 85 ge \u2265 sub \u2282 sup \u2283 nsub \u2284 sube \u2286 86 supe \u2287 oplus \u2295 otimes \u2297 perp \u22A5 87 sdot \u22C5 lceil \u2308 rceil \u2309 lfloor \u230A 88 rfloor \u230B lang \u2329 rang \u232A loz \u25CA 89 spades \u2660 clubs \u2663 hearts \u2665 diams \u2666 90 } 91 92 # III. Special Entities (HTML 4.01) 93 array set namedEntities { 94 quot \x22 amp \x26 lt \x3C gt \x3E OElig \u152 oelig \u153 95 Scaron \u160 scaron \u161 Yuml \u178 circ \u2C6 96 tilde \u2DC ensp \u2002 emsp \u2003 thinsp \u2009 97 zwnj \u200C zwj \u200D lrm \u200E rlm \u200F ndash \u2013 98 mdash \u2014 lsquo \u2018 rsquo \u2019 sbquo \u201A 99 ldquo \u201C rdquo \u201D bdquo \u201E dagger \u2020 100 Dagger \u2021 permil \u2030 lsaquo \u2039 rsaquo \u203A 101 euro \u20AC 102 } 103 104 # IV. Special Entities (XHTML, XML) 105 array set namedEntities { 106 apos \u0027 107 } 108 109 # Internal cache for the foreach variable-lists and the 110 # substitution strings used to split a HTML string into 111 # incrementally handleable scripts. This should reduce the 112 # time compute this information for repeated calls with the same 113 # split-factor. The array is indexed by a combination of the 114 # numerical split factor and the length of the command prefix and 115 # maps this to a 2-element list containing variable- and 116 # subst-string. 117 118 variable splitdata 119 array set splitdata {} 120 121} 122 123# htmlparse::parse -- 124# 125# This command is the basic parser for HTML. It takes a HTML 126# string, parses it and invokes a command prefix for every tag 127# encountered. It is not necessary for the HTML to be valid for 128# this parser to function. It is the responsibility of the 129# command invoked for every tag to check this. Another 130# responsibility of the invoked command is the handling of tag 131# attributes and character entities (escaped characters). The 132# parser provides the un-interpreted tag attributes to the 133# invoked command to aid in the former, and the package at large 134# provides a helper command, '::htmlparse::mapEscapes', to aid 135# in the handling of the latter. The parser *does* ignore 136# leading DOCTYPE declarations and all valid HTML comments it 137# encounters. 138# 139# All information beyond the HTML string itself is specified via 140# options, these are explained below. 141# 142# To help understanding the options some more background 143# information about the parser. 144# 145# It is capable to detect incomplete tags in the HTML string 146# given to it. Under normal circumstances this will cause the 147# parser to throw an error, but if the option '-incvar' is used 148# to specify a global (or namespace) variable the parser will 149# store the incomplete part of the input into this variable 150# instead. This will aid greatly in the handling of 151# incrementally arriving HTML as the parser will handle whatever 152# he can and defer the handling of the incomplete part until 153# more data has arrived. 154# 155# Another feature of the parser are its two possible modes of 156# operation. The normal mode is activated if the option '-queue' 157# is not present on the command line invoking the parser. If it 158# is present the parser will go into the incremental mode instead. 159# 160# The main difference is that a parser in normal mode will 161# immediately invoke the command prefix for each tag it 162# encounters. In incremental mode however the parser will 163# generate a number of scripts which invoke the command prefix 164# for groups of tags in the HTML string and then store these 165# scripts in the specified queue. It is then the responsibility 166# of the caller of the parser to ensure the execution of the 167# scripts in the queue. 168# 169# Note: The queue objecct given to the parser has to provide the 170# same interface as the queue defined in tcllib -> struct. This 171# does for example mean that all queues created via that part of 172# tcllib can be immediately used here. Still, the queue doesn't 173# have to come from tcllib -> struct as long as the same 174# interface is provided. 175# 176# In both modes the parser will return an empty string to the 177# caller. 178# 179# To a parser in incremental mode the option '-split' can be 180# given and will specify the size of the groups he creates. In 181# other words, -split 5 means that each of the generated scripts 182# will invoke the command prefix for 5 consecutive tags in the 183# HTML string. A parser in normal mode will ignore this option 184# and its value. 185# 186# The option '-vroot' specifies a virtual root tag. A parser in 187# normal mode will invoke the command prefix for it immediately 188# before and after he processes the tags in the HTML, thus 189# simulating that the HTML string is enclosed in a <vroot> 190# </vroot> combination. In incremental mode however the parser 191# is unable to provide the closing virtual root as he never 192# knows when the input is complete. In this case the first 193# script generated by each invocation of the parser will contain 194# an invocation of the command prefix for the virtual root as 195# its first command. 196# 197# Interface to the command prefix: 198# 199# In normal mode the parser will invoke the command prefix with 200# for arguments appended. See '::htmlparse::debugCallback' for a 201# description. In incremental mode however the generated scripts 202# will invoke the command prefix with five arguments 203# appended. The last four of these are the same which were 204# mentioned above. The first however is a placeholder string 205# (\win\) for a clientdata value to be supplied later during the 206# actual execution of the generated scripts. This could be a tk 207# window path, for example. This allows the user of this package 208# to preprocess HTML strings without commiting them to a 209# specific window, object, whatever during parsing. This 210# connection can be made later. This also means that it is 211# possible to cache preprocessed HTML. Of course, nothing 212# prevents the user of the parser to replace the placeholder 213# with an empty string. 214# 215# Arguments: 216# args An option/value-list followed by the string to 217# parse. Available options are: 218# 219# -cmd The command prefix to invoke for every tag in 220# the HTML string. Defaults to 221# '::htmlparse::debugCallback'. 222# 223# -vroot The virtual root tag to add around the HTML in 224# normal mode. In incremental mode it is the 225# first tag in each chunk processed by the 226# parser, but there will be no closing tags. 227# Defaults to 'hmstart'. 228# 229# -split The size of the groups produced by an 230# incremental mode parser. Ignored when in 231# normal mode. Defaults to 10. Values <= 0 are 232# not allowed. 233# 234# -incvar The name of the variable where to store any 235# incomplete HTML into. Optional. 236# 237# -queue 238# The handle/name of the queue objecct to store 239# the generated scripts into. Activates 240# incremental mode. Normal mode is used if this 241# option is not present. 242# 243# After the options the command expects a single argument 244# containing the HTML string to parse. 245# 246# Side Effects: 247# In normal mode as of the invoked command. Else none. 248# 249# Results: 250# None. 251 252proc ::htmlparse::parse {args} { 253 # Convert the HTML string into a evaluable command sequence. 254 255 variable splitdata 256 257 # Option processing, start with the defaults, then run through the 258 # list of arguments. 259 260 set cmd ::htmlparse::debugCallback 261 set vroot hmstart 262 set incvar "" 263 set split 10 264 set queue "" 265 266 while {[set err [cmdline::getopt args {cmd.arg vroot.arg incvar.arg split.arg queue.arg} opt arg]]} { 267 if {$err < 0} { 268 return -code error "::htmlparse::parse : $arg" 269 } 270 switch -exact -- $opt { 271 cmd - 272 vroot - 273 incvar - 274 queue { 275 if {[string length $arg] == 0} { 276 return -code error "::htmlparse::parse : -$opt illegal argument (empty)" 277 } 278 # Each option has an variable with the same name associated with it. 279 # FRINK: nocheck 280 set $opt $arg 281 } 282 split { 283 if {$arg <= 0} { 284 return -code error "::htmlparse::parse : -split illegal argument (<= 0)" 285 } 286 set split $arg 287 } 288 default {# Can't happen} 289 } 290 } 291 292 if {[llength $args] > 1} { 293 return -code error "::htmlparse::parse : to many arguments behind the options, expected one" 294 } 295 if {[llength $args] < 1} { 296 return -code error "::htmlparse::parse : html string missing" 297 } 298 299 set html [PrepareHtml [lindex $args 0]] 300 301 # Look for incomplete HTML from the last iteration and prepend it 302 # to the input we just got. 303 304 if {$incvar != {}} { 305 upvar $incvar incomplete 306 } else { 307 set incomplete "" 308 } 309 310 if {[catch {set new $incomplete$html}]} {set new $html} 311 set html $new 312 313 # Handle incomplete HTML (Recognize incomplete tag at end, buffer 314 # it up for the next call). 315 316 set end [lindex \{$html\} end] 317 if {[set idx [string last < $end]] > [string last > $end]} { 318 319 if {$incvar == {}} { 320 return -code error "::htmlparse::parse : HTML is incomplete, option -incvar is missing" 321 } 322 323 # upvar $incvar incomplete -- Already done, s.a. 324 set incomplete [string range $end $idx end] 325 incr idx -1 326 set html [string range $end 0 $idx] 327 328 } else { 329 set incomplete "" 330 } 331 332 # Convert the HTML string into a script. 333 334 set sub "\}\n$cmd {\\1} {} {\\2} \{\}\n$cmd {\\1} {/} {} \{" 335 regsub -all -- {<([^\s>]+)\s*([^>]*)/>} $html $sub html 336 337 set sub "\}\n$cmd {\\2} {\\1} {\\3} \{" 338 regsub -all -- {<(/?)([^\s>]+)\s*([^>]*)>} $html $sub html 339 340 # The value of queue now determines wether we process the HTML by 341 # ourselves (queue is empty) or if we generate a list of scripts 342 # each of which processes n tags, n the argument to -split. 343 344 if {$queue == {}} { 345 # And evaluate it. This is the main parsing step. 346 347 eval "$cmd {$vroot} {} {} \{$html\}" 348 eval "$cmd {$vroot} / {} {}" 349 } else { 350 # queue defined, generate list of scripts doing small chunks of tags. 351 352 set lcmd [llength $cmd] 353 set key $split,$lcmd 354 355 if {![info exists splitdata($key)]} { 356 for {set i 0; set group {}} {$i < $split} {incr i} { 357 # Use the length of the command prefix to generate 358 # additional variables before the main variable after 359 # which the placeholder will be inserted. 360 361 for {set j 1} {$j < $lcmd} {incr j} { 362 append group "b${j}_$i " 363 } 364 365 append group "a$i c$i d$i e$i f$i\n" 366 } 367 regsub -all -- {(a[0-9]+)} $group {{$\1} @win@} subgroup 368 regsub -all -- {([b-z_0-9]+[0-9]+)} $subgroup {{$\1}} subgroup 369 370 set splitdata($key) [list $group $subgroup] 371 } 372 373 foreach {group subgroup} $splitdata($key) break ; # lassign 374 foreach $group "$cmd {$vroot} {} {} \{$html\}" { 375 $queue put [string trimright [subst $subgroup]] 376 } 377 } 378 return 379} 380 381# htmlparse::PrepareHtml -- 382# 383# Internal helper command of '::htmlparse::parse'. Removes 384# leading DOCTYPE declarations and comments, protects the 385# special characters of tcl from evaluation. 386# 387# Arguments: 388# html The HTML string to prepare 389# 390# Side Effects: 391# None. 392# 393# Results: 394# The provided HTML string with the described modifications 395# applied to it. 396 397proc ::htmlparse::PrepareHtml {html} { 398 # Remove the following items from the text: 399 # - A leading <!DOCTYPE...> declaration. 400 # - All comments <!-- ... --> 401 # 402 # Also normalize the line endings (\r -> \n). 403 404 # Tcllib SF Bug 861287 - Processing of comments. 405 # Recognize EOC by RE, instead of fixed string. 406 407 set html [string map [list \r \n] $html] 408 409 regsub -- "^.*<!DOCTYPE\[^>\]*>" $html {} html 410 regsub -all -- "--(\[ \t\n\]*)>" $html "\001\\1\002" html 411 412 # Recognize borken beginnings of a comment and convert them to PCDATA. 413 regsub -all -- "<--(\[^\001\]*)\001(\[^\002\]*)\002" $html {\<--\1--\2\>} html 414 415 # And now recognize true comments, remove them. 416 regsub -all -- "<!--\[^\001\]*\001(\[^\002\]*)\002" $html {} html 417 418 # Protect characters special to tcl (braces, slashes) by 419 # converting them to their escape sequences. 420 421 return [string map [list \ 422 "\{" "{" \ 423 "\}" "}" \ 424 "\\" "\"] $html] 425} 426 427 428 429# htmlparse::debugCallback -- 430# 431# The standard callback used by the parser in 432# '::htmlparse::parse' if none was specified by the user. Simply 433# dumps its arguments to stdout. This callback can be used for 434# both normal and incremental mode of the calling parser. In 435# other words, it accepts four or five arguments. The last four 436# arguments are described below. The optional fifth argument 437# contains the clientdata value given to the callback by a 438# parser in incremental mode. All callbacks have to follow the 439# signature of this command in the last four arguments, and 440# callbacks used in incremental parsing have to follow this 441# signature in the last five arguments. 442# 443# Arguments: 444# tag The name of the tag currently 445# processed by the parser. 446# 447# slash Either empty or a slash. Allows us to 448# distinguish between opening (slash is 449# empty) and closing tags (slash is 450# equal to a '/'). 451# 452# param The un-interpreted list of parameters 453# to the tag. 454# 455# textBehindTheTag The text found by the parser behind 456# the tag named in 'tag'. 457# 458# Side Effects: 459# None. 460# 461# Results: 462# None. 463 464proc ::htmlparse::debugCallback {args} { 465 # args = ?clientData? tag slash param textBehindTheTag 466 puts "==> $args" 467 return 468} 469 470# htmlparse::mapEscapes -- 471# 472# Takes a HTML string, substitutes all escape sequences with 473# their actual characters and returns the resulting string. 474# HTML not containing escape sequences or invalid escape 475# sequences is returned unchanged. 476# 477# Arguments: 478# html The string to modify 479# 480# Side Effects: 481# None. 482# 483# Results: 484# The argument string with all escape sequences replaced with 485# their actual characters. 486 487proc ::htmlparse::mapEscapes {html} { 488 # Find HTML escape characters of the form &xxx(;|EOW) 489 490 # Quote special Tcl chars so they pass through [subst] unharmed. 491 set new [string map [list \] \\\] \[ \\\[ \$ \\\$ \\ \\\\] $html] 492 regsub -all -- {&([[:alnum:]]{2,7})(;|\M)} $new {[DoNamedMap \1 {\2}]} new 493 regsub -all -- {&#([[:digit:]]{1,5})(;|\M)} $new {[DoDecMap \1 {\2}]} new 494 regsub -all -- {&#x([[:xdigit:]]{1,4})(;|\M)} $new {[DoHexMap \1 {\2}]} new 495 return [subst $new] 496} 497 498proc ::htmlparse::DoNamedMap {name endOf} { 499 variable namedEntities 500 if {[info exist namedEntities($name)]} { 501 return $namedEntities($name) 502 } else { 503 # Put it back.. 504 return "&$name$endOf" 505 } 506} 507 508proc ::htmlparse::DoDecMap {dec endOf} { 509 scan $dec %d dec 510 if {$dec <= 0xFFFD} { 511 return [format %c $dec] 512 } else { 513 # Put it back.. 514 return "&#$dec$endOf" 515 } 516} 517 518proc ::htmlparse::DoHexMap {hex endOf} { 519 scan $hex %x value 520 if {$value <= 0xFFFD} { 521 return [format %c $value] 522 } else { 523 # Put it back.. 524 return "&#x$hex$endOf" 525 } 526} 527 528# htmlparse::2tree -- 529# 530# This command is a wrapper around '::htmlparse::parse' which 531# takes a HTML string and converts it into a tree containing the 532# logical structure of the parsed document. The tree object has 533# to be created by the caller. It is also expected that the tree 534# object provides the same interface as the tree object from 535# tcllib -> struct. It doesn't have to come from that module 536# though. The internal callback does some basic checking of HTML 537# validity and tries to recover from the most basic errors. 538# 539# Arguments: 540# html The HTML string to parse and convert. 541# tree The name of the tree to fill. 542# 543# Side Effects: 544# Creates a tree object (see tcllib -> struct) 545# and modifies it. 546# 547# Results: 548# The contents of 'tree'. 549 550proc ::htmlparse::2tree {html tree} { 551 552 # One internal datastructure is required, a stack of open 553 # tags. This stack is also provided by the 'struct' module of 554 # tcllib. As the operation of this command is synchronuous we 555 # don't have to take care against multiple running copies at the 556 # same times (Such are possible, but will be in different 557 # interpreters and true concurrency is possible only if they are 558 # in different threads too). IOW, no need for tricks to make the 559 # internal datastructure unique. 560 561 catch {::htmlparse::tags destroy} 562 563 ::struct::stack ::htmlparse::tags 564 ::htmlparse::tags push root 565 $tree set root type root 566 567 parse -cmd [list ::htmlparse::2treeCallback $tree] $html 568 569 # A bit hackish, correct the ordering of nodes for the optional 570 # tag types, over a larger area when was seen by the parser itself. 571 572 $tree walk root -order post n { 573 ::htmlparse::Reorder $tree $n 574 } 575 576 ::htmlparse::tags destroy 577 return $tree 578} 579 580# htmlparse::2treeCallback -- 581# 582# Internal helper command. A special callback to 583# '::htmlparse::parse' used by '::htmlparse::2tree' which takes 584# the incoming stream of tags and converts them into a tree 585# representing the inner structure of the parsed HTML 586# document. Recovers from simple HTML errors like missing 587# opening tags, missing closing tags and overlapping tags. 588# 589# Arguments: 590# tree The name of the tree to manipulate. 591# tag See '::htmlparse::debugCallback'. 592# slash See '::htmlparse::debugCallback'. 593# param See '::htmlparse::debugCallback'. 594# textBehindTheTag See '::htmlparse::debugCallback'. 595# 596# Side Effects: 597# Manipulates the tree object whose name was given as the first 598# argument. 599# 600# Results: 601# None. 602 603proc ::htmlparse::2treeCallback {tree tag slash param textBehindTheTag} { 604 # This could be table-driven I think but for now the switches 605 # should work fine. 606 607 # Normalize tag information for later comparisons. Also remove 608 # superfluous whitespace. Don't forget to decode the standard 609 # entities. 610 611 set tag [string tolower $tag] 612 set textBehindTheTag [string trim $textBehindTheTag] 613 if {$textBehindTheTag != {}} { 614 set text [mapEscapes $textBehindTheTag] 615 } 616 617 if {"$slash" == "/"} { 618 # Handle closing tags. Standard operation is to pop the tag 619 # from the stack of open tags. We don't do this for </p> and 620 # </li>. As they were optional they were never pushed onto the 621 # stack (Well, actually they are just popped immediately after 622 # they were pusheed, see below). 623 624 switch -exact -- $tag { 625 base - option - meta - li - p { 626 # Ignore, nothing to do. 627 } 628 default { 629 # The moment we get a closing tag which does not match 630 # the tag on the stack we have two possibilities on how 631 # this came into existence to choose from: 632 # 633 # a) A tag is now closed but was never opened. 634 # b) A tag requiring an end tag was opened but the end 635 # tag was omitted and we now are at a tag which was 636 # opened before the one with the omitted end tag. 637 638 # NOTE: 639 # Pages delivered from the amazon.uk site contain both 640 # cases: </a> without opening, <b> & <font> without 641 # closing. Another error: <a><b></a></b>, i.e. overlapping 642 # tags. Fortunately this can be handled by the algorithm 643 # below, in two cycles, one of which is case (b), followed 644 # by case (a). It seems as if Amazon/UK believes that visual 645 # markup like <b> and <font> is an option (switch-on) instead 646 # of a region. 647 648 # Algorithm used here to deal with these: 649 # 1) Search whole stack for the matching opening tag. 650 # If there is one assume case (b) and pop everything 651 # until and including this opening tag. 652 # 2) If no matching opening tag was found assume case 653 # (a) and ignore the tag. 654 # 655 # Part (1) also subsumes the normal case, i.e. the 656 # matching tag is at the top of the stack. 657 658 set nodes [::htmlparse::tags peek [::htmlparse::tags size]] 659 # Note: First item is top of stack, last item is bottom of stack ! 660 # (This behaviour of tcllib stacks is not documented 661 # -> we should update the manpage). 662 663 #foreach n $nodes {lappend tstring [p get $n -key type]} 664 #puts stderr --[join $tstring]-- 665 666 set level 1 667 set found 0 668 foreach n $nodes { 669 set type [$tree get $n type] 670 if {0 == [string compare $tag $type]} { 671 # Found an earlier open tag -> (b). 672 set found 1 673 break 674 } 675 incr level 676 } 677 if {$found} { 678 ::htmlparse::tags pop $level 679 if {$level > 1} { 680 #foreach n $nodes {lappend tstring [$tree get $n type]} 681 #puts stderr "\tdesync at <$tag> ($tstring) => pop $level" 682 } 683 } else { 684 #foreach n $nodes {lappend tstring [$tree get $n type]} 685 #puts stderr "\tdesync at <$tag> ($tstring) => ignore" 686 } 687 } 688 } 689 690 # If there is text behind a closing tag X it belongs to the 691 # parent tag of X. 692 693 if {$textBehindTheTag != {}} { 694 # Attach the text behind the closing tag to the reopened 695 # context. 696 697 set pcd [$tree insert [::htmlparse::tags peek] end] 698 $tree set $pcd type PCDATA 699 $tree set $pcd data $textBehindTheTag 700 } 701 702 } else { 703 # Handle opening tags. The standard operation for most is to 704 # push them onto the stack and thus open a nested context. 705 # This does not happen for both the optional tags (p, li) and 706 # the ones which don't have closing tags (meta, br, option, 707 # input, area, img). 708 # 709 # The text coming with the tag will be added after the tag if 710 # it is a tag without a matching close, else it will be added 711 # as a node below the tag (as it is the region between the 712 # opening and closing tag and thus nested inside). Empty text 713 # is ignored under all circcumstances. 714 715 set node [$tree insert [::htmlparse::tags peek] end] 716 $tree set $node type $tag 717 $tree set $node data $param 718 719 if {$textBehindTheTag != {}} { 720 switch -exact -- $tag { 721 input - area - img - br { 722 set pcd [$tree insert [::htmlparse::tags peek] end] 723 } 724 default { 725 set pcd [$tree insert $node end] 726 } 727 } 728 $tree set $pcd type PCDATA 729 $tree set $pcd data $textBehindTheTag 730 } 731 732 ::htmlparse::tags push $node 733 734 # Special handling: <p>, <li> may have no closing tag => pop 735 # : them immediately. 736 # 737 # Special handling: <meta>, <br>, <option>, <input>, <area>, 738 # : <img>: no closing tags for these. 739 740 switch -exact -- $tag { 741 hr - base - meta - li - br - option - input - area - img - p - h1 - h2 - h3 - h4 - h5 - h6 { 742 ::htmlparse::tags pop 743 } 744 default {} 745 } 746 } 747} 748 749# htmlparse::removeVisualFluff -- 750# 751# This command walks a tree as generated by '::htmlparse::2tree' 752# and removes all the nodes which represent visual tags and not 753# structural ones. The purpose of the command is to make the 754# tree easier to navigate without getting bogged down in visual 755# information not relevant to the search. 756# 757# Arguments: 758# tree The name of the tree to cut down. 759# 760# Side Effects: 761# Modifies the specified tree. 762# 763# Results: 764# None. 765 766proc ::htmlparse::removeVisualFluff {tree} { 767 $tree walk root -order post n { 768 ::htmlparse::RemoveVisualFluff $tree $n 769 } 770 return 771} 772 773# htmlparse::removeFormDefs -- 774# 775# Like '::htmlparse::removeVisualFluff' this command is here to 776# cut down on the size of the tree as generated by 777# '::htmlparse::2tree'. It removes all nodes representing forms 778# and form elements. 779# 780# Arguments: 781# tree The name of the tree to cut down. 782# 783# Side Effects: 784# Modifies the specified tree. 785# 786# Results: 787# None. 788 789proc ::htmlparse::removeFormDefs {tree} { 790 $tree walk root -order post n { 791 ::htmlparse::RemoveFormDefs $tree $n 792 } 793 return 794} 795 796# htmlparse::RemoveVisualFluff -- 797# 798# Internal helper command to 799# '::htmlparse::removeVisualFluff'. Does the actual work. 800# 801# Arguments: 802# tree The name of the tree currently processed 803# node The name of the node to look at. 804# 805# Side Effects: 806# Modifies the specified tree. 807# 808# Results: 809# None. 810 811proc ::htmlparse::RemoveVisualFluff {tree node} { 812 switch -exact -- [$tree get $node type] { 813 hmstart - html - font - center - div - sup - b - i { 814 # Removes the node, but does not affect the nodes below 815 # it. These are just made into chiildren of the parent of 816 # this node, in its place. 817 818 $tree cut $node 819 } 820 script - option - select - meta - map - img { 821 # Removes this node and everything below it. 822 $tree delete $node 823 } 824 default { 825 # Ignore tag 826 } 827 } 828} 829 830# htmlparse::RemoveFormDefs -- 831# 832# Internal helper command to 833# '::htmlparse::removeFormDefs'. Does the actual work. 834# 835# Arguments: 836# tree The name of the tree currently processed 837# node The name of the node to look at. 838# 839# Side Effects: 840# Modifies the specified tree. 841# 842# Results: 843# None. 844 845proc ::htmlparse::RemoveFormDefs {tree node} { 846 switch -exact -- [$tree get $node type] { 847 form { 848 $tree delete $node 849 } 850 default { 851 # Ignore tag 852 } 853 } 854} 855 856# htmlparse::Reorder -- 857 858# Internal helper command to '::htmlparse::2tree'. Moves the 859# nodes between p/p, li/li and h<i> sequences below the 860# paragraphs and items. IOW, corrects misconstructions for 861# the optional node types. 862# 863# Arguments: 864# tree The name of the tree currently processed 865# node The name of the node to look at. 866# 867# Side Effects: 868# Modifies the specified tree. 869# 870# Results: 871# None. 872 873proc ::htmlparse::Reorder {tree node} { 874 switch -exact -- [set tp [$tree get $node type]] { 875 h1 - h2 - h3 - h4 - h5 - h6 - p - li { 876 # Look for right siblings until the next node with a 877 # similar type (or end of level) and move these below this 878 # node. 879 880 while {1} { 881 set sibling [$tree next $node] 882 if { 883 ($sibling == {}) || 884 ([lsearch -exact {h1 h2 h3 h4 h5 h6 p li} [$tree get $sibling type]] != -1) 885 } { 886 break 887 } 888 $tree move $node end $sibling 889 } 890 } 891 default { 892 # Ignore tag 893 } 894 } 895} 896 897# ### ######### ########################### 898 899package provide htmlparse 1.2 900