1# htmlparse.tcl --
2#
3#	This file implements a simple HTML parsing library in Tcl.
4#	It may take advantage of parsers coded in C in the future.
5#
6#	The functionality here is a subset of the
7#
8#		Simple HTML display library by Stephen Uhler (stephen.uhler@sun.com)
9#		Copyright (c) 1995 by Sun Microsystems
10#		Version 0.3 Fri Sep  1 10:47:17 PDT 1995
11#
12#	The main restriction is that all Tk-related code in the above
13#	was left out of the code here. It is expected that this code
14#	will go into a 'tklib' in the future.
15#
16# Copyright (c) 2001 by ActiveState Tool Corp.
17# See the file license.terms.
18
19package require Tcl       8.2
20package require struct::stack
21package require cmdline   1.1
22
23namespace eval ::htmlparse {
24    namespace export		\
25	    parse		\
26	    debugCallback	\
27	    mapEscapes		\
28	    2tree		\
29	    removeVisualFluff	\
30	    removeFormDefs
31
32    # Table of escape characters. Maps from their names to the actual
33    # character.  See http://htmlhelp.org/reference/html40/entities/
34
35    variable namedEntities
36
37    # I. Latin-1 Entities (HTML 4.01)
38    array set namedEntities {
39	nbsp \xa0 iexcl \xa1 cent \xa2 pound \xa3 curren \xa4
40	yen \xa5 brvbar \xa6 sect \xa7 uml \xa8 copy \xa9
41	ordf \xaa laquo \xab not \xac shy \xad reg \xae
42	macr \xaf deg \xb0 plusmn \xb1 sup2 \xb2 sup3 \xb3
43	acute \xb4 micro \xb5 para \xb6 middot \xb7 cedil \xb8
44	sup1 \xb9 ordm \xba raquo \xbb frac14 \xbc frac12 \xbd
45	frac34 \xbe iquest \xbf Agrave \xc0 Aacute \xc1 Acirc \xc2
46	Atilde \xc3 Auml \xc4 Aring \xc5 AElig \xc6 Ccedil \xc7
47	Egrave \xc8 Eacute \xc9 Ecirc \xca Euml \xcb Igrave \xcc
48	Iacute \xcd Icirc \xce Iuml \xcf ETH \xd0 Ntilde \xd1
49	Ograve \xd2 Oacute \xd3 Ocirc \xd4 Otilde \xd5 Ouml \xd6
50	times \xd7 Oslash \xd8 Ugrave \xd9 Uacute \xda Ucirc \xdb
51	Uuml \xdc Yacute \xdd THORN \xde szlig \xdf agrave \xe0
52	aacute \xe1 acirc \xe2 atilde \xe3 auml \xe4 aring \xe5
53	aelig \xe6 ccedil \xe7 egrave \xe8 eacute \xe9 ecirc \xea
54	euml \xeb igrave \xec iacute \xed icirc \xee iuml \xef
55	eth \xf0 ntilde \xf1 ograve \xf2 oacute \xf3 ocirc \xf4
56	otilde \xf5 ouml \xf6 divide \xf7 oslash \xf8 ugrave \xf9
57	uacute \xfa ucirc \xfb uuml \xfc yacute \xfd thorn \xfe
58	yuml \xff
59    }
60
61    # II. Entities for Symbols and Greek Letters (HTML 4.01)
62    array set namedEntities {
63	fnof \u192 Alpha \u391 Beta \u392 Gamma \u393 Delta \u394
64	Epsilon \u395 Zeta \u396 Eta \u397 Theta \u398 Iota \u399
65	Kappa \u39A Lambda \u39B Mu \u39C Nu \u39D Xi \u39E
66	Omicron \u39F Pi \u3A0 Rho \u3A1 Sigma \u3A3 Tau \u3A4
67	Upsilon \u3A5 Phi \u3A6 Chi \u3A7 Psi \u3A8 Omega \u3A9
68	alpha \u3B1 beta \u3B2 gamma \u3B3 delta \u3B4 epsilon \u3B5
69	zeta \u3B6 eta \u3B7 theta \u3B8 iota \u3B9 kappa \u3BA
70	lambda \u3BB mu \u3BC nu \u3BD xi \u3BE omicron \u3BF
71	pi \u3C0 rho \u3C1 sigmaf \u3C2 sigma \u3C3 tau \u3C4
72	upsilon \u3C5 phi \u3C6 chi \u3C7 psi \u3C8 omega \u3C9
73	thetasym \u3D1 upsih \u3D2 piv \u3D6 bull \u2022
74	hellip \u2026 prime \u2032 Prime \u2033 oline \u203E
75	frasl \u2044 weierp \u2118 image \u2111 real \u211C
76	trade \u2122 alefsym \u2135 larr \u2190 uarr \u2191
77	rarr \u2192 darr \u2193 harr \u2194 crarr \u21B5
78	lArr \u21D0 uArr \u21D1 rArr \u21D2 dArr \u21D3 hArr \u21D4
79	forall \u2200 part \u2202 exist \u2203 empty \u2205
80	nabla \u2207 isin \u2208 notin \u2209 ni \u220B prod \u220F
81	sum \u2211 minus \u2212 lowast \u2217 radic \u221A
82	prop \u221D infin \u221E ang \u2220 and \u2227 or \u2228
83	cap \u2229 cup \u222A int \u222B there4 \u2234 sim \u223C
84	cong \u2245 asymp \u2248 ne \u2260 equiv \u2261 le \u2264
85	ge \u2265 sub \u2282 sup \u2283 nsub \u2284 sube \u2286
86	supe \u2287 oplus \u2295 otimes \u2297 perp \u22A5
87	sdot \u22C5 lceil \u2308 rceil \u2309 lfloor \u230A
88	rfloor \u230B lang \u2329 rang \u232A loz \u25CA
89	spades \u2660 clubs \u2663 hearts \u2665 diams \u2666
90    }
91
92    # III. Special Entities (HTML 4.01)
93    array set namedEntities {
94	quot \x22 amp \x26 lt \x3C gt \x3E OElig \u152 oelig \u153
95	Scaron \u160 scaron \u161 Yuml \u178 circ \u2C6
96	tilde \u2DC ensp \u2002 emsp \u2003 thinsp \u2009
97	zwnj \u200C zwj \u200D lrm \u200E rlm \u200F ndash \u2013
98	mdash \u2014 lsquo \u2018 rsquo \u2019 sbquo \u201A
99	ldquo \u201C rdquo \u201D bdquo \u201E dagger \u2020
100	Dagger \u2021 permil \u2030 lsaquo \u2039 rsaquo \u203A
101	euro \u20AC
102    }
103
104    # IV. Special Entities (XHTML, XML)
105    array set namedEntities {
106	apos \u0027
107    }
108
109    # Internal cache for the foreach variable-lists and the
110    # substitution strings used to split a HTML string into
111    # incrementally handleable scripts. This should reduce the
112    # time compute this information for repeated calls with the same
113    # split-factor. The array is indexed by a combination of the
114    # numerical split factor and the length of the command prefix and
115    # maps this to a 2-element list containing variable- and
116    # subst-string.
117
118    variable  splitdata
119    array set splitdata {}
120
121}
122
123# htmlparse::parse --
124#
125#	This command is the basic parser for HTML. It takes a HTML
126#	string, parses it and invokes a command prefix for every tag
127#	encountered. It is not necessary for the HTML to be valid for
128#	this parser to function. It is the responsibility of the
129#	command invoked for every tag to check this. Another
130#	responsibility of the invoked command is the handling of tag
131#	attributes and character entities (escaped characters). The
132#	parser provides the un-interpreted tag attributes to the
133#	invoked command to aid in the former, and the package at large
134#	provides a helper command, '::htmlparse::mapEscapes', to aid
135#	in the handling of the latter. The parser *does* ignore
136#	leading DOCTYPE declarations and all valid HTML comments it
137#	encounters.
138#
139#	All information beyond the HTML string itself is specified via
140#	options, these are explained below.
141#
142#	To help understanding the options some more background
143#	information about the parser.
144#
145#	It is capable to detect incomplete tags in the HTML string
146#	given to it. Under normal circumstances this will cause the
147#	parser to throw an error, but if the option '-incvar' is used
148#	to specify a global (or namespace) variable the parser will
149#	store the incomplete part of the input into this variable
150#	instead. This will aid greatly in the handling of
151#	incrementally arriving HTML as the parser will handle whatever
152#	he can and defer the handling of the incomplete part until
153#	more data has arrived.
154#
155#	Another feature of the parser are its two possible modes of
156#	operation. The normal mode is activated if the option '-queue'
157#	is not present on the command line invoking the parser. If it
158#	is present the parser will go into the incremental mode instead.
159#
160#	The main difference is that a parser in normal mode will
161#	immediately invoke the command prefix for each tag it
162#	encounters. In incremental mode however the parser will
163#	generate a number of scripts which invoke the command prefix
164#	for groups of tags in the HTML string and then store these
165#	scripts in the specified queue. It is then the responsibility
166#	of the caller of the parser to ensure the execution of the
167#	scripts in the queue.
168#
169#	Note: The queue objecct given to the parser has to provide the
170#	same interface as the queue defined in tcllib -> struct. This
171#	does for example mean that all queues created via that part of
172#	tcllib can be immediately used here. Still, the queue doesn't
173#	have to come from tcllib -> struct as long as the same
174#	interface is provided.
175#
176#	In both modes the parser will return an empty string to the
177#	caller.
178#
179#	To a parser in incremental mode the option '-split' can be
180#	given and will specify the size of the groups he creates. In
181#	other words, -split 5 means that each of the generated scripts
182#	will invoke the command prefix for 5 consecutive tags in the
183#	HTML string. A parser in normal mode will ignore this option
184#	and its value.
185#
186#	The option '-vroot' specifies a virtual root tag. A parser in
187#	normal mode will invoke the command prefix for it immediately
188#	before and after he processes the tags in the HTML, thus
189#	simulating that the HTML string is enclosed in a <vroot>
190#	</vroot> combination. In incremental mode however the parser
191#	is unable to provide the closing virtual root as he never
192#	knows when the input is complete. In this case the first
193#	script generated by each invocation of the parser will contain
194#	an invocation of the command prefix for the virtual root as
195#	its first command.
196#
197#	Interface to the command prefix:
198#
199#	In normal mode the parser will invoke the command prefix with
200#	for arguments appended. See '::htmlparse::debugCallback' for a
201#	description. In incremental mode however the generated scripts
202#	will invoke the command prefix with five arguments
203#	appended. The last four of these are the same which were
204#	mentioned above. The first however is a placeholder string
205#	(\win\) for a clientdata value to be supplied later during the
206#	actual execution of the generated scripts. This could be a tk
207#	window path, for example. This allows the user of this package
208#	to preprocess HTML strings without commiting them to a
209#	specific window, object, whatever during parsing. This
210#	connection can be made later. This also means that it is
211#	possible to cache preprocessed HTML. Of course, nothing
212#	prevents the user of the parser to replace the placeholder
213#	with an empty string.
214#
215# Arguments:
216#	args	An option/value-list followed by the string to
217#		parse. Available options are:
218#
219#		-cmd	The command prefix to invoke for every tag in
220#			the HTML string. Defaults to
221#			'::htmlparse::debugCallback'.
222#
223#		-vroot	The virtual root tag to add around the HTML in
224#			normal mode. In incremental mode it is the
225#			first tag in each chunk processed by the
226#			parser, but there will be no closing tags.
227#			Defaults to 'hmstart'.
228#
229#		-split	The size of the groups produced by an
230#			incremental mode parser. Ignored when in
231#			normal mode. Defaults to 10. Values <= 0 are
232#			not allowed.
233#
234#		-incvar	The name of the variable where to store any
235#			incomplete HTML into. Optional.
236#
237#		-queue
238#			The handle/name of the queue objecct to store
239#			the generated scripts into. Activates
240#			incremental mode. Normal mode is used if this
241#			option is not present.
242#
243#		After the options the command expects a single argument
244#		containing the HTML string to parse.
245#
246# Side Effects:
247#	In normal mode as of the invoked command. Else none.
248#
249# Results:
250#	None.
251
252proc ::htmlparse::parse {args} {
253    # Convert the HTML string into a evaluable command sequence.
254
255    variable splitdata
256
257    # Option processing, start with the defaults, then run through the
258    # list of arguments.
259
260    set cmd    ::htmlparse::debugCallback
261    set vroot  hmstart
262    set incvar ""
263    set split  10
264    set queue  ""
265
266    while {[set err [cmdline::getopt args {cmd.arg vroot.arg incvar.arg split.arg queue.arg} opt arg]]} {
267	if {$err < 0} {
268	    return -code error "::htmlparse::parse : $arg"
269	}
270	switch -exact -- $opt {
271	    cmd    -
272	    vroot  -
273	    incvar -
274	    queue  {
275		if {[string length $arg] == 0} {
276		    return -code error "::htmlparse::parse : -$opt illegal argument (empty)"
277		}
278		# Each option has an variable with the same name associated with it.
279		# FRINK: nocheck
280		set $opt $arg
281	    }
282	    split  {
283		if {$arg <= 0} {
284		    return -code error "::htmlparse::parse : -split illegal argument (<= 0)"
285		}
286		set split $arg
287	    }
288	    default {# Can't happen}
289	}
290    }
291
292    if {[llength $args] > 1} {
293	return -code error "::htmlparse::parse : to many arguments behind the options, expected one"
294    }
295    if {[llength $args] < 1} {
296	return -code error "::htmlparse::parse : html string missing"
297    }
298
299    set html [PrepareHtml [lindex $args 0]]
300
301    # Look for incomplete HTML from the last iteration and prepend it
302    # to the input we just got.
303
304    if {$incvar != {}} {
305	upvar $incvar incomplete
306    } else {
307	set incomplete ""
308    }
309
310    if {[catch {set new $incomplete$html}]} {set new $html}
311    set html $new
312
313    # Handle incomplete HTML (Recognize incomplete tag at end, buffer
314    # it up for the next call).
315
316    set end [lindex \{$html\} end]
317    if {[set idx [string last < $end]] > [string last > $end]} {
318
319	if {$incvar == {}} {
320	    return -code error "::htmlparse::parse : HTML is incomplete, option -incvar is missing"
321	}
322
323	#  upvar $incvar incomplete -- Already done, s.a.
324	set incomplete [string range $end $idx end]
325	incr idx -1
326	set html       [string range $end 0 $idx]
327
328    } else {
329	set incomplete ""
330    }
331
332    # Convert the HTML string into a script.
333
334    set sub "\}\n$cmd {\\1} {} {\\2} \{\}\n$cmd {\\1} {/} {} \{"
335    regsub -all -- {<([^\s>]+)\s*([^>]*)/>} $html $sub html
336
337    set sub "\}\n$cmd {\\2} {\\1} {\\3} \{"
338    regsub -all -- {<(/?)([^\s>]+)\s*([^>]*)>} $html $sub html
339
340    # The value of queue now determines wether we process the HTML by
341    # ourselves (queue is empty) or if we generate a list of  scripts
342    # each of which processes n tags, n the argument to -split.
343
344    if {$queue == {}} {
345	# And evaluate it. This is the main parsing step.
346
347	eval "$cmd {$vroot} {} {} \{$html\}"
348	eval "$cmd {$vroot} /  {} {}"
349    } else {
350	# queue defined, generate list of scripts doing small chunks of tags.
351
352	set lcmd [llength $cmd]
353	set key  $split,$lcmd
354
355	if {![info exists splitdata($key)]} {
356	    for {set i 0; set group {}} {$i < $split} {incr i} {
357		# Use the length of the command prefix to generate
358		# additional variables before the main variable after
359		# which the placeholder will be inserted.
360
361		for {set j 1} {$j < $lcmd} {incr j} {
362		    append group "b${j}_$i "
363		}
364
365		append group "a$i c$i d$i e$i f$i\n"
366	    }
367	    regsub -all -- {(a[0-9]+)}          $group    {{$\1} @win@} subgroup
368	    regsub -all -- {([b-z_0-9]+[0-9]+)} $subgroup {{$\1}}       subgroup
369
370	    set splitdata($key) [list $group $subgroup]
371	}
372
373	foreach {group subgroup} $splitdata($key) break ; # lassign
374	foreach $group "$cmd {$vroot} {} {} \{$html\}" {
375	    $queue put [string trimright [subst $subgroup]]
376	}
377    }
378    return
379}
380
381# htmlparse::PrepareHtml --
382#
383#	Internal helper command of '::htmlparse::parse'. Removes
384#	leading DOCTYPE declarations and comments, protects the
385#	special characters of tcl from evaluation.
386#
387# Arguments:
388#	html	The HTML string to prepare
389#
390# Side Effects:
391#	None.
392#
393# Results:
394#	The provided HTML string with the described modifications
395#	applied to it.
396
397proc ::htmlparse::PrepareHtml {html} {
398    # Remove the following items from the text:
399    # - A leading	<!DOCTYPE...> declaration.
400    # - All comments	<!-- ... -->
401    #
402    # Also normalize the line endings (\r -> \n).
403
404    # Tcllib SF Bug 861287 - Processing of comments.
405    # Recognize EOC by RE, instead of fixed string.
406
407    set html [string map [list \r \n] $html]
408
409    regsub -- "^.*<!DOCTYPE\[^>\]*>"    $html {}     html
410    regsub -all -- "--(\[ \t\n\]*)>"      $html "\001\\1\002" html
411
412    # Recognize borken beginnings of a comment and convert them to PCDATA.
413    regsub -all -- "<--(\[^\001\]*)\001(\[^\002\]*)\002" $html {\&lt;--\1--\2\&gt;} html
414
415    # And now recognize true comments, remove them.
416    regsub -all -- "<!--\[^\001\]*\001(\[^\002\]*)\002"  $html {}                   html
417
418    # Protect characters special to tcl (braces, slashes) by
419    # converting them to their escape sequences.
420
421    return [string map [list \
422		    "\{" "&#123;" \
423		    "\}" "&#125;" \
424		    "\\" "&#92;"] $html]
425}
426
427
428
429# htmlparse::debugCallback --
430#
431#	The standard callback used by the parser in
432#	'::htmlparse::parse' if none was specified by the user. Simply
433#	dumps its arguments to stdout.  This callback can be used for
434#	both normal and incremental mode of the calling parser. In
435#	other words, it accepts four or five arguments. The last four
436#	arguments are described below. The optional fifth argument
437#	contains the clientdata value given to the callback by a
438#	parser in incremental mode. All callbacks have to follow the
439#	signature of this command in the last four arguments, and
440#	callbacks used in incremental parsing have to follow this
441#	signature in the last five arguments.
442#
443# Arguments:
444#	tag			The name of the tag currently
445#				processed by the parser.
446#
447#	slash			Either empty or a slash. Allows us to
448#				distinguish between opening (slash is
449#				empty) and closing tags (slash is
450#				equal to a '/').
451#
452#	param			The un-interpreted list of parameters
453#				to the tag.
454#
455#	textBehindTheTag	The text found by the parser behind
456#				the tag named in 'tag'.
457#
458# Side Effects:
459#	None.
460#
461# Results:
462#	None.
463
464proc ::htmlparse::debugCallback {args} {
465    # args = ?clientData? tag slash param textBehindTheTag
466    puts "==> $args"
467    return
468}
469
470# htmlparse::mapEscapes --
471#
472#	Takes a HTML string, substitutes all escape sequences with
473#	their actual characters and returns the resulting string.
474#	HTML not containing escape sequences or invalid escape
475#	sequences is returned unchanged.
476#
477# Arguments:
478#	html	The string to modify
479#
480# Side Effects:
481#	None.
482#
483# Results:
484#	The argument string with all escape sequences replaced with
485#	their actual characters.
486
487proc ::htmlparse::mapEscapes {html} {
488    # Find HTML escape characters of the form &xxx(;|EOW)
489
490    # Quote special Tcl chars so they pass through [subst] unharmed.
491    set new [string map [list \] \\\] \[ \\\[ \$ \\\$ \\ \\\\] $html]
492    regsub -all -- {&([[:alnum:]]{2,7})(;|\M)} $new {[DoNamedMap \1 {\2}]} new
493    regsub -all -- {&#([[:digit:]]{1,5})(;|\M)} $new {[DoDecMap \1 {\2}]} new
494    regsub -all -- {&#x([[:xdigit:]]{1,4})(;|\M)} $new {[DoHexMap \1 {\2}]} new
495    return [subst $new]
496}
497
498proc ::htmlparse::DoNamedMap {name endOf} {
499    variable namedEntities
500    if {[info exist namedEntities($name)]} {
501	return $namedEntities($name)
502    } else {
503	# Put it back..
504	return "&$name$endOf"
505    }
506}
507
508proc ::htmlparse::DoDecMap {dec endOf} {
509    scan $dec %d dec
510    if {$dec <= 0xFFFD} {
511	return [format %c $dec]
512    } else {
513	# Put it back..
514	return "&#$dec$endOf"
515    }
516}
517
518proc ::htmlparse::DoHexMap {hex endOf} {
519    scan $hex %x value
520    if {$value <= 0xFFFD} {
521	return [format %c $value]
522    } else {
523	# Put it back..
524	return "&#x$hex$endOf"
525    }
526}
527
528# htmlparse::2tree --
529#
530#	This command is a wrapper around '::htmlparse::parse' which
531#	takes a HTML string and converts it into a tree containing the
532#	logical structure of the parsed document. The tree object has
533#	to be created by the caller. It is also expected that the tree
534#	object provides the same interface as the tree object from
535#	tcllib -> struct. It doesn't have to come from that module
536#	though. The internal callback does some basic checking of HTML
537#	validity and tries to recover from the most basic errors.
538#
539# Arguments:
540#	html	The HTML string to parse and convert.
541#	tree	The name of the tree to fill.
542#
543# Side Effects:
544#	Creates a tree object (see tcllib -> struct)
545#	and modifies it.
546#
547# Results:
548#	The contents of 'tree'.
549
550proc ::htmlparse::2tree {html tree} {
551
552    # One internal datastructure is required, a stack of open
553    # tags. This stack is also provided by the 'struct' module of
554    # tcllib. As the operation of this command is synchronuous we
555    # don't have to take care against multiple running copies at the
556    # same times (Such are possible, but will be in different
557    # interpreters and true concurrency is possible only if they are
558    # in different threads too). IOW, no need for tricks to make the
559    # internal datastructure unique.
560
561    catch {::htmlparse::tags destroy}
562
563    ::struct::stack ::htmlparse::tags
564    ::htmlparse::tags push root
565    $tree set root type root
566
567    parse -cmd [list ::htmlparse::2treeCallback $tree] $html
568
569    # A bit hackish, correct the ordering of nodes for the optional
570    # tag types, over a larger area when was seen by the parser itself.
571
572    $tree walk root -order post n {
573	::htmlparse::Reorder $tree $n
574    }
575
576    ::htmlparse::tags destroy
577    return $tree
578}
579
580# htmlparse::2treeCallback --
581#
582#	Internal helper command. A special callback to
583#	'::htmlparse::parse' used by '::htmlparse::2tree' which takes
584#	the incoming stream of tags and converts them into a tree
585#	representing the inner structure of the parsed HTML
586#	document. Recovers from simple HTML errors like missing
587#	opening tags, missing closing tags and overlapping tags.
588#
589# Arguments:
590#	tree			The name of the tree to manipulate.
591#	tag			See '::htmlparse::debugCallback'.
592#	slash			See '::htmlparse::debugCallback'.
593#	param			See '::htmlparse::debugCallback'.
594#	textBehindTheTag	See '::htmlparse::debugCallback'.
595#
596# Side Effects:
597#	Manipulates the tree object whose name was given as the first
598#	argument.
599#
600# Results:
601#	None.
602
603proc ::htmlparse::2treeCallback {tree tag slash param textBehindTheTag} {
604    # This could be table-driven I think but for now the switches
605    # should work fine.
606
607    # Normalize tag information for later comparisons. Also remove
608    # superfluous whitespace. Don't forget to decode the standard
609    # entities.
610
611    set  tag  [string tolower $tag]
612    set  textBehindTheTag [string trim $textBehindTheTag]
613    if {$textBehindTheTag != {}} {
614	set text [mapEscapes $textBehindTheTag]
615    }
616
617    if {"$slash" == "/"} {
618	# Handle closing tags. Standard operation is to pop the tag
619	# from the stack of open tags. We don't do this for </p> and
620	# </li>. As they were optional they were never pushed onto the
621	# stack (Well, actually they are just popped immediately after
622	# they were pusheed, see below).
623
624	switch -exact -- $tag {
625	    base - option - meta - li - p {
626		# Ignore, nothing to do.
627	    }
628	    default {
629		# The moment we get a closing tag which does not match
630		# the tag on the stack we have two possibilities on how
631		# this came into existence to choose from:
632		#
633		# a) A tag is now closed but was never opened.
634		# b) A tag requiring an end tag was opened but the end
635		#    tag was omitted and we now are at a tag which was
636		#    opened before the one with the omitted end tag.
637
638		# NOTE:
639		# Pages delivered from the amazon.uk site contain both
640		# cases: </a> without opening, <b> & <font> without
641		# closing. Another error: <a><b></a></b>, i.e. overlapping
642		# tags. Fortunately this can be handled by the algorithm
643		# below, in two cycles, one of which is case (b), followed
644		# by case (a). It seems as if Amazon/UK believes that visual
645		# markup like <b> and <font> is an option (switch-on) instead
646		# of a region.
647
648		# Algorithm used here to deal with these:
649		# 1) Search whole stack for the matching opening tag.
650		#    If there is one assume case (b) and pop everything
651		#    until and including this opening tag.
652		# 2) If no matching opening tag was found assume case
653		#    (a) and ignore the tag.
654		#
655		# Part (1) also subsumes the normal case, i.e. the
656		# matching tag is at the top of the stack.
657
658		set nodes [::htmlparse::tags peek [::htmlparse::tags size]]
659		# Note: First item is top of stack, last item is bottom of stack !
660		# (This behaviour of tcllib stacks is not documented
661		# -> we should update the manpage).
662
663		#foreach n $nodes {lappend tstring [p get $n -key type]}
664		#puts stderr --[join $tstring]--
665
666		set level 1
667		set found 0
668		foreach n $nodes {
669		    set type [$tree get $n type]
670		    if {0 == [string compare $tag $type]} {
671			# Found an earlier open tag -> (b).
672			set found 1
673			break
674		    }
675		    incr level
676		}
677		if {$found} {
678		    ::htmlparse::tags pop $level
679		    if {$level > 1} {
680			#foreach n $nodes {lappend tstring [$tree get $n type]}
681			#puts stderr "\tdesync at <$tag> ($tstring) => pop $level"
682		    }
683		} else {
684		    #foreach n $nodes {lappend tstring [$tree get $n type]}
685		    #puts stderr "\tdesync at <$tag> ($tstring) => ignore"
686		}
687	    }
688	}
689
690	# If there is text behind a closing tag X it belongs to the
691	# parent tag of X.
692
693	if {$textBehindTheTag != {}} {
694	    # Attach the text behind the closing tag to the reopened
695	    # context.
696
697	    set        pcd  [$tree insert [::htmlparse::tags peek] end]
698	    $tree set $pcd  type PCDATA
699	    $tree set $pcd  data $textBehindTheTag
700	}
701
702    } else {
703	# Handle opening tags. The standard operation for most is to
704	# push them onto the stack and thus open a nested context.
705	# This does not happen for both the optional tags (p, li) and
706	# the ones which don't have closing tags (meta, br, option,
707	# input, area, img).
708	#
709	# The text coming with the tag will be added after the tag if
710	# it is a tag without a matching close, else it will be added
711	# as a node below the tag (as it is the region between the
712	# opening and closing tag and thus nested inside). Empty text
713	# is ignored under all circcumstances.
714
715	set        node [$tree insert [::htmlparse::tags peek] end]
716	$tree set $node type $tag
717	$tree set $node data $param
718
719	if {$textBehindTheTag != {}} {
720	    switch -exact -- $tag {
721		input -	area - img - br {
722		    set pcd  [$tree insert [::htmlparse::tags peek] end]
723		}
724		default {
725		    set pcd  [$tree insert $node end]
726		}
727	    }
728	    $tree set $pcd type PCDATA
729	    $tree set $pcd data $textBehindTheTag
730	}
731
732	::htmlparse::tags push $node
733
734	# Special handling: <p>, <li> may have no closing tag => pop
735	#                 : them immediately.
736	#
737	# Special handling: <meta>, <br>, <option>, <input>, <area>,
738	#                 : <img>: no closing tags for these.
739
740	switch -exact -- $tag {
741	    hr - base - meta - li - br - option - input - area - img - p - h1 - h2 - h3 - h4 - h5 - h6 {
742		::htmlparse::tags pop
743	    }
744	    default {}
745	}
746    }
747}
748
749# htmlparse::removeVisualFluff --
750#
751#	This command walks a tree as generated by '::htmlparse::2tree'
752#	and removes all the nodes which represent visual tags and not
753#	structural ones. The purpose of the command is to make the
754#	tree easier to navigate without getting bogged down in visual
755#	information not relevant to the search.
756#
757# Arguments:
758#	tree	The name of the tree to cut down.
759#
760# Side Effects:
761#	Modifies the specified tree.
762#
763# Results:
764#	None.
765
766proc ::htmlparse::removeVisualFluff {tree} {
767    $tree walk root -order post n {
768	::htmlparse::RemoveVisualFluff $tree $n
769    }
770    return
771}
772
773# htmlparse::removeFormDefs --
774#
775#	Like '::htmlparse::removeVisualFluff' this command is here to
776#	cut down on the size of the tree as generated by
777#	'::htmlparse::2tree'. It removes all nodes representing forms
778#	and form elements.
779#
780# Arguments:
781#	tree	The name of the tree to cut down.
782#
783# Side Effects:
784#	Modifies the specified tree.
785#
786# Results:
787#	None.
788
789proc ::htmlparse::removeFormDefs {tree} {
790    $tree walk root -order post n {
791	::htmlparse::RemoveFormDefs $tree $n
792    }
793    return
794}
795
796# htmlparse::RemoveVisualFluff --
797#
798#	Internal helper command to
799#	'::htmlparse::removeVisualFluff'. Does the actual work.
800#
801# Arguments:
802#	tree	The name of the tree currently processed
803#	node	The name of the node to look at.
804#
805# Side Effects:
806#	Modifies the specified tree.
807#
808# Results:
809#	None.
810
811proc ::htmlparse::RemoveVisualFluff {tree node} {
812    switch -exact -- [$tree get $node type] {
813	hmstart - html - font - center - div - sup - b - i {
814	    # Removes the node, but does not affect the nodes below
815	    # it. These are just made into chiildren of the parent of
816	    # this node, in its place.
817
818	    $tree cut $node
819	}
820	script - option - select - meta - map - img {
821	    # Removes this node and everything below it.
822	    $tree delete $node
823	}
824	default {
825	    # Ignore tag
826	}
827    }
828}
829
830# htmlparse::RemoveFormDefs --
831#
832#	Internal helper command to
833#	'::htmlparse::removeFormDefs'. Does the actual work.
834#
835# Arguments:
836#	tree	The name of the tree currently processed
837#	node	The name of the node to look at.
838#
839# Side Effects:
840#	Modifies the specified tree.
841#
842# Results:
843#	None.
844
845proc ::htmlparse::RemoveFormDefs {tree node} {
846    switch -exact -- [$tree get $node type] {
847	form {
848	    $tree delete $node
849	}
850	default {
851	    # Ignore tag
852	}
853    }
854}
855
856# htmlparse::Reorder --
857
858#	Internal helper command to '::htmlparse::2tree'. Moves the
859#	nodes between p/p, li/li and h<i> sequences below the
860#	paragraphs and items. IOW, corrects misconstructions for
861#	the optional node types.
862#
863# Arguments:
864#	tree	The name of the tree currently processed
865#	node	The name of the node to look at.
866#
867# Side Effects:
868#	Modifies the specified tree.
869#
870# Results:
871#	None.
872
873proc ::htmlparse::Reorder {tree node} {
874    switch -exact -- [set tp [$tree get $node type]] {
875	h1 - h2 - h3 - h4 - h5 - h6 - p - li {
876	    # Look for right siblings until the next node with a
877	    # similar type (or end of level) and move these below this
878	    # node.
879
880	    while {1} {
881		set sibling [$tree next $node]
882		if {
883		    ($sibling == {}) ||
884		    ([lsearch -exact {h1 h2 h3 h4 h5 h6 p li} [$tree get $sibling type]] != -1)
885		} {
886		    break
887		}
888		$tree move $node end $sibling
889	    }
890	}
891	default {
892	    # Ignore tag
893	}
894    }
895}
896
897# ### ######### ###########################
898
899package provide htmlparse 1.2
900