1# sgmlparser.tcl --
2#
3#	This file provides the generic part of a parser for SGML-based
4#	languages, namely HTML and XML.
5#
6#	NB.  It is a misnomer.  There is no support for parsing
7#	arbitrary SGML as such.
8#
9#	See sgml.tcl for variable definitions.
10#
11# Copyright (c) 1998-2003 Zveno Pty Ltd
12# http://www.zveno.com/
13#
14# Zveno makes this software available free of charge for any purpose.
15# Copies may be made of this software but all of this notice must be included
16# on any copy.
17#
18# The software was developed for research purposes only and Zveno does not
19# warrant that it is error free or fit for any purpose.  Zveno disclaims any
20# liability for all claims, expenses, losses, damages and costs any user may
21# incur as a result of using, copying or modifying this software.
22#
23# Copyright (c) 1997 ANU and CSIRO on behalf of the
24# participants in the CRC for Advanced Computational Systems ('ACSys').
25#
26# ACSys makes this software and all associated data and documentation
27# ('Software') available free of charge for any purpose.  You may make copies
28# of the Software but you must include all of this notice on any copy.
29#
30# The Software was developed for research purposes and ACSys does not warrant
31# that it is error free or fit for any purpose.  ACSys disclaims any
32# liability for all claims, expenses, losses, damages and costs any user may
33# incur as a result of using, copying or modifying the Software.
34#
35# $Id: sgmlparser.tcl,v 1.30 2003/02/25 04:09:20 balls Exp $
36
37package require xmldefs
38
39package require sgml 1.9
40
41package require uri 1.1
42
43package provide sgmlparser 1.0
44
45namespace eval sgml {
46    namespace export tokenise parseEvent
47
48    namespace export parseDTD
49
50    # NB. Most namespace variables are defined in sgml-8.[01].tcl
51    # to account for differences between versions of Tcl.
52    # This especially includes the regular expressions used.
53
54    variable ParseEventNum
55    if {![info exists ParseEventNum]} {
56	set ParseEventNum 0
57    }
58    variable ParseDTDnum
59    if {![info exists ParseDTDNum]} {
60	set ParseDTDNum 0
61    }
62
63    variable declExpr [cl $::sgml::Wsp]*([cl ^$::sgml::Wsp]+)[cl $::sgml::Wsp]*([cl ^>]*)
64    variable EntityExpr [cl $::sgml::Wsp]*(%[cl $::sgml::Wsp])?[cl $::sgml::Wsp]*($::sgml::Name)[cl $::sgml::Wsp]+(.*)
65
66    #variable MarkupDeclExpr <([cl ^$::sgml::Wsp>]+)[cl $::sgml::Wsp]*([cl ^$::sgml::Wsp]+)[cl $::sgml::Wsp]*([cl ^>]*)>
67    #variable MarkupDeclSub "} {\\1} {\\2} {\\3} {"
68    variable MarkupDeclExpr <[cl $::sgml::Wsp]*([cl ^$::sgml::Wsp>]+)[cl $::sgml::Wsp]*([cl ^>]*)>
69    variable MarkupDeclSub "\} {\\1} {\\2} \{"
70
71    variable ExternalEntityExpr ^(PUBLIC|SYSTEM)[cl $::sgml::Wsp]+("|')(.*?)\\2([cl $::sgml::Wsp]+("|')(.*?)\\2)?([cl $::sgml::Wsp]+NDATA[cl $::sgml::Wsp]+($::xml::Name))?\$
72
73    variable StdOptions
74    array set StdOptions [list \
75	-elementstartcommand		[namespace current]::noop	\
76	-elementendcommand		[namespace current]::noop	\
77	-characterdatacommand		[namespace current]::noop	\
78	-processinginstructioncommand	[namespace current]::noop	\
79	-externalentitycommand		{}				\
80	-xmldeclcommand			[namespace current]::noop	\
81	-doctypecommand			[namespace current]::noop	\
82	-commentcommand			[namespace current]::noop	\
83	-entitydeclcommand		[namespace current]::noop	\
84	-unparsedentitydeclcommand	[namespace current]::noop	\
85	-parameterentitydeclcommand	[namespace current]::noop	\
86	-notationdeclcommand		[namespace current]::noop	\
87	-elementdeclcommand		[namespace current]::noop	\
88	-attlistdeclcommand		[namespace current]::noop	\
89	-paramentityparsing		1				\
90	-defaultexpandinternalentities	1				\
91	-startdoctypedeclcommand	[namespace current]::noop	\
92	-enddoctypedeclcommand		[namespace current]::noop	\
93	-entityreferencecommand		{}				\
94	-warningcommand			[namespace current]::noop	\
95	-errorcommand			[namespace current]::Error	\
96	-final				1				\
97	-validate			0				\
98	-baseurl			{}				\
99	-name				{}				\
100	-emptyelement			[namespace current]::EmptyElement	\
101	-parseattributelistcommand	[namespace current]::noop	\
102	-parseentitydeclcommand		[namespace current]::noop	\
103	-normalize			1				\
104	-internaldtd			{}				\
105	-reportempty			0				\
106	-ignorewhitespace		0				\
107    ]
108}
109
110# sgml::tokenise --
111#
112#	Transform the given HTML/XML text into a Tcl list.
113#
114# Arguments:
115#	sgml		text to tokenize
116#	elemExpr	RE to recognise tags
117#	elemSub		transform for matched tags
118#	args		options
119#
120# Valid Options:
121#       -internaldtdvariable
122#	-final		boolean		True if no more data is to be supplied
123#	-statevariable	varName		Name of a variable used to store info
124#
125# Results:
126#	Returns a Tcl list representing the document.
127
128proc sgml::tokenise {sgml elemExpr elemSub args} {
129    array set options {-final 1}
130    array set options $args
131    set options(-final) [Boolean $options(-final)]
132
133    # If the data is not final then there must be a variable to store
134    # unused data.
135    if {!$options(-final) && ![info exists options(-statevariable)]} {
136	return -code error {option "-statevariable" required if not final}
137    }
138
139    # Pre-process stage
140    #
141    # Extract the internal DTD subset, if any
142
143    catch {upvar #0 $options(-internaldtdvariable) dtd}
144    if {[regexp {<!DOCTYPE[^[<]+\[([^]]+)\]} $sgml discard dtd]} {
145	regsub {(<!DOCTYPE[^[<]+)(\[[^]]+\])} $sgml {\1\&xml:intdtd;} sgml
146    }
147
148    # Protect Tcl special characters
149    regsub -all {([{}\\])} $sgml {\\\1} sgml
150
151    # Do the translation
152
153    if {[info exists options(-statevariable)]} {
154	# Mats: Several rewrites here to handle -final 0 option.
155	# If any cached unparsed xml (state(leftover)), prepend it.
156	upvar #0 $options(-statevariable) state
157	if {[string length $state(leftover)]} {
158	    regsub -all $elemExpr $state(leftover)$sgml $elemSub sgml
159	    set state(leftover) {}
160	} else {
161	    regsub -all $elemExpr $sgml $elemSub sgml
162	}
163	set sgml "{} {} {} \{$sgml\}"
164
165	# Performance note (Tcl 8.0):
166	#	Use of lindex, lreplace will cause parsing to list object
167
168	# This RE only fixes chopped inside tags, not chopped text.
169	if {[regexp {^([^<]*)(<[^>]*$)} [lindex $sgml end] x text rest]} {
170	    set sgml [lreplace $sgml end end $text]
171	    # Mats: unmatched stuff means that it is chopped off. Cache it for next round.
172	    set state(leftover) $rest
173	}
174
175	# Patch from bug report #596959, Marshall Rose
176	if {[string compare [lindex $sgml 4] ""]} {
177	    set sgml [linsert $sgml 0 {} {} {} {} {}]
178	}
179
180    } else {
181
182	# Performance note (Tcl 8.0):
183	#	In this case, no conversion to list object is performed
184
185	# Mats: This fails if not -final and $sgml is chopped off right in a tag.
186	regsub -all $elemExpr $sgml $elemSub sgml
187	set sgml "{} {} {} \{$sgml\}"
188    }
189
190    return $sgml
191
192}
193
194# sgml::parseEvent --
195#
196#	Produces an event stream for a XML/HTML document,
197#	given the Tcl list format returned by tokenise.
198#
199#	This procedure checks that the document is well-formed,
200#	and throws an error if the document is found to be not
201#	well formed.  Warnings are passed via the -warningcommand script.
202#
203#	The procedure only check for well-formedness,
204#	no DTD is required.  However, facilities are provided for entity expansion.
205#
206# Arguments:
207#	sgml		Instance data, as a Tcl list.
208#	args		option/value pairs
209#
210# Valid Options:
211#	-final			Indicates end of document data
212#	-validate		Boolean to enable validation
213#	-baseurl		URL for resolving relative URLs
214#	-elementstartcommand	Called when an element starts
215#	-elementendcommand	Called when an element ends
216#	-characterdatacommand	Called when character data occurs
217#	-entityreferencecommand	Called when an entity reference occurs
218#	-processinginstructioncommand	Called when a PI occurs
219#	-externalentitycommand	Called for an external entity reference
220#
221#	-xmldeclcommand		Called when the XML declaration occurs
222#	-doctypecommand		Called when the document type declaration occurs
223#	-commentcommand		Called when a comment occurs
224#	-entitydeclcommand	Called when a parsed entity is declared
225#	-unparsedentitydeclcommand	Called when an unparsed external entity is declared
226#	-parameterentitydeclcommand	Called when a parameter entity is declared
227#	-notationdeclcommand	Called when a notation is declared
228#	-elementdeclcommand	Called when an element is declared
229#	-attlistdeclcommand	Called when an attribute list is declared
230#	-paramentityparsing	Boolean to enable/disable parameter entity substitution
231#	-defaultexpandinternalentities	Boolean to enable/disable expansion of entities declared in internal DTD subset
232#
233#	-startdoctypedeclcommand	Called when the Doc Type declaration starts (see also -doctypecommand)
234#	-enddoctypedeclcommand	Called when the Doc Type declaration ends (see also -doctypecommand)
235#
236#	-errorcommand		Script to evaluate for a fatal error
237#	-warningcommand		Script to evaluate for a reportable warning
238#	-statevariable		global state variable
239#	-normalize		whether to normalize names
240#	-reportempty		whether to include an indication of empty elements
241#	-ignorewhitespace	whether to automatically strip whitespace
242#
243# Results:
244#	The various callback scripts are invoked.
245#	Returns empty string.
246#
247# BUGS:
248#	If command options are set to empty string then they should not be invoked.
249
250proc sgml::parseEvent {sgml args} {
251    variable Wsp
252    variable noWsp
253    variable Nmtoken
254    variable Name
255    variable ParseEventNum
256    variable StdOptions
257
258    array set options [array get StdOptions]
259    catch {array set options $args}
260
261    # Mats:
262    # If the data is not final then there must be a variable to persistently store the parse state.
263    if {!$options(-final) && ![info exists options(-statevariable)]} {
264	return -code error {option "-statevariable" required if not final}
265    }
266
267    foreach {opt value} [array get options *command] {
268	if {[string compare $opt "-externalentitycommand"] && ![string length $value]} {
269	    set options($opt) [namespace current]::noop
270	}
271    }
272
273    if {![info exists options(-statevariable)]} {
274	set options(-statevariable) [namespace current]::ParseEvent[incr ParseEventNum]
275    }
276    if {![info exists options(entities)]} {
277	set options(entities) [namespace current]::Entities$ParseEventNum
278	array set $options(entities) [array get [namespace current]::EntityPredef]
279    }
280    if {![info exists options(extentities)]} {
281	set options(extentities) [namespace current]::ExtEntities$ParseEventNum
282    }
283    if {![info exists options(parameterentities)]} {
284	set options(parameterentities) [namespace current]::ParamEntities$ParseEventNum
285    }
286    if {![info exists options(externalparameterentities)]} {
287	set options(externalparameterentities) [namespace current]::ExtParamEntities$ParseEventNum
288    }
289    if {![info exists options(elementdecls)]} {
290	set options(elementdecls) [namespace current]::ElementDecls$ParseEventNum
291    }
292    if {![info exists options(attlistdecls)]} {
293	set options(attlistdecls) [namespace current]::AttListDecls$ParseEventNum
294    }
295    if {![info exists options(notationdecls)]} {
296	set options(notationdecls) [namespace current]::NotationDecls$ParseEventNum
297    }
298    if {![info exists options(namespaces)]} {
299	set options(namespaces) [namespace current]::Namespaces$ParseEventNum
300    }
301
302    # Choose an external entity resolver
303
304    if {![string length $options(-externalentitycommand)]} {
305	if {$options(-validate)} {
306	    set options(-externalentitycommand) [namespace code ResolveEntity]
307	} else {
308	    set options(-externalentitycommand) [namespace code noop]
309	}
310    }
311
312    upvar #0 $options(-statevariable) state
313    upvar #0 $options(entities) entities
314
315    # Mats:
316    # The problem is that the state is not maintained when -final 0 !
317    # I've switched back to an older version here.
318
319    if {![info exists state(line)]} {
320	# Initialise the state variable
321	array set state {
322	    mode normal
323	    haveXMLDecl 0
324	    haveDocElement 0
325	    inDTD 0
326	    context {}
327	    stack {}
328	    line 0
329	    defaultNS {}
330	    defaultNSURI {}
331	}
332    }
333
334    foreach {tag close param text} $sgml {
335
336	# Keep track of lines in the input
337	incr state(line) [regsub -all \n $param {} discard]
338	incr state(line) [regsub -all \n $text {} discard]
339
340	# If the current mode is cdata or comment then we must undo what the
341	# regsub has done to reconstitute the data
342
343	set empty {}
344	switch $state(mode) {
345	    comment {
346		# This had "[string length $param] && " as a guard -
347		# can't remember why :-(
348		if {[regexp ([cl ^-]*)--\$ $tag discard comm1]} {
349		    # end of comment (in tag)
350		    set tag {}
351		    set close {}
352		    set state(mode) normal
353		    uplevel #0 $options(-commentcommand) [list $state(commentdata)<$comm1]
354		    unset state(commentdata)
355		} elseif {[regexp ([cl ^-]*)--\$ $param discard comm1]} {
356		    # end of comment (in attributes)
357		    uplevel #0 $options(-commentcommand) [list $state(commentdata)<$close$tag>$comm1]
358		    unset state(commentdata)
359		    set tag {}
360		    set param {}
361		    set close {}
362		    set state(mode) normal
363		} elseif {[regexp ([cl ^-]*)-->(.*) $text discard comm1 text]} {
364		    # end of comment (in text)
365		    uplevel #0 $options(-commentcommand) [list $state(commentdata)<$close$tag$param>$comm1]
366		    unset state(commentdata)
367		    set tag {}
368		    set param {}
369		    set close {}
370		    set state(mode) normal
371		} else {
372		    # comment continues
373		    append state(commentdata) <$close$tag$param>$text
374		    continue
375		}
376	    }
377	    cdata {
378		if {[string length $param] && [regexp ([cl ^\]]*)\]\][cl $Wsp]*\$ $tag discard cdata1]} {
379		    # end of CDATA (in tag)
380		    PCDATA [array get options] $state(cdata)<[subst -nocommand -novariable $close$cdata1]
381		    set text [subst -novariable -nocommand $text]
382		    set tag {}
383		    unset state(cdata)
384		    set state(mode) normal
385		} elseif {[regexp ([cl ^\]]*)\]\][cl $Wsp]*\$ $param discard cdata1]} {
386		    # end of CDATA (in attributes)
387		    PCDATA [array get options] $state(cdata)<[subst -nocommand -novariable $close$tag$cdata1]
388		    set text [subst -novariable -nocommand $text]
389		    set tag {}
390		    set param {}
391		    unset state(cdata)
392		    set state(mode) normal
393		} elseif {[regexp (.*)\]\][cl $Wsp]*>(.*) $text discard cdata1 text]} {
394		    # end of CDATA (in text)
395		    PCDATA [array get options] $state(cdata)<[subst -nocommand -novariable $close$tag$param>$cdata1]
396		    set text [subst -novariable -nocommand $text]
397		    set tag {}
398		    set param {}
399		    set close {}
400		    unset state(cdata)
401		    set state(mode) normal
402		} else {
403		    # CDATA continues
404		    append state(cdata) [subst -nocommand -novariable <$close$tag$param>$text]
405		    continue
406		}
407	    }
408	    continue {
409		# We're skipping elements looking for the close tag
410		switch -glob -- [string length $tag],[regexp {^\?|!.*} $tag],$close {
411		    0,* {
412			continue
413		    }
414		    *,0, {
415			if {![string compare $tag $state(continue:tag)]} {
416			    set empty [uplevel #0 $options(-emptyelement) [list $tag $param $empty]]
417			    if {![string length $empty]} {
418				incr state(continue:level)
419			    }
420			}
421			continue
422		    }
423		    *,0,/ {
424			if {![string compare $tag $state(continue:tag)]} {
425			    incr state(continue:level) -1
426			}
427			if {!$state(continue:level)} {
428			    unset state(continue:tag)
429			    unset state(continue:level)
430			    set state(mode) {}
431			}
432		    }
433		    default {
434			continue
435		    }
436		}
437	    }
438	    default {
439		# The trailing slash on empty elements can't be automatically separated out
440		# in the RE, so we must do it here.
441		regexp (.*)(/)[cl $Wsp]*$ $param discard param empty
442	    }
443	}
444
445	# default: normal mode
446
447	# Bug: if the attribute list has a right angle bracket then the empty
448	# element marker will not be seen
449
450	set empty [uplevel #0 $options(-emptyelement) [list $tag $param $empty]]
451
452	switch -glob -- [string length $tag],[regexp {^\?|!.*} $tag],$close,$empty {
453
454	    0,0,, {
455		# Ignore empty tag - dealt with non-normal mode above
456	    }
457	    *,0,, {
458
459		# Start tag for an element.
460
461		# Check if the internal DTD entity is in an attribute value
462		regsub -all &xml:intdtd\; $param \[$options(-internaldtd)\] param
463
464		set code [catch {ParseEvent:ElementOpen $tag $param [array get options]} msg]
465		set state(haveDocElement) 1
466		switch $code {
467		    0 {# OK}
468		    3 {
469			# break
470			return {}
471		    }
472		    4 {
473			# continue
474			# Remember this tag and look for its close
475			set state(continue:tag) $tag
476			set state(continue:level) 1
477			set state(mode) continue
478			continue
479		    }
480		    default {
481			return -code $code -errorinfo $::errorInfo $msg
482		    }
483		}
484
485	    }
486
487	    *,0,/, {
488
489		# End tag for an element.
490
491		set code [catch {ParseEvent:ElementClose $tag [array get options]} msg]
492		switch $code {
493		    0 {# OK}
494		    3 {
495			# break
496			return {}
497		    }
498		    4 {
499			# continue
500			# skip sibling nodes
501			set state(continue:tag) [lindex $state(stack) end]
502			set state(continue:level) 1
503			set state(mode) continue
504			continue
505		    }
506		    default {
507			return -code $code -errorinfo $::errorInfo $msg
508		    }
509		}
510
511	    }
512
513	    *,0,,/ {
514
515		# Empty element
516
517		# The trailing slash sneaks through into the param variable
518		regsub -all /[cl $::sgml::Wsp]*\$ $param {} param
519
520		set code [catch {ParseEvent:ElementOpen $tag $param [array get options] -empty 1} msg]
521		set state(haveDocElement) 1
522		switch $code {
523		    0 {# OK}
524		    3 {
525			# break
526			return {}
527		    }
528		    4 {
529			# continue
530			# Pretty useless since it closes straightaway
531		    }
532		    default {
533			return -code $code -errorinfo $::errorInfo $msg
534		    }
535		}
536		set code [catch {ParseEvent:ElementClose $tag [array get options] -empty 1} msg]
537		switch $code {
538		    0 {# OK}
539		    3 {
540			# break
541			return {}
542		    }
543		    4 {
544			# continue
545			# skip sibling nodes
546			set state(continue:tag) [lindex $state(stack) end]
547			set state(continue:level) 1
548			set state(mode) continue
549			continue
550		    }
551		    default {
552			return -code $code -errorinfo $::errorInfo $msg
553		    }
554		}
555
556	    }
557
558	    *,1,* {
559		# Processing instructions or XML declaration
560		switch -glob -- $tag {
561
562		    {\?xml} {
563			# XML Declaration
564			if {$state(haveXMLDecl)} {
565			    uplevel #0 $options(-errorcommand) [list illegalcharacter "unexpected characters \"<$tag\" around line $state(line)"]
566			} elseif {![regexp {\?$} $param]} {
567			    uplevel #0 $options(-errorcommand) [list missingcharacters "XML Declaration missing characters \"?>\" around line $state(line)"]
568			} else {
569
570			    # We can do the parsing in one step with Tcl 8.1 RE's
571			    # This has the benefit of performing better WF checking
572
573			    set adv_re [format {^[%s]*version[%s]*=[%s]*("|')(-+|[a-zA-Z0-9_.:]+)\1([%s]+encoding[%s]*=[%s]*("|')([A-Za-z][-A-Za-z0-9._]*)\4)?([%s]*standalone[%s]*=[%s]*("|')(yes|no)\7)?[%s]*\?$} $::sgml::Wsp $::sgml::Wsp $::sgml::Wsp $::sgml::Wsp $::sgml::Wsp $::sgml::Wsp $::sgml::Wsp $::sgml::Wsp $::sgml::Wsp $::sgml::Wsp]
574
575			    if {[catch {regexp $adv_re $param discard delimiter version discard delimiter encoding discard delimiter standalone} matches]} {
576				# Otherwise we must fallback to 8.0.
577				# This won't detect certain well-formedness errors
578
579				# Get the version number
580				if {[regexp [format {[%s]*version[%s]*=[%s]*"(-+|[a-zA-Z0-9_.:]+)"[%s]*} $::sgml::Wsp $::sgml::Wsp $::sgml::Wsp $::sgml::Wsp] $param discard version] || [regexp [format {[%s]*version[%s]*=[%s]*'(-+|[a-zA-Z0-9_.:]+)'[%s]*} $::sgml::Wsp $::sgml::Wsp $::sgml::Wsp $::sgml::Wsp] $param discard version]} {
581				    if {[string compare $version "1.0"]} {
582					# Should we support future versions?
583					# At least 1.X?
584					uplevel #0 $options(-errorcommand) [list versionincompatibility "document XML version \"$version\" is incompatible with XML version 1.0"]
585				    }
586				} else {
587				    uplevel #0 $options(-errorcommand) [list missingversion "XML Declaration missing version information around line $state(line)"]
588				}
589
590				# Get the encoding declaration
591				set encoding {}
592				regexp [format {[%s]*encoding[%s]*=[%s]*"([A-Za-z]([A-Za-z0-9._]|-)*)"[%s]*} $::sgml::Wsp $::sgml::Wsp $::sgml::Wsp $::sgml::Wsp] $param discard encoding
593				regexp [format {[%s]*encoding[%s]*=[%s]*'([A-Za-z]([A-Za-z0-9._]|-)*)'[%s]*} $::sgml::Wsp $::sgml::Wsp $::sgml::Wsp $::sgml::Wsp] $param discard encoding
594
595				# Get the standalone declaration
596				set standalone {}
597				regexp [format {[%s]*standalone[%s]*=[%s]*"(yes|no)"[%s]*} $::sgml::Wsp $::sgml::Wsp $::sgml::Wsp $::sgml::Wsp] $param discard standalone
598				regexp [format {[%s]*standalone[%s]*=[%s]*'(yes|no)'[%s]*} $::sgml::Wsp $::sgml::Wsp $::sgml::Wsp $::sgml::Wsp] $param discard standalone
599
600				# Invoke the callback
601				uplevel #0 $options(-xmldeclcommand) [list $version $encoding $standalone]
602
603			    } elseif {$matches == 0} {
604				uplevel #0 $options(-errorcommand) [list illformeddeclaration "XML Declaration not well-formed around line $state(line)"]
605			    } else {
606
607				# Invoke the callback
608				uplevel #0 $options(-xmldeclcommand) [list $version $encoding $standalone]
609
610			    }
611
612			}
613
614		    }
615
616		    {\?*} {
617			# Processing instruction
618			set tag [string range $tag 1 end]
619			if {[regsub {\?$} $tag {} tag]} {
620			    if {[string length [string trim $param]]} {
621				uplevel #0 $options(-errorcommand) [list unexpectedtext "unexpected text \"$param\" in processing instruction around line $state(line)"]
622			    }
623			} elseif {![regexp ^$Name\$ $tag]} {
624			    uplevel #0 $options(-errorcommand) [list illegalcharacter "illegal character in processing instruction target \"$tag\""]
625			} elseif {[regexp {^[xX][mM][lL]$} $tag]} {
626			    uplevel #0 $options(-errorcommand) [list illegalcharacters "characters \"xml\" not permitted in processing instruction target \"$tag\""]
627			} elseif {![regsub {\?$} $param {} param]} {
628			    uplevel #0 $options(-errorcommand) [list missingquestion "PI: expected '?' character around line $state(line)"]
629			}
630			set code [catch {uplevel #0 $options(-processinginstructioncommand) [list $tag [string trimleft $param]]} msg]
631			switch $code {
632			    0 {# OK}
633			    3 {
634				# break
635				return {}
636			    }
637			    4 {
638				# continue
639				# skip sibling nodes
640				set state(continue:tag) [lindex $state(stack) end]
641				set state(continue:level) 1
642				set state(mode) continue
643				continue
644			    }
645			    default {
646				return -code $code -errorinfo $::errorInfo $msg
647			    }
648			}
649		    }
650
651		    !DOCTYPE {
652			# External entity reference
653			# This should move into xml.tcl
654			# Parse the params supplied.  Looking for Name, ExternalID and MarkupDecl
655			set matched [regexp ^[cl $Wsp]*($Name)[cl $Wsp]*(.*) $param x state(doc_name) param]
656			set state(doc_name) [Normalize $state(doc_name) $options(-normalize)]
657			set externalID {}
658			set pubidlit {}
659			set systemlit {}
660			set externalID {}
661			if {[regexp -nocase ^[cl $Wsp]*(SYSTEM|PUBLIC)(.*) $param x id param]} {
662			    switch [string toupper $id] {
663				SYSTEM {
664				    if {[regexp ^[cl $Wsp]+"([cl ^"]*)"(.*) $param x systemlit param] || [regexp ^[cl $Wsp]+'([cl ^']*)'(.*) $param x systemlit param]} {
665					set externalID [list SYSTEM $systemlit] ;# "
666				    } else {
667					uplevel #0 $options(-errorcommand) {syntaxerror {syntax error: SYSTEM identifier not followed by literal}}
668				    }
669				}
670				PUBLIC {
671				    if {[regexp ^[cl $Wsp]+"([cl ^"]*)"(.*) $param x pubidlit param] || [regexp ^[cl $Wsp]+'([cl ^']*)'(.*) $param x pubidlit param]} {
672					if {[regexp ^[cl $Wsp]+"([cl ^"]*)"(.*) $param x systemlit param] || [regexp ^[cl $Wsp]+'([cl ^']*)'(.*) $param x systemlit param]} {
673					    set externalID [list PUBLIC $pubidlit $systemlit]
674					} else {
675					    uplevel #0 $options(-errorcommand) [list syntaxerror "syntax error: PUBLIC identifier not followed by system literal around line $state(line)"]
676					}
677				    } else {
678					uplevel #0 $options(-errorcommand) [list syntaxerror "syntax error: PUBLIC identifier not followed by literal around line $state(line)"]
679				    }
680				}
681			    }
682			    if {[regexp -nocase ^[cl $Wsp]+NDATA[cl $Wsp]+($Name)(.*) $param x notation param]} {
683				lappend externalID $notation
684			    }
685			}
686
687			set state(inDTD) 1
688
689			ParseEvent:DocTypeDecl [array get options] $state(doc_name) $pubidlit $systemlit $options(-internaldtd)
690
691			set state(inDTD) 0
692
693		    }
694
695		    !--* {
696
697			# Start of a comment
698			# See if it ends in the same tag, otherwise change the
699			# parsing mode
700
701			regexp {!--(.*)} $tag discard comm1
702			if {[regexp ([cl ^-]*)--[cl $Wsp]*\$ $comm1 discard comm1_1]} {
703			    # processed comment (end in tag)
704			    uplevel #0 $options(-commentcommand) [list $comm1_1]
705			} elseif {[regexp ([cl ^-]*)--[cl $Wsp]*\$ $param discard comm2]} {
706			    # processed comment (end in attributes)
707			    uplevel #0 $options(-commentcommand) [list $comm1$comm2]
708			} elseif {[regexp ([cl ^-]*)-->(.*) $text discard comm2 text]} {
709			    # processed comment (end in text)
710			    uplevel #0 $options(-commentcommand) [list $comm1$param$empty>$comm2]
711			} else {
712			    # start of comment
713			    set state(mode) comment
714			    set state(commentdata) "$comm1$param$empty>$text"
715			    continue
716			}
717		    }
718
719		    {!\[CDATA\[*} {
720
721			regexp {!\[CDATA\[(.*)} $tag discard cdata1
722			if {[regexp {(.*)]]$} $cdata1 discard cdata2]} {
723			    # processed CDATA (end in tag)
724			    PCDATA [array get options] [subst -novariable -nocommand $cdata2]
725			    set text [subst -novariable -nocommand $text]
726			} elseif {[regexp {(.*)]]$} $param discard cdata2]} {
727			    # processed CDATA (end in attribute)
728			    # Backslashes in param are quoted at this stage
729			    PCDATA [array get options] $cdata1[subst -novariable -nocommand $cdata2]
730			    set text [subst -novariable -nocommand $text]
731			} elseif {[regexp {(.*)]]>(.*)} $text discard cdata2 text]} {
732			    # processed CDATA (end in text)
733			    # Backslashes in param and text are quoted at this stage
734			    PCDATA [array get options] $cdata1[subst -novariable -nocommand $param]$empty>[subst -novariable -nocommand $cdata2]
735			    set text [subst -novariable -nocommand $text]
736			} else {
737			    # start CDATA
738			    set state(cdata) "$cdata1$param>$text"
739			    set state(mode) cdata
740			    continue
741			}
742
743		    }
744
745		    !ELEMENT -
746		    !ATTLIST -
747		    !ENTITY -
748		    !NOTATION {
749			uplevel #0 $options(-errorcommand) [list illegaldeclaration "[string range $tag 1 end] declaration not expected in document instance around line $state(line)"]
750		    }
751
752		    default {
753			uplevel #0 $options(-errorcommand) [list unknowninstruction "unknown processing instruction \"<$tag>\" around line $state(line)"]
754		    }
755		}
756	    }
757	    *,1,* -
758	    *,0,/,/ {
759		# Syntax error
760	    	uplevel #0 $options(-errorcommand) [list syntaxerror "syntax error: closed/empty tag: tag $tag param $param empty $empty close $close around line $state(line)"]
761	    }
762	}
763
764	# Process character data
765
766	if {$state(haveDocElement) && [llength $state(stack)]} {
767
768	    # Check if the internal DTD entity is in the text
769	    regsub -all &xml:intdtd\; $text \[$options(-internaldtd)\] text
770
771	    # Look for entity references
772	    if {([array size entities] || \
773		    [string length $options(-entityreferencecommand)]) && \
774		    $options(-defaultexpandinternalentities) && \
775		    [regexp {&[^;]+;} $text]} {
776
777		# protect Tcl specials
778		# NB. braces and backslashes may already be protected
779		regsub -all {\\({|}|\\)} $text {\1} text
780		regsub -all {([][$\\{}])} $text {\\\1} text
781
782		# Mark entity references
783		regsub -all {&([^;]+);} $text [format {%s; %s {\1} ; %s %s} \}\} [namespace code [list Entity [array get options] $options(-entityreferencecommand) [namespace code [list PCDATA [array get options]]] $options(entities)]] [namespace code [list DeProtect [namespace code [list PCDATA [array get options]]]]] \{\{] text
784		set text "uplevel #0 [namespace code [list DeProtect1 [namespace code [list PCDATA [array get options]]]]] {{$text}}"
785		eval $text
786	    } else {
787
788		# Restore protected special characters
789		regsub -all {\\([][{}\\])} $text {\1} text
790		PCDATA [array get options] $text
791	    }
792	} elseif {[string length [string trim $text]]} {
793	    uplevel #0 $options(-errorcommand) [list unexpectedtext "unexpected text \"$text\" in document prolog around line $state(line)"]
794	}
795
796    }
797
798    # If this is the end of the document, close all open containers
799    if {$options(-final) && [llength $state(stack)]} {
800	eval $options(-errorcommand) [list unclosedelement "element [lindex $state(stack) end] remains unclosed around line $state(line)"]
801    }
802
803    return {}
804}
805
806# sgml::DeProtect --
807#
808#	Invoke given command after removing protecting backslashes
809#	from given text.
810#
811# Arguments:
812#	cmd	Command to invoke
813#	text	Text to deprotect
814#
815# Results:
816#	Depends on command
817
818proc sgml::DeProtect1 {cmd text} {
819    if {[string compare {} $text]} {
820	regsub -all {\\([]$[{}\\])} $text {\1} text
821	uplevel #0 $cmd [list $text]
822    }
823}
824proc sgml::DeProtect {cmd text} {
825    set text [lindex $text 0]
826    if {[string compare {} $text]} {
827	regsub -all {\\([]$[{}\\])} $text {\1} text
828	uplevel #0 $cmd [list $text]
829    }
830}
831
832# sgml::ParserDelete --
833#
834#	Free all memory associated with parser
835#
836# Arguments:
837#	var	global state array
838#
839# Results:
840#	Variables unset
841
842proc sgml::ParserDelete var {
843    upvar #0 $var state
844
845    if {![info exists state]} {
846	return -code error "unknown parser"
847    }
848
849    catch {unset $state(entities)}
850    catch {unset $state(parameterentities)}
851    catch {unset $state(elementdecls)}
852    catch {unset $state(attlistdecls)}
853    catch {unset $state(notationdecls)}
854    catch {unset $state(namespaces)}
855
856    unset state
857
858    return {}
859}
860
861# sgml::ParseEvent:ElementOpen --
862#
863#	Start of an element.
864#
865# Arguments:
866#	tag	Element name
867#	attr	Attribute list
868#	opts	Options
869#	args	further configuration options
870#
871# Options:
872#	-empty boolean
873#		indicates whether the element was an empty element
874#
875# Results:
876#	Modify state and invoke callback
877
878proc sgml::ParseEvent:ElementOpen {tag attr opts args} {
879    variable Name
880    variable Wsp
881
882    array set options $opts
883    upvar #0 $options(-statevariable) state
884    array set cfg {-empty 0}
885    array set cfg $args
886    set handleEmpty 0
887
888    if {$options(-normalize)} {
889	set tag [string toupper $tag]
890    }
891
892    # Update state
893    lappend state(stack) $tag
894
895    # Parse attribute list into a key-value representation
896    if {[string compare $options(-parseattributelistcommand) {}]} {
897	if {[catch {uplevel #0 $options(-parseattributelistcommand) [list $opts $attr]} attr]} {
898	    if {[string compare [lindex $attr 0] "unterminated attribute value"]} {
899		uplevel #0 $options(-errorcommand) [list unterminatedattribute "$attr around line $state(line)"]
900		set attr {}
901	    } else {
902
903		# It is most likely that a ">" character was in an attribute value.
904		# This manifests itself by ">" appearing in the element's text.
905		# In this case the callback should return a three element list;
906		# the message "unterminated attribute value", the attribute list it
907		# did manage to parse and the remainder of the attribute list.
908
909		foreach {msg attlist brokenattr} $attr break
910
911		upvar text elemText
912		if {[string first > $elemText] >= 0} {
913
914		    # Now piece the attribute list back together
915		    regexp [cl $Wsp]*($Name)[cl $Wsp]*=[cl $Wsp]*("|')(.*) $brokenattr discard attname delimiter attvalue
916		    regexp (.*)>([cl ^>]*)\$ $elemText discard remattlist elemText
917		    regexp ([cl ^$delimiter]*)${delimiter}(.*) $remattlist discard remattvalue remattlist
918
919		    # Gotcha: watch out for empty element syntax
920		    if {[string match */ [string trimright $remattlist]]} {
921			set remattlist [string range $remattlist 0 end-1]
922			set handleEmpty 1
923			set cfg(-empty) 1
924		    }
925
926		    append attvalue >$remattvalue
927		    lappend attlist $attname $attvalue
928
929		    # Complete parsing the attribute list
930		    if {[catch {uplevel #0 $options(-parseattributelistcommand) [list $options(-statevariable) $remattlist]} attr]} {
931			uplevel #0 $options(-errorcommand) [list unterminatedattribute "$attr around line $state(line)"]
932			set attr {}
933			set attlist {}
934		    } else {
935			eval lappend attlist $attr
936		    }
937
938		    set attr $attlist
939
940		} else {
941		    uplevel #0 $options(-errorcommand) [list unterminatedattribute "$attr around line $state(line)"]
942		    set attr {}
943		}
944	    }
945	}
946    }
947
948    set empty {}
949    if {$cfg(-empty) && $options(-reportempty)} {
950	set empty {-empty 1}
951    }
952
953    # Check for namespace declarations
954    upvar #0 $options(namespaces) namespaces
955    set nsdecls {}
956    if {[llength $attr]} {
957	array set attrlist $attr
958	foreach {attrName attrValue} [array get attrlist xmlns*] {
959	    unset attrlist($attrName)
960	    set colon [set prefix {}]
961	    if {[regexp {^xmlns(:(.+))?$} $attrName discard colon prefix]} {
962		switch -glob [string length $colon],[string length $prefix] {
963		    0,0 {
964			# default NS declaration
965			lappend state(defaultNSURI) $attrValue
966			lappend state(defaultNS) [llength $state(stack)]
967			lappend nsdecls $attrValue {}
968		    }
969		    0,* {
970			# Huh?
971		    }
972		    *,0 {
973			# Error
974			uplevel #0 $state(-warningcommand) "no prefix specified for namespace URI \"$attrValue\" in element \"$tag\""
975		    }
976		    default {
977			set namespaces($prefix,[llength $state(stack)]) $attrValue
978			lappend nsdecls $attrValue $prefix
979		    }
980		}
981	    }
982	}
983	if {[llength $nsdecls]} {
984	    set nsdecls [list -namespacedecls $nsdecls]
985	}
986	set attr [array get attrlist]
987    }
988
989    # Check whether this element has an expanded name
990    set ns {}
991    if {[regexp {([^:]+):(.*)$} $tag discard prefix tag]} {
992	set nsspec [lsort -dictionary -decreasing [array names namespaces $prefix,*]]
993	if {[llength $nsspec]} {
994	    set nsuri $namespaces([lindex $nsspec 0])
995	    set ns [list -namespace $nsuri]
996	} else {
997	    uplevel #0 $options(-errorcommand) [list namespaceundeclared "no namespace declared for prefix \"$prefix\" in element $tag"]
998	}
999    } elseif {[llength $state(defaultNSURI)]} {
1000	set ns [list -namespace [lindex $state(defaultNSURI) end]]
1001    }
1002
1003    # Invoke callback
1004    set code [catch {uplevel #0 $options(-elementstartcommand) [list $tag $attr] $empty $ns $nsdecls} msg]
1005
1006    # Sometimes empty elements must be handled here (see above)
1007    if {$code == 0 && $handleEmpty} {
1008	ParseEvent:ElementClose $tag $opts -empty 1
1009    }
1010
1011    return -code $code -errorinfo $::errorInfo $msg
1012}
1013
1014# sgml::ParseEvent:ElementClose --
1015#
1016#	End of an element.
1017#
1018# Arguments:
1019#	tag	Element name
1020#	opts	Options
1021#	args	further configuration options
1022#
1023# Options:
1024#	-empty boolean
1025#		indicates whether the element as an empty element
1026#
1027# Results:
1028#	Modify state and invoke callback
1029
1030proc sgml::ParseEvent:ElementClose {tag opts args} {
1031    array set options $opts
1032    upvar #0 $options(-statevariable) state
1033    array set cfg {-empty 0}
1034    array set cfg $args
1035
1036    # WF check
1037    if {[string compare $tag [lindex $state(stack) end]]} {
1038	uplevel #0 $options(-errorcommand) [list illegalendtag "end tag \"$tag\" does not match open element \"[lindex $state(stack) end]\" around line $state(line)"]
1039	return
1040    }
1041
1042    # Check whether this element has an expanded name
1043    upvar #0 $options(namespaces) namespaces
1044    set ns {}
1045    if {[regexp {([^:]+):(.*)$} $tag discard prefix tag]} {
1046	set nsuri $namespaces([lindex [lsort -dictionary -decreasing [array names namespaces $prefix,*]] 0])
1047	set ns [list -namespace $nsuri]
1048    } elseif {[llength $state(defaultNSURI)]} {
1049	set ns [list -namespace [lindex $state(defaultNSURI) end]]
1050    }
1051
1052    # Pop namespace stacks, if any
1053    if {[llength $state(defaultNS)]} {
1054	if {[llength $state(stack)] == [lindex $state(defaultNS) end]} {
1055	    set state(defaultNS) [lreplace $state(defaultNS) end end]
1056	}
1057    }
1058    foreach nsspec [array names namespaces *,[llength $state(stack)]] {
1059	unset namespaces($nsspec)
1060    }
1061
1062    # Update state
1063    set state(stack) [lreplace $state(stack) end end]
1064
1065    set empty {}
1066    if {$cfg(-empty) && $options(-reportempty)} {
1067	set empty {-empty 1}
1068    }
1069
1070    # Invoke callback
1071    # Mats: Shall be same as sgml::ParseEvent:ElementOpen to handle exceptions in callback.
1072    set code [catch {uplevel #0 $options(-elementendcommand) [list $tag] $empty $ns} msg]
1073    return -code $code -errorinfo $::errorInfo $msg
1074}
1075
1076# sgml::PCDATA --
1077#
1078#	Process PCDATA before passing to application
1079#
1080# Arguments:
1081#	opts	options
1082#	pcdata	Character data to be processed
1083#
1084# Results:
1085#	Checks that characters are legal,
1086#	checks -ignorewhitespace setting.
1087
1088proc sgml::PCDATA {opts pcdata} {
1089    array set options $opts
1090
1091    if {$options(-ignorewhitespace) && \
1092	    ![string length [string trim $pcdata]]} {
1093	return {}
1094    }
1095
1096    if {![regexp ^[cl $::sgml::Char]*\$ $pcdata]} {
1097	upvar \#0 $options(-statevariable) state
1098	uplevel \#0 $options(-errorcommand) [list illegalcharacters "illegal, non-Unicode characters found in text \"$pcdata\" around line $state(line)"]
1099    }
1100
1101    uplevel \#0 $options(-characterdatacommand) [list $pcdata]
1102}
1103
1104# sgml::Normalize --
1105#
1106#	Perform name normalization if required
1107#
1108# Arguments:
1109#	name	name to normalize
1110#	req	normalization required
1111#
1112# Results:
1113#	Name returned as upper-case if normalization required
1114
1115proc sgml::Normalize {name req} {
1116    if {$req} {
1117	return [string toupper $name]
1118    } else {
1119	return $name
1120    }
1121}
1122
1123# sgml::Entity --
1124#
1125#	Resolve XML entity references (syntax: &xxx;).
1126#
1127# Arguments:
1128#	opts		options
1129#	entityrefcmd	application callback for entity references
1130#	pcdatacmd	application callback for character data
1131#	entities	name of array containing entity definitions.
1132#	ref		entity reference (the "xxx" bit)
1133#
1134# Results:
1135#	Returns substitution text for given entity.
1136
1137proc sgml::Entity {opts entityrefcmd pcdatacmd entities ref} {
1138    array set options $opts
1139    upvar #0 $options(-statevariable) state
1140
1141    if {![string length $entities]} {
1142	set entities [namespace current]::EntityPredef
1143    }
1144
1145    switch -glob -- $ref {
1146	%* {
1147	    # Parameter entity - not recognised outside of a DTD
1148	}
1149	#x* {
1150	    # Character entity - hex
1151	    if {[catch {format %c [scan [string range $ref 2 end] %x tmp; set tmp]} char]} {
1152		return -code error "malformed character entity \"$ref\""
1153	    }
1154	    uplevel #0 $pcdatacmd [list $char]
1155
1156	    return {}
1157
1158	}
1159	#* {
1160	    # Character entity - decimal
1161	    if {[catch {format %c [scan [string range $ref 1 end] %d tmp; set tmp]} char]} {
1162		return -code error "malformed character entity \"$ref\""
1163	    }
1164	    uplevel #0 $pcdatacmd [list $char]
1165
1166	    return {}
1167
1168	}
1169	default {
1170	    # General entity
1171	    upvar #0 $entities map
1172	    if {[info exists map($ref)]} {
1173
1174		if {![regexp {<|&} $map($ref)]} {
1175
1176		    # Simple text replacement - optimise
1177		    uplevel #0 $pcdatacmd [list $map($ref)]
1178
1179		    return {}
1180
1181		}
1182
1183		# Otherwise an additional round of parsing is required.
1184		# This only applies to XML, since HTML doesn't have general entities
1185
1186		# Must parse the replacement text for start & end tags, etc
1187		# This text must be self-contained: balanced closing tags, and so on
1188
1189		set tokenised [tokenise $map($ref) $::xml::tokExpr $::xml::substExpr]
1190		set options(-final) 0
1191		eval parseEvent [list $tokenised] [array get options]
1192
1193		return {}
1194
1195	    } elseif {[string compare $entityrefcmd "::sgml::noop"]} {
1196
1197		set result [uplevel #0 $entityrefcmd [list $ref]]
1198
1199		if {[string length $result]} {
1200		    uplevel #0 $pcdatacmd [list $result]
1201		}
1202
1203		return {}
1204
1205	    } else {
1206
1207		# Reconstitute entity reference
1208
1209		uplevel #0 $options(-errorcommand) [list illegalentity "undefined entity reference \"$ref\""]
1210
1211		return {}
1212
1213	    }
1214	}
1215    }
1216
1217    # If all else fails leave the entity reference untouched
1218    uplevel #0 $pcdatacmd [list &$ref\;]
1219
1220    return {}
1221}
1222
1223####################################
1224#
1225# DTD parser for SGML (XML).
1226#
1227# This DTD actually only handles XML DTDs.  Other language's
1228# DTD's, such as HTML, must be written in terms of a XML DTD.
1229#
1230####################################
1231
1232# sgml::ParseEvent:DocTypeDecl --
1233#
1234#	Entry point for DTD parsing
1235#
1236# Arguments:
1237#	opts	configuration options
1238#	docEl	document element name
1239#	pubId	public identifier
1240#	sysId	system identifier (a URI)
1241#	intSSet	internal DTD subset
1242
1243proc sgml::ParseEvent:DocTypeDecl {opts docEl pubId sysId intSSet} {
1244    array set options {}
1245    array set options $opts
1246
1247    set code [catch {uplevel #0 $options(-doctypecommand) [list $docEl $pubId $sysId $intSSet]} err]
1248    switch $code {
1249	3 {
1250	    # break
1251	    return {}
1252	}
1253	0 -
1254	4 {
1255	    # continue
1256	}
1257	default {
1258	    return -code $code $err
1259	}
1260    }
1261
1262    # Otherwise we'll parse the DTD and report it piecemeal
1263
1264    # The internal DTD subset is processed first (XML 2.8)
1265    # During this stage, parameter entities are only allowed
1266    # between markup declarations
1267
1268    ParseDTD:Internal [array get options] $intSSet
1269
1270    # The external DTD subset is processed last (XML 2.8)
1271    # During this stage, parameter entities may occur anywhere
1272
1273    # We must resolve the external identifier to obtain the
1274    # DTD data.  The application may supply its own resolver.
1275
1276    if {[string length $pubId] || [string length $sysId]} {
1277	uplevel #0 $options(-externalentitycommand) [list $options(-name) $options(-baseurl) $sysId $pubId]
1278    }
1279
1280    return {}
1281}
1282
1283# sgml::ParseDTD:Internal --
1284#
1285#	Parse the internal DTD subset.
1286#
1287#	Parameter entities are only allowed between markup declarations.
1288#
1289# Arguments:
1290#	opts	configuration options
1291#	dtd	DTD data
1292#
1293# Results:
1294#	Markup declarations parsed may cause callback invocation
1295
1296proc sgml::ParseDTD:Internal {opts dtd} {
1297    variable MarkupDeclExpr
1298    variable MarkupDeclSub
1299
1300    array set options {}
1301    array set options $opts
1302
1303    upvar #0 $options(-statevariable) state
1304    upvar #0 $options(parameterentities) PEnts
1305    upvar #0 $options(externalparameterentities) ExtPEnts
1306
1307    # Tokenize the DTD
1308
1309    # Protect Tcl special characters
1310    regsub -all {([{}\\])} $dtd {\\\1} dtd
1311
1312    regsub -all $MarkupDeclExpr $dtd $MarkupDeclSub dtd
1313
1314    # Entities may have angle brackets in their replacement
1315    # text, which breaks the RE processing.  So, we must
1316    # use a similar technique to processing doc instances
1317    # to rebuild the declarations from the pieces
1318
1319    set mode {} ;# normal
1320    set delimiter {}
1321    set name {}
1322    set param {}
1323
1324    set state(inInternalDTD) 1
1325
1326    # Process the tokens
1327    foreach {decl value text} [lrange "{} {} \{$dtd\}" 3 end] {
1328
1329	# Keep track of line numbers
1330	incr state(line) [regsub -all \n $text {} discard]
1331
1332	ParseDTD:EntityMode [array get options] mode replText decl value text $delimiter $name $param
1333
1334	ParseDTD:ProcessMarkupDecl [array get options] decl value delimiter name mode replText text param
1335
1336	# There may be parameter entity references between markup decls
1337
1338	if {[regexp {%.*;} $text]} {
1339
1340	    # Protect Tcl special characters
1341	    regsub -all {([{}\\])} $text {\\\1} text
1342
1343	    regsub -all %($::sgml::Name)\; $text "\} {\\1} \{" text
1344
1345	    set PElist "\{$text\}"
1346	    set PElist [lreplace $PElist end end]
1347	    foreach {text entref} $PElist {
1348		if {[string length [string trim $text]]} {
1349		    uplevel #0 $options(-errorcommand) [list unexpectedtext "unexpected text in internal DTD subset around line $state(line)"]
1350		}
1351
1352		# Expand parameter entity and recursively parse
1353		# BUG: no checks yet for recursive entity references
1354
1355		if {[info exists PEnts($entref)]} {
1356		    set externalParser [$options(-name) entityparser]
1357		    $externalParser parse $PEnts($entref) -dtdsubset internal
1358		} elseif {[info exists ExtPEnts($entref)]} {
1359		    set externalParser [$options(-name) entityparser]
1360		    $externalParser parse $ExtPEnts($entref) -dtdsubset external
1361		    #$externalParser free
1362		} else {
1363		    uplevel #0 $options(-errorcommand) [list illegalreference "reference to undeclared parameter entity \"$entref\""]
1364		}
1365	    }
1366
1367	}
1368
1369    }
1370
1371    return {}
1372}
1373
1374# sgml::ParseDTD:EntityMode --
1375#
1376#	Perform special processing for various parser modes
1377#
1378# Arguments:
1379#	opts	configuration options
1380#	modeVar	pass-by-reference mode variable
1381#	replTextVar	pass-by-ref
1382#	declVar	pass-by-ref
1383#	valueVar	pass-by-ref
1384#	textVar	pass-by-ref
1385#	delimiter	delimiter currently in force
1386#	name
1387#	param
1388#
1389# Results:
1390#	Depends on current mode
1391
1392proc sgml::ParseDTD:EntityMode {opts modeVar replTextVar declVar valueVar textVar delimiter name param} {
1393    upvar 1 $modeVar mode
1394    upvar 1 $replTextVar replText
1395    upvar 1 $declVar decl
1396    upvar 1 $valueVar value
1397    upvar 1 $textVar text
1398    array set options $opts
1399
1400    switch $mode {
1401	{} {
1402	    # Pass through to normal processing section
1403	}
1404	entity {
1405	    # Look for closing delimiter
1406	    if {[regexp ([cl ^$delimiter]*)${delimiter}(.*) $decl discard val1 remainder]} {
1407		append replText <$val1
1408		DTD:ENTITY [array get options] $name [string trim $param] $delimiter$replText$delimiter
1409		set decl /
1410		set text $remainder\ $value>$text
1411		set value {}
1412		set mode {}
1413	    } elseif {[regexp ([cl ^$delimiter]*)${delimiter}(.*) $value discard val2 remainder]} {
1414		append replText <$decl\ $val2
1415		DTD:ENTITY [array get options] $name [string trim $param] $delimiter$replText$delimiter
1416		set decl /
1417		set text $remainder>$text
1418		set value {}
1419		set mode {}
1420	    } elseif {[regexp ([cl ^$delimiter]*)${delimiter}(.*) $text discard val3 remainder]} {
1421		append replText <$decl\ $value>$val3
1422		DTD:ENTITY [array get options] $name [string trim $param] $delimiter$replText$delimiter
1423		set decl /
1424		set text $remainder
1425		set value {}
1426		set mode {}
1427	    } else {
1428
1429		# Remain in entity mode
1430		append replText <$decl\ $value>$text
1431		return -code continue
1432
1433	    }
1434	}
1435
1436	ignore {
1437	    upvar #0 $options(-statevariable) state
1438
1439	    if {[regexp {]](.*)$} $decl discard remainder]} {
1440		set state(condSections) [lreplace $state(condSections) end end]
1441		set decl $remainder
1442		set mode {}
1443	    } elseif {[regexp {]](.*)$} $value discard remainder]} {
1444		set state(condSections) [lreplace $state(condSections) end end]
1445		regexp <[cl $::sgml::Wsp]*($::sgml::Name)(.*) $remainder discard decl value
1446		set mode {}
1447	    } elseif {[regexp {]]>(.*)$} $text discard remainder]} {
1448		set state(condSections) [lreplace $state(condSections) end end]
1449		set decl /
1450		set value {}
1451		set text $remainder
1452		#regexp <[cl $::sgml::Wsp]*($::sgml::Name)([cl ^>]*)>(.*) $remainder discard decl value text
1453		set mode {}
1454	    } else {
1455		set decl /
1456	    }
1457
1458	}
1459
1460	comment {
1461	    # Look for closing comment delimiter
1462
1463	    upvar #0 $options(-statevariable) state
1464
1465	    if {[regexp (.*?)--(.*)\$ $decl discard data1 remainder]} {
1466	    } elseif {[regexp (.*?)--(.*)\$ $value discard data1 remainder]} {
1467	    } elseif {[regexp (.*?)--(.*)\$ $text discard data1 remainder]} {
1468	    } else {
1469		# comment continues
1470		append state(commentdata) <$decl\ $value>$text
1471		set decl /
1472		set value {}
1473		set text {}
1474	    }
1475	}
1476
1477    }
1478
1479    return {}
1480}
1481
1482# sgml::ParseDTD:ProcessMarkupDecl --
1483#
1484#	Process a single markup declaration
1485#
1486# Arguments:
1487#	opts	configuration options
1488#	declVar	pass-by-ref
1489#	valueVar	pass-by-ref
1490#	delimiterVar	pass-by-ref for current delimiter in force
1491#	nameVar	pass-by-ref
1492#	modeVar	pass-by-ref for current parser mode
1493#	replTextVar	pass-by-ref
1494#	textVar	pass-by-ref
1495#	paramVar	pass-by-ref
1496#
1497# Results:
1498#	Depends on markup declaration.  May change parser mode
1499
1500proc sgml::ParseDTD:ProcessMarkupDecl {opts declVar valueVar delimiterVar nameVar modeVar replTextVar textVar paramVar} {
1501    upvar 1 $modeVar mode
1502    upvar 1 $replTextVar replText
1503    upvar 1 $textVar text
1504    upvar 1 $declVar decl
1505    upvar 1 $valueVar value
1506    upvar 1 $nameVar name
1507    upvar 1 $delimiterVar delimiter
1508    upvar 1 $paramVar param
1509
1510    variable declExpr
1511    variable ExternalEntityExpr
1512
1513    array set options $opts
1514    upvar #0 $options(-statevariable) state
1515
1516    switch -glob -- $decl {
1517
1518	/ {
1519	    # continuation from entity processing
1520	}
1521
1522	!ELEMENT {
1523	    # Element declaration
1524	    if {[regexp $declExpr $value discard tag cmodel]} {
1525		DTD:ELEMENT [array get options] $tag $cmodel
1526	    } else {
1527		uplevel #0 $options(-errorcommand) [list illegaldeclaration "malformed element declaration around line $state(line)"]
1528	    }
1529	}
1530
1531	!ATTLIST {
1532	    # Attribute list declaration
1533	    variable declExpr
1534	    if {[regexp $declExpr $value discard tag attdefns]} {
1535		if {[catch {DTD:ATTLIST [array get options] $tag $attdefns} err]} {
1536		    #puts stderr "Stack trace: $::errorInfo\n***\n"
1537		    # Atttribute parsing has bugs at the moment
1538		    #return -code error "$err around line $state(line)"
1539		    return {}
1540		}
1541	    } else {
1542		uplevel #0 $options(-errorcommand) [list illegaldeclaration "malformed attribute list declaration around line $state(line)"]
1543	    }
1544	}
1545
1546	!ENTITY {
1547	    # Entity declaration
1548	    variable EntityExpr
1549
1550	    if {[regexp $EntityExpr $value discard param name value]} {
1551
1552		# Entity replacement text may have a '>' character.
1553		# In this case, the real delimiter will be in the following
1554		# text.  This is complicated by the possibility of there
1555		# being several '<','>' pairs in the replacement text.
1556		# At this point, we are searching for the matching quote delimiter.
1557
1558		if {[regexp $ExternalEntityExpr $value]} {
1559		    DTD:ENTITY [array get options] $name [string trim $param] $value
1560		} elseif {[regexp ("|')(.*?)\\1(.*) $value discard delimiter replText value]} {
1561
1562		    if {[string length [string trim $value]]} {
1563			uplevel #0 $options(-errorcommand) [list illegaldeclaration "malformed entity declaration around line $state(line)"]
1564		    } else {
1565			DTD:ENTITY [array get options] $name [string trim $param] $delimiter$replText$delimiter
1566		    }
1567		} elseif {[regexp ("|')(.*) $value discard delimiter replText]} {
1568		    append replText >$text
1569		    set text {}
1570		    set mode entity
1571		} else {
1572		    uplevel #0 $options(-errorcommand) [list illegaldeclaration "no delimiter for entity declaration around line $state(line)"]
1573		}
1574
1575	    } else {
1576		uplevel #0 $options(-errorcommand) [list illegaldeclaration "malformed entity declaration around line $state(line)"]
1577	    }
1578	}
1579
1580	!NOTATION {
1581	    # Notation declaration
1582	    if {[regexp $declExpr param discard tag notation]} {
1583		DTD:ENTITY [array get options] $tag $notation
1584	    } else {
1585		uplevel #0 $options(-errorcommand) [list illegaldeclaration "malformed entity declaration around line $state(line)"]
1586	    }
1587	}
1588
1589	!--* {
1590	    # Start of a comment
1591
1592	    if {[regexp !--(.*?)--\$ $decl discard data]} {
1593		if {[string length [string trim $value]]} {
1594		    uplevel #0 $options(-errorcommand) [list unexpectedtext "unexpected text \"$value\""]
1595		}
1596		uplevel #0 $options(-commentcommand) [list $data]
1597		set decl /
1598		set value {}
1599	    } elseif {[regexp -- ^(.*?)--\$ $value discard data2]} {
1600		regexp !--(.*)\$ $decl discard data1
1601		uplevel #0 $options(-commentcommand) [list $data1\ $data2]
1602		set decl /
1603		set value {}
1604	    } elseif {[regexp (.*?)-->(.*)\$ $text discard data3 remainder]} {
1605		regexp !--(.*)\$ $decl discard data1
1606		uplevel #0 $options(-commentcommand) [list $data1\ $value>$data3]
1607		set decl /
1608		set value {}
1609		set text $remainder
1610	    } else {
1611		regexp !--(.*)\$ $decl discard data1
1612		set state(commentdata) $data1\ $value>$text
1613		set decl /
1614		set value {}
1615		set text {}
1616		set mode comment
1617	    }
1618	}
1619
1620	!*INCLUDE* -
1621	!*IGNORE* {
1622	    if {$state(inInternalDTD)} {
1623		uplevel #0 $options(-errorcommand) [list illegalsection "conditional section not permitted in internal DTD subset around line $state(line)"]
1624	    }
1625
1626	    if {[regexp {^!\[INCLUDE\[(.*)} $decl discard remainder]} {
1627		# Push conditional section stack, popped by ]]> sequence
1628
1629		if {[regexp {(.*?)]]$} $remainder discard r2]} {
1630		    # section closed immediately
1631		    if {[string length [string trim $r2]]} {
1632			uplevel #0 $options(-errorcommand) [list unexpectedtext "unexpected text \"$r2\" in conditional section"]
1633		    }
1634		} elseif {[regexp {(.*?)]](.*)} $value discard r2 r3]} {
1635		    # section closed immediately
1636		    if {[string length [string trim $r2]]} {
1637			uplevel #0 $options(-errorcommand) [list unexpectedtext "unexpected text \"$r2\" in conditional section"]
1638		    }
1639		    if {[string length [string trim $r3]]} {
1640			uplevel #0 $options(-errorcommand) [list unexpectedtext "unexpected text \"$r3\" in conditional section"]
1641		    }
1642		} else {
1643
1644		    lappend state(condSections) INCLUDE
1645
1646		    set parser [$options(-name) entityparser]
1647		    $parser parse $remainder\ $value> -dtdsubset external
1648		    #$parser free
1649
1650		    if {[regexp {(.*?)]]>(.*)} $text discard t1 t2]} {
1651			if {[string length [string trim $t1]]} {
1652			    uplevel #0 $options(-errorcommand) [list unexpectedtext "unexpected text \"$t1\""]
1653			}
1654			if {![llength $state(condSections)]} {
1655			    uplevel #0 $options(-errorcommand) [list illegalsection "extraneous conditional section close"]
1656			}
1657			set state(condSections) [lreplace $state(condSections) end end]
1658			set text $t2
1659		    }
1660
1661		}
1662	    } elseif {[regexp {^!\[IGNORE\[(.*)} $decl discard remainder]} {
1663		# Set ignore mode.  Still need a stack
1664		set mode ignore
1665
1666		if {[regexp {(.*?)]]$} $remainder discard r2]} {
1667		    # section closed immediately
1668		    if {[string length [string trim $r2]]} {
1669			uplevel #0 $options(-errorcommand) [list unexpectedtext "unexpected text \"$r2\" in conditional section"]
1670		    }
1671		} elseif {[regexp {(.*?)]](.*)} $value discard r2 r3]} {
1672		    # section closed immediately
1673		    if {[string length [string trim $r2]]} {
1674			uplevel #0 $options(-errorcommand) [list unexpectedtext "unexpected text \"$r2\" in conditional section"]
1675		    }
1676		    if {[string length [string trim $r3]]} {
1677			uplevel #0 $options(-errorcommand) [list unexpectedtext "unexpected text \"$r3\" in conditional section"]
1678		    }
1679		} else {
1680
1681		    lappend state(condSections) IGNORE
1682
1683		    if {[regexp {(.*?)]]>(.*)} $text discard t1 t2]} {
1684			if {[string length [string trim $t1]]} {
1685			    uplevel #0 $options(-errorcommand) [list unexpectedtext "unexpected text \"$t1\""]
1686			}
1687			if {![llength $state(condSections)]} {
1688			    uplevel #0 $options(-errorcommand) [list illegalsection "extraneous conditional section close"]
1689			}
1690			set state(condSections) [lreplace $state(condSections) end end]
1691			set text $t2
1692		    }
1693
1694		}
1695	    } else {
1696		uplevel #0 $options(-errorcommand) [list illegaldeclaration "illegal markup declaration \"$decl\" around line $state(line)"]
1697	    }
1698
1699	}
1700
1701	default {
1702	    if {[regexp {^\?(.*)} $decl discard target]} {
1703		# Processing instruction
1704	    } else {
1705		uplevel #0 $options(-errorcommand) [list illegaldeclaration "illegal markup declaration \"$decl\""]
1706	    }
1707	}
1708    }
1709
1710    return {}
1711}
1712
1713# sgml::ParseDTD:External --
1714#
1715#	Parse the external DTD subset.
1716#
1717#	Parameter entities are allowed anywhere.
1718#
1719# Arguments:
1720#	opts	configuration options
1721#	dtd	DTD data
1722#
1723# Results:
1724#	Markup declarations parsed may cause callback invocation
1725
1726proc sgml::ParseDTD:External {opts dtd} {
1727    variable MarkupDeclExpr
1728    variable MarkupDeclSub
1729    variable declExpr
1730
1731    array set options $opts
1732    upvar #0 $options(parameterentities) PEnts
1733    upvar #0 $options(externalparameterentities) ExtPEnts
1734    upvar #0 $options(-statevariable) state
1735
1736    # As with the internal DTD subset, watch out for
1737    # entities with angle brackets
1738    set mode {} ;# normal
1739    set delimiter {}
1740    set name {}
1741    set param {}
1742
1743    set oldState 0
1744    catch {set oldState $state(inInternalDTD)}
1745    set state(inInternalDTD) 0
1746
1747    # Initialise conditional section stack
1748    if {![info exists state(condSections)]} {
1749	set state(condSections) {}
1750    }
1751    set startCondSectionDepth [llength $state(condSections)]
1752
1753    while {[string length $dtd]} {
1754	set progress 0
1755	set PEref {}
1756	if {![string compare $mode "ignore"]} {
1757	    set progress 1
1758	    if {[regexp {]]>(.*)} $dtd discard dtd]} {
1759		set remainder {}
1760		set mode {} ;# normal
1761		set state(condSections) [lreplace $state(condSections) end end]
1762		continue
1763	    } else {
1764		uplevel #0 $options(-errorcommand) [list missingdelimiter "IGNORE conditional section closing delimiter not found"]
1765	    }
1766	} elseif {[regexp ^(.*?)%($::sgml::Name)\;(.*)\$ $dtd discard data PEref remainder]} {
1767	    set progress 1
1768	} else {
1769	    set data $dtd
1770	    set dtd {}
1771	    set remainder {}
1772	}
1773
1774	# Tokenize the DTD (so far)
1775
1776	# Protect Tcl special characters
1777	regsub -all {([{}\\])} $data {\\\1} dataP
1778
1779	set n [regsub -all $MarkupDeclExpr $dataP $MarkupDeclSub dataP]
1780
1781	if {$n} {
1782	    set progress 1
1783	    # All but the last markup declaration should have no text
1784	    set dataP [lrange "{} {} \{$dataP\}" 3 end]
1785	    if {[llength $dataP] > 3} {
1786		foreach {decl value text} [lrange $dataP 0 [expr [llength $dataP] - 4]] {
1787		    ParseDTD:EntityMode [array get options] mode replText decl value text $delimiter $name $param
1788		    ParseDTD:ProcessMarkupDecl [array get options] decl value delimiter name mode repltextVar text param
1789
1790		    if {[string length [string trim $text]]} {
1791			# check for conditional section close
1792			if {[regexp {]]>(.*)$} $text discard text]} {
1793			    if {[string length [string trim $text]]} {
1794				uplevel #0 $options(-errorcommand) [list unexpectedtext "unexpected text \"$text\""]
1795			    }
1796			    if {![llength $state(condSections)]} {
1797				uplevel #0 $options(-errorcommand) [list illegalsection "extraneous conditional section close"]
1798			    }
1799			    set state(condSections) [lreplace $state(condSections) end end]
1800			    if {![string compare $mode "ignore"]} {
1801				set mode {} ;# normal
1802			    }
1803			} else {
1804			    uplevel #0 $options(-errorcommand) [list unexpectedtext "unexpected text \"$text\""]
1805			}
1806		    }
1807		}
1808	    }
1809	    # Do the last declaration
1810	    foreach {decl value text} [lrange $dataP [expr [llength $dataP] - 3] end] {
1811		ParseDTD:EntityMode [array get options] mode replText decl value text $delimiter $name $param
1812		ParseDTD:ProcessMarkupDecl [array get options] decl value delimiter name mode repltextVar text param
1813	    }
1814	}
1815
1816	# Now expand the PE reference, if any
1817	switch -glob $mode,[string length $PEref],$n {
1818	    ignore,0,* {
1819		set dtd $text
1820	    }
1821	    ignore,*,* {
1822		set dtd $text$remainder
1823	    }
1824	    *,0,0 {
1825		set dtd $data
1826	    }
1827	    *,0,* {
1828		set dtd $text
1829	    }
1830	    *,*,0 {
1831		if {[catch {append data $PEnts($PEref)}]} {
1832		    if {[info exists ExtPEnts($PEref)]} {
1833			set externalParser [$options(-name) entityparser]
1834			$externalParser parse $ExtPEnts($PEref) -dtdsubset external
1835			#$externalParser free
1836		    } else {
1837			uplevel #0 $options(-errorcommand) [list entityundeclared "parameter entity \"$PEref\" not declared"]
1838		    }
1839		}
1840		set dtd $data$remainder
1841	    }
1842	    default {
1843		if {[catch {append text $PEnts($PEref)}]} {
1844		    if {[info exists ExtPEnts($PEref)]} {
1845			set externalParser [$options(-name) entityparser]
1846			$externalParser parse $ExtPEnts($PEref) -dtdsubset external
1847			#$externalParser free
1848		    } else {
1849			uplevel #0 $options(-errorcommand) [list entityundeclared "parameter entity \"$PEref\" not declared"]
1850		    }
1851		}
1852		set dtd $text$remainder
1853	    }
1854	}
1855
1856	# Check whether a conditional section has been terminated
1857	if {[regexp {^(.*?)]]>(.*)$} $dtd discard t1 t2]} {
1858	    if {![regexp <.*> $t1]} {
1859		if {[string length [string trim $t1]]} {
1860		    uplevel #0 $options(-errorcommand) [list unexpectedtext "unexpected text \"$t1\""]
1861		}
1862		if {![llength $state(condSections)]} {
1863		    uplevel #0 $options(-errorcommand) [list illegalsection "extraneous conditional section close"]
1864		}
1865		set state(condSections) [lreplace $state(condSections) end end]
1866		if {![string compare $mode "ignore"]} {
1867		    set mode {} ;# normal
1868		}
1869		set dtd $t2
1870		set progress 1
1871	    }
1872	}
1873
1874	if {!$progress} {
1875	    # No parameter entity references were found and
1876	    # the text does not contain a well-formed markup declaration
1877	    # Avoid going into an infinite loop
1878	    upvar #0 $options(-errorcommand) [list syntaxerror "external entity does not contain well-formed markup declaration"]
1879	    break
1880	}
1881    }
1882
1883    set state(inInternalDTD) $oldState
1884
1885    # Check that conditional sections have been closed properly
1886    if {[llength $state(condSections)] > $startCondSectionDepth} {
1887	uplevel #0 $options(-errorcommand) [list syntaxerror "[lindex $state(condSections) end] conditional section not closed"]
1888    }
1889    if {[llength $state(condSections)] < $startCondSectionDepth} {
1890	uplevel #0 $options(-errorcommand) [list syntaxerror "too many conditional section closures"]
1891    }
1892
1893    return {}
1894}
1895
1896# Procedures for handling the various declarative elements in a DTD.
1897# New elements may be added by creating a procedure of the form
1898# parse:DTD:_element_
1899
1900# For each of these procedures, the various regular expressions they use
1901# are created outside of the proc to avoid overhead at runtime
1902
1903# sgml::DTD:ELEMENT --
1904#
1905#	<!ELEMENT ...> defines an element.
1906#
1907#	The content model for the element is stored in the contentmodel array,
1908#	indexed by the element name.  The content model is parsed into the
1909#	following list form:
1910#
1911#		{}	Content model is EMPTY.
1912#			Indicated by an empty list.
1913#		*	Content model is ANY.
1914#			Indicated by an asterix.
1915#		{ELEMENT ...}
1916#			Content model is element-only.
1917#		{MIXED {element1 element2 ...}}
1918#			Content model is mixed (PCDATA and elements).
1919#			The second element of the list contains the
1920#			elements that may occur.  #PCDATA is assumed
1921#			(ie. the list is normalised).
1922#
1923# Arguments:
1924#	opts	configuration options
1925#	name	element GI
1926#	modspec	unparsed content model specification
1927
1928proc sgml::DTD:ELEMENT {opts name modspec} {
1929    variable Wsp
1930    array set options $opts
1931
1932    upvar #0 $options(elementdecls) elements
1933
1934    if {$options(-validate) && [info exists elements($name)]} {
1935	eval $options(-errorcommand) [list elementdeclared "element \"$name\" already declared"]
1936    } else {
1937	switch -- $modspec {
1938	    EMPTY {
1939	    	set elements($name) {}
1940		uplevel #0 $options(-elementdeclcommand) $name {{}}
1941	    }
1942	    ANY {
1943	    	set elements($name) *
1944		uplevel #0 $options(-elementdeclcommand) $name *
1945	    }
1946	    default {
1947		# Don't parse the content model for now,
1948		# just pass the model to the application
1949		if {0 && [regexp [format {^\([%s]*#PCDATA[%s]*(\|([^)]+))?[%s]*\)*[%s]*$} $Wsp $Wsp $Wsp $Wsp] discard discard mtoks]} {
1950		    set cm($name) [list MIXED [split $mtoks |]]
1951		} elseif {0} {
1952		    if {[catch {CModelParse $state(state) $value} result]} {
1953			eval $options(-errorcommand) [list element? $result]
1954		    } else {
1955			set cm($id) [list ELEMENT $result]
1956		    }
1957		} else {
1958		    set elements($name) $modspec
1959		    uplevel #0 $options(-elementdeclcommand) $name [list $modspec]
1960		}
1961	    }
1962	}
1963    }
1964}
1965
1966# sgml::CModelParse --
1967#
1968#	Parse an element content model (non-mixed).
1969#	A syntax tree is constructed.
1970#	A transition table is built next.
1971#
1972#	This is going to need alot of work!
1973#
1974# Arguments:
1975#	state	state array variable
1976#	value	the content model data
1977#
1978# Results:
1979#	A Tcl list representing the content model.
1980
1981proc sgml::CModelParse {state value} {
1982    upvar #0 $state var
1983
1984    # First build syntax tree
1985    set syntaxTree [CModelMakeSyntaxTree $state $value]
1986
1987    # Build transition table
1988    set transitionTable [CModelMakeTransitionTable $state $syntaxTree]
1989
1990    return [list $syntaxTree $transitionTable]
1991}
1992
1993# sgml::CModelMakeSyntaxTree --
1994#
1995#	Construct a syntax tree for the regular expression.
1996#
1997#	Syntax tree is represented as a Tcl list:
1998#	rep {:choice|:seq {{rep list1} {rep list2} ...}}
1999#	where:	rep is repetition character, *, + or ?. {} for no repetition
2000#		listN is nested expression or Name
2001#
2002# Arguments:
2003#	spec	Element specification
2004#
2005# Results:
2006#	Syntax tree for element spec as nested Tcl list.
2007#
2008#	Examples:
2009#	(memo)
2010#		{} {:seq {{} memo}}
2011#	(front, body, back?)
2012#		{} {:seq {{} front} {{} body} {? back}}
2013#	(head, (p | list | note)*, div2*)
2014#		{} {:seq {{} head} {* {:choice {{} p} {{} list} {{} note}}} {* div2}}
2015#	(p | a | ul)+
2016#		+ {:choice {{} p} {{} a} {{} ul}}
2017
2018proc sgml::CModelMakeSyntaxTree {state spec} {
2019    upvar #0 $state var
2020    variable Wsp
2021    variable name
2022
2023    # Translate the spec into a Tcl list.
2024
2025    # None of the Tcl special characters are allowed in a content model spec.
2026    if {[regexp {\$|\[|\]|\{|\}} $spec]} {
2027	return -code error "illegal characters in specification"
2028    }
2029
2030    regsub -all [format {(%s)[%s]*(\?|\*|\+)?[%s]*(,|\|)?} $name $Wsp $Wsp] $spec [format {%sCModelSTname %s {\1} {\2} {\3}} \n $state] spec
2031    regsub -all {\(} $spec "\nCModelSTopenParen $state " spec
2032    regsub -all [format {\)[%s]*(\?|\*|\+)?[%s]*(,|\|)?} $Wsp $Wsp] $spec [format {%sCModelSTcloseParen %s {\1} {\2}} \n $state] spec
2033
2034    array set var {stack {} state start}
2035    eval $spec
2036
2037    # Peel off the outer seq, its redundant
2038    return [lindex [lindex $var(stack) 1] 0]
2039}
2040
2041# sgml::CModelSTname --
2042#
2043#	Processes a name in a content model spec.
2044#
2045# Arguments:
2046#	state	state array variable
2047#	name	name specified
2048#	rep	repetition operator
2049#	cs	choice or sequence delimiter
2050#
2051# Results:
2052#	See CModelSTcp.
2053
2054proc sgml::CModelSTname {state name rep cs args} {
2055    if {[llength $args]} {
2056	return -code error "syntax error in specification: \"$args\""
2057    }
2058
2059    CModelSTcp $state $name $rep $cs
2060}
2061
2062# sgml::CModelSTcp --
2063#
2064#	Process a content particle.
2065#
2066# Arguments:
2067#	state	state array variable
2068#	name	name specified
2069#	rep	repetition operator
2070#	cs	choice or sequence delimiter
2071#
2072# Results:
2073#	The content particle is added to the current group.
2074
2075proc sgml::CModelSTcp {state cp rep cs} {
2076    upvar #0 $state var
2077
2078    switch -glob -- [lindex $var(state) end]=$cs {
2079	start= {
2080	    set var(state) [lreplace $var(state) end end end]
2081	    # Add (dummy) grouping, either choice or sequence will do
2082	    CModelSTcsSet $state ,
2083	    CModelSTcpAdd $state $cp $rep
2084	}
2085	:choice= -
2086	:seq= {
2087	    set var(state) [lreplace $var(state) end end end]
2088	    CModelSTcpAdd $state $cp $rep
2089	}
2090	start=| -
2091	start=, {
2092	    set var(state) [lreplace $var(state) end end [expr {$cs == "," ? ":seq" : ":choice"}]]
2093	    CModelSTcsSet $state $cs
2094	    CModelSTcpAdd $state $cp $rep
2095	}
2096	:choice=| -
2097	:seq=, {
2098	    CModelSTcpAdd $state $cp $rep
2099	}
2100	:choice=, -
2101	:seq=| {
2102	    return -code error "syntax error in specification: incorrect delimiter after \"$cp\", should be \"[expr {$cs == "," ? "|" : ","}]\""
2103	}
2104	end=* {
2105	    return -code error "syntax error in specification: no delimiter before \"$cp\""
2106	}
2107	default {
2108	    return -code error "syntax error"
2109	}
2110    }
2111
2112}
2113
2114# sgml::CModelSTcsSet --
2115#
2116#	Start a choice or sequence on the stack.
2117#
2118# Arguments:
2119#	state	state array
2120#	cs	choice oir sequence
2121#
2122# Results:
2123#	state is modified: end element of state is appended.
2124
2125proc sgml::CModelSTcsSet {state cs} {
2126    upvar #0 $state var
2127
2128    set cs [expr {$cs == "," ? ":seq" : ":choice"}]
2129
2130    if {[llength $var(stack)]} {
2131	set var(stack) [lreplace $var(stack) end end $cs]
2132    } else {
2133	set var(stack) [list $cs {}]
2134    }
2135}
2136
2137# sgml::CModelSTcpAdd --
2138#
2139#	Append a content particle to the top of the stack.
2140#
2141# Arguments:
2142#	state	state array
2143#	cp	content particle
2144#	rep	repetition
2145#
2146# Results:
2147#	state is modified: end element of state is appended.
2148
2149proc sgml::CModelSTcpAdd {state cp rep} {
2150    upvar #0 $state var
2151
2152    if {[llength $var(stack)]} {
2153	set top [lindex $var(stack) end]
2154    	lappend top [list $rep $cp]
2155	set var(stack) [lreplace $var(stack) end end $top]
2156    } else {
2157	set var(stack) [list $rep $cp]
2158    }
2159}
2160
2161# sgml::CModelSTopenParen --
2162#
2163#	Processes a '(' in a content model spec.
2164#
2165# Arguments:
2166#	state	state array
2167#
2168# Results:
2169#	Pushes stack in state array.
2170
2171proc sgml::CModelSTopenParen {state args} {
2172    upvar #0 $state var
2173
2174    if {[llength $args]} {
2175	return -code error "syntax error in specification: \"$args\""
2176    }
2177
2178    lappend var(state) start
2179    lappend var(stack) [list {} {}]
2180}
2181
2182# sgml::CModelSTcloseParen --
2183#
2184#	Processes a ')' in a content model spec.
2185#
2186# Arguments:
2187#	state	state array
2188#	rep	repetition
2189#	cs	choice or sequence delimiter
2190#
2191# Results:
2192#	Stack is popped, and former top of stack is appended to previous element.
2193
2194proc sgml::CModelSTcloseParen {state rep cs args} {
2195    upvar #0 $state var
2196
2197    if {[llength $args]} {
2198	return -code error "syntax error in specification: \"$args\""
2199    }
2200
2201    set cp [lindex $var(stack) end]
2202    set var(stack) [lreplace $var(stack) end end]
2203    set var(state) [lreplace $var(state) end end]
2204    CModelSTcp $state $cp $rep $cs
2205}
2206
2207# sgml::CModelMakeTransitionTable --
2208#
2209#	Given a content model's syntax tree, constructs
2210#	the transition table for the regular expression.
2211#
2212#	See "Compilers, Principles, Techniques, and Tools",
2213#	Aho, Sethi and Ullman.  Section 3.9, algorithm 3.5.
2214#
2215# Arguments:
2216#	state	state array variable
2217#	st	syntax tree
2218#
2219# Results:
2220#	The transition table is returned, as a key/value Tcl list.
2221
2222proc sgml::CModelMakeTransitionTable {state st} {
2223    upvar #0 $state var
2224
2225    # Construct nullable, firstpos and lastpos functions
2226    array set var {number 0}
2227    foreach {nullable firstpos lastpos} [	\
2228	TraverseDepth1st $state $st {
2229	    # Evaluated for leaf nodes
2230	    # Compute nullable(n)
2231	    # Compute firstpos(n)
2232	    # Compute lastpos(n)
2233	    set nullable [nullable leaf $rep $name]
2234	    set firstpos [list {} $var(number)]
2235	    set lastpos [list {} $var(number)]
2236	    set var(pos:$var(number)) $name
2237	} {
2238	    # Evaluated for nonterminal nodes
2239	    # Compute nullable, firstpos, lastpos
2240	    set firstpos [firstpos $cs $firstpos $nullable]
2241	    set lastpos  [lastpos  $cs $lastpos  $nullable]
2242	    set nullable [nullable nonterm $rep $cs $nullable]
2243	}	\
2244    ] break
2245
2246    set accepting [incr var(number)]
2247    set var(pos:$accepting) #
2248
2249    # var(pos:N) maps from position to symbol.
2250    # Construct reverse map for convenience.
2251    # NB. A symbol may appear in more than one position.
2252    # var is about to be reset, so use different arrays.
2253
2254    foreach {pos symbol} [array get var pos:*] {
2255	set pos [lindex [split $pos :] 1]
2256	set pos2symbol($pos) $symbol
2257	lappend sym2pos($symbol) $pos
2258    }
2259
2260    # Construct the followpos functions
2261    catch {unset var}
2262    followpos $state $st $firstpos $lastpos
2263
2264    # Construct transition table
2265    # Dstates is [union $marked $unmarked]
2266    set unmarked [list [lindex $firstpos 1]]
2267    while {[llength $unmarked]} {
2268	set T [lindex $unmarked 0]
2269	lappend marked $T
2270	set unmarked [lrange $unmarked 1 end]
2271
2272	# Find which input symbols occur in T
2273	set symbols {}
2274	foreach pos $T {
2275	    if {$pos != $accepting && [lsearch $symbols $pos2symbol($pos)] < 0} {
2276		lappend symbols $pos2symbol($pos)
2277	    }
2278	}
2279	foreach a $symbols {
2280	    set U {}
2281	    foreach pos $sym2pos($a) {
2282		if {[lsearch $T $pos] >= 0} {
2283		    # add followpos($pos)
2284	    	    if {$var($pos) == {}} {
2285	    	    	lappend U $accepting
2286	    	    } else {
2287	    	    	eval lappend U $var($pos)
2288	    	    }
2289		}
2290	    }
2291	    set U [makeSet $U]
2292	    if {[llength $U] && [lsearch $marked $U] < 0 && [lsearch $unmarked $U] < 0} {
2293		lappend unmarked $U
2294	    }
2295	    set Dtran($T,$a) $U
2296	}
2297
2298    }
2299
2300    return [list [array get Dtran] [array get sym2pos] $accepting]
2301}
2302
2303# sgml::followpos --
2304#
2305#	Compute the followpos function, using the already computed
2306#	firstpos and lastpos.
2307#
2308# Arguments:
2309#	state		array variable to store followpos functions
2310#	st		syntax tree
2311#	firstpos	firstpos functions for the syntax tree
2312#	lastpos		lastpos functions
2313#
2314# Results:
2315#	followpos functions for each leaf node, in name/value format
2316
2317proc sgml::followpos {state st firstpos lastpos} {
2318    upvar #0 $state var
2319
2320    switch -- [lindex [lindex $st 1] 0] {
2321	:seq {
2322	    for {set i 1} {$i < [llength [lindex $st 1]]} {incr i} {
2323	    	followpos $state [lindex [lindex $st 1] $i]			\
2324			[lindex [lindex $firstpos 0] [expr $i - 1]]	\
2325			[lindex [lindex $lastpos 0] [expr $i - 1]]
2326	    	foreach pos [lindex [lindex [lindex $lastpos 0] [expr $i - 1]] 1] {
2327		    eval lappend var($pos) [lindex [lindex [lindex $firstpos 0] $i] 1]
2328		    set var($pos) [makeSet $var($pos)]
2329	    	}
2330	    }
2331	}
2332	:choice {
2333	    for {set i 1} {$i < [llength [lindex $st 1]]} {incr i} {
2334		followpos $state [lindex [lindex $st 1] $i]			\
2335			[lindex [lindex $firstpos 0] [expr $i - 1]]	\
2336			[lindex [lindex $lastpos 0] [expr $i - 1]]
2337	    }
2338	}
2339	default {
2340	    # No action at leaf nodes
2341	}
2342    }
2343
2344    switch -- [lindex $st 0] {
2345	? {
2346	    # We having nothing to do here ! Doing the same as
2347	    # for * effectively converts this qualifier into the other.
2348	}
2349	* {
2350	    foreach pos [lindex $lastpos 1] {
2351		eval lappend var($pos) [lindex $firstpos 1]
2352		set var($pos) [makeSet $var($pos)]
2353	    }
2354	}
2355    }
2356
2357}
2358
2359# sgml::TraverseDepth1st --
2360#
2361#	Perform depth-first traversal of a tree.
2362#	A new tree is constructed, with each node computed by f.
2363#
2364# Arguments:
2365#	state	state array variable
2366#	t	The tree to traverse, a Tcl list
2367#	leaf	Evaluated at a leaf node
2368#	nonTerm	Evaluated at a nonterminal node
2369#
2370# Results:
2371#	A new tree is returned.
2372
2373proc sgml::TraverseDepth1st {state t leaf nonTerm} {
2374    upvar #0 $state var
2375
2376    set nullable {}
2377    set firstpos {}
2378    set lastpos {}
2379
2380    switch -- [lindex [lindex $t 1] 0] {
2381	:seq -
2382	:choice {
2383	    set rep [lindex $t 0]
2384	    set cs [lindex [lindex $t 1] 0]
2385
2386	    foreach child [lrange [lindex $t 1] 1 end] {
2387		foreach {childNullable childFirstpos childLastpos} \
2388			[TraverseDepth1st $state $child $leaf $nonTerm] break
2389		lappend nullable $childNullable
2390		lappend firstpos $childFirstpos
2391		lappend lastpos  $childLastpos
2392	    }
2393
2394	    eval $nonTerm
2395	}
2396	default {
2397	    incr var(number)
2398	    set rep [lindex [lindex $t 0] 0]
2399	    set name [lindex [lindex $t 1] 0]
2400	    eval $leaf
2401	}
2402    }
2403
2404    return [list $nullable $firstpos $lastpos]
2405}
2406
2407# sgml::firstpos --
2408#
2409#	Computes the firstpos function for a nonterminal node.
2410#
2411# Arguments:
2412#	cs		node type, choice or sequence
2413#	firstpos	firstpos functions for the subtree
2414#	nullable	nullable functions for the subtree
2415#
2416# Results:
2417#	firstpos function for this node is returned.
2418
2419proc sgml::firstpos {cs firstpos nullable} {
2420    switch -- $cs {
2421	:seq {
2422	    set result [lindex [lindex $firstpos 0] 1]
2423	    for {set i 0} {$i < [llength $nullable]} {incr i} {
2424	    	if {[lindex [lindex $nullable $i] 1]} {
2425	    	    eval lappend result [lindex [lindex $firstpos [expr $i + 1]] 1]
2426		} else {
2427		    break
2428		}
2429	    }
2430	}
2431	:choice {
2432	    foreach child $firstpos {
2433		eval lappend result $child
2434	    }
2435	}
2436    }
2437
2438    return [list $firstpos [makeSet $result]]
2439}
2440
2441# sgml::lastpos --
2442#
2443#	Computes the lastpos function for a nonterminal node.
2444#	Same as firstpos, only logic is reversed
2445#
2446# Arguments:
2447#	cs		node type, choice or sequence
2448#	lastpos		lastpos functions for the subtree
2449#	nullable	nullable functions forthe subtree
2450#
2451# Results:
2452#	lastpos function for this node is returned.
2453
2454proc sgml::lastpos {cs lastpos nullable} {
2455    switch -- $cs {
2456	:seq {
2457	    set result [lindex [lindex $lastpos end] 1]
2458	    for {set i [expr [llength $nullable] - 1]} {$i >= 0} {incr i -1} {
2459		if {[lindex [lindex $nullable $i] 1]} {
2460		    eval lappend result [lindex [lindex $lastpos $i] 1]
2461		} else {
2462		    break
2463		}
2464	    }
2465	}
2466	:choice {
2467	    foreach child $lastpos {
2468		eval lappend result $child
2469	    }
2470	}
2471    }
2472
2473    return [list $lastpos [makeSet $result]]
2474}
2475
2476# sgml::makeSet --
2477#
2478#	Turn a list into a set, ie. remove duplicates.
2479#
2480# Arguments:
2481#	s	a list
2482#
2483# Results:
2484#	A set is returned, which is a list with duplicates removed.
2485
2486proc sgml::makeSet s {
2487    foreach r $s {
2488	if {[llength $r]} {
2489	    set unique($r) {}
2490	}
2491    }
2492    return [array names unique]
2493}
2494
2495# sgml::nullable --
2496#
2497#	Compute the nullable function for a node.
2498#
2499# Arguments:
2500#	nodeType	leaf or nonterminal
2501#	rep		repetition applying to this node
2502#	name		leaf node: symbol for this node, nonterm node: choice or seq node
2503#	subtree		nonterm node: nullable functions for the subtree
2504#
2505# Results:
2506#	Returns nullable function for this branch of the tree.
2507
2508proc sgml::nullable {nodeType rep name {subtree {}}} {
2509    switch -glob -- $rep:$nodeType {
2510	:leaf -
2511	+:leaf {
2512	    return [list {} 0]
2513	}
2514	\\*:leaf -
2515	\\?:leaf {
2516	    return [list {} 1]
2517	}
2518	\\*:nonterm -
2519	\\?:nonterm {
2520	    return [list $subtree 1]
2521	}
2522	:nonterm -
2523	+:nonterm {
2524	    switch -- $name {
2525		:choice {
2526		    set result 0
2527		    foreach child $subtree {
2528			set result [expr $result || [lindex $child 1]]
2529		    }
2530		}
2531		:seq {
2532		    set result 1
2533		    foreach child $subtree {
2534			set result [expr $result && [lindex $child 1]]
2535		    }
2536		}
2537	    }
2538	    return [list $subtree $result]
2539	}
2540    }
2541}
2542
2543# sgml::DTD:ATTLIST --
2544#
2545#	<!ATTLIST ...> defines an attribute list.
2546#
2547# Arguments:
2548#	opts	configuration opions
2549#	name	Element GI
2550#	attspec	unparsed attribute definitions
2551#
2552# Results:
2553#	Attribute list variables are modified.
2554
2555proc sgml::DTD:ATTLIST {opts name attspec} {
2556    variable attlist_exp
2557    variable attlist_enum_exp
2558    variable attlist_fixed_exp
2559
2560    array set options $opts
2561
2562    # Parse the attribute list.  If it were regular, could just use foreach,
2563    # but some attributes may have values.
2564    regsub -all {([][$\\])} $attspec {\\\1} attspec
2565    regsub -all $attlist_exp $attspec "\}\nDTDAttribute {$options(-attlistdeclcommand)} $name $options(attlistdecls) {\\1} {\\2} {\\3} {} \{" attspec
2566    regsub -all $attlist_enum_exp $attspec "\}\nDTDAttribute {$options(-attlistdeclcommand)} $name $options(attlistdecls) {\\1} {\\2} {} {\\4} \{" attspec
2567    regsub -all $attlist_fixed_exp $attspec "\}\nDTDAttribute {$options(-attlistdeclcommand)} $name $options(attlistdecls) {\\1} {\\2} {\\3} {\\4} \{" attspec
2568
2569    eval "noop \{$attspec\}"
2570
2571    return {}
2572}
2573
2574# sgml::DTDAttribute --
2575#
2576#	Parse definition of a single attribute.
2577#
2578# Arguments:
2579#	callback	attribute defn callback
2580#	name	element name
2581#	var	array variable
2582#	att	attribute name
2583#	type	type of this attribute
2584#	default	default value of the attribute
2585#	value	other information
2586#	text	other text (should be empty)
2587#
2588# Results:
2589#	Attribute defn added to array, unless it already exists
2590
2591proc sgml::DTDAttribute args {
2592    # BUG: Some problems with parameter passing - deal with it later
2593    foreach {callback name var att type default value text} $args break
2594
2595    upvar #0 $var atts
2596
2597    if {[string length [string trim $text]]} {
2598	return -code error "unexpected text \"$text\" in attribute definition"
2599    }
2600
2601    # What about overridden attribute defns?
2602    # A non-validating app may want to know about them
2603    # (eg. an editor)
2604    if {![info exists atts($name/$att)]} {
2605	set atts($name/$att) [list $type $default $value]
2606	uplevel #0 $callback [list $name $att $type $default $value]
2607    }
2608
2609    return {}
2610}
2611
2612# sgml::DTD:ENTITY --
2613#
2614#	<!ENTITY ...> declaration.
2615#
2616#	Callbacks:
2617#	-entitydeclcommand for general entity declaration
2618#	-unparsedentitydeclcommand for unparsed external entity declaration
2619#	-parameterentitydeclcommand for parameter entity declaration
2620#
2621# Arguments:
2622#	opts	configuration options
2623#	name	name of entity being defined
2624#	param	whether a parameter entity is being defined
2625#	value	unparsed replacement text
2626#
2627# Results:
2628#	Modifies the caller's entities array variable
2629
2630proc sgml::DTD:ENTITY {opts name param value} {
2631
2632    array set options $opts
2633
2634    if {[string compare % $param]} {
2635	# Entity declaration - general or external
2636	upvar #0 $options(entities) ents
2637	upvar #0 $options(extentities) externals
2638
2639	if {[info exists ents($name)] || [info exists externals($name)]} {
2640	    eval $options(-warningcommand) entity [list "entity \"$name\" already declared"]
2641	} else {
2642	    if {[catch {uplevel #0 $options(-parseentitydeclcommand) [list $value]} value]} {
2643		return -code error "unable to parse entity declaration due to \"$value\""
2644	    }
2645	    switch -glob [lindex $value 0],[lindex $value 3] {
2646		internal, {
2647		    set ents($name) [EntitySubst [array get options] [lindex $value 1]]
2648		    uplevel #0 $options(-entitydeclcommand) [list $name $ents($name)]
2649		}
2650		internal,* {
2651		    return -code error "unexpected NDATA declaration"
2652		}
2653		external, {
2654		    set externals($name) [lrange $value 1 2]
2655		    uplevel #0 $options(-entitydeclcommand) [eval list $name [lrange $value 1 2]]
2656		}
2657		external,* {
2658		    set externals($name) [lrange $value 1 3]
2659		    uplevel #0 $options(-unparsedentitydeclcommand) [eval list $name [lrange $value 1 3]]
2660		}
2661		default {
2662		    return -code error "internal error: unexpected parser state"
2663		}
2664	    }
2665	}
2666    } else {
2667	# Parameter entity declaration
2668	upvar #0 $options(parameterentities) PEnts
2669	upvar #0 $options(externalparameterentities) ExtPEnts
2670
2671	if {[info exists PEnts($name)] || [info exists ExtPEnts($name)]} {
2672	    eval $options(-warningcommand) parameterentity [list "parameter entity \"$name\" already declared"]
2673	} else {
2674	    if {[catch {uplevel #0 $options(-parseentitydeclcommand) [list $value]} value]} {
2675		return -code error "unable to parse parameter entity declaration due to \"$value\""
2676	    }
2677	    if {[string length [lindex $value 3]]} {
2678		return -code error "NDATA illegal in parameter entity declaration"
2679	    }
2680	    switch [lindex $value 0] {
2681		internal {
2682		    # Substitute character references and PEs (XML: 4.5)
2683		    set value [EntitySubst [array get options] [lindex $value 1]]
2684
2685		    set PEnts($name) $value
2686		    uplevel #0 $options(-parameterentitydeclcommand) [list $name $value]
2687		}
2688		external -
2689		default {
2690		    # Get the replacement text now.
2691		    # Could wait until the first reference, but easier
2692		    # to just do it now.
2693
2694		    set token [uri::geturl [uri::resolve $options(-baseurl) [lindex $value 1]]]
2695
2696		    set ExtPEnts($name) [lindex [array get $token data] 1]
2697		    uplevel #0 $options(-parameterentitydeclcommand) [eval list $name [lrange $value 1 2]]
2698		}
2699	    }
2700	}
2701    }
2702}
2703
2704# sgml::EntitySubst --
2705#
2706#	Perform entity substitution on an entity replacement text.
2707#	This differs slightly from other substitution procedures,
2708#	because only parameter and character entity substitution
2709#	is performed, not general entities.
2710#	See XML Rec. section 4.5.
2711#
2712# Arguments:
2713#	opts	configuration options
2714#	value	Literal entity value
2715#
2716# Results:
2717#	Expanded replacement text
2718
2719proc sgml::EntitySubst {opts value} {
2720    array set options $opts
2721
2722    # Protect Tcl special characters
2723    regsub -all {([{}\\])} $value {\\\1} value
2724
2725    # Find entity references
2726    regsub -all (&#\[0-9\]+|&#x\[0-9a-fA-F\]+|%${::sgml::Name})\; $value "\[EntitySubstValue [list $options(parameterentities)] {\\1}\]" value
2727
2728    set result [subst $value]
2729
2730    return $result
2731}
2732
2733# sgml::EntitySubstValue --
2734#
2735#	Handle a single character or parameter entity substitution
2736#
2737# Arguments:
2738#	PEvar	array variable containing PE declarations
2739#	ref	character or parameter entity reference
2740#
2741# Results:
2742#	Replacement text
2743
2744proc sgml::EntitySubstValue {PEvar ref} {
2745    switch -glob -- $ref {
2746	&#x* {
2747	    scan [string range $ref 3 end] %x hex
2748	    return [format %c $hex]
2749	}
2750	&#* {
2751	    return [format %c [string range $ref 2 end]]
2752	}
2753	%* {
2754	    upvar #0 $PEvar PEs
2755	    set ref [string range $ref 1 end]
2756	    if {[info exists PEs($ref)]} {
2757		return $PEs($ref)
2758	    } else {
2759		return -code error "parameter entity \"$ref\" not declared"
2760	    }
2761	}
2762	default {
2763	    return -code error "internal error - unexpected entity reference"
2764	}
2765    }
2766    return {}
2767}
2768
2769# sgml::DTD:NOTATION --
2770#
2771#	Process notation declaration
2772#
2773# Arguments:
2774#	opts	configuration options
2775#	name	notation name
2776#	value	unparsed notation spec
2777
2778proc sgml::DTD:NOTATION {opts name value} {
2779    return {}
2780
2781    variable notation_exp
2782    upvar opts state
2783
2784    if {[regexp $notation_exp $value x scheme data] == 2} {
2785    } else {
2786	eval $state(-errorcommand) [list notationvalue "notation value \"$value\" incorrectly specified"]
2787    }
2788}
2789
2790# sgml::ResolveEntity --
2791#
2792#	Default entity resolution routine
2793#
2794# Arguments:
2795#	name	name of parent parser
2796#	base	base URL for relative URLs
2797#	sysId	system identifier
2798#	pubId	public identifier
2799
2800proc sgml::ResolveEntity {name base sysId pubId} {
2801    variable ParseEventNum
2802
2803    if {[catch {uri::resolve $base $sysId} url]} {
2804	return -code error "unable to resolve system identifier \"$sysId\""
2805    }
2806    if {[catch {uri::geturl $url} token]} {
2807	return -code error "unable to retrieve external entity \"$url\" for system identifier \"$sysId\""
2808    }
2809
2810    upvar #0 $token data
2811
2812    set parser [uplevel #0 $name entityparser]
2813
2814    $parser parse $data(body) -dtdsubset external
2815    #$parser free
2816
2817    return {}
2818}
2819