1# -*- tcl -*-
2#
3# Copyright (c) 2005 by Andreas Kupries <andreas_kupries@users.sourceforge.net>
4# Parser Generator / Transformation - Normalize PEG AST for later.
5
6# This package assumes to be used from within a PAGE plugin. It uses
7# the API commands listed below. These are identical across the major
8# types of PAGE plugins, allowing this package to be used in reader,
9# transform, and writer plugins. It cannot be used in a configuration
10# plugin, and this makes no sense either.
11#
12# To ensure that our assumption is ok we require the relevant pseudo
13# package setup by the PAGE plugin management code.
14#
15# -----------------+--
16# page_info        | Reporting to the user.
17# page_warning     |
18# page_error       |
19# -----------------+--
20# page_log_error   | Reporting of internals.
21# page_log_warning |
22# page_log_info    |
23# -----------------+--
24
25# ### ### ### ######### ######### #########
26## Requisites
27
28# @mdgen NODEP: page::plugin
29
30package require page::plugin ; # S.a. pseudo-package.
31package require treeql
32package require page::util::quote
33
34namespace eval ::page::util::norm::peg {
35    # Get the peg char de/encoder commands.
36    # (unquote, quote'tcl)
37
38    namespace import ::page::util::quote::*
39}
40
41# ### ### ### ######### ######### #########
42## API
43
44proc ::page::util::norm::peg {t} {
45    set q [treeql q -tree $t]
46
47    page_info {[PEG Normalization]}
48    page_log_info ..Terminals   ; peg::Terminals   $q $t
49    page_log_info ..Chains      ; peg::CutChains   $q $t
50    page_log_info ..Metadata    ; peg::Metadata    $q $t
51    page_log_info ..Definitions ; peg::Definitions $q $t
52    page_log_info ..Expressions ; peg::Expressions $q $t
53
54    # Sentinel for PE algorithms.
55    $t set root symbol <StartExpression>
56    $q destroy
57
58    page_log_info Ok
59    return
60}
61
62# ### ### ### ######### ######### #########
63## Documentation
64#
65## See doc_normalize.txt for the specification of the publicly visible
66## attributes.
67##
68## Internal attributes
69## - DATA - Transient storage for terminal data.
70
71# ### ### ### ######### ######### #########
72## Internal. Helpers
73
74proc ::page::util::norm::peg::Terminals {q t} {
75    # The data for all terminals is stored in their grandparental
76    # nodes. We get rid of both terminals and their parents.
77
78    $q query tree withatt type terminal over n {
79	set p  [$t parent $n]
80	set gp [$t parent $p]
81
82	CopyLocation $t $n $gp
83	AttrCopy     $t $n detail $gp DATA
84	TokReduce    $t           $gp DATA
85	$t delete $p
86    }
87
88    # We can now drop the type attribute, as all the remaining nodes
89    # (which have it) will contain the value 'nonterminal'.
90
91    $q query tree hasatt type over n {
92	$t unset $n type
93    }
94    return
95}
96
97proc ::page::util::norm::peg::CutChains {q t} {
98    # All nodes which have exactly one child are irrelevant. We get
99    # rid of them. The root node is the sole exception. The immediate
100    # child of the root however is superfluous as well.
101
102    $q query tree notq {root} over n {
103	if {[llength [$t children $n]] != 1} continue
104	$t cut $n
105    }
106
107    foreach n [$t children root] {$t cut $n}
108    return
109}
110
111proc ::page::util::norm::peg::Metadata {q t} {
112    # Having the name of the grammar in a tree node is overkill. We
113    # move this information into an attribute of the root node.
114    # The node keeping the start expression separate is irrelevant as
115    # well. We get rid of it, and tag the root of the start expression
116    # with a marker attribute.
117
118    $q query tree withatt detail Header over n {
119	set tmp    [Child $t $n 0]
120	set sexpr  [Child $t $n 1]
121
122	AttrCopy $t $tmp DATA root name
123	$t cut $tmp
124	$t cut $n
125	break
126    }
127
128    # Remember the node for the start expression in the root for quick
129    # access by later stages.
130
131    $t set root start $sexpr
132    return
133}
134
135proc ::page::util::norm::peg::Definitions {q t} {
136    # We move nonterminal hint information from nodes into attributes,
137    # and delete the now irrelevant nodes.
138
139    # NOTE: This transformation is dependent on the removal of all
140    # nodes with exactly one child, as it removes the all 'Attribute'
141    # nodes already. Otherwise this transformation would have to put
142    # the information into the grandparental node.
143
144    # The default mode for nonterminals is 'value'.
145
146    $q query tree withatt detail Definition over n {
147	$t set $n mode value
148    }
149
150    foreach {a mode} {
151	VOID  discard
152	MATCH match
153	LEAF  leaf
154    } {
155	$q query tree withatt detail $a over n {
156	    set p [$t parent $n]
157	    $t set $p mode $mode
158	    $t delete $n
159	}
160    }
161
162    # Like with the global metadata we move definition specific
163    # information out of nodes into attributes, get rid of the
164    # superfluous nodes, and tag the definition roots with marker
165    # attributes.
166
167    set defs {}
168    $q query tree withatt detail Definition over n {
169	# Define mode information for all nonterminals without an
170	# explicit specification. We also save the mode information
171	# from deletion when we redo the definition node.
172
173	set first [Child $t $n 0]
174
175	set sym [$t get $first DATA]
176	$t set $n symbol $sym
177	$t set $n label  $sym
178	$t set $n users  {}
179
180	# Now determine the range in the input covered by the
181	# definition. The left extent comes from the terminal for the
182	# nonterminal symbol it defines. The right extent comes from
183	# the rightmost child under the definition. While this not an
184	# expression tree yet the location data is sound already.
185
186	MergeLocations $t $first [Rightmost $t $n] $n
187	$t unset $n detail
188
189	lappend defs $sym $n
190	$t cut $first
191    }
192
193    # We remember a mapping from nonterminal names to their defining
194    # nodes in the root as well, for quick reference later, when we
195    # build nonterminal usage references
196
197    $t set root definitions $defs
198    return
199}
200
201proc ::page::util::norm::peg::Rightmost {t n} {
202    # Determine the rightmost leaf under the specified node.
203
204    if {[$t isleaf $n]} {return $n}
205    return [Rightmost $t [lindex [$t children $n] end]]
206}
207
208proc ::page::util::norm::peg::Expressions {q t} {
209    # We now transform the remaining nodes into proper expression
210    # trees. The order matters, to shed as much nodes as possible
211    # early, and to avoid unncessary work.
212
213    ExprRanges       $q $t
214    ExprUnaryOps     $q $t
215    ExprChars        $q $t
216    ExprNonterminals $q $t
217    ExprOperators    $q $t
218    ExprFlatten      $q $t
219    return
220}
221
222proc ::page::util::norm::peg::ExprRanges {q t} {
223    # Ranges = .. operator
224
225    $q query tree withatt detail Range over n {
226	# Two the children, both of text 'Char', their data is what we
227	# take. The children become irrelevant and are removed.
228
229	foreach {b e} [$t children $n] break
230	set begin [unquote [$t get $b DATA]]
231	set end   [unquote [$t get $e DATA]]
232
233	$t set $n op ..
234	$t set $n begin $begin
235	$t set $n end   $end
236
237	MergeLocations $t $b $e $n
238
239	$t unset $n detail
240
241	$t delete $b
242	$t delete $e
243    }
244    return
245}
246
247proc ::page::util::norm::peg::ExprUnaryOps {q t} {
248    # Unary operators ... Their transformation sheds more nodes.
249
250    foreach {a op} {
251	QUESTION ?
252	STAR     *
253	PLUS     +
254	AND      &
255	NOT      !
256    } {
257	$q query tree withatt detail $a over n {
258	    set p [$t parent $n]
259
260	    $t set $p op $op
261	    $t cut $n
262
263	    $t unset $p detail
264	}
265    }
266    return
267}
268
269proc ::page::util::norm::peg::ExprChars {q t} {
270    # Chars = t operator (The remaining Char'acters are plain terminal
271    # symbols.
272
273    $q query tree withatt detail Char over n {
274	set ch [unquote [$t get $n DATA]]
275
276	$t set $n op   t
277	$t set $n char $ch
278
279	$t unset $n detail
280	$t unset $n DATA
281    }
282    return
283}
284
285proc ::page::util::norm::peg::ExprNonterminals {q t} {
286    # Identifiers = n operator (nonterminal references) ...
287
288    array set defs [$t get root definitions]
289    array set undefined {}
290
291    $q query tree withatt detail Identifier over n {
292	set sym [$t get $n DATA]
293
294	$t set $n op  n
295	$t set $n sym $sym
296
297	$t unset $n detail
298	$t unset $n DATA
299
300	# Create x-references between the users and the definition of
301	# a nonterminal symbol.
302
303	if {![info exists defs($sym)]} {
304	    $t set $n def {}
305	    lappend undefined($sym) $n
306	    continue
307	} else {
308	    set def $defs($sym)
309	    $t set $n def $def
310	}
311
312	set users [$t get $def users]
313	lappend users $n
314	$t set $def users $users
315    }
316
317    $t set root undefined [array get undefined]
318    return
319}
320
321proc ::page::util::norm::peg::ExprOperators {q t} {
322    # The remaining operator nodes can be changed directly from node
323    # text to operator. Se we do.
324
325    foreach {a op} {
326	EPSILON    epsilon
327	ALNUM      alnum
328	ALPHA      alpha
329	DOT        dot
330	Literal    x
331	Class      /
332	Sequence   x
333	Expression /
334    } {
335	$q query tree withatt detail $a over n {
336	    $t set   $n op $op
337	    $t unset $n detail
338	}
339    }
340    return
341}
342
343proc ::page::util::norm::peg::ExprFlatten {q t} {
344    # Last tweaks of the expressions. Classes inside of Expressions,
345    # and Literals in Sequences create nested / or x expressions. We
346    # locate such and flatten the nested expression, cutting out the
347    # superfluous operator.
348
349    foreach op {x /} {
350	# Locate all x operators, whose parents are x operators as
351	# well, then go back to the child operators and cut them out.
352
353	$q query tree withatt op $op \
354		parent unique withatt op $op \
355		children withatt op $op \
356		over n {
357	    $t cut $n
358	}
359
360	# Locate all x operators without children and convert them
361	# into epsilon operators. Because that is what they accept,
362	# nothing.
363
364	$q query tree withatt op $op over n {
365	    if {[$t numchildren $n]} continue
366	    $t set $n op epsilon
367	}
368    }
369    return
370}
371
372# ### ### ### ######### ######### #########
373## Internal. Low-level helpers.
374
375proc ::page::util::norm::peg::CopyLocation {t src dst} {
376    $t set $dst range    [$t get $src range]
377    $t set $dst range_lc [$t get $src range_lc]
378    return
379}
380
381proc ::page::util::norm::peg::MergeLocations {t srca srcb dst} {
382    set ar   [$t get $srca range]
383    set arlc [$t get $srca range_lc]
384
385    set br   [$t get $srcb range]
386    set brlc [$t get $srcb range_lc]
387
388    $t set $dst range    [list [lindex $ar   0] [lindex $br   1]]
389    $t set $dst range_lc [list [lindex $arlc 0] [lindex $brlc 1]]
390    return
391}
392
393proc ::page::util::norm::peg::TokReduce {t src attr} {
394    set tokens [$t get $src $attr]
395    set ch     {}
396    foreach tok $tokens {
397	lappend ch [lindex $tok 0]
398    }
399    $t set $src $attr [join $ch {}]
400    return
401}
402
403proc ::page::util::norm::peg::AttrCopy {t src asrc dst adst} {
404    $t set $dst $adst [$t get $src $asrc]
405    return
406}
407
408proc ::page::util::norm::peg::Child {t n index} {
409    return [lindex [$t children $n] $index]
410}
411
412# ### ### ### ######### ######### #########
413## Ready
414
415package provide page::util::norm::peg 0.1
416