1# cgen.tcl --
2#
3#	Generator core for compiler of magic(5) files into recognizers
4#	based on the 'rtcore'.
5#
6# Copyright (c) 2004-2005 Colin McCormack <coldstore@users.sourceforge.net>
7# Copyright (c) 2005      Andreas Kupries <andreas_kupries@users.sourceforge.net>
8#
9# See the file "license.terms" for information on usage and redistribution
10# of this file, and for a DISCLAIMER OF ALL WARRANTIES.
11#
12# RCS: @(#) $Id: cgen.tcl,v 1.7 2007/06/23 03:39:34 andreas_kupries Exp $
13
14#####
15#
16# "mime type recognition in pure tcl"
17# http://wiki.tcl.tk/12526
18#
19# Tcl code harvested on:  10 Feb 2005, 04:06 GMT
20# Wiki page last updated: ???
21#
22#####
23
24# ### ### ### ######### ######### #########
25## Requirements
26
27package require Tcl 8.4
28package require fileutil::magic::rt ; # Runtime core, for Access to the typemap
29package require struct::list        ; # Our data structures.
30package require struct::tree        ; #
31
32package provide fileutil::magic::cgen 1.0
33
34# ### ### ### ######### ######### #########
35## Implementation
36
37namespace eval ::fileutil::magic::cgen {
38    # Import the runtime typemap into our scope.
39    variable ::fileutil::magic::rt::typemap
40
41    # The tree most operations use for their work.
42    variable tree {}
43
44    # Generator data structure.
45    variable regions
46
47    # Type mapping for indirect offsets.
48    # empty -> long/Q, because this uses native byteorder.
49
50    array set otmap {
51        .b c    .B c
52        .s s    .S S
53        .l i    .L I
54	{} Q
55    }
56
57    # Export the API
58    namespace export 2tree treedump treegen
59}
60
61
62# Optimisations:
63
64# reorder tests according to expected or observed frequency this
65# conflicts with reduction in strength optimisations.
66
67# Rewriting within a level will require pulling apart the list of
68# tests at that level and reordering them.  There is an inconsistency
69# between handling at 0-level and deeper level - this has to be
70# removed or justified.
71
72# Hypothetically, every test at the same level should be mutually
73# exclusive, but this is not given, and should be detected.  If true,
74# this allows reduction in strength to switch on Numeric tests
75
76# reduce Numeric tests at the same level to switches
77#
78# - first pass through clauses at same level to categorise as
79#   variant values over same test (type and offset).
80
81# work out some way to cache String comparisons
82
83# Reduce seek/reads for String comparisons at same level:
84#
85# - first pass through clauses at same level to determine string ranges.
86#
87# - String tests at same level over overlapping ranges can be
88#   written as sub-string comparisons over the maximum range
89#   this saves re-reading the same string from file.
90#
91# - common prefix strings will have to be guarded against, by
92#   sorting string values, then sorting the tests in reverse length order.
93
94
95proc ::fileutil::magic::cgen::path {tree} {
96    # Annotates the tree. In each node we store the path from the root
97    # to this node, as list of nodes, with the current node the last
98    # element. The root node is never stored in the path.
99
100    $tree set root path {}
101    foreach child [$tree children root] {
102   	$tree walk $child -type dfs node {
103   	    set path [$tree get [$tree parent $node] path]
104   	    lappend path [$tree index $node]
105   	    $tree set $node path $path
106   	}
107    }
108    return
109}
110
111proc ::fileutil::magic::cgen::tree_el {tree parent file line type qual comp offset val message args} {
112
113    # Recursively creates and annotates a node for the specified
114    # tests, and its sub-tests (args).
115
116    set     node [$tree insert $parent end]
117    set     path [$tree get    $parent path]
118    lappend path [$tree index  $node]
119    $tree set $node path $path
120
121    # generate a proc call type for the type, Numeric or String
122    variable ::fileutil::magic::rt::typemap
123
124    switch -glob -- $type {
125   	*byte* -
126   	*short* -
127   	*long* -
128   	*date* {
129   	    set otype N
130   	    set type [lindex $typemap($type) 1]
131   	}
132   	*string {
133   	    set otype S
134   	}
135   	default {
136   	    puts stderr "Unknown type: '$type'"
137   	}
138    }
139
140    # Stores the type determined above, and the arguments into
141    # attributes of the new node.
142
143    foreach key {line type qual comp offset val message file otype} {
144   	if {[catch {
145   	    $tree set $node $key [set $key]
146   	} result]} {
147	    upvar ::errorInfo eo
148   	    puts "Tree: $eo - $file $line $type"
149   	}
150    }
151
152    # now add children
153    foreach el $args {
154	eval [linsert $el 0 tree_el $tree $node $file]
155   	# 8.5 # tree_el $tree $node $file {*}$el
156    }
157    return $node
158}
159
160proc ::fileutil::magic::cgen::2tree {script} {
161
162    # Converts a recognizer which is in a simple script form into a
163    # tree.
164
165    variable tree
166    set tree [::struct::tree]
167
168    $tree set root path ""
169    $tree set root otype Root
170    $tree set root type root
171    $tree set root message "unknown"
172
173    # generate a test for each match
174    set file "unknown"
175    foreach el $script {
176   	#puts "EL: $el"
177   	if {[lindex $el 0] eq "file"} {
178   	    set file [lindex $el 1]
179   	} else {
180	    set node [eval [linsert $el 0 tree_el $tree root $file]]
181	    # 8.5 # set more [tree_el $tree root $file {*}$el]
182   	    append result $node
183   	}
184    }
185    optNum $tree root
186    #optStr $tree root
187    puts stderr "Script contains [llength [$tree children root]] discriminators"
188    path $tree
189
190    # Decoding the offsets, determination if we have to handle
191    # relative offsets, and where. The less, the better.
192    Offsets $tree
193
194    return $tree
195}
196
197proc ::fileutil::magic::cgen::isStr {tree node} {
198    return [expr {"S" eq [$tree get $node otype]}]
199}
200
201proc ::fileutil::magic::cgen::sortRegion {r1 r2} {
202    set cmp 0
203    if {[catch {
204   	if {[string match (*) $r1] || [string match (*) $r2]} {
205   	    set cmp [string compare $r1 $r2]
206   	} else {
207   	    set cmp [expr {[lindex $r1 0] - [lindex $r2 0]}]
208   	    if {!$cmp} {
209   		set cmp 0
210   		set cmp [expr {[lindex $r1 1] - [lindex $r2 1]}]
211   	    }
212   	}
213    } result]} {
214   	set cmp [string compare $r1 $r2]
215    }
216    return $cmp
217}
218
219proc ::fileutil::magic::cgen::optStr {tree node} {
220    variable regions
221    catch {unset regions}
222    array set regions {}
223
224    optStr1 $tree $node
225
226    puts stderr "Regions [array statistics regions]"
227    foreach region [lsort \
228	    -index   0 \
229	    -command ::fileutil::magic::cgen::sortRegion \
230	    [array name regions]] {
231   	puts "$region - $regions($region)"
232    }
233}
234
235proc ::fileutil::magic::cgen::optStr1 {tree node} {
236    variable regions
237
238    # traverse each numeric element of this node's children,
239    # categorising them
240
241    set kids [$tree children $node]
242    foreach child $kids {
243   	optStr1 $tree $child
244    }
245
246    set strings [$tree children $node filter ::fileutil::magic::cgen::isStr]
247    #puts stderr "optstr: $node: $strings"
248
249    foreach el $strings {
250   	#if {[$tree get $el otype] eq "String"} {puts "[$tree getall $el] - [string length [$tree get $el val]]"}
251	if {[$tree get $el comp] eq "x"} {
252	    continue
253	}
254
255	set offset [$tree get $el offset]
256	set len    [string length [$tree get $el val]]
257	lappend regions([list $offset $len]) $el
258    }
259}
260
261proc ::fileutil::magic::cgen::isNum {tree node} {
262    return [expr {"N" eq [$tree get $node otype]}]
263}
264
265proc ::fileutil::magic::cgen::switchNSort {tree n1 n2} {
266    return [expr {[$tree get $n1 val] - [$tree get $n1 val]}]
267}
268
269proc ::fileutil::magic::cgen::optNum {tree node} {
270    array set offsets {}
271
272    # traverse each numeric element of this node's children,
273    # categorising them
274
275    set kids [$tree children $node]
276    foreach child $kids {
277	optNum $tree $child
278    }
279
280    set numerics [$tree children $node filter ::fileutil::magic::cgen::isNum]
281    #puts stderr "optNum: $node: $numerics"
282    if {[llength $numerics] < 2} {
283	return
284    }
285
286    foreach el $numerics {
287	if {[$tree get $el comp] ne "=="} {
288	    continue
289	}
290	lappend offsets([$tree get $el type],[$tree get $el offset],[$tree get $el qual]) $el
291    }
292
293    #puts "Offset: stderr [array get offsets]"
294    foreach {match nodes} [array get offsets] {
295	if {[llength $nodes] < 2} {
296	    continue
297	}
298
299	catch {unset matcher}
300	foreach n $nodes {
301	    set nv [expr [$tree get $n val]]
302	    if {[info exists matcher($nv)]} {
303		puts stderr "*====================================="
304		puts stderr "* Node         <[$tree getall $n]>"
305		puts stderr "* clashes with <[$tree getall $matcher($nv)]>"
306		puts stderr "*====================================="
307	    } else {
308		set matcher($nv) $n
309	    }
310	}
311
312	foreach {type offset qual} [split $match ,] break
313	set switch [$tree insert $node [$tree index [lindex $nodes 0]]]
314	$tree set $switch otype   Switch
315	$tree set $switch message $match
316	$tree set $switch offset  $offset
317	$tree set $switch type    $type
318	$tree set $switch qual    $qual
319
320	set nodes [lsort -command [list ::fileutil::magic::cgen::switchNSort $tree] $nodes]
321
322	eval [linsert $nodes 0 $tree move $switch end]
323	# 8.5 # $tree move $switch end {*}$nodes
324	set     path [$tree get [$tree parent $switch] path]
325	lappend path [$tree index $switch]
326	$tree set $switch path $path
327    }
328}
329
330proc ::fileutil::magic::cgen::Offsets {tree} {
331
332    # Indicator if a node has to save field location information for
333    # relative addressing. The 'kill' attribute is an accumulated
334    # 'save' over the whole subtree. It will be used to determine when
335    # level information was destroyed by subnodes and has to be
336    # regenerated at the current level.
337
338    $tree walk root -type dfs node {
339	$tree set $node save 0
340	$tree set $node kill 0
341    }
342
343    # We walk from the leafs up to the root, synthesizing the data
344    # needed, as we go.
345    $tree walk root -type dfs -order post node {
346	if {$node eq "root"} continue
347	DecodeOffset $tree $node [$tree get $node offset]
348
349	# If the current node's parent is a switch, and the node has
350	# to save, then the switch has to save. Because the current
351	# node is not relevant during code generation anymore, the
352	# switch is.
353
354	if {[$tree get $node save]} {
355	    # We save, therefore we kill.
356	    $tree set $node kill 1
357	    if {[$tree get [$tree parent $node] otype] eq "Switch"} {
358		$tree set [$tree parent $node] save 1
359	    }
360	} else {
361	    # We don't save i.e. kill, but we may inherit it from
362	    # children which kill.
363
364	    foreach c [$tree children $node] {
365		if {[$tree get $c kill]} {
366		    $tree set $node kill 1
367		    break
368		}
369	    }
370	}
371    }
372}
373
374proc ::fileutil::magic::cgen::DecodeOffset {tree node offset} {
375    if {[string match "(*)" $offset]} {
376	# Indirection offset. (Decoding is non-trivial, therefore
377	# packed into a proc).
378
379	set ind 1 ; # Indirect location
380	foreach {rel base itype idelta} [DecodeIndirectOffset $offset] break
381
382    } elseif {[string match "&*" $offset]} {
383	# Direct relative offset. (Decoding is trivial)
384
385	set ind    0       ; # Direct location
386	set rel    1       ; # Relative
387	set base   [string range $offset 1 end] ; # Base Delta
388	set itype  {}      ; # No data for indirect
389	set idelta {}      ; # s.a.
390
391    } else {
392	set ind    0       ; # Direct location
393	set rel    0       ; # Absolute
394	set base   $offset ; # Here!
395	set itype  {}      ; # No data for indirect
396	set idelta {}      ; # s.a.
397    }
398
399    # Store the expanded data back into the tree.
400
401    foreach v {ind rel base itype idelta} {
402	$tree set $node $v [set $v]
403    }
404
405    # For nodes with adressing relative to last field above the latter
406    # has to save this information.
407
408    if {$rel} {
409	$tree set [$tree parent $node] save 1
410    }
411    return
412}
413
414proc ::fileutil::magic::cgen::DecodeIndirectOffset {offset} {
415    variable otmap ; # Offset typemap.
416
417    # Offset parser.
418    # Syntax:
419    #   ( ?&? number ?.[bslBSL]? ?[+-]? ?number? )
420
421    set n {(([0-9]+)|(0x[0-9A-Fa-f]+))}
422    set o "\\((&?)(${n})((\\.\[bslBSL])?)(\[+-]?)(${n}?)\\)"
423    #         |   | ||| ||               |       | |||
424    #         1   2 345 67               8       9 012
425    #         ^   ^     ^                ^       ^
426    #         rel base  type             sign    index
427    #
428    #                            1   2    3 4 5 6    7 8    9   0 1 2
429    set ok [regexp $o $offset -> rel base _ _ _ type _ sign idx _ _ _]
430
431    if {!$ok} {
432        return -code error "Bad offset \"$offset\""
433    }
434
435    # rel is in {"", &}, map to 0|1
436    if {$rel eq ""} {set rel 0} else {set rel 1}
437
438    # base is a number, enforce decimal. Not optional.
439    set base [expr $base]
440
441    # Type is in .b .s .l .B .S .L, and "". Map to a regular magic
442    # type code.
443    set type $otmap($type)
444
445    # sign is in {+,-,""}. Map to -|"" (Becomes sign of index)
446    if {$sign eq "+"} {set sign ""}
447
448    # Index is optional number. Enforce decimal, empty is zero. Add in
449    # the sign as well for a proper signed index.
450
451    if {$idx eq ""} {set idx 0}
452    set idx $sign[expr $idx]
453
454    return [list $rel $base $type $idx]
455}
456
457proc ::fileutil::magic::cgen::treedump {tree} {
458    set result ""
459    $tree walk root -type dfs node {
460	set path  [$tree get $node path]
461	set depth [llength $path]
462
463	append result [string repeat "  " $depth] [list $path] ": " [$tree get $node type]:
464
465	if {[$tree keyexists $node offset]} {
466	    append result " ,O|[$tree get $node offset]|"
467
468	    set x {}
469	    foreach v {ind rel base itype idelta} {lappend x [$tree get $node $v]}
470	    append result "=<[join $x !]>"
471	}
472	if {[$tree keyexists $node qual]} {
473	    set q [$tree get $node qual]
474	    if {$q ne ""} {
475		append result " ,q/$q/"
476	    }
477	}
478
479	if {[$tree keyexists $node comp]} {
480	    append result " " C([$tree get $node comp])
481	}
482	if {[$tree keyexists $node val]} {
483	    append result " " V([$tree get $node val])
484	}
485
486	if {[$tree keyexists $node otype]} {
487	    append result " " [$tree get $node otype]/[$tree get $node save]
488	}
489
490	if {$depth == 1} {
491	    set msg [$tree get $node message]
492	    set n $node
493	    while {($n != {}) && ($msg == "")} {
494		set n [lindex [$tree children $n] 0]
495		if {$n != {}} {
496		    set msg [$tree get $n message]
497		}
498	    }
499	    append result " " ( $msg )
500	    if {[$tree keyexists $node file]} {
501		append result " - " [$tree get $node file]
502	    }
503	}
504
505	#append result " <" [$tree getall $node] >
506	append result \n
507    }
508    return $result
509}
510
511proc ::fileutil::magic::cgen::treegen {tree node} {
512    return "[treegen1 $tree $node]\nresult\n"
513}
514
515proc ::fileutil::magic::cgen::treegen1 {tree node} {
516    variable ::fileutil::magic::rt::typemap
517
518    set result ""
519    foreach k {otype type offset comp val qual message save path} {
520	if {[$tree keyexists $node $k]} {
521	    set $k [$tree get $node $k]
522	}
523    }
524
525    set level [llength $path]
526
527    # Generate code for each node per its type.
528
529    switch $otype {
530	N -
531	S {
532	    if {$save} {
533		# We have to save field data for relative adressing under this
534		# leaf.
535		if {$otype eq "N"} {
536		    set type [list Nx $level $type]
537		} elseif {$otype eq "S"} {
538		    set type [list Sx $level]
539		}
540	    } else {
541		# Regular fetching of information.
542		if {$otype eq "N"} {
543		    set type [list N $type]
544		} elseif {$otype eq "S"} {
545		    set type S
546		}
547	    }
548
549	    set offset [GenerateOffset $tree $node]
550
551	    if {$qual eq ""} {
552		append result "if \{\[$type $offset $comp [list $val]\]\} \{"
553	    } else {
554		append result "if \{\[$type $offset $comp [list $val] $qual\]\} \{"
555	    }
556
557	    if {[$tree isleaf $node]} {
558		if {$message ne ""} {
559		    append result "emit [list $message]"
560		} else {
561		    append result "emit [$tree get $node path]"
562		}
563	    } else {
564		# If we saved data the child branches may destroy
565		# level information. We regenerate it if needed.
566
567		if {$message ne ""} {
568		    append result "emit [list $message]\n"
569		}
570
571		set killed 0
572		foreach child [$tree children $node] {
573		    if {$save && $killed && [$tree get $child rel]} {
574			# This location already does not regenerate if
575			# the killing subnode was last. We also do not
576			# need to regenerate if the current subnode
577			# does not use relative adressing.
578			append result "L $level;"
579			set killed 0
580		    }
581		    append result [treegen1 $tree $child]
582		    set killed [expr {$killed || [$tree get $child kill]}]
583		}
584		#append result "\nreturn \$result"
585	    }
586
587	    append result "\}\n"
588	}
589	Root {
590	    foreach child [$tree children $node] {
591		append result [treegen1 $tree $child]
592	    }
593	}
594	Switch {
595	    set offset [GenerateOffset $tree $node]
596
597	    if {$save} {
598		set fetch "Nvx $level"
599	    } else {
600		set fetch Nv
601	    }
602
603	    append fetch " " $type " " $offset
604	    if {$qual ne ""} {
605		append fetch " " $qual
606	    }
607	    append result "switch -- \[$fetch\] "
608
609	    set scan [lindex $typemap($type) 1]
610
611	    set ckilled 0
612	    foreach child [$tree children $node] {
613		binary scan [binary format $scan [$tree get $child val]] $scan val
614		append result "$val \{"
615
616		if {$save && $ckilled} {
617		    # This location already does not regenerate if
618		    # the killing subnode was last. We also do not
619		    # need to regenerate if the current subnode
620		    # does not use relative adressing.
621		    append result "L $level;"
622		    set ckilled 0
623		}
624
625		if {[$tree isleaf $child]} {
626		    append result "emit [list [$tree get $child message]]"
627		} else {
628		    set killed 0
629		    append result "emit [list [$tree get $child message]]\n"
630		    foreach grandchild [$tree children $child] {
631			if {$save && $killed && [$tree get $grandchild rel]} {
632			    # This location already does not regenerate if
633			    # the killing subnode was last. We also do not
634			    # need to regenerate if the current subnode
635			    # does not use relative adressing.
636			    append result "L $level;"
637			    set killed 0
638			}
639			append result [treegen1 $tree $grandchild]
640			set killed [expr {$killed || [$tree get $grandchild kill]}]
641		    }
642		}
643
644		set ckilled [expr {$ckilled || [$tree get $child kill]}]
645		append result "\} "
646	    }
647	    append result "\n"
648	}
649    }
650    return $result
651}
652
653proc ::fileutil::magic::cgen::GenerateOffset {tree node} {
654    # Examples:
655    # direct absolute:     45      -> 45
656    # direct relative:    &45      -> [R 45]
657    # indirect absolute:  (45.s+1) -> [I 45 s 1]
658    # indirect relative: (&45.s+1) -> [I [R 45] s 1]
659
660    foreach v {ind rel base itype idelta} {
661	set $v [$tree get $node $v]
662    }
663
664    if {$rel} {set base "\[R $base\]"}
665    if {$ind} {set base "\[I $base $itype $idelta\]"}
666    return $base
667}
668
669# ### ### ### ######### ######### #########
670## Ready for use.
671# EOF
672