1# cgen.tcl -- 2# 3# Generator core for compiler of magic(5) files into recognizers 4# based on the 'rtcore'. 5# 6# Copyright (c) 2004-2005 Colin McCormack <coldstore@users.sourceforge.net> 7# Copyright (c) 2005 Andreas Kupries <andreas_kupries@users.sourceforge.net> 8# 9# See the file "license.terms" for information on usage and redistribution 10# of this file, and for a DISCLAIMER OF ALL WARRANTIES. 11# 12# RCS: @(#) $Id: cgen.tcl,v 1.7 2007/06/23 03:39:34 andreas_kupries Exp $ 13 14##### 15# 16# "mime type recognition in pure tcl" 17# http://wiki.tcl.tk/12526 18# 19# Tcl code harvested on: 10 Feb 2005, 04:06 GMT 20# Wiki page last updated: ??? 21# 22##### 23 24# ### ### ### ######### ######### ######### 25## Requirements 26 27package require Tcl 8.4 28package require fileutil::magic::rt ; # Runtime core, for Access to the typemap 29package require struct::list ; # Our data structures. 30package require struct::tree ; # 31 32package provide fileutil::magic::cgen 1.0 33 34# ### ### ### ######### ######### ######### 35## Implementation 36 37namespace eval ::fileutil::magic::cgen { 38 # Import the runtime typemap into our scope. 39 variable ::fileutil::magic::rt::typemap 40 41 # The tree most operations use for their work. 42 variable tree {} 43 44 # Generator data structure. 45 variable regions 46 47 # Type mapping for indirect offsets. 48 # empty -> long/Q, because this uses native byteorder. 49 50 array set otmap { 51 .b c .B c 52 .s s .S S 53 .l i .L I 54 {} Q 55 } 56 57 # Export the API 58 namespace export 2tree treedump treegen 59} 60 61 62# Optimisations: 63 64# reorder tests according to expected or observed frequency this 65# conflicts with reduction in strength optimisations. 66 67# Rewriting within a level will require pulling apart the list of 68# tests at that level and reordering them. There is an inconsistency 69# between handling at 0-level and deeper level - this has to be 70# removed or justified. 71 72# Hypothetically, every test at the same level should be mutually 73# exclusive, but this is not given, and should be detected. If true, 74# this allows reduction in strength to switch on Numeric tests 75 76# reduce Numeric tests at the same level to switches 77# 78# - first pass through clauses at same level to categorise as 79# variant values over same test (type and offset). 80 81# work out some way to cache String comparisons 82 83# Reduce seek/reads for String comparisons at same level: 84# 85# - first pass through clauses at same level to determine string ranges. 86# 87# - String tests at same level over overlapping ranges can be 88# written as sub-string comparisons over the maximum range 89# this saves re-reading the same string from file. 90# 91# - common prefix strings will have to be guarded against, by 92# sorting string values, then sorting the tests in reverse length order. 93 94 95proc ::fileutil::magic::cgen::path {tree} { 96 # Annotates the tree. In each node we store the path from the root 97 # to this node, as list of nodes, with the current node the last 98 # element. The root node is never stored in the path. 99 100 $tree set root path {} 101 foreach child [$tree children root] { 102 $tree walk $child -type dfs node { 103 set path [$tree get [$tree parent $node] path] 104 lappend path [$tree index $node] 105 $tree set $node path $path 106 } 107 } 108 return 109} 110 111proc ::fileutil::magic::cgen::tree_el {tree parent file line type qual comp offset val message args} { 112 113 # Recursively creates and annotates a node for the specified 114 # tests, and its sub-tests (args). 115 116 set node [$tree insert $parent end] 117 set path [$tree get $parent path] 118 lappend path [$tree index $node] 119 $tree set $node path $path 120 121 # generate a proc call type for the type, Numeric or String 122 variable ::fileutil::magic::rt::typemap 123 124 switch -glob -- $type { 125 *byte* - 126 *short* - 127 *long* - 128 *date* { 129 set otype N 130 set type [lindex $typemap($type) 1] 131 } 132 *string { 133 set otype S 134 } 135 default { 136 puts stderr "Unknown type: '$type'" 137 } 138 } 139 140 # Stores the type determined above, and the arguments into 141 # attributes of the new node. 142 143 foreach key {line type qual comp offset val message file otype} { 144 if {[catch { 145 $tree set $node $key [set $key] 146 } result]} { 147 upvar ::errorInfo eo 148 puts "Tree: $eo - $file $line $type" 149 } 150 } 151 152 # now add children 153 foreach el $args { 154 eval [linsert $el 0 tree_el $tree $node $file] 155 # 8.5 # tree_el $tree $node $file {*}$el 156 } 157 return $node 158} 159 160proc ::fileutil::magic::cgen::2tree {script} { 161 162 # Converts a recognizer which is in a simple script form into a 163 # tree. 164 165 variable tree 166 set tree [::struct::tree] 167 168 $tree set root path "" 169 $tree set root otype Root 170 $tree set root type root 171 $tree set root message "unknown" 172 173 # generate a test for each match 174 set file "unknown" 175 foreach el $script { 176 #puts "EL: $el" 177 if {[lindex $el 0] eq "file"} { 178 set file [lindex $el 1] 179 } else { 180 set node [eval [linsert $el 0 tree_el $tree root $file]] 181 # 8.5 # set more [tree_el $tree root $file {*}$el] 182 append result $node 183 } 184 } 185 optNum $tree root 186 #optStr $tree root 187 puts stderr "Script contains [llength [$tree children root]] discriminators" 188 path $tree 189 190 # Decoding the offsets, determination if we have to handle 191 # relative offsets, and where. The less, the better. 192 Offsets $tree 193 194 return $tree 195} 196 197proc ::fileutil::magic::cgen::isStr {tree node} { 198 return [expr {"S" eq [$tree get $node otype]}] 199} 200 201proc ::fileutil::magic::cgen::sortRegion {r1 r2} { 202 set cmp 0 203 if {[catch { 204 if {[string match (*) $r1] || [string match (*) $r2]} { 205 set cmp [string compare $r1 $r2] 206 } else { 207 set cmp [expr {[lindex $r1 0] - [lindex $r2 0]}] 208 if {!$cmp} { 209 set cmp 0 210 set cmp [expr {[lindex $r1 1] - [lindex $r2 1]}] 211 } 212 } 213 } result]} { 214 set cmp [string compare $r1 $r2] 215 } 216 return $cmp 217} 218 219proc ::fileutil::magic::cgen::optStr {tree node} { 220 variable regions 221 catch {unset regions} 222 array set regions {} 223 224 optStr1 $tree $node 225 226 puts stderr "Regions [array statistics regions]" 227 foreach region [lsort \ 228 -index 0 \ 229 -command ::fileutil::magic::cgen::sortRegion \ 230 [array name regions]] { 231 puts "$region - $regions($region)" 232 } 233} 234 235proc ::fileutil::magic::cgen::optStr1 {tree node} { 236 variable regions 237 238 # traverse each numeric element of this node's children, 239 # categorising them 240 241 set kids [$tree children $node] 242 foreach child $kids { 243 optStr1 $tree $child 244 } 245 246 set strings [$tree children $node filter ::fileutil::magic::cgen::isStr] 247 #puts stderr "optstr: $node: $strings" 248 249 foreach el $strings { 250 #if {[$tree get $el otype] eq "String"} {puts "[$tree getall $el] - [string length [$tree get $el val]]"} 251 if {[$tree get $el comp] eq "x"} { 252 continue 253 } 254 255 set offset [$tree get $el offset] 256 set len [string length [$tree get $el val]] 257 lappend regions([list $offset $len]) $el 258 } 259} 260 261proc ::fileutil::magic::cgen::isNum {tree node} { 262 return [expr {"N" eq [$tree get $node otype]}] 263} 264 265proc ::fileutil::magic::cgen::switchNSort {tree n1 n2} { 266 return [expr {[$tree get $n1 val] - [$tree get $n1 val]}] 267} 268 269proc ::fileutil::magic::cgen::optNum {tree node} { 270 array set offsets {} 271 272 # traverse each numeric element of this node's children, 273 # categorising them 274 275 set kids [$tree children $node] 276 foreach child $kids { 277 optNum $tree $child 278 } 279 280 set numerics [$tree children $node filter ::fileutil::magic::cgen::isNum] 281 #puts stderr "optNum: $node: $numerics" 282 if {[llength $numerics] < 2} { 283 return 284 } 285 286 foreach el $numerics { 287 if {[$tree get $el comp] ne "=="} { 288 continue 289 } 290 lappend offsets([$tree get $el type],[$tree get $el offset],[$tree get $el qual]) $el 291 } 292 293 #puts "Offset: stderr [array get offsets]" 294 foreach {match nodes} [array get offsets] { 295 if {[llength $nodes] < 2} { 296 continue 297 } 298 299 catch {unset matcher} 300 foreach n $nodes { 301 set nv [expr [$tree get $n val]] 302 if {[info exists matcher($nv)]} { 303 puts stderr "*=====================================" 304 puts stderr "* Node <[$tree getall $n]>" 305 puts stderr "* clashes with <[$tree getall $matcher($nv)]>" 306 puts stderr "*=====================================" 307 } else { 308 set matcher($nv) $n 309 } 310 } 311 312 foreach {type offset qual} [split $match ,] break 313 set switch [$tree insert $node [$tree index [lindex $nodes 0]]] 314 $tree set $switch otype Switch 315 $tree set $switch message $match 316 $tree set $switch offset $offset 317 $tree set $switch type $type 318 $tree set $switch qual $qual 319 320 set nodes [lsort -command [list ::fileutil::magic::cgen::switchNSort $tree] $nodes] 321 322 eval [linsert $nodes 0 $tree move $switch end] 323 # 8.5 # $tree move $switch end {*}$nodes 324 set path [$tree get [$tree parent $switch] path] 325 lappend path [$tree index $switch] 326 $tree set $switch path $path 327 } 328} 329 330proc ::fileutil::magic::cgen::Offsets {tree} { 331 332 # Indicator if a node has to save field location information for 333 # relative addressing. The 'kill' attribute is an accumulated 334 # 'save' over the whole subtree. It will be used to determine when 335 # level information was destroyed by subnodes and has to be 336 # regenerated at the current level. 337 338 $tree walk root -type dfs node { 339 $tree set $node save 0 340 $tree set $node kill 0 341 } 342 343 # We walk from the leafs up to the root, synthesizing the data 344 # needed, as we go. 345 $tree walk root -type dfs -order post node { 346 if {$node eq "root"} continue 347 DecodeOffset $tree $node [$tree get $node offset] 348 349 # If the current node's parent is a switch, and the node has 350 # to save, then the switch has to save. Because the current 351 # node is not relevant during code generation anymore, the 352 # switch is. 353 354 if {[$tree get $node save]} { 355 # We save, therefore we kill. 356 $tree set $node kill 1 357 if {[$tree get [$tree parent $node] otype] eq "Switch"} { 358 $tree set [$tree parent $node] save 1 359 } 360 } else { 361 # We don't save i.e. kill, but we may inherit it from 362 # children which kill. 363 364 foreach c [$tree children $node] { 365 if {[$tree get $c kill]} { 366 $tree set $node kill 1 367 break 368 } 369 } 370 } 371 } 372} 373 374proc ::fileutil::magic::cgen::DecodeOffset {tree node offset} { 375 if {[string match "(*)" $offset]} { 376 # Indirection offset. (Decoding is non-trivial, therefore 377 # packed into a proc). 378 379 set ind 1 ; # Indirect location 380 foreach {rel base itype idelta} [DecodeIndirectOffset $offset] break 381 382 } elseif {[string match "&*" $offset]} { 383 # Direct relative offset. (Decoding is trivial) 384 385 set ind 0 ; # Direct location 386 set rel 1 ; # Relative 387 set base [string range $offset 1 end] ; # Base Delta 388 set itype {} ; # No data for indirect 389 set idelta {} ; # s.a. 390 391 } else { 392 set ind 0 ; # Direct location 393 set rel 0 ; # Absolute 394 set base $offset ; # Here! 395 set itype {} ; # No data for indirect 396 set idelta {} ; # s.a. 397 } 398 399 # Store the expanded data back into the tree. 400 401 foreach v {ind rel base itype idelta} { 402 $tree set $node $v [set $v] 403 } 404 405 # For nodes with adressing relative to last field above the latter 406 # has to save this information. 407 408 if {$rel} { 409 $tree set [$tree parent $node] save 1 410 } 411 return 412} 413 414proc ::fileutil::magic::cgen::DecodeIndirectOffset {offset} { 415 variable otmap ; # Offset typemap. 416 417 # Offset parser. 418 # Syntax: 419 # ( ?&? number ?.[bslBSL]? ?[+-]? ?number? ) 420 421 set n {(([0-9]+)|(0x[0-9A-Fa-f]+))} 422 set o "\\((&?)(${n})((\\.\[bslBSL])?)(\[+-]?)(${n}?)\\)" 423 # | | ||| || | | ||| 424 # 1 2 345 67 8 9 012 425 # ^ ^ ^ ^ ^ 426 # rel base type sign index 427 # 428 # 1 2 3 4 5 6 7 8 9 0 1 2 429 set ok [regexp $o $offset -> rel base _ _ _ type _ sign idx _ _ _] 430 431 if {!$ok} { 432 return -code error "Bad offset \"$offset\"" 433 } 434 435 # rel is in {"", &}, map to 0|1 436 if {$rel eq ""} {set rel 0} else {set rel 1} 437 438 # base is a number, enforce decimal. Not optional. 439 set base [expr $base] 440 441 # Type is in .b .s .l .B .S .L, and "". Map to a regular magic 442 # type code. 443 set type $otmap($type) 444 445 # sign is in {+,-,""}. Map to -|"" (Becomes sign of index) 446 if {$sign eq "+"} {set sign ""} 447 448 # Index is optional number. Enforce decimal, empty is zero. Add in 449 # the sign as well for a proper signed index. 450 451 if {$idx eq ""} {set idx 0} 452 set idx $sign[expr $idx] 453 454 return [list $rel $base $type $idx] 455} 456 457proc ::fileutil::magic::cgen::treedump {tree} { 458 set result "" 459 $tree walk root -type dfs node { 460 set path [$tree get $node path] 461 set depth [llength $path] 462 463 append result [string repeat " " $depth] [list $path] ": " [$tree get $node type]: 464 465 if {[$tree keyexists $node offset]} { 466 append result " ,O|[$tree get $node offset]|" 467 468 set x {} 469 foreach v {ind rel base itype idelta} {lappend x [$tree get $node $v]} 470 append result "=<[join $x !]>" 471 } 472 if {[$tree keyexists $node qual]} { 473 set q [$tree get $node qual] 474 if {$q ne ""} { 475 append result " ,q/$q/" 476 } 477 } 478 479 if {[$tree keyexists $node comp]} { 480 append result " " C([$tree get $node comp]) 481 } 482 if {[$tree keyexists $node val]} { 483 append result " " V([$tree get $node val]) 484 } 485 486 if {[$tree keyexists $node otype]} { 487 append result " " [$tree get $node otype]/[$tree get $node save] 488 } 489 490 if {$depth == 1} { 491 set msg [$tree get $node message] 492 set n $node 493 while {($n != {}) && ($msg == "")} { 494 set n [lindex [$tree children $n] 0] 495 if {$n != {}} { 496 set msg [$tree get $n message] 497 } 498 } 499 append result " " ( $msg ) 500 if {[$tree keyexists $node file]} { 501 append result " - " [$tree get $node file] 502 } 503 } 504 505 #append result " <" [$tree getall $node] > 506 append result \n 507 } 508 return $result 509} 510 511proc ::fileutil::magic::cgen::treegen {tree node} { 512 return "[treegen1 $tree $node]\nresult\n" 513} 514 515proc ::fileutil::magic::cgen::treegen1 {tree node} { 516 variable ::fileutil::magic::rt::typemap 517 518 set result "" 519 foreach k {otype type offset comp val qual message save path} { 520 if {[$tree keyexists $node $k]} { 521 set $k [$tree get $node $k] 522 } 523 } 524 525 set level [llength $path] 526 527 # Generate code for each node per its type. 528 529 switch $otype { 530 N - 531 S { 532 if {$save} { 533 # We have to save field data for relative adressing under this 534 # leaf. 535 if {$otype eq "N"} { 536 set type [list Nx $level $type] 537 } elseif {$otype eq "S"} { 538 set type [list Sx $level] 539 } 540 } else { 541 # Regular fetching of information. 542 if {$otype eq "N"} { 543 set type [list N $type] 544 } elseif {$otype eq "S"} { 545 set type S 546 } 547 } 548 549 set offset [GenerateOffset $tree $node] 550 551 if {$qual eq ""} { 552 append result "if \{\[$type $offset $comp [list $val]\]\} \{" 553 } else { 554 append result "if \{\[$type $offset $comp [list $val] $qual\]\} \{" 555 } 556 557 if {[$tree isleaf $node]} { 558 if {$message ne ""} { 559 append result "emit [list $message]" 560 } else { 561 append result "emit [$tree get $node path]" 562 } 563 } else { 564 # If we saved data the child branches may destroy 565 # level information. We regenerate it if needed. 566 567 if {$message ne ""} { 568 append result "emit [list $message]\n" 569 } 570 571 set killed 0 572 foreach child [$tree children $node] { 573 if {$save && $killed && [$tree get $child rel]} { 574 # This location already does not regenerate if 575 # the killing subnode was last. We also do not 576 # need to regenerate if the current subnode 577 # does not use relative adressing. 578 append result "L $level;" 579 set killed 0 580 } 581 append result [treegen1 $tree $child] 582 set killed [expr {$killed || [$tree get $child kill]}] 583 } 584 #append result "\nreturn \$result" 585 } 586 587 append result "\}\n" 588 } 589 Root { 590 foreach child [$tree children $node] { 591 append result [treegen1 $tree $child] 592 } 593 } 594 Switch { 595 set offset [GenerateOffset $tree $node] 596 597 if {$save} { 598 set fetch "Nvx $level" 599 } else { 600 set fetch Nv 601 } 602 603 append fetch " " $type " " $offset 604 if {$qual ne ""} { 605 append fetch " " $qual 606 } 607 append result "switch -- \[$fetch\] " 608 609 set scan [lindex $typemap($type) 1] 610 611 set ckilled 0 612 foreach child [$tree children $node] { 613 binary scan [binary format $scan [$tree get $child val]] $scan val 614 append result "$val \{" 615 616 if {$save && $ckilled} { 617 # This location already does not regenerate if 618 # the killing subnode was last. We also do not 619 # need to regenerate if the current subnode 620 # does not use relative adressing. 621 append result "L $level;" 622 set ckilled 0 623 } 624 625 if {[$tree isleaf $child]} { 626 append result "emit [list [$tree get $child message]]" 627 } else { 628 set killed 0 629 append result "emit [list [$tree get $child message]]\n" 630 foreach grandchild [$tree children $child] { 631 if {$save && $killed && [$tree get $grandchild rel]} { 632 # This location already does not regenerate if 633 # the killing subnode was last. We also do not 634 # need to regenerate if the current subnode 635 # does not use relative adressing. 636 append result "L $level;" 637 set killed 0 638 } 639 append result [treegen1 $tree $grandchild] 640 set killed [expr {$killed || [$tree get $grandchild kill]}] 641 } 642 } 643 644 set ckilled [expr {$ckilled || [$tree get $child kill]}] 645 append result "\} " 646 } 647 append result "\n" 648 } 649 } 650 return $result 651} 652 653proc ::fileutil::magic::cgen::GenerateOffset {tree node} { 654 # Examples: 655 # direct absolute: 45 -> 45 656 # direct relative: &45 -> [R 45] 657 # indirect absolute: (45.s+1) -> [I 45 s 1] 658 # indirect relative: (&45.s+1) -> [I [R 45] s 1] 659 660 foreach v {ind rel base itype idelta} { 661 set $v [$tree get $node $v] 662 } 663 664 if {$rel} {set base "\[R $base\]"} 665 if {$ind} {set base "\[I $base $itype $idelta\]"} 666 return $base 667} 668 669# ### ### ### ######### ######### ######### 670## Ready for use. 671# EOF 672