1# rtcore.tcl -- 2# 3# Runtime core for file type recognition engines written in pure Tcl. 4# 5# Copyright (c) 2004-2005 Colin McCormack <coldstore@users.sourceforge.net> 6# Copyright (c) 2005 Andreas Kupries <andreas_kupries@users.sourceforge.net> 7# 8# See the file "license.terms" for information on usage and redistribution 9# of this file, and for a DISCLAIMER OF ALL WARRANTIES. 10# 11# RCS: @(#) $Id: rtcore.tcl,v 1.5 2005/09/28 04:51:19 andreas_kupries Exp $ 12 13##### 14# 15# "mime type recognition in pure tcl" 16# http://wiki.tcl.tk/12526 17# 18# Tcl code harvested on: 10 Feb 2005, 04:06 GMT 19# Wiki page last updated: ??? 20# 21##### 22 23# TODO - Required Functionality: 24 25# implement full offset language 26# implement pstring (pascal string, blerk) 27# implement regex form (blerk!) 28# implement string qualifiers 29 30# ### ### ### ######### ######### ######### 31## Requirements 32 33package require Tcl 8.4 34 35# ### ### ### ######### ######### ######### 36## Implementation 37 38namespace eval ::fileutil::magic::rt { 39 # Configuration flag. (De)activate debugging output. 40 # This is done during initialization. 41 # Changes at runtime have no effect. 42 43 variable debug 0 44 45 # Runtime state. 46 47 variable fd {} ; # Channel to file under scrutiny 48 variable strbuf {} ; # Input cache [*]. 49 variable cache ; # Cache of fetched and decoded numeric 50 array set cache {} ; # values. 51 variable result {} ; # Accumulated recognition result. 52 variable string {} ; # Last recognized string | For substitution 53 variable numeric -9999 ; # Last recognized number | into the message 54 55 variable last ; # Behind last fetch locations, 56 array set last {} ; # per nesting level. 57 58 # [*] The vast majority of magic strings are in the first 4k of the file. 59 60 # Export APIs (full public, recognizer public) 61 namespace export open close file_start result 62 namespace export emit offset Nv N S Nvx Nx Sx L R I 63} 64 65# ### ### ### ######### ######### ######### 66## Public API, general use. 67 68# open the file to be scanned 69proc ::fileutil::magic::rt::open {file} { 70 variable result {} 71 variable string {} 72 variable numeric -9999 73 variable strbuf 74 variable fd 75 variable cache 76 77 set fd [::open $file] 78 ::fconfigure $fd -translation binary 79 80 # fill the string cache 81 set strbuf [::read $fd 4096] 82 83 # clear the fetch cache 84 catch {unset cache} 85 array set cache {} 86 87 return $fd 88} 89 90proc ::fileutil::magic::rt::close {} { 91 variable fd 92 ::close $fd 93 return 94} 95 96# mark the start of a magic file in debugging 97proc ::fileutil::magic::rt::file_start {name} { 98 ::fileutil::magic::rt::Debug {puts stderr "File: $name"} 99} 100 101# return the emitted result 102proc ::fileutil::magic::rt::result {{msg ""}} { 103 variable result 104 if {$msg ne ""} {emit $msg} 105 return -code return $result 106} 107 108proc ::fileutil::magic::rt::resultv {{msg ""}} { 109 variable result 110 if {$msg ne ""} {emit $msg} 111 return $result 112} 113 114# ### ### ### ######### ######### ######### 115## Public API, for use by a recognizer. 116 117# emit a message 118proc ::fileutil::magic::rt::emit {msg} { 119 variable string 120 variable numeric 121 variable result 122 123 set map [list \ 124 \\b "" \ 125 %s $string \ 126 %ld $numeric \ 127 %d $numeric \ 128 ] 129 130 lappend result [::string map $map $msg] 131 return 132} 133 134# handle complex offsets - TODO 135proc ::fileutil::magic::rt::offset {where} { 136 ::fileutil::magic::rt::Debug {puts stderr "OFFSET: $where"} 137 return 0 138} 139 140proc ::fileutil::magic::rt::Nv {type offset {qual ""}} { 141 variable typemap 142 variable numeric 143 144 # unpack the type characteristics 145 foreach {size scan} $typemap($type) break 146 147 # fetch the numeric field from the file 148 set numeric [Fetch $offset $size $scan] 149 150 if {$qual ne ""} { 151 # there's a mask to be applied 152 set numeric [expr $numeric $qual] 153 } 154 155 ::fileutil::magic::rt::Debug {puts stderr "NV $type $offset $qual: $numeric"} 156 return $numeric 157} 158 159# Numeric - get bytes of $type at $offset and $compare to $val 160# qual might be a mask 161proc ::fileutil::magic::rt::N {type offset comp val {qual ""}} { 162 variable typemap 163 variable numeric 164 165 # unpack the type characteristics 166 foreach {size scan} $typemap($type) break 167 168 # fetch the numeric field 169 set numeric [Fetch $offset $size $scan] 170 171 # Would moving this before the fetch an optimisation ? The 172 # tradeoff is that we give up filling the cache, and it is unclear 173 # how often that value would be used. -- Profile! 174 if {$comp eq "x"} { 175 # anything matches - don't care 176 return 1 177 } 178 179 # get value in binary form, then back to numeric 180 # this avoids problems with sign, as both values are 181 # [binary scan]-converted identically 182 binary scan [binary format $scan $val] $scan val 183 184 if {$qual ne ""} { 185 # there's a mask to be applied 186 set numeric [expr $numeric $qual] 187 } 188 189 # perform comparison 190 set c [expr $val $comp $numeric] 191 192 ::fileutil::magic::rt::Debug {puts stderr "numeric $type: $val $comp $numeric / $qual - $c"} 193 return $c 194} 195 196proc ::fileutil::magic::rt::S {offset comp val {qual ""}} { 197 variable fd 198 variable string 199 200 # convert any backslashes 201 set val [subst -nocommands -novariables $val] 202 203 if {$comp eq "x"} { 204 # match anything - don't care, just get the value 205 set string "" 206 207 # Query: Can we use GetString here ? 208 # Or at least the strbuf cache ? 209 210 # move to the offset 211 ::seek $fd $offset 212 while { 213 ([::string length $string] < 100) && 214 [::string is print [set c [::read $fd 1]]] 215 } { 216 if {[::string is space $c]} { 217 break 218 } 219 append string $c 220 } 221 222 return 1 223 } 224 225 # get the string and compare it 226 set string [GetString $offset [::string length $val]] 227 set cmp [::string compare $val $string] 228 set c [expr $cmp $comp 0] 229 230 ::fileutil::magic::rt::Debug { 231 puts "String '$val' $comp '$string' - $c" 232 if {$c} { 233 puts "offset $offset - $string" 234 } 235 } 236 return $c 237} 238 239proc ::fileutil::magic::rt::Nvx {atlevel type offset {qual ""}} { 240 variable typemap 241 variable numeric 242 variable last 243 244 upvar 1 level l 245 set l $atlevel 246 247 # unpack the type characteristics 248 foreach {size scan} $typemap($type) break 249 250 # fetch the numeric field from the file 251 set numeric [Fetch $offset $size $scan] 252 253 set last($atlevel) [expr {$offset + $size}] 254 255 if {$qual ne ""} { 256 # there's a mask to be applied 257 set numeric [expr $numeric $qual] 258 } 259 260 ::fileutil::magic::rt::Debug {puts stderr "NV $type $offset $qual: $numeric"} 261 return $numeric 262} 263 264# Numeric - get bytes of $type at $offset and $compare to $val 265# qual might be a mask 266proc ::fileutil::magic::rt::Nx {atlevel type offset comp val {qual ""}} { 267 variable typemap 268 variable numeric 269 variable last 270 271 upvar 1 level l 272 set l $atlevel 273 274 # unpack the type characteristics 275 foreach {size scan} $typemap($type) break 276 277 set last($atlevel) [expr {$offset + $size}] 278 279 # fetch the numeric field 280 set numeric [Fetch $offset $size $scan] 281 282 if {$comp eq "x"} { 283 # anything matches - don't care 284 return 1 285 } 286 287 # get value in binary form, then back to numeric 288 # this avoids problems with sign, as both values are 289 # [binary scan]-converted identically 290 binary scan [binary format $scan $val] $scan val 291 292 if {$qual ne ""} { 293 # there's a mask to be applied 294 set numeric [expr $numeric $qual] 295 } 296 297 # perform comparison 298 set c [expr $val $comp $numeric] 299 300 ::fileutil::magic::rt::Debug {puts stderr "numeric $type: $val $comp $numeric / $qual - $c"} 301 return $c 302} 303 304proc ::fileutil::magic::rt::Sx {atlevel offset comp val {qual ""}} { 305 variable fd 306 variable string 307 variable last 308 309 upvar 1 level l 310 set l $atlevel 311 312 # convert any backslashes 313 set val [subst -nocommands -novariables $val] 314 315 if {$comp eq "x"} { 316 # match anything - don't care, just get the value 317 set string "" 318 319 # Query: Can we use GetString here ? 320 # Or at least the strbuf cache ? 321 322 # move to the offset 323 ::seek $fd $offset 324 while { 325 ([::string length $string] < 100) && 326 [::string is print [set c [::read $fd 1]]] 327 } { 328 if {[::string is space $c]} { 329 break 330 } 331 append string $c 332 } 333 334 set last($atlevel) [expr {$offset + [string length $string]}] 335 336 return 1 337 } 338 339 set len [::string length $val] 340 set last($atlevel) [expr {$offset + $len}] 341 342 # get the string and compare it 343 set string [GetString $offset $len] 344 set cmp [::string compare $val $string] 345 set c [expr $cmp $comp 0] 346 347 ::fileutil::magic::rt::Debug { 348 puts "String '$val' $comp '$string' - $c" 349 if {$c} { 350 puts "offset $offset - $string" 351 } 352 } 353 return $c 354} 355proc ::fileutil::magic::rt::L {newlevel} { 356 # Regenerate level information in the calling context. 357 upvar 1 level l ; set l $newlevel 358 return 359} 360 361proc ::fileutil::magic::rt::I {base type delta} { 362 # Handling of base locations specified indirectly through the 363 # contents of the inspected file. 364 365 variable typemap 366 foreach {size scan} $typemap($type) break 367 return [expr {[Fetch $base $size $scan] + $delta}] 368} 369 370proc ::fileutil::magic::rt::R {base} { 371 # Handling of base locations specified relative to the end of the 372 # last field one level above. 373 374 variable last ; # Remembered locations. 375 upvar 1 level l ; # The level to get data from. 376 return [expr {$last($l) + $base}] 377} 378 379# ### ### ### ######### ######### ######### 380## Internal. Retrieval of the data used in comparisons. 381 382# fetch and cache a numeric value from the file 383proc ::fileutil::magic::rt::Fetch {where what scan} { 384 variable cache 385 variable numeric 386 variable fd 387 388 if {![info exists cache($where,$what,$scan)]} { 389 ::seek $fd $where 390 binary scan [::read $fd $what] $scan numeric 391 set cache($where,$what,$scan) $numeric 392 393 # Optimization: If we got 4 bytes, i.e. long we implicitly 394 # know the short and byte data as well. Should put them into 395 # the cache. -- Profile: How often does such an overlap truly 396 # happen ? 397 398 } else { 399 set numeric $cache($where,$what,$scan) 400 } 401 return $numeric 402} 403 404proc ::fileutil::magic::rt::GetString {offset len} { 405 # We have the first 1k of the file cached 406 variable string 407 variable strbuf 408 variable fd 409 410 set end [expr {$offset + $len - 1}] 411 if {$end < 4096} { 412 # in the string cache, copy the requested part. 413 set string [::string range $strbuf $offset $end] 414 } else { 415 # an unusual one, move to the offset and read directly from 416 # the file. 417 ::seek $fd $offset 418 set string [::read $fd $len] 419 } 420 return $string 421} 422 423# ### ### ### ######### ######### ######### 424## Internal, debugging. 425 426if {!$::fileutil::magic::rt::debug} { 427 # This procedure definition is optimized out of using code by the 428 # core bcc. It knows that neither argument checks are required, 429 # nor is anything done. So neither results, nor errors are 430 # possible, a true no-operation. 431 proc ::fileutil::magic::rt::Debug {args} {} 432 433} else { 434 proc ::fileutil::magic::rt::Debug {script} { 435 # Run the commands in the debug script. This usually generates 436 # some output. The uplevel is required to ensure the proper 437 # resolution of all variables found in the script. 438 uplevel 1 $script 439 return 440 } 441} 442 443# ### ### ### ######### ######### ######### 444## Initialize constants 445 446namespace eval ::fileutil::magic::rt { 447 # maps magic typenames to field characteristics: size (#byte), 448 # binary scan format 449 450 variable typemap 451} 452 453proc ::fileutil::magic::rt::Init {} { 454 variable typemap 455 global tcl_platform 456 457 # Set the definitions for all types which have their endianess 458 # explicitly specified n their name. 459 460 array set typemap { 461 byte {1 c} ubyte {1 c} 462 beshort {2 S} ubeshort {2 S} 463 leshort {2 s} uleshort {2 s} 464 belong {4 I} ubelong {4 I} 465 lelong {4 i} ulelong {4 i} 466 bedate {4 S} ledate {4 s} 467 beldate {4 I} leldate {4 i} 468 469 long {4 Q} ulong {4 Q} date {4 Q} ldate {4 Q} 470 short {2 Y} ushort {2 Y} 471 } 472 473 # Now set the definitions for the types without explicit 474 # endianess. They assume/use 'native' byteorder. We also put in 475 # special forms for the compiler, so that it can use short names 476 # for the native-endian types as well. 477 478 # generate short form names 479 foreach {n v} [array get typemap] { 480 foreach {len scan} $v break 481 #puts stderr "Adding $scan - [list $len $scan]" 482 set typemap($scan) [list $len $scan] 483 } 484 485 # The special Q and Y short forms are incorrect, correct now to 486 # use the proper native endianess. 487 488 if {$tcl_platform(byteOrder) eq "littleEndian"} { 489 array set typemap {Q {4 i} Y {2 s}} 490 } else { 491 array set typemap {Q {4 I} Y {2 S}} 492 } 493} 494 495::fileutil::magic::rt::Init 496# ### ### ### ######### ######### ######### 497## Ready for use. 498 499package provide fileutil::magic::rt 1.0 500# EOF 501