1# rtcore.tcl --
2#
3#	Runtime core for file type recognition engines written in pure Tcl.
4#
5# Copyright (c) 2004-2005 Colin McCormack <coldstore@users.sourceforge.net>
6# Copyright (c) 2005      Andreas Kupries <andreas_kupries@users.sourceforge.net>
7#
8# See the file "license.terms" for information on usage and redistribution
9# of this file, and for a DISCLAIMER OF ALL WARRANTIES.
10#
11# RCS: @(#) $Id: rtcore.tcl,v 1.5 2005/09/28 04:51:19 andreas_kupries Exp $
12
13#####
14#
15# "mime type recognition in pure tcl"
16# http://wiki.tcl.tk/12526
17#
18# Tcl code harvested on:  10 Feb 2005, 04:06 GMT
19# Wiki page last updated: ???
20#
21#####
22
23# TODO - Required Functionality:
24
25# implement full offset language
26# implement pstring (pascal string, blerk)
27# implement regex form (blerk!)
28# implement string qualifiers
29
30# ### ### ### ######### ######### #########
31## Requirements
32
33package require Tcl 8.4
34
35# ### ### ### ######### ######### #########
36## Implementation
37
38namespace eval ::fileutil::magic::rt {
39    # Configuration flag. (De)activate debugging output.
40    # This is done during initialization.
41    # Changes at runtime have no effect.
42
43    variable debug 0
44
45    # Runtime state.
46
47    variable fd     {}     ; # Channel to file under scrutiny
48    variable strbuf {}     ; # Input cache [*].
49    variable cache         ; # Cache of fetched and decoded numeric
50    array set cache {}	   ; # values.
51    variable result {}     ; # Accumulated recognition result.
52    variable string {}     ; # Last recognized string | For substitution
53    variable numeric -9999 ; # Last recognized number | into the message
54
55    variable  last         ; # Behind last fetch locations,
56    array set last {}      ; # per nesting level.
57
58    # [*] The vast majority of magic strings are in the first 4k of the file.
59
60    # Export APIs (full public, recognizer public)
61    namespace export open close file_start result
62    namespace export emit offset Nv N S Nvx Nx Sx L R I
63}
64
65# ### ### ### ######### ######### #########
66## Public API, general use.
67
68# open the file to be scanned
69proc ::fileutil::magic::rt::open {file} {
70    variable result {}
71    variable string {}
72    variable numeric -9999
73    variable strbuf
74    variable fd
75    variable cache
76
77    set fd [::open $file]
78    ::fconfigure $fd -translation binary
79
80    # fill the string cache
81    set strbuf [::read $fd 4096]
82
83    # clear the fetch cache
84    catch {unset cache}
85    array set cache {}
86
87    return $fd
88}
89
90proc ::fileutil::magic::rt::close {} {
91    variable fd
92    ::close $fd
93    return
94}
95
96# mark the start of a magic file in debugging
97proc ::fileutil::magic::rt::file_start {name} {
98    ::fileutil::magic::rt::Debug {puts stderr "File: $name"}
99}
100
101# return the emitted result
102proc ::fileutil::magic::rt::result {{msg ""}} {
103    variable result
104    if {$msg ne ""} {emit $msg}
105    return -code return $result
106}
107
108proc ::fileutil::magic::rt::resultv {{msg ""}} {
109    variable result
110    if {$msg ne ""} {emit $msg}
111    return $result
112}
113
114# ### ### ### ######### ######### #########
115## Public API, for use by a recognizer.
116
117# emit a message
118proc ::fileutil::magic::rt::emit {msg} {
119    variable string
120    variable numeric
121    variable result
122
123    set map [list \
124	    \\b "" \
125	    %s  $string \
126	    %ld $numeric \
127	    %d  $numeric \
128	    ]
129
130    lappend result [::string map $map $msg]
131    return
132}
133
134# handle complex offsets - TODO
135proc ::fileutil::magic::rt::offset {where} {
136    ::fileutil::magic::rt::Debug {puts stderr "OFFSET: $where"}
137    return 0
138}
139
140proc ::fileutil::magic::rt::Nv {type offset {qual ""}} {
141    variable typemap
142    variable numeric
143
144    # unpack the type characteristics
145    foreach {size scan} $typemap($type) break
146
147    # fetch the numeric field from the file
148    set numeric [Fetch $offset $size $scan]
149
150    if {$qual ne ""} {
151	# there's a mask to be applied
152	set numeric [expr $numeric $qual]
153    }
154
155    ::fileutil::magic::rt::Debug {puts stderr "NV $type $offset $qual: $numeric"}
156    return $numeric
157}
158
159# Numeric - get bytes of $type at $offset and $compare to $val
160# qual might be a mask
161proc ::fileutil::magic::rt::N {type offset comp val {qual ""}} {
162    variable typemap
163    variable numeric
164
165    # unpack the type characteristics
166    foreach {size scan} $typemap($type) break
167
168    # fetch the numeric field
169    set numeric [Fetch $offset $size $scan]
170
171    # Would moving this before the fetch an optimisation ? The
172    # tradeoff is that we give up filling the cache, and it is unclear
173    # how often that value would be used. -- Profile!
174    if {$comp eq "x"} {
175	# anything matches - don't care
176	return 1
177    }
178
179    # get value in binary form, then back to numeric
180    # this avoids problems with sign, as both values are
181    # [binary scan]-converted identically
182    binary scan [binary format $scan $val] $scan val
183
184    if {$qual ne ""} {
185	# there's a mask to be applied
186	set numeric [expr $numeric $qual]
187    }
188
189    # perform comparison
190    set c [expr $val $comp $numeric]
191
192    ::fileutil::magic::rt::Debug {puts stderr "numeric $type: $val $comp $numeric / $qual - $c"}
193    return $c
194}
195
196proc ::fileutil::magic::rt::S {offset comp val {qual ""}} {
197    variable fd
198    variable string
199
200    # convert any backslashes
201    set val [subst -nocommands -novariables $val]
202
203    if {$comp eq "x"} {
204	# match anything - don't care, just get the value
205	set string ""
206
207	# Query: Can we use GetString here ?
208	# Or at least the strbuf cache ?
209
210	# move to the offset
211	::seek $fd $offset
212	while {
213	    ([::string length $string] < 100) &&
214	    [::string is print [set c [::read $fd 1]]]
215	} {
216	    if {[::string is space $c]} {
217		break
218	    }
219	    append string $c
220	}
221
222	return 1
223    }
224
225    # get the string and compare it
226    set string [GetString $offset [::string length $val]]
227    set cmp    [::string compare $val $string]
228    set c      [expr $cmp $comp 0]
229
230    ::fileutil::magic::rt::Debug {
231	puts "String '$val' $comp '$string' - $c"
232	if {$c} {
233	    puts "offset $offset - $string"
234	}
235    }
236    return $c
237}
238
239proc ::fileutil::magic::rt::Nvx {atlevel type offset {qual ""}} {
240    variable typemap
241    variable numeric
242    variable last
243
244    upvar 1 level l
245    set  l $atlevel
246
247    # unpack the type characteristics
248    foreach {size scan} $typemap($type) break
249
250    # fetch the numeric field from the file
251    set numeric [Fetch $offset $size $scan]
252
253    set last($atlevel) [expr {$offset + $size}]
254
255    if {$qual ne ""} {
256	# there's a mask to be applied
257	set numeric [expr $numeric $qual]
258    }
259
260    ::fileutil::magic::rt::Debug {puts stderr "NV $type $offset $qual: $numeric"}
261    return $numeric
262}
263
264# Numeric - get bytes of $type at $offset and $compare to $val
265# qual might be a mask
266proc ::fileutil::magic::rt::Nx {atlevel type offset comp val {qual ""}} {
267    variable typemap
268    variable numeric
269    variable last
270
271    upvar 1 level l
272    set  l $atlevel
273
274    # unpack the type characteristics
275    foreach {size scan} $typemap($type) break
276
277    set last($atlevel) [expr {$offset + $size}]
278
279    # fetch the numeric field
280    set numeric [Fetch $offset $size $scan]
281
282    if {$comp eq "x"} {
283	# anything matches - don't care
284	return 1
285    }
286
287    # get value in binary form, then back to numeric
288    # this avoids problems with sign, as both values are
289    # [binary scan]-converted identically
290    binary scan [binary format $scan $val] $scan val
291
292    if {$qual ne ""} {
293	# there's a mask to be applied
294	set numeric [expr $numeric $qual]
295    }
296
297    # perform comparison
298    set c [expr $val $comp $numeric]
299
300    ::fileutil::magic::rt::Debug {puts stderr "numeric $type: $val $comp $numeric / $qual - $c"}
301    return $c
302}
303
304proc ::fileutil::magic::rt::Sx {atlevel offset comp val {qual ""}} {
305    variable fd
306    variable string
307    variable last
308
309    upvar 1 level l
310    set  l $atlevel
311
312    # convert any backslashes
313    set val [subst -nocommands -novariables $val]
314
315    if {$comp eq "x"} {
316	# match anything - don't care, just get the value
317	set string ""
318
319	# Query: Can we use GetString here ?
320	# Or at least the strbuf cache ?
321
322	# move to the offset
323	::seek $fd $offset
324	while {
325	    ([::string length $string] < 100) &&
326	    [::string is print [set c [::read $fd 1]]]
327	} {
328	    if {[::string is space $c]} {
329		break
330	    }
331	    append string $c
332	}
333
334	set last($atlevel) [expr {$offset + [string length $string]}]
335
336	return 1
337    }
338
339    set len [::string length $val]
340    set last($atlevel) [expr {$offset + $len}]
341
342    # get the string and compare it
343    set string [GetString $offset $len]
344    set cmp    [::string compare $val $string]
345    set c      [expr $cmp $comp 0]
346
347    ::fileutil::magic::rt::Debug {
348	puts "String '$val' $comp '$string' - $c"
349	if {$c} {
350	    puts "offset $offset - $string"
351	}
352    }
353    return $c
354}
355proc ::fileutil::magic::rt::L {newlevel} {
356    # Regenerate level information in the calling context.
357    upvar 1 level l ; set l $newlevel
358    return
359}
360
361proc ::fileutil::magic::rt::I {base type delta} {
362    # Handling of base locations specified indirectly through the
363    # contents of the inspected file.
364
365    variable typemap
366    foreach {size scan} $typemap($type) break
367    return [expr {[Fetch $base $size $scan] + $delta}]
368}
369
370proc ::fileutil::magic::rt::R {base} {
371    # Handling of base locations specified relative to the end of the
372    # last field one level above.
373
374    variable last   ; # Remembered locations.
375    upvar 1 level l ; # The level to get data from.
376    return [expr {$last($l) + $base}]
377}
378
379# ### ### ### ######### ######### #########
380## Internal. Retrieval of the data used in comparisons.
381
382# fetch and cache a numeric value from the file
383proc ::fileutil::magic::rt::Fetch {where what scan} {
384    variable cache
385    variable numeric
386    variable fd
387
388    if {![info exists cache($where,$what,$scan)]} {
389	::seek $fd $where
390	binary scan [::read $fd $what] $scan numeric
391	set cache($where,$what,$scan) $numeric
392
393	# Optimization: If we got 4 bytes, i.e. long we implicitly
394	# know the short and byte data as well. Should put them into
395	# the cache. -- Profile: How often does such an overlap truly
396	# happen ?
397
398    } else {
399	set numeric $cache($where,$what,$scan)
400    }
401    return $numeric
402}
403
404proc ::fileutil::magic::rt::GetString {offset len} {
405    # We have the first 1k of the file cached
406    variable string
407    variable strbuf
408    variable fd
409
410    set end [expr {$offset + $len - 1}]
411    if {$end < 4096} {
412	# in the string cache, copy the requested part.
413	set string [::string range $strbuf $offset $end]
414    } else {
415	# an unusual one, move to the offset and read directly from
416	# the file.
417	::seek $fd $offset
418	set string [::read $fd $len]
419    }
420    return $string
421}
422
423# ### ### ### ######### ######### #########
424## Internal, debugging.
425
426if {!$::fileutil::magic::rt::debug} {
427    # This procedure definition is optimized out of using code by the
428    # core bcc. It knows that neither argument checks are required,
429    # nor is anything done. So neither results, nor errors are
430    # possible, a true no-operation.
431    proc ::fileutil::magic::rt::Debug {args} {}
432
433} else {
434    proc ::fileutil::magic::rt::Debug {script} {
435	# Run the commands in the debug script. This usually generates
436	# some output. The uplevel is required to ensure the proper
437	# resolution of all variables found in the script.
438	uplevel 1 $script
439	return
440    }
441}
442
443# ### ### ### ######### ######### #########
444## Initialize constants
445
446namespace eval ::fileutil::magic::rt {
447    # maps magic typenames to field characteristics: size (#byte),
448    # binary scan format
449
450    variable typemap
451}
452
453proc ::fileutil::magic::rt::Init {} {
454    variable typemap
455    global tcl_platform
456
457    # Set the definitions for all types which have their endianess
458    # explicitly specified n their name.
459
460    array set typemap {
461	byte    {1 c}  ubyte    {1 c}
462	beshort {2 S}  ubeshort {2 S}
463	leshort {2 s}  uleshort {2 s}
464	belong  {4 I}  ubelong  {4 I}
465	lelong  {4 i}  ulelong  {4 i}
466	bedate  {4 S}  ledate   {4 s}
467	beldate {4 I}  leldate  {4 i}
468
469	long  {4 Q} ulong  {4 Q} date  {4 Q} ldate {4 Q}
470	short {2 Y} ushort {2 Y}
471    }
472
473    # Now set the definitions for the types without explicit
474    # endianess. They assume/use 'native' byteorder. We also put in
475    # special forms for the compiler, so that it can use short names
476    # for the native-endian types as well.
477
478    # generate short form names
479    foreach {n v} [array get typemap] {
480	foreach {len scan} $v break
481	#puts stderr "Adding $scan - [list $len $scan]"
482	set typemap($scan) [list $len $scan]
483    }
484
485    # The special Q and Y short forms are incorrect, correct now to
486    # use the proper native endianess.
487
488    if {$tcl_platform(byteOrder) eq "littleEndian"} {
489	array set typemap {Q {4 i} Y {2 s}}
490    } else {
491	array set typemap {Q {4 I} Y {2 S}}
492    }
493}
494
495::fileutil::magic::rt::Init
496# ### ### ### ######### ######### #########
497## Ready for use.
498
499package provide fileutil::magic::rt 1.0
500# EOF
501