1# tdelta.tcl --
2#
3#	Produce an rdiff-style delta signature of one file with respect to another,
4#	and re-create one file by applying the delta to the other.
5#
6# Written by Stephen Huntley (stephen.huntley@alum.mit.edu)
7# License: Tcl license
8# Version 1.0
9#
10# Usage:
11#
12# tdelta <reference file | channel> <target file | channel> [sizecheck [fingerprint]]
13#	Returns a delta of the target file with respect to the reference file.
14#	i.e., using patch to apply the delta to the target file will re-create the reference file.
15#
16#	sizecheck and fingerprint are booleans which enable time-saving checks:
17#
18#	if sizecheck is True then if the file size is
19#	less than five times the block size, then no delta calculation is done and the
20#	signature contains the full reference file contents.
21#
22#	if fingerprint is True then 10 small strings ("fingerprints") are taken from the target
23#	file and searched for in the reference file.  If at least three aren't found, then
24#	no delta calculation is done and the signature contains the full reference file contents.
25#
26# tpatch <target file | channel> <delta signature> <output file (duplicate of reference file) | channel>
27#	Reconstitute original reference file by applying delta to target file.
28#
29#
30# global variables:
31#
32# blockSize
33#	Size of file segments to compare.
34# 	Smaller blockSize tends to create smaller delta.
35# 	Larger blockSize tends to take more time to compute delta.
36# md5Size
37#	Substring of md5 checksum to store in delta signature.
38#	If security is less of a concern, set md5Size to a number
39#	between 1-32 to create a more compact signature.
40
41package provide trsync 1.0
42
43namespace eval ::trsync {
44
45if ![info exists blockSize] {variable blockSize 100}
46if ![info exists Mod] {variable Mod [expr pow(2,16)]}
47if ![info exists md5Size] {variable md5Size 32}
48
49variable temp
50if ![info exists temp] {
51	catch {set temp $::env(TMP)}
52	catch {set temp $::env(TEMP)}
53	catch {set temp $::env(TRSYNC_TEMP)}
54	if [catch {file mkdir $temp}] {set temp [pwd]}
55}
56if ![file writable $temp] {error "temp location not writable"}
57
58proc Backup {args} {
59	return
60}
61
62proc ConstructFile {copyinstructions {eolNative 0} {backup {}}} {
63	if [catch {package present md5 2}] {package forget md5 ; package require md5 2}
64
65	set fileToConstruct [lindex $copyinstructions 0]
66	set existingFile [lindex $copyinstructions 1]
67	set blockSize [lindex $copyinstructions 2]
68	array set fileStats [lindex $copyinstructions 3]
69	array set digestInstructionArray [DigestInstructionsExpand [lindex $copyinstructions 4] $blockSize]
70	array set dataInstructionArray [lindex $copyinstructions 5]
71	unset copyinstructions
72
73	if {[lsearch [file channels] $existingFile] == -1} {
74		set existingFile [FileNameNormalize $existingFile]
75		if {$fileToConstruct == {}} {file delete -force $existingFile ; return}
76		catch {
77			set existingID [open $existingFile r]
78			fconfigure $existingID -translation binary
79		}
80	} else {
81		set existingID $existingFile
82		fconfigure $existingID -translation binary
83	}
84
85	set temp $::trsync::temp
86
87	if {[lsearch [file channels] $fileToConstruct] == -1} {
88		set fileToConstruct [FileNameNormalize $fileToConstruct]
89		set constructTag "trsync.[md5::md5 -hex "[clock seconds] [clock clicks]"]"
90		set constructID [open $temp/$constructTag w]
91	} else {
92		set constructID $fileToConstruct
93	}
94	fconfigure $constructID -translation binary
95
96	if $eolNative {set eolNative [string is ascii -strict [array get dataInstructionArray]]}
97
98	set filePointer 1
99	while {$filePointer <= $fileStats(size)} {
100		if {[array names dataInstructionArray $filePointer] != {}} {
101			puts -nonewline $constructID $dataInstructionArray($filePointer)
102			set segmentLength [string length $dataInstructionArray($filePointer)]
103			array unset dataInstructionArray $filePointer
104			set filePointer [expr $filePointer + $segmentLength]
105		} elseif {[array names digestInstructionArray $filePointer] != {}} {
106			if ![info exists existingID] {error "Corrupt copy instructions."}
107			set blockNumber [lindex $digestInstructionArray($filePointer) 0]
108			set blockMd5Sum [lindex $digestInstructionArray($filePointer) 1]
109
110			seek $existingID [expr $blockNumber * $blockSize]
111
112			set existingBlock [read $existingID $blockSize]
113			set existingBlockMd5Sum [string range [md5::md5 -hex -- $existingBlock] 0 [expr [string length $blockMd5Sum] - 1]]
114			if ![string equal -nocase $blockMd5Sum $existingBlockMd5Sum] {error "digest file contents mismatch"}
115			puts -nonewline $constructID $existingBlock
116
117			if $eolNative {set eolNative [string is ascii -strict $existingBlock]}
118			unset existingBlock
119			set filePointer [expr $filePointer + $blockSize]
120		} else {
121			error "Corrupt copy instructions."
122		}
123	}
124	catch {close $existingID}
125	set fileStats(eolNative) $eolNative
126	if {[lsearch [file channels] $fileToConstruct] > -1} {return [array get fileStats]}
127
128	close $constructID
129
130	if $eolNative {
131		fcopy [set fin [open $temp/$constructTag r]] [set fout [open $temp/${constructTag}fcopy w]]
132		close $fin
133		close $fout
134		file delete -force $temp/$constructTag
135		set constructTag "${constructTag}fcopy"
136	}
137
138	catch {file attributes $temp/$constructTag -readonly 0} result
139	catch {file attributes $temp/$constructTag -permissions rw-rw-rw-} result
140	catch {file attributes $temp/$constructTag -owner $fileStats(uid)} result
141	catch {file attributes $temp/$constructTag -group $fileStats(gid)} result
142	catch {file mtime $temp/$constructTag $fileStats(mtime)} result
143	catch {file atime $temp/$constructTag $fileStats(atime)} result
144	if [string equal $fileToConstruct $existingFile] {
145		catch {file attributes $existingFile -readonly 0} result
146		catch {file attributes $existingFile -permissions rw-rw-rw-} result
147	}
148
149	Backup $backup $fileToConstruct
150
151	file mkdir [file dirname $fileToConstruct]
152	file rename -force $temp/$constructTag $fileToConstruct
153	array set attributes $fileStats(attributes)
154	array set attrConstruct [file attributes $fileToConstruct]
155	foreach attr [array names attributes] {
156		if [string equal [array get attributes $attr] [array get attrConstruct $attr]] {continue}
157		if {[string equal $attr "-longname"] || [string equal $attr "-shortname"] || [string equal $attr "-permissions"]} {continue}
158		catch {file attributes $fileToConstruct $attr $attributes($attr)} result
159	}
160	catch {file attributes $fileToConstruct -permissions $fileStats(mode)} result
161	return
162}
163
164proc CopyInstructions {filename digest} {
165	if [catch {package present md5 2}] {package forget md5 ; package require md5 2}
166
167	if {[lsearch [file channels] $filename] == -1} {
168		set filename [FileNameNormalize $filename]
169		file stat $filename fileStats
170		array set fileAttributes [file attributes $filename]
171		array unset fileAttributes -longname
172		array unset fileAttributes -shortname
173		set arrayadd attributes ; lappend arrayadd [array get fileAttributes] ; array set fileStats $arrayadd
174		set f [open $filename r]
175	} else {
176		set f $filename
177		set fileStats(attributes) {}
178	}
179	fconfigure $f -translation binary
180	seek $f 0 end
181	set fileSize [tell $f]
182	seek $f 0
183	set fileStats(size) $fileSize
184	set digestFileName [lindex $digest 0]
185	set blockSize [lindex $digest 1]
186	set digest [lrange $digest 2 end]
187
188	if {[lsearch -exact $digest fingerprints] > -1} {
189		set fingerPrints [lindex $digest end]
190		set digest [lrange $digest 0 end-2]
191		set fileContents [read $f]
192		set matchCount 0
193		foreach fP $fingerPrints {
194			if {[string first $fP $fileContents] > -1} {incr matchCount}
195			if {$matchCount > 3} {break}
196		}
197		unset fileContents
198		seek $f 0
199		if {$matchCount < 3} {set digest {}}
200	}
201
202	set digestLength [llength $digest]
203	for {set i 0} {$i < $digestLength} {incr i} {
204		set arrayadd [lindex [lindex $digest $i] 1]
205		lappend arrayadd $i
206		array set Checksums $arrayadd
207	}
208	set digestInstructions {}
209	set dataInstructions {}
210	set weakChecksum {}
211	set startBlockPointer 0
212	set endBlockPointer 0
213
214	if ![array exists Checksums] {
215		set dataInstructions 1
216		lappend dataInstructions [read $f]
217		set endBlockPointer $fileSize
218	}
219
220	while {$endBlockPointer < $fileSize} {
221		set endBlockPointer [expr $startBlockPointer + $blockSize]
222		incr startBlockPointer
223		if {$weakChecksum == {}} {
224			set blockContents [read $f $blockSize]
225			set blockNumberSequence [SequenceBlock $blockContents]
226			set weakChecksumInfo [WeakChecksum $blockNumberSequence]
227			set weakChecksum [format %.0f [lindex $weakChecksumInfo 0]]
228			set startDataPointer $startBlockPointer
229			set endDataPointer $startDataPointer
230			set dataBuffer {}
231		}
232		if {[array names Checksums $weakChecksum] != {}} {
233			set md5Sum [md5::md5 -hex -- $blockContents]
234			set blockIndex $Checksums($weakChecksum)
235			set digestmd5Sum [lindex [lindex $digest $blockIndex] 0]
236			if [string equal -nocase $digestmd5Sum $md5Sum] {
237				if {$endDataPointer > $startDataPointer} {
238					lappend dataInstructions $startDataPointer
239					lappend dataInstructions $dataBuffer
240				}
241				lappend digestInstructions $startBlockPointer
242				lappend digestInstructions "$blockIndex [string range $md5Sum 0 [expr $::trsync::md5Size - 1]]"
243				set weakChecksum {}
244				set startBlockPointer $endBlockPointer
245				continue
246			}
247		}
248		if {$endBlockPointer >= $fileSize} {
249			lappend dataInstructions $startDataPointer
250			lappend dataInstructions $dataBuffer$blockContents
251			break
252		}
253		set rollChar [read $f 1]
254		binary scan $rollChar c* rollNumber
255		set rollNumber [expr ($rollNumber + 0x100)%0x100]
256		lappend blockNumberSequence $rollNumber
257		set blockNumberSequence [lrange $blockNumberSequence 1 end]
258
259		binary scan $blockContents a1a* rollOffChar blockContents
260		set blockContents $blockContents$rollChar
261		set dataBuffer $dataBuffer$rollOffChar
262		incr endDataPointer
263
264		set weakChecksumInfo "[eval RollChecksum [lrange $weakChecksumInfo 1 5] $rollNumber] [lindex $blockNumberSequence 0]"
265		set weakChecksum [format %.0f [lindex $weakChecksumInfo 0]]
266	}
267	close $f
268
269	lappend copyInstructions $filename
270	lappend copyInstructions $digestFileName
271	lappend copyInstructions $blockSize
272	lappend copyInstructions [array get fileStats]
273	lappend copyInstructions [DigestInstructionsCompress $digestInstructions $blockSize]
274	lappend copyInstructions $dataInstructions
275	return $copyInstructions
276}
277
278proc Digest {filename blockSize {sizecheck 0} {fingerprint 0}} {
279	if [catch {package present md5 2}] {package forget md5 ; package require md5 2}
280
281	set digest "[list $filename] $blockSize"
282	if {[lsearch [file channels] $filename] == -1} {
283		set filename [FileNameNormalize $filename]
284		set digest "[list $filename] $blockSize"
285		if {!([file isfile $filename] && [file readable $filename])} {return $digest}
286		set f [open $filename r]
287	} else {
288		set f $filename
289	}
290	fconfigure $f -translation binary
291	seek $f 0 end
292	set fileSize [tell $f]
293	seek $f 0
294	if {$sizecheck && ($fileSize < [expr $blockSize * 5])} {close $f ; return $digest}
295
296	while {![eof $f]} {
297		set blockContents [read $f $blockSize]
298		set md5Sum [md5::md5 -hex -- $blockContents]
299		set blockNumberSequence [SequenceBlock $blockContents]
300		set weakChecksum [lindex [WeakChecksum $blockNumberSequence] 0]
301		lappend digest "$md5Sum [format %.0f $weakChecksum]"
302	}
303	if $fingerprint {
304		set fileIncrement [expr $fileSize/10]
305		set fpLocation [expr $fileSize - 21]
306		set i 0
307		while {$i < 10} {
308			if {$fpLocation < 0} {set fpLocation 0}
309			seek $f $fpLocation
310			lappend fingerPrints [read $f 20]
311			set fpLocation [expr $fpLocation - $fileIncrement]
312			incr i
313		}
314		lappend digest fingerprints
315		lappend digest [lsort -unique $fingerPrints]
316	}
317	close $f
318	return $digest
319}
320
321proc DigestInstructionsCompress {digestInstructions blockSize} {
322	if [string equal $digestInstructions {}] {return {}}
323	set blockSpan $blockSize
324	foreach {pointer blockInfo} $digestInstructions {
325		if ![info exists currentBlockInfo] {
326			set currentPointer $pointer
327			set currentBlockInfo $blockInfo
328			set md5Size [string length [lindex $blockInfo 1]]
329			continue
330		}
331		if {$pointer == [expr $currentPointer + $blockSpan]} {
332			set md5 [lindex $blockInfo 1]
333			lappend currentBlockInfo $md5
334			incr blockSpan $blockSize
335		} else {
336			lappend newDigestInstructions $currentPointer
337			lappend newDigestInstructions "[lindex $currentBlockInfo 0] [list "$md5Size [string map {{ } {}} [lrange $currentBlockInfo 1 end]]"]"
338
339			set currentPointer $pointer
340			set currentBlockInfo $blockInfo
341			set blockSpan $blockSize
342		}
343	}
344	lappend newDigestInstructions $currentPointer
345	lappend newDigestInstructions "[lindex $currentBlockInfo 0] [list "$md5Size [string map {{ } {}} [lrange $currentBlockInfo 1 end]]"]"
346	return $newDigestInstructions
347}
348
349proc DigestInstructionsExpand {digestInstructions blockSize} {
350	if [string equal $digestInstructions {}] {return {}}
351	foreach {pointer blockInfo} $digestInstructions {
352		set blockNumber [lindex $blockInfo 0]
353		set md5Size [lindex [lindex $blockInfo 1] 0]
354		set blockString [lindex [lindex $blockInfo 1] 1]
355		set blockLength [string length $blockString]
356
357		set expandedBlock {}
358		for {set i $md5Size} {$i <= $blockLength} {incr i $md5Size} {
359			append expandedBlock " [string range $blockString [expr $i - $md5Size] [expr $i - 1]]"
360		}
361
362		set blockInfo "$blockNumber $expandedBlock"
363		foreach md5 [lrange $blockInfo 1 end] {
364			lappend newDigestInstructions $pointer
365			lappend newDigestInstructions "$blockNumber $md5"
366			incr pointer $blockSize
367			incr blockNumber
368		}
369	}
370	return $newDigestInstructions
371}
372
373proc FileNameNormalize {filename} {
374	file normalize $filename
375}
376
377proc RollChecksum {a(k,l)_ b(k,l)_ k l Xsub_k Xsub_l+1 } {
378	set Mod $trsync::Mod
379
380	set a(k+1,l+1)_ [expr ${a(k,l)_} - $Xsub_k + ${Xsub_l+1}]
381	set b(k+1,l+1)_ [expr ${b(k,l)_} - (($l - $k + 1) * $Xsub_k) + ${a(k+1,l+1)_}]
382
383	set a(k+1,l+1)_ [expr fmod(${a(k+1,l+1)_},$Mod)]
384	set b(k+1,l+1)_ [expr fmod(${b(k+1,l+1)_},$Mod)]
385	set s(k+1,l+1)_ [expr ${a(k+1,l+1)_} + ($Mod * ${b(k+1,l+1)_})]
386	return "${s(k+1,l+1)_} ${a(k+1,l+1)_} ${b(k+1,l+1)_} [incr k] [incr l]"
387}
388
389proc SequenceBlock {blockcontents} {
390	binary scan $blockcontents c* blockNumberSequence
391	set blockNumberSequenceLength [llength $blockNumberSequence]
392	for {set i 0} {$i < $blockNumberSequenceLength} {incr i} {
393		set blockNumberSequence [lreplace $blockNumberSequence $i $i [expr ([lindex $blockNumberSequence $i] + 0x100)%0x100]]
394	}
395	return $blockNumberSequence
396}
397
398proc WeakChecksum {Xsub_k...Xsub_l} {
399	set a(k,i)_ 0
400	set b(k,i)_ 0
401	set Mod $trsync::Mod
402	set k 1
403	set l [llength ${Xsub_k...Xsub_l}]
404	for {set i $k} {$i <= $l} {incr i} {
405		set Xsub_i [lindex ${Xsub_k...Xsub_l} [expr $i - 1]]
406		set a(k,i)_ [expr ${a(k,i)_} + $Xsub_i]
407		set b(k,i)_ [expr ${b(k,i)_} + (($l - $i + 1) * $Xsub_i)]
408	}
409	set a(k,l)_ [expr fmod(${a(k,i)_},$Mod)]
410	set b(k,l)_ [expr fmod(${b(k,i)_},$Mod)]
411	set s(k,l)_ [expr ${a(k,l)_} + ($Mod * ${b(k,l)_})]
412	return "${s(k,l)_} ${a(k,l)_} ${b(k,l)_} $k $l [lindex ${Xsub_k...Xsub_l} 0]"
413}
414
415proc tdelta {referenceFile targetFile blockSize {sizecheck 0} {fingerprint 0}} {
416	if {$::trsync::md5Size < 1} {error "md5Size must be greater than zero."}
417	set signature [Digest $targetFile $blockSize $sizecheck $fingerprint]
418	return [CopyInstructions $referenceFile $signature]
419}
420
421proc tpatch {targetFile copyInstructions fileToConstruct {eolNative 0}} {
422	set copyInstructions [lreplace $copyInstructions 0 1 $fileToConstruct $targetFile]
423	return [ConstructFile $copyInstructions $eolNative]
424}
425
426namespace export tdelta tpatch
427
428}
429# end namespace eval ::trsync
430
431