1# tdelta.tcl -- 2# 3# Produce an rdiff-style delta signature of one file with respect to another, 4# and re-create one file by applying the delta to the other. 5# 6# Written by Stephen Huntley (stephen.huntley@alum.mit.edu) 7# License: Tcl license 8# Version 1.0 9# 10# Usage: 11# 12# tdelta <reference file | channel> <target file | channel> [sizecheck [fingerprint]] 13# Returns a delta of the target file with respect to the reference file. 14# i.e., using patch to apply the delta to the target file will re-create the reference file. 15# 16# sizecheck and fingerprint are booleans which enable time-saving checks: 17# 18# if sizecheck is True then if the file size is 19# less than five times the block size, then no delta calculation is done and the 20# signature contains the full reference file contents. 21# 22# if fingerprint is True then 10 small strings ("fingerprints") are taken from the target 23# file and searched for in the reference file. If at least three aren't found, then 24# no delta calculation is done and the signature contains the full reference file contents. 25# 26# tpatch <target file | channel> <delta signature> <output file (duplicate of reference file) | channel> 27# Reconstitute original reference file by applying delta to target file. 28# 29# 30# global variables: 31# 32# blockSize 33# Size of file segments to compare. 34# Smaller blockSize tends to create smaller delta. 35# Larger blockSize tends to take more time to compute delta. 36# md5Size 37# Substring of md5 checksum to store in delta signature. 38# If security is less of a concern, set md5Size to a number 39# between 1-32 to create a more compact signature. 40 41package provide trsync 1.0 42 43namespace eval ::trsync { 44 45if ![info exists blockSize] {variable blockSize 100} 46if ![info exists Mod] {variable Mod [expr pow(2,16)]} 47if ![info exists md5Size] {variable md5Size 32} 48 49variable temp 50if ![info exists temp] { 51 catch {set temp $::env(TMP)} 52 catch {set temp $::env(TEMP)} 53 catch {set temp $::env(TRSYNC_TEMP)} 54 if [catch {file mkdir $temp}] {set temp [pwd]} 55} 56if ![file writable $temp] {error "temp location not writable"} 57 58proc Backup {args} { 59 return 60} 61 62proc ConstructFile {copyinstructions {eolNative 0} {backup {}}} { 63 if [catch {package present md5 2}] {package forget md5 ; package require md5 2} 64 65 set fileToConstruct [lindex $copyinstructions 0] 66 set existingFile [lindex $copyinstructions 1] 67 set blockSize [lindex $copyinstructions 2] 68 array set fileStats [lindex $copyinstructions 3] 69 array set digestInstructionArray [DigestInstructionsExpand [lindex $copyinstructions 4] $blockSize] 70 array set dataInstructionArray [lindex $copyinstructions 5] 71 unset copyinstructions 72 73 if {[lsearch [file channels] $existingFile] == -1} { 74 set existingFile [FileNameNormalize $existingFile] 75 if {$fileToConstruct == {}} {file delete -force $existingFile ; return} 76 catch { 77 set existingID [open $existingFile r] 78 fconfigure $existingID -translation binary 79 } 80 } else { 81 set existingID $existingFile 82 fconfigure $existingID -translation binary 83 } 84 85 set temp $::trsync::temp 86 87 if {[lsearch [file channels] $fileToConstruct] == -1} { 88 set fileToConstruct [FileNameNormalize $fileToConstruct] 89 set constructTag "trsync.[md5::md5 -hex "[clock seconds] [clock clicks]"]" 90 set constructID [open $temp/$constructTag w] 91 } else { 92 set constructID $fileToConstruct 93 } 94 fconfigure $constructID -translation binary 95 96 if $eolNative {set eolNative [string is ascii -strict [array get dataInstructionArray]]} 97 98 set filePointer 1 99 while {$filePointer <= $fileStats(size)} { 100 if {[array names dataInstructionArray $filePointer] != {}} { 101 puts -nonewline $constructID $dataInstructionArray($filePointer) 102 set segmentLength [string length $dataInstructionArray($filePointer)] 103 array unset dataInstructionArray $filePointer 104 set filePointer [expr $filePointer + $segmentLength] 105 } elseif {[array names digestInstructionArray $filePointer] != {}} { 106 if ![info exists existingID] {error "Corrupt copy instructions."} 107 set blockNumber [lindex $digestInstructionArray($filePointer) 0] 108 set blockMd5Sum [lindex $digestInstructionArray($filePointer) 1] 109 110 seek $existingID [expr $blockNumber * $blockSize] 111 112 set existingBlock [read $existingID $blockSize] 113 set existingBlockMd5Sum [string range [md5::md5 -hex -- $existingBlock] 0 [expr [string length $blockMd5Sum] - 1]] 114 if ![string equal -nocase $blockMd5Sum $existingBlockMd5Sum] {error "digest file contents mismatch"} 115 puts -nonewline $constructID $existingBlock 116 117 if $eolNative {set eolNative [string is ascii -strict $existingBlock]} 118 unset existingBlock 119 set filePointer [expr $filePointer + $blockSize] 120 } else { 121 error "Corrupt copy instructions." 122 } 123 } 124 catch {close $existingID} 125 set fileStats(eolNative) $eolNative 126 if {[lsearch [file channels] $fileToConstruct] > -1} {return [array get fileStats]} 127 128 close $constructID 129 130 if $eolNative { 131 fcopy [set fin [open $temp/$constructTag r]] [set fout [open $temp/${constructTag}fcopy w]] 132 close $fin 133 close $fout 134 file delete -force $temp/$constructTag 135 set constructTag "${constructTag}fcopy" 136 } 137 138 catch {file attributes $temp/$constructTag -readonly 0} result 139 catch {file attributes $temp/$constructTag -permissions rw-rw-rw-} result 140 catch {file attributes $temp/$constructTag -owner $fileStats(uid)} result 141 catch {file attributes $temp/$constructTag -group $fileStats(gid)} result 142 catch {file mtime $temp/$constructTag $fileStats(mtime)} result 143 catch {file atime $temp/$constructTag $fileStats(atime)} result 144 if [string equal $fileToConstruct $existingFile] { 145 catch {file attributes $existingFile -readonly 0} result 146 catch {file attributes $existingFile -permissions rw-rw-rw-} result 147 } 148 149 Backup $backup $fileToConstruct 150 151 file mkdir [file dirname $fileToConstruct] 152 file rename -force $temp/$constructTag $fileToConstruct 153 array set attributes $fileStats(attributes) 154 array set attrConstruct [file attributes $fileToConstruct] 155 foreach attr [array names attributes] { 156 if [string equal [array get attributes $attr] [array get attrConstruct $attr]] {continue} 157 if {[string equal $attr "-longname"] || [string equal $attr "-shortname"] || [string equal $attr "-permissions"]} {continue} 158 catch {file attributes $fileToConstruct $attr $attributes($attr)} result 159 } 160 catch {file attributes $fileToConstruct -permissions $fileStats(mode)} result 161 return 162} 163 164proc CopyInstructions {filename digest} { 165 if [catch {package present md5 2}] {package forget md5 ; package require md5 2} 166 167 if {[lsearch [file channels] $filename] == -1} { 168 set filename [FileNameNormalize $filename] 169 file stat $filename fileStats 170 array set fileAttributes [file attributes $filename] 171 array unset fileAttributes -longname 172 array unset fileAttributes -shortname 173 set arrayadd attributes ; lappend arrayadd [array get fileAttributes] ; array set fileStats $arrayadd 174 set f [open $filename r] 175 } else { 176 set f $filename 177 set fileStats(attributes) {} 178 } 179 fconfigure $f -translation binary 180 seek $f 0 end 181 set fileSize [tell $f] 182 seek $f 0 183 set fileStats(size) $fileSize 184 set digestFileName [lindex $digest 0] 185 set blockSize [lindex $digest 1] 186 set digest [lrange $digest 2 end] 187 188 if {[lsearch -exact $digest fingerprints] > -1} { 189 set fingerPrints [lindex $digest end] 190 set digest [lrange $digest 0 end-2] 191 set fileContents [read $f] 192 set matchCount 0 193 foreach fP $fingerPrints { 194 if {[string first $fP $fileContents] > -1} {incr matchCount} 195 if {$matchCount > 3} {break} 196 } 197 unset fileContents 198 seek $f 0 199 if {$matchCount < 3} {set digest {}} 200 } 201 202 set digestLength [llength $digest] 203 for {set i 0} {$i < $digestLength} {incr i} { 204 set arrayadd [lindex [lindex $digest $i] 1] 205 lappend arrayadd $i 206 array set Checksums $arrayadd 207 } 208 set digestInstructions {} 209 set dataInstructions {} 210 set weakChecksum {} 211 set startBlockPointer 0 212 set endBlockPointer 0 213 214 if ![array exists Checksums] { 215 set dataInstructions 1 216 lappend dataInstructions [read $f] 217 set endBlockPointer $fileSize 218 } 219 220 while {$endBlockPointer < $fileSize} { 221 set endBlockPointer [expr $startBlockPointer + $blockSize] 222 incr startBlockPointer 223 if {$weakChecksum == {}} { 224 set blockContents [read $f $blockSize] 225 set blockNumberSequence [SequenceBlock $blockContents] 226 set weakChecksumInfo [WeakChecksum $blockNumberSequence] 227 set weakChecksum [format %.0f [lindex $weakChecksumInfo 0]] 228 set startDataPointer $startBlockPointer 229 set endDataPointer $startDataPointer 230 set dataBuffer {} 231 } 232 if {[array names Checksums $weakChecksum] != {}} { 233 set md5Sum [md5::md5 -hex -- $blockContents] 234 set blockIndex $Checksums($weakChecksum) 235 set digestmd5Sum [lindex [lindex $digest $blockIndex] 0] 236 if [string equal -nocase $digestmd5Sum $md5Sum] { 237 if {$endDataPointer > $startDataPointer} { 238 lappend dataInstructions $startDataPointer 239 lappend dataInstructions $dataBuffer 240 } 241 lappend digestInstructions $startBlockPointer 242 lappend digestInstructions "$blockIndex [string range $md5Sum 0 [expr $::trsync::md5Size - 1]]" 243 set weakChecksum {} 244 set startBlockPointer $endBlockPointer 245 continue 246 } 247 } 248 if {$endBlockPointer >= $fileSize} { 249 lappend dataInstructions $startDataPointer 250 lappend dataInstructions $dataBuffer$blockContents 251 break 252 } 253 set rollChar [read $f 1] 254 binary scan $rollChar c* rollNumber 255 set rollNumber [expr ($rollNumber + 0x100)%0x100] 256 lappend blockNumberSequence $rollNumber 257 set blockNumberSequence [lrange $blockNumberSequence 1 end] 258 259 binary scan $blockContents a1a* rollOffChar blockContents 260 set blockContents $blockContents$rollChar 261 set dataBuffer $dataBuffer$rollOffChar 262 incr endDataPointer 263 264 set weakChecksumInfo "[eval RollChecksum [lrange $weakChecksumInfo 1 5] $rollNumber] [lindex $blockNumberSequence 0]" 265 set weakChecksum [format %.0f [lindex $weakChecksumInfo 0]] 266 } 267 close $f 268 269 lappend copyInstructions $filename 270 lappend copyInstructions $digestFileName 271 lappend copyInstructions $blockSize 272 lappend copyInstructions [array get fileStats] 273 lappend copyInstructions [DigestInstructionsCompress $digestInstructions $blockSize] 274 lappend copyInstructions $dataInstructions 275 return $copyInstructions 276} 277 278proc Digest {filename blockSize {sizecheck 0} {fingerprint 0}} { 279 if [catch {package present md5 2}] {package forget md5 ; package require md5 2} 280 281 set digest "[list $filename] $blockSize" 282 if {[lsearch [file channels] $filename] == -1} { 283 set filename [FileNameNormalize $filename] 284 set digest "[list $filename] $blockSize" 285 if {!([file isfile $filename] && [file readable $filename])} {return $digest} 286 set f [open $filename r] 287 } else { 288 set f $filename 289 } 290 fconfigure $f -translation binary 291 seek $f 0 end 292 set fileSize [tell $f] 293 seek $f 0 294 if {$sizecheck && ($fileSize < [expr $blockSize * 5])} {close $f ; return $digest} 295 296 while {![eof $f]} { 297 set blockContents [read $f $blockSize] 298 set md5Sum [md5::md5 -hex -- $blockContents] 299 set blockNumberSequence [SequenceBlock $blockContents] 300 set weakChecksum [lindex [WeakChecksum $blockNumberSequence] 0] 301 lappend digest "$md5Sum [format %.0f $weakChecksum]" 302 } 303 if $fingerprint { 304 set fileIncrement [expr $fileSize/10] 305 set fpLocation [expr $fileSize - 21] 306 set i 0 307 while {$i < 10} { 308 if {$fpLocation < 0} {set fpLocation 0} 309 seek $f $fpLocation 310 lappend fingerPrints [read $f 20] 311 set fpLocation [expr $fpLocation - $fileIncrement] 312 incr i 313 } 314 lappend digest fingerprints 315 lappend digest [lsort -unique $fingerPrints] 316 } 317 close $f 318 return $digest 319} 320 321proc DigestInstructionsCompress {digestInstructions blockSize} { 322 if [string equal $digestInstructions {}] {return {}} 323 set blockSpan $blockSize 324 foreach {pointer blockInfo} $digestInstructions { 325 if ![info exists currentBlockInfo] { 326 set currentPointer $pointer 327 set currentBlockInfo $blockInfo 328 set md5Size [string length [lindex $blockInfo 1]] 329 continue 330 } 331 if {$pointer == [expr $currentPointer + $blockSpan]} { 332 set md5 [lindex $blockInfo 1] 333 lappend currentBlockInfo $md5 334 incr blockSpan $blockSize 335 } else { 336 lappend newDigestInstructions $currentPointer 337 lappend newDigestInstructions "[lindex $currentBlockInfo 0] [list "$md5Size [string map {{ } {}} [lrange $currentBlockInfo 1 end]]"]" 338 339 set currentPointer $pointer 340 set currentBlockInfo $blockInfo 341 set blockSpan $blockSize 342 } 343 } 344 lappend newDigestInstructions $currentPointer 345 lappend newDigestInstructions "[lindex $currentBlockInfo 0] [list "$md5Size [string map {{ } {}} [lrange $currentBlockInfo 1 end]]"]" 346 return $newDigestInstructions 347} 348 349proc DigestInstructionsExpand {digestInstructions blockSize} { 350 if [string equal $digestInstructions {}] {return {}} 351 foreach {pointer blockInfo} $digestInstructions { 352 set blockNumber [lindex $blockInfo 0] 353 set md5Size [lindex [lindex $blockInfo 1] 0] 354 set blockString [lindex [lindex $blockInfo 1] 1] 355 set blockLength [string length $blockString] 356 357 set expandedBlock {} 358 for {set i $md5Size} {$i <= $blockLength} {incr i $md5Size} { 359 append expandedBlock " [string range $blockString [expr $i - $md5Size] [expr $i - 1]]" 360 } 361 362 set blockInfo "$blockNumber $expandedBlock" 363 foreach md5 [lrange $blockInfo 1 end] { 364 lappend newDigestInstructions $pointer 365 lappend newDigestInstructions "$blockNumber $md5" 366 incr pointer $blockSize 367 incr blockNumber 368 } 369 } 370 return $newDigestInstructions 371} 372 373proc FileNameNormalize {filename} { 374 file normalize $filename 375} 376 377proc RollChecksum {a(k,l)_ b(k,l)_ k l Xsub_k Xsub_l+1 } { 378 set Mod $trsync::Mod 379 380 set a(k+1,l+1)_ [expr ${a(k,l)_} - $Xsub_k + ${Xsub_l+1}] 381 set b(k+1,l+1)_ [expr ${b(k,l)_} - (($l - $k + 1) * $Xsub_k) + ${a(k+1,l+1)_}] 382 383 set a(k+1,l+1)_ [expr fmod(${a(k+1,l+1)_},$Mod)] 384 set b(k+1,l+1)_ [expr fmod(${b(k+1,l+1)_},$Mod)] 385 set s(k+1,l+1)_ [expr ${a(k+1,l+1)_} + ($Mod * ${b(k+1,l+1)_})] 386 return "${s(k+1,l+1)_} ${a(k+1,l+1)_} ${b(k+1,l+1)_} [incr k] [incr l]" 387} 388 389proc SequenceBlock {blockcontents} { 390 binary scan $blockcontents c* blockNumberSequence 391 set blockNumberSequenceLength [llength $blockNumberSequence] 392 for {set i 0} {$i < $blockNumberSequenceLength} {incr i} { 393 set blockNumberSequence [lreplace $blockNumberSequence $i $i [expr ([lindex $blockNumberSequence $i] + 0x100)%0x100]] 394 } 395 return $blockNumberSequence 396} 397 398proc WeakChecksum {Xsub_k...Xsub_l} { 399 set a(k,i)_ 0 400 set b(k,i)_ 0 401 set Mod $trsync::Mod 402 set k 1 403 set l [llength ${Xsub_k...Xsub_l}] 404 for {set i $k} {$i <= $l} {incr i} { 405 set Xsub_i [lindex ${Xsub_k...Xsub_l} [expr $i - 1]] 406 set a(k,i)_ [expr ${a(k,i)_} + $Xsub_i] 407 set b(k,i)_ [expr ${b(k,i)_} + (($l - $i + 1) * $Xsub_i)] 408 } 409 set a(k,l)_ [expr fmod(${a(k,i)_},$Mod)] 410 set b(k,l)_ [expr fmod(${b(k,i)_},$Mod)] 411 set s(k,l)_ [expr ${a(k,l)_} + ($Mod * ${b(k,l)_})] 412 return "${s(k,l)_} ${a(k,l)_} ${b(k,l)_} $k $l [lindex ${Xsub_k...Xsub_l} 0]" 413} 414 415proc tdelta {referenceFile targetFile blockSize {sizecheck 0} {fingerprint 0}} { 416 if {$::trsync::md5Size < 1} {error "md5Size must be greater than zero."} 417 set signature [Digest $targetFile $blockSize $sizecheck $fingerprint] 418 return [CopyInstructions $referenceFile $signature] 419} 420 421proc tpatch {targetFile copyInstructions fileToConstruct {eolNative 0}} { 422 set copyInstructions [lreplace $copyInstructions 0 1 $fileToConstruct $targetFile] 423 return [ConstructFile $copyInstructions $eolNative] 424} 425 426namespace export tdelta tpatch 427 428} 429# end namespace eval ::trsync 430 431