1234285Sdim 2234285Sdim; match.asm -- Pentium-Pro optimized version of longest_match() 3234285Sdim; 4234285Sdim; Updated for zlib 1.1.3 and converted to MASM 6.1x 5234285Sdim; Copyright (C) 2000 Dan Higdon <hdan@kinesoft.com> 6234285Sdim; and Chuck Walbourn <chuckw@kinesoft.com> 7234285Sdim; Corrections by Cosmin Truta <cosmint@cs.ubbcluj.ro> 8234285Sdim; 9234285Sdim; This is free software; you can redistribute it and/or modify it 10234285Sdim; under the terms of the GNU General Public License. 11234285Sdim 12234285Sdim; Based on match.S 13234285Sdim; Written for zlib 1.1.2 14234285Sdim; Copyright (C) 1998 Brian Raiter <breadbox@muppetlabs.com> 15234285Sdim; 16234285Sdim; Modified by Gilles Vollant (2005) for add gzhead and gzindex 17234285Sdim 18234285Sdim .686P 19234285Sdim .MODEL FLAT 20234285Sdim 21234285Sdim;=========================================================================== 22234285Sdim; EQUATES 23234285Sdim;=========================================================================== 24234285Sdim 25234285SdimMAX_MATCH EQU 258 26234285SdimMIN_MATCH EQU 3 27234285SdimMIN_LOOKAHEAD EQU (MAX_MATCH + MIN_MATCH + 1) 28234285SdimMAX_MATCH_8 EQU ((MAX_MATCH + 7) AND (NOT 7)) 29249423Sdim 30249423Sdim;=========================================================================== 31234285Sdim; STRUCTURES 32234285Sdim;=========================================================================== 33234285Sdim 34234285Sdim; This STRUCT assumes a 4-byte alignment 35234285Sdim 36234285SdimDEFLATE_STATE STRUCT 37234285Sdimds_strm dd ? 38234285Sdimds_status dd ? 39234285Sdimds_pending_buf dd ? 40234285Sdimds_pending_buf_size dd ? 41234285Sdimds_pending_out dd ? 42234285Sdimds_pending dd ? 43234285Sdimds_wrap dd ? 44234285Sdim; gzhead and gzindex are added in zlib 1.2.2.2 (see deflate.h) 45234285Sdimds_gzhead dd ? 46234285Sdimds_gzindex dd ? 47234285Sdimds_data_type db ? 48234285Sdimds_method db ? 49234285Sdim db ? ; padding 50234285Sdim db ? ; padding 51234285Sdimds_last_flush dd ? 52234285Sdimds_w_size dd ? ; used 53234285Sdimds_w_bits dd ? 54234285Sdimds_w_mask dd ? ; used 55234285Sdimds_window dd ? ; used 56234285Sdimds_window_size dd ? 57234285Sdimds_prev dd ? ; used 58234285Sdimds_head dd ? 59234285Sdimds_ins_h dd ? 60234285Sdimds_hash_size dd ? 61234285Sdimds_hash_bits dd ? 62234285Sdimds_hash_mask dd ? 63234285Sdimds_hash_shift dd ? 64234285Sdimds_block_start dd ? 65234285Sdimds_match_length dd ? ; used 66234285Sdimds_prev_match dd ? ; used 67234285Sdimds_match_available dd ? 68234285Sdimds_strstart dd ? ; used 69234285Sdimds_match_start dd ? ; used 70234285Sdimds_lookahead dd ? ; used 71234285Sdimds_prev_length dd ? ; used 72234285Sdimds_max_chain_length dd ? ; used 73234285Sdimds_max_laxy_match dd ? 74234285Sdimds_level dd ? 75234285Sdimds_strategy dd ? 76234285Sdimds_good_match dd ? ; used 77234285Sdimds_nice_match dd ? ; used 78234285Sdim 79234285Sdim; Don't need anymore of the struct for match 80234285SdimDEFLATE_STATE ENDS 81234285Sdim 82234285Sdim;=========================================================================== 83234285Sdim; CODE 84234285Sdim;=========================================================================== 85234285Sdim_TEXT SEGMENT 86234285Sdim 87234285Sdim;--------------------------------------------------------------------------- 88234285Sdim; match_init 89234285Sdim;--------------------------------------------------------------------------- 90234285Sdim ALIGN 4 91234285SdimPUBLIC _match_init 92234285Sdim_match_init PROC 93234285Sdim ; no initialization needed 94234285Sdim ret 95234285Sdim_match_init ENDP 96234285Sdim 97234285Sdim;--------------------------------------------------------------------------- 98234285Sdim; uInt longest_match(deflate_state *deflatestate, IPos curmatch) 99234285Sdim;--------------------------------------------------------------------------- 100234285Sdim ALIGN 4 101234285Sdim 102234285SdimPUBLIC _longest_match 103239462Sdim_longest_match PROC 104234285Sdim 105234285Sdim; Since this code uses EBP for a scratch register, the stack frame must 106234285Sdim; be manually constructed and referenced relative to the ESP register. 107234285Sdim 108234285Sdim; Stack image 109239462Sdim; Variables 110234285Sdimchainlenwmask = 0 ; high word: current chain len 111234285Sdim ; low word: s->wmask 112234285Sdimwindow = 4 ; local copy of s->window 113239462Sdimwindowbestlen = 8 ; s->window + bestlen 114234285Sdimscanend = 12 ; last two bytes of string 115234285Sdimscanstart = 16 ; first two bytes of string 116234285Sdimscanalign = 20 ; dword-misalignment of string 117234285Sdimnicematch = 24 ; a good enough match size 118234285Sdimbestlen = 28 ; size of best match so far 119239462Sdimscan = 32 ; ptr to string wanting match 120234285Sdimvarsize = 36 ; number of bytes (also offset to last saved register) 121234285Sdim 122234285Sdim; Saved Registers (actually pushed into place) 123234285Sdimebx_save = 36 124234285Sdimedi_save = 40 125234285Sdimesi_save = 44 126234285Sdimebp_save = 48 127234285Sdim 128234285Sdim; Parameters 129234285Sdimretaddr = 52 130234285Sdimdeflatestate = 56 131234285Sdimcurmatch = 60 132234285Sdim 133239462Sdim; Save registers that the compiler may be using 134234285Sdim push ebp 135234285Sdim push edi 136234285Sdim push esi 137234285Sdim push ebx 138239462Sdim 139239462Sdim; Allocate local variable space 140234285Sdim sub esp,varsize 141239462Sdim 142239462Sdim; Retrieve the function arguments. ecx will hold cur_match 143234285Sdim; throughout the entire function. edx will hold the pointer to the 144234285Sdim; deflate_state structure during the function's setup (before 145234285Sdim; entering the main loop). 146234285Sdim 147234285Sdim mov edx, [esp+deflatestate] 148239462SdimASSUME edx:PTR DEFLATE_STATE 149234285Sdim 150234285Sdim mov ecx, [esp+curmatch] 151239462Sdim 152234285Sdim; uInt wmask = s->w_mask; 153234285Sdim; unsigned chain_length = s->max_chain_length; 154234285Sdim; if (s->prev_length >= s->good_match) { 155234285Sdim; chain_length >>= 2; 156234285Sdim; } 157234285Sdim 158234285Sdim mov eax, [edx].ds_prev_length 159234285Sdim mov ebx, [edx].ds_good_match 160234285Sdim cmp eax, ebx 161239462Sdim mov eax, [edx].ds_w_mask 162239462Sdim mov ebx, [edx].ds_max_chain_length 163263508Sdim jl SHORT LastMatchGood 164263508Sdim shr ebx, 2 165239462SdimLastMatchGood: 166234285Sdim 167239462Sdim; chainlen is decremented once beforehand so that the function can 168239462Sdim; use the sign flag instead of the zero flag for the exit test. 169239462Sdim; It is then shifted into the high word, to make room for the wmask 170239462Sdim; value, which it will always accompany. 171239462Sdim 172239462Sdim dec ebx 173234285Sdim shl ebx, 16 174234285Sdim or ebx, eax 175234285Sdim mov [esp+chainlenwmask], ebx 176234285Sdim 177234285Sdim; if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead; 178239462Sdim 179234285Sdim mov eax, [edx].ds_nice_match 180234285Sdim mov ebx, [edx].ds_lookahead 181239462Sdim cmp ebx, eax 182234285Sdim jl SHORT LookaheadLess 183234285Sdim mov ebx, eax 184234285SdimLookaheadLess: 185234285Sdim mov [esp+nicematch], ebx 186239462Sdim 187239462Sdim;/* register Bytef *scan = s->window + s->strstart; */ 188239462Sdim 189239462Sdim mov esi, [edx].ds_window 190239462Sdim mov [esp+window], esi 191234285Sdim mov ebp, [edx].ds_strstart 192234285Sdim lea edi, [esi+ebp] 193234285Sdim mov [esp+scan],edi 194234285Sdim 195234285Sdim;/* Determine how many bytes the scan ptr is off from being */ 196234285Sdim;/* dword-aligned. */ 197234285Sdim 198234285Sdim mov eax, edi 199234285Sdim neg eax 200239462Sdim and eax, 3 201234285Sdim mov [esp+scanalign], eax 202234285Sdim 203234285Sdim;/* IPos limit = s->strstart > (IPos)MAX_DIST(s) ? */ 204239462Sdim;/* s->strstart - (IPos)MAX_DIST(s) : NIL; */ 205234285Sdim 206239462Sdim mov eax, [edx].ds_w_size 207234285Sdim sub eax, MIN_LOOKAHEAD 208234285Sdim sub ebp, eax 209234285Sdim jg SHORT LimitPositive 210234285Sdim xor ebp, ebp 211234285SdimLimitPositive: 212234285Sdim 213234285Sdim;/* int best_len = s->prev_length; */ 214234285Sdim 215234285Sdim mov eax, [edx].ds_prev_length 216234285Sdim mov [esp+bestlen], eax 217234285Sdim 218234285Sdim;/* Store the sum of s->window + best_len in %esi locally, and in %esi. */ 219239462Sdim 220234285Sdim add esi, eax 221234285Sdim mov [esp+windowbestlen], esi 222234285Sdim 223234285Sdim;/* register ush scan_start = *(ushf*)scan; */ 224239462Sdim;/* register ush scan_end = *(ushf*)(scan+best_len-1); */ 225239462Sdim;/* Posf *prev = s->prev; */ 226234285Sdim 227 movzx ebx, WORD PTR[edi] 228 mov [esp+scanstart], ebx 229 movzx ebx, WORD PTR[eax+edi-1] 230 mov [esp+scanend], ebx 231 mov edi, [edx].ds_prev 232 233;/* Jump into the main loop. */ 234 235 mov edx, [esp+chainlenwmask] 236 jmp SHORT LoopEntry 237 238;/* do { 239; * match = s->window + cur_match; 240; * if (*(ushf*)(match+best_len-1) != scan_end || 241; * *(ushf*)match != scan_start) continue; 242; * [...] 243; * } while ((cur_match = prev[cur_match & wmask]) > limit 244; * && --chain_length != 0); 245; * 246; * Here is the inner loop of the function. The function will spend the 247; * majority of its time in this loop, and majority of that time will 248; * be spent in the first ten instructions. 249; * 250; * Within this loop: 251; * %ebx = scanend 252; * %ecx = curmatch 253; * %edx = chainlenwmask - i.e., ((chainlen << 16) | wmask) 254; * %esi = windowbestlen - i.e., (window + bestlen) 255; * %edi = prev 256; * %ebp = limit 257; */ 258 259 ALIGN 4 260LookupLoop: 261 and ecx, edx 262 movzx ecx, WORD PTR[edi+ecx*2] 263 cmp ecx, ebp 264 jbe LeaveNow 265 sub edx, 000010000H 266 js LeaveNow 267 268LoopEntry: 269 movzx eax, WORD PTR[esi+ecx-1] 270 cmp eax, ebx 271 jnz SHORT LookupLoop 272 273 mov eax, [esp+window] 274 movzx eax, WORD PTR[eax+ecx] 275 cmp eax, [esp+scanstart] 276 jnz SHORT LookupLoop 277 278;/* Store the current value of chainlen. */ 279 280 mov [esp+chainlenwmask], edx 281 282;/* Point %edi to the string under scrutiny, and %esi to the string we */ 283;/* are hoping to match it up with. In actuality, %esi and %edi are */ 284;/* both pointed (MAX_MATCH_8 - scanalign) bytes ahead, and %edx is */ 285;/* initialized to -(MAX_MATCH_8 - scanalign). */ 286 287 mov esi, [esp+window] 288 mov edi, [esp+scan] 289 add esi, ecx 290 mov eax, [esp+scanalign] 291 mov edx, -MAX_MATCH_8 292 lea edi, [edi+eax+MAX_MATCH_8] 293 lea esi, [esi+eax+MAX_MATCH_8] 294 295;/* Test the strings for equality, 8 bytes at a time. At the end, 296; * adjust %edx so that it is offset to the exact byte that mismatched. 297; * 298; * We already know at this point that the first three bytes of the 299; * strings match each other, and they can be safely passed over before 300; * starting the compare loop. So what this code does is skip over 0-3 301; * bytes, as much as necessary in order to dword-align the %edi 302; * pointer. (%esi will still be misaligned three times out of four.) 303; * 304; * It should be confessed that this loop usually does not represent 305; * much of the total running time. Replacing it with a more 306; * straightforward "rep cmpsb" would not drastically degrade 307; * performance. 308; */ 309 310LoopCmps: 311 mov eax, DWORD PTR[esi+edx] 312 xor eax, DWORD PTR[edi+edx] 313 jnz SHORT LeaveLoopCmps 314 315 mov eax, DWORD PTR[esi+edx+4] 316 xor eax, DWORD PTR[edi+edx+4] 317 jnz SHORT LeaveLoopCmps4 318 319 add edx, 8 320 jnz SHORT LoopCmps 321 jmp LenMaximum 322 ALIGN 4 323 324LeaveLoopCmps4: 325 add edx, 4 326 327LeaveLoopCmps: 328 test eax, 00000FFFFH 329 jnz SHORT LenLower 330 331 add edx, 2 332 shr eax, 16 333 334LenLower: 335 sub al, 1 336 adc edx, 0 337 338;/* Calculate the length of the match. If it is longer than MAX_MATCH, */ 339;/* then automatically accept it as the best possible match and leave. */ 340 341 lea eax, [edi+edx] 342 mov edi, [esp+scan] 343 sub eax, edi 344 cmp eax, MAX_MATCH 345 jge SHORT LenMaximum 346 347;/* If the length of the match is not longer than the best match we */ 348;/* have so far, then forget it and return to the lookup loop. */ 349 350 mov edx, [esp+deflatestate] 351 mov ebx, [esp+bestlen] 352 cmp eax, ebx 353 jg SHORT LongerMatch 354 mov esi, [esp+windowbestlen] 355 mov edi, [edx].ds_prev 356 mov ebx, [esp+scanend] 357 mov edx, [esp+chainlenwmask] 358 jmp LookupLoop 359 ALIGN 4 360 361;/* s->match_start = cur_match; */ 362;/* best_len = len; */ 363;/* if (len >= nice_match) break; */ 364;/* scan_end = *(ushf*)(scan+best_len-1); */ 365 366LongerMatch: 367 mov ebx, [esp+nicematch] 368 mov [esp+bestlen], eax 369 mov [edx].ds_match_start, ecx 370 cmp eax, ebx 371 jge SHORT LeaveNow 372 mov esi, [esp+window] 373 add esi, eax 374 mov [esp+windowbestlen], esi 375 movzx ebx, WORD PTR[edi+eax-1] 376 mov edi, [edx].ds_prev 377 mov [esp+scanend], ebx 378 mov edx, [esp+chainlenwmask] 379 jmp LookupLoop 380 ALIGN 4 381 382;/* Accept the current string, with the maximum possible length. */ 383 384LenMaximum: 385 mov edx, [esp+deflatestate] 386 mov DWORD PTR[esp+bestlen], MAX_MATCH 387 mov [edx].ds_match_start, ecx 388 389;/* if ((uInt)best_len <= s->lookahead) return (uInt)best_len; */ 390;/* return s->lookahead; */ 391 392LeaveNow: 393 mov edx, [esp+deflatestate] 394 mov ebx, [esp+bestlen] 395 mov eax, [edx].ds_lookahead 396 cmp ebx, eax 397 jg SHORT LookaheadRet 398 mov eax, ebx 399LookaheadRet: 400 401; Restore the stack and return from whence we came. 402 403 add esp, varsize 404 pop ebx 405 pop esi 406 pop edi 407 pop ebp 408 ret 409 410_longest_match ENDP 411 412_TEXT ENDS 413END 414