1249259Sdim/* match.S -- x86 assembly version of the zlib longest_match() function. 2249259Sdim * Optimized for the Intel 686 chips (PPro and later). 3249259Sdim * 4249259Sdim * Copyright (C) 1998, 2007 Brian Raiter <breadbox@muppetlabs.com> 5249259Sdim * 6249259Sdim * This software is provided 'as-is', without any express or implied 7249259Sdim * warranty. In no event will the author be held liable for any damages 8249259Sdim * arising from the use of this software. 9249259Sdim * 10249259Sdim * Permission is granted to anyone to use this software for any purpose, 11249259Sdim * including commercial applications, and to alter it and redistribute it 12249259Sdim * freely, subject to the following restrictions: 13249259Sdim * 14249259Sdim * 1. The origin of this software must not be misrepresented; you must not 15249259Sdim * claim that you wrote the original software. If you use this software 16249259Sdim * in a product, an acknowledgment in the product documentation would be 17249259Sdim * appreciated but is not required. 18249259Sdim * 2. Altered source versions must be plainly marked as such, and must not be 19249259Sdim * misrepresented as being the original software. 20249259Sdim * 3. This notice may not be removed or altered from any source distribution. 21249259Sdim */ 22249259Sdim 23249259Sdim#ifndef NO_UNDERLINE 24249259Sdim#define match_init _match_init 25249259Sdim#define longest_match _longest_match 26249259Sdim#endif 27249259Sdim 28249259Sdim#define MAX_MATCH (258) 29249259Sdim#define MIN_MATCH (3) 30249259Sdim#define MIN_LOOKAHEAD (MAX_MATCH + MIN_MATCH + 1) 31249259Sdim#define MAX_MATCH_8 ((MAX_MATCH + 7) & ~7) 32249259Sdim 33249259Sdim/* stack frame offsets */ 34249259Sdim 35249259Sdim#define chainlenwmask 0 /* high word: current chain len */ 36249259Sdim /* low word: s->wmask */ 37249259Sdim#define window 4 /* local copy of s->window */ 38249259Sdim#define windowbestlen 8 /* s->window + bestlen */ 39249259Sdim#define scanstart 16 /* first two bytes of string */ 40249259Sdim#define scanend 12 /* last two bytes of string */ 41249259Sdim#define scanalign 20 /* dword-misalignment of string */ 42249259Sdim#define nicematch 24 /* a good enough match size */ 43249259Sdim#define bestlen 28 /* size of best match so far */ 44249259Sdim#define scan 32 /* ptr to string wanting match */ 45249259Sdim 46249259Sdim#define LocalVarsSize (36) 47249259Sdim/* saved ebx 36 */ 48249259Sdim/* saved edi 40 */ 49249259Sdim/* saved esi 44 */ 50249259Sdim/* saved ebp 48 */ 51249259Sdim/* return address 52 */ 52249259Sdim#define deflatestate 56 /* the function arguments */ 53249259Sdim#define curmatch 60 54249259Sdim 55249259Sdim/* All the +zlib1222add offsets are due to the addition of fields 56249259Sdim * in zlib in the deflate_state structure since the asm code was first written 57249259Sdim * (if you compile with zlib 1.0.4 or older, use "zlib1222add equ (-4)"). 58249259Sdim * (if you compile with zlib between 1.0.5 and 1.2.2.1, use "zlib1222add equ 0"). 59249259Sdim * if you compile with zlib 1.2.2.2 or later , use "zlib1222add equ 8"). 60249259Sdim */ 61249259Sdim 62249259Sdim#define zlib1222add (8) 63249259Sdim 64249259Sdim#define dsWSize (36+zlib1222add) 65249259Sdim#define dsWMask (44+zlib1222add) 66249259Sdim#define dsWindow (48+zlib1222add) 67249259Sdim#define dsPrev (56+zlib1222add) 68249259Sdim#define dsMatchLen (88+zlib1222add) 69249259Sdim#define dsPrevMatch (92+zlib1222add) 70249259Sdim#define dsStrStart (100+zlib1222add) 71249259Sdim#define dsMatchStart (104+zlib1222add) 72249259Sdim#define dsLookahead (108+zlib1222add) 73249259Sdim#define dsPrevLen (112+zlib1222add) 74249259Sdim#define dsMaxChainLen (116+zlib1222add) 75249259Sdim#define dsGoodMatch (132+zlib1222add) 76249259Sdim#define dsNiceMatch (136+zlib1222add) 77249259Sdim 78249259Sdim 79249259Sdim.file "match.S" 80249259Sdim 81249259Sdim.globl match_init, longest_match 82249259Sdim 83249259Sdim.text 84249259Sdim 85249259Sdim/* uInt longest_match(deflate_state *deflatestate, IPos curmatch) */ 86249259Sdim.cfi_sections .debug_frame 87249259Sdim 88249259Sdimlongest_match: 89249259Sdim 90249259Sdim.cfi_startproc 91249259Sdim/* Save registers that the compiler may be using, and adjust %esp to */ 92249259Sdim/* make room for our stack frame. */ 93249259Sdim 94249259Sdim pushl %ebp 95249259Sdim .cfi_def_cfa_offset 8 96249259Sdim .cfi_offset ebp, -8 97249259Sdim pushl %edi 98249259Sdim .cfi_def_cfa_offset 12 99249259Sdim pushl %esi 100249259Sdim .cfi_def_cfa_offset 16 101249259Sdim pushl %ebx 102249259Sdim .cfi_def_cfa_offset 20 103249259Sdim subl $LocalVarsSize, %esp 104249259Sdim .cfi_def_cfa_offset LocalVarsSize+20 105249259Sdim 106249259Sdim/* Retrieve the function arguments. %ecx will hold cur_match */ 107249259Sdim/* throughout the entire function. %edx will hold the pointer to the */ 108249259Sdim/* deflate_state structure during the function's setup (before */ 109249259Sdim/* entering the main loop). */ 110249259Sdim 111249259Sdim movl deflatestate(%esp), %edx 112249259Sdim movl curmatch(%esp), %ecx 113249259Sdim 114249259Sdim/* uInt wmask = s->w_mask; */ 115263508Sdim/* unsigned chain_length = s->max_chain_length; */ 116263508Sdim/* if (s->prev_length >= s->good_match) { */ 117249259Sdim/* chain_length >>= 2; */ 118249259Sdim/* } */ 119263508Sdim 120263508Sdim movl dsPrevLen(%edx), %eax 121249259Sdim movl dsGoodMatch(%edx), %ebx 122249259Sdim cmpl %ebx, %eax 123263508Sdim movl dsWMask(%edx), %eax 124249259Sdim movl dsMaxChainLen(%edx), %ebx 125249259Sdim jl LastMatchGood 126249259Sdim shrl $2, %ebx 127263508SdimLastMatchGood: 128249259Sdim 129249259Sdim/* chainlen is decremented once beforehand so that the function can */ 130249259Sdim/* use the sign flag instead of the zero flag for the exit test. */ 131249259Sdim/* It is then shifted into the high word, to make room for the wmask */ 132263508Sdim/* value, which it will always accompany. */ 133249259Sdim 134249259Sdim decl %ebx 135249259Sdim shll $16, %ebx 136249259Sdim orl %eax, %ebx 137249259Sdim movl %ebx, chainlenwmask(%esp) 138249259Sdim 139249259Sdim/* if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead; */ 140249259Sdim 141249259Sdim movl dsNiceMatch(%edx), %eax 142249259Sdim movl dsLookahead(%edx), %ebx 143249259Sdim cmpl %eax, %ebx 144249259Sdim jl LookaheadLess 145249259Sdim movl %eax, %ebx 146249259SdimLookaheadLess: movl %ebx, nicematch(%esp) 147249259Sdim 148263508Sdim/* register Bytef *scan = s->window + s->strstart; */ 149249259Sdim 150249259Sdim movl dsWindow(%edx), %esi 151249259Sdim movl %esi, window(%esp) 152249259Sdim movl dsStrStart(%edx), %ebp 153263508Sdim lea (%esi,%ebp), %edi 154249259Sdim movl %edi, scan(%esp) 155249259Sdim 156249259Sdim/* Determine how many bytes the scan ptr is off from being */ 157249259Sdim/* dword-aligned. */ 158249259Sdim 159249259Sdim movl %edi, %eax 160249259Sdim negl %eax 161249259Sdim andl $3, %eax 162249259Sdim movl %eax, scanalign(%esp) 163249259Sdim 164249259Sdim/* IPos limit = s->strstart > (IPos)MAX_DIST(s) ? */ 165249259Sdim/* s->strstart - (IPos)MAX_DIST(s) : NIL; */ 166249259Sdim 167249259Sdim movl dsWSize(%edx), %eax 168249259Sdim subl $MIN_LOOKAHEAD, %eax 169249259Sdim subl %eax, %ebp 170263508Sdim jg LimitPositive 171249259Sdim xorl %ebp, %ebp 172249259SdimLimitPositive: 173249259Sdim 174249259Sdim/* int best_len = s->prev_length; */ 175249259Sdim 176249259Sdim movl dsPrevLen(%edx), %eax 177249259Sdim movl %eax, bestlen(%esp) 178249259Sdim 179249259Sdim/* Store the sum of s->window + best_len in %esi locally, and in %esi. */ 180249259Sdim 181249259Sdim addl %eax, %esi 182249259Sdim movl %esi, windowbestlen(%esp) 183263508Sdim 184249259Sdim/* register ush scan_start = *(ushf*)scan; */ 185249259Sdim/* register ush scan_end = *(ushf*)(scan+best_len-1); */ 186249259Sdim/* Posf *prev = s->prev; */ 187249259Sdim 188249259Sdim movzwl (%edi), %ebx 189249259Sdim movl %ebx, scanstart(%esp) 190249259Sdim movzwl -1(%edi,%eax), %ebx 191249259Sdim movl %ebx, scanend(%esp) 192249259Sdim movl dsPrev(%edx), %edi 193249259Sdim 194249259Sdim/* Jump into the main loop. */ 195249259Sdim 196249259Sdim movl chainlenwmask(%esp), %edx 197249259Sdim jmp LoopEntry 198249259Sdim 199249259Sdim.balign 16 200249259Sdim 201249259Sdim/* do { 202249259Sdim * match = s->window + cur_match; 203249259Sdim * if (*(ushf*)(match+best_len-1) != scan_end || 204263508Sdim * *(ushf*)match != scan_start) continue; 205249259Sdim * [...] 206249259Sdim * } while ((cur_match = prev[cur_match & wmask]) > limit 207249259Sdim * && --chain_length != 0); 208249259Sdim * 209249259Sdim * Here is the inner loop of the function. The function will spend the 210249259Sdim * majority of its time in this loop, and majority of that time will 211249259Sdim * be spent in the first ten instructions. 212249259Sdim * 213249259Sdim * Within this loop: 214249259Sdim * %ebx = scanend 215249259Sdim * %ecx = curmatch 216249259Sdim * %edx = chainlenwmask - i.e., ((chainlen << 16) | wmask) 217249259Sdim * %esi = windowbestlen - i.e., (window + bestlen) 218249259Sdim * %edi = prev 219249259Sdim * %ebp = limit 220249259Sdim */ 221249259SdimLookupLoop: 222249259Sdim andl %edx, %ecx 223249259Sdim movzwl (%edi,%ecx,2), %ecx 224249259Sdim cmpl %ebp, %ecx 225249259Sdim jbe LeaveNow 226249259Sdim subl $0x00010000, %edx 227249259Sdim js LeaveNow 228249259SdimLoopEntry: movzwl -1(%esi,%ecx), %eax 229249259Sdim cmpl %ebx, %eax 230249259Sdim jnz LookupLoop 231249259Sdim movl window(%esp), %eax 232249259Sdim movzwl (%eax,%ecx), %eax 233249259Sdim cmpl scanstart(%esp), %eax 234249259Sdim jnz LookupLoop 235249259Sdim 236249259Sdim/* Store the current value of chainlen. */ 237263508Sdim 238249259Sdim movl %edx, chainlenwmask(%esp) 239249259Sdim 240249259Sdim/* Point %edi to the string under scrutiny, and %esi to the string we */ 241249259Sdim/* are hoping to match it up with. In actuality, %esi and %edi are */ 242249259Sdim/* both pointed (MAX_MATCH_8 - scanalign) bytes ahead, and %edx is */ 243249259Sdim/* initialized to -(MAX_MATCH_8 - scanalign). */ 244249259Sdim 245249259Sdim movl window(%esp), %esi 246249259Sdim movl scan(%esp), %edi 247249259Sdim addl %ecx, %esi 248249259Sdim movl scanalign(%esp), %eax 249249259Sdim movl $(-MAX_MATCH_8), %edx 250249259Sdim lea MAX_MATCH_8(%edi,%eax), %edi 251249259Sdim lea MAX_MATCH_8(%esi,%eax), %esi 252249259Sdim 253249259Sdim/* Test the strings for equality, 8 bytes at a time. At the end, 254249259Sdim * adjust %edx so that it is offset to the exact byte that mismatched. 255249259Sdim * 256249259Sdim * We already know at this point that the first three bytes of the 257249259Sdim * strings match each other, and they can be safely passed over before 258249259Sdim * starting the compare loop. So what this code does is skip over 0-3 259249259Sdim * bytes, as much as necessary in order to dword-align the %edi 260249259Sdim * pointer. (%esi will still be misaligned three times out of four.) 261249259Sdim * 262249259Sdim * It should be confessed that this loop usually does not represent 263249259Sdim * much of the total running time. Replacing it with a more 264249259Sdim * straightforward "rep cmpsb" would not drastically degrade 265249259Sdim * performance. 266249259Sdim */ 267249259SdimLoopCmps: 268249259Sdim movl (%esi,%edx), %eax 269263508Sdim xorl (%edi,%edx), %eax 270263508Sdim jnz LeaveLoopCmps 271263508Sdim movl 4(%esi,%edx), %eax 272249259Sdim xorl 4(%edi,%edx), %eax 273249259Sdim jnz LeaveLoopCmps4 274263508Sdim addl $8, %edx 275263508Sdim jnz LoopCmps 276249259Sdim jmp LenMaximum 277249259SdimLeaveLoopCmps4: addl $4, %edx 278249259SdimLeaveLoopCmps: testl $0x0000FFFF, %eax 279263508Sdim jnz LenLower 280263508Sdim addl $2, %edx 281263508Sdim shrl $16, %eax 282263508SdimLenLower: subb $1, %al 283263508Sdim adcl $0, %edx 284249259Sdim 285249259Sdim/* Calculate the length of the match. If it is longer than MAX_MATCH, */ 286249259Sdim/* then automatically accept it as the best possible match and leave. */ 287249259Sdim 288249259Sdim lea (%edi,%edx), %eax 289249259Sdim movl scan(%esp), %edi 290249259Sdim subl %edi, %eax 291249259Sdim cmpl $MAX_MATCH, %eax 292249259Sdim jge LenMaximum 293249259Sdim 294249259Sdim/* If the length of the match is not longer than the best match we */ 295/* have so far, then forget it and return to the lookup loop. */ 296 297 movl deflatestate(%esp), %edx 298 movl bestlen(%esp), %ebx 299 cmpl %ebx, %eax 300 jg LongerMatch 301 movl windowbestlen(%esp), %esi 302 movl dsPrev(%edx), %edi 303 movl scanend(%esp), %ebx 304 movl chainlenwmask(%esp), %edx 305 jmp LookupLoop 306 307/* s->match_start = cur_match; */ 308/* best_len = len; */ 309/* if (len >= nice_match) break; */ 310/* scan_end = *(ushf*)(scan+best_len-1); */ 311 312LongerMatch: movl nicematch(%esp), %ebx 313 movl %eax, bestlen(%esp) 314 movl %ecx, dsMatchStart(%edx) 315 cmpl %ebx, %eax 316 jge LeaveNow 317 movl window(%esp), %esi 318 addl %eax, %esi 319 movl %esi, windowbestlen(%esp) 320 movzwl -1(%edi,%eax), %ebx 321 movl dsPrev(%edx), %edi 322 movl %ebx, scanend(%esp) 323 movl chainlenwmask(%esp), %edx 324 jmp LookupLoop 325 326/* Accept the current string, with the maximum possible length. */ 327 328LenMaximum: movl deflatestate(%esp), %edx 329 movl $MAX_MATCH, bestlen(%esp) 330 movl %ecx, dsMatchStart(%edx) 331 332/* if ((uInt)best_len <= s->lookahead) return (uInt)best_len; */ 333/* return s->lookahead; */ 334 335LeaveNow: 336 movl deflatestate(%esp), %edx 337 movl bestlen(%esp), %ebx 338 movl dsLookahead(%edx), %eax 339 cmpl %eax, %ebx 340 jg LookaheadRet 341 movl %ebx, %eax 342LookaheadRet: 343 344/* Restore the stack and return from whence we came. */ 345 346 addl $LocalVarsSize, %esp 347 .cfi_def_cfa_offset 20 348 popl %ebx 349 .cfi_def_cfa_offset 16 350 popl %esi 351 .cfi_def_cfa_offset 12 352 popl %edi 353 .cfi_def_cfa_offset 8 354 popl %ebp 355 .cfi_def_cfa_offset 4 356.cfi_endproc 357match_init: ret 358