1205194Sdelphij/* match.S -- x86 assembly version of the zlib longest_match() function. 2205194Sdelphij * Optimized for the Intel 686 chips (PPro and later). 3205194Sdelphij * 4205194Sdelphij * Copyright (C) 1998, 2007 Brian Raiter <breadbox@muppetlabs.com> 5205194Sdelphij * 6205194Sdelphij * This software is provided 'as-is', without any express or implied 7205194Sdelphij * warranty. In no event will the author be held liable for any damages 8205194Sdelphij * arising from the use of this software. 9205194Sdelphij * 10205194Sdelphij * Permission is granted to anyone to use this software for any purpose, 11205194Sdelphij * including commercial applications, and to alter it and redistribute it 12205194Sdelphij * freely, subject to the following restrictions: 13205194Sdelphij * 14205194Sdelphij * 1. The origin of this software must not be misrepresented; you must not 15205194Sdelphij * claim that you wrote the original software. If you use this software 16205194Sdelphij * in a product, an acknowledgment in the product documentation would be 17205194Sdelphij * appreciated but is not required. 18205194Sdelphij * 2. Altered source versions must be plainly marked as such, and must not be 19205194Sdelphij * misrepresented as being the original software. 20205194Sdelphij * 3. This notice may not be removed or altered from any source distribution. 21205194Sdelphij */ 22205194Sdelphij 23205194Sdelphij#ifndef NO_UNDERLINE 24205194Sdelphij#define match_init _match_init 25205194Sdelphij#define longest_match _longest_match 26205194Sdelphij#endif 27205194Sdelphij 28205194Sdelphij#define MAX_MATCH (258) 29205194Sdelphij#define MIN_MATCH (3) 30205194Sdelphij#define MIN_LOOKAHEAD (MAX_MATCH + MIN_MATCH + 1) 31205194Sdelphij#define MAX_MATCH_8 ((MAX_MATCH + 7) & ~7) 32205194Sdelphij 33205194Sdelphij/* stack frame offsets */ 34205194Sdelphij 35205194Sdelphij#define chainlenwmask 0 /* high word: current chain len */ 36205194Sdelphij /* low word: s->wmask */ 37205194Sdelphij#define window 4 /* local copy of s->window */ 38205194Sdelphij#define windowbestlen 8 /* s->window + bestlen */ 39205194Sdelphij#define scanstart 16 /* first two bytes of string */ 40205194Sdelphij#define scanend 12 /* last two bytes of string */ 41205194Sdelphij#define scanalign 20 /* dword-misalignment of string */ 42205194Sdelphij#define nicematch 24 /* a good enough match size */ 43205194Sdelphij#define bestlen 28 /* size of best match so far */ 44205194Sdelphij#define scan 32 /* ptr to string wanting match */ 45205194Sdelphij 46205194Sdelphij#define LocalVarsSize (36) 47205194Sdelphij/* saved ebx 36 */ 48205194Sdelphij/* saved edi 40 */ 49205194Sdelphij/* saved esi 44 */ 50205194Sdelphij/* saved ebp 48 */ 51205194Sdelphij/* return address 52 */ 52205194Sdelphij#define deflatestate 56 /* the function arguments */ 53205194Sdelphij#define curmatch 60 54205194Sdelphij 55205194Sdelphij/* All the +zlib1222add offsets are due to the addition of fields 56205194Sdelphij * in zlib in the deflate_state structure since the asm code was first written 57205194Sdelphij * (if you compile with zlib 1.0.4 or older, use "zlib1222add equ (-4)"). 58205194Sdelphij * (if you compile with zlib between 1.0.5 and 1.2.2.1, use "zlib1222add equ 0"). 59205194Sdelphij * if you compile with zlib 1.2.2.2 or later , use "zlib1222add equ 8"). 60205194Sdelphij */ 61205194Sdelphij 62205194Sdelphij#define zlib1222add (8) 63205194Sdelphij 64205194Sdelphij#define dsWSize (36+zlib1222add) 65205194Sdelphij#define dsWMask (44+zlib1222add) 66205194Sdelphij#define dsWindow (48+zlib1222add) 67205194Sdelphij#define dsPrev (56+zlib1222add) 68205194Sdelphij#define dsMatchLen (88+zlib1222add) 69205194Sdelphij#define dsPrevMatch (92+zlib1222add) 70205194Sdelphij#define dsStrStart (100+zlib1222add) 71205194Sdelphij#define dsMatchStart (104+zlib1222add) 72205194Sdelphij#define dsLookahead (108+zlib1222add) 73205194Sdelphij#define dsPrevLen (112+zlib1222add) 74205194Sdelphij#define dsMaxChainLen (116+zlib1222add) 75205194Sdelphij#define dsGoodMatch (132+zlib1222add) 76205194Sdelphij#define dsNiceMatch (136+zlib1222add) 77205194Sdelphij 78205194Sdelphij 79205194Sdelphij.file "match.S" 80205194Sdelphij 81205194Sdelphij.globl match_init, longest_match 82205194Sdelphij 83205194Sdelphij.text 84205194Sdelphij 85205194Sdelphij/* uInt longest_match(deflate_state *deflatestate, IPos curmatch) */ 86230837Sdelphij.cfi_sections .debug_frame 87205194Sdelphij 88205194Sdelphijlongest_match: 89205194Sdelphij 90230837Sdelphij.cfi_startproc 91205194Sdelphij/* Save registers that the compiler may be using, and adjust %esp to */ 92205194Sdelphij/* make room for our stack frame. */ 93205194Sdelphij 94205194Sdelphij pushl %ebp 95230837Sdelphij .cfi_def_cfa_offset 8 96230837Sdelphij .cfi_offset ebp, -8 97205194Sdelphij pushl %edi 98230837Sdelphij .cfi_def_cfa_offset 12 99205194Sdelphij pushl %esi 100230837Sdelphij .cfi_def_cfa_offset 16 101205194Sdelphij pushl %ebx 102230837Sdelphij .cfi_def_cfa_offset 20 103205194Sdelphij subl $LocalVarsSize, %esp 104230837Sdelphij .cfi_def_cfa_offset LocalVarsSize+20 105205194Sdelphij 106205194Sdelphij/* Retrieve the function arguments. %ecx will hold cur_match */ 107205194Sdelphij/* throughout the entire function. %edx will hold the pointer to the */ 108205194Sdelphij/* deflate_state structure during the function's setup (before */ 109205194Sdelphij/* entering the main loop). */ 110205194Sdelphij 111205194Sdelphij movl deflatestate(%esp), %edx 112205194Sdelphij movl curmatch(%esp), %ecx 113205194Sdelphij 114205194Sdelphij/* uInt wmask = s->w_mask; */ 115205194Sdelphij/* unsigned chain_length = s->max_chain_length; */ 116205194Sdelphij/* if (s->prev_length >= s->good_match) { */ 117205194Sdelphij/* chain_length >>= 2; */ 118205194Sdelphij/* } */ 119230837Sdelphij 120205194Sdelphij movl dsPrevLen(%edx), %eax 121205194Sdelphij movl dsGoodMatch(%edx), %ebx 122205194Sdelphij cmpl %ebx, %eax 123205194Sdelphij movl dsWMask(%edx), %eax 124205194Sdelphij movl dsMaxChainLen(%edx), %ebx 125205194Sdelphij jl LastMatchGood 126205194Sdelphij shrl $2, %ebx 127205194SdelphijLastMatchGood: 128205194Sdelphij 129205194Sdelphij/* chainlen is decremented once beforehand so that the function can */ 130205194Sdelphij/* use the sign flag instead of the zero flag for the exit test. */ 131205194Sdelphij/* It is then shifted into the high word, to make room for the wmask */ 132205194Sdelphij/* value, which it will always accompany. */ 133205194Sdelphij 134205194Sdelphij decl %ebx 135205194Sdelphij shll $16, %ebx 136205194Sdelphij orl %eax, %ebx 137205194Sdelphij movl %ebx, chainlenwmask(%esp) 138205194Sdelphij 139205194Sdelphij/* if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead; */ 140205194Sdelphij 141205194Sdelphij movl dsNiceMatch(%edx), %eax 142205194Sdelphij movl dsLookahead(%edx), %ebx 143205194Sdelphij cmpl %eax, %ebx 144205194Sdelphij jl LookaheadLess 145205194Sdelphij movl %eax, %ebx 146205194SdelphijLookaheadLess: movl %ebx, nicematch(%esp) 147205194Sdelphij 148205194Sdelphij/* register Bytef *scan = s->window + s->strstart; */ 149205194Sdelphij 150205194Sdelphij movl dsWindow(%edx), %esi 151205194Sdelphij movl %esi, window(%esp) 152205194Sdelphij movl dsStrStart(%edx), %ebp 153205194Sdelphij lea (%esi,%ebp), %edi 154205194Sdelphij movl %edi, scan(%esp) 155205194Sdelphij 156205194Sdelphij/* Determine how many bytes the scan ptr is off from being */ 157205194Sdelphij/* dword-aligned. */ 158205194Sdelphij 159205194Sdelphij movl %edi, %eax 160205194Sdelphij negl %eax 161205194Sdelphij andl $3, %eax 162205194Sdelphij movl %eax, scanalign(%esp) 163205194Sdelphij 164205194Sdelphij/* IPos limit = s->strstart > (IPos)MAX_DIST(s) ? */ 165205194Sdelphij/* s->strstart - (IPos)MAX_DIST(s) : NIL; */ 166205194Sdelphij 167205194Sdelphij movl dsWSize(%edx), %eax 168205194Sdelphij subl $MIN_LOOKAHEAD, %eax 169205194Sdelphij subl %eax, %ebp 170205194Sdelphij jg LimitPositive 171205194Sdelphij xorl %ebp, %ebp 172205194SdelphijLimitPositive: 173205194Sdelphij 174205194Sdelphij/* int best_len = s->prev_length; */ 175205194Sdelphij 176205194Sdelphij movl dsPrevLen(%edx), %eax 177205194Sdelphij movl %eax, bestlen(%esp) 178205194Sdelphij 179205194Sdelphij/* Store the sum of s->window + best_len in %esi locally, and in %esi. */ 180205194Sdelphij 181205194Sdelphij addl %eax, %esi 182205194Sdelphij movl %esi, windowbestlen(%esp) 183205194Sdelphij 184205194Sdelphij/* register ush scan_start = *(ushf*)scan; */ 185205194Sdelphij/* register ush scan_end = *(ushf*)(scan+best_len-1); */ 186205194Sdelphij/* Posf *prev = s->prev; */ 187205194Sdelphij 188205194Sdelphij movzwl (%edi), %ebx 189205194Sdelphij movl %ebx, scanstart(%esp) 190205194Sdelphij movzwl -1(%edi,%eax), %ebx 191205194Sdelphij movl %ebx, scanend(%esp) 192205194Sdelphij movl dsPrev(%edx), %edi 193205194Sdelphij 194205194Sdelphij/* Jump into the main loop. */ 195205194Sdelphij 196205194Sdelphij movl chainlenwmask(%esp), %edx 197205194Sdelphij jmp LoopEntry 198205194Sdelphij 199205194Sdelphij.balign 16 200205194Sdelphij 201205194Sdelphij/* do { 202205194Sdelphij * match = s->window + cur_match; 203205194Sdelphij * if (*(ushf*)(match+best_len-1) != scan_end || 204205194Sdelphij * *(ushf*)match != scan_start) continue; 205205194Sdelphij * [...] 206205194Sdelphij * } while ((cur_match = prev[cur_match & wmask]) > limit 207205194Sdelphij * && --chain_length != 0); 208205194Sdelphij * 209205194Sdelphij * Here is the inner loop of the function. The function will spend the 210205194Sdelphij * majority of its time in this loop, and majority of that time will 211205194Sdelphij * be spent in the first ten instructions. 212205194Sdelphij * 213205194Sdelphij * Within this loop: 214205194Sdelphij * %ebx = scanend 215205194Sdelphij * %ecx = curmatch 216205194Sdelphij * %edx = chainlenwmask - i.e., ((chainlen << 16) | wmask) 217205194Sdelphij * %esi = windowbestlen - i.e., (window + bestlen) 218205194Sdelphij * %edi = prev 219205194Sdelphij * %ebp = limit 220205194Sdelphij */ 221205194SdelphijLookupLoop: 222205194Sdelphij andl %edx, %ecx 223205194Sdelphij movzwl (%edi,%ecx,2), %ecx 224205194Sdelphij cmpl %ebp, %ecx 225205194Sdelphij jbe LeaveNow 226205194Sdelphij subl $0x00010000, %edx 227205194Sdelphij js LeaveNow 228205194SdelphijLoopEntry: movzwl -1(%esi,%ecx), %eax 229205194Sdelphij cmpl %ebx, %eax 230205194Sdelphij jnz LookupLoop 231205194Sdelphij movl window(%esp), %eax 232205194Sdelphij movzwl (%eax,%ecx), %eax 233205194Sdelphij cmpl scanstart(%esp), %eax 234205194Sdelphij jnz LookupLoop 235205194Sdelphij 236205194Sdelphij/* Store the current value of chainlen. */ 237205194Sdelphij 238205194Sdelphij movl %edx, chainlenwmask(%esp) 239205194Sdelphij 240205194Sdelphij/* Point %edi to the string under scrutiny, and %esi to the string we */ 241205194Sdelphij/* are hoping to match it up with. In actuality, %esi and %edi are */ 242205194Sdelphij/* both pointed (MAX_MATCH_8 - scanalign) bytes ahead, and %edx is */ 243205194Sdelphij/* initialized to -(MAX_MATCH_8 - scanalign). */ 244205194Sdelphij 245205194Sdelphij movl window(%esp), %esi 246205194Sdelphij movl scan(%esp), %edi 247205194Sdelphij addl %ecx, %esi 248205194Sdelphij movl scanalign(%esp), %eax 249205194Sdelphij movl $(-MAX_MATCH_8), %edx 250205194Sdelphij lea MAX_MATCH_8(%edi,%eax), %edi 251205194Sdelphij lea MAX_MATCH_8(%esi,%eax), %esi 252205194Sdelphij 253205194Sdelphij/* Test the strings for equality, 8 bytes at a time. At the end, 254205194Sdelphij * adjust %edx so that it is offset to the exact byte that mismatched. 255205194Sdelphij * 256205194Sdelphij * We already know at this point that the first three bytes of the 257205194Sdelphij * strings match each other, and they can be safely passed over before 258205194Sdelphij * starting the compare loop. So what this code does is skip over 0-3 259205194Sdelphij * bytes, as much as necessary in order to dword-align the %edi 260205194Sdelphij * pointer. (%esi will still be misaligned three times out of four.) 261205194Sdelphij * 262205194Sdelphij * It should be confessed that this loop usually does not represent 263205194Sdelphij * much of the total running time. Replacing it with a more 264205194Sdelphij * straightforward "rep cmpsb" would not drastically degrade 265205194Sdelphij * performance. 266205194Sdelphij */ 267205194SdelphijLoopCmps: 268205194Sdelphij movl (%esi,%edx), %eax 269205194Sdelphij xorl (%edi,%edx), %eax 270205194Sdelphij jnz LeaveLoopCmps 271205194Sdelphij movl 4(%esi,%edx), %eax 272205194Sdelphij xorl 4(%edi,%edx), %eax 273205194Sdelphij jnz LeaveLoopCmps4 274205194Sdelphij addl $8, %edx 275205194Sdelphij jnz LoopCmps 276205194Sdelphij jmp LenMaximum 277205194SdelphijLeaveLoopCmps4: addl $4, %edx 278205194SdelphijLeaveLoopCmps: testl $0x0000FFFF, %eax 279205194Sdelphij jnz LenLower 280205194Sdelphij addl $2, %edx 281205194Sdelphij shrl $16, %eax 282205194SdelphijLenLower: subb $1, %al 283205194Sdelphij adcl $0, %edx 284205194Sdelphij 285205194Sdelphij/* Calculate the length of the match. If it is longer than MAX_MATCH, */ 286205194Sdelphij/* then automatically accept it as the best possible match and leave. */ 287205194Sdelphij 288205194Sdelphij lea (%edi,%edx), %eax 289205194Sdelphij movl scan(%esp), %edi 290205194Sdelphij subl %edi, %eax 291205194Sdelphij cmpl $MAX_MATCH, %eax 292205194Sdelphij jge LenMaximum 293205194Sdelphij 294205194Sdelphij/* If the length of the match is not longer than the best match we */ 295205194Sdelphij/* have so far, then forget it and return to the lookup loop. */ 296205194Sdelphij 297205194Sdelphij movl deflatestate(%esp), %edx 298205194Sdelphij movl bestlen(%esp), %ebx 299205194Sdelphij cmpl %ebx, %eax 300205194Sdelphij jg LongerMatch 301205194Sdelphij movl windowbestlen(%esp), %esi 302205194Sdelphij movl dsPrev(%edx), %edi 303205194Sdelphij movl scanend(%esp), %ebx 304205194Sdelphij movl chainlenwmask(%esp), %edx 305205194Sdelphij jmp LookupLoop 306205194Sdelphij 307205194Sdelphij/* s->match_start = cur_match; */ 308205194Sdelphij/* best_len = len; */ 309205194Sdelphij/* if (len >= nice_match) break; */ 310205194Sdelphij/* scan_end = *(ushf*)(scan+best_len-1); */ 311205194Sdelphij 312205194SdelphijLongerMatch: movl nicematch(%esp), %ebx 313205194Sdelphij movl %eax, bestlen(%esp) 314205194Sdelphij movl %ecx, dsMatchStart(%edx) 315205194Sdelphij cmpl %ebx, %eax 316205194Sdelphij jge LeaveNow 317205194Sdelphij movl window(%esp), %esi 318205194Sdelphij addl %eax, %esi 319205194Sdelphij movl %esi, windowbestlen(%esp) 320205194Sdelphij movzwl -1(%edi,%eax), %ebx 321205194Sdelphij movl dsPrev(%edx), %edi 322205194Sdelphij movl %ebx, scanend(%esp) 323205194Sdelphij movl chainlenwmask(%esp), %edx 324205194Sdelphij jmp LookupLoop 325205194Sdelphij 326205194Sdelphij/* Accept the current string, with the maximum possible length. */ 327205194Sdelphij 328205194SdelphijLenMaximum: movl deflatestate(%esp), %edx 329205194Sdelphij movl $MAX_MATCH, bestlen(%esp) 330205194Sdelphij movl %ecx, dsMatchStart(%edx) 331205194Sdelphij 332205194Sdelphij/* if ((uInt)best_len <= s->lookahead) return (uInt)best_len; */ 333205194Sdelphij/* return s->lookahead; */ 334205194Sdelphij 335205194SdelphijLeaveNow: 336205194Sdelphij movl deflatestate(%esp), %edx 337205194Sdelphij movl bestlen(%esp), %ebx 338205194Sdelphij movl dsLookahead(%edx), %eax 339205194Sdelphij cmpl %eax, %ebx 340205194Sdelphij jg LookaheadRet 341205194Sdelphij movl %ebx, %eax 342205194SdelphijLookaheadRet: 343205194Sdelphij 344205194Sdelphij/* Restore the stack and return from whence we came. */ 345205194Sdelphij 346205194Sdelphij addl $LocalVarsSize, %esp 347230837Sdelphij .cfi_def_cfa_offset 20 348205194Sdelphij popl %ebx 349230837Sdelphij .cfi_def_cfa_offset 16 350205194Sdelphij popl %esi 351230837Sdelphij .cfi_def_cfa_offset 12 352205194Sdelphij popl %edi 353230837Sdelphij .cfi_def_cfa_offset 8 354205194Sdelphij popl %ebp 355230837Sdelphij .cfi_def_cfa_offset 4 356230837Sdelphij.cfi_endproc 357205194Sdelphijmatch_init: ret 358