match.S revision 205194
1189623Srwatson/* match.S -- x86 assembly version of the zlib longest_match() function. 2189623Srwatson * Optimized for the Intel 686 chips (PPro and later). 3189623Srwatson * 4189623Srwatson * Copyright (C) 1998, 2007 Brian Raiter <breadbox@muppetlabs.com> 5189623Srwatson * 6189623Srwatson * This software is provided 'as-is', without any express or implied 7189623Srwatson * warranty. In no event will the author be held liable for any damages 8189623Srwatson * arising from the use of this software. 9189623Srwatson * 10189623Srwatson * Permission is granted to anyone to use this software for any purpose, 11189623Srwatson * including commercial applications, and to alter it and redistribute it 12189623Srwatson * freely, subject to the following restrictions: 13189623Srwatson * 14189623Srwatson * 1. The origin of this software must not be misrepresented; you must not 15189623Srwatson * claim that you wrote the original software. If you use this software 16189623Srwatson * in a product, an acknowledgment in the product documentation would be 17189623Srwatson * appreciated but is not required. 18189623Srwatson * 2. Altered source versions must be plainly marked as such, and must not be 19189623Srwatson * misrepresented as being the original software. 20189623Srwatson * 3. This notice may not be removed or altered from any source distribution. 21189623Srwatson */ 22189623Srwatson 23189623Srwatson#ifndef NO_UNDERLINE 24189623Srwatson#define match_init _match_init 25189623Srwatson#define longest_match _longest_match 26189623Srwatson#endif 27189623Srwatson 28189623Srwatson#define MAX_MATCH (258) 29189623Srwatson#define MIN_MATCH (3) 30189623Srwatson#define MIN_LOOKAHEAD (MAX_MATCH + MIN_MATCH + 1) 31189623Srwatson#define MAX_MATCH_8 ((MAX_MATCH + 7) & ~7) 32189623Srwatson 33189623Srwatson/* stack frame offsets */ 34189623Srwatson 35189623Srwatson#define chainlenwmask 0 /* high word: current chain len */ 36189623Srwatson /* low word: s->wmask */ 37189623Srwatson#define window 4 /* local copy of s->window */ 38189623Srwatson#define windowbestlen 8 /* s->window + bestlen */ 39189623Srwatson#define scanstart 16 /* first two bytes of string */ 40189623Srwatson#define scanend 12 /* last two bytes of string */ 41189623Srwatson#define scanalign 20 /* dword-misalignment of string */ 42189623Srwatson#define nicematch 24 /* a good enough match size */ 43189623Srwatson#define bestlen 28 /* size of best match so far */ 44189623Srwatson#define scan 32 /* ptr to string wanting match */ 45189623Srwatson 46189623Srwatson#define LocalVarsSize (36) 47189623Srwatson/* saved ebx 36 */ 48189623Srwatson/* saved edi 40 */ 49189623Srwatson/* saved esi 44 */ 50189623Srwatson/* saved ebp 48 */ 51189623Srwatson/* return address 52 */ 52189623Srwatson#define deflatestate 56 /* the function arguments */ 53189623Srwatson#define curmatch 60 54189623Srwatson 55189623Srwatson/* All the +zlib1222add offsets are due to the addition of fields 56189623Srwatson * in zlib in the deflate_state structure since the asm code was first written 57189623Srwatson * (if you compile with zlib 1.0.4 or older, use "zlib1222add equ (-4)"). 58189623Srwatson * (if you compile with zlib between 1.0.5 and 1.2.2.1, use "zlib1222add equ 0"). 59189623Srwatson * if you compile with zlib 1.2.2.2 or later , use "zlib1222add equ 8"). 60189623Srwatson */ 61189623Srwatson 62189623Srwatson#define zlib1222add (8) 63189623Srwatson 64189623Srwatson#define dsWSize (36+zlib1222add) 65189623Srwatson#define dsWMask (44+zlib1222add) 66189623Srwatson#define dsWindow (48+zlib1222add) 67189623Srwatson#define dsPrev (56+zlib1222add) 68189623Srwatson#define dsMatchLen (88+zlib1222add) 69189623Srwatson#define dsPrevMatch (92+zlib1222add) 70189623Srwatson#define dsStrStart (100+zlib1222add) 71189623Srwatson#define dsMatchStart (104+zlib1222add) 72189623Srwatson#define dsLookahead (108+zlib1222add) 73189623Srwatson#define dsPrevLen (112+zlib1222add) 74189623Srwatson#define dsMaxChainLen (116+zlib1222add) 75189623Srwatson#define dsGoodMatch (132+zlib1222add) 76189623Srwatson#define dsNiceMatch (136+zlib1222add) 77189623Srwatson 78189623Srwatson 79189623Srwatson.file "match.S" 80189623Srwatson 81189623Srwatson.globl match_init, longest_match 82189623Srwatson 83189623Srwatson.text 84189623Srwatson 85189623Srwatson/* uInt longest_match(deflate_state *deflatestate, IPos curmatch) */ 86189623Srwatson 87189623Srwatsonlongest_match: 88189623Srwatson 89189623Srwatson/* Save registers that the compiler may be using, and adjust %esp to */ 90189623Srwatson/* make room for our stack frame. */ 91189623Srwatson 92189623Srwatson pushl %ebp 93189623Srwatson pushl %edi 94189623Srwatson pushl %esi 95189623Srwatson pushl %ebx 96189623Srwatson subl $LocalVarsSize, %esp 97189623Srwatson 98189623Srwatson/* Retrieve the function arguments. %ecx will hold cur_match */ 99189623Srwatson/* throughout the entire function. %edx will hold the pointer to the */ 100189623Srwatson/* deflate_state structure during the function's setup (before */ 101189623Srwatson/* entering the main loop). */ 102189623Srwatson 103189623Srwatson movl deflatestate(%esp), %edx 104189623Srwatson movl curmatch(%esp), %ecx 105189623Srwatson 106189623Srwatson/* uInt wmask = s->w_mask; */ 107189623Srwatson/* unsigned chain_length = s->max_chain_length; */ 108189623Srwatson/* if (s->prev_length >= s->good_match) { */ 109189623Srwatson/* chain_length >>= 2; */ 110189623Srwatson/* } */ 111189623Srwatson 112189623Srwatson movl dsPrevLen(%edx), %eax 113189623Srwatson movl dsGoodMatch(%edx), %ebx 114189623Srwatson cmpl %ebx, %eax 115189623Srwatson movl dsWMask(%edx), %eax 116189623Srwatson movl dsMaxChainLen(%edx), %ebx 117189623Srwatson jl LastMatchGood 118189623Srwatson shrl $2, %ebx 119189623SrwatsonLastMatchGood: 120189623Srwatson 121189623Srwatson/* chainlen is decremented once beforehand so that the function can */ 122189623Srwatson/* use the sign flag instead of the zero flag for the exit test. */ 123189623Srwatson/* It is then shifted into the high word, to make room for the wmask */ 124189623Srwatson/* value, which it will always accompany. */ 125189623Srwatson 126189623Srwatson decl %ebx 127189623Srwatson shll $16, %ebx 128189623Srwatson orl %eax, %ebx 129189623Srwatson movl %ebx, chainlenwmask(%esp) 130189623Srwatson 131189623Srwatson/* if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead; */ 132189623Srwatson 133189623Srwatson movl dsNiceMatch(%edx), %eax 134189623Srwatson movl dsLookahead(%edx), %ebx 135189623Srwatson cmpl %eax, %ebx 136189623Srwatson jl LookaheadLess 137189623Srwatson movl %eax, %ebx 138189623SrwatsonLookaheadLess: movl %ebx, nicematch(%esp) 139189623Srwatson 140189623Srwatson/* register Bytef *scan = s->window + s->strstart; */ 141189623Srwatson 142189623Srwatson movl dsWindow(%edx), %esi 143189623Srwatson movl %esi, window(%esp) 144189623Srwatson movl dsStrStart(%edx), %ebp 145189623Srwatson lea (%esi,%ebp), %edi 146189623Srwatson movl %edi, scan(%esp) 147189623Srwatson 148189623Srwatson/* Determine how many bytes the scan ptr is off from being */ 149189623Srwatson/* dword-aligned. */ 150189623Srwatson 151189623Srwatson movl %edi, %eax 152189623Srwatson negl %eax 153189623Srwatson andl $3, %eax 154189623Srwatson movl %eax, scanalign(%esp) 155189623Srwatson 156189623Srwatson/* IPos limit = s->strstart > (IPos)MAX_DIST(s) ? */ 157189623Srwatson/* s->strstart - (IPos)MAX_DIST(s) : NIL; */ 158189623Srwatson 159189623Srwatson movl dsWSize(%edx), %eax 160189623Srwatson subl $MIN_LOOKAHEAD, %eax 161189623Srwatson subl %eax, %ebp 162189623Srwatson jg LimitPositive 163189623Srwatson xorl %ebp, %ebp 164189623SrwatsonLimitPositive: 165189623Srwatson 166189623Srwatson/* int best_len = s->prev_length; */ 167189623Srwatson 168189623Srwatson movl dsPrevLen(%edx), %eax 169189623Srwatson movl %eax, bestlen(%esp) 170189623Srwatson 171189623Srwatson/* Store the sum of s->window + best_len in %esi locally, and in %esi. */ 172189623Srwatson 173189623Srwatson addl %eax, %esi 174189623Srwatson movl %esi, windowbestlen(%esp) 175189623Srwatson 176189623Srwatson/* register ush scan_start = *(ushf*)scan; */ 177189623Srwatson/* register ush scan_end = *(ushf*)(scan+best_len-1); */ 178189623Srwatson/* Posf *prev = s->prev; */ 179189623Srwatson 180189623Srwatson movzwl (%edi), %ebx 181189623Srwatson movl %ebx, scanstart(%esp) 182189623Srwatson movzwl -1(%edi,%eax), %ebx 183189623Srwatson movl %ebx, scanend(%esp) 184189623Srwatson movl dsPrev(%edx), %edi 185189623Srwatson 186189623Srwatson/* Jump into the main loop. */ 187189623Srwatson 188189623Srwatson movl chainlenwmask(%esp), %edx 189189623Srwatson jmp LoopEntry 190189623Srwatson 191189623Srwatson.balign 16 192189623Srwatson 193189623Srwatson/* do { 194189623Srwatson * match = s->window + cur_match; 195189623Srwatson * if (*(ushf*)(match+best_len-1) != scan_end || 196189623Srwatson * *(ushf*)match != scan_start) continue; 197189623Srwatson * [...] 198189623Srwatson * } while ((cur_match = prev[cur_match & wmask]) > limit 199189623Srwatson * && --chain_length != 0); 200189623Srwatson * 201189623Srwatson * Here is the inner loop of the function. The function will spend the 202189623Srwatson * majority of its time in this loop, and majority of that time will 203189623Srwatson * be spent in the first ten instructions. 204189623Srwatson * 205189623Srwatson * Within this loop: 206189623Srwatson * %ebx = scanend 207189623Srwatson * %ecx = curmatch 208189623Srwatson * %edx = chainlenwmask - i.e., ((chainlen << 16) | wmask) 209189623Srwatson * %esi = windowbestlen - i.e., (window + bestlen) 210189623Srwatson * %edi = prev 211189623Srwatson * %ebp = limit 212189623Srwatson */ 213189623SrwatsonLookupLoop: 214189623Srwatson andl %edx, %ecx 215189623Srwatson movzwl (%edi,%ecx,2), %ecx 216189623Srwatson cmpl %ebp, %ecx 217189623Srwatson jbe LeaveNow 218189623Srwatson subl $0x00010000, %edx 219189623Srwatson js LeaveNow 220189623SrwatsonLoopEntry: movzwl -1(%esi,%ecx), %eax 221189623Srwatson cmpl %ebx, %eax 222189623Srwatson jnz LookupLoop 223189623Srwatson movl window(%esp), %eax 224189623Srwatson movzwl (%eax,%ecx), %eax 225189623Srwatson cmpl scanstart(%esp), %eax 226189623Srwatson jnz LookupLoop 227189623Srwatson 228189623Srwatson/* Store the current value of chainlen. */ 229189623Srwatson 230189623Srwatson movl %edx, chainlenwmask(%esp) 231189623Srwatson 232189623Srwatson/* Point %edi to the string under scrutiny, and %esi to the string we */ 233189623Srwatson/* are hoping to match it up with. In actuality, %esi and %edi are */ 234189623Srwatson/* both pointed (MAX_MATCH_8 - scanalign) bytes ahead, and %edx is */ 235189623Srwatson/* initialized to -(MAX_MATCH_8 - scanalign). */ 236189623Srwatson 237189623Srwatson movl window(%esp), %esi 238189623Srwatson movl scan(%esp), %edi 239189623Srwatson addl %ecx, %esi 240189623Srwatson movl scanalign(%esp), %eax 241189623Srwatson movl $(-MAX_MATCH_8), %edx 242189623Srwatson lea MAX_MATCH_8(%edi,%eax), %edi 243189623Srwatson lea MAX_MATCH_8(%esi,%eax), %esi 244189623Srwatson 245189623Srwatson/* Test the strings for equality, 8 bytes at a time. At the end, 246189623Srwatson * adjust %edx so that it is offset to the exact byte that mismatched. 247189623Srwatson * 248189623Srwatson * We already know at this point that the first three bytes of the 249189623Srwatson * strings match each other, and they can be safely passed over before 250189623Srwatson * starting the compare loop. So what this code does is skip over 0-3 251189623Srwatson * bytes, as much as necessary in order to dword-align the %edi 252189623Srwatson * pointer. (%esi will still be misaligned three times out of four.) 253189623Srwatson * 254189623Srwatson * It should be confessed that this loop usually does not represent 255189623Srwatson * much of the total running time. Replacing it with a more 256189623Srwatson * straightforward "rep cmpsb" would not drastically degrade 257189623Srwatson * performance. 258189623Srwatson */ 259189623SrwatsonLoopCmps: 260189623Srwatson movl (%esi,%edx), %eax 261189623Srwatson xorl (%edi,%edx), %eax 262189623Srwatson jnz LeaveLoopCmps 263189623Srwatson movl 4(%esi,%edx), %eax 264189623Srwatson xorl 4(%edi,%edx), %eax 265189623Srwatson jnz LeaveLoopCmps4 266189623Srwatson addl $8, %edx 267189623Srwatson jnz LoopCmps 268189623Srwatson jmp LenMaximum 269189623SrwatsonLeaveLoopCmps4: addl $4, %edx 270189623SrwatsonLeaveLoopCmps: testl $0x0000FFFF, %eax 271189623Srwatson jnz LenLower 272189623Srwatson addl $2, %edx 273189623Srwatson shrl $16, %eax 274189623SrwatsonLenLower: subb $1, %al 275189623Srwatson adcl $0, %edx 276189623Srwatson 277189623Srwatson/* Calculate the length of the match. If it is longer than MAX_MATCH, */ 278189623Srwatson/* then automatically accept it as the best possible match and leave. */ 279189623Srwatson 280189623Srwatson lea (%edi,%edx), %eax 281189623Srwatson movl scan(%esp), %edi 282189623Srwatson subl %edi, %eax 283189623Srwatson cmpl $MAX_MATCH, %eax 284189623Srwatson jge LenMaximum 285189623Srwatson 286189623Srwatson/* If the length of the match is not longer than the best match we */ 287189623Srwatson/* have so far, then forget it and return to the lookup loop. */ 288189623Srwatson 289189623Srwatson movl deflatestate(%esp), %edx 290189623Srwatson movl bestlen(%esp), %ebx 291189623Srwatson cmpl %ebx, %eax 292189623Srwatson jg LongerMatch 293189623Srwatson movl windowbestlen(%esp), %esi 294189623Srwatson movl dsPrev(%edx), %edi 295189623Srwatson movl scanend(%esp), %ebx 296189623Srwatson movl chainlenwmask(%esp), %edx 297189623Srwatson jmp LookupLoop 298189623Srwatson 299189623Srwatson/* s->match_start = cur_match; */ 300189623Srwatson/* best_len = len; */ 301189623Srwatson/* if (len >= nice_match) break; */ 302189623Srwatson/* scan_end = *(ushf*)(scan+best_len-1); */ 303189623Srwatson 304189623SrwatsonLongerMatch: movl nicematch(%esp), %ebx 305189623Srwatson movl %eax, bestlen(%esp) 306189623Srwatson movl %ecx, dsMatchStart(%edx) 307189623Srwatson cmpl %ebx, %eax 308189623Srwatson jge LeaveNow 309189623Srwatson movl window(%esp), %esi 310189623Srwatson addl %eax, %esi 311189623Srwatson movl %esi, windowbestlen(%esp) 312189623Srwatson movzwl -1(%edi,%eax), %ebx 313189623Srwatson movl dsPrev(%edx), %edi 314189623Srwatson movl %ebx, scanend(%esp) 315189623Srwatson movl chainlenwmask(%esp), %edx 316189623Srwatson jmp LookupLoop 317189623Srwatson 318189623Srwatson/* Accept the current string, with the maximum possible length. */ 319189623Srwatson 320189623SrwatsonLenMaximum: movl deflatestate(%esp), %edx 321189623Srwatson movl $MAX_MATCH, bestlen(%esp) 322189623Srwatson movl %ecx, dsMatchStart(%edx) 323189623Srwatson 324189623Srwatson/* if ((uInt)best_len <= s->lookahead) return (uInt)best_len; */ 325189623Srwatson/* return s->lookahead; */ 326189623Srwatson 327189623SrwatsonLeaveNow: 328189623Srwatson movl deflatestate(%esp), %edx 329189623Srwatson movl bestlen(%esp), %ebx 330189623Srwatson movl dsLookahead(%edx), %eax 331189623Srwatson cmpl %eax, %ebx 332189623Srwatson jg LookaheadRet 333189623Srwatson movl %ebx, %eax 334189623SrwatsonLookaheadRet: 335189623Srwatson 336189623Srwatson/* Restore the stack and return from whence we came. */ 337189623Srwatson 338189623Srwatson addl $LocalVarsSize, %esp 339189623Srwatson popl %ebx 340189623Srwatson popl %esi 341 popl %edi 342 popl %ebp 343match_init: ret 344