1;=========================================================================== 2; Copyright (c) 1990-2007 Info-ZIP. All rights reserved. 3; 4; See the accompanying file LICENSE, version 2000-Apr-09 or later 5; (the contents of which are also included in zip.h) for terms of use. 6; If, for some reason, all these files are missing, the Info-ZIP license 7; also may be found at: ftp://ftp.info-zip.org/pub/infozip/license.html 8;=========================================================================== 9; crc_i386.asm, optimized CRC calculation function for Zip and UnZip, 10; created by Paul Kienitz and Christian Spieler. Last revised 07 Jan 2007. 11; 12; Revised 06-Oct-96, Scott Field (sfield@microsoft.com) 13; fixed to assemble with masm by not using .model directive which makes 14; assumptions about segment alignment. Also, 15; avoid using loop, and j[e]cxz where possible. Use mov + inc, rather 16; than lodsb, and other misc. changes resulting in the following performance 17; increases: 18; 19; unrolled loops NO_UNROLLED_LOOPS 20; *8 >8 <8 *8 >8 <8 21; 22; +54% +42% +35% +82% +52% +25% 23; 24; first item in each table is input buffer length, even multiple of 8 25; second item in each table is input buffer length, > 8 26; third item in each table is input buffer length, < 8 27; 28; Revised 02-Apr-97, Chr. Spieler, based on Rodney Brown (rdb@cmutual.com.au) 29; Incorporated Rodney Brown's 32-bit-reads optimization as found in the 30; UNIX AS source crc_i386.S. This new code can be disabled by defining 31; the macro symbol NO_32_BIT_LOADS. 32; 33; Revised 12-Oct-97, Chr. Spieler, based on Rodney Brown (rdb@cmutual.com.au) 34; Incorporated Rodney Brown's additional tweaks for 32-bit-optimized CPUs 35; (like the Pentium Pro, Pentium II, and probably some Pentium clones). 36; This optimization is controlled by the macro symbol __686 and is disabled 37; by default. (This default is based on the assumption that most users 38; do not yet work on a Pentium Pro or Pentium II machine ...) 39; 40; Revised 25-Mar-98, Cosmin Truta (cosmint@cs.ubbcluj.ro) 41; Working without .model directive caused tasm32 version 5.0 to produce 42; bad object code. The optimized alignments can be optionally disabled 43; by defining NO_ALIGN, thus allowing to use .model flat. There is no need 44; to define this macro if using other versions of tasm. 45; 46; Revised 16-Jan-2005, Cosmin Truta (cosmint@cs.ubbcluj.ro) 47; Enabled the 686 build by default, because there are hardly any pre-686 CPUs 48; in serious use nowadays. (See the 12-Oct-97 note above.) 49; 50; Revised 03-Jan-2006, Chr. Spieler 51; Enlarged unrolling loops to "do 16 bytes per turn"; optimized access to 52; data buffer in loop body (adjust pointer only once in loop body and use 53; offsets to access each item); added additional support for the "unfolded 54; tables" optimization variant (enabled by IZ_CRCOPTIM_UNFOLDTBL). 55; 56; Revised 07-Jan-2007, Chr. Spieler 57; Recognize additional conditional flag CRC_TABLE_ONLY that prevents 58; compilation of the crc32() function. 59; 60; FLAT memory model assumed. 61; 62; Loop unrolling can be disabled by defining the macro NO_UNROLLED_LOOPS. 63; This results in shorter code at the expense of reduced performance. 64; 65;============================================================================== 66; 67; Do NOT assemble this source if external crc32 routine from zlib gets used, 68; or only the precomputed CRC_32_Table is needed. 69; 70 IFNDEF USE_ZLIB 71 IFNDEF CRC_TABLE_ONLY 72; 73 .386p 74 name crc_i386 75 76 IFDEF NO_ALIGN 77 .model flat 78 ENDIF 79 80 IFNDEF PRE_686 81 IFNDEF __686 82__686 EQU 1 ; optimize for Pentium Pro, Pentium II and compatible CPUs 83 ENDIF 84 ENDIF 85 86extrn _get_crc_table:near ; ZCONST ulg near *get_crc_table(void); 87 88; 89 IFNDEF NO_STD_STACKFRAME 90 ; Use a `standard' stack frame setup on routine entry and exit. 91 ; Actually, this option is set as default, because it results 92 ; in smaller code !! 93STD_ENTRY MACRO 94 push ebp 95 mov ebp,esp 96 ENDM 97 98 Arg1 EQU 08H[ebp] 99 Arg2 EQU 0CH[ebp] 100 Arg3 EQU 10H[ebp] 101 102STD_LEAVE MACRO 103 pop ebp 104 ENDM 105 106 ELSE ; NO_STD_STACKFRAME 107 108STD_ENTRY MACRO 109 ENDM 110 111 Arg1 EQU 18H[esp] 112 Arg2 EQU 1CH[esp] 113 Arg3 EQU 20H[esp] 114 115STD_LEAVE MACRO 116 ENDM 117 118 ENDIF ; ?NO_STD_STACKFRAME 119 120; These two (three) macros make up the loop body of the CRC32 cruncher. 121; registers modified: 122; eax : crc value "c" 123; esi : pointer to next data byte (or dword) "buf++" 124; registers read: 125; edi : pointer to base of crc_table array 126; scratch registers: 127; ebx : index into crc_table array 128; (requires upper three bytes = 0 when __686 is undefined) 129 IFNDEF __686 ; optimize for 386, 486, Pentium 130Do_CRC MACRO 131 mov bl,al ; tmp = c & 0xFF 132 shr eax,8 ; c = (c >> 8) 133 xor eax,[edi+ebx*4] ; ^ table[tmp] 134 ENDM 135 ELSE ; __686 : optimize for Pentium Pro, Pentium II and compatible CPUs 136Do_CRC MACRO 137 movzx ebx,al ; tmp = c & 0xFF 138 shr eax,8 ; c = (c >> 8) 139 xor eax,[edi+ebx*4] ; ^ table[tmp] 140 ENDM 141 ENDIF ; ?__686 142Do_CRC_byte MACRO 143 xor al, byte ptr [esi] ; c ^= *buf 144 inc esi ; buf++ 145 Do_CRC ; c = (c >> 8) ^ table[c & 0xFF] 146 ENDM 147Do_CRC_byteof MACRO ofs 148 xor al, byte ptr [esi+ofs] ; c ^= *(buf+ofs) 149 Do_CRC ; c = (c >> 8) ^ table[c & 0xFF] 150 ENDM 151 IFNDEF NO_32_BIT_LOADS 152 IFDEF IZ_CRCOPTIM_UNFOLDTBL 153 ; the edx register is needed in crc calculation 154 SavLen EQU Arg3 155 156UpdCRC_dword MACRO 157 movzx ebx,al ; tmp = c & 0xFF 158 mov edx,[edi+ebx*4+3072] ; table[256*3+tmp] 159 movzx ebx,ah ; tmp = (c>>8) & 0xFF 160 shr eax,16 ; 161 xor edx,[edi+ebx*4+2048] ; ^ table[256*2+tmp] 162 movzx ebx,al ; tmp = (c>>16) & 0xFF 163 shr eax,8 ; tmp = (c>>24) 164 xor edx,[edi+ebx*4+1024] ; ^ table[256*1+tmp] 165 mov eax,[edi+eax*4] ; ^ table[256*0+tmp] 166 xor eax,edx ; .. 167 ENDM 168UpdCRC_dword_sh MACRO dwPtrIncr 169 movzx ebx,al ; tmp = c & 0xFF 170 mov edx,[edi+ebx*4+3072] ; table[256*3+tmp] 171 movzx ebx,ah ; tmp = (c>>8) & 0xFF 172 xor edx,[edi+ebx*4+2048] ; ^ table[256*2+tmp] 173 shr eax,16 ; 174 movzx ebx,al ; tmp = (c>>16) & 0xFF 175 add esi, 4*dwPtrIncr ; ((ulg *)buf) += dwPtrIncr 176 shr eax,8 ; tmp = (c>>24) 177 xor edx,[edi+ebx*4+1024] ; ^ table[256*1+tmp] 178 mov eax,[edi+eax*4] ; ^ table[256*0+tmp] 179 xor eax,edx ; .. 180 ENDM 181 ELSE ; IZ_CRCOPTIM_UNFOLDTBL 182 ; the edx register is not needed anywhere else 183 SavLen EQU edx 184 185UpdCRC_dword MACRO 186 Do_CRC 187 Do_CRC 188 Do_CRC 189 Do_CRC 190 ENDM 191UpdCRC_dword_sh MACRO dwPtrIncr 192 Do_CRC 193 Do_CRC 194 add esi, 4*dwPtrIncr ; ((ulg *)buf) += dwPtrIncr 195 Do_CRC 196 Do_CRC 197 ENDM 198 ENDIF ; ?IZ_CRCOPTIM_UNFOLDTBL 199Do_CRC_dword MACRO 200 xor eax, dword ptr [esi] ; c ^= *(ulg *)buf 201 UpdCRC_dword_sh 1 ; ... ((ulg *)buf)++ 202 ENDM 203Do_CRC_4dword MACRO 204 xor eax, dword ptr [esi] ; c ^= *(ulg *)buf 205 UpdCRC_dword 206 xor eax, dword ptr [esi+4] ; c ^= *((ulg *)buf+1) 207 UpdCRC_dword 208 xor eax, dword ptr [esi+8] ; c ^= *((ulg *)buf+2) 209 UpdCRC_dword 210 xor eax, dword ptr [esi+12] ; c ^= *((ulg *)buf]+3 211 UpdCRC_dword_sh 4 ; ... ((ulg *)buf)+=4 212 ENDM 213 ENDIF ; !NO_32_BIT_LOADS 214 215 IFNDEF NO_ALIGN 216_TEXT segment use32 para public 'CODE' 217 ELSE 218_TEXT segment use32 219 ENDIF 220 assume CS: _TEXT 221 222 public _crc32 223_crc32 proc near ; ulg crc32(ulg crc, ZCONST uch *buf, extent len) 224 STD_ENTRY 225 push edi 226 push esi 227 push ebx 228 push edx 229 push ecx 230 231 mov esi,Arg2 ; 2nd arg: uch *buf 232 sub eax,eax ;> if (!buf) 233 test esi,esi ;> return 0; 234 jz fine ;> else { 235 236 call _get_crc_table 237 mov edi,eax 238 mov eax,Arg1 ; 1st arg: ulg crc 239 IFNDEF __686 240 sub ebx,ebx ; ebx=0; make bl usable as a dword 241 ENDIF 242 mov ecx,Arg3 ; 3rd arg: extent len 243 not eax ;> c = ~crc; 244 245 test ecx,ecx 246 IFNDEF NO_UNROLLED_LOOPS 247 jz bail 248 IFNDEF NO_32_BIT_LOADS 249align_loop: 250 test esi,3 ; align buf pointer on next 251 jz SHORT aligned_now ; dword boundary 252 Do_CRC_byte 253 dec ecx 254 jnz align_loop 255aligned_now: 256 ENDIF ; !NO_32_BIT_LOADS 257 mov SavLen,ecx ; save current len for later 258 shr ecx,4 ; ecx = len / 16 259 jz No_Sixteens 260 IFNDEF NO_ALIGN 261; align loop head at start of 486 internal cache line !! 262 align 16 263 ENDIF 264Next_Sixteen: 265 IFNDEF NO_32_BIT_LOADS 266 Do_CRC_4dword 267 ELSE ; NO_32_BIT_LOADS 268 Do_CRC_byteof 0 269 Do_CRC_byteof 1 270 Do_CRC_byteof 2 271 Do_CRC_byteof 3 272 Do_CRC_byteof 4 273 Do_CRC_byteof 5 274 Do_CRC_byteof 6 275 Do_CRC_byteof 7 276 Do_CRC_byteof 8 277 Do_CRC_byteof 9 278 Do_CRC_byteof 10 279 Do_CRC_byteof 11 280 Do_CRC_byteof 12 281 Do_CRC_byteof 13 282 Do_CRC_byteof 14 283 Do_CRC_byteof 15 284 add esi, 16 ; buf += 16 285 ENDIF ; ?NO_32_BIT_LOADS 286 dec ecx 287 jnz Next_Sixteen 288No_Sixteens: 289 mov ecx,SavLen 290 and ecx,00000000FH ; ecx = len % 16 291 IFNDEF NO_32_BIT_LOADS 292 shr ecx,2 ; ecx = len / 4 293 jz SHORT No_Fours 294Next_Four: 295 Do_CRC_dword 296 dec ecx 297 jnz Next_Four 298No_Fours: 299 mov ecx,SavLen 300 and ecx,000000003H ; ecx = len % 4 301 ENDIF ; !NO_32_BIT_LOADS 302 ENDIF ; !NO_UNROLLED_LOOPS 303 jz SHORT bail ;> if (len) 304 IFNDEF NO_ALIGN 305; align loop head at start of 486 internal cache line !! 306 align 16 307 ENDIF 308loupe: ;> do { 309 Do_CRC_byte ; c = CRC32(c,*buf++,crctab); 310 dec ecx ;> } while (--len); 311 jnz loupe 312 313bail: ;> } 314 not eax ;> return ~c; 315fine: 316 pop ecx 317 pop edx 318 pop ebx 319 pop esi 320 pop edi 321 STD_LEAVE 322 ret 323_crc32 endp 324 325_TEXT ends 326; 327 ENDIF ; !CRC_TABLE_ONLY 328 ENDIF ; !USE_ZLIB 329; 330end 331