1;===========================================================================
2; Copyright (c) 1990-2007 Info-ZIP.  All rights reserved.
3;
4; See the accompanying file LICENSE, version 2000-Apr-09 or later
5; (the contents of which are also included in zip.h) for terms of use.
6; If, for some reason, all these files are missing, the Info-ZIP license
7; also may be found at:  ftp://ftp.info-zip.org/pub/infozip/license.html
8;===========================================================================
9; crc_i386.asm, optimized CRC calculation function for Zip and UnZip,
10; created by Paul Kienitz and Christian Spieler.  Last revised 07 Jan 2007.
11;
12; Revised 06-Oct-96, Scott Field (sfield@microsoft.com)
13;   fixed to assemble with masm by not using .model directive which makes
14;   assumptions about segment alignment.  Also,
15;   avoid using loop, and j[e]cxz where possible.  Use mov + inc, rather
16;   than lodsb, and other misc. changes resulting in the following performance
17;   increases:
18;
19;      unrolled loops                NO_UNROLLED_LOOPS
20;      *8    >8      <8              *8      >8      <8
21;
22;      +54%  +42%    +35%            +82%    +52%    +25%
23;
24;   first item in each table is input buffer length, even multiple of 8
25;   second item in each table is input buffer length, > 8
26;   third item in each table is input buffer length, < 8
27;
28; Revised 02-Apr-97, Chr. Spieler, based on Rodney Brown (rdb@cmutual.com.au)
29;   Incorporated Rodney Brown's 32-bit-reads optimization as found in the
30;   UNIX AS source crc_i386.S. This new code can be disabled by defining
31;   the macro symbol NO_32_BIT_LOADS.
32;
33; Revised 12-Oct-97, Chr. Spieler, based on Rodney Brown (rdb@cmutual.com.au)
34;   Incorporated Rodney Brown's additional tweaks for 32-bit-optimized CPUs
35;   (like the Pentium Pro, Pentium II, and probably some Pentium clones).
36;   This optimization is controlled by the macro symbol __686 and is disabled
37;   by default. (This default is based on the assumption that most users
38;   do not yet work on a Pentium Pro or Pentium II machine ...)
39;
40; Revised 25-Mar-98, Cosmin Truta (cosmint@cs.ubbcluj.ro)
41;   Working without .model directive caused tasm32 version 5.0 to produce
42;   bad object code. The optimized alignments can be optionally disabled
43;   by defining NO_ALIGN, thus allowing to use .model flat. There is no need
44;   to define this macro if using other versions of tasm.
45;
46; Revised 16-Jan-2005, Cosmin Truta (cosmint@cs.ubbcluj.ro)
47;   Enabled the 686 build by default, because there are hardly any pre-686 CPUs
48;   in serious use nowadays. (See the 12-Oct-97 note above.)
49;
50; Revised 03-Jan-2006, Chr. Spieler
51;   Enlarged unrolling loops to "do 16 bytes per turn"; optimized access to
52;   data buffer in loop body (adjust pointer only once in loop body and use
53;   offsets to access each item); added additional support for the "unfolded
54;   tables" optimization variant (enabled by IZ_CRCOPTIM_UNFOLDTBL).
55;
56; Revised 07-Jan-2007, Chr. Spieler
57;   Recognize additional conditional flag CRC_TABLE_ONLY that prevents
58;   compilation of the crc32() function.
59;
60; FLAT memory model assumed.
61;
62; Loop unrolling can be disabled by defining the macro NO_UNROLLED_LOOPS.
63; This results in shorter code at the expense of reduced performance.
64;
65;==============================================================================
66;
67; Do NOT assemble this source if external crc32 routine from zlib gets used,
68; or only the precomputed CRC_32_Table is needed.
69;
70    IFNDEF USE_ZLIB
71    IFNDEF CRC_TABLE_ONLY
72;
73        .386p
74        name    crc_i386
75
76    IFDEF NO_ALIGN
77        .model flat
78    ENDIF
79
80    IFNDEF PRE_686
81    IFNDEF __686
82__686   EQU     1 ; optimize for Pentium Pro, Pentium II and compatible CPUs
83    ENDIF
84    ENDIF
85
86extrn   _get_crc_table:near    ; ZCONST ulg near *get_crc_table(void);
87
88;
89    IFNDEF NO_STD_STACKFRAME
90        ; Use a `standard' stack frame setup on routine entry and exit.
91        ; Actually, this option is set as default, because it results
92        ; in smaller code !!
93STD_ENTRY       MACRO
94                push    ebp
95                mov     ebp,esp
96        ENDM
97
98        Arg1    EQU     08H[ebp]
99        Arg2    EQU     0CH[ebp]
100        Arg3    EQU     10H[ebp]
101
102STD_LEAVE       MACRO
103                pop     ebp
104        ENDM
105
106    ELSE  ; NO_STD_STACKFRAME
107
108STD_ENTRY       MACRO
109        ENDM
110
111        Arg1    EQU     18H[esp]
112        Arg2    EQU     1CH[esp]
113        Arg3    EQU     20H[esp]
114
115STD_LEAVE       MACRO
116        ENDM
117
118    ENDIF ; ?NO_STD_STACKFRAME
119
120; These two (three) macros make up the loop body of the CRC32 cruncher.
121; registers modified:
122;   eax  : crc value "c"
123;   esi  : pointer to next data byte (or dword) "buf++"
124; registers read:
125;   edi  : pointer to base of crc_table array
126; scratch registers:
127;   ebx  : index into crc_table array
128;          (requires upper three bytes = 0 when __686 is undefined)
129    IFNDEF  __686 ; optimize for 386, 486, Pentium
130Do_CRC  MACRO
131                mov     bl,al                ; tmp = c & 0xFF
132                shr     eax,8                ; c = (c >> 8)
133                xor     eax,[edi+ebx*4]      ;  ^ table[tmp]
134        ENDM
135    ELSE ; __686 : optimize for Pentium Pro, Pentium II and compatible CPUs
136Do_CRC  MACRO
137                movzx   ebx,al                 ; tmp = c & 0xFF
138                shr     eax,8                  ; c = (c >> 8)
139                xor     eax,[edi+ebx*4]        ;  ^ table[tmp]
140        ENDM
141    ENDIF ; ?__686
142Do_CRC_byte     MACRO
143                xor     al, byte ptr [esi]     ; c ^= *buf
144                inc     esi                    ; buf++
145                Do_CRC                         ; c = (c >> 8) ^ table[c & 0xFF]
146        ENDM
147Do_CRC_byteof   MACRO   ofs
148                xor     al, byte ptr [esi+ofs] ; c ^= *(buf+ofs)
149                Do_CRC                         ; c = (c >> 8) ^ table[c & 0xFF]
150        ENDM
151    IFNDEF  NO_32_BIT_LOADS
152      IFDEF IZ_CRCOPTIM_UNFOLDTBL
153        ; the edx register is needed in crc calculation
154        SavLen  EQU     Arg3
155
156UpdCRC_dword    MACRO
157                movzx   ebx,al                 ; tmp = c & 0xFF
158                mov     edx,[edi+ebx*4+3072]   ;  table[256*3+tmp]
159                movzx   ebx,ah                 ; tmp = (c>>8) & 0xFF
160                shr     eax,16                 ;
161                xor     edx,[edi+ebx*4+2048]   ;  ^ table[256*2+tmp]
162                movzx   ebx,al                 ; tmp = (c>>16) & 0xFF
163                shr     eax,8                  ; tmp = (c>>24)
164                xor     edx,[edi+ebx*4+1024]   ;  ^ table[256*1+tmp]
165                mov     eax,[edi+eax*4]        ;  ^ table[256*0+tmp]
166                xor     eax,edx                ; ..
167        ENDM
168UpdCRC_dword_sh MACRO   dwPtrIncr
169                movzx   ebx,al                 ; tmp = c & 0xFF
170                mov     edx,[edi+ebx*4+3072]   ;  table[256*3+tmp]
171                movzx   ebx,ah                 ; tmp = (c>>8) & 0xFF
172                xor     edx,[edi+ebx*4+2048]   ;  ^ table[256*2+tmp]
173                shr     eax,16                 ;
174                movzx   ebx,al                 ; tmp = (c>>16) & 0xFF
175                add     esi, 4*dwPtrIncr       ; ((ulg *)buf) += dwPtrIncr
176                shr     eax,8                  ; tmp = (c>>24)
177                xor     edx,[edi+ebx*4+1024]   ;  ^ table[256*1+tmp]
178                mov     eax,[edi+eax*4]        ;  ^ table[256*0+tmp]
179                xor     eax,edx                ; ..
180        ENDM
181      ELSE ; IZ_CRCOPTIM_UNFOLDTBL
182        ; the edx register is not needed anywhere else
183        SavLen  EQU     edx
184
185UpdCRC_dword    MACRO
186                Do_CRC
187                Do_CRC
188                Do_CRC
189                Do_CRC
190        ENDM
191UpdCRC_dword_sh MACRO   dwPtrIncr
192                Do_CRC
193                Do_CRC
194                add     esi, 4*dwPtrIncr       ; ((ulg *)buf) += dwPtrIncr
195                Do_CRC
196                Do_CRC
197        ENDM
198      ENDIF ; ?IZ_CRCOPTIM_UNFOLDTBL
199Do_CRC_dword    MACRO
200                xor     eax, dword ptr [esi]   ; c ^= *(ulg *)buf
201                UpdCRC_dword_sh 1              ; ... ((ulg *)buf)++
202        ENDM
203Do_CRC_4dword   MACRO
204                xor     eax, dword ptr [esi]    ; c ^= *(ulg *)buf
205                UpdCRC_dword
206                xor     eax, dword ptr [esi+4]  ; c ^= *((ulg *)buf+1)
207                UpdCRC_dword
208                xor     eax, dword ptr [esi+8]  ; c ^= *((ulg *)buf+2)
209                UpdCRC_dword
210                xor     eax, dword ptr [esi+12] ; c ^= *((ulg *)buf]+3
211                UpdCRC_dword_sh	4               ; ... ((ulg *)buf)+=4
212        ENDM
213    ENDIF ; !NO_32_BIT_LOADS
214
215    IFNDEF NO_ALIGN
216_TEXT   segment use32 para public 'CODE'
217    ELSE
218_TEXT   segment use32
219    ENDIF
220        assume  CS: _TEXT
221
222        public  _crc32
223_crc32          proc    near  ; ulg crc32(ulg crc, ZCONST uch *buf, extent len)
224                STD_ENTRY
225                push    edi
226                push    esi
227                push    ebx
228                push    edx
229                push    ecx
230
231                mov     esi,Arg2            ; 2nd arg: uch *buf
232                sub     eax,eax             ;> if (!buf)
233                test    esi,esi             ;>   return 0;
234                jz      fine                ;> else {
235
236                call    _get_crc_table
237                mov     edi,eax
238                mov     eax,Arg1            ; 1st arg: ulg crc
239    IFNDEF __686
240                sub     ebx,ebx             ; ebx=0; make bl usable as a dword
241    ENDIF
242                mov     ecx,Arg3            ; 3rd arg: extent len
243                not     eax                 ;>   c = ~crc;
244
245                test    ecx,ecx
246    IFNDEF  NO_UNROLLED_LOOPS
247                jz      bail
248    IFNDEF  NO_32_BIT_LOADS
249align_loop:
250                test    esi,3               ; align buf pointer on next
251                jz      SHORT aligned_now   ;  dword boundary
252                Do_CRC_byte
253                dec     ecx
254                jnz     align_loop
255aligned_now:
256    ENDIF ; !NO_32_BIT_LOADS
257                mov     SavLen,ecx          ; save current len for later
258                shr     ecx,4               ; ecx = len / 16
259                jz      No_Sixteens
260    IFNDEF NO_ALIGN
261; align loop head at start of 486 internal cache line !!
262                align   16
263    ENDIF
264Next_Sixteen:
265    IFNDEF  NO_32_BIT_LOADS
266                Do_CRC_4dword
267    ELSE ; NO_32_BIT_LOADS
268                Do_CRC_byteof   0
269                Do_CRC_byteof   1
270                Do_CRC_byteof   2
271                Do_CRC_byteof   3
272                Do_CRC_byteof   4
273                Do_CRC_byteof   5
274                Do_CRC_byteof   6
275                Do_CRC_byteof   7
276                Do_CRC_byteof   8
277                Do_CRC_byteof   9
278                Do_CRC_byteof   10
279                Do_CRC_byteof   11
280                Do_CRC_byteof   12
281                Do_CRC_byteof   13
282                Do_CRC_byteof   14
283                Do_CRC_byteof   15
284                add     esi, 16                 ; buf += 16
285    ENDIF ; ?NO_32_BIT_LOADS
286                dec     ecx
287                jnz     Next_Sixteen
288No_Sixteens:
289                mov     ecx,SavLen
290                and     ecx,00000000FH      ; ecx = len % 16
291    IFNDEF  NO_32_BIT_LOADS
292                shr     ecx,2               ; ecx = len / 4
293                jz      SHORT No_Fours
294Next_Four:
295                Do_CRC_dword
296                dec     ecx
297                jnz     Next_Four
298No_Fours:
299                mov     ecx,SavLen
300                and     ecx,000000003H      ; ecx = len % 4
301    ENDIF ; !NO_32_BIT_LOADS
302    ENDIF ; !NO_UNROLLED_LOOPS
303                jz      SHORT bail          ;>   if (len)
304    IFNDEF NO_ALIGN
305; align loop head at start of 486 internal cache line !!
306                align   16
307    ENDIF
308loupe:                                      ;>     do {
309                Do_CRC_byte                 ;        c = CRC32(c,*buf++,crctab);
310                dec     ecx                 ;>     } while (--len);
311                jnz     loupe
312
313bail:                                       ;> }
314                not     eax                 ;> return ~c;
315fine:
316                pop     ecx
317                pop     edx
318                pop     ebx
319                pop     esi
320                pop     edi
321                STD_LEAVE
322                ret
323_crc32          endp
324
325_TEXT   ends
326;
327    ENDIF ; !CRC_TABLE_ONLY
328    ENDIF ; !USE_ZLIB
329;
330end
331