1/* 	sha1edp.s : this file provides optimized x86_64 and i386 implementation of the sha1 function
2	CoreOS - vector and numerics group
3	cclee	6-21-10
4
5	The implementation is based on the principle described in an Intel online article
6	"Improving the Performance of the Secure Hash Algorithm (SHA-1)"
7	http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1/
8
9
10	Update HASH[] by processing a one 64-byte block in MESSAGE[] can be represented by the following C function
11
12void SHA1( int HASH[], int MESSAGE[] )
13{
14    int A[81], B[81], C[81], D[81], E[81];
15    int W[80];
16
17    int i, FN;
18
19    A[0] = HASH[0];
20    B[0] = HASH[1];
21    C[0] = HASH[2];
22    D[0] = HASH[3];
23    E[0] = HASH[4];
24
25    for ( i=0; i<80; ++i )
26    {
27        if ( i < 16 )
28            W[i] = BIG_ENDIAN_LOAD( MESSAGE[i] );
29        else
30            W[i] = ROTATE_LEFT( W[i-3] ^ W[i-8] ^ W[i-14] ^ W[i-16], 1 );
31
32        FN = F( i, B[i], C[i], D[i] );
33
34        A[i+1] = FN + E[i] + ROTATE_LEFT( A[i], 5 ) + W[i] + K(i);
35        B[i+1] = A[i];
36        C[i+1] = ROTATE_LEFT( B[i], 30 );
37        D[i+1] = C[i];
38        E[i+1] = D[i];
39    }
40
41    HASH[0] += A[80];
42    HASH[1] += B[80];
43    HASH[2] += C[80];
44    HASH[3] += D[80];
45    HASH[4] += E[80];
46}
47
48	For i=0:15, W[i] is simply big-endian loading of MESSAGE[i]. For i=16:79, W[i] is updated according to W[i] = ROTATE_LEFT( W[i-3] ^ W[i-8] ^ W[i-14] ^ W[i-16], 1 );
49
50	The approach (by Dean Gaudet) can be used to vectorize the computation of W[i] for i=16:79,
51
52	1. done on 4 consequtive W[i] values in a single XMM register
53    W[i  ] = (W[i-3] ^ W[i-8] ^ W[i-14] ^ W[i-16]) rol 1
54    W[i+1] = (W[i-2] ^ W[i-7] ^ W[i-13] ^ W[i-15]) rol 1
55    W[i+2] = (W[i-1] ^ W[i-6] ^ W[i-12] ^ W[i-14]) rol 1
56    W[i+3] = (   0   ^ W[i-5] ^ W[i-11] ^ W[i-13]) rol 1
57
58    2. this additional calculation unfortunately requires many additional operations
59    W[i+3] ^= W[i] rol 1
60
61    3. once we have 4 W[i] values in XMM we can also add four K values with one instruction
62    W[i:i+3] += {K,K,K,K}
63
64	Let W0 = {W[i] W[i+1] W[i+2] W[i+3]} be the current W-vector to be computed, W4 = {W[i-4] W[i-3] W[i-2] W[i-1]} be the previous vector, and so on
65	The Dean Gaudet approach can be expressed as
66
67	1. W0 = rotate_left(left_shift(W4,32) ^ W8 ^ left_shift(concatenate(W16,W12),64) ^ W16,1);
68	2. W[i+3] ^= W[i] rol 1
69	3. W0 += {K,K,K,K}
70
71	For i>=32, the Intel online article suggests that (using a basic identity (X rol 1) rol 1 = X rol 2) the update equation is equivalent to
72
73	1. W0 = rotate_left(left_shift(concatenate(W8,W4),64) ^ W16 ^ W28 ^ W32, 2);
74
75	Note:
76	1. In total, we need 8 16-byte registers or memory for W0,W4,...,W28. W0 and W32 can be the same register or memory.
77	2. The registers are used in a circular buffering mode. For example, we start with W28,W24,...,W0 (with W0 indicating the most recent 16-byte)
78		i=0, W28,W24,...,W0
79		i=4, W24,W20,...,W28
80		i=8, W20,W16,...,W24
81		.
82		.
83		and so forth.
84	3. 2 ssse3 instructions are used in the Intel article, pshufb and palignr.
85		a. pshufb is used to simplify the BIG_ENDIAN_LOAD operation
86		b. palignr is used to simplify the computation of left_shift(concatenate(W12,W8),64)
87	4. we probe __cpu_capabilities to detect ssse3 support and dispatch code with ssse3 support when available.
88	   If ssse3 is not supported, a suboptimal code (pshufb and palignr workaround) is dispatched.
89
90*/
91
92/* the code can be compiled into single block (64 bytes) per call mode by setting Multiple_blocks to 0 */
93#define	Multiple_Blocks	1
94
95#if defined (__x86_64__) || defined(__i386__)		// x86_64 or i386 architectures
96
97#if defined(__x86_64__)
98
99	// set up for x86_64
100#define	stack_size	(8+16*11+16*4)					// 8 (alignedment) + x0-x10 + 4 128-bits for intermediate WK(t) storage
101#define	sp			%rsp							// unifying architectural stack pointer representation
102#define	ctx			%rdi							// 1st input argument, will move to HASH_PTR (%r9)
103#define	buf			%rsi							// 2nd input argument, will move to BUFFER_PTR (%r10)
104#define	cnt			%r11							// will copy from the 3rd input argument (%rdx)
105#define K_BASE		%r8								// an aligned pointer to point to shufb reference numbers of table of K values
106#define HASH_PTR	%r9								// pointer to Hash values (A,B,C,D,E)
107#define BUFFER_PTR	%r10							// pointer to input blocks
108
109#else	// !__x86_64__
110
111	// set up for i386
112#define stack_size	(12+16*2+16*11+16*4)			// 12-bytes (alignment) + extra 2 + 3 (W24/W28/XMM_SHUFB_BSWAP) + 8 (xmm0-xmm7) + 4 (WK(t))
113#define	sp			%esp							// unifying architectural stack pointer representation
114#define HASH_PTR	stack_size+16+4(sp)				// use 1st input argument from caller function, 16 for (esi/edi/ebx/ebp)
115#define BUFFER_PTR	stack_size+16+8(sp)				// use 2nd input argument from caller function
116#define cnt			stack_size+16+12(sp)			// use 3rd input argument from caller function
117#define K_BASE		stack_size-4(sp)				// use for K_BASE
118
119#endif	// __x86_64__
120
121// symbolizing registers or stack memory with algorithmic variables	W0,W4,...,W28 + W_TMP, W_TMP2, and XMM_SHUFB_BSWAP for code with ssse3 support
122
123#define W_TMP  	%xmm0
124#define W_TMP2 	%xmm1
125#define W0  	%xmm2
126#define W4  	%xmm3
127#define W8  	%xmm4
128#define W12 	%xmm5
129#define W16 	%xmm6
130#define W20 	%xmm7
131#if defined(__x86_64__)
132#define W24 	%xmm8
133#define W28 	%xmm9
134#define XMM_SHUFB_BSWAP %xmm10				// used only when ssse3 is supported
135#else	// defined (__i386__)
136#define W24     12*16(sp)
137#define W28     13*16(sp)
138#define XMM_SHUFB_BSWAP 14*16(sp)			// used only when ssse3 is supported
139#endif
140
141#define	xmov	movaps						// aligned 16-byte move
142#define	xmovu	movups						// unaligned 16-byte move
143
144// intermediate hash variables
145#define A %ecx
146#define B %esi
147#define C %edi
148#define D %ebp
149#define E %edx
150
151// temp variables
152#define T1 %eax
153#define T2 %ebx
154
155#define	WK(t)	(t&15)*4(sp)
156
157	// int F1(int B, int C, int D) { return (D ^ ( B & (C ^ D)); }
158	// result in T1
159	.macro	F1
160	mov	$1, T1
161	xor	$2, T1
162	and	$0, T1
163	xor	$2, T1
164	.endm
165
166	// int F2(int B, int C, int D) { return (D ^ B ^ C); }
167	// result in T1
168	.macro	F2
169	mov	$2, T1
170	xor	$1, T1
171	xor	$0, T1
172	.endm
173
174	// int F3(int B, int C, int D) { return (B & C) | (D & (B ^ C)); }
175	// result in T1
176	.macro	F3
177		mov $1, T1
178        mov $0, T2
179        or  $0, T1
180        and $1, T2
181        and $2, T1
182        or  T2, T1
183	.endm
184
185	// for i=60:79, F4 is identical to F2
186	#define	F4	F2
187
188
189	/*
190		i=0:15, W[i] = BIG_ENDIAN_LOAD(MESSAGE[i]);
191
192		with ssse3 support, this is achived via
193		for (i=0;i<16;i+=4) {
194			1. W_TMP = new 16 bytes from MESSAGE[]
195			2. W_TMP = pshufb(W_TMP, XMM_SHUFB_BSWAP); save to W circular buffer for updating W
196			3. WTMP += {K,K,K,K};
197			4. save quadruple W[i]+K[i] = W_TMP in the stack memory;
198		}
199
200		each step is represented in one of the following 4 macro definitions
201
202	*/
203
204	.macro	W_PRECALC_00_15_0_ssse3			// input argument $0 : 0/4/8/12
205#if defined (__x86_64__)					// BUFFER_PTR is already an address register in x86_64
206	xmovu	$0*4(BUFFER_PTR), W_TMP			// read 16-bytes into W_TMP, BUFFER_PTR possibly not 16-byte aligned
207#else										// BUFFER_PTR is from the argument set up in the caller
208	mov     BUFFER_PTR, T1					// T1 = BUFFER_PTR
209    xmovu  $0*4(T1), W_TMP					// read 16-bytes into W_TMP, BUFFER_PTR possibly not 16-byte aligned
210#endif
211	.endm
212
213	.macro	W_PRECALC_00_15_1_ssse3			// input argument $0 : current 16-bytes in the circular buffer, one of W0,W4,W8,...,W28
214	pshufb	XMM_SHUFB_BSWAP, W_TMP			// convert W_TMP from little-endian into big-endian
215	xmov	W_TMP, $0						// save W_TMP in the circular buffer
216	.endm
217
218	.macro	W_PRECALC_00_15_2				// K_BASE points to the current K quadruple.
219#if defined (__x86_64__)					// K_BASE is already an address register in x86_64
220	paddd	(K_BASE), W_TMP					// W_TMP += {K,K,K,K};
221#else										// K_BASE is previously set up in the stack memory
222	mov     K_BASE, T1						// T1 = K_BASE
223    paddd   (T1), W_TMP						// W_TMP += {K,K,K,K};
224#endif
225	.endm
226
227	.macro	W_PRECALC_00_15_3
228	xmov	W_TMP, WK($0&~3)				// save quadruple W[i]+K in the stack memory, which would be used later for updating the hashes A/B/C/D/E
229	.endm
230
231	/*
232		without ssse3 support, steps 1 and 2 need to be modified
233		1. sequentially load 4 words into T1, bswap T1, and save it to 4-bytes in the stack space
234		2. load the 16-bytes from the aligned stack memory into W_TMP
235	*/
236
237	.macro	W_PRECALC_00_15_0_nossse3		// input argument $0 : 0/4/8/12
238
239#if	defined (__x86_64__)
240	#define	BUFFERP	BUFFER_PTR
241#else
242	mov		BUFFER_PTR, T2					// copy BUFFER_PTR (from caller 2nd argument) to T2
243	#define	BUFFERP	T2
244#endif
245
246	// load 1st word, bswap it, save it to stack
247	mov		$0*4(BUFFERP), T1
248	bswap	T1
249	mov		T1, 14*16(sp)
250
251	// load 2nd word, bswap it, save it to stack
252	mov		4+$0*4(BUFFERP), T1
253	bswap	T1
254	mov		T1, 4+14*16(sp)
255
256	// load 3rd word, bswap it, save it to stack
257	mov		8+$0*4(BUFFERP), T1
258	bswap	T1
259	mov		T1, 8+14*16(sp)
260
261	// load 4th word, bswap it, save it to stack
262	mov		12+$0*4(BUFFERP), T1
263	bswap	T1
264	mov		T1, 12+14*16(sp)
265	.endm
266
267	.macro	W_PRECALC_00_15_1_nossse3 		// input argument $0 : current 16-bytes in the circular buffer, one of W0,W4,W8,...,W28
268	xmov	14*16(sp), W_TMP				// load the bswapped 16-bytes from the aligned stack memory
269	xmov	W_TMP, $0						// save W = W_TMP in the circular buffer
270	.endm
271
272	// rounds 16-31 compute W[0] using the vectorization approach by Dean Gaudet
273	/*
274	W[i  ] = (W[i-3] ^ W[i-8] ^ W[i-14] ^ W[i-16]) rol 1
275    W[i+1] = (W[i-2] ^ W[i-7] ^ W[i-13] ^ W[i-15]) rol 1
276    W[i+2] = (W[i-1] ^ W[i-6] ^ W[i-12] ^ W[i-14]) rol 1
277    W[i+3] = (   0   ^ W[i-5] ^ W[i-11] ^ W[i-13]) rol 1
278
279	W[i+3] ^= W[i] rol 1;	// this W[i] is already rol by 1, if we are taking from the intial W before rol 1, we should rol this by 2
280
281	The operation (updating W and W+K) is scheduled as and divided into 4 steps
282
283	0. W_tmp = W3; W = W14 ^ W8
284	1. W = W3 ^ W8 ^ W14 ^ W16; W_TMP = W; W_TMP2 = (W[i] 0 0 0);
285	2. W_TMP = (W3 ^ W8 ^ W14 ^ W16) rol 1; split (W[i] 0 0 0) rol 2 in W_TMP2 and W
286	3. W = W_TMP = W_TMP ^ W_TMP2 ^ W = (W3 ^ W8 ^ W14 ^ W16) rol 1 ^ (W[i] 0 0 0) rol 2; WK = W _TMP+K;
287
288	*/
289
290	.macro	W_PRECALC_16_31_0_ssse3	// input arguments : W16,W12,W8,W4,W
291	xmov	$1, $4					// W = W12
292	palignr	$$8, $0, $4				// W = W14
293	xmov	$3, W_TMP				// W_TMP = W4
294	psrldq	$$4, W_TMP				// W_TMP = W3
295	pxor	$2, $4					// W = W8 ^ W14
296	.endm
297
298	.macro	W_PRECALC_16_31_1		// input arguments : W16,W
299	pxor	$0, W_TMP				// W_TMP = W3 ^ W16
300	pxor	W_TMP, $1				// W = W3 ^ W16 ^ W8 ^ W14
301	xmov	$1, W_TMP2				// W_TMP2 = W3 ^ W16 ^ W8 ^ W14
302	xmov	$1, W_TMP				// W_TMP = W3 ^ W16 ^ W8 ^ W14
303	pslldq	$$12, W_TMP2			// W_TMP2 = (W[i] 0 0 0)
304	.endm
305
306	.macro	W_PRECALC_16_31_2		// input argument : W
307	psrld	$$31, $0				// (W3 ^ W16 ^ W8 ^ W14)>>31
308	pslld	$$1, W_TMP				// (W3 ^ W16 ^ W8 ^ W14)<<1
309	por		$0, W_TMP				// W_TMP = (W3 ^ W16 ^ W8 ^ W14) rol 1
310	xmov	W_TMP2, $0				// copy W[i] at location of W[i+3]
311	psrld	$$30, W_TMP2			// W_TMP2 = W[i] lower 2 bits after rol 2
312	pslld	$$2, $0					// W = W[i] higher 30 bits after rol 2
313	.endm
314
315	.macro	W_PRECALC_16_31_3		// input arguments: W, i, K_XMM
316#if defined (__i386__)
317	mov     K_BASE, T1				// K_BASE is store in the stack memory for i386
318#endif
319	pxor	$0, W_TMP
320	pxor	W_TMP2, W_TMP			// W_TMP = (W3 ^ W16 ^ W8 ^ W14) rol 1 ^ (W[i] 0 0 0) rol 2
321	xmov	W_TMP, $0				// save W = W_TMP in the W circular buffer
322#if defined (__x86_64__)
323	paddd	$2(K_BASE), W_TMP		// W+K
324#else
325    paddd   $2(T1), W_TMP			// W+K
326#endif
327	xmov	W_TMP, WK($1&~3)		// save WK = W+K for later update of the hashes A/B/C/D/E
328	.endm
329
330	// the following is a variant of W_PRECALC_16_31_0_ssse3 to be used for system without ssse3, palignr is replaced with 4 instructions
331
332	.macro	W_PRECALC_16_31_0_nossse3	// input arguments : W16,W12,W8,W4,W
333	xmov	$1, $4						// W = W12 = (w9 w10 w11 w12)
334
335	// the following is a wrokaround for palignr
336	xmov	$0, W_TMP					// W16 = (w13 w14 w15 w16)
337	pslldq	$$8, $4						// shift left to make (w11 w12 0 0)
338	psrldq	$$8, W_TMP					// shift right to make (0 0 w13 w14)
339	por		W_TMP, $4					// W = W14 = (w11 w12 w13 w14)
340
341	xmov	$3, W_TMP					// W_TMP = W4 = (w1 w2 w3 w4)
342	psrldq	$$4, W_TMP					// W_TMP = W3 = (0 w1 w2 w3)
343	pxor	$2, $4						// W = W8 ^ W14
344	.endm
345
346	/* rounds 32-79 compute W und W+K iusing the vectorization approach from the Intel article
347
348		W = rotate_left(left_shift(concatenate(W8,W4),64) ^ W16 ^ W28 ^ W32, 2);
349
350		where left_shift(concatenate(W8,W4),64) is equivalent to W6. Note also that W32 and W use the same register.
351
352
353	0. W_tmp = W6; W = W28 ^ W32;
354	1. W = W_tmp = W6 ^ W16 ^ W28 ^ W32;
355	2. W_tmp = (W6 ^ W16 ^ W28 ^ W32) rol 2;
356	3. W = W_Tmp; WK = W_tmp + K;
357
358	*/
359
360
361	.macro	W_PRECALC_32_79_0_ssse3		// inputr arguments : W28,W8,W4,W
362	xmov	$2, W_TMP					// (w1 w2 w3 w4)
363	pxor	$0, $3						// W = W28 ^ W32;
364	palignr	$$8, $1, W_TMP				// W_tmp = (w3 w4 w5 w6) = W6;
365	.endm
366
367	// the following is a variant and will be used for system without ssse3 support
368	.macro	W_PRECALC_32_79_0_nossse3	// input arguments : W28,W8,W4,W
369	xmov	$2, W_TMP					// (w1 w2 w3 w4)
370	xmov    $1, W_TMP2					// (w5 w6 w7 w8)
371	pxor	$0, $3						// W = W28 ^ W32
372	pslldq	$$8, W_TMP					// (w3 w4 0 0)
373	psrldq	$$8, W_TMP2					// (0 0 w5 w6)
374	por		W_TMP2, W_TMP				// W_tmp = (w3 w4 w5 w6) = W6
375	.endm
376
377	// this is a variant of W_PRECALC_32_79_0_ssse3 for i386 (as W24/W28 are stored in memory, not in registers)
378	.macro  W_PRECALC_32_79_0_i386_ssse3	// input arguments : W28,W8,W4,W
379    xmov    $3, W_TMP						// W32
380    pxor    $0, W_TMP						// W28 ^ W32
381    xmov    W_TMP, $3						// W = W28 ^ W32;
382    xmov    $2, W_TMP						// W4
383    palignr $$8, $1, W_TMP					// W_tmp = (w3 w4 w5 w6) = W6;
384    .endm
385
386	// this is a variant of W_PRECALC_32_79_0_nossse3 for i386 (as W24/W28 are stored in memory, not in registers)
387	.macro  W_PRECALC_32_79_0_i386_nossse3  // input arguments : W28,W8,W4,W
388    xmov    $3, W_TMP						// W32
389    pxor    $0, W_TMP						// W28 ^ W32
390    xmov    W_TMP, $3						// W = W28 ^ W32
391    xmov    $2, W_TMP						// W4 = (w1 w2 w3 w4)
392	xmov    $1, W_TMP2						// W8 = (w5 w6 w7 w8)
393	pslldq	$$8, W_TMP						// (w3 w4 0 0)
394	psrldq	$$8, W_TMP2						// (0 0 w5 w6)
395	por		W_TMP2, W_TMP					// W_tmp = (w3 w4 w5 w6) = W6
396    .endm
397
398	.macro	W_PRECALC_32_79_1			// input arguments : W16,W
399	pxor	$0, W_TMP					// W_tmp = W6 ^ W16
400	pxor	$1, W_TMP					// W_tmp = W6 ^ W16 ^ W28 ^ W32
401	xmov	W_TMP, $1					// W = W_tmp = W6 ^ W16 ^ W28 ^ W32
402	.endm
403
404	.macro	W_PRECALC_32_79_2			// input argument : W
405	psrld	$$30, $0					// W >> 30
406	pslld	$$2, W_TMP					// W << 2
407	por		$0, W_TMP					// W_tmp = (W6 ^ W16 ^ W28 ^ W32) rol 2
408	.endm
409
410	// this is a variant of W_PRECALC_32_79_2 for i386 (as W24/W28 are stored in memory, not in registers)
411	// this should be used when the input is either W24 or W28 on i386 architecture
412    .macro  W_PRECALC_32_79_2_i386  	// input argument : W
413    xmov    $0, W_TMP2					// W
414    psrld   $$30, W_TMP2				// W >> 30
415    xmov    W_TMP2, $0					// save (W >> 30) at W
416    pslld   $$2, W_TMP					// W_tmp << 2
417    por     $0, W_TMP					// W_tmp = (W6 ^ W16 ^ W28 ^ W32) rol 2
418    .endm
419
420	.macro	W_PRECALC_32_79_3			// input argument W, i, K_XMM
421#if defined (__x86_64__)
422	xmov	W_TMP, $0					// W = (W6 ^ W16 ^ W28 ^ W32) rol 2
423	paddd	$2(K_BASE), W_TMP			// W + K
424	xmov	W_TMP, WK($1&~3)			// write W+K
425#else
426    mov     K_BASE, T1					// T1 = K_BASE (which is in the caller argument)
427    xmov    W_TMP, $0					// W = (W6 ^ W16 ^ W28 ^ W32) rol 2
428    paddd   $2(T1), W_TMP				// W_tmp = W + K
429    xmov    W_TMP, WK($1&~3)			// write WK
430#endif
431	.endm
432
433
434	/* The hash update operation is completed by the following statements.
435
436		A[i+1] = FN + E[i] + ROTATE_LEFT( A[i], 5 ) + WK(i);
437        B[i+1] = A[i];
438        C[i+1] = ROTATE_LEFT( B[i], 30 );
439        D[i+1] = C[i];
440        E[i+1] = D[i];
441
442		Suppose we start with A0,B0,C0,D0,E0. The 1st iteration can be expressed as follows:
443
444		A1 = FN + E0 + rol(A0,5) + WK;
445		B1 = A0;
446		C1 = rol(B0, 30);
447		D1 = C0;
448		E1 = D0;
449
450		to avoid excessive memory movement between registers,
451			1. A1 = FN + E0 + rol(A0,5) + WK; can be temporarily saved in E0,
452			2. C1 = rol(B0,30) can be temporarily saved in B0.
453
454		Therefore, ignoring the time index, the update operation is equivalent to
455			1. E = FN(B,C,D) + E + rol(A,5) + WK(i)
456			2. B = rol(B,30)
457			3. the hashes are now stored in the order of E,A,B,C,D
458
459
460		To pack 2 hash update operations in 1 iteration, starting with A,B,C,D,E
461		1. E = FN(B,C,D) + E + rol(A,5) + WK(i)
462		2. B = rol(B,30)
463		// now the hashes are in the order of E,A,B,C,D
464		3. D = FN(A,B,C) + D + rol(E,5) + WK(i+1)
465		4. A = rol(A,30)
466		// now the hashes are in the order of D,E,A,B,C
467
468		These operations are distributed into the following 2 macro definitions RR0 and RR1.
469
470	*/
471
472	.macro	RR0				// input arguments : FN, A, B, C, D, E, i
473	$0		$2, $3, $4		// T1 = FN(B,C,D)
474	add		WK($6), $5		// E + WK(i)
475	rol		$$30, $2		// B = rol(B,30)
476	mov		$1, T2			// T2 = A
477	add		WK($6+1), $4	// D + WK(i+1)
478	rol		$$5, T2			// rol(A,5)
479	add		T1, $5			// E = FN(B,C,D) + E + WK(i)
480	.endm
481
482	.macro	RR1
483	add		$5, T2			// T2 = FN(B,C,D) + E + rol(A,5) + WK(i)
484	mov		T2, $5			// E = FN(B,C,D) + E + rol(A,5) + WK(i)
485	rol		$$5, T2			// rol(E,5)
486	add		T2, $4			// D + WK(i+1) + rol(E,5)
487	$0		$1, $2, $3		// FN(A,B,C)
488	add		T1, $4			// D = FN(A,B,C) + D + rol(E,5) + WK(i+1)
489	rol		$$30, $1		// A = rol(A,30)
490	.endm
491
492
493
494	/*
495
496		The following macro definitions are used to expand code for the per-block sha1 operation.
497
498			INITIAL_W_PRECALC_ssse3	: BIG_ENDIAN_LOAD(64 bytes block) into W (i=0:15) and store W+K into the stack memory
499			INTERNAL_ssse3 : updating W (16:79) and update the digests A/B/C/D/E (i=0:63, based on W+K stored in the stack memory)
500			ENDING : finishing up update the digests A/B/C/D/E (i=64:79)
501
502		For multiple-block sha1 operation (Multiple_Blocks = 1), INITIAL_W_PRECALC_ssse3 and ENDING are combined
503		into 1 macro definition for software pipeling.
504
505			SOFTWARE_PIPELINING_ssse3 : BIG_ENDIAN_LOAD(64 bytes block) into W (i=0:15) and store W+K into the stack, and finishing up update the digests A/B/C/D/E (i=64:79)
506
507		assume cnt (the number of blocks)  >= 1, the main code body should look like
508
509		INITIAL_W_PRECALC_ssse3				// W = big_endian_load and pre-compute W+K (i=0:15)
510		do {
511			INTERNAL_ssse3					// update W(i=16:79), and update hash digests A/B/C/D/E (i=0:63)
512			cnt--;
513			if (cnt==0) break;
514			BUFFER_PTR += 64;
515			SOFTWARE_PIPELINING_ssse3;		// update hash digests A/B/C/D/E (i=64:79) + W = big_endian_load and pre-compute W+K (i=0:15)
516		}
517		ENDING								// update hash digests A/B/C/D/E (i=64:79)
518
519	*/
520
521	#define	W_PRECALC_00_15_0	W_PRECALC_00_15_0_ssse3
522	#define	W_PRECALC_00_15_1	W_PRECALC_00_15_1_ssse3
523	#define	W_PRECALC_16_31_0	W_PRECALC_16_31_0_ssse3
524	#define	W_PRECALC_32_79_0	W_PRECALC_32_79_0_ssse3
525	#define	W_PRECALC_32_79_0_i386	W_PRECALC_32_79_0_i386_ssse3
526
527
528	.macro	INITIAL_W_PRECALC_ssse3			// BIG_ENDIAN_LOAD(64 bytes block) into W (i=0:15) and store W+K into the stack memory
529
530	// i=0 	: W28,W24,W20,W16,W12,W8,W4,W0
531	W_PRECALC_00_15_0	0					// W_TMP = (BUFFER_PTR)
532	W_PRECALC_00_15_1	W0					// convert W_TMP to big-endian, and save W0 = W_TMP
533	W_PRECALC_00_15_2						// W_TMP = W0 + K
534	W_PRECALC_00_15_3	3					// (sp) = W_TMP = W0 + K
535
536	// i=4	: W24,W20,W16,W12,W8,W4,W0,W28
537	W_PRECALC_00_15_0	4					// W_TMP = 16(BUFFER_PTR)
538	W_PRECALC_00_15_1	W28					// convert W_TMP to big-endian, and save W28 = W_TMP
539	W_PRECALC_00_15_2						// W_TMP = W28 + K
540	W_PRECALC_00_15_3	7					// 16(sp) = W_TMP = W28 + K
541
542	// i=8  : W20,W16,W12,W8,W4,W0,W28,W24
543	W_PRECALC_00_15_0	8					// W_TMP = 32(BUFFER_PTR)
544	W_PRECALC_00_15_1	W24					// convert W_TMP to big-endian, and save W24 = W_TMP
545	W_PRECALC_00_15_2						// W_TMP = W24 + K
546	W_PRECALC_00_15_3	11					// 32(sp) = W_TMP = W24 + K
547
548	// i=12 : W16,W12,W8,W4,W0,W28,W24,W20
549	W_PRECALC_00_15_0	12					// W_TMP = 48(BUFFER_PTR)
550	W_PRECALC_00_15_1	W20					// convert W_TMP to big-endian, and save W20 = W_TMP
551	W_PRECALC_00_15_2						// W_TMP = W20 + K
552	W_PRECALC_00_15_3	15					// 48(sp) = W_TMP = W20 + K
553
554	.endm
555
556
557	.macro	INTERNAL_ssse3					// updating W (16:79) and update the digests A/B/C/D/E (i=0:63, based on W+K stored in the stack memory)
558
559	// i=16 : W12,W8,W4,W0,W28,W24,W20,W16
560	W_PRECALC_16_31_0	W0,W28,W24,W20,W16
561	RR0					F1,A,B,C,D,E,0
562	W_PRECALC_16_31_1	W0,W16
563	RR1					F1,A,B,C,D,E,0
564	W_PRECALC_16_31_2	W16
565	RR0					F1,D,E,A,B,C,2
566	W_PRECALC_16_31_3	W16, 2, 0
567	RR1					F1,D,E,A,B,C,2
568
569	// i=20 : W8,W4,W0,W28,W24,W20,W16,W12
570	W_PRECALC_16_31_0	W28,W24,W20,W16,W12
571	RR0					F1,B,C,D,E,A,4
572	W_PRECALC_16_31_1	W28,W12
573	RR1					F1,B,C,D,E,A,4
574	W_PRECALC_16_31_2	W12
575	RR0					F1,E,A,B,C,D,6
576	W_PRECALC_16_31_3	W12, 6, 16
577	RR1					F1,E,A,B,C,D,6
578
579	// i=24 : W4,W0,W28,W24,W20,W16,W12,W8
580	W_PRECALC_16_31_0	W24,W20,W16,W12,W8
581	RR0					F1,C,D,E,A,B,8
582	W_PRECALC_16_31_1	W24,W8
583	RR1					F1,C,D,E,A,B,8
584	W_PRECALC_16_31_2	W8
585	RR0					F1,A,B,C,D,E,10
586	W_PRECALC_16_31_3	W8,10,16
587	RR1					F1,A,B,C,D,E,10
588
589	// i=28 : W0,W28,W24,W20,W16,W12,W8,W4
590	W_PRECALC_16_31_0	W20,W16,W12,W8,W4
591	RR0					F1,D,E,A,B,C,12
592	W_PRECALC_16_31_1	W20,W4
593	RR1					F1,D,E,A,B,C,12
594	W_PRECALC_16_31_2	W4
595	RR0					F1,B,C,D,E,A,14
596	W_PRECALC_16_31_3	W4,14,16
597	RR1					F1,B,C,D,E,A,14
598
599	// i=32 : W28,W24,W20,W16,W12,W8,W4,W0
600	W_PRECALC_32_79_0	W28,W8,W4,W0
601	RR0					F1,E,A,B,C,D,16
602	W_PRECALC_32_79_1	W16,W0
603	RR1					F1,E,A,B,C,D,16
604	W_PRECALC_32_79_2	W0
605	RR0					F1,C,D,E,A,B,18
606	W_PRECALC_32_79_3	W0,18,16
607	RR1					F1,C,D,E,A,B,18
608
609	// starting using F2
610
611	// i=36 : W24,W20,W16,W12,W8,W4,W0,W28
612#if defined (__x86_64__)
613	W_PRECALC_32_79_0	W24,W4,W0,W28
614#else
615	W_PRECALC_32_79_0_i386	W24,W4,W0,W28
616#endif
617	RR0					F2,A,B,C,D,E,20
618	W_PRECALC_32_79_1	W12,W28
619	RR1					F2,A,B,C,D,E,20
620#if defined (__x86_64__)
621	W_PRECALC_32_79_2	W28
622#else
623	W_PRECALC_32_79_2_i386	W28
624#endif
625	RR0					F2,D,E,A,B,C,22
626	W_PRECALC_32_79_3	W28,22,16
627	RR1					F2,D,E,A,B,C,22
628
629	// i=40 : W20,W16,W12,W8,W4,W0,W28,W24
630	#undef  K_XMM
631    #define K_XMM   32
632#if defined (__x86_64__)
633	W_PRECALC_32_79_0	W20,W0,W28,W24
634#else
635	W_PRECALC_32_79_0_i386	W20,W0,W28,W24
636#endif
637	RR0					F2,B,C,D,E,A,24
638	W_PRECALC_32_79_1	W8,W24
639	RR1					F2,B,C,D,E,A,24
640#if defined (__x86_64__)
641	W_PRECALC_32_79_2	W24
642#else
643	W_PRECALC_32_79_2_i386	W24
644#endif
645	RR0					F2,E,A,B,C,D,26
646	W_PRECALC_32_79_3	W24,26,K_XMM
647	RR1					F2,E,A,B,C,D,26
648
649	// i=44 : W16,W12,W8,W4,W0,W28,W24,W20
650	W_PRECALC_32_79_0	W16,W28,W24,W20
651	RR0					F2,C,D,E,A,B,28
652	W_PRECALC_32_79_1	W4,W20
653	RR1					F2,C,D,E,A,B,28
654	W_PRECALC_32_79_2	W20
655	RR0					F2,A,B,C,D,E,30
656	W_PRECALC_32_79_3	W20,30,K_XMM
657	RR1					F2,A,B,C,D,E,30
658
659	// i=48 : W12,W8,W4,W0,W28,W24,W20,W16
660	W_PRECALC_32_79_0	W12,W24,W20,W16
661	RR0					F2,D,E,A,B,C,32
662	W_PRECALC_32_79_1	W0,W16
663	RR1					F2,D,E,A,B,C,32
664	W_PRECALC_32_79_2	W16
665	RR0					F2,B,C,D,E,A,34
666	W_PRECALC_32_79_3	W16,34,K_XMM
667	RR1					F2,B,C,D,E,A,34
668
669	// i=52 : W8,W4,W0,W28,W24,W20,W16,W12
670	W_PRECALC_32_79_0	W8,W20,W16,W12
671	RR0					F2,E,A,B,C,D,36
672	W_PRECALC_32_79_1	W28,W12
673	RR1					F2,E,A,B,C,D,36
674	W_PRECALC_32_79_2	W12
675	RR0					F2,C,D,E,A,B,38
676	W_PRECALC_32_79_3	W12,38,K_XMM
677	RR1					F2,C,D,E,A,B,38
678
679	// starting using F3
680
681	// i=56 : W4,W0,W28,W24,W20,W16,W12,W8
682	W_PRECALC_32_79_0	W4,W16,W12,W8
683	RR0					F3,A,B,C,D,E,40
684	W_PRECALC_32_79_1	W24,W8
685	RR1					F3,A,B,C,D,E,40
686	W_PRECALC_32_79_2	W8
687	RR0					F3,D,E,A,B,C,42
688	W_PRECALC_32_79_3	W8,42,K_XMM
689	RR1					F3,D,E,A,B,C,42
690
691	// i=60 : W0,W28,W24,W20,W16,W12,W8,W4
692	#undef	K_XMM
693	#define	K_XMM	48
694	W_PRECALC_32_79_0	W0,W12,W8,W4
695	RR0					F3,B,C,D,E,A,44
696	W_PRECALC_32_79_1	W20,W4
697	RR1					F3,B,C,D,E,A,44
698	W_PRECALC_32_79_2	W4
699	RR0					F3,E,A,B,C,D,46
700	W_PRECALC_32_79_3	W4,46,K_XMM
701	RR1					F3,E,A,B,C,D,46
702
703	// i=64 : W28,W24,W20,W16,W12,W8,W4,W0
704	W_PRECALC_32_79_0	W28,W8,W4,W0
705	RR0					F3,C,D,E,A,B,48
706	W_PRECALC_32_79_1	W16,W0
707	RR1					F3,C,D,E,A,B,48
708	W_PRECALC_32_79_2	W0
709	RR0					F3,A,B,C,D,E,50
710	W_PRECALC_32_79_3	W0,50,K_XMM
711	RR1					F3,A,B,C,D,E,50
712
713	// i=68 : W24,W20,W16,W12,W8,W4,W0,W28
714#if defined (__x86_64__)
715	W_PRECALC_32_79_0	W24,W4,W0,W28
716#else
717	W_PRECALC_32_79_0_i386	W24,W4,W0,W28
718#endif
719	RR0					F3,D,E,A,B,C,52
720	W_PRECALC_32_79_1	W12,W28
721	RR1					F3,D,E,A,B,C,52
722#if defined (__x86_64__)
723	W_PRECALC_32_79_2	W28
724#else
725	W_PRECALC_32_79_2_i386	W28
726#endif
727	RR0					F3,B,C,D,E,A,54
728	W_PRECALC_32_79_3	W28,54,K_XMM
729	RR1					F3,B,C,D,E,A,54
730
731	// i=72 : W20,W16,W12,W8,W4,W0,W28,W24
732#if defined (__x86_64__)
733	W_PRECALC_32_79_0	W20,W0,W28,W24
734#else
735	W_PRECALC_32_79_0_i386	W20,W0,W28,W24
736#endif
737	RR0					F3,E,A,B,C,D,56
738	W_PRECALC_32_79_1	W8,W24
739	RR1					F3,E,A,B,C,D,56
740#if defined (__x86_64__)
741	W_PRECALC_32_79_2	W24
742#else
743	W_PRECALC_32_79_2_i386	W24
744#endif
745	RR0					F3,C,D,E,A,B,58
746	W_PRECALC_32_79_3	W24,58,K_XMM
747	RR1					F3,C,D,E,A,B,58
748
749	// starting using F4
750
751	// i=76 : W16,W12,W8,W4,W0,W28,W24,W20
752	W_PRECALC_32_79_0	W16,W28,W24,W20
753	RR0					F4,A,B,C,D,E,60
754	W_PRECALC_32_79_1	W4,W20
755	RR1					F4,A,B,C,D,E,60
756	W_PRECALC_32_79_2	W20
757	RR0					F4,D,E,A,B,C,62
758	W_PRECALC_32_79_3	W20,62,K_XMM
759	RR1					F4,D,E,A,B,C,62
760
761	.endm
762
763	.macro	SOFTWARE_PIPELINING_ssse3
764	// i=0  : W28,W24,W20,W16,W12,W8,W4,W0
765	W_PRECALC_00_15_0	0					// W_TMP = (BUFFER_PTR)
766	RR0					F4,B,C,D,E,A,64
767	W_PRECALC_00_15_1	W0					// convert W_TMP to big-endian, and save W0 = W_TMP
768	RR1					F4,B,C,D,E,A,64
769	W_PRECALC_00_15_2						// W_TMP = W0 + K
770	RR0					F4,E,A,B,C,D,66
771	W_PRECALC_00_15_3	3					// (sp) = W_TMP = W0 + K
772	RR1					F4,E,A,B,C,D,66
773
774	// i=4  : W24,W20,W16,W12,W8,W4,W0,W28
775	W_PRECALC_00_15_0	4					// W_TMP = 16(BUFFER_PTR)
776	RR0					F4,C,D,E,A,B,68
777	W_PRECALC_00_15_1	W28					// convert W_TMP to big-endian, and save W28 = W_TMP
778	RR1					F4,C,D,E,A,B,68
779	W_PRECALC_00_15_2						// W_TMP = W28 + K
780	RR0					F4,A,B,C,D,E,70
781	W_PRECALC_00_15_3	7					// 16(sp) = W_TMP = W28 + K[0]
782	RR1					F4,A,B,C,D,E,70
783
784	// i=8  : W20,W16,W12,W8,W4,W0,W28,W24
785	W_PRECALC_00_15_0	8					// W_TMP = 32(BUFFER_PTR)
786	RR0					F4,D,E,A,B,C,72
787	W_PRECALC_00_15_1	W24					// convert W_TMP to big-endian, and save W24 = W_TMP
788	RR1					F4,D,E,A,B,C,72
789	W_PRECALC_00_15_2						// W_TMP = W24 + K
790	RR0					F4,B,C,D,E,A,74
791	W_PRECALC_00_15_3	11					// 32(sp) = W_TMP = W24 + K
792	RR1					F4,B,C,D,E,A,74
793
794	// i=12 : W16,W12,W8,W4,W0,W28,W24,W20
795	W_PRECALC_00_15_0	12					// W_TMP = 48(BUFFER_PTR)
796	RR0					F4,E,A,B,C,D,76
797	W_PRECALC_00_15_1	W20					// convert W_TMP to big-endian, and save W20 = W_TMP
798	RR1					F4,E,A,B,C,D,76
799	W_PRECALC_00_15_2						// W_TMP = W20 + K
800	RR0					F4,C,D,E,A,B,78
801	W_PRECALC_00_15_3	15					// 48(sp) = W_TMP = W20 + K
802	RR1					F4,C,D,E,A,B,78
803	.endm
804
805
806	#undef	W_PRECALC_00_15_0
807	#undef	W_PRECALC_00_15_1
808	#undef	W_PRECALC_16_31_0
809	#undef	W_PRECALC_32_79_0
810	#undef	W_PRECALC_32_79_0_i386
811
812
813
814	/*
815
816		The following are 3 macro definitions that are no-ssse3 variants of the previous 3 macro definitions.
817
818		INITIAL_W_PRECALC_nossse3
819		INTERNAL_nossse3
820		SOFTWARE_PIPELINING_nossse3
821
822		They will be used in a sha1 code main body definition that will be used for system without ssse3 support.
823
824	*/
825
826	#define	W_PRECALC_00_15_0	W_PRECALC_00_15_0_nossse3
827	#define	W_PRECALC_00_15_1	W_PRECALC_00_15_1_nossse3
828	#define	W_PRECALC_16_31_0	W_PRECALC_16_31_0_nossse3
829	#define	W_PRECALC_32_79_0	W_PRECALC_32_79_0_nossse3
830	#define	W_PRECALC_32_79_0_i386	W_PRECALC_32_79_0_i386_nossse3
831
832
833	.macro	INITIAL_W_PRECALC_nossse3
834
835	// i=0 	: W28,W24,W20,W16,W12,W8,W4,W0
836	W_PRECALC_00_15_0	0					// W_TMP = (BUFFER_PTR)
837	W_PRECALC_00_15_1	W0					// convert W_TMP to big-endian, and save W0 = W_TMP
838	W_PRECALC_00_15_2						// W_TMP = W0 + K
839	W_PRECALC_00_15_3	3					// (sp) = W_TMP = W0 + K
840
841	// i=4	: W24,W20,W16,W12,W8,W4,W0,W28
842	W_PRECALC_00_15_0	4					// W_TMP = 16(BUFFER_PTR)
843	W_PRECALC_00_15_1	W28					// convert W_TMP to big-endian, and save W28 = W_TMP
844	W_PRECALC_00_15_2						// W_TMP = W28 + K
845	W_PRECALC_00_15_3	7					// 16(sp) = W_TMP = W28 + K
846
847	// i=8  : W20,W16,W12,W8,W4,W0,W28,W24
848	W_PRECALC_00_15_0	8					// W_TMP = 32(BUFFER_PTR)
849	W_PRECALC_00_15_1	W24					// convert W_TMP to big-endian, and save W24 = W_TMP
850	W_PRECALC_00_15_2						// W_TMP = W24 + K
851	W_PRECALC_00_15_3	11					// 32(sp) = W_TMP = W24 + K
852
853	// i=12 : W16,W12,W8,W4,W0,W28,W24,W20
854	W_PRECALC_00_15_0	12					// W_TMP = 48(BUFFER_PTR)
855	W_PRECALC_00_15_1	W20					// convert W_TMP to big-endian, and save W20 = W_TMP
856	W_PRECALC_00_15_2						// W_TMP = W20 + K
857	W_PRECALC_00_15_3	15					// 48(sp) = W_TMP = W20 + K
858
859	.endm
860
861
862	.macro	INTERNAL_nossse3
863	// i=16
864	// circular buffer : W12,W8,W4,W0,W28,W24,W20,W16
865	W_PRECALC_16_31_0	W0,W28,W24,W20,W16
866	RR0					F1,A,B,C,D,E,0
867	W_PRECALC_16_31_1	W0,W16
868	RR1					F1,A,B,C,D,E,0
869	W_PRECALC_16_31_2	W16
870	RR0					F1,D,E,A,B,C,2
871	W_PRECALC_16_31_3	W16, 2, 0
872	RR1					F1,D,E,A,B,C,2
873
874	// i=20,
875	// W8,W4,W0,W28,W24,W20,W16,W12
876	W_PRECALC_16_31_0	W28,W24,W20,W16,W12
877	RR0					F1,B,C,D,E,A,4
878	W_PRECALC_16_31_1	W28,W12
879	RR1					F1,B,C,D,E,A,4
880
881	W_PRECALC_16_31_2	W12
882	RR0					F1,E,A,B,C,D,6
883	W_PRECALC_16_31_3	W12, 6, 16
884	RR1					F1,E,A,B,C,D,6
885
886	// i=24,
887	// W4,W0,W28,W24,W20,W16,W12,W8
888	W_PRECALC_16_31_0	W24,W20,W16,W12,W8
889	RR0					F1,C,D,E,A,B,8
890	W_PRECALC_16_31_1	W24,W8
891	RR1					F1,C,D,E,A,B,8
892
893	W_PRECALC_16_31_2	W8
894	RR0					F1,A,B,C,D,E,10
895	W_PRECALC_16_31_3	W8,10,16
896	RR1					F1,A,B,C,D,E,10
897
898	// i=28
899	// W0,W28,W24,W20,W16,W12,W8,W4
900	W_PRECALC_16_31_0	W20,W16,W12,W8,W4
901	RR0					F1,D,E,A,B,C,12
902	W_PRECALC_16_31_1	W20,W4
903	RR1					F1,D,E,A,B,C,12
904
905	W_PRECALC_16_31_2	W4
906	RR0					F1,B,C,D,E,A,14
907	W_PRECALC_16_31_3	W4,14,16
908	RR1					F1,B,C,D,E,A,14
909
910	//i=32
911	// W28,W24,W20,W16,W12,W8,W4,W0
912	W_PRECALC_32_79_0	W28,W8,W4,W0
913	RR0					F1,E,A,B,C,D,16
914	W_PRECALC_32_79_1	W16,W0
915	RR1					F1,E,A,B,C,D,16
916	W_PRECALC_32_79_2	W0
917	RR0					F1,C,D,E,A,B,18
918	W_PRECALC_32_79_3	W0,18,16
919	RR1					F1,C,D,E,A,B,18
920
921	//i=36
922	// W24,W20,W16,W12,W8,W4,W0,W28
923#if defined (__x86_64__)
924	W_PRECALC_32_79_0	W24,W4,W0,W28
925#else
926	W_PRECALC_32_79_0_i386	W24,W4,W0,W28
927#endif
928	RR0					F2,A,B,C,D,E,20
929	W_PRECALC_32_79_1	W12,W28
930	RR1					F2,A,B,C,D,E,20
931#if defined (__x86_64__)
932	W_PRECALC_32_79_2	W28
933#else
934	W_PRECALC_32_79_2_i386	W28
935#endif
936	RR0					F2,D,E,A,B,C,22
937	W_PRECALC_32_79_3	W28,22,16
938	RR1					F2,D,E,A,B,C,22
939
940	//i=40
941	#undef  K_XMM
942    #define K_XMM   32
943	// W20,W16,W12,W8,W4,W0,W28,W24
944#if defined (__x86_64__)
945	W_PRECALC_32_79_0	W20,W0,W28,W24
946#else
947	W_PRECALC_32_79_0_i386	W20,W0,W28,W24
948#endif
949	RR0					F2,B,C,D,E,A,24
950	W_PRECALC_32_79_1	W8,W24
951	RR1					F2,B,C,D,E,A,24
952#if defined (__x86_64__)
953	W_PRECALC_32_79_2	W24
954#else
955	W_PRECALC_32_79_2_i386	W24
956#endif
957	RR0					F2,E,A,B,C,D,26
958	W_PRECALC_32_79_3	W24,26,K_XMM
959	RR1					F2,E,A,B,C,D,26
960
961	//i=44
962	// W16,W12,W8,W4,W0,W28,W24,W20
963	W_PRECALC_32_79_0	W16,W28,W24,W20
964	RR0					F2,C,D,E,A,B,28
965	W_PRECALC_32_79_1	W4,W20
966	RR1					F2,C,D,E,A,B,28
967	W_PRECALC_32_79_2	W20
968	RR0					F2,A,B,C,D,E,30
969	W_PRECALC_32_79_3	W20,30,K_XMM
970	RR1					F2,A,B,C,D,E,30
971
972	//i=48
973	// W12,W8,W4,W0,W28,W24,W20,W16
974	W_PRECALC_32_79_0	W12,W24,W20,W16
975	RR0					F2,D,E,A,B,C,32
976	W_PRECALC_32_79_1	W0,W16
977	RR1					F2,D,E,A,B,C,32
978	W_PRECALC_32_79_2	W16
979	RR0					F2,B,C,D,E,A,34
980	W_PRECALC_32_79_3	W16,34,K_XMM
981	RR1					F2,B,C,D,E,A,34
982
983	//i=52
984	// W8,W4,W0,W28,W24,W20,W16,W12
985	W_PRECALC_32_79_0	W8,W20,W16,W12
986	RR0					F2,E,A,B,C,D,36
987	W_PRECALC_32_79_1	W28,W12
988	RR1					F2,E,A,B,C,D,36
989	W_PRECALC_32_79_2	W12
990	RR0					F2,C,D,E,A,B,38
991	W_PRECALC_32_79_3	W12,38,K_XMM
992	RR1					F2,C,D,E,A,B,38
993
994	//i=56
995	// W4,W0,W28,W24,W20,W16,W12,W8
996	W_PRECALC_32_79_0	W4,W16,W12,W8
997	RR0					F3,A,B,C,D,E,40
998	W_PRECALC_32_79_1	W24,W8
999	RR1					F3,A,B,C,D,E,40
1000	W_PRECALC_32_79_2	W8
1001	RR0					F3,D,E,A,B,C,42
1002	W_PRECALC_32_79_3	W8,42,K_XMM
1003	RR1					F3,D,E,A,B,C,42
1004
1005	//i=60
1006	#undef	K_XMM
1007	#define	K_XMM	48
1008	// W0,W28,W24,W20,W16,W12,W8,W4
1009	W_PRECALC_32_79_0	W0,W12,W8,W4
1010	RR0					F3,B,C,D,E,A,44
1011	W_PRECALC_32_79_1	W20,W4
1012	RR1					F3,B,C,D,E,A,44
1013	W_PRECALC_32_79_2	W4
1014	RR0					F3,E,A,B,C,D,46
1015	W_PRECALC_32_79_3	W4,46,K_XMM
1016	RR1					F3,E,A,B,C,D,46
1017
1018	//i=64
1019	// W28,W24,W20,W16,W12,W8,W4,W0
1020	W_PRECALC_32_79_0	W28,W8,W4,W0
1021	RR0					F3,C,D,E,A,B,48
1022	W_PRECALC_32_79_1	W16,W0
1023	RR1					F3,C,D,E,A,B,48
1024	W_PRECALC_32_79_2	W0
1025	RR0					F3,A,B,C,D,E,50
1026	W_PRECALC_32_79_3	W0,50,K_XMM
1027	RR1					F3,A,B,C,D,E,50
1028
1029	//i=68
1030	// W24,W20,W16,W12,W8,W4,W0,W28
1031#if defined (__x86_64__)
1032	W_PRECALC_32_79_0	W24,W4,W0,W28
1033#else
1034	W_PRECALC_32_79_0_i386	W24,W4,W0,W28
1035#endif
1036	RR0					F3,D,E,A,B,C,52
1037	W_PRECALC_32_79_1	W12,W28
1038	RR1					F3,D,E,A,B,C,52
1039#if defined (__x86_64__)
1040	W_PRECALC_32_79_2	W28
1041#else
1042	W_PRECALC_32_79_2_i386	W28
1043#endif
1044	RR0					F3,B,C,D,E,A,54
1045	W_PRECALC_32_79_3	W28,54,K_XMM
1046	RR1					F3,B,C,D,E,A,54
1047
1048	//i=72
1049	// W20,W16,W12,W8,W4,W0,W28,W24
1050#if defined (__x86_64__)
1051	W_PRECALC_32_79_0	W20,W0,W28,W24
1052#else
1053	W_PRECALC_32_79_0_i386	W20,W0,W28,W24
1054#endif
1055	RR0					F3,E,A,B,C,D,56
1056	W_PRECALC_32_79_1	W8,W24
1057	RR1					F3,E,A,B,C,D,56
1058#if defined (__x86_64__)
1059	W_PRECALC_32_79_2	W24
1060#else
1061	W_PRECALC_32_79_2_i386	W24
1062#endif
1063	RR0					F3,C,D,E,A,B,58
1064	W_PRECALC_32_79_3	W24,58,K_XMM
1065	RR1					F3,C,D,E,A,B,58
1066
1067	// starting using F4
1068
1069	//i=76
1070	// W16,W12,W8,W4,W0,W28,W24,W20
1071	W_PRECALC_32_79_0	W16,W28,W24,W20
1072	RR0					F4,A,B,C,D,E,60
1073	W_PRECALC_32_79_1	W4,W20
1074	RR1					F4,A,B,C,D,E,60
1075	W_PRECALC_32_79_2	W20
1076	RR0					F4,D,E,A,B,C,62
1077	W_PRECALC_32_79_3	W20,62,K_XMM
1078	RR1					F4,D,E,A,B,C,62
1079
1080	.endm
1081
1082	.macro	SOFTWARE_PIPELINING_nossse3
1083	// i=0  : W28,W24,W20,W16,W12,W8,W4,W0
1084	W_PRECALC_00_15_0	0					// W_TMP = (BUFFER_PTR)
1085	RR0					F4,B,C,D,E,A,64
1086	W_PRECALC_00_15_1	W0					// convert W_TMP to big-endian, and save W0 = W_TMP
1087	RR1					F4,B,C,D,E,A,64
1088	W_PRECALC_00_15_2						// W_TMP = W0 + K
1089	RR0					F4,E,A,B,C,D,66
1090	W_PRECALC_00_15_3	3					// (sp) = W_TMP = W0 + K
1091	RR1					F4,E,A,B,C,D,66
1092
1093	// i=4  : W24,W20,W16,W12,W8,W4,W0,W28
1094	W_PRECALC_00_15_0	4					// W_TMP = 16(BUFFER_PTR)
1095	RR0					F4,C,D,E,A,B,68
1096	W_PRECALC_00_15_1	W28					// convert W_TMP to big-endian, and save W28 = W_TMP
1097	RR1					F4,C,D,E,A,B,68
1098	W_PRECALC_00_15_2						// W_TMP = W28 + K
1099	RR0					F4,A,B,C,D,E,70
1100	W_PRECALC_00_15_3	7					// 16(sp) = W_TMP = W28 + K[0]
1101	RR1					F4,A,B,C,D,E,70
1102
1103	// i=8  : W20,W16,W12,W8,W4,W0,W28,W24
1104	W_PRECALC_00_15_0	8					// W_TMP = 32(BUFFER_PTR)
1105	RR0					F4,D,E,A,B,C,72
1106	W_PRECALC_00_15_1	W24					// convert W_TMP to big-endian, and save W24 = W_TMP
1107	RR1					F4,D,E,A,B,C,72
1108	W_PRECALC_00_15_2						// W_TMP = W24 + K
1109	RR0					F4,B,C,D,E,A,74
1110	W_PRECALC_00_15_3	11					// 32(sp) = W_TMP = W24 + K
1111	RR1					F4,B,C,D,E,A,74
1112
1113	// i=12 : W16,W12,W8,W4,W0,W28,W24,W20
1114	W_PRECALC_00_15_0	12					// W_TMP = 48(BUFFER_PTR)
1115	RR0					F4,E,A,B,C,D,76
1116	W_PRECALC_00_15_1	W20					// convert W_TMP to big-endian, and save W20 = W_TMP
1117	RR1					F4,E,A,B,C,D,76
1118	W_PRECALC_00_15_2						// W_TMP = W20 + K
1119	RR0					F4,C,D,E,A,B,78
1120	W_PRECALC_00_15_3	15					// 48(sp) = W_TMP = W20 + K
1121	RR1					F4,C,D,E,A,B,78
1122	.endm
1123
1124	.macro	ENDING		// finish up updating hash digests (i=64:79)
1125	//i=80
1126	RR0					F4,B,C,D,E,A,64
1127	RR1					F4,B,C,D,E,A,64
1128	RR0					F4,E,A,B,C,D,66
1129	RR1					F4,E,A,B,C,D,66
1130
1131	//i=84
1132	RR0					F4,C,D,E,A,B,68
1133	RR1					F4,C,D,E,A,B,68
1134	RR0					F4,A,B,C,D,E,70
1135	RR1					F4,A,B,C,D,E,70
1136
1137	//i=88
1138	RR0					F4,D,E,A,B,C,72
1139	RR1					F4,D,E,A,B,C,72
1140	RR0					F4,B,C,D,E,A,74
1141	RR1					F4,B,C,D,E,A,74
1142
1143	//i=92
1144	RR0					F4,E,A,B,C,D,76
1145	RR1					F4,E,A,B,C,D,76
1146	RR0					F4,C,D,E,A,B,78
1147	RR1					F4,C,D,E,A,B,78
1148	.endm
1149
1150	// load hash digests A,B,C,D,E from memory into registers
1151	.macro	LOAD_HASH
1152#if defined (__x86_64__)
1153	mov			(HASH_PTR), A
1154	mov			4(HASH_PTR), B
1155	mov			8(HASH_PTR), C
1156	mov			12(HASH_PTR), D
1157	mov			16(HASH_PTR), E
1158#else
1159    mov         HASH_PTR, T1
1160    mov         (T1), A
1161    mov         4(T1), B
1162    mov         8(T1), C
1163    mov         12(T1), D
1164    mov         16(T1), E
1165#endif
1166	.endm
1167
1168	.macro	UPDATE_HASH
1169	add		$0, $1
1170	mov		$1, $0
1171	.endm
1172
1173	.macro UPDATE_ALL_HASH
1174#if defined (__x86_64__)
1175	UPDATE_HASH		(HASH_PTR), A
1176	UPDATE_HASH		4(HASH_PTR), B
1177	UPDATE_HASH		8(HASH_PTR), C
1178	UPDATE_HASH		12(HASH_PTR), D
1179	UPDATE_HASH		16(HASH_PTR), E
1180#else
1181    mov             HASH_PTR, T1
1182    UPDATE_HASH     (T1), A
1183    UPDATE_HASH     4(T1), B
1184    UPDATE_HASH     8(T1), C
1185    UPDATE_HASH     12(T1), D
1186    UPDATE_HASH     16(T1), E
1187#endif
1188	.endm
1189
1190
1191	/*
1192		 main sha1 code for system without ssse3 support
1193	*/
1194
1195	.macro  SHA1_PIPELINED_MAIN_BODY_nossse3
1196	LOAD_HASH						// load initial hashes into A,B,C,D,E (registers)
1197	INITIAL_W_PRECALC_nossse3		// big_endian_load(W) and W+K (i=0:15)
1198	.align	4,0x90
11990:
1200	INTERNAL_nossse3				// update W (i=16:79) and update ABCDE (i=0:63)
1201#if Multiple_Blocks
1202#if defined(__x86_64__)
1203	add	$$64, BUFFER_PTR			// BUFFER_PTR+=64;
1204	sub	$$1, cnt					// pre-decrement cnt by 1
1205#else
1206	addl	$$64, BUFFER_PTR			// BUFFER_PTR+=64;
1207	subl	$$1, cnt					// pre-decrement cnt by 1
1208#endif
1209	jbe	1f							// if cnt <= 0, branch to finish off
1210	SOFTWARE_PIPELINING_nossse3		// update ABCDE (i=64:79) || big_endian_load(W) and W+K (i=0:15)
1211	UPDATE_ALL_HASH					// update output hashes
1212	jmp	0b							// repeat for next block
1213	.align	4,0x90
12141:
1215#endif
1216	ENDING							// update ABCDE (i=64:79)
1217	UPDATE_ALL_HASH					// update output hashes
1218	.endm
1219
1220	/*
1221		 main sha1 code for system with ssse3 support
1222	*/
1223
1224	.macro  SHA1_PIPELINED_MAIN_BODY_ssse3
1225	LOAD_HASH						// load initial hashes into A,B,C,D,E
1226	INITIAL_W_PRECALC_ssse3			// big_endian_load(W) and W+K (i=0:15)
1227	.align	4,0x90
12280:
1229	INTERNAL_ssse3					// update W (i=16:79) and update ABCDE (i=0:63)
1230#if Multiple_Blocks
1231#if defined(__x86_64__)
1232	add	$$64, BUFFER_PTR			// BUFFER_PTR+=64;
1233	sub	$$1, cnt					// pre-decrement cnt by 1
1234#else
1235	addl	$$64, BUFFER_PTR			// BUFFER_PTR+=64;
1236	subl	$$1, cnt					// pre-decrement cnt by 1
1237#endif
1238	jbe	1f							// if cnt <= 0, branch to finish off
1239	SOFTWARE_PIPELINING_ssse3		// update ABCDE (i=64:79) || big_endian_load(W) and W+K (i=0:15)
1240	UPDATE_ALL_HASH					// update output hashes
1241	jmp	0b							// repeat for next block
1242	.align	4,0x90
12431:
1244#endif
1245	ENDING							// update ABCDE (i=64:79)
1246	UPDATE_ALL_HASH					// update output hashes
1247	.endm
1248
1249#ifdef	KERNEL
1250#include <i386/cpu_capabilities.h>
1251#else
1252#include <System/i386/cpu_capabilities.h>
1253#endif
1254
1255	.text
1256
1257	.globl _SHA1Transform
1258	//.private_extern	_SHA1Transform
1259_SHA1Transform:
1260
1261	// detect SSSE3 and dispatch appropriate code branch
1262	#if defined __x86_64__
1263    	movq    __cpu_capabilities@GOTPCREL(%rip), %rax         // %rax -> __cpu_capabilities
1264    	mov     (%rax), %eax                                    // %eax = __cpu_capabilities
1265	#else       // i386
1266		#if defined KERNEL
1267    		leal    __cpu_capabilities, %eax                    // %eax -> __cpu_capabilities
1268    		mov     (%eax), %eax                                // %eax = __cpu_capabilities
1269		#else
1270    		mov    _COMM_PAGE_CPU_CAPABILITIES, %eax
1271		#endif
1272	#endif
1273    test    $(kHasSupplementalSSE3), %eax
1274    je      _SHA1Transform_nossse3                    			// branch to no-ssse3 code
1275
1276
1277	// start the sha1 code with ssse3 support
1278
1279	// save callee-save registers
1280#if defined (__x86_64__)
1281	push	%rbx
1282	push	%rbp
1283#else
1284    push    %ebx
1285    push    %ebp
1286    push    %esi
1287    push    %edi
1288#endif
1289
1290	sub		$stack_size, sp					// allocate stack memory for use
1291
1292	// save used xmm register if this is for kernel
1293#if	KERNEL
1294	xmov	%xmm0, 4*16(sp)
1295	xmov	%xmm1, 5*16(sp)
1296	xmov	%xmm2, 6*16(sp)
1297	xmov	%xmm3, 7*16(sp)
1298	xmov	%xmm4, 8*16(sp)
1299	xmov	%xmm5, 9*16(sp)
1300	xmov	%xmm6, 10*16(sp)
1301	xmov	%xmm7, 11*16(sp)
1302#if defined (__x86_64__)
1303	xmov	%xmm8, 12*16(sp)
1304	xmov	%xmm9, 13*16(sp)
1305	xmov	%xmm10, 14*16(sp)
1306#endif
1307#endif
1308
1309#if defined (__x86_64__)
1310
1311	// set up registers to free %edx/%edi/%esi for other use (ABCDE)
1312	mov		ctx, HASH_PTR
1313	mov		buf, BUFFER_PTR
1314#if Multiple_Blocks
1315	mov		%rdx, cnt
1316#endif
1317	lea		K_XMM_AR(%rip), K_BASE
1318	xmov	0x40(K_BASE), XMM_SHUFB_BSWAP
1319
1320#else	// __i386__
1321
1322#if	KERNEL
1323    lea     K_XMM_AR, %eax
1324#else
1325	// Get address of 0 in R.
1326           call    0f          // Push program counter onto stack.
1327        0: pop     %eax      // Get program counter.
1328		lea	K_XMM_AR-0b(%eax), %eax
1329#endif
1330    mov     %eax, K_BASE
1331    xmov    0x40(%eax), %xmm0
1332    xmov    %xmm0, XMM_SHUFB_BSWAP
1333
1334#endif
1335
1336	SHA1_PIPELINED_MAIN_BODY_ssse3
1337
1338	// restore used xmm registers if this is for kernel
1339#if	KERNEL
1340	xmov	4*16(sp), %xmm0
1341	xmov	5*16(sp), %xmm1
1342	xmov	6*16(sp), %xmm2
1343	xmov	7*16(sp), %xmm3
1344	xmov	8*16(sp), %xmm4
1345	xmov	9*16(sp), %xmm5
1346	xmov	10*16(sp), %xmm6
1347	xmov	11*16(sp), %xmm7
1348#if defined (__x86_64__)
1349	xmov	12*16(sp), %xmm8
1350	xmov	13*16(sp), %xmm9
1351	xmov	14*16(sp), %xmm10
1352#endif
1353#endif
1354
1355	add		$stack_size, sp		// deallocate stack memory
1356
1357	// restore callee-save registers
1358#if defined (__x86_64__)
1359	pop		%rbp
1360	pop		%rbx
1361#else
1362    pop     %edi
1363    pop     %esi
1364    pop     %ebp
1365    pop     %ebx
1366#endif
1367
1368	ret							// return
1369
1370	// this is equivalent to the above function _SHA1Transform, but it does not use ssse3 instructions
1371
1372	.globl _SHA1Transform_nossse3
1373	.private_extern	_SHA1Transform_nossse3
1374_SHA1Transform_nossse3:
1375
1376	// push callee-save registers
1377#if defined (__x86_64__)
1378	push	%rbx
1379	push	%rbp
1380#else
1381    push    %ebx
1382    push    %ebp
1383    push    %esi
1384    push    %edi
1385#endif
1386
1387	sub		$stack_size, sp			// allocate stack memory for local use
1388
1389	// save used xmm registers if this is for kernel
1390#if	KERNEL
1391	xmov	%xmm0, 4*16(sp)
1392	xmov	%xmm1, 5*16(sp)
1393	xmov	%xmm2, 6*16(sp)
1394	xmov	%xmm3, 7*16(sp)
1395	xmov	%xmm4, 8*16(sp)
1396	xmov	%xmm5, 9*16(sp)
1397	xmov	%xmm6, 10*16(sp)
1398	xmov	%xmm7, 11*16(sp)
1399#if defined (__x86_64__)
1400	xmov	%xmm8, 12*16(sp)
1401	xmov	%xmm9, 13*16(sp)
1402#endif
1403#endif
1404
1405#if defined (__x86_64__)
1406
1407	// set up registers to free %edx/%edi/%esi for other use (ABCDE)
1408	mov		ctx, HASH_PTR
1409	mov		buf, BUFFER_PTR
1410#if Multiple_Blocks
1411	mov		%rdx, cnt
1412#endif
1413	lea		K_XMM_AR(%rip), K_BASE
1414
1415#else	// __i386__
1416
1417#if	KERNEL
1418    lea     K_XMM_AR, %eax
1419#else
1420	// Get address of 0 in R.
1421           call    0f          // Push program counter onto stack.
1422        0: pop     %eax      // Get program counter.
1423		lea	K_XMM_AR-0b(%eax), %eax
1424#endif
1425    mov     %eax, K_BASE
1426
1427#endif
1428
1429	SHA1_PIPELINED_MAIN_BODY_nossse3
1430
1431	// restore used xmm registers if this is for kernel
1432#if	KERNEL
1433	xmov	4*16(sp), %xmm0
1434	xmov	5*16(sp), %xmm1
1435	xmov	6*16(sp), %xmm2
1436	xmov	7*16(sp), %xmm3
1437	xmov	8*16(sp), %xmm4
1438	xmov	9*16(sp), %xmm5
1439	xmov	10*16(sp), %xmm6
1440	xmov	11*16(sp), %xmm7
1441#if defined (__x86_64__)
1442	xmov	12*16(sp), %xmm8
1443	xmov	13*16(sp), %xmm9
1444#endif
1445#endif
1446
1447	add		$stack_size, sp		// deallocate stack memory
1448
1449	// restore callee-save registers
1450#if defined (__x86_64__)
1451	pop		%rbp
1452	pop		%rbx
1453#else
1454    pop     %edi
1455    pop     %esi
1456    pop     %ebp
1457    pop     %ebx
1458#endif
1459
1460	ret							// return
1461
1462	.const
1463	.align	4, 0x90
1464
1465#define K1 0x5a827999
1466#define K2 0x6ed9eba1
1467#define K3 0x8f1bbcdc
1468#define K4 0xca62c1d6
1469
1470K_XMM_AR:
1471    .long	K1
1472	.long	K1
1473	.long	K1
1474	.long	K1
1475    .long	K2
1476	.long	K2
1477	.long	K2
1478	.long	K2
1479    .long	K3
1480	.long	K3
1481	.long	K3
1482	.long	K3
1483    .long	K4
1484	.long	K4
1485	.long	K4
1486	.long	K4
1487// bswap_shufb_ctl:	invoked thru 0x40(K_XMM_AR)
1488    .long	0x00010203
1489    .long	0x04050607
1490    .long	0x08090a0b
1491    .long	0x0c0d0e0f
1492
1493
1494
1495#endif	// architecture x86_64 or i386
1496