1/* sha1edp.s : this file provides optimized x86_64 and i386 implementation of the sha1 function 2 CoreOS - vector and numerics group 3 cclee 6-21-10 4 5 The implementation is based on the principle described in an Intel online article 6 "Improving the Performance of the Secure Hash Algorithm (SHA-1)" 7 http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1/ 8 9 10 Update HASH[] by processing a one 64-byte block in MESSAGE[] can be represented by the following C function 11 12void SHA1( int HASH[], int MESSAGE[] ) 13{ 14 int A[81], B[81], C[81], D[81], E[81]; 15 int W[80]; 16 17 int i, FN; 18 19 A[0] = HASH[0]; 20 B[0] = HASH[1]; 21 C[0] = HASH[2]; 22 D[0] = HASH[3]; 23 E[0] = HASH[4]; 24 25 for ( i=0; i<80; ++i ) 26 { 27 if ( i < 16 ) 28 W[i] = BIG_ENDIAN_LOAD( MESSAGE[i] ); 29 else 30 W[i] = ROTATE_LEFT( W[i-3] ^ W[i-8] ^ W[i-14] ^ W[i-16], 1 ); 31 32 FN = F( i, B[i], C[i], D[i] ); 33 34 A[i+1] = FN + E[i] + ROTATE_LEFT( A[i], 5 ) + W[i] + K(i); 35 B[i+1] = A[i]; 36 C[i+1] = ROTATE_LEFT( B[i], 30 ); 37 D[i+1] = C[i]; 38 E[i+1] = D[i]; 39 } 40 41 HASH[0] += A[80]; 42 HASH[1] += B[80]; 43 HASH[2] += C[80]; 44 HASH[3] += D[80]; 45 HASH[4] += E[80]; 46} 47 48 For i=0:15, W[i] is simply big-endian loading of MESSAGE[i]. For i=16:79, W[i] is updated according to W[i] = ROTATE_LEFT( W[i-3] ^ W[i-8] ^ W[i-14] ^ W[i-16], 1 ); 49 50 The approach (by Dean Gaudet) can be used to vectorize the computation of W[i] for i=16:79, 51 52 1. done on 4 consequtive W[i] values in a single XMM register 53 W[i ] = (W[i-3] ^ W[i-8] ^ W[i-14] ^ W[i-16]) rol 1 54 W[i+1] = (W[i-2] ^ W[i-7] ^ W[i-13] ^ W[i-15]) rol 1 55 W[i+2] = (W[i-1] ^ W[i-6] ^ W[i-12] ^ W[i-14]) rol 1 56 W[i+3] = ( 0 ^ W[i-5] ^ W[i-11] ^ W[i-13]) rol 1 57 58 2. this additional calculation unfortunately requires many additional operations 59 W[i+3] ^= W[i] rol 1 60 61 3. once we have 4 W[i] values in XMM we can also add four K values with one instruction 62 W[i:i+3] += {K,K,K,K} 63 64 Let W0 = {W[i] W[i+1] W[i+2] W[i+3]} be the current W-vector to be computed, W4 = {W[i-4] W[i-3] W[i-2] W[i-1]} be the previous vector, and so on 65 The Dean Gaudet approach can be expressed as 66 67 1. W0 = rotate_left(left_shift(W4,32) ^ W8 ^ left_shift(concatenate(W16,W12),64) ^ W16,1); 68 2. W[i+3] ^= W[i] rol 1 69 3. W0 += {K,K,K,K} 70 71 For i>=32, the Intel online article suggests that (using a basic identity (X rol 1) rol 1 = X rol 2) the update equation is equivalent to 72 73 1. W0 = rotate_left(left_shift(concatenate(W8,W4),64) ^ W16 ^ W28 ^ W32, 2); 74 75 Note: 76 1. In total, we need 8 16-byte registers or memory for W0,W4,...,W28. W0 and W32 can be the same register or memory. 77 2. The registers are used in a circular buffering mode. For example, we start with W28,W24,...,W0 (with W0 indicating the most recent 16-byte) 78 i=0, W28,W24,...,W0 79 i=4, W24,W20,...,W28 80 i=8, W20,W16,...,W24 81 . 82 . 83 and so forth. 84 3. 2 ssse3 instructions are used in the Intel article, pshufb and palignr. 85 a. pshufb is used to simplify the BIG_ENDIAN_LOAD operation 86 b. palignr is used to simplify the computation of left_shift(concatenate(W12,W8),64) 87 4. we probe __cpu_capabilities to detect ssse3 support and dispatch code with ssse3 support when available. 88 If ssse3 is not supported, a suboptimal code (pshufb and palignr workaround) is dispatched. 89 90*/ 91 92/* the code can be compiled into single block (64 bytes) per call mode by setting Multiple_blocks to 0 */ 93#define Multiple_Blocks 1 94 95#if defined (__x86_64__) || defined(__i386__) // x86_64 or i386 architectures 96 97#if defined(__x86_64__) 98 99 // set up for x86_64 100#define stack_size (8+16*11+16*4) // 8 (alignedment) + x0-x10 + 4 128-bits for intermediate WK(t) storage 101#define sp %rsp // unifying architectural stack pointer representation 102#define ctx %rdi // 1st input argument, will move to HASH_PTR (%r9) 103#define buf %rsi // 2nd input argument, will move to BUFFER_PTR (%r10) 104#define cnt %r11 // will copy from the 3rd input argument (%rdx) 105#define K_BASE %r8 // an aligned pointer to point to shufb reference numbers of table of K values 106#define HASH_PTR %r9 // pointer to Hash values (A,B,C,D,E) 107#define BUFFER_PTR %r10 // pointer to input blocks 108 109#else // !__x86_64__ 110 111 // set up for i386 112#define stack_size (12+16*2+16*11+16*4) // 12-bytes (alignment) + extra 2 + 3 (W24/W28/XMM_SHUFB_BSWAP) + 8 (xmm0-xmm7) + 4 (WK(t)) 113#define sp %esp // unifying architectural stack pointer representation 114#define HASH_PTR stack_size+16+4(sp) // use 1st input argument from caller function, 16 for (esi/edi/ebx/ebp) 115#define BUFFER_PTR stack_size+16+8(sp) // use 2nd input argument from caller function 116#define cnt stack_size+16+12(sp) // use 3rd input argument from caller function 117#define K_BASE stack_size-4(sp) // use for K_BASE 118 119#endif // __x86_64__ 120 121// symbolizing registers or stack memory with algorithmic variables W0,W4,...,W28 + W_TMP, W_TMP2, and XMM_SHUFB_BSWAP for code with ssse3 support 122 123#define W_TMP %xmm0 124#define W_TMP2 %xmm1 125#define W0 %xmm2 126#define W4 %xmm3 127#define W8 %xmm4 128#define W12 %xmm5 129#define W16 %xmm6 130#define W20 %xmm7 131#if defined(__x86_64__) 132#define W24 %xmm8 133#define W28 %xmm9 134#define XMM_SHUFB_BSWAP %xmm10 // used only when ssse3 is supported 135#else // defined (__i386__) 136#define W24 12*16(sp) 137#define W28 13*16(sp) 138#define XMM_SHUFB_BSWAP 14*16(sp) // used only when ssse3 is supported 139#endif 140 141#define xmov movaps // aligned 16-byte move 142#define xmovu movups // unaligned 16-byte move 143 144// intermediate hash variables 145#define A %ecx 146#define B %esi 147#define C %edi 148#define D %ebp 149#define E %edx 150 151// temp variables 152#define T1 %eax 153#define T2 %ebx 154 155#define WK(t) (t&15)*4(sp) 156 157 // int F1(int B, int C, int D) { return (D ^ ( B & (C ^ D)); } 158 // result in T1 159 .macro F1 160 mov $1, T1 161 xor $2, T1 162 and $0, T1 163 xor $2, T1 164 .endm 165 166 // int F2(int B, int C, int D) { return (D ^ B ^ C); } 167 // result in T1 168 .macro F2 169 mov $2, T1 170 xor $1, T1 171 xor $0, T1 172 .endm 173 174 // int F3(int B, int C, int D) { return (B & C) | (D & (B ^ C)); } 175 // result in T1 176 .macro F3 177 mov $1, T1 178 mov $0, T2 179 or $0, T1 180 and $1, T2 181 and $2, T1 182 or T2, T1 183 .endm 184 185 // for i=60:79, F4 is identical to F2 186 #define F4 F2 187 188 189 /* 190 i=0:15, W[i] = BIG_ENDIAN_LOAD(MESSAGE[i]); 191 192 with ssse3 support, this is achived via 193 for (i=0;i<16;i+=4) { 194 1. W_TMP = new 16 bytes from MESSAGE[] 195 2. W_TMP = pshufb(W_TMP, XMM_SHUFB_BSWAP); save to W circular buffer for updating W 196 3. WTMP += {K,K,K,K}; 197 4. save quadruple W[i]+K[i] = W_TMP in the stack memory; 198 } 199 200 each step is represented in one of the following 4 macro definitions 201 202 */ 203 204 .macro W_PRECALC_00_15_0_ssse3 // input argument $0 : 0/4/8/12 205#if defined (__x86_64__) // BUFFER_PTR is already an address register in x86_64 206 xmovu $0*4(BUFFER_PTR), W_TMP // read 16-bytes into W_TMP, BUFFER_PTR possibly not 16-byte aligned 207#else // BUFFER_PTR is from the argument set up in the caller 208 mov BUFFER_PTR, T1 // T1 = BUFFER_PTR 209 xmovu $0*4(T1), W_TMP // read 16-bytes into W_TMP, BUFFER_PTR possibly not 16-byte aligned 210#endif 211 .endm 212 213 .macro W_PRECALC_00_15_1_ssse3 // input argument $0 : current 16-bytes in the circular buffer, one of W0,W4,W8,...,W28 214 pshufb XMM_SHUFB_BSWAP, W_TMP // convert W_TMP from little-endian into big-endian 215 xmov W_TMP, $0 // save W_TMP in the circular buffer 216 .endm 217 218 .macro W_PRECALC_00_15_2 // K_BASE points to the current K quadruple. 219#if defined (__x86_64__) // K_BASE is already an address register in x86_64 220 paddd (K_BASE), W_TMP // W_TMP += {K,K,K,K}; 221#else // K_BASE is previously set up in the stack memory 222 mov K_BASE, T1 // T1 = K_BASE 223 paddd (T1), W_TMP // W_TMP += {K,K,K,K}; 224#endif 225 .endm 226 227 .macro W_PRECALC_00_15_3 228 xmov W_TMP, WK($0&~3) // save quadruple W[i]+K in the stack memory, which would be used later for updating the hashes A/B/C/D/E 229 .endm 230 231 /* 232 without ssse3 support, steps 1 and 2 need to be modified 233 1. sequentially load 4 words into T1, bswap T1, and save it to 4-bytes in the stack space 234 2. load the 16-bytes from the aligned stack memory into W_TMP 235 */ 236 237 .macro W_PRECALC_00_15_0_nossse3 // input argument $0 : 0/4/8/12 238 239#if defined (__x86_64__) 240 #define BUFFERP BUFFER_PTR 241#else 242 mov BUFFER_PTR, T2 // copy BUFFER_PTR (from caller 2nd argument) to T2 243 #define BUFFERP T2 244#endif 245 246 // load 1st word, bswap it, save it to stack 247 mov $0*4(BUFFERP), T1 248 bswap T1 249 mov T1, 14*16(sp) 250 251 // load 2nd word, bswap it, save it to stack 252 mov 4+$0*4(BUFFERP), T1 253 bswap T1 254 mov T1, 4+14*16(sp) 255 256 // load 3rd word, bswap it, save it to stack 257 mov 8+$0*4(BUFFERP), T1 258 bswap T1 259 mov T1, 8+14*16(sp) 260 261 // load 4th word, bswap it, save it to stack 262 mov 12+$0*4(BUFFERP), T1 263 bswap T1 264 mov T1, 12+14*16(sp) 265 .endm 266 267 .macro W_PRECALC_00_15_1_nossse3 // input argument $0 : current 16-bytes in the circular buffer, one of W0,W4,W8,...,W28 268 xmov 14*16(sp), W_TMP // load the bswapped 16-bytes from the aligned stack memory 269 xmov W_TMP, $0 // save W = W_TMP in the circular buffer 270 .endm 271 272 // rounds 16-31 compute W[0] using the vectorization approach by Dean Gaudet 273 /* 274 W[i ] = (W[i-3] ^ W[i-8] ^ W[i-14] ^ W[i-16]) rol 1 275 W[i+1] = (W[i-2] ^ W[i-7] ^ W[i-13] ^ W[i-15]) rol 1 276 W[i+2] = (W[i-1] ^ W[i-6] ^ W[i-12] ^ W[i-14]) rol 1 277 W[i+3] = ( 0 ^ W[i-5] ^ W[i-11] ^ W[i-13]) rol 1 278 279 W[i+3] ^= W[i] rol 1; // this W[i] is already rol by 1, if we are taking from the intial W before rol 1, we should rol this by 2 280 281 The operation (updating W and W+K) is scheduled as and divided into 4 steps 282 283 0. W_tmp = W3; W = W14 ^ W8 284 1. W = W3 ^ W8 ^ W14 ^ W16; W_TMP = W; W_TMP2 = (W[i] 0 0 0); 285 2. W_TMP = (W3 ^ W8 ^ W14 ^ W16) rol 1; split (W[i] 0 0 0) rol 2 in W_TMP2 and W 286 3. W = W_TMP = W_TMP ^ W_TMP2 ^ W = (W3 ^ W8 ^ W14 ^ W16) rol 1 ^ (W[i] 0 0 0) rol 2; WK = W _TMP+K; 287 288 */ 289 290 .macro W_PRECALC_16_31_0_ssse3 // input arguments : W16,W12,W8,W4,W 291 xmov $1, $4 // W = W12 292 palignr $$8, $0, $4 // W = W14 293 xmov $3, W_TMP // W_TMP = W4 294 psrldq $$4, W_TMP // W_TMP = W3 295 pxor $2, $4 // W = W8 ^ W14 296 .endm 297 298 .macro W_PRECALC_16_31_1 // input arguments : W16,W 299 pxor $0, W_TMP // W_TMP = W3 ^ W16 300 pxor W_TMP, $1 // W = W3 ^ W16 ^ W8 ^ W14 301 xmov $1, W_TMP2 // W_TMP2 = W3 ^ W16 ^ W8 ^ W14 302 xmov $1, W_TMP // W_TMP = W3 ^ W16 ^ W8 ^ W14 303 pslldq $$12, W_TMP2 // W_TMP2 = (W[i] 0 0 0) 304 .endm 305 306 .macro W_PRECALC_16_31_2 // input argument : W 307 psrld $$31, $0 // (W3 ^ W16 ^ W8 ^ W14)>>31 308 pslld $$1, W_TMP // (W3 ^ W16 ^ W8 ^ W14)<<1 309 por $0, W_TMP // W_TMP = (W3 ^ W16 ^ W8 ^ W14) rol 1 310 xmov W_TMP2, $0 // copy W[i] at location of W[i+3] 311 psrld $$30, W_TMP2 // W_TMP2 = W[i] lower 2 bits after rol 2 312 pslld $$2, $0 // W = W[i] higher 30 bits after rol 2 313 .endm 314 315 .macro W_PRECALC_16_31_3 // input arguments: W, i, K_XMM 316#if defined (__i386__) 317 mov K_BASE, T1 // K_BASE is store in the stack memory for i386 318#endif 319 pxor $0, W_TMP 320 pxor W_TMP2, W_TMP // W_TMP = (W3 ^ W16 ^ W8 ^ W14) rol 1 ^ (W[i] 0 0 0) rol 2 321 xmov W_TMP, $0 // save W = W_TMP in the W circular buffer 322#if defined (__x86_64__) 323 paddd $2(K_BASE), W_TMP // W+K 324#else 325 paddd $2(T1), W_TMP // W+K 326#endif 327 xmov W_TMP, WK($1&~3) // save WK = W+K for later update of the hashes A/B/C/D/E 328 .endm 329 330 // the following is a variant of W_PRECALC_16_31_0_ssse3 to be used for system without ssse3, palignr is replaced with 4 instructions 331 332 .macro W_PRECALC_16_31_0_nossse3 // input arguments : W16,W12,W8,W4,W 333 xmov $1, $4 // W = W12 = (w9 w10 w11 w12) 334 335 // the following is a wrokaround for palignr 336 xmov $0, W_TMP // W16 = (w13 w14 w15 w16) 337 pslldq $$8, $4 // shift left to make (w11 w12 0 0) 338 psrldq $$8, W_TMP // shift right to make (0 0 w13 w14) 339 por W_TMP, $4 // W = W14 = (w11 w12 w13 w14) 340 341 xmov $3, W_TMP // W_TMP = W4 = (w1 w2 w3 w4) 342 psrldq $$4, W_TMP // W_TMP = W3 = (0 w1 w2 w3) 343 pxor $2, $4 // W = W8 ^ W14 344 .endm 345 346 /* rounds 32-79 compute W und W+K iusing the vectorization approach from the Intel article 347 348 W = rotate_left(left_shift(concatenate(W8,W4),64) ^ W16 ^ W28 ^ W32, 2); 349 350 where left_shift(concatenate(W8,W4),64) is equivalent to W6. Note also that W32 and W use the same register. 351 352 353 0. W_tmp = W6; W = W28 ^ W32; 354 1. W = W_tmp = W6 ^ W16 ^ W28 ^ W32; 355 2. W_tmp = (W6 ^ W16 ^ W28 ^ W32) rol 2; 356 3. W = W_Tmp; WK = W_tmp + K; 357 358 */ 359 360 361 .macro W_PRECALC_32_79_0_ssse3 // inputr arguments : W28,W8,W4,W 362 xmov $2, W_TMP // (w1 w2 w3 w4) 363 pxor $0, $3 // W = W28 ^ W32; 364 palignr $$8, $1, W_TMP // W_tmp = (w3 w4 w5 w6) = W6; 365 .endm 366 367 // the following is a variant and will be used for system without ssse3 support 368 .macro W_PRECALC_32_79_0_nossse3 // input arguments : W28,W8,W4,W 369 xmov $2, W_TMP // (w1 w2 w3 w4) 370 xmov $1, W_TMP2 // (w5 w6 w7 w8) 371 pxor $0, $3 // W = W28 ^ W32 372 pslldq $$8, W_TMP // (w3 w4 0 0) 373 psrldq $$8, W_TMP2 // (0 0 w5 w6) 374 por W_TMP2, W_TMP // W_tmp = (w3 w4 w5 w6) = W6 375 .endm 376 377 // this is a variant of W_PRECALC_32_79_0_ssse3 for i386 (as W24/W28 are stored in memory, not in registers) 378 .macro W_PRECALC_32_79_0_i386_ssse3 // input arguments : W28,W8,W4,W 379 xmov $3, W_TMP // W32 380 pxor $0, W_TMP // W28 ^ W32 381 xmov W_TMP, $3 // W = W28 ^ W32; 382 xmov $2, W_TMP // W4 383 palignr $$8, $1, W_TMP // W_tmp = (w3 w4 w5 w6) = W6; 384 .endm 385 386 // this is a variant of W_PRECALC_32_79_0_nossse3 for i386 (as W24/W28 are stored in memory, not in registers) 387 .macro W_PRECALC_32_79_0_i386_nossse3 // input arguments : W28,W8,W4,W 388 xmov $3, W_TMP // W32 389 pxor $0, W_TMP // W28 ^ W32 390 xmov W_TMP, $3 // W = W28 ^ W32 391 xmov $2, W_TMP // W4 = (w1 w2 w3 w4) 392 xmov $1, W_TMP2 // W8 = (w5 w6 w7 w8) 393 pslldq $$8, W_TMP // (w3 w4 0 0) 394 psrldq $$8, W_TMP2 // (0 0 w5 w6) 395 por W_TMP2, W_TMP // W_tmp = (w3 w4 w5 w6) = W6 396 .endm 397 398 .macro W_PRECALC_32_79_1 // input arguments : W16,W 399 pxor $0, W_TMP // W_tmp = W6 ^ W16 400 pxor $1, W_TMP // W_tmp = W6 ^ W16 ^ W28 ^ W32 401 xmov W_TMP, $1 // W = W_tmp = W6 ^ W16 ^ W28 ^ W32 402 .endm 403 404 .macro W_PRECALC_32_79_2 // input argument : W 405 psrld $$30, $0 // W >> 30 406 pslld $$2, W_TMP // W << 2 407 por $0, W_TMP // W_tmp = (W6 ^ W16 ^ W28 ^ W32) rol 2 408 .endm 409 410 // this is a variant of W_PRECALC_32_79_2 for i386 (as W24/W28 are stored in memory, not in registers) 411 // this should be used when the input is either W24 or W28 on i386 architecture 412 .macro W_PRECALC_32_79_2_i386 // input argument : W 413 xmov $0, W_TMP2 // W 414 psrld $$30, W_TMP2 // W >> 30 415 xmov W_TMP2, $0 // save (W >> 30) at W 416 pslld $$2, W_TMP // W_tmp << 2 417 por $0, W_TMP // W_tmp = (W6 ^ W16 ^ W28 ^ W32) rol 2 418 .endm 419 420 .macro W_PRECALC_32_79_3 // input argument W, i, K_XMM 421#if defined (__x86_64__) 422 xmov W_TMP, $0 // W = (W6 ^ W16 ^ W28 ^ W32) rol 2 423 paddd $2(K_BASE), W_TMP // W + K 424 xmov W_TMP, WK($1&~3) // write W+K 425#else 426 mov K_BASE, T1 // T1 = K_BASE (which is in the caller argument) 427 xmov W_TMP, $0 // W = (W6 ^ W16 ^ W28 ^ W32) rol 2 428 paddd $2(T1), W_TMP // W_tmp = W + K 429 xmov W_TMP, WK($1&~3) // write WK 430#endif 431 .endm 432 433 434 /* The hash update operation is completed by the following statements. 435 436 A[i+1] = FN + E[i] + ROTATE_LEFT( A[i], 5 ) + WK(i); 437 B[i+1] = A[i]; 438 C[i+1] = ROTATE_LEFT( B[i], 30 ); 439 D[i+1] = C[i]; 440 E[i+1] = D[i]; 441 442 Suppose we start with A0,B0,C0,D0,E0. The 1st iteration can be expressed as follows: 443 444 A1 = FN + E0 + rol(A0,5) + WK; 445 B1 = A0; 446 C1 = rol(B0, 30); 447 D1 = C0; 448 E1 = D0; 449 450 to avoid excessive memory movement between registers, 451 1. A1 = FN + E0 + rol(A0,5) + WK; can be temporarily saved in E0, 452 2. C1 = rol(B0,30) can be temporarily saved in B0. 453 454 Therefore, ignoring the time index, the update operation is equivalent to 455 1. E = FN(B,C,D) + E + rol(A,5) + WK(i) 456 2. B = rol(B,30) 457 3. the hashes are now stored in the order of E,A,B,C,D 458 459 460 To pack 2 hash update operations in 1 iteration, starting with A,B,C,D,E 461 1. E = FN(B,C,D) + E + rol(A,5) + WK(i) 462 2. B = rol(B,30) 463 // now the hashes are in the order of E,A,B,C,D 464 3. D = FN(A,B,C) + D + rol(E,5) + WK(i+1) 465 4. A = rol(A,30) 466 // now the hashes are in the order of D,E,A,B,C 467 468 These operations are distributed into the following 2 macro definitions RR0 and RR1. 469 470 */ 471 472 .macro RR0 // input arguments : FN, A, B, C, D, E, i 473 $0 $2, $3, $4 // T1 = FN(B,C,D) 474 add WK($6), $5 // E + WK(i) 475 rol $$30, $2 // B = rol(B,30) 476 mov $1, T2 // T2 = A 477 add WK($6+1), $4 // D + WK(i+1) 478 rol $$5, T2 // rol(A,5) 479 add T1, $5 // E = FN(B,C,D) + E + WK(i) 480 .endm 481 482 .macro RR1 483 add $5, T2 // T2 = FN(B,C,D) + E + rol(A,5) + WK(i) 484 mov T2, $5 // E = FN(B,C,D) + E + rol(A,5) + WK(i) 485 rol $$5, T2 // rol(E,5) 486 add T2, $4 // D + WK(i+1) + rol(E,5) 487 $0 $1, $2, $3 // FN(A,B,C) 488 add T1, $4 // D = FN(A,B,C) + D + rol(E,5) + WK(i+1) 489 rol $$30, $1 // A = rol(A,30) 490 .endm 491 492 493 494 /* 495 496 The following macro definitions are used to expand code for the per-block sha1 operation. 497 498 INITIAL_W_PRECALC_ssse3 : BIG_ENDIAN_LOAD(64 bytes block) into W (i=0:15) and store W+K into the stack memory 499 INTERNAL_ssse3 : updating W (16:79) and update the digests A/B/C/D/E (i=0:63, based on W+K stored in the stack memory) 500 ENDING : finishing up update the digests A/B/C/D/E (i=64:79) 501 502 For multiple-block sha1 operation (Multiple_Blocks = 1), INITIAL_W_PRECALC_ssse3 and ENDING are combined 503 into 1 macro definition for software pipeling. 504 505 SOFTWARE_PIPELINING_ssse3 : BIG_ENDIAN_LOAD(64 bytes block) into W (i=0:15) and store W+K into the stack, and finishing up update the digests A/B/C/D/E (i=64:79) 506 507 assume cnt (the number of blocks) >= 1, the main code body should look like 508 509 INITIAL_W_PRECALC_ssse3 // W = big_endian_load and pre-compute W+K (i=0:15) 510 do { 511 INTERNAL_ssse3 // update W(i=16:79), and update hash digests A/B/C/D/E (i=0:63) 512 cnt--; 513 if (cnt==0) break; 514 BUFFER_PTR += 64; 515 SOFTWARE_PIPELINING_ssse3; // update hash digests A/B/C/D/E (i=64:79) + W = big_endian_load and pre-compute W+K (i=0:15) 516 } 517 ENDING // update hash digests A/B/C/D/E (i=64:79) 518 519 */ 520 521 #define W_PRECALC_00_15_0 W_PRECALC_00_15_0_ssse3 522 #define W_PRECALC_00_15_1 W_PRECALC_00_15_1_ssse3 523 #define W_PRECALC_16_31_0 W_PRECALC_16_31_0_ssse3 524 #define W_PRECALC_32_79_0 W_PRECALC_32_79_0_ssse3 525 #define W_PRECALC_32_79_0_i386 W_PRECALC_32_79_0_i386_ssse3 526 527 528 .macro INITIAL_W_PRECALC_ssse3 // BIG_ENDIAN_LOAD(64 bytes block) into W (i=0:15) and store W+K into the stack memory 529 530 // i=0 : W28,W24,W20,W16,W12,W8,W4,W0 531 W_PRECALC_00_15_0 0 // W_TMP = (BUFFER_PTR) 532 W_PRECALC_00_15_1 W0 // convert W_TMP to big-endian, and save W0 = W_TMP 533 W_PRECALC_00_15_2 // W_TMP = W0 + K 534 W_PRECALC_00_15_3 3 // (sp) = W_TMP = W0 + K 535 536 // i=4 : W24,W20,W16,W12,W8,W4,W0,W28 537 W_PRECALC_00_15_0 4 // W_TMP = 16(BUFFER_PTR) 538 W_PRECALC_00_15_1 W28 // convert W_TMP to big-endian, and save W28 = W_TMP 539 W_PRECALC_00_15_2 // W_TMP = W28 + K 540 W_PRECALC_00_15_3 7 // 16(sp) = W_TMP = W28 + K 541 542 // i=8 : W20,W16,W12,W8,W4,W0,W28,W24 543 W_PRECALC_00_15_0 8 // W_TMP = 32(BUFFER_PTR) 544 W_PRECALC_00_15_1 W24 // convert W_TMP to big-endian, and save W24 = W_TMP 545 W_PRECALC_00_15_2 // W_TMP = W24 + K 546 W_PRECALC_00_15_3 11 // 32(sp) = W_TMP = W24 + K 547 548 // i=12 : W16,W12,W8,W4,W0,W28,W24,W20 549 W_PRECALC_00_15_0 12 // W_TMP = 48(BUFFER_PTR) 550 W_PRECALC_00_15_1 W20 // convert W_TMP to big-endian, and save W20 = W_TMP 551 W_PRECALC_00_15_2 // W_TMP = W20 + K 552 W_PRECALC_00_15_3 15 // 48(sp) = W_TMP = W20 + K 553 554 .endm 555 556 557 .macro INTERNAL_ssse3 // updating W (16:79) and update the digests A/B/C/D/E (i=0:63, based on W+K stored in the stack memory) 558 559 // i=16 : W12,W8,W4,W0,W28,W24,W20,W16 560 W_PRECALC_16_31_0 W0,W28,W24,W20,W16 561 RR0 F1,A,B,C,D,E,0 562 W_PRECALC_16_31_1 W0,W16 563 RR1 F1,A,B,C,D,E,0 564 W_PRECALC_16_31_2 W16 565 RR0 F1,D,E,A,B,C,2 566 W_PRECALC_16_31_3 W16, 2, 0 567 RR1 F1,D,E,A,B,C,2 568 569 // i=20 : W8,W4,W0,W28,W24,W20,W16,W12 570 W_PRECALC_16_31_0 W28,W24,W20,W16,W12 571 RR0 F1,B,C,D,E,A,4 572 W_PRECALC_16_31_1 W28,W12 573 RR1 F1,B,C,D,E,A,4 574 W_PRECALC_16_31_2 W12 575 RR0 F1,E,A,B,C,D,6 576 W_PRECALC_16_31_3 W12, 6, 16 577 RR1 F1,E,A,B,C,D,6 578 579 // i=24 : W4,W0,W28,W24,W20,W16,W12,W8 580 W_PRECALC_16_31_0 W24,W20,W16,W12,W8 581 RR0 F1,C,D,E,A,B,8 582 W_PRECALC_16_31_1 W24,W8 583 RR1 F1,C,D,E,A,B,8 584 W_PRECALC_16_31_2 W8 585 RR0 F1,A,B,C,D,E,10 586 W_PRECALC_16_31_3 W8,10,16 587 RR1 F1,A,B,C,D,E,10 588 589 // i=28 : W0,W28,W24,W20,W16,W12,W8,W4 590 W_PRECALC_16_31_0 W20,W16,W12,W8,W4 591 RR0 F1,D,E,A,B,C,12 592 W_PRECALC_16_31_1 W20,W4 593 RR1 F1,D,E,A,B,C,12 594 W_PRECALC_16_31_2 W4 595 RR0 F1,B,C,D,E,A,14 596 W_PRECALC_16_31_3 W4,14,16 597 RR1 F1,B,C,D,E,A,14 598 599 // i=32 : W28,W24,W20,W16,W12,W8,W4,W0 600 W_PRECALC_32_79_0 W28,W8,W4,W0 601 RR0 F1,E,A,B,C,D,16 602 W_PRECALC_32_79_1 W16,W0 603 RR1 F1,E,A,B,C,D,16 604 W_PRECALC_32_79_2 W0 605 RR0 F1,C,D,E,A,B,18 606 W_PRECALC_32_79_3 W0,18,16 607 RR1 F1,C,D,E,A,B,18 608 609 // starting using F2 610 611 // i=36 : W24,W20,W16,W12,W8,W4,W0,W28 612#if defined (__x86_64__) 613 W_PRECALC_32_79_0 W24,W4,W0,W28 614#else 615 W_PRECALC_32_79_0_i386 W24,W4,W0,W28 616#endif 617 RR0 F2,A,B,C,D,E,20 618 W_PRECALC_32_79_1 W12,W28 619 RR1 F2,A,B,C,D,E,20 620#if defined (__x86_64__) 621 W_PRECALC_32_79_2 W28 622#else 623 W_PRECALC_32_79_2_i386 W28 624#endif 625 RR0 F2,D,E,A,B,C,22 626 W_PRECALC_32_79_3 W28,22,16 627 RR1 F2,D,E,A,B,C,22 628 629 // i=40 : W20,W16,W12,W8,W4,W0,W28,W24 630 #undef K_XMM 631 #define K_XMM 32 632#if defined (__x86_64__) 633 W_PRECALC_32_79_0 W20,W0,W28,W24 634#else 635 W_PRECALC_32_79_0_i386 W20,W0,W28,W24 636#endif 637 RR0 F2,B,C,D,E,A,24 638 W_PRECALC_32_79_1 W8,W24 639 RR1 F2,B,C,D,E,A,24 640#if defined (__x86_64__) 641 W_PRECALC_32_79_2 W24 642#else 643 W_PRECALC_32_79_2_i386 W24 644#endif 645 RR0 F2,E,A,B,C,D,26 646 W_PRECALC_32_79_3 W24,26,K_XMM 647 RR1 F2,E,A,B,C,D,26 648 649 // i=44 : W16,W12,W8,W4,W0,W28,W24,W20 650 W_PRECALC_32_79_0 W16,W28,W24,W20 651 RR0 F2,C,D,E,A,B,28 652 W_PRECALC_32_79_1 W4,W20 653 RR1 F2,C,D,E,A,B,28 654 W_PRECALC_32_79_2 W20 655 RR0 F2,A,B,C,D,E,30 656 W_PRECALC_32_79_3 W20,30,K_XMM 657 RR1 F2,A,B,C,D,E,30 658 659 // i=48 : W12,W8,W4,W0,W28,W24,W20,W16 660 W_PRECALC_32_79_0 W12,W24,W20,W16 661 RR0 F2,D,E,A,B,C,32 662 W_PRECALC_32_79_1 W0,W16 663 RR1 F2,D,E,A,B,C,32 664 W_PRECALC_32_79_2 W16 665 RR0 F2,B,C,D,E,A,34 666 W_PRECALC_32_79_3 W16,34,K_XMM 667 RR1 F2,B,C,D,E,A,34 668 669 // i=52 : W8,W4,W0,W28,W24,W20,W16,W12 670 W_PRECALC_32_79_0 W8,W20,W16,W12 671 RR0 F2,E,A,B,C,D,36 672 W_PRECALC_32_79_1 W28,W12 673 RR1 F2,E,A,B,C,D,36 674 W_PRECALC_32_79_2 W12 675 RR0 F2,C,D,E,A,B,38 676 W_PRECALC_32_79_3 W12,38,K_XMM 677 RR1 F2,C,D,E,A,B,38 678 679 // starting using F3 680 681 // i=56 : W4,W0,W28,W24,W20,W16,W12,W8 682 W_PRECALC_32_79_0 W4,W16,W12,W8 683 RR0 F3,A,B,C,D,E,40 684 W_PRECALC_32_79_1 W24,W8 685 RR1 F3,A,B,C,D,E,40 686 W_PRECALC_32_79_2 W8 687 RR0 F3,D,E,A,B,C,42 688 W_PRECALC_32_79_3 W8,42,K_XMM 689 RR1 F3,D,E,A,B,C,42 690 691 // i=60 : W0,W28,W24,W20,W16,W12,W8,W4 692 #undef K_XMM 693 #define K_XMM 48 694 W_PRECALC_32_79_0 W0,W12,W8,W4 695 RR0 F3,B,C,D,E,A,44 696 W_PRECALC_32_79_1 W20,W4 697 RR1 F3,B,C,D,E,A,44 698 W_PRECALC_32_79_2 W4 699 RR0 F3,E,A,B,C,D,46 700 W_PRECALC_32_79_3 W4,46,K_XMM 701 RR1 F3,E,A,B,C,D,46 702 703 // i=64 : W28,W24,W20,W16,W12,W8,W4,W0 704 W_PRECALC_32_79_0 W28,W8,W4,W0 705 RR0 F3,C,D,E,A,B,48 706 W_PRECALC_32_79_1 W16,W0 707 RR1 F3,C,D,E,A,B,48 708 W_PRECALC_32_79_2 W0 709 RR0 F3,A,B,C,D,E,50 710 W_PRECALC_32_79_3 W0,50,K_XMM 711 RR1 F3,A,B,C,D,E,50 712 713 // i=68 : W24,W20,W16,W12,W8,W4,W0,W28 714#if defined (__x86_64__) 715 W_PRECALC_32_79_0 W24,W4,W0,W28 716#else 717 W_PRECALC_32_79_0_i386 W24,W4,W0,W28 718#endif 719 RR0 F3,D,E,A,B,C,52 720 W_PRECALC_32_79_1 W12,W28 721 RR1 F3,D,E,A,B,C,52 722#if defined (__x86_64__) 723 W_PRECALC_32_79_2 W28 724#else 725 W_PRECALC_32_79_2_i386 W28 726#endif 727 RR0 F3,B,C,D,E,A,54 728 W_PRECALC_32_79_3 W28,54,K_XMM 729 RR1 F3,B,C,D,E,A,54 730 731 // i=72 : W20,W16,W12,W8,W4,W0,W28,W24 732#if defined (__x86_64__) 733 W_PRECALC_32_79_0 W20,W0,W28,W24 734#else 735 W_PRECALC_32_79_0_i386 W20,W0,W28,W24 736#endif 737 RR0 F3,E,A,B,C,D,56 738 W_PRECALC_32_79_1 W8,W24 739 RR1 F3,E,A,B,C,D,56 740#if defined (__x86_64__) 741 W_PRECALC_32_79_2 W24 742#else 743 W_PRECALC_32_79_2_i386 W24 744#endif 745 RR0 F3,C,D,E,A,B,58 746 W_PRECALC_32_79_3 W24,58,K_XMM 747 RR1 F3,C,D,E,A,B,58 748 749 // starting using F4 750 751 // i=76 : W16,W12,W8,W4,W0,W28,W24,W20 752 W_PRECALC_32_79_0 W16,W28,W24,W20 753 RR0 F4,A,B,C,D,E,60 754 W_PRECALC_32_79_1 W4,W20 755 RR1 F4,A,B,C,D,E,60 756 W_PRECALC_32_79_2 W20 757 RR0 F4,D,E,A,B,C,62 758 W_PRECALC_32_79_3 W20,62,K_XMM 759 RR1 F4,D,E,A,B,C,62 760 761 .endm 762 763 .macro SOFTWARE_PIPELINING_ssse3 764 // i=0 : W28,W24,W20,W16,W12,W8,W4,W0 765 W_PRECALC_00_15_0 0 // W_TMP = (BUFFER_PTR) 766 RR0 F4,B,C,D,E,A,64 767 W_PRECALC_00_15_1 W0 // convert W_TMP to big-endian, and save W0 = W_TMP 768 RR1 F4,B,C,D,E,A,64 769 W_PRECALC_00_15_2 // W_TMP = W0 + K 770 RR0 F4,E,A,B,C,D,66 771 W_PRECALC_00_15_3 3 // (sp) = W_TMP = W0 + K 772 RR1 F4,E,A,B,C,D,66 773 774 // i=4 : W24,W20,W16,W12,W8,W4,W0,W28 775 W_PRECALC_00_15_0 4 // W_TMP = 16(BUFFER_PTR) 776 RR0 F4,C,D,E,A,B,68 777 W_PRECALC_00_15_1 W28 // convert W_TMP to big-endian, and save W28 = W_TMP 778 RR1 F4,C,D,E,A,B,68 779 W_PRECALC_00_15_2 // W_TMP = W28 + K 780 RR0 F4,A,B,C,D,E,70 781 W_PRECALC_00_15_3 7 // 16(sp) = W_TMP = W28 + K[0] 782 RR1 F4,A,B,C,D,E,70 783 784 // i=8 : W20,W16,W12,W8,W4,W0,W28,W24 785 W_PRECALC_00_15_0 8 // W_TMP = 32(BUFFER_PTR) 786 RR0 F4,D,E,A,B,C,72 787 W_PRECALC_00_15_1 W24 // convert W_TMP to big-endian, and save W24 = W_TMP 788 RR1 F4,D,E,A,B,C,72 789 W_PRECALC_00_15_2 // W_TMP = W24 + K 790 RR0 F4,B,C,D,E,A,74 791 W_PRECALC_00_15_3 11 // 32(sp) = W_TMP = W24 + K 792 RR1 F4,B,C,D,E,A,74 793 794 // i=12 : W16,W12,W8,W4,W0,W28,W24,W20 795 W_PRECALC_00_15_0 12 // W_TMP = 48(BUFFER_PTR) 796 RR0 F4,E,A,B,C,D,76 797 W_PRECALC_00_15_1 W20 // convert W_TMP to big-endian, and save W20 = W_TMP 798 RR1 F4,E,A,B,C,D,76 799 W_PRECALC_00_15_2 // W_TMP = W20 + K 800 RR0 F4,C,D,E,A,B,78 801 W_PRECALC_00_15_3 15 // 48(sp) = W_TMP = W20 + K 802 RR1 F4,C,D,E,A,B,78 803 .endm 804 805 806 #undef W_PRECALC_00_15_0 807 #undef W_PRECALC_00_15_1 808 #undef W_PRECALC_16_31_0 809 #undef W_PRECALC_32_79_0 810 #undef W_PRECALC_32_79_0_i386 811 812 813 814 /* 815 816 The following are 3 macro definitions that are no-ssse3 variants of the previous 3 macro definitions. 817 818 INITIAL_W_PRECALC_nossse3 819 INTERNAL_nossse3 820 SOFTWARE_PIPELINING_nossse3 821 822 They will be used in a sha1 code main body definition that will be used for system without ssse3 support. 823 824 */ 825 826 #define W_PRECALC_00_15_0 W_PRECALC_00_15_0_nossse3 827 #define W_PRECALC_00_15_1 W_PRECALC_00_15_1_nossse3 828 #define W_PRECALC_16_31_0 W_PRECALC_16_31_0_nossse3 829 #define W_PRECALC_32_79_0 W_PRECALC_32_79_0_nossse3 830 #define W_PRECALC_32_79_0_i386 W_PRECALC_32_79_0_i386_nossse3 831 832 833 .macro INITIAL_W_PRECALC_nossse3 834 835 // i=0 : W28,W24,W20,W16,W12,W8,W4,W0 836 W_PRECALC_00_15_0 0 // W_TMP = (BUFFER_PTR) 837 W_PRECALC_00_15_1 W0 // convert W_TMP to big-endian, and save W0 = W_TMP 838 W_PRECALC_00_15_2 // W_TMP = W0 + K 839 W_PRECALC_00_15_3 3 // (sp) = W_TMP = W0 + K 840 841 // i=4 : W24,W20,W16,W12,W8,W4,W0,W28 842 W_PRECALC_00_15_0 4 // W_TMP = 16(BUFFER_PTR) 843 W_PRECALC_00_15_1 W28 // convert W_TMP to big-endian, and save W28 = W_TMP 844 W_PRECALC_00_15_2 // W_TMP = W28 + K 845 W_PRECALC_00_15_3 7 // 16(sp) = W_TMP = W28 + K 846 847 // i=8 : W20,W16,W12,W8,W4,W0,W28,W24 848 W_PRECALC_00_15_0 8 // W_TMP = 32(BUFFER_PTR) 849 W_PRECALC_00_15_1 W24 // convert W_TMP to big-endian, and save W24 = W_TMP 850 W_PRECALC_00_15_2 // W_TMP = W24 + K 851 W_PRECALC_00_15_3 11 // 32(sp) = W_TMP = W24 + K 852 853 // i=12 : W16,W12,W8,W4,W0,W28,W24,W20 854 W_PRECALC_00_15_0 12 // W_TMP = 48(BUFFER_PTR) 855 W_PRECALC_00_15_1 W20 // convert W_TMP to big-endian, and save W20 = W_TMP 856 W_PRECALC_00_15_2 // W_TMP = W20 + K 857 W_PRECALC_00_15_3 15 // 48(sp) = W_TMP = W20 + K 858 859 .endm 860 861 862 .macro INTERNAL_nossse3 863 // i=16 864 // circular buffer : W12,W8,W4,W0,W28,W24,W20,W16 865 W_PRECALC_16_31_0 W0,W28,W24,W20,W16 866 RR0 F1,A,B,C,D,E,0 867 W_PRECALC_16_31_1 W0,W16 868 RR1 F1,A,B,C,D,E,0 869 W_PRECALC_16_31_2 W16 870 RR0 F1,D,E,A,B,C,2 871 W_PRECALC_16_31_3 W16, 2, 0 872 RR1 F1,D,E,A,B,C,2 873 874 // i=20, 875 // W8,W4,W0,W28,W24,W20,W16,W12 876 W_PRECALC_16_31_0 W28,W24,W20,W16,W12 877 RR0 F1,B,C,D,E,A,4 878 W_PRECALC_16_31_1 W28,W12 879 RR1 F1,B,C,D,E,A,4 880 881 W_PRECALC_16_31_2 W12 882 RR0 F1,E,A,B,C,D,6 883 W_PRECALC_16_31_3 W12, 6, 16 884 RR1 F1,E,A,B,C,D,6 885 886 // i=24, 887 // W4,W0,W28,W24,W20,W16,W12,W8 888 W_PRECALC_16_31_0 W24,W20,W16,W12,W8 889 RR0 F1,C,D,E,A,B,8 890 W_PRECALC_16_31_1 W24,W8 891 RR1 F1,C,D,E,A,B,8 892 893 W_PRECALC_16_31_2 W8 894 RR0 F1,A,B,C,D,E,10 895 W_PRECALC_16_31_3 W8,10,16 896 RR1 F1,A,B,C,D,E,10 897 898 // i=28 899 // W0,W28,W24,W20,W16,W12,W8,W4 900 W_PRECALC_16_31_0 W20,W16,W12,W8,W4 901 RR0 F1,D,E,A,B,C,12 902 W_PRECALC_16_31_1 W20,W4 903 RR1 F1,D,E,A,B,C,12 904 905 W_PRECALC_16_31_2 W4 906 RR0 F1,B,C,D,E,A,14 907 W_PRECALC_16_31_3 W4,14,16 908 RR1 F1,B,C,D,E,A,14 909 910 //i=32 911 // W28,W24,W20,W16,W12,W8,W4,W0 912 W_PRECALC_32_79_0 W28,W8,W4,W0 913 RR0 F1,E,A,B,C,D,16 914 W_PRECALC_32_79_1 W16,W0 915 RR1 F1,E,A,B,C,D,16 916 W_PRECALC_32_79_2 W0 917 RR0 F1,C,D,E,A,B,18 918 W_PRECALC_32_79_3 W0,18,16 919 RR1 F1,C,D,E,A,B,18 920 921 //i=36 922 // W24,W20,W16,W12,W8,W4,W0,W28 923#if defined (__x86_64__) 924 W_PRECALC_32_79_0 W24,W4,W0,W28 925#else 926 W_PRECALC_32_79_0_i386 W24,W4,W0,W28 927#endif 928 RR0 F2,A,B,C,D,E,20 929 W_PRECALC_32_79_1 W12,W28 930 RR1 F2,A,B,C,D,E,20 931#if defined (__x86_64__) 932 W_PRECALC_32_79_2 W28 933#else 934 W_PRECALC_32_79_2_i386 W28 935#endif 936 RR0 F2,D,E,A,B,C,22 937 W_PRECALC_32_79_3 W28,22,16 938 RR1 F2,D,E,A,B,C,22 939 940 //i=40 941 #undef K_XMM 942 #define K_XMM 32 943 // W20,W16,W12,W8,W4,W0,W28,W24 944#if defined (__x86_64__) 945 W_PRECALC_32_79_0 W20,W0,W28,W24 946#else 947 W_PRECALC_32_79_0_i386 W20,W0,W28,W24 948#endif 949 RR0 F2,B,C,D,E,A,24 950 W_PRECALC_32_79_1 W8,W24 951 RR1 F2,B,C,D,E,A,24 952#if defined (__x86_64__) 953 W_PRECALC_32_79_2 W24 954#else 955 W_PRECALC_32_79_2_i386 W24 956#endif 957 RR0 F2,E,A,B,C,D,26 958 W_PRECALC_32_79_3 W24,26,K_XMM 959 RR1 F2,E,A,B,C,D,26 960 961 //i=44 962 // W16,W12,W8,W4,W0,W28,W24,W20 963 W_PRECALC_32_79_0 W16,W28,W24,W20 964 RR0 F2,C,D,E,A,B,28 965 W_PRECALC_32_79_1 W4,W20 966 RR1 F2,C,D,E,A,B,28 967 W_PRECALC_32_79_2 W20 968 RR0 F2,A,B,C,D,E,30 969 W_PRECALC_32_79_3 W20,30,K_XMM 970 RR1 F2,A,B,C,D,E,30 971 972 //i=48 973 // W12,W8,W4,W0,W28,W24,W20,W16 974 W_PRECALC_32_79_0 W12,W24,W20,W16 975 RR0 F2,D,E,A,B,C,32 976 W_PRECALC_32_79_1 W0,W16 977 RR1 F2,D,E,A,B,C,32 978 W_PRECALC_32_79_2 W16 979 RR0 F2,B,C,D,E,A,34 980 W_PRECALC_32_79_3 W16,34,K_XMM 981 RR1 F2,B,C,D,E,A,34 982 983 //i=52 984 // W8,W4,W0,W28,W24,W20,W16,W12 985 W_PRECALC_32_79_0 W8,W20,W16,W12 986 RR0 F2,E,A,B,C,D,36 987 W_PRECALC_32_79_1 W28,W12 988 RR1 F2,E,A,B,C,D,36 989 W_PRECALC_32_79_2 W12 990 RR0 F2,C,D,E,A,B,38 991 W_PRECALC_32_79_3 W12,38,K_XMM 992 RR1 F2,C,D,E,A,B,38 993 994 //i=56 995 // W4,W0,W28,W24,W20,W16,W12,W8 996 W_PRECALC_32_79_0 W4,W16,W12,W8 997 RR0 F3,A,B,C,D,E,40 998 W_PRECALC_32_79_1 W24,W8 999 RR1 F3,A,B,C,D,E,40 1000 W_PRECALC_32_79_2 W8 1001 RR0 F3,D,E,A,B,C,42 1002 W_PRECALC_32_79_3 W8,42,K_XMM 1003 RR1 F3,D,E,A,B,C,42 1004 1005 //i=60 1006 #undef K_XMM 1007 #define K_XMM 48 1008 // W0,W28,W24,W20,W16,W12,W8,W4 1009 W_PRECALC_32_79_0 W0,W12,W8,W4 1010 RR0 F3,B,C,D,E,A,44 1011 W_PRECALC_32_79_1 W20,W4 1012 RR1 F3,B,C,D,E,A,44 1013 W_PRECALC_32_79_2 W4 1014 RR0 F3,E,A,B,C,D,46 1015 W_PRECALC_32_79_3 W4,46,K_XMM 1016 RR1 F3,E,A,B,C,D,46 1017 1018 //i=64 1019 // W28,W24,W20,W16,W12,W8,W4,W0 1020 W_PRECALC_32_79_0 W28,W8,W4,W0 1021 RR0 F3,C,D,E,A,B,48 1022 W_PRECALC_32_79_1 W16,W0 1023 RR1 F3,C,D,E,A,B,48 1024 W_PRECALC_32_79_2 W0 1025 RR0 F3,A,B,C,D,E,50 1026 W_PRECALC_32_79_3 W0,50,K_XMM 1027 RR1 F3,A,B,C,D,E,50 1028 1029 //i=68 1030 // W24,W20,W16,W12,W8,W4,W0,W28 1031#if defined (__x86_64__) 1032 W_PRECALC_32_79_0 W24,W4,W0,W28 1033#else 1034 W_PRECALC_32_79_0_i386 W24,W4,W0,W28 1035#endif 1036 RR0 F3,D,E,A,B,C,52 1037 W_PRECALC_32_79_1 W12,W28 1038 RR1 F3,D,E,A,B,C,52 1039#if defined (__x86_64__) 1040 W_PRECALC_32_79_2 W28 1041#else 1042 W_PRECALC_32_79_2_i386 W28 1043#endif 1044 RR0 F3,B,C,D,E,A,54 1045 W_PRECALC_32_79_3 W28,54,K_XMM 1046 RR1 F3,B,C,D,E,A,54 1047 1048 //i=72 1049 // W20,W16,W12,W8,W4,W0,W28,W24 1050#if defined (__x86_64__) 1051 W_PRECALC_32_79_0 W20,W0,W28,W24 1052#else 1053 W_PRECALC_32_79_0_i386 W20,W0,W28,W24 1054#endif 1055 RR0 F3,E,A,B,C,D,56 1056 W_PRECALC_32_79_1 W8,W24 1057 RR1 F3,E,A,B,C,D,56 1058#if defined (__x86_64__) 1059 W_PRECALC_32_79_2 W24 1060#else 1061 W_PRECALC_32_79_2_i386 W24 1062#endif 1063 RR0 F3,C,D,E,A,B,58 1064 W_PRECALC_32_79_3 W24,58,K_XMM 1065 RR1 F3,C,D,E,A,B,58 1066 1067 // starting using F4 1068 1069 //i=76 1070 // W16,W12,W8,W4,W0,W28,W24,W20 1071 W_PRECALC_32_79_0 W16,W28,W24,W20 1072 RR0 F4,A,B,C,D,E,60 1073 W_PRECALC_32_79_1 W4,W20 1074 RR1 F4,A,B,C,D,E,60 1075 W_PRECALC_32_79_2 W20 1076 RR0 F4,D,E,A,B,C,62 1077 W_PRECALC_32_79_3 W20,62,K_XMM 1078 RR1 F4,D,E,A,B,C,62 1079 1080 .endm 1081 1082 .macro SOFTWARE_PIPELINING_nossse3 1083 // i=0 : W28,W24,W20,W16,W12,W8,W4,W0 1084 W_PRECALC_00_15_0 0 // W_TMP = (BUFFER_PTR) 1085 RR0 F4,B,C,D,E,A,64 1086 W_PRECALC_00_15_1 W0 // convert W_TMP to big-endian, and save W0 = W_TMP 1087 RR1 F4,B,C,D,E,A,64 1088 W_PRECALC_00_15_2 // W_TMP = W0 + K 1089 RR0 F4,E,A,B,C,D,66 1090 W_PRECALC_00_15_3 3 // (sp) = W_TMP = W0 + K 1091 RR1 F4,E,A,B,C,D,66 1092 1093 // i=4 : W24,W20,W16,W12,W8,W4,W0,W28 1094 W_PRECALC_00_15_0 4 // W_TMP = 16(BUFFER_PTR) 1095 RR0 F4,C,D,E,A,B,68 1096 W_PRECALC_00_15_1 W28 // convert W_TMP to big-endian, and save W28 = W_TMP 1097 RR1 F4,C,D,E,A,B,68 1098 W_PRECALC_00_15_2 // W_TMP = W28 + K 1099 RR0 F4,A,B,C,D,E,70 1100 W_PRECALC_00_15_3 7 // 16(sp) = W_TMP = W28 + K[0] 1101 RR1 F4,A,B,C,D,E,70 1102 1103 // i=8 : W20,W16,W12,W8,W4,W0,W28,W24 1104 W_PRECALC_00_15_0 8 // W_TMP = 32(BUFFER_PTR) 1105 RR0 F4,D,E,A,B,C,72 1106 W_PRECALC_00_15_1 W24 // convert W_TMP to big-endian, and save W24 = W_TMP 1107 RR1 F4,D,E,A,B,C,72 1108 W_PRECALC_00_15_2 // W_TMP = W24 + K 1109 RR0 F4,B,C,D,E,A,74 1110 W_PRECALC_00_15_3 11 // 32(sp) = W_TMP = W24 + K 1111 RR1 F4,B,C,D,E,A,74 1112 1113 // i=12 : W16,W12,W8,W4,W0,W28,W24,W20 1114 W_PRECALC_00_15_0 12 // W_TMP = 48(BUFFER_PTR) 1115 RR0 F4,E,A,B,C,D,76 1116 W_PRECALC_00_15_1 W20 // convert W_TMP to big-endian, and save W20 = W_TMP 1117 RR1 F4,E,A,B,C,D,76 1118 W_PRECALC_00_15_2 // W_TMP = W20 + K 1119 RR0 F4,C,D,E,A,B,78 1120 W_PRECALC_00_15_3 15 // 48(sp) = W_TMP = W20 + K 1121 RR1 F4,C,D,E,A,B,78 1122 .endm 1123 1124 .macro ENDING // finish up updating hash digests (i=64:79) 1125 //i=80 1126 RR0 F4,B,C,D,E,A,64 1127 RR1 F4,B,C,D,E,A,64 1128 RR0 F4,E,A,B,C,D,66 1129 RR1 F4,E,A,B,C,D,66 1130 1131 //i=84 1132 RR0 F4,C,D,E,A,B,68 1133 RR1 F4,C,D,E,A,B,68 1134 RR0 F4,A,B,C,D,E,70 1135 RR1 F4,A,B,C,D,E,70 1136 1137 //i=88 1138 RR0 F4,D,E,A,B,C,72 1139 RR1 F4,D,E,A,B,C,72 1140 RR0 F4,B,C,D,E,A,74 1141 RR1 F4,B,C,D,E,A,74 1142 1143 //i=92 1144 RR0 F4,E,A,B,C,D,76 1145 RR1 F4,E,A,B,C,D,76 1146 RR0 F4,C,D,E,A,B,78 1147 RR1 F4,C,D,E,A,B,78 1148 .endm 1149 1150 // load hash digests A,B,C,D,E from memory into registers 1151 .macro LOAD_HASH 1152#if defined (__x86_64__) 1153 mov (HASH_PTR), A 1154 mov 4(HASH_PTR), B 1155 mov 8(HASH_PTR), C 1156 mov 12(HASH_PTR), D 1157 mov 16(HASH_PTR), E 1158#else 1159 mov HASH_PTR, T1 1160 mov (T1), A 1161 mov 4(T1), B 1162 mov 8(T1), C 1163 mov 12(T1), D 1164 mov 16(T1), E 1165#endif 1166 .endm 1167 1168 .macro UPDATE_HASH 1169 add $0, $1 1170 mov $1, $0 1171 .endm 1172 1173 .macro UPDATE_ALL_HASH 1174#if defined (__x86_64__) 1175 UPDATE_HASH (HASH_PTR), A 1176 UPDATE_HASH 4(HASH_PTR), B 1177 UPDATE_HASH 8(HASH_PTR), C 1178 UPDATE_HASH 12(HASH_PTR), D 1179 UPDATE_HASH 16(HASH_PTR), E 1180#else 1181 mov HASH_PTR, T1 1182 UPDATE_HASH (T1), A 1183 UPDATE_HASH 4(T1), B 1184 UPDATE_HASH 8(T1), C 1185 UPDATE_HASH 12(T1), D 1186 UPDATE_HASH 16(T1), E 1187#endif 1188 .endm 1189 1190 1191 /* 1192 main sha1 code for system without ssse3 support 1193 */ 1194 1195 .macro SHA1_PIPELINED_MAIN_BODY_nossse3 1196 LOAD_HASH // load initial hashes into A,B,C,D,E (registers) 1197 INITIAL_W_PRECALC_nossse3 // big_endian_load(W) and W+K (i=0:15) 1198 .align 4,0x90 11990: 1200 INTERNAL_nossse3 // update W (i=16:79) and update ABCDE (i=0:63) 1201#if Multiple_Blocks 1202#if defined(__x86_64__) 1203 add $$64, BUFFER_PTR // BUFFER_PTR+=64; 1204 sub $$1, cnt // pre-decrement cnt by 1 1205#else 1206 addl $$64, BUFFER_PTR // BUFFER_PTR+=64; 1207 subl $$1, cnt // pre-decrement cnt by 1 1208#endif 1209 jbe 1f // if cnt <= 0, branch to finish off 1210 SOFTWARE_PIPELINING_nossse3 // update ABCDE (i=64:79) || big_endian_load(W) and W+K (i=0:15) 1211 UPDATE_ALL_HASH // update output hashes 1212 jmp 0b // repeat for next block 1213 .align 4,0x90 12141: 1215#endif 1216 ENDING // update ABCDE (i=64:79) 1217 UPDATE_ALL_HASH // update output hashes 1218 .endm 1219 1220 /* 1221 main sha1 code for system with ssse3 support 1222 */ 1223 1224 .macro SHA1_PIPELINED_MAIN_BODY_ssse3 1225 LOAD_HASH // load initial hashes into A,B,C,D,E 1226 INITIAL_W_PRECALC_ssse3 // big_endian_load(W) and W+K (i=0:15) 1227 .align 4,0x90 12280: 1229 INTERNAL_ssse3 // update W (i=16:79) and update ABCDE (i=0:63) 1230#if Multiple_Blocks 1231#if defined(__x86_64__) 1232 add $$64, BUFFER_PTR // BUFFER_PTR+=64; 1233 sub $$1, cnt // pre-decrement cnt by 1 1234#else 1235 addl $$64, BUFFER_PTR // BUFFER_PTR+=64; 1236 subl $$1, cnt // pre-decrement cnt by 1 1237#endif 1238 jbe 1f // if cnt <= 0, branch to finish off 1239 SOFTWARE_PIPELINING_ssse3 // update ABCDE (i=64:79) || big_endian_load(W) and W+K (i=0:15) 1240 UPDATE_ALL_HASH // update output hashes 1241 jmp 0b // repeat for next block 1242 .align 4,0x90 12431: 1244#endif 1245 ENDING // update ABCDE (i=64:79) 1246 UPDATE_ALL_HASH // update output hashes 1247 .endm 1248 1249#ifdef KERNEL 1250#include <i386/cpu_capabilities.h> 1251#else 1252#include <System/i386/cpu_capabilities.h> 1253#endif 1254 1255 .text 1256 1257 .globl _SHA1Transform 1258 //.private_extern _SHA1Transform 1259_SHA1Transform: 1260 1261 // detect SSSE3 and dispatch appropriate code branch 1262 #if defined __x86_64__ 1263 movq __cpu_capabilities@GOTPCREL(%rip), %rax // %rax -> __cpu_capabilities 1264 mov (%rax), %eax // %eax = __cpu_capabilities 1265 #else // i386 1266 #if defined KERNEL 1267 leal __cpu_capabilities, %eax // %eax -> __cpu_capabilities 1268 mov (%eax), %eax // %eax = __cpu_capabilities 1269 #else 1270 mov _COMM_PAGE_CPU_CAPABILITIES, %eax 1271 #endif 1272 #endif 1273 test $(kHasSupplementalSSE3), %eax 1274 je _SHA1Transform_nossse3 // branch to no-ssse3 code 1275 1276 1277 // start the sha1 code with ssse3 support 1278 1279 // save callee-save registers 1280#if defined (__x86_64__) 1281 push %rbx 1282 push %rbp 1283#else 1284 push %ebx 1285 push %ebp 1286 push %esi 1287 push %edi 1288#endif 1289 1290 sub $stack_size, sp // allocate stack memory for use 1291 1292 // save used xmm register if this is for kernel 1293#if KERNEL 1294 xmov %xmm0, 4*16(sp) 1295 xmov %xmm1, 5*16(sp) 1296 xmov %xmm2, 6*16(sp) 1297 xmov %xmm3, 7*16(sp) 1298 xmov %xmm4, 8*16(sp) 1299 xmov %xmm5, 9*16(sp) 1300 xmov %xmm6, 10*16(sp) 1301 xmov %xmm7, 11*16(sp) 1302#if defined (__x86_64__) 1303 xmov %xmm8, 12*16(sp) 1304 xmov %xmm9, 13*16(sp) 1305 xmov %xmm10, 14*16(sp) 1306#endif 1307#endif 1308 1309#if defined (__x86_64__) 1310 1311 // set up registers to free %edx/%edi/%esi for other use (ABCDE) 1312 mov ctx, HASH_PTR 1313 mov buf, BUFFER_PTR 1314#if Multiple_Blocks 1315 mov %rdx, cnt 1316#endif 1317 lea K_XMM_AR(%rip), K_BASE 1318 xmov 0x40(K_BASE), XMM_SHUFB_BSWAP 1319 1320#else // __i386__ 1321 1322#if KERNEL 1323 lea K_XMM_AR, %eax 1324#else 1325 // Get address of 0 in R. 1326 call 0f // Push program counter onto stack. 1327 0: pop %eax // Get program counter. 1328 lea K_XMM_AR-0b(%eax), %eax 1329#endif 1330 mov %eax, K_BASE 1331 xmov 0x40(%eax), %xmm0 1332 xmov %xmm0, XMM_SHUFB_BSWAP 1333 1334#endif 1335 1336 SHA1_PIPELINED_MAIN_BODY_ssse3 1337 1338 // restore used xmm registers if this is for kernel 1339#if KERNEL 1340 xmov 4*16(sp), %xmm0 1341 xmov 5*16(sp), %xmm1 1342 xmov 6*16(sp), %xmm2 1343 xmov 7*16(sp), %xmm3 1344 xmov 8*16(sp), %xmm4 1345 xmov 9*16(sp), %xmm5 1346 xmov 10*16(sp), %xmm6 1347 xmov 11*16(sp), %xmm7 1348#if defined (__x86_64__) 1349 xmov 12*16(sp), %xmm8 1350 xmov 13*16(sp), %xmm9 1351 xmov 14*16(sp), %xmm10 1352#endif 1353#endif 1354 1355 add $stack_size, sp // deallocate stack memory 1356 1357 // restore callee-save registers 1358#if defined (__x86_64__) 1359 pop %rbp 1360 pop %rbx 1361#else 1362 pop %edi 1363 pop %esi 1364 pop %ebp 1365 pop %ebx 1366#endif 1367 1368 ret // return 1369 1370 // this is equivalent to the above function _SHA1Transform, but it does not use ssse3 instructions 1371 1372 .globl _SHA1Transform_nossse3 1373 .private_extern _SHA1Transform_nossse3 1374_SHA1Transform_nossse3: 1375 1376 // push callee-save registers 1377#if defined (__x86_64__) 1378 push %rbx 1379 push %rbp 1380#else 1381 push %ebx 1382 push %ebp 1383 push %esi 1384 push %edi 1385#endif 1386 1387 sub $stack_size, sp // allocate stack memory for local use 1388 1389 // save used xmm registers if this is for kernel 1390#if KERNEL 1391 xmov %xmm0, 4*16(sp) 1392 xmov %xmm1, 5*16(sp) 1393 xmov %xmm2, 6*16(sp) 1394 xmov %xmm3, 7*16(sp) 1395 xmov %xmm4, 8*16(sp) 1396 xmov %xmm5, 9*16(sp) 1397 xmov %xmm6, 10*16(sp) 1398 xmov %xmm7, 11*16(sp) 1399#if defined (__x86_64__) 1400 xmov %xmm8, 12*16(sp) 1401 xmov %xmm9, 13*16(sp) 1402#endif 1403#endif 1404 1405#if defined (__x86_64__) 1406 1407 // set up registers to free %edx/%edi/%esi for other use (ABCDE) 1408 mov ctx, HASH_PTR 1409 mov buf, BUFFER_PTR 1410#if Multiple_Blocks 1411 mov %rdx, cnt 1412#endif 1413 lea K_XMM_AR(%rip), K_BASE 1414 1415#else // __i386__ 1416 1417#if KERNEL 1418 lea K_XMM_AR, %eax 1419#else 1420 // Get address of 0 in R. 1421 call 0f // Push program counter onto stack. 1422 0: pop %eax // Get program counter. 1423 lea K_XMM_AR-0b(%eax), %eax 1424#endif 1425 mov %eax, K_BASE 1426 1427#endif 1428 1429 SHA1_PIPELINED_MAIN_BODY_nossse3 1430 1431 // restore used xmm registers if this is for kernel 1432#if KERNEL 1433 xmov 4*16(sp), %xmm0 1434 xmov 5*16(sp), %xmm1 1435 xmov 6*16(sp), %xmm2 1436 xmov 7*16(sp), %xmm3 1437 xmov 8*16(sp), %xmm4 1438 xmov 9*16(sp), %xmm5 1439 xmov 10*16(sp), %xmm6 1440 xmov 11*16(sp), %xmm7 1441#if defined (__x86_64__) 1442 xmov 12*16(sp), %xmm8 1443 xmov 13*16(sp), %xmm9 1444#endif 1445#endif 1446 1447 add $stack_size, sp // deallocate stack memory 1448 1449 // restore callee-save registers 1450#if defined (__x86_64__) 1451 pop %rbp 1452 pop %rbx 1453#else 1454 pop %edi 1455 pop %esi 1456 pop %ebp 1457 pop %ebx 1458#endif 1459 1460 ret // return 1461 1462 .const 1463 .align 4, 0x90 1464 1465#define K1 0x5a827999 1466#define K2 0x6ed9eba1 1467#define K3 0x8f1bbcdc 1468#define K4 0xca62c1d6 1469 1470K_XMM_AR: 1471 .long K1 1472 .long K1 1473 .long K1 1474 .long K1 1475 .long K2 1476 .long K2 1477 .long K2 1478 .long K2 1479 .long K3 1480 .long K3 1481 .long K3 1482 .long K3 1483 .long K4 1484 .long K4 1485 .long K4 1486 .long K4 1487// bswap_shufb_ctl: invoked thru 0x40(K_XMM_AR) 1488 .long 0x00010203 1489 .long 0x04050607 1490 .long 0x08090a0b 1491 .long 0x0c0d0e0f 1492 1493 1494 1495#endif // architecture x86_64 or i386 1496