1#if defined __x86_64__ 2 3 .text 4 .align 4,0x90 5.globl _crc32_vec 6_crc32_vec: 7 8 // input : 9 // crc : edi 10 // buf : rsi 11 // len : rdx 12 13 // symbolizing x86_64 registers 14 15 #define crc %edi 16 #define buf %rsi 17 #define len %rdx 18 #define tab %rcx 19 20 #define v0 %xmm0 21 #define v1 %xmm1 22 #define v2 %xmm2 23 #define v3 %xmm3 24 #define v4 %xmm4 25 #define v5 %xmm5 26 27 // push rbp, sp should now be 16-byte aligned 28 pushq %rbp 29 movq %rsp, %rbp 30 31#ifdef KERNEL 32 /* 33 allocate 6*16 = 96 stack space and save %xmm0-%xmm7 34 */ 35 subq $96, %rsp 36 movaps v0, -16(%rbp) 37 movaps v1, -32(%rbp) 38 movaps v2, -48(%rbp) 39 movaps v3, -64(%rbp) 40 movaps v4, -80(%rbp) 41 movaps v5, -96(%rbp) 42#endif 43 44 /* 45 set up the table pointer and use 16-byte data directly in pclmulqdq 46 tried movaps to %xmm7, and use %xmm7, performance about the same 47 */ 48 leaq L_coefficients(%rip), tab 49 #define K12 (tab) 50 #define K34 16(tab) 51 #define K56 32(tab) 52 #define uPx 48(tab) 53 #define L_shufb 64(tab) 54 55 /* load the initial crc and xor with the 1st 16-byte vector */ 56 movd crc, v0 57 movdqu (buf), v1 58 pslldq $12, v0 // shift up to the most significant word in v0 59 pshufb L_shufb, v1 60 pxor v1, v0 61 62 /* if this is the only vector, we've achieve the final 128-bit vector */ 63 add $16, buf 64 sub $16, len 65 jle L_128bits 66 67 /* make sure there are at least 3 more vectors */ 68 cmp $48, len 69 jl L_no_more_4_vectors 70 71 /* read the next 3 vectors*/ 72 movdqu (buf), v1 73 movdqu 16(buf), v2 74 movdqu 32(buf), v3 75 pshufb L_shufb, v1 76 pshufb L_shufb, v2 77 pshufb L_shufb, v3 78 79 add $48, buf 80 81 /* pre-decrement len by 64, to check whether there are at least 4 more vectors */ 82 sub $48+64, len 83 jl L_foldv13 84 85 /* ------------------------------------------------- 86 the main loop, folding 4 vectors per iterations 87 ------------------------------------------------- 88 */ 89L_FOLD_BY_4: 90 91 movdqa v0, v4 92 movdqa v1, v5 93 pclmulqdq $0x11, K12, v0 94 pclmulqdq $0x11, K12, v1 95 pclmulqdq $0x00, K12, v4 96 pclmulqdq $0x00, K12, v5 97 pxor v4, v0 98 pxor v5, v1 99 movdqu 0(buf), v4 100 movdqu 16(buf), v5 101 pshufb L_shufb, v4 102 pshufb L_shufb, v5 103 pxor v4, v0 104 pxor v5, v1 105 movdqa v2, v4 106 movdqa v3, v5 107 pclmulqdq $0x11, K12, v2 108 pclmulqdq $0x11, K12, v3 109 pclmulqdq $0x00, K12, v4 110 pclmulqdq $0x00, K12, v5 111 pxor v4, v2 112 pxor v5, v3 113 movdqu 32(buf), v4 114 movdqu 48(buf), v5 115 pshufb L_shufb, v4 116 pshufb L_shufb, v5 117 pxor v4, v2 118 pxor v5, v3 119 120 add $64, buf 121 sub $64, len 122 ja L_FOLD_BY_4 123 124 125 /* 126 now sequentially fold v0 into v1,v2,v3 127 */ 128L_foldv13: 129 130 .macro FOLD1 131 movdqa v0, v4 // a copy of v0 = H(x)x^64 + L(x) 132 pclmulqdq $$0x11, K34, v0 // H(x) * {x^[128+64] mod P(x)} 133 pclmulqdq $$0x00, K34, v4 // L(x) * {x^128 mod P(x)} 134 pxor v4, v0 // xor with L(x) * {x^128 mod P(x)} 135 pxor $0, v0 // H(x) * {x^[128+64] mod P(x)} xor with the new vector v1/v2/v3 136 .endm 137 138 /* FOLD1 of v1-v3 into v0 */ 139 FOLD1 v1 140 FOLD1 v2 141 FOLD1 v3 142 143 /* post-increment len by 64 */ 144 add $64, len 145 146L_no_more_4_vectors: 147 148 /* pre-decrement len by 16 to detect whether there is still some vector to process */ 149 sub $16, len 150 jl L_128bits 151L_FOLD_BY_1: 152 movdqu (buf), v5 153 pshufb L_shufb, v5 154 FOLD1 v5 /* folding into the new vector */ 155 add $16, buf 156 sub $16, len 157 jae L_FOLD_BY_1 /* until no more new vector */ 158 159L_128bits: /* we've arrived at the final 128-bit vector */ 160 161 /* reduction from 128-bits to 64-bits */ 162 movdqa v0, v1 163 pclmulqdq $0x11, K56, v0 // v0 = H(x) * K5 96-bits 164 pslldq $8, v1 // v1 = L(x) 64-bits 165 psrldq $4, v1 // v1 = L(x) 64-bits in the right position 166 pxor v1, v0 167 movdqa v0, v1 168 pclmulqdq $0x01, K56, v1 169 pxor v1, v0 170 171 /* 172 barrett reduction: 173 174 T1 = floor(R(x)/x^32) * [1/P(x)]; R/P 175 T2 = floor(T1/x^32) * P(x); int(R/P)*P; 176 CRC = (R+int(R/P)*P) mod x^32; R-int(R/P)*P 177 178 */ 179 movq v0, v1 180 psrldq $4, v1 // R/x^32 181 pclmulqdq $0x00, uPx, v1 // T1 = floor(R/x^32)*u 182 psrldq $4, v1 // T1/x^32 183 pclmulqdq $0x10, uPx, v1 // T2 = floor(T1/x^32)*P 184 pxor v1, v0 185 movd v0, %eax 186 187 188#ifdef KERNEL 189 // restore xmm0-xmm7, and deallocate 96 bytes from stack 190 movaps -16(%rbp), v0 191 movaps -32(%rbp), v1 192 movaps -48(%rbp), v2 193 movaps -64(%rbp), v3 194 movaps -80(%rbp), v4 195 movaps -96(%rbp), v5 196 addq $96, %rsp 197#endif 198 199 leave 200 ret 201 202 .const 203 .align 4 204L_coefficients: // used for vectorizing crc32 computation using pclmulqdq 205 206#define K1 0x8833794C 207#define K2 0xE6228B11 208#define K3 0xC5B9CD4C 209#define K4 0xE8A45605 210#define K5 0xF200AA66 211#define K6 0x490D678D 212#define ux 0x104D101DF 213#define Px 0x104C11DB7 214 215 .quad K2 216 .quad K1 217 .quad K4 218 .quad K3 219 .quad K6 220 .quad K5 221 .quad ux 222 .quad Px 223 .quad 0x08090a0b0c0d0e0f 224 .quad 0x0001020304050607 225 226 227#endif // defined VEC_OPTIMIZE 228