aesni-mb-x86_64.S revision 290207
1 # $FreeBSD: head/secure/lib/libcrypto/amd64/aesni-mb-x86_64.S 290207 2015-10-30 20:51:33Z jkim $ 2.text 3 4 5 6.globl aesni_multi_cbc_encrypt 7.type aesni_multi_cbc_encrypt,@function 8.align 32 9aesni_multi_cbc_encrypt: 10 movq %rsp,%rax 11 pushq %rbx 12 pushq %rbp 13 pushq %r12 14 pushq %r13 15 pushq %r14 16 pushq %r15 17 18 19 20 21 22 23 subq $48,%rsp 24 andq $-64,%rsp 25 movq %rax,16(%rsp) 26 27.Lenc4x_body: 28 movdqu (%rsi),%xmm12 29 leaq 120(%rsi),%rsi 30 leaq 80(%rdi),%rdi 31 32.Lenc4x_loop_grande: 33 movl %edx,24(%rsp) 34 xorl %edx,%edx 35 movl -64(%rdi),%ecx 36 movq -80(%rdi),%r8 37 cmpl %edx,%ecx 38 movq -72(%rdi),%r12 39 cmovgl %ecx,%edx 40 testl %ecx,%ecx 41 movdqu -56(%rdi),%xmm2 42 movl %ecx,32(%rsp) 43 cmovleq %rsp,%r8 44 movl -24(%rdi),%ecx 45 movq -40(%rdi),%r9 46 cmpl %edx,%ecx 47 movq -32(%rdi),%r13 48 cmovgl %ecx,%edx 49 testl %ecx,%ecx 50 movdqu -16(%rdi),%xmm3 51 movl %ecx,36(%rsp) 52 cmovleq %rsp,%r9 53 movl 16(%rdi),%ecx 54 movq 0(%rdi),%r10 55 cmpl %edx,%ecx 56 movq 8(%rdi),%r14 57 cmovgl %ecx,%edx 58 testl %ecx,%ecx 59 movdqu 24(%rdi),%xmm4 60 movl %ecx,40(%rsp) 61 cmovleq %rsp,%r10 62 movl 56(%rdi),%ecx 63 movq 40(%rdi),%r11 64 cmpl %edx,%ecx 65 movq 48(%rdi),%r15 66 cmovgl %ecx,%edx 67 testl %ecx,%ecx 68 movdqu 64(%rdi),%xmm5 69 movl %ecx,44(%rsp) 70 cmovleq %rsp,%r11 71 testl %edx,%edx 72 jz .Lenc4x_done 73 74 movups 16-120(%rsi),%xmm1 75 pxor %xmm12,%xmm2 76 movups 32-120(%rsi),%xmm0 77 pxor %xmm12,%xmm3 78 movl 240-120(%rsi),%eax 79 pxor %xmm12,%xmm4 80 movdqu (%r8),%xmm6 81 pxor %xmm12,%xmm5 82 movdqu (%r9),%xmm7 83 pxor %xmm6,%xmm2 84 movdqu (%r10),%xmm8 85 pxor %xmm7,%xmm3 86 movdqu (%r11),%xmm9 87 pxor %xmm8,%xmm4 88 pxor %xmm9,%xmm5 89 movdqa 32(%rsp),%xmm10 90 xorq %rbx,%rbx 91 jmp .Loop_enc4x 92 93.align 32 94.Loop_enc4x: 95 addq $16,%rbx 96 leaq 16(%rsp),%rbp 97 movl $1,%ecx 98 subq %rbx,%rbp 99 100.byte 102,15,56,220,209 101 prefetcht0 31(%r8,%rbx,1) 102 prefetcht0 31(%r9,%rbx,1) 103.byte 102,15,56,220,217 104 prefetcht0 31(%r10,%rbx,1) 105 prefetcht0 31(%r10,%rbx,1) 106.byte 102,15,56,220,225 107.byte 102,15,56,220,233 108 movups 48-120(%rsi),%xmm1 109 cmpl 32(%rsp),%ecx 110.byte 102,15,56,220,208 111.byte 102,15,56,220,216 112.byte 102,15,56,220,224 113 cmovgeq %rbp,%r8 114 cmovgq %rbp,%r12 115.byte 102,15,56,220,232 116 movups -56(%rsi),%xmm0 117 cmpl 36(%rsp),%ecx 118.byte 102,15,56,220,209 119.byte 102,15,56,220,217 120.byte 102,15,56,220,225 121 cmovgeq %rbp,%r9 122 cmovgq %rbp,%r13 123.byte 102,15,56,220,233 124 movups -40(%rsi),%xmm1 125 cmpl 40(%rsp),%ecx 126.byte 102,15,56,220,208 127.byte 102,15,56,220,216 128.byte 102,15,56,220,224 129 cmovgeq %rbp,%r10 130 cmovgq %rbp,%r14 131.byte 102,15,56,220,232 132 movups -24(%rsi),%xmm0 133 cmpl 44(%rsp),%ecx 134.byte 102,15,56,220,209 135.byte 102,15,56,220,217 136.byte 102,15,56,220,225 137 cmovgeq %rbp,%r11 138 cmovgq %rbp,%r15 139.byte 102,15,56,220,233 140 movups -8(%rsi),%xmm1 141 movdqa %xmm10,%xmm11 142.byte 102,15,56,220,208 143 prefetcht0 15(%r12,%rbx,1) 144 prefetcht0 15(%r13,%rbx,1) 145.byte 102,15,56,220,216 146 prefetcht0 15(%r14,%rbx,1) 147 prefetcht0 15(%r15,%rbx,1) 148.byte 102,15,56,220,224 149.byte 102,15,56,220,232 150 movups 128-120(%rsi),%xmm0 151 pxor %xmm12,%xmm12 152 153.byte 102,15,56,220,209 154 pcmpgtd %xmm12,%xmm11 155 movdqu -120(%rsi),%xmm12 156.byte 102,15,56,220,217 157 paddd %xmm11,%xmm10 158 movdqa %xmm10,32(%rsp) 159.byte 102,15,56,220,225 160.byte 102,15,56,220,233 161 movups 144-120(%rsi),%xmm1 162 163 cmpl $11,%eax 164 165.byte 102,15,56,220,208 166.byte 102,15,56,220,216 167.byte 102,15,56,220,224 168.byte 102,15,56,220,232 169 movups 160-120(%rsi),%xmm0 170 171 jb .Lenc4x_tail 172 173.byte 102,15,56,220,209 174.byte 102,15,56,220,217 175.byte 102,15,56,220,225 176.byte 102,15,56,220,233 177 movups 176-120(%rsi),%xmm1 178 179.byte 102,15,56,220,208 180.byte 102,15,56,220,216 181.byte 102,15,56,220,224 182.byte 102,15,56,220,232 183 movups 192-120(%rsi),%xmm0 184 185 je .Lenc4x_tail 186 187.byte 102,15,56,220,209 188.byte 102,15,56,220,217 189.byte 102,15,56,220,225 190.byte 102,15,56,220,233 191 movups 208-120(%rsi),%xmm1 192 193.byte 102,15,56,220,208 194.byte 102,15,56,220,216 195.byte 102,15,56,220,224 196.byte 102,15,56,220,232 197 movups 224-120(%rsi),%xmm0 198 jmp .Lenc4x_tail 199 200.align 32 201.Lenc4x_tail: 202.byte 102,15,56,220,209 203.byte 102,15,56,220,217 204.byte 102,15,56,220,225 205.byte 102,15,56,220,233 206 movdqu (%r8,%rbx,1),%xmm6 207 movdqu 16-120(%rsi),%xmm1 208 209.byte 102,15,56,221,208 210 movdqu (%r9,%rbx,1),%xmm7 211 pxor %xmm12,%xmm6 212.byte 102,15,56,221,216 213 movdqu (%r10,%rbx,1),%xmm8 214 pxor %xmm12,%xmm7 215.byte 102,15,56,221,224 216 movdqu (%r11,%rbx,1),%xmm9 217 pxor %xmm12,%xmm8 218.byte 102,15,56,221,232 219 movdqu 32-120(%rsi),%xmm0 220 pxor %xmm12,%xmm9 221 222 movups %xmm2,-16(%r12,%rbx,1) 223 pxor %xmm6,%xmm2 224 movups %xmm3,-16(%r13,%rbx,1) 225 pxor %xmm7,%xmm3 226 movups %xmm4,-16(%r14,%rbx,1) 227 pxor %xmm8,%xmm4 228 movups %xmm5,-16(%r15,%rbx,1) 229 pxor %xmm9,%xmm5 230 231 decl %edx 232 jnz .Loop_enc4x 233 234 movq 16(%rsp),%rax 235 movl 24(%rsp),%edx 236 237 238 239 240 241 242 243 244 245 246 leaq 160(%rdi),%rdi 247 decl %edx 248 jnz .Lenc4x_loop_grande 249 250.Lenc4x_done: 251 movq -48(%rax),%r15 252 movq -40(%rax),%r14 253 movq -32(%rax),%r13 254 movq -24(%rax),%r12 255 movq -16(%rax),%rbp 256 movq -8(%rax),%rbx 257 leaq (%rax),%rsp 258.Lenc4x_epilogue: 259 .byte 0xf3,0xc3 260.size aesni_multi_cbc_encrypt,.-aesni_multi_cbc_encrypt 261 262.globl aesni_multi_cbc_decrypt 263.type aesni_multi_cbc_decrypt,@function 264.align 32 265aesni_multi_cbc_decrypt: 266 movq %rsp,%rax 267 pushq %rbx 268 pushq %rbp 269 pushq %r12 270 pushq %r13 271 pushq %r14 272 pushq %r15 273 274 275 276 277 278 279 subq $48,%rsp 280 andq $-64,%rsp 281 movq %rax,16(%rsp) 282 283.Ldec4x_body: 284 movdqu (%rsi),%xmm12 285 leaq 120(%rsi),%rsi 286 leaq 80(%rdi),%rdi 287 288.Ldec4x_loop_grande: 289 movl %edx,24(%rsp) 290 xorl %edx,%edx 291 movl -64(%rdi),%ecx 292 movq -80(%rdi),%r8 293 cmpl %edx,%ecx 294 movq -72(%rdi),%r12 295 cmovgl %ecx,%edx 296 testl %ecx,%ecx 297 movdqu -56(%rdi),%xmm6 298 movl %ecx,32(%rsp) 299 cmovleq %rsp,%r8 300 movl -24(%rdi),%ecx 301 movq -40(%rdi),%r9 302 cmpl %edx,%ecx 303 movq -32(%rdi),%r13 304 cmovgl %ecx,%edx 305 testl %ecx,%ecx 306 movdqu -16(%rdi),%xmm7 307 movl %ecx,36(%rsp) 308 cmovleq %rsp,%r9 309 movl 16(%rdi),%ecx 310 movq 0(%rdi),%r10 311 cmpl %edx,%ecx 312 movq 8(%rdi),%r14 313 cmovgl %ecx,%edx 314 testl %ecx,%ecx 315 movdqu 24(%rdi),%xmm8 316 movl %ecx,40(%rsp) 317 cmovleq %rsp,%r10 318 movl 56(%rdi),%ecx 319 movq 40(%rdi),%r11 320 cmpl %edx,%ecx 321 movq 48(%rdi),%r15 322 cmovgl %ecx,%edx 323 testl %ecx,%ecx 324 movdqu 64(%rdi),%xmm9 325 movl %ecx,44(%rsp) 326 cmovleq %rsp,%r11 327 testl %edx,%edx 328 jz .Ldec4x_done 329 330 movups 16-120(%rsi),%xmm1 331 movups 32-120(%rsi),%xmm0 332 movl 240-120(%rsi),%eax 333 movdqu (%r8),%xmm2 334 movdqu (%r9),%xmm3 335 pxor %xmm12,%xmm2 336 movdqu (%r10),%xmm4 337 pxor %xmm12,%xmm3 338 movdqu (%r11),%xmm5 339 pxor %xmm12,%xmm4 340 pxor %xmm12,%xmm5 341 movdqa 32(%rsp),%xmm10 342 xorq %rbx,%rbx 343 jmp .Loop_dec4x 344 345.align 32 346.Loop_dec4x: 347 addq $16,%rbx 348 leaq 16(%rsp),%rbp 349 movl $1,%ecx 350 subq %rbx,%rbp 351 352.byte 102,15,56,222,209 353 prefetcht0 31(%r8,%rbx,1) 354 prefetcht0 31(%r9,%rbx,1) 355.byte 102,15,56,222,217 356 prefetcht0 31(%r10,%rbx,1) 357 prefetcht0 31(%r11,%rbx,1) 358.byte 102,15,56,222,225 359.byte 102,15,56,222,233 360 movups 48-120(%rsi),%xmm1 361 cmpl 32(%rsp),%ecx 362.byte 102,15,56,222,208 363.byte 102,15,56,222,216 364.byte 102,15,56,222,224 365 cmovgeq %rbp,%r8 366 cmovgq %rbp,%r12 367.byte 102,15,56,222,232 368 movups -56(%rsi),%xmm0 369 cmpl 36(%rsp),%ecx 370.byte 102,15,56,222,209 371.byte 102,15,56,222,217 372.byte 102,15,56,222,225 373 cmovgeq %rbp,%r9 374 cmovgq %rbp,%r13 375.byte 102,15,56,222,233 376 movups -40(%rsi),%xmm1 377 cmpl 40(%rsp),%ecx 378.byte 102,15,56,222,208 379.byte 102,15,56,222,216 380.byte 102,15,56,222,224 381 cmovgeq %rbp,%r10 382 cmovgq %rbp,%r14 383.byte 102,15,56,222,232 384 movups -24(%rsi),%xmm0 385 cmpl 44(%rsp),%ecx 386.byte 102,15,56,222,209 387.byte 102,15,56,222,217 388.byte 102,15,56,222,225 389 cmovgeq %rbp,%r11 390 cmovgq %rbp,%r15 391.byte 102,15,56,222,233 392 movups -8(%rsi),%xmm1 393 movdqa %xmm10,%xmm11 394.byte 102,15,56,222,208 395 prefetcht0 15(%r12,%rbx,1) 396 prefetcht0 15(%r13,%rbx,1) 397.byte 102,15,56,222,216 398 prefetcht0 15(%r14,%rbx,1) 399 prefetcht0 15(%r15,%rbx,1) 400.byte 102,15,56,222,224 401.byte 102,15,56,222,232 402 movups 128-120(%rsi),%xmm0 403 pxor %xmm12,%xmm12 404 405.byte 102,15,56,222,209 406 pcmpgtd %xmm12,%xmm11 407 movdqu -120(%rsi),%xmm12 408.byte 102,15,56,222,217 409 paddd %xmm11,%xmm10 410 movdqa %xmm10,32(%rsp) 411.byte 102,15,56,222,225 412.byte 102,15,56,222,233 413 movups 144-120(%rsi),%xmm1 414 415 cmpl $11,%eax 416 417.byte 102,15,56,222,208 418.byte 102,15,56,222,216 419.byte 102,15,56,222,224 420.byte 102,15,56,222,232 421 movups 160-120(%rsi),%xmm0 422 423 jb .Ldec4x_tail 424 425.byte 102,15,56,222,209 426.byte 102,15,56,222,217 427.byte 102,15,56,222,225 428.byte 102,15,56,222,233 429 movups 176-120(%rsi),%xmm1 430 431.byte 102,15,56,222,208 432.byte 102,15,56,222,216 433.byte 102,15,56,222,224 434.byte 102,15,56,222,232 435 movups 192-120(%rsi),%xmm0 436 437 je .Ldec4x_tail 438 439.byte 102,15,56,222,209 440.byte 102,15,56,222,217 441.byte 102,15,56,222,225 442.byte 102,15,56,222,233 443 movups 208-120(%rsi),%xmm1 444 445.byte 102,15,56,222,208 446.byte 102,15,56,222,216 447.byte 102,15,56,222,224 448.byte 102,15,56,222,232 449 movups 224-120(%rsi),%xmm0 450 jmp .Ldec4x_tail 451 452.align 32 453.Ldec4x_tail: 454.byte 102,15,56,222,209 455.byte 102,15,56,222,217 456.byte 102,15,56,222,225 457 pxor %xmm0,%xmm6 458 pxor %xmm0,%xmm7 459.byte 102,15,56,222,233 460 movdqu 16-120(%rsi),%xmm1 461 pxor %xmm0,%xmm8 462 pxor %xmm0,%xmm9 463 movdqu 32-120(%rsi),%xmm0 464 465.byte 102,15,56,223,214 466.byte 102,15,56,223,223 467 movdqu -16(%r8,%rbx,1),%xmm6 468 movdqu -16(%r9,%rbx,1),%xmm7 469.byte 102,65,15,56,223,224 470.byte 102,65,15,56,223,233 471 movdqu -16(%r10,%rbx,1),%xmm8 472 movdqu -16(%r11,%rbx,1),%xmm9 473 474 movups %xmm2,-16(%r12,%rbx,1) 475 movdqu (%r8,%rbx,1),%xmm2 476 movups %xmm3,-16(%r13,%rbx,1) 477 movdqu (%r9,%rbx,1),%xmm3 478 pxor %xmm12,%xmm2 479 movups %xmm4,-16(%r14,%rbx,1) 480 movdqu (%r10,%rbx,1),%xmm4 481 pxor %xmm12,%xmm3 482 movups %xmm5,-16(%r15,%rbx,1) 483 movdqu (%r11,%rbx,1),%xmm5 484 pxor %xmm12,%xmm4 485 pxor %xmm12,%xmm5 486 487 decl %edx 488 jnz .Loop_dec4x 489 490 movq 16(%rsp),%rax 491 movl 24(%rsp),%edx 492 493 leaq 160(%rdi),%rdi 494 decl %edx 495 jnz .Ldec4x_loop_grande 496 497.Ldec4x_done: 498 movq -48(%rax),%r15 499 movq -40(%rax),%r14 500 movq -32(%rax),%r13 501 movq -24(%rax),%r12 502 movq -16(%rax),%rbp 503 movq -8(%rax),%rbx 504 leaq (%rax),%rsp 505.Ldec4x_epilogue: 506 .byte 0xf3,0xc3 507.size aesni_multi_cbc_decrypt,.-aesni_multi_cbc_decrypt 508