chacha-x86.S revision 1.1.1.1
1#include <machine/asm.h> 2.text 3.globl ChaCha20_ctr32 4.type ChaCha20_ctr32,@function 5.align 16 6ChaCha20_ctr32: 7.L_ChaCha20_ctr32_begin: 8 pushl %ebp 9 pushl %ebx 10 pushl %esi 11 pushl %edi 12 xorl %eax,%eax 13 cmpl 28(%esp),%eax 14 je .L000no_data 15 call .Lpic_point 16.Lpic_point: 17 popl %eax 18 leal OPENSSL_ia32cap_P-.Lpic_point(%eax),%ebp 19 testl $16777216,(%ebp) 20 jz .L001x86 21 testl $512,4(%ebp) 22 jz .L001x86 23 jmp .Lssse3_shortcut 24.L001x86: 25 movl 32(%esp),%esi 26 movl 36(%esp),%edi 27 subl $132,%esp 28 movl (%esi),%eax 29 movl 4(%esi),%ebx 30 movl 8(%esi),%ecx 31 movl 12(%esi),%edx 32 movl %eax,80(%esp) 33 movl %ebx,84(%esp) 34 movl %ecx,88(%esp) 35 movl %edx,92(%esp) 36 movl 16(%esi),%eax 37 movl 20(%esi),%ebx 38 movl 24(%esi),%ecx 39 movl 28(%esi),%edx 40 movl %eax,96(%esp) 41 movl %ebx,100(%esp) 42 movl %ecx,104(%esp) 43 movl %edx,108(%esp) 44 movl (%edi),%eax 45 movl 4(%edi),%ebx 46 movl 8(%edi),%ecx 47 movl 12(%edi),%edx 48 subl $1,%eax 49 movl %eax,112(%esp) 50 movl %ebx,116(%esp) 51 movl %ecx,120(%esp) 52 movl %edx,124(%esp) 53 jmp .L002entry 54.align 16 55.L003outer_loop: 56 movl %ebx,156(%esp) 57 movl %eax,152(%esp) 58 movl %ecx,160(%esp) 59.L002entry: 60 movl $1634760805,%eax 61 movl $857760878,4(%esp) 62 movl $2036477234,8(%esp) 63 movl $1797285236,12(%esp) 64 movl 84(%esp),%ebx 65 movl 88(%esp),%ebp 66 movl 104(%esp),%ecx 67 movl 108(%esp),%esi 68 movl 116(%esp),%edx 69 movl 120(%esp),%edi 70 movl %ebx,20(%esp) 71 movl %ebp,24(%esp) 72 movl %ecx,40(%esp) 73 movl %esi,44(%esp) 74 movl %edx,52(%esp) 75 movl %edi,56(%esp) 76 movl 92(%esp),%ebx 77 movl 124(%esp),%edi 78 movl 112(%esp),%edx 79 movl 80(%esp),%ebp 80 movl 96(%esp),%ecx 81 movl 100(%esp),%esi 82 addl $1,%edx 83 movl %ebx,28(%esp) 84 movl %edi,60(%esp) 85 movl %edx,112(%esp) 86 movl $10,%ebx 87 jmp .L004loop 88.align 16 89.L004loop: 90 addl %ebp,%eax 91 movl %ebx,128(%esp) 92 movl %ebp,%ebx 93 xorl %eax,%edx 94 roll $16,%edx 95 addl %edx,%ecx 96 xorl %ecx,%ebx 97 movl 52(%esp),%edi 98 roll $12,%ebx 99 movl 20(%esp),%ebp 100 addl %ebx,%eax 101 xorl %eax,%edx 102 movl %eax,(%esp) 103 roll $8,%edx 104 movl 4(%esp),%eax 105 addl %edx,%ecx 106 movl %edx,48(%esp) 107 xorl %ecx,%ebx 108 addl %ebp,%eax 109 roll $7,%ebx 110 xorl %eax,%edi 111 movl %ecx,32(%esp) 112 roll $16,%edi 113 movl %ebx,16(%esp) 114 addl %edi,%esi 115 movl 40(%esp),%ecx 116 xorl %esi,%ebp 117 movl 56(%esp),%edx 118 roll $12,%ebp 119 movl 24(%esp),%ebx 120 addl %ebp,%eax 121 xorl %eax,%edi 122 movl %eax,4(%esp) 123 roll $8,%edi 124 movl 8(%esp),%eax 125 addl %edi,%esi 126 movl %edi,52(%esp) 127 xorl %esi,%ebp 128 addl %ebx,%eax 129 roll $7,%ebp 130 xorl %eax,%edx 131 movl %esi,36(%esp) 132 roll $16,%edx 133 movl %ebp,20(%esp) 134 addl %edx,%ecx 135 movl 44(%esp),%esi 136 xorl %ecx,%ebx 137 movl 60(%esp),%edi 138 roll $12,%ebx 139 movl 28(%esp),%ebp 140 addl %ebx,%eax 141 xorl %eax,%edx 142 movl %eax,8(%esp) 143 roll $8,%edx 144 movl 12(%esp),%eax 145 addl %edx,%ecx 146 movl %edx,56(%esp) 147 xorl %ecx,%ebx 148 addl %ebp,%eax 149 roll $7,%ebx 150 xorl %eax,%edi 151 roll $16,%edi 152 movl %ebx,24(%esp) 153 addl %edi,%esi 154 xorl %esi,%ebp 155 roll $12,%ebp 156 movl 20(%esp),%ebx 157 addl %ebp,%eax 158 xorl %eax,%edi 159 movl %eax,12(%esp) 160 roll $8,%edi 161 movl (%esp),%eax 162 addl %edi,%esi 163 movl %edi,%edx 164 xorl %esi,%ebp 165 addl %ebx,%eax 166 roll $7,%ebp 167 xorl %eax,%edx 168 roll $16,%edx 169 movl %ebp,28(%esp) 170 addl %edx,%ecx 171 xorl %ecx,%ebx 172 movl 48(%esp),%edi 173 roll $12,%ebx 174 movl 24(%esp),%ebp 175 addl %ebx,%eax 176 xorl %eax,%edx 177 movl %eax,(%esp) 178 roll $8,%edx 179 movl 4(%esp),%eax 180 addl %edx,%ecx 181 movl %edx,60(%esp) 182 xorl %ecx,%ebx 183 addl %ebp,%eax 184 roll $7,%ebx 185 xorl %eax,%edi 186 movl %ecx,40(%esp) 187 roll $16,%edi 188 movl %ebx,20(%esp) 189 addl %edi,%esi 190 movl 32(%esp),%ecx 191 xorl %esi,%ebp 192 movl 52(%esp),%edx 193 roll $12,%ebp 194 movl 28(%esp),%ebx 195 addl %ebp,%eax 196 xorl %eax,%edi 197 movl %eax,4(%esp) 198 roll $8,%edi 199 movl 8(%esp),%eax 200 addl %edi,%esi 201 movl %edi,48(%esp) 202 xorl %esi,%ebp 203 addl %ebx,%eax 204 roll $7,%ebp 205 xorl %eax,%edx 206 movl %esi,44(%esp) 207 roll $16,%edx 208 movl %ebp,24(%esp) 209 addl %edx,%ecx 210 movl 36(%esp),%esi 211 xorl %ecx,%ebx 212 movl 56(%esp),%edi 213 roll $12,%ebx 214 movl 16(%esp),%ebp 215 addl %ebx,%eax 216 xorl %eax,%edx 217 movl %eax,8(%esp) 218 roll $8,%edx 219 movl 12(%esp),%eax 220 addl %edx,%ecx 221 movl %edx,52(%esp) 222 xorl %ecx,%ebx 223 addl %ebp,%eax 224 roll $7,%ebx 225 xorl %eax,%edi 226 roll $16,%edi 227 movl %ebx,28(%esp) 228 addl %edi,%esi 229 xorl %esi,%ebp 230 movl 48(%esp),%edx 231 roll $12,%ebp 232 movl 128(%esp),%ebx 233 addl %ebp,%eax 234 xorl %eax,%edi 235 movl %eax,12(%esp) 236 roll $8,%edi 237 movl (%esp),%eax 238 addl %edi,%esi 239 movl %edi,56(%esp) 240 xorl %esi,%ebp 241 roll $7,%ebp 242 decl %ebx 243 jnz .L004loop 244 movl 160(%esp),%ebx 245 addl $1634760805,%eax 246 addl 80(%esp),%ebp 247 addl 96(%esp),%ecx 248 addl 100(%esp),%esi 249 cmpl $64,%ebx 250 jb .L005tail 251 movl 156(%esp),%ebx 252 addl 112(%esp),%edx 253 addl 120(%esp),%edi 254 xorl (%ebx),%eax 255 xorl 16(%ebx),%ebp 256 movl %eax,(%esp) 257 movl 152(%esp),%eax 258 xorl 32(%ebx),%ecx 259 xorl 36(%ebx),%esi 260 xorl 48(%ebx),%edx 261 xorl 56(%ebx),%edi 262 movl %ebp,16(%eax) 263 movl %ecx,32(%eax) 264 movl %esi,36(%eax) 265 movl %edx,48(%eax) 266 movl %edi,56(%eax) 267 movl 4(%esp),%ebp 268 movl 8(%esp),%ecx 269 movl 12(%esp),%esi 270 movl 20(%esp),%edx 271 movl 24(%esp),%edi 272 addl $857760878,%ebp 273 addl $2036477234,%ecx 274 addl $1797285236,%esi 275 addl 84(%esp),%edx 276 addl 88(%esp),%edi 277 xorl 4(%ebx),%ebp 278 xorl 8(%ebx),%ecx 279 xorl 12(%ebx),%esi 280 xorl 20(%ebx),%edx 281 xorl 24(%ebx),%edi 282 movl %ebp,4(%eax) 283 movl %ecx,8(%eax) 284 movl %esi,12(%eax) 285 movl %edx,20(%eax) 286 movl %edi,24(%eax) 287 movl 28(%esp),%ebp 288 movl 40(%esp),%ecx 289 movl 44(%esp),%esi 290 movl 52(%esp),%edx 291 movl 60(%esp),%edi 292 addl 92(%esp),%ebp 293 addl 104(%esp),%ecx 294 addl 108(%esp),%esi 295 addl 116(%esp),%edx 296 addl 124(%esp),%edi 297 xorl 28(%ebx),%ebp 298 xorl 40(%ebx),%ecx 299 xorl 44(%ebx),%esi 300 xorl 52(%ebx),%edx 301 xorl 60(%ebx),%edi 302 leal 64(%ebx),%ebx 303 movl %ebp,28(%eax) 304 movl (%esp),%ebp 305 movl %ecx,40(%eax) 306 movl 160(%esp),%ecx 307 movl %esi,44(%eax) 308 movl %edx,52(%eax) 309 movl %edi,60(%eax) 310 movl %ebp,(%eax) 311 leal 64(%eax),%eax 312 subl $64,%ecx 313 jnz .L003outer_loop 314 jmp .L006done 315.L005tail: 316 addl 112(%esp),%edx 317 addl 120(%esp),%edi 318 movl %eax,(%esp) 319 movl %ebp,16(%esp) 320 movl %ecx,32(%esp) 321 movl %esi,36(%esp) 322 movl %edx,48(%esp) 323 movl %edi,56(%esp) 324 movl 4(%esp),%ebp 325 movl 8(%esp),%ecx 326 movl 12(%esp),%esi 327 movl 20(%esp),%edx 328 movl 24(%esp),%edi 329 addl $857760878,%ebp 330 addl $2036477234,%ecx 331 addl $1797285236,%esi 332 addl 84(%esp),%edx 333 addl 88(%esp),%edi 334 movl %ebp,4(%esp) 335 movl %ecx,8(%esp) 336 movl %esi,12(%esp) 337 movl %edx,20(%esp) 338 movl %edi,24(%esp) 339 movl 28(%esp),%ebp 340 movl 40(%esp),%ecx 341 movl 44(%esp),%esi 342 movl 52(%esp),%edx 343 movl 60(%esp),%edi 344 addl 92(%esp),%ebp 345 addl 104(%esp),%ecx 346 addl 108(%esp),%esi 347 addl 116(%esp),%edx 348 addl 124(%esp),%edi 349 movl %ebp,28(%esp) 350 movl 156(%esp),%ebp 351 movl %ecx,40(%esp) 352 movl 152(%esp),%ecx 353 movl %esi,44(%esp) 354 xorl %esi,%esi 355 movl %edx,52(%esp) 356 movl %edi,60(%esp) 357 xorl %eax,%eax 358 xorl %edx,%edx 359.L007tail_loop: 360 movb (%esi,%ebp,1),%al 361 movb (%esp,%esi,1),%dl 362 leal 1(%esi),%esi 363 xorb %dl,%al 364 movb %al,-1(%ecx,%esi,1) 365 decl %ebx 366 jnz .L007tail_loop 367.L006done: 368 addl $132,%esp 369.L000no_data: 370 popl %edi 371 popl %esi 372 popl %ebx 373 popl %ebp 374 ret 375.size ChaCha20_ctr32,.-.L_ChaCha20_ctr32_begin 376.globl ChaCha20_ssse3 377.type ChaCha20_ssse3,@function 378.align 16 379ChaCha20_ssse3: 380.L_ChaCha20_ssse3_begin: 381 pushl %ebp 382 pushl %ebx 383 pushl %esi 384 pushl %edi 385.Lssse3_shortcut: 386 movl 20(%esp),%edi 387 movl 24(%esp),%esi 388 movl 28(%esp),%ecx 389 movl 32(%esp),%edx 390 movl 36(%esp),%ebx 391 movl %esp,%ebp 392 subl $524,%esp 393 andl $-64,%esp 394 movl %ebp,512(%esp) 395 leal .Lssse3_data-.Lpic_point(%eax),%eax 396 movdqu (%ebx),%xmm3 397.L0081x: 398 movdqa 32(%eax),%xmm0 399 movdqu (%edx),%xmm1 400 movdqu 16(%edx),%xmm2 401 movdqa (%eax),%xmm6 402 movdqa 16(%eax),%xmm7 403 movl %ebp,48(%esp) 404 movdqa %xmm0,(%esp) 405 movdqa %xmm1,16(%esp) 406 movdqa %xmm2,32(%esp) 407 movdqa %xmm3,48(%esp) 408 movl $10,%edx 409 jmp .L009loop1x 410.align 16 411.L010outer1x: 412 movdqa 80(%eax),%xmm3 413 movdqa (%esp),%xmm0 414 movdqa 16(%esp),%xmm1 415 movdqa 32(%esp),%xmm2 416 paddd 48(%esp),%xmm3 417 movl $10,%edx 418 movdqa %xmm3,48(%esp) 419 jmp .L009loop1x 420.align 16 421.L009loop1x: 422 paddd %xmm1,%xmm0 423 pxor %xmm0,%xmm3 424.byte 102,15,56,0,222 425 paddd %xmm3,%xmm2 426 pxor %xmm2,%xmm1 427 movdqa %xmm1,%xmm4 428 psrld $20,%xmm1 429 pslld $12,%xmm4 430 por %xmm4,%xmm1 431 paddd %xmm1,%xmm0 432 pxor %xmm0,%xmm3 433.byte 102,15,56,0,223 434 paddd %xmm3,%xmm2 435 pxor %xmm2,%xmm1 436 movdqa %xmm1,%xmm4 437 psrld $25,%xmm1 438 pslld $7,%xmm4 439 por %xmm4,%xmm1 440 pshufd $78,%xmm2,%xmm2 441 pshufd $57,%xmm1,%xmm1 442 pshufd $147,%xmm3,%xmm3 443 nop 444 paddd %xmm1,%xmm0 445 pxor %xmm0,%xmm3 446.byte 102,15,56,0,222 447 paddd %xmm3,%xmm2 448 pxor %xmm2,%xmm1 449 movdqa %xmm1,%xmm4 450 psrld $20,%xmm1 451 pslld $12,%xmm4 452 por %xmm4,%xmm1 453 paddd %xmm1,%xmm0 454 pxor %xmm0,%xmm3 455.byte 102,15,56,0,223 456 paddd %xmm3,%xmm2 457 pxor %xmm2,%xmm1 458 movdqa %xmm1,%xmm4 459 psrld $25,%xmm1 460 pslld $7,%xmm4 461 por %xmm4,%xmm1 462 pshufd $78,%xmm2,%xmm2 463 pshufd $147,%xmm1,%xmm1 464 pshufd $57,%xmm3,%xmm3 465 decl %edx 466 jnz .L009loop1x 467 paddd (%esp),%xmm0 468 paddd 16(%esp),%xmm1 469 paddd 32(%esp),%xmm2 470 paddd 48(%esp),%xmm3 471 cmpl $64,%ecx 472 jb .L011tail 473 movdqu (%esi),%xmm4 474 movdqu 16(%esi),%xmm5 475 pxor %xmm4,%xmm0 476 movdqu 32(%esi),%xmm4 477 pxor %xmm5,%xmm1 478 movdqu 48(%esi),%xmm5 479 pxor %xmm4,%xmm2 480 pxor %xmm5,%xmm3 481 leal 64(%esi),%esi 482 movdqu %xmm0,(%edi) 483 movdqu %xmm1,16(%edi) 484 movdqu %xmm2,32(%edi) 485 movdqu %xmm3,48(%edi) 486 leal 64(%edi),%edi 487 subl $64,%ecx 488 jnz .L010outer1x 489 jmp .L012done 490.L011tail: 491 movdqa %xmm0,(%esp) 492 movdqa %xmm1,16(%esp) 493 movdqa %xmm2,32(%esp) 494 movdqa %xmm3,48(%esp) 495 xorl %eax,%eax 496 xorl %edx,%edx 497 xorl %ebp,%ebp 498.L013tail_loop: 499 movb (%esp,%ebp,1),%al 500 movb (%esi,%ebp,1),%dl 501 leal 1(%ebp),%ebp 502 xorb %dl,%al 503 movb %al,-1(%edi,%ebp,1) 504 decl %ecx 505 jnz .L013tail_loop 506.L012done: 507 movl 512(%esp),%esp 508 popl %edi 509 popl %esi 510 popl %ebx 511 popl %ebp 512 ret 513.size ChaCha20_ssse3,.-.L_ChaCha20_ssse3_begin 514.align 64 515.Lssse3_data: 516.byte 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13 517.byte 3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14 518.long 1634760805,857760878,2036477234,1797285236 519.long 0,1,2,3 520.long 4,4,4,4 521.long 1,0,0,0 522.long 4,0,0,0 523.long 0,-1,-1,-1 524.align 64 525.byte 67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54 526.byte 44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32 527.byte 60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111 528.byte 114,103,62,0 529.comm OPENSSL_ia32cap_P,16,4 530