x86-mont.S revision 298998
1 # $FreeBSD: head/secure/lib/libcrypto/i386/x86-mont.s 298998 2016-05-03 18:50:10Z jkim $ 2.file "x86-mont.s" 3.text 4.globl bn_mul_mont 5.type bn_mul_mont,@function 6.align 16 7bn_mul_mont: 8.L_bn_mul_mont_begin: 9 pushl %ebp 10 pushl %ebx 11 pushl %esi 12 pushl %edi 13 xorl %eax,%eax 14 movl 40(%esp),%edi 15 cmpl $4,%edi 16 jl .L000just_leave 17 leal 20(%esp),%esi 18 leal 24(%esp),%edx 19 movl %esp,%ebp 20 addl $2,%edi 21 negl %edi 22 leal -32(%esp,%edi,4),%esp 23 negl %edi 24 movl %esp,%eax 25 subl %edx,%eax 26 andl $2047,%eax 27 subl %eax,%esp 28 xorl %esp,%edx 29 andl $2048,%edx 30 xorl $2048,%edx 31 subl %edx,%esp 32 andl $-64,%esp 33 movl %ebp,%eax 34 subl %esp,%eax 35 andl $-4096,%eax 36.L001page_walk: 37 movl (%esp,%eax,1),%edx 38 subl $4096,%eax 39.byte 46 40 jnc .L001page_walk 41 movl (%esi),%eax 42 movl 4(%esi),%ebx 43 movl 8(%esi),%ecx 44 movl 12(%esi),%edx 45 movl 16(%esi),%esi 46 movl (%esi),%esi 47 movl %eax,4(%esp) 48 movl %ebx,8(%esp) 49 movl %ecx,12(%esp) 50 movl %edx,16(%esp) 51 movl %esi,20(%esp) 52 leal -3(%edi),%ebx 53 movl %ebp,24(%esp) 54 leal OPENSSL_ia32cap_P,%eax 55 btl $26,(%eax) 56 jnc .L002non_sse2 57 movl $-1,%eax 58 movd %eax,%mm7 59 movl 8(%esp),%esi 60 movl 12(%esp),%edi 61 movl 16(%esp),%ebp 62 xorl %edx,%edx 63 xorl %ecx,%ecx 64 movd (%edi),%mm4 65 movd (%esi),%mm5 66 movd (%ebp),%mm3 67 pmuludq %mm4,%mm5 68 movq %mm5,%mm2 69 movq %mm5,%mm0 70 pand %mm7,%mm0 71 pmuludq 20(%esp),%mm5 72 pmuludq %mm5,%mm3 73 paddq %mm0,%mm3 74 movd 4(%ebp),%mm1 75 movd 4(%esi),%mm0 76 psrlq $32,%mm2 77 psrlq $32,%mm3 78 incl %ecx 79.align 16 80.L0031st: 81 pmuludq %mm4,%mm0 82 pmuludq %mm5,%mm1 83 paddq %mm0,%mm2 84 paddq %mm1,%mm3 85 movq %mm2,%mm0 86 pand %mm7,%mm0 87 movd 4(%ebp,%ecx,4),%mm1 88 paddq %mm0,%mm3 89 movd 4(%esi,%ecx,4),%mm0 90 psrlq $32,%mm2 91 movd %mm3,28(%esp,%ecx,4) 92 psrlq $32,%mm3 93 leal 1(%ecx),%ecx 94 cmpl %ebx,%ecx 95 jl .L0031st 96 pmuludq %mm4,%mm0 97 pmuludq %mm5,%mm1 98 paddq %mm0,%mm2 99 paddq %mm1,%mm3 100 movq %mm2,%mm0 101 pand %mm7,%mm0 102 paddq %mm0,%mm3 103 movd %mm3,28(%esp,%ecx,4) 104 psrlq $32,%mm2 105 psrlq $32,%mm3 106 paddq %mm2,%mm3 107 movq %mm3,32(%esp,%ebx,4) 108 incl %edx 109.L004outer: 110 xorl %ecx,%ecx 111 movd (%edi,%edx,4),%mm4 112 movd (%esi),%mm5 113 movd 32(%esp),%mm6 114 movd (%ebp),%mm3 115 pmuludq %mm4,%mm5 116 paddq %mm6,%mm5 117 movq %mm5,%mm0 118 movq %mm5,%mm2 119 pand %mm7,%mm0 120 pmuludq 20(%esp),%mm5 121 pmuludq %mm5,%mm3 122 paddq %mm0,%mm3 123 movd 36(%esp),%mm6 124 movd 4(%ebp),%mm1 125 movd 4(%esi),%mm0 126 psrlq $32,%mm2 127 psrlq $32,%mm3 128 paddq %mm6,%mm2 129 incl %ecx 130 decl %ebx 131.L005inner: 132 pmuludq %mm4,%mm0 133 pmuludq %mm5,%mm1 134 paddq %mm0,%mm2 135 paddq %mm1,%mm3 136 movq %mm2,%mm0 137 movd 36(%esp,%ecx,4),%mm6 138 pand %mm7,%mm0 139 movd 4(%ebp,%ecx,4),%mm1 140 paddq %mm0,%mm3 141 movd 4(%esi,%ecx,4),%mm0 142 psrlq $32,%mm2 143 movd %mm3,28(%esp,%ecx,4) 144 psrlq $32,%mm3 145 paddq %mm6,%mm2 146 decl %ebx 147 leal 1(%ecx),%ecx 148 jnz .L005inner 149 movl %ecx,%ebx 150 pmuludq %mm4,%mm0 151 pmuludq %mm5,%mm1 152 paddq %mm0,%mm2 153 paddq %mm1,%mm3 154 movq %mm2,%mm0 155 pand %mm7,%mm0 156 paddq %mm0,%mm3 157 movd %mm3,28(%esp,%ecx,4) 158 psrlq $32,%mm2 159 psrlq $32,%mm3 160 movd 36(%esp,%ebx,4),%mm6 161 paddq %mm2,%mm3 162 paddq %mm6,%mm3 163 movq %mm3,32(%esp,%ebx,4) 164 leal 1(%edx),%edx 165 cmpl %ebx,%edx 166 jle .L004outer 167 emms 168 jmp .L006common_tail 169.align 16 170.L002non_sse2: 171 movl 8(%esp),%esi 172 leal 1(%ebx),%ebp 173 movl 12(%esp),%edi 174 xorl %ecx,%ecx 175 movl %esi,%edx 176 andl $1,%ebp 177 subl %edi,%edx 178 leal 4(%edi,%ebx,4),%eax 179 orl %edx,%ebp 180 movl (%edi),%edi 181 jz .L007bn_sqr_mont 182 movl %eax,28(%esp) 183 movl (%esi),%eax 184 xorl %edx,%edx 185.align 16 186.L008mull: 187 movl %edx,%ebp 188 mull %edi 189 addl %eax,%ebp 190 leal 1(%ecx),%ecx 191 adcl $0,%edx 192 movl (%esi,%ecx,4),%eax 193 cmpl %ebx,%ecx 194 movl %ebp,28(%esp,%ecx,4) 195 jl .L008mull 196 movl %edx,%ebp 197 mull %edi 198 movl 20(%esp),%edi 199 addl %ebp,%eax 200 movl 16(%esp),%esi 201 adcl $0,%edx 202 imull 32(%esp),%edi 203 movl %eax,32(%esp,%ebx,4) 204 xorl %ecx,%ecx 205 movl %edx,36(%esp,%ebx,4) 206 movl %ecx,40(%esp,%ebx,4) 207 movl (%esi),%eax 208 mull %edi 209 addl 32(%esp),%eax 210 movl 4(%esi),%eax 211 adcl $0,%edx 212 incl %ecx 213 jmp .L0092ndmadd 214.align 16 215.L0101stmadd: 216 movl %edx,%ebp 217 mull %edi 218 addl 32(%esp,%ecx,4),%ebp 219 leal 1(%ecx),%ecx 220 adcl $0,%edx 221 addl %eax,%ebp 222 movl (%esi,%ecx,4),%eax 223 adcl $0,%edx 224 cmpl %ebx,%ecx 225 movl %ebp,28(%esp,%ecx,4) 226 jl .L0101stmadd 227 movl %edx,%ebp 228 mull %edi 229 addl 32(%esp,%ebx,4),%eax 230 movl 20(%esp),%edi 231 adcl $0,%edx 232 movl 16(%esp),%esi 233 addl %eax,%ebp 234 adcl $0,%edx 235 imull 32(%esp),%edi 236 xorl %ecx,%ecx 237 addl 36(%esp,%ebx,4),%edx 238 movl %ebp,32(%esp,%ebx,4) 239 adcl $0,%ecx 240 movl (%esi),%eax 241 movl %edx,36(%esp,%ebx,4) 242 movl %ecx,40(%esp,%ebx,4) 243 mull %edi 244 addl 32(%esp),%eax 245 movl 4(%esi),%eax 246 adcl $0,%edx 247 movl $1,%ecx 248.align 16 249.L0092ndmadd: 250 movl %edx,%ebp 251 mull %edi 252 addl 32(%esp,%ecx,4),%ebp 253 leal 1(%ecx),%ecx 254 adcl $0,%edx 255 addl %eax,%ebp 256 movl (%esi,%ecx,4),%eax 257 adcl $0,%edx 258 cmpl %ebx,%ecx 259 movl %ebp,24(%esp,%ecx,4) 260 jl .L0092ndmadd 261 movl %edx,%ebp 262 mull %edi 263 addl 32(%esp,%ebx,4),%ebp 264 adcl $0,%edx 265 addl %eax,%ebp 266 adcl $0,%edx 267 movl %ebp,28(%esp,%ebx,4) 268 xorl %eax,%eax 269 movl 12(%esp),%ecx 270 addl 36(%esp,%ebx,4),%edx 271 adcl 40(%esp,%ebx,4),%eax 272 leal 4(%ecx),%ecx 273 movl %edx,32(%esp,%ebx,4) 274 cmpl 28(%esp),%ecx 275 movl %eax,36(%esp,%ebx,4) 276 je .L006common_tail 277 movl (%ecx),%edi 278 movl 8(%esp),%esi 279 movl %ecx,12(%esp) 280 xorl %ecx,%ecx 281 xorl %edx,%edx 282 movl (%esi),%eax 283 jmp .L0101stmadd 284.align 16 285.L007bn_sqr_mont: 286 movl %ebx,(%esp) 287 movl %ecx,12(%esp) 288 movl %edi,%eax 289 mull %edi 290 movl %eax,32(%esp) 291 movl %edx,%ebx 292 shrl $1,%edx 293 andl $1,%ebx 294 incl %ecx 295.align 16 296.L011sqr: 297 movl (%esi,%ecx,4),%eax 298 movl %edx,%ebp 299 mull %edi 300 addl %ebp,%eax 301 leal 1(%ecx),%ecx 302 adcl $0,%edx 303 leal (%ebx,%eax,2),%ebp 304 shrl $31,%eax 305 cmpl (%esp),%ecx 306 movl %eax,%ebx 307 movl %ebp,28(%esp,%ecx,4) 308 jl .L011sqr 309 movl (%esi,%ecx,4),%eax 310 movl %edx,%ebp 311 mull %edi 312 addl %ebp,%eax 313 movl 20(%esp),%edi 314 adcl $0,%edx 315 movl 16(%esp),%esi 316 leal (%ebx,%eax,2),%ebp 317 imull 32(%esp),%edi 318 shrl $31,%eax 319 movl %ebp,32(%esp,%ecx,4) 320 leal (%eax,%edx,2),%ebp 321 movl (%esi),%eax 322 shrl $31,%edx 323 movl %ebp,36(%esp,%ecx,4) 324 movl %edx,40(%esp,%ecx,4) 325 mull %edi 326 addl 32(%esp),%eax 327 movl %ecx,%ebx 328 adcl $0,%edx 329 movl 4(%esi),%eax 330 movl $1,%ecx 331.align 16 332.L0123rdmadd: 333 movl %edx,%ebp 334 mull %edi 335 addl 32(%esp,%ecx,4),%ebp 336 adcl $0,%edx 337 addl %eax,%ebp 338 movl 4(%esi,%ecx,4),%eax 339 adcl $0,%edx 340 movl %ebp,28(%esp,%ecx,4) 341 movl %edx,%ebp 342 mull %edi 343 addl 36(%esp,%ecx,4),%ebp 344 leal 2(%ecx),%ecx 345 adcl $0,%edx 346 addl %eax,%ebp 347 movl (%esi,%ecx,4),%eax 348 adcl $0,%edx 349 cmpl %ebx,%ecx 350 movl %ebp,24(%esp,%ecx,4) 351 jl .L0123rdmadd 352 movl %edx,%ebp 353 mull %edi 354 addl 32(%esp,%ebx,4),%ebp 355 adcl $0,%edx 356 addl %eax,%ebp 357 adcl $0,%edx 358 movl %ebp,28(%esp,%ebx,4) 359 movl 12(%esp),%ecx 360 xorl %eax,%eax 361 movl 8(%esp),%esi 362 addl 36(%esp,%ebx,4),%edx 363 adcl 40(%esp,%ebx,4),%eax 364 movl %edx,32(%esp,%ebx,4) 365 cmpl %ebx,%ecx 366 movl %eax,36(%esp,%ebx,4) 367 je .L006common_tail 368 movl 4(%esi,%ecx,4),%edi 369 leal 1(%ecx),%ecx 370 movl %edi,%eax 371 movl %ecx,12(%esp) 372 mull %edi 373 addl 32(%esp,%ecx,4),%eax 374 adcl $0,%edx 375 movl %eax,32(%esp,%ecx,4) 376 xorl %ebp,%ebp 377 cmpl %ebx,%ecx 378 leal 1(%ecx),%ecx 379 je .L013sqrlast 380 movl %edx,%ebx 381 shrl $1,%edx 382 andl $1,%ebx 383.align 16 384.L014sqradd: 385 movl (%esi,%ecx,4),%eax 386 movl %edx,%ebp 387 mull %edi 388 addl %ebp,%eax 389 leal (%eax,%eax,1),%ebp 390 adcl $0,%edx 391 shrl $31,%eax 392 addl 32(%esp,%ecx,4),%ebp 393 leal 1(%ecx),%ecx 394 adcl $0,%eax 395 addl %ebx,%ebp 396 adcl $0,%eax 397 cmpl (%esp),%ecx 398 movl %ebp,28(%esp,%ecx,4) 399 movl %eax,%ebx 400 jle .L014sqradd 401 movl %edx,%ebp 402 addl %edx,%edx 403 shrl $31,%ebp 404 addl %ebx,%edx 405 adcl $0,%ebp 406.L013sqrlast: 407 movl 20(%esp),%edi 408 movl 16(%esp),%esi 409 imull 32(%esp),%edi 410 addl 32(%esp,%ecx,4),%edx 411 movl (%esi),%eax 412 adcl $0,%ebp 413 movl %edx,32(%esp,%ecx,4) 414 movl %ebp,36(%esp,%ecx,4) 415 mull %edi 416 addl 32(%esp),%eax 417 leal -1(%ecx),%ebx 418 adcl $0,%edx 419 movl $1,%ecx 420 movl 4(%esi),%eax 421 jmp .L0123rdmadd 422.align 16 423.L006common_tail: 424 movl 16(%esp),%ebp 425 movl 4(%esp),%edi 426 leal 32(%esp),%esi 427 movl (%esi),%eax 428 movl %ebx,%ecx 429 xorl %edx,%edx 430.align 16 431.L015sub: 432 sbbl (%ebp,%edx,4),%eax 433 movl %eax,(%edi,%edx,4) 434 decl %ecx 435 movl 4(%esi,%edx,4),%eax 436 leal 1(%edx),%edx 437 jge .L015sub 438 sbbl $0,%eax 439 andl %eax,%esi 440 notl %eax 441 movl %edi,%ebp 442 andl %eax,%ebp 443 orl %ebp,%esi 444.align 16 445.L016copy: 446 movl (%esi,%ebx,4),%eax 447 movl %eax,(%edi,%ebx,4) 448 movl %ecx,32(%esp,%ebx,4) 449 decl %ebx 450 jge .L016copy 451 movl 24(%esp),%esp 452 movl $1,%eax 453.L000just_leave: 454 popl %edi 455 popl %esi 456 popl %ebx 457 popl %ebp 458 ret 459.size bn_mul_mont,.-.L_bn_mul_mont_begin 460.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105 461.byte 112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56 462.byte 54,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121 463.byte 32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46 464.byte 111,114,103,62,0 465.comm OPENSSL_ia32cap_P,16,4 466