1 # $FreeBSD: releng/10.3/secure/lib/libcrypto/i386/x86-mont.s 238405 2012-07-12 19:30:53Z jkim $ 2.file "x86-mont.s" 3.text 4.globl bn_mul_mont 5.type bn_mul_mont,@function 6.align 16 7bn_mul_mont: 8.L_bn_mul_mont_begin: 9 pushl %ebp 10 pushl %ebx 11 pushl %esi 12 pushl %edi 13 xorl %eax,%eax 14 movl 40(%esp),%edi 15 cmpl $4,%edi 16 jl .L000just_leave 17 leal 20(%esp),%esi 18 leal 24(%esp),%edx 19 movl %esp,%ebp 20 addl $2,%edi 21 negl %edi 22 leal -32(%esp,%edi,4),%esp 23 negl %edi 24 movl %esp,%eax 25 subl %edx,%eax 26 andl $2047,%eax 27 subl %eax,%esp 28 xorl %esp,%edx 29 andl $2048,%edx 30 xorl $2048,%edx 31 subl %edx,%esp 32 andl $-64,%esp 33 movl (%esi),%eax 34 movl 4(%esi),%ebx 35 movl 8(%esi),%ecx 36 movl 12(%esi),%edx 37 movl 16(%esi),%esi 38 movl (%esi),%esi 39 movl %eax,4(%esp) 40 movl %ebx,8(%esp) 41 movl %ecx,12(%esp) 42 movl %edx,16(%esp) 43 movl %esi,20(%esp) 44 leal -3(%edi),%ebx 45 movl %ebp,24(%esp) 46 leal OPENSSL_ia32cap_P,%eax 47 btl $26,(%eax) 48 jnc .L001non_sse2 49 movl $-1,%eax 50 movd %eax,%mm7 51 movl 8(%esp),%esi 52 movl 12(%esp),%edi 53 movl 16(%esp),%ebp 54 xorl %edx,%edx 55 xorl %ecx,%ecx 56 movd (%edi),%mm4 57 movd (%esi),%mm5 58 movd (%ebp),%mm3 59 pmuludq %mm4,%mm5 60 movq %mm5,%mm2 61 movq %mm5,%mm0 62 pand %mm7,%mm0 63 pmuludq 20(%esp),%mm5 64 pmuludq %mm5,%mm3 65 paddq %mm0,%mm3 66 movd 4(%ebp),%mm1 67 movd 4(%esi),%mm0 68 psrlq $32,%mm2 69 psrlq $32,%mm3 70 incl %ecx 71.align 16 72.L0021st: 73 pmuludq %mm4,%mm0 74 pmuludq %mm5,%mm1 75 paddq %mm0,%mm2 76 paddq %mm1,%mm3 77 movq %mm2,%mm0 78 pand %mm7,%mm0 79 movd 4(%ebp,%ecx,4),%mm1 80 paddq %mm0,%mm3 81 movd 4(%esi,%ecx,4),%mm0 82 psrlq $32,%mm2 83 movd %mm3,28(%esp,%ecx,4) 84 psrlq $32,%mm3 85 leal 1(%ecx),%ecx 86 cmpl %ebx,%ecx 87 jl .L0021st 88 pmuludq %mm4,%mm0 89 pmuludq %mm5,%mm1 90 paddq %mm0,%mm2 91 paddq %mm1,%mm3 92 movq %mm2,%mm0 93 pand %mm7,%mm0 94 paddq %mm0,%mm3 95 movd %mm3,28(%esp,%ecx,4) 96 psrlq $32,%mm2 97 psrlq $32,%mm3 98 paddq %mm2,%mm3 99 movq %mm3,32(%esp,%ebx,4) 100 incl %edx 101.L003outer: 102 xorl %ecx,%ecx 103 movd (%edi,%edx,4),%mm4 104 movd (%esi),%mm5 105 movd 32(%esp),%mm6 106 movd (%ebp),%mm3 107 pmuludq %mm4,%mm5 108 paddq %mm6,%mm5 109 movq %mm5,%mm0 110 movq %mm5,%mm2 111 pand %mm7,%mm0 112 pmuludq 20(%esp),%mm5 113 pmuludq %mm5,%mm3 114 paddq %mm0,%mm3 115 movd 36(%esp),%mm6 116 movd 4(%ebp),%mm1 117 movd 4(%esi),%mm0 118 psrlq $32,%mm2 119 psrlq $32,%mm3 120 paddq %mm6,%mm2 121 incl %ecx 122 decl %ebx 123.L004inner: 124 pmuludq %mm4,%mm0 125 pmuludq %mm5,%mm1 126 paddq %mm0,%mm2 127 paddq %mm1,%mm3 128 movq %mm2,%mm0 129 movd 36(%esp,%ecx,4),%mm6 130 pand %mm7,%mm0 131 movd 4(%ebp,%ecx,4),%mm1 132 paddq %mm0,%mm3 133 movd 4(%esi,%ecx,4),%mm0 134 psrlq $32,%mm2 135 movd %mm3,28(%esp,%ecx,4) 136 psrlq $32,%mm3 137 paddq %mm6,%mm2 138 decl %ebx 139 leal 1(%ecx),%ecx 140 jnz .L004inner 141 movl %ecx,%ebx 142 pmuludq %mm4,%mm0 143 pmuludq %mm5,%mm1 144 paddq %mm0,%mm2 145 paddq %mm1,%mm3 146 movq %mm2,%mm0 147 pand %mm7,%mm0 148 paddq %mm0,%mm3 149 movd %mm3,28(%esp,%ecx,4) 150 psrlq $32,%mm2 151 psrlq $32,%mm3 152 movd 36(%esp,%ebx,4),%mm6 153 paddq %mm2,%mm3 154 paddq %mm6,%mm3 155 movq %mm3,32(%esp,%ebx,4) 156 leal 1(%edx),%edx 157 cmpl %ebx,%edx 158 jle .L003outer 159 emms 160 jmp .L005common_tail 161.align 16 162.L001non_sse2: 163 movl 8(%esp),%esi 164 leal 1(%ebx),%ebp 165 movl 12(%esp),%edi 166 xorl %ecx,%ecx 167 movl %esi,%edx 168 andl $1,%ebp 169 subl %edi,%edx 170 leal 4(%edi,%ebx,4),%eax 171 orl %edx,%ebp 172 movl (%edi),%edi 173 jz .L006bn_sqr_mont 174 movl %eax,28(%esp) 175 movl (%esi),%eax 176 xorl %edx,%edx 177.align 16 178.L007mull: 179 movl %edx,%ebp 180 mull %edi 181 addl %eax,%ebp 182 leal 1(%ecx),%ecx 183 adcl $0,%edx 184 movl (%esi,%ecx,4),%eax 185 cmpl %ebx,%ecx 186 movl %ebp,28(%esp,%ecx,4) 187 jl .L007mull 188 movl %edx,%ebp 189 mull %edi 190 movl 20(%esp),%edi 191 addl %ebp,%eax 192 movl 16(%esp),%esi 193 adcl $0,%edx 194 imull 32(%esp),%edi 195 movl %eax,32(%esp,%ebx,4) 196 xorl %ecx,%ecx 197 movl %edx,36(%esp,%ebx,4) 198 movl %ecx,40(%esp,%ebx,4) 199 movl (%esi),%eax 200 mull %edi 201 addl 32(%esp),%eax 202 movl 4(%esi),%eax 203 adcl $0,%edx 204 incl %ecx 205 jmp .L0082ndmadd 206.align 16 207.L0091stmadd: 208 movl %edx,%ebp 209 mull %edi 210 addl 32(%esp,%ecx,4),%ebp 211 leal 1(%ecx),%ecx 212 adcl $0,%edx 213 addl %eax,%ebp 214 movl (%esi,%ecx,4),%eax 215 adcl $0,%edx 216 cmpl %ebx,%ecx 217 movl %ebp,28(%esp,%ecx,4) 218 jl .L0091stmadd 219 movl %edx,%ebp 220 mull %edi 221 addl 32(%esp,%ebx,4),%eax 222 movl 20(%esp),%edi 223 adcl $0,%edx 224 movl 16(%esp),%esi 225 addl %eax,%ebp 226 adcl $0,%edx 227 imull 32(%esp),%edi 228 xorl %ecx,%ecx 229 addl 36(%esp,%ebx,4),%edx 230 movl %ebp,32(%esp,%ebx,4) 231 adcl $0,%ecx 232 movl (%esi),%eax 233 movl %edx,36(%esp,%ebx,4) 234 movl %ecx,40(%esp,%ebx,4) 235 mull %edi 236 addl 32(%esp),%eax 237 movl 4(%esi),%eax 238 adcl $0,%edx 239 movl $1,%ecx 240.align 16 241.L0082ndmadd: 242 movl %edx,%ebp 243 mull %edi 244 addl 32(%esp,%ecx,4),%ebp 245 leal 1(%ecx),%ecx 246 adcl $0,%edx 247 addl %eax,%ebp 248 movl (%esi,%ecx,4),%eax 249 adcl $0,%edx 250 cmpl %ebx,%ecx 251 movl %ebp,24(%esp,%ecx,4) 252 jl .L0082ndmadd 253 movl %edx,%ebp 254 mull %edi 255 addl 32(%esp,%ebx,4),%ebp 256 adcl $0,%edx 257 addl %eax,%ebp 258 adcl $0,%edx 259 movl %ebp,28(%esp,%ebx,4) 260 xorl %eax,%eax 261 movl 12(%esp),%ecx 262 addl 36(%esp,%ebx,4),%edx 263 adcl 40(%esp,%ebx,4),%eax 264 leal 4(%ecx),%ecx 265 movl %edx,32(%esp,%ebx,4) 266 cmpl 28(%esp),%ecx 267 movl %eax,36(%esp,%ebx,4) 268 je .L005common_tail 269 movl (%ecx),%edi 270 movl 8(%esp),%esi 271 movl %ecx,12(%esp) 272 xorl %ecx,%ecx 273 xorl %edx,%edx 274 movl (%esi),%eax 275 jmp .L0091stmadd 276.align 16 277.L006bn_sqr_mont: 278 movl %ebx,(%esp) 279 movl %ecx,12(%esp) 280 movl %edi,%eax 281 mull %edi 282 movl %eax,32(%esp) 283 movl %edx,%ebx 284 shrl $1,%edx 285 andl $1,%ebx 286 incl %ecx 287.align 16 288.L010sqr: 289 movl (%esi,%ecx,4),%eax 290 movl %edx,%ebp 291 mull %edi 292 addl %ebp,%eax 293 leal 1(%ecx),%ecx 294 adcl $0,%edx 295 leal (%ebx,%eax,2),%ebp 296 shrl $31,%eax 297 cmpl (%esp),%ecx 298 movl %eax,%ebx 299 movl %ebp,28(%esp,%ecx,4) 300 jl .L010sqr 301 movl (%esi,%ecx,4),%eax 302 movl %edx,%ebp 303 mull %edi 304 addl %ebp,%eax 305 movl 20(%esp),%edi 306 adcl $0,%edx 307 movl 16(%esp),%esi 308 leal (%ebx,%eax,2),%ebp 309 imull 32(%esp),%edi 310 shrl $31,%eax 311 movl %ebp,32(%esp,%ecx,4) 312 leal (%eax,%edx,2),%ebp 313 movl (%esi),%eax 314 shrl $31,%edx 315 movl %ebp,36(%esp,%ecx,4) 316 movl %edx,40(%esp,%ecx,4) 317 mull %edi 318 addl 32(%esp),%eax 319 movl %ecx,%ebx 320 adcl $0,%edx 321 movl 4(%esi),%eax 322 movl $1,%ecx 323.align 16 324.L0113rdmadd: 325 movl %edx,%ebp 326 mull %edi 327 addl 32(%esp,%ecx,4),%ebp 328 adcl $0,%edx 329 addl %eax,%ebp 330 movl 4(%esi,%ecx,4),%eax 331 adcl $0,%edx 332 movl %ebp,28(%esp,%ecx,4) 333 movl %edx,%ebp 334 mull %edi 335 addl 36(%esp,%ecx,4),%ebp 336 leal 2(%ecx),%ecx 337 adcl $0,%edx 338 addl %eax,%ebp 339 movl (%esi,%ecx,4),%eax 340 adcl $0,%edx 341 cmpl %ebx,%ecx 342 movl %ebp,24(%esp,%ecx,4) 343 jl .L0113rdmadd 344 movl %edx,%ebp 345 mull %edi 346 addl 32(%esp,%ebx,4),%ebp 347 adcl $0,%edx 348 addl %eax,%ebp 349 adcl $0,%edx 350 movl %ebp,28(%esp,%ebx,4) 351 movl 12(%esp),%ecx 352 xorl %eax,%eax 353 movl 8(%esp),%esi 354 addl 36(%esp,%ebx,4),%edx 355 adcl 40(%esp,%ebx,4),%eax 356 movl %edx,32(%esp,%ebx,4) 357 cmpl %ebx,%ecx 358 movl %eax,36(%esp,%ebx,4) 359 je .L005common_tail 360 movl 4(%esi,%ecx,4),%edi 361 leal 1(%ecx),%ecx 362 movl %edi,%eax 363 movl %ecx,12(%esp) 364 mull %edi 365 addl 32(%esp,%ecx,4),%eax 366 adcl $0,%edx 367 movl %eax,32(%esp,%ecx,4) 368 xorl %ebp,%ebp 369 cmpl %ebx,%ecx 370 leal 1(%ecx),%ecx 371 je .L012sqrlast 372 movl %edx,%ebx 373 shrl $1,%edx 374 andl $1,%ebx 375.align 16 376.L013sqradd: 377 movl (%esi,%ecx,4),%eax 378 movl %edx,%ebp 379 mull %edi 380 addl %ebp,%eax 381 leal (%eax,%eax,1),%ebp 382 adcl $0,%edx 383 shrl $31,%eax 384 addl 32(%esp,%ecx,4),%ebp 385 leal 1(%ecx),%ecx 386 adcl $0,%eax 387 addl %ebx,%ebp 388 adcl $0,%eax 389 cmpl (%esp),%ecx 390 movl %ebp,28(%esp,%ecx,4) 391 movl %eax,%ebx 392 jle .L013sqradd 393 movl %edx,%ebp 394 addl %edx,%edx 395 shrl $31,%ebp 396 addl %ebx,%edx 397 adcl $0,%ebp 398.L012sqrlast: 399 movl 20(%esp),%edi 400 movl 16(%esp),%esi 401 imull 32(%esp),%edi 402 addl 32(%esp,%ecx,4),%edx 403 movl (%esi),%eax 404 adcl $0,%ebp 405 movl %edx,32(%esp,%ecx,4) 406 movl %ebp,36(%esp,%ecx,4) 407 mull %edi 408 addl 32(%esp),%eax 409 leal -1(%ecx),%ebx 410 adcl $0,%edx 411 movl $1,%ecx 412 movl 4(%esi),%eax 413 jmp .L0113rdmadd 414.align 16 415.L005common_tail: 416 movl 16(%esp),%ebp 417 movl 4(%esp),%edi 418 leal 32(%esp),%esi 419 movl (%esi),%eax 420 movl %ebx,%ecx 421 xorl %edx,%edx 422.align 16 423.L014sub: 424 sbbl (%ebp,%edx,4),%eax 425 movl %eax,(%edi,%edx,4) 426 decl %ecx 427 movl 4(%esi,%edx,4),%eax 428 leal 1(%edx),%edx 429 jge .L014sub 430 sbbl $0,%eax 431 andl %eax,%esi 432 notl %eax 433 movl %edi,%ebp 434 andl %eax,%ebp 435 orl %ebp,%esi 436.align 16 437.L015copy: 438 movl (%esi,%ebx,4),%eax 439 movl %eax,(%edi,%ebx,4) 440 movl %ecx,32(%esp,%ebx,4) 441 decl %ebx 442 jge .L015copy 443 movl 24(%esp),%esp 444 movl $1,%eax 445.L000just_leave: 446 popl %edi 447 popl %esi 448 popl %ebx 449 popl %ebp 450 ret 451.size bn_mul_mont,.-.L_bn_mul_mont_begin 452.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105 453.byte 112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56 454.byte 54,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121 455.byte 32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46 456.byte 111,114,103,62,0 457.comm OPENSSL_ia32cap_P,8,4 458