x86-mont.s revision 285830
1204076Spjd # $FreeBSD: releng/10.2/secure/lib/libcrypto/i386/x86-mont.s 238405 2012-07-12 19:30:53Z jkim $ 2204076Spjd.file "x86-mont.s" 3211885Spjd.text 4204076Spjd.globl bn_mul_mont 5204076Spjd.type bn_mul_mont,@function 6204076Spjd.align 16 7204076Spjdbn_mul_mont: 8204076Spjd.L_bn_mul_mont_begin: 9204076Spjd pushl %ebp 10204076Spjd pushl %ebx 11204076Spjd pushl %esi 12204076Spjd pushl %edi 13204076Spjd xorl %eax,%eax 14204076Spjd movl 40(%esp),%edi 15204076Spjd cmpl $4,%edi 16204076Spjd jl .L000just_leave 17204076Spjd leal 20(%esp),%esi 18204076Spjd leal 24(%esp),%edx 19204076Spjd movl %esp,%ebp 20204076Spjd addl $2,%edi 21204076Spjd negl %edi 22204076Spjd leal -32(%esp,%edi,4),%esp 23204076Spjd negl %edi 24204076Spjd movl %esp,%eax 25204076Spjd subl %edx,%eax 26204076Spjd andl $2047,%eax 27204076Spjd subl %eax,%esp 28204076Spjd xorl %esp,%edx 29204076Spjd andl $2048,%edx 30204076Spjd xorl $2048,%edx 31204076Spjd subl %edx,%esp 32204076Spjd andl $-64,%esp 33204076Spjd movl (%esi),%eax 34204076Spjd movl 4(%esi),%ebx 35211885Spjd movl 8(%esi),%ecx 36204076Spjd movl 12(%esi),%edx 37204076Spjd movl 16(%esi),%esi 38204076Spjd movl (%esi),%esi 39211885Spjd movl %eax,4(%esp) 40204076Spjd movl %ebx,8(%esp) 41211885Spjd movl %ecx,12(%esp) 42211885Spjd movl %edx,16(%esp) 43211885Spjd movl %esi,20(%esp) 44211885Spjd leal -3(%edi),%ebx 45211885Spjd movl %ebp,24(%esp) 46204076Spjd leal OPENSSL_ia32cap_P,%eax 47204076Spjd btl $26,(%eax) 48204076Spjd jnc .L001non_sse2 49204076Spjd movl $-1,%eax 50211885Spjd movd %eax,%mm7 51204076Spjd movl 8(%esp),%esi 52204076Spjd movl 12(%esp),%edi 53204076Spjd movl 16(%esp),%ebp 54204076Spjd xorl %edx,%edx 55211885Spjd xorl %ecx,%ecx 56204076Spjd movd (%edi),%mm4 57211885Spjd movd (%esi),%mm5 58211885Spjd movd (%ebp),%mm3 59211885Spjd pmuludq %mm4,%mm5 60211885Spjd movq %mm5,%mm2 61211885Spjd movq %mm5,%mm0 62211885Spjd pand %mm7,%mm0 63211885Spjd pmuludq 20(%esp),%mm5 64211885Spjd pmuludq %mm5,%mm3 65211885Spjd paddq %mm0,%mm3 66211885Spjd movd 4(%ebp),%mm1 67211885Spjd movd 4(%esi),%mm0 68211885Spjd psrlq $32,%mm2 69211885Spjd psrlq $32,%mm3 70211885Spjd incl %ecx 71211885Spjd.align 16 72211885Spjd.L0021st: 73211885Spjd pmuludq %mm4,%mm0 74211885Spjd pmuludq %mm5,%mm1 75211885Spjd paddq %mm0,%mm2 76211885Spjd paddq %mm1,%mm3 77211885Spjd movq %mm2,%mm0 78211885Spjd pand %mm7,%mm0 79211885Spjd movd 4(%ebp,%ecx,4),%mm1 80211885Spjd paddq %mm0,%mm3 81211885Spjd movd 4(%esi,%ecx,4),%mm0 82211885Spjd psrlq $32,%mm2 83211885Spjd movd %mm3,28(%esp,%ecx,4) 84211885Spjd psrlq $32,%mm3 85211976Spjd leal 1(%ecx),%ecx 86211976Spjd cmpl %ebx,%ecx 87211976Spjd jl .L0021st 88204076Spjd pmuludq %mm4,%mm0 89204076Spjd pmuludq %mm5,%mm1 90204076Spjd paddq %mm0,%mm2 91204076Spjd paddq %mm1,%mm3 92204076Spjd movq %mm2,%mm0 93204076Spjd pand %mm7,%mm0 94204076Spjd paddq %mm0,%mm3 95204076Spjd movd %mm3,28(%esp,%ecx,4) 96204076Spjd psrlq $32,%mm2 97204076Spjd psrlq $32,%mm3 98204076Spjd paddq %mm2,%mm3 99204076Spjd movq %mm3,32(%esp,%ebx,4) 100204076Spjd incl %edx 101204076Spjd.L003outer: 102211884Spjd xorl %ecx,%ecx 103211884Spjd movd (%edi,%edx,4),%mm4 104211884Spjd movd (%esi),%mm5 105211884Spjd movd 32(%esp),%mm6 106211884Spjd movd (%ebp),%mm3 107211884Spjd pmuludq %mm4,%mm5 108211884Spjd paddq %mm6,%mm5 109211884Spjd movq %mm5,%mm0 110211884Spjd movq %mm5,%mm2 111211884Spjd pand %mm7,%mm0 112211884Spjd pmuludq 20(%esp),%mm5 113211884Spjd pmuludq %mm5,%mm3 114211884Spjd paddq %mm0,%mm3 115211884Spjd movd 36(%esp),%mm6 116211884Spjd movd 4(%ebp),%mm1 117204076Spjd movd 4(%esi),%mm0 118204076Spjd psrlq $32,%mm2 119204076Spjd psrlq $32,%mm3 120204076Spjd paddq %mm6,%mm2 121204076Spjd incl %ecx 122204076Spjd decl %ebx 123204076Spjd.L004inner: 124204076Spjd pmuludq %mm4,%mm0 125204076Spjd pmuludq %mm5,%mm1 126204076Spjd paddq %mm0,%mm2 127204076Spjd paddq %mm1,%mm3 128204076Spjd movq %mm2,%mm0 129204076Spjd movd 36(%esp,%ecx,4),%mm6 130204076Spjd pand %mm7,%mm0 131204076Spjd movd 4(%ebp,%ecx,4),%mm1 132204076Spjd paddq %mm0,%mm3 133204076Spjd movd 4(%esi,%ecx,4),%mm0 134204076Spjd psrlq $32,%mm2 135204076Spjd movd %mm3,28(%esp,%ecx,4) 136204076Spjd psrlq $32,%mm3 137204076Spjd paddq %mm6,%mm2 138204076Spjd decl %ebx 139204076Spjd leal 1(%ecx),%ecx 140204076Spjd jnz .L004inner 141204076Spjd movl %ecx,%ebx 142204076Spjd pmuludq %mm4,%mm0 143204076Spjd pmuludq %mm5,%mm1 144204076Spjd paddq %mm0,%mm2 145204076Spjd paddq %mm1,%mm3 146204076Spjd movq %mm2,%mm0 147204076Spjd pand %mm7,%mm0 148204076Spjd paddq %mm0,%mm3 149211885Spjd movd %mm3,28(%esp,%ecx,4) 150211885Spjd psrlq $32,%mm2 151211885Spjd psrlq $32,%mm3 152211885Spjd movd 36(%esp,%ebx,4),%mm6 153211976Spjd paddq %mm2,%mm3 154211976Spjd paddq %mm6,%mm3 155211885Spjd movq %mm3,32(%esp,%ebx,4) 156211885Spjd leal 1(%edx),%edx 157211885Spjd cmpl %ebx,%edx 158211885Spjd jle .L003outer 159211885Spjd emms 160211976Spjd jmp .L005common_tail 161211976Spjd.align 16 162211976Spjd.L001non_sse2: 163211976Spjd movl 8(%esp),%esi 164211976Spjd leal 1(%ebx),%ebp 165211976Spjd movl 12(%esp),%edi 166211976Spjd xorl %ecx,%ecx 167211976Spjd movl %esi,%edx 168211976Spjd andl $1,%ebp 169211976Spjd subl %edi,%edx 170211976Spjd leal 4(%edi,%ebx,4),%eax 171211976Spjd orl %edx,%ebp 172211976Spjd movl (%edi),%edi 173211976Spjd jz .L006bn_sqr_mont 174211976Spjd movl %eax,28(%esp) 175211976Spjd movl (%esi),%eax 176211976Spjd xorl %edx,%edx 177211976Spjd.align 16 178211976Spjd.L007mull: 179211976Spjd movl %edx,%ebp 180211976Spjd mull %edi 181211976Spjd addl %eax,%ebp 182211885Spjd leal 1(%ecx),%ecx 183211885Spjd adcl $0,%edx 184211885Spjd movl (%esi,%ecx,4),%eax 185211885Spjd cmpl %ebx,%ecx 186211885Spjd movl %ebp,28(%esp,%ecx,4) 187211885Spjd jl .L007mull 188211885Spjd movl %edx,%ebp 189211885Spjd mull %edi 190211885Spjd movl 20(%esp),%edi 191211885Spjd addl %ebp,%eax 192211885Spjd movl 16(%esp),%esi 193211885Spjd adcl $0,%edx 194211885Spjd imull 32(%esp),%edi 195211885Spjd movl %eax,32(%esp,%ebx,4) 196211885Spjd xorl %ecx,%ecx 197211885Spjd movl %edx,36(%esp,%ebx,4) 198211885Spjd movl %ecx,40(%esp,%ebx,4) 199211885Spjd movl (%esi),%eax 200211885Spjd mull %edi 201211885Spjd addl 32(%esp),%eax 202211885Spjd movl 4(%esi),%eax 203211885Spjd adcl $0,%edx 204211885Spjd incl %ecx 205211885Spjd jmp .L0082ndmadd 206211885Spjd.align 16 207211885Spjd.L0091stmadd: 208211885Spjd movl %edx,%ebp 209211885Spjd mull %edi 210211885Spjd addl 32(%esp,%ecx,4),%ebp 211211885Spjd leal 1(%ecx),%ecx 212211885Spjd adcl $0,%edx 213211885Spjd addl %eax,%ebp 214211885Spjd movl (%esi,%ecx,4),%eax 215211885Spjd adcl $0,%edx 216211885Spjd cmpl %ebx,%ecx 217211885Spjd movl %ebp,28(%esp,%ecx,4) 218211885Spjd jl .L0091stmadd 219211885Spjd movl %edx,%ebp 220211885Spjd mull %edi 221211885Spjd addl 32(%esp,%ebx,4),%eax 222211885Spjd movl 20(%esp),%edi 223211885Spjd adcl $0,%edx 224211885Spjd movl 16(%esp),%esi 225211885Spjd addl %eax,%ebp 226211885Spjd adcl $0,%edx 227211885Spjd imull 32(%esp),%edi 228211885Spjd xorl %ecx,%ecx 229211885Spjd addl 36(%esp,%ebx,4),%edx 230211885Spjd movl %ebp,32(%esp,%ebx,4) 231211885Spjd adcl $0,%ecx 232211885Spjd movl (%esi),%eax 233211885Spjd movl %edx,36(%esp,%ebx,4) 234211885Spjd movl %ecx,40(%esp,%ebx,4) 235211885Spjd mull %edi 236211885Spjd addl 32(%esp),%eax 237211885Spjd movl 4(%esi),%eax 238211885Spjd adcl $0,%edx 239211885Spjd movl $1,%ecx 240211885Spjd.align 16 241211885Spjd.L0082ndmadd: 242211885Spjd movl %edx,%ebp 243211885Spjd mull %edi 244211885Spjd addl 32(%esp,%ecx,4),%ebp 245211885Spjd leal 1(%ecx),%ecx 246211885Spjd adcl $0,%edx 247211885Spjd addl %eax,%ebp 248211885Spjd movl (%esi,%ecx,4),%eax 249211885Spjd adcl $0,%edx 250211885Spjd cmpl %ebx,%ecx 251211885Spjd movl %ebp,24(%esp,%ecx,4) 252211885Spjd jl .L0082ndmadd 253211885Spjd movl %edx,%ebp 254211885Spjd mull %edi 255211885Spjd addl 32(%esp,%ebx,4),%ebp 256211885Spjd adcl $0,%edx 257211885Spjd addl %eax,%ebp 258211885Spjd adcl $0,%edx 259211885Spjd movl %ebp,28(%esp,%ebx,4) 260211885Spjd xorl %eax,%eax 261211885Spjd movl 12(%esp),%ecx 262211885Spjd addl 36(%esp,%ebx,4),%edx 263211885Spjd adcl 40(%esp,%ebx,4),%eax 264211885Spjd leal 4(%ecx),%ecx 265211885Spjd movl %edx,32(%esp,%ebx,4) 266211885Spjd cmpl 28(%esp),%ecx 267211885Spjd movl %eax,36(%esp,%ebx,4) 268211976Spjd je .L005common_tail 269211976Spjd movl (%ecx),%edi 270211976Spjd movl 8(%esp),%esi 271211976Spjd movl %ecx,12(%esp) 272211976Spjd xorl %ecx,%ecx 273211976Spjd xorl %edx,%edx 274211976Spjd movl (%esi),%eax 275211976Spjd jmp .L0091stmadd 276211976Spjd.align 16 277211976Spjd.L006bn_sqr_mont: 278211976Spjd movl %ebx,(%esp) 279211976Spjd movl %ecx,12(%esp) 280211976Spjd movl %edi,%eax 281211976Spjd mull %edi 282211976Spjd movl %eax,32(%esp) 283211976Spjd movl %edx,%ebx 284211976Spjd shrl $1,%edx 285211976Spjd andl $1,%ebx 286211976Spjd incl %ecx 287211976Spjd.align 16 288211976Spjd.L010sqr: 289211976Spjd movl (%esi,%ecx,4),%eax 290211976Spjd movl %edx,%ebp 291211976Spjd mull %edi 292211976Spjd addl %ebp,%eax 293211976Spjd leal 1(%ecx),%ecx 294211976Spjd adcl $0,%edx 295211976Spjd leal (%ebx,%eax,2),%ebp 296213429Spjd shrl $31,%eax 297211885Spjd cmpl (%esp),%ecx 298211885Spjd movl %eax,%ebx 299211885Spjd movl %ebp,28(%esp,%ecx,4) 300211885Spjd jl .L010sqr 301211885Spjd movl (%esi,%ecx,4),%eax 302211885Spjd movl %edx,%ebp 303211885Spjd mull %edi 304211885Spjd addl %ebp,%eax 305211885Spjd movl 20(%esp),%edi 306211885Spjd adcl $0,%edx 307211885Spjd movl 16(%esp),%esi 308211885Spjd leal (%ebx,%eax,2),%ebp 309211885Spjd imull 32(%esp),%edi 310211885Spjd shrl $31,%eax 311211885Spjd movl %ebp,32(%esp,%ecx,4) 312211885Spjd leal (%eax,%edx,2),%ebp 313211885Spjd movl (%esi),%eax 314211885Spjd shrl $31,%edx 315211885Spjd movl %ebp,36(%esp,%ecx,4) 316211885Spjd movl %edx,40(%esp,%ecx,4) 317211885Spjd mull %edi 318211885Spjd addl 32(%esp),%eax 319211885Spjd movl %ecx,%ebx 320211885Spjd adcl $0,%edx 321211885Spjd movl 4(%esi),%eax 322211885Spjd movl $1,%ecx 323211885Spjd.align 16 324211885Spjd.L0113rdmadd: 325211885Spjd movl %edx,%ebp 326211885Spjd mull %edi 327211885Spjd addl 32(%esp,%ecx,4),%ebp 328211885Spjd adcl $0,%edx 329211885Spjd addl %eax,%ebp 330211885Spjd movl 4(%esi,%ecx,4),%eax 331211885Spjd adcl $0,%edx 332211885Spjd movl %ebp,28(%esp,%ecx,4) 333211885Spjd movl %edx,%ebp 334211885Spjd mull %edi 335211885Spjd addl 36(%esp,%ecx,4),%ebp 336211885Spjd leal 2(%ecx),%ecx 337211885Spjd adcl $0,%edx 338211885Spjd addl %eax,%ebp 339211885Spjd movl (%esi,%ecx,4),%eax 340211885Spjd adcl $0,%edx 341211885Spjd cmpl %ebx,%ecx 342204076Spjd movl %ebp,24(%esp,%ecx,4) 343204076Spjd jl .L0113rdmadd 344204076Spjd movl %edx,%ebp 345204076Spjd mull %edi 346204076Spjd addl 32(%esp,%ebx,4),%ebp 347211885Spjd adcl $0,%edx 348204076Spjd addl %eax,%ebp 349204076Spjd adcl $0,%edx 350204076Spjd movl %ebp,28(%esp,%ebx,4) 351211885Spjd movl 12(%esp),%ecx 352204076Spjd xorl %eax,%eax 353204076Spjd movl 8(%esp),%esi 354211885Spjd addl 36(%esp,%ebx,4),%edx 355204076Spjd adcl 40(%esp,%ebx,4),%eax 356204076Spjd movl %edx,32(%esp,%ebx,4) 357211885Spjd cmpl %ebx,%ecx 358204076Spjd movl %eax,36(%esp,%ebx,4) 359211885Spjd je .L005common_tail 360211885Spjd movl 4(%esi,%ecx,4),%edi 361204076Spjd leal 1(%ecx),%ecx 362211885Spjd movl %edi,%eax 363204076Spjd movl %ecx,12(%esp) 364204076Spjd mull %edi 365204076Spjd addl 32(%esp,%ecx,4),%eax 366204076Spjd adcl $0,%edx 367204076Spjd movl %eax,32(%esp,%ecx,4) 368204076Spjd xorl %ebp,%ebp 369204076Spjd cmpl %ebx,%ecx 370204076Spjd leal 1(%ecx),%ecx 371204076Spjd je .L012sqrlast 372204076Spjd movl %edx,%ebx 373211885Spjd shrl $1,%edx 374211885Spjd andl $1,%ebx 375211885Spjd.align 16 376211885Spjd.L013sqradd: 377204076Spjd movl (%esi,%ecx,4),%eax 378204076Spjd movl %edx,%ebp 379204076Spjd mull %edi 380211885Spjd addl %ebp,%eax 381213183Spjd leal (%eax,%eax,1),%ebp 382211885Spjd adcl $0,%edx 383204076Spjd shrl $31,%eax 384204076Spjd addl 32(%esp,%ecx,4),%ebp 385204076Spjd leal 1(%ecx),%ecx 386204076Spjd adcl $0,%eax 387204076Spjd addl %ebx,%ebp 388204076Spjd adcl $0,%eax 389211885Spjd cmpl (%esp),%ecx 390204076Spjd movl %ebp,28(%esp,%ecx,4) 391204076Spjd movl %eax,%ebx 392204076Spjd jle .L013sqradd 393 movl %edx,%ebp 394 addl %edx,%edx 395 shrl $31,%ebp 396 addl %ebx,%edx 397 adcl $0,%ebp 398.L012sqrlast: 399 movl 20(%esp),%edi 400 movl 16(%esp),%esi 401 imull 32(%esp),%edi 402 addl 32(%esp,%ecx,4),%edx 403 movl (%esi),%eax 404 adcl $0,%ebp 405 movl %edx,32(%esp,%ecx,4) 406 movl %ebp,36(%esp,%ecx,4) 407 mull %edi 408 addl 32(%esp),%eax 409 leal -1(%ecx),%ebx 410 adcl $0,%edx 411 movl $1,%ecx 412 movl 4(%esi),%eax 413 jmp .L0113rdmadd 414.align 16 415.L005common_tail: 416 movl 16(%esp),%ebp 417 movl 4(%esp),%edi 418 leal 32(%esp),%esi 419 movl (%esi),%eax 420 movl %ebx,%ecx 421 xorl %edx,%edx 422.align 16 423.L014sub: 424 sbbl (%ebp,%edx,4),%eax 425 movl %eax,(%edi,%edx,4) 426 decl %ecx 427 movl 4(%esi,%edx,4),%eax 428 leal 1(%edx),%edx 429 jge .L014sub 430 sbbl $0,%eax 431 andl %eax,%esi 432 notl %eax 433 movl %edi,%ebp 434 andl %eax,%ebp 435 orl %ebp,%esi 436.align 16 437.L015copy: 438 movl (%esi,%ebx,4),%eax 439 movl %eax,(%edi,%ebx,4) 440 movl %ecx,32(%esp,%ebx,4) 441 decl %ebx 442 jge .L015copy 443 movl 24(%esp),%esp 444 movl $1,%eax 445.L000just_leave: 446 popl %edi 447 popl %esi 448 popl %ebx 449 popl %ebp 450 ret 451.size bn_mul_mont,.-.L_bn_mul_mont_begin 452.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105 453.byte 112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56 454.byte 54,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121 455.byte 32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46 456.byte 111,114,103,62,0 457.comm OPENSSL_ia32cap_P,8,4 458