1183234Ssimon#!/usr/bin/env perl 2183234Ssimon 3183234Ssimon# ==================================================================== 4238405Sjkim# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 5183234Ssimon# project. The module is, however, dual licensed under OpenSSL and 6183234Ssimon# CRYPTOGAMS licenses depending on where you obtain it. For further 7183234Ssimon# details see http://www.openssl.org/~appro/cryptogams/. 8183234Ssimon# ==================================================================== 9183234Ssimon 10183234Ssimon# October 2005. 11183234Ssimon# 12183234Ssimon# Montgomery multiplication routine for x86_64. While it gives modest 13183234Ssimon# 9% improvement of rsa4096 sign on Opteron, rsa512 sign runs more 14183234Ssimon# than twice, >2x, as fast. Most common rsa1024 sign is improved by 15183234Ssimon# respectful 50%. It remains to be seen if loop unrolling and 16183234Ssimon# dedicated squaring routine can provide further improvement... 17183234Ssimon 18238405Sjkim# July 2011. 19238405Sjkim# 20238405Sjkim# Add dedicated squaring procedure. Performance improvement varies 21238405Sjkim# from platform to platform, but in average it's ~5%/15%/25%/33% 22238405Sjkim# for 512-/1024-/2048-/4096-bit RSA *sign* benchmarks respectively. 23183234Ssimon 24238405Sjkim# August 2011. 25238405Sjkim# 26238405Sjkim# Unroll and modulo-schedule inner loops in such manner that they 27238405Sjkim# are "fallen through" for input lengths of 8, which is critical for 28238405Sjkim# 1024-bit RSA *sign*. Average performance improvement in comparison 29238405Sjkim# to *initial* version of this module from 2005 is ~0%/30%/40%/45% 30238405Sjkim# for 512-/1024-/2048-/4096-bit RSA *sign* benchmarks respectively. 31238405Sjkim 32238405Sjkim$flavour = shift; 33238405Sjkim$output = shift; 34238405Sjkimif ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 35238405Sjkim 36238405Sjkim$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 37238405Sjkim 38183234Ssimon$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 39183234Ssimon( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 40183234Ssimon( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 41183234Ssimondie "can't locate x86_64-xlate.pl"; 42183234Ssimon 43246772Sjkimopen OUT,"| \"$^X\" $xlate $flavour $output"; 44246772Sjkim*STDOUT=*OUT; 45183234Ssimon 46183234Ssimon# int bn_mul_mont( 47183234Ssimon$rp="%rdi"; # BN_ULONG *rp, 48183234Ssimon$ap="%rsi"; # const BN_ULONG *ap, 49183234Ssimon$bp="%rdx"; # const BN_ULONG *bp, 50183234Ssimon$np="%rcx"; # const BN_ULONG *np, 51183234Ssimon$n0="%r8"; # const BN_ULONG *n0, 52183234Ssimon$num="%r9"; # int num); 53183234Ssimon$lo0="%r10"; 54183234Ssimon$hi0="%r11"; 55183234Ssimon$hi1="%r13"; 56183234Ssimon$i="%r14"; 57183234Ssimon$j="%r15"; 58183234Ssimon$m0="%rbx"; 59183234Ssimon$m1="%rbp"; 60183234Ssimon 61183234Ssimon$code=<<___; 62183234Ssimon.text 63183234Ssimon 64183234Ssimon.globl bn_mul_mont 65183234Ssimon.type bn_mul_mont,\@function,6 66183234Ssimon.align 16 67183234Ssimonbn_mul_mont: 68238405Sjkim test \$3,${num}d 69238405Sjkim jnz .Lmul_enter 70238405Sjkim cmp \$8,${num}d 71238405Sjkim jb .Lmul_enter 72238405Sjkim cmp $ap,$bp 73238405Sjkim jne .Lmul4x_enter 74238405Sjkim jmp .Lsqr4x_enter 75238405Sjkim 76238405Sjkim.align 16 77238405Sjkim.Lmul_enter: 78183234Ssimon push %rbx 79183234Ssimon push %rbp 80183234Ssimon push %r12 81183234Ssimon push %r13 82183234Ssimon push %r14 83183234Ssimon push %r15 84183234Ssimon 85183234Ssimon mov ${num}d,${num}d 86238405Sjkim lea 2($num),%r10 87238405Sjkim mov %rsp,%r11 88238405Sjkim neg %r10 89238405Sjkim lea (%rsp,%r10,8),%rsp # tp=alloca(8*(num+2)) 90183234Ssimon and \$-1024,%rsp # minimize TLB usage 91183234Ssimon 92238405Sjkim mov %r11,8(%rsp,$num,8) # tp[num+1]=%rsp 93238405Sjkim.Lmul_body: 94238405Sjkim mov $bp,%r12 # reassign $bp 95238405Sjkim___ 96238405Sjkim $bp="%r12"; 97238405Sjkim$code.=<<___; 98183234Ssimon mov ($n0),$n0 # pull n0[0] value 99238405Sjkim mov ($bp),$m0 # m0=bp[0] 100238405Sjkim mov ($ap),%rax 101183234Ssimon 102183234Ssimon xor $i,$i # i=0 103183234Ssimon xor $j,$j # j=0 104183234Ssimon 105238405Sjkim mov $n0,$m1 106183234Ssimon mulq $m0 # ap[0]*bp[0] 107183234Ssimon mov %rax,$lo0 108238405Sjkim mov ($np),%rax 109238405Sjkim 110238405Sjkim imulq $lo0,$m1 # "tp[0]"*n0 111183234Ssimon mov %rdx,$hi0 112183234Ssimon 113238405Sjkim mulq $m1 # np[0]*m1 114238405Sjkim add %rax,$lo0 # discarded 115238405Sjkim mov 8($ap),%rax 116183234Ssimon adc \$0,%rdx 117183234Ssimon mov %rdx,$hi1 118183234Ssimon 119183234Ssimon lea 1($j),$j # j++ 120238405Sjkim jmp .L1st_enter 121238405Sjkim 122238405Sjkim.align 16 123183234Ssimon.L1st: 124238405Sjkim add %rax,$hi1 125183234Ssimon mov ($ap,$j,8),%rax 126238405Sjkim adc \$0,%rdx 127238405Sjkim add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0] 128238405Sjkim mov $lo0,$hi0 129238405Sjkim adc \$0,%rdx 130238405Sjkim mov $hi1,-16(%rsp,$j,8) # tp[j-1] 131238405Sjkim mov %rdx,$hi1 132238405Sjkim 133238405Sjkim.L1st_enter: 134183234Ssimon mulq $m0 # ap[j]*bp[0] 135238405Sjkim add %rax,$hi0 136238405Sjkim mov ($np,$j,8),%rax 137183234Ssimon adc \$0,%rdx 138238405Sjkim lea 1($j),$j # j++ 139238405Sjkim mov %rdx,$lo0 140183234Ssimon 141183234Ssimon mulq $m1 # np[j]*m1 142238405Sjkim cmp $num,$j 143238405Sjkim jne .L1st 144238405Sjkim 145238405Sjkim add %rax,$hi1 146238405Sjkim mov ($ap),%rax # ap[0] 147183234Ssimon adc \$0,%rdx 148238405Sjkim add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0] 149183234Ssimon adc \$0,%rdx 150238405Sjkim mov $hi1,-16(%rsp,$j,8) # tp[j-1] 151183234Ssimon mov %rdx,$hi1 152238405Sjkim mov $lo0,$hi0 153183234Ssimon 154183234Ssimon xor %rdx,%rdx 155183234Ssimon add $hi0,$hi1 156183234Ssimon adc \$0,%rdx 157183234Ssimon mov $hi1,-8(%rsp,$num,8) 158183234Ssimon mov %rdx,(%rsp,$num,8) # store upmost overflow bit 159183234Ssimon 160183234Ssimon lea 1($i),$i # i++ 161238405Sjkim jmp .Louter 162238405Sjkim.align 16 163183234Ssimon.Louter: 164238405Sjkim mov ($bp,$i,8),$m0 # m0=bp[i] 165183234Ssimon xor $j,$j # j=0 166238405Sjkim mov $n0,$m1 167238405Sjkim mov (%rsp),$lo0 168183234Ssimon mulq $m0 # ap[0]*bp[i] 169238405Sjkim add %rax,$lo0 # ap[0]*bp[i]+tp[0] 170238405Sjkim mov ($np),%rax 171183234Ssimon adc \$0,%rdx 172238405Sjkim 173238405Sjkim imulq $lo0,$m1 # tp[0]*n0 174183234Ssimon mov %rdx,$hi0 175183234Ssimon 176238405Sjkim mulq $m1 # np[0]*m1 177238405Sjkim add %rax,$lo0 # discarded 178238405Sjkim mov 8($ap),%rax 179238405Sjkim adc \$0,%rdx 180183234Ssimon mov 8(%rsp),$lo0 # tp[1] 181183234Ssimon mov %rdx,$hi1 182183234Ssimon 183183234Ssimon lea 1($j),$j # j++ 184238405Sjkim jmp .Linner_enter 185238405Sjkim 186238405Sjkim.align 16 187183234Ssimon.Linner: 188238405Sjkim add %rax,$hi1 189183234Ssimon mov ($ap,$j,8),%rax 190238405Sjkim adc \$0,%rdx 191238405Sjkim add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j] 192238405Sjkim mov (%rsp,$j,8),$lo0 193238405Sjkim adc \$0,%rdx 194238405Sjkim mov $hi1,-16(%rsp,$j,8) # tp[j-1] 195238405Sjkim mov %rdx,$hi1 196238405Sjkim 197238405Sjkim.Linner_enter: 198183234Ssimon mulq $m0 # ap[j]*bp[i] 199238405Sjkim add %rax,$hi0 200183234Ssimon mov ($np,$j,8),%rax 201183234Ssimon adc \$0,%rdx 202238405Sjkim add $hi0,$lo0 # ap[j]*bp[i]+tp[j] 203183234Ssimon mov %rdx,$hi0 204238405Sjkim adc \$0,$hi0 205238405Sjkim lea 1($j),$j # j++ 206183234Ssimon 207183234Ssimon mulq $m1 # np[j]*m1 208238405Sjkim cmp $num,$j 209238405Sjkim jne .Linner 210238405Sjkim 211238405Sjkim add %rax,$hi1 212238405Sjkim mov ($ap),%rax # ap[0] 213183234Ssimon adc \$0,%rdx 214238405Sjkim add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j] 215238405Sjkim mov (%rsp,$j,8),$lo0 216183234Ssimon adc \$0,%rdx 217238405Sjkim mov $hi1,-16(%rsp,$j,8) # tp[j-1] 218183234Ssimon mov %rdx,$hi1 219183234Ssimon 220183234Ssimon xor %rdx,%rdx 221183234Ssimon add $hi0,$hi1 222183234Ssimon adc \$0,%rdx 223183234Ssimon add $lo0,$hi1 # pull upmost overflow bit 224183234Ssimon adc \$0,%rdx 225183234Ssimon mov $hi1,-8(%rsp,$num,8) 226183234Ssimon mov %rdx,(%rsp,$num,8) # store upmost overflow bit 227183234Ssimon 228183234Ssimon lea 1($i),$i # i++ 229183234Ssimon cmp $num,$i 230183234Ssimon jl .Louter 231183234Ssimon 232238405Sjkim xor $i,$i # i=0 and clear CF! 233238405Sjkim mov (%rsp),%rax # tp[0] 234183234Ssimon lea (%rsp),$ap # borrow ap for tp 235238405Sjkim mov $num,$j # j=num 236183234Ssimon jmp .Lsub 237183234Ssimon.align 16 238183234Ssimon.Lsub: sbb ($np,$i,8),%rax 239183234Ssimon mov %rax,($rp,$i,8) # rp[i]=tp[i]-np[i] 240183234Ssimon mov 8($ap,$i,8),%rax # tp[i+1] 241183234Ssimon lea 1($i),$i # i++ 242238405Sjkim dec $j # doesnn't affect CF! 243238405Sjkim jnz .Lsub 244183234Ssimon 245183234Ssimon sbb \$0,%rax # handle upmost overflow bit 246238405Sjkim xor $i,$i 247183234Ssimon and %rax,$ap 248183234Ssimon not %rax 249183234Ssimon mov $rp,$np 250183234Ssimon and %rax,$np 251238405Sjkim mov $num,$j # j=num 252183234Ssimon or $np,$ap # ap=borrow?tp:rp 253183234Ssimon.align 16 254183234Ssimon.Lcopy: # copy or in-place refresh 255238405Sjkim mov ($ap,$i,8),%rax 256238405Sjkim mov $i,(%rsp,$i,8) # zap temporary vector 257238405Sjkim mov %rax,($rp,$i,8) # rp[i]=tp[i] 258238405Sjkim lea 1($i),$i 259238405Sjkim sub \$1,$j 260238405Sjkim jnz .Lcopy 261238405Sjkim 262238405Sjkim mov 8(%rsp,$num,8),%rsi # restore %rsp 263238405Sjkim mov \$1,%rax 264238405Sjkim mov (%rsi),%r15 265238405Sjkim mov 8(%rsi),%r14 266238405Sjkim mov 16(%rsi),%r13 267238405Sjkim mov 24(%rsi),%r12 268238405Sjkim mov 32(%rsi),%rbp 269238405Sjkim mov 40(%rsi),%rbx 270238405Sjkim lea 48(%rsi),%rsp 271238405Sjkim.Lmul_epilogue: 272238405Sjkim ret 273238405Sjkim.size bn_mul_mont,.-bn_mul_mont 274238405Sjkim___ 275238405Sjkim{{{ 276238405Sjkimmy @A=("%r10","%r11"); 277238405Sjkimmy @N=("%r13","%rdi"); 278238405Sjkim$code.=<<___; 279238405Sjkim.type bn_mul4x_mont,\@function,6 280238405Sjkim.align 16 281238405Sjkimbn_mul4x_mont: 282238405Sjkim.Lmul4x_enter: 283238405Sjkim push %rbx 284238405Sjkim push %rbp 285238405Sjkim push %r12 286238405Sjkim push %r13 287238405Sjkim push %r14 288238405Sjkim push %r15 289238405Sjkim 290238405Sjkim mov ${num}d,${num}d 291238405Sjkim lea 4($num),%r10 292238405Sjkim mov %rsp,%r11 293238405Sjkim neg %r10 294238405Sjkim lea (%rsp,%r10,8),%rsp # tp=alloca(8*(num+4)) 295238405Sjkim and \$-1024,%rsp # minimize TLB usage 296238405Sjkim 297238405Sjkim mov %r11,8(%rsp,$num,8) # tp[num+1]=%rsp 298238405Sjkim.Lmul4x_body: 299238405Sjkim mov $rp,16(%rsp,$num,8) # tp[num+2]=$rp 300238405Sjkim mov %rdx,%r12 # reassign $bp 301238405Sjkim___ 302238405Sjkim $bp="%r12"; 303238405Sjkim$code.=<<___; 304238405Sjkim mov ($n0),$n0 # pull n0[0] value 305238405Sjkim mov ($bp),$m0 # m0=bp[0] 306238405Sjkim mov ($ap),%rax 307238405Sjkim 308238405Sjkim xor $i,$i # i=0 309238405Sjkim xor $j,$j # j=0 310238405Sjkim 311238405Sjkim mov $n0,$m1 312238405Sjkim mulq $m0 # ap[0]*bp[0] 313238405Sjkim mov %rax,$A[0] 314238405Sjkim mov ($np),%rax 315238405Sjkim 316238405Sjkim imulq $A[0],$m1 # "tp[0]"*n0 317238405Sjkim mov %rdx,$A[1] 318238405Sjkim 319238405Sjkim mulq $m1 # np[0]*m1 320238405Sjkim add %rax,$A[0] # discarded 321238405Sjkim mov 8($ap),%rax 322238405Sjkim adc \$0,%rdx 323238405Sjkim mov %rdx,$N[1] 324238405Sjkim 325238405Sjkim mulq $m0 326238405Sjkim add %rax,$A[1] 327238405Sjkim mov 8($np),%rax 328238405Sjkim adc \$0,%rdx 329238405Sjkim mov %rdx,$A[0] 330238405Sjkim 331238405Sjkim mulq $m1 332238405Sjkim add %rax,$N[1] 333238405Sjkim mov 16($ap),%rax 334238405Sjkim adc \$0,%rdx 335238405Sjkim add $A[1],$N[1] 336238405Sjkim lea 4($j),$j # j++ 337238405Sjkim adc \$0,%rdx 338238405Sjkim mov $N[1],(%rsp) 339238405Sjkim mov %rdx,$N[0] 340238405Sjkim jmp .L1st4x 341238405Sjkim.align 16 342238405Sjkim.L1st4x: 343238405Sjkim mulq $m0 # ap[j]*bp[0] 344238405Sjkim add %rax,$A[0] 345238405Sjkim mov -16($np,$j,8),%rax 346238405Sjkim adc \$0,%rdx 347238405Sjkim mov %rdx,$A[1] 348238405Sjkim 349238405Sjkim mulq $m1 # np[j]*m1 350238405Sjkim add %rax,$N[0] 351238405Sjkim mov -8($ap,$j,8),%rax 352238405Sjkim adc \$0,%rdx 353238405Sjkim add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0] 354238405Sjkim adc \$0,%rdx 355238405Sjkim mov $N[0],-24(%rsp,$j,8) # tp[j-1] 356238405Sjkim mov %rdx,$N[1] 357238405Sjkim 358238405Sjkim mulq $m0 # ap[j]*bp[0] 359238405Sjkim add %rax,$A[1] 360238405Sjkim mov -8($np,$j,8),%rax 361238405Sjkim adc \$0,%rdx 362238405Sjkim mov %rdx,$A[0] 363238405Sjkim 364238405Sjkim mulq $m1 # np[j]*m1 365238405Sjkim add %rax,$N[1] 366183234Ssimon mov ($ap,$j,8),%rax 367238405Sjkim adc \$0,%rdx 368238405Sjkim add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] 369238405Sjkim adc \$0,%rdx 370238405Sjkim mov $N[1],-16(%rsp,$j,8) # tp[j-1] 371238405Sjkim mov %rdx,$N[0] 372238405Sjkim 373238405Sjkim mulq $m0 # ap[j]*bp[0] 374238405Sjkim add %rax,$A[0] 375238405Sjkim mov ($np,$j,8),%rax 376238405Sjkim adc \$0,%rdx 377238405Sjkim mov %rdx,$A[1] 378238405Sjkim 379238405Sjkim mulq $m1 # np[j]*m1 380238405Sjkim add %rax,$N[0] 381238405Sjkim mov 8($ap,$j,8),%rax 382238405Sjkim adc \$0,%rdx 383238405Sjkim add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0] 384238405Sjkim adc \$0,%rdx 385238405Sjkim mov $N[0],-8(%rsp,$j,8) # tp[j-1] 386238405Sjkim mov %rdx,$N[1] 387238405Sjkim 388238405Sjkim mulq $m0 # ap[j]*bp[0] 389238405Sjkim add %rax,$A[1] 390238405Sjkim mov 8($np,$j,8),%rax 391238405Sjkim adc \$0,%rdx 392238405Sjkim lea 4($j),$j # j++ 393238405Sjkim mov %rdx,$A[0] 394238405Sjkim 395238405Sjkim mulq $m1 # np[j]*m1 396238405Sjkim add %rax,$N[1] 397238405Sjkim mov -16($ap,$j,8),%rax 398238405Sjkim adc \$0,%rdx 399238405Sjkim add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] 400238405Sjkim adc \$0,%rdx 401238405Sjkim mov $N[1],-32(%rsp,$j,8) # tp[j-1] 402238405Sjkim mov %rdx,$N[0] 403238405Sjkim cmp $num,$j 404238405Sjkim jl .L1st4x 405238405Sjkim 406238405Sjkim mulq $m0 # ap[j]*bp[0] 407238405Sjkim add %rax,$A[0] 408238405Sjkim mov -16($np,$j,8),%rax 409238405Sjkim adc \$0,%rdx 410238405Sjkim mov %rdx,$A[1] 411238405Sjkim 412238405Sjkim mulq $m1 # np[j]*m1 413238405Sjkim add %rax,$N[0] 414238405Sjkim mov -8($ap,$j,8),%rax 415238405Sjkim adc \$0,%rdx 416238405Sjkim add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0] 417238405Sjkim adc \$0,%rdx 418238405Sjkim mov $N[0],-24(%rsp,$j,8) # tp[j-1] 419238405Sjkim mov %rdx,$N[1] 420238405Sjkim 421238405Sjkim mulq $m0 # ap[j]*bp[0] 422238405Sjkim add %rax,$A[1] 423238405Sjkim mov -8($np,$j,8),%rax 424238405Sjkim adc \$0,%rdx 425238405Sjkim mov %rdx,$A[0] 426238405Sjkim 427238405Sjkim mulq $m1 # np[j]*m1 428238405Sjkim add %rax,$N[1] 429238405Sjkim mov ($ap),%rax # ap[0] 430238405Sjkim adc \$0,%rdx 431238405Sjkim add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] 432238405Sjkim adc \$0,%rdx 433238405Sjkim mov $N[1],-16(%rsp,$j,8) # tp[j-1] 434238405Sjkim mov %rdx,$N[0] 435238405Sjkim 436238405Sjkim xor $N[1],$N[1] 437238405Sjkim add $A[0],$N[0] 438238405Sjkim adc \$0,$N[1] 439238405Sjkim mov $N[0],-8(%rsp,$j,8) 440238405Sjkim mov $N[1],(%rsp,$j,8) # store upmost overflow bit 441238405Sjkim 442238405Sjkim lea 1($i),$i # i++ 443238405Sjkim.align 4 444238405Sjkim.Louter4x: 445238405Sjkim mov ($bp,$i,8),$m0 # m0=bp[i] 446238405Sjkim xor $j,$j # j=0 447238405Sjkim mov (%rsp),$A[0] 448238405Sjkim mov $n0,$m1 449238405Sjkim mulq $m0 # ap[0]*bp[i] 450238405Sjkim add %rax,$A[0] # ap[0]*bp[i]+tp[0] 451238405Sjkim mov ($np),%rax 452238405Sjkim adc \$0,%rdx 453238405Sjkim 454238405Sjkim imulq $A[0],$m1 # tp[0]*n0 455238405Sjkim mov %rdx,$A[1] 456238405Sjkim 457238405Sjkim mulq $m1 # np[0]*m1 458238405Sjkim add %rax,$A[0] # "$N[0]", discarded 459238405Sjkim mov 8($ap),%rax 460238405Sjkim adc \$0,%rdx 461238405Sjkim mov %rdx,$N[1] 462238405Sjkim 463238405Sjkim mulq $m0 # ap[j]*bp[i] 464238405Sjkim add %rax,$A[1] 465238405Sjkim mov 8($np),%rax 466238405Sjkim adc \$0,%rdx 467238405Sjkim add 8(%rsp),$A[1] # +tp[1] 468238405Sjkim adc \$0,%rdx 469238405Sjkim mov %rdx,$A[0] 470238405Sjkim 471238405Sjkim mulq $m1 # np[j]*m1 472238405Sjkim add %rax,$N[1] 473238405Sjkim mov 16($ap),%rax 474238405Sjkim adc \$0,%rdx 475238405Sjkim add $A[1],$N[1] # np[j]*m1+ap[j]*bp[i]+tp[j] 476238405Sjkim lea 4($j),$j # j+=2 477238405Sjkim adc \$0,%rdx 478238405Sjkim mov $N[1],(%rsp) # tp[j-1] 479238405Sjkim mov %rdx,$N[0] 480238405Sjkim jmp .Linner4x 481238405Sjkim.align 16 482238405Sjkim.Linner4x: 483238405Sjkim mulq $m0 # ap[j]*bp[i] 484238405Sjkim add %rax,$A[0] 485238405Sjkim mov -16($np,$j,8),%rax 486238405Sjkim adc \$0,%rdx 487238405Sjkim add -16(%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j] 488238405Sjkim adc \$0,%rdx 489238405Sjkim mov %rdx,$A[1] 490238405Sjkim 491238405Sjkim mulq $m1 # np[j]*m1 492238405Sjkim add %rax,$N[0] 493238405Sjkim mov -8($ap,$j,8),%rax 494238405Sjkim adc \$0,%rdx 495238405Sjkim add $A[0],$N[0] 496238405Sjkim adc \$0,%rdx 497238405Sjkim mov $N[0],-24(%rsp,$j,8) # tp[j-1] 498238405Sjkim mov %rdx,$N[1] 499238405Sjkim 500238405Sjkim mulq $m0 # ap[j]*bp[i] 501238405Sjkim add %rax,$A[1] 502238405Sjkim mov -8($np,$j,8),%rax 503238405Sjkim adc \$0,%rdx 504238405Sjkim add -8(%rsp,$j,8),$A[1] 505238405Sjkim adc \$0,%rdx 506238405Sjkim mov %rdx,$A[0] 507238405Sjkim 508238405Sjkim mulq $m1 # np[j]*m1 509238405Sjkim add %rax,$N[1] 510238405Sjkim mov ($ap,$j,8),%rax 511238405Sjkim adc \$0,%rdx 512238405Sjkim add $A[1],$N[1] 513238405Sjkim adc \$0,%rdx 514238405Sjkim mov $N[1],-16(%rsp,$j,8) # tp[j-1] 515238405Sjkim mov %rdx,$N[0] 516238405Sjkim 517238405Sjkim mulq $m0 # ap[j]*bp[i] 518238405Sjkim add %rax,$A[0] 519238405Sjkim mov ($np,$j,8),%rax 520238405Sjkim adc \$0,%rdx 521238405Sjkim add (%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j] 522238405Sjkim adc \$0,%rdx 523238405Sjkim mov %rdx,$A[1] 524238405Sjkim 525238405Sjkim mulq $m1 # np[j]*m1 526238405Sjkim add %rax,$N[0] 527238405Sjkim mov 8($ap,$j,8),%rax 528238405Sjkim adc \$0,%rdx 529238405Sjkim add $A[0],$N[0] 530238405Sjkim adc \$0,%rdx 531238405Sjkim mov $N[0],-8(%rsp,$j,8) # tp[j-1] 532238405Sjkim mov %rdx,$N[1] 533238405Sjkim 534238405Sjkim mulq $m0 # ap[j]*bp[i] 535238405Sjkim add %rax,$A[1] 536238405Sjkim mov 8($np,$j,8),%rax 537238405Sjkim adc \$0,%rdx 538238405Sjkim add 8(%rsp,$j,8),$A[1] 539238405Sjkim adc \$0,%rdx 540238405Sjkim lea 4($j),$j # j++ 541238405Sjkim mov %rdx,$A[0] 542238405Sjkim 543238405Sjkim mulq $m1 # np[j]*m1 544238405Sjkim add %rax,$N[1] 545238405Sjkim mov -16($ap,$j,8),%rax 546238405Sjkim adc \$0,%rdx 547238405Sjkim add $A[1],$N[1] 548238405Sjkim adc \$0,%rdx 549238405Sjkim mov $N[1],-32(%rsp,$j,8) # tp[j-1] 550238405Sjkim mov %rdx,$N[0] 551238405Sjkim cmp $num,$j 552238405Sjkim jl .Linner4x 553238405Sjkim 554238405Sjkim mulq $m0 # ap[j]*bp[i] 555238405Sjkim add %rax,$A[0] 556238405Sjkim mov -16($np,$j,8),%rax 557238405Sjkim adc \$0,%rdx 558238405Sjkim add -16(%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j] 559238405Sjkim adc \$0,%rdx 560238405Sjkim mov %rdx,$A[1] 561238405Sjkim 562238405Sjkim mulq $m1 # np[j]*m1 563238405Sjkim add %rax,$N[0] 564238405Sjkim mov -8($ap,$j,8),%rax 565238405Sjkim adc \$0,%rdx 566238405Sjkim add $A[0],$N[0] 567238405Sjkim adc \$0,%rdx 568238405Sjkim mov $N[0],-24(%rsp,$j,8) # tp[j-1] 569238405Sjkim mov %rdx,$N[1] 570238405Sjkim 571238405Sjkim mulq $m0 # ap[j]*bp[i] 572238405Sjkim add %rax,$A[1] 573238405Sjkim mov -8($np,$j,8),%rax 574238405Sjkim adc \$0,%rdx 575238405Sjkim add -8(%rsp,$j,8),$A[1] 576238405Sjkim adc \$0,%rdx 577238405Sjkim lea 1($i),$i # i++ 578238405Sjkim mov %rdx,$A[0] 579238405Sjkim 580238405Sjkim mulq $m1 # np[j]*m1 581238405Sjkim add %rax,$N[1] 582238405Sjkim mov ($ap),%rax # ap[0] 583238405Sjkim adc \$0,%rdx 584238405Sjkim add $A[1],$N[1] 585238405Sjkim adc \$0,%rdx 586238405Sjkim mov $N[1],-16(%rsp,$j,8) # tp[j-1] 587238405Sjkim mov %rdx,$N[0] 588238405Sjkim 589238405Sjkim xor $N[1],$N[1] 590238405Sjkim add $A[0],$N[0] 591238405Sjkim adc \$0,$N[1] 592238405Sjkim add (%rsp,$num,8),$N[0] # pull upmost overflow bit 593238405Sjkim adc \$0,$N[1] 594238405Sjkim mov $N[0],-8(%rsp,$j,8) 595238405Sjkim mov $N[1],(%rsp,$j,8) # store upmost overflow bit 596238405Sjkim 597238405Sjkim cmp $num,$i 598238405Sjkim jl .Louter4x 599238405Sjkim___ 600238405Sjkim{ 601238405Sjkimmy @ri=("%rax","%rdx",$m0,$m1); 602238405Sjkim$code.=<<___; 603238405Sjkim mov 16(%rsp,$num,8),$rp # restore $rp 604238405Sjkim mov 0(%rsp),@ri[0] # tp[0] 605238405Sjkim pxor %xmm0,%xmm0 606238405Sjkim mov 8(%rsp),@ri[1] # tp[1] 607238405Sjkim shr \$2,$num # num/=4 608238405Sjkim lea (%rsp),$ap # borrow ap for tp 609238405Sjkim xor $i,$i # i=0 and clear CF! 610238405Sjkim 611238405Sjkim sub 0($np),@ri[0] 612238405Sjkim mov 16($ap),@ri[2] # tp[2] 613238405Sjkim mov 24($ap),@ri[3] # tp[3] 614238405Sjkim sbb 8($np),@ri[1] 615238405Sjkim lea -1($num),$j # j=num/4-1 616238405Sjkim jmp .Lsub4x 617238405Sjkim.align 16 618238405Sjkim.Lsub4x: 619238405Sjkim mov @ri[0],0($rp,$i,8) # rp[i]=tp[i]-np[i] 620238405Sjkim mov @ri[1],8($rp,$i,8) # rp[i]=tp[i]-np[i] 621238405Sjkim sbb 16($np,$i,8),@ri[2] 622238405Sjkim mov 32($ap,$i,8),@ri[0] # tp[i+1] 623238405Sjkim mov 40($ap,$i,8),@ri[1] 624238405Sjkim sbb 24($np,$i,8),@ri[3] 625238405Sjkim mov @ri[2],16($rp,$i,8) # rp[i]=tp[i]-np[i] 626238405Sjkim mov @ri[3],24($rp,$i,8) # rp[i]=tp[i]-np[i] 627238405Sjkim sbb 32($np,$i,8),@ri[0] 628238405Sjkim mov 48($ap,$i,8),@ri[2] 629238405Sjkim mov 56($ap,$i,8),@ri[3] 630238405Sjkim sbb 40($np,$i,8),@ri[1] 631238405Sjkim lea 4($i),$i # i++ 632238405Sjkim dec $j # doesnn't affect CF! 633238405Sjkim jnz .Lsub4x 634238405Sjkim 635238405Sjkim mov @ri[0],0($rp,$i,8) # rp[i]=tp[i]-np[i] 636238405Sjkim mov 32($ap,$i,8),@ri[0] # load overflow bit 637238405Sjkim sbb 16($np,$i,8),@ri[2] 638238405Sjkim mov @ri[1],8($rp,$i,8) # rp[i]=tp[i]-np[i] 639238405Sjkim sbb 24($np,$i,8),@ri[3] 640238405Sjkim mov @ri[2],16($rp,$i,8) # rp[i]=tp[i]-np[i] 641238405Sjkim 642238405Sjkim sbb \$0,@ri[0] # handle upmost overflow bit 643238405Sjkim mov @ri[3],24($rp,$i,8) # rp[i]=tp[i]-np[i] 644238405Sjkim xor $i,$i # i=0 645238405Sjkim and @ri[0],$ap 646238405Sjkim not @ri[0] 647238405Sjkim mov $rp,$np 648238405Sjkim and @ri[0],$np 649238405Sjkim lea -1($num),$j 650238405Sjkim or $np,$ap # ap=borrow?tp:rp 651238405Sjkim 652238405Sjkim movdqu ($ap),%xmm1 653238405Sjkim movdqa %xmm0,(%rsp) 654238405Sjkim movdqu %xmm1,($rp) 655238405Sjkim jmp .Lcopy4x 656238405Sjkim.align 16 657238405Sjkim.Lcopy4x: # copy or in-place refresh 658238405Sjkim movdqu 16($ap,$i),%xmm2 659238405Sjkim movdqu 32($ap,$i),%xmm1 660238405Sjkim movdqa %xmm0,16(%rsp,$i) 661238405Sjkim movdqu %xmm2,16($rp,$i) 662238405Sjkim movdqa %xmm0,32(%rsp,$i) 663238405Sjkim movdqu %xmm1,32($rp,$i) 664238405Sjkim lea 32($i),$i 665183234Ssimon dec $j 666238405Sjkim jnz .Lcopy4x 667183234Ssimon 668238405Sjkim shl \$2,$num 669238405Sjkim movdqu 16($ap,$i),%xmm2 670238405Sjkim movdqa %xmm0,16(%rsp,$i) 671238405Sjkim movdqu %xmm2,16($rp,$i) 672238405Sjkim___ 673238405Sjkim} 674238405Sjkim$code.=<<___; 675238405Sjkim mov 8(%rsp,$num,8),%rsi # restore %rsp 676183234Ssimon mov \$1,%rax 677238405Sjkim mov (%rsi),%r15 678238405Sjkim mov 8(%rsi),%r14 679238405Sjkim mov 16(%rsi),%r13 680238405Sjkim mov 24(%rsi),%r12 681238405Sjkim mov 32(%rsi),%rbp 682238405Sjkim mov 40(%rsi),%rbx 683238405Sjkim lea 48(%rsi),%rsp 684238405Sjkim.Lmul4x_epilogue: 685238405Sjkim ret 686238405Sjkim.size bn_mul4x_mont,.-bn_mul4x_mont 687238405Sjkim___ 688238405Sjkim}}} 689238405Sjkim{{{ 690238405Sjkim###################################################################### 691238405Sjkim# void bn_sqr4x_mont( 692238405Sjkimmy $rptr="%rdi"; # const BN_ULONG *rptr, 693238405Sjkimmy $aptr="%rsi"; # const BN_ULONG *aptr, 694238405Sjkimmy $bptr="%rdx"; # not used 695238405Sjkimmy $nptr="%rcx"; # const BN_ULONG *nptr, 696238405Sjkimmy $n0 ="%r8"; # const BN_ULONG *n0); 697238405Sjkimmy $num ="%r9"; # int num, has to be divisible by 4 and 698238405Sjkim # not less than 8 699238405Sjkim 700238405Sjkimmy ($i,$j,$tptr)=("%rbp","%rcx",$rptr); 701238405Sjkimmy @A0=("%r10","%r11"); 702238405Sjkimmy @A1=("%r12","%r13"); 703238405Sjkimmy ($a0,$a1,$ai)=("%r14","%r15","%rbx"); 704238405Sjkim 705238405Sjkim$code.=<<___; 706238405Sjkim.type bn_sqr4x_mont,\@function,6 707238405Sjkim.align 16 708238405Sjkimbn_sqr4x_mont: 709238405Sjkim.Lsqr4x_enter: 710238405Sjkim push %rbx 711238405Sjkim push %rbp 712238405Sjkim push %r12 713238405Sjkim push %r13 714238405Sjkim push %r14 715238405Sjkim push %r15 716238405Sjkim 717238405Sjkim shl \$3,${num}d # convert $num to bytes 718238405Sjkim xor %r10,%r10 719238405Sjkim mov %rsp,%r11 # put aside %rsp 720238405Sjkim sub $num,%r10 # -$num 721238405Sjkim mov ($n0),$n0 # *n0 722238405Sjkim lea -72(%rsp,%r10,2),%rsp # alloca(frame+2*$num) 723238405Sjkim and \$-1024,%rsp # minimize TLB usage 724238405Sjkim ############################################################## 725238405Sjkim # Stack layout 726238405Sjkim # 727238405Sjkim # +0 saved $num, used in reduction section 728238405Sjkim # +8 &t[2*$num], used in reduction section 729238405Sjkim # +32 saved $rptr 730238405Sjkim # +40 saved $nptr 731238405Sjkim # +48 saved *n0 732238405Sjkim # +56 saved %rsp 733238405Sjkim # +64 t[2*$num] 734238405Sjkim # 735238405Sjkim mov $rptr,32(%rsp) # save $rptr 736238405Sjkim mov $nptr,40(%rsp) 737238405Sjkim mov $n0, 48(%rsp) 738238405Sjkim mov %r11, 56(%rsp) # save original %rsp 739238405Sjkim.Lsqr4x_body: 740238405Sjkim ############################################################## 741238405Sjkim # Squaring part: 742238405Sjkim # 743238405Sjkim # a) multiply-n-add everything but a[i]*a[i]; 744238405Sjkim # b) shift result of a) by 1 to the left and accumulate 745238405Sjkim # a[i]*a[i] products; 746238405Sjkim # 747238405Sjkim lea 32(%r10),$i # $i=-($num-32) 748238405Sjkim lea ($aptr,$num),$aptr # end of a[] buffer, ($aptr,$i)=&ap[2] 749238405Sjkim 750238405Sjkim mov $num,$j # $j=$num 751238405Sjkim 752238405Sjkim # comments apply to $num==8 case 753238405Sjkim mov -32($aptr,$i),$a0 # a[0] 754238405Sjkim lea 64(%rsp,$num,2),$tptr # end of tp[] buffer, &tp[2*$num] 755238405Sjkim mov -24($aptr,$i),%rax # a[1] 756238405Sjkim lea -32($tptr,$i),$tptr # end of tp[] window, &tp[2*$num-"$i"] 757238405Sjkim mov -16($aptr,$i),$ai # a[2] 758238405Sjkim mov %rax,$a1 759238405Sjkim 760238405Sjkim mul $a0 # a[1]*a[0] 761238405Sjkim mov %rax,$A0[0] # a[1]*a[0] 762238405Sjkim mov $ai,%rax # a[2] 763238405Sjkim mov %rdx,$A0[1] 764238405Sjkim mov $A0[0],-24($tptr,$i) # t[1] 765238405Sjkim 766238405Sjkim xor $A0[0],$A0[0] 767238405Sjkim mul $a0 # a[2]*a[0] 768238405Sjkim add %rax,$A0[1] 769238405Sjkim mov $ai,%rax 770238405Sjkim adc %rdx,$A0[0] 771238405Sjkim mov $A0[1],-16($tptr,$i) # t[2] 772238405Sjkim 773238405Sjkim lea -16($i),$j # j=-16 774238405Sjkim 775238405Sjkim 776238405Sjkim mov 8($aptr,$j),$ai # a[3] 777238405Sjkim mul $a1 # a[2]*a[1] 778238405Sjkim mov %rax,$A1[0] # a[2]*a[1]+t[3] 779238405Sjkim mov $ai,%rax 780238405Sjkim mov %rdx,$A1[1] 781238405Sjkim 782238405Sjkim xor $A0[1],$A0[1] 783238405Sjkim add $A1[0],$A0[0] 784238405Sjkim lea 16($j),$j 785238405Sjkim adc \$0,$A0[1] 786238405Sjkim mul $a0 # a[3]*a[0] 787238405Sjkim add %rax,$A0[0] # a[3]*a[0]+a[2]*a[1]+t[3] 788238405Sjkim mov $ai,%rax 789238405Sjkim adc %rdx,$A0[1] 790238405Sjkim mov $A0[0],-8($tptr,$j) # t[3] 791238405Sjkim jmp .Lsqr4x_1st 792238405Sjkim 793238405Sjkim.align 16 794238405Sjkim.Lsqr4x_1st: 795238405Sjkim mov ($aptr,$j),$ai # a[4] 796238405Sjkim xor $A1[0],$A1[0] 797238405Sjkim mul $a1 # a[3]*a[1] 798238405Sjkim add %rax,$A1[1] # a[3]*a[1]+t[4] 799238405Sjkim mov $ai,%rax 800238405Sjkim adc %rdx,$A1[0] 801238405Sjkim 802238405Sjkim xor $A0[0],$A0[0] 803238405Sjkim add $A1[1],$A0[1] 804238405Sjkim adc \$0,$A0[0] 805238405Sjkim mul $a0 # a[4]*a[0] 806238405Sjkim add %rax,$A0[1] # a[4]*a[0]+a[3]*a[1]+t[4] 807238405Sjkim mov $ai,%rax # a[3] 808238405Sjkim adc %rdx,$A0[0] 809238405Sjkim mov $A0[1],($tptr,$j) # t[4] 810238405Sjkim 811238405Sjkim 812238405Sjkim mov 8($aptr,$j),$ai # a[5] 813238405Sjkim xor $A1[1],$A1[1] 814238405Sjkim mul $a1 # a[4]*a[3] 815238405Sjkim add %rax,$A1[0] # a[4]*a[3]+t[5] 816238405Sjkim mov $ai,%rax 817238405Sjkim adc %rdx,$A1[1] 818238405Sjkim 819238405Sjkim xor $A0[1],$A0[1] 820238405Sjkim add $A1[0],$A0[0] 821238405Sjkim adc \$0,$A0[1] 822238405Sjkim mul $a0 # a[5]*a[2] 823238405Sjkim add %rax,$A0[0] # a[5]*a[2]+a[4]*a[3]+t[5] 824238405Sjkim mov $ai,%rax 825238405Sjkim adc %rdx,$A0[1] 826238405Sjkim mov $A0[0],8($tptr,$j) # t[5] 827238405Sjkim 828238405Sjkim mov 16($aptr,$j),$ai # a[6] 829238405Sjkim xor $A1[0],$A1[0] 830238405Sjkim mul $a1 # a[5]*a[3] 831238405Sjkim add %rax,$A1[1] # a[5]*a[3]+t[6] 832238405Sjkim mov $ai,%rax 833238405Sjkim adc %rdx,$A1[0] 834238405Sjkim 835238405Sjkim xor $A0[0],$A0[0] 836238405Sjkim add $A1[1],$A0[1] 837238405Sjkim adc \$0,$A0[0] 838238405Sjkim mul $a0 # a[6]*a[2] 839238405Sjkim add %rax,$A0[1] # a[6]*a[2]+a[5]*a[3]+t[6] 840238405Sjkim mov $ai,%rax # a[3] 841238405Sjkim adc %rdx,$A0[0] 842238405Sjkim mov $A0[1],16($tptr,$j) # t[6] 843238405Sjkim 844238405Sjkim 845238405Sjkim mov 24($aptr,$j),$ai # a[7] 846238405Sjkim xor $A1[1],$A1[1] 847238405Sjkim mul $a1 # a[6]*a[5] 848238405Sjkim add %rax,$A1[0] # a[6]*a[5]+t[7] 849238405Sjkim mov $ai,%rax 850238405Sjkim adc %rdx,$A1[1] 851238405Sjkim 852238405Sjkim xor $A0[1],$A0[1] 853238405Sjkim add $A1[0],$A0[0] 854238405Sjkim lea 32($j),$j 855238405Sjkim adc \$0,$A0[1] 856238405Sjkim mul $a0 # a[7]*a[4] 857238405Sjkim add %rax,$A0[0] # a[7]*a[4]+a[6]*a[5]+t[6] 858238405Sjkim mov $ai,%rax 859238405Sjkim adc %rdx,$A0[1] 860238405Sjkim mov $A0[0],-8($tptr,$j) # t[7] 861238405Sjkim 862238405Sjkim cmp \$0,$j 863238405Sjkim jne .Lsqr4x_1st 864238405Sjkim 865238405Sjkim xor $A1[0],$A1[0] 866238405Sjkim add $A0[1],$A1[1] 867238405Sjkim adc \$0,$A1[0] 868238405Sjkim mul $a1 # a[7]*a[5] 869238405Sjkim add %rax,$A1[1] 870238405Sjkim adc %rdx,$A1[0] 871238405Sjkim 872238405Sjkim mov $A1[1],($tptr) # t[8] 873238405Sjkim lea 16($i),$i 874238405Sjkim mov $A1[0],8($tptr) # t[9] 875238405Sjkim jmp .Lsqr4x_outer 876238405Sjkim 877238405Sjkim.align 16 878238405Sjkim.Lsqr4x_outer: # comments apply to $num==6 case 879238405Sjkim mov -32($aptr,$i),$a0 # a[0] 880238405Sjkim lea 64(%rsp,$num,2),$tptr # end of tp[] buffer, &tp[2*$num] 881238405Sjkim mov -24($aptr,$i),%rax # a[1] 882238405Sjkim lea -32($tptr,$i),$tptr # end of tp[] window, &tp[2*$num-"$i"] 883238405Sjkim mov -16($aptr,$i),$ai # a[2] 884238405Sjkim mov %rax,$a1 885238405Sjkim 886238405Sjkim mov -24($tptr,$i),$A0[0] # t[1] 887238405Sjkim xor $A0[1],$A0[1] 888238405Sjkim mul $a0 # a[1]*a[0] 889238405Sjkim add %rax,$A0[0] # a[1]*a[0]+t[1] 890238405Sjkim mov $ai,%rax # a[2] 891238405Sjkim adc %rdx,$A0[1] 892238405Sjkim mov $A0[0],-24($tptr,$i) # t[1] 893238405Sjkim 894238405Sjkim xor $A0[0],$A0[0] 895238405Sjkim add -16($tptr,$i),$A0[1] # a[2]*a[0]+t[2] 896238405Sjkim adc \$0,$A0[0] 897238405Sjkim mul $a0 # a[2]*a[0] 898238405Sjkim add %rax,$A0[1] 899238405Sjkim mov $ai,%rax 900238405Sjkim adc %rdx,$A0[0] 901238405Sjkim mov $A0[1],-16($tptr,$i) # t[2] 902238405Sjkim 903238405Sjkim lea -16($i),$j # j=-16 904238405Sjkim xor $A1[0],$A1[0] 905238405Sjkim 906238405Sjkim 907238405Sjkim mov 8($aptr,$j),$ai # a[3] 908238405Sjkim xor $A1[1],$A1[1] 909238405Sjkim add 8($tptr,$j),$A1[0] 910238405Sjkim adc \$0,$A1[1] 911238405Sjkim mul $a1 # a[2]*a[1] 912238405Sjkim add %rax,$A1[0] # a[2]*a[1]+t[3] 913238405Sjkim mov $ai,%rax 914238405Sjkim adc %rdx,$A1[1] 915238405Sjkim 916238405Sjkim xor $A0[1],$A0[1] 917238405Sjkim add $A1[0],$A0[0] 918238405Sjkim adc \$0,$A0[1] 919238405Sjkim mul $a0 # a[3]*a[0] 920238405Sjkim add %rax,$A0[0] # a[3]*a[0]+a[2]*a[1]+t[3] 921238405Sjkim mov $ai,%rax 922238405Sjkim adc %rdx,$A0[1] 923238405Sjkim mov $A0[0],8($tptr,$j) # t[3] 924238405Sjkim 925238405Sjkim lea 16($j),$j 926238405Sjkim jmp .Lsqr4x_inner 927238405Sjkim 928238405Sjkim.align 16 929238405Sjkim.Lsqr4x_inner: 930238405Sjkim mov ($aptr,$j),$ai # a[4] 931238405Sjkim xor $A1[0],$A1[0] 932238405Sjkim add ($tptr,$j),$A1[1] 933238405Sjkim adc \$0,$A1[0] 934238405Sjkim mul $a1 # a[3]*a[1] 935238405Sjkim add %rax,$A1[1] # a[3]*a[1]+t[4] 936238405Sjkim mov $ai,%rax 937238405Sjkim adc %rdx,$A1[0] 938238405Sjkim 939238405Sjkim xor $A0[0],$A0[0] 940238405Sjkim add $A1[1],$A0[1] 941238405Sjkim adc \$0,$A0[0] 942238405Sjkim mul $a0 # a[4]*a[0] 943238405Sjkim add %rax,$A0[1] # a[4]*a[0]+a[3]*a[1]+t[4] 944238405Sjkim mov $ai,%rax # a[3] 945238405Sjkim adc %rdx,$A0[0] 946238405Sjkim mov $A0[1],($tptr,$j) # t[4] 947238405Sjkim 948238405Sjkim mov 8($aptr,$j),$ai # a[5] 949238405Sjkim xor $A1[1],$A1[1] 950238405Sjkim add 8($tptr,$j),$A1[0] 951238405Sjkim adc \$0,$A1[1] 952238405Sjkim mul $a1 # a[4]*a[3] 953238405Sjkim add %rax,$A1[0] # a[4]*a[3]+t[5] 954238405Sjkim mov $ai,%rax 955238405Sjkim adc %rdx,$A1[1] 956238405Sjkim 957238405Sjkim xor $A0[1],$A0[1] 958238405Sjkim add $A1[0],$A0[0] 959238405Sjkim lea 16($j),$j # j++ 960238405Sjkim adc \$0,$A0[1] 961238405Sjkim mul $a0 # a[5]*a[2] 962238405Sjkim add %rax,$A0[0] # a[5]*a[2]+a[4]*a[3]+t[5] 963238405Sjkim mov $ai,%rax 964238405Sjkim adc %rdx,$A0[1] 965238405Sjkim mov $A0[0],-8($tptr,$j) # t[5], "preloaded t[1]" below 966238405Sjkim 967238405Sjkim cmp \$0,$j 968238405Sjkim jne .Lsqr4x_inner 969238405Sjkim 970238405Sjkim xor $A1[0],$A1[0] 971238405Sjkim add $A0[1],$A1[1] 972238405Sjkim adc \$0,$A1[0] 973238405Sjkim mul $a1 # a[5]*a[3] 974238405Sjkim add %rax,$A1[1] 975238405Sjkim adc %rdx,$A1[0] 976238405Sjkim 977238405Sjkim mov $A1[1],($tptr) # t[6], "preloaded t[2]" below 978238405Sjkim mov $A1[0],8($tptr) # t[7], "preloaded t[3]" below 979238405Sjkim 980238405Sjkim add \$16,$i 981238405Sjkim jnz .Lsqr4x_outer 982238405Sjkim 983238405Sjkim # comments apply to $num==4 case 984238405Sjkim mov -32($aptr),$a0 # a[0] 985238405Sjkim lea 64(%rsp,$num,2),$tptr # end of tp[] buffer, &tp[2*$num] 986238405Sjkim mov -24($aptr),%rax # a[1] 987238405Sjkim lea -32($tptr,$i),$tptr # end of tp[] window, &tp[2*$num-"$i"] 988238405Sjkim mov -16($aptr),$ai # a[2] 989238405Sjkim mov %rax,$a1 990238405Sjkim 991238405Sjkim xor $A0[1],$A0[1] 992238405Sjkim mul $a0 # a[1]*a[0] 993238405Sjkim add %rax,$A0[0] # a[1]*a[0]+t[1], preloaded t[1] 994238405Sjkim mov $ai,%rax # a[2] 995238405Sjkim adc %rdx,$A0[1] 996238405Sjkim mov $A0[0],-24($tptr) # t[1] 997238405Sjkim 998238405Sjkim xor $A0[0],$A0[0] 999238405Sjkim add $A1[1],$A0[1] # a[2]*a[0]+t[2], preloaded t[2] 1000238405Sjkim adc \$0,$A0[0] 1001238405Sjkim mul $a0 # a[2]*a[0] 1002238405Sjkim add %rax,$A0[1] 1003238405Sjkim mov $ai,%rax 1004238405Sjkim adc %rdx,$A0[0] 1005238405Sjkim mov $A0[1],-16($tptr) # t[2] 1006238405Sjkim 1007238405Sjkim mov -8($aptr),$ai # a[3] 1008238405Sjkim mul $a1 # a[2]*a[1] 1009238405Sjkim add %rax,$A1[0] # a[2]*a[1]+t[3], preloaded t[3] 1010238405Sjkim mov $ai,%rax 1011238405Sjkim adc \$0,%rdx 1012238405Sjkim 1013238405Sjkim xor $A0[1],$A0[1] 1014238405Sjkim add $A1[0],$A0[0] 1015238405Sjkim mov %rdx,$A1[1] 1016238405Sjkim adc \$0,$A0[1] 1017238405Sjkim mul $a0 # a[3]*a[0] 1018238405Sjkim add %rax,$A0[0] # a[3]*a[0]+a[2]*a[1]+t[3] 1019238405Sjkim mov $ai,%rax 1020238405Sjkim adc %rdx,$A0[1] 1021238405Sjkim mov $A0[0],-8($tptr) # t[3] 1022238405Sjkim 1023238405Sjkim xor $A1[0],$A1[0] 1024238405Sjkim add $A0[1],$A1[1] 1025238405Sjkim adc \$0,$A1[0] 1026238405Sjkim mul $a1 # a[3]*a[1] 1027238405Sjkim add %rax,$A1[1] 1028238405Sjkim mov -16($aptr),%rax # a[2] 1029238405Sjkim adc %rdx,$A1[0] 1030238405Sjkim 1031238405Sjkim mov $A1[1],($tptr) # t[4] 1032238405Sjkim mov $A1[0],8($tptr) # t[5] 1033238405Sjkim 1034238405Sjkim mul $ai # a[2]*a[3] 1035238405Sjkim___ 1036238405Sjkim{ 1037238405Sjkimmy ($shift,$carry)=($a0,$a1); 1038238405Sjkimmy @S=(@A1,$ai,$n0); 1039238405Sjkim$code.=<<___; 1040238405Sjkim add \$16,$i 1041238405Sjkim xor $shift,$shift 1042238405Sjkim sub $num,$i # $i=16-$num 1043238405Sjkim xor $carry,$carry 1044238405Sjkim 1045238405Sjkim add $A1[0],%rax # t[5] 1046238405Sjkim adc \$0,%rdx 1047238405Sjkim mov %rax,8($tptr) # t[5] 1048238405Sjkim mov %rdx,16($tptr) # t[6] 1049238405Sjkim mov $carry,24($tptr) # t[7] 1050238405Sjkim 1051238405Sjkim mov -16($aptr,$i),%rax # a[0] 1052238405Sjkim lea 64(%rsp,$num,2),$tptr 1053238405Sjkim xor $A0[0],$A0[0] # t[0] 1054238405Sjkim mov -24($tptr,$i,2),$A0[1] # t[1] 1055238405Sjkim 1056238405Sjkim lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift 1057238405Sjkim shr \$63,$A0[0] 1058238405Sjkim lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 | 1059238405Sjkim shr \$63,$A0[1] 1060238405Sjkim or $A0[0],$S[1] # | t[2*i]>>63 1061238405Sjkim mov -16($tptr,$i,2),$A0[0] # t[2*i+2] # prefetch 1062238405Sjkim mov $A0[1],$shift # shift=t[2*i+1]>>63 1063238405Sjkim mul %rax # a[i]*a[i] 1064238405Sjkim neg $carry # mov $carry,cf 1065238405Sjkim mov -8($tptr,$i,2),$A0[1] # t[2*i+2+1] # prefetch 1066238405Sjkim adc %rax,$S[0] 1067238405Sjkim mov -8($aptr,$i),%rax # a[i+1] # prefetch 1068238405Sjkim mov $S[0],-32($tptr,$i,2) 1069238405Sjkim adc %rdx,$S[1] 1070238405Sjkim 1071238405Sjkim lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift 1072238405Sjkim mov $S[1],-24($tptr,$i,2) 1073238405Sjkim sbb $carry,$carry # mov cf,$carry 1074238405Sjkim shr \$63,$A0[0] 1075238405Sjkim lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 | 1076238405Sjkim shr \$63,$A0[1] 1077238405Sjkim or $A0[0],$S[3] # | t[2*i]>>63 1078238405Sjkim mov 0($tptr,$i,2),$A0[0] # t[2*i+2] # prefetch 1079238405Sjkim mov $A0[1],$shift # shift=t[2*i+1]>>63 1080238405Sjkim mul %rax # a[i]*a[i] 1081238405Sjkim neg $carry # mov $carry,cf 1082238405Sjkim mov 8($tptr,$i,2),$A0[1] # t[2*i+2+1] # prefetch 1083238405Sjkim adc %rax,$S[2] 1084238405Sjkim mov 0($aptr,$i),%rax # a[i+1] # prefetch 1085238405Sjkim mov $S[2],-16($tptr,$i,2) 1086238405Sjkim adc %rdx,$S[3] 1087238405Sjkim lea 16($i),$i 1088238405Sjkim mov $S[3],-40($tptr,$i,2) 1089238405Sjkim sbb $carry,$carry # mov cf,$carry 1090238405Sjkim jmp .Lsqr4x_shift_n_add 1091238405Sjkim 1092238405Sjkim.align 16 1093238405Sjkim.Lsqr4x_shift_n_add: 1094238405Sjkim lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift 1095238405Sjkim shr \$63,$A0[0] 1096238405Sjkim lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 | 1097238405Sjkim shr \$63,$A0[1] 1098238405Sjkim or $A0[0],$S[1] # | t[2*i]>>63 1099238405Sjkim mov -16($tptr,$i,2),$A0[0] # t[2*i+2] # prefetch 1100238405Sjkim mov $A0[1],$shift # shift=t[2*i+1]>>63 1101238405Sjkim mul %rax # a[i]*a[i] 1102238405Sjkim neg $carry # mov $carry,cf 1103238405Sjkim mov -8($tptr,$i,2),$A0[1] # t[2*i+2+1] # prefetch 1104238405Sjkim adc %rax,$S[0] 1105238405Sjkim mov -8($aptr,$i),%rax # a[i+1] # prefetch 1106238405Sjkim mov $S[0],-32($tptr,$i,2) 1107238405Sjkim adc %rdx,$S[1] 1108238405Sjkim 1109238405Sjkim lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift 1110238405Sjkim mov $S[1],-24($tptr,$i,2) 1111238405Sjkim sbb $carry,$carry # mov cf,$carry 1112238405Sjkim shr \$63,$A0[0] 1113238405Sjkim lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 | 1114238405Sjkim shr \$63,$A0[1] 1115238405Sjkim or $A0[0],$S[3] # | t[2*i]>>63 1116238405Sjkim mov 0($tptr,$i,2),$A0[0] # t[2*i+2] # prefetch 1117238405Sjkim mov $A0[1],$shift # shift=t[2*i+1]>>63 1118238405Sjkim mul %rax # a[i]*a[i] 1119238405Sjkim neg $carry # mov $carry,cf 1120238405Sjkim mov 8($tptr,$i,2),$A0[1] # t[2*i+2+1] # prefetch 1121238405Sjkim adc %rax,$S[2] 1122238405Sjkim mov 0($aptr,$i),%rax # a[i+1] # prefetch 1123238405Sjkim mov $S[2],-16($tptr,$i,2) 1124238405Sjkim adc %rdx,$S[3] 1125238405Sjkim 1126238405Sjkim lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift 1127238405Sjkim mov $S[3],-8($tptr,$i,2) 1128238405Sjkim sbb $carry,$carry # mov cf,$carry 1129238405Sjkim shr \$63,$A0[0] 1130238405Sjkim lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 | 1131238405Sjkim shr \$63,$A0[1] 1132238405Sjkim or $A0[0],$S[1] # | t[2*i]>>63 1133238405Sjkim mov 16($tptr,$i,2),$A0[0] # t[2*i+2] # prefetch 1134238405Sjkim mov $A0[1],$shift # shift=t[2*i+1]>>63 1135238405Sjkim mul %rax # a[i]*a[i] 1136238405Sjkim neg $carry # mov $carry,cf 1137238405Sjkim mov 24($tptr,$i,2),$A0[1] # t[2*i+2+1] # prefetch 1138238405Sjkim adc %rax,$S[0] 1139238405Sjkim mov 8($aptr,$i),%rax # a[i+1] # prefetch 1140238405Sjkim mov $S[0],0($tptr,$i,2) 1141238405Sjkim adc %rdx,$S[1] 1142238405Sjkim 1143238405Sjkim lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift 1144238405Sjkim mov $S[1],8($tptr,$i,2) 1145238405Sjkim sbb $carry,$carry # mov cf,$carry 1146238405Sjkim shr \$63,$A0[0] 1147238405Sjkim lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 | 1148238405Sjkim shr \$63,$A0[1] 1149238405Sjkim or $A0[0],$S[3] # | t[2*i]>>63 1150238405Sjkim mov 32($tptr,$i,2),$A0[0] # t[2*i+2] # prefetch 1151238405Sjkim mov $A0[1],$shift # shift=t[2*i+1]>>63 1152238405Sjkim mul %rax # a[i]*a[i] 1153238405Sjkim neg $carry # mov $carry,cf 1154238405Sjkim mov 40($tptr,$i,2),$A0[1] # t[2*i+2+1] # prefetch 1155238405Sjkim adc %rax,$S[2] 1156238405Sjkim mov 16($aptr,$i),%rax # a[i+1] # prefetch 1157238405Sjkim mov $S[2],16($tptr,$i,2) 1158238405Sjkim adc %rdx,$S[3] 1159238405Sjkim mov $S[3],24($tptr,$i,2) 1160238405Sjkim sbb $carry,$carry # mov cf,$carry 1161238405Sjkim add \$32,$i 1162238405Sjkim jnz .Lsqr4x_shift_n_add 1163238405Sjkim 1164238405Sjkim lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift 1165238405Sjkim shr \$63,$A0[0] 1166238405Sjkim lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 | 1167238405Sjkim shr \$63,$A0[1] 1168238405Sjkim or $A0[0],$S[1] # | t[2*i]>>63 1169238405Sjkim mov -16($tptr),$A0[0] # t[2*i+2] # prefetch 1170238405Sjkim mov $A0[1],$shift # shift=t[2*i+1]>>63 1171238405Sjkim mul %rax # a[i]*a[i] 1172238405Sjkim neg $carry # mov $carry,cf 1173238405Sjkim mov -8($tptr),$A0[1] # t[2*i+2+1] # prefetch 1174238405Sjkim adc %rax,$S[0] 1175238405Sjkim mov -8($aptr),%rax # a[i+1] # prefetch 1176238405Sjkim mov $S[0],-32($tptr) 1177238405Sjkim adc %rdx,$S[1] 1178238405Sjkim 1179238405Sjkim lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1|shift 1180238405Sjkim mov $S[1],-24($tptr) 1181238405Sjkim sbb $carry,$carry # mov cf,$carry 1182238405Sjkim shr \$63,$A0[0] 1183238405Sjkim lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 | 1184238405Sjkim shr \$63,$A0[1] 1185238405Sjkim or $A0[0],$S[3] # | t[2*i]>>63 1186238405Sjkim mul %rax # a[i]*a[i] 1187238405Sjkim neg $carry # mov $carry,cf 1188238405Sjkim adc %rax,$S[2] 1189238405Sjkim adc %rdx,$S[3] 1190238405Sjkim mov $S[2],-16($tptr) 1191238405Sjkim mov $S[3],-8($tptr) 1192238405Sjkim___ 1193238405Sjkim} 1194238405Sjkim############################################################## 1195238405Sjkim# Montgomery reduction part, "word-by-word" algorithm. 1196238405Sjkim# 1197238405Sjkim{ 1198238405Sjkimmy ($topbit,$nptr)=("%rbp",$aptr); 1199238405Sjkimmy ($m0,$m1)=($a0,$a1); 1200238405Sjkimmy @Ni=("%rbx","%r9"); 1201238405Sjkim$code.=<<___; 1202238405Sjkim mov 40(%rsp),$nptr # restore $nptr 1203238405Sjkim mov 48(%rsp),$n0 # restore *n0 1204238405Sjkim xor $j,$j 1205238405Sjkim mov $num,0(%rsp) # save $num 1206238405Sjkim sub $num,$j # $j=-$num 1207238405Sjkim mov 64(%rsp),$A0[0] # t[0] # modsched # 1208238405Sjkim mov $n0,$m0 # # modsched # 1209238405Sjkim lea 64(%rsp,$num,2),%rax # end of t[] buffer 1210238405Sjkim lea 64(%rsp,$num),$tptr # end of t[] window 1211238405Sjkim mov %rax,8(%rsp) # save end of t[] buffer 1212238405Sjkim lea ($nptr,$num),$nptr # end of n[] buffer 1213238405Sjkim xor $topbit,$topbit # $topbit=0 1214238405Sjkim 1215238405Sjkim mov 0($nptr,$j),%rax # n[0] # modsched # 1216238405Sjkim mov 8($nptr,$j),$Ni[1] # n[1] # modsched # 1217238405Sjkim imulq $A0[0],$m0 # m0=t[0]*n0 # modsched # 1218238405Sjkim mov %rax,$Ni[0] # # modsched # 1219238405Sjkim jmp .Lsqr4x_mont_outer 1220238405Sjkim 1221238405Sjkim.align 16 1222238405Sjkim.Lsqr4x_mont_outer: 1223238405Sjkim xor $A0[1],$A0[1] 1224238405Sjkim mul $m0 # n[0]*m0 1225238405Sjkim add %rax,$A0[0] # n[0]*m0+t[0] 1226238405Sjkim mov $Ni[1],%rax 1227238405Sjkim adc %rdx,$A0[1] 1228238405Sjkim mov $n0,$m1 1229238405Sjkim 1230238405Sjkim xor $A0[0],$A0[0] 1231238405Sjkim add 8($tptr,$j),$A0[1] 1232238405Sjkim adc \$0,$A0[0] 1233238405Sjkim mul $m0 # n[1]*m0 1234238405Sjkim add %rax,$A0[1] # n[1]*m0+t[1] 1235238405Sjkim mov $Ni[0],%rax 1236238405Sjkim adc %rdx,$A0[0] 1237238405Sjkim 1238238405Sjkim imulq $A0[1],$m1 1239238405Sjkim 1240238405Sjkim mov 16($nptr,$j),$Ni[0] # n[2] 1241238405Sjkim xor $A1[1],$A1[1] 1242238405Sjkim add $A0[1],$A1[0] 1243238405Sjkim adc \$0,$A1[1] 1244238405Sjkim mul $m1 # n[0]*m1 1245238405Sjkim add %rax,$A1[0] # n[0]*m1+"t[1]" 1246238405Sjkim mov $Ni[0],%rax 1247238405Sjkim adc %rdx,$A1[1] 1248238405Sjkim mov $A1[0],8($tptr,$j) # "t[1]" 1249238405Sjkim 1250238405Sjkim xor $A0[1],$A0[1] 1251238405Sjkim add 16($tptr,$j),$A0[0] 1252238405Sjkim adc \$0,$A0[1] 1253238405Sjkim mul $m0 # n[2]*m0 1254238405Sjkim add %rax,$A0[0] # n[2]*m0+t[2] 1255238405Sjkim mov $Ni[1],%rax 1256238405Sjkim adc %rdx,$A0[1] 1257238405Sjkim 1258238405Sjkim mov 24($nptr,$j),$Ni[1] # n[3] 1259238405Sjkim xor $A1[0],$A1[0] 1260238405Sjkim add $A0[0],$A1[1] 1261238405Sjkim adc \$0,$A1[0] 1262238405Sjkim mul $m1 # n[1]*m1 1263238405Sjkim add %rax,$A1[1] # n[1]*m1+"t[2]" 1264238405Sjkim mov $Ni[1],%rax 1265238405Sjkim adc %rdx,$A1[0] 1266238405Sjkim mov $A1[1],16($tptr,$j) # "t[2]" 1267238405Sjkim 1268238405Sjkim xor $A0[0],$A0[0] 1269238405Sjkim add 24($tptr,$j),$A0[1] 1270238405Sjkim lea 32($j),$j 1271238405Sjkim adc \$0,$A0[0] 1272238405Sjkim mul $m0 # n[3]*m0 1273238405Sjkim add %rax,$A0[1] # n[3]*m0+t[3] 1274238405Sjkim mov $Ni[0],%rax 1275238405Sjkim adc %rdx,$A0[0] 1276238405Sjkim jmp .Lsqr4x_mont_inner 1277238405Sjkim 1278238405Sjkim.align 16 1279238405Sjkim.Lsqr4x_mont_inner: 1280238405Sjkim mov ($nptr,$j),$Ni[0] # n[4] 1281238405Sjkim xor $A1[1],$A1[1] 1282238405Sjkim add $A0[1],$A1[0] 1283238405Sjkim adc \$0,$A1[1] 1284238405Sjkim mul $m1 # n[2]*m1 1285238405Sjkim add %rax,$A1[0] # n[2]*m1+"t[3]" 1286238405Sjkim mov $Ni[0],%rax 1287238405Sjkim adc %rdx,$A1[1] 1288238405Sjkim mov $A1[0],-8($tptr,$j) # "t[3]" 1289238405Sjkim 1290238405Sjkim xor $A0[1],$A0[1] 1291238405Sjkim add ($tptr,$j),$A0[0] 1292238405Sjkim adc \$0,$A0[1] 1293238405Sjkim mul $m0 # n[4]*m0 1294238405Sjkim add %rax,$A0[0] # n[4]*m0+t[4] 1295238405Sjkim mov $Ni[1],%rax 1296238405Sjkim adc %rdx,$A0[1] 1297238405Sjkim 1298238405Sjkim mov 8($nptr,$j),$Ni[1] # n[5] 1299238405Sjkim xor $A1[0],$A1[0] 1300238405Sjkim add $A0[0],$A1[1] 1301238405Sjkim adc \$0,$A1[0] 1302238405Sjkim mul $m1 # n[3]*m1 1303238405Sjkim add %rax,$A1[1] # n[3]*m1+"t[4]" 1304238405Sjkim mov $Ni[1],%rax 1305238405Sjkim adc %rdx,$A1[0] 1306238405Sjkim mov $A1[1],($tptr,$j) # "t[4]" 1307238405Sjkim 1308238405Sjkim xor $A0[0],$A0[0] 1309238405Sjkim add 8($tptr,$j),$A0[1] 1310238405Sjkim adc \$0,$A0[0] 1311238405Sjkim mul $m0 # n[5]*m0 1312238405Sjkim add %rax,$A0[1] # n[5]*m0+t[5] 1313238405Sjkim mov $Ni[0],%rax 1314238405Sjkim adc %rdx,$A0[0] 1315238405Sjkim 1316238405Sjkim 1317238405Sjkim mov 16($nptr,$j),$Ni[0] # n[6] 1318238405Sjkim xor $A1[1],$A1[1] 1319238405Sjkim add $A0[1],$A1[0] 1320238405Sjkim adc \$0,$A1[1] 1321238405Sjkim mul $m1 # n[4]*m1 1322238405Sjkim add %rax,$A1[0] # n[4]*m1+"t[5]" 1323238405Sjkim mov $Ni[0],%rax 1324238405Sjkim adc %rdx,$A1[1] 1325238405Sjkim mov $A1[0],8($tptr,$j) # "t[5]" 1326238405Sjkim 1327238405Sjkim xor $A0[1],$A0[1] 1328238405Sjkim add 16($tptr,$j),$A0[0] 1329238405Sjkim adc \$0,$A0[1] 1330238405Sjkim mul $m0 # n[6]*m0 1331238405Sjkim add %rax,$A0[0] # n[6]*m0+t[6] 1332238405Sjkim mov $Ni[1],%rax 1333238405Sjkim adc %rdx,$A0[1] 1334238405Sjkim 1335238405Sjkim mov 24($nptr,$j),$Ni[1] # n[7] 1336238405Sjkim xor $A1[0],$A1[0] 1337238405Sjkim add $A0[0],$A1[1] 1338238405Sjkim adc \$0,$A1[0] 1339238405Sjkim mul $m1 # n[5]*m1 1340238405Sjkim add %rax,$A1[1] # n[5]*m1+"t[6]" 1341238405Sjkim mov $Ni[1],%rax 1342238405Sjkim adc %rdx,$A1[0] 1343238405Sjkim mov $A1[1],16($tptr,$j) # "t[6]" 1344238405Sjkim 1345238405Sjkim xor $A0[0],$A0[0] 1346238405Sjkim add 24($tptr,$j),$A0[1] 1347238405Sjkim lea 32($j),$j 1348238405Sjkim adc \$0,$A0[0] 1349238405Sjkim mul $m0 # n[7]*m0 1350238405Sjkim add %rax,$A0[1] # n[7]*m0+t[7] 1351238405Sjkim mov $Ni[0],%rax 1352238405Sjkim adc %rdx,$A0[0] 1353238405Sjkim cmp \$0,$j 1354238405Sjkim jne .Lsqr4x_mont_inner 1355238405Sjkim 1356238405Sjkim sub 0(%rsp),$j # $j=-$num # modsched # 1357238405Sjkim mov $n0,$m0 # # modsched # 1358238405Sjkim 1359238405Sjkim xor $A1[1],$A1[1] 1360238405Sjkim add $A0[1],$A1[0] 1361238405Sjkim adc \$0,$A1[1] 1362238405Sjkim mul $m1 # n[6]*m1 1363238405Sjkim add %rax,$A1[0] # n[6]*m1+"t[7]" 1364238405Sjkim mov $Ni[1],%rax 1365238405Sjkim adc %rdx,$A1[1] 1366238405Sjkim mov $A1[0],-8($tptr) # "t[7]" 1367238405Sjkim 1368238405Sjkim xor $A0[1],$A0[1] 1369238405Sjkim add ($tptr),$A0[0] # +t[8] 1370238405Sjkim adc \$0,$A0[1] 1371238405Sjkim mov 0($nptr,$j),$Ni[0] # n[0] # modsched # 1372238405Sjkim add $topbit,$A0[0] 1373238405Sjkim adc \$0,$A0[1] 1374238405Sjkim 1375238405Sjkim imulq 16($tptr,$j),$m0 # m0=t[0]*n0 # modsched # 1376238405Sjkim xor $A1[0],$A1[0] 1377238405Sjkim mov 8($nptr,$j),$Ni[1] # n[1] # modsched # 1378238405Sjkim add $A0[0],$A1[1] 1379238405Sjkim mov 16($tptr,$j),$A0[0] # t[0] # modsched # 1380238405Sjkim adc \$0,$A1[0] 1381238405Sjkim mul $m1 # n[7]*m1 1382238405Sjkim add %rax,$A1[1] # n[7]*m1+"t[8]" 1383238405Sjkim mov $Ni[0],%rax # # modsched # 1384238405Sjkim adc %rdx,$A1[0] 1385238405Sjkim mov $A1[1],($tptr) # "t[8]" 1386238405Sjkim 1387238405Sjkim xor $topbit,$topbit 1388238405Sjkim add 8($tptr),$A1[0] # +t[9] 1389238405Sjkim adc $topbit,$topbit 1390238405Sjkim add $A0[1],$A1[0] 1391238405Sjkim lea 16($tptr),$tptr # "t[$num]>>128" 1392238405Sjkim adc \$0,$topbit 1393238405Sjkim mov $A1[0],-8($tptr) # "t[9]" 1394238405Sjkim cmp 8(%rsp),$tptr # are we done? 1395238405Sjkim jb .Lsqr4x_mont_outer 1396238405Sjkim 1397238405Sjkim mov 0(%rsp),$num # restore $num 1398238405Sjkim mov $topbit,($tptr) # save $topbit 1399238405Sjkim___ 1400238405Sjkim} 1401238405Sjkim############################################################## 1402238405Sjkim# Post-condition, 4x unrolled copy from bn_mul_mont 1403238405Sjkim# 1404238405Sjkim{ 1405238405Sjkimmy ($tptr,$nptr)=("%rbx",$aptr); 1406238405Sjkimmy @ri=("%rax","%rdx","%r10","%r11"); 1407238405Sjkim$code.=<<___; 1408238405Sjkim mov 64(%rsp,$num),@ri[0] # tp[0] 1409238405Sjkim lea 64(%rsp,$num),$tptr # upper half of t[2*$num] holds result 1410238405Sjkim mov 40(%rsp),$nptr # restore $nptr 1411238405Sjkim shr \$5,$num # num/4 1412238405Sjkim mov 8($tptr),@ri[1] # t[1] 1413238405Sjkim xor $i,$i # i=0 and clear CF! 1414238405Sjkim 1415238405Sjkim mov 32(%rsp),$rptr # restore $rptr 1416238405Sjkim sub 0($nptr),@ri[0] 1417238405Sjkim mov 16($tptr),@ri[2] # t[2] 1418238405Sjkim mov 24($tptr),@ri[3] # t[3] 1419238405Sjkim sbb 8($nptr),@ri[1] 1420238405Sjkim lea -1($num),$j # j=num/4-1 1421238405Sjkim jmp .Lsqr4x_sub 1422238405Sjkim.align 16 1423238405Sjkim.Lsqr4x_sub: 1424238405Sjkim mov @ri[0],0($rptr,$i,8) # rp[i]=tp[i]-np[i] 1425238405Sjkim mov @ri[1],8($rptr,$i,8) # rp[i]=tp[i]-np[i] 1426238405Sjkim sbb 16($nptr,$i,8),@ri[2] 1427238405Sjkim mov 32($tptr,$i,8),@ri[0] # tp[i+1] 1428238405Sjkim mov 40($tptr,$i,8),@ri[1] 1429238405Sjkim sbb 24($nptr,$i,8),@ri[3] 1430238405Sjkim mov @ri[2],16($rptr,$i,8) # rp[i]=tp[i]-np[i] 1431238405Sjkim mov @ri[3],24($rptr,$i,8) # rp[i]=tp[i]-np[i] 1432238405Sjkim sbb 32($nptr,$i,8),@ri[0] 1433238405Sjkim mov 48($tptr,$i,8),@ri[2] 1434238405Sjkim mov 56($tptr,$i,8),@ri[3] 1435238405Sjkim sbb 40($nptr,$i,8),@ri[1] 1436238405Sjkim lea 4($i),$i # i++ 1437238405Sjkim dec $j # doesn't affect CF! 1438238405Sjkim jnz .Lsqr4x_sub 1439238405Sjkim 1440238405Sjkim mov @ri[0],0($rptr,$i,8) # rp[i]=tp[i]-np[i] 1441238405Sjkim mov 32($tptr,$i,8),@ri[0] # load overflow bit 1442238405Sjkim sbb 16($nptr,$i,8),@ri[2] 1443238405Sjkim mov @ri[1],8($rptr,$i,8) # rp[i]=tp[i]-np[i] 1444238405Sjkim sbb 24($nptr,$i,8),@ri[3] 1445238405Sjkim mov @ri[2],16($rptr,$i,8) # rp[i]=tp[i]-np[i] 1446238405Sjkim 1447238405Sjkim sbb \$0,@ri[0] # handle upmost overflow bit 1448238405Sjkim mov @ri[3],24($rptr,$i,8) # rp[i]=tp[i]-np[i] 1449238405Sjkim xor $i,$i # i=0 1450238405Sjkim and @ri[0],$tptr 1451238405Sjkim not @ri[0] 1452238405Sjkim mov $rptr,$nptr 1453238405Sjkim and @ri[0],$nptr 1454238405Sjkim lea -1($num),$j 1455238405Sjkim or $nptr,$tptr # tp=borrow?tp:rp 1456238405Sjkim 1457238405Sjkim pxor %xmm0,%xmm0 1458238405Sjkim lea 64(%rsp,$num,8),$nptr 1459238405Sjkim movdqu ($tptr),%xmm1 1460238405Sjkim lea ($nptr,$num,8),$nptr 1461238405Sjkim movdqa %xmm0,64(%rsp) # zap lower half of temporary vector 1462238405Sjkim movdqa %xmm0,($nptr) # zap upper half of temporary vector 1463238405Sjkim movdqu %xmm1,($rptr) 1464238405Sjkim jmp .Lsqr4x_copy 1465238405Sjkim.align 16 1466238405Sjkim.Lsqr4x_copy: # copy or in-place refresh 1467238405Sjkim movdqu 16($tptr,$i),%xmm2 1468238405Sjkim movdqu 32($tptr,$i),%xmm1 1469238405Sjkim movdqa %xmm0,80(%rsp,$i) # zap lower half of temporary vector 1470238405Sjkim movdqa %xmm0,96(%rsp,$i) # zap lower half of temporary vector 1471238405Sjkim movdqa %xmm0,16($nptr,$i) # zap upper half of temporary vector 1472238405Sjkim movdqa %xmm0,32($nptr,$i) # zap upper half of temporary vector 1473238405Sjkim movdqu %xmm2,16($rptr,$i) 1474238405Sjkim movdqu %xmm1,32($rptr,$i) 1475238405Sjkim lea 32($i),$i 1476238405Sjkim dec $j 1477238405Sjkim jnz .Lsqr4x_copy 1478238405Sjkim 1479238405Sjkim movdqu 16($tptr,$i),%xmm2 1480238405Sjkim movdqa %xmm0,80(%rsp,$i) # zap lower half of temporary vector 1481238405Sjkim movdqa %xmm0,16($nptr,$i) # zap upper half of temporary vector 1482238405Sjkim movdqu %xmm2,16($rptr,$i) 1483238405Sjkim___ 1484238405Sjkim} 1485238405Sjkim$code.=<<___; 1486238405Sjkim mov 56(%rsp),%rsi # restore %rsp 1487238405Sjkim mov \$1,%rax 1488238405Sjkim mov 0(%rsi),%r15 1489238405Sjkim mov 8(%rsi),%r14 1490238405Sjkim mov 16(%rsi),%r13 1491238405Sjkim mov 24(%rsi),%r12 1492238405Sjkim mov 32(%rsi),%rbp 1493238405Sjkim mov 40(%rsi),%rbx 1494238405Sjkim lea 48(%rsi),%rsp 1495238405Sjkim.Lsqr4x_epilogue: 1496238405Sjkim ret 1497238405Sjkim.size bn_sqr4x_mont,.-bn_sqr4x_mont 1498238405Sjkim___ 1499238405Sjkim}}} 1500238405Sjkim$code.=<<___; 1501238405Sjkim.asciz "Montgomery Multiplication for x86_64, CRYPTOGAMS by <appro\@openssl.org>" 1502238405Sjkim.align 16 1503238405Sjkim___ 1504238405Sjkim 1505238405Sjkim# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 1506238405Sjkim# CONTEXT *context,DISPATCHER_CONTEXT *disp) 1507238405Sjkimif ($win64) { 1508238405Sjkim$rec="%rcx"; 1509238405Sjkim$frame="%rdx"; 1510238405Sjkim$context="%r8"; 1511238405Sjkim$disp="%r9"; 1512238405Sjkim 1513238405Sjkim$code.=<<___; 1514238405Sjkim.extern __imp_RtlVirtualUnwind 1515238405Sjkim.type mul_handler,\@abi-omnipotent 1516238405Sjkim.align 16 1517238405Sjkimmul_handler: 1518238405Sjkim push %rsi 1519238405Sjkim push %rdi 1520238405Sjkim push %rbx 1521238405Sjkim push %rbp 1522238405Sjkim push %r12 1523238405Sjkim push %r13 1524238405Sjkim push %r14 1525238405Sjkim push %r15 1526238405Sjkim pushfq 1527238405Sjkim sub \$64,%rsp 1528238405Sjkim 1529238405Sjkim mov 120($context),%rax # pull context->Rax 1530238405Sjkim mov 248($context),%rbx # pull context->Rip 1531238405Sjkim 1532238405Sjkim mov 8($disp),%rsi # disp->ImageBase 1533238405Sjkim mov 56($disp),%r11 # disp->HandlerData 1534238405Sjkim 1535238405Sjkim mov 0(%r11),%r10d # HandlerData[0] 1536238405Sjkim lea (%rsi,%r10),%r10 # end of prologue label 1537238405Sjkim cmp %r10,%rbx # context->Rip<end of prologue label 1538238405Sjkim jb .Lcommon_seh_tail 1539238405Sjkim 1540238405Sjkim mov 152($context),%rax # pull context->Rsp 1541238405Sjkim 1542238405Sjkim mov 4(%r11),%r10d # HandlerData[1] 1543238405Sjkim lea (%rsi,%r10),%r10 # epilogue label 1544238405Sjkim cmp %r10,%rbx # context->Rip>=epilogue label 1545238405Sjkim jae .Lcommon_seh_tail 1546238405Sjkim 1547238405Sjkim mov 192($context),%r10 # pull $num 1548238405Sjkim mov 8(%rax,%r10,8),%rax # pull saved stack pointer 1549238405Sjkim lea 48(%rax),%rax 1550238405Sjkim 1551238405Sjkim mov -8(%rax),%rbx 1552238405Sjkim mov -16(%rax),%rbp 1553238405Sjkim mov -24(%rax),%r12 1554238405Sjkim mov -32(%rax),%r13 1555238405Sjkim mov -40(%rax),%r14 1556238405Sjkim mov -48(%rax),%r15 1557238405Sjkim mov %rbx,144($context) # restore context->Rbx 1558238405Sjkim mov %rbp,160($context) # restore context->Rbp 1559238405Sjkim mov %r12,216($context) # restore context->R12 1560238405Sjkim mov %r13,224($context) # restore context->R13 1561238405Sjkim mov %r14,232($context) # restore context->R14 1562238405Sjkim mov %r15,240($context) # restore context->R15 1563238405Sjkim 1564238405Sjkim jmp .Lcommon_seh_tail 1565238405Sjkim.size mul_handler,.-mul_handler 1566238405Sjkim 1567238405Sjkim.type sqr_handler,\@abi-omnipotent 1568238405Sjkim.align 16 1569238405Sjkimsqr_handler: 1570238405Sjkim push %rsi 1571238405Sjkim push %rdi 1572238405Sjkim push %rbx 1573238405Sjkim push %rbp 1574238405Sjkim push %r12 1575238405Sjkim push %r13 1576238405Sjkim push %r14 1577238405Sjkim push %r15 1578238405Sjkim pushfq 1579238405Sjkim sub \$64,%rsp 1580238405Sjkim 1581238405Sjkim mov 120($context),%rax # pull context->Rax 1582238405Sjkim mov 248($context),%rbx # pull context->Rip 1583238405Sjkim 1584238405Sjkim lea .Lsqr4x_body(%rip),%r10 1585238405Sjkim cmp %r10,%rbx # context->Rip<.Lsqr_body 1586238405Sjkim jb .Lcommon_seh_tail 1587238405Sjkim 1588238405Sjkim mov 152($context),%rax # pull context->Rsp 1589238405Sjkim 1590238405Sjkim lea .Lsqr4x_epilogue(%rip),%r10 1591238405Sjkim cmp %r10,%rbx # context->Rip>=.Lsqr_epilogue 1592238405Sjkim jae .Lcommon_seh_tail 1593238405Sjkim 1594238405Sjkim mov 56(%rax),%rax # pull saved stack pointer 1595238405Sjkim lea 48(%rax),%rax 1596238405Sjkim 1597238405Sjkim mov -8(%rax),%rbx 1598238405Sjkim mov -16(%rax),%rbp 1599238405Sjkim mov -24(%rax),%r12 1600238405Sjkim mov -32(%rax),%r13 1601238405Sjkim mov -40(%rax),%r14 1602238405Sjkim mov -48(%rax),%r15 1603238405Sjkim mov %rbx,144($context) # restore context->Rbx 1604238405Sjkim mov %rbp,160($context) # restore context->Rbp 1605238405Sjkim mov %r12,216($context) # restore context->R12 1606238405Sjkim mov %r13,224($context) # restore context->R13 1607238405Sjkim mov %r14,232($context) # restore context->R14 1608238405Sjkim mov %r15,240($context) # restore context->R15 1609238405Sjkim 1610238405Sjkim.Lcommon_seh_tail: 1611238405Sjkim mov 8(%rax),%rdi 1612238405Sjkim mov 16(%rax),%rsi 1613238405Sjkim mov %rax,152($context) # restore context->Rsp 1614238405Sjkim mov %rsi,168($context) # restore context->Rsi 1615238405Sjkim mov %rdi,176($context) # restore context->Rdi 1616238405Sjkim 1617238405Sjkim mov 40($disp),%rdi # disp->ContextRecord 1618238405Sjkim mov $context,%rsi # context 1619238405Sjkim mov \$154,%ecx # sizeof(CONTEXT) 1620238405Sjkim .long 0xa548f3fc # cld; rep movsq 1621238405Sjkim 1622238405Sjkim mov $disp,%rsi 1623238405Sjkim xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 1624238405Sjkim mov 8(%rsi),%rdx # arg2, disp->ImageBase 1625238405Sjkim mov 0(%rsi),%r8 # arg3, disp->ControlPc 1626238405Sjkim mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 1627238405Sjkim mov 40(%rsi),%r10 # disp->ContextRecord 1628238405Sjkim lea 56(%rsi),%r11 # &disp->HandlerData 1629238405Sjkim lea 24(%rsi),%r12 # &disp->EstablisherFrame 1630238405Sjkim mov %r10,32(%rsp) # arg5 1631238405Sjkim mov %r11,40(%rsp) # arg6 1632238405Sjkim mov %r12,48(%rsp) # arg7 1633238405Sjkim mov %rcx,56(%rsp) # arg8, (NULL) 1634238405Sjkim call *__imp_RtlVirtualUnwind(%rip) 1635238405Sjkim 1636238405Sjkim mov \$1,%eax # ExceptionContinueSearch 1637238405Sjkim add \$64,%rsp 1638238405Sjkim popfq 1639183234Ssimon pop %r15 1640183234Ssimon pop %r14 1641183234Ssimon pop %r13 1642183234Ssimon pop %r12 1643183234Ssimon pop %rbp 1644183234Ssimon pop %rbx 1645238405Sjkim pop %rdi 1646238405Sjkim pop %rsi 1647183234Ssimon ret 1648238405Sjkim.size sqr_handler,.-sqr_handler 1649238405Sjkim 1650238405Sjkim.section .pdata 1651238405Sjkim.align 4 1652238405Sjkim .rva .LSEH_begin_bn_mul_mont 1653238405Sjkim .rva .LSEH_end_bn_mul_mont 1654238405Sjkim .rva .LSEH_info_bn_mul_mont 1655238405Sjkim 1656238405Sjkim .rva .LSEH_begin_bn_mul4x_mont 1657238405Sjkim .rva .LSEH_end_bn_mul4x_mont 1658238405Sjkim .rva .LSEH_info_bn_mul4x_mont 1659238405Sjkim 1660238405Sjkim .rva .LSEH_begin_bn_sqr4x_mont 1661238405Sjkim .rva .LSEH_end_bn_sqr4x_mont 1662238405Sjkim .rva .LSEH_info_bn_sqr4x_mont 1663238405Sjkim 1664238405Sjkim.section .xdata 1665238405Sjkim.align 8 1666238405Sjkim.LSEH_info_bn_mul_mont: 1667238405Sjkim .byte 9,0,0,0 1668238405Sjkim .rva mul_handler 1669238405Sjkim .rva .Lmul_body,.Lmul_epilogue # HandlerData[] 1670238405Sjkim.LSEH_info_bn_mul4x_mont: 1671238405Sjkim .byte 9,0,0,0 1672238405Sjkim .rva mul_handler 1673238405Sjkim .rva .Lmul4x_body,.Lmul4x_epilogue # HandlerData[] 1674238405Sjkim.LSEH_info_bn_sqr4x_mont: 1675238405Sjkim .byte 9,0,0,0 1676238405Sjkim .rva sqr_handler 1677183234Ssimon___ 1678238405Sjkim} 1679183234Ssimon 1680183234Ssimonprint $code; 1681183234Ssimonclose STDOUT; 1682