1#!/usr/bin/env perl 2# 3# ==================================================================== 4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL 5# project. 6# 7# Rights for redistribution and usage in source and binary forms are 8# granted according to the OpenSSL license. Warranty of any kind is 9# disclaimed. 10# ==================================================================== 11 12 13# July 1999 14# 15# This is drop-in MIPS III/IV ISA replacement for crypto/bn/bn_asm.c. 16# 17# The module is designed to work with either of the "new" MIPS ABI(5), 18# namely N32 or N64, offered by IRIX 6.x. It's not ment to work under 19# IRIX 5.x not only because it doesn't support new ABIs but also 20# because 5.x kernels put R4x00 CPU into 32-bit mode and all those 21# 64-bit instructions (daddu, dmultu, etc.) found below gonna only 22# cause illegal instruction exception:-( 23# 24# In addition the code depends on preprocessor flags set up by MIPSpro 25# compiler driver (either as or cc) and therefore (probably?) can't be 26# compiled by the GNU assembler. GNU C driver manages fine though... 27# I mean as long as -mmips-as is specified or is the default option, 28# because then it simply invokes /usr/bin/as which in turn takes 29# perfect care of the preprocessor definitions. Another neat feature 30# offered by the MIPSpro assembler is an optimization pass. This gave 31# me the opportunity to have the code looking more regular as all those 32# architecture dependent instruction rescheduling details were left to 33# the assembler. Cool, huh? 34# 35# Performance improvement is astonishing! 'apps/openssl speed rsa dsa' 36# goes way over 3 times faster! 37# 38# <appro@fy.chalmers.se> 39 40# October 2010 41# 42# Adapt the module even for 32-bit ABIs and other OSes. The former was 43# achieved by mechanical replacement of 64-bit arithmetic instructions 44# such as dmultu, daddu, etc. with their 32-bit counterparts and 45# adjusting offsets denoting multiples of BN_ULONG. Above mentioned 46# >3x performance improvement naturally does not apply to 32-bit code 47# [because there is no instruction 32-bit compiler can't use], one 48# has to content with 40-85% improvement depending on benchmark and 49# key length, more for longer keys. 50 51$flavour = shift || "o32"; 52while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} 53open STDOUT,">$output"; 54 55if ($flavour =~ /64|n32/i) { 56 $LD="ld"; 57 $ST="sd"; 58 $MULTU="dmultu"; 59 $DIVU="ddivu"; 60 $ADDU="daddu"; 61 $SUBU="dsubu"; 62 $SRL="dsrl"; 63 $SLL="dsll"; 64 $BNSZ=8; 65 $PTR_ADD="daddu"; 66 $PTR_SUB="dsubu"; 67 $SZREG=8; 68 $REG_S="sd"; 69 $REG_L="ld"; 70} else { 71 $LD="lw"; 72 $ST="sw"; 73 $MULTU="multu"; 74 $DIVU="divu"; 75 $ADDU="addu"; 76 $SUBU="subu"; 77 $SRL="srl"; 78 $SLL="sll"; 79 $BNSZ=4; 80 $PTR_ADD="addu"; 81 $PTR_SUB="subu"; 82 $SZREG=4; 83 $REG_S="sw"; 84 $REG_L="lw"; 85 $code=".set mips2\n"; 86} 87 88# Below is N32/64 register layout used in the original module. 89# 90($zero,$at,$v0,$v1)=map("\$$_",(0..3)); 91($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); 92($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25)); 93($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23)); 94($gp,$sp,$fp,$ra)=map("\$$_",(28..31)); 95($ta0,$ta1,$ta2,$ta3)=($a4,$a5,$a6,$a7); 96# 97# No special adaptation is required for O32. NUBI on the other hand 98# is treated by saving/restoring ($v1,$t0..$t3). 99 100$gp=$v1 if ($flavour =~ /nubi/i); 101 102$minus4=$v1; 103 104$code.=<<___; 105.rdata 106.asciiz "mips3.s, Version 1.2" 107.asciiz "MIPS II/III/IV ISA artwork by Andy Polyakov <appro\@fy.chalmers.se>" 108 109.text 110.set noat 111 112.align 5 113.globl bn_mul_add_words 114.ent bn_mul_add_words 115bn_mul_add_words: 116 .set noreorder 117 bgtz $a2,bn_mul_add_words_internal 118 move $v0,$zero 119 jr $ra 120 move $a0,$v0 121.end bn_mul_add_words 122 123.align 5 124.ent bn_mul_add_words_internal 125bn_mul_add_words_internal: 126___ 127$code.=<<___ if ($flavour =~ /nubi/i); 128 .frame $sp,6*$SZREG,$ra 129 .mask 0x8000f008,-$SZREG 130 .set noreorder 131 $PTR_SUB $sp,6*$SZREG 132 $REG_S $ra,5*$SZREG($sp) 133 $REG_S $t3,4*$SZREG($sp) 134 $REG_S $t2,3*$SZREG($sp) 135 $REG_S $t1,2*$SZREG($sp) 136 $REG_S $t0,1*$SZREG($sp) 137 $REG_S $gp,0*$SZREG($sp) 138___ 139$code.=<<___; 140 .set reorder 141 li $minus4,-4 142 and $ta0,$a2,$minus4 143 beqz $ta0,.L_bn_mul_add_words_tail 144 145.L_bn_mul_add_words_loop: 146 $LD $t0,0($a1) 147 $MULTU $t0,$a3 148 $LD $t1,0($a0) 149 $LD $t2,$BNSZ($a1) 150 $LD $t3,$BNSZ($a0) 151 $LD $ta0,2*$BNSZ($a1) 152 $LD $ta1,2*$BNSZ($a0) 153 $ADDU $t1,$v0 154 sltu $v0,$t1,$v0 # All manuals say it "compares 32-bit 155 # values", but it seems to work fine 156 # even on 64-bit registers. 157 mflo $at 158 mfhi $t0 159 $ADDU $t1,$at 160 $ADDU $v0,$t0 161 $MULTU $t2,$a3 162 sltu $at,$t1,$at 163 $ST $t1,0($a0) 164 $ADDU $v0,$at 165 166 $LD $ta2,3*$BNSZ($a1) 167 $LD $ta3,3*$BNSZ($a0) 168 $ADDU $t3,$v0 169 sltu $v0,$t3,$v0 170 mflo $at 171 mfhi $t2 172 $ADDU $t3,$at 173 $ADDU $v0,$t2 174 $MULTU $ta0,$a3 175 sltu $at,$t3,$at 176 $ST $t3,$BNSZ($a0) 177 $ADDU $v0,$at 178 179 subu $a2,4 180 $PTR_ADD $a0,4*$BNSZ 181 $PTR_ADD $a1,4*$BNSZ 182 $ADDU $ta1,$v0 183 sltu $v0,$ta1,$v0 184 mflo $at 185 mfhi $ta0 186 $ADDU $ta1,$at 187 $ADDU $v0,$ta0 188 $MULTU $ta2,$a3 189 sltu $at,$ta1,$at 190 $ST $ta1,-2*$BNSZ($a0) 191 $ADDU $v0,$at 192 193 194 and $ta0,$a2,$minus4 195 $ADDU $ta3,$v0 196 sltu $v0,$ta3,$v0 197 mflo $at 198 mfhi $ta2 199 $ADDU $ta3,$at 200 $ADDU $v0,$ta2 201 sltu $at,$ta3,$at 202 $ST $ta3,-$BNSZ($a0) 203 .set noreorder 204 bgtz $ta0,.L_bn_mul_add_words_loop 205 $ADDU $v0,$at 206 207 beqz $a2,.L_bn_mul_add_words_return 208 nop 209 210.L_bn_mul_add_words_tail: 211 .set reorder 212 $LD $t0,0($a1) 213 $MULTU $t0,$a3 214 $LD $t1,0($a0) 215 subu $a2,1 216 $ADDU $t1,$v0 217 sltu $v0,$t1,$v0 218 mflo $at 219 mfhi $t0 220 $ADDU $t1,$at 221 $ADDU $v0,$t0 222 sltu $at,$t1,$at 223 $ST $t1,0($a0) 224 $ADDU $v0,$at 225 beqz $a2,.L_bn_mul_add_words_return 226 227 $LD $t0,$BNSZ($a1) 228 $MULTU $t0,$a3 229 $LD $t1,$BNSZ($a0) 230 subu $a2,1 231 $ADDU $t1,$v0 232 sltu $v0,$t1,$v0 233 mflo $at 234 mfhi $t0 235 $ADDU $t1,$at 236 $ADDU $v0,$t0 237 sltu $at,$t1,$at 238 $ST $t1,$BNSZ($a0) 239 $ADDU $v0,$at 240 beqz $a2,.L_bn_mul_add_words_return 241 242 $LD $t0,2*$BNSZ($a1) 243 $MULTU $t0,$a3 244 $LD $t1,2*$BNSZ($a0) 245 $ADDU $t1,$v0 246 sltu $v0,$t1,$v0 247 mflo $at 248 mfhi $t0 249 $ADDU $t1,$at 250 $ADDU $v0,$t0 251 sltu $at,$t1,$at 252 $ST $t1,2*$BNSZ($a0) 253 $ADDU $v0,$at 254 255.L_bn_mul_add_words_return: 256 .set noreorder 257___ 258$code.=<<___ if ($flavour =~ /nubi/i); 259 $REG_L $t3,4*$SZREG($sp) 260 $REG_L $t2,3*$SZREG($sp) 261 $REG_L $t1,2*$SZREG($sp) 262 $REG_L $t0,1*$SZREG($sp) 263 $REG_L $gp,0*$SZREG($sp) 264 $PTR_ADD $sp,6*$SZREG 265___ 266$code.=<<___; 267 jr $ra 268 move $a0,$v0 269.end bn_mul_add_words_internal 270 271.align 5 272.globl bn_mul_words 273.ent bn_mul_words 274bn_mul_words: 275 .set noreorder 276 bgtz $a2,bn_mul_words_internal 277 move $v0,$zero 278 jr $ra 279 move $a0,$v0 280.end bn_mul_words 281 282.align 5 283.ent bn_mul_words_internal 284bn_mul_words_internal: 285___ 286$code.=<<___ if ($flavour =~ /nubi/i); 287 .frame $sp,6*$SZREG,$ra 288 .mask 0x8000f008,-$SZREG 289 .set noreorder 290 $PTR_SUB $sp,6*$SZREG 291 $REG_S $ra,5*$SZREG($sp) 292 $REG_S $t3,4*$SZREG($sp) 293 $REG_S $t2,3*$SZREG($sp) 294 $REG_S $t1,2*$SZREG($sp) 295 $REG_S $t0,1*$SZREG($sp) 296 $REG_S $gp,0*$SZREG($sp) 297___ 298$code.=<<___; 299 .set reorder 300 li $minus4,-4 301 and $ta0,$a2,$minus4 302 beqz $ta0,.L_bn_mul_words_tail 303 304.L_bn_mul_words_loop: 305 $LD $t0,0($a1) 306 $MULTU $t0,$a3 307 $LD $t2,$BNSZ($a1) 308 $LD $ta0,2*$BNSZ($a1) 309 $LD $ta2,3*$BNSZ($a1) 310 mflo $at 311 mfhi $t0 312 $ADDU $v0,$at 313 sltu $t1,$v0,$at 314 $MULTU $t2,$a3 315 $ST $v0,0($a0) 316 $ADDU $v0,$t1,$t0 317 318 subu $a2,4 319 $PTR_ADD $a0,4*$BNSZ 320 $PTR_ADD $a1,4*$BNSZ 321 mflo $at 322 mfhi $t2 323 $ADDU $v0,$at 324 sltu $t3,$v0,$at 325 $MULTU $ta0,$a3 326 $ST $v0,-3*$BNSZ($a0) 327 $ADDU $v0,$t3,$t2 328 329 mflo $at 330 mfhi $ta0 331 $ADDU $v0,$at 332 sltu $ta1,$v0,$at 333 $MULTU $ta2,$a3 334 $ST $v0,-2*$BNSZ($a0) 335 $ADDU $v0,$ta1,$ta0 336 337 and $ta0,$a2,$minus4 338 mflo $at 339 mfhi $ta2 340 $ADDU $v0,$at 341 sltu $ta3,$v0,$at 342 $ST $v0,-$BNSZ($a0) 343 .set noreorder 344 bgtz $ta0,.L_bn_mul_words_loop 345 $ADDU $v0,$ta3,$ta2 346 347 beqz $a2,.L_bn_mul_words_return 348 nop 349 350.L_bn_mul_words_tail: 351 .set reorder 352 $LD $t0,0($a1) 353 $MULTU $t0,$a3 354 subu $a2,1 355 mflo $at 356 mfhi $t0 357 $ADDU $v0,$at 358 sltu $t1,$v0,$at 359 $ST $v0,0($a0) 360 $ADDU $v0,$t1,$t0 361 beqz $a2,.L_bn_mul_words_return 362 363 $LD $t0,$BNSZ($a1) 364 $MULTU $t0,$a3 365 subu $a2,1 366 mflo $at 367 mfhi $t0 368 $ADDU $v0,$at 369 sltu $t1,$v0,$at 370 $ST $v0,$BNSZ($a0) 371 $ADDU $v0,$t1,$t0 372 beqz $a2,.L_bn_mul_words_return 373 374 $LD $t0,2*$BNSZ($a1) 375 $MULTU $t0,$a3 376 mflo $at 377 mfhi $t0 378 $ADDU $v0,$at 379 sltu $t1,$v0,$at 380 $ST $v0,2*$BNSZ($a0) 381 $ADDU $v0,$t1,$t0 382 383.L_bn_mul_words_return: 384 .set noreorder 385___ 386$code.=<<___ if ($flavour =~ /nubi/i); 387 $REG_L $t3,4*$SZREG($sp) 388 $REG_L $t2,3*$SZREG($sp) 389 $REG_L $t1,2*$SZREG($sp) 390 $REG_L $t0,1*$SZREG($sp) 391 $REG_L $gp,0*$SZREG($sp) 392 $PTR_ADD $sp,6*$SZREG 393___ 394$code.=<<___; 395 jr $ra 396 move $a0,$v0 397.end bn_mul_words_internal 398 399.align 5 400.globl bn_sqr_words 401.ent bn_sqr_words 402bn_sqr_words: 403 .set noreorder 404 bgtz $a2,bn_sqr_words_internal 405 move $v0,$zero 406 jr $ra 407 move $a0,$v0 408.end bn_sqr_words 409 410.align 5 411.ent bn_sqr_words_internal 412bn_sqr_words_internal: 413___ 414$code.=<<___ if ($flavour =~ /nubi/i); 415 .frame $sp,6*$SZREG,$ra 416 .mask 0x8000f008,-$SZREG 417 .set noreorder 418 $PTR_SUB $sp,6*$SZREG 419 $REG_S $ra,5*$SZREG($sp) 420 $REG_S $t3,4*$SZREG($sp) 421 $REG_S $t2,3*$SZREG($sp) 422 $REG_S $t1,2*$SZREG($sp) 423 $REG_S $t0,1*$SZREG($sp) 424 $REG_S $gp,0*$SZREG($sp) 425___ 426$code.=<<___; 427 .set reorder 428 li $minus4,-4 429 and $ta0,$a2,$minus4 430 beqz $ta0,.L_bn_sqr_words_tail 431 432.L_bn_sqr_words_loop: 433 $LD $t0,0($a1) 434 $MULTU $t0,$t0 435 $LD $t2,$BNSZ($a1) 436 $LD $ta0,2*$BNSZ($a1) 437 $LD $ta2,3*$BNSZ($a1) 438 mflo $t1 439 mfhi $t0 440 $ST $t1,0($a0) 441 $ST $t0,$BNSZ($a0) 442 443 $MULTU $t2,$t2 444 subu $a2,4 445 $PTR_ADD $a0,8*$BNSZ 446 $PTR_ADD $a1,4*$BNSZ 447 mflo $t3 448 mfhi $t2 449 $ST $t3,-6*$BNSZ($a0) 450 $ST $t2,-5*$BNSZ($a0) 451 452 $MULTU $ta0,$ta0 453 mflo $ta1 454 mfhi $ta0 455 $ST $ta1,-4*$BNSZ($a0) 456 $ST $ta0,-3*$BNSZ($a0) 457 458 459 $MULTU $ta2,$ta2 460 and $ta0,$a2,$minus4 461 mflo $ta3 462 mfhi $ta2 463 $ST $ta3,-2*$BNSZ($a0) 464 465 .set noreorder 466 bgtz $ta0,.L_bn_sqr_words_loop 467 $ST $ta2,-$BNSZ($a0) 468 469 beqz $a2,.L_bn_sqr_words_return 470 nop 471 472.L_bn_sqr_words_tail: 473 .set reorder 474 $LD $t0,0($a1) 475 $MULTU $t0,$t0 476 subu $a2,1 477 mflo $t1 478 mfhi $t0 479 $ST $t1,0($a0) 480 $ST $t0,$BNSZ($a0) 481 beqz $a2,.L_bn_sqr_words_return 482 483 $LD $t0,$BNSZ($a1) 484 $MULTU $t0,$t0 485 subu $a2,1 486 mflo $t1 487 mfhi $t0 488 $ST $t1,2*$BNSZ($a0) 489 $ST $t0,3*$BNSZ($a0) 490 beqz $a2,.L_bn_sqr_words_return 491 492 $LD $t0,2*$BNSZ($a1) 493 $MULTU $t0,$t0 494 mflo $t1 495 mfhi $t0 496 $ST $t1,4*$BNSZ($a0) 497 $ST $t0,5*$BNSZ($a0) 498 499.L_bn_sqr_words_return: 500 .set noreorder 501___ 502$code.=<<___ if ($flavour =~ /nubi/i); 503 $REG_L $t3,4*$SZREG($sp) 504 $REG_L $t2,3*$SZREG($sp) 505 $REG_L $t1,2*$SZREG($sp) 506 $REG_L $t0,1*$SZREG($sp) 507 $REG_L $gp,0*$SZREG($sp) 508 $PTR_ADD $sp,6*$SZREG 509___ 510$code.=<<___; 511 jr $ra 512 move $a0,$v0 513 514.end bn_sqr_words_internal 515 516.align 5 517.globl bn_add_words 518.ent bn_add_words 519bn_add_words: 520 .set noreorder 521 bgtz $a3,bn_add_words_internal 522 move $v0,$zero 523 jr $ra 524 move $a0,$v0 525.end bn_add_words 526 527.align 5 528.ent bn_add_words_internal 529bn_add_words_internal: 530___ 531$code.=<<___ if ($flavour =~ /nubi/i); 532 .frame $sp,6*$SZREG,$ra 533 .mask 0x8000f008,-$SZREG 534 .set noreorder 535 $PTR_SUB $sp,6*$SZREG 536 $REG_S $ra,5*$SZREG($sp) 537 $REG_S $t3,4*$SZREG($sp) 538 $REG_S $t2,3*$SZREG($sp) 539 $REG_S $t1,2*$SZREG($sp) 540 $REG_S $t0,1*$SZREG($sp) 541 $REG_S $gp,0*$SZREG($sp) 542___ 543$code.=<<___; 544 .set reorder 545 li $minus4,-4 546 and $at,$a3,$minus4 547 beqz $at,.L_bn_add_words_tail 548 549.L_bn_add_words_loop: 550 $LD $t0,0($a1) 551 $LD $ta0,0($a2) 552 subu $a3,4 553 $LD $t1,$BNSZ($a1) 554 and $at,$a3,$minus4 555 $LD $t2,2*$BNSZ($a1) 556 $PTR_ADD $a2,4*$BNSZ 557 $LD $t3,3*$BNSZ($a1) 558 $PTR_ADD $a0,4*$BNSZ 559 $LD $ta1,-3*$BNSZ($a2) 560 $PTR_ADD $a1,4*$BNSZ 561 $LD $ta2,-2*$BNSZ($a2) 562 $LD $ta3,-$BNSZ($a2) 563 $ADDU $ta0,$t0 564 sltu $t8,$ta0,$t0 565 $ADDU $t0,$ta0,$v0 566 sltu $v0,$t0,$ta0 567 $ST $t0,-4*$BNSZ($a0) 568 $ADDU $v0,$t8 569 570 $ADDU $ta1,$t1 571 sltu $t9,$ta1,$t1 572 $ADDU $t1,$ta1,$v0 573 sltu $v0,$t1,$ta1 574 $ST $t1,-3*$BNSZ($a0) 575 $ADDU $v0,$t9 576 577 $ADDU $ta2,$t2 578 sltu $t8,$ta2,$t2 579 $ADDU $t2,$ta2,$v0 580 sltu $v0,$t2,$ta2 581 $ST $t2,-2*$BNSZ($a0) 582 $ADDU $v0,$t8 583 584 $ADDU $ta3,$t3 585 sltu $t9,$ta3,$t3 586 $ADDU $t3,$ta3,$v0 587 sltu $v0,$t3,$ta3 588 $ST $t3,-$BNSZ($a0) 589 590 .set noreorder 591 bgtz $at,.L_bn_add_words_loop 592 $ADDU $v0,$t9 593 594 beqz $a3,.L_bn_add_words_return 595 nop 596 597.L_bn_add_words_tail: 598 .set reorder 599 $LD $t0,0($a1) 600 $LD $ta0,0($a2) 601 $ADDU $ta0,$t0 602 subu $a3,1 603 sltu $t8,$ta0,$t0 604 $ADDU $t0,$ta0,$v0 605 sltu $v0,$t0,$ta0 606 $ST $t0,0($a0) 607 $ADDU $v0,$t8 608 beqz $a3,.L_bn_add_words_return 609 610 $LD $t1,$BNSZ($a1) 611 $LD $ta1,$BNSZ($a2) 612 $ADDU $ta1,$t1 613 subu $a3,1 614 sltu $t9,$ta1,$t1 615 $ADDU $t1,$ta1,$v0 616 sltu $v0,$t1,$ta1 617 $ST $t1,$BNSZ($a0) 618 $ADDU $v0,$t9 619 beqz $a3,.L_bn_add_words_return 620 621 $LD $t2,2*$BNSZ($a1) 622 $LD $ta2,2*$BNSZ($a2) 623 $ADDU $ta2,$t2 624 sltu $t8,$ta2,$t2 625 $ADDU $t2,$ta2,$v0 626 sltu $v0,$t2,$ta2 627 $ST $t2,2*$BNSZ($a0) 628 $ADDU $v0,$t8 629 630.L_bn_add_words_return: 631 .set noreorder 632___ 633$code.=<<___ if ($flavour =~ /nubi/i); 634 $REG_L $t3,4*$SZREG($sp) 635 $REG_L $t2,3*$SZREG($sp) 636 $REG_L $t1,2*$SZREG($sp) 637 $REG_L $t0,1*$SZREG($sp) 638 $REG_L $gp,0*$SZREG($sp) 639 $PTR_ADD $sp,6*$SZREG 640___ 641$code.=<<___; 642 jr $ra 643 move $a0,$v0 644 645.end bn_add_words_internal 646 647.align 5 648.globl bn_sub_words 649.ent bn_sub_words 650bn_sub_words: 651 .set noreorder 652 bgtz $a3,bn_sub_words_internal 653 move $v0,$zero 654 jr $ra 655 move $a0,$zero 656.end bn_sub_words 657 658.align 5 659.ent bn_sub_words_internal 660bn_sub_words_internal: 661___ 662$code.=<<___ if ($flavour =~ /nubi/i); 663 .frame $sp,6*$SZREG,$ra 664 .mask 0x8000f008,-$SZREG 665 .set noreorder 666 $PTR_SUB $sp,6*$SZREG 667 $REG_S $ra,5*$SZREG($sp) 668 $REG_S $t3,4*$SZREG($sp) 669 $REG_S $t2,3*$SZREG($sp) 670 $REG_S $t1,2*$SZREG($sp) 671 $REG_S $t0,1*$SZREG($sp) 672 $REG_S $gp,0*$SZREG($sp) 673___ 674$code.=<<___; 675 .set reorder 676 li $minus4,-4 677 and $at,$a3,$minus4 678 beqz $at,.L_bn_sub_words_tail 679 680.L_bn_sub_words_loop: 681 $LD $t0,0($a1) 682 $LD $ta0,0($a2) 683 subu $a3,4 684 $LD $t1,$BNSZ($a1) 685 and $at,$a3,$minus4 686 $LD $t2,2*$BNSZ($a1) 687 $PTR_ADD $a2,4*$BNSZ 688 $LD $t3,3*$BNSZ($a1) 689 $PTR_ADD $a0,4*$BNSZ 690 $LD $ta1,-3*$BNSZ($a2) 691 $PTR_ADD $a1,4*$BNSZ 692 $LD $ta2,-2*$BNSZ($a2) 693 $LD $ta3,-$BNSZ($a2) 694 sltu $t8,$t0,$ta0 695 $SUBU $ta0,$t0,$ta0 696 $SUBU $t0,$ta0,$v0 697 sgtu $v0,$t0,$ta0 698 $ST $t0,-4*$BNSZ($a0) 699 $ADDU $v0,$t8 700 701 sltu $t9,$t1,$ta1 702 $SUBU $ta1,$t1,$ta1 703 $SUBU $t1,$ta1,$v0 704 sgtu $v0,$t1,$ta1 705 $ST $t1,-3*$BNSZ($a0) 706 $ADDU $v0,$t9 707 708 709 sltu $t8,$t2,$ta2 710 $SUBU $ta2,$t2,$ta2 711 $SUBU $t2,$ta2,$v0 712 sgtu $v0,$t2,$ta2 713 $ST $t2,-2*$BNSZ($a0) 714 $ADDU $v0,$t8 715 716 sltu $t9,$t3,$ta3 717 $SUBU $ta3,$t3,$ta3 718 $SUBU $t3,$ta3,$v0 719 sgtu $v0,$t3,$ta3 720 $ST $t3,-$BNSZ($a0) 721 722 .set noreorder 723 bgtz $at,.L_bn_sub_words_loop 724 $ADDU $v0,$t9 725 726 beqz $a3,.L_bn_sub_words_return 727 nop 728 729.L_bn_sub_words_tail: 730 .set reorder 731 $LD $t0,0($a1) 732 $LD $ta0,0($a2) 733 subu $a3,1 734 sltu $t8,$t0,$ta0 735 $SUBU $ta0,$t0,$ta0 736 $SUBU $t0,$ta0,$v0 737 sgtu $v0,$t0,$ta0 738 $ST $t0,0($a0) 739 $ADDU $v0,$t8 740 beqz $a3,.L_bn_sub_words_return 741 742 $LD $t1,$BNSZ($a1) 743 subu $a3,1 744 $LD $ta1,$BNSZ($a2) 745 sltu $t9,$t1,$ta1 746 $SUBU $ta1,$t1,$ta1 747 $SUBU $t1,$ta1,$v0 748 sgtu $v0,$t1,$ta1 749 $ST $t1,$BNSZ($a0) 750 $ADDU $v0,$t9 751 beqz $a3,.L_bn_sub_words_return 752 753 $LD $t2,2*$BNSZ($a1) 754 $LD $ta2,2*$BNSZ($a2) 755 sltu $t8,$t2,$ta2 756 $SUBU $ta2,$t2,$ta2 757 $SUBU $t2,$ta2,$v0 758 sgtu $v0,$t2,$ta2 759 $ST $t2,2*$BNSZ($a0) 760 $ADDU $v0,$t8 761 762.L_bn_sub_words_return: 763 .set noreorder 764___ 765$code.=<<___ if ($flavour =~ /nubi/i); 766 $REG_L $t3,4*$SZREG($sp) 767 $REG_L $t2,3*$SZREG($sp) 768 $REG_L $t1,2*$SZREG($sp) 769 $REG_L $t0,1*$SZREG($sp) 770 $REG_L $gp,0*$SZREG($sp) 771 $PTR_ADD $sp,6*$SZREG 772___ 773$code.=<<___; 774 jr $ra 775 move $a0,$v0 776.end bn_sub_words_internal 777 778.align 5 779.globl bn_div_3_words 780.ent bn_div_3_words 781bn_div_3_words: 782 .set noreorder 783 move $a3,$a0 # we know that bn_div_words does not 784 # touch $a3, $ta2, $ta3 and preserves $a2 785 # so that we can save two arguments 786 # and return address in registers 787 # instead of stack:-) 788 789 $LD $a0,($a3) 790 move $ta2,$a1 791 bne $a0,$a2,bn_div_3_words_internal 792 $LD $a1,-$BNSZ($a3) 793 li $v0,-1 794 jr $ra 795 move $a0,$v0 796.end bn_div_3_words 797 798.align 5 799.ent bn_div_3_words_internal 800bn_div_3_words_internal: 801___ 802$code.=<<___ if ($flavour =~ /nubi/i); 803 .frame $sp,6*$SZREG,$ra 804 .mask 0x8000f008,-$SZREG 805 .set noreorder 806 $PTR_SUB $sp,6*$SZREG 807 $REG_S $ra,5*$SZREG($sp) 808 $REG_S $t3,4*$SZREG($sp) 809 $REG_S $t2,3*$SZREG($sp) 810 $REG_S $t1,2*$SZREG($sp) 811 $REG_S $t0,1*$SZREG($sp) 812 $REG_S $gp,0*$SZREG($sp) 813___ 814$code.=<<___; 815 .set reorder 816 move $ta3,$ra 817 bal bn_div_words_internal 818 move $ra,$ta3 819 $MULTU $ta2,$v0 820 $LD $t2,-2*$BNSZ($a3) 821 move $ta0,$zero 822 mfhi $t1 823 mflo $t0 824 sltu $t8,$t1,$a1 825.L_bn_div_3_words_inner_loop: 826 bnez $t8,.L_bn_div_3_words_inner_loop_done 827 sgeu $at,$t2,$t0 828 seq $t9,$t1,$a1 829 and $at,$t9 830 sltu $t3,$t0,$ta2 831 $ADDU $a1,$a2 832 $SUBU $t1,$t3 833 $SUBU $t0,$ta2 834 sltu $t8,$t1,$a1 835 sltu $ta0,$a1,$a2 836 or $t8,$ta0 837 .set noreorder 838 beqz $at,.L_bn_div_3_words_inner_loop 839 $SUBU $v0,1 840 $ADDU $v0,1 841 .set reorder 842.L_bn_div_3_words_inner_loop_done: 843 .set noreorder 844___ 845$code.=<<___ if ($flavour =~ /nubi/i); 846 $REG_L $t3,4*$SZREG($sp) 847 $REG_L $t2,3*$SZREG($sp) 848 $REG_L $t1,2*$SZREG($sp) 849 $REG_L $t0,1*$SZREG($sp) 850 $REG_L $gp,0*$SZREG($sp) 851 $PTR_ADD $sp,6*$SZREG 852___ 853$code.=<<___; 854 jr $ra 855 move $a0,$v0 856.end bn_div_3_words_internal 857 858.align 5 859.globl bn_div_words 860.ent bn_div_words 861bn_div_words: 862 .set noreorder 863 bnez $a2,bn_div_words_internal 864 li $v0,-1 # I would rather signal div-by-zero 865 # which can be done with 'break 7' 866 jr $ra 867 move $a0,$v0 868.end bn_div_words 869 870.align 5 871.ent bn_div_words_internal 872bn_div_words_internal: 873___ 874$code.=<<___ if ($flavour =~ /nubi/i); 875 .frame $sp,6*$SZREG,$ra 876 .mask 0x8000f008,-$SZREG 877 .set noreorder 878 $PTR_SUB $sp,6*$SZREG 879 $REG_S $ra,5*$SZREG($sp) 880 $REG_S $t3,4*$SZREG($sp) 881 $REG_S $t2,3*$SZREG($sp) 882 $REG_S $t1,2*$SZREG($sp) 883 $REG_S $t0,1*$SZREG($sp) 884 $REG_S $gp,0*$SZREG($sp) 885___ 886$code.=<<___; 887 move $v1,$zero 888 bltz $a2,.L_bn_div_words_body 889 move $t9,$v1 890 $SLL $a2,1 891 bgtz $a2,.-4 892 addu $t9,1 893 894 .set reorder 895 negu $t1,$t9 896 li $t2,-1 897 $SLL $t2,$t1 898 and $t2,$a0 899 $SRL $at,$a1,$t1 900 .set noreorder 901 beqz $t2,.+12 902 nop 903 break 6 # signal overflow 904 .set reorder 905 $SLL $a0,$t9 906 $SLL $a1,$t9 907 or $a0,$at 908___ 909$QT=$ta0; 910$HH=$ta1; 911$DH=$v1; 912$code.=<<___; 913.L_bn_div_words_body: 914 $SRL $DH,$a2,4*$BNSZ # bits 915 sgeu $at,$a0,$a2 916 .set noreorder 917 beqz $at,.+12 918 nop 919 $SUBU $a0,$a2 920 .set reorder 921 922 li $QT,-1 923 $SRL $HH,$a0,4*$BNSZ # bits 924 $SRL $QT,4*$BNSZ # q=0xffffffff 925 beq $DH,$HH,.L_bn_div_words_skip_div1 926 $DIVU $zero,$a0,$DH 927 mflo $QT 928.L_bn_div_words_skip_div1: 929 $MULTU $a2,$QT 930 $SLL $t3,$a0,4*$BNSZ # bits 931 $SRL $at,$a1,4*$BNSZ # bits 932 or $t3,$at 933 mflo $t0 934 mfhi $t1 935.L_bn_div_words_inner_loop1: 936 sltu $t2,$t3,$t0 937 seq $t8,$HH,$t1 938 sltu $at,$HH,$t1 939 and $t2,$t8 940 sltu $v0,$t0,$a2 941 or $at,$t2 942 .set noreorder 943 beqz $at,.L_bn_div_words_inner_loop1_done 944 $SUBU $t1,$v0 945 $SUBU $t0,$a2 946 b .L_bn_div_words_inner_loop1 947 $SUBU $QT,1 948 .set reorder 949.L_bn_div_words_inner_loop1_done: 950 951 $SLL $a1,4*$BNSZ # bits 952 $SUBU $a0,$t3,$t0 953 $SLL $v0,$QT,4*$BNSZ # bits 954 955 li $QT,-1 956 $SRL $HH,$a0,4*$BNSZ # bits 957 $SRL $QT,4*$BNSZ # q=0xffffffff 958 beq $DH,$HH,.L_bn_div_words_skip_div2 959 $DIVU $zero,$a0,$DH 960 mflo $QT 961.L_bn_div_words_skip_div2: 962 $MULTU $a2,$QT 963 $SLL $t3,$a0,4*$BNSZ # bits 964 $SRL $at,$a1,4*$BNSZ # bits 965 or $t3,$at 966 mflo $t0 967 mfhi $t1 968.L_bn_div_words_inner_loop2: 969 sltu $t2,$t3,$t0 970 seq $t8,$HH,$t1 971 sltu $at,$HH,$t1 972 and $t2,$t8 973 sltu $v1,$t0,$a2 974 or $at,$t2 975 .set noreorder 976 beqz $at,.L_bn_div_words_inner_loop2_done 977 $SUBU $t1,$v1 978 $SUBU $t0,$a2 979 b .L_bn_div_words_inner_loop2 980 $SUBU $QT,1 981 .set reorder 982.L_bn_div_words_inner_loop2_done: 983 984 $SUBU $a0,$t3,$t0 985 or $v0,$QT 986 $SRL $v1,$a0,$t9 # $v1 contains remainder if anybody wants it 987 $SRL $a2,$t9 # restore $a2 988 989 .set noreorder 990 move $a1,$v1 991___ 992$code.=<<___ if ($flavour =~ /nubi/i); 993 $REG_L $t3,4*$SZREG($sp) 994 $REG_L $t2,3*$SZREG($sp) 995 $REG_L $t1,2*$SZREG($sp) 996 $REG_L $t0,1*$SZREG($sp) 997 $REG_L $gp,0*$SZREG($sp) 998 $PTR_ADD $sp,6*$SZREG 999___ 1000$code.=<<___; 1001 jr $ra 1002 move $a0,$v0 1003.end bn_div_words_internal 1004___ 1005undef $HH; undef $QT; undef $DH; 1006 1007($a_0,$a_1,$a_2,$a_3)=($t0,$t1,$t2,$t3); 1008($b_0,$b_1,$b_2,$b_3)=($ta0,$ta1,$ta2,$ta3); 1009 1010($a_4,$a_5,$a_6,$a_7)=($s0,$s2,$s4,$a1); # once we load a[7], no use for $a1 1011($b_4,$b_5,$b_6,$b_7)=($s1,$s3,$s5,$a2); # once we load b[7], no use for $a2 1012 1013($t_1,$t_2,$c_1,$c_2,$c_3)=($t8,$t9,$v0,$v1,$a3); 1014 1015$code.=<<___; 1016 1017.align 5 1018.globl bn_mul_comba8 1019.ent bn_mul_comba8 1020bn_mul_comba8: 1021 .set noreorder 1022___ 1023$code.=<<___ if ($flavour =~ /nubi/i); 1024 .frame $sp,12*$SZREG,$ra 1025 .mask 0x803ff008,-$SZREG 1026 $PTR_SUB $sp,12*$SZREG 1027 $REG_S $ra,11*$SZREG($sp) 1028 $REG_S $s5,10*$SZREG($sp) 1029 $REG_S $s4,9*$SZREG($sp) 1030 $REG_S $s3,8*$SZREG($sp) 1031 $REG_S $s2,7*$SZREG($sp) 1032 $REG_S $s1,6*$SZREG($sp) 1033 $REG_S $s0,5*$SZREG($sp) 1034 $REG_S $t3,4*$SZREG($sp) 1035 $REG_S $t2,3*$SZREG($sp) 1036 $REG_S $t1,2*$SZREG($sp) 1037 $REG_S $t0,1*$SZREG($sp) 1038 $REG_S $gp,0*$SZREG($sp) 1039___ 1040$code.=<<___ if ($flavour !~ /nubi/i); 1041 .frame $sp,6*$SZREG,$ra 1042 .mask 0x003f0000,-$SZREG 1043 $PTR_SUB $sp,6*$SZREG 1044 $REG_S $s5,5*$SZREG($sp) 1045 $REG_S $s4,4*$SZREG($sp) 1046 $REG_S $s3,3*$SZREG($sp) 1047 $REG_S $s2,2*$SZREG($sp) 1048 $REG_S $s1,1*$SZREG($sp) 1049 $REG_S $s0,0*$SZREG($sp) 1050___ 1051$code.=<<___; 1052 1053 .set reorder 1054 $LD $a_0,0($a1) # If compiled with -mips3 option on 1055 # R5000 box assembler barks on this 1056 # 1ine with "should not have mult/div 1057 # as last instruction in bb (R10K 1058 # bug)" warning. If anybody out there 1059 # has a clue about how to circumvent 1060 # this do send me a note. 1061 # <appro\@fy.chalmers.se> 1062 1063 $LD $b_0,0($a2) 1064 $LD $a_1,$BNSZ($a1) 1065 $LD $a_2,2*$BNSZ($a1) 1066 $MULTU $a_0,$b_0 # mul_add_c(a[0],b[0],c1,c2,c3); 1067 $LD $a_3,3*$BNSZ($a1) 1068 $LD $b_1,$BNSZ($a2) 1069 $LD $b_2,2*$BNSZ($a2) 1070 $LD $b_3,3*$BNSZ($a2) 1071 mflo $c_1 1072 mfhi $c_2 1073 1074 $LD $a_4,4*$BNSZ($a1) 1075 $LD $a_5,5*$BNSZ($a1) 1076 $MULTU $a_0,$b_1 # mul_add_c(a[0],b[1],c2,c3,c1); 1077 $LD $a_6,6*$BNSZ($a1) 1078 $LD $a_7,7*$BNSZ($a1) 1079 $LD $b_4,4*$BNSZ($a2) 1080 $LD $b_5,5*$BNSZ($a2) 1081 mflo $t_1 1082 mfhi $t_2 1083 $ADDU $c_2,$t_1 1084 sltu $at,$c_2,$t_1 1085 $MULTU $a_1,$b_0 # mul_add_c(a[1],b[0],c2,c3,c1); 1086 $ADDU $c_3,$t_2,$at 1087 $LD $b_6,6*$BNSZ($a2) 1088 $LD $b_7,7*$BNSZ($a2) 1089 $ST $c_1,0($a0) # r[0]=c1; 1090 mflo $t_1 1091 mfhi $t_2 1092 $ADDU $c_2,$t_1 1093 sltu $at,$c_2,$t_1 1094 $MULTU $a_2,$b_0 # mul_add_c(a[2],b[0],c3,c1,c2); 1095 $ADDU $t_2,$at 1096 $ADDU $c_3,$t_2 1097 sltu $c_1,$c_3,$t_2 1098 $ST $c_2,$BNSZ($a0) # r[1]=c2; 1099 1100 mflo $t_1 1101 mfhi $t_2 1102 $ADDU $c_3,$t_1 1103 sltu $at,$c_3,$t_1 1104 $MULTU $a_1,$b_1 # mul_add_c(a[1],b[1],c3,c1,c2); 1105 $ADDU $t_2,$at 1106 $ADDU $c_1,$t_2 1107 mflo $t_1 1108 mfhi $t_2 1109 $ADDU $c_3,$t_1 1110 sltu $at,$c_3,$t_1 1111 $MULTU $a_0,$b_2 # mul_add_c(a[0],b[2],c3,c1,c2); 1112 $ADDU $t_2,$at 1113 $ADDU $c_1,$t_2 1114 sltu $c_2,$c_1,$t_2 1115 mflo $t_1 1116 mfhi $t_2 1117 $ADDU $c_3,$t_1 1118 sltu $at,$c_3,$t_1 1119 $MULTU $a_0,$b_3 # mul_add_c(a[0],b[3],c1,c2,c3); 1120 $ADDU $t_2,$at 1121 $ADDU $c_1,$t_2 1122 sltu $at,$c_1,$t_2 1123 $ADDU $c_2,$at 1124 $ST $c_3,2*$BNSZ($a0) # r[2]=c3; 1125 1126 mflo $t_1 1127 mfhi $t_2 1128 $ADDU $c_1,$t_1 1129 sltu $at,$c_1,$t_1 1130 $MULTU $a_1,$b_2 # mul_add_c(a[1],b[2],c1,c2,c3); 1131 $ADDU $t_2,$at 1132 $ADDU $c_2,$t_2 1133 sltu $c_3,$c_2,$t_2 1134 mflo $t_1 1135 mfhi $t_2 1136 $ADDU $c_1,$t_1 1137 sltu $at,$c_1,$t_1 1138 $MULTU $a_2,$b_1 # mul_add_c(a[2],b[1],c1,c2,c3); 1139 $ADDU $t_2,$at 1140 $ADDU $c_2,$t_2 1141 sltu $at,$c_2,$t_2 1142 $ADDU $c_3,$at 1143 mflo $t_1 1144 mfhi $t_2 1145 $ADDU $c_1,$t_1 1146 sltu $at,$c_1,$t_1 1147 $MULTU $a_3,$b_0 # mul_add_c(a[3],b[0],c1,c2,c3); 1148 $ADDU $t_2,$at 1149 $ADDU $c_2,$t_2 1150 sltu $at,$c_2,$t_2 1151 $ADDU $c_3,$at 1152 mflo $t_1 1153 mfhi $t_2 1154 $ADDU $c_1,$t_1 1155 sltu $at,$c_1,$t_1 1156 $MULTU $a_4,$b_0 # mul_add_c(a[4],b[0],c2,c3,c1); 1157 $ADDU $t_2,$at 1158 $ADDU $c_2,$t_2 1159 sltu $at,$c_2,$t_2 1160 $ADDU $c_3,$at 1161 $ST $c_1,3*$BNSZ($a0) # r[3]=c1; 1162 1163 mflo $t_1 1164 mfhi $t_2 1165 $ADDU $c_2,$t_1 1166 sltu $at,$c_2,$t_1 1167 $MULTU $a_3,$b_1 # mul_add_c(a[3],b[1],c2,c3,c1); 1168 $ADDU $t_2,$at 1169 $ADDU $c_3,$t_2 1170 sltu $c_1,$c_3,$t_2 1171 mflo $t_1 1172 mfhi $t_2 1173 $ADDU $c_2,$t_1 1174 sltu $at,$c_2,$t_1 1175 $MULTU $a_2,$b_2 # mul_add_c(a[2],b[2],c2,c3,c1); 1176 $ADDU $t_2,$at 1177 $ADDU $c_3,$t_2 1178 sltu $at,$c_3,$t_2 1179 $ADDU $c_1,$at 1180 mflo $t_1 1181 mfhi $t_2 1182 $ADDU $c_2,$t_1 1183 sltu $at,$c_2,$t_1 1184 $MULTU $a_1,$b_3 # mul_add_c(a[1],b[3],c2,c3,c1); 1185 $ADDU $t_2,$at 1186 $ADDU $c_3,$t_2 1187 sltu $at,$c_3,$t_2 1188 $ADDU $c_1,$at 1189 mflo $t_1 1190 mfhi $t_2 1191 $ADDU $c_2,$t_1 1192 sltu $at,$c_2,$t_1 1193 $MULTU $a_0,$b_4 # mul_add_c(a[0],b[4],c2,c3,c1); 1194 $ADDU $t_2,$at 1195 $ADDU $c_3,$t_2 1196 sltu $at,$c_3,$t_2 1197 $ADDU $c_1,$at 1198 mflo $t_1 1199 mfhi $t_2 1200 $ADDU $c_2,$t_1 1201 sltu $at,$c_2,$t_1 1202 $MULTU $a_0,$b_5 # mul_add_c(a[0],b[5],c3,c1,c2); 1203 $ADDU $t_2,$at 1204 $ADDU $c_3,$t_2 1205 sltu $at,$c_3,$t_2 1206 $ADDU $c_1,$at 1207 $ST $c_2,4*$BNSZ($a0) # r[4]=c2; 1208 1209 mflo $t_1 1210 mfhi $t_2 1211 $ADDU $c_3,$t_1 1212 sltu $at,$c_3,$t_1 1213 $MULTU $a_1,$b_4 # mul_add_c(a[1],b[4],c3,c1,c2); 1214 $ADDU $t_2,$at 1215 $ADDU $c_1,$t_2 1216 sltu $c_2,$c_1,$t_2 1217 mflo $t_1 1218 mfhi $t_2 1219 $ADDU $c_3,$t_1 1220 sltu $at,$c_3,$t_1 1221 $MULTU $a_2,$b_3 # mul_add_c(a[2],b[3],c3,c1,c2); 1222 $ADDU $t_2,$at 1223 $ADDU $c_1,$t_2 1224 sltu $at,$c_1,$t_2 1225 $ADDU $c_2,$at 1226 mflo $t_1 1227 mfhi $t_2 1228 $ADDU $c_3,$t_1 1229 sltu $at,$c_3,$t_1 1230 $MULTU $a_3,$b_2 # mul_add_c(a[3],b[2],c3,c1,c2); 1231 $ADDU $t_2,$at 1232 $ADDU $c_1,$t_2 1233 sltu $at,$c_1,$t_2 1234 $ADDU $c_2,$at 1235 mflo $t_1 1236 mfhi $t_2 1237 $ADDU $c_3,$t_1 1238 sltu $at,$c_3,$t_1 1239 $MULTU $a_4,$b_1 # mul_add_c(a[4],b[1],c3,c1,c2); 1240 $ADDU $t_2,$at 1241 $ADDU $c_1,$t_2 1242 sltu $at,$c_1,$t_2 1243 $ADDU $c_2,$at 1244 mflo $t_1 1245 mfhi $t_2 1246 $ADDU $c_3,$t_1 1247 sltu $at,$c_3,$t_1 1248 $MULTU $a_5,$b_0 # mul_add_c(a[5],b[0],c3,c1,c2); 1249 $ADDU $t_2,$at 1250 $ADDU $c_1,$t_2 1251 sltu $at,$c_1,$t_2 1252 $ADDU $c_2,$at 1253 mflo $t_1 1254 mfhi $t_2 1255 $ADDU $c_3,$t_1 1256 sltu $at,$c_3,$t_1 1257 $MULTU $a_6,$b_0 # mul_add_c(a[6],b[0],c1,c2,c3); 1258 $ADDU $t_2,$at 1259 $ADDU $c_1,$t_2 1260 sltu $at,$c_1,$t_2 1261 $ADDU $c_2,$at 1262 $ST $c_3,5*$BNSZ($a0) # r[5]=c3; 1263 1264 mflo $t_1 1265 mfhi $t_2 1266 $ADDU $c_1,$t_1 1267 sltu $at,$c_1,$t_1 1268 $MULTU $a_5,$b_1 # mul_add_c(a[5],b[1],c1,c2,c3); 1269 $ADDU $t_2,$at 1270 $ADDU $c_2,$t_2 1271 sltu $c_3,$c_2,$t_2 1272 mflo $t_1 1273 mfhi $t_2 1274 $ADDU $c_1,$t_1 1275 sltu $at,$c_1,$t_1 1276 $MULTU $a_4,$b_2 # mul_add_c(a[4],b[2],c1,c2,c3); 1277 $ADDU $t_2,$at 1278 $ADDU $c_2,$t_2 1279 sltu $at,$c_2,$t_2 1280 $ADDU $c_3,$at 1281 mflo $t_1 1282 mfhi $t_2 1283 $ADDU $c_1,$t_1 1284 sltu $at,$c_1,$t_1 1285 $MULTU $a_3,$b_3 # mul_add_c(a[3],b[3],c1,c2,c3); 1286 $ADDU $t_2,$at 1287 $ADDU $c_2,$t_2 1288 sltu $at,$c_2,$t_2 1289 $ADDU $c_3,$at 1290 mflo $t_1 1291 mfhi $t_2 1292 $ADDU $c_1,$t_1 1293 sltu $at,$c_1,$t_1 1294 $MULTU $a_2,$b_4 # mul_add_c(a[2],b[4],c1,c2,c3); 1295 $ADDU $t_2,$at 1296 $ADDU $c_2,$t_2 1297 sltu $at,$c_2,$t_2 1298 $ADDU $c_3,$at 1299 mflo $t_1 1300 mfhi $t_2 1301 $ADDU $c_1,$t_1 1302 sltu $at,$c_1,$t_1 1303 $MULTU $a_1,$b_5 # mul_add_c(a[1],b[5],c1,c2,c3); 1304 $ADDU $t_2,$at 1305 $ADDU $c_2,$t_2 1306 sltu $at,$c_2,$t_2 1307 $ADDU $c_3,$at 1308 mflo $t_1 1309 mfhi $t_2 1310 $ADDU $c_1,$t_1 1311 sltu $at,$c_1,$t_1 1312 $MULTU $a_0,$b_6 # mul_add_c(a[0],b[6],c1,c2,c3); 1313 $ADDU $t_2,$at 1314 $ADDU $c_2,$t_2 1315 sltu $at,$c_2,$t_2 1316 $ADDU $c_3,$at 1317 mflo $t_1 1318 mfhi $t_2 1319 $ADDU $c_1,$t_1 1320 sltu $at,$c_1,$t_1 1321 $MULTU $a_0,$b_7 # mul_add_c(a[0],b[7],c2,c3,c1); 1322 $ADDU $t_2,$at 1323 $ADDU $c_2,$t_2 1324 sltu $at,$c_2,$t_2 1325 $ADDU $c_3,$at 1326 $ST $c_1,6*$BNSZ($a0) # r[6]=c1; 1327 1328 mflo $t_1 1329 mfhi $t_2 1330 $ADDU $c_2,$t_1 1331 sltu $at,$c_2,$t_1 1332 $MULTU $a_1,$b_6 # mul_add_c(a[1],b[6],c2,c3,c1); 1333 $ADDU $t_2,$at 1334 $ADDU $c_3,$t_2 1335 sltu $c_1,$c_3,$t_2 1336 mflo $t_1 1337 mfhi $t_2 1338 $ADDU $c_2,$t_1 1339 sltu $at,$c_2,$t_1 1340 $MULTU $a_2,$b_5 # mul_add_c(a[2],b[5],c2,c3,c1); 1341 $ADDU $t_2,$at 1342 $ADDU $c_3,$t_2 1343 sltu $at,$c_3,$t_2 1344 $ADDU $c_1,$at 1345 mflo $t_1 1346 mfhi $t_2 1347 $ADDU $c_2,$t_1 1348 sltu $at,$c_2,$t_1 1349 $MULTU $a_3,$b_4 # mul_add_c(a[3],b[4],c2,c3,c1); 1350 $ADDU $t_2,$at 1351 $ADDU $c_3,$t_2 1352 sltu $at,$c_3,$t_2 1353 $ADDU $c_1,$at 1354 mflo $t_1 1355 mfhi $t_2 1356 $ADDU $c_2,$t_1 1357 sltu $at,$c_2,$t_1 1358 $MULTU $a_4,$b_3 # mul_add_c(a[4],b[3],c2,c3,c1); 1359 $ADDU $t_2,$at 1360 $ADDU $c_3,$t_2 1361 sltu $at,$c_3,$t_2 1362 $ADDU $c_1,$at 1363 mflo $t_1 1364 mfhi $t_2 1365 $ADDU $c_2,$t_1 1366 sltu $at,$c_2,$t_1 1367 $MULTU $a_5,$b_2 # mul_add_c(a[5],b[2],c2,c3,c1); 1368 $ADDU $t_2,$at 1369 $ADDU $c_3,$t_2 1370 sltu $at,$c_3,$t_2 1371 $ADDU $c_1,$at 1372 mflo $t_1 1373 mfhi $t_2 1374 $ADDU $c_2,$t_1 1375 sltu $at,$c_2,$t_1 1376 $MULTU $a_6,$b_1 # mul_add_c(a[6],b[1],c2,c3,c1); 1377 $ADDU $t_2,$at 1378 $ADDU $c_3,$t_2 1379 sltu $at,$c_3,$t_2 1380 $ADDU $c_1,$at 1381 mflo $t_1 1382 mfhi $t_2 1383 $ADDU $c_2,$t_1 1384 sltu $at,$c_2,$t_1 1385 $MULTU $a_7,$b_0 # mul_add_c(a[7],b[0],c2,c3,c1); 1386 $ADDU $t_2,$at 1387 $ADDU $c_3,$t_2 1388 sltu $at,$c_3,$t_2 1389 $ADDU $c_1,$at 1390 mflo $t_1 1391 mfhi $t_2 1392 $ADDU $c_2,$t_1 1393 sltu $at,$c_2,$t_1 1394 $MULTU $a_7,$b_1 # mul_add_c(a[7],b[1],c3,c1,c2); 1395 $ADDU $t_2,$at 1396 $ADDU $c_3,$t_2 1397 sltu $at,$c_3,$t_2 1398 $ADDU $c_1,$at 1399 $ST $c_2,7*$BNSZ($a0) # r[7]=c2; 1400 1401 mflo $t_1 1402 mfhi $t_2 1403 $ADDU $c_3,$t_1 1404 sltu $at,$c_3,$t_1 1405 $MULTU $a_6,$b_2 # mul_add_c(a[6],b[2],c3,c1,c2); 1406 $ADDU $t_2,$at 1407 $ADDU $c_1,$t_2 1408 sltu $c_2,$c_1,$t_2 1409 mflo $t_1 1410 mfhi $t_2 1411 $ADDU $c_3,$t_1 1412 sltu $at,$c_3,$t_1 1413 $MULTU $a_5,$b_3 # mul_add_c(a[5],b[3],c3,c1,c2); 1414 $ADDU $t_2,$at 1415 $ADDU $c_1,$t_2 1416 sltu $at,$c_1,$t_2 1417 $ADDU $c_2,$at 1418 mflo $t_1 1419 mfhi $t_2 1420 $ADDU $c_3,$t_1 1421 sltu $at,$c_3,$t_1 1422 $MULTU $a_4,$b_4 # mul_add_c(a[4],b[4],c3,c1,c2); 1423 $ADDU $t_2,$at 1424 $ADDU $c_1,$t_2 1425 sltu $at,$c_1,$t_2 1426 $ADDU $c_2,$at 1427 mflo $t_1 1428 mfhi $t_2 1429 $ADDU $c_3,$t_1 1430 sltu $at,$c_3,$t_1 1431 $MULTU $a_3,$b_5 # mul_add_c(a[3],b[5],c3,c1,c2); 1432 $ADDU $t_2,$at 1433 $ADDU $c_1,$t_2 1434 sltu $at,$c_1,$t_2 1435 $ADDU $c_2,$at 1436 mflo $t_1 1437 mfhi $t_2 1438 $ADDU $c_3,$t_1 1439 sltu $at,$c_3,$t_1 1440 $MULTU $a_2,$b_6 # mul_add_c(a[2],b[6],c3,c1,c2); 1441 $ADDU $t_2,$at 1442 $ADDU $c_1,$t_2 1443 sltu $at,$c_1,$t_2 1444 $ADDU $c_2,$at 1445 mflo $t_1 1446 mfhi $t_2 1447 $ADDU $c_3,$t_1 1448 sltu $at,$c_3,$t_1 1449 $MULTU $a_1,$b_7 # mul_add_c(a[1],b[7],c3,c1,c2); 1450 $ADDU $t_2,$at 1451 $ADDU $c_1,$t_2 1452 sltu $at,$c_1,$t_2 1453 $ADDU $c_2,$at 1454 mflo $t_1 1455 mfhi $t_2 1456 $ADDU $c_3,$t_1 1457 sltu $at,$c_3,$t_1 1458 $MULTU $a_2,$b_7 # mul_add_c(a[2],b[7],c1,c2,c3); 1459 $ADDU $t_2,$at 1460 $ADDU $c_1,$t_2 1461 sltu $at,$c_1,$t_2 1462 $ADDU $c_2,$at 1463 $ST $c_3,8*$BNSZ($a0) # r[8]=c3; 1464 1465 mflo $t_1 1466 mfhi $t_2 1467 $ADDU $c_1,$t_1 1468 sltu $at,$c_1,$t_1 1469 $MULTU $a_3,$b_6 # mul_add_c(a[3],b[6],c1,c2,c3); 1470 $ADDU $t_2,$at 1471 $ADDU $c_2,$t_2 1472 sltu $c_3,$c_2,$t_2 1473 mflo $t_1 1474 mfhi $t_2 1475 $ADDU $c_1,$t_1 1476 sltu $at,$c_1,$t_1 1477 $MULTU $a_4,$b_5 # mul_add_c(a[4],b[5],c1,c2,c3); 1478 $ADDU $t_2,$at 1479 $ADDU $c_2,$t_2 1480 sltu $at,$c_2,$t_2 1481 $ADDU $c_3,$at 1482 mflo $t_1 1483 mfhi $t_2 1484 $ADDU $c_1,$t_1 1485 sltu $at,$c_1,$t_1 1486 $MULTU $a_5,$b_4 # mul_add_c(a[5],b[4],c1,c2,c3); 1487 $ADDU $t_2,$at 1488 $ADDU $c_2,$t_2 1489 sltu $at,$c_2,$t_2 1490 $ADDU $c_3,$at 1491 mflo $t_1 1492 mfhi $t_2 1493 $ADDU $c_1,$t_1 1494 sltu $at,$c_1,$t_1 1495 $MULTU $a_6,$b_3 # mul_add_c(a[6],b[3],c1,c2,c3); 1496 $ADDU $t_2,$at 1497 $ADDU $c_2,$t_2 1498 sltu $at,$c_2,$t_2 1499 $ADDU $c_3,$at 1500 mflo $t_1 1501 mfhi $t_2 1502 $ADDU $c_1,$t_1 1503 sltu $at,$c_1,$t_1 1504 $MULTU $a_7,$b_2 # mul_add_c(a[7],b[2],c1,c2,c3); 1505 $ADDU $t_2,$at 1506 $ADDU $c_2,$t_2 1507 sltu $at,$c_2,$t_2 1508 $ADDU $c_3,$at 1509 mflo $t_1 1510 mfhi $t_2 1511 $ADDU $c_1,$t_1 1512 sltu $at,$c_1,$t_1 1513 $MULTU $a_7,$b_3 # mul_add_c(a[7],b[3],c2,c3,c1); 1514 $ADDU $t_2,$at 1515 $ADDU $c_2,$t_2 1516 sltu $at,$c_2,$t_2 1517 $ADDU $c_3,$at 1518 $ST $c_1,9*$BNSZ($a0) # r[9]=c1; 1519 1520 mflo $t_1 1521 mfhi $t_2 1522 $ADDU $c_2,$t_1 1523 sltu $at,$c_2,$t_1 1524 $MULTU $a_6,$b_4 # mul_add_c(a[6],b[4],c2,c3,c1); 1525 $ADDU $t_2,$at 1526 $ADDU $c_3,$t_2 1527 sltu $c_1,$c_3,$t_2 1528 mflo $t_1 1529 mfhi $t_2 1530 $ADDU $c_2,$t_1 1531 sltu $at,$c_2,$t_1 1532 $MULTU $a_5,$b_5 # mul_add_c(a[5],b[5],c2,c3,c1); 1533 $ADDU $t_2,$at 1534 $ADDU $c_3,$t_2 1535 sltu $at,$c_3,$t_2 1536 $ADDU $c_1,$at 1537 mflo $t_1 1538 mfhi $t_2 1539 $ADDU $c_2,$t_1 1540 sltu $at,$c_2,$t_1 1541 $MULTU $a_4,$b_6 # mul_add_c(a[4],b[6],c2,c3,c1); 1542 $ADDU $t_2,$at 1543 $ADDU $c_3,$t_2 1544 sltu $at,$c_3,$t_2 1545 $ADDU $c_1,$at 1546 mflo $t_1 1547 mfhi $t_2 1548 $ADDU $c_2,$t_1 1549 sltu $at,$c_2,$t_1 1550 $MULTU $a_3,$b_7 # mul_add_c(a[3],b[7],c2,c3,c1); 1551 $ADDU $t_2,$at 1552 $ADDU $c_3,$t_2 1553 sltu $at,$c_3,$t_2 1554 $ADDU $c_1,$at 1555 mflo $t_1 1556 mfhi $t_2 1557 $ADDU $c_2,$t_1 1558 sltu $at,$c_2,$t_1 1559 $MULTU $a_4,$b_7 # mul_add_c(a[4],b[7],c3,c1,c2); 1560 $ADDU $t_2,$at 1561 $ADDU $c_3,$t_2 1562 sltu $at,$c_3,$t_2 1563 $ADDU $c_1,$at 1564 $ST $c_2,10*$BNSZ($a0) # r[10]=c2; 1565 1566 mflo $t_1 1567 mfhi $t_2 1568 $ADDU $c_3,$t_1 1569 sltu $at,$c_3,$t_1 1570 $MULTU $a_5,$b_6 # mul_add_c(a[5],b[6],c3,c1,c2); 1571 $ADDU $t_2,$at 1572 $ADDU $c_1,$t_2 1573 sltu $c_2,$c_1,$t_2 1574 mflo $t_1 1575 mfhi $t_2 1576 $ADDU $c_3,$t_1 1577 sltu $at,$c_3,$t_1 1578 $MULTU $a_6,$b_5 # mul_add_c(a[6],b[5],c3,c1,c2); 1579 $ADDU $t_2,$at 1580 $ADDU $c_1,$t_2 1581 sltu $at,$c_1,$t_2 1582 $ADDU $c_2,$at 1583 mflo $t_1 1584 mfhi $t_2 1585 $ADDU $c_3,$t_1 1586 sltu $at,$c_3,$t_1 1587 $MULTU $a_7,$b_4 # mul_add_c(a[7],b[4],c3,c1,c2); 1588 $ADDU $t_2,$at 1589 $ADDU $c_1,$t_2 1590 sltu $at,$c_1,$t_2 1591 $ADDU $c_2,$at 1592 mflo $t_1 1593 mfhi $t_2 1594 $ADDU $c_3,$t_1 1595 sltu $at,$c_3,$t_1 1596 $MULTU $a_7,$b_5 # mul_add_c(a[7],b[5],c1,c2,c3); 1597 $ADDU $t_2,$at 1598 $ADDU $c_1,$t_2 1599 sltu $at,$c_1,$t_2 1600 $ADDU $c_2,$at 1601 $ST $c_3,11*$BNSZ($a0) # r[11]=c3; 1602 1603 mflo $t_1 1604 mfhi $t_2 1605 $ADDU $c_1,$t_1 1606 sltu $at,$c_1,$t_1 1607 $MULTU $a_6,$b_6 # mul_add_c(a[6],b[6],c1,c2,c3); 1608 $ADDU $t_2,$at 1609 $ADDU $c_2,$t_2 1610 sltu $c_3,$c_2,$t_2 1611 mflo $t_1 1612 mfhi $t_2 1613 $ADDU $c_1,$t_1 1614 sltu $at,$c_1,$t_1 1615 $MULTU $a_5,$b_7 # mul_add_c(a[5],b[7],c1,c2,c3); 1616 $ADDU $t_2,$at 1617 $ADDU $c_2,$t_2 1618 sltu $at,$c_2,$t_2 1619 $ADDU $c_3,$at 1620 mflo $t_1 1621 mfhi $t_2 1622 $ADDU $c_1,$t_1 1623 sltu $at,$c_1,$t_1 1624 $MULTU $a_6,$b_7 # mul_add_c(a[6],b[7],c2,c3,c1); 1625 $ADDU $t_2,$at 1626 $ADDU $c_2,$t_2 1627 sltu $at,$c_2,$t_2 1628 $ADDU $c_3,$at 1629 $ST $c_1,12*$BNSZ($a0) # r[12]=c1; 1630 1631 mflo $t_1 1632 mfhi $t_2 1633 $ADDU $c_2,$t_1 1634 sltu $at,$c_2,$t_1 1635 $MULTU $a_7,$b_6 # mul_add_c(a[7],b[6],c2,c3,c1); 1636 $ADDU $t_2,$at 1637 $ADDU $c_3,$t_2 1638 sltu $c_1,$c_3,$t_2 1639 mflo $t_1 1640 mfhi $t_2 1641 $ADDU $c_2,$t_1 1642 sltu $at,$c_2,$t_1 1643 $MULTU $a_7,$b_7 # mul_add_c(a[7],b[7],c3,c1,c2); 1644 $ADDU $t_2,$at 1645 $ADDU $c_3,$t_2 1646 sltu $at,$c_3,$t_2 1647 $ADDU $c_1,$at 1648 $ST $c_2,13*$BNSZ($a0) # r[13]=c2; 1649 1650 mflo $t_1 1651 mfhi $t_2 1652 $ADDU $c_3,$t_1 1653 sltu $at,$c_3,$t_1 1654 $ADDU $t_2,$at 1655 $ADDU $c_1,$t_2 1656 $ST $c_3,14*$BNSZ($a0) # r[14]=c3; 1657 $ST $c_1,15*$BNSZ($a0) # r[15]=c1; 1658 1659 .set noreorder 1660___ 1661$code.=<<___ if ($flavour =~ /nubi/i); 1662 $REG_L $s5,10*$SZREG($sp) 1663 $REG_L $s4,9*$SZREG($sp) 1664 $REG_L $s3,8*$SZREG($sp) 1665 $REG_L $s2,7*$SZREG($sp) 1666 $REG_L $s1,6*$SZREG($sp) 1667 $REG_L $s0,5*$SZREG($sp) 1668 $REG_L $t3,4*$SZREG($sp) 1669 $REG_L $t2,3*$SZREG($sp) 1670 $REG_L $t1,2*$SZREG($sp) 1671 $REG_L $t0,1*$SZREG($sp) 1672 $REG_L $gp,0*$SZREG($sp) 1673 jr $ra 1674 $PTR_ADD $sp,12*$SZREG 1675___ 1676$code.=<<___ if ($flavour !~ /nubi/i); 1677 $REG_L $s5,5*$SZREG($sp) 1678 $REG_L $s4,4*$SZREG($sp) 1679 $REG_L $s3,3*$SZREG($sp) 1680 $REG_L $s2,2*$SZREG($sp) 1681 $REG_L $s1,1*$SZREG($sp) 1682 $REG_L $s0,0*$SZREG($sp) 1683 jr $ra 1684 $PTR_ADD $sp,6*$SZREG 1685___ 1686$code.=<<___; 1687.end bn_mul_comba8 1688 1689.align 5 1690.globl bn_mul_comba4 1691.ent bn_mul_comba4 1692bn_mul_comba4: 1693___ 1694$code.=<<___ if ($flavour =~ /nubi/i); 1695 .frame $sp,6*$SZREG,$ra 1696 .mask 0x8000f008,-$SZREG 1697 .set noreorder 1698 $PTR_SUB $sp,6*$SZREG 1699 $REG_S $ra,5*$SZREG($sp) 1700 $REG_S $t3,4*$SZREG($sp) 1701 $REG_S $t2,3*$SZREG($sp) 1702 $REG_S $t1,2*$SZREG($sp) 1703 $REG_S $t0,1*$SZREG($sp) 1704 $REG_S $gp,0*$SZREG($sp) 1705___ 1706$code.=<<___; 1707 .set reorder 1708 $LD $a_0,0($a1) 1709 $LD $b_0,0($a2) 1710 $LD $a_1,$BNSZ($a1) 1711 $LD $a_2,2*$BNSZ($a1) 1712 $MULTU $a_0,$b_0 # mul_add_c(a[0],b[0],c1,c2,c3); 1713 $LD $a_3,3*$BNSZ($a1) 1714 $LD $b_1,$BNSZ($a2) 1715 $LD $b_2,2*$BNSZ($a2) 1716 $LD $b_3,3*$BNSZ($a2) 1717 mflo $c_1 1718 mfhi $c_2 1719 $ST $c_1,0($a0) 1720 1721 $MULTU $a_0,$b_1 # mul_add_c(a[0],b[1],c2,c3,c1); 1722 mflo $t_1 1723 mfhi $t_2 1724 $ADDU $c_2,$t_1 1725 sltu $at,$c_2,$t_1 1726 $MULTU $a_1,$b_0 # mul_add_c(a[1],b[0],c2,c3,c1); 1727 $ADDU $c_3,$t_2,$at 1728 mflo $t_1 1729 mfhi $t_2 1730 $ADDU $c_2,$t_1 1731 sltu $at,$c_2,$t_1 1732 $MULTU $a_2,$b_0 # mul_add_c(a[2],b[0],c3,c1,c2); 1733 $ADDU $t_2,$at 1734 $ADDU $c_3,$t_2 1735 sltu $c_1,$c_3,$t_2 1736 $ST $c_2,$BNSZ($a0) 1737 1738 mflo $t_1 1739 mfhi $t_2 1740 $ADDU $c_3,$t_1 1741 sltu $at,$c_3,$t_1 1742 $MULTU $a_1,$b_1 # mul_add_c(a[1],b[1],c3,c1,c2); 1743 $ADDU $t_2,$at 1744 $ADDU $c_1,$t_2 1745 mflo $t_1 1746 mfhi $t_2 1747 $ADDU $c_3,$t_1 1748 sltu $at,$c_3,$t_1 1749 $MULTU $a_0,$b_2 # mul_add_c(a[0],b[2],c3,c1,c2); 1750 $ADDU $t_2,$at 1751 $ADDU $c_1,$t_2 1752 sltu $c_2,$c_1,$t_2 1753 mflo $t_1 1754 mfhi $t_2 1755 $ADDU $c_3,$t_1 1756 sltu $at,$c_3,$t_1 1757 $MULTU $a_0,$b_3 # mul_add_c(a[0],b[3],c1,c2,c3); 1758 $ADDU $t_2,$at 1759 $ADDU $c_1,$t_2 1760 sltu $at,$c_1,$t_2 1761 $ADDU $c_2,$at 1762 $ST $c_3,2*$BNSZ($a0) 1763 1764 mflo $t_1 1765 mfhi $t_2 1766 $ADDU $c_1,$t_1 1767 sltu $at,$c_1,$t_1 1768 $MULTU $a_1,$b_2 # mul_add_c(a[1],b[2],c1,c2,c3); 1769 $ADDU $t_2,$at 1770 $ADDU $c_2,$t_2 1771 sltu $c_3,$c_2,$t_2 1772 mflo $t_1 1773 mfhi $t_2 1774 $ADDU $c_1,$t_1 1775 sltu $at,$c_1,$t_1 1776 $MULTU $a_2,$b_1 # mul_add_c(a[2],b[1],c1,c2,c3); 1777 $ADDU $t_2,$at 1778 $ADDU $c_2,$t_2 1779 sltu $at,$c_2,$t_2 1780 $ADDU $c_3,$at 1781 mflo $t_1 1782 mfhi $t_2 1783 $ADDU $c_1,$t_1 1784 sltu $at,$c_1,$t_1 1785 $MULTU $a_3,$b_0 # mul_add_c(a[3],b[0],c1,c2,c3); 1786 $ADDU $t_2,$at 1787 $ADDU $c_2,$t_2 1788 sltu $at,$c_2,$t_2 1789 $ADDU $c_3,$at 1790 mflo $t_1 1791 mfhi $t_2 1792 $ADDU $c_1,$t_1 1793 sltu $at,$c_1,$t_1 1794 $MULTU $a_3,$b_1 # mul_add_c(a[3],b[1],c2,c3,c1); 1795 $ADDU $t_2,$at 1796 $ADDU $c_2,$t_2 1797 sltu $at,$c_2,$t_2 1798 $ADDU $c_3,$at 1799 $ST $c_1,3*$BNSZ($a0) 1800 1801 mflo $t_1 1802 mfhi $t_2 1803 $ADDU $c_2,$t_1 1804 sltu $at,$c_2,$t_1 1805 $MULTU $a_2,$b_2 # mul_add_c(a[2],b[2],c2,c3,c1); 1806 $ADDU $t_2,$at 1807 $ADDU $c_3,$t_2 1808 sltu $c_1,$c_3,$t_2 1809 mflo $t_1 1810 mfhi $t_2 1811 $ADDU $c_2,$t_1 1812 sltu $at,$c_2,$t_1 1813 $MULTU $a_1,$b_3 # mul_add_c(a[1],b[3],c2,c3,c1); 1814 $ADDU $t_2,$at 1815 $ADDU $c_3,$t_2 1816 sltu $at,$c_3,$t_2 1817 $ADDU $c_1,$at 1818 mflo $t_1 1819 mfhi $t_2 1820 $ADDU $c_2,$t_1 1821 sltu $at,$c_2,$t_1 1822 $MULTU $a_2,$b_3 # mul_add_c(a[2],b[3],c3,c1,c2); 1823 $ADDU $t_2,$at 1824 $ADDU $c_3,$t_2 1825 sltu $at,$c_3,$t_2 1826 $ADDU $c_1,$at 1827 $ST $c_2,4*$BNSZ($a0) 1828 1829 mflo $t_1 1830 mfhi $t_2 1831 $ADDU $c_3,$t_1 1832 sltu $at,$c_3,$t_1 1833 $MULTU $a_3,$b_2 # mul_add_c(a[3],b[2],c3,c1,c2); 1834 $ADDU $t_2,$at 1835 $ADDU $c_1,$t_2 1836 sltu $c_2,$c_1,$t_2 1837 mflo $t_1 1838 mfhi $t_2 1839 $ADDU $c_3,$t_1 1840 sltu $at,$c_3,$t_1 1841 $MULTU $a_3,$b_3 # mul_add_c(a[3],b[3],c1,c2,c3); 1842 $ADDU $t_2,$at 1843 $ADDU $c_1,$t_2 1844 sltu $at,$c_1,$t_2 1845 $ADDU $c_2,$at 1846 $ST $c_3,5*$BNSZ($a0) 1847 1848 mflo $t_1 1849 mfhi $t_2 1850 $ADDU $c_1,$t_1 1851 sltu $at,$c_1,$t_1 1852 $ADDU $t_2,$at 1853 $ADDU $c_2,$t_2 1854 $ST $c_1,6*$BNSZ($a0) 1855 $ST $c_2,7*$BNSZ($a0) 1856 1857 .set noreorder 1858___ 1859$code.=<<___ if ($flavour =~ /nubi/i); 1860 $REG_L $t3,4*$SZREG($sp) 1861 $REG_L $t2,3*$SZREG($sp) 1862 $REG_L $t1,2*$SZREG($sp) 1863 $REG_L $t0,1*$SZREG($sp) 1864 $REG_L $gp,0*$SZREG($sp) 1865 $PTR_ADD $sp,6*$SZREG 1866___ 1867$code.=<<___; 1868 jr $ra 1869 nop 1870.end bn_mul_comba4 1871___ 1872 1873($a_4,$a_5,$a_6,$a_7)=($b_0,$b_1,$b_2,$b_3); 1874 1875sub add_c2 () { 1876my ($hi,$lo,$c0,$c1,$c2, 1877 $warm, # !$warm denotes first call with specific sequence of 1878 # $c_[XYZ] when there is no Z-carry to accumulate yet; 1879 $an,$bn # these two are arguments for multiplication which 1880 # result is used in *next* step [which is why it's 1881 # commented as "forward multiplication" below]; 1882 )=@_; 1883$code.=<<___; 1884 mflo $lo 1885 mfhi $hi 1886 $ADDU $c0,$lo 1887 sltu $at,$c0,$lo 1888 $MULTU $an,$bn # forward multiplication 1889 $ADDU $c0,$lo 1890 $ADDU $at,$hi 1891 sltu $lo,$c0,$lo 1892 $ADDU $c1,$at 1893 $ADDU $hi,$lo 1894___ 1895$code.=<<___ if (!$warm); 1896 sltu $c2,$c1,$at 1897 $ADDU $c1,$hi 1898 sltu $hi,$c1,$hi 1899 $ADDU $c2,$hi 1900___ 1901$code.=<<___ if ($warm); 1902 sltu $at,$c1,$at 1903 $ADDU $c1,$hi 1904 $ADDU $c2,$at 1905 sltu $hi,$c1,$hi 1906 $ADDU $c2,$hi 1907___ 1908} 1909 1910$code.=<<___; 1911 1912.align 5 1913.globl bn_sqr_comba8 1914.ent bn_sqr_comba8 1915bn_sqr_comba8: 1916___ 1917$code.=<<___ if ($flavour =~ /nubi/i); 1918 .frame $sp,6*$SZREG,$ra 1919 .mask 0x8000f008,-$SZREG 1920 .set noreorder 1921 $PTR_SUB $sp,6*$SZREG 1922 $REG_S $ra,5*$SZREG($sp) 1923 $REG_S $t3,4*$SZREG($sp) 1924 $REG_S $t2,3*$SZREG($sp) 1925 $REG_S $t1,2*$SZREG($sp) 1926 $REG_S $t0,1*$SZREG($sp) 1927 $REG_S $gp,0*$SZREG($sp) 1928___ 1929$code.=<<___; 1930 .set reorder 1931 $LD $a_0,0($a1) 1932 $LD $a_1,$BNSZ($a1) 1933 $LD $a_2,2*$BNSZ($a1) 1934 $LD $a_3,3*$BNSZ($a1) 1935 1936 $MULTU $a_0,$a_0 # mul_add_c(a[0],b[0],c1,c2,c3); 1937 $LD $a_4,4*$BNSZ($a1) 1938 $LD $a_5,5*$BNSZ($a1) 1939 $LD $a_6,6*$BNSZ($a1) 1940 $LD $a_7,7*$BNSZ($a1) 1941 mflo $c_1 1942 mfhi $c_2 1943 $ST $c_1,0($a0) 1944 1945 $MULTU $a_0,$a_1 # mul_add_c2(a[0],b[1],c2,c3,c1); 1946 mflo $t_1 1947 mfhi $t_2 1948 slt $c_1,$t_2,$zero 1949 $SLL $t_2,1 1950 $MULTU $a_2,$a_0 # mul_add_c2(a[2],b[0],c3,c1,c2); 1951 slt $a2,$t_1,$zero 1952 $ADDU $t_2,$a2 1953 $SLL $t_1,1 1954 $ADDU $c_2,$t_1 1955 sltu $at,$c_2,$t_1 1956 $ADDU $c_3,$t_2,$at 1957 $ST $c_2,$BNSZ($a0) 1958___ 1959 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0, 1960 $a_1,$a_1); # mul_add_c(a[1],b[1],c3,c1,c2); 1961$code.=<<___; 1962 mflo $t_1 1963 mfhi $t_2 1964 $ADDU $c_3,$t_1 1965 sltu $at,$c_3,$t_1 1966 $MULTU $a_0,$a_3 # mul_add_c2(a[0],b[3],c1,c2,c3); 1967 $ADDU $t_2,$at 1968 $ADDU $c_1,$t_2 1969 sltu $at,$c_1,$t_2 1970 $ADDU $c_2,$at 1971 $ST $c_3,2*$BNSZ($a0) 1972___ 1973 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0, 1974 $a_1,$a_2); # mul_add_c2(a[1],b[2],c1,c2,c3); 1975 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1, 1976 $a_4,$a_0); # mul_add_c2(a[4],b[0],c2,c3,c1); 1977$code.=<<___; 1978 $ST $c_1,3*$BNSZ($a0) 1979___ 1980 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0, 1981 $a_3,$a_1); # mul_add_c2(a[3],b[1],c2,c3,c1); 1982 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1, 1983 $a_2,$a_2); # mul_add_c(a[2],b[2],c2,c3,c1); 1984$code.=<<___; 1985 mflo $t_1 1986 mfhi $t_2 1987 $ADDU $c_2,$t_1 1988 sltu $at,$c_2,$t_1 1989 $MULTU $a_0,$a_5 # mul_add_c2(a[0],b[5],c3,c1,c2); 1990 $ADDU $t_2,$at 1991 $ADDU $c_3,$t_2 1992 sltu $at,$c_3,$t_2 1993 $ADDU $c_1,$at 1994 $ST $c_2,4*$BNSZ($a0) 1995___ 1996 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0, 1997 $a_1,$a_4); # mul_add_c2(a[1],b[4],c3,c1,c2); 1998 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1, 1999 $a_2,$a_3); # mul_add_c2(a[2],b[3],c3,c1,c2); 2000 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1, 2001 $a_6,$a_0); # mul_add_c2(a[6],b[0],c1,c2,c3); 2002$code.=<<___; 2003 $ST $c_3,5*$BNSZ($a0) 2004___ 2005 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0, 2006 $a_5,$a_1); # mul_add_c2(a[5],b[1],c1,c2,c3); 2007 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1, 2008 $a_4,$a_2); # mul_add_c2(a[4],b[2],c1,c2,c3); 2009 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1, 2010 $a_3,$a_3); # mul_add_c(a[3],b[3],c1,c2,c3); 2011$code.=<<___; 2012 mflo $t_1 2013 mfhi $t_2 2014 $ADDU $c_1,$t_1 2015 sltu $at,$c_1,$t_1 2016 $MULTU $a_0,$a_7 # mul_add_c2(a[0],b[7],c2,c3,c1); 2017 $ADDU $t_2,$at 2018 $ADDU $c_2,$t_2 2019 sltu $at,$c_2,$t_2 2020 $ADDU $c_3,$at 2021 $ST $c_1,6*$BNSZ($a0) 2022___ 2023 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0, 2024 $a_1,$a_6); # mul_add_c2(a[1],b[6],c2,c3,c1); 2025 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1, 2026 $a_2,$a_5); # mul_add_c2(a[2],b[5],c2,c3,c1); 2027 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1, 2028 $a_3,$a_4); # mul_add_c2(a[3],b[4],c2,c3,c1); 2029 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1, 2030 $a_7,$a_1); # mul_add_c2(a[7],b[1],c3,c1,c2); 2031$code.=<<___; 2032 $ST $c_2,7*$BNSZ($a0) 2033___ 2034 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0, 2035 $a_6,$a_2); # mul_add_c2(a[6],b[2],c3,c1,c2); 2036 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1, 2037 $a_5,$a_3); # mul_add_c2(a[5],b[3],c3,c1,c2); 2038 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1, 2039 $a_4,$a_4); # mul_add_c(a[4],b[4],c3,c1,c2); 2040$code.=<<___; 2041 mflo $t_1 2042 mfhi $t_2 2043 $ADDU $c_3,$t_1 2044 sltu $at,$c_3,$t_1 2045 $MULTU $a_2,$a_7 # mul_add_c2(a[2],b[7],c1,c2,c3); 2046 $ADDU $t_2,$at 2047 $ADDU $c_1,$t_2 2048 sltu $at,$c_1,$t_2 2049 $ADDU $c_2,$at 2050 $ST $c_3,8*$BNSZ($a0) 2051___ 2052 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0, 2053 $a_3,$a_6); # mul_add_c2(a[3],b[6],c1,c2,c3); 2054 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1, 2055 $a_4,$a_5); # mul_add_c2(a[4],b[5],c1,c2,c3); 2056 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1, 2057 $a_7,$a_3); # mul_add_c2(a[7],b[3],c2,c3,c1); 2058$code.=<<___; 2059 $ST $c_1,9*$BNSZ($a0) 2060___ 2061 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0, 2062 $a_6,$a_4); # mul_add_c2(a[6],b[4],c2,c3,c1); 2063 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1, 2064 $a_5,$a_5); # mul_add_c(a[5],b[5],c2,c3,c1); 2065$code.=<<___; 2066 mflo $t_1 2067 mfhi $t_2 2068 $ADDU $c_2,$t_1 2069 sltu $at,$c_2,$t_1 2070 $MULTU $a_4,$a_7 # mul_add_c2(a[4],b[7],c3,c1,c2); 2071 $ADDU $t_2,$at 2072 $ADDU $c_3,$t_2 2073 sltu $at,$c_3,$t_2 2074 $ADDU $c_1,$at 2075 $ST $c_2,10*$BNSZ($a0) 2076___ 2077 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0, 2078 $a_5,$a_6); # mul_add_c2(a[5],b[6],c3,c1,c2); 2079 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1, 2080 $a_7,$a_5); # mul_add_c2(a[7],b[5],c1,c2,c3); 2081$code.=<<___; 2082 $ST $c_3,11*$BNSZ($a0) 2083___ 2084 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0, 2085 $a_6,$a_6); # mul_add_c(a[6],b[6],c1,c2,c3); 2086$code.=<<___; 2087 mflo $t_1 2088 mfhi $t_2 2089 $ADDU $c_1,$t_1 2090 sltu $at,$c_1,$t_1 2091 $MULTU $a_6,$a_7 # mul_add_c2(a[6],b[7],c2,c3,c1); 2092 $ADDU $t_2,$at 2093 $ADDU $c_2,$t_2 2094 sltu $at,$c_2,$t_2 2095 $ADDU $c_3,$at 2096 $ST $c_1,12*$BNSZ($a0) 2097___ 2098 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0, 2099 $a_7,$a_7); # mul_add_c(a[7],b[7],c3,c1,c2); 2100$code.=<<___; 2101 $ST $c_2,13*$BNSZ($a0) 2102 2103 mflo $t_1 2104 mfhi $t_2 2105 $ADDU $c_3,$t_1 2106 sltu $at,$c_3,$t_1 2107 $ADDU $t_2,$at 2108 $ADDU $c_1,$t_2 2109 $ST $c_3,14*$BNSZ($a0) 2110 $ST $c_1,15*$BNSZ($a0) 2111 2112 .set noreorder 2113___ 2114$code.=<<___ if ($flavour =~ /nubi/i); 2115 $REG_L $t3,4*$SZREG($sp) 2116 $REG_L $t2,3*$SZREG($sp) 2117 $REG_L $t1,2*$SZREG($sp) 2118 $REG_L $t0,1*$SZREG($sp) 2119 $REG_L $gp,0*$SZREG($sp) 2120 $PTR_ADD $sp,6*$SZREG 2121___ 2122$code.=<<___; 2123 jr $ra 2124 nop 2125.end bn_sqr_comba8 2126 2127.align 5 2128.globl bn_sqr_comba4 2129.ent bn_sqr_comba4 2130bn_sqr_comba4: 2131___ 2132$code.=<<___ if ($flavour =~ /nubi/i); 2133 .frame $sp,6*$SZREG,$ra 2134 .mask 0x8000f008,-$SZREG 2135 .set noreorder 2136 $PTR_SUB $sp,6*$SZREG 2137 $REG_S $ra,5*$SZREG($sp) 2138 $REG_S $t3,4*$SZREG($sp) 2139 $REG_S $t2,3*$SZREG($sp) 2140 $REG_S $t1,2*$SZREG($sp) 2141 $REG_S $t0,1*$SZREG($sp) 2142 $REG_S $gp,0*$SZREG($sp) 2143___ 2144$code.=<<___; 2145 .set reorder 2146 $LD $a_0,0($a1) 2147 $LD $a_1,$BNSZ($a1) 2148 $MULTU $a_0,$a_0 # mul_add_c(a[0],b[0],c1,c2,c3); 2149 $LD $a_2,2*$BNSZ($a1) 2150 $LD $a_3,3*$BNSZ($a1) 2151 mflo $c_1 2152 mfhi $c_2 2153 $ST $c_1,0($a0) 2154 2155 $MULTU $a_0,$a_1 # mul_add_c2(a[0],b[1],c2,c3,c1); 2156 mflo $t_1 2157 mfhi $t_2 2158 slt $c_1,$t_2,$zero 2159 $SLL $t_2,1 2160 $MULTU $a_2,$a_0 # mul_add_c2(a[2],b[0],c3,c1,c2); 2161 slt $a2,$t_1,$zero 2162 $ADDU $t_2,$a2 2163 $SLL $t_1,1 2164 $ADDU $c_2,$t_1 2165 sltu $at,$c_2,$t_1 2166 $ADDU $c_3,$t_2,$at 2167 $ST $c_2,$BNSZ($a0) 2168___ 2169 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0, 2170 $a_1,$a_1); # mul_add_c(a[1],b[1],c3,c1,c2); 2171$code.=<<___; 2172 mflo $t_1 2173 mfhi $t_2 2174 $ADDU $c_3,$t_1 2175 sltu $at,$c_3,$t_1 2176 $MULTU $a_0,$a_3 # mul_add_c2(a[0],b[3],c1,c2,c3); 2177 $ADDU $t_2,$at 2178 $ADDU $c_1,$t_2 2179 sltu $at,$c_1,$t_2 2180 $ADDU $c_2,$at 2181 $ST $c_3,2*$BNSZ($a0) 2182___ 2183 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0, 2184 $a_1,$a_2); # mul_add_c2(a2[1],b[2],c1,c2,c3); 2185 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1, 2186 $a_3,$a_1); # mul_add_c2(a[3],b[1],c2,c3,c1); 2187$code.=<<___; 2188 $ST $c_1,3*$BNSZ($a0) 2189___ 2190 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0, 2191 $a_2,$a_2); # mul_add_c(a[2],b[2],c2,c3,c1); 2192$code.=<<___; 2193 mflo $t_1 2194 mfhi $t_2 2195 $ADDU $c_2,$t_1 2196 sltu $at,$c_2,$t_1 2197 $MULTU $a_2,$a_3 # mul_add_c2(a[2],b[3],c3,c1,c2); 2198 $ADDU $t_2,$at 2199 $ADDU $c_3,$t_2 2200 sltu $at,$c_3,$t_2 2201 $ADDU $c_1,$at 2202 $ST $c_2,4*$BNSZ($a0) 2203___ 2204 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0, 2205 $a_3,$a_3); # mul_add_c(a[3],b[3],c1,c2,c3); 2206$code.=<<___; 2207 $ST $c_3,5*$BNSZ($a0) 2208 2209 mflo $t_1 2210 mfhi $t_2 2211 $ADDU $c_1,$t_1 2212 sltu $at,$c_1,$t_1 2213 $ADDU $t_2,$at 2214 $ADDU $c_2,$t_2 2215 $ST $c_1,6*$BNSZ($a0) 2216 $ST $c_2,7*$BNSZ($a0) 2217 2218 .set noreorder 2219___ 2220$code.=<<___ if ($flavour =~ /nubi/i); 2221 $REG_L $t3,4*$SZREG($sp) 2222 $REG_L $t2,3*$SZREG($sp) 2223 $REG_L $t1,2*$SZREG($sp) 2224 $REG_L $t0,1*$SZREG($sp) 2225 $REG_L $gp,0*$SZREG($sp) 2226 $PTR_ADD $sp,6*$SZREG 2227___ 2228$code.=<<___; 2229 jr $ra 2230 nop 2231.end bn_sqr_comba4 2232___ 2233print $code; 2234close STDOUT; 2235