1238384Sjkim#!/usr/bin/env perl 2238384Sjkim# 3238384Sjkim# ==================================================================== 4238384Sjkim# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL 5238384Sjkim# project. The module is, however, dual licensed under OpenSSL and 6238384Sjkim# CRYPTOGAMS licenses depending on where you obtain it. For further 7238384Sjkim# details see http://www.openssl.org/~appro/cryptogams/. 8238384Sjkim# ==================================================================== 9238384Sjkim 10238384Sjkim# This module doesn't present direct interest for OpenSSL, because it 11238384Sjkim# doesn't provide better performance for longer keys. While 512-bit 12238384Sjkim# RSA private key operations are 40% faster, 1024-bit ones are hardly 13238384Sjkim# faster at all, while longer key operations are slower by up to 20%. 14238384Sjkim# It might be of interest to embedded system developers though, as 15238384Sjkim# it's smaller than 1KB, yet offers ~3x improvement over compiler 16238384Sjkim# generated code. 17238384Sjkim# 18238384Sjkim# The module targets N32 and N64 MIPS ABIs and currently is a bit 19238384Sjkim# IRIX-centric, i.e. is likely to require adaptation for other OSes. 20238384Sjkim 21238384Sjkim# int bn_mul_mont( 22238384Sjkim$rp="a0"; # BN_ULONG *rp, 23238384Sjkim$ap="a1"; # const BN_ULONG *ap, 24238384Sjkim$bp="a2"; # const BN_ULONG *bp, 25238384Sjkim$np="a3"; # const BN_ULONG *np, 26238384Sjkim$n0="a4"; # const BN_ULONG *n0, 27238384Sjkim$num="a5"; # int num); 28238384Sjkim 29238384Sjkim$lo0="a6"; 30238384Sjkim$hi0="a7"; 31238384Sjkim$lo1="v0"; 32238384Sjkim$hi1="v1"; 33238384Sjkim$aj="t0"; 34238384Sjkim$bi="t1"; 35238384Sjkim$nj="t2"; 36238384Sjkim$tp="t3"; 37238384Sjkim$alo="s0"; 38238384Sjkim$ahi="s1"; 39238384Sjkim$nlo="s2"; 40238384Sjkim$nhi="s3"; 41238384Sjkim$tj="s4"; 42238384Sjkim$i="s5"; 43238384Sjkim$j="s6"; 44238384Sjkim$fp="t8"; 45238384Sjkim$m1="t9"; 46238384Sjkim 47238384Sjkim$FRAME=8*(2+8); 48238384Sjkim 49238384Sjkim$code=<<___; 50238384Sjkim#include <asm.h> 51238384Sjkim#include <regdef.h> 52238384Sjkim 53238384Sjkim.text 54238384Sjkim 55238384Sjkim.set noat 56238384Sjkim.set reorder 57238384Sjkim 58238384Sjkim.align 5 59238384Sjkim.globl bn_mul_mont 60238384Sjkim.ent bn_mul_mont 61238384Sjkimbn_mul_mont: 62238384Sjkim .set noreorder 63238384Sjkim PTR_SUB sp,64 64238384Sjkim move $fp,sp 65238384Sjkim .frame $fp,64,ra 66238384Sjkim slt AT,$num,4 67238384Sjkim li v0,0 68238384Sjkim beqzl AT,.Lproceed 69238384Sjkim nop 70238384Sjkim jr ra 71238384Sjkim PTR_ADD sp,$fp,64 72238384Sjkim .set reorder 73238384Sjkim.align 5 74238384Sjkim.Lproceed: 75238384Sjkim ld $n0,0($n0) 76238384Sjkim ld $bi,0($bp) # bp[0] 77238384Sjkim ld $aj,0($ap) # ap[0] 78238384Sjkim ld $nj,0($np) # np[0] 79238384Sjkim PTR_SUB sp,16 # place for two extra words 80238384Sjkim sll $num,3 81238384Sjkim li AT,-4096 82238384Sjkim PTR_SUB sp,$num 83238384Sjkim and sp,AT 84238384Sjkim 85238384Sjkim sd s0,0($fp) 86238384Sjkim sd s1,8($fp) 87238384Sjkim sd s2,16($fp) 88238384Sjkim sd s3,24($fp) 89238384Sjkim sd s4,32($fp) 90238384Sjkim sd s5,40($fp) 91238384Sjkim sd s6,48($fp) 92238384Sjkim sd s7,56($fp) 93238384Sjkim 94238384Sjkim dmultu $aj,$bi 95238384Sjkim ld $alo,8($ap) 96238384Sjkim ld $nlo,8($np) 97238384Sjkim mflo $lo0 98238384Sjkim mfhi $hi0 99238384Sjkim dmultu $lo0,$n0 100238384Sjkim mflo $m1 101238384Sjkim 102238384Sjkim dmultu $alo,$bi 103238384Sjkim mflo $alo 104238384Sjkim mfhi $ahi 105238384Sjkim 106238384Sjkim dmultu $nj,$m1 107238384Sjkim mflo $lo1 108238384Sjkim mfhi $hi1 109238384Sjkim dmultu $nlo,$m1 110238384Sjkim daddu $lo1,$lo0 111238384Sjkim sltu AT,$lo1,$lo0 112238384Sjkim daddu $hi1,AT 113238384Sjkim mflo $nlo 114238384Sjkim mfhi $nhi 115238384Sjkim 116238384Sjkim move $tp,sp 117238384Sjkim li $j,16 118238384Sjkim.align 4 119238384Sjkim.L1st: 120238384Sjkim .set noreorder 121238384Sjkim PTR_ADD $aj,$ap,$j 122238384Sjkim ld $aj,($aj) 123238384Sjkim PTR_ADD $nj,$np,$j 124238384Sjkim ld $nj,($nj) 125238384Sjkim 126238384Sjkim dmultu $aj,$bi 127238384Sjkim daddu $lo0,$alo,$hi0 128238384Sjkim daddu $lo1,$nlo,$hi1 129238384Sjkim sltu AT,$lo0,$hi0 130238384Sjkim sltu s7,$lo1,$hi1 131238384Sjkim daddu $hi0,$ahi,AT 132238384Sjkim daddu $hi1,$nhi,s7 133238384Sjkim mflo $alo 134238384Sjkim mfhi $ahi 135238384Sjkim 136238384Sjkim daddu $lo1,$lo0 137238384Sjkim sltu AT,$lo1,$lo0 138238384Sjkim dmultu $nj,$m1 139238384Sjkim daddu $hi1,AT 140238384Sjkim addu $j,8 141238384Sjkim sd $lo1,($tp) 142238384Sjkim sltu s7,$j,$num 143238384Sjkim mflo $nlo 144238384Sjkim mfhi $nhi 145238384Sjkim 146238384Sjkim bnez s7,.L1st 147238384Sjkim PTR_ADD $tp,8 148238384Sjkim .set reorder 149238384Sjkim 150238384Sjkim daddu $lo0,$alo,$hi0 151238384Sjkim sltu AT,$lo0,$hi0 152238384Sjkim daddu $hi0,$ahi,AT 153238384Sjkim 154238384Sjkim daddu $lo1,$nlo,$hi1 155238384Sjkim sltu s7,$lo1,$hi1 156238384Sjkim daddu $hi1,$nhi,s7 157238384Sjkim daddu $lo1,$lo0 158238384Sjkim sltu AT,$lo1,$lo0 159238384Sjkim daddu $hi1,AT 160238384Sjkim 161238384Sjkim sd $lo1,($tp) 162238384Sjkim 163238384Sjkim daddu $hi1,$hi0 164238384Sjkim sltu AT,$hi1,$hi0 165238384Sjkim sd $hi1,8($tp) 166238384Sjkim sd AT,16($tp) 167238384Sjkim 168238384Sjkim li $i,8 169238384Sjkim.align 4 170238384Sjkim.Louter: 171238384Sjkim PTR_ADD $bi,$bp,$i 172238384Sjkim ld $bi,($bi) 173238384Sjkim ld $aj,($ap) 174238384Sjkim ld $alo,8($ap) 175238384Sjkim ld $tj,(sp) 176238384Sjkim 177238384Sjkim dmultu $aj,$bi 178238384Sjkim ld $nj,($np) 179238384Sjkim ld $nlo,8($np) 180238384Sjkim mflo $lo0 181238384Sjkim mfhi $hi0 182238384Sjkim daddu $lo0,$tj 183238384Sjkim dmultu $lo0,$n0 184238384Sjkim sltu AT,$lo0,$tj 185238384Sjkim daddu $hi0,AT 186238384Sjkim mflo $m1 187238384Sjkim 188238384Sjkim dmultu $alo,$bi 189238384Sjkim mflo $alo 190238384Sjkim mfhi $ahi 191238384Sjkim 192238384Sjkim dmultu $nj,$m1 193238384Sjkim mflo $lo1 194238384Sjkim mfhi $hi1 195238384Sjkim 196238384Sjkim dmultu $nlo,$m1 197238384Sjkim daddu $lo1,$lo0 198238384Sjkim sltu AT,$lo1,$lo0 199238384Sjkim daddu $hi1,AT 200238384Sjkim mflo $nlo 201238384Sjkim mfhi $nhi 202238384Sjkim 203238384Sjkim move $tp,sp 204238384Sjkim li $j,16 205238384Sjkim ld $tj,8($tp) 206238384Sjkim.align 4 207238384Sjkim.Linner: 208238384Sjkim .set noreorder 209238384Sjkim PTR_ADD $aj,$ap,$j 210238384Sjkim ld $aj,($aj) 211238384Sjkim PTR_ADD $nj,$np,$j 212238384Sjkim ld $nj,($nj) 213238384Sjkim 214238384Sjkim dmultu $aj,$bi 215238384Sjkim daddu $lo0,$alo,$hi0 216238384Sjkim daddu $lo1,$nlo,$hi1 217238384Sjkim sltu AT,$lo0,$hi0 218238384Sjkim sltu s7,$lo1,$hi1 219238384Sjkim daddu $hi0,$ahi,AT 220238384Sjkim daddu $hi1,$nhi,s7 221238384Sjkim mflo $alo 222238384Sjkim mfhi $ahi 223238384Sjkim 224238384Sjkim daddu $lo0,$tj 225238384Sjkim addu $j,8 226238384Sjkim dmultu $nj,$m1 227238384Sjkim sltu AT,$lo0,$tj 228238384Sjkim daddu $lo1,$lo0 229238384Sjkim daddu $hi0,AT 230238384Sjkim sltu s7,$lo1,$lo0 231238384Sjkim ld $tj,16($tp) 232238384Sjkim daddu $hi1,s7 233238384Sjkim sltu AT,$j,$num 234238384Sjkim mflo $nlo 235238384Sjkim mfhi $nhi 236238384Sjkim sd $lo1,($tp) 237238384Sjkim bnez AT,.Linner 238238384Sjkim PTR_ADD $tp,8 239238384Sjkim .set reorder 240238384Sjkim 241238384Sjkim daddu $lo0,$alo,$hi0 242238384Sjkim sltu AT,$lo0,$hi0 243238384Sjkim daddu $hi0,$ahi,AT 244238384Sjkim daddu $lo0,$tj 245238384Sjkim sltu s7,$lo0,$tj 246238384Sjkim daddu $hi0,s7 247238384Sjkim 248238384Sjkim ld $tj,16($tp) 249238384Sjkim daddu $lo1,$nlo,$hi1 250238384Sjkim sltu AT,$lo1,$hi1 251238384Sjkim daddu $hi1,$nhi,AT 252238384Sjkim daddu $lo1,$lo0 253238384Sjkim sltu s7,$lo1,$lo0 254238384Sjkim daddu $hi1,s7 255238384Sjkim sd $lo1,($tp) 256238384Sjkim 257238384Sjkim daddu $lo1,$hi1,$hi0 258238384Sjkim sltu $hi1,$lo1,$hi0 259238384Sjkim daddu $lo1,$tj 260238384Sjkim sltu AT,$lo1,$tj 261238384Sjkim daddu $hi1,AT 262238384Sjkim sd $lo1,8($tp) 263238384Sjkim sd $hi1,16($tp) 264238384Sjkim 265238384Sjkim addu $i,8 266238384Sjkim sltu s7,$i,$num 267238384Sjkim bnez s7,.Louter 268238384Sjkim 269238384Sjkim .set noreorder 270238384Sjkim PTR_ADD $tj,sp,$num # &tp[num] 271238384Sjkim move $tp,sp 272238384Sjkim move $ap,sp 273238384Sjkim li $hi0,0 # clear borrow bit 274238384Sjkim 275238384Sjkim.align 4 276238384Sjkim.Lsub: ld $lo0,($tp) 277238384Sjkim ld $lo1,($np) 278238384Sjkim PTR_ADD $tp,8 279238384Sjkim PTR_ADD $np,8 280238384Sjkim dsubu $lo1,$lo0,$lo1 # tp[i]-np[i] 281238384Sjkim sgtu AT,$lo1,$lo0 282238384Sjkim dsubu $lo0,$lo1,$hi0 283238384Sjkim sgtu $hi0,$lo0,$lo1 284238384Sjkim sd $lo0,($rp) 285238384Sjkim or $hi0,AT 286238384Sjkim sltu AT,$tp,$tj 287238384Sjkim bnez AT,.Lsub 288238384Sjkim PTR_ADD $rp,8 289238384Sjkim 290238384Sjkim dsubu $hi0,$hi1,$hi0 # handle upmost overflow bit 291238384Sjkim move $tp,sp 292238384Sjkim PTR_SUB $rp,$num # restore rp 293238384Sjkim not $hi1,$hi0 294238384Sjkim 295238384Sjkim and $ap,$hi0,sp 296238384Sjkim and $bp,$hi1,$rp 297238384Sjkim or $ap,$ap,$bp # ap=borrow?tp:rp 298238384Sjkim 299238384Sjkim.align 4 300238384Sjkim.Lcopy: ld $aj,($ap) 301238384Sjkim PTR_ADD $ap,8 302238384Sjkim PTR_ADD $tp,8 303238384Sjkim sd zero,-8($tp) 304238384Sjkim sltu AT,$tp,$tj 305238384Sjkim sd $aj,($rp) 306238384Sjkim bnez AT,.Lcopy 307238384Sjkim PTR_ADD $rp,8 308238384Sjkim 309238384Sjkim ld s0,0($fp) 310238384Sjkim ld s1,8($fp) 311238384Sjkim ld s2,16($fp) 312238384Sjkim ld s3,24($fp) 313238384Sjkim ld s4,32($fp) 314238384Sjkim ld s5,40($fp) 315238384Sjkim ld s6,48($fp) 316238384Sjkim ld s7,56($fp) 317238384Sjkim li v0,1 318238384Sjkim jr ra 319238384Sjkim PTR_ADD sp,$fp,64 320238384Sjkim .set reorder 321238384SjkimEND(bn_mul_mont) 322238384Sjkim.rdata 323238384Sjkim.asciiz "Montgomery Multiplication for MIPS III/IV, CRYPTOGAMS by <appro\@openssl.org>" 324238384Sjkim___ 325238384Sjkim 326238384Sjkimprint $code; 327238384Sjkimclose STDOUT; 328