1#!/usr/bin/env perl 2 3# ==================================================================== 4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL 5# project. The module is, however, dual licensed under OpenSSL and 6# CRYPTOGAMS licenses depending on where you obtain it. For further 7# details see http://www.openssl.org/~appro/cryptogams/. 8# ==================================================================== 9 10# SHA1 block procedure for MIPS. 11 12# Performance improvement is 30% on unaligned input. The "secret" is 13# to deploy lwl/lwr pair to load unaligned input. One could have 14# vectorized Xupdate on MIPSIII/IV, but the goal was to code MIPS32- 15# compatible subroutine. There is room for minor optimization on 16# little-endian platforms... 17 18# September 2012. 19# 20# Add MIPS32r2 code (>25% less instructions). 21 22###################################################################### 23# There is a number of MIPS ABI in use, O32 and N32/64 are most 24# widely used. Then there is a new contender: NUBI. It appears that if 25# one picks the latter, it's possible to arrange code in ABI neutral 26# manner. Therefore let's stick to NUBI register layout: 27# 28($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25)); 29($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); 30($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23)); 31($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31)); 32# 33# The return value is placed in $a0. Following coding rules facilitate 34# interoperability: 35# 36# - never ever touch $tp, "thread pointer", former $gp; 37# - copy return value to $t0, former $v0 [or to $a0 if you're adapting 38# old code]; 39# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary; 40# 41# For reference here is register layout for N32/64 MIPS ABIs: 42# 43# ($zero,$at,$v0,$v1)=map("\$$_",(0..3)); 44# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); 45# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25)); 46# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23)); 47# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31)); 48# 49$flavour = shift || "o32"; # supported flavours are o32,n32,64,nubi32,nubi64 50 51if ($flavour =~ /64|n32/i) { 52 $PTR_ADD="dadd"; # incidentally works even on n32 53 $PTR_SUB="dsub"; # incidentally works even on n32 54 $REG_S="sd"; 55 $REG_L="ld"; 56 $PTR_SLL="dsll"; # incidentally works even on n32 57 $SZREG=8; 58} else { 59 $PTR_ADD="add"; 60 $PTR_SUB="sub"; 61 $REG_S="sw"; 62 $REG_L="lw"; 63 $PTR_SLL="sll"; 64 $SZREG=4; 65} 66# 67# <appro@openssl.org> 68# 69###################################################################### 70 71$big_endian=(`echo MIPSEL | $ENV{CC} -E -`=~/MIPSEL/)?1:0 if ($ENV{CC}); 72 73for (@ARGV) { $output=$_ if (/^\w[\w\-]*\.\w+$/); } 74open STDOUT,">$output"; 75 76if (!defined($big_endian)) 77 { $big_endian=(unpack('L',pack('N',1))==1); } 78 79# offsets of the Most and Least Significant Bytes 80$MSB=$big_endian?0:3; 81$LSB=3&~$MSB; 82 83@X=map("\$$_",(8..23)); # a4-a7,s0-s11 84 85$ctx=$a0; 86$inp=$a1; 87$num=$a2; 88$A="\$1"; 89$B="\$2"; 90$C="\$3"; 91$D="\$7"; 92$E="\$24"; @V=($A,$B,$C,$D,$E); 93$t0="\$25"; 94$t1=$num; # $num is offloaded to stack 95$t2="\$30"; # fp 96$K="\$31"; # ra 97 98sub BODY_00_14 { 99my ($i,$a,$b,$c,$d,$e)=@_; 100my $j=$i+1; 101$code.=<<___ if (!$big_endian); 102#if defined(_MIPS_ARCH_MIPS32R2) || defined(_MIPS_ARCH_MIPS64R2) 103 wsbh @X[$i],@X[$i] # byte swap($i) 104 rotr @X[$i],@X[$i],16 105#else 106 srl $t0,@X[$i],24 # byte swap($i) 107 srl $t1,@X[$i],8 108 andi $t2,@X[$i],0xFF00 109 sll @X[$i],@X[$i],24 110 andi $t1,0xFF00 111 sll $t2,$t2,8 112 or @X[$i],$t0 113 or $t1,$t2 114 or @X[$i],$t1 115#endif 116___ 117$code.=<<___; 118#if defined(_MIPS_ARCH_MIPS32R2) || defined(_MIPS_ARCH_MIPS64R2) 119 addu $e,$K # $i 120 xor $t0,$c,$d 121 rotr $t1,$a,27 122 lwl @X[$j],$j*4+$MSB($inp) 123 and $t0,$b 124 addu $e,$t1 125 lwr @X[$j],$j*4+$LSB($inp) 126 xor $t0,$d 127 addu $e,@X[$i] 128 rotr $b,$b,2 129 addu $e,$t0 130#else 131 lwl @X[$j],$j*4+$MSB($inp) 132 sll $t0,$a,5 # $i 133 addu $e,$K 134 lwr @X[$j],$j*4+$LSB($inp) 135 srl $t1,$a,27 136 addu $e,$t0 137 xor $t0,$c,$d 138 addu $e,$t1 139 sll $t2,$b,30 140 and $t0,$b 141 srl $b,$b,2 142 xor $t0,$d 143 addu $e,@X[$i] 144 or $b,$t2 145 addu $e,$t0 146#endif 147___ 148} 149 150sub BODY_15_19 { 151my ($i,$a,$b,$c,$d,$e)=@_; 152my $j=$i+1; 153 154$code.=<<___ if (!$big_endian && $i==15); 155#if defined(_MIPS_ARCH_MIPS32R2) || defined(_MIPS_ARCH_MIPS64R2) 156 wsbh @X[$i],@X[$i] # byte swap($i) 157 rotr @X[$i],@X[$i],16 158#else 159 srl $t0,@X[$i],24 # byte swap($i) 160 srl $t1,@X[$i],8 161 andi $t2,@X[$i],0xFF00 162 sll @X[$i],@X[$i],24 163 andi $t1,0xFF00 164 sll $t2,$t2,8 165 or @X[$i],$t0 166 or @X[$i],$t1 167 or @X[$i],$t2 168#endif 169___ 170$code.=<<___; 171#if defined(_MIPS_ARCH_MIPS32R2) || defined(_MIPS_ARCH_MIPS64R2) 172 addu $e,$K # $i 173 xor @X[$j%16],@X[($j+2)%16] 174 xor $t0,$c,$d 175 rotr $t1,$a,27 176 xor @X[$j%16],@X[($j+8)%16] 177 and $t0,$b 178 addu $e,$t1 179 xor @X[$j%16],@X[($j+13)%16] 180 xor $t0,$d 181 addu $e,@X[$i%16] 182 rotr @X[$j%16],@X[$j%16],31 183 rotr $b,$b,2 184 addu $e,$t0 185#else 186 xor @X[$j%16],@X[($j+2)%16] 187 sll $t0,$a,5 # $i 188 addu $e,$K 189 srl $t1,$a,27 190 addu $e,$t0 191 xor @X[$j%16],@X[($j+8)%16] 192 xor $t0,$c,$d 193 addu $e,$t1 194 xor @X[$j%16],@X[($j+13)%16] 195 sll $t2,$b,30 196 and $t0,$b 197 srl $t1,@X[$j%16],31 198 addu @X[$j%16],@X[$j%16] 199 srl $b,$b,2 200 xor $t0,$d 201 or @X[$j%16],$t1 202 addu $e,@X[$i%16] 203 or $b,$t2 204 addu $e,$t0 205#endif 206___ 207} 208 209sub BODY_20_39 { 210my ($i,$a,$b,$c,$d,$e)=@_; 211my $j=$i+1; 212$code.=<<___ if ($i<79); 213#if defined(_MIPS_ARCH_MIPS32R2) || defined(_MIPS_ARCH_MIPS64R2) 214 xor @X[$j%16],@X[($j+2)%16] 215 addu $e,$K # $i 216 rotr $t1,$a,27 217 xor @X[$j%16],@X[($j+8)%16] 218 xor $t0,$c,$d 219 addu $e,$t1 220 xor @X[$j%16],@X[($j+13)%16] 221 xor $t0,$b 222 addu $e,@X[$i%16] 223 rotr @X[$j%16],@X[$j%16],31 224 rotr $b,$b,2 225 addu $e,$t0 226#else 227 xor @X[$j%16],@X[($j+2)%16] 228 sll $t0,$a,5 # $i 229 addu $e,$K 230 srl $t1,$a,27 231 addu $e,$t0 232 xor @X[$j%16],@X[($j+8)%16] 233 xor $t0,$c,$d 234 addu $e,$t1 235 xor @X[$j%16],@X[($j+13)%16] 236 sll $t2,$b,30 237 xor $t0,$b 238 srl $t1,@X[$j%16],31 239 addu @X[$j%16],@X[$j%16] 240 srl $b,$b,2 241 addu $e,@X[$i%16] 242 or @X[$j%16],$t1 243 or $b,$t2 244 addu $e,$t0 245#endif 246___ 247$code.=<<___ if ($i==79); 248#if defined(_MIPS_ARCH_MIPS32R2) || defined(_MIPS_ARCH_MIPS64R2) 249 lw @X[0],0($ctx) 250 addu $e,$K # $i 251 lw @X[1],4($ctx) 252 rotr $t1,$a,27 253 lw @X[2],8($ctx) 254 xor $t0,$c,$d 255 addu $e,$t1 256 lw @X[3],12($ctx) 257 xor $t0,$b 258 addu $e,@X[$i%16] 259 lw @X[4],16($ctx) 260 rotr $b,$b,2 261 addu $e,$t0 262#else 263 lw @X[0],0($ctx) 264 sll $t0,$a,5 # $i 265 addu $e,$K 266 lw @X[1],4($ctx) 267 srl $t1,$a,27 268 addu $e,$t0 269 lw @X[2],8($ctx) 270 xor $t0,$c,$d 271 addu $e,$t1 272 lw @X[3],12($ctx) 273 sll $t2,$b,30 274 xor $t0,$b 275 lw @X[4],16($ctx) 276 srl $b,$b,2 277 addu $e,@X[$i%16] 278 or $b,$t2 279 addu $e,$t0 280#endif 281___ 282} 283 284sub BODY_40_59 { 285my ($i,$a,$b,$c,$d,$e)=@_; 286my $j=$i+1; 287$code.=<<___ if ($i<79); 288#if defined(_MIPS_ARCH_MIPS32R2) || defined(_MIPS_ARCH_MIPS64R2) 289 addu $e,$K # $i 290 and $t0,$c,$d 291 xor @X[$j%16],@X[($j+2)%16] 292 rotr $t1,$a,27 293 addu $e,$t0 294 xor @X[$j%16],@X[($j+8)%16] 295 xor $t0,$c,$d 296 addu $e,$t1 297 xor @X[$j%16],@X[($j+13)%16] 298 and $t0,$b 299 addu $e,@X[$i%16] 300 rotr @X[$j%16],@X[$j%16],31 301 rotr $b,$b,2 302 addu $e,$t0 303#else 304 xor @X[$j%16],@X[($j+2)%16] 305 sll $t0,$a,5 # $i 306 addu $e,$K 307 srl $t1,$a,27 308 addu $e,$t0 309 xor @X[$j%16],@X[($j+8)%16] 310 and $t0,$c,$d 311 addu $e,$t1 312 xor @X[$j%16],@X[($j+13)%16] 313 sll $t2,$b,30 314 addu $e,$t0 315 srl $t1,@X[$j%16],31 316 xor $t0,$c,$d 317 addu @X[$j%16],@X[$j%16] 318 and $t0,$b 319 srl $b,$b,2 320 or @X[$j%16],$t1 321 addu $e,@X[$i%16] 322 or $b,$t2 323 addu $e,$t0 324#endif 325___ 326} 327 328$FRAMESIZE=16; # large enough to accomodate NUBI saved registers 329$SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0xc0fff008 : 0xc0ff0000; 330 331$code=<<___; 332#ifdef OPENSSL_FIPSCANISTER 333# include <openssl/fipssyms.h> 334#endif 335 336#if defined(__mips_smartmips) && !defined(_MIPS_ARCH_MIPS32R2) 337#define _MIPS_ARCH_MIPS32R2 338#endif 339 340.text 341 342.set noat 343.set noreorder 344.align 5 345.globl sha1_block_data_order 346.ent sha1_block_data_order 347sha1_block_data_order: 348 .frame $sp,$FRAMESIZE*$SZREG,$ra 349 .mask $SAVED_REGS_MASK,-$SZREG 350 .set noreorder 351 $PTR_SUB $sp,$FRAMESIZE*$SZREG 352 $REG_S $ra,($FRAMESIZE-1)*$SZREG($sp) 353 $REG_S $fp,($FRAMESIZE-2)*$SZREG($sp) 354 $REG_S $s11,($FRAMESIZE-3)*$SZREG($sp) 355 $REG_S $s10,($FRAMESIZE-4)*$SZREG($sp) 356 $REG_S $s9,($FRAMESIZE-5)*$SZREG($sp) 357 $REG_S $s8,($FRAMESIZE-6)*$SZREG($sp) 358 $REG_S $s7,($FRAMESIZE-7)*$SZREG($sp) 359 $REG_S $s6,($FRAMESIZE-8)*$SZREG($sp) 360 $REG_S $s5,($FRAMESIZE-9)*$SZREG($sp) 361 $REG_S $s4,($FRAMESIZE-10)*$SZREG($sp) 362___ 363$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue 364 $REG_S $s3,($FRAMESIZE-11)*$SZREG($sp) 365 $REG_S $s2,($FRAMESIZE-12)*$SZREG($sp) 366 $REG_S $s1,($FRAMESIZE-13)*$SZREG($sp) 367 $REG_S $s0,($FRAMESIZE-14)*$SZREG($sp) 368 $REG_S $gp,($FRAMESIZE-15)*$SZREG($sp) 369___ 370$code.=<<___; 371 $PTR_SLL $num,6 372 $PTR_ADD $num,$inp 373 $REG_S $num,0($sp) 374 lw $A,0($ctx) 375 lw $B,4($ctx) 376 lw $C,8($ctx) 377 lw $D,12($ctx) 378 b .Loop 379 lw $E,16($ctx) 380.align 4 381.Loop: 382 .set reorder 383 lwl @X[0],$MSB($inp) 384 lui $K,0x5a82 385 lwr @X[0],$LSB($inp) 386 ori $K,0x7999 # K_00_19 387___ 388for ($i=0;$i<15;$i++) { &BODY_00_14($i,@V); unshift(@V,pop(@V)); } 389for (;$i<20;$i++) { &BODY_15_19($i,@V); unshift(@V,pop(@V)); } 390$code.=<<___; 391 lui $K,0x6ed9 392 ori $K,0xeba1 # K_20_39 393___ 394for (;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } 395$code.=<<___; 396 lui $K,0x8f1b 397 ori $K,0xbcdc # K_40_59 398___ 399for (;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); } 400$code.=<<___; 401 lui $K,0xca62 402 ori $K,0xc1d6 # K_60_79 403___ 404for (;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } 405$code.=<<___; 406 $PTR_ADD $inp,64 407 $REG_L $num,0($sp) 408 409 addu $A,$X[0] 410 addu $B,$X[1] 411 sw $A,0($ctx) 412 addu $C,$X[2] 413 addu $D,$X[3] 414 sw $B,4($ctx) 415 addu $E,$X[4] 416 sw $C,8($ctx) 417 sw $D,12($ctx) 418 sw $E,16($ctx) 419 .set noreorder 420 bne $inp,$num,.Loop 421 nop 422 423 .set noreorder 424 $REG_L $ra,($FRAMESIZE-1)*$SZREG($sp) 425 $REG_L $fp,($FRAMESIZE-2)*$SZREG($sp) 426 $REG_L $s11,($FRAMESIZE-3)*$SZREG($sp) 427 $REG_L $s10,($FRAMESIZE-4)*$SZREG($sp) 428 $REG_L $s9,($FRAMESIZE-5)*$SZREG($sp) 429 $REG_L $s8,($FRAMESIZE-6)*$SZREG($sp) 430 $REG_L $s7,($FRAMESIZE-7)*$SZREG($sp) 431 $REG_L $s6,($FRAMESIZE-8)*$SZREG($sp) 432 $REG_L $s5,($FRAMESIZE-9)*$SZREG($sp) 433 $REG_L $s4,($FRAMESIZE-10)*$SZREG($sp) 434___ 435$code.=<<___ if ($flavour =~ /nubi/i); 436 $REG_L $s3,($FRAMESIZE-11)*$SZREG($sp) 437 $REG_L $s2,($FRAMESIZE-12)*$SZREG($sp) 438 $REG_L $s1,($FRAMESIZE-13)*$SZREG($sp) 439 $REG_L $s0,($FRAMESIZE-14)*$SZREG($sp) 440 $REG_L $gp,($FRAMESIZE-15)*$SZREG($sp) 441___ 442$code.=<<___; 443 jr $ra 444 $PTR_ADD $sp,$FRAMESIZE*$SZREG 445.end sha1_block_data_order 446.rdata 447.asciiz "SHA1 for MIPS, CRYPTOGAMS by <appro\@openssl.org>" 448___ 449print $code; 450close STDOUT; 451