1#!/usr/bin/env perl 2 3# ==================================================================== 4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL 5# project. The module is, however, dual licensed under OpenSSL and 6# CRYPTOGAMS licenses depending on where you obtain it. For further 7# details see http://www.openssl.org/~appro/cryptogams/. 8# ==================================================================== 9 10# I let hardware handle unaligned input(*), except on page boundaries 11# (see below for details). Otherwise straightforward implementation 12# with X vector in register bank. 13# 14# (*) this means that this module is inappropriate for PPC403? Does 15# anybody know if pre-POWER3 can sustain unaligned load? 16 17# -m64 -m32 18# ---------------------------------- 19# PPC970,gcc-4.0.0 +76% +59% 20# Power6,xlc-7 +68% +33% 21 22$flavour = shift; 23 24if ($flavour =~ /64/) { 25 $SIZE_T =8; 26 $LRSAVE =2*$SIZE_T; 27 $UCMP ="cmpld"; 28 $STU ="stdu"; 29 $POP ="ld"; 30 $PUSH ="std"; 31} elsif ($flavour =~ /32/) { 32 $SIZE_T =4; 33 $LRSAVE =$SIZE_T; 34 $UCMP ="cmplw"; 35 $STU ="stwu"; 36 $POP ="lwz"; 37 $PUSH ="stw"; 38} else { die "nonsense $flavour"; } 39 40# Define endianess based on flavour 41# i.e.: linux64le 42$LITTLE_ENDIAN = ($flavour=~/le$/) ? $SIZE_T : 0; 43 44$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 45( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or 46( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or 47die "can't locate ppc-xlate.pl"; 48 49open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!"; 50 51$FRAME=24*$SIZE_T+64; 52$LOCALS=6*$SIZE_T; 53 54$K ="r0"; 55$sp ="r1"; 56$toc="r2"; 57$ctx="r3"; 58$inp="r4"; 59$num="r5"; 60$t0 ="r15"; 61$t1 ="r6"; 62 63$A ="r7"; 64$B ="r8"; 65$C ="r9"; 66$D ="r10"; 67$E ="r11"; 68$T ="r12"; 69 70@V=($A,$B,$C,$D,$E,$T); 71@X=("r16","r17","r18","r19","r20","r21","r22","r23", 72 "r24","r25","r26","r27","r28","r29","r30","r31"); 73 74sub loadbe { 75my ($dst, $src, $temp_reg) = @_; 76$code.=<<___ if (!$LITTLE_ENDIAN); 77 lwz $dst,$src 78___ 79$code.=<<___ if ($LITTLE_ENDIAN); 80 lwz $temp_reg,$src 81 rotlwi $dst,$temp_reg,8 82 rlwimi $dst,$temp_reg,24,0,7 83 rlwimi $dst,$temp_reg,24,16,23 84___ 85} 86 87sub BODY_00_19 { 88my ($i,$a,$b,$c,$d,$e,$f)=@_; 89my $j=$i+1; 90 91 # Since the last value of $f is discarded, we can use 92 # it as a temp reg to swap byte-order when needed. 93 loadbe("@X[$i]","`$i*4`($inp)",$f) if ($i==0); 94 loadbe("@X[$j]","`$j*4`($inp)",$f) if ($i<15); 95$code.=<<___ if ($i<15); 96 add $f,$K,$e 97 rotlwi $e,$a,5 98 add $f,$f,@X[$i] 99 and $t0,$c,$b 100 add $f,$f,$e 101 andc $t1,$d,$b 102 rotlwi $b,$b,30 103 or $t0,$t0,$t1 104 add $f,$f,$t0 105___ 106$code.=<<___ if ($i>=15); 107 add $f,$K,$e 108 rotlwi $e,$a,5 109 xor @X[$j%16],@X[$j%16],@X[($j+2)%16] 110 add $f,$f,@X[$i%16] 111 and $t0,$c,$b 112 xor @X[$j%16],@X[$j%16],@X[($j+8)%16] 113 add $f,$f,$e 114 andc $t1,$d,$b 115 rotlwi $b,$b,30 116 or $t0,$t0,$t1 117 xor @X[$j%16],@X[$j%16],@X[($j+13)%16] 118 add $f,$f,$t0 119 rotlwi @X[$j%16],@X[$j%16],1 120___ 121} 122 123sub BODY_20_39 { 124my ($i,$a,$b,$c,$d,$e,$f)=@_; 125my $j=$i+1; 126$code.=<<___ if ($i<79); 127 add $f,$K,$e 128 xor $t0,$b,$d 129 rotlwi $e,$a,5 130 xor @X[$j%16],@X[$j%16],@X[($j+2)%16] 131 add $f,$f,@X[$i%16] 132 xor $t0,$t0,$c 133 xor @X[$j%16],@X[$j%16],@X[($j+8)%16] 134 add $f,$f,$t0 135 rotlwi $b,$b,30 136 xor @X[$j%16],@X[$j%16],@X[($j+13)%16] 137 add $f,$f,$e 138 rotlwi @X[$j%16],@X[$j%16],1 139___ 140$code.=<<___ if ($i==79); 141 add $f,$K,$e 142 xor $t0,$b,$d 143 rotlwi $e,$a,5 144 lwz r16,0($ctx) 145 add $f,$f,@X[$i%16] 146 xor $t0,$t0,$c 147 lwz r17,4($ctx) 148 add $f,$f,$t0 149 rotlwi $b,$b,30 150 lwz r18,8($ctx) 151 lwz r19,12($ctx) 152 add $f,$f,$e 153 lwz r20,16($ctx) 154___ 155} 156 157sub BODY_40_59 { 158my ($i,$a,$b,$c,$d,$e,$f)=@_; 159my $j=$i+1; 160$code.=<<___; 161 add $f,$K,$e 162 rotlwi $e,$a,5 163 xor @X[$j%16],@X[$j%16],@X[($j+2)%16] 164 add $f,$f,@X[$i%16] 165 and $t0,$b,$c 166 xor @X[$j%16],@X[$j%16],@X[($j+8)%16] 167 add $f,$f,$e 168 or $t1,$b,$c 169 rotlwi $b,$b,30 170 xor @X[$j%16],@X[$j%16],@X[($j+13)%16] 171 and $t1,$t1,$d 172 or $t0,$t0,$t1 173 rotlwi @X[$j%16],@X[$j%16],1 174 add $f,$f,$t0 175___ 176} 177 178$code=<<___; 179.machine "any" 180.text 181 182.globl .sha1_block_data_order 183.align 4 184.sha1_block_data_order: 185 $STU $sp,-$FRAME($sp) 186 mflr r0 187 $PUSH r15,`$FRAME-$SIZE_T*17`($sp) 188 $PUSH r16,`$FRAME-$SIZE_T*16`($sp) 189 $PUSH r17,`$FRAME-$SIZE_T*15`($sp) 190 $PUSH r18,`$FRAME-$SIZE_T*14`($sp) 191 $PUSH r19,`$FRAME-$SIZE_T*13`($sp) 192 $PUSH r20,`$FRAME-$SIZE_T*12`($sp) 193 $PUSH r21,`$FRAME-$SIZE_T*11`($sp) 194 $PUSH r22,`$FRAME-$SIZE_T*10`($sp) 195 $PUSH r23,`$FRAME-$SIZE_T*9`($sp) 196 $PUSH r24,`$FRAME-$SIZE_T*8`($sp) 197 $PUSH r25,`$FRAME-$SIZE_T*7`($sp) 198 $PUSH r26,`$FRAME-$SIZE_T*6`($sp) 199 $PUSH r27,`$FRAME-$SIZE_T*5`($sp) 200 $PUSH r28,`$FRAME-$SIZE_T*4`($sp) 201 $PUSH r29,`$FRAME-$SIZE_T*3`($sp) 202 $PUSH r30,`$FRAME-$SIZE_T*2`($sp) 203 $PUSH r31,`$FRAME-$SIZE_T*1`($sp) 204 $PUSH r0,`$FRAME+$LRSAVE`($sp) 205 lwz $A,0($ctx) 206 lwz $B,4($ctx) 207 lwz $C,8($ctx) 208 lwz $D,12($ctx) 209 lwz $E,16($ctx) 210 andi. r0,$inp,3 211 bne Lunaligned 212Laligned: 213 mtctr $num 214 bl Lsha1_block_private 215 b Ldone 216 217; PowerPC specification allows an implementation to be ill-behaved 218; upon unaligned access which crosses page boundary. "Better safe 219; than sorry" principle makes me treat it specially. But I don't 220; look for particular offending word, but rather for 64-byte input 221; block which crosses the boundary. Once found that block is aligned 222; and hashed separately... 223.align 4 224Lunaligned: 225 subfic $t1,$inp,4096 226 andi. $t1,$t1,4095 ; distance to closest page boundary 227 srwi. $t1,$t1,6 ; t1/=64 228 beq Lcross_page 229 $UCMP $num,$t1 230 ble Laligned ; didn't cross the page boundary 231 mtctr $t1 232 subfc $num,$t1,$num 233 bl Lsha1_block_private 234Lcross_page: 235 li $t1,16 236 mtctr $t1 237 addi r20,$sp,$LOCALS ; spot within the frame 238Lmemcpy: 239 lbz r16,0($inp) 240 lbz r17,1($inp) 241 lbz r18,2($inp) 242 lbz r19,3($inp) 243 addi $inp,$inp,4 244 stb r16,0(r20) 245 stb r17,1(r20) 246 stb r18,2(r20) 247 stb r19,3(r20) 248 addi r20,r20,4 249 bdnz Lmemcpy 250 251 $PUSH $inp,`$FRAME-$SIZE_T*18`($sp) 252 li $t1,1 253 addi $inp,$sp,$LOCALS 254 mtctr $t1 255 bl Lsha1_block_private 256 $POP $inp,`$FRAME-$SIZE_T*18`($sp) 257 addic. $num,$num,-1 258 bne Lunaligned 259 260Ldone: 261 $POP r0,`$FRAME+$LRSAVE`($sp) 262 $POP r15,`$FRAME-$SIZE_T*17`($sp) 263 $POP r16,`$FRAME-$SIZE_T*16`($sp) 264 $POP r17,`$FRAME-$SIZE_T*15`($sp) 265 $POP r18,`$FRAME-$SIZE_T*14`($sp) 266 $POP r19,`$FRAME-$SIZE_T*13`($sp) 267 $POP r20,`$FRAME-$SIZE_T*12`($sp) 268 $POP r21,`$FRAME-$SIZE_T*11`($sp) 269 $POP r22,`$FRAME-$SIZE_T*10`($sp) 270 $POP r23,`$FRAME-$SIZE_T*9`($sp) 271 $POP r24,`$FRAME-$SIZE_T*8`($sp) 272 $POP r25,`$FRAME-$SIZE_T*7`($sp) 273 $POP r26,`$FRAME-$SIZE_T*6`($sp) 274 $POP r27,`$FRAME-$SIZE_T*5`($sp) 275 $POP r28,`$FRAME-$SIZE_T*4`($sp) 276 $POP r29,`$FRAME-$SIZE_T*3`($sp) 277 $POP r30,`$FRAME-$SIZE_T*2`($sp) 278 $POP r31,`$FRAME-$SIZE_T*1`($sp) 279 mtlr r0 280 addi $sp,$sp,$FRAME 281 blr 282 .long 0 283 .byte 0,12,4,1,0x80,18,3,0 284 .long 0 285___ 286 287# This is private block function, which uses tailored calling 288# interface, namely upon entry SHA_CTX is pre-loaded to given 289# registers and counter register contains amount of chunks to 290# digest... 291$code.=<<___; 292.align 4 293Lsha1_block_private: 294___ 295$code.=<<___; # load K_00_19 296 lis $K,0x5a82 297 ori $K,$K,0x7999 298___ 299for($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); } 300$code.=<<___; # load K_20_39 301 lis $K,0x6ed9 302 ori $K,$K,0xeba1 303___ 304for(;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } 305$code.=<<___; # load K_40_59 306 lis $K,0x8f1b 307 ori $K,$K,0xbcdc 308___ 309for(;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); } 310$code.=<<___; # load K_60_79 311 lis $K,0xca62 312 ori $K,$K,0xc1d6 313___ 314for(;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } 315$code.=<<___; 316 add r16,r16,$E 317 add r17,r17,$T 318 add r18,r18,$A 319 add r19,r19,$B 320 add r20,r20,$C 321 stw r16,0($ctx) 322 mr $A,r16 323 stw r17,4($ctx) 324 mr $B,r17 325 stw r18,8($ctx) 326 mr $C,r18 327 stw r19,12($ctx) 328 mr $D,r19 329 stw r20,16($ctx) 330 mr $E,r20 331 addi $inp,$inp,`16*4` 332 bdnz Lsha1_block_private 333 blr 334 .long 0 335 .byte 0,12,0x14,0,0,0,0,0 336.size .sha1_block_data_order,.-.sha1_block_data_order 337___ 338$code.=<<___; 339.asciz "SHA1 block transform for PPC, CRYPTOGAMS by <appro\@fy.chalmers.se>" 340___ 341 342$code =~ s/\`([^\`]*)\`/eval $1/gem; 343print $code; 344close STDOUT; 345