sha512-ia64.pl (160815) | sha512-ia64.pl (194206) |
---|---|
1#!/usr/bin/env perl 2# 3# ==================================================================== 4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | 1#!/usr/bin/env perl 2# 3# ==================================================================== 4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL |
5# project. Rights for redistribution and usage in source and binary 6# forms are granted according to the OpenSSL license. | 5# project. The module is, however, dual licensed under OpenSSL and 6# CRYPTOGAMS licenses depending on where you obtain it. For further 7# details see http://www.openssl.org/~appro/cryptogams/. |
7# ==================================================================== 8# 9# SHA256/512_Transform for Itanium. 10# 11# sha512_block runs in 1003 cycles on Itanium 2, which is almost 50% 12# faster than gcc and >60%(!) faster than code generated by HP-UX 13# compiler (yes, HP-UX is generating slower code, because unlike gcc, 14# it failed to deploy "shift right pair," 'shrp' instruction, which --- 51 unchanged lines hidden (view full) --- 66if ($output =~ /512.*\.[s|asm]/) { 67 $SZ=8; 68 $BITS=8*$SZ; 69 $LDW="ld8"; 70 $STW="st8"; 71 $ADD="add"; 72 $SHRU="shr.u"; 73 $TABLE="K512"; | 8# ==================================================================== 9# 10# SHA256/512_Transform for Itanium. 11# 12# sha512_block runs in 1003 cycles on Itanium 2, which is almost 50% 13# faster than gcc and >60%(!) faster than code generated by HP-UX 14# compiler (yes, HP-UX is generating slower code, because unlike gcc, 15# it failed to deploy "shift right pair," 'shrp' instruction, which --- 51 unchanged lines hidden (view full) --- 67if ($output =~ /512.*\.[s|asm]/) { 68 $SZ=8; 69 $BITS=8*$SZ; 70 $LDW="ld8"; 71 $STW="st8"; 72 $ADD="add"; 73 $SHRU="shr.u"; 74 $TABLE="K512"; |
74 $func="sha512_block"; | 75 $func="sha512_block_data_order"; |
75 @Sigma0=(28,34,39); 76 @Sigma1=(14,18,41); 77 @sigma0=(1, 8, 7); 78 @sigma1=(19,61, 6); 79 $rounds=80; 80} elsif ($output =~ /256.*\.[s|asm]/) { 81 $SZ=4; 82 $BITS=8*$SZ; 83 $LDW="ld4"; 84 $STW="st4"; 85 $ADD="padd4"; 86 $SHRU="pshr4.u"; 87 $TABLE="K256"; | 76 @Sigma0=(28,34,39); 77 @Sigma1=(14,18,41); 78 @sigma0=(1, 8, 7); 79 @sigma1=(19,61, 6); 80 $rounds=80; 81} elsif ($output =~ /256.*\.[s|asm]/) { 82 $SZ=4; 83 $BITS=8*$SZ; 84 $LDW="ld4"; 85 $STW="st4"; 86 $ADD="padd4"; 87 $SHRU="pshr4.u"; 88 $TABLE="K256"; |
88 $func="sha256_block"; | 89 $func="sha256_block_data_order"; |
89 @Sigma0=( 2,13,22); 90 @Sigma1=( 6,11,25); 91 @sigma0=( 7,18, 3); 92 @sigma1=(17,19,10); 93 $rounds=64; 94} else { die "nonsense $output"; } 95 96open STDOUT,">$output" || die "can't open $output: $!"; 97 98if ($^O eq "hpux") { 99 $ADDP="addp4"; 100 for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); } 101} else { $ADDP="add"; } 102for (@ARGV) { $big_endian=1 if (/\-DB_ENDIAN/); 103 $big_endian=0 if (/\-DL_ENDIAN/); } 104if (!defined($big_endian)) 105 { $big_endian=(unpack('L',pack('N',1))==1); } 106 107$code=<<___; | 90 @Sigma0=( 2,13,22); 91 @Sigma1=( 6,11,25); 92 @sigma0=( 7,18, 3); 93 @sigma1=(17,19,10); 94 $rounds=64; 95} else { die "nonsense $output"; } 96 97open STDOUT,">$output" || die "can't open $output: $!"; 98 99if ($^O eq "hpux") { 100 $ADDP="addp4"; 101 for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); } 102} else { $ADDP="add"; } 103for (@ARGV) { $big_endian=1 if (/\-DB_ENDIAN/); 104 $big_endian=0 if (/\-DL_ENDIAN/); } 105if (!defined($big_endian)) 106 { $big_endian=(unpack('L',pack('N',1))==1); } 107 108$code=<<___; |
108.ident \"$output, version 1.0\" | 109.ident \"$output, version 1.1\" |
109.ident \"IA-64 ISA artwork by Andy Polyakov <appro\@fy.chalmers.se>\" 110.explicit 111.text 112 | 110.ident \"IA-64 ISA artwork by Andy Polyakov <appro\@fy.chalmers.se>\" 111.explicit 112.text 113 |
114pfssave=r2; 115lcsave=r3; |
|
113prsave=r14; 114K=r15; 115A=r16; B=r17; C=r18; D=r19; 116E=r20; F=r21; G=r22; H=r23; 117T1=r24; T2=r25; 118s0=r26; s1=r27; t0=r28; t1=r29; 119Ktbl=r30; 120ctx=r31; // 1st arg 121input=r48; // 2nd arg 122num=r49; // 3rd arg 123sgm0=r50; sgm1=r51; // small constants | 116prsave=r14; 117K=r15; 118A=r16; B=r17; C=r18; D=r19; 119E=r20; F=r21; G=r22; H=r23; 120T1=r24; T2=r25; 121s0=r26; s1=r27; t0=r28; t1=r29; 122Ktbl=r30; 123ctx=r31; // 1st arg 124input=r48; // 2nd arg 125num=r49; // 3rd arg 126sgm0=r50; sgm1=r51; // small constants |
127A_=r54; B_=r55; C_=r56; D_=r57; 128E_=r58; F_=r59; G_=r60; H_=r61; |
|
124 125// void $func (SHA_CTX *ctx, const void *in,size_t num[,int host]) 126.global $func# 127.proc $func# 128.align 32 129$func: 130 .prologue | 129 130// void $func (SHA_CTX *ctx, const void *in,size_t num[,int host]) 131.global $func# 132.proc $func# 133.align 32 134$func: 135 .prologue |
131 .fframe 0 132 .save ar.pfs,r2 133 .save ar.lc,r3 134 .save pr,prsave 135{ .mmi; alloc r2=ar.pfs,3,17,0,16 | 136 .save ar.pfs,pfssave 137{ .mmi; alloc pfssave=ar.pfs,3,27,0,16 |
136 $ADDP ctx=0,r32 // 1st arg | 138 $ADDP ctx=0,r32 // 1st arg |
137 mov r3=ar.lc } | 139 .save ar.lc,lcsave 140 mov lcsave=ar.lc } |
138{ .mmi; $ADDP input=0,r33 // 2nd arg | 141{ .mmi; $ADDP input=0,r33 // 2nd arg |
139 addl Ktbl=\@ltoff($TABLE#),gp | 142 mov num=r34 // 3rd arg 143 .save pr,prsave |
140 mov prsave=pr };; 141 142 .body | 144 mov prsave=pr };; 145 146 .body |
143{ .mii; ld8 Ktbl=[Ktbl] 144 mov num=r34 };; // 3rd arg 145 | |
146{ .mib; add r8=0*$SZ,ctx 147 add r9=1*$SZ,ctx | 147{ .mib; add r8=0*$SZ,ctx 148 add r9=1*$SZ,ctx |
148 brp.loop.imp .L_first16,.L_first16_ctop 149 } | 149 brp.loop.imp .L_first16,.L_first16_end-16 } |
150{ .mib; add r10=2*$SZ,ctx 151 add r11=3*$SZ,ctx | 150{ .mib; add r10=2*$SZ,ctx 151 add r11=3*$SZ,ctx |
152 brp.loop.imp .L_rest,.L_rest_ctop 153 };; 154// load A-H 155{ .mmi; $LDW A=[r8],4*$SZ 156 $LDW B=[r9],4*$SZ 157 mov sgm0=$sigma0[2] } 158{ .mmi; $LDW C=[r10],4*$SZ 159 $LDW D=[r11],4*$SZ 160 mov sgm1=$sigma1[2] };; 161{ .mmi; $LDW E=[r8] 162 $LDW F=[r9] } 163{ .mmi; $LDW G=[r10] 164 $LDW H=[r11] 165 cmp.ne p15,p14=0,r35 };; // used in sha256_block | 152 brp.loop.imp .L_rest,.L_rest_end-16 };; |
166 | 153 |
154// load A-H 155.Lpic_point: 156{ .mmi; $LDW A_=[r8],4*$SZ 157 $LDW B_=[r9],4*$SZ 158 mov Ktbl=ip } 159{ .mmi; $LDW C_=[r10],4*$SZ 160 $LDW D_=[r11],4*$SZ 161 mov sgm0=$sigma0[2] };; 162{ .mmi; $LDW E_=[r8] 163 $LDW F_=[r9] 164 add Ktbl=($TABLE#-.Lpic_point),Ktbl } 165{ .mmi; $LDW G_=[r10] 166 $LDW H_=[r11] 167 cmp.ne p0,p16=0,r0 };; // used in sha256_block 168___ 169$code.=<<___ if ($BITS==64); 170{ .mii; and r8=7,input 171 and input=~7,input;; 172 cmp.eq p9,p0=1,r8 } 173{ .mmi; cmp.eq p10,p0=2,r8 174 cmp.eq p11,p0=3,r8 175 cmp.eq p12,p0=4,r8 } 176{ .mmi; cmp.eq p13,p0=5,r8 177 cmp.eq p14,p0=6,r8 178 cmp.eq p15,p0=7,r8 };; 179___ 180$code.=<<___; |
|
167.L_outer: | 181.L_outer: |
168{ .mii; mov ar.lc=15 169 mov ar.ec=1 };; 170.align 32 171.L_first16: | |
172.rotr X[16] | 182.rotr X[16] |
183{ .mmi; mov A=A_ 184 mov B=B_ 185 mov ar.lc=14 } 186{ .mmi; mov C=C_ 187 mov D=D_ 188 mov E=E_ } 189{ .mmi; mov F=F_ 190 mov G=G_ 191 mov ar.ec=2 } 192{ .mmi; ld1 X[15]=[input],$SZ // eliminated in 64-bit 193 mov H=H_ 194 mov sgm1=$sigma1[2] };; 195 |
|
173___ 174$t0="t0", $t1="t1", $code.=<<___ if ($BITS==32); | 196___ 197$t0="t0", $t1="t1", $code.=<<___ if ($BITS==32); |
175{ .mib; (p14) add r9=1,input 176 (p14) add r10=2,input } 177{ .mib; (p14) add r11=3,input 178 (p15) br.dptk.few .L_host };; 179{ .mmi; (p14) ld1 r8=[input],$SZ 180 (p14) ld1 r9=[r9] } 181{ .mmi; (p14) ld1 r10=[r10] 182 (p14) ld1 r11=[r11] };; 183{ .mii; (p14) dep r9=r8,r9,8,8 184 (p14) dep r11=r10,r11,8,8 };; 185{ .mib; (p14) dep X[15]=r9,r11,16,16 };; 186.L_host: 187{ .mib; (p15) $LDW X[15]=[input],$SZ // X[i]=*input++ | 198.align 32 199.L_first16: 200{ .mmi; add r9=1-$SZ,input 201 add r10=2-$SZ,input 202 add r11=3-$SZ,input };; 203{ .mmi; ld1 r9=[r9] 204 ld1 r10=[r10] |
188 dep.z $t1=E,32,32 } | 205 dep.z $t1=E,32,32 } |
189{ .mib; $LDW K=[Ktbl],$SZ | 206{ .mmi; $LDW K=[Ktbl],$SZ 207 ld1 r11=[r11] |
190 zxt4 E=E };; | 208 zxt4 E=E };; |
191{ .mmi; or $t1=$t1,E 192 and T1=F,E 193 and T2=A,B } | 209{ .mii; or $t1=$t1,E 210 dep X[15]=X[15],r9,8,8 211 dep r11=r10,r11,8,8 };; 212{ .mmi; and T1=F,E 213 and T2=A,B 214 dep X[15]=X[15],r11,16,16 } |
194{ .mmi; andcm r8=G,E 195 and r9=A,C 196 mux2 $t0=A,0x44 };; // copy lower half to upper | 215{ .mmi; andcm r8=G,E 216 and r9=A,C 217 mux2 $t0=A,0x44 };; // copy lower half to upper |
197{ .mib; xor T1=T1,r8 // T1=((e & f) ^ (~e & g)) | 218{ .mmi; (p16) ld1 X[15-1]=[input],$SZ // prefetch 219 xor T1=T1,r8 // T1=((e & f) ^ (~e & g)) |
198 _rotr r11=$t1,$Sigma1[0] } // ROTR(e,14) 199{ .mib; and r10=B,C 200 xor T2=T2,r9 };; 201___ 202$t0="A", $t1="E", $code.=<<___ if ($BITS==64); | 220 _rotr r11=$t1,$Sigma1[0] } // ROTR(e,14) 221{ .mib; and r10=B,C 222 xor T2=T2,r9 };; 223___ 224$t0="A", $t1="E", $code.=<<___ if ($BITS==64); |
203{ .mmi; $LDW X[15]=[input],$SZ // X[i]=*input++ | 225// in 64-bit mode I load whole X[16] at once and take care of alignment... 226{ .mmi; add r8=1*$SZ,input 227 add r9=2*$SZ,input 228 add r10=3*$SZ,input };; 229{ .mmb; $LDW X[15]=[input],4*$SZ 230 $LDW X[14]=[r8],4*$SZ 231(p9) br.cond.dpnt.many .L1byte };; 232{ .mmb; $LDW X[13]=[r9],4*$SZ 233 $LDW X[12]=[r10],4*$SZ 234(p10) br.cond.dpnt.many .L2byte };; 235{ .mmb; $LDW X[11]=[input],4*$SZ 236 $LDW X[10]=[r8],4*$SZ 237(p11) br.cond.dpnt.many .L3byte };; 238{ .mmb; $LDW X[ 9]=[r9],4*$SZ 239 $LDW X[ 8]=[r10],4*$SZ 240(p12) br.cond.dpnt.many .L4byte };; 241{ .mmb; $LDW X[ 7]=[input],4*$SZ 242 $LDW X[ 6]=[r8],4*$SZ 243(p13) br.cond.dpnt.many .L5byte };; 244{ .mmb; $LDW X[ 5]=[r9],4*$SZ 245 $LDW X[ 4]=[r10],4*$SZ 246(p14) br.cond.dpnt.many .L6byte };; 247{ .mmb; $LDW X[ 3]=[input],4*$SZ 248 $LDW X[ 2]=[r8],4*$SZ 249(p15) br.cond.dpnt.many .L7byte };; 250{ .mmb; $LDW X[ 1]=[r9],4*$SZ 251 $LDW X[ 0]=[r10],4*$SZ 252 br.many .L_first16 };; 253.L1byte: 254{ .mmi; $LDW X[13]=[r9],4*$SZ 255 $LDW X[12]=[r10],4*$SZ 256 shrp X[15]=X[15],X[14],56 };; 257{ .mmi; $LDW X[11]=[input],4*$SZ 258 $LDW X[10]=[r8],4*$SZ 259 shrp X[14]=X[14],X[13],56 } 260{ .mmi; $LDW X[ 9]=[r9],4*$SZ 261 $LDW X[ 8]=[r10],4*$SZ 262 shrp X[13]=X[13],X[12],56 };; 263{ .mmi; $LDW X[ 7]=[input],4*$SZ 264 $LDW X[ 6]=[r8],4*$SZ 265 shrp X[12]=X[12],X[11],56 } 266{ .mmi; $LDW X[ 5]=[r9],4*$SZ 267 $LDW X[ 4]=[r10],4*$SZ 268 shrp X[11]=X[11],X[10],56 };; 269{ .mmi; $LDW X[ 3]=[input],4*$SZ 270 $LDW X[ 2]=[r8],4*$SZ 271 shrp X[10]=X[10],X[ 9],56 } 272{ .mmi; $LDW X[ 1]=[r9],4*$SZ 273 $LDW X[ 0]=[r10],4*$SZ 274 shrp X[ 9]=X[ 9],X[ 8],56 };; 275{ .mii; $LDW T1=[input] 276 shrp X[ 8]=X[ 8],X[ 7],56 277 shrp X[ 7]=X[ 7],X[ 6],56 } 278{ .mii; shrp X[ 6]=X[ 6],X[ 5],56 279 shrp X[ 5]=X[ 5],X[ 4],56 };; 280{ .mii; shrp X[ 4]=X[ 4],X[ 3],56 281 shrp X[ 3]=X[ 3],X[ 2],56 } 282{ .mii; shrp X[ 2]=X[ 2],X[ 1],56 283 shrp X[ 1]=X[ 1],X[ 0],56 } 284{ .mib; shrp X[ 0]=X[ 0],T1,56 285 br.many .L_first16 };; 286.L2byte: 287{ .mmi; $LDW X[11]=[input],4*$SZ 288 $LDW X[10]=[r8],4*$SZ 289 shrp X[15]=X[15],X[14],48 } 290{ .mmi; $LDW X[ 9]=[r9],4*$SZ 291 $LDW X[ 8]=[r10],4*$SZ 292 shrp X[14]=X[14],X[13],48 };; 293{ .mmi; $LDW X[ 7]=[input],4*$SZ 294 $LDW X[ 6]=[r8],4*$SZ 295 shrp X[13]=X[13],X[12],48 } 296{ .mmi; $LDW X[ 5]=[r9],4*$SZ 297 $LDW X[ 4]=[r10],4*$SZ 298 shrp X[12]=X[12],X[11],48 };; 299{ .mmi; $LDW X[ 3]=[input],4*$SZ 300 $LDW X[ 2]=[r8],4*$SZ 301 shrp X[11]=X[11],X[10],48 } 302{ .mmi; $LDW X[ 1]=[r9],4*$SZ 303 $LDW X[ 0]=[r10],4*$SZ 304 shrp X[10]=X[10],X[ 9],48 };; 305{ .mii; $LDW T1=[input] 306 shrp X[ 9]=X[ 9],X[ 8],48 307 shrp X[ 8]=X[ 8],X[ 7],48 } 308{ .mii; shrp X[ 7]=X[ 7],X[ 6],48 309 shrp X[ 6]=X[ 6],X[ 5],48 };; 310{ .mii; shrp X[ 5]=X[ 5],X[ 4],48 311 shrp X[ 4]=X[ 4],X[ 3],48 } 312{ .mii; shrp X[ 3]=X[ 3],X[ 2],48 313 shrp X[ 2]=X[ 2],X[ 1],48 } 314{ .mii; shrp X[ 1]=X[ 1],X[ 0],48 315 shrp X[ 0]=X[ 0],T1,48 } 316{ .mfb; br.many .L_first16 };; 317.L3byte: 318{ .mmi; $LDW X[ 9]=[r9],4*$SZ 319 $LDW X[ 8]=[r10],4*$SZ 320 shrp X[15]=X[15],X[14],40 };; 321{ .mmi; $LDW X[ 7]=[input],4*$SZ 322 $LDW X[ 6]=[r8],4*$SZ 323 shrp X[14]=X[14],X[13],40 } 324{ .mmi; $LDW X[ 5]=[r9],4*$SZ 325 $LDW X[ 4]=[r10],4*$SZ 326 shrp X[13]=X[13],X[12],40 };; 327{ .mmi; $LDW X[ 3]=[input],4*$SZ 328 $LDW X[ 2]=[r8],4*$SZ 329 shrp X[12]=X[12],X[11],40 } 330{ .mmi; $LDW X[ 1]=[r9],4*$SZ 331 $LDW X[ 0]=[r10],4*$SZ 332 shrp X[11]=X[11],X[10],40 };; 333{ .mii; $LDW T1=[input] 334 shrp X[10]=X[10],X[ 9],40 335 shrp X[ 9]=X[ 9],X[ 8],40 } 336{ .mii; shrp X[ 8]=X[ 8],X[ 7],40 337 shrp X[ 7]=X[ 7],X[ 6],40 };; 338{ .mii; shrp X[ 6]=X[ 6],X[ 5],40 339 shrp X[ 5]=X[ 5],X[ 4],40 } 340{ .mii; shrp X[ 4]=X[ 4],X[ 3],40 341 shrp X[ 3]=X[ 3],X[ 2],40 } 342{ .mii; shrp X[ 2]=X[ 2],X[ 1],40 343 shrp X[ 1]=X[ 1],X[ 0],40 } 344{ .mib; shrp X[ 0]=X[ 0],T1,40 345 br.many .L_first16 };; 346.L4byte: 347{ .mmi; $LDW X[ 7]=[input],4*$SZ 348 $LDW X[ 6]=[r8],4*$SZ 349 shrp X[15]=X[15],X[14],32 } 350{ .mmi; $LDW X[ 5]=[r9],4*$SZ 351 $LDW X[ 4]=[r10],4*$SZ 352 shrp X[14]=X[14],X[13],32 };; 353{ .mmi; $LDW X[ 3]=[input],4*$SZ 354 $LDW X[ 2]=[r8],4*$SZ 355 shrp X[13]=X[13],X[12],32 } 356{ .mmi; $LDW X[ 1]=[r9],4*$SZ 357 $LDW X[ 0]=[r10],4*$SZ 358 shrp X[12]=X[12],X[11],32 };; 359{ .mii; $LDW T1=[input] 360 shrp X[11]=X[11],X[10],32 361 shrp X[10]=X[10],X[ 9],32 } 362{ .mii; shrp X[ 9]=X[ 9],X[ 8],32 363 shrp X[ 8]=X[ 8],X[ 7],32 };; 364{ .mii; shrp X[ 7]=X[ 7],X[ 6],32 365 shrp X[ 6]=X[ 6],X[ 5],32 } 366{ .mii; shrp X[ 5]=X[ 5],X[ 4],32 367 shrp X[ 4]=X[ 4],X[ 3],32 } 368{ .mii; shrp X[ 3]=X[ 3],X[ 2],32 369 shrp X[ 2]=X[ 2],X[ 1],32 } 370{ .mii; shrp X[ 1]=X[ 1],X[ 0],32 371 shrp X[ 0]=X[ 0],T1,32 } 372{ .mfb; br.many .L_first16 };; 373.L5byte: 374{ .mmi; $LDW X[ 5]=[r9],4*$SZ 375 $LDW X[ 4]=[r10],4*$SZ 376 shrp X[15]=X[15],X[14],24 };; 377{ .mmi; $LDW X[ 3]=[input],4*$SZ 378 $LDW X[ 2]=[r8],4*$SZ 379 shrp X[14]=X[14],X[13],24 } 380{ .mmi; $LDW X[ 1]=[r9],4*$SZ 381 $LDW X[ 0]=[r10],4*$SZ 382 shrp X[13]=X[13],X[12],24 };; 383{ .mii; $LDW T1=[input] 384 shrp X[12]=X[12],X[11],24 385 shrp X[11]=X[11],X[10],24 } 386{ .mii; shrp X[10]=X[10],X[ 9],24 387 shrp X[ 9]=X[ 9],X[ 8],24 };; 388{ .mii; shrp X[ 8]=X[ 8],X[ 7],24 389 shrp X[ 7]=X[ 7],X[ 6],24 } 390{ .mii; shrp X[ 6]=X[ 6],X[ 5],24 391 shrp X[ 5]=X[ 5],X[ 4],24 } 392{ .mii; shrp X[ 4]=X[ 4],X[ 3],24 393 shrp X[ 3]=X[ 3],X[ 2],24 } 394{ .mii; shrp X[ 2]=X[ 2],X[ 1],24 395 shrp X[ 1]=X[ 1],X[ 0],24 } 396{ .mib; shrp X[ 0]=X[ 0],T1,24 397 br.many .L_first16 };; 398.L6byte: 399{ .mmi; $LDW X[ 3]=[input],4*$SZ 400 $LDW X[ 2]=[r8],4*$SZ 401 shrp X[15]=X[15],X[14],16 } 402{ .mmi; $LDW X[ 1]=[r9],4*$SZ 403 $LDW X[ 0]=[r10],4*$SZ 404 shrp X[14]=X[14],X[13],16 };; 405{ .mii; $LDW T1=[input] 406 shrp X[13]=X[13],X[12],16 407 shrp X[12]=X[12],X[11],16 } 408{ .mii; shrp X[11]=X[11],X[10],16 409 shrp X[10]=X[10],X[ 9],16 };; 410{ .mii; shrp X[ 9]=X[ 9],X[ 8],16 411 shrp X[ 8]=X[ 8],X[ 7],16 } 412{ .mii; shrp X[ 7]=X[ 7],X[ 6],16 413 shrp X[ 6]=X[ 6],X[ 5],16 } 414{ .mii; shrp X[ 5]=X[ 5],X[ 4],16 415 shrp X[ 4]=X[ 4],X[ 3],16 } 416{ .mii; shrp X[ 3]=X[ 3],X[ 2],16 417 shrp X[ 2]=X[ 2],X[ 1],16 } 418{ .mii; shrp X[ 1]=X[ 1],X[ 0],16 419 shrp X[ 0]=X[ 0],T1,16 } 420{ .mfb; br.many .L_first16 };; 421.L7byte: 422{ .mmi; $LDW X[ 1]=[r9],4*$SZ 423 $LDW X[ 0]=[r10],4*$SZ 424 shrp X[15]=X[15],X[14],8 };; 425{ .mii; $LDW T1=[input] 426 shrp X[14]=X[14],X[13],8 427 shrp X[13]=X[13],X[12],8 } 428{ .mii; shrp X[12]=X[12],X[11],8 429 shrp X[11]=X[11],X[10],8 };; 430{ .mii; shrp X[10]=X[10],X[ 9],8 431 shrp X[ 9]=X[ 9],X[ 8],8 } 432{ .mii; shrp X[ 8]=X[ 8],X[ 7],8 433 shrp X[ 7]=X[ 7],X[ 6],8 } 434{ .mii; shrp X[ 6]=X[ 6],X[ 5],8 435 shrp X[ 5]=X[ 5],X[ 4],8 } 436{ .mii; shrp X[ 4]=X[ 4],X[ 3],8 437 shrp X[ 3]=X[ 3],X[ 2],8 } 438{ .mii; shrp X[ 2]=X[ 2],X[ 1],8 439 shrp X[ 1]=X[ 1],X[ 0],8 } 440{ .mib; shrp X[ 0]=X[ 0],T1,8 441 br.many .L_first16 };; 442 443.align 32 444.L_first16: 445{ .mmi; $LDW K=[Ktbl],$SZ |
204 and T1=F,E 205 and T2=A,B } | 446 and T1=F,E 447 and T2=A,B } |
206{ .mmi; $LDW K=[Ktbl],$SZ | 448{ .mmi; //$LDW X[15]=[input],$SZ // X[i]=*input++ |
207 andcm r8=G,E 208 and r9=A,C };; 209{ .mmi; xor T1=T1,r8 //T1=((e & f) ^ (~e & g)) 210 and r10=B,C 211 _rotr r11=$t1,$Sigma1[0] } // ROTR(e,14) 212{ .mmi; xor T2=T2,r9 213 mux1 X[15]=X[15],\@rev };; // eliminated in big-endian 214___ --- 16 unchanged lines hidden (view full) --- 231 mov C=B };; 232{ .mib; add T1=T1,X[15] // T1+=X[i] 233 _rotr r8=$t0,$Sigma0[2] } // ROTR(a,39) 234{ .mib; xor r10=r10,r11 235 mux2 X[15]=X[15],0x44 };; // eliminated in 64-bit 236{ .mmi; xor r10=r8,r10 // r10=Sigma0(a) 237 mov B=A 238 add A=T1,T2 };; | 449 andcm r8=G,E 450 and r9=A,C };; 451{ .mmi; xor T1=T1,r8 //T1=((e & f) ^ (~e & g)) 452 and r10=B,C 453 _rotr r11=$t1,$Sigma1[0] } // ROTR(e,14) 454{ .mmi; xor T2=T2,r9 455 mux1 X[15]=X[15],\@rev };; // eliminated in big-endian 456___ --- 16 unchanged lines hidden (view full) --- 473 mov C=B };; 474{ .mib; add T1=T1,X[15] // T1+=X[i] 475 _rotr r8=$t0,$Sigma0[2] } // ROTR(a,39) 476{ .mib; xor r10=r10,r11 477 mux2 X[15]=X[15],0x44 };; // eliminated in 64-bit 478{ .mmi; xor r10=r8,r10 // r10=Sigma0(a) 479 mov B=A 480 add A=T1,T2 };; |
239.L_first16_ctop: | |
240{ .mib; add E=E,T1 241 add A=A,r10 // T2=Maj(a,b,c)+Sigma0(a) 242 br.ctop.sptk .L_first16 };; | 481{ .mib; add E=E,T1 482 add A=A,r10 // T2=Maj(a,b,c)+Sigma0(a) 483 br.ctop.sptk .L_first16 };; |
484.L_first16_end: |
|
243 | 485 |
244{ .mib; mov ar.lc=$rounds-17 } 245{ .mib; mov ar.ec=1 };; | 486{ .mii; mov ar.lc=$rounds-17 487 mov ar.ec=1 };; 488 |
246.align 32 247.L_rest: 248.rotr X[16] 249{ .mib; $LDW K=[Ktbl],$SZ 250 _rotr r8=X[15-1],$sigma0[0] } // ROTR(s0,1) 251{ .mib; $ADD X[15]=X[15],X[15-9] // X[i&0xF]+=X[(i+9)&0xF] 252 $SHRU s0=X[15-1],sgm0 };; // s0=X[(i+1)&0xF]>>7 253{ .mib; and T1=F,E --- 52 unchanged lines hidden (view full) --- 306{ .mib; mov D=C 307 mov C=B };; 308{ .mmi; add T1=T1,X[15] // T1+=X[i] 309 xor r10=r10,r11 310 _rotr r8=$t0,$Sigma0[2] };; // ROTR(a,39) 311{ .mmi; xor r10=r8,r10 // r10=Sigma0(a) 312 mov B=A 313 add A=T1,T2 };; | 489.align 32 490.L_rest: 491.rotr X[16] 492{ .mib; $LDW K=[Ktbl],$SZ 493 _rotr r8=X[15-1],$sigma0[0] } // ROTR(s0,1) 494{ .mib; $ADD X[15]=X[15],X[15-9] // X[i&0xF]+=X[(i+9)&0xF] 495 $SHRU s0=X[15-1],sgm0 };; // s0=X[(i+1)&0xF]>>7 496{ .mib; and T1=F,E --- 52 unchanged lines hidden (view full) --- 549{ .mib; mov D=C 550 mov C=B };; 551{ .mmi; add T1=T1,X[15] // T1+=X[i] 552 xor r10=r10,r11 553 _rotr r8=$t0,$Sigma0[2] };; // ROTR(a,39) 554{ .mmi; xor r10=r8,r10 // r10=Sigma0(a) 555 mov B=A 556 add A=T1,T2 };; |
314.L_rest_ctop: | |
315{ .mib; add E=E,T1 316 add A=A,r10 // T2=Maj(a,b,c)+Sigma0(a) 317 br.ctop.sptk .L_rest };; | 557{ .mib; add E=E,T1 558 add A=A,r10 // T2=Maj(a,b,c)+Sigma0(a) 559 br.ctop.sptk .L_rest };; |
560.L_rest_end: |
|
318 | 561 |
562{ .mmi; add A_=A_,A 563 add B_=B_,B 564 add C_=C_,C } 565{ .mmi; add D_=D_,D 566 add E_=E_,E 567 cmp.ltu p16,p0=1,num };; 568{ .mmi; add F_=F_,F 569 add G_=G_,G 570 add H_=H_,H } 571{ .mmb; add Ktbl=-$SZ*$rounds,Ktbl 572(p16) add num=-1,num 573(p16) br.dptk.many .L_outer };; 574 |
|
319{ .mib; add r8=0*$SZ,ctx 320 add r9=1*$SZ,ctx } 321{ .mib; add r10=2*$SZ,ctx 322 add r11=3*$SZ,ctx };; | 575{ .mib; add r8=0*$SZ,ctx 576 add r9=1*$SZ,ctx } 577{ .mib; add r10=2*$SZ,ctx 578 add r11=3*$SZ,ctx };; |
323{ .mmi; $LDW r32=[r8],4*$SZ 324 $LDW r33=[r9],4*$SZ } 325{ .mmi; $LDW r34=[r10],4*$SZ 326 $LDW r35=[r11],4*$SZ 327 cmp.ltu p6,p7=1,num };; 328{ .mmi; $LDW r36=[r8],-4*$SZ 329 $LDW r37=[r9],-4*$SZ 330(p6) add Ktbl=-$SZ*$rounds,Ktbl } 331{ .mmi; $LDW r38=[r10],-4*$SZ 332 $LDW r39=[r11],-4*$SZ 333(p7) mov ar.lc=r3 };; 334{ .mmi; add A=A,r32 335 add B=B,r33 336 add C=C,r34 } 337{ .mmi; add D=D,r35 338 add E=E,r36 339 add F=F,r37 };; 340{ .mmi; $STW [r8]=A,4*$SZ 341 $STW [r9]=B,4*$SZ 342 add G=G,r38 } 343{ .mmi; $STW [r10]=C,4*$SZ 344 $STW [r11]=D,4*$SZ 345 add H=H,r39 };; 346{ .mmi; $STW [r8]=E 347 $STW [r9]=F 348(p6) add num=-1,num } 349{ .mmb; $STW [r10]=G 350 $STW [r11]=H 351(p6) br.dptk.many .L_outer };; 352 353{ .mib; mov pr=prsave,0x1ffff | 579{ .mmi; $STW [r8]=A_,4*$SZ 580 $STW [r9]=B_,4*$SZ 581 mov ar.lc=lcsave } 582{ .mmi; $STW [r10]=C_,4*$SZ 583 $STW [r11]=D_,4*$SZ 584 mov pr=prsave,0x1ffff };; 585{ .mmb; $STW [r8]=E_ 586 $STW [r9]=F_ } 587{ .mmb; $STW [r10]=G_ 588 $STW [r11]=H_ |
354 br.ret.sptk.many b0 };; 355.endp $func# 356___ 357 358$code =~ s/\`([^\`]*)\`/eval $1/gem; 359$code =~ s/_rotr(\s+)([^=]+)=([^,]+),([0-9]+)/shrp$1$2=$3,$3,$4/gm; 360if ($BITS==64) { 361 $code =~ s/mux2(\s+)\S+/nop.i$1 0x0/gm; | 589 br.ret.sptk.many b0 };; 590.endp $func# 591___ 592 593$code =~ s/\`([^\`]*)\`/eval $1/gem; 594$code =~ s/_rotr(\s+)([^=]+)=([^,]+),([0-9]+)/shrp$1$2=$3,$3,$4/gm; 595if ($BITS==64) { 596 $code =~ s/mux2(\s+)\S+/nop.i$1 0x0/gm; |
362 $code =~ s/mux1(\s+)\S+/nop.i$1 0x0/gm if ($big_endian); | 597 $code =~ s/mux1(\s+)\S+/nop.i$1 0x0/gm if ($big_endian); 598 $code =~ s/(shrp\s+X\[[^=]+)=([^,]+),([^,]+),([1-9]+)/$1=$3,$2,64-$4/gm 599 if (!$big_endian); 600 $code =~ s/ld1(\s+)X\[\S+/nop.m$1 0x0/gm; |
363} 364 365print $code; 366 367print<<___ if ($BITS==32); 368.align 64 369.type K256#,\@object 370K256: data4 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 --- 8 unchanged lines hidden (view full) --- 379 data4 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 380 data4 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 381 data4 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 382 data4 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 383 data4 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 384 data4 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 385 data4 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 386.size K256#,$SZ*$rounds | 601} 602 603print $code; 604 605print<<___ if ($BITS==32); 606.align 64 607.type K256#,\@object 608K256: data4 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 --- 8 unchanged lines hidden (view full) --- 617 data4 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 618 data4 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 619 data4 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 620 data4 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 621 data4 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 622 data4 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 623 data4 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 624.size K256#,$SZ*$rounds |
625stringz "SHA256 block transform for IA64, CRYPTOGAMS by <appro\@openssl.org>" |
|
387___ 388print<<___ if ($BITS==64); 389.align 64 390.type K512#,\@object 391K512: data8 0x428a2f98d728ae22,0x7137449123ef65cd 392 data8 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc 393 data8 0x3956c25bf348b538,0x59f111f1b605d019 394 data8 0x923f82a4af194f9b,0xab1c5ed5da6d8118 --- 29 unchanged lines hidden (view full) --- 424 data8 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 425 data8 0x06f067aa72176fba,0x0a637dc5a2c898a6 426 data8 0x113f9804bef90dae,0x1b710b35131c471b 427 data8 0x28db77f523047d84,0x32caab7b40c72493 428 data8 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c 429 data8 0x4cc5d4becb3e42b6,0x597f299cfc657e2a 430 data8 0x5fcb6fab3ad6faec,0x6c44198c4a475817 431.size K512#,$SZ*$rounds | 626___ 627print<<___ if ($BITS==64); 628.align 64 629.type K512#,\@object 630K512: data8 0x428a2f98d728ae22,0x7137449123ef65cd 631 data8 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc 632 data8 0x3956c25bf348b538,0x59f111f1b605d019 633 data8 0x923f82a4af194f9b,0xab1c5ed5da6d8118 --- 29 unchanged lines hidden (view full) --- 663 data8 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 664 data8 0x06f067aa72176fba,0x0a637dc5a2c898a6 665 data8 0x113f9804bef90dae,0x1b710b35131c471b 666 data8 0x28db77f523047d84,0x32caab7b40c72493 667 data8 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c 668 data8 0x4cc5d4becb3e42b6,0x597f299cfc657e2a 669 data8 0x5fcb6fab3ad6faec,0x6c44198c4a475817 670.size K512#,$SZ*$rounds |
671stringz "SHA512 block transform for IA64, CRYPTOGAMS by <appro\@openssl.org>" |
|
432___ | 672___ |