Deleted Added
full compact
sha512-ia64.pl (160815) sha512-ia64.pl (194206)
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. Rights for redistribution and usage in source and binary
6# forms are granted according to the OpenSSL license.
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
7# ====================================================================
8#
9# SHA256/512_Transform for Itanium.
10#
11# sha512_block runs in 1003 cycles on Itanium 2, which is almost 50%
12# faster than gcc and >60%(!) faster than code generated by HP-UX
13# compiler (yes, HP-UX is generating slower code, because unlike gcc,
14# it failed to deploy "shift right pair," 'shrp' instruction, which

--- 51 unchanged lines hidden (view full) ---

66if ($output =~ /512.*\.[s|asm]/) {
67 $SZ=8;
68 $BITS=8*$SZ;
69 $LDW="ld8";
70 $STW="st8";
71 $ADD="add";
72 $SHRU="shr.u";
73 $TABLE="K512";
8# ====================================================================
9#
10# SHA256/512_Transform for Itanium.
11#
12# sha512_block runs in 1003 cycles on Itanium 2, which is almost 50%
13# faster than gcc and >60%(!) faster than code generated by HP-UX
14# compiler (yes, HP-UX is generating slower code, because unlike gcc,
15# it failed to deploy "shift right pair," 'shrp' instruction, which

--- 51 unchanged lines hidden (view full) ---

67if ($output =~ /512.*\.[s|asm]/) {
68 $SZ=8;
69 $BITS=8*$SZ;
70 $LDW="ld8";
71 $STW="st8";
72 $ADD="add";
73 $SHRU="shr.u";
74 $TABLE="K512";
74 $func="sha512_block";
75 $func="sha512_block_data_order";
75 @Sigma0=(28,34,39);
76 @Sigma1=(14,18,41);
77 @sigma0=(1, 8, 7);
78 @sigma1=(19,61, 6);
79 $rounds=80;
80} elsif ($output =~ /256.*\.[s|asm]/) {
81 $SZ=4;
82 $BITS=8*$SZ;
83 $LDW="ld4";
84 $STW="st4";
85 $ADD="padd4";
86 $SHRU="pshr4.u";
87 $TABLE="K256";
76 @Sigma0=(28,34,39);
77 @Sigma1=(14,18,41);
78 @sigma0=(1, 8, 7);
79 @sigma1=(19,61, 6);
80 $rounds=80;
81} elsif ($output =~ /256.*\.[s|asm]/) {
82 $SZ=4;
83 $BITS=8*$SZ;
84 $LDW="ld4";
85 $STW="st4";
86 $ADD="padd4";
87 $SHRU="pshr4.u";
88 $TABLE="K256";
88 $func="sha256_block";
89 $func="sha256_block_data_order";
89 @Sigma0=( 2,13,22);
90 @Sigma1=( 6,11,25);
91 @sigma0=( 7,18, 3);
92 @sigma1=(17,19,10);
93 $rounds=64;
94} else { die "nonsense $output"; }
95
96open STDOUT,">$output" || die "can't open $output: $!";
97
98if ($^O eq "hpux") {
99 $ADDP="addp4";
100 for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); }
101} else { $ADDP="add"; }
102for (@ARGV) { $big_endian=1 if (/\-DB_ENDIAN/);
103 $big_endian=0 if (/\-DL_ENDIAN/); }
104if (!defined($big_endian))
105 { $big_endian=(unpack('L',pack('N',1))==1); }
106
107$code=<<___;
90 @Sigma0=( 2,13,22);
91 @Sigma1=( 6,11,25);
92 @sigma0=( 7,18, 3);
93 @sigma1=(17,19,10);
94 $rounds=64;
95} else { die "nonsense $output"; }
96
97open STDOUT,">$output" || die "can't open $output: $!";
98
99if ($^O eq "hpux") {
100 $ADDP="addp4";
101 for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); }
102} else { $ADDP="add"; }
103for (@ARGV) { $big_endian=1 if (/\-DB_ENDIAN/);
104 $big_endian=0 if (/\-DL_ENDIAN/); }
105if (!defined($big_endian))
106 { $big_endian=(unpack('L',pack('N',1))==1); }
107
108$code=<<___;
108.ident \"$output, version 1.0\"
109.ident \"$output, version 1.1\"
109.ident \"IA-64 ISA artwork by Andy Polyakov <appro\@fy.chalmers.se>\"
110.explicit
111.text
112
110.ident \"IA-64 ISA artwork by Andy Polyakov <appro\@fy.chalmers.se>\"
111.explicit
112.text
113
114pfssave=r2;
115lcsave=r3;
113prsave=r14;
114K=r15;
115A=r16; B=r17; C=r18; D=r19;
116E=r20; F=r21; G=r22; H=r23;
117T1=r24; T2=r25;
118s0=r26; s1=r27; t0=r28; t1=r29;
119Ktbl=r30;
120ctx=r31; // 1st arg
121input=r48; // 2nd arg
122num=r49; // 3rd arg
123sgm0=r50; sgm1=r51; // small constants
116prsave=r14;
117K=r15;
118A=r16; B=r17; C=r18; D=r19;
119E=r20; F=r21; G=r22; H=r23;
120T1=r24; T2=r25;
121s0=r26; s1=r27; t0=r28; t1=r29;
122Ktbl=r30;
123ctx=r31; // 1st arg
124input=r48; // 2nd arg
125num=r49; // 3rd arg
126sgm0=r50; sgm1=r51; // small constants
127A_=r54; B_=r55; C_=r56; D_=r57;
128E_=r58; F_=r59; G_=r60; H_=r61;
124
125// void $func (SHA_CTX *ctx, const void *in,size_t num[,int host])
126.global $func#
127.proc $func#
128.align 32
129$func:
130 .prologue
129
130// void $func (SHA_CTX *ctx, const void *in,size_t num[,int host])
131.global $func#
132.proc $func#
133.align 32
134$func:
135 .prologue
131 .fframe 0
132 .save ar.pfs,r2
133 .save ar.lc,r3
134 .save pr,prsave
135{ .mmi; alloc r2=ar.pfs,3,17,0,16
136 .save ar.pfs,pfssave
137{ .mmi; alloc pfssave=ar.pfs,3,27,0,16
136 $ADDP ctx=0,r32 // 1st arg
138 $ADDP ctx=0,r32 // 1st arg
137 mov r3=ar.lc }
139 .save ar.lc,lcsave
140 mov lcsave=ar.lc }
138{ .mmi; $ADDP input=0,r33 // 2nd arg
141{ .mmi; $ADDP input=0,r33 // 2nd arg
139 addl Ktbl=\@ltoff($TABLE#),gp
142 mov num=r34 // 3rd arg
143 .save pr,prsave
140 mov prsave=pr };;
141
142 .body
144 mov prsave=pr };;
145
146 .body
143{ .mii; ld8 Ktbl=[Ktbl]
144 mov num=r34 };; // 3rd arg
145
146{ .mib; add r8=0*$SZ,ctx
147 add r9=1*$SZ,ctx
147{ .mib; add r8=0*$SZ,ctx
148 add r9=1*$SZ,ctx
148 brp.loop.imp .L_first16,.L_first16_ctop
149 }
149 brp.loop.imp .L_first16,.L_first16_end-16 }
150{ .mib; add r10=2*$SZ,ctx
151 add r11=3*$SZ,ctx
150{ .mib; add r10=2*$SZ,ctx
151 add r11=3*$SZ,ctx
152 brp.loop.imp .L_rest,.L_rest_ctop
153 };;
154// load A-H
155{ .mmi; $LDW A=[r8],4*$SZ
156 $LDW B=[r9],4*$SZ
157 mov sgm0=$sigma0[2] }
158{ .mmi; $LDW C=[r10],4*$SZ
159 $LDW D=[r11],4*$SZ
160 mov sgm1=$sigma1[2] };;
161{ .mmi; $LDW E=[r8]
162 $LDW F=[r9] }
163{ .mmi; $LDW G=[r10]
164 $LDW H=[r11]
165 cmp.ne p15,p14=0,r35 };; // used in sha256_block
152 brp.loop.imp .L_rest,.L_rest_end-16 };;
166
153
154// load A-H
155.Lpic_point:
156{ .mmi; $LDW A_=[r8],4*$SZ
157 $LDW B_=[r9],4*$SZ
158 mov Ktbl=ip }
159{ .mmi; $LDW C_=[r10],4*$SZ
160 $LDW D_=[r11],4*$SZ
161 mov sgm0=$sigma0[2] };;
162{ .mmi; $LDW E_=[r8]
163 $LDW F_=[r9]
164 add Ktbl=($TABLE#-.Lpic_point),Ktbl }
165{ .mmi; $LDW G_=[r10]
166 $LDW H_=[r11]
167 cmp.ne p0,p16=0,r0 };; // used in sha256_block
168___
169$code.=<<___ if ($BITS==64);
170{ .mii; and r8=7,input
171 and input=~7,input;;
172 cmp.eq p9,p0=1,r8 }
173{ .mmi; cmp.eq p10,p0=2,r8
174 cmp.eq p11,p0=3,r8
175 cmp.eq p12,p0=4,r8 }
176{ .mmi; cmp.eq p13,p0=5,r8
177 cmp.eq p14,p0=6,r8
178 cmp.eq p15,p0=7,r8 };;
179___
180$code.=<<___;
167.L_outer:
181.L_outer:
168{ .mii; mov ar.lc=15
169 mov ar.ec=1 };;
170.align 32
171.L_first16:
172.rotr X[16]
182.rotr X[16]
183{ .mmi; mov A=A_
184 mov B=B_
185 mov ar.lc=14 }
186{ .mmi; mov C=C_
187 mov D=D_
188 mov E=E_ }
189{ .mmi; mov F=F_
190 mov G=G_
191 mov ar.ec=2 }
192{ .mmi; ld1 X[15]=[input],$SZ // eliminated in 64-bit
193 mov H=H_
194 mov sgm1=$sigma1[2] };;
195
173___
174$t0="t0", $t1="t1", $code.=<<___ if ($BITS==32);
196___
197$t0="t0", $t1="t1", $code.=<<___ if ($BITS==32);
175{ .mib; (p14) add r9=1,input
176 (p14) add r10=2,input }
177{ .mib; (p14) add r11=3,input
178 (p15) br.dptk.few .L_host };;
179{ .mmi; (p14) ld1 r8=[input],$SZ
180 (p14) ld1 r9=[r9] }
181{ .mmi; (p14) ld1 r10=[r10]
182 (p14) ld1 r11=[r11] };;
183{ .mii; (p14) dep r9=r8,r9,8,8
184 (p14) dep r11=r10,r11,8,8 };;
185{ .mib; (p14) dep X[15]=r9,r11,16,16 };;
186.L_host:
187{ .mib; (p15) $LDW X[15]=[input],$SZ // X[i]=*input++
198.align 32
199.L_first16:
200{ .mmi; add r9=1-$SZ,input
201 add r10=2-$SZ,input
202 add r11=3-$SZ,input };;
203{ .mmi; ld1 r9=[r9]
204 ld1 r10=[r10]
188 dep.z $t1=E,32,32 }
205 dep.z $t1=E,32,32 }
189{ .mib; $LDW K=[Ktbl],$SZ
206{ .mmi; $LDW K=[Ktbl],$SZ
207 ld1 r11=[r11]
190 zxt4 E=E };;
208 zxt4 E=E };;
191{ .mmi; or $t1=$t1,E
192 and T1=F,E
193 and T2=A,B }
209{ .mii; or $t1=$t1,E
210 dep X[15]=X[15],r9,8,8
211 dep r11=r10,r11,8,8 };;
212{ .mmi; and T1=F,E
213 and T2=A,B
214 dep X[15]=X[15],r11,16,16 }
194{ .mmi; andcm r8=G,E
195 and r9=A,C
196 mux2 $t0=A,0x44 };; // copy lower half to upper
215{ .mmi; andcm r8=G,E
216 and r9=A,C
217 mux2 $t0=A,0x44 };; // copy lower half to upper
197{ .mib; xor T1=T1,r8 // T1=((e & f) ^ (~e & g))
218{ .mmi; (p16) ld1 X[15-1]=[input],$SZ // prefetch
219 xor T1=T1,r8 // T1=((e & f) ^ (~e & g))
198 _rotr r11=$t1,$Sigma1[0] } // ROTR(e,14)
199{ .mib; and r10=B,C
200 xor T2=T2,r9 };;
201___
202$t0="A", $t1="E", $code.=<<___ if ($BITS==64);
220 _rotr r11=$t1,$Sigma1[0] } // ROTR(e,14)
221{ .mib; and r10=B,C
222 xor T2=T2,r9 };;
223___
224$t0="A", $t1="E", $code.=<<___ if ($BITS==64);
203{ .mmi; $LDW X[15]=[input],$SZ // X[i]=*input++
225// in 64-bit mode I load whole X[16] at once and take care of alignment...
226{ .mmi; add r8=1*$SZ,input
227 add r9=2*$SZ,input
228 add r10=3*$SZ,input };;
229{ .mmb; $LDW X[15]=[input],4*$SZ
230 $LDW X[14]=[r8],4*$SZ
231(p9) br.cond.dpnt.many .L1byte };;
232{ .mmb; $LDW X[13]=[r9],4*$SZ
233 $LDW X[12]=[r10],4*$SZ
234(p10) br.cond.dpnt.many .L2byte };;
235{ .mmb; $LDW X[11]=[input],4*$SZ
236 $LDW X[10]=[r8],4*$SZ
237(p11) br.cond.dpnt.many .L3byte };;
238{ .mmb; $LDW X[ 9]=[r9],4*$SZ
239 $LDW X[ 8]=[r10],4*$SZ
240(p12) br.cond.dpnt.many .L4byte };;
241{ .mmb; $LDW X[ 7]=[input],4*$SZ
242 $LDW X[ 6]=[r8],4*$SZ
243(p13) br.cond.dpnt.many .L5byte };;
244{ .mmb; $LDW X[ 5]=[r9],4*$SZ
245 $LDW X[ 4]=[r10],4*$SZ
246(p14) br.cond.dpnt.many .L6byte };;
247{ .mmb; $LDW X[ 3]=[input],4*$SZ
248 $LDW X[ 2]=[r8],4*$SZ
249(p15) br.cond.dpnt.many .L7byte };;
250{ .mmb; $LDW X[ 1]=[r9],4*$SZ
251 $LDW X[ 0]=[r10],4*$SZ
252 br.many .L_first16 };;
253.L1byte:
254{ .mmi; $LDW X[13]=[r9],4*$SZ
255 $LDW X[12]=[r10],4*$SZ
256 shrp X[15]=X[15],X[14],56 };;
257{ .mmi; $LDW X[11]=[input],4*$SZ
258 $LDW X[10]=[r8],4*$SZ
259 shrp X[14]=X[14],X[13],56 }
260{ .mmi; $LDW X[ 9]=[r9],4*$SZ
261 $LDW X[ 8]=[r10],4*$SZ
262 shrp X[13]=X[13],X[12],56 };;
263{ .mmi; $LDW X[ 7]=[input],4*$SZ
264 $LDW X[ 6]=[r8],4*$SZ
265 shrp X[12]=X[12],X[11],56 }
266{ .mmi; $LDW X[ 5]=[r9],4*$SZ
267 $LDW X[ 4]=[r10],4*$SZ
268 shrp X[11]=X[11],X[10],56 };;
269{ .mmi; $LDW X[ 3]=[input],4*$SZ
270 $LDW X[ 2]=[r8],4*$SZ
271 shrp X[10]=X[10],X[ 9],56 }
272{ .mmi; $LDW X[ 1]=[r9],4*$SZ
273 $LDW X[ 0]=[r10],4*$SZ
274 shrp X[ 9]=X[ 9],X[ 8],56 };;
275{ .mii; $LDW T1=[input]
276 shrp X[ 8]=X[ 8],X[ 7],56
277 shrp X[ 7]=X[ 7],X[ 6],56 }
278{ .mii; shrp X[ 6]=X[ 6],X[ 5],56
279 shrp X[ 5]=X[ 5],X[ 4],56 };;
280{ .mii; shrp X[ 4]=X[ 4],X[ 3],56
281 shrp X[ 3]=X[ 3],X[ 2],56 }
282{ .mii; shrp X[ 2]=X[ 2],X[ 1],56
283 shrp X[ 1]=X[ 1],X[ 0],56 }
284{ .mib; shrp X[ 0]=X[ 0],T1,56
285 br.many .L_first16 };;
286.L2byte:
287{ .mmi; $LDW X[11]=[input],4*$SZ
288 $LDW X[10]=[r8],4*$SZ
289 shrp X[15]=X[15],X[14],48 }
290{ .mmi; $LDW X[ 9]=[r9],4*$SZ
291 $LDW X[ 8]=[r10],4*$SZ
292 shrp X[14]=X[14],X[13],48 };;
293{ .mmi; $LDW X[ 7]=[input],4*$SZ
294 $LDW X[ 6]=[r8],4*$SZ
295 shrp X[13]=X[13],X[12],48 }
296{ .mmi; $LDW X[ 5]=[r9],4*$SZ
297 $LDW X[ 4]=[r10],4*$SZ
298 shrp X[12]=X[12],X[11],48 };;
299{ .mmi; $LDW X[ 3]=[input],4*$SZ
300 $LDW X[ 2]=[r8],4*$SZ
301 shrp X[11]=X[11],X[10],48 }
302{ .mmi; $LDW X[ 1]=[r9],4*$SZ
303 $LDW X[ 0]=[r10],4*$SZ
304 shrp X[10]=X[10],X[ 9],48 };;
305{ .mii; $LDW T1=[input]
306 shrp X[ 9]=X[ 9],X[ 8],48
307 shrp X[ 8]=X[ 8],X[ 7],48 }
308{ .mii; shrp X[ 7]=X[ 7],X[ 6],48
309 shrp X[ 6]=X[ 6],X[ 5],48 };;
310{ .mii; shrp X[ 5]=X[ 5],X[ 4],48
311 shrp X[ 4]=X[ 4],X[ 3],48 }
312{ .mii; shrp X[ 3]=X[ 3],X[ 2],48
313 shrp X[ 2]=X[ 2],X[ 1],48 }
314{ .mii; shrp X[ 1]=X[ 1],X[ 0],48
315 shrp X[ 0]=X[ 0],T1,48 }
316{ .mfb; br.many .L_first16 };;
317.L3byte:
318{ .mmi; $LDW X[ 9]=[r9],4*$SZ
319 $LDW X[ 8]=[r10],4*$SZ
320 shrp X[15]=X[15],X[14],40 };;
321{ .mmi; $LDW X[ 7]=[input],4*$SZ
322 $LDW X[ 6]=[r8],4*$SZ
323 shrp X[14]=X[14],X[13],40 }
324{ .mmi; $LDW X[ 5]=[r9],4*$SZ
325 $LDW X[ 4]=[r10],4*$SZ
326 shrp X[13]=X[13],X[12],40 };;
327{ .mmi; $LDW X[ 3]=[input],4*$SZ
328 $LDW X[ 2]=[r8],4*$SZ
329 shrp X[12]=X[12],X[11],40 }
330{ .mmi; $LDW X[ 1]=[r9],4*$SZ
331 $LDW X[ 0]=[r10],4*$SZ
332 shrp X[11]=X[11],X[10],40 };;
333{ .mii; $LDW T1=[input]
334 shrp X[10]=X[10],X[ 9],40
335 shrp X[ 9]=X[ 9],X[ 8],40 }
336{ .mii; shrp X[ 8]=X[ 8],X[ 7],40
337 shrp X[ 7]=X[ 7],X[ 6],40 };;
338{ .mii; shrp X[ 6]=X[ 6],X[ 5],40
339 shrp X[ 5]=X[ 5],X[ 4],40 }
340{ .mii; shrp X[ 4]=X[ 4],X[ 3],40
341 shrp X[ 3]=X[ 3],X[ 2],40 }
342{ .mii; shrp X[ 2]=X[ 2],X[ 1],40
343 shrp X[ 1]=X[ 1],X[ 0],40 }
344{ .mib; shrp X[ 0]=X[ 0],T1,40
345 br.many .L_first16 };;
346.L4byte:
347{ .mmi; $LDW X[ 7]=[input],4*$SZ
348 $LDW X[ 6]=[r8],4*$SZ
349 shrp X[15]=X[15],X[14],32 }
350{ .mmi; $LDW X[ 5]=[r9],4*$SZ
351 $LDW X[ 4]=[r10],4*$SZ
352 shrp X[14]=X[14],X[13],32 };;
353{ .mmi; $LDW X[ 3]=[input],4*$SZ
354 $LDW X[ 2]=[r8],4*$SZ
355 shrp X[13]=X[13],X[12],32 }
356{ .mmi; $LDW X[ 1]=[r9],4*$SZ
357 $LDW X[ 0]=[r10],4*$SZ
358 shrp X[12]=X[12],X[11],32 };;
359{ .mii; $LDW T1=[input]
360 shrp X[11]=X[11],X[10],32
361 shrp X[10]=X[10],X[ 9],32 }
362{ .mii; shrp X[ 9]=X[ 9],X[ 8],32
363 shrp X[ 8]=X[ 8],X[ 7],32 };;
364{ .mii; shrp X[ 7]=X[ 7],X[ 6],32
365 shrp X[ 6]=X[ 6],X[ 5],32 }
366{ .mii; shrp X[ 5]=X[ 5],X[ 4],32
367 shrp X[ 4]=X[ 4],X[ 3],32 }
368{ .mii; shrp X[ 3]=X[ 3],X[ 2],32
369 shrp X[ 2]=X[ 2],X[ 1],32 }
370{ .mii; shrp X[ 1]=X[ 1],X[ 0],32
371 shrp X[ 0]=X[ 0],T1,32 }
372{ .mfb; br.many .L_first16 };;
373.L5byte:
374{ .mmi; $LDW X[ 5]=[r9],4*$SZ
375 $LDW X[ 4]=[r10],4*$SZ
376 shrp X[15]=X[15],X[14],24 };;
377{ .mmi; $LDW X[ 3]=[input],4*$SZ
378 $LDW X[ 2]=[r8],4*$SZ
379 shrp X[14]=X[14],X[13],24 }
380{ .mmi; $LDW X[ 1]=[r9],4*$SZ
381 $LDW X[ 0]=[r10],4*$SZ
382 shrp X[13]=X[13],X[12],24 };;
383{ .mii; $LDW T1=[input]
384 shrp X[12]=X[12],X[11],24
385 shrp X[11]=X[11],X[10],24 }
386{ .mii; shrp X[10]=X[10],X[ 9],24
387 shrp X[ 9]=X[ 9],X[ 8],24 };;
388{ .mii; shrp X[ 8]=X[ 8],X[ 7],24
389 shrp X[ 7]=X[ 7],X[ 6],24 }
390{ .mii; shrp X[ 6]=X[ 6],X[ 5],24
391 shrp X[ 5]=X[ 5],X[ 4],24 }
392{ .mii; shrp X[ 4]=X[ 4],X[ 3],24
393 shrp X[ 3]=X[ 3],X[ 2],24 }
394{ .mii; shrp X[ 2]=X[ 2],X[ 1],24
395 shrp X[ 1]=X[ 1],X[ 0],24 }
396{ .mib; shrp X[ 0]=X[ 0],T1,24
397 br.many .L_first16 };;
398.L6byte:
399{ .mmi; $LDW X[ 3]=[input],4*$SZ
400 $LDW X[ 2]=[r8],4*$SZ
401 shrp X[15]=X[15],X[14],16 }
402{ .mmi; $LDW X[ 1]=[r9],4*$SZ
403 $LDW X[ 0]=[r10],4*$SZ
404 shrp X[14]=X[14],X[13],16 };;
405{ .mii; $LDW T1=[input]
406 shrp X[13]=X[13],X[12],16
407 shrp X[12]=X[12],X[11],16 }
408{ .mii; shrp X[11]=X[11],X[10],16
409 shrp X[10]=X[10],X[ 9],16 };;
410{ .mii; shrp X[ 9]=X[ 9],X[ 8],16
411 shrp X[ 8]=X[ 8],X[ 7],16 }
412{ .mii; shrp X[ 7]=X[ 7],X[ 6],16
413 shrp X[ 6]=X[ 6],X[ 5],16 }
414{ .mii; shrp X[ 5]=X[ 5],X[ 4],16
415 shrp X[ 4]=X[ 4],X[ 3],16 }
416{ .mii; shrp X[ 3]=X[ 3],X[ 2],16
417 shrp X[ 2]=X[ 2],X[ 1],16 }
418{ .mii; shrp X[ 1]=X[ 1],X[ 0],16
419 shrp X[ 0]=X[ 0],T1,16 }
420{ .mfb; br.many .L_first16 };;
421.L7byte:
422{ .mmi; $LDW X[ 1]=[r9],4*$SZ
423 $LDW X[ 0]=[r10],4*$SZ
424 shrp X[15]=X[15],X[14],8 };;
425{ .mii; $LDW T1=[input]
426 shrp X[14]=X[14],X[13],8
427 shrp X[13]=X[13],X[12],8 }
428{ .mii; shrp X[12]=X[12],X[11],8
429 shrp X[11]=X[11],X[10],8 };;
430{ .mii; shrp X[10]=X[10],X[ 9],8
431 shrp X[ 9]=X[ 9],X[ 8],8 }
432{ .mii; shrp X[ 8]=X[ 8],X[ 7],8
433 shrp X[ 7]=X[ 7],X[ 6],8 }
434{ .mii; shrp X[ 6]=X[ 6],X[ 5],8
435 shrp X[ 5]=X[ 5],X[ 4],8 }
436{ .mii; shrp X[ 4]=X[ 4],X[ 3],8
437 shrp X[ 3]=X[ 3],X[ 2],8 }
438{ .mii; shrp X[ 2]=X[ 2],X[ 1],8
439 shrp X[ 1]=X[ 1],X[ 0],8 }
440{ .mib; shrp X[ 0]=X[ 0],T1,8
441 br.many .L_first16 };;
442
443.align 32
444.L_first16:
445{ .mmi; $LDW K=[Ktbl],$SZ
204 and T1=F,E
205 and T2=A,B }
446 and T1=F,E
447 and T2=A,B }
206{ .mmi; $LDW K=[Ktbl],$SZ
448{ .mmi; //$LDW X[15]=[input],$SZ // X[i]=*input++
207 andcm r8=G,E
208 and r9=A,C };;
209{ .mmi; xor T1=T1,r8 //T1=((e & f) ^ (~e & g))
210 and r10=B,C
211 _rotr r11=$t1,$Sigma1[0] } // ROTR(e,14)
212{ .mmi; xor T2=T2,r9
213 mux1 X[15]=X[15],\@rev };; // eliminated in big-endian
214___

--- 16 unchanged lines hidden (view full) ---

231 mov C=B };;
232{ .mib; add T1=T1,X[15] // T1+=X[i]
233 _rotr r8=$t0,$Sigma0[2] } // ROTR(a,39)
234{ .mib; xor r10=r10,r11
235 mux2 X[15]=X[15],0x44 };; // eliminated in 64-bit
236{ .mmi; xor r10=r8,r10 // r10=Sigma0(a)
237 mov B=A
238 add A=T1,T2 };;
449 andcm r8=G,E
450 and r9=A,C };;
451{ .mmi; xor T1=T1,r8 //T1=((e & f) ^ (~e & g))
452 and r10=B,C
453 _rotr r11=$t1,$Sigma1[0] } // ROTR(e,14)
454{ .mmi; xor T2=T2,r9
455 mux1 X[15]=X[15],\@rev };; // eliminated in big-endian
456___

--- 16 unchanged lines hidden (view full) ---

473 mov C=B };;
474{ .mib; add T1=T1,X[15] // T1+=X[i]
475 _rotr r8=$t0,$Sigma0[2] } // ROTR(a,39)
476{ .mib; xor r10=r10,r11
477 mux2 X[15]=X[15],0x44 };; // eliminated in 64-bit
478{ .mmi; xor r10=r8,r10 // r10=Sigma0(a)
479 mov B=A
480 add A=T1,T2 };;
239.L_first16_ctop:
240{ .mib; add E=E,T1
241 add A=A,r10 // T2=Maj(a,b,c)+Sigma0(a)
242 br.ctop.sptk .L_first16 };;
481{ .mib; add E=E,T1
482 add A=A,r10 // T2=Maj(a,b,c)+Sigma0(a)
483 br.ctop.sptk .L_first16 };;
484.L_first16_end:
243
485
244{ .mib; mov ar.lc=$rounds-17 }
245{ .mib; mov ar.ec=1 };;
486{ .mii; mov ar.lc=$rounds-17
487 mov ar.ec=1 };;
488
246.align 32
247.L_rest:
248.rotr X[16]
249{ .mib; $LDW K=[Ktbl],$SZ
250 _rotr r8=X[15-1],$sigma0[0] } // ROTR(s0,1)
251{ .mib; $ADD X[15]=X[15],X[15-9] // X[i&0xF]+=X[(i+9)&0xF]
252 $SHRU s0=X[15-1],sgm0 };; // s0=X[(i+1)&0xF]>>7
253{ .mib; and T1=F,E

--- 52 unchanged lines hidden (view full) ---

306{ .mib; mov D=C
307 mov C=B };;
308{ .mmi; add T1=T1,X[15] // T1+=X[i]
309 xor r10=r10,r11
310 _rotr r8=$t0,$Sigma0[2] };; // ROTR(a,39)
311{ .mmi; xor r10=r8,r10 // r10=Sigma0(a)
312 mov B=A
313 add A=T1,T2 };;
489.align 32
490.L_rest:
491.rotr X[16]
492{ .mib; $LDW K=[Ktbl],$SZ
493 _rotr r8=X[15-1],$sigma0[0] } // ROTR(s0,1)
494{ .mib; $ADD X[15]=X[15],X[15-9] // X[i&0xF]+=X[(i+9)&0xF]
495 $SHRU s0=X[15-1],sgm0 };; // s0=X[(i+1)&0xF]>>7
496{ .mib; and T1=F,E

--- 52 unchanged lines hidden (view full) ---

549{ .mib; mov D=C
550 mov C=B };;
551{ .mmi; add T1=T1,X[15] // T1+=X[i]
552 xor r10=r10,r11
553 _rotr r8=$t0,$Sigma0[2] };; // ROTR(a,39)
554{ .mmi; xor r10=r8,r10 // r10=Sigma0(a)
555 mov B=A
556 add A=T1,T2 };;
314.L_rest_ctop:
315{ .mib; add E=E,T1
316 add A=A,r10 // T2=Maj(a,b,c)+Sigma0(a)
317 br.ctop.sptk .L_rest };;
557{ .mib; add E=E,T1
558 add A=A,r10 // T2=Maj(a,b,c)+Sigma0(a)
559 br.ctop.sptk .L_rest };;
560.L_rest_end:
318
561
562{ .mmi; add A_=A_,A
563 add B_=B_,B
564 add C_=C_,C }
565{ .mmi; add D_=D_,D
566 add E_=E_,E
567 cmp.ltu p16,p0=1,num };;
568{ .mmi; add F_=F_,F
569 add G_=G_,G
570 add H_=H_,H }
571{ .mmb; add Ktbl=-$SZ*$rounds,Ktbl
572(p16) add num=-1,num
573(p16) br.dptk.many .L_outer };;
574
319{ .mib; add r8=0*$SZ,ctx
320 add r9=1*$SZ,ctx }
321{ .mib; add r10=2*$SZ,ctx
322 add r11=3*$SZ,ctx };;
575{ .mib; add r8=0*$SZ,ctx
576 add r9=1*$SZ,ctx }
577{ .mib; add r10=2*$SZ,ctx
578 add r11=3*$SZ,ctx };;
323{ .mmi; $LDW r32=[r8],4*$SZ
324 $LDW r33=[r9],4*$SZ }
325{ .mmi; $LDW r34=[r10],4*$SZ
326 $LDW r35=[r11],4*$SZ
327 cmp.ltu p6,p7=1,num };;
328{ .mmi; $LDW r36=[r8],-4*$SZ
329 $LDW r37=[r9],-4*$SZ
330(p6) add Ktbl=-$SZ*$rounds,Ktbl }
331{ .mmi; $LDW r38=[r10],-4*$SZ
332 $LDW r39=[r11],-4*$SZ
333(p7) mov ar.lc=r3 };;
334{ .mmi; add A=A,r32
335 add B=B,r33
336 add C=C,r34 }
337{ .mmi; add D=D,r35
338 add E=E,r36
339 add F=F,r37 };;
340{ .mmi; $STW [r8]=A,4*$SZ
341 $STW [r9]=B,4*$SZ
342 add G=G,r38 }
343{ .mmi; $STW [r10]=C,4*$SZ
344 $STW [r11]=D,4*$SZ
345 add H=H,r39 };;
346{ .mmi; $STW [r8]=E
347 $STW [r9]=F
348(p6) add num=-1,num }
349{ .mmb; $STW [r10]=G
350 $STW [r11]=H
351(p6) br.dptk.many .L_outer };;
352
353{ .mib; mov pr=prsave,0x1ffff
579{ .mmi; $STW [r8]=A_,4*$SZ
580 $STW [r9]=B_,4*$SZ
581 mov ar.lc=lcsave }
582{ .mmi; $STW [r10]=C_,4*$SZ
583 $STW [r11]=D_,4*$SZ
584 mov pr=prsave,0x1ffff };;
585{ .mmb; $STW [r8]=E_
586 $STW [r9]=F_ }
587{ .mmb; $STW [r10]=G_
588 $STW [r11]=H_
354 br.ret.sptk.many b0 };;
355.endp $func#
356___
357
358$code =~ s/\`([^\`]*)\`/eval $1/gem;
359$code =~ s/_rotr(\s+)([^=]+)=([^,]+),([0-9]+)/shrp$1$2=$3,$3,$4/gm;
360if ($BITS==64) {
361 $code =~ s/mux2(\s+)\S+/nop.i$1 0x0/gm;
589 br.ret.sptk.many b0 };;
590.endp $func#
591___
592
593$code =~ s/\`([^\`]*)\`/eval $1/gem;
594$code =~ s/_rotr(\s+)([^=]+)=([^,]+),([0-9]+)/shrp$1$2=$3,$3,$4/gm;
595if ($BITS==64) {
596 $code =~ s/mux2(\s+)\S+/nop.i$1 0x0/gm;
362 $code =~ s/mux1(\s+)\S+/nop.i$1 0x0/gm if ($big_endian);
597 $code =~ s/mux1(\s+)\S+/nop.i$1 0x0/gm if ($big_endian);
598 $code =~ s/(shrp\s+X\[[^=]+)=([^,]+),([^,]+),([1-9]+)/$1=$3,$2,64-$4/gm
599 if (!$big_endian);
600 $code =~ s/ld1(\s+)X\[\S+/nop.m$1 0x0/gm;
363}
364
365print $code;
366
367print<<___ if ($BITS==32);
368.align 64
369.type K256#,\@object
370K256: data4 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5

--- 8 unchanged lines hidden (view full) ---

379 data4 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
380 data4 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
381 data4 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
382 data4 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
383 data4 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
384 data4 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
385 data4 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
386.size K256#,$SZ*$rounds
601}
602
603print $code;
604
605print<<___ if ($BITS==32);
606.align 64
607.type K256#,\@object
608K256: data4 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5

--- 8 unchanged lines hidden (view full) ---

617 data4 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
618 data4 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
619 data4 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
620 data4 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
621 data4 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
622 data4 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
623 data4 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
624.size K256#,$SZ*$rounds
625stringz "SHA256 block transform for IA64, CRYPTOGAMS by <appro\@openssl.org>"
387___
388print<<___ if ($BITS==64);
389.align 64
390.type K512#,\@object
391K512: data8 0x428a2f98d728ae22,0x7137449123ef65cd
392 data8 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
393 data8 0x3956c25bf348b538,0x59f111f1b605d019
394 data8 0x923f82a4af194f9b,0xab1c5ed5da6d8118

--- 29 unchanged lines hidden (view full) ---

424 data8 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
425 data8 0x06f067aa72176fba,0x0a637dc5a2c898a6
426 data8 0x113f9804bef90dae,0x1b710b35131c471b
427 data8 0x28db77f523047d84,0x32caab7b40c72493
428 data8 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
429 data8 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
430 data8 0x5fcb6fab3ad6faec,0x6c44198c4a475817
431.size K512#,$SZ*$rounds
626___
627print<<___ if ($BITS==64);
628.align 64
629.type K512#,\@object
630K512: data8 0x428a2f98d728ae22,0x7137449123ef65cd
631 data8 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
632 data8 0x3956c25bf348b538,0x59f111f1b605d019
633 data8 0x923f82a4af194f9b,0xab1c5ed5da6d8118

--- 29 unchanged lines hidden (view full) ---

663 data8 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
664 data8 0x06f067aa72176fba,0x0a637dc5a2c898a6
665 data8 0x113f9804bef90dae,0x1b710b35131c471b
666 data8 0x28db77f523047d84,0x32caab7b40c72493
667 data8 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
668 data8 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
669 data8 0x5fcb6fab3ad6faec,0x6c44198c4a475817
670.size K512#,$SZ*$rounds
671stringz "SHA512 block transform for IA64, CRYPTOGAMS by <appro\@openssl.org>"
432___
672___