Deleted Added
full compact
ia64.S (89837) ia64.S (111147)
1.explicit
2.text
1.explicit
2.text
3.ident "ia64.S, Version 1.1"
3.ident "ia64.S, Version 2.0"
4.ident "IA-64 ISA artwork by Andy Polyakov <appro@fy.chalmers.se>"
5
6//
7// ====================================================================
8// Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
9// project.
10//
11// Rights for redistribution and usage in source and binary forms are
12// granted according to the OpenSSL license. Warranty of any kind is
13// disclaimed.
14// ====================================================================
15//
4.ident "IA-64 ISA artwork by Andy Polyakov <appro@fy.chalmers.se>"
5
6//
7// ====================================================================
8// Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
9// project.
10//
11// Rights for redistribution and usage in source and binary forms are
12// granted according to the OpenSSL license. Warranty of any kind is
13// disclaimed.
14// ====================================================================
15//
16// Version 2.x is Itanium2 re-tune. Few words about how Itanum2 is
17// different from Itanium to this module viewpoint. Most notably, is it
18// "wider" than Itanium? Can you experience loop scalability as
19// discussed in commentary sections? Not really:-( Itanium2 has 6
20// integer ALU ports, i.e. it's 2 ports wider, but it's not enough to
21// spin twice as fast, as I need 8 IALU ports. Amount of floating point
22// ports is the same, i.e. 2, while I need 4. In other words, to this
23// module Itanium2 remains effectively as "wide" as Itanium. Yet it's
24// essentially different in respect to this module, and a re-tune was
25// required. Well, because some intruction latencies has changed. Most
26// noticeably those intensively used:
27//
28// Itanium Itanium2
29// ldf8 9 6 L2 hit
30// ld8 2 1 L1 hit
31// getf 2 5
32// xma[->getf] 7[+1] 4[+0]
33// add[->st8] 1[+1] 1[+0]
34//
35// What does it mean? You might ratiocinate that the original code
36// should run just faster... Because sum of latencies is smaller...
37// Wrong! Note that getf latency increased. This means that if a loop is
38// scheduled for lower latency (and they are), then it will suffer from
39// stall condition and the code will therefore turn anti-scalable, e.g.
40// original bn_mul_words spun at 5*n or 2.5 times slower than expected
41// on Itanium2! What to do? Reschedule loops for Itanium2? But then
42// Itanium would exhibit anti-scalability. So I've chosen to reschedule
43// for worst latency for every instruction aiming for best *all-round*
44// performance.
16
17// Q. How much faster does it get?
18// A. Here is the output from 'openssl speed rsa dsa' for vanilla
19// 0.9.6a compiled with gcc version 2.96 20000731 (Red Hat
20// Linux 7.1 2.96-81):
21//
22// sign verify sign/s verify/s
23// rsa 512 bits 0.0036s 0.0003s 275.3 2999.2

--- 120 unchanged lines hidden (view full) ---

144(p6) br.ret.spnt.many b0 };;
145
146 .save ar.lc,r3
147{ .mib; sub r10=r35,r0,1
148 mov r3=ar.lc
149 brp.loop.imp .L_bn_add_words_ctop,.L_bn_add_words_cend-16
150 }
151 .body
45
46// Q. How much faster does it get?
47// A. Here is the output from 'openssl speed rsa dsa' for vanilla
48// 0.9.6a compiled with gcc version 2.96 20000731 (Red Hat
49// Linux 7.1 2.96-81):
50//
51// sign verify sign/s verify/s
52// rsa 512 bits 0.0036s 0.0003s 275.3 2999.2

--- 120 unchanged lines hidden (view full) ---

173(p6) br.ret.spnt.many b0 };;
174
175 .save ar.lc,r3
176{ .mib; sub r10=r35,r0,1
177 mov r3=ar.lc
178 brp.loop.imp .L_bn_add_words_ctop,.L_bn_add_words_cend-16
179 }
180 .body
152{ .mib; mov r14=r32 // rp
181{ .mib;
182#if defined(_HPUX_SOURCE) && defined(_ILP32)
183 addp4 r14=0,r32 // rp
184#else
185 mov r14=r32 // rp
186#endif
153 mov r9=pr };;
187 mov r9=pr };;
154{ .mii; mov r15=r33 // ap
188{ .mii;
189#if defined(_HPUX_SOURCE) && defined(_ILP32)
190 addp4 r15=0,r33 // ap
191#else
192 mov r15=r33 // ap
193#endif
155 mov ar.lc=r10
156 mov ar.ec=6 }
194 mov ar.lc=r10
195 mov ar.ec=6 }
157{ .mib; mov r16=r34 // bp
196{ .mib;
197#if defined(_HPUX_SOURCE) && defined(_ILP32)
198 addp4 r16=0,r34 // bp
199#else
200 mov r16=r34 // bp
201#endif
158 mov pr.rot=1<<16 };;
159
160.L_bn_add_words_ctop:
161{ .mii; (p16) ld8 r32=[r16],8 // b=*(bp++)
162 (p18) add r39=r37,r34
163 (p19) cmp.ltu.unc p56,p0=r40,r38 }
164{ .mfb; (p0) nop.m 0x0
165 (p0) nop.f 0x0
166 (p0) nop.b 0x0 }
167{ .mii; (p16) ld8 r35=[r15],8 // a=*(ap++)
168 (p58) cmp.eq.or p57,p0=-1,r41 // (p20)
169 (p58) add r41=1,r41 } // (p20)
170{ .mfb; (p21) st8 [r14]=r42,8 // *(rp++)=r
171 (p0) nop.f 0x0
172 br.ctop.sptk .L_bn_add_words_ctop };;
173.L_bn_add_words_cend:
174
175{ .mii;
176(p59) add r8=1,r8 // return value
202 mov pr.rot=1<<16 };;
203
204.L_bn_add_words_ctop:
205{ .mii; (p16) ld8 r32=[r16],8 // b=*(bp++)
206 (p18) add r39=r37,r34
207 (p19) cmp.ltu.unc p56,p0=r40,r38 }
208{ .mfb; (p0) nop.m 0x0
209 (p0) nop.f 0x0
210 (p0) nop.b 0x0 }
211{ .mii; (p16) ld8 r35=[r15],8 // a=*(ap++)
212 (p58) cmp.eq.or p57,p0=-1,r41 // (p20)
213 (p58) add r41=1,r41 } // (p20)
214{ .mfb; (p21) st8 [r14]=r42,8 // *(rp++)=r
215 (p0) nop.f 0x0
216 br.ctop.sptk .L_bn_add_words_ctop };;
217.L_bn_add_words_cend:
218
219{ .mii;
220(p59) add r8=1,r8 // return value
177 mov pr=r9,-1
221 mov pr=r9,0x1ffff
178 mov ar.lc=r3 }
179{ .mbb; nop.b 0x0
180 br.ret.sptk.many b0 };;
181.endp bn_add_words#
182
183//
184// BN_ULONG bn_sub_words(BN_ULONG *rp, BN_ULONG *ap, BN_ULONG *bp,int num)
185//

--- 11 unchanged lines hidden (view full) ---

197(p6) br.ret.spnt.many b0 };;
198
199 .save ar.lc,r3
200{ .mib; sub r10=r35,r0,1
201 mov r3=ar.lc
202 brp.loop.imp .L_bn_sub_words_ctop,.L_bn_sub_words_cend-16
203 }
204 .body
222 mov ar.lc=r3 }
223{ .mbb; nop.b 0x0
224 br.ret.sptk.many b0 };;
225.endp bn_add_words#
226
227//
228// BN_ULONG bn_sub_words(BN_ULONG *rp, BN_ULONG *ap, BN_ULONG *bp,int num)
229//

--- 11 unchanged lines hidden (view full) ---

241(p6) br.ret.spnt.many b0 };;
242
243 .save ar.lc,r3
244{ .mib; sub r10=r35,r0,1
245 mov r3=ar.lc
246 brp.loop.imp .L_bn_sub_words_ctop,.L_bn_sub_words_cend-16
247 }
248 .body
205{ .mib; mov r14=r32 // rp
249{ .mib;
250#if defined(_HPUX_SOURCE) && defined(_ILP32)
251 addp4 r14=0,r32 // rp
252#else
253 mov r14=r32 // rp
254#endif
206 mov r9=pr };;
255 mov r9=pr };;
207{ .mii; mov r15=r33 // ap
256{ .mii;
257#if defined(_HPUX_SOURCE) && defined(_ILP32)
258 addp4 r15=0,r33 // ap
259#else
260 mov r15=r33 // ap
261#endif
208 mov ar.lc=r10
209 mov ar.ec=6 }
262 mov ar.lc=r10
263 mov ar.ec=6 }
210{ .mib; mov r16=r34 // bp
264{ .mib;
265#if defined(_HPUX_SOURCE) && defined(_ILP32)
266 addp4 r16=0,r34 // bp
267#else
268 mov r16=r34 // bp
269#endif
211 mov pr.rot=1<<16 };;
212
213.L_bn_sub_words_ctop:
214{ .mii; (p16) ld8 r32=[r16],8 // b=*(bp++)
215 (p18) sub r39=r37,r34
216 (p19) cmp.gtu.unc p56,p0=r40,r38 }
217{ .mfb; (p0) nop.m 0x0
218 (p0) nop.f 0x0
219 (p0) nop.b 0x0 }
220{ .mii; (p16) ld8 r35=[r15],8 // a=*(ap++)
221 (p58) cmp.eq.or p57,p0=0,r41 // (p20)
222 (p58) add r41=-1,r41 } // (p20)
223{ .mbb; (p21) st8 [r14]=r42,8 // *(rp++)=r
224 (p0) nop.b 0x0
225 br.ctop.sptk .L_bn_sub_words_ctop };;
226.L_bn_sub_words_cend:
227
228{ .mii;
229(p59) add r8=1,r8 // return value
270 mov pr.rot=1<<16 };;
271
272.L_bn_sub_words_ctop:
273{ .mii; (p16) ld8 r32=[r16],8 // b=*(bp++)
274 (p18) sub r39=r37,r34
275 (p19) cmp.gtu.unc p56,p0=r40,r38 }
276{ .mfb; (p0) nop.m 0x0
277 (p0) nop.f 0x0
278 (p0) nop.b 0x0 }
279{ .mii; (p16) ld8 r35=[r15],8 // a=*(ap++)
280 (p58) cmp.eq.or p57,p0=0,r41 // (p20)
281 (p58) add r41=-1,r41 } // (p20)
282{ .mbb; (p21) st8 [r14]=r42,8 // *(rp++)=r
283 (p0) nop.b 0x0
284 br.ctop.sptk .L_bn_sub_words_ctop };;
285.L_bn_sub_words_cend:
286
287{ .mii;
288(p59) add r8=1,r8 // return value
230 mov pr=r9,-1
289 mov pr=r9,0x1ffff
231 mov ar.lc=r3 }
232{ .mbb; nop.b 0x0
233 br.ret.sptk.many b0 };;
234.endp bn_sub_words#
235#endif
236
237#if 0
238#define XMA_TEMPTATION

--- 9 unchanged lines hidden (view full) ---

248.skip 32 // makes the loop body aligned at 64-byte boundary
249bn_mul_words:
250 .prologue
251 .fframe 0
252 .save ar.pfs,r2
253#ifdef XMA_TEMPTATION
254{ .mfi; alloc r2=ar.pfs,4,0,0,0 };;
255#else
290 mov ar.lc=r3 }
291{ .mbb; nop.b 0x0
292 br.ret.sptk.many b0 };;
293.endp bn_sub_words#
294#endif
295
296#if 0
297#define XMA_TEMPTATION

--- 9 unchanged lines hidden (view full) ---

307.skip 32 // makes the loop body aligned at 64-byte boundary
308bn_mul_words:
309 .prologue
310 .fframe 0
311 .save ar.pfs,r2
312#ifdef XMA_TEMPTATION
313{ .mfi; alloc r2=ar.pfs,4,0,0,0 };;
314#else
256{ .mfi; alloc r2=ar.pfs,4,4,0,8 };;
315{ .mfi; alloc r2=ar.pfs,4,12,0,16 };;
257#endif
258{ .mib; mov r8=r0 // return value
259 cmp4.le p6,p0=r34,r0
260(p6) br.ret.spnt.many b0 };;
261
262 .save ar.lc,r3
263{ .mii; sub r10=r34,r0,1
264 mov r3=ar.lc
265 mov r9=pr };;
266
267 .body
268{ .mib; setf.sig f8=r35 // w
316#endif
317{ .mib; mov r8=r0 // return value
318 cmp4.le p6,p0=r34,r0
319(p6) br.ret.spnt.many b0 };;
320
321 .save ar.lc,r3
322{ .mii; sub r10=r34,r0,1
323 mov r3=ar.lc
324 mov r9=pr };;
325
326 .body
327{ .mib; setf.sig f8=r35 // w
269 mov pr.rot=0x400001<<16
270 // ------^----- serves as (p48) at first (p26)
328 mov pr.rot=0x800001<<16
329 // ------^----- serves as (p50) at first (p27)
271 brp.loop.imp .L_bn_mul_words_ctop,.L_bn_mul_words_cend-16
272 }
273
274#ifndef XMA_TEMPTATION
275
330 brp.loop.imp .L_bn_mul_words_ctop,.L_bn_mul_words_cend-16
331 }
332
333#ifndef XMA_TEMPTATION
334
276{ .mii; mov r14=r32 // rp
277 mov r15=r33 // ap
335{ .mii;
336#if defined(_HPUX_SOURCE) && defined(_ILP32)
337 addp4 r14=0,r32 // rp
338 addp4 r15=0,r33 // ap
339#else
340 mov r14=r32 // rp
341 mov r15=r33 // ap
342#endif
278 mov ar.lc=r10 }
343 mov ar.lc=r10 }
279{ .mii; mov r39=0 // serves as r33 at first (p26)
280 mov ar.ec=12 };;
344{ .mii; mov r40=0 // serves as r35 at first (p27)
345 mov ar.ec=13 };;
281
346
282// This loop spins in 2*(n+11) ticks. It's scheduled for data in L2
283// cache (i.e. 9 ticks away) as floating point load/store instructions
347// This loop spins in 2*(n+12) ticks. It's scheduled for data in Itanium
348// L2 cache (i.e. 9 ticks away) as floating point load/store instructions
284// bypass L1 cache and L2 latency is actually best-case scenario for
349// bypass L1 cache and L2 latency is actually best-case scenario for
285// ldf8. The loop is not scalable and shall run in 2*(n+11) even on
286// "wider" IA-64 implementations. It's a trade-off here. n+22 loop
350// ldf8. The loop is not scalable and shall run in 2*(n+12) even on
351// "wider" IA-64 implementations. It's a trade-off here. n+24 loop
287// would give us ~5% in *overall* performance improvement on "wider"
288// IA-64, but would hurt Itanium for about same because of longer
289// epilogue. As it's a matter of few percents in either case I've
290// chosen to trade the scalability for development time (you can see
291// this very instruction sequence in bn_mul_add_words loop which in
292// turn is scalable).
293.L_bn_mul_words_ctop:
352// would give us ~5% in *overall* performance improvement on "wider"
353// IA-64, but would hurt Itanium for about same because of longer
354// epilogue. As it's a matter of few percents in either case I've
355// chosen to trade the scalability for development time (you can see
356// this very instruction sequence in bn_mul_add_words loop which in
357// turn is scalable).
358.L_bn_mul_words_ctop:
294{ .mfi; (p25) getf.sig r36=f49 // low
295 (p21) xmpy.lu f45=f37,f8
296 (p27) cmp.ltu p52,p48=r39,r38 }
359{ .mfi; (p25) getf.sig r36=f52 // low
360 (p21) xmpy.lu f48=f37,f8
361 (p28) cmp.ltu p54,p50=r41,r39 }
297{ .mfi; (p16) ldf8 f32=[r15],8
362{ .mfi; (p16) ldf8 f32=[r15],8
298 (p21) xmpy.hu f38=f37,f8
363 (p21) xmpy.hu f40=f37,f8
299 (p0) nop.i 0x0 };;
364 (p0) nop.i 0x0 };;
300{ .mii; (p26) getf.sig r32=f43 // high
301 .pred.rel "mutex",p48,p52
302 (p48) add r38=r37,r33 // (p26)
303 (p52) add r38=r37,r33,1 } // (p26)
304{ .mfb; (p27) st8 [r14]=r39,8
365{ .mii; (p25) getf.sig r32=f44 // high
366 .pred.rel "mutex",p50,p54
367 (p50) add r40=r38,r35 // (p27)
368 (p54) add r40=r38,r35,1 } // (p27)
369{ .mfb; (p28) st8 [r14]=r41,8
305 (p0) nop.f 0x0
306 br.ctop.sptk .L_bn_mul_words_ctop };;
307.L_bn_mul_words_cend:
308
309{ .mii; nop.m 0x0
370 (p0) nop.f 0x0
371 br.ctop.sptk .L_bn_mul_words_ctop };;
372.L_bn_mul_words_cend:
373
374{ .mii; nop.m 0x0
310.pred.rel "mutex",p49,p53
311(p49) add r8=r34,r0
312(p53) add r8=r34,r0,1 }
375.pred.rel "mutex",p51,p55
376(p51) add r8=r36,r0
377(p55) add r8=r36,r0,1 }
313{ .mfb; nop.m 0x0
314 nop.f 0x0
315 nop.b 0x0 }
316
317#else // XMA_TEMPTATION
318
319 setf.sig f37=r0 // serves as carry at (p18) tick
320 mov ar.lc=r10

--- 18 unchanged lines hidden (view full) ---

339 br.ctop.sptk .L_bn_mul_words_ctop };;
340.L_bn_mul_words_cend:
341
342 getf.sig r8=f41 // the return value
343
344#endif // XMA_TEMPTATION
345
346{ .mii; nop.m 0x0
378{ .mfb; nop.m 0x0
379 nop.f 0x0
380 nop.b 0x0 }
381
382#else // XMA_TEMPTATION
383
384 setf.sig f37=r0 // serves as carry at (p18) tick
385 mov ar.lc=r10

--- 18 unchanged lines hidden (view full) ---

404 br.ctop.sptk .L_bn_mul_words_ctop };;
405.L_bn_mul_words_cend:
406
407 getf.sig r8=f41 // the return value
408
409#endif // XMA_TEMPTATION
410
411{ .mii; nop.m 0x0
347 mov pr=r9,-1
412 mov pr=r9,0x1ffff
348 mov ar.lc=r3 }
349{ .mfb; rum 1<<5 // clear um.mfh
350 nop.f 0x0
351 br.ret.sptk.many b0 };;
352.endp bn_mul_words#
353#endif
354
355#if 1

--- 15 unchanged lines hidden (view full) ---

371
372 .save ar.lc,r3
373{ .mii; sub r10=r34,r0,1
374 mov r3=ar.lc
375 mov r9=pr };;
376
377 .body
378{ .mib; setf.sig f8=r35 // w
413 mov ar.lc=r3 }
414{ .mfb; rum 1<<5 // clear um.mfh
415 nop.f 0x0
416 br.ret.sptk.many b0 };;
417.endp bn_mul_words#
418#endif
419
420#if 1

--- 15 unchanged lines hidden (view full) ---

436
437 .save ar.lc,r3
438{ .mii; sub r10=r34,r0,1
439 mov r3=ar.lc
440 mov r9=pr };;
441
442 .body
443{ .mib; setf.sig f8=r35 // w
379 mov pr.rot=0x400001<<16
380 // ------^----- serves as (p48) at first (p26)
444 mov pr.rot=0x800001<<16
445 // ------^----- serves as (p50) at first (p27)
381 brp.loop.imp .L_bn_mul_add_words_ctop,.L_bn_mul_add_words_cend-16
382 }
446 brp.loop.imp .L_bn_mul_add_words_ctop,.L_bn_mul_add_words_cend-16
447 }
383{ .mii; mov r14=r32 // rp
384 mov r15=r33 // ap
448{ .mii;
449#if defined(_HPUX_SOURCE) && defined(_ILP32)
450 addp4 r14=0,r32 // rp
451 addp4 r15=0,r33 // ap
452#else
453 mov r14=r32 // rp
454 mov r15=r33 // ap
455#endif
385 mov ar.lc=r10 }
456 mov ar.lc=r10 }
386{ .mii; mov r39=0 // serves as r33 at first (p26)
387 mov r18=r32 // rp copy
388 mov ar.ec=14 };;
457{ .mii; mov r40=0 // serves as r35 at first (p27)
458#if defined(_HPUX_SOURCE) && defined(_ILP32)
459 addp4 r18=0,r32 // rp copy
460#else
461 mov r18=r32 // rp copy
462#endif
463 mov ar.ec=15 };;
389
464
390// This loop spins in 3*(n+13) ticks on Itanium and should spin in
391// 2*(n+13) on "wider" IA-64 implementations (to be verified with new
465// This loop spins in 3*(n+14) ticks on Itanium and should spin in
466// 2*(n+14) on "wider" IA-64 implementations (to be verified with new
392// �-architecture manuals as they become available). As usual it's
393// possible to compress the epilogue, down to 10 in this case, at the
394// cost of scalability. Compressed (and therefore non-scalable) loop
467// �-architecture manuals as they become available). As usual it's
468// possible to compress the epilogue, down to 10 in this case, at the
469// cost of scalability. Compressed (and therefore non-scalable) loop
395// running at 3*(n+10) would buy you ~10% on Itanium but take ~35%
470// running at 3*(n+11) would buy you ~10% on Itanium but take ~35%
396// from "wider" IA-64 so let it be scalable! Special attention was
397// paid for having the loop body split at 64-byte boundary. ld8 is
398// scheduled for L1 cache as the data is more than likely there.
399// Indeed, bn_mul_words has put it there a moment ago:-)
400.L_bn_mul_add_words_ctop:
471// from "wider" IA-64 so let it be scalable! Special attention was
472// paid for having the loop body split at 64-byte boundary. ld8 is
473// scheduled for L1 cache as the data is more than likely there.
474// Indeed, bn_mul_words has put it there a moment ago:-)
475.L_bn_mul_add_words_ctop:
401{ .mfi; (p25) getf.sig r36=f49 // low
402 (p21) xmpy.lu f45=f37,f8
403 (p27) cmp.ltu p52,p48=r39,r38 }
476{ .mfi; (p25) getf.sig r36=f52 // low
477 (p21) xmpy.lu f48=f37,f8
478 (p28) cmp.ltu p54,p50=r41,r39 }
404{ .mfi; (p16) ldf8 f32=[r15],8
479{ .mfi; (p16) ldf8 f32=[r15],8
405 (p21) xmpy.hu f38=f37,f8
406 (p27) add r43=r43,r39 };;
407{ .mii; (p26) getf.sig r32=f43 // high
408 .pred.rel "mutex",p48,p52
409 (p48) add r38=r37,r33 // (p26)
410 (p52) add r38=r37,r33,1 } // (p26)
411{ .mfb; (p27) cmp.ltu.unc p56,p0=r43,r39
480 (p21) xmpy.hu f40=f37,f8
481 (p28) add r45=r45,r41 };;
482{ .mii; (p25) getf.sig r32=f44 // high
483 .pred.rel "mutex",p50,p54
484 (p50) add r40=r38,r35 // (p27)
485 (p54) add r40=r38,r35,1 } // (p27)
486{ .mfb; (p28) cmp.ltu.unc p60,p0=r45,r41
412 (p0) nop.f 0x0
413 (p0) nop.b 0x0 }
487 (p0) nop.f 0x0
488 (p0) nop.b 0x0 }
414{ .mii; (p26) ld8 r42=[r18],8
415 (p58) cmp.eq.or p57,p0=-1,r44
416 (p58) add r44=1,r44 }
417{ .mfb; (p29) st8 [r14]=r45,8
489{ .mii; (p27) ld8 r44=[r18],8
490 (p62) cmp.eq.or p61,p0=-1,r46
491 (p62) add r46=1,r46 }
492{ .mfb; (p30) st8 [r14]=r47,8
418 (p0) nop.f 0x0
419 br.ctop.sptk .L_bn_mul_add_words_ctop};;
420.L_bn_mul_add_words_cend:
421
422{ .mii; nop.m 0x0
493 (p0) nop.f 0x0
494 br.ctop.sptk .L_bn_mul_add_words_ctop};;
495.L_bn_mul_add_words_cend:
496
497{ .mii; nop.m 0x0
423.pred.rel "mutex",p51,p55
424(p51) add r8=r36,r0
425(p55) add r8=r36,r0,1 }
498.pred.rel "mutex",p53,p57
499(p53) add r8=r38,r0
500(p57) add r8=r38,r0,1 }
426{ .mfb; nop.m 0x0
427 nop.f 0x0
428 nop.b 0x0 };;
429{ .mii;
501{ .mfb; nop.m 0x0
502 nop.f 0x0
503 nop.b 0x0 };;
504{ .mii;
430(p59) add r8=1,r8
431 mov pr=r9,-1
505(p63) add r8=1,r8
506 mov pr=r9,0x1ffff
432 mov ar.lc=r3 }
433{ .mfb; rum 1<<5 // clear um.mfh
434 nop.f 0x0
435 br.ret.sptk.many b0 };;
436.endp bn_mul_add_words#
437#endif
438
439#if 1

--- 16 unchanged lines hidden (view full) ---

456(p6) br.ret.spnt.many b0 };;
457
458 .save ar.lc,r3
459{ .mii; sub r10=r34,r0,1
460 mov r3=ar.lc
461 mov r9=pr };;
462
463 .body
507 mov ar.lc=r3 }
508{ .mfb; rum 1<<5 // clear um.mfh
509 nop.f 0x0
510 br.ret.sptk.many b0 };;
511.endp bn_mul_add_words#
512#endif
513
514#if 1

--- 16 unchanged lines hidden (view full) ---

531(p6) br.ret.spnt.many b0 };;
532
533 .save ar.lc,r3
534{ .mii; sub r10=r34,r0,1
535 mov r3=ar.lc
536 mov r9=pr };;
537
538 .body
539#if defined(_HPUX_SOURCE) && defined(_ILP32)
540{ .mii; addp4 r32=0,r32
541 addp4 r33=0,r33 };;
542#endif
464{ .mib;
465 mov pr.rot=1<<16
466 brp.loop.imp .L_bn_sqr_words_ctop,.L_bn_sqr_words_cend-16
467 }
468{ .mii; add r34=8,r32
469 mov ar.lc=r10
470 mov ar.ec=18 };;
471

--- 15 unchanged lines hidden (view full) ---

487 (p25) xmpy.hu f52=f41,f41
488 (p0) nop.i 0x0 }
489{ .mib; (p33) stf8 [r34]=f60,16
490 (p0) nop.i 0x0
491 br.ctop.sptk .L_bn_sqr_words_ctop };;
492.L_bn_sqr_words_cend:
493
494{ .mii; nop.m 0x0
543{ .mib;
544 mov pr.rot=1<<16
545 brp.loop.imp .L_bn_sqr_words_ctop,.L_bn_sqr_words_cend-16
546 }
547{ .mii; add r34=8,r32
548 mov ar.lc=r10
549 mov ar.ec=18 };;
550

--- 15 unchanged lines hidden (view full) ---

566 (p25) xmpy.hu f52=f41,f41
567 (p0) nop.i 0x0 }
568{ .mib; (p33) stf8 [r34]=f60,16
569 (p0) nop.i 0x0
570 br.ctop.sptk .L_bn_sqr_words_ctop };;
571.L_bn_sqr_words_cend:
572
573{ .mii; nop.m 0x0
495 mov pr=r9,-1
574 mov pr=r9,0x1ffff
496 mov ar.lc=r3 }
497{ .mfb; rum 1<<5 // clear um.mfh
498 nop.f 0x0
499 br.ret.sptk.many b0 };;
500.endp bn_sqr_words#
501#endif
502
503#if 1

--- 17 unchanged lines hidden (view full) ---

521//
522.global bn_sqr_comba8#
523.proc bn_sqr_comba8#
524.align 64
525bn_sqr_comba8:
526 .prologue
527 .fframe 0
528 .save ar.pfs,r2
575 mov ar.lc=r3 }
576{ .mfb; rum 1<<5 // clear um.mfh
577 nop.f 0x0
578 br.ret.sptk.many b0 };;
579.endp bn_sqr_words#
580#endif
581
582#if 1

--- 17 unchanged lines hidden (view full) ---

600//
601.global bn_sqr_comba8#
602.proc bn_sqr_comba8#
603.align 64
604bn_sqr_comba8:
605 .prologue
606 .fframe 0
607 .save ar.pfs,r2
608#if defined(_HPUX_SOURCE) && defined(_ILP32)
529{ .mii; alloc r2=ar.pfs,2,1,0,0
609{ .mii; alloc r2=ar.pfs,2,1,0,0
610 addp4 r33=0,r33
611 addp4 r32=0,r32 };;
612{ .mii;
613#else
614{ .mii; alloc r2=ar.pfs,2,1,0,0
615#endif
530 mov r34=r33
531 add r14=8,r33 };;
532 .body
533{ .mii; add r17=8,r34
534 add r15=16,r33
535 add r18=16,r34 }
536{ .mfb; add r16=24,r33
537 br .L_cheat_entry_point8 };;

--- 44 unchanged lines hidden (view full) ---

582#define carry3 r34
583.global bn_mul_comba8#
584.proc bn_mul_comba8#
585.align 64
586bn_mul_comba8:
587 .prologue
588 .fframe 0
589 .save ar.pfs,r2
616 mov r34=r33
617 add r14=8,r33 };;
618 .body
619{ .mii; add r17=8,r34
620 add r15=16,r33
621 add r18=16,r34 }
622{ .mfb; add r16=24,r33
623 br .L_cheat_entry_point8 };;

--- 44 unchanged lines hidden (view full) ---

668#define carry3 r34
669.global bn_mul_comba8#
670.proc bn_mul_comba8#
671.align 64
672bn_mul_comba8:
673 .prologue
674 .fframe 0
675 .save ar.pfs,r2
676#if defined(_HPUX_SOURCE) && defined(_ILP32)
590{ .mii; alloc r2=ar.pfs,3,0,0,0
677{ .mii; alloc r2=ar.pfs,3,0,0,0
678 addp4 r33=0,r33
679 addp4 r34=0,r34 };;
680{ .mii; addp4 r32=0,r32
681#else
682{ .mii; alloc r2=ar.pfs,3,0,0,0
683#endif
591 add r14=8,r33
592 add r17=8,r34 }
593 .body
594{ .mii; add r15=16,r33
595 add r18=16,r34
596 add r16=24,r33 }
597.L_cheat_entry_point8:
598{ .mmi; add r19=24,r34

--- 534 unchanged lines hidden (view full) ---

1133//
1134.global bn_sqr_comba4#
1135.proc bn_sqr_comba4#
1136.align 64
1137bn_sqr_comba4:
1138 .prologue
1139 .fframe 0
1140 .save ar.pfs,r2
684 add r14=8,r33
685 add r17=8,r34 }
686 .body
687{ .mii; add r15=16,r33
688 add r18=16,r34
689 add r16=24,r33 }
690.L_cheat_entry_point8:
691{ .mmi; add r19=24,r34

--- 534 unchanged lines hidden (view full) ---

1226//
1227.global bn_sqr_comba4#
1228.proc bn_sqr_comba4#
1229.align 64
1230bn_sqr_comba4:
1231 .prologue
1232 .fframe 0
1233 .save ar.pfs,r2
1234#if defined(_HPUX_SOURCE) && defined(_ILP32)
1235{ .mii; alloc r2=ar.pfs,2,1,0,0
1236 addp4 r32=0,r32
1237 addp4 r33=0,r33 };;
1238{ .mii;
1239#else
1141{ .mii; alloc r2=ar.pfs,2,1,0,0
1240{ .mii; alloc r2=ar.pfs,2,1,0,0
1241#endif
1142 mov r34=r33
1143 add r14=8,r33 };;
1144 .body
1145{ .mii; add r17=8,r34
1146 add r15=16,r33
1147 add r18=16,r34 }
1148{ .mfb; add r16=24,r33
1149 br .L_cheat_entry_point4 };;

--- 9 unchanged lines hidden (view full) ---

1159#define carry2 r15
1160.global bn_mul_comba4#
1161.proc bn_mul_comba4#
1162.align 64
1163bn_mul_comba4:
1164 .prologue
1165 .fframe 0
1166 .save ar.pfs,r2
1242 mov r34=r33
1243 add r14=8,r33 };;
1244 .body
1245{ .mii; add r17=8,r34
1246 add r15=16,r33
1247 add r18=16,r34 }
1248{ .mfb; add r16=24,r33
1249 br .L_cheat_entry_point4 };;

--- 9 unchanged lines hidden (view full) ---

1259#define carry2 r15
1260.global bn_mul_comba4#
1261.proc bn_mul_comba4#
1262.align 64
1263bn_mul_comba4:
1264 .prologue
1265 .fframe 0
1266 .save ar.pfs,r2
1267#if defined(_HPUX_SOURCE) && defined(_ILP32)
1268{ .mii; alloc r2=ar.pfs,3,0,0,0
1269 addp4 r33=0,r33
1270 addp4 r34=0,r34 };;
1271{ .mii; addp4 r32=0,r32
1272#else
1167{ .mii; alloc r2=ar.pfs,3,0,0,0
1273{ .mii; alloc r2=ar.pfs,3,0,0,0
1274#endif
1168 add r14=8,r33
1169 add r17=8,r34 }
1170 .body
1171{ .mii; add r15=16,r33
1172 add r18=16,r34
1173 add r16=24,r33 };;
1174.L_cheat_entry_point4:
1175{ .mmi; add r19=24,r34

--- 283 unchanged lines hidden (view full) ---

1459{ .mib; (cont) cmp.leu cont,break=HH,r31
1460 (p8) add r31=-1,r31
1461(cont) br.wtop.spnt .L_divw_2nd_iter };;
1462///////////////////////////////////////////////////////////
1463{ .mii; sub H=H,r35
1464 or r8=r8,r33
1465 mov ar.pfs=r2 };;
1466{ .mii; shr.u r9=H,I // remainder if anybody wants it
1275 add r14=8,r33
1276 add r17=8,r34 }
1277 .body
1278{ .mii; add r15=16,r33
1279 add r18=16,r34
1280 add r16=24,r33 };;
1281.L_cheat_entry_point4:
1282{ .mmi; add r19=24,r34

--- 283 unchanged lines hidden (view full) ---

1566{ .mib; (cont) cmp.leu cont,break=HH,r31
1567 (p8) add r31=-1,r31
1568(cont) br.wtop.spnt .L_divw_2nd_iter };;
1569///////////////////////////////////////////////////////////
1570{ .mii; sub H=H,r35
1571 or r8=r8,r33
1572 mov ar.pfs=r2 };;
1573{ .mii; shr.u r9=H,I // remainder if anybody wants it
1467 mov pr=r10,-1 }
1574 mov pr=r10,0x1ffff }
1468{ .mfb; br.ret.sptk.many b0 };;
1469
1470// Unsigned 64 by 32 (well, by 64 for the moment) bit integer division
1471// procedure.
1472//
1473// inputs: f6 = (double)a, f7 = (double)b
1474// output: f8 = (int)(a/b)
1475// clobbered: f8,f9,f10,f11,pred

--- 23 unchanged lines hidden ---
1575{ .mfb; br.ret.sptk.many b0 };;
1576
1577// Unsigned 64 by 32 (well, by 64 for the moment) bit integer division
1578// procedure.
1579//
1580// inputs: f6 = (double)a, f7 = (double)b
1581// output: f8 = (int)(a/b)
1582// clobbered: f8,f9,f10,f11,pred

--- 23 unchanged lines hidden ---