ia64.S (89837) | ia64.S (111147) |
---|---|
1.explicit 2.text | 1.explicit 2.text |
3.ident "ia64.S, Version 1.1" | 3.ident "ia64.S, Version 2.0" |
4.ident "IA-64 ISA artwork by Andy Polyakov <appro@fy.chalmers.se>" 5 6// 7// ==================================================================== 8// Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL 9// project. 10// 11// Rights for redistribution and usage in source and binary forms are 12// granted according to the OpenSSL license. Warranty of any kind is 13// disclaimed. 14// ==================================================================== 15// | 4.ident "IA-64 ISA artwork by Andy Polyakov <appro@fy.chalmers.se>" 5 6// 7// ==================================================================== 8// Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL 9// project. 10// 11// Rights for redistribution and usage in source and binary forms are 12// granted according to the OpenSSL license. Warranty of any kind is 13// disclaimed. 14// ==================================================================== 15// |
16// Version 2.x is Itanium2 re-tune. Few words about how Itanum2 is 17// different from Itanium to this module viewpoint. Most notably, is it 18// "wider" than Itanium? Can you experience loop scalability as 19// discussed in commentary sections? Not really:-( Itanium2 has 6 20// integer ALU ports, i.e. it's 2 ports wider, but it's not enough to 21// spin twice as fast, as I need 8 IALU ports. Amount of floating point 22// ports is the same, i.e. 2, while I need 4. In other words, to this 23// module Itanium2 remains effectively as "wide" as Itanium. Yet it's 24// essentially different in respect to this module, and a re-tune was 25// required. Well, because some intruction latencies has changed. Most 26// noticeably those intensively used: 27// 28// Itanium Itanium2 29// ldf8 9 6 L2 hit 30// ld8 2 1 L1 hit 31// getf 2 5 32// xma[->getf] 7[+1] 4[+0] 33// add[->st8] 1[+1] 1[+0] 34// 35// What does it mean? You might ratiocinate that the original code 36// should run just faster... Because sum of latencies is smaller... 37// Wrong! Note that getf latency increased. This means that if a loop is 38// scheduled for lower latency (and they are), then it will suffer from 39// stall condition and the code will therefore turn anti-scalable, e.g. 40// original bn_mul_words spun at 5*n or 2.5 times slower than expected 41// on Itanium2! What to do? Reschedule loops for Itanium2? But then 42// Itanium would exhibit anti-scalability. So I've chosen to reschedule 43// for worst latency for every instruction aiming for best *all-round* 44// performance. |
|
16 17// Q. How much faster does it get? 18// A. Here is the output from 'openssl speed rsa dsa' for vanilla 19// 0.9.6a compiled with gcc version 2.96 20000731 (Red Hat 20// Linux 7.1 2.96-81): 21// 22// sign verify sign/s verify/s 23// rsa 512 bits 0.0036s 0.0003s 275.3 2999.2 --- 120 unchanged lines hidden (view full) --- 144(p6) br.ret.spnt.many b0 };; 145 146 .save ar.lc,r3 147{ .mib; sub r10=r35,r0,1 148 mov r3=ar.lc 149 brp.loop.imp .L_bn_add_words_ctop,.L_bn_add_words_cend-16 150 } 151 .body | 45 46// Q. How much faster does it get? 47// A. Here is the output from 'openssl speed rsa dsa' for vanilla 48// 0.9.6a compiled with gcc version 2.96 20000731 (Red Hat 49// Linux 7.1 2.96-81): 50// 51// sign verify sign/s verify/s 52// rsa 512 bits 0.0036s 0.0003s 275.3 2999.2 --- 120 unchanged lines hidden (view full) --- 173(p6) br.ret.spnt.many b0 };; 174 175 .save ar.lc,r3 176{ .mib; sub r10=r35,r0,1 177 mov r3=ar.lc 178 brp.loop.imp .L_bn_add_words_ctop,.L_bn_add_words_cend-16 179 } 180 .body |
152{ .mib; mov r14=r32 // rp | 181{ .mib; 182#if defined(_HPUX_SOURCE) && defined(_ILP32) 183 addp4 r14=0,r32 // rp 184#else 185 mov r14=r32 // rp 186#endif |
153 mov r9=pr };; | 187 mov r9=pr };; |
154{ .mii; mov r15=r33 // ap | 188{ .mii; 189#if defined(_HPUX_SOURCE) && defined(_ILP32) 190 addp4 r15=0,r33 // ap 191#else 192 mov r15=r33 // ap 193#endif |
155 mov ar.lc=r10 156 mov ar.ec=6 } | 194 mov ar.lc=r10 195 mov ar.ec=6 } |
157{ .mib; mov r16=r34 // bp | 196{ .mib; 197#if defined(_HPUX_SOURCE) && defined(_ILP32) 198 addp4 r16=0,r34 // bp 199#else 200 mov r16=r34 // bp 201#endif |
158 mov pr.rot=1<<16 };; 159 160.L_bn_add_words_ctop: 161{ .mii; (p16) ld8 r32=[r16],8 // b=*(bp++) 162 (p18) add r39=r37,r34 163 (p19) cmp.ltu.unc p56,p0=r40,r38 } 164{ .mfb; (p0) nop.m 0x0 165 (p0) nop.f 0x0 166 (p0) nop.b 0x0 } 167{ .mii; (p16) ld8 r35=[r15],8 // a=*(ap++) 168 (p58) cmp.eq.or p57,p0=-1,r41 // (p20) 169 (p58) add r41=1,r41 } // (p20) 170{ .mfb; (p21) st8 [r14]=r42,8 // *(rp++)=r 171 (p0) nop.f 0x0 172 br.ctop.sptk .L_bn_add_words_ctop };; 173.L_bn_add_words_cend: 174 175{ .mii; 176(p59) add r8=1,r8 // return value | 202 mov pr.rot=1<<16 };; 203 204.L_bn_add_words_ctop: 205{ .mii; (p16) ld8 r32=[r16],8 // b=*(bp++) 206 (p18) add r39=r37,r34 207 (p19) cmp.ltu.unc p56,p0=r40,r38 } 208{ .mfb; (p0) nop.m 0x0 209 (p0) nop.f 0x0 210 (p0) nop.b 0x0 } 211{ .mii; (p16) ld8 r35=[r15],8 // a=*(ap++) 212 (p58) cmp.eq.or p57,p0=-1,r41 // (p20) 213 (p58) add r41=1,r41 } // (p20) 214{ .mfb; (p21) st8 [r14]=r42,8 // *(rp++)=r 215 (p0) nop.f 0x0 216 br.ctop.sptk .L_bn_add_words_ctop };; 217.L_bn_add_words_cend: 218 219{ .mii; 220(p59) add r8=1,r8 // return value |
177 mov pr=r9,-1 | 221 mov pr=r9,0x1ffff |
178 mov ar.lc=r3 } 179{ .mbb; nop.b 0x0 180 br.ret.sptk.many b0 };; 181.endp bn_add_words# 182 183// 184// BN_ULONG bn_sub_words(BN_ULONG *rp, BN_ULONG *ap, BN_ULONG *bp,int num) 185// --- 11 unchanged lines hidden (view full) --- 197(p6) br.ret.spnt.many b0 };; 198 199 .save ar.lc,r3 200{ .mib; sub r10=r35,r0,1 201 mov r3=ar.lc 202 brp.loop.imp .L_bn_sub_words_ctop,.L_bn_sub_words_cend-16 203 } 204 .body | 222 mov ar.lc=r3 } 223{ .mbb; nop.b 0x0 224 br.ret.sptk.many b0 };; 225.endp bn_add_words# 226 227// 228// BN_ULONG bn_sub_words(BN_ULONG *rp, BN_ULONG *ap, BN_ULONG *bp,int num) 229// --- 11 unchanged lines hidden (view full) --- 241(p6) br.ret.spnt.many b0 };; 242 243 .save ar.lc,r3 244{ .mib; sub r10=r35,r0,1 245 mov r3=ar.lc 246 brp.loop.imp .L_bn_sub_words_ctop,.L_bn_sub_words_cend-16 247 } 248 .body |
205{ .mib; mov r14=r32 // rp | 249{ .mib; 250#if defined(_HPUX_SOURCE) && defined(_ILP32) 251 addp4 r14=0,r32 // rp 252#else 253 mov r14=r32 // rp 254#endif |
206 mov r9=pr };; | 255 mov r9=pr };; |
207{ .mii; mov r15=r33 // ap | 256{ .mii; 257#if defined(_HPUX_SOURCE) && defined(_ILP32) 258 addp4 r15=0,r33 // ap 259#else 260 mov r15=r33 // ap 261#endif |
208 mov ar.lc=r10 209 mov ar.ec=6 } | 262 mov ar.lc=r10 263 mov ar.ec=6 } |
210{ .mib; mov r16=r34 // bp | 264{ .mib; 265#if defined(_HPUX_SOURCE) && defined(_ILP32) 266 addp4 r16=0,r34 // bp 267#else 268 mov r16=r34 // bp 269#endif |
211 mov pr.rot=1<<16 };; 212 213.L_bn_sub_words_ctop: 214{ .mii; (p16) ld8 r32=[r16],8 // b=*(bp++) 215 (p18) sub r39=r37,r34 216 (p19) cmp.gtu.unc p56,p0=r40,r38 } 217{ .mfb; (p0) nop.m 0x0 218 (p0) nop.f 0x0 219 (p0) nop.b 0x0 } 220{ .mii; (p16) ld8 r35=[r15],8 // a=*(ap++) 221 (p58) cmp.eq.or p57,p0=0,r41 // (p20) 222 (p58) add r41=-1,r41 } // (p20) 223{ .mbb; (p21) st8 [r14]=r42,8 // *(rp++)=r 224 (p0) nop.b 0x0 225 br.ctop.sptk .L_bn_sub_words_ctop };; 226.L_bn_sub_words_cend: 227 228{ .mii; 229(p59) add r8=1,r8 // return value | 270 mov pr.rot=1<<16 };; 271 272.L_bn_sub_words_ctop: 273{ .mii; (p16) ld8 r32=[r16],8 // b=*(bp++) 274 (p18) sub r39=r37,r34 275 (p19) cmp.gtu.unc p56,p0=r40,r38 } 276{ .mfb; (p0) nop.m 0x0 277 (p0) nop.f 0x0 278 (p0) nop.b 0x0 } 279{ .mii; (p16) ld8 r35=[r15],8 // a=*(ap++) 280 (p58) cmp.eq.or p57,p0=0,r41 // (p20) 281 (p58) add r41=-1,r41 } // (p20) 282{ .mbb; (p21) st8 [r14]=r42,8 // *(rp++)=r 283 (p0) nop.b 0x0 284 br.ctop.sptk .L_bn_sub_words_ctop };; 285.L_bn_sub_words_cend: 286 287{ .mii; 288(p59) add r8=1,r8 // return value |
230 mov pr=r9,-1 | 289 mov pr=r9,0x1ffff |
231 mov ar.lc=r3 } 232{ .mbb; nop.b 0x0 233 br.ret.sptk.many b0 };; 234.endp bn_sub_words# 235#endif 236 237#if 0 238#define XMA_TEMPTATION --- 9 unchanged lines hidden (view full) --- 248.skip 32 // makes the loop body aligned at 64-byte boundary 249bn_mul_words: 250 .prologue 251 .fframe 0 252 .save ar.pfs,r2 253#ifdef XMA_TEMPTATION 254{ .mfi; alloc r2=ar.pfs,4,0,0,0 };; 255#else | 290 mov ar.lc=r3 } 291{ .mbb; nop.b 0x0 292 br.ret.sptk.many b0 };; 293.endp bn_sub_words# 294#endif 295 296#if 0 297#define XMA_TEMPTATION --- 9 unchanged lines hidden (view full) --- 307.skip 32 // makes the loop body aligned at 64-byte boundary 308bn_mul_words: 309 .prologue 310 .fframe 0 311 .save ar.pfs,r2 312#ifdef XMA_TEMPTATION 313{ .mfi; alloc r2=ar.pfs,4,0,0,0 };; 314#else |
256{ .mfi; alloc r2=ar.pfs,4,4,0,8 };; | 315{ .mfi; alloc r2=ar.pfs,4,12,0,16 };; |
257#endif 258{ .mib; mov r8=r0 // return value 259 cmp4.le p6,p0=r34,r0 260(p6) br.ret.spnt.many b0 };; 261 262 .save ar.lc,r3 263{ .mii; sub r10=r34,r0,1 264 mov r3=ar.lc 265 mov r9=pr };; 266 267 .body 268{ .mib; setf.sig f8=r35 // w | 316#endif 317{ .mib; mov r8=r0 // return value 318 cmp4.le p6,p0=r34,r0 319(p6) br.ret.spnt.many b0 };; 320 321 .save ar.lc,r3 322{ .mii; sub r10=r34,r0,1 323 mov r3=ar.lc 324 mov r9=pr };; 325 326 .body 327{ .mib; setf.sig f8=r35 // w |
269 mov pr.rot=0x400001<<16 270 // ------^----- serves as (p48) at first (p26) | 328 mov pr.rot=0x800001<<16 329 // ------^----- serves as (p50) at first (p27) |
271 brp.loop.imp .L_bn_mul_words_ctop,.L_bn_mul_words_cend-16 272 } 273 274#ifndef XMA_TEMPTATION 275 | 330 brp.loop.imp .L_bn_mul_words_ctop,.L_bn_mul_words_cend-16 331 } 332 333#ifndef XMA_TEMPTATION 334 |
276{ .mii; mov r14=r32 // rp 277 mov r15=r33 // ap | 335{ .mii; 336#if defined(_HPUX_SOURCE) && defined(_ILP32) 337 addp4 r14=0,r32 // rp 338 addp4 r15=0,r33 // ap 339#else 340 mov r14=r32 // rp 341 mov r15=r33 // ap 342#endif |
278 mov ar.lc=r10 } | 343 mov ar.lc=r10 } |
279{ .mii; mov r39=0 // serves as r33 at first (p26) 280 mov ar.ec=12 };; | 344{ .mii; mov r40=0 // serves as r35 at first (p27) 345 mov ar.ec=13 };; |
281 | 346 |
282// This loop spins in 2*(n+11) ticks. It's scheduled for data in L2 283// cache (i.e. 9 ticks away) as floating point load/store instructions | 347// This loop spins in 2*(n+12) ticks. It's scheduled for data in Itanium 348// L2 cache (i.e. 9 ticks away) as floating point load/store instructions |
284// bypass L1 cache and L2 latency is actually best-case scenario for | 349// bypass L1 cache and L2 latency is actually best-case scenario for |
285// ldf8. The loop is not scalable and shall run in 2*(n+11) even on 286// "wider" IA-64 implementations. It's a trade-off here. n+22 loop | 350// ldf8. The loop is not scalable and shall run in 2*(n+12) even on 351// "wider" IA-64 implementations. It's a trade-off here. n+24 loop |
287// would give us ~5% in *overall* performance improvement on "wider" 288// IA-64, but would hurt Itanium for about same because of longer 289// epilogue. As it's a matter of few percents in either case I've 290// chosen to trade the scalability for development time (you can see 291// this very instruction sequence in bn_mul_add_words loop which in 292// turn is scalable). 293.L_bn_mul_words_ctop: | 352// would give us ~5% in *overall* performance improvement on "wider" 353// IA-64, but would hurt Itanium for about same because of longer 354// epilogue. As it's a matter of few percents in either case I've 355// chosen to trade the scalability for development time (you can see 356// this very instruction sequence in bn_mul_add_words loop which in 357// turn is scalable). 358.L_bn_mul_words_ctop: |
294{ .mfi; (p25) getf.sig r36=f49 // low 295 (p21) xmpy.lu f45=f37,f8 296 (p27) cmp.ltu p52,p48=r39,r38 } | 359{ .mfi; (p25) getf.sig r36=f52 // low 360 (p21) xmpy.lu f48=f37,f8 361 (p28) cmp.ltu p54,p50=r41,r39 } |
297{ .mfi; (p16) ldf8 f32=[r15],8 | 362{ .mfi; (p16) ldf8 f32=[r15],8 |
298 (p21) xmpy.hu f38=f37,f8 | 363 (p21) xmpy.hu f40=f37,f8 |
299 (p0) nop.i 0x0 };; | 364 (p0) nop.i 0x0 };; |
300{ .mii; (p26) getf.sig r32=f43 // high 301 .pred.rel "mutex",p48,p52 302 (p48) add r38=r37,r33 // (p26) 303 (p52) add r38=r37,r33,1 } // (p26) 304{ .mfb; (p27) st8 [r14]=r39,8 | 365{ .mii; (p25) getf.sig r32=f44 // high 366 .pred.rel "mutex",p50,p54 367 (p50) add r40=r38,r35 // (p27) 368 (p54) add r40=r38,r35,1 } // (p27) 369{ .mfb; (p28) st8 [r14]=r41,8 |
305 (p0) nop.f 0x0 306 br.ctop.sptk .L_bn_mul_words_ctop };; 307.L_bn_mul_words_cend: 308 309{ .mii; nop.m 0x0 | 370 (p0) nop.f 0x0 371 br.ctop.sptk .L_bn_mul_words_ctop };; 372.L_bn_mul_words_cend: 373 374{ .mii; nop.m 0x0 |
310.pred.rel "mutex",p49,p53 311(p49) add r8=r34,r0 312(p53) add r8=r34,r0,1 } | 375.pred.rel "mutex",p51,p55 376(p51) add r8=r36,r0 377(p55) add r8=r36,r0,1 } |
313{ .mfb; nop.m 0x0 314 nop.f 0x0 315 nop.b 0x0 } 316 317#else // XMA_TEMPTATION 318 319 setf.sig f37=r0 // serves as carry at (p18) tick 320 mov ar.lc=r10 --- 18 unchanged lines hidden (view full) --- 339 br.ctop.sptk .L_bn_mul_words_ctop };; 340.L_bn_mul_words_cend: 341 342 getf.sig r8=f41 // the return value 343 344#endif // XMA_TEMPTATION 345 346{ .mii; nop.m 0x0 | 378{ .mfb; nop.m 0x0 379 nop.f 0x0 380 nop.b 0x0 } 381 382#else // XMA_TEMPTATION 383 384 setf.sig f37=r0 // serves as carry at (p18) tick 385 mov ar.lc=r10 --- 18 unchanged lines hidden (view full) --- 404 br.ctop.sptk .L_bn_mul_words_ctop };; 405.L_bn_mul_words_cend: 406 407 getf.sig r8=f41 // the return value 408 409#endif // XMA_TEMPTATION 410 411{ .mii; nop.m 0x0 |
347 mov pr=r9,-1 | 412 mov pr=r9,0x1ffff |
348 mov ar.lc=r3 } 349{ .mfb; rum 1<<5 // clear um.mfh 350 nop.f 0x0 351 br.ret.sptk.many b0 };; 352.endp bn_mul_words# 353#endif 354 355#if 1 --- 15 unchanged lines hidden (view full) --- 371 372 .save ar.lc,r3 373{ .mii; sub r10=r34,r0,1 374 mov r3=ar.lc 375 mov r9=pr };; 376 377 .body 378{ .mib; setf.sig f8=r35 // w | 413 mov ar.lc=r3 } 414{ .mfb; rum 1<<5 // clear um.mfh 415 nop.f 0x0 416 br.ret.sptk.many b0 };; 417.endp bn_mul_words# 418#endif 419 420#if 1 --- 15 unchanged lines hidden (view full) --- 436 437 .save ar.lc,r3 438{ .mii; sub r10=r34,r0,1 439 mov r3=ar.lc 440 mov r9=pr };; 441 442 .body 443{ .mib; setf.sig f8=r35 // w |
379 mov pr.rot=0x400001<<16 380 // ------^----- serves as (p48) at first (p26) | 444 mov pr.rot=0x800001<<16 445 // ------^----- serves as (p50) at first (p27) |
381 brp.loop.imp .L_bn_mul_add_words_ctop,.L_bn_mul_add_words_cend-16 382 } | 446 brp.loop.imp .L_bn_mul_add_words_ctop,.L_bn_mul_add_words_cend-16 447 } |
383{ .mii; mov r14=r32 // rp 384 mov r15=r33 // ap | 448{ .mii; 449#if defined(_HPUX_SOURCE) && defined(_ILP32) 450 addp4 r14=0,r32 // rp 451 addp4 r15=0,r33 // ap 452#else 453 mov r14=r32 // rp 454 mov r15=r33 // ap 455#endif |
385 mov ar.lc=r10 } | 456 mov ar.lc=r10 } |
386{ .mii; mov r39=0 // serves as r33 at first (p26) 387 mov r18=r32 // rp copy 388 mov ar.ec=14 };; | 457{ .mii; mov r40=0 // serves as r35 at first (p27) 458#if defined(_HPUX_SOURCE) && defined(_ILP32) 459 addp4 r18=0,r32 // rp copy 460#else 461 mov r18=r32 // rp copy 462#endif 463 mov ar.ec=15 };; |
389 | 464 |
390// This loop spins in 3*(n+13) ticks on Itanium and should spin in 391// 2*(n+13) on "wider" IA-64 implementations (to be verified with new | 465// This loop spins in 3*(n+14) ticks on Itanium and should spin in 466// 2*(n+14) on "wider" IA-64 implementations (to be verified with new |
392// �-architecture manuals as they become available). As usual it's 393// possible to compress the epilogue, down to 10 in this case, at the 394// cost of scalability. Compressed (and therefore non-scalable) loop | 467// �-architecture manuals as they become available). As usual it's 468// possible to compress the epilogue, down to 10 in this case, at the 469// cost of scalability. Compressed (and therefore non-scalable) loop |
395// running at 3*(n+10) would buy you ~10% on Itanium but take ~35% | 470// running at 3*(n+11) would buy you ~10% on Itanium but take ~35% |
396// from "wider" IA-64 so let it be scalable! Special attention was 397// paid for having the loop body split at 64-byte boundary. ld8 is 398// scheduled for L1 cache as the data is more than likely there. 399// Indeed, bn_mul_words has put it there a moment ago:-) 400.L_bn_mul_add_words_ctop: | 471// from "wider" IA-64 so let it be scalable! Special attention was 472// paid for having the loop body split at 64-byte boundary. ld8 is 473// scheduled for L1 cache as the data is more than likely there. 474// Indeed, bn_mul_words has put it there a moment ago:-) 475.L_bn_mul_add_words_ctop: |
401{ .mfi; (p25) getf.sig r36=f49 // low 402 (p21) xmpy.lu f45=f37,f8 403 (p27) cmp.ltu p52,p48=r39,r38 } | 476{ .mfi; (p25) getf.sig r36=f52 // low 477 (p21) xmpy.lu f48=f37,f8 478 (p28) cmp.ltu p54,p50=r41,r39 } |
404{ .mfi; (p16) ldf8 f32=[r15],8 | 479{ .mfi; (p16) ldf8 f32=[r15],8 |
405 (p21) xmpy.hu f38=f37,f8 406 (p27) add r43=r43,r39 };; 407{ .mii; (p26) getf.sig r32=f43 // high 408 .pred.rel "mutex",p48,p52 409 (p48) add r38=r37,r33 // (p26) 410 (p52) add r38=r37,r33,1 } // (p26) 411{ .mfb; (p27) cmp.ltu.unc p56,p0=r43,r39 | 480 (p21) xmpy.hu f40=f37,f8 481 (p28) add r45=r45,r41 };; 482{ .mii; (p25) getf.sig r32=f44 // high 483 .pred.rel "mutex",p50,p54 484 (p50) add r40=r38,r35 // (p27) 485 (p54) add r40=r38,r35,1 } // (p27) 486{ .mfb; (p28) cmp.ltu.unc p60,p0=r45,r41 |
412 (p0) nop.f 0x0 413 (p0) nop.b 0x0 } | 487 (p0) nop.f 0x0 488 (p0) nop.b 0x0 } |
414{ .mii; (p26) ld8 r42=[r18],8 415 (p58) cmp.eq.or p57,p0=-1,r44 416 (p58) add r44=1,r44 } 417{ .mfb; (p29) st8 [r14]=r45,8 | 489{ .mii; (p27) ld8 r44=[r18],8 490 (p62) cmp.eq.or p61,p0=-1,r46 491 (p62) add r46=1,r46 } 492{ .mfb; (p30) st8 [r14]=r47,8 |
418 (p0) nop.f 0x0 419 br.ctop.sptk .L_bn_mul_add_words_ctop};; 420.L_bn_mul_add_words_cend: 421 422{ .mii; nop.m 0x0 | 493 (p0) nop.f 0x0 494 br.ctop.sptk .L_bn_mul_add_words_ctop};; 495.L_bn_mul_add_words_cend: 496 497{ .mii; nop.m 0x0 |
423.pred.rel "mutex",p51,p55 424(p51) add r8=r36,r0 425(p55) add r8=r36,r0,1 } | 498.pred.rel "mutex",p53,p57 499(p53) add r8=r38,r0 500(p57) add r8=r38,r0,1 } |
426{ .mfb; nop.m 0x0 427 nop.f 0x0 428 nop.b 0x0 };; 429{ .mii; | 501{ .mfb; nop.m 0x0 502 nop.f 0x0 503 nop.b 0x0 };; 504{ .mii; |
430(p59) add r8=1,r8 431 mov pr=r9,-1 | 505(p63) add r8=1,r8 506 mov pr=r9,0x1ffff |
432 mov ar.lc=r3 } 433{ .mfb; rum 1<<5 // clear um.mfh 434 nop.f 0x0 435 br.ret.sptk.many b0 };; 436.endp bn_mul_add_words# 437#endif 438 439#if 1 --- 16 unchanged lines hidden (view full) --- 456(p6) br.ret.spnt.many b0 };; 457 458 .save ar.lc,r3 459{ .mii; sub r10=r34,r0,1 460 mov r3=ar.lc 461 mov r9=pr };; 462 463 .body | 507 mov ar.lc=r3 } 508{ .mfb; rum 1<<5 // clear um.mfh 509 nop.f 0x0 510 br.ret.sptk.many b0 };; 511.endp bn_mul_add_words# 512#endif 513 514#if 1 --- 16 unchanged lines hidden (view full) --- 531(p6) br.ret.spnt.many b0 };; 532 533 .save ar.lc,r3 534{ .mii; sub r10=r34,r0,1 535 mov r3=ar.lc 536 mov r9=pr };; 537 538 .body |
539#if defined(_HPUX_SOURCE) && defined(_ILP32) 540{ .mii; addp4 r32=0,r32 541 addp4 r33=0,r33 };; 542#endif |
|
464{ .mib; 465 mov pr.rot=1<<16 466 brp.loop.imp .L_bn_sqr_words_ctop,.L_bn_sqr_words_cend-16 467 } 468{ .mii; add r34=8,r32 469 mov ar.lc=r10 470 mov ar.ec=18 };; 471 --- 15 unchanged lines hidden (view full) --- 487 (p25) xmpy.hu f52=f41,f41 488 (p0) nop.i 0x0 } 489{ .mib; (p33) stf8 [r34]=f60,16 490 (p0) nop.i 0x0 491 br.ctop.sptk .L_bn_sqr_words_ctop };; 492.L_bn_sqr_words_cend: 493 494{ .mii; nop.m 0x0 | 543{ .mib; 544 mov pr.rot=1<<16 545 brp.loop.imp .L_bn_sqr_words_ctop,.L_bn_sqr_words_cend-16 546 } 547{ .mii; add r34=8,r32 548 mov ar.lc=r10 549 mov ar.ec=18 };; 550 --- 15 unchanged lines hidden (view full) --- 566 (p25) xmpy.hu f52=f41,f41 567 (p0) nop.i 0x0 } 568{ .mib; (p33) stf8 [r34]=f60,16 569 (p0) nop.i 0x0 570 br.ctop.sptk .L_bn_sqr_words_ctop };; 571.L_bn_sqr_words_cend: 572 573{ .mii; nop.m 0x0 |
495 mov pr=r9,-1 | 574 mov pr=r9,0x1ffff |
496 mov ar.lc=r3 } 497{ .mfb; rum 1<<5 // clear um.mfh 498 nop.f 0x0 499 br.ret.sptk.many b0 };; 500.endp bn_sqr_words# 501#endif 502 503#if 1 --- 17 unchanged lines hidden (view full) --- 521// 522.global bn_sqr_comba8# 523.proc bn_sqr_comba8# 524.align 64 525bn_sqr_comba8: 526 .prologue 527 .fframe 0 528 .save ar.pfs,r2 | 575 mov ar.lc=r3 } 576{ .mfb; rum 1<<5 // clear um.mfh 577 nop.f 0x0 578 br.ret.sptk.many b0 };; 579.endp bn_sqr_words# 580#endif 581 582#if 1 --- 17 unchanged lines hidden (view full) --- 600// 601.global bn_sqr_comba8# 602.proc bn_sqr_comba8# 603.align 64 604bn_sqr_comba8: 605 .prologue 606 .fframe 0 607 .save ar.pfs,r2 |
608#if defined(_HPUX_SOURCE) && defined(_ILP32) |
|
529{ .mii; alloc r2=ar.pfs,2,1,0,0 | 609{ .mii; alloc r2=ar.pfs,2,1,0,0 |
610 addp4 r33=0,r33 611 addp4 r32=0,r32 };; 612{ .mii; 613#else 614{ .mii; alloc r2=ar.pfs,2,1,0,0 615#endif |
|
530 mov r34=r33 531 add r14=8,r33 };; 532 .body 533{ .mii; add r17=8,r34 534 add r15=16,r33 535 add r18=16,r34 } 536{ .mfb; add r16=24,r33 537 br .L_cheat_entry_point8 };; --- 44 unchanged lines hidden (view full) --- 582#define carry3 r34 583.global bn_mul_comba8# 584.proc bn_mul_comba8# 585.align 64 586bn_mul_comba8: 587 .prologue 588 .fframe 0 589 .save ar.pfs,r2 | 616 mov r34=r33 617 add r14=8,r33 };; 618 .body 619{ .mii; add r17=8,r34 620 add r15=16,r33 621 add r18=16,r34 } 622{ .mfb; add r16=24,r33 623 br .L_cheat_entry_point8 };; --- 44 unchanged lines hidden (view full) --- 668#define carry3 r34 669.global bn_mul_comba8# 670.proc bn_mul_comba8# 671.align 64 672bn_mul_comba8: 673 .prologue 674 .fframe 0 675 .save ar.pfs,r2 |
676#if defined(_HPUX_SOURCE) && defined(_ILP32) |
|
590{ .mii; alloc r2=ar.pfs,3,0,0,0 | 677{ .mii; alloc r2=ar.pfs,3,0,0,0 |
678 addp4 r33=0,r33 679 addp4 r34=0,r34 };; 680{ .mii; addp4 r32=0,r32 681#else 682{ .mii; alloc r2=ar.pfs,3,0,0,0 683#endif |
|
591 add r14=8,r33 592 add r17=8,r34 } 593 .body 594{ .mii; add r15=16,r33 595 add r18=16,r34 596 add r16=24,r33 } 597.L_cheat_entry_point8: 598{ .mmi; add r19=24,r34 --- 534 unchanged lines hidden (view full) --- 1133// 1134.global bn_sqr_comba4# 1135.proc bn_sqr_comba4# 1136.align 64 1137bn_sqr_comba4: 1138 .prologue 1139 .fframe 0 1140 .save ar.pfs,r2 | 684 add r14=8,r33 685 add r17=8,r34 } 686 .body 687{ .mii; add r15=16,r33 688 add r18=16,r34 689 add r16=24,r33 } 690.L_cheat_entry_point8: 691{ .mmi; add r19=24,r34 --- 534 unchanged lines hidden (view full) --- 1226// 1227.global bn_sqr_comba4# 1228.proc bn_sqr_comba4# 1229.align 64 1230bn_sqr_comba4: 1231 .prologue 1232 .fframe 0 1233 .save ar.pfs,r2 |
1234#if defined(_HPUX_SOURCE) && defined(_ILP32) 1235{ .mii; alloc r2=ar.pfs,2,1,0,0 1236 addp4 r32=0,r32 1237 addp4 r33=0,r33 };; 1238{ .mii; 1239#else |
|
1141{ .mii; alloc r2=ar.pfs,2,1,0,0 | 1240{ .mii; alloc r2=ar.pfs,2,1,0,0 |
1241#endif |
|
1142 mov r34=r33 1143 add r14=8,r33 };; 1144 .body 1145{ .mii; add r17=8,r34 1146 add r15=16,r33 1147 add r18=16,r34 } 1148{ .mfb; add r16=24,r33 1149 br .L_cheat_entry_point4 };; --- 9 unchanged lines hidden (view full) --- 1159#define carry2 r15 1160.global bn_mul_comba4# 1161.proc bn_mul_comba4# 1162.align 64 1163bn_mul_comba4: 1164 .prologue 1165 .fframe 0 1166 .save ar.pfs,r2 | 1242 mov r34=r33 1243 add r14=8,r33 };; 1244 .body 1245{ .mii; add r17=8,r34 1246 add r15=16,r33 1247 add r18=16,r34 } 1248{ .mfb; add r16=24,r33 1249 br .L_cheat_entry_point4 };; --- 9 unchanged lines hidden (view full) --- 1259#define carry2 r15 1260.global bn_mul_comba4# 1261.proc bn_mul_comba4# 1262.align 64 1263bn_mul_comba4: 1264 .prologue 1265 .fframe 0 1266 .save ar.pfs,r2 |
1267#if defined(_HPUX_SOURCE) && defined(_ILP32) 1268{ .mii; alloc r2=ar.pfs,3,0,0,0 1269 addp4 r33=0,r33 1270 addp4 r34=0,r34 };; 1271{ .mii; addp4 r32=0,r32 1272#else |
|
1167{ .mii; alloc r2=ar.pfs,3,0,0,0 | 1273{ .mii; alloc r2=ar.pfs,3,0,0,0 |
1274#endif |
|
1168 add r14=8,r33 1169 add r17=8,r34 } 1170 .body 1171{ .mii; add r15=16,r33 1172 add r18=16,r34 1173 add r16=24,r33 };; 1174.L_cheat_entry_point4: 1175{ .mmi; add r19=24,r34 --- 283 unchanged lines hidden (view full) --- 1459{ .mib; (cont) cmp.leu cont,break=HH,r31 1460 (p8) add r31=-1,r31 1461(cont) br.wtop.spnt .L_divw_2nd_iter };; 1462/////////////////////////////////////////////////////////// 1463{ .mii; sub H=H,r35 1464 or r8=r8,r33 1465 mov ar.pfs=r2 };; 1466{ .mii; shr.u r9=H,I // remainder if anybody wants it | 1275 add r14=8,r33 1276 add r17=8,r34 } 1277 .body 1278{ .mii; add r15=16,r33 1279 add r18=16,r34 1280 add r16=24,r33 };; 1281.L_cheat_entry_point4: 1282{ .mmi; add r19=24,r34 --- 283 unchanged lines hidden (view full) --- 1566{ .mib; (cont) cmp.leu cont,break=HH,r31 1567 (p8) add r31=-1,r31 1568(cont) br.wtop.spnt .L_divw_2nd_iter };; 1569/////////////////////////////////////////////////////////// 1570{ .mii; sub H=H,r35 1571 or r8=r8,r33 1572 mov ar.pfs=r2 };; 1573{ .mii; shr.u r9=H,I // remainder if anybody wants it |
1467 mov pr=r10,-1 } | 1574 mov pr=r10,0x1ffff } |
1468{ .mfb; br.ret.sptk.many b0 };; 1469 1470// Unsigned 64 by 32 (well, by 64 for the moment) bit integer division 1471// procedure. 1472// 1473// inputs: f6 = (double)a, f7 = (double)b 1474// output: f8 = (int)(a/b) 1475// clobbered: f8,f9,f10,f11,pred --- 23 unchanged lines hidden --- | 1575{ .mfb; br.ret.sptk.many b0 };; 1576 1577// Unsigned 64 by 32 (well, by 64 for the moment) bit integer division 1578// procedure. 1579// 1580// inputs: f6 = (double)a, f7 = (double)b 1581// output: f8 = (int)(a/b) 1582// clobbered: f8,f9,f10,f11,pred --- 23 unchanged lines hidden --- |