sqr_diag_addlsh1.asm revision 1.1.1.1
1dnl IA-64 mpn_sqr_diag_addlsh1 2 3dnl Contributed to the GNU project by Torbjorn Granlund. 4 5dnl Copyright 2010, 2011 Free Software Foundation, Inc. 6 7dnl This file is part of the GNU MP Library. 8 9dnl The GNU MP Library is free software; you can redistribute it and/or modify 10dnl it under the terms of the GNU Lesser General Public License as published 11dnl by the Free Software Foundation; either version 3 of the License, or (at 12dnl your option) any later version. 13 14dnl The GNU MP Library is distributed in the hope that it will be useful, but 15dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 16dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 17dnl License for more details. 18 19dnl You should have received a copy of the GNU Lesser General Public License 20dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 21 22include(`../config.m4') 23 24C cycles/limb 25C Itanium: ? 26C Itanium 2: 2 Unrolling could bring it to 1.5 + epsilon 27 28C Exact performance table. The 2nd line is this code, the 3rd line is ctop- 29C less code. In an assembly sqr_basecase, the ctop-full numbers will become a 30C few cycles better since we can mitigate the many I0 instructions. 31C 32C 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 33C - 20 22 24 26 28 30 32 34 36 38 40 42 44 46 48 50 52 54 56 Needs updating 34C - 13 16 17 18 20 21 23 25 26 30 31 31 33 34 36 38 39 42 43 35 36C We should keep in mind that this code takes linear time in a O(n^2) context 37C and that it will only be used under SQR_TOOM2_THRESHOLD, which might become 38C around 60. Keeping overhead down for smallish operands (< 10) is more 39C important than optimal cycle counts. 40 41C TODO 42C * Make sure we don't depend on uninitialised r-registers, f-registers, or 43C * p-registers. 44C * Optimise by doing first two loop iterations in function header. 45 46C INPUT PARAMETERS 47define(`rp_param', `r32') define(`rp', `r14') C size: 2n 48define(`tp_param', `r33') define(`tp', `r15') C size: 2n - 2 49define(`up_param', `r34') define(`up', `r31') C size: n 50define(`n', `r35') 51 52 53ASM_START() 54PROLOGUE(mpn_sqr_diag_addlsh1) 55 56 .prologue 57 .save ar.pfs, r2 58 .save ar.lc, r3 59 .body 60 61.mmi; alloc r2 = ar.pfs, 4,24,0,24 C M 62 nop 4711 63 mov r3 = ar.lc C I0 64.mmi; mov tp = tp_param C M I 65 mov up = up_param C M I 66 mov rp = rp_param C M I 67 ;; 68.mmi; ld8 r36 = [tp], 8 C M 69 add r20 = -2, n C M I 70 mov r9 = ar.ec C I0 71 ;; 72.mmi; ld8 r32 = [tp], 8 C M 73 mov r16 = 0 C M I 74 mov ar.ec = 7 C I0 75 ;; 76.mmi; nop 4711 77 mov r44 = 0 C M I 78 mov ar.lc = r20 C I0 79 ;; 80.mii; mov r33 = 0 81 mov r10 = pr C I0 82 mov pr.rot = 0x30000 C I0 83 ;; 84 br.cexit.spnt.few.clr L(end) 85 86dnl *** MAIN LOOP START *** 87 ALIGN(32) 88L(top): 89.mfi; (p18) ldf8 f33 = [up], 8 C M 90 (p20) xma.l f36 = f35, f35, f42 C F 91 (p41) cmpequc p50, p0 = -1, r44 C M I 92.mfi; setfsig f40 = r16 C M23 93 (p20) xma.hu f38 = f35, f35, f42 C F 94 (p23) add r50 = r41, r49 C M I 95 ;; 96.mmi; (p16) ld8 r36 = [tp], 8 C M 97 (p23) cmpltu p40, p0 = r50, r41 C cyout hi M I 98 (p19) shrp r45 = r38, r35, 63 C non-critical I0 99.mmi; (p21) getfsig r39 = f39 C hi M2 100 (p24) st8 [rp] = r51, 8 C hi M23 101 (p41) add r44 = 1, r44 C M I 102 ;; 103.mmi; (p16) ld8 r32 = [tp], 8 C M 104 (p50) cmpeqor p40, p0 = -1, r50 C cyout hi M I 105 (p17) shrp r16 = r33, r37, 63 C critical I0 106.mmi; (p21) getfsig r42 = f37 C lo M2 107 (p23) st8 [rp] = r44, 8 C lo M23 108 (p50) add r50 = 1, r50 C M I 109 ;; 110 br.ctop.sptk.few.clr L(top) C B 111dnl *** MAIN LOOP END *** 112 ;; 113L(end): 114.mmi; nop 4711 115 (p41) add r44 = 1, r44 C M I 116 shr.u r48 = r39, 63 C I0 117 ;; 118.mmi; st8 [rp] = r51, 8 C M23 119 (p41) cmpequc p6, p0 = 0, r44 C M I 120 add r50 = r41, r48 C M I 121 ;; 122.mmi; st8 [rp] = r44, 8 C M23 123 (p6) add r50 = 1, r50 C M I 124 mov ar.lc = r3 C I0 125 ;; 126.mii; st8 [rp] = r50 C M23 127 mov ar.ec = r9 C I0 128 mov pr = r10 C I0 129 ;; 130.mib; nop 4711 131 mov ar.pfs = r2 C I0 132 br.ret.sptk.many b0 C B 133EPILOGUE() 134