sqr_diag_addlsh1.asm revision 1.1.1.2
1dnl IA-64 mpn_sqr_diag_addlsh1 2 3dnl Contributed to the GNU project by Torbjorn Granlund. 4 5dnl Copyright 2010, 2011 Free Software Foundation, Inc. 6 7dnl This file is part of the GNU MP Library. 8dnl 9dnl The GNU MP Library is free software; you can redistribute it and/or modify 10dnl it under the terms of either: 11dnl 12dnl * the GNU Lesser General Public License as published by the Free 13dnl Software Foundation; either version 3 of the License, or (at your 14dnl option) any later version. 15dnl 16dnl or 17dnl 18dnl * the GNU General Public License as published by the Free Software 19dnl Foundation; either version 2 of the License, or (at your option) any 20dnl later version. 21dnl 22dnl or both in parallel, as here. 23dnl 24dnl The GNU MP Library is distributed in the hope that it will be useful, but 25dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 26dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 27dnl for more details. 28dnl 29dnl You should have received copies of the GNU General Public License and the 30dnl GNU Lesser General Public License along with the GNU MP Library. If not, 31dnl see https://www.gnu.org/licenses/. 32 33include(`../config.m4') 34 35C cycles/limb 36C Itanium: ? 37C Itanium 2: 2 Unrolling could bring it to 1.5 + epsilon 38 39C Exact performance table. The 2nd line is this code, the 3rd line is ctop- 40C less code. In an assembly sqr_basecase, the ctop-full numbers will become a 41C few cycles better since we can mitigate the many I0 instructions. 42C 43C 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 44C - 20 22 24 26 28 30 32 34 36 38 40 42 44 46 48 50 52 54 56 Needs updating 45C - 13 16 17 18 20 21 23 25 26 30 31 31 33 34 36 38 39 42 43 46 47C We should keep in mind that this code takes linear time in a O(n^2) context 48C and that it will only be used under SQR_TOOM2_THRESHOLD, which might become 49C around 60. Keeping overhead down for smallish operands (< 10) is more 50C important than optimal cycle counts. 51 52C TODO 53C * Make sure we don't depend on uninitialised r-registers, f-registers, or 54C * p-registers. 55C * Optimise by doing first two loop iterations in function header. 56 57C INPUT PARAMETERS 58define(`rp_param', `r32') define(`rp', `r14') C size: 2n 59define(`tp_param', `r33') define(`tp', `r15') C size: 2n - 2 60define(`up_param', `r34') define(`up', `r31') C size: n 61define(`n', `r35') 62 63ifdef(`HAVE_ABI_32',` 64 define(`ABI64', `') 65 define(`ABI32', `$1') 66',` 67 define(`ABI64', `$1') 68 define(`ABI32', `') 69') 70 71ASM_START() 72PROLOGUE(mpn_sqr_diag_addlsh1) 73 74 .prologue 75 .save ar.pfs, r2 76 .save ar.lc, r3 77 .body 78 79 {.mii; alloc r2 = ar.pfs, 4,24,0,24 C M 80 mov r3 = ar.lc C I0 81 ABI64(` nop 4711 ') 82 ABI32(` zxt4 n = n ') 83}{.mmi; ABI64(` mov tp = tp_param ') C M I 84 ABI32(` addp4 tp = 0, tp_param') C M I 85 ABI64(` mov up = up_param ') C M I 86 ABI32(` addp4 up = 0, up_param') C M I 87 ABI64(` mov rp = rp_param ') C M I 88 ABI32(` addp4 rp = 0, rp_param') C M I 89 ;; 90}{.mmi; ld8 r36 = [tp], 8 C M 91 add r20 = -2, n C M I 92 mov r9 = ar.ec C I0 93 ;; 94}{.mmi; ld8 r32 = [tp], 8 C M 95 mov r16 = 0 C M I 96 mov ar.ec = 7 C I0 97 ;; 98}{.mmi; nop 4711 99 mov r44 = 0 C M I 100 mov ar.lc = r20 C I0 101 ;; 102}{.mii; mov r33 = 0 103 mov r10 = pr C I0 104 mov pr.rot = 0x30000 C I0 105 ;; 106} br.cexit.spnt.few.clr L(end) 107 108dnl *** MAIN LOOP START *** 109 ALIGN(32) 110L(top): 111 {.mfi; (p18) ldf8 f33 = [up], 8 C M 112 (p20) xma.l f36 = f35, f35, f42 C F 113 (p41) cmpequc p50, p0 = -1, r44 C M I 114}{.mfi; setfsig f40 = r16 C M23 115 (p20) xma.hu f38 = f35, f35, f42 C F 116 (p23) add r50 = r41, r49 C M I 117 ;; 118}{.mmi; (p16) ld8 r36 = [tp], 8 C M 119 (p23) cmpltu p40, p0 = r50, r41 C cyout hi M I 120 (p19) shrp r45 = r38, r35, 63 C non-critical I0 121}{.mmi; (p21) getfsig r39 = f39 C hi M2 122 (p24) st8 [rp] = r51, 8 C hi M23 123 (p41) add r44 = 1, r44 C M I 124 ;; 125}{.mmi; (p16) ld8 r32 = [tp], 8 C M 126 (p50) cmpeqor p40, p0 = -1, r50 C cyout hi M I 127 (p17) shrp r16 = r33, r37, 63 C critical I0 128}{.mmi; (p21) getfsig r42 = f37 C lo M2 129 (p23) st8 [rp] = r44, 8 C lo M23 130 (p50) add r50 = 1, r50 C M I 131 ;; 132} br.ctop.sptk.few.clr L(top) C B 133dnl *** MAIN LOOP END *** 134 ;; 135L(end): 136 {.mmi; nop 4711 137 (p41) add r44 = 1, r44 C M I 138 shr.u r48 = r39, 63 C I0 139 ;; 140}{.mmi; st8 [rp] = r51, 8 C M23 141 (p41) cmpequc p6, p0 = 0, r44 C M I 142 add r50 = r41, r48 C M I 143 ;; 144}{.mmi; st8 [rp] = r44, 8 C M23 145 (p6) add r50 = 1, r50 C M I 146 mov ar.lc = r3 C I0 147 ;; 148}{.mii; st8 [rp] = r50 C M23 149 mov ar.ec = r9 C I0 150 mov pr = r10 C I0 151 ;; 152}{.mib; nop 4711 153 mov ar.pfs = r2 C I0 154 br.ret.sptk.many b0 C B 155} 156EPILOGUE() 157