1dnl IA-64 mpn_divexact_1 -- mpn by limb exact division. 2 3dnl Copyright 2003, 2004, 2005 Free Software Foundation, Inc. 4 5dnl This file is part of the GNU MP Library. 6 7dnl The GNU MP Library is free software; you can redistribute it and/or modify 8dnl it under the terms of the GNU Lesser General Public License as published 9dnl by the Free Software Foundation; either version 3 of the License, or (at 10dnl your option) any later version. 11 12dnl The GNU MP Library is distributed in the hope that it will be useful, but 13dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 14dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 15dnl License for more details. 16 17dnl You should have received a copy of the GNU Lesser General Public License 18dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 19 20include(`../config.m4') 21 22C cycles/limb 23C Itanium: 16 24C Itanium 2: 8 25 26C INPUT PARAMETERS 27define(`rp', `r32') 28define(`up', `r33') 29define(`n', `r34') 30define(`divisor', `r35') 31 32define(`lshift', `r24') 33define(`rshift', `r25') 34 35C This code is a bit messy, and not as similar to mode1o.asm as desired. 36 37C The critical path during initialization is for computing the inverse of the 38C divisor. Since odd divisors are probably common, we conditionally execute 39C the initial count_traling_zeros code and the downshift. 40 41C Possible improvement: Merge more of the feed-in code into the inverse 42C computation. 43 44ASM_START() 45 .text 46 .align 32 47.Ltab: 48data1 0,0x01, 0,0xAB, 0,0xCD, 0,0xB7, 0,0x39, 0,0xA3, 0,0xC5, 0,0xEF 49data1 0,0xF1, 0,0x1B, 0,0x3D, 0,0xA7, 0,0x29, 0,0x13, 0,0x35, 0,0xDF 50data1 0,0xE1, 0,0x8B, 0,0xAD, 0,0x97, 0,0x19, 0,0x83, 0,0xA5, 0,0xCF 51data1 0,0xD1, 0,0xFB, 0,0x1D, 0,0x87, 0,0x09, 0,0xF3, 0,0x15, 0,0xBF 52data1 0,0xC1, 0,0x6B, 0,0x8D, 0,0x77, 0,0xF9, 0,0x63, 0,0x85, 0,0xAF 53data1 0,0xB1, 0,0xDB, 0,0xFD, 0,0x67, 0,0xE9, 0,0xD3, 0,0xF5, 0,0x9F 54data1 0,0xA1, 0,0x4B, 0,0x6D, 0,0x57, 0,0xD9, 0,0x43, 0,0x65, 0,0x8F 55data1 0,0x91, 0,0xBB, 0,0xDD, 0,0x47, 0,0xC9, 0,0xB3, 0,0xD5, 0,0x7F 56data1 0,0x81, 0,0x2B, 0,0x4D, 0,0x37, 0,0xB9, 0,0x23, 0,0x45, 0,0x6F 57data1 0,0x71, 0,0x9B, 0,0xBD, 0,0x27, 0,0xA9, 0,0x93, 0,0xB5, 0,0x5F 58data1 0,0x61, 0,0x0B, 0,0x2D, 0,0x17, 0,0x99, 0,0x03, 0,0x25, 0,0x4F 59data1 0,0x51, 0,0x7B, 0,0x9D, 0,0x07, 0,0x89, 0,0x73, 0,0x95, 0,0x3F 60data1 0,0x41, 0,0xEB, 0,0x0D, 0,0xF7, 0,0x79, 0,0xE3, 0,0x05, 0,0x2F 61data1 0,0x31, 0,0x5B, 0,0x7D, 0,0xE7, 0,0x69, 0,0x53, 0,0x75, 0,0x1F 62data1 0,0x21, 0,0xCB, 0,0xED, 0,0xD7, 0,0x59, 0,0xC3, 0,0xE5, 0,0x0F 63data1 0,0x11, 0,0x3B, 0,0x5D, 0,0xC7, 0,0x49, 0,0x33, 0,0x55, 0,0xFF 64 65 66PROLOGUE(mpn_divexact_1) 67 .prologue 68 .save ar.lc, r2 69 .body 70 71 {.mmi; add r8 = -1, divisor C M0 72 nop 0 C M1 73 tbit.z p8, p9 = divisor, 0 C I0 74} 75ifdef(`HAVE_ABI_32', 76` addp4 rp = 0, rp C M2 rp extend 77 addp4 up = 0, up C M3 up extend 78 sxt4 n = n') C I1 size extend 79 ;; 80.Lhere: 81 {.mmi; ld8 r20 = [up], 8 C M0 up[0] 82 (p8) andcm r8 = r8, divisor C M1 83 mov r15 = ip C I0 .Lhere 84 ;; 85}{.mii 86 .pred.rel "mutex", p8, p9 87 (p9) mov rshift = 0 C M0 88 (p8) popcnt rshift = r8 C I0 r8 = cnt_lo_zeros(divisor) 89 cmp.eq p6, p10 = 1, n C I1 90 ;; 91}{.mii; add r9 = .Ltab-.Lhere, r15 C M0 92 (p8) shr.u divisor = divisor, rshift C I0 93 nop 0 C I1 94 ;; 95}{.mmi; add n = -4, n C M0 size-1 96 (p10) ld8 r21 = [up], 8 C M1 up[1] 97 mov r14 = 2 C M1 2 98}{.mfi; setf.sig f6 = divisor C M2 divisor 99 mov f9 = f0 C M3 carry FIXME 100 zxt1 r3 = divisor C I1 divisor low byte 101 ;; 102}{.mmi; add r3 = r9, r3 C M0 table offset ip and index 103 sub r16 = 0, divisor C M1 -divisor 104 mov r2 = ar.lc C I0 105}{.mmi; sub lshift = 64, rshift C M2 106 setf.sig f13 = r14 C M3 2 in significand 107 mov r17 = -1 C I1 -1 108 ;; 109}{.mmi; ld1 r3 = [r3] C M0 inverse, 8 bits 110 nop 0 C M1 111 mov ar.lc = n C I0 size-1 loop count 112}{.mmi; setf.sig f12 = r16 C M2 -divisor 113 setf.sig f8 = r17 C M3 -1 114 cmp.eq p7, p0 = -2, n C I1 115 ;; 116}{.mmi; setf.sig f7 = r3 C M2 inverse, 8 bits 117 cmp.eq p8, p0 = -1, n C M0 118 shr.u r23 = r20, rshift C I0 119 ;; 120} 121 122 C f6 divisor 123 C f7 inverse, being calculated 124 C f8 -1, will be -inverse 125 C f9 carry 126 C f12 -divisor 127 C f13 2 128 C f14 scratch 129 130 xmpy.l f14 = f13, f7 C Newton 2*i 131 xmpy.l f7 = f7, f7 C Newton i*i 132 ;; 133 xma.l f7 = f7, f12, f14 C Newton i*i*-d + 2*i, 16 bits 134 ;; 135 setf.sig f10 = r23 C speculative, used iff n = 1 136 xmpy.l f14 = f13, f7 C Newton 2*i 137 shl r22 = r21, lshift C speculative, used iff n > 1 138 xmpy.l f7 = f7, f7 C Newton i*i 139 ;; 140 or r31 = r22, r23 C speculative, used iff n > 1 141 xma.l f7 = f7, f12, f14 C Newton i*i*-d + 2*i, 32 bits 142 shr.u r23 = r21, rshift C speculative, used iff n > 1 143 ;; 144 setf.sig f11 = r31 C speculative, used iff n > 1 145 xmpy.l f14 = f13, f7 C Newton 2*i 146 xmpy.l f7 = f7, f7 C Newton i*i 147 ;; 148 xma.l f7 = f7, f12, f14 C Newton i*i*-d + 2*i, 64 bits 149 150 (p7) br.cond.dptk .Ln2 151 (p10) br.cond.dptk .grt3 152 ;; 153 154.Ln1: xmpy.l f12 = f10, f7 C q = ulimb * inverse 155 br .Lx1 156 157.Ln2: 158 xmpy.l f8 = f7, f8 C -inverse = inverse * -1 159 xmpy.l f12 = f11, f7 C q = ulimb * inverse 160 setf.sig f11 = r23 161 br .Lx2 162 163.grt3: 164 ld8 r21 = [up], 8 C up[2] 165 xmpy.l f8 = f7, f8 C -inverse = inverse * -1 166 ;; 167 shl r22 = r21, lshift 168 ;; 169 xmpy.l f12 = f11, f7 C q = ulimb * inverse 170 ;; 171 or r31 = r22, r23 172 shr.u r23 = r21, rshift 173 ;; 174 setf.sig f11 = r31 175 (p8) br.cond.dptk .Lx3 C branch for n = 3 176 ;; 177 ld8 r21 = [up], 8 178 br .Lent 179 180.Loop: ld8 r21 = [up], 8 181 xma.l f12 = f9, f8, f10 C q = c * -inverse + si 182 ;; 183.Lent: add r16 = 160, up 184 shl r22 = r21, lshift 185 ;; 186 stf8 [rp] = f12, 8 187 xma.hu f9 = f12, f6, f9 C c = high(q * divisor + c) 188 xmpy.l f10 = f11, f7 C si = ulimb * inverse 189 ;; 190 or r31 = r22, r23 191 shr.u r23 = r21, rshift 192 ;; 193 lfetch [r16] 194 setf.sig f11 = r31 195 br.cloop.sptk.few.clr .Loop 196 197 198 xma.l f12 = f9, f8, f10 C q = c * -inverse + si 199 ;; 200.Lx3: stf8 [rp] = f12, 8 201 xma.hu f9 = f12, f6, f9 C c = high(q * divisor + c) 202 xmpy.l f10 = f11, f7 C si = ulimb * inverse 203 ;; 204 setf.sig f11 = r23 205 ;; 206 xma.l f12 = f9, f8, f10 C q = c * -inverse + si 207 ;; 208.Lx2: stf8 [rp] = f12, 8 209 xma.hu f9 = f12, f6, f9 C c = high(q * divisor + c) 210 xmpy.l f10 = f11, f7 C si = ulimb * inverse 211 ;; 212 xma.l f12 = f9, f8, f10 C q = c * -inverse + si 213 ;; 214.Lx1: stf8 [rp] = f12, 8 215 mov ar.lc = r2 C I0 216 br.ret.sptk.many b0 217EPILOGUE() 218