1dnl SPARC v9 mpn_add_n -- Add two limb vectors of the same length > 0 and 2dnl store sum in a third limb vector. 3 4dnl Copyright 2001, 2002, 2003 Free Software Foundation, Inc. 5 6dnl This file is part of the GNU MP Library. 7 8dnl The GNU MP Library is free software; you can redistribute it and/or modify 9dnl it under the terms of the GNU Lesser General Public License as published 10dnl by the Free Software Foundation; either version 3 of the License, or (at 11dnl your option) any later version. 12 13dnl The GNU MP Library is distributed in the hope that it will be useful, but 14dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 15dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 16dnl License for more details. 17 18dnl You should have received a copy of the GNU Lesser General Public License 19dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 20 21include(`../config.m4') 22 23C cycles/limb 24C UltraSPARC 1&2: 4 25C UltraSPARC 3: 4.5 26 27C Compute carry-out from the most significant bits of u,v, and r, where 28C r=u+v+carry_in, using logic operations. 29 30C This code runs at 4 cycles/limb on UltraSPARC 1 and 2. It has a 4 insn 31C recurrency, and the UltraSPARC 1 and 2 the IE units are 100% saturated. 32C Therefore, it seems futile to try to optimize this any further... 33 34C INPUT PARAMETERS 35define(`rp',`%i0') 36define(`up',`%i1') 37define(`vp',`%i2') 38define(`n',`%i3') 39 40define(`u0',`%l0') 41define(`u1',`%l2') 42define(`u2',`%l4') 43define(`u3',`%l6') 44define(`v0',`%l1') 45define(`v1',`%l3') 46define(`v2',`%l5') 47define(`v3',`%l7') 48 49define(`cy',`%i4') 50 51define(`fanop',`fitod %f0,%f2') dnl A quasi nop running in the FA pipe 52define(`fmnop',`fmuld %f0,%f0,%f4') dnl A quasi nop running in the FM pipe 53 54ASM_START() 55 REGISTER(%g2,#scratch) 56 REGISTER(%g3,#scratch) 57PROLOGUE(mpn_add_n) 58 save %sp,-160,%sp 59 60 fitod %f0,%f0 C make sure f0 contains small, quiet number 61 subcc n,4,%g0 62 bl,pn %icc,.Loop0 63 mov 0,cy 64 65 ldx [up+0],u0 66 ldx [vp+0],v0 67 add up,32,up 68 ldx [up-24],u1 69 ldx [vp+8],v1 70 add vp,32,vp 71 ldx [up-16],u2 72 ldx [vp-16],v2 73 ldx [up-8],u3 74 ldx [vp-8],v3 75 subcc n,8,n 76 add u0,v0,%g1 C main add 77 add %g1,cy,%g4 C carry add 78 or u0,v0,%g2 79 bl,pn %icc,.Lend4567 80 fanop 81 b,a .Loop 82 83 .align 16 84C START MAIN LOOP 85.Loop: andn %g2,%g4,%g2 86 and u0,v0,%g3 87 ldx [up+0],u0 88 fanop 89C -- 90 or %g3,%g2,%g2 91 ldx [vp+0],v0 92 add up,32,up 93 fanop 94C -- 95 srlx %g2,63,cy 96 add u1,v1,%g1 97 stx %g4,[rp+0] 98 fanop 99C -- 100 add %g1,cy,%g4 101 or u1,v1,%g2 102 fmnop 103 fanop 104C -- 105 andn %g2,%g4,%g2 106 and u1,v1,%g3 107 ldx [up-24],u1 108 fanop 109C -- 110 or %g3,%g2,%g2 111 ldx [vp+8],v1 112 add vp,32,vp 113 fanop 114C -- 115 srlx %g2,63,cy 116 add u2,v2,%g1 117 stx %g4,[rp+8] 118 fanop 119C -- 120 add %g1,cy,%g4 121 or u2,v2,%g2 122 fmnop 123 fanop 124C -- 125 andn %g2,%g4,%g2 126 and u2,v2,%g3 127 ldx [up-16],u2 128 fanop 129C -- 130 or %g3,%g2,%g2 131 ldx [vp-16],v2 132 add rp,32,rp 133 fanop 134C -- 135 srlx %g2,63,cy 136 add u3,v3,%g1 137 stx %g4,[rp-16] 138 fanop 139C -- 140 add %g1,cy,%g4 141 or u3,v3,%g2 142 fmnop 143 fanop 144C -- 145 andn %g2,%g4,%g2 146 and u3,v3,%g3 147 ldx [up-8],u3 148 fanop 149C -- 150 or %g3,%g2,%g2 151 subcc n,4,n 152 ldx [vp-8],v3 153 fanop 154C -- 155 srlx %g2,63,cy 156 add u0,v0,%g1 157 stx %g4,[rp-8] 158 fanop 159C -- 160 add %g1,cy,%g4 161 or u0,v0,%g2 162 bge,pt %icc,.Loop 163 fanop 164C END MAIN LOOP 165.Lend4567: 166 andn %g2,%g4,%g2 167 and u0,v0,%g3 168 or %g3,%g2,%g2 169 srlx %g2,63,cy 170 add u1,v1,%g1 171 stx %g4,[rp+0] 172 add %g1,cy,%g4 173 or u1,v1,%g2 174 andn %g2,%g4,%g2 175 and u1,v1,%g3 176 or %g3,%g2,%g2 177 srlx %g2,63,cy 178 add u2,v2,%g1 179 stx %g4,[rp+8] 180 add %g1,cy,%g4 181 or u2,v2,%g2 182 andn %g2,%g4,%g2 183 and u2,v2,%g3 184 or %g3,%g2,%g2 185 add rp,32,rp 186 srlx %g2,63,cy 187 add u3,v3,%g1 188 stx %g4,[rp-16] 189 add %g1,cy,%g4 190 or u3,v3,%g2 191 andn %g2,%g4,%g2 192 and u3,v3,%g3 193 or %g3,%g2,%g2 194 srlx %g2,63,cy 195 stx %g4,[rp-8] 196 197 addcc n,4,n 198 bz,pn %icc,.Lret 199 fanop 200 201.Loop0: ldx [up],u0 202 add up,8,up 203 ldx [vp],v0 204 add vp,8,vp 205 add rp,8,rp 206 subcc n,1,n 207 add u0,v0,%g1 208 or u0,v0,%g2 209 add %g1,cy,%g4 210 and u0,v0,%g3 211 andn %g2,%g4,%g2 212 stx %g4,[rp-8] 213 or %g3,%g2,%g2 214 bnz,pt %icc,.Loop0 215 srlx %g2,63,cy 216 217.Lret: mov cy,%i0 218 ret 219 restore 220EPILOGUE(mpn_add_n) 221