1dnl SPARC v9 mpn_sub_n -- Subtract two limb vectors of the same length > 0 and 2dnl store difference in a third limb vector. 3 4dnl Copyright 2001, 2002, 2003 Free Software Foundation, Inc. 5 6dnl This file is part of the GNU MP Library. 7 8dnl The GNU MP Library is free software; you can redistribute it and/or modify 9dnl it under the terms of the GNU Lesser General Public License as published 10dnl by the Free Software Foundation; either version 3 of the License, or (at 11dnl your option) any later version. 12 13dnl The GNU MP Library is distributed in the hope that it will be useful, but 14dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 15dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 16dnl License for more details. 17 18dnl You should have received a copy of the GNU Lesser General Public License 19dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 20 21include(`../config.m4') 22 23C cycles/limb 24C UltraSPARC 1&2: 4 25C UltraSPARC 3: 4.5 26 27C Compute carry-out from the most significant bits of u,v, and r, where 28C r=u-v-carry_in, using logic operations. 29 30C This code runs at 4 cycles/limb on UltraSPARC 1 and 2. It has a 4 insn 31C recurrency, and the UltraSPARC 1 and 2 the IE units are 100% saturated. 32C Therefore, it seems futile to try to optimize this any further... 33 34C INPUT PARAMETERS 35define(`rp',`%i0') 36define(`up',`%i1') 37define(`vp',`%i2') 38define(`n',`%i3') 39 40define(`u0',`%l0') 41define(`u1',`%l2') 42define(`u2',`%l4') 43define(`u3',`%l6') 44define(`v0',`%l1') 45define(`v1',`%l3') 46define(`v2',`%l5') 47define(`v3',`%l7') 48 49define(`cy',`%i4') 50 51define(`fanop',`fitod %f0,%f2') dnl A quasi nop running in the FA pipe 52define(`fmnop',`fmuld %f0,%f0,%f4') dnl A quasi nop running in the FM pipe 53 54ASM_START() 55 REGISTER(%g2,#scratch) 56 REGISTER(%g3,#scratch) 57PROLOGUE(mpn_sub_n) 58 save %sp,-160,%sp 59 60 fitod %f0,%f0 C make sure f0 contains small, quiet number 61 subcc n,4,%g0 62 bl,pn %icc,.Loop0 63 mov 0,cy 64 65 ldx [up+0],u0 66 ldx [vp+0],v0 67 add up,32,up 68 ldx [up-24],u1 69 ldx [vp+8],v1 70 add vp,32,vp 71 ldx [up-16],u2 72 ldx [vp-16],v2 73 ldx [up-8],u3 74 ldx [vp-8],v3 75 subcc n,8,n 76 sub u0,v0,%g1 C main sub 77 sub %g1,cy,%g4 C carry sub 78 orn u0,v0,%g2 79 bl,pn %icc,.Lend4567 80 fanop 81 b,a .Loop 82 83 .align 16 84C START MAIN LOOP 85.Loop: orn %g4,%g2,%g2 86 andn u0,v0,%g3 87 ldx [up+0],u0 88 fanop 89C -- 90 andn %g2,%g3,%g2 91 ldx [vp+0],v0 92 add up,32,up 93 fanop 94C -- 95 srlx %g2,63,cy 96 sub u1,v1,%g1 97 stx %g4,[rp+0] 98 fanop 99C -- 100 sub %g1,cy,%g4 101 orn u1,v1,%g2 102 fmnop 103 fanop 104C -- 105 orn %g4,%g2,%g2 106 andn u1,v1,%g3 107 ldx [up-24],u1 108 fanop 109C -- 110 andn %g2,%g3,%g2 111 ldx [vp+8],v1 112 add vp,32,vp 113 fanop 114C -- 115 srlx %g2,63,cy 116 sub u2,v2,%g1 117 stx %g4,[rp+8] 118 fanop 119C -- 120 sub %g1,cy,%g4 121 orn u2,v2,%g2 122 fmnop 123 fanop 124C -- 125 orn %g4,%g2,%g2 126 andn u2,v2,%g3 127 ldx [up-16],u2 128 fanop 129C -- 130 andn %g2,%g3,%g2 131 ldx [vp-16],v2 132 add rp,32,rp 133 fanop 134C -- 135 srlx %g2,63,cy 136 sub u3,v3,%g1 137 stx %g4,[rp-16] 138 fanop 139C -- 140 sub %g1,cy,%g4 141 orn u3,v3,%g2 142 fmnop 143 fanop 144C -- 145 orn %g4,%g2,%g2 146 andn u3,v3,%g3 147 ldx [up-8],u3 148 fanop 149C -- 150 andn %g2,%g3,%g2 151 subcc n,4,n 152 ldx [vp-8],v3 153 fanop 154C -- 155 srlx %g2,63,cy 156 sub u0,v0,%g1 157 stx %g4,[rp-8] 158 fanop 159C -- 160 sub %g1,cy,%g4 161 orn u0,v0,%g2 162 bge,pt %icc,.Loop 163 fanop 164C END MAIN LOOP 165.Lend4567: 166 orn %g4,%g2,%g2 167 andn u0,v0,%g3 168 andn %g2,%g3,%g2 169 srlx %g2,63,cy 170 sub u1,v1,%g1 171 stx %g4,[rp+0] 172 sub %g1,cy,%g4 173 orn u1,v1,%g2 174 orn %g4,%g2,%g2 175 andn u1,v1,%g3 176 andn %g2,%g3,%g2 177 srlx %g2,63,cy 178 sub u2,v2,%g1 179 stx %g4,[rp+8] 180 sub %g1,cy,%g4 181 orn u2,v2,%g2 182 orn %g4,%g2,%g2 183 andn u2,v2,%g3 184 andn %g2,%g3,%g2 185 add rp,32,rp 186 srlx %g2,63,cy 187 sub u3,v3,%g1 188 stx %g4,[rp-16] 189 sub %g1,cy,%g4 190 orn u3,v3,%g2 191 orn %g4,%g2,%g2 192 andn u3,v3,%g3 193 andn %g2,%g3,%g2 194 srlx %g2,63,cy 195 stx %g4,[rp-8] 196 197 addcc n,4,n 198 bz,pn %icc,.Lret 199 fanop 200 201.Loop0: ldx [up],u0 202 add up,8,up 203 ldx [vp],v0 204 add vp,8,vp 205 add rp,8,rp 206 subcc n,1,n 207 sub u0,v0,%g1 208 orn u0,v0,%g2 209 sub %g1,cy,%g4 210 andn u0,v0,%g3 211 orn %g4,%g2,%g2 212 stx %g4,[rp-8] 213 andn %g2,%g3,%g2 214 bnz,pt %icc,.Loop0 215 srlx %g2,63,cy 216 217.Lret: mov cy,%i0 218 ret 219 restore 220EPILOGUE(mpn_sub_n) 221