1dnl SPARC v9 mpn_mul_2 and mpn_addmul_2 for T3/T4/T5. 2 3dnl Contributed to the GNU project by Torbj��rn Granlund. 4 5dnl Copyright 2013 Free Software Foundation, Inc. 6 7dnl This file is part of the GNU MP Library. 8dnl 9dnl The GNU MP Library is free software; you can redistribute it and/or modify 10dnl it under the terms of either: 11dnl 12dnl * the GNU Lesser General Public License as published by the Free 13dnl Software Foundation; either version 3 of the License, or (at your 14dnl option) any later version. 15dnl 16dnl or 17dnl 18dnl * the GNU General Public License as published by the Free Software 19dnl Foundation; either version 2 of the License, or (at your option) any 20dnl later version. 21dnl 22dnl or both in parallel, as here. 23dnl 24dnl The GNU MP Library is distributed in the hope that it will be useful, but 25dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 26dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 27dnl for more details. 28dnl 29dnl You should have received copies of the GNU General Public License and the 30dnl GNU Lesser General Public License along with the GNU MP Library. If not, 31dnl see https://www.gnu.org/licenses/. 32 33include(`../config.m4') 34 35 36C cycles/limb cycles/limb 37C mul_2 addmul_2 38C UltraSPARC T3: 22.5 23.5 39C UltraSPARC T4: 3.25 3.75 40 41 42C The code is reasonably scheduled but also relies on OoO. There was hope that 43C this could run at around 3.0 and 3.5 c/l respectively, on T4. Two cycles per 44C iteration needs to be removed. 45C 46C We could almost use 2-way unrolling, but currently the wN registers live too 47C long. By changing add x,w1,w1 to add x,w1,w0, i.e. migrate the values down- 48C wards, 2-way unrolling should become possible. With n-indexed addressing it 49C should run no slower. 50C 51C The rp loads to g1/g3 are very much over-scheduled. Presumably, they could 52C be postponed a full way, and then just one register could be used. 53 54C INPUT PARAMETERS 55define(`rp', `%i0') 56define(`up', `%i1') 57define(`n', `%i2') 58define(`vp', `%i3') 59 60define(`v0', `%o0') 61define(`v1', `%o1') 62 63define(`w0', `%o2') 64define(`w1', `%o3') 65define(`w2', `%o4') 66define(`w3', `%o5') 67 68ifdef(`OPERATION_mul_2',` 69 define(`AM2', `') 70 define(`ADDX', `addcc`'$1') 71 define(`func', `mpn_mul_2') 72') 73ifdef(`OPERATION_addmul_2',` 74 define(`AM2', `$1') 75 define(`ADDX', `addxccc($1,$2,$3)') 76 define(`func', `mpn_addmul_2') 77') 78 79 80MULFUNC_PROLOGUE(mpn_mul_2 mpn_addmul_2) 81 82ASM_START() 83 REGISTER(%g2,#scratch) 84 REGISTER(%g3,#scratch) 85PROLOGUE(func) 86 save %sp, -176, %sp 87 88 ldx [vp+0], v0 C load v0 89 and n, 3, %g5 90 ldx [vp+8], v1 C load v1 91 add n, -6, n 92 ldx [up+0], %g4 93 brz %g5, L(b0) 94 cmp %g5, 2 95 bcs L(b1) 96 nop 97 be L(b2) 98 nop 99 100L(b3): 101AM2(` ldx [rp+0], %g1') 102 mulx %g4, v0, w2 103 umulxhi(%g4, v0, w3) 104 ldx [up+8], %i5 105 mulx %g4, v1, %l3 106 umulxhi(%g4, v1, %l7) 107AM2(` ldx [rp+8], %g3') 108 add up, -8, up 109 add rp, -8, rp 110 b L(lo3) 111 mov 0, w0 112 113L(b2): 114AM2(` ldx [rp+0], %g3') 115 mulx %g4, v0, w3 116 umulxhi(%g4, v0, w0) 117 ldx [up+8], %i4 118 mulx %g4, v1, %l1 119 umulxhi(%g4, v1, %l5) 120AM2(` ldx [rp+8], %g1') 121 add rp, 16, rp 122 brlz n, L(end) 123 mov 0, w1 124 ba L(top) 125 add up, 16, up 126 127L(b1): 128AM2(` ldx [rp+0], %g1') 129 mulx %g4, v0, w0 130 umulxhi(%g4, v0, w1) 131 ldx [up+8], %i5 132 mulx %g4, v1, %l3 133 umulxhi(%g4, v1, %l7) 134AM2(` ldx [rp+8], %g3') 135 add up, 8, up 136 add rp, 8, rp 137 b L(lo1) 138 mov 0, w2 139 140L(b0): 141AM2(` ldx [rp+0], %g3') 142 mulx %g4, v0, w1 143 umulxhi(%g4, v0, w2) 144 ldx [up+8], %i4 145 mulx %g4, v1, %l1 146 umulxhi(%g4, v1, %l5) 147AM2(` ldx [rp+8], %g1') 148 b L(lo0) 149 mov 0, w3 150 151 ALIGN(16) C cycle 152L(top): mulx %i4, v0, %l2 C 0->5 153 umulxhi(%i4, v0, %l6) C 0->5 154 ldx [up+0], %i5 C 1->6 155AM2(` addcc w3, %g3, w3') C 1 156 stx w3, [rp-16] C 2 157 ADDX(` %l1, w0, w0') C 2 158 addxccc(%l5, w1, w1) C 3 159 mulx %i4, v1, %l3 C 3->9 160 umulxhi(%i4, v1, %l7) C 4->9 161AM2(` ldx [rp+0], %g3') C 4 162 addcc %l2, w0, w0 C 5 163 addxccc(%l6, w1, w1) C 5 164 addxc( %g0, %g0, w2) C 6 165L(lo1): mulx %i5, v0, %l0 C 6 166 umulxhi(%i5, v0, %l4) C 7 167 ldx [up+8], %i4 C 7 168AM2(` addcc w0, %g1, w0') C 8 169 stx w0, [rp-8] C 8 170 ADDX(` %l3, w1, w1') C 9 171 addxccc(%l7, w2, w2) C 9 172 mulx %i5, v1, %l1 C 10 173 umulxhi(%i5, v1, %l5) C 10 174AM2(` ldx [rp+8], %g1') C 11 175 addcc %l0, w1, w1 C 11 176 addxccc(%l4, w2, w2) C 12 177 addxc( %g0, %g0, w3) C 12 178L(lo0): mulx %i4, v0, %l2 C 13 179 umulxhi(%i4, v0, %l6) C 13 180 ldx [up+16], %i5 C 14 181AM2(` addcc w1, %g3, w1') C 14 182 stx w1, [rp+0] C 15 183 ADDX(` %l1, w2, w2') C 15 184 addxccc(%l5, w3, w3) C 16 185 mulx %i4, v1, %l3 C 16 186 umulxhi(%i4, v1, %l7) C 17 187AM2(` ldx [rp+16], %g3') C 17 188 addcc %l2, w2, w2 C 18 189 addxccc(%l6, w3, w3) C 18 190 addxc( %g0, %g0, w0) C 19 191L(lo3): mulx %i5, v0, %l0 C 19 192 umulxhi(%i5, v0, %l4) C 20 193 ldx [up+24], %i4 C 20 194AM2(` addcc w2, %g1, w2') C 21 195 stx w2, [rp+8] C 21 196 ADDX(` %l3, w3, w3') C 22 197 addxccc(%l7, w0, w0) C 22 198 mulx %i5, v1, %l1 C 23 199 umulxhi(%i5, v1, %l5) C 23 200AM2(` ldx [rp+24], %g1') C 24 201 addcc %l0, w3, w3 C 24 202 addxccc(%l4, w0, w0) C 25 203 addxc( %g0, %g0, w1) C 25 204 add up, 32, up 205 add rp, 32, rp 206 brgz n, L(top) 207 add n, -4, n 208 209L(end): mulx %i4, v0, %l2 210 umulxhi(%i4, v0, %l6) 211AM2(` addcc w3, %g3, w3') 212 stx w3, [rp-16] 213 ADDX(` %l1, w0, w0') 214 addxccc(%l5, w1, w1) 215 mulx %i4, v1, %l3 216 umulxhi(%i4, v1, %l7) 217 addcc %l2, w0, w0 218 addxccc(%l6, w1, w1) 219 addxc( %g0, %g0, w2) 220AM2(` addcc w0, %g1, w0') 221 stx w0, [rp-8] 222 ADDX(` %l3, w1, w1') 223 stx w1, [rp+0] 224 addxc(%l7, w2, %i0) 225 226 ret 227 restore 228EPILOGUE() 229