1dnl SPARC v9 32-bit mpn_mul_1 -- Multiply a limb vector with a limb and store 2dnl the result in a second limb vector. 3 4dnl Copyright 1998, 2000, 2001, 2003 Free Software Foundation, Inc. 5 6dnl This file is part of the GNU MP Library. 7 8dnl The GNU MP Library is free software; you can redistribute it and/or modify 9dnl it under the terms of the GNU Lesser General Public License as published 10dnl by the Free Software Foundation; either version 3 of the License, or (at 11dnl your option) any later version. 12 13dnl The GNU MP Library is distributed in the hope that it will be useful, but 14dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 15dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 16dnl License for more details. 17 18dnl You should have received a copy of the GNU Lesser General Public License 19dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 20 21include(`../config.m4') 22 23C Algorithm: We use two floating-point multiplies per limb product, with the 24C invariant v operand split into two 16-bit pieces, and the u operand split 25C into 32-bit pieces. We convert the two 48-bit products and transfer them to 26C the integer unit. 27 28C cycles/limb 29C UltraSPARC 1&2: 6.5 30C UltraSPARC 3: ? 31 32C Possible optimizations: 33C 1. Combine 32-bit memory operations into 64-bit operations. Since we're 34C memory bandwidth limited, this could save 1.5 cycles/limb. 35C 2. Unroll the inner loop. Since we already use alternate temporary areas, 36C it is very straightforward to unroll, using an exit branch midways. 37C Unrolling would allow deeper scheduling which could improve speed for L2 38C cache case. 39C 3. For mpn_mul_1: Use more alternating temp areas. The std'es and ldx'es 40C aren't sufficiently apart-scheduled with just two temp areas. 41C 4. Specialize for particular v values. If its upper 16 bits are zero, we 42C could save many operations. 43 44C INPUT PARAMETERS 45C rp i0 46C up i1 47C n i2 48C v i3 49 50define(`FSIZE',224) 51 52ASM_START() 53PROLOGUE(mpn_mul_1) 54 add %sp, -FSIZE, %sp 55 sethi %hi(0xffff), %g1 56 srl %o3, 16, %g2 57 or %g1, %lo(0xffff), %g1 58 and %o3, %g1, %g1 59 stx %g1, [%sp+104] 60 stx %g2, [%sp+112] 61 ldd [%sp+104], %f6 62 ldd [%sp+112], %f8 63 fxtod %f6, %f6 64 fxtod %f8, %f8 65 ld [%sp+104], %f10 C zero f10 66 67 mov 0, %g3 C cy = 0 68 69define(`fanop', `fitod %f18, %f0') C A quasi nop running in the FA pipe 70 71 add %sp, 160, %o5 C point in scratch area 72 and %o5, -32, %o5 C align at 0 (mod 32) in scratch area 73 74 subcc %o2, 1, %o2 75 ld [%o1], %f11 C read up[i] 76 add %o1, 4, %o1 C up++ 77 bne,pt %icc, .L_two_or_more 78 fxtod %f10, %f2 79 80 fmuld %f2, %f8, %f16 81 fmuld %f2, %f6, %f4 82 fdtox %f16, %f14 83 fdtox %f4, %f12 84 std %f14, [%o5+16] 85 std %f12, [%o5+24] 86 ldx [%o5+16], %g2 C p16 87 ldx [%o5+24], %g1 C p0 88 b .L1 89 add %o0, -16, %o0 90 91 .align 16 92.L_two_or_more: 93 subcc %o2, 1, %o2 94 ld [%o1], %f11 C read up[i] 95 fmuld %f2, %f8, %f16 96 fmuld %f2, %f6, %f4 97 add %o1, 4, %o1 C up++ 98 bne,pt %icc, .L_three_or_more 99 fxtod %f10, %f2 100 101 fdtox %f16, %f14 102 fdtox %f4, %f12 103 std %f14, [%o5+16] 104 fmuld %f2, %f8, %f16 105 std %f12, [%o5+24] 106 fmuld %f2, %f6, %f4 107 fdtox %f16, %f14 108 fdtox %f4, %f12 109 std %f14, [%o5+0] 110 std %f12, [%o5+8] 111 ldx [%o5+16], %g2 C p16 112 ldx [%o5+24], %g1 C p0 113 b .L2 114 add %o0, -12, %o0 115 116 .align 16 117.L_three_or_more: 118 subcc %o2, 1, %o2 119 ld [%o1], %f11 C read up[i] 120 fdtox %f16, %f14 121 fdtox %f4, %f12 122 std %f14, [%o5+16] 123 fmuld %f2, %f8, %f16 124 std %f12, [%o5+24] 125 fmuld %f2, %f6, %f4 126 add %o1, 4, %o1 C up++ 127 bne,pt %icc, .L_four_or_more 128 fxtod %f10, %f2 129 130 fdtox %f16, %f14 131 fdtox %f4, %f12 132 std %f14, [%o5+0] 133 fmuld %f2, %f8, %f16 134 std %f12, [%o5+8] 135 fmuld %f2, %f6, %f4 136 fdtox %f16, %f14 137 ldx [%o5+16], %g2 C p16 138 fdtox %f4, %f12 139 ldx [%o5+24], %g1 C p0 140 std %f14, [%o5+16] 141 std %f12, [%o5+24] 142 b .L3 143 add %o0, -8, %o0 144 145 .align 16 146.L_four_or_more: 147 subcc %o2, 1, %o2 148 ld [%o1], %f11 C read up[i] 149 fdtox %f16, %f14 150 fdtox %f4, %f12 151 std %f14, [%o5+0] 152 fmuld %f2, %f8, %f16 153 std %f12, [%o5+8] 154 fmuld %f2, %f6, %f4 155 add %o1, 4, %o1 C up++ 156 bne,pt %icc, .L_five_or_more 157 fxtod %f10, %f2 158 159 fdtox %f16, %f14 160 ldx [%o5+16], %g2 C p16 161 fdtox %f4, %f12 162 ldx [%o5+24], %g1 C p0 163 std %f14, [%o5+16] 164 fmuld %f2, %f8, %f16 165 std %f12, [%o5+24] 166 fmuld %f2, %f6, %f4 167 add %o1, 4, %o1 C up++ 168 b .L4 169 add %o0, -4, %o0 170 171 .align 16 172.L_five_or_more: 173 subcc %o2, 1, %o2 174 ld [%o1], %f11 C read up[i] 175 fdtox %f16, %f14 176 ldx [%o5+16], %g2 C p16 177 fdtox %f4, %f12 178 ldx [%o5+24], %g1 C p0 179 std %f14, [%o5+16] 180 fmuld %f2, %f8, %f16 181 std %f12, [%o5+24] 182 fmuld %f2, %f6, %f4 183 add %o1, 4, %o1 C up++ 184 bne,pt %icc, .Loop 185 fxtod %f10, %f2 186 b,a .L5 187 188C BEGIN MAIN LOOP 189 .align 16 190C -- 0 191.Loop: nop 192 subcc %o2, 1, %o2 193 ld [%o1], %f11 C read up[i] 194 fdtox %f16, %f14 195C -- 1 196 sllx %g2, 16, %g4 C (p16 << 16) 197 add %o0, 4, %o0 C rp++ 198 ldx [%o5+0], %g2 C p16 199 fdtox %f4, %f12 200C -- 2 201 nop 202 add %g1, %g4, %g4 C p = p0 + (p16 << 16) 203 ldx [%o5+8], %g1 C p0 204 fanop 205C -- 3 206 nop 207 add %g3, %g4, %g4 C p += cy 208 std %f14, [%o5+0] 209 fmuld %f2, %f8, %f16 210C -- 4 211 srlx %g4, 32, %g3 C new cy 212 add %o1, 4, %o1 C up++ 213 std %f12, [%o5+8] 214 fmuld %f2, %f6, %f4 215C -- 5 216 xor %o5, 16, %o5 C alternate scratch variables 217 stw %g4, [%o0-4] 218 bne,pt %icc, .Loop 219 fxtod %f10, %f2 220C END MAIN LOOP 221 222.L5: fdtox %f16, %f14 223 sllx %g2, 16, %g4 C (p16 << 16) 224 ldx [%o5+0], %g2 C p16 225 fdtox %f4, %f12 226 add %g1, %g4, %g4 C p = p0 + (p16 << 16) 227 ldx [%o5+8], %g1 C p0 228 add %g4, %g3, %g4 C p += cy 229 std %f14, [%o5+0] 230 fmuld %f2, %f8, %f16 231 std %f12, [%o5+8] 232 fmuld %f2, %f6, %f4 233 xor %o5, 16, %o5 234 stw %g4, [%o0+0] 235 srlx %g4, 32, %g3 C new cy 236 237.L4: fdtox %f16, %f14 238 sllx %g2, 16, %g4 C (p16 << 16) 239 ldx [%o5+0], %g2 C p16 240 fdtox %f4, %f12 241 add %g1, %g4, %g4 C p = p0 + (p16 << 16) 242 ldx [%o5+8], %g1 C p0 243 add %g3, %g4, %g4 C p += cy 244 std %f14, [%o5+0] 245 std %f12, [%o5+8] 246 xor %o5, 16, %o5 247 stw %g4, [%o0+4] 248 srlx %g4, 32, %g3 C new cy 249 250.L3: sllx %g2, 16, %g4 C (p16 << 16) 251 ldx [%o5+0], %g2 C p16 252 add %g1, %g4, %g4 C p = p0 + (p16 << 16) 253 ldx [%o5+8], %g1 C p0 254 add %g3, %g4, %g4 C p += cy 255 xor %o5, 16, %o5 256 stw %g4, [%o0+8] 257 srlx %g4, 32, %g3 C new cy 258 259.L2: sllx %g2, 16, %g4 C (p16 << 16) 260 ldx [%o5+0], %g2 C p16 261 add %g1, %g4, %g4 C p = p0 + (p16 << 16) 262 ldx [%o5+8], %g1 C p0 263 add %g3, %g4, %g4 C p += cy 264 stw %g4, [%o0+12] 265 srlx %g4, 32, %g3 C new cy 266 267.L1: sllx %g2, 16, %g4 C (p16 << 16) 268 add %g1, %g4, %g4 C p = p0 + (p16 << 16) 269 add %g3, %g4, %g4 C p += cy 270 stw %g4, [%o0+16] 271 srlx %g4, 32, %g3 C new cy 272 273 mov %g3, %o0 274 retl 275 sub %sp, -FSIZE, %sp 276EPILOGUE(mpn_mul_1) 277