dfmul.S revision 341825
1//===----------------------Hexagon builtin routine ------------------------===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is dual licensed under the MIT and the University of Illinois Open 6// Source Licenses. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9 10/* Double Precision Multiply */ 11#define A r1:0 12#define AH r1 13#define AL r0 14#define B r3:2 15#define BH r3 16#define BL r2 17 18#define BTMP r5:4 19#define BTMPH r5 20#define BTMPL r4 21 22#define PP_ODD r7:6 23#define PP_ODD_H r7 24#define PP_ODD_L r6 25 26#define ONE r9:8 27#define S_ONE r8 28#define S_ZERO r9 29 30#define PP_HH r11:10 31#define PP_HH_H r11 32#define PP_HH_L r10 33 34#define ATMP r13:12 35#define ATMPH r13 36#define ATMPL r12 37 38#define PP_LL r15:14 39#define PP_LL_H r15 40#define PP_LL_L r14 41 42#define TMP r28 43 44#define MANTBITS 52 45#define HI_MANTBITS 20 46#define EXPBITS 11 47#define BIAS 1024 48#define MANTISSA_TO_INT_BIAS 52 49 50/* Some constant to adjust normalization amount in error code */ 51/* Amount to right shift the partial product to get to a denorm */ 52#define FUDGE 5 53 54#define Q6_ALIAS(TAG) .global __qdsp_##TAG ; .set __qdsp_##TAG, __hexagon_##TAG 55#define FAST_ALIAS(TAG) .global __hexagon_fast_##TAG ; .set __hexagon_fast_##TAG, __hexagon_##TAG 56#define FAST2_ALIAS(TAG) .global __hexagon_fast2_##TAG ; .set __hexagon_fast2_##TAG, __hexagon_##TAG 57#define END(TAG) .size TAG,.-TAG 58 59#define SR_ROUND_OFF 22 60 .text 61 .global __hexagon_muldf3 62 .type __hexagon_muldf3,@function 63 Q6_ALIAS(muldf3) 64 FAST_ALIAS(muldf3) 65 FAST2_ALIAS(muldf3) 66 .p2align 5 67__hexagon_muldf3: 68 { 69 p0 = dfclass(A,#2) 70 p0 = dfclass(B,#2) 71 ATMP = combine(##0x40000000,#0) 72 } 73 { 74 ATMP = insert(A,#MANTBITS,#EXPBITS-1) 75 BTMP = asl(B,#EXPBITS-1) 76 TMP = #-BIAS 77 ONE = #1 78 } 79 { 80 PP_ODD = mpyu(BTMPL,ATMPH) 81 BTMP = insert(ONE,#2,#62) 82 } 83 /* since we know that the MSB of the H registers is zero, we should never carry */ 84 /* H <= 2^31-1. L <= 2^32-1. Therefore, HL <= 2^63-2^32-2^31+1 */ 85 /* Adding 2 HLs, we get 2^64-3*2^32+2 maximum. */ 86 /* Therefore, we can add 3 2^32-1 values safely without carry. We only need one. */ 87 { 88 PP_LL = mpyu(ATMPL,BTMPL) 89 PP_ODD += mpyu(ATMPL,BTMPH) 90 } 91 { 92 PP_ODD += lsr(PP_LL,#32) 93 PP_HH = mpyu(ATMPH,BTMPH) 94 BTMP = combine(##BIAS+BIAS-4,#0) 95 } 96 { 97 PP_HH += lsr(PP_ODD,#32) 98 if (!p0) jump .Lmul_abnormal 99 p1 = cmp.eq(PP_LL_L,#0) // 64 lsb's 0? 100 p1 = cmp.eq(PP_ODD_L,#0) // 64 lsb's 0? 101 } 102 /* 103 * PP_HH can have a maximum of 0x3FFF_FFFF_FFFF_FFFF or thereabouts 104 * PP_HH can have a minimum of 0x1000_0000_0000_0000 or so 105 */ 106#undef PP_ODD 107#undef PP_ODD_H 108#undef PP_ODD_L 109#define EXP10 r7:6 110#define EXP1 r7 111#define EXP0 r6 112 { 113 if (!p1) PP_HH_L = or(PP_HH_L,S_ONE) 114 EXP0 = extractu(AH,#EXPBITS,#HI_MANTBITS) 115 EXP1 = extractu(BH,#EXPBITS,#HI_MANTBITS) 116 } 117 { 118 PP_LL = neg(PP_HH) 119 EXP0 += add(TMP,EXP1) 120 TMP = xor(AH,BH) 121 } 122 { 123 if (!p2.new) PP_HH = PP_LL 124 p2 = cmp.gt(TMP,#-1) 125 p0 = !cmp.gt(EXP0,BTMPH) 126 p0 = cmp.gt(EXP0,BTMPL) 127 if (!p0.new) jump:nt .Lmul_ovf_unf 128 } 129 { 130 A = convert_d2df(PP_HH) 131 EXP0 = add(EXP0,#-BIAS-58) 132 } 133 { 134 AH += asl(EXP0,#HI_MANTBITS) 135 jumpr r31 136 } 137 138 .falign 139.Lpossible_unf: 140 /* We end up with a positive exponent */ 141 /* But we may have rounded up to an exponent of 1. */ 142 /* If the exponent is 1, if we rounded up to it 143 * we need to also raise underflow 144 * Fortunately, this is pretty easy to detect, we must have +/- 0x0010_0000_0000_0000 145 * And the PP should also have more than one bit set 146 */ 147 /* Note: ATMP should have abs(PP_HH) */ 148 /* Note: BTMPL should have 0x7FEFFFFF */ 149 { 150 p0 = cmp.eq(AL,#0) 151 p0 = bitsclr(AH,BTMPL) 152 if (!p0.new) jumpr:t r31 153 BTMPH = #0x7fff 154 } 155 { 156 p0 = bitsset(ATMPH,BTMPH) 157 BTMPL = USR 158 BTMPH = #0x030 159 } 160 { 161 if (p0) BTMPL = or(BTMPL,BTMPH) 162 } 163 { 164 USR = BTMPL 165 } 166 { 167 p0 = dfcmp.eq(A,A) 168 jumpr r31 169 } 170 .falign 171.Lmul_ovf_unf: 172 { 173 A = convert_d2df(PP_HH) 174 ATMP = abs(PP_HH) // take absolute value 175 EXP1 = add(EXP0,#-BIAS-58) 176 } 177 { 178 AH += asl(EXP1,#HI_MANTBITS) 179 EXP1 = extractu(AH,#EXPBITS,#HI_MANTBITS) 180 BTMPL = ##0x7FEFFFFF 181 } 182 { 183 EXP1 += add(EXP0,##-BIAS-58) 184 //BTMPH = add(clb(ATMP),#-2) 185 BTMPH = #0 186 } 187 { 188 p0 = cmp.gt(EXP1,##BIAS+BIAS-2) // overflow 189 if (p0.new) jump:nt .Lmul_ovf 190 } 191 { 192 p0 = cmp.gt(EXP1,#0) 193 if (p0.new) jump:nt .Lpossible_unf 194 BTMPH = sub(EXP0,BTMPH) 195 TMP = #63 // max amount to shift 196 } 197 /* Underflow */ 198 /* 199 * PP_HH has the partial product with sticky LSB. 200 * PP_HH can have a maximum of 0x3FFF_FFFF_FFFF_FFFF or thereabouts 201 * PP_HH can have a minimum of 0x1000_0000_0000_0000 or so 202 * The exponent of PP_HH is in EXP1, which is non-positive (0 or negative) 203 * That's the exponent that happens after the normalization 204 * 205 * EXP0 has the exponent that, when added to the normalized value, is out of range. 206 * 207 * Strategy: 208 * 209 * * Shift down bits, with sticky bit, such that the bits are aligned according 210 * to the LZ count and appropriate exponent, but not all the way to mantissa 211 * field, keep around the last few bits. 212 * * Put a 1 near the MSB 213 * * Check the LSBs for inexact; if inexact also set underflow 214 * * Convert [u]d2df -- will correctly round according to rounding mode 215 * * Replace exponent field with zero 216 * 217 * 218 */ 219 220 221 { 222 BTMPL = #0 // offset for extract 223 BTMPH = sub(#FUDGE,BTMPH) // amount to right shift 224 } 225 { 226 p3 = cmp.gt(PP_HH_H,#-1) // is it positive? 227 BTMPH = min(BTMPH,TMP) // Don't shift more than 63 228 PP_HH = ATMP 229 } 230 { 231 TMP = USR 232 PP_LL = extractu(PP_HH,BTMP) 233 } 234 { 235 PP_HH = asr(PP_HH,BTMPH) 236 BTMPL = #0x0030 // underflow flag 237 AH = insert(S_ZERO,#EXPBITS,#HI_MANTBITS) 238 } 239 { 240 p0 = cmp.gtu(ONE,PP_LL) // Did we extract all zeros? 241 if (!p0.new) PP_HH_L = or(PP_HH_L,S_ONE) // add sticky bit 242 PP_HH_H = setbit(PP_HH_H,#HI_MANTBITS+3) // Add back in a bit so we can use convert instruction 243 } 244 { 245 PP_LL = neg(PP_HH) 246 p1 = bitsclr(PP_HH_L,#0x7) // Are the LSB's clear? 247 if (!p1.new) TMP = or(BTMPL,TMP) // If not, Inexact+Underflow 248 } 249 { 250 if (!p3) PP_HH = PP_LL 251 USR = TMP 252 } 253 { 254 A = convert_d2df(PP_HH) // Do rounding 255 p0 = dfcmp.eq(A,A) // realize exception 256 } 257 { 258 AH = insert(S_ZERO,#EXPBITS-1,#HI_MANTBITS+1) // Insert correct exponent 259 jumpr r31 260 } 261 .falign 262.Lmul_ovf: 263 // We get either max finite value or infinity. Either way, overflow+inexact 264 { 265 TMP = USR 266 ATMP = combine(##0x7fefffff,#-1) // positive max finite 267 A = PP_HH 268 } 269 { 270 PP_LL_L = extractu(TMP,#2,#SR_ROUND_OFF) // rounding bits 271 TMP = or(TMP,#0x28) // inexact + overflow 272 BTMP = combine(##0x7ff00000,#0) // positive infinity 273 } 274 { 275 USR = TMP 276 PP_LL_L ^= lsr(AH,#31) // Does sign match rounding? 277 TMP = PP_LL_L // unmodified rounding mode 278 } 279 { 280 p0 = !cmp.eq(TMP,#1) // If not round-to-zero and 281 p0 = !cmp.eq(PP_LL_L,#2) // Not rounding the other way, 282 if (p0.new) ATMP = BTMP // we should get infinity 283 p0 = dfcmp.eq(A,A) // Realize FP exception if enabled 284 } 285 { 286 A = insert(ATMP,#63,#0) // insert inf/maxfinite, leave sign 287 jumpr r31 288 } 289 290.Lmul_abnormal: 291 { 292 ATMP = extractu(A,#63,#0) // strip off sign 293 BTMP = extractu(B,#63,#0) // strip off sign 294 } 295 { 296 p3 = cmp.gtu(ATMP,BTMP) 297 if (!p3.new) A = B // sort values 298 if (!p3.new) B = A // sort values 299 } 300 { 301 // Any NaN --> NaN, possibly raise invalid if sNaN 302 p0 = dfclass(A,#0x0f) // A not NaN? 303 if (!p0.new) jump:nt .Linvalid_nan 304 if (!p3) ATMP = BTMP 305 if (!p3) BTMP = ATMP 306 } 307 { 308 // Infinity * nonzero number is infinity 309 p1 = dfclass(A,#0x08) // A is infinity 310 p1 = dfclass(B,#0x0e) // B is nonzero 311 } 312 { 313 // Infinity * zero --> NaN, raise invalid 314 // Other zeros return zero 315 p0 = dfclass(A,#0x08) // A is infinity 316 p0 = dfclass(B,#0x01) // B is zero 317 } 318 { 319 if (p1) jump .Ltrue_inf 320 p2 = dfclass(B,#0x01) 321 } 322 { 323 if (p0) jump .Linvalid_zeroinf 324 if (p2) jump .Ltrue_zero // so return zero 325 TMP = ##0x7c000000 326 } 327 // We are left with a normal or subnormal times a subnormal. A > B 328 // If A and B are both very small (exp(a) < BIAS-MANTBITS), 329 // we go to a single sticky bit, which we can round easily. 330 // If A and B might multiply to something bigger, decrease A exponent and increase 331 // B exponent and try again 332 { 333 p0 = bitsclr(AH,TMP) 334 if (p0.new) jump:nt .Lmul_tiny 335 } 336 { 337 TMP = cl0(BTMP) 338 } 339 { 340 TMP = add(TMP,#-EXPBITS) 341 } 342 { 343 BTMP = asl(BTMP,TMP) 344 } 345 { 346 B = insert(BTMP,#63,#0) 347 AH -= asl(TMP,#HI_MANTBITS) 348 } 349 jump __hexagon_muldf3 350.Lmul_tiny: 351 { 352 TMP = USR 353 A = xor(A,B) // get sign bit 354 } 355 { 356 TMP = or(TMP,#0x30) // Inexact + Underflow 357 A = insert(ONE,#63,#0) // put in rounded up value 358 BTMPH = extractu(TMP,#2,#SR_ROUND_OFF) // get rounding mode 359 } 360 { 361 USR = TMP 362 p0 = cmp.gt(BTMPH,#1) // Round towards pos/neg inf? 363 if (!p0.new) AL = #0 // If not, zero 364 BTMPH ^= lsr(AH,#31) // rounding my way --> set LSB 365 } 366 { 367 p0 = cmp.eq(BTMPH,#3) // if rounding towards right inf 368 if (!p0.new) AL = #0 // don't go to zero 369 jumpr r31 370 } 371.Linvalid_zeroinf: 372 { 373 TMP = USR 374 } 375 { 376 A = #-1 377 TMP = or(TMP,#2) 378 } 379 { 380 USR = TMP 381 } 382 { 383 p0 = dfcmp.uo(A,A) // force exception if enabled 384 jumpr r31 385 } 386.Linvalid_nan: 387 { 388 p0 = dfclass(B,#0x0f) // if B is not NaN 389 TMP = convert_df2sf(A) // will generate invalid if sNaN 390 if (p0.new) B = A // make it whatever A is 391 } 392 { 393 BL = convert_df2sf(B) // will generate invalid if sNaN 394 A = #-1 395 jumpr r31 396 } 397 .falign 398.Ltrue_zero: 399 { 400 A = B 401 B = A 402 } 403.Ltrue_inf: 404 { 405 BH = extract(BH,#1,#31) 406 } 407 { 408 AH ^= asl(BH,#31) 409 jumpr r31 410 } 411END(__hexagon_muldf3) 412 413#undef ATMP 414#undef ATMPL 415#undef ATMPH 416#undef BTMP 417#undef BTMPL 418#undef BTMPH 419