1336817Sdim//===----------------------Hexagon builtin routine ------------------------===// 2336817Sdim// 3353358Sdim// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4353358Sdim// See https://llvm.org/LICENSE.txt for license information. 5353358Sdim// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6336817Sdim// 7336817Sdim//===----------------------------------------------------------------------===// 8336817Sdim 9353358Sdim// Double Precision square root 10336817Sdim 11336817Sdim#define EXP r28 12336817Sdim 13336817Sdim#define A r1:0 14336817Sdim#define AH r1 15336817Sdim#define AL r0 16336817Sdim 17336817Sdim#define SFSH r3:2 18336817Sdim#define SF_S r3 19336817Sdim#define SF_H r2 20336817Sdim 21336817Sdim#define SFHALF_SONE r5:4 22336817Sdim#define S_ONE r4 23336817Sdim#define SFHALF r5 24336817Sdim#define SF_D r6 25336817Sdim#define SF_E r7 26336817Sdim#define RECIPEST r8 27336817Sdim#define SFRAD r9 28336817Sdim 29336817Sdim#define FRACRAD r11:10 30336817Sdim#define FRACRADH r11 31336817Sdim#define FRACRADL r10 32336817Sdim 33336817Sdim#define ROOT r13:12 34336817Sdim#define ROOTHI r13 35336817Sdim#define ROOTLO r12 36336817Sdim 37336817Sdim#define PROD r15:14 38336817Sdim#define PRODHI r15 39336817Sdim#define PRODLO r14 40336817Sdim 41336817Sdim#define P_TMP p0 42336817Sdim#define P_EXP1 p1 43336817Sdim#define NORMAL p2 44336817Sdim 45336817Sdim#define SF_EXPBITS 8 46336817Sdim#define SF_MANTBITS 23 47336817Sdim 48336817Sdim#define DF_EXPBITS 11 49336817Sdim#define DF_MANTBITS 52 50336817Sdim 51336817Sdim#define DF_BIAS 0x3ff 52336817Sdim 53336817Sdim#define DFCLASS_ZERO 0x01 54336817Sdim#define DFCLASS_NORMAL 0x02 55336817Sdim#define DFCLASS_DENORMAL 0x02 56336817Sdim#define DFCLASS_INFINITE 0x08 57336817Sdim#define DFCLASS_NAN 0x10 58336817Sdim 59336817Sdim#define Q6_ALIAS(TAG) .global __qdsp_##TAG ; .set __qdsp_##TAG, __hexagon_##TAG; .type __qdsp_##TAG,@function 60336817Sdim#define FAST_ALIAS(TAG) .global __hexagon_fast_##TAG ; .set __hexagon_fast_##TAG, __hexagon_##TAG; .type __hexagon_fast_##TAG,@function 61336817Sdim#define FAST2_ALIAS(TAG) .global __hexagon_fast2_##TAG ; .set __hexagon_fast2_##TAG, __hexagon_##TAG; .type __hexagon_fast2_##TAG,@function 62336817Sdim#define END(TAG) .size TAG,.-TAG 63336817Sdim 64336817Sdim .text 65336817Sdim .global __hexagon_sqrtdf2 66336817Sdim .type __hexagon_sqrtdf2,@function 67336817Sdim .global __hexagon_sqrt 68336817Sdim .type __hexagon_sqrt,@function 69336817Sdim Q6_ALIAS(sqrtdf2) 70336817Sdim Q6_ALIAS(sqrt) 71336817Sdim FAST_ALIAS(sqrtdf2) 72336817Sdim FAST_ALIAS(sqrt) 73336817Sdim FAST2_ALIAS(sqrtdf2) 74336817Sdim FAST2_ALIAS(sqrt) 75336817Sdim .type sqrt,@function 76336817Sdim .p2align 5 77336817Sdim__hexagon_sqrtdf2: 78336817Sdim__hexagon_sqrt: 79336817Sdim { 80336817Sdim PROD = extractu(A,#SF_MANTBITS+1,#DF_MANTBITS-SF_MANTBITS) 81336817Sdim EXP = extractu(AH,#DF_EXPBITS,#DF_MANTBITS-32) 82336817Sdim SFHALF_SONE = combine(##0x3f000004,#1) 83336817Sdim } 84336817Sdim { 85336817Sdim NORMAL = dfclass(A,#DFCLASS_NORMAL) // Is it normal 86336817Sdim NORMAL = cmp.gt(AH,#-1) // and positive? 87336817Sdim if (!NORMAL.new) jump:nt .Lsqrt_abnormal 88336817Sdim SFRAD = or(SFHALF,PRODLO) 89336817Sdim } 90336817Sdim#undef NORMAL 91336817Sdim.Ldenormal_restart: 92336817Sdim { 93336817Sdim FRACRAD = A 94336817Sdim SF_E,P_TMP = sfinvsqrta(SFRAD) 95336817Sdim SFHALF = and(SFHALF,#-16) 96336817Sdim SFSH = #0 97336817Sdim } 98336817Sdim#undef A 99336817Sdim#undef AH 100336817Sdim#undef AL 101336817Sdim#define ERROR r1:0 102336817Sdim#define ERRORHI r1 103336817Sdim#define ERRORLO r0 104336817Sdim // SF_E : reciprocal square root 105336817Sdim // SF_H : half rsqrt 106336817Sdim // sf_S : square root 107336817Sdim // SF_D : error term 108336817Sdim // SFHALF: 0.5 109336817Sdim { 110336817Sdim SF_S += sfmpy(SF_E,SFRAD):lib // s0: root 111336817Sdim SF_H += sfmpy(SF_E,SFHALF):lib // h0: 0.5*y0. Could also decrement exponent... 112336817Sdim SF_D = SFHALF 113336817Sdim#undef SFRAD 114336817Sdim#define SHIFTAMT r9 115336817Sdim SHIFTAMT = and(EXP,#1) 116336817Sdim } 117336817Sdim { 118336817Sdim SF_D -= sfmpy(SF_S,SF_H):lib // d0: 0.5-H*S = 0.5-0.5*~1 119336817Sdim FRACRADH = insert(S_ONE,#DF_EXPBITS+1,#DF_MANTBITS-32) // replace upper bits with hidden 120336817Sdim P_EXP1 = cmp.gtu(SHIFTAMT,#0) 121336817Sdim } 122336817Sdim { 123336817Sdim SF_S += sfmpy(SF_S,SF_D):lib // s1: refine sqrt 124336817Sdim SF_H += sfmpy(SF_H,SF_D):lib // h1: refine half-recip 125336817Sdim SF_D = SFHALF 126336817Sdim SHIFTAMT = mux(P_EXP1,#8,#9) 127336817Sdim } 128336817Sdim { 129336817Sdim SF_D -= sfmpy(SF_S,SF_H):lib // d1: error term 130336817Sdim FRACRAD = asl(FRACRAD,SHIFTAMT) // Move fracrad bits to right place 131336817Sdim SHIFTAMT = mux(P_EXP1,#3,#2) 132336817Sdim } 133336817Sdim { 134336817Sdim SF_H += sfmpy(SF_H,SF_D):lib // d2: rsqrt 135336817Sdim // cool trick: half of 1/sqrt(x) has same mantissa as 1/sqrt(x). 136336817Sdim PROD = asl(FRACRAD,SHIFTAMT) // fracrad<<(2+exp1) 137336817Sdim } 138336817Sdim { 139336817Sdim SF_H = and(SF_H,##0x007fffff) 140336817Sdim } 141336817Sdim { 142336817Sdim SF_H = add(SF_H,##0x00800000 - 3) 143336817Sdim SHIFTAMT = mux(P_EXP1,#7,#8) 144336817Sdim } 145336817Sdim { 146336817Sdim RECIPEST = asl(SF_H,SHIFTAMT) 147336817Sdim SHIFTAMT = mux(P_EXP1,#15-(1+1),#15-(1+0)) 148336817Sdim } 149336817Sdim { 150336817Sdim ROOT = mpyu(RECIPEST,PRODHI) // root = mpyu_full(recipest,hi(fracrad<<(2+exp1))) 151336817Sdim } 152336817Sdim 153336817Sdim#undef SFSH // r3:2 154336817Sdim#undef SF_H // r2 155336817Sdim#undef SF_S // r3 156336817Sdim#undef S_ONE // r4 157336817Sdim#undef SFHALF // r5 158336817Sdim#undef SFHALF_SONE // r5:4 159336817Sdim#undef SF_D // r6 160336817Sdim#undef SF_E // r7 161336817Sdim 162336817Sdim#define HL r3:2 163336817Sdim#define LL r5:4 164336817Sdim#define HH r7:6 165336817Sdim 166336817Sdim#undef P_EXP1 167336817Sdim#define P_CARRY0 p1 168336817Sdim#define P_CARRY1 p2 169336817Sdim#define P_CARRY2 p3 170336817Sdim 171353358Sdim // Iteration 0 172353358Sdim // Maybe we can save a cycle by starting with ERROR=asl(fracrad), then as we multiply 173353358Sdim // We can shift and subtract instead of shift and add? 174336817Sdim { 175336817Sdim ERROR = asl(FRACRAD,#15) 176336817Sdim PROD = mpyu(ROOTHI,ROOTHI) 177336817Sdim P_CARRY0 = cmp.eq(r0,r0) 178336817Sdim } 179336817Sdim { 180336817Sdim ERROR -= asl(PROD,#15) 181336817Sdim PROD = mpyu(ROOTHI,ROOTLO) 182336817Sdim P_CARRY1 = cmp.eq(r0,r0) 183336817Sdim } 184336817Sdim { 185336817Sdim ERROR -= lsr(PROD,#16) 186336817Sdim P_CARRY2 = cmp.eq(r0,r0) 187336817Sdim } 188336817Sdim { 189336817Sdim ERROR = mpyu(ERRORHI,RECIPEST) 190336817Sdim } 191336817Sdim { 192336817Sdim ROOT += lsr(ERROR,SHIFTAMT) 193336817Sdim SHIFTAMT = add(SHIFTAMT,#16) 194336817Sdim ERROR = asl(FRACRAD,#31) // for next iter 195336817Sdim } 196353358Sdim // Iteration 1 197336817Sdim { 198336817Sdim PROD = mpyu(ROOTHI,ROOTHI) 199336817Sdim ERROR -= mpyu(ROOTHI,ROOTLO) // amount is 31, no shift needed 200336817Sdim } 201336817Sdim { 202336817Sdim ERROR -= asl(PROD,#31) 203336817Sdim PROD = mpyu(ROOTLO,ROOTLO) 204336817Sdim } 205336817Sdim { 206336817Sdim ERROR -= lsr(PROD,#33) 207336817Sdim } 208336817Sdim { 209336817Sdim ERROR = mpyu(ERRORHI,RECIPEST) 210336817Sdim } 211336817Sdim { 212336817Sdim ROOT += lsr(ERROR,SHIFTAMT) 213336817Sdim SHIFTAMT = add(SHIFTAMT,#16) 214336817Sdim ERROR = asl(FRACRAD,#47) // for next iter 215336817Sdim } 216353358Sdim // Iteration 2 217336817Sdim { 218336817Sdim PROD = mpyu(ROOTHI,ROOTHI) 219336817Sdim } 220336817Sdim { 221336817Sdim ERROR -= asl(PROD,#47) 222336817Sdim PROD = mpyu(ROOTHI,ROOTLO) 223336817Sdim } 224336817Sdim { 225336817Sdim ERROR -= asl(PROD,#16) // bidir shr 31-47 226336817Sdim PROD = mpyu(ROOTLO,ROOTLO) 227336817Sdim } 228336817Sdim { 229336817Sdim ERROR -= lsr(PROD,#17) // 64-47 230336817Sdim } 231336817Sdim { 232336817Sdim ERROR = mpyu(ERRORHI,RECIPEST) 233336817Sdim } 234336817Sdim { 235336817Sdim ROOT += lsr(ERROR,SHIFTAMT) 236336817Sdim } 237336817Sdim#undef ERROR 238336817Sdim#undef PROD 239336817Sdim#undef PRODHI 240336817Sdim#undef PRODLO 241336817Sdim#define REM_HI r15:14 242336817Sdim#define REM_HI_HI r15 243336817Sdim#define REM_LO r1:0 244336817Sdim#undef RECIPEST 245336817Sdim#undef SHIFTAMT 246336817Sdim#define TWOROOT_LO r9:8 247353358Sdim // Adjust Root 248336817Sdim { 249336817Sdim HL = mpyu(ROOTHI,ROOTLO) 250336817Sdim LL = mpyu(ROOTLO,ROOTLO) 251336817Sdim REM_HI = #0 252336817Sdim REM_LO = #0 253336817Sdim } 254336817Sdim { 255336817Sdim HL += lsr(LL,#33) 256336817Sdim LL += asl(HL,#33) 257336817Sdim P_CARRY0 = cmp.eq(r0,r0) 258336817Sdim } 259336817Sdim { 260336817Sdim HH = mpyu(ROOTHI,ROOTHI) 261336817Sdim REM_LO = sub(REM_LO,LL,P_CARRY0):carry 262336817Sdim TWOROOT_LO = #1 263336817Sdim } 264336817Sdim { 265336817Sdim HH += lsr(HL,#31) 266336817Sdim TWOROOT_LO += asl(ROOT,#1) 267336817Sdim } 268336817Sdim#undef HL 269336817Sdim#undef LL 270336817Sdim#define REM_HI_TMP r3:2 271336817Sdim#define REM_HI_TMP_HI r3 272336817Sdim#define REM_LO_TMP r5:4 273336817Sdim { 274336817Sdim REM_HI = sub(FRACRAD,HH,P_CARRY0):carry 275336817Sdim REM_LO_TMP = sub(REM_LO,TWOROOT_LO,P_CARRY1):carry 276336817Sdim#undef FRACRAD 277336817Sdim#undef HH 278336817Sdim#define ZERO r11:10 279336817Sdim#define ONE r7:6 280336817Sdim ONE = #1 281336817Sdim ZERO = #0 282336817Sdim } 283336817Sdim { 284336817Sdim REM_HI_TMP = sub(REM_HI,ZERO,P_CARRY1):carry 285336817Sdim ONE = add(ROOT,ONE) 286336817Sdim EXP = add(EXP,#-DF_BIAS) // subtract bias --> signed exp 287336817Sdim } 288336817Sdim { 289336817Sdim // If carry set, no borrow: result was still positive 290336817Sdim if (P_CARRY1) ROOT = ONE 291336817Sdim if (P_CARRY1) REM_LO = REM_LO_TMP 292336817Sdim if (P_CARRY1) REM_HI = REM_HI_TMP 293336817Sdim } 294336817Sdim { 295336817Sdim REM_LO_TMP = sub(REM_LO,TWOROOT_LO,P_CARRY2):carry 296336817Sdim ONE = #1 297336817Sdim EXP = asr(EXP,#1) // divide signed exp by 2 298336817Sdim } 299336817Sdim { 300336817Sdim REM_HI_TMP = sub(REM_HI,ZERO,P_CARRY2):carry 301336817Sdim ONE = add(ROOT,ONE) 302336817Sdim } 303336817Sdim { 304336817Sdim if (P_CARRY2) ROOT = ONE 305336817Sdim if (P_CARRY2) REM_LO = REM_LO_TMP 306336817Sdim // since tworoot <= 2^32, remhi must be zero 307336817Sdim#undef REM_HI_TMP 308336817Sdim#undef REM_HI_TMP_HI 309336817Sdim#define S_ONE r2 310336817Sdim#define ADJ r3 311336817Sdim S_ONE = #1 312336817Sdim } 313336817Sdim { 314336817Sdim P_TMP = cmp.eq(REM_LO,ZERO) // is the low part zero 315336817Sdim if (!P_TMP.new) ROOTLO = or(ROOTLO,S_ONE) // if so, it's exact... hopefully 316336817Sdim ADJ = cl0(ROOT) 317336817Sdim EXP = add(EXP,#-63) 318336817Sdim } 319336817Sdim#undef REM_LO 320336817Sdim#define RET r1:0 321336817Sdim#define RETHI r1 322336817Sdim { 323336817Sdim RET = convert_ud2df(ROOT) // set up mantissa, maybe set inexact flag 324336817Sdim EXP = add(EXP,ADJ) // add back bias 325336817Sdim } 326336817Sdim { 327336817Sdim RETHI += asl(EXP,#DF_MANTBITS-32) // add exponent adjust 328336817Sdim jumpr r31 329336817Sdim } 330336817Sdim#undef REM_LO_TMP 331336817Sdim#undef REM_HI_TMP 332336817Sdim#undef REM_HI_TMP_HI 333336817Sdim#undef REM_LO 334336817Sdim#undef REM_HI 335336817Sdim#undef TWOROOT_LO 336336817Sdim 337336817Sdim#undef RET 338336817Sdim#define A r1:0 339336817Sdim#define AH r1 340336817Sdim#define AL r1 341336817Sdim#undef S_ONE 342336817Sdim#define TMP r3:2 343336817Sdim#define TMPHI r3 344336817Sdim#define TMPLO r2 345336817Sdim#undef P_CARRY0 346336817Sdim#define P_NEG p1 347336817Sdim 348336817Sdim 349336817Sdim#define SFHALF r5 350336817Sdim#define SFRAD r9 351336817Sdim.Lsqrt_abnormal: 352336817Sdim { 353336817Sdim P_TMP = dfclass(A,#DFCLASS_ZERO) // zero? 354336817Sdim if (P_TMP.new) jumpr:t r31 355336817Sdim } 356336817Sdim { 357336817Sdim P_TMP = dfclass(A,#DFCLASS_NAN) 358336817Sdim if (P_TMP.new) jump:nt .Lsqrt_nan 359336817Sdim } 360336817Sdim { 361336817Sdim P_TMP = cmp.gt(AH,#-1) 362336817Sdim if (!P_TMP.new) jump:nt .Lsqrt_invalid_neg 363336817Sdim if (!P_TMP.new) EXP = ##0x7F800001 // sNaN 364336817Sdim } 365336817Sdim { 366336817Sdim P_TMP = dfclass(A,#DFCLASS_INFINITE) 367336817Sdim if (P_TMP.new) jumpr:nt r31 368336817Sdim } 369336817Sdim // If we got here, we're denormal 370336817Sdim // prepare to restart 371336817Sdim { 372336817Sdim A = extractu(A,#DF_MANTBITS,#0) // Extract mantissa 373336817Sdim } 374336817Sdim { 375336817Sdim EXP = add(clb(A),#-DF_EXPBITS) // how much to normalize? 376336817Sdim } 377336817Sdim { 378336817Sdim A = asl(A,EXP) // Shift mantissa 379336817Sdim EXP = sub(#1,EXP) // Form exponent 380336817Sdim } 381336817Sdim { 382336817Sdim AH = insert(EXP,#1,#DF_MANTBITS-32) // insert lsb of exponent 383336817Sdim } 384336817Sdim { 385336817Sdim TMP = extractu(A,#SF_MANTBITS+1,#DF_MANTBITS-SF_MANTBITS) // get sf value (mant+exp1) 386336817Sdim SFHALF = ##0x3f000004 // form half constant 387336817Sdim } 388336817Sdim { 389336817Sdim SFRAD = or(SFHALF,TMPLO) // form sf value 390336817Sdim SFHALF = and(SFHALF,#-16) 391336817Sdim jump .Ldenormal_restart // restart 392336817Sdim } 393336817Sdim.Lsqrt_nan: 394336817Sdim { 395336817Sdim EXP = convert_df2sf(A) // if sNaN, get invalid 396336817Sdim A = #-1 // qNaN 397336817Sdim jumpr r31 398336817Sdim } 399336817Sdim.Lsqrt_invalid_neg: 400336817Sdim { 401336817Sdim A = convert_sf2df(EXP) // Invalid,NaNval 402336817Sdim jumpr r31 403336817Sdim } 404336817SdimEND(__hexagon_sqrt) 405336817SdimEND(__hexagon_sqrtdf2) 406