1336817Sdim//===----------------------Hexagon builtin routine ------------------------===//
2336817Sdim//
3353358Sdim// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4353358Sdim// See https://llvm.org/LICENSE.txt for license information.
5353358Sdim// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6336817Sdim//
7336817Sdim//===----------------------------------------------------------------------===//
8336817Sdim
9353358Sdim// Double Precision square root
10336817Sdim
11336817Sdim#define EXP r28
12336817Sdim
13336817Sdim#define A r1:0
14336817Sdim#define AH r1
15336817Sdim#define AL r0
16336817Sdim
17336817Sdim#define SFSH r3:2
18336817Sdim#define SF_S r3
19336817Sdim#define SF_H r2
20336817Sdim
21336817Sdim#define SFHALF_SONE r5:4
22336817Sdim#define S_ONE r4
23336817Sdim#define SFHALF r5
24336817Sdim#define SF_D r6
25336817Sdim#define SF_E r7
26336817Sdim#define RECIPEST r8
27336817Sdim#define SFRAD r9
28336817Sdim
29336817Sdim#define FRACRAD r11:10
30336817Sdim#define FRACRADH r11
31336817Sdim#define FRACRADL r10
32336817Sdim
33336817Sdim#define ROOT r13:12
34336817Sdim#define ROOTHI r13
35336817Sdim#define ROOTLO r12
36336817Sdim
37336817Sdim#define PROD r15:14
38336817Sdim#define PRODHI r15
39336817Sdim#define PRODLO r14
40336817Sdim
41336817Sdim#define P_TMP p0
42336817Sdim#define P_EXP1 p1
43336817Sdim#define NORMAL p2
44336817Sdim
45336817Sdim#define SF_EXPBITS 8
46336817Sdim#define SF_MANTBITS 23
47336817Sdim
48336817Sdim#define DF_EXPBITS 11
49336817Sdim#define DF_MANTBITS 52
50336817Sdim
51336817Sdim#define DF_BIAS 0x3ff
52336817Sdim
53336817Sdim#define DFCLASS_ZERO     0x01
54336817Sdim#define DFCLASS_NORMAL   0x02
55336817Sdim#define DFCLASS_DENORMAL 0x02
56336817Sdim#define DFCLASS_INFINITE 0x08
57336817Sdim#define DFCLASS_NAN      0x10
58336817Sdim
59336817Sdim#define Q6_ALIAS(TAG) .global __qdsp_##TAG ; .set __qdsp_##TAG, __hexagon_##TAG; .type __qdsp_##TAG,@function
60336817Sdim#define FAST_ALIAS(TAG) .global __hexagon_fast_##TAG ; .set __hexagon_fast_##TAG, __hexagon_##TAG; .type __hexagon_fast_##TAG,@function
61336817Sdim#define FAST2_ALIAS(TAG) .global __hexagon_fast2_##TAG ; .set __hexagon_fast2_##TAG, __hexagon_##TAG; .type __hexagon_fast2_##TAG,@function
62336817Sdim#define END(TAG) .size TAG,.-TAG
63336817Sdim
64336817Sdim	.text
65336817Sdim	.global __hexagon_sqrtdf2
66336817Sdim	.type __hexagon_sqrtdf2,@function
67336817Sdim	.global __hexagon_sqrt
68336817Sdim	.type __hexagon_sqrt,@function
69336817Sdim	Q6_ALIAS(sqrtdf2)
70336817Sdim	Q6_ALIAS(sqrt)
71336817Sdim	FAST_ALIAS(sqrtdf2)
72336817Sdim	FAST_ALIAS(sqrt)
73336817Sdim	FAST2_ALIAS(sqrtdf2)
74336817Sdim	FAST2_ALIAS(sqrt)
75336817Sdim	.type sqrt,@function
76336817Sdim	.p2align 5
77336817Sdim__hexagon_sqrtdf2:
78336817Sdim__hexagon_sqrt:
79336817Sdim	{
80336817Sdim		PROD = extractu(A,#SF_MANTBITS+1,#DF_MANTBITS-SF_MANTBITS)
81336817Sdim		EXP = extractu(AH,#DF_EXPBITS,#DF_MANTBITS-32)
82336817Sdim		SFHALF_SONE = combine(##0x3f000004,#1)
83336817Sdim	}
84336817Sdim	{
85336817Sdim		NORMAL = dfclass(A,#DFCLASS_NORMAL)		// Is it normal
86336817Sdim		NORMAL = cmp.gt(AH,#-1)				// and positive?
87336817Sdim		if (!NORMAL.new) jump:nt .Lsqrt_abnormal
88336817Sdim		SFRAD = or(SFHALF,PRODLO)
89336817Sdim	}
90336817Sdim#undef NORMAL
91336817Sdim.Ldenormal_restart:
92336817Sdim	{
93336817Sdim		FRACRAD = A
94336817Sdim		SF_E,P_TMP = sfinvsqrta(SFRAD)
95336817Sdim		SFHALF = and(SFHALF,#-16)
96336817Sdim		SFSH = #0
97336817Sdim	}
98336817Sdim#undef A
99336817Sdim#undef AH
100336817Sdim#undef AL
101336817Sdim#define ERROR r1:0
102336817Sdim#define ERRORHI r1
103336817Sdim#define ERRORLO r0
104336817Sdim	// SF_E : reciprocal square root
105336817Sdim	// SF_H : half rsqrt
106336817Sdim	// sf_S : square root
107336817Sdim	// SF_D : error term
108336817Sdim	// SFHALF: 0.5
109336817Sdim	{
110336817Sdim		SF_S += sfmpy(SF_E,SFRAD):lib		// s0: root
111336817Sdim		SF_H += sfmpy(SF_E,SFHALF):lib		// h0: 0.5*y0. Could also decrement exponent...
112336817Sdim		SF_D = SFHALF
113336817Sdim#undef SFRAD
114336817Sdim#define SHIFTAMT r9
115336817Sdim		SHIFTAMT = and(EXP,#1)
116336817Sdim	}
117336817Sdim	{
118336817Sdim		SF_D -= sfmpy(SF_S,SF_H):lib		// d0: 0.5-H*S = 0.5-0.5*~1
119336817Sdim		FRACRADH = insert(S_ONE,#DF_EXPBITS+1,#DF_MANTBITS-32)	// replace upper bits with hidden
120336817Sdim		P_EXP1 = cmp.gtu(SHIFTAMT,#0)
121336817Sdim	}
122336817Sdim	{
123336817Sdim		SF_S += sfmpy(SF_S,SF_D):lib		// s1: refine sqrt
124336817Sdim		SF_H += sfmpy(SF_H,SF_D):lib		// h1: refine half-recip
125336817Sdim		SF_D = SFHALF
126336817Sdim		SHIFTAMT = mux(P_EXP1,#8,#9)
127336817Sdim	}
128336817Sdim	{
129336817Sdim		SF_D -= sfmpy(SF_S,SF_H):lib		// d1: error term
130336817Sdim		FRACRAD = asl(FRACRAD,SHIFTAMT)		// Move fracrad bits to right place
131336817Sdim		SHIFTAMT = mux(P_EXP1,#3,#2)
132336817Sdim	}
133336817Sdim	{
134336817Sdim		SF_H += sfmpy(SF_H,SF_D):lib		// d2: rsqrt
135336817Sdim		// cool trick: half of 1/sqrt(x) has same mantissa as 1/sqrt(x).
136336817Sdim		PROD = asl(FRACRAD,SHIFTAMT)		// fracrad<<(2+exp1)
137336817Sdim	}
138336817Sdim	{
139336817Sdim		SF_H = and(SF_H,##0x007fffff)
140336817Sdim	}
141336817Sdim	{
142336817Sdim		SF_H = add(SF_H,##0x00800000 - 3)
143336817Sdim		SHIFTAMT = mux(P_EXP1,#7,#8)
144336817Sdim	}
145336817Sdim	{
146336817Sdim		RECIPEST = asl(SF_H,SHIFTAMT)
147336817Sdim		SHIFTAMT = mux(P_EXP1,#15-(1+1),#15-(1+0))
148336817Sdim	}
149336817Sdim	{
150336817Sdim		ROOT = mpyu(RECIPEST,PRODHI)		// root = mpyu_full(recipest,hi(fracrad<<(2+exp1)))
151336817Sdim	}
152336817Sdim
153336817Sdim#undef SFSH	// r3:2
154336817Sdim#undef SF_H	// r2
155336817Sdim#undef SF_S	// r3
156336817Sdim#undef S_ONE	// r4
157336817Sdim#undef SFHALF	// r5
158336817Sdim#undef SFHALF_SONE	// r5:4
159336817Sdim#undef SF_D	// r6
160336817Sdim#undef SF_E	// r7
161336817Sdim
162336817Sdim#define HL r3:2
163336817Sdim#define LL r5:4
164336817Sdim#define HH r7:6
165336817Sdim
166336817Sdim#undef P_EXP1
167336817Sdim#define P_CARRY0 p1
168336817Sdim#define P_CARRY1 p2
169336817Sdim#define P_CARRY2 p3
170336817Sdim
171353358Sdim	// Iteration 0
172353358Sdim	// Maybe we can save a cycle by starting with ERROR=asl(fracrad), then as we multiply
173353358Sdim	// We can shift and subtract instead of shift and add?
174336817Sdim	{
175336817Sdim		ERROR = asl(FRACRAD,#15)
176336817Sdim		PROD = mpyu(ROOTHI,ROOTHI)
177336817Sdim		P_CARRY0 = cmp.eq(r0,r0)
178336817Sdim	}
179336817Sdim	{
180336817Sdim		ERROR -= asl(PROD,#15)
181336817Sdim		PROD = mpyu(ROOTHI,ROOTLO)
182336817Sdim		P_CARRY1 = cmp.eq(r0,r0)
183336817Sdim	}
184336817Sdim	{
185336817Sdim		ERROR -= lsr(PROD,#16)
186336817Sdim		P_CARRY2 = cmp.eq(r0,r0)
187336817Sdim	}
188336817Sdim	{
189336817Sdim		ERROR = mpyu(ERRORHI,RECIPEST)
190336817Sdim	}
191336817Sdim	{
192336817Sdim		ROOT += lsr(ERROR,SHIFTAMT)
193336817Sdim		SHIFTAMT = add(SHIFTAMT,#16)
194336817Sdim		ERROR = asl(FRACRAD,#31)		// for next iter
195336817Sdim	}
196353358Sdim	// Iteration 1
197336817Sdim	{
198336817Sdim		PROD = mpyu(ROOTHI,ROOTHI)
199336817Sdim		ERROR -= mpyu(ROOTHI,ROOTLO)	// amount is 31, no shift needed
200336817Sdim	}
201336817Sdim	{
202336817Sdim		ERROR -= asl(PROD,#31)
203336817Sdim		PROD = mpyu(ROOTLO,ROOTLO)
204336817Sdim	}
205336817Sdim	{
206336817Sdim		ERROR -= lsr(PROD,#33)
207336817Sdim	}
208336817Sdim	{
209336817Sdim		ERROR = mpyu(ERRORHI,RECIPEST)
210336817Sdim	}
211336817Sdim	{
212336817Sdim		ROOT += lsr(ERROR,SHIFTAMT)
213336817Sdim		SHIFTAMT = add(SHIFTAMT,#16)
214336817Sdim		ERROR = asl(FRACRAD,#47)	// for next iter
215336817Sdim	}
216353358Sdim	// Iteration 2
217336817Sdim	{
218336817Sdim		PROD = mpyu(ROOTHI,ROOTHI)
219336817Sdim	}
220336817Sdim	{
221336817Sdim		ERROR -= asl(PROD,#47)
222336817Sdim		PROD = mpyu(ROOTHI,ROOTLO)
223336817Sdim	}
224336817Sdim	{
225336817Sdim		ERROR -= asl(PROD,#16)		// bidir shr 31-47
226336817Sdim		PROD = mpyu(ROOTLO,ROOTLO)
227336817Sdim	}
228336817Sdim	{
229336817Sdim		ERROR -= lsr(PROD,#17)		// 64-47
230336817Sdim	}
231336817Sdim	{
232336817Sdim		ERROR = mpyu(ERRORHI,RECIPEST)
233336817Sdim	}
234336817Sdim	{
235336817Sdim		ROOT += lsr(ERROR,SHIFTAMT)
236336817Sdim	}
237336817Sdim#undef ERROR
238336817Sdim#undef PROD
239336817Sdim#undef PRODHI
240336817Sdim#undef PRODLO
241336817Sdim#define REM_HI r15:14
242336817Sdim#define REM_HI_HI r15
243336817Sdim#define REM_LO r1:0
244336817Sdim#undef RECIPEST
245336817Sdim#undef SHIFTAMT
246336817Sdim#define TWOROOT_LO r9:8
247353358Sdim	// Adjust Root
248336817Sdim	{
249336817Sdim		HL = mpyu(ROOTHI,ROOTLO)
250336817Sdim		LL = mpyu(ROOTLO,ROOTLO)
251336817Sdim		REM_HI = #0
252336817Sdim		REM_LO = #0
253336817Sdim	}
254336817Sdim	{
255336817Sdim		HL += lsr(LL,#33)
256336817Sdim		LL += asl(HL,#33)
257336817Sdim		P_CARRY0 = cmp.eq(r0,r0)
258336817Sdim	}
259336817Sdim	{
260336817Sdim		HH = mpyu(ROOTHI,ROOTHI)
261336817Sdim		REM_LO = sub(REM_LO,LL,P_CARRY0):carry
262336817Sdim		TWOROOT_LO = #1
263336817Sdim	}
264336817Sdim	{
265336817Sdim		HH += lsr(HL,#31)
266336817Sdim		TWOROOT_LO += asl(ROOT,#1)
267336817Sdim	}
268336817Sdim#undef HL
269336817Sdim#undef LL
270336817Sdim#define REM_HI_TMP r3:2
271336817Sdim#define REM_HI_TMP_HI r3
272336817Sdim#define REM_LO_TMP r5:4
273336817Sdim	{
274336817Sdim		REM_HI = sub(FRACRAD,HH,P_CARRY0):carry
275336817Sdim		REM_LO_TMP = sub(REM_LO,TWOROOT_LO,P_CARRY1):carry
276336817Sdim#undef FRACRAD
277336817Sdim#undef HH
278336817Sdim#define ZERO r11:10
279336817Sdim#define ONE r7:6
280336817Sdim		ONE = #1
281336817Sdim		ZERO = #0
282336817Sdim	}
283336817Sdim	{
284336817Sdim		REM_HI_TMP = sub(REM_HI,ZERO,P_CARRY1):carry
285336817Sdim		ONE = add(ROOT,ONE)
286336817Sdim		EXP = add(EXP,#-DF_BIAS)			// subtract bias --> signed exp
287336817Sdim	}
288336817Sdim	{
289336817Sdim				// If carry set, no borrow: result was still positive
290336817Sdim		if (P_CARRY1) ROOT = ONE
291336817Sdim		if (P_CARRY1) REM_LO = REM_LO_TMP
292336817Sdim		if (P_CARRY1) REM_HI = REM_HI_TMP
293336817Sdim	}
294336817Sdim	{
295336817Sdim		REM_LO_TMP = sub(REM_LO,TWOROOT_LO,P_CARRY2):carry
296336817Sdim		ONE = #1
297336817Sdim		EXP = asr(EXP,#1)				// divide signed exp by 2
298336817Sdim	}
299336817Sdim	{
300336817Sdim		REM_HI_TMP = sub(REM_HI,ZERO,P_CARRY2):carry
301336817Sdim		ONE = add(ROOT,ONE)
302336817Sdim	}
303336817Sdim	{
304336817Sdim		if (P_CARRY2) ROOT = ONE
305336817Sdim		if (P_CARRY2) REM_LO = REM_LO_TMP
306336817Sdim								// since tworoot <= 2^32, remhi must be zero
307336817Sdim#undef REM_HI_TMP
308336817Sdim#undef REM_HI_TMP_HI
309336817Sdim#define S_ONE r2
310336817Sdim#define ADJ r3
311336817Sdim		S_ONE = #1
312336817Sdim	}
313336817Sdim	{
314336817Sdim		P_TMP = cmp.eq(REM_LO,ZERO)			// is the low part zero
315336817Sdim		if (!P_TMP.new) ROOTLO = or(ROOTLO,S_ONE)	// if so, it's exact... hopefully
316336817Sdim		ADJ = cl0(ROOT)
317336817Sdim		EXP = add(EXP,#-63)
318336817Sdim	}
319336817Sdim#undef REM_LO
320336817Sdim#define RET r1:0
321336817Sdim#define RETHI r1
322336817Sdim	{
323336817Sdim		RET = convert_ud2df(ROOT)			// set up mantissa, maybe set inexact flag
324336817Sdim		EXP = add(EXP,ADJ)				// add back bias
325336817Sdim	}
326336817Sdim	{
327336817Sdim		RETHI += asl(EXP,#DF_MANTBITS-32)		// add exponent adjust
328336817Sdim		jumpr r31
329336817Sdim	}
330336817Sdim#undef REM_LO_TMP
331336817Sdim#undef REM_HI_TMP
332336817Sdim#undef REM_HI_TMP_HI
333336817Sdim#undef REM_LO
334336817Sdim#undef REM_HI
335336817Sdim#undef TWOROOT_LO
336336817Sdim
337336817Sdim#undef RET
338336817Sdim#define A r1:0
339336817Sdim#define AH r1
340336817Sdim#define AL r1
341336817Sdim#undef S_ONE
342336817Sdim#define TMP r3:2
343336817Sdim#define TMPHI r3
344336817Sdim#define TMPLO r2
345336817Sdim#undef P_CARRY0
346336817Sdim#define P_NEG p1
347336817Sdim
348336817Sdim
349336817Sdim#define SFHALF r5
350336817Sdim#define SFRAD r9
351336817Sdim.Lsqrt_abnormal:
352336817Sdim	{
353336817Sdim		P_TMP = dfclass(A,#DFCLASS_ZERO)			// zero?
354336817Sdim		if (P_TMP.new) jumpr:t r31
355336817Sdim	}
356336817Sdim	{
357336817Sdim		P_TMP = dfclass(A,#DFCLASS_NAN)
358336817Sdim		if (P_TMP.new) jump:nt .Lsqrt_nan
359336817Sdim	}
360336817Sdim	{
361336817Sdim		P_TMP = cmp.gt(AH,#-1)
362336817Sdim		if (!P_TMP.new) jump:nt .Lsqrt_invalid_neg
363336817Sdim		if (!P_TMP.new) EXP = ##0x7F800001			// sNaN
364336817Sdim	}
365336817Sdim	{
366336817Sdim		P_TMP = dfclass(A,#DFCLASS_INFINITE)
367336817Sdim		if (P_TMP.new) jumpr:nt r31
368336817Sdim	}
369336817Sdim	// If we got here, we're denormal
370336817Sdim	// prepare to restart
371336817Sdim	{
372336817Sdim		A = extractu(A,#DF_MANTBITS,#0)		// Extract mantissa
373336817Sdim	}
374336817Sdim	{
375336817Sdim		EXP = add(clb(A),#-DF_EXPBITS)		// how much to normalize?
376336817Sdim	}
377336817Sdim	{
378336817Sdim		A = asl(A,EXP)				// Shift mantissa
379336817Sdim		EXP = sub(#1,EXP)			// Form exponent
380336817Sdim	}
381336817Sdim	{
382336817Sdim		AH = insert(EXP,#1,#DF_MANTBITS-32)		// insert lsb of exponent
383336817Sdim	}
384336817Sdim	{
385336817Sdim		TMP = extractu(A,#SF_MANTBITS+1,#DF_MANTBITS-SF_MANTBITS)	// get sf value (mant+exp1)
386336817Sdim		SFHALF = ##0x3f000004						// form half constant
387336817Sdim	}
388336817Sdim	{
389336817Sdim		SFRAD = or(SFHALF,TMPLO)			// form sf value
390336817Sdim		SFHALF = and(SFHALF,#-16)
391336817Sdim		jump .Ldenormal_restart				// restart
392336817Sdim	}
393336817Sdim.Lsqrt_nan:
394336817Sdim	{
395336817Sdim		EXP = convert_df2sf(A)				// if sNaN, get invalid
396336817Sdim		A = #-1						// qNaN
397336817Sdim		jumpr r31
398336817Sdim	}
399336817Sdim.Lsqrt_invalid_neg:
400336817Sdim	{
401336817Sdim		A = convert_sf2df(EXP)				// Invalid,NaNval
402336817Sdim		jumpr r31
403336817Sdim	}
404336817SdimEND(__hexagon_sqrt)
405336817SdimEND(__hexagon_sqrtdf2)
406