dfsqrt.S revision 336817
1//===----------------------Hexagon builtin routine ------------------------===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is dual licensed under the MIT and the University of Illinois Open
6// Source Licenses. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9
10/* Double Precision square root */
11
12#define EXP r28
13
14#define A r1:0
15#define AH r1
16#define AL r0
17
18#define SFSH r3:2
19#define SF_S r3
20#define SF_H r2
21
22#define SFHALF_SONE r5:4
23#define S_ONE r4
24#define SFHALF r5
25#define SF_D r6
26#define SF_E r7
27#define RECIPEST r8
28#define SFRAD r9
29
30#define FRACRAD r11:10
31#define FRACRADH r11
32#define FRACRADL r10
33
34#define ROOT r13:12
35#define ROOTHI r13
36#define ROOTLO r12
37
38#define PROD r15:14
39#define PRODHI r15
40#define PRODLO r14
41
42#define P_TMP p0
43#define P_EXP1 p1
44#define NORMAL p2
45
46#define SF_EXPBITS 8
47#define SF_MANTBITS 23
48
49#define DF_EXPBITS 11
50#define DF_MANTBITS 52
51
52#define DF_BIAS 0x3ff
53
54#define DFCLASS_ZERO     0x01
55#define DFCLASS_NORMAL   0x02
56#define DFCLASS_DENORMAL 0x02
57#define DFCLASS_INFINITE 0x08
58#define DFCLASS_NAN      0x10
59
60#define Q6_ALIAS(TAG) .global __qdsp_##TAG ; .set __qdsp_##TAG, __hexagon_##TAG; .type __qdsp_##TAG,@function
61#define FAST_ALIAS(TAG) .global __hexagon_fast_##TAG ; .set __hexagon_fast_##TAG, __hexagon_##TAG; .type __hexagon_fast_##TAG,@function
62#define FAST2_ALIAS(TAG) .global __hexagon_fast2_##TAG ; .set __hexagon_fast2_##TAG, __hexagon_##TAG; .type __hexagon_fast2_##TAG,@function
63#define END(TAG) .size TAG,.-TAG
64
65	.text
66	.global __hexagon_sqrtdf2
67	.type __hexagon_sqrtdf2,@function
68	.global __hexagon_sqrt
69	.type __hexagon_sqrt,@function
70	Q6_ALIAS(sqrtdf2)
71	Q6_ALIAS(sqrt)
72	FAST_ALIAS(sqrtdf2)
73	FAST_ALIAS(sqrt)
74	FAST2_ALIAS(sqrtdf2)
75	FAST2_ALIAS(sqrt)
76	.type sqrt,@function
77	.p2align 5
78__hexagon_sqrtdf2:
79__hexagon_sqrt:
80	{
81		PROD = extractu(A,#SF_MANTBITS+1,#DF_MANTBITS-SF_MANTBITS)
82		EXP = extractu(AH,#DF_EXPBITS,#DF_MANTBITS-32)
83		SFHALF_SONE = combine(##0x3f000004,#1)
84	}
85	{
86		NORMAL = dfclass(A,#DFCLASS_NORMAL)		// Is it normal
87		NORMAL = cmp.gt(AH,#-1)				// and positive?
88		if (!NORMAL.new) jump:nt .Lsqrt_abnormal
89		SFRAD = or(SFHALF,PRODLO)
90	}
91#undef NORMAL
92.Ldenormal_restart:
93	{
94		FRACRAD = A
95		SF_E,P_TMP = sfinvsqrta(SFRAD)
96		SFHALF = and(SFHALF,#-16)
97		SFSH = #0
98	}
99#undef A
100#undef AH
101#undef AL
102#define ERROR r1:0
103#define ERRORHI r1
104#define ERRORLO r0
105	// SF_E : reciprocal square root
106	// SF_H : half rsqrt
107	// sf_S : square root
108	// SF_D : error term
109	// SFHALF: 0.5
110	{
111		SF_S += sfmpy(SF_E,SFRAD):lib		// s0: root
112		SF_H += sfmpy(SF_E,SFHALF):lib		// h0: 0.5*y0. Could also decrement exponent...
113		SF_D = SFHALF
114#undef SFRAD
115#define SHIFTAMT r9
116		SHIFTAMT = and(EXP,#1)
117	}
118	{
119		SF_D -= sfmpy(SF_S,SF_H):lib		// d0: 0.5-H*S = 0.5-0.5*~1
120		FRACRADH = insert(S_ONE,#DF_EXPBITS+1,#DF_MANTBITS-32)	// replace upper bits with hidden
121		P_EXP1 = cmp.gtu(SHIFTAMT,#0)
122	}
123	{
124		SF_S += sfmpy(SF_S,SF_D):lib		// s1: refine sqrt
125		SF_H += sfmpy(SF_H,SF_D):lib		// h1: refine half-recip
126		SF_D = SFHALF
127		SHIFTAMT = mux(P_EXP1,#8,#9)
128	}
129	{
130		SF_D -= sfmpy(SF_S,SF_H):lib		// d1: error term
131		FRACRAD = asl(FRACRAD,SHIFTAMT)		// Move fracrad bits to right place
132		SHIFTAMT = mux(P_EXP1,#3,#2)
133	}
134	{
135		SF_H += sfmpy(SF_H,SF_D):lib		// d2: rsqrt
136		// cool trick: half of 1/sqrt(x) has same mantissa as 1/sqrt(x).
137		PROD = asl(FRACRAD,SHIFTAMT)		// fracrad<<(2+exp1)
138	}
139	{
140		SF_H = and(SF_H,##0x007fffff)
141	}
142	{
143		SF_H = add(SF_H,##0x00800000 - 3)
144		SHIFTAMT = mux(P_EXP1,#7,#8)
145	}
146	{
147		RECIPEST = asl(SF_H,SHIFTAMT)
148		SHIFTAMT = mux(P_EXP1,#15-(1+1),#15-(1+0))
149	}
150	{
151		ROOT = mpyu(RECIPEST,PRODHI)		// root = mpyu_full(recipest,hi(fracrad<<(2+exp1)))
152	}
153
154#undef SFSH	// r3:2
155#undef SF_H	// r2
156#undef SF_S	// r3
157#undef S_ONE	// r4
158#undef SFHALF	// r5
159#undef SFHALF_SONE	// r5:4
160#undef SF_D	// r6
161#undef SF_E	// r7
162
163#define HL r3:2
164#define LL r5:4
165#define HH r7:6
166
167#undef P_EXP1
168#define P_CARRY0 p1
169#define P_CARRY1 p2
170#define P_CARRY2 p3
171
172	/* Iteration 0 */
173	/* Maybe we can save a cycle by starting with ERROR=asl(fracrad), then as we multiply */
174	/* We can shift and subtract instead of shift and add? */
175	{
176		ERROR = asl(FRACRAD,#15)
177		PROD = mpyu(ROOTHI,ROOTHI)
178		P_CARRY0 = cmp.eq(r0,r0)
179	}
180	{
181		ERROR -= asl(PROD,#15)
182		PROD = mpyu(ROOTHI,ROOTLO)
183		P_CARRY1 = cmp.eq(r0,r0)
184	}
185	{
186		ERROR -= lsr(PROD,#16)
187		P_CARRY2 = cmp.eq(r0,r0)
188	}
189	{
190		ERROR = mpyu(ERRORHI,RECIPEST)
191	}
192	{
193		ROOT += lsr(ERROR,SHIFTAMT)
194		SHIFTAMT = add(SHIFTAMT,#16)
195		ERROR = asl(FRACRAD,#31)		// for next iter
196	}
197	/* Iteration 1 */
198	{
199		PROD = mpyu(ROOTHI,ROOTHI)
200		ERROR -= mpyu(ROOTHI,ROOTLO)	// amount is 31, no shift needed
201	}
202	{
203		ERROR -= asl(PROD,#31)
204		PROD = mpyu(ROOTLO,ROOTLO)
205	}
206	{
207		ERROR -= lsr(PROD,#33)
208	}
209	{
210		ERROR = mpyu(ERRORHI,RECIPEST)
211	}
212	{
213		ROOT += lsr(ERROR,SHIFTAMT)
214		SHIFTAMT = add(SHIFTAMT,#16)
215		ERROR = asl(FRACRAD,#47)	// for next iter
216	}
217	/* Iteration 2 */
218	{
219		PROD = mpyu(ROOTHI,ROOTHI)
220	}
221	{
222		ERROR -= asl(PROD,#47)
223		PROD = mpyu(ROOTHI,ROOTLO)
224	}
225	{
226		ERROR -= asl(PROD,#16)		// bidir shr 31-47
227		PROD = mpyu(ROOTLO,ROOTLO)
228	}
229	{
230		ERROR -= lsr(PROD,#17)		// 64-47
231	}
232	{
233		ERROR = mpyu(ERRORHI,RECIPEST)
234	}
235	{
236		ROOT += lsr(ERROR,SHIFTAMT)
237	}
238#undef ERROR
239#undef PROD
240#undef PRODHI
241#undef PRODLO
242#define REM_HI r15:14
243#define REM_HI_HI r15
244#define REM_LO r1:0
245#undef RECIPEST
246#undef SHIFTAMT
247#define TWOROOT_LO r9:8
248	/* Adjust Root */
249	{
250		HL = mpyu(ROOTHI,ROOTLO)
251		LL = mpyu(ROOTLO,ROOTLO)
252		REM_HI = #0
253		REM_LO = #0
254	}
255	{
256		HL += lsr(LL,#33)
257		LL += asl(HL,#33)
258		P_CARRY0 = cmp.eq(r0,r0)
259	}
260	{
261		HH = mpyu(ROOTHI,ROOTHI)
262		REM_LO = sub(REM_LO,LL,P_CARRY0):carry
263		TWOROOT_LO = #1
264	}
265	{
266		HH += lsr(HL,#31)
267		TWOROOT_LO += asl(ROOT,#1)
268	}
269#undef HL
270#undef LL
271#define REM_HI_TMP r3:2
272#define REM_HI_TMP_HI r3
273#define REM_LO_TMP r5:4
274	{
275		REM_HI = sub(FRACRAD,HH,P_CARRY0):carry
276		REM_LO_TMP = sub(REM_LO,TWOROOT_LO,P_CARRY1):carry
277#undef FRACRAD
278#undef HH
279#define ZERO r11:10
280#define ONE r7:6
281		ONE = #1
282		ZERO = #0
283	}
284	{
285		REM_HI_TMP = sub(REM_HI,ZERO,P_CARRY1):carry
286		ONE = add(ROOT,ONE)
287		EXP = add(EXP,#-DF_BIAS)			// subtract bias --> signed exp
288	}
289	{
290				// If carry set, no borrow: result was still positive
291		if (P_CARRY1) ROOT = ONE
292		if (P_CARRY1) REM_LO = REM_LO_TMP
293		if (P_CARRY1) REM_HI = REM_HI_TMP
294	}
295	{
296		REM_LO_TMP = sub(REM_LO,TWOROOT_LO,P_CARRY2):carry
297		ONE = #1
298		EXP = asr(EXP,#1)				// divide signed exp by 2
299	}
300	{
301		REM_HI_TMP = sub(REM_HI,ZERO,P_CARRY2):carry
302		ONE = add(ROOT,ONE)
303	}
304	{
305		if (P_CARRY2) ROOT = ONE
306		if (P_CARRY2) REM_LO = REM_LO_TMP
307								// since tworoot <= 2^32, remhi must be zero
308#undef REM_HI_TMP
309#undef REM_HI_TMP_HI
310#define S_ONE r2
311#define ADJ r3
312		S_ONE = #1
313	}
314	{
315		P_TMP = cmp.eq(REM_LO,ZERO)			// is the low part zero
316		if (!P_TMP.new) ROOTLO = or(ROOTLO,S_ONE)	// if so, it's exact... hopefully
317		ADJ = cl0(ROOT)
318		EXP = add(EXP,#-63)
319	}
320#undef REM_LO
321#define RET r1:0
322#define RETHI r1
323	{
324		RET = convert_ud2df(ROOT)			// set up mantissa, maybe set inexact flag
325		EXP = add(EXP,ADJ)				// add back bias
326	}
327	{
328		RETHI += asl(EXP,#DF_MANTBITS-32)		// add exponent adjust
329		jumpr r31
330	}
331#undef REM_LO_TMP
332#undef REM_HI_TMP
333#undef REM_HI_TMP_HI
334#undef REM_LO
335#undef REM_HI
336#undef TWOROOT_LO
337
338#undef RET
339#define A r1:0
340#define AH r1
341#define AL r1
342#undef S_ONE
343#define TMP r3:2
344#define TMPHI r3
345#define TMPLO r2
346#undef P_CARRY0
347#define P_NEG p1
348
349
350#define SFHALF r5
351#define SFRAD r9
352.Lsqrt_abnormal:
353	{
354		P_TMP = dfclass(A,#DFCLASS_ZERO)			// zero?
355		if (P_TMP.new) jumpr:t r31
356	}
357	{
358		P_TMP = dfclass(A,#DFCLASS_NAN)
359		if (P_TMP.new) jump:nt .Lsqrt_nan
360	}
361	{
362		P_TMP = cmp.gt(AH,#-1)
363		if (!P_TMP.new) jump:nt .Lsqrt_invalid_neg
364		if (!P_TMP.new) EXP = ##0x7F800001			// sNaN
365	}
366	{
367		P_TMP = dfclass(A,#DFCLASS_INFINITE)
368		if (P_TMP.new) jumpr:nt r31
369	}
370	// If we got here, we're denormal
371	// prepare to restart
372	{
373		A = extractu(A,#DF_MANTBITS,#0)		// Extract mantissa
374	}
375	{
376		EXP = add(clb(A),#-DF_EXPBITS)		// how much to normalize?
377	}
378	{
379		A = asl(A,EXP)				// Shift mantissa
380		EXP = sub(#1,EXP)			// Form exponent
381	}
382	{
383		AH = insert(EXP,#1,#DF_MANTBITS-32)		// insert lsb of exponent
384	}
385	{
386		TMP = extractu(A,#SF_MANTBITS+1,#DF_MANTBITS-SF_MANTBITS)	// get sf value (mant+exp1)
387		SFHALF = ##0x3f000004						// form half constant
388	}
389	{
390		SFRAD = or(SFHALF,TMPLO)			// form sf value
391		SFHALF = and(SFHALF,#-16)
392		jump .Ldenormal_restart				// restart
393	}
394.Lsqrt_nan:
395	{
396		EXP = convert_df2sf(A)				// if sNaN, get invalid
397		A = #-1						// qNaN
398		jumpr r31
399	}
400.Lsqrt_invalid_neg:
401	{
402		A = convert_sf2df(EXP)				// Invalid,NaNval
403		jumpr r31
404	}
405END(__hexagon_sqrt)
406END(__hexagon_sqrtdf2)
407