dfmul.S revision 341825
1//===----------------------Hexagon builtin routine ------------------------===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is dual licensed under the MIT and the University of Illinois Open
6// Source Licenses. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9
10/* Double Precision Multiply */
11#define A r1:0
12#define AH r1
13#define AL r0
14#define B r3:2
15#define BH r3
16#define BL r2
17
18#define BTMP r5:4
19#define BTMPH r5
20#define BTMPL r4
21
22#define PP_ODD r7:6
23#define PP_ODD_H r7
24#define PP_ODD_L r6
25
26#define ONE r9:8
27#define S_ONE r8
28#define S_ZERO r9
29
30#define PP_HH r11:10
31#define PP_HH_H r11
32#define PP_HH_L r10
33
34#define ATMP r13:12
35#define ATMPH r13
36#define ATMPL r12
37
38#define PP_LL r15:14
39#define PP_LL_H r15
40#define PP_LL_L r14
41
42#define TMP r28
43
44#define MANTBITS 52
45#define HI_MANTBITS 20
46#define EXPBITS 11
47#define BIAS 1024
48#define MANTISSA_TO_INT_BIAS 52
49
50/* Some constant to adjust normalization amount in error code */
51/* Amount to right shift the partial product to get to a denorm */
52#define FUDGE 5
53
54#define Q6_ALIAS(TAG) .global __qdsp_##TAG ; .set __qdsp_##TAG, __hexagon_##TAG
55#define FAST_ALIAS(TAG) .global __hexagon_fast_##TAG ; .set __hexagon_fast_##TAG, __hexagon_##TAG
56#define FAST2_ALIAS(TAG) .global __hexagon_fast2_##TAG ; .set __hexagon_fast2_##TAG, __hexagon_##TAG
57#define END(TAG) .size TAG,.-TAG
58
59#define SR_ROUND_OFF 22
60	.text
61	.global __hexagon_muldf3
62	.type __hexagon_muldf3,@function
63	Q6_ALIAS(muldf3)
64  FAST_ALIAS(muldf3)
65  FAST2_ALIAS(muldf3)
66	.p2align 5
67__hexagon_muldf3:
68	{
69		p0 = dfclass(A,#2)
70		p0 = dfclass(B,#2)
71		ATMP = combine(##0x40000000,#0)
72	}
73	{
74		ATMP = insert(A,#MANTBITS,#EXPBITS-1)
75		BTMP = asl(B,#EXPBITS-1)
76		TMP = #-BIAS
77		ONE = #1
78	}
79	{
80		PP_ODD = mpyu(BTMPL,ATMPH)
81		BTMP = insert(ONE,#2,#62)
82	}
83	/* since we know that the MSB of the H registers is zero, we should never carry */
84	/* H <= 2^31-1.  L <= 2^32-1.  Therefore, HL <= 2^63-2^32-2^31+1 */
85	/* Adding 2 HLs, we get 2^64-3*2^32+2 maximum.  */
86	/* Therefore, we can add 3 2^32-1 values safely without carry.  We only need one. */
87	{
88		PP_LL = mpyu(ATMPL,BTMPL)
89		PP_ODD += mpyu(ATMPL,BTMPH)
90	}
91	{
92		PP_ODD += lsr(PP_LL,#32)
93		PP_HH = mpyu(ATMPH,BTMPH)
94		BTMP = combine(##BIAS+BIAS-4,#0)
95	}
96	{
97		PP_HH += lsr(PP_ODD,#32)
98		if (!p0) jump .Lmul_abnormal
99		p1 = cmp.eq(PP_LL_L,#0)		// 64 lsb's 0?
100		p1 = cmp.eq(PP_ODD_L,#0)	// 64 lsb's 0?
101	}
102	/*
103	 * PP_HH can have a maximum of 0x3FFF_FFFF_FFFF_FFFF or thereabouts
104	 * PP_HH can have a minimum of 0x1000_0000_0000_0000 or so
105	 */
106#undef PP_ODD
107#undef PP_ODD_H
108#undef PP_ODD_L
109#define EXP10 r7:6
110#define EXP1 r7
111#define EXP0 r6
112	{
113		if (!p1) PP_HH_L = or(PP_HH_L,S_ONE)
114		EXP0 = extractu(AH,#EXPBITS,#HI_MANTBITS)
115		EXP1 = extractu(BH,#EXPBITS,#HI_MANTBITS)
116	}
117	{
118		PP_LL = neg(PP_HH)
119		EXP0 += add(TMP,EXP1)
120		TMP = xor(AH,BH)
121	}
122	{
123		if (!p2.new) PP_HH = PP_LL
124		p2 = cmp.gt(TMP,#-1)
125		p0 = !cmp.gt(EXP0,BTMPH)
126		p0 = cmp.gt(EXP0,BTMPL)
127		if (!p0.new) jump:nt .Lmul_ovf_unf
128	}
129	{
130		A = convert_d2df(PP_HH)
131		EXP0 = add(EXP0,#-BIAS-58)
132	}
133	{
134		AH += asl(EXP0,#HI_MANTBITS)
135		jumpr r31
136	}
137
138	.falign
139.Lpossible_unf:
140	/* We end up with a positive exponent */
141	/* But we may have rounded up to an exponent of 1. */
142	/* If the exponent is 1, if we rounded up to it
143	 * we need to also raise underflow
144	 * Fortunately, this is pretty easy to detect, we must have +/- 0x0010_0000_0000_0000
145	 * And the PP should also have more than one bit set
146	 */
147	/* Note: ATMP should have abs(PP_HH) */
148	/* Note: BTMPL should have 0x7FEFFFFF */
149	{
150		p0 = cmp.eq(AL,#0)
151		p0 = bitsclr(AH,BTMPL)
152		if (!p0.new) jumpr:t r31
153		BTMPH = #0x7fff
154	}
155	{
156		p0 = bitsset(ATMPH,BTMPH)
157		BTMPL = USR
158		BTMPH = #0x030
159	}
160	{
161		if (p0) BTMPL = or(BTMPL,BTMPH)
162	}
163	{
164		USR = BTMPL
165	}
166	{
167		p0 = dfcmp.eq(A,A)
168		jumpr r31
169	}
170	.falign
171.Lmul_ovf_unf:
172	{
173		A = convert_d2df(PP_HH)
174		ATMP = abs(PP_HH)			// take absolute value
175		EXP1 = add(EXP0,#-BIAS-58)
176	}
177	{
178		AH += asl(EXP1,#HI_MANTBITS)
179		EXP1 = extractu(AH,#EXPBITS,#HI_MANTBITS)
180		BTMPL = ##0x7FEFFFFF
181	}
182	{
183		EXP1 += add(EXP0,##-BIAS-58)
184		//BTMPH = add(clb(ATMP),#-2)
185		BTMPH = #0
186	}
187	{
188		p0 = cmp.gt(EXP1,##BIAS+BIAS-2)	// overflow
189		if (p0.new) jump:nt .Lmul_ovf
190	}
191	{
192		p0 = cmp.gt(EXP1,#0)
193		if (p0.new) jump:nt .Lpossible_unf
194		BTMPH = sub(EXP0,BTMPH)
195		TMP = #63				// max amount to shift
196	}
197	/* Underflow */
198	/*
199	 * PP_HH has the partial product with sticky LSB.
200	 * PP_HH can have a maximum of 0x3FFF_FFFF_FFFF_FFFF or thereabouts
201	 * PP_HH can have a minimum of 0x1000_0000_0000_0000 or so
202	 * The exponent of PP_HH is in  EXP1, which is non-positive (0 or negative)
203	 * That's the exponent that happens after the normalization
204	 *
205	 * EXP0 has the exponent that, when added to the normalized value, is out of range.
206	 *
207	 * Strategy:
208	 *
209	 * * Shift down bits, with sticky bit, such that the bits are aligned according
210	 *   to the LZ count and appropriate exponent, but not all the way to mantissa
211	 *   field, keep around the last few bits.
212	 * * Put a 1 near the MSB
213	 * * Check the LSBs for inexact; if inexact also set underflow
214	 * * Convert [u]d2df -- will correctly round according to rounding mode
215	 * * Replace exponent field with zero
216	 *
217	 *
218	 */
219
220
221	{
222		BTMPL = #0	 			// offset for extract
223		BTMPH = sub(#FUDGE,BTMPH)		// amount to right shift
224	}
225	{
226		p3 = cmp.gt(PP_HH_H,#-1)		// is it positive?
227		BTMPH = min(BTMPH,TMP)			// Don't shift more than 63
228		PP_HH = ATMP
229	}
230	{
231		TMP = USR
232		PP_LL = extractu(PP_HH,BTMP)
233	}
234	{
235		PP_HH = asr(PP_HH,BTMPH)
236		BTMPL = #0x0030					// underflow flag
237		AH = insert(S_ZERO,#EXPBITS,#HI_MANTBITS)
238	}
239	{
240		p0 = cmp.gtu(ONE,PP_LL)				// Did we extract all zeros?
241		if (!p0.new) PP_HH_L = or(PP_HH_L,S_ONE)	// add sticky bit
242		PP_HH_H = setbit(PP_HH_H,#HI_MANTBITS+3)	// Add back in a bit so we can use convert instruction
243	}
244	{
245		PP_LL = neg(PP_HH)
246		p1 = bitsclr(PP_HH_L,#0x7)		// Are the LSB's clear?
247		if (!p1.new) TMP = or(BTMPL,TMP)	// If not, Inexact+Underflow
248	}
249	{
250		if (!p3) PP_HH = PP_LL
251		USR = TMP
252	}
253	{
254		A = convert_d2df(PP_HH)			// Do rounding
255		p0 = dfcmp.eq(A,A)			// realize exception
256	}
257	{
258		AH = insert(S_ZERO,#EXPBITS-1,#HI_MANTBITS+1)		// Insert correct exponent
259		jumpr r31
260	}
261	.falign
262.Lmul_ovf:
263	// We get either max finite value or infinity.  Either way, overflow+inexact
264	{
265		TMP = USR
266		ATMP = combine(##0x7fefffff,#-1)	// positive max finite
267		A = PP_HH
268	}
269	{
270		PP_LL_L = extractu(TMP,#2,#SR_ROUND_OFF)	// rounding bits
271		TMP = or(TMP,#0x28)			// inexact + overflow
272		BTMP = combine(##0x7ff00000,#0)		// positive infinity
273	}
274	{
275		USR = TMP
276		PP_LL_L ^= lsr(AH,#31)			// Does sign match rounding?
277		TMP = PP_LL_L				// unmodified rounding mode
278	}
279	{
280		p0 = !cmp.eq(TMP,#1)			// If not round-to-zero and
281		p0 = !cmp.eq(PP_LL_L,#2)		// Not rounding the other way,
282		if (p0.new) ATMP = BTMP			// we should get infinity
283		p0 = dfcmp.eq(A,A)			// Realize FP exception if enabled
284	}
285	{
286		A = insert(ATMP,#63,#0)			// insert inf/maxfinite, leave sign
287		jumpr r31
288	}
289
290.Lmul_abnormal:
291	{
292		ATMP = extractu(A,#63,#0)		// strip off sign
293		BTMP = extractu(B,#63,#0)		// strip off sign
294	}
295	{
296		p3 = cmp.gtu(ATMP,BTMP)
297		if (!p3.new) A = B			// sort values
298		if (!p3.new) B = A			// sort values
299	}
300	{
301		// Any NaN --> NaN, possibly raise invalid if sNaN
302		p0 = dfclass(A,#0x0f)		// A not NaN?
303		if (!p0.new) jump:nt .Linvalid_nan
304		if (!p3) ATMP = BTMP
305		if (!p3) BTMP = ATMP
306	}
307	{
308		// Infinity * nonzero number is infinity
309		p1 = dfclass(A,#0x08)		// A is infinity
310		p1 = dfclass(B,#0x0e)		// B is nonzero
311	}
312	{
313		// Infinity * zero --> NaN, raise invalid
314		// Other zeros return zero
315		p0 = dfclass(A,#0x08)		// A is infinity
316		p0 = dfclass(B,#0x01)		// B is zero
317	}
318	{
319		if (p1) jump .Ltrue_inf
320		p2 = dfclass(B,#0x01)
321	}
322	{
323		if (p0) jump .Linvalid_zeroinf
324		if (p2) jump .Ltrue_zero		// so return zero
325		TMP = ##0x7c000000
326	}
327	// We are left with a normal or subnormal times a subnormal. A > B
328	// If A and B are both very small (exp(a) < BIAS-MANTBITS),
329	// we go to a single sticky bit, which we can round easily.
330	// If A and B might multiply to something bigger, decrease A exponent and increase
331	// B exponent and try again
332	{
333		p0 = bitsclr(AH,TMP)
334		if (p0.new) jump:nt .Lmul_tiny
335	}
336	{
337		TMP = cl0(BTMP)
338	}
339	{
340		TMP = add(TMP,#-EXPBITS)
341	}
342	{
343		BTMP = asl(BTMP,TMP)
344	}
345	{
346		B = insert(BTMP,#63,#0)
347		AH -= asl(TMP,#HI_MANTBITS)
348	}
349	jump __hexagon_muldf3
350.Lmul_tiny:
351	{
352		TMP = USR
353		A = xor(A,B)				// get sign bit
354	}
355	{
356		TMP = or(TMP,#0x30)			// Inexact + Underflow
357		A = insert(ONE,#63,#0)			// put in rounded up value
358		BTMPH = extractu(TMP,#2,#SR_ROUND_OFF)	// get rounding mode
359	}
360	{
361		USR = TMP
362		p0 = cmp.gt(BTMPH,#1)			// Round towards pos/neg inf?
363		if (!p0.new) AL = #0			// If not, zero
364		BTMPH ^= lsr(AH,#31)			// rounding my way --> set LSB
365	}
366	{
367		p0 = cmp.eq(BTMPH,#3)			// if rounding towards right inf
368		if (!p0.new) AL = #0			// don't go to zero
369		jumpr r31
370	}
371.Linvalid_zeroinf:
372	{
373		TMP = USR
374	}
375	{
376		A = #-1
377		TMP = or(TMP,#2)
378	}
379	{
380		USR = TMP
381	}
382	{
383		p0 = dfcmp.uo(A,A)			// force exception if enabled
384		jumpr r31
385	}
386.Linvalid_nan:
387	{
388		p0 = dfclass(B,#0x0f)			// if B is not NaN
389		TMP = convert_df2sf(A)			// will generate invalid if sNaN
390		if (p0.new) B = A 			// make it whatever A is
391	}
392	{
393		BL = convert_df2sf(B)			// will generate invalid if sNaN
394		A = #-1
395		jumpr r31
396	}
397	.falign
398.Ltrue_zero:
399	{
400		A = B
401		B = A
402	}
403.Ltrue_inf:
404	{
405		BH = extract(BH,#1,#31)
406	}
407	{
408		AH ^= asl(BH,#31)
409		jumpr r31
410	}
411END(__hexagon_muldf3)
412
413#undef ATMP
414#undef ATMPL
415#undef ATMPH
416#undef BTMP
417#undef BTMPL
418#undef BTMPH
419