1/* Copyright (C) 2008-2015 Free Software Foundation, Inc.
2   Contributor: Joern Rennecke <joern.rennecke@embecosm.com>
3		on behalf of Synopsys Inc.
4
5This file is part of GCC.
6
7GCC is free software; you can redistribute it and/or modify it under
8the terms of the GNU General Public License as published by the Free
9Software Foundation; either version 3, or (at your option) any later
10version.
11
12GCC is distributed in the hope that it will be useful, but WITHOUT ANY
13WARRANTY; without even the implied warranty of MERCHANTABILITY or
14FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
15for more details.
16
17Under Section 7 of GPL version 3, you are granted additional
18permissions described in the GCC Runtime Library Exception, version
193.1, as published by the Free Software Foundation.
20
21You should have received a copy of the GNU General Public License and
22a copy of the GCC Runtime Library Exception along with this program;
23see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
24<http://www.gnu.org/licenses/>.  */
25
26/*
27   to calculate a := b/x as b*y, with y := 1/x:
28   - x is in the range [1..2)
29   - calculate 15..18 bit inverse y0 using a table of approximating polynoms.
30     Precision is higher for polynoms used to evaluate input with larger
31     value.
32   - Do one newton-raphson iteration step to double the precision,
33     then multiply this with the divisor
34	-> more time to decide if dividend is subnormal
35     - the worst error propagation is on the side of the value range
36       with the least initial defect, thus giving us about 30 bits precision.
37      The truncation error for the either is less than 1 + x/2 ulp.
38      A 31 bit inverse can be simply calculated by using x with implicit 1
39      and chaining the multiplies.  For a 32 bit inverse, we multiply y0^2
40      with the bare fraction part of x, then add in y0^2 for the implicit
41      1 of x.
42    - If calculating a 31 bit inverse, the systematic error is less than
43      -1 ulp; likewise, for 32 bit, it is less than -2 ulp.
44    - If we calculate our seed with a 32 bit fraction, we can archive a
45      tentative result strictly better than -2 / +2.5 (1) ulp/128, i.e. we
46      only need to take the step to calculate the 2nd stage rest and
47      rounding adjust 1/32th of the time.  However, if we use a 20 bit
48      fraction for the seed, the negative error can exceed -2 ulp/128, (2)
49      thus for a simple add / tst check, we need to do the 2nd stage
50      rest calculation/ rounding adjust 1/16th of the time.
51      (1): The inexactness of the 32 bit inverse contributes an error in the
52      range of (-1 .. +(1+x/2) ) ulp/128.  Leaving out the low word of the
53      rest contributes an error < +1/x ulp/128 .  In the interval [1,2),
54      x/2 + 1/x <= 1.5 .
55      (2): Unless proven otherwise.  I have not actually looked for an
56      example where -2 ulp/128 is exceeded, and my calculations indicate
57      that the excess, if existent, is less than -1/512 ulp.
58    ??? The algorithm is still based on the ARC700 optimized code.
59    Maybe we could make better use of 64 bit multiply results and/or mmed .
60 */
61#include "../arc-ieee-754.h"
62
63/* N.B. fp-bit.c does double rounding on denormal numbers.  */
64#if 0 /* DEBUG */
65	.global __divdf3
66	FUNC(__divdf3)
67	.balign 4
68__divdf3:
69	push_s blink
70	push_s r2
71	push_s r3
72	push_s r0
73	bl.d __divdf3_c
74	push_s r1
75	ld_s r2,[sp,12]
76	ld_s r3,[sp,8]
77	st_s r0,[sp,12]
78	st_s r1,[sp,8]
79	pop_s r1
80	bl.d __divdf3_asm
81	pop_s r0
82	pop_s r3
83	pop_s r2
84	pop_s blink
85	cmp r0,r2
86	cmp.eq r1,r3
87	jeq_s [blink]
88	and r12,DBL0H,DBL1H
89	bic.f 0,0x7ff80000,r12 ; both NaN -> OK
90	jeq_s [blink]
91	bl abort
92	ENDFUNC(__divdf3)
93#define __divdf3 __divdf3_asm
94#endif /* DEBUG */
95
96	FUNC(__divdf3)
97	.balign 4
98.L7ff00000:
99	.long 0x7ff00000
100.Ldivtab:
101	.long 0xfc0fffe1
102	.long 0xf46ffdfb
103	.long 0xed1ffa54
104	.long 0xe61ff515
105	.long 0xdf7fee75
106	.long 0xd91fe680
107	.long 0xd2ffdd52
108	.long 0xcd1fd30c
109	.long 0xc77fc7cd
110	.long 0xc21fbbb6
111	.long 0xbcefaec0
112	.long 0xb7efa100
113	.long 0xb32f92bf
114	.long 0xae8f83b7
115	.long 0xaa2f7467
116	.long 0xa5ef6479
117	.long 0xa1cf53fa
118	.long 0x9ddf433e
119	.long 0x9a0f3216
120	.long 0x965f2091
121	.long 0x92df0f11
122	.long 0x8f6efd05
123	.long 0x8c1eeacc
124	.long 0x88eed876
125	.long 0x85dec615
126	.long 0x82eeb3b9
127	.long 0x800ea10b
128	.long 0x7d3e8e0f
129	.long 0x7a8e7b3f
130	.long 0x77ee6836
131	.long 0x756e5576
132	.long 0x72fe4293
133	.long 0x709e2f93
134	.long 0x6e4e1c7f
135	.long 0x6c0e095e
136	.long 0x69edf6c5
137	.long 0x67cde3a5
138	.long 0x65cdd125
139	.long 0x63cdbe25
140	.long 0x61ddab3f
141	.long 0x600d991f
142	.long 0x5e3d868c
143	.long 0x5c6d7384
144	.long 0x5abd615f
145	.long 0x590d4ecd
146	.long 0x576d3c83
147	.long 0x55dd2a89
148	.long 0x545d18e9
149	.long 0x52dd06e9
150	.long 0x516cf54e
151	.long 0x4ffce356
152	.long 0x4e9cd1ce
153	.long 0x4d3cbfec
154	.long 0x4becae86
155	.long 0x4aac9da4
156	.long 0x496c8c73
157	.long 0x483c7bd3
158	.long 0x470c6ae8
159	.long 0x45dc59af
160	.long 0x44bc4915
161	.long 0x43ac3924
162	.long 0x428c27fb
163	.long 0x418c187a
164	.long 0x407c07bd
165
166__divdf3_support: /* This label makes debugger output saner.  */
167	.balign 4
168.Ldenorm_dbl1:
169	brge r6, \
170		0x43500000,.Linf_NaN ; large number / denorm -> Inf
171	bmsk.f r12,DBL1H,19
172	mov.eq r12,DBL1L
173	mov.eq DBL1L,0
174	sub.eq r7,r7,32
175	norm.f r11,r12 ; flag for x/0 -> Inf check
176	beq_s .Linf_NaN
177	mov.mi r11,0
178	add.pl r11,r11,1
179	add_s r12,r12,r12
180	asl r8,r12,r11
181	rsub r12,r11,31
182	lsr r12,DBL1L,r12
183	tst_s DBL1H,DBL1H
184	or r8,r8,r12
185	lsr r4,r8,26
186	lsr DBL1H,r8,12
187	ld.as r4,[r10,r4]
188	bxor.mi DBL1H,DBL1H,31
189	sub r11,r11,11
190	asl DBL1L,DBL1L,r11
191	sub r11,r11,1
192	mulu64 r4,r8
193	sub r7,r7,r11
194	b.d .Lpast_denorm_dbl1
195	asl r7,r7,20
196
197	.balign 4
198.Ldenorm_dbl0:
199	bmsk.f r12,DBL0H,19
200	; wb stall
201	mov.eq r12,DBL0L
202	sub.eq r6,r6,32
203	norm.f r11,r12 ; flag for 0/x -> 0 check
204	brge r7, \
205		0x43500000, .Lret0_2 ; denorm/large number -> 0
206	beq_s .Lret0_2
207	mov.mi r11,0
208	add.pl r11,r11,1
209	asl r12,r12,r11
210	sub r6,r6,r11
211	add.f 0,r6,31
212	lsr r10,DBL0L,r6
213	mov.mi r10,0
214	add r6,r6,11+32
215	neg.f r11,r6
216	asl DBL0L,DBL0L,r11
217	mov.pl DBL0L,0
218	sub r6,r6,32-1
219	b.d .Lpast_denorm_dbl0
220	asl r6,r6,20
221
222.Linf_NaN:
223	tst_s DBL0L,DBL0L ; 0/0 -> NaN
224	xor_s DBL1H,DBL1H,DBL0H
225	bclr.eq.f DBL0H,DBL0H,31
226	bmsk DBL0H,DBL1H,30
227	xor_s DBL0H,DBL0H,DBL1H
228	sub.eq DBL0H,DBL0H,1
229	mov_s DBL0L,0
230	j_s.d [blink]
231	or DBL0H,DBL0H,r9
232	.balign 4
233.Lret0_2:
234	xor_s DBL1H,DBL1H,DBL0H
235	mov_s DBL0L,0
236	bmsk DBL0H,DBL1H,30
237	j_s.d [blink]
238	xor_s DBL0H,DBL0H,DBL1H
239	.balign 4
240	.global __divdf3
241/* N.B. the spacing between divtab and the sub3 to get its address must
242   be a multiple of 8.  */
243__divdf3:
244	asl r8,DBL1H,12
245	lsr r4,r8,26
246	sub3 r10,pcl,61; (.-.Ldivtab) >> 3
247	ld.as r9,[pcl,-124]; [pcl,(-((.-.L7ff00000) >> 2))] ; 0x7ff00000
248	ld.as r4,[r10,r4]
249	lsr r12,DBL1L,20
250	and.f r7,DBL1H,r9
251	or r8,r8,r12
252	mulu64 r4,r8
253	beq.d .Ldenorm_dbl1
254.Lpast_denorm_dbl1:
255	and.f r6,DBL0H,r9
256	breq.d r7,r9,.Linf_nan_dbl1
257	asl r4,r4,12
258	sub r4,r4,mhi
259	mulu64 r4,r4
260	beq.d .Ldenorm_dbl0
261	lsr r8,r8,1
262	breq.d r6,r9,.Linf_nan_dbl0
263	asl r12,DBL0H,11
264	lsr r10,DBL0L,21
265.Lpast_denorm_dbl0:
266	bset r8,r8,31
267	mulu64 mhi,r8
268	add_s r12,r12,r10
269	bset r5,r12,31
270	cmp r5,r8
271	cmp.eq DBL0L,DBL1L
272	lsr.cc r5,r5,1
273	sub r4,r4,mhi ; u1.31 inverse, about 30 bit
274	mulu64 r5,r4 ; result fraction highpart
275	lsr r8,r8,2 ; u3.29
276	add r5,r6, /* wait for immediate */ \
277		0x3fe00000
278	mov r11,mhi ; result fraction highpart
279	mulu64 r11,r8 ; u-28.31
280	asl_s DBL1L,DBL1L,9 ; u-29.23:9
281	sbc r6,r5,r7
282	mov r12,mlo ; u-28.31
283	mulu64 r11,DBL1L ; mhi: u-28.23:9
284	add.cs DBL0L,DBL0L,DBL0L
285	asl_s DBL0L,DBL0L,6 ; u-26.25:7
286	asl r10,r11,23
287	sub_l DBL0L,DBL0L,r12
288	lsr r7,r11,9
289	sub r5,DBL0L,mhi ; rest msw ; u-26.31:0
290	mul64 r5,r4 ; mhi: result fraction lowpart
291	xor.f 0,DBL0H,DBL1H
292	and DBL0H,r6,r9
293	add_s DBL0H,DBL0H,r7
294	bclr r12,r9,20 ; 0x7fe00000
295	brhs.d r6,r12,.Linf_denorm
296	bxor.mi DBL0H,DBL0H,31
297	add.f r12,mhi,0x11
298	asr r9,r12,5
299	sub.mi DBL0H,DBL0H,1
300	add.f DBL0L,r9,r10
301	tst r12,0x1c
302	jne.d [blink]
303	add.cs DBL0H,DBL0H,1
304        /* work out exact rounding if we fall through here.  */
305        /* We know that the exact result cannot be represented in double
306           precision.  Find the mid-point between the two nearest
307           representable values, multiply with the divisor, and check if
308           the result is larger than the dividend.  Since we want to know
309	   only the sign bit, it is sufficient to calculate only the
310	   highpart of the lower 64 bits.  */
311	mulu64 r11,DBL1L ; rest before considering r12 in r5 : -mlo
312	sub.f DBL0L,DBL0L,1
313	asl r12,r9,2 ; u-22.30:2
314	sub.cs DBL0H,DBL0H,1
315	sub.f r12,r12,2
316	mov r10,mlo ; rest before considering r12 in r5 : -r10
317	mulu64 r12,DBL1L ; mhi: u-51.32
318	asl r5,r5,25 ; s-51.7:25
319	lsr r10,r10,7 ; u-51.30:2
320	mov r7,mhi ; u-51.32
321	mulu64 r12,r8 ; mlo: u-51.31:1
322	sub r5,r5,r10
323	add.mi r5,r5,DBL1L ; signed multiply adjust for r12*DBL1L
324	bset r7,r7,0 ; make sure that the result is not zero, and that
325	sub r5,r5,r7 ; a highpart zero appears negative
326	sub.f r5,r5,mlo ; rest msw
327	add.pl.f DBL0L,DBL0L,1
328	j_s.d [blink]
329	add.eq DBL0H,DBL0H,1
330
331.Linf_nan_dbl1: ; 0/Inf -> NaN Inf/Inf -> NaN x/Inf-> 0 x/NaN -> NaN
332	or.f 0,r6,DBL0L
333	cmp.ne r6,r9
334	not_s DBL0L,DBL1H
335	sub_s.ne DBL0L,DBL0L,DBL0L
336	tst_s DBL0H,DBL0H
337	add_s DBL0H,DBL1H,DBL0L
338	j_s.d [blink]
339	bxor.mi DBL0H,DBL0H,31
340.Linf_nan_dbl0:
341	tst_s DBL1H,DBL1H
342	j_s.d [blink]
343	bxor.mi DBL0H,DBL0H,31
344	.balign 4
345.Linf_denorm:
346	lsr r12,r6,28
347	brlo.d r12,0xc,.Linf
348.Ldenorm:
349	asr r6,r6,20
350	neg r9,r6
351	mov_s DBL0H,0
352	brhs.d r9,54,.Lret0
353	bxor.mi DBL0H,DBL0H,31
354	add r12,mhi,1
355	and r12,r12,-4
356	rsub r7,r6,5
357	asr r10,r12,28
358	bmsk r4,r12,27
359	min r7,r7,31
360	asr DBL0L,r4,r7
361	add DBL1H,r11,r10
362	abs.f r10,r4
363	sub.mi r10,r10,1
364	add.f r7,r6,32-5
365	asl r4,r4,r7
366	mov.mi r4,r10
367	add.f r10,r6,23
368	rsub r7,r6,9
369	lsr r7,DBL1H,r7
370	asl r10,DBL1H,r10
371	or.pnz DBL0H,DBL0H,r7
372	or.mi r4,r4,r10
373	mov.mi r10,r7
374	add.f DBL0L,r10,DBL0L
375	add.cs.f DBL0H,DBL0H,1 ; carry clear after this point
376	bxor.f 0,r4,31
377	add.pnz.f DBL0L,DBL0L,1
378	add.cs.f DBL0H,DBL0H,1
379	jne_s [blink]
380	/* Calculation so far was not conclusive; calculate further rest.  */
381	mulu64 r11,DBL1L ; rest before considering r12 in r5 : -mlo
382	asr.f r12,r12,3
383	asl r5,r5,25 ; s-51.7:25
384	mov r11,mlo ; rest before considering r12 in r5 : -r11
385	mulu64 r12,r8 ; u-51.31:1
386	and r9,DBL0L,1 ; tie-breaker: round to even
387	lsr r11,r11,7 ; u-51.30:2
388	mov DBL1H,mlo ; u-51.31:1
389	mulu64 r12,DBL1L ; u-51.62:2
390	sub.mi r11,r11,DBL1L ; signed multiply adjust for r12*DBL1L
391	add_s DBL1H,DBL1H,r11
392	sub DBL1H,DBL1H,r5 ; -rest msw
393	add_s DBL1H,DBL1H,mhi ; -rest msw
394	add.f 0,DBL1H,DBL1H ; can't ror.f by 32 :-(
395	tst_s DBL1H,DBL1H
396	cmp.eq mlo,r9
397	add.cs.f DBL0L,DBL0L,1
398	j_s.d [blink]
399	add.cs DBL0H,DBL0H,1
400
401.Lret0:
402	/* return +- 0 */
403	j_s.d [blink]
404	mov_s DBL0L,0
405.Linf:
406	mov_s DBL0H,r9
407	mov_s DBL0L,0
408	j_s.d [blink]
409	bxor.mi DBL0H,DBL0H,31
410	ENDFUNC(__divdf3)
411