divdf3.S revision 1.1.1.5
1/* Copyright (C) 2008-2017 Free Software Foundation, Inc.
2   Contributor: Joern Rennecke <joern.rennecke@embecosm.com>
3		on behalf of Synopsys Inc.
4
5This file is part of GCC.
6
7GCC is free software; you can redistribute it and/or modify it under
8the terms of the GNU General Public License as published by the Free
9Software Foundation; either version 3, or (at your option) any later
10version.
11
12GCC is distributed in the hope that it will be useful, but WITHOUT ANY
13WARRANTY; without even the implied warranty of MERCHANTABILITY or
14FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
15for more details.
16
17Under Section 7 of GPL version 3, you are granted additional
18permissions described in the GCC Runtime Library Exception, version
193.1, as published by the Free Software Foundation.
20
21You should have received a copy of the GNU General Public License and
22a copy of the GCC Runtime Library Exception along with this program;
23see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
24<http://www.gnu.org/licenses/>.  */
25
26/*
27   to calculate a := b/x as b*y, with y := 1/x:
28   - x is in the range [1..2)
29   - calculate 15..18 bit inverse y0 using a table of approximating polynoms.
30     Precision is higher for polynoms used to evaluate input with larger
31     value.
32   - Do one newton-raphson iteration step to double the precision,
33     then multiply this with the divisor
34	-> more time to decide if dividend is subnormal
35     - the worst error propagation is on the side of the value range
36       with the least initial defect, thus giving us about 30 bits precision.
37      The truncation error for the either is less than 1 + x/2 ulp.
38      A 31 bit inverse can be simply calculated by using x with implicit 1
39      and chaining the multiplies.  For a 32 bit inverse, we multiply y0^2
40      with the bare fraction part of x, then add in y0^2 for the implicit
41      1 of x.
42    - If calculating a 31 bit inverse, the systematic error is less than
43      -1 ulp; likewise, for 32 bit, it is less than -2 ulp.
44    - If we calculate our seed with a 32 bit fraction, we can archive a
45      tentative result strictly better than -2 / +2.5 (1) ulp/128, i.e. we
46      only need to take the step to calculate the 2nd stage rest and
47      rounding adjust 1/32th of the time.  However, if we use a 20 bit
48      fraction for the seed, the negative error can exceed -2 ulp/128, (2)
49      thus for a simple add / tst check, we need to do the 2nd stage
50      rest calculation/ rounding adjust 1/16th of the time.
51      (1): The inexactness of the 32 bit inverse contributes an error in the
52      range of (-1 .. +(1+x/2) ) ulp/128.  Leaving out the low word of the
53      rest contributes an error < +1/x ulp/128 .  In the interval [1,2),
54      x/2 + 1/x <= 1.5 .
55      (2): Unless proven otherwise.  I have not actually looked for an
56      example where -2 ulp/128 is exceeded, and my calculations indicate
57      that the excess, if existent, is less than -1/512 ulp.
58    ??? The algorithm is still based on the ARC700 optimized code.
59    Maybe we could make better use of 32x16 bit multiply, or 64 bit multiply
60    results.
61 */
62#include "../arc-ieee-754.h"
63#define mlo acc2
64#define mhi acc1
65#define mul64(b,c) mullw 0,b,c` machlw 0,b,c
66#define mulu64(b,c) mululw 0,b,c` machulw 0,b,c
67
68/* N.B. fp-bit.c does double rounding on denormal numbers.  */
69#if 0 /* DEBUG */
70	.global __divdf3
71	FUNC(__divdf3)
72	.balign 4
73__divdf3:
74	push_s blink
75	push_s r2
76	push_s r3
77	push_s r0
78	bl.d __divdf3_c
79	push_s r1
80	ld_s r2,[sp,12]
81	ld_s r3,[sp,8]
82	st_s r0,[sp,12]
83	st_s r1,[sp,8]
84	pop_s r1
85	bl.d __divdf3_asm
86	pop_s r0
87	pop_s r3
88	pop_s r2
89	pop_s blink
90	cmp r0,r2
91	cmp.eq r1,r3
92	jeq_s [blink]
93	and r12,DBL0H,DBL1H
94	bic.f 0,0x7ff80000,r12 ; both NaN -> OK
95	jeq_s [blink]
96	bl abort
97	ENDFUNC(__divdf3)
98#define __divdf3 __divdf3_asm
99#endif /* DEBUG */
100
101	FUNC(__divdf3)
102	.balign 4
103.L7ff00000:
104	.long 0x7ff00000
105.Ldivtab:
106	.long 0xfc0fffe1
107	.long 0xf46ffdfb
108	.long 0xed1ffa54
109	.long 0xe61ff515
110	.long 0xdf7fee75
111	.long 0xd91fe680
112	.long 0xd2ffdd52
113	.long 0xcd1fd30c
114	.long 0xc77fc7cd
115	.long 0xc21fbbb6
116	.long 0xbcefaec0
117	.long 0xb7efa100
118	.long 0xb32f92bf
119	.long 0xae8f83b7
120	.long 0xaa2f7467
121	.long 0xa5ef6479
122	.long 0xa1cf53fa
123	.long 0x9ddf433e
124	.long 0x9a0f3216
125	.long 0x965f2091
126	.long 0x92df0f11
127	.long 0x8f6efd05
128	.long 0x8c1eeacc
129	.long 0x88eed876
130	.long 0x85dec615
131	.long 0x82eeb3b9
132	.long 0x800ea10b
133	.long 0x7d3e8e0f
134	.long 0x7a8e7b3f
135	.long 0x77ee6836
136	.long 0x756e5576
137	.long 0x72fe4293
138	.long 0x709e2f93
139	.long 0x6e4e1c7f
140	.long 0x6c0e095e
141	.long 0x69edf6c5
142	.long 0x67cde3a5
143	.long 0x65cdd125
144	.long 0x63cdbe25
145	.long 0x61ddab3f
146	.long 0x600d991f
147	.long 0x5e3d868c
148	.long 0x5c6d7384
149	.long 0x5abd615f
150	.long 0x590d4ecd
151	.long 0x576d3c83
152	.long 0x55dd2a89
153	.long 0x545d18e9
154	.long 0x52dd06e9
155	.long 0x516cf54e
156	.long 0x4ffce356
157	.long 0x4e9cd1ce
158	.long 0x4d3cbfec
159	.long 0x4becae86
160	.long 0x4aac9da4
161	.long 0x496c8c73
162	.long 0x483c7bd3
163	.long 0x470c6ae8
164	.long 0x45dc59af
165	.long 0x44bc4915
166	.long 0x43ac3924
167	.long 0x428c27fb
168	.long 0x418c187a
169	.long 0x407c07bd
170
171__divdf3_support: /* This label makes debugger output saner.  */
172	.balign 4
173.Ldenorm_dbl1:
174	brge r6, \
175		0x43500000,.Linf_NaN ; large number / denorm -> Inf
176	bmsk.f r12,DBL1H,19
177	mov.eq r12,DBL1L
178	mov.eq DBL1L,0
179	sub.eq r7,r7,32
180	norm.f r11,r12 ; flag for x/0 -> Inf check
181	beq_s .Linf_NaN
182	mov.mi r11,0
183	add.pl r11,r11,1
184	add_s r12,r12,r12
185	asl r8,r12,r11
186	rsub r12,r11,31
187	lsr r12,DBL1L,r12
188	tst_s DBL1H,DBL1H
189	or r8,r8,r12
190	lsr r4,r8,26
191	lsr DBL1H,r8,12
192	ld.as r4,[r10,r4]
193	bxor.mi DBL1H,DBL1H,31
194	sub r11,r11,11
195	asl DBL1L,DBL1L,r11
196	sub r11,r11,1
197	mulu64 (r4,r8)
198	sub r7,r7,r11
199	b.d .Lpast_denorm_dbl1
200	asl r7,r7,20
201
202.Linf_NaN:
203	tst_s DBL0L,DBL0L ; 0/0 -> NaN
204	xor_s DBL1H,DBL1H,DBL0H
205	bclr.eq.f DBL0H,DBL0H,31
206	bmsk DBL0H,DBL1H,30
207	xor_s DBL0H,DBL0H,DBL1H
208	sub.eq DBL0H,DBL0H,1
209	mov_s DBL0L,0
210	j_s.d [blink]
211	or DBL0H,DBL0H,r9
212	.balign 4
213.Lret0_2:
214	xor_s DBL1H,DBL1H,DBL0H
215	mov_s DBL0L,0
216	bmsk DBL0H,DBL1H,30
217	j_s.d [blink]
218	xor_s DBL0H,DBL0H,DBL1H
219	.balign 4
220	.global __divdf3
221/* N.B. the spacing between divtab and the sub3 to get its address must
222   be a multiple of 8.  */
223__divdf3:
224	asl r8,DBL1H,12
225	lsr r4,r8,26
226	sub3 r10,pcl,51;(.-.Ldivtab) >> 3
227	ld.as r9,[pcl,-104]; [pcl,(-((.-.L7ff00000) >> 2))] ; 0x7ff00000
228	ld.as r4,[r10,r4]
229	lsr r12,DBL1L,20
230	and.f r7,DBL1H,r9
231	or r8,r8,r12
232	mulu64 (r4,r8)
233	beq.d .Ldenorm_dbl1
234.Lpast_denorm_dbl1:
235	and.f r6,DBL0H,r9
236	breq.d r7,r9,.Linf_nan_dbl1
237	asl r4,r4,12
238	sub r4,r4,mhi
239	mululw 0,r4,r4
240	machulw r5,r4,r4
241	bne.d .Lnormal_dbl0
242	lsr r8,r8,1
243
244	.balign 4
245.Ldenorm_dbl0:
246	bmsk.f r12,DBL0H,19
247	; wb stall
248	mov.eq r12,DBL0L
249	sub.eq r6,r6,32
250	norm.f r11,r12 ; flag for 0/x -> 0 check
251	brge r7, \
252		0x43500000, .Lret0_2 ; denorm/large number -> 0
253	beq_s .Lret0_2
254	mov.mi r11,0
255	add.pl r11,r11,1
256	asl r12,r12,r11
257	sub r6,r6,r11
258	add.f 0,r6,31
259	lsr r10,DBL0L,r6
260	mov.mi r10,0
261	add r6,r6,11+32
262	neg.f r11,r6
263	asl DBL0L,DBL0L,r11
264	mov.pl DBL0L,0
265	sub r6,r6,32-1
266	b.d .Lpast_denorm_dbl0
267	asl r6,r6,20
268
269	.balign 4
270.Linf_nan_dbl1: ; 0/Inf -> NaN Inf/Inf -> NaN x/Inf-> 0 x/NaN -> NaN
271	or.f 0,r6,DBL0L
272	cmp.ne r6,r9
273	not_s DBL0L,DBL1H
274	sub_s.ne DBL0L,DBL0L,DBL0L
275	tst_s DBL0H,DBL0H
276	add_s DBL0H,DBL1H,DBL0L
277	j_s.d [blink]
278	bxor.mi DBL0H,DBL0H,31
279
280	.balign 4
281.Lnormal_dbl0:
282	breq.d r6,r9,.Linf_nan_dbl0
283	asl r12,DBL0H,11
284	lsr r10,DBL0L,21
285.Lpast_denorm_dbl0:
286	bset r8,r8,31
287	mulu64 (r5,r8)
288	add_s r12,r12,r10
289	bset r5,r12,31
290	cmp r5,r8
291	cmp.eq DBL0L,DBL1L
292	lsr.cc r5,r5,1
293	sub r4,r4,mhi ; u1.31 inverse, about 30 bit
294	mululw 0,r5,r4
295	machulw r11,r5,r4 ; result fraction highpart
296	lsr r8,r8,2 ; u3.29
297	add r5,r6, /* wait for immediate */ \
298		0x3fe00000
299	mulu64 (r11,r8) ; u-28.31
300	asl_s DBL1L,DBL1L,9 ; u-29.23:9
301	sbc r6,r5,r7
302	mov r12,mlo ; u-28.31
303	mulu64 (r11,DBL1L) ; mhi: u-28.23:9
304	add.cs DBL0L,DBL0L,DBL0L
305	asl_s DBL0L,DBL0L,6 ; u-26.25:7
306	asl r10,r11,23
307	sub_l DBL0L,DBL0L,r12
308	lsr r7,r11,9
309	sub r5,DBL0L,mhi ; rest msw ; u-26.31:0
310	mul64 (r5,r4) ; mhi: result fraction lowpart
311	xor.f 0,DBL0H,DBL1H
312	and DBL0H,r6,r9
313	add_s DBL0H,DBL0H,r7
314	bclr r12,r9,20 ; 0x7fe00000
315	brhs.d r6,r12,.Linf_denorm
316	bxor.mi DBL0H,DBL0H,31
317	add.f r12,mhi,0x11
318	asr r9,r12,5
319	sub.mi DBL0H,DBL0H,1
320	add.f DBL0L,r9,r10
321	tst r12,0x1c
322	jne.d [blink]
323	add.cs DBL0H,DBL0H,1
324        /* work out exact rounding if we fall through here.  */
325        /* We know that the exact result cannot be represented in double
326           precision.  Find the mid-point between the two nearest
327           representable values, multiply with the divisor, and check if
328           the result is larger than the dividend.  Since we want to know
329	   only the sign bit, it is sufficient to calculate only the
330	   highpart of the lower 64 bits.  */
331	mulu64 (r11,DBL1L) ; rest before considering r12 in r5 : -mlo
332	sub.f DBL0L,DBL0L,1
333	asl r12,r9,2 ; u-22.30:2
334	sub.cs DBL0H,DBL0H,1
335	sub.f r12,r12,2
336	mov r10,mlo ; rest before considering r12 in r5 : -r10
337	mululw 0,r12,DBL1L
338	machulw r7,r12,DBL1L ; mhi: u-51.32
339	asl r5,r5,25 ; s-51.7:25
340	lsr r10,r10,7 ; u-51.30:2
341	mulu64 (r12,r8) ; mlo: u-51.31:1
342	sub r5,r5,r10
343	add.mi r5,r5,DBL1L ; signed multiply adjust for r12*DBL1L
344	bset r7,r7,0 ; make sure that the result is not zero, and that
345	sub r5,r5,r7 ; a highpart zero appears negative
346	sub.f r5,r5,mlo ; rest msw
347	add.pl.f DBL0L,DBL0L,1
348	j_s.d [blink]
349	add.eq DBL0H,DBL0H,1
350
351.Linf_nan_dbl0:
352	tst_s DBL1H,DBL1H
353	j_s.d [blink]
354	bxor.mi DBL0H,DBL0H,31
355	.balign 4
356.Linf_denorm:
357	lsr r12,r6,28
358	brlo.d r12,0xc,.Linf
359.Ldenorm:
360	asr r6,r6,20
361	neg r9,r6
362	mov_s DBL0H,0
363	brhs.d r9,54,.Lret0
364	bxor.mi DBL0H,DBL0H,31
365	add r12,mhi,1
366	and r12,r12,-4
367	rsub r7,r6,5
368	asr r10,r12,28
369	bmsk r4,r12,27
370	min r7,r7,31
371	asr DBL0L,r4,r7
372	add DBL1H,r11,r10
373	abs.f r10,r4
374	sub.mi r10,r10,1
375	add.f r7,r6,32-5
376	asl r4,r4,r7
377	mov.mi r4,r10
378	add.f r10,r6,23
379	rsub r7,r6,9
380	lsr r7,DBL1H,r7
381	asl r10,DBL1H,r10
382	or.pnz DBL0H,DBL0H,r7
383	or.mi r4,r4,r10
384	mov.mi r10,r7
385	add.f DBL0L,r10,DBL0L
386	add.cs.f DBL0H,DBL0H,1 ; carry clear after this point
387	bxor.f 0,r4,31
388	add.pnz.f DBL0L,DBL0L,1
389	add.cs.f DBL0H,DBL0H,1
390	jne_s [blink]
391	/* Calculation so far was not conclusive; calculate further rest.  */
392	mulu64 (r11,DBL1L) ; rest before considering r12 in r5 : -mlo
393	asr.f r12,r12,3
394	asl r5,r5,25 ; s-51.7:25
395	mov r11,mlo ; rest before considering r12 in r5 : -r11
396	mulu64 (r12,r8) ; u-51.31:1
397	and r9,DBL0L,1 ; tie-breaker: round to even
398	lsr r11,r11,7 ; u-51.30:2
399	mov DBL1H,mlo ; u-51.31:1
400	mulu64 (r12,DBL1L) ; u-51.62:2
401	sub.mi r11,r11,DBL1L ; signed multiply adjust for r12*DBL1L
402	add_s DBL1H,DBL1H,r11
403	sub DBL1H,DBL1H,r5 ; -rest msw
404	add_s DBL1H,DBL1H,mhi ; -rest msw
405	add.f 0,DBL1H,DBL1H ; can't ror.f by 32 :-(
406	tst_s DBL1H,DBL1H
407	cmp.eq mlo,r9
408	add.cs.f DBL0L,DBL0L,1
409	j_s.d [blink]
410	add.cs DBL0H,DBL0H,1
411
412.Lret0:
413	/* return +- 0 */
414	j_s.d [blink]
415	mov_s DBL0L,0
416.Linf:
417	mov_s DBL0H,r9
418	mov_s DBL0L,0
419	j_s.d [blink]
420	bxor.mi DBL0H,DBL0H,31
421	ENDFUNC(__divdf3)
422