1/* Copyright (C) 2008-2015 Free Software Foundation, Inc.
2   Contributor: Joern Rennecke <joern.rennecke@embecosm.com>
3		on behalf of Synopsys Inc.
4
5This file is part of GCC.
6
7GCC is free software; you can redistribute it and/or modify it under
8the terms of the GNU General Public License as published by the Free
9Software Foundation; either version 3, or (at your option) any later
10version.
11
12GCC is distributed in the hope that it will be useful, but WITHOUT ANY
13WARRANTY; without even the implied warranty of MERCHANTABILITY or
14FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
15for more details.
16
17Under Section 7 of GPL version 3, you are granted additional
18permissions described in the GCC Runtime Library Exception, version
193.1, as published by the Free Software Foundation.
20
21You should have received a copy of the GNU General Public License and
22a copy of the GCC Runtime Library Exception along with this program;
23see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
24<http://www.gnu.org/licenses/>.  */
25
26#include "arc-ieee-754.h"
27#if 0 /* DEBUG */
28	.global __adddf3
29	.balign 4
30__adddf3:
31	push_s blink
32	push_s r2
33	push_s r3
34	push_s r0
35	bl.d __adddf3_c
36	push_s r1
37	ld_s r2,[sp,12]
38	ld_s r3,[sp,8]
39	st_s r0,[sp,12]
40	st_s r1,[sp,8]
41	pop_s r1
42	bl.d __adddf3_asm
43	pop_s r0
44	pop_s r3
45	pop_s r2
46	pop_s blink
47	cmp r0,r2
48	cmp.eq r1,r3
49	jeq_s [blink]
50	bl abort
51	.global __subdf3
52	.balign 4
53__subdf3:
54	push_s blink
55	push_s r2
56	push_s r3
57	push_s r0
58	bl.d __subdf3_c
59	push_s r1
60	ld_s r2,[sp,12]
61	ld_s r3,[sp,8]
62	st_s r0,[sp,12]
63	st_s r1,[sp,8]
64	pop_s r1
65	bl.d __subdf3_asm
66	pop_s r0
67	pop_s r3
68	pop_s r2
69	pop_s blink
70	cmp r0,r2
71	cmp.eq r1,r3
72	jeq_s [blink]
73	bl abort
74#define __adddf3 __adddf3_asm
75#define __subdf3 __subdf3_asm
76#endif /* DEBUG */
77/* N.B. This is optimized for ARC700.
78  ARC600 has very different scheduling / instruction selection criteria.  */
79
80/* inputs: DBL0, DBL1 (r0-r3)
81   output: DBL0 (r0, r1)
82   clobber: r2-r10, r12, flags
83   All NaN highword bits must be 1.  NaN low word is random.  */
84
85	.balign 4
86	.global __adddf3
87	.global __subdf3
88	.long 0x7ff00000 ; exponent mask
89	FUNC(__adddf3)
90	FUNC(__subdf3)
91__subdf3:
92	bxor_l DBL1H,DBL1H,31
93__adddf3:
94	ld r9,[pcl,-8]
95	bmsk r4,DBL0H,30
96	xor r10,DBL0H,DBL1H
97	and r6,DBL1H,r9
98	sub.f r12,r4,r6
99	asr_s r12,r12,20
100	blo .Ldbl1_gt
101	brhs r4,r9,.Linf_nan
102	brhs r12,32,.Large_shift
103	brne r12,0,.Lsmall_shift
104	brge r10,0,.Ladd_same_exp ; r12 == 0
105
106/* After subtracting, we need to normalize; when shifting to place the
107  leading 1 into position for the implicit 1 and adding that to DBL0H,
108  we increment the exponent.  Thus, we have to subtract one more than
109  the shift count from the exponent beforehand.  Iff the exponent drops thus
110  below zero (before adding in the fraction with the leading one), we have
111  generated a denormal number.  Denormal handling is basicallly reducing the
112  shift count so that we produce a zero exponent instead; however, this way
113  the shift count can become zero (if we started out with exponent 1).
114  Therefore, a simple min operation is not good enough, since we don't
115  want to handle a zero normalizing shift in the main path.
116  On the plus side, we don't need to check for denorm input, the result
117  of subtracing these looks just the same as denormals generated during
118  subtraction.  */
119	bmsk r7,DBL1H,30
120	cmp r4,r7
121	cmp.eq DBL0L,DBL1L
122	blo .L_rsub_same_exp
123	sub.f DBL0L,DBL0L,DBL1L
124	bmsk r12,DBL0H,19
125	bic DBL1H,DBL0H,r12
126	sbc.f r4,r4,r7
127	beq_l .Large_cancel
128	norm DBL1L,r4
129	b.d .Lsub_done_same_exp
130	sub r12,DBL1L,9
131
132	.balign 4
133.Linf_nan:
134	; If both inputs are inf, but with different signs, the result is NaN.
135	asr r12,r10,31
136	or_s DBL1H,DBL1H,r12
137	j_s.d [blink]
138	or.eq DBL0H,DBL0H,DBL1H
139
140	.balign 4
141.L_rsub_same_exp:
142	rsub.f DBL0L,DBL0L,DBL1L
143	bmsk r12,DBL1H,19
144	bic_s DBL1H,DBL1H,r12
145	sbc.f r4,r7,r4
146	beq_l .Large_cancel
147	norm DBL1L,r4
148
149	sub r12,DBL1L,9
150.Lsub_done_same_exp:
151	asl_s r12,r12,20
152	sub_s DBL1L,DBL1L,10
153	sub DBL0H,DBL1H,r12
154	xor.f 0,DBL0H,DBL1H
155	bmi .Ldenorm
156.Lpast_denorm:
157	neg_s r12,DBL1L
158	lsr r7,DBL0L,r12
159	asl r12,r4,DBL1L
160	asl_s DBL0L,DBL0L,DBL1L
161	add_s r12,r12,r7
162	j_s.d [blink]
163	add_l DBL0H,DBL0H,r12
164	.balign 4
165.Ladd_same_exp:
166	/* This is a special case because we can't test for need to shift
167	   down by checking if bit 20 of DBL0H changes.  OTOH, here we know
168	   that we always need to shift down.  */
169	; The implicit 1 of DBL0 is not shifted together with the
170	;  fraction, thus effectively doubled, compensating for not setting
171	;  implicit1 for DBL1
172	add_s r12,DBL0L,DBL1L
173	lsr.f 0,r12,2 ; round to even
174	breq r6,0,.Ldenorm_add
175	adc.f DBL0L,DBL0L,DBL1L
176	sub r7,DBL1H,DBL0H
177	sub1 r7,r7,r9 ; boost exponent by 2/2
178	rrc DBL0L,DBL0L
179	asr.f r7,r7 ; DBL1.fraction/2 - DBL0.fraction/2 ; exp++
180	add.cs.f DBL0L,DBL0L,0x80000000
181	add_l DBL0H,DBL0H,r7 ; DBL0.implicit1 not shifted for DBL1.implicit1
182	add.cs DBL0H,DBL0H,1
183	bic.f 0,r9,DBL0H ; check for overflow -> infinity.
184	jne_l [blink]
185	and DBL0H,DBL0H,0xfff00000
186	j_s.d [blink]
187	mov_s DBL0L,0
188	.balign 4
189.Large_shift:
190	brhs r12,55,.Lret_dbl0
191	bmsk_s DBL1H,DBL1H,19
192	brne r6,0,.Lno_denorm_large_shift
193	brhi.d r12,33,.Lfixed_denorm_large_shift
194	sub_s r12,r12,1
195	breq r12,31, .Lfixed_denorm_small_shift
196.Lshift32:
197	mov_s r12,DBL1L
198	mov_s DBL1L,DBL1H
199	brlt.d r10,0,.Lsub
200	mov_s DBL1H,0
201	b_s .Ladd
202.Ldenorm_add:
203	cmp_s r12,DBL1L
204	mov_s DBL0L,r12
205	j_s.d [blink]
206	adc DBL0H,r4,DBL1H
207
208.Lret_dbl0:
209	j_s [blink]
210	.balign 4
211.Lsmall_shift:
212	breq.d r6,0,.Ldenorm_small_shift
213	bmsk_s DBL1H,DBL1H,19
214	bset_s DBL1H,DBL1H,20
215.Lfixed_denorm_small_shift:
216	neg r8,r12
217	asl r4,DBL1H,r8
218	lsr_l DBL1H,DBL1H,r12
219	lsr r5,DBL1L,r12
220	asl r12,DBL1L,r8
221	brge.d r10,0,.Ladd
222	or DBL1L,r4,r5
223/* subtract, abs(DBL0) > abs(DBL1) */
224/* DBL0H, DBL0L: original values
225   DBL1H, DBL1L: fraction with explicit leading 1, shifted into place
226   r4:  orig. DBL0H & 0x7fffffff
227   r6:  orig. DBL1H & 0x7ff00000
228   r9:  0x7ff00000
229   r10: orig. DBL0H ^ DBL1H
230   r12: guard bits */
231	.balign 4
232.Lsub:
233	neg.f r12,r12
234	mov_s r7,DBL1H
235	bmsk r5,DBL0H,19
236	sbc.f DBL0L,DBL0L,DBL1L
237	bic DBL1H,DBL0H,r5
238	bset r5,r5,20
239	sbc.f r4,r5,r7
240	beq_l .Large_cancel_sub
241	norm DBL1L,r4
242	bmsk r6,DBL1H,30
243.Lsub_done:
244	sub_s DBL1L,DBL1L,9
245	breq DBL1L,1,.Lsub_done_noshift
246	asl r5,DBL1L,20
247	sub_s DBL1L,DBL1L,1
248	brlo r6,r5,.Ldenorm_sub
249	sub DBL0H,DBL1H,r5
250.Lpast_denorm_sub:
251	neg_s DBL1H,DBL1L
252	lsr r6,r12,DBL1H
253	asl_s r12,r12,DBL1L
254	and r8,r6,1
255	add1.f 0,r8,r12
256	add.ne.f r12,r12,r12
257	asl r8,DBL0L,DBL1L
258	lsr r12,DBL0L,DBL1H
259	adc.f DBL0L,r8,r6
260	asl r5,r4,DBL1L
261	add_s DBL0H,DBL0H,r12
262	j_s.d [blink]
263	adc DBL0H,DBL0H,r5
264
265	.balign 4
266.Lno_denorm_large_shift:
267	breq.d r12,32,.Lshift32
268	bset_l DBL1H,DBL1H,20
269.Lfixed_denorm_large_shift:
270	neg r8,r12
271	asl r4,DBL1H,r8
272	lsr r5,DBL1L,r12
273	asl.f 0,DBL1L,r8
274	lsr DBL1L,DBL1H,r12
275	or r12,r4,r5
276	tst.eq r12,1
277	or.ne r12,r12,2
278	brlt.d r10,0,.Lsub
279	mov_s DBL1H,0
280	b_l .Ladd
281
282	; If a denorm is produced without shifting, we have an exact result -
283	; no need for rounding.
284	.balign 4
285.Ldenorm_sub:
286	lsr DBL1L,r6,20
287	xor DBL0H,r6,DBL1H
288	brne.d DBL1L,1,.Lpast_denorm_sub
289	sub_s DBL1L,DBL1L,1
290.Lsub_done_noshift:
291	add.f 0,r12,r12
292	btst.eq DBL0L,0
293	cmp.eq r12,r12
294	add.cs.f DBL0L,DBL0L,1
295	bclr r4,r4,20
296	j_s.d [blink]
297	adc DBL0H,DBL1H,r4
298
299	.balign 4
300.Ldenorm_small_shift:
301	brne.d r12,1,.Lfixed_denorm_small_shift
302	sub_l r12,r12,1
303	brlt r10,0,.Lsub
304.Ladd: ; bit 20 of DBL1H is clear and bit 0 of r12 does not matter
305	add.f DBL0L,DBL0L,DBL1L
306	add_s DBL1H,DBL1H,DBL0H
307	add.cs DBL1H,DBL1H,1
308	xor_l DBL0H,DBL0H,DBL1H
309	bbit0 DBL0H,20,.Lno_shiftdown
310	lsr.f DBL0H,DBL1H
311	and r4,DBL0L,2
312	bmsk DBL0H,DBL0H,18
313	sbc DBL0H,DBL1H,DBL0H
314	rrc.f DBL0L,DBL0L
315	or.f r12,r12,r4
316	cmp.eq r12,r12
317	add.cs.f DBL0L,DBL0L,1
318	bic.f 0,r9,DBL0H ; check for generating infinity with possible ...
319	jne.d [blink]    ; ... non-zero fraction
320	add.cs DBL0H,DBL0H,1
321	mov_s DBL0L,0
322	bmsk DBL1H,DBL0H,19
323	j_s.d [blink]
324	bic_s DBL0H,DBL0H,DBL1H
325.Lno_shiftdown:
326	mov_s DBL0H,DBL1H
327	add.f 0,r12,r12
328	btst.eq DBL0L,0
329	cmp.eq r12,r12
330	add.cs.f DBL0L,DBL0L,1
331	j_s.d [blink]
332	add.cs DBL0H,DBL0H,1
333	.balign 4
334.Ldenorm:
335	bmsk DBL0H,DBL1H,30
336	lsr r12,DBL0H,20
337	xor_s DBL0H,DBL0H,DBL1H
338	sub_l DBL1L,r12,1
339	bgt .Lpast_denorm
340	j_s.d [blink]
341	add_l DBL0H,DBL0H,r4
342
343	.balign 4
344.Large_cancel:
345	;DBL0L: mantissa DBL1H: sign & exponent
346	norm.f DBL1L,DBL0L
347	bmsk DBL0H,DBL1H,30
348	add_s DBL1L,DBL1L,22
349	mov.mi DBL1L,21
350	add_s r12,DBL1L,1
351	asl_s r12,r12,20
352	beq_s .Lret0
353	brhs.d DBL0H,r12,.Lpast_denorm_large_cancel
354	sub DBL0H,DBL1H,r12
355	bmsk DBL0H,DBL1H,30
356	lsr r12,DBL0H,20
357	xor_s DBL0H,DBL0H,DBL1H
358	sub.f DBL1L,r12,1
359	jle [blink]
360.Lpast_denorm_large_cancel:
361	rsub.f r7,DBL1L,32
362	lsr r7,DBL0L,r7
363	asl_s DBL0L,DBL0L,DBL1L
364	mov.ls r7,DBL0L
365	add_s DBL0H,DBL0H,r7
366	j_s.d [blink]
367	mov.ls DBL0L,0
368.Lret0:
369	j_s.d	[blink]
370	mov_l	DBL0H,0
371
372/* r4:DBL0L:r12 : unnormalized result fraction
373   DBL1H: result sign and exponent         */
374/* When seeing large cancellation, only the topmost guard bit might be set.  */
375	.balign 4
376.Large_cancel_sub:
377	norm.f DBL1L,DBL0L
378	bpnz.d 0f
379	bmsk DBL0H,DBL1H,30
380	mov r5,22<<20
381	bne.d 1f
382	mov_s DBL1L,21
383	bset r5,r5,5+20
384	add_s DBL1L,DBL1L,32
385	brne r12,0,1f
386	j_s.d	[blink]
387	mov_l	DBL0H,0
388	.balign 4
3890:	add r5,DBL1L,23
390	asl r5,r5,20
391	add_s DBL1L,DBL1L,22
3921:	brlo DBL0H,r5,.Ldenorm_large_cancel_sub
393	sub DBL0H,DBL1H,r5
394.Lpast_denorm_large_cancel_sub:
395	rsub.f r7,DBL1L,32
396	lsr r12,r12,r7
397	lsr r7,DBL0L,r7
398	asl_s DBL0L,DBL0L,DBL1L
399	add.ge DBL0H,DBL0H,r7
400	add_s DBL0L,DBL0L,r12
401	add.lt DBL0H,DBL0H,DBL0L
402	mov.eq DBL0L,r12
403	j_s.d [blink]
404	mov.lt DBL0L,0
405	.balign 4
406.Ldenorm_large_cancel_sub:
407	lsr r5,DBL0H,20
408	xor_s DBL0H,DBL0H,DBL1H
409	brgt.d r5,1,.Lpast_denorm_large_cancel_sub
410	sub DBL1L,r5,1
411	j_l [blink] ; denorm, no shift -> no rounding needed.
412
413/* r4: DBL0H & 0x7fffffff
414   r6: DBL1H & 0x7ff00000
415   r9: 0x7ff00000
416   r10: sign difference
417   r12: shift count (negative) */
418	.balign 4
419.Ldbl1_gt:
420	brhs r6,r9,.Lret_dbl1 ; inf or NaN
421	neg r8,r12
422	brhs r8,32,.Large_shift_dbl0
423.Lsmall_shift_dbl0:
424	breq.d r6,0,.Ldenorm_small_shift_dbl0
425	bmsk_s DBL0H,DBL0H,19
426	bset_s DBL0H,DBL0H,20
427.Lfixed_denorm_small_shift_dbl0:
428	asl r4,DBL0H,r12
429	lsr DBL0H,DBL0H,r8
430	lsr r5,DBL0L,r8
431	asl r12,DBL0L,r12
432	brge.d r10,0,.Ladd_dbl1_gt
433	or DBL0L,r4,r5
434/* subtract, abs(DBL0) < abs(DBL1) */
435/* DBL0H, DBL0L: fraction with explicit leading 1, shifted into place
436   DBL1H, DBL1L: original values
437   r6:  orig. DBL1H & 0x7ff00000
438   r9:  0x7ff00000
439   r12: guard bits */
440	.balign 4
441.Lrsub:
442	neg.f r12,r12
443	bmsk r7,DBL1H,19
444	mov_s r5,DBL0H
445	sbc.f DBL0L,DBL1L,DBL0L
446	bic DBL1H,DBL1H,r7
447	bset r7,r7,20
448	sbc.f r4,r7,r5
449	beq_l .Large_cancel_sub
450	norm DBL1L,r4
451	b_l .Lsub_done ; note: r6 is already set up.
452
453.Lret_dbl1:
454	mov_s DBL0H,DBL1H
455	j_s.d [blink]
456	mov_l DBL0L,DBL1L
457	.balign 4
458.Ldenorm_small_shift_dbl0:
459	sub.f r8,r8,1
460	bne.d .Lfixed_denorm_small_shift_dbl0
461	add_s r12,r12,1
462	brlt r10,0,.Lrsub
463.Ladd_dbl1_gt: ; bit 20 of DBL0H is clear and bit 0 of r12 does not matter
464	add.f DBL0L,DBL0L,DBL1L
465	add_s DBL0H,DBL0H,DBL1H
466	add.cs DBL0H,DBL0H,1
467	xor DBL1H,DBL0H,DBL1H
468	bbit0 DBL1H,20,.Lno_shiftdown_dbl1_gt
469	lsr.f DBL1H,DBL0H
470	and r4,DBL0L,2
471	bmsk DBL1H,DBL1H,18
472	sbc DBL0H,DBL0H,DBL1H
473	rrc.f DBL0L,DBL0L
474	or.f r12,r12,r4
475	cmp.eq r12,r12
476	add.cs.f DBL0L,DBL0L,1
477	bic.f 0,r9,DBL0H ; check for generating infinity with possible ...
478	jne.d [blink]    ; ... non-zero fraction
479	add.cs DBL0H,DBL0H,1
480	mov_s DBL0L,0
481	bmsk DBL1H,DBL0H,19
482	j_s.d [blink]
483	bic_s DBL0H,DBL0H,DBL1H
484.Lno_shiftdown_dbl1_gt:
485	add.f 0,r12,r12
486	btst.eq DBL0L,0
487	cmp.eq r12,r12
488	add.cs.f DBL0L,DBL0L,1
489	j_s.d [blink]
490	add.cs DBL0H,DBL0H,1
491
492	.balign 4
493.Large_shift_dbl0:
494	brhs r8,55,.Lret_dbl1
495	bmsk_s DBL0H,DBL0H,19
496	brne r6,0,.Lno_denorm_large_shift_dbl0
497	add_s r12,r12,1
498	brne.d r8,33,.Lfixed_denorm_large_shift_dbl0
499	sub r8,r8,1
500	bset_s DBL0H,DBL0H,20
501.Lshift32_dbl0:
502	mov_s r12,DBL0L
503	mov_s DBL0L,DBL0H
504	brlt.d r10,0,.Lrsub
505	mov_s DBL0H,0
506	b_s .Ladd_dbl1_gt
507
508	.balign 4
509.Lno_denorm_large_shift_dbl0:
510	breq.d r8,32,.Lshift32_dbl0
511	bset_l DBL0H,DBL0H,20
512.Lfixed_denorm_large_shift_dbl0:
513	asl r4,DBL0H,r12
514	lsr r5,DBL0L,r8
515	asl.f 0,DBL0L,r12
516	lsr DBL0L,DBL0H,r8
517	or r12,r4,r5
518	tst.eq r12,1
519	or.ne r12,r12,2
520	brlt.d r10,0,.Lrsub
521	mov_s DBL0H,0
522	b_l .Ladd_dbl1_gt
523	ENDFUNC(__adddf3)
524	ENDFUNC(__subdf3)
525