lib1funcs-Os-4-200.S revision 1.1.1.3
1/* Copyright (C) 2006-2016 Free Software Foundation, Inc.
2
3This file is free software; you can redistribute it and/or modify it
4under the terms of the GNU General Public License as published by the
5Free Software Foundation; either version 3, or (at your option) any
6later version.
7
8This file is distributed in the hope that it will be useful, but
9WITHOUT ANY WARRANTY; without even the implied warranty of
10MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11General Public License for more details.
12
13Under Section 7 of GPL version 3, you are granted additional
14permissions described in the GCC Runtime Library Exception, version
153.1, as published by the Free Software Foundation.
16
17You should have received a copy of the GNU General Public License and
18a copy of the GCC Runtime Library Exception along with this program;
19see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
20<http://www.gnu.org/licenses/>.  */
21
22/* Moderately Space-optimized libgcc routines for the Renesas SH /
23   STMicroelectronics ST40 CPUs.
24   Contributed by J"orn Rennecke joern.rennecke@st.com.  */
25
26#include "lib1funcs.h"
27
28#if !__SHMEDIA__
29#ifdef L_udivsi3_i4i
30
31/* 88 bytes; sh4-200 cycle counts:
32   divisor  >= 2G: 11 cycles
33   dividend <  2G: 48 cycles
34   dividend >= 2G: divisor != 1: 54 cycles
35   dividend >= 2G, divisor == 1: 22 cycles */
36#if defined (__SH_FPU_DOUBLE__) || defined (__SH4_SINGLE_ONLY__)
37!! args in r4 and r5, result in r0, clobber r1
38
39	.global GLOBAL(udivsi3_i4i)
40	FUNC(GLOBAL(udivsi3_i4i))
41GLOBAL(udivsi3_i4i):
42	mova L1,r0
43	cmp/pz r5
44	sts fpscr,r1
45	lds.l @r0+,fpscr
46	sts.l fpul,@-r15
47	bf LOCAL(huge_divisor)
48	mov.l r1,@-r15
49	lds r4,fpul
50	cmp/pz r4
51#ifdef FMOVD_WORKS
52	fmov.d dr0,@-r15
53	float fpul,dr0
54	fmov.d dr2,@-r15
55	bt LOCAL(dividend_adjusted)
56	mov #1,r1
57	fmov.d @r0,dr2
58	cmp/eq r1,r5
59	bt LOCAL(div_by_1)
60	fadd dr2,dr0
61LOCAL(dividend_adjusted):
62	lds r5,fpul
63	float fpul,dr2
64	fdiv dr2,dr0
65LOCAL(div_by_1):
66	fmov.d @r15+,dr2
67	ftrc dr0,fpul
68	fmov.d @r15+,dr0
69#else /* !FMOVD_WORKS */
70	fmov.s DR01,@-r15
71	mov #1,r1
72	fmov.s DR00,@-r15
73	float fpul,dr0
74	fmov.s DR21,@-r15
75	bt/s LOCAL(dividend_adjusted)
76	fmov.s DR20,@-r15
77	cmp/eq r1,r5
78	bt LOCAL(div_by_1)
79	fmov.s @r0+,DR20
80	fmov.s @r0,DR21
81	fadd dr2,dr0
82LOCAL(dividend_adjusted):
83	lds r5,fpul
84	float fpul,dr2
85	fdiv dr2,dr0
86LOCAL(div_by_1):
87	fmov.s @r15+,DR20
88	fmov.s @r15+,DR21
89	ftrc dr0,fpul
90	fmov.s @r15+,DR00
91	fmov.s @r15+,DR01
92#endif /* !FMOVD_WORKS */
93	lds.l @r15+,fpscr
94	sts fpul,r0
95	rts
96	lds.l @r15+,fpul
97
98#ifdef FMOVD_WORKS
99	.p2align 3        ! make double below 8 byte aligned.
100#endif
101LOCAL(huge_divisor):
102	lds r1,fpscr
103	add #4,r15
104	cmp/hs r5,r4
105	rts
106	movt r0
107
108	.p2align 2
109L1:
110#ifndef FMOVD_WORKS
111	.long 0x80000
112#else
113	.long 0x180000
114#endif
115	.double 4294967296
116
117	ENDFUNC(GLOBAL(udivsi3_i4i))
118#elif !defined (__sh1__)  /* !__SH_FPU_DOUBLE__ */
119
120#if 0
121/* With 36 bytes, the following would probably be the most compact
122   implementation, but with 139 cycles on an sh4-200, it is extremely slow.  */
123GLOBAL(udivsi3_i4i):
124	mov.l r2,@-r15
125	mov #0,r1
126	div0u
127	mov r1,r2
128	mov.l r3,@-r15
129	mov r1,r3
130	sett
131	mov r4,r0
132LOCAL(loop):
133	rotcr r2
134	;
135	bt/s LOCAL(end)
136	cmp/gt r2,r3
137	rotcl r0
138	bra LOCAL(loop)
139	div1 r5,r1
140LOCAL(end):
141	rotcl r0
142	mov.l @r15+,r3
143	rts
144	mov.l @r15+,r2
145#endif /* 0 */
146
147/* Size: 186 bytes jointly for udivsi3_i4i and sdivsi3_i4i
148   sh4-200 run times:
149   udiv small divisor: 55 cycles
150   udiv large divisor: 52 cycles
151   sdiv small divisor, positive result: 59 cycles
152   sdiv large divisor, positive result: 56 cycles
153   sdiv small divisor, negative result: 65 cycles (*)
154   sdiv large divisor, negative result: 62 cycles (*)
155   (*): r2 is restored in the rts delay slot and has a lingering latency
156        of two more cycles.  */
157	.balign 4
158	.global	GLOBAL(udivsi3_i4i)
159	FUNC(GLOBAL(udivsi3_i4i))
160	FUNC(GLOBAL(sdivsi3_i4i))
161GLOBAL(udivsi3_i4i):
162	sts pr,r1
163	mov.l r4,@-r15
164	extu.w r5,r0
165	cmp/eq r5,r0
166	swap.w r4,r0
167	shlr16 r4
168	bf/s LOCAL(large_divisor)
169	div0u
170	mov.l r5,@-r15
171	shll16 r5
172LOCAL(sdiv_small_divisor):
173	div1 r5,r4
174	bsr LOCAL(div6)
175	div1 r5,r4
176	div1 r5,r4
177	bsr LOCAL(div6)
178	div1 r5,r4
179	xtrct r4,r0
180	xtrct r0,r4
181	bsr LOCAL(div7)
182	swap.w r4,r4
183	div1 r5,r4
184	bsr LOCAL(div7)
185	div1 r5,r4
186	xtrct r4,r0
187	mov.l @r15+,r5
188	swap.w r0,r0
189	mov.l @r15+,r4
190	jmp @r1
191	rotcl r0
192LOCAL(div7):
193	div1 r5,r4
194LOCAL(div6):
195	            div1 r5,r4; div1 r5,r4; div1 r5,r4
196	div1 r5,r4; div1 r5,r4; rts;        div1 r5,r4
197
198LOCAL(divx3):
199	rotcl r0
200	div1 r5,r4
201	rotcl r0
202	div1 r5,r4
203	rotcl r0
204	rts
205	div1 r5,r4
206
207LOCAL(large_divisor):
208	mov.l r5,@-r15
209LOCAL(sdiv_large_divisor):
210	xor r4,r0
211	.rept 4
212	rotcl r0
213	bsr LOCAL(divx3)
214	div1 r5,r4
215	.endr
216	mov.l @r15+,r5
217	mov.l @r15+,r4
218	jmp @r1
219	rotcl r0
220	ENDFUNC(GLOBAL(udivsi3_i4i))
221
222	.global	GLOBAL(sdivsi3_i4i)
223GLOBAL(sdivsi3_i4i):
224	mov.l r4,@-r15
225	cmp/pz r5
226	mov.l r5,@-r15
227	bt/s LOCAL(pos_divisor)
228	cmp/pz r4
229	neg r5,r5
230	extu.w r5,r0
231	bt/s LOCAL(neg_result)
232	cmp/eq r5,r0
233	neg r4,r4
234LOCAL(pos_result):
235	swap.w r4,r0
236	bra LOCAL(sdiv_check_divisor)
237	sts pr,r1
238LOCAL(pos_divisor):
239	extu.w r5,r0
240	bt/s LOCAL(pos_result)
241	cmp/eq r5,r0
242	neg r4,r4
243LOCAL(neg_result):
244	mova LOCAL(negate_result),r0
245	;
246	mov r0,r1
247	swap.w r4,r0
248	lds r2,macl
249	sts pr,r2
250LOCAL(sdiv_check_divisor):
251	shlr16 r4
252	bf/s LOCAL(sdiv_large_divisor)
253	div0u
254	bra LOCAL(sdiv_small_divisor)
255	shll16 r5
256	.balign 4
257LOCAL(negate_result):
258	neg r0,r0
259	jmp @r2
260	sts macl,r2
261	ENDFUNC(GLOBAL(sdivsi3_i4i))
262#endif /* !__SH_FPU_DOUBLE__ */
263#endif /* L_udivsi3_i4i */
264
265#ifdef L_sdivsi3_i4i
266#if defined (__SH_FPU_DOUBLE__) || defined (__SH4_SINGLE_ONLY__)
267/* 48 bytes, 45 cycles on sh4-200  */
268!! args in r4 and r5, result in r0, clobber r1
269
270	.global GLOBAL(sdivsi3_i4i)
271	FUNC(GLOBAL(sdivsi3_i4i))
272GLOBAL(sdivsi3_i4i):
273	sts.l fpscr,@-r15
274	sts fpul,r1
275	mova L1,r0
276	lds.l @r0+,fpscr
277	lds r4,fpul
278#ifdef FMOVD_WORKS
279	fmov.d dr0,@-r15
280	float fpul,dr0
281	lds r5,fpul
282	fmov.d dr2,@-r15
283#else
284	fmov.s DR01,@-r15
285	fmov.s DR00,@-r15
286	float fpul,dr0
287	lds r5,fpul
288	fmov.s DR21,@-r15
289	fmov.s DR20,@-r15
290#endif
291	float fpul,dr2
292	fdiv dr2,dr0
293#ifdef FMOVD_WORKS
294	fmov.d @r15+,dr2
295#else
296	fmov.s @r15+,DR20
297	fmov.s @r15+,DR21
298#endif
299	ftrc dr0,fpul
300#ifdef FMOVD_WORKS
301	fmov.d @r15+,dr0
302#else
303	fmov.s @r15+,DR00
304	fmov.s @r15+,DR01
305#endif
306	lds.l @r15+,fpscr
307	sts fpul,r0
308	rts
309	lds r1,fpul
310
311	.p2align 2
312L1:
313#ifndef FMOVD_WORKS
314	.long 0x80000
315#else
316	.long 0x180000
317#endif
318
319	ENDFUNC(GLOBAL(sdivsi3_i4i))
320#endif /* __SH_FPU_DOUBLE__ */
321#endif /* L_sdivsi3_i4i */
322#endif /* !__SHMEDIA__ */
323