lib1funcs-Os-4-200.S revision 1.1.1.9
1/* Copyright (C) 2006-2020 Free Software Foundation, Inc.
2
3This file is free software; you can redistribute it and/or modify it
4under the terms of the GNU General Public License as published by the
5Free Software Foundation; either version 3, or (at your option) any
6later version.
7
8This file is distributed in the hope that it will be useful, but
9WITHOUT ANY WARRANTY; without even the implied warranty of
10MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11General Public License for more details.
12
13Under Section 7 of GPL version 3, you are granted additional
14permissions described in the GCC Runtime Library Exception, version
153.1, as published by the Free Software Foundation.
16
17You should have received a copy of the GNU General Public License and
18a copy of the GCC Runtime Library Exception along with this program;
19see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
20<http://www.gnu.org/licenses/>.  */
21
22/* Moderately Space-optimized libgcc routines for the Renesas SH /
23   STMicroelectronics ST40 CPUs.
24   Contributed by J"orn Rennecke joern.rennecke@st.com.  */
25
26#include "lib1funcs.h"
27
28#ifdef L_udivsi3_i4i
29
30/* 88 bytes; sh4-200 cycle counts:
31   divisor  >= 2G: 11 cycles
32   dividend <  2G: 48 cycles
33   dividend >= 2G: divisor != 1: 54 cycles
34   dividend >= 2G, divisor == 1: 22 cycles */
35#if defined (__SH_FPU_DOUBLE__) || defined (__SH4_SINGLE_ONLY__)
36!! args in r4 and r5, result in r0, clobber r1
37
38	.global GLOBAL(udivsi3_i4i)
39	FUNC(GLOBAL(udivsi3_i4i))
40GLOBAL(udivsi3_i4i):
41	mova L1,r0
42	cmp/pz r5
43	sts fpscr,r1
44	lds.l @r0+,fpscr
45	sts.l fpul,@-r15
46	bf LOCAL(huge_divisor)
47	mov.l r1,@-r15
48	lds r4,fpul
49	cmp/pz r4
50#ifdef FMOVD_WORKS
51	fmov.d dr0,@-r15
52	float fpul,dr0
53	fmov.d dr2,@-r15
54	bt LOCAL(dividend_adjusted)
55	mov #1,r1
56	fmov.d @r0,dr2
57	cmp/eq r1,r5
58	bt LOCAL(div_by_1)
59	fadd dr2,dr0
60LOCAL(dividend_adjusted):
61	lds r5,fpul
62	float fpul,dr2
63	fdiv dr2,dr0
64LOCAL(div_by_1):
65	fmov.d @r15+,dr2
66	ftrc dr0,fpul
67	fmov.d @r15+,dr0
68#else /* !FMOVD_WORKS */
69	fmov.s DR01,@-r15
70	mov #1,r1
71	fmov.s DR00,@-r15
72	float fpul,dr0
73	fmov.s DR21,@-r15
74	bt/s LOCAL(dividend_adjusted)
75	fmov.s DR20,@-r15
76	cmp/eq r1,r5
77	bt LOCAL(div_by_1)
78	fmov.s @r0+,DR20
79	fmov.s @r0,DR21
80	fadd dr2,dr0
81LOCAL(dividend_adjusted):
82	lds r5,fpul
83	float fpul,dr2
84	fdiv dr2,dr0
85LOCAL(div_by_1):
86	fmov.s @r15+,DR20
87	fmov.s @r15+,DR21
88	ftrc dr0,fpul
89	fmov.s @r15+,DR00
90	fmov.s @r15+,DR01
91#endif /* !FMOVD_WORKS */
92	lds.l @r15+,fpscr
93	sts fpul,r0
94	rts
95	lds.l @r15+,fpul
96
97#ifdef FMOVD_WORKS
98	.p2align 3        ! make double below 8 byte aligned.
99#endif
100LOCAL(huge_divisor):
101	lds r1,fpscr
102	add #4,r15
103	cmp/hs r5,r4
104	rts
105	movt r0
106
107	.p2align 2
108L1:
109#ifndef FMOVD_WORKS
110	.long 0x80000
111#else
112	.long 0x180000
113#endif
114	.double 4294967296
115
116	ENDFUNC(GLOBAL(udivsi3_i4i))
117#elif !defined (__sh1__)  /* !__SH_FPU_DOUBLE__ */
118
119#if 0
120/* With 36 bytes, the following would probably be the most compact
121   implementation, but with 139 cycles on an sh4-200, it is extremely slow.  */
122GLOBAL(udivsi3_i4i):
123	mov.l r2,@-r15
124	mov #0,r1
125	div0u
126	mov r1,r2
127	mov.l r3,@-r15
128	mov r1,r3
129	sett
130	mov r4,r0
131LOCAL(loop):
132	rotcr r2
133	;
134	bt/s LOCAL(end)
135	cmp/gt r2,r3
136	rotcl r0
137	bra LOCAL(loop)
138	div1 r5,r1
139LOCAL(end):
140	rotcl r0
141	mov.l @r15+,r3
142	rts
143	mov.l @r15+,r2
144#endif /* 0 */
145
146/* Size: 186 bytes jointly for udivsi3_i4i and sdivsi3_i4i
147   sh4-200 run times:
148   udiv small divisor: 55 cycles
149   udiv large divisor: 52 cycles
150   sdiv small divisor, positive result: 59 cycles
151   sdiv large divisor, positive result: 56 cycles
152   sdiv small divisor, negative result: 65 cycles (*)
153   sdiv large divisor, negative result: 62 cycles (*)
154   (*): r2 is restored in the rts delay slot and has a lingering latency
155        of two more cycles.  */
156	.balign 4
157	.global	GLOBAL(udivsi3_i4i)
158	FUNC(GLOBAL(udivsi3_i4i))
159	FUNC(GLOBAL(sdivsi3_i4i))
160GLOBAL(udivsi3_i4i):
161	sts pr,r1
162	mov.l r4,@-r15
163	extu.w r5,r0
164	cmp/eq r5,r0
165	swap.w r4,r0
166	shlr16 r4
167	bf/s LOCAL(large_divisor)
168	div0u
169	mov.l r5,@-r15
170	shll16 r5
171LOCAL(sdiv_small_divisor):
172	div1 r5,r4
173	bsr LOCAL(div6)
174	div1 r5,r4
175	div1 r5,r4
176	bsr LOCAL(div6)
177	div1 r5,r4
178	xtrct r4,r0
179	xtrct r0,r4
180	bsr LOCAL(div7)
181	swap.w r4,r4
182	div1 r5,r4
183	bsr LOCAL(div7)
184	div1 r5,r4
185	xtrct r4,r0
186	mov.l @r15+,r5
187	swap.w r0,r0
188	mov.l @r15+,r4
189	jmp @r1
190	rotcl r0
191LOCAL(div7):
192	div1 r5,r4
193LOCAL(div6):
194	            div1 r5,r4; div1 r5,r4; div1 r5,r4
195	div1 r5,r4; div1 r5,r4; rts;        div1 r5,r4
196
197LOCAL(divx3):
198	rotcl r0
199	div1 r5,r4
200	rotcl r0
201	div1 r5,r4
202	rotcl r0
203	rts
204	div1 r5,r4
205
206LOCAL(large_divisor):
207	mov.l r5,@-r15
208LOCAL(sdiv_large_divisor):
209	xor r4,r0
210	.rept 4
211	rotcl r0
212	bsr LOCAL(divx3)
213	div1 r5,r4
214	.endr
215	mov.l @r15+,r5
216	mov.l @r15+,r4
217	jmp @r1
218	rotcl r0
219	ENDFUNC(GLOBAL(udivsi3_i4i))
220
221	.global	GLOBAL(sdivsi3_i4i)
222GLOBAL(sdivsi3_i4i):
223	mov.l r4,@-r15
224	cmp/pz r5
225	mov.l r5,@-r15
226	bt/s LOCAL(pos_divisor)
227	cmp/pz r4
228	neg r5,r5
229	extu.w r5,r0
230	bt/s LOCAL(neg_result)
231	cmp/eq r5,r0
232	neg r4,r4
233LOCAL(pos_result):
234	swap.w r4,r0
235	bra LOCAL(sdiv_check_divisor)
236	sts pr,r1
237LOCAL(pos_divisor):
238	extu.w r5,r0
239	bt/s LOCAL(pos_result)
240	cmp/eq r5,r0
241	neg r4,r4
242LOCAL(neg_result):
243	mova LOCAL(negate_result),r0
244	;
245	mov r0,r1
246	swap.w r4,r0
247	lds r2,macl
248	sts pr,r2
249LOCAL(sdiv_check_divisor):
250	shlr16 r4
251	bf/s LOCAL(sdiv_large_divisor)
252	div0u
253	bra LOCAL(sdiv_small_divisor)
254	shll16 r5
255	.balign 4
256LOCAL(negate_result):
257	neg r0,r0
258	jmp @r2
259	sts macl,r2
260	ENDFUNC(GLOBAL(sdivsi3_i4i))
261#endif /* !__SH_FPU_DOUBLE__ */
262#endif /* L_udivsi3_i4i */
263
264#ifdef L_sdivsi3_i4i
265#if defined (__SH_FPU_DOUBLE__) || defined (__SH4_SINGLE_ONLY__)
266/* 48 bytes, 45 cycles on sh4-200  */
267!! args in r4 and r5, result in r0, clobber r1
268
269	.global GLOBAL(sdivsi3_i4i)
270	FUNC(GLOBAL(sdivsi3_i4i))
271GLOBAL(sdivsi3_i4i):
272	sts.l fpscr,@-r15
273	sts fpul,r1
274	mova L1,r0
275	lds.l @r0+,fpscr
276	lds r4,fpul
277#ifdef FMOVD_WORKS
278	fmov.d dr0,@-r15
279	float fpul,dr0
280	lds r5,fpul
281	fmov.d dr2,@-r15
282#else
283	fmov.s DR01,@-r15
284	fmov.s DR00,@-r15
285	float fpul,dr0
286	lds r5,fpul
287	fmov.s DR21,@-r15
288	fmov.s DR20,@-r15
289#endif
290	float fpul,dr2
291	fdiv dr2,dr0
292#ifdef FMOVD_WORKS
293	fmov.d @r15+,dr2
294#else
295	fmov.s @r15+,DR20
296	fmov.s @r15+,DR21
297#endif
298	ftrc dr0,fpul
299#ifdef FMOVD_WORKS
300	fmov.d @r15+,dr0
301#else
302	fmov.s @r15+,DR00
303	fmov.s @r15+,DR01
304#endif
305	lds.l @r15+,fpscr
306	sts fpul,r0
307	rts
308	lds r1,fpul
309
310	.p2align 2
311L1:
312#ifndef FMOVD_WORKS
313	.long 0x80000
314#else
315	.long 0x180000
316#endif
317
318	ENDFUNC(GLOBAL(sdivsi3_i4i))
319#endif /* __SH_FPU_DOUBLE__ */
320#endif /* L_sdivsi3_i4i */
321