1;; Copyright (C) 2001-2015 Free Software Foundation, Inc.
2;;
3;; This file is part of GCC.
4;;
5;; GCC is free software; you can redistribute it and/or modify it under
6;; the terms of the GNU General Public License as published by the Free
7;; Software Foundation; either version 3, or (at your option) any later
8;; version.
9;;
10;; GCC is distributed in the hope that it will be useful, but WITHOUT ANY
11;; WARRANTY; without even the implied warranty of MERCHANTABILITY or
12;; FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
13;; for more details.
14;;
15;; Under Section 7 of GPL version 3, you are granted additional
16;; permissions described in the GCC Runtime Library Exception, version
17;; 3.1, as published by the Free Software Foundation.
18;;
19;; You should have received a copy of the GNU General Public License and
20;; a copy of the GCC Runtime Library Exception along with this program;
21;; see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
22;; <http://www.gnu.org/licenses/>.
23;;
24;; This code is derived from mulsi3.S, observing that the mstep*16-based
25;; multiplications there, from which it is formed, are actually
26;; zero-extending; in gcc-speak "umulhisi3".  The difference to *this*
27;; function is just a missing top mstep*16 sequence and shifts and 64-bit
28;; additions for the high part.  Compared to an implementation based on
29;; calling __Mul four times (see default implementation of umul_ppmm in
30;; longlong.h), this will complete in a time between a fourth and a third
31;; of that, assuming the value-based optimizations don't strike.  If they
32;; all strike there (very often) but none here, we still win, though by a
33;; lesser margin, due to lesser total overhead.
34
35#define L(x) .x
36#define CONCAT1(a, b) CONCAT2(a, b)
37#define CONCAT2(a, b) a ## b
38
39#ifdef __USER_LABEL_PREFIX__
40# define SYM(x) CONCAT1 (__USER_LABEL_PREFIX__, x)
41#else
42# define SYM(x) x
43#endif
44
45	.global SYM(__umulsidi3)
46	.type	SYM(__umulsidi3),@function
47SYM(__umulsidi3):
48#if defined (__CRIS_arch_version) && __CRIS_arch_version >= 10
49;; Can't have the mulu.d last on a cache-line, due to a hardware bug.  See
50;; the documentation for -mmul-bug-workaround.
51;; Not worthwhile to conditionalize here.
52	.p2alignw 2,0x050f
53	mulu.d $r11,$r10
54	ret
55	move $mof,$r11
56#else
57	move.d $r11,$r9
58	bound.d $r10,$r9
59	cmpu.w 65535,$r9
60	bls L(L3)
61	move.d $r10,$r12
62
63	move.d $r10,$r13
64	movu.w $r11,$r9 ; ab*cd = (a*c)<<32 (a*d + b*c)<<16 + b*d
65
66;; We're called for floating point numbers very often with the "low" 16
67;; bits zero, so it's worthwhile to optimize for that.
68
69	beq L(L6)	; d == 0?
70	lslq 16,$r13
71
72	beq L(L7)	; b == 0?
73	clear.w $r10
74
75	mstep $r9,$r13	; d*b
76	mstep $r9,$r13
77	mstep $r9,$r13
78	mstep $r9,$r13
79	mstep $r9,$r13
80	mstep $r9,$r13
81	mstep $r9,$r13
82	mstep $r9,$r13
83	mstep $r9,$r13
84	mstep $r9,$r13
85	mstep $r9,$r13
86	mstep $r9,$r13
87	mstep $r9,$r13
88	mstep $r9,$r13
89	mstep $r9,$r13
90	mstep $r9,$r13
91
92L(L7):
93	test.d $r10
94	mstep $r9,$r10	; d*a
95	mstep $r9,$r10
96	mstep $r9,$r10
97	mstep $r9,$r10
98	mstep $r9,$r10
99	mstep $r9,$r10
100	mstep $r9,$r10
101	mstep $r9,$r10
102	mstep $r9,$r10
103	mstep $r9,$r10
104	mstep $r9,$r10
105	mstep $r9,$r10
106	mstep $r9,$r10
107	mstep $r9,$r10
108	mstep $r9,$r10
109	mstep $r9,$r10
110
111;; d*a in $r10, d*b in $r13, ab in $r12 and cd in $r11
112;; $r9 = d, need to do b*c and a*c; we can drop d.
113;; so $r9 is up for use and we can shift down $r11 as the mstep
114;; source for the next mstep-part.
115
116L(L8):
117	lsrq 16,$r11
118	move.d $r12,$r9
119	lslq 16,$r9
120	beq L(L9)	; b == 0?
121	mstep $r11,$r9
122
123	mstep $r11,$r9	; b*c
124	mstep $r11,$r9
125	mstep $r11,$r9
126	mstep $r11,$r9
127	mstep $r11,$r9
128	mstep $r11,$r9
129	mstep $r11,$r9
130	mstep $r11,$r9
131	mstep $r11,$r9
132	mstep $r11,$r9
133	mstep $r11,$r9
134	mstep $r11,$r9
135	mstep $r11,$r9
136	mstep $r11,$r9
137	mstep $r11,$r9
138L(L9):
139
140;; d*a in $r10, d*b in $r13, c*b in $r9, ab in $r12 and c in $r11,
141;; need to do a*c.  We want that to end up in $r11, so we shift up $r11 to
142;; now use as the destination operand.  We'd need a test insn to update N
143;; to do it the other way round.
144
145	lsrq 16,$r12
146	lslq 16,$r11
147	mstep $r12,$r11
148	mstep $r12,$r11
149	mstep $r12,$r11
150	mstep $r12,$r11
151	mstep $r12,$r11
152	mstep $r12,$r11
153	mstep $r12,$r11
154	mstep $r12,$r11
155	mstep $r12,$r11
156	mstep $r12,$r11
157	mstep $r12,$r11
158	mstep $r12,$r11
159	mstep $r12,$r11
160	mstep $r12,$r11
161	mstep $r12,$r11
162	mstep $r12,$r11
163
164;; d*a in $r10, d*b in $r13, c*b in $r9, a*c in $r11 ($r12 free).
165;; Need (a*d + b*c)<<16 + b*d into $r10 and
166;; a*c + (a*d + b*c)>>16 plus carry from the additions into $r11.
167
168	add.d $r9,$r10	; (a*d + b*c) - may produce a carry.
169	scs $r12	; The carry corresponds to bit 16 of $r11.
170	lslq 16,$r12
171	add.d $r12,$r11	; $r11 = a*c + carry from (a*d + b*c).
172
173#if defined (__CRIS_arch_version) && __CRIS_arch_version >= 8
174	swapw $r10
175	addu.w $r10,$r11 ; $r11 = a*c + (a*d + b*c) >> 16 including carry.
176	clear.w $r10	; $r10 = (a*d + b*c) << 16
177#else
178	move.d $r10,$r9
179	lsrq 16,$r9
180	add.d $r9,$r11	; $r11 = a*c + (a*d + b*c) >> 16 including carry.
181	lslq 16,$r10	; $r10 = (a*d + b*c) << 16
182#endif
183	add.d $r13,$r10	; $r10 = (a*d + b*c) << 16 + b*d - may produce a carry.
184	scs $r9
185	ret
186	add.d $r9,$r11	; Last carry added to the high-order 32 bits.
187
188L(L6):
189	clear.d $r13
190	ba L(L8)
191	clear.d $r10
192
193L(L11):
194	clear.d $r10
195	ret
196	clear.d $r11
197
198L(L3):
199;; Form the maximum in $r10, by knowing the minimum, $r9.
200;; (We don't know which one of $r10 or $r11 it is.)
201;; Check if the largest operand is still just 16 bits.
202
203	xor $r9,$r10
204	xor $r11,$r10
205	cmpu.w 65535,$r10
206	bls L(L5)
207	movu.w $r9,$r13
208
209;; We have ab*cd = (a*c)<<32 + (a*d + b*c)<<16 + b*d, but c==0
210;; so we only need (a*d)<<16 + b*d with d = $r13, ab = $r10.
211;; Remember that the upper part of (a*d)<<16 goes into the lower part
212;; of $r11 and there may be a carry from adding the low 32 parts.
213	beq L(L11)	; d == 0?
214	move.d $r10,$r9
215
216	lslq 16,$r9
217	beq L(L10)	; b == 0?
218	clear.w $r10
219
220	mstep $r13,$r9	; b*d
221	mstep $r13,$r9
222	mstep $r13,$r9
223	mstep $r13,$r9
224	mstep $r13,$r9
225	mstep $r13,$r9
226	mstep $r13,$r9
227	mstep $r13,$r9
228	mstep $r13,$r9
229	mstep $r13,$r9
230	mstep $r13,$r9
231	mstep $r13,$r9
232	mstep $r13,$r9
233	mstep $r13,$r9
234	mstep $r13,$r9
235	mstep $r13,$r9
236L(L10):
237	test.d $r10
238	mstep $r13,$r10	; a*d
239	mstep $r13,$r10
240	mstep $r13,$r10
241	mstep $r13,$r10
242	mstep $r13,$r10
243	mstep $r13,$r10
244	mstep $r13,$r10
245	mstep $r13,$r10
246	mstep $r13,$r10
247	mstep $r13,$r10
248	mstep $r13,$r10
249	mstep $r13,$r10
250	mstep $r13,$r10
251	mstep $r13,$r10
252	mstep $r13,$r10
253	mstep $r13,$r10
254	move.d $r10,$r11
255	lsrq 16,$r11
256	lslq 16,$r10
257	add.d $r9,$r10
258	scs $r12
259	ret
260	add.d $r12,$r11
261
262L(L5):
263;; We have ab*cd = (a*c)<<32 + (a*d + b*c)<<16 + b*d, but a and c==0
264;; so b*d (with min=b=$r13, max=d=$r10) it is.  As it won't overflow the
265;; 32-bit part, just set $r11 to 0.
266
267	lslq 16,$r10
268	clear.d $r11
269
270	mstep $r13,$r10
271	mstep $r13,$r10
272	mstep $r13,$r10
273	mstep $r13,$r10
274	mstep $r13,$r10
275	mstep $r13,$r10
276	mstep $r13,$r10
277	mstep $r13,$r10
278	mstep $r13,$r10
279	mstep $r13,$r10
280	mstep $r13,$r10
281	mstep $r13,$r10
282	mstep $r13,$r10
283	mstep $r13,$r10
284	mstep $r13,$r10
285	ret
286	mstep $r13,$r10
287#endif
288L(Lfe1):
289	.size	SYM(__umulsidi3),L(Lfe1)-SYM(__umulsidi3)
290