1dnl  AMD64 mpn_mul_basecase.
2
3dnl  Contributed to the GNU project by Torbjorn Granlund and David Harvey.
4
5dnl  Copyright 2008 Free Software Foundation, Inc.
6
7dnl  This file is part of the GNU MP Library.
8
9dnl  The GNU MP Library is free software; you can redistribute it and/or modify
10dnl  it under the terms of the GNU Lesser General Public License as published
11dnl  by the Free Software Foundation; either version 3 of the License, or (at
12dnl  your option) any later version.
13
14dnl  The GNU MP Library is distributed in the hope that it will be useful, but
15dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
16dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
17dnl  License for more details.
18
19dnl  You should have received a copy of the GNU Lesser General Public License
20dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
21
22include(`../config.m4')
23
24C	     cycles/limb
25C K8,K9:	 2.375
26C K10:		 2.375
27C P4:		 ?
28C P6-15:	 4.45
29
30C The inner loops of this code are the result of running a code generation and
31C optimization tool suite written by David Harvey and Torbjorn Granlund.
32
33C TODO
34C  * Use fewer registers.  (how??? I can't see it -- david)
35C  * Avoid some "mov $0,r" and instead use "xor r,r".
36C  * Can the top of each L(addmul_outer_n) prologue be folded into the
37C    mul_1/mul_2 prologues, saving a LEA (%rip)? It would slow down the
38C    case where vn = 1 or 2; is it worth it?
39
40C INPUT PARAMETERS
41define(`rp',      `%rdi')
42define(`up',      `%rsi')
43define(`un_param',`%rdx')
44define(`vp',      `%rcx')
45define(`vn',      `%r8')
46
47define(`v0', `%r12')
48define(`v1', `%r9')
49
50define(`w0', `%rbx')
51define(`w1', `%r15')
52define(`w2', `%rbp')
53define(`w3', `%r10')
54
55define(`n',  `%r11')
56define(`outer_addr', `%r14')
57define(`un',  `%r13')
58
59ASM_START()
60	TEXT
61	ALIGN(16)
62PROLOGUE(mpn_mul_basecase)
63	push	%rbx
64	push	%rbp
65	push	%r12
66	push	%r13
67	push	%r14
68	push	%r15
69
70	xor	R32(un), R32(un)
71	mov	(up), %rax
72	mov	(vp), v0
73
74	sub	un_param, un		C rdx used by mul
75	mov	un, n
76	mov	R32(un_param), R32(w0)
77
78	lea	(rp,un_param,8), rp
79	lea	(up,un_param,8), up
80
81	mul	v0
82
83	test	$1, R8(vn)
84	jz	L(mul_2)
85
86C ===========================================================
87C     mul_1 for vp[0] if vn is odd
88
89L(mul_1):
90	and	$3, R32(w0)
91	jz	L(mul_1_prologue_0)
92	cmp	$2, R32(w0)
93	jc	L(mul_1_prologue_1)
94	jz	L(mul_1_prologue_2)
95	jmp	L(mul_1_prologue_3)
96
97L(mul_1_prologue_0):
98	mov	%rax, w2
99	mov	%rdx, w3		C note: already w0 == 0
100	lea	L(addmul_outer_0)(%rip), outer_addr
101	jmp	L(mul_1_entry_0)
102
103L(mul_1_prologue_1):
104	cmp	$-1, un
105	jne	2f
106	mov	%rax, -8(rp)
107	mov	%rdx, (rp)
108	jmp	L(ret)
1092:	add	$1, n
110	lea	L(addmul_outer_1)(%rip), outer_addr
111	mov	%rax, w1
112	mov	%rdx, w2
113	xor	R32(w3), R32(w3)
114	mov	(up,n,8), %rax
115	jmp	L(mul_1_entry_1)
116
117L(mul_1_prologue_2):
118	add	$-2, n
119	lea	L(addmul_outer_2)(%rip), outer_addr
120	mov	%rax, w0
121	mov	%rdx, w1
122	mov	24(up,n,8), %rax
123	xor	R32(w2), R32(w2)
124	xor	R32(w3), R32(w3)
125	jmp	L(mul_1_entry_2)
126
127L(mul_1_prologue_3):
128	add	$-1, n
129	lea	L(addmul_outer_3)(%rip), outer_addr
130	mov	%rax, w3
131	mov	%rdx, w0
132	jmp	L(mul_1_entry_3)
133
134
135	C this loop is 10 c/loop = 2.5 c/l on K8, for all up/rp alignments
136
137	ALIGN(16)
138L(mul_1_top):
139	mov	w0, -16(rp,n,8)
140	add	%rax, w1
141	mov	(up,n,8), %rax
142	adc	%rdx, w2
143L(mul_1_entry_1):
144	xor	R32(w0), R32(w0)
145	mul	v0
146	mov	w1, -8(rp,n,8)
147	add	%rax, w2
148	adc	%rdx, w3
149L(mul_1_entry_0):
150	mov	8(up,n,8), %rax
151	mul	v0
152	mov	w2, (rp,n,8)
153	add	%rax, w3
154	adc	%rdx, w0
155L(mul_1_entry_3):
156	mov	16(up,n,8), %rax
157	mul	v0
158	mov	w3, 8(rp,n,8)
159	xor	R32(w2), R32(w2)	C zero
160	mov	w2, w3			C zero
161	add	%rax, w0
162	mov	24(up,n,8), %rax
163	mov	w2, w1			C zero
164	adc	%rdx, w1
165L(mul_1_entry_2):
166	mul	v0
167	add	$4, n
168	js	L(mul_1_top)
169
170	mov	w0, -16(rp)
171	add	%rax, w1
172	mov	w1, -8(rp)
173	adc	%rdx, w2
174	mov	w2, (rp)
175
176	add	$-1, vn			C vn -= 1
177	jz	L(ret)
178
179	mov	8(vp), v0
180	mov	16(vp), v1
181
182	lea	8(vp), vp		C vp += 1
183	lea	8(rp), rp		C rp += 1
184
185	jmp	*outer_addr
186
187C ===========================================================
188C     mul_2 for vp[0], vp[1] if vn is even
189
190	ALIGN(16)
191L(mul_2):
192	mov	8(vp), v1
193
194	and	$3, R32(w0)
195	jz	L(mul_2_prologue_0)
196	cmp	$2, R32(w0)
197	jz	L(mul_2_prologue_2)
198	jc	L(mul_2_prologue_1)
199
200L(mul_2_prologue_3):
201	lea	L(addmul_outer_3)(%rip), outer_addr
202	add	$2, n
203	mov	%rax, -16(rp,n,8)
204	mov	%rdx, w2
205	xor	R32(w3), R32(w3)
206	xor	R32(w0), R32(w0)
207	mov	-16(up,n,8), %rax
208	jmp	L(mul_2_entry_3)
209
210	ALIGN(16)
211L(mul_2_prologue_0):
212	add	$3, n
213	mov	%rax, w0
214	mov	%rdx, w1
215	xor	R32(w2), R32(w2)
216	mov	-24(up,n,8), %rax
217	lea	L(addmul_outer_0)(%rip), outer_addr
218	jmp	L(mul_2_entry_0)
219
220	ALIGN(16)
221L(mul_2_prologue_1):
222	mov	%rax, w3
223	mov	%rdx, w0
224	xor	R32(w1), R32(w1)
225	lea	L(addmul_outer_1)(%rip), outer_addr
226	jmp	L(mul_2_entry_1)
227
228	ALIGN(16)
229L(mul_2_prologue_2):
230	add	$1, n
231	lea	L(addmul_outer_2)(%rip), outer_addr
232	mov	$0, R32(w0)
233	mov	$0, R32(w1)
234	mov	%rax, w2
235	mov	-8(up,n,8), %rax
236	mov	%rdx, w3
237	jmp	L(mul_2_entry_2)
238
239	C this loop is 18 c/loop = 2.25 c/l on K8, for all up/rp alignments
240
241	ALIGN(16)
242L(mul_2_top):
243	mov	-32(up,n,8), %rax
244	mul	v1
245	add	%rax, w0
246	adc	%rdx, w1
247	mov	-24(up,n,8), %rax
248	xor	R32(w2), R32(w2)
249	mul	v0
250	add	%rax, w0
251	mov	-24(up,n,8), %rax
252	adc	%rdx, w1
253	adc	$0, R32(w2)
254L(mul_2_entry_0):
255	mul	v1
256	add	%rax, w1
257	mov	w0, -24(rp,n,8)
258	adc	%rdx, w2
259	mov	-16(up,n,8), %rax
260	mul	v0
261	mov	$0, R32(w3)
262	add	%rax, w1
263	adc	%rdx, w2
264	mov	-16(up,n,8), %rax
265	adc	$0, R32(w3)
266	mov	$0, R32(w0)
267	mov	w1, -16(rp,n,8)
268L(mul_2_entry_3):
269	mul	v1
270	add	%rax, w2
271	mov	-8(up,n,8), %rax
272	adc	%rdx, w3
273	mov	$0, R32(w1)
274	mul	v0
275	add	%rax, w2
276	mov	-8(up,n,8), %rax
277	adc	%rdx, w3
278	adc	R32(w1), R32(w0)	C adc $0, w0
279L(mul_2_entry_2):
280	mul	v1
281	add	%rax, w3
282	mov	w2, -8(rp,n,8)
283	adc	%rdx, w0
284	mov	(up,n,8), %rax
285	mul	v0
286	add	%rax, w3
287	adc	%rdx, w0
288	adc	$0, R32(w1)
289L(mul_2_entry_1):
290	add	$4, n
291	mov	w3, -32(rp,n,8)
292	js	L(mul_2_top)
293
294	mov	-32(up,n,8), %rax
295	mul	v1
296	add	%rax, w0
297	mov	w0, (rp)
298	adc	%rdx, w1
299	mov	w1, 8(rp)
300
301	add	$-2, vn			C vn -= 2
302	jz	L(ret)
303
304	mov	16(vp), v0
305	mov	24(vp), v1
306
307	lea	16(vp), vp		C vp += 2
308	lea	16(rp), rp		C rp += 2
309
310	jmp	*outer_addr
311
312
313C ===========================================================
314C     addmul_2 for remaining vp's
315
316	C in the following prologues, we reuse un to store the
317	C adjusted value of n that is reloaded on each iteration
318
319L(addmul_outer_0):
320	add	$3, un
321	lea	0(%rip), outer_addr
322
323	mov	un, n
324	mov	-24(up,un,8), %rax
325	mul	v0
326	mov	%rax, w0
327	mov	-24(up,un,8), %rax
328	mov	%rdx, w1
329	xor	R32(w2), R32(w2)
330	jmp	L(addmul_entry_0)
331
332L(addmul_outer_1):
333	mov	un, n
334	mov	(up,un,8), %rax
335	mul	v0
336	mov	%rax, w3
337	mov	(up,un,8), %rax
338	mov	%rdx, w0
339	xor	R32(w1), R32(w1)
340	jmp	L(addmul_entry_1)
341
342L(addmul_outer_2):
343	add	$1, un
344	lea	0(%rip), outer_addr
345
346	mov	un, n
347	mov	-8(up,un,8), %rax
348	mul	v0
349	xor	R32(w0), R32(w0)
350	mov	%rax, w2
351	xor	R32(w1), R32(w1)
352	mov	%rdx, w3
353	mov	-8(up,un,8), %rax
354	jmp	L(addmul_entry_2)
355
356L(addmul_outer_3):
357	add	$2, un
358	lea	0(%rip), outer_addr
359
360	mov	un, n
361	mov	-16(up,un,8), %rax
362	xor	R32(w3), R32(w3)
363	mul	v0
364	mov	%rax, w1
365	mov	-16(up,un,8), %rax
366	mov	%rdx, w2
367	jmp	L(addmul_entry_3)
368
369	C this loop is 19 c/loop = 2.375 c/l on K8, for all up/rp alignments
370
371	ALIGN(16)
372L(addmul_top):
373	add	w3, -32(rp,n,8)
374	adc	%rax, w0
375	mov	-24(up,n,8), %rax
376	adc	%rdx, w1
377	xor	R32(w2), R32(w2)
378	mul	v0
379	add	%rax, w0
380	mov	-24(up,n,8), %rax
381	adc	%rdx, w1
382	adc	R32(w2), R32(w2)	C adc $0, w2
383L(addmul_entry_0):
384	mul	v1
385	xor	R32(w3), R32(w3)
386	add	w0, -24(rp,n,8)
387	adc	%rax, w1
388	mov	-16(up,n,8), %rax
389	adc	%rdx, w2
390	mul	v0
391	add	%rax, w1
392	mov	-16(up,n,8), %rax
393	adc	%rdx, w2
394	adc	$0, R32(w3)
395L(addmul_entry_3):
396	mul	v1
397	add	w1, -16(rp,n,8)
398	adc	%rax, w2
399	mov	-8(up,n,8), %rax
400	adc	%rdx, w3
401	mul	v0
402	xor	R32(w0), R32(w0)
403	add	%rax, w2
404	adc	%rdx, w3
405	mov	$0, R32(w1)
406	mov	-8(up,n,8), %rax
407	adc	R32(w1), R32(w0)	C adc $0, w0
408L(addmul_entry_2):
409	mul	v1
410	add	w2, -8(rp,n,8)
411	adc	%rax, w3
412	adc	%rdx, w0
413	mov	(up,n,8), %rax
414	mul	v0
415	add	%rax, w3
416	mov	(up,n,8), %rax
417	adc	%rdx, w0
418	adc	$0, R32(w1)
419L(addmul_entry_1):
420	mul	v1
421	add	$4, n
422	js	L(addmul_top)
423
424	add	w3, -8(rp)
425	adc	%rax, w0
426	mov	w0, (rp)
427	adc	%rdx, w1
428	mov	w1, 8(rp)
429
430	add	$-2, vn			C vn -= 2
431	jz	L(ret)
432
433	lea	16(rp), rp		C rp += 2
434	lea	16(vp), vp		C vp += 2
435
436	mov	(vp), v0
437	mov	8(vp), v1
438
439	jmp	*outer_addr
440
441	ALIGN(16)
442L(ret):	pop	%r15
443	pop	%r14
444	pop	%r13
445	pop	%r12
446	pop	%rbp
447	pop	%rbx
448	ret
449
450EPILOGUE()
451