1181834Srobertodnl  AMD64 mpn_mul_basecase optimised for Intel Haswell.
2280849Scy
3181834Srobertodnl  Contributed to the GNU project by Torbj��rn Granlund.
4181834Sroberto
5181834Srobertodnl  Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc.
6280849Scy
7280849Scydnl  This file is part of the GNU MP Library.
8280849Scydnl
9181834Srobertodnl  The GNU MP Library is free software; you can redistribute it and/or modify
10280849Scydnl  it under the terms of either:
11280849Scydnl
12280849Scydnl    * the GNU Lesser General Public License as published by the Free
13285169Scydnl      Software Foundation; either version 3 of the License, or (at your
14181834Srobertodnl      option) any later version.
15280849Scydnl
16280849Scydnl  or
17280849Scydnl
18181834Srobertodnl    * the GNU General Public License as published by the Free Software
19280849Scydnl      Foundation; either version 2 of the License, or (at your option) any
20280849Scydnl      later version.
21181834Srobertodnl
22280849Scydnl  or both in parallel, as here.
23280849Scydnl
24181834Srobertodnl  The GNU MP Library is distributed in the hope that it will be useful, but
25280849Scydnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26181834Srobertodnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
27280849Scydnl  for more details.
28280849Scydnl
29280849Scydnl  You should have received copies of the GNU General Public License and the
30181834Srobertodnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
31181834Srobertodnl  see https://www.gnu.org/licenses/.
32181834Sroberto
33181834Srobertoinclude(`../config.m4')
34181834Sroberto
35285169ScyC cycles/limb	mul_1		mul_2		mul_3		addmul_2
36181834SrobertoC AMD K8,K9	n/a		n/a		 -		n/a
37181834SrobertoC AMD K10	n/a		n/a		 -		n/a
38181834SrobertoC AMD bull	n/a		n/a		 -		n/a
39181834SrobertoC AMD pile	n/a		n/a		 -		n/a
40181834SrobertoC AMD steam	 ?		 ?		 -		 ?
41285169ScyC AMD bobcat	n/a		n/a		 -		n/a
42280849ScyC AMD jaguar	 ?		 ?		 -		 ?
43181834SrobertoC Intel P4	n/a		n/a		 -		n/a
44280849ScyC Intel core	n/a		n/a		 -		n/a
45280849ScyC Intel NHM	n/a		n/a		 -		n/a
46280849ScyC Intel SBR	n/a		n/a		 -		n/a
47181834SrobertoC Intel IBR	n/a		n/a		 -		n/a
48280849ScyC Intel HWL	 1.77		 1.86		 -		 2.15
49280849ScyC Intel BWL	 ?		 ?		 -		 ?
50280849ScyC Intel atom	n/a		n/a		 -		n/a
51280849ScyC VIA nano	n/a		n/a		 -		n/a
52280849Scy
53280849ScyC The inner loops of this code are the result of running a code generation and
54280849ScyC optimisation tool suite written by David Harvey and Torbj��rn Granlund.
55280849Scy
56280849ScyC TODO
57280849ScyC  * Adjoin a mul_3.
58280849ScyC  * Further micro-optimise.
59280849Scy
60280849Scydefine(`rp',      `%rdi')
61280849Scydefine(`up',      `%rsi')
62181834Srobertodefine(`un_param',`%rdx')
63181834Srobertodefine(`vp',      `%rcx')
64280849Scydefine(`vn',      `%r8')
65280849Scy
66280849Scydefine(`un',      `%rbx')
67280849Scy
68280849Scydefine(`w0',	`%r10')
69280849Scydefine(`w1',	`%r11')
70280849Scydefine(`w2',	`%r12')
71280849Scydefine(`w3',	`%r13')
72280849Scydefine(`n',	`%rbp')
73280849Scydefine(`v0',	`%r9')
74280849Scy
75181834SrobertoABI_SUPPORT(DOS64)
76280849ScyABI_SUPPORT(STD64)
77280849Scy
78280849ScyASM_START()
79280849Scy	TEXT
80280849Scy	ALIGN(16)
81280849ScyPROLOGUE(mpn_mul_basecase)
82181834Sroberto	FUNC_ENTRY(4)
83280849ScyIFDOS(`	mov	56(%rsp), %r8d	')
84181834Sroberto	push	%rbx
85280849Scy	push	%rbp
86280849Scy	push	%r12
87181834Sroberto	push	%r13
88280849Scy	push	%r14
89280849Scy	mov	un_param, un		C free up rdx
90181834Sroberto	neg	un
91280849Scy
92280849Scy	mov	un_param, n		C FIXME: share
93280849Scy	sar	$2, n			C FIXME: share
94280849Scy
95280849Scy	test	$1, R8(vn)
96280849Scy	jz	L(do_mul_2)
97181834Sroberto
98280849Scydefine(`w4',	`%r9')
99280849Scydefine(`w5',	`%r14')
100280849Scy
101181834Sroberto	mov	(vp), %rdx
102280849Scy
103181834SrobertoL(do_mul_1):
104280849Scy	test	$1, R8(un)
105280849Scy	jnz	L(m1x1)
106280849Scy
107280849ScyL(m1x0):test	$2, R8(un)
108280849Scy	jnz	L(m110)
109280849Scy
110280849ScyL(m100):
111280849Scy	mulx(	(up), w5, w2)
112280849Scy	mulx(	8,(up), w1, w3)
113280849Scy	lea	-24(rp), rp
114280849Scy	jmp	L(m1l0)
115280849Scy
116181834SrobertoL(m110):
117280849Scy	mulx(	(up), w3, w4)
118280849Scy	mulx(	8,(up), w1, w5)
119181834Sroberto	lea	-8(rp), rp
120280849Scy	test	n, n
121280849Scy	jz	L(cj2)
122280849Scy	mulx(	16,(up), w0, w2)
123280849Scy	lea	16(up), up
124280849Scy	jmp	L(m1l2)
125280849Scy
126181834SrobertoL(m1x1):test	$2, R8(un)
127280849Scy	jz	L(m111)
128280849Scy
129280849ScyL(m101):
130280849Scy	mulx(	(up), w4, w5)
131280849Scy	lea	-16(rp), rp
132181834Sroberto	test	n, n
133280849Scy	jz	L(cj1)
134280849Scy	mulx(	8,(up), w0, w2)
135280849Scy	lea	8(up), up
136280849Scy	jmp	L(m1l1)
137280849Scy
138280849ScyL(m111):
139280849Scy	mulx(	(up), w2, w3)
140280849Scy	mulx(	8,(up), w0, w4)
141181834Sroberto	mulx(	16,(up), w1, w5)
142280849Scy	lea	24(up), up
143280849Scy	test	n, n
144280849Scy	jnz	L(gt3)
145280849Scy	add	w0, w3
146280849Scy	jmp	L(cj3)
147280849ScyL(gt3):	add	w0, w3
148280849Scy	jmp	L(m1l3)
149280849Scy
150280849Scy	ALIGN(32)
151280849ScyL(m1tp):lea	32(rp), rp
152280849ScyL(m1l3):mov	w2, (rp)
153280849Scy	mulx(	(up), w0, w2)
154280849ScyL(m1l2):mov	w3, 8(rp)
155280849Scy	adc	w1, w4
156280849ScyL(m1l1):adc	w0, w5
157280849Scy	mov	w4, 16(rp)
158280849Scy	mulx(	8,(up), w1, w3)
159280849ScyL(m1l0):mov	w5, 24(rp)
160280849Scy	mulx(	16,(up), w0, w4)
161280849Scy	adc	w1, w2
162280849Scy	mulx(	24,(up), w1, w5)
163280849Scy	adc	w0, w3
164280849Scy	lea	32(up), up
165280849Scy	dec	n
166280849Scy	jnz	L(m1tp)
167181834Sroberto
168280849ScyL(m1ed):lea	32(rp), rp
169280849ScyL(cj3):	mov	w2, (rp)
170181834SrobertoL(cj2):	mov	w3, 8(rp)
171181834Sroberto	adc	w1, w4
172280849ScyL(cj1):	mov	w4, 16(rp)
173280849Scy	adc	$0, w5
174280849Scy	mov	w5, 24(rp)
175280849Scy
176280849Scy	dec	R32(vn)
177280849Scy	jz	L(ret5)
178280849Scy
179181834Sroberto	lea	8(vp), vp
180181834Sroberto	lea	32(rp), rp
181181834SrobertoC	push	%r12
182181834SrobertoC	push	%r13
183181834SrobertoC	push	%r14
184285169Scy	jmp	L(do_addmul)
185285169Scy
186181834SrobertoL(do_mul_2):
187181834Srobertodefine(`v1',	`%r14')
188181834SrobertoC	push	%r12
189181834SrobertoC	push	%r13
190181834SrobertoC	push	%r14
191280849Scy
192181834Sroberto	mov	(vp), v0
193280849Scy	mov	8(vp), v1
194181834Sroberto
195181834Sroberto	lea	(un), n
196280849Scy	sar	$2, n
197280849Scy
198280849Scy	test	$1, R8(un)
199285169Scy	jnz	L(m2x1)
200285169Scy
201280849ScyL(m2x0):xor	w0, w0
202280849Scy	test	$2, R8(un)
203280849Scy	mov	(up), %rdx
204280849Scy	mulx(	v0, w2, w1)
205280849Scy	jz	L(m2l0)
206280849Scy
207280849ScyL(m210):lea	-16(rp), rp
208280849Scy	lea	-16(up), up
209280849Scy	jmp	L(m2l2)
210280849Scy
211280849ScyL(m2x1):xor	w2, w2
212280849Scy	test	$2, R8(un)
213280849Scy	mov	(up), %rdx
214181834Sroberto	mulx(	v0, w0, w3)
215181834Sroberto	jz	L(m211)
216181834Sroberto
217181834SrobertoL(m201):lea	-24(rp), rp
218285169Scy	lea	8(up), up
219285169Scy	jmp	L(m2l1)
220181834Sroberto
221181834SrobertoL(m211):lea	-8(rp), rp
222181834Sroberto	lea	-8(up), up
223181834Sroberto	jmp	L(m2l3)
224181834Sroberto
225280849Scy	ALIGN(16)
226181834SrobertoL(m2tp):mulx(	v1, %rax, w0)
227280849Scy	add	%rax, w2
228181834Sroberto	mov	(up), %rdx
229181834Sroberto	mulx(	v0, %rax, w1)
230280849Scy	adc	$0, w0
231280849Scy	add	%rax, w2
232181834Sroberto	adc	$0, w1
233181834Sroberto	add	w3, w2
234181834SrobertoL(m2l0):mov	w2, (rp)
235181834Sroberto	adc	$0, w1
236181834Sroberto	mulx(	v1, %rax, w2)
237181834Sroberto	add	%rax, w0
238	mov	8(up), %rdx
239	adc	$0, w2
240	mulx(	v0, %rax, w3)
241	add	%rax, w0
242	adc	$0, w3
243	add	w1, w0
244L(m2l3):mov	w0, 8(rp)
245	adc	$0, w3
246	mulx(	v1, %rax, w0)
247	add	%rax, w2
248	mov	16(up), %rdx
249	mulx(	v0, %rax, w1)
250	adc	$0, w0
251	add	%rax, w2
252	adc	$0, w1
253	add	w3, w2
254L(m2l2):mov	w2, 16(rp)
255	adc	$0, w1
256	mulx(	v1, %rax, w2)
257	add	%rax, w0
258	mov	24(up), %rdx
259	adc	$0, w2
260	mulx(	v0, %rax, w3)
261	add	%rax, w0
262	adc	$0, w3
263	add	w1, w0
264	lea	32(up), up
265L(m2l1):mov	w0, 24(rp)
266	adc	$0, w3
267	inc	n
268	lea	32(rp), rp
269	jnz	L(m2tp)
270
271L(m2ed):mulx(	v1, %rdx, %rax)
272	add	%rdx, w2
273	adc	$0, %rax
274	add	w3, w2
275	mov	w2, (rp)
276	adc	$0, %rax
277	mov	%rax, 8(rp)
278
279	add	$-2, R32(vn)
280	jz	L(ret5)
281	lea	16(vp), vp
282	lea	16(rp), rp
283
284
285L(do_addmul):
286	push	%r15
287	push	vn			C save vn in new stack slot
288define(`vn',	`(%rsp)')
289define(`X0',	`%r14')
290define(`X1',	`%r15')
291define(`v1',	`%r8')
292
293	lea	(rp,un,8), rp
294	lea	(up,un,8), up
295
296L(outer):
297	mov	(vp), v0
298	mov	8(vp), v1
299
300	lea	2(un), n
301	sar	$2, n
302
303	mov	(up), %rdx
304	test	$1, R8(un)
305	jnz	L(bx1)
306
307L(bx0):	mov	(rp), X0
308	mov	8(rp), X1
309	mulx(	v0, %rax, w1)
310	add	%rax, X0
311	mulx(	v1, %rax, w2)
312	adc	$0, w1
313	mov	X0, (rp)
314	add	%rax, X1
315	adc	$0, w2
316	mov	8(up), %rdx
317	test	$2, R8(un)
318	jnz	L(b10)
319
320L(b00):	lea	16(up), up
321	lea	16(rp), rp
322	jmp	L(lo0)
323
324L(b10):	mov	16(rp), X0
325	lea	32(up), up
326	mulx(	v0, %rax, w3)
327	jmp	L(lo2)
328
329L(bx1):	mov	(rp), X1
330	mov	8(rp), X0
331	mulx(	v0, %rax, w3)
332	add	%rax, X1
333	adc	$0, w3
334	mulx(	v1, %rax, w0)
335	add	%rax, X0
336	adc	$0, w0
337	mov	8(up), %rdx
338	mov	X1, (rp)
339	mulx(	v0, %rax, w1)
340	test	$2, R8(un)
341	jz	L(b11)
342
343L(b01):	mov	16(rp), X1
344	lea	24(rp), rp
345	lea	24(up), up
346	jmp	L(lo1)
347
348L(b11):	lea	8(rp), rp
349	lea	8(up), up
350	jmp	L(lo3)
351
352	ALIGN(16)
353L(top):	mulx(	v0, %rax, w3)
354	add	w0, X1
355	adc	$0, w2
356L(lo2):	add	%rax, X1
357	adc	$0, w3
358	mulx(	v1, %rax, w0)
359	add	%rax, X0
360	adc	$0, w0
361	lea	32(rp), rp
362	add	w1, X1
363	mov	-16(up), %rdx
364	mov	X1, -24(rp)
365	adc	$0, w3
366	add	w2, X0
367	mov	-8(rp), X1
368	mulx(	v0, %rax, w1)
369	adc	$0, w0
370L(lo1):	add	%rax, X0
371	mulx(	v1, %rax, w2)
372	adc	$0, w1
373	add	w3, X0
374	mov	X0, -16(rp)
375	adc	$0, w1
376	add	%rax, X1
377	adc	$0, w2
378	add	w0, X1
379	mov	-8(up), %rdx
380	adc	$0, w2
381L(lo0):	mulx(	v0, %rax, w3)
382	add	%rax, X1
383	adc	$0, w3
384	mov	(rp), X0
385	mulx(	v1, %rax, w0)
386	add	%rax, X0
387	adc	$0, w0
388	add	w1, X1
389	mov	X1, -8(rp)
390	adc	$0, w3
391	mov	(up), %rdx
392	add	w2, X0
393	mulx(	v0, %rax, w1)
394	adc	$0, w0
395L(lo3):	add	%rax, X0
396	adc	$0, w1
397	mulx(	v1, %rax, w2)
398	add	w3, X0
399	mov	8(rp), X1
400	mov	X0, (rp)
401	mov	16(rp), X0
402	adc	$0, w1
403	add	%rax, X1
404	adc	$0, w2
405	mov	8(up), %rdx
406	lea	32(up), up
407	inc	n
408	jnz	L(top)
409
410L(end):	mulx(	v0, %rax, w3)
411	add	w0, X1
412	adc	$0, w2
413	add	%rax, X1
414	adc	$0, w3
415	mulx(	v1, %rdx, %rax)
416	add	w1, X1
417	mov	X1, 8(rp)
418	adc	$0, w3
419	add	w2, %rdx
420	adc	$0, %rax
421	add	w3, %rdx
422	mov	%rdx, 16(rp)
423	adc	$0, %rax
424	mov	%rax, 24(rp)
425
426	addl	$-2, vn
427	lea	16(vp), vp
428	lea	-16(up,un,8), up
429	lea	32(rp,un,8), rp
430	jnz	L(outer)
431
432	pop	%rax		C deallocate vn slot
433	pop	%r15
434L(ret5):pop	%r14
435L(ret4):pop	%r13
436L(ret3):pop	%r12
437L(ret2):pop	%rbp
438	pop	%rbx
439	FUNC_EXIT()
440	ret
441EPILOGUE()
442