1dnl  AMD64 mpn_mul_1 using mulx optimised for Intel Haswell.
2
3dnl  Contributed to the GNU project by Torbj��rn Granlund.
4
5dnl  Copyright 2012, 2013 Free Software Foundation, Inc.
6
7dnl  This file is part of the GNU MP Library.
8dnl
9dnl  The GNU MP Library is free software; you can redistribute it and/or modify
10dnl  it under the terms of either:
11dnl
12dnl    * the GNU Lesser General Public License as published by the Free
13dnl      Software Foundation; either version 3 of the License, or (at your
14dnl      option) any later version.
15dnl
16dnl  or
17dnl
18dnl    * the GNU General Public License as published by the Free Software
19dnl      Foundation; either version 2 of the License, or (at your option) any
20dnl      later version.
21dnl
22dnl  or both in parallel, as here.
23dnl
24dnl  The GNU MP Library is distributed in the hope that it will be useful, but
25dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
27dnl  for more details.
28dnl
29dnl  You should have received copies of the GNU General Public License and the
30dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
31dnl  see https://www.gnu.org/licenses/.
32
33include(`../config.m4')
34
35C	     cycles/limb
36C AMD K8,K9      -
37C AMD K10        -
38C AMD bull       -
39C AMD pile       -
40C AMD steam      -
41C AMD excavator  -
42C AMD bobcat     -
43C AMD jaguar     -
44C Intel P4       -
45C Intel core2    -
46C Intel NHM      -
47C Intel SBR      -
48C Intel IBR      -
49C Intel HWL      1.59
50C Intel BWL      1.76
51C Intel SKL      1.54
52C Intel atom     -
53C Intel SLM      -
54C VIA nano       -
55
56C The loop of this code is the result of running a code generation and
57C optimisation tool suite written by David Harvey and Torbjorn Granlund.
58
59define(`rp',      `%rdi')   C rcx
60define(`up',      `%rsi')   C rdx
61define(`n_param', `%rdx')   C r8
62define(`v0_param',`%rcx')   C r9
63
64define(`n',       `%rbp')
65define(`v0',      `%rdx')
66
67ABI_SUPPORT(DOS64)
68ABI_SUPPORT(STD64)
69
70ASM_START()
71	TEXT
72	ALIGN(32)
73PROLOGUE(mpn_mul_1)
74	FUNC_ENTRY(4)
75	push	%rbx
76	push	%rbp
77	push	%r12
78
79	mov	n_param, n
80	shr	$2, n
81
82	test	$1, R8(n_param)
83	jnz	L(bx1)
84
85L(bx0):	test	$2, R8(n_param)
86	mov	v0_param, v0
87	jnz	L(b10)
88
89L(b00):	mulx(	(up), %r9, %r8)
90	mulx(	8,(up), %r11, %r10)
91	mulx(	16,(up), %rcx, %r12)
92	lea	-32(rp), rp
93	jmp	L(lo0)
94
95L(b10):	mulx(	(up), %rcx, %r12)
96	mulx(	8,(up), %rbx, %rax)
97	lea	-16(rp), rp
98	test	n, n
99	jz	L(cj2)
100	mulx(	16,(up), %r9, %r8)
101	lea	16(up), up
102	jmp	L(lo2)
103
104L(bx1):	test	$2, R8(n_param)
105	mov	v0_param, v0
106	jnz	L(b11)
107
108L(b01):	mulx(	(up), %rbx, %rax)
109	lea	-24(rp), rp
110	test	n, n
111	jz	L(cj1)
112	mulx(	8,(up), %r9, %r8)
113	lea	8(up), up
114	jmp	L(lo1)
115
116L(b11):	mulx(	(up), %r11, %r10)
117	mulx(	8,(up), %rcx, %r12)
118	mulx(	16,(up), %rbx, %rax)
119	lea	-8(rp), rp
120	test	n, n
121	jz	L(cj3)
122	lea	24(up), up
123	jmp	L(lo3)
124
125	ALIGN(32)
126L(top):	lea	32(rp), rp
127	mov	%r9, (rp)
128	adc	%r8, %r11
129L(lo3):	mulx(	(up), %r9, %r8)
130	mov	%r11, 8(rp)
131	adc	%r10, %rcx
132L(lo2):	mov	%rcx, 16(rp)
133	adc	%r12, %rbx
134L(lo1):	mulx(	8,(up), %r11, %r10)
135	adc	%rax, %r9
136	mulx(	16,(up), %rcx, %r12)
137	mov	%rbx, 24(rp)
138L(lo0):	mulx(	24,(up), %rbx, %rax)
139	lea	32(up), up
140	dec	n
141	jnz	L(top)
142
143L(end):	lea	32(rp), rp
144	mov	%r9, (rp)
145	adc	%r8, %r11
146L(cj3):	mov	%r11, 8(rp)
147	adc	%r10, %rcx
148L(cj2):	mov	%rcx, 16(rp)
149	adc	%r12, %rbx
150L(cj1):	mov	%rbx, 24(rp)
151	adc	$0, %rax
152
153	pop	%r12
154	pop	%rbp
155	pop	%rbx
156	FUNC_EXIT()
157	ret
158EPILOGUE()
159ASM_END()
160