1dnl  X86-64 mpn_mul_1 optimised for Intel Sandy Bridge.
2
3dnl  Contributed to the GNU project by Torbj��rn Granlund.
4
5dnl  Copyright 2003-2005, 2007, 2008, 2011-2013, 2017 Free Software Foundation,
6dnl  Inc.
7
8dnl  This file is part of the GNU MP Library.
9dnl
10dnl  The GNU MP Library is free software; you can redistribute it and/or modify
11dnl  it under the terms of either:
12dnl
13dnl    * the GNU Lesser General Public License as published by the Free
14dnl      Software Foundation; either version 3 of the License, or (at your
15dnl      option) any later version.
16dnl
17dnl  or
18dnl
19dnl    * the GNU General Public License as published by the Free Software
20dnl      Foundation; either version 2 of the License, or (at your option) any
21dnl      later version.
22dnl
23dnl  or both in parallel, as here.
24dnl
25dnl  The GNU MP Library is distributed in the hope that it will be useful, but
26dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
27dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
28dnl  for more details.
29dnl
30dnl  You should have received copies of the GNU General Public License and the
31dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
32dnl  see https://www.gnu.org/licenses/.
33
34include(`../config.m4')
35
36C	     cycles/limb
37C AMD K8,K9
38C AMD K10
39C AMD bull
40C AMD pile
41C AMD steam
42C AMD excavator
43C AMD bobcat
44C AMD jaguar
45C Intel P4
46C Intel core2
47C Intel NHM
48C Intel SBR      2.49
49C Intel IBR      2.32
50C Intel HWL      2.44
51C Intel BWL      2.43
52C Intel SKL      2.47
53C Intel atom
54C Intel SLM
55C VIA nano
56
57C The loop of this code is the result of running a code generation and
58C optimisation tool suite written by David Harvey and Torbjorn Granlund.
59
60define(`rp',      `%rdi')   C rcx
61define(`up_param',`%rsi')   C rdx
62define(`n_param', `%rdx')   C r8
63define(`v0',      `%rcx')   C r9
64define(`cin',     `%r8')    C stack
65
66define(`up',      `%rsi')   C same as rp_param
67define(`n',	  `%r9')
68
69ABI_SUPPORT(DOS64)
70ABI_SUPPORT(STD64)
71
72IFDOS(`	define(`rp',      `%rcx')')
73IFDOS(`	define(`up_param',`%rdx')')
74IFDOS(`	define(`n_param', `%r8')')
75IFDOS(`	define(`v0',      `%r9')')
76IFDOS(`	define(`cin',     `48(%rsp)')')
77
78IFDOS(`	define(`up',      `%rsi')')
79IFDOS(`	define(`n',       `%r8')')
80
81ASM_START()
82	TEXT
83	ALIGN(16)
84PROLOGUE(mpn_mul_1)
85IFDOS(`	push	%rsi		')
86	mov	(up_param), %rax
87IFSTD(`	mov	n_param, n	')
88	lea	(up_param,n_param,8), up
89	lea	-8(rp,n_param,8), rp
90	neg	n
91	mul	v0
92
93	test	$1, R8(n)
94	jz	L(x0)
95L(x1):	mov	%rax, %r11
96	mov	%rdx, %r10
97	test	$2, R8(n)
98	jnz	L(01)
99
100L(11):	mov	8(up,n,8), %rax
101	dec	n
102	jmp	L(L3)
103
104L(01):	inc	n
105	jnz	L(L1)
106	mov	%rax, (rp)
107	mov	%rdx, %rax
108IFDOS(`	pop	%rsi		')
109	ret
110
111L(x0):	mov	%rax, %r10
112	mov	%rdx, %r11
113	mov	8(up,n,8), %rax
114	test	$2, R8(n)
115	jz	L(L0)
116
117L(10):	add	$-2, n
118	jmp	L(L2)
119
120	ALIGN(8)
121L(top):	mov	%rdx, %r10
122	add	%rax, %r11
123L(L1):	mov	0(up,n,8), %rax
124	adc	$0, %r10
125	mul	v0
126	add	%rax, %r10
127	mov	%r11, 0(rp,n,8)
128	mov	8(up,n,8), %rax
129	mov	%rdx, %r11
130L(L0c):	adc	$0, %r11
131L(L0):	mul	v0
132	mov	%r10, 8(rp,n,8)
133	add	%rax, %r11
134	mov	%rdx, %r10
135L(L3c):	mov	16(up,n,8), %rax
136	adc	$0, %r10
137L(L3):	mul	v0
138	mov	%r11, 16(rp,n,8)
139	mov	%rdx, %r11
140	add	%rax, %r10
141L(L2c):	mov	24(up,n,8), %rax
142	adc	$0, %r11
143L(L2):	mul	v0
144	mov	%r10, 24(rp,n,8)
145	add	$4, n
146	jnc	L(top)
147
148L(end):	add	%rax, %r11
149	mov	%rdx, %rax
150	adc	$0, %rax
151	mov	%r11, (rp)
152
153IFDOS(`	pop	%rsi		')
154	ret
155EPILOGUE()
156
157	ALIGN(16)
158PROLOGUE(mpn_mul_1c)
159IFDOS(`	push	%rsi		')
160	mov	(up_param), %rax
161IFSTD(`	mov	n_param, n	')
162	lea	(up_param,n_param,8), up
163	lea	-8(rp,n_param,8), rp
164	neg	n
165	mul	v0
166
167	test	$1, R8(n)
168	jz	L(x0c)
169L(x1c):	mov	%rax, %r11
170	mov	%rdx, %r10
171	test	$2, R8(n)
172	jnz	L(01c)
173
174L(11c):	add	cin, %r11
175	dec	n
176	jmp	L(L3c)
177
178L(01c):	add	cin, %r11
179	inc	n
180	jnz	L(L1)
181	mov	%r11, (rp)
182	mov	%rdx, %rax
183	adc	$0, %rax
184IFDOS(`	pop	%rsi		')
185	ret
186
187L(x0c):	mov	%rax, %r10
188	mov	%rdx, %r11
189	test	$2, R8(n)
190	jz	L(00c)
191
192L(10c):	add	$-2, n
193	add	cin, %r10
194	jmp	L(L2c)
195
196L(00c):	add	cin, %r10
197	mov	8(up,n,8), %rax
198	jmp	L(L0c)
199EPILOGUE()
200