1279377Simpdnl  AMD64 mpn_mul_1 optimised for AMD Bulldozer.
2279377Simp
3279377Simpdnl  Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc.
4279377Simp
5279377Simpdnl  This file is part of the GNU MP Library.
6279377Simpdnl
7279377Simpdnl  The GNU MP Library is free software; you can redistribute it and/or modify
8279377Simpdnl  it under the terms of either:
9279377Simpdnl
10279377Simpdnl    * the GNU Lesser General Public License as published by the Free
11279377Simpdnl      Software Foundation; either version 3 of the License, or (at your
12279377Simpdnl      option) any later version.
13279377Simpdnl
14279377Simpdnl  or
15279377Simpdnl
16279377Simpdnl    * the GNU General Public License as published by the Free Software
17279377Simpdnl      Foundation; either version 2 of the License, or (at your option) any
18279377Simpdnl      later version.
19279377Simpdnl
20279377Simpdnl  or both in parallel, as here.
21279377Simpdnl
22279377Simpdnl  The GNU MP Library is distributed in the hope that it will be useful, but
23279377Simpdnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24279377Simpdnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25279377Simpdnl  for more details.
26279377Simpdnl
27279377Simpdnl  You should have received copies of the GNU General Public License and the
28279377Simpdnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29279377Simpdnl  see https://www.gnu.org/licenses/.
30279377Simp
31279377Simpinclude(`../config.m4')
32279377Simp
33279377SimpC	     cycles/limb
34279377SimpC AMD K8,K9      3.65
35279377SimpC AMD K10        3.30    3.68
36279377SimpC AMD bull       4.04    4.29
37279377SimpC AMD pile       4.33
38279377SimpC AMD steam
39279377SimpC AMD excavator
40279377SimpC AMD bobcat     5.73
41279377SimpC AMD jaguar     5.87
42279377SimpC Intel P4      12.5
43279377SimpC Intel core2    4.38
44279377SimpC Intel NHM      4.28
45279377SimpC Intel SBR      2.69
46279377SimpC Intel IBR      2.55
47279377SimpC Intel HWL      2.41
48279377SimpC Intel BWL      2.49
49279377SimpC Intel SKL      2.50
50279377SimpC Intel atom    20.3
51279377SimpC Intel SLM      7.8
52279377SimpC VIA nano       4.25
53279377Simp
54279377SimpC The loop of this code is the result of running a code generation and
55279377SimpC optimisation tool suite written by David Harvey and Torbjorn Granlund.
56279377Simp
57279377SimpC TODO
58279377SimpC  * Move loop code into feed-in blocks, to save insn for zeroing regs.
59279377Simp
60279377Simpdefine(`rp',      `%rdi')   C rcx
61279377Simpdefine(`up',      `%rsi')   C rdx
62279377Simpdefine(`n_param', `%rdx')   C r8
63279377Simpdefine(`v0',      `%rcx')   C r9
64279377Simp
65279377Simpdefine(`n',       `%rbx')
66279377Simp
67279377SimpABI_SUPPORT(DOS64)
68279377SimpABI_SUPPORT(STD64)
69279377Simp
70279377SimpIFDOS(`	define(`up', ``%rsi'')	') dnl
71279377SimpIFDOS(`	define(`rp', ``%rcx'')	') dnl
72279377SimpIFDOS(`	define(`v0', ``%r9'')	') dnl
73279377SimpIFDOS(`	define(`r9', ``rdi'')	') dnl
74279377SimpIFDOS(`	define(`n',  ``%r8'')	') dnl
75279377SimpIFDOS(`	define(`r8', ``rbx'')	') dnl
76279377Simp
77279377SimpASM_START()
78279377Simp	TEXT
79279377Simp	ALIGN(16)
80279377SimpPROLOGUE(mpn_mul_1c)
81279377SimpIFDOS(``push	%rsi		'')
82279377SimpIFDOS(``push	%rdi		'')
83279377SimpIFDOS(``mov	%rdx, %rsi	'')
84279377Simp
85279377Simp	mov	(up), %rax		C read first u limb early
86279377Simp	push	%rbx
87279377SimpIFSTD(`	mov	n_param, %r11	')	C move away n from rdx, mul uses it
88279377SimpIFDOS(`	mov	n, %r11		')
89279377Simp	mul	v0
90279377Simp
91279377SimpIFSTD(` add	%r8, %rax	')
92279377SimpIFDOS(` add	64(%rsp), %rax	')	C 40 + 3*8  (3 push insns)
93279377Simp	adc	$0, %rdx
94279377Simp	jmp	L(common)
95279377Simp
96279377SimpEPILOGUE()
97279377Simp
98279377Simp	ALIGN(16)
99279377SimpPROLOGUE(mpn_mul_1)
100279377SimpIFDOS(``push	%rsi		'')
101279377SimpIFDOS(``push	%rdi		'')
102279377SimpIFDOS(``mov	%rdx, %rsi	'')
103279377Simp
104279377Simp	mov	(up), %rax		C read first u limb early
105279377Simp	push	%rbx
106279377SimpIFSTD(`	mov	n_param, %r11	')	C move away n from rdx, mul uses it
107279377SimpIFDOS(`	mov	n, %r11		')
108279377Simp	mul	v0
109279377Simp
110279377SimpL(common):
111279377SimpIFSTD(`	mov	%r11, n		')
112279377Simp
113279377Simp	and	$3, R32(%r11)
114279377Simp	lea	-16(rp,n,8), rp
115279377Simp	jz	L(b0)
116279377Simp	cmp	$2, R32(%r11)
117279377Simp	jb	L(b1)
118279377Simp	jz	L(b2)
119279377Simp
120279377SimpL(b3):	mov	%rax, %r10
121279377Simp	mov	%rdx, %r11
122279377Simp	mov	8(up), %rax
123279377Simp	mul	v0
124279377Simp	lea	(up,n,8), up
125279377Simp	not	n
126279377Simp	jmp	L(L3)
127279377Simp
128279377SimpL(b0):	mov	%rax, %r9
129279377Simp	mov	%rdx, %r10
130279377Simp	mov	8(up), %rax
131279377Simp	lea	(up,n,8), up
132279377Simp	neg	n
133279377Simp	jmp	L(L0)
134279377Simp
135279377SimpL(b1):	mov	%rax, %r8
136279377Simp	cmp	$1, n
137279377Simp	jz	L(n1)
138279377Simp	mov	%rdx, %r9
139279377Simp	lea	(up,n,8), up
140279377Simp	neg	n
141279377Simp	mov	%r8, 16(rp,n,8)
142279377Simp	inc	n
143279377Simp	jmp	L(L1)
144279377Simp
145279377SimpL(b2):	mov	%rax, %r11
146279377Simp	mov	%rdx, %r8
147279377Simp	mov	8(up), %rax
148279377Simp	lea	(up,n,8), up
149279377Simp	neg	n
150279377Simp	add	$2, n
151279377Simp	jns	L(end)
152279377Simp
153279377Simp	ALIGN(16)
154279377SimpL(top):	mul	v0
155279377Simp	mov	%rdx, %r9
156279377Simp	add	%rax, %r8
157279377Simp	adc	$0, %r9
158279377Simp	mov	%r8, 8(rp,n,8)
159279377Simp	mov	%r11, (rp,n,8)
160279377SimpL(L1):	mov	(up,n,8), %rax
161279377Simp	mul	v0
162279377Simp	add	%rax, %r9
163279377Simp	mov	%rdx, %r10
164279377Simp	mov	8(up,n,8), %rax
165279377Simp	adc	$0, %r10
166279377SimpL(L0):	mul	v0
167279377Simp	add	%rax, %r10
168279377Simp	mov	%rdx, %r11
169279377Simp	mov	16(up,n,8), %rax
170279377Simp	adc	$0, %r11
171279377Simp	mul	v0
172279377Simp	mov	%r9, 16(rp,n,8)
173279377SimpL(L3):	add	%rax, %r11
174279377Simp	mov	%r10, 24(rp,n,8)
175279377Simp	mov	%rdx, %r8
176279377Simp	adc	$0, %r8
177279377Simp	add	$4, n
178279377Simp	mov	-8(up,n,8), %rax
179279377Simp	js	L(top)
180279377Simp
181279377SimpL(end):	mul	v0
182279377Simp	add	%rax, %r8
183279377Simp	adc	$0, %rdx
184279377Simp	mov	%r11, (rp)
185279377SimpL(n1):	mov	%r8, 8(rp)
186279377Simp	mov	%rdx, %rax
187
188	pop	%rbx
189IFDOS(``pop	%rdi		'')
190IFDOS(``pop	%rsi		'')
191	ret
192EPILOGUE()
193ASM_END()
194