1dnl  AMD64 mpn_mul_1.
2
3dnl  Copyright 2003-2005, 2007, 2008, 2012 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33C	     cycles/limb
34C AMD K8,K9      2.54
35C AMD K10        2.54
36C AMD bull       4.98
37C AMD pile       4.80
38C AMD steam
39C AMD excavator
40C AMD bobcat     5.37
41C AMD jaguar     6.16
42C Intel P4      12.6
43C Intel core2    4.05
44C Intel NHM      4.0
45C Intel SBR      2.91
46C Intel IBR      2.73
47C Intel HWL      2.44
48C Intel BWL      2.39
49C Intel SKL      2.44
50C Intel atom    19.8
51C Intel SLM      9.0
52C VIA nano       4.25
53
54C The loop of this code is the result of running a code generation and
55C optimization tool suite written by David Harvey and Torbjorn Granlund.
56
57C TODO
58C  * The loop is great, but the prologue and epilogue code was quickly written.
59C    Tune it!
60
61define(`rp',      `%rdi')   C rcx
62define(`up',      `%rsi')   C rdx
63define(`n_param', `%rdx')   C r8
64define(`vl',      `%rcx')   C r9
65
66define(`n',       `%r11')
67
68ABI_SUPPORT(DOS64)
69ABI_SUPPORT(STD64)
70
71IFDOS(`	define(`up', ``%rsi'')	') dnl
72IFDOS(`	define(`rp', ``%rcx'')	') dnl
73IFDOS(`	define(`vl', ``%r9'')	') dnl
74IFDOS(`	define(`r9', ``rdi'')	') dnl
75IFDOS(`	define(`n',  ``%r8'')	') dnl
76IFDOS(`	define(`r8', ``r11'')	') dnl
77
78ASM_START()
79	TEXT
80	ALIGN(16)
81PROLOGUE(mpn_mul_1c)
82IFDOS(``push	%rsi		'')
83IFDOS(``push	%rdi		'')
84IFDOS(``mov	%rdx, %rsi	'')
85	push	%rbx
86IFSTD(`	mov	%r8, %r10')
87IFDOS(`	mov	64(%rsp), %r10')	C 40 + 3*8  (3 push insns)
88	jmp	L(common)
89EPILOGUE()
90
91PROLOGUE(mpn_mul_1)
92IFDOS(``push	%rsi		'')
93IFDOS(``push	%rdi		'')
94IFDOS(``mov	%rdx, %rsi	'')
95
96	push	%rbx
97	xor	%r10, %r10
98L(common):
99	mov	(up), %rax		C read first u limb early
100IFSTD(`	mov	n_param, %rbx   ')	C move away n from rdx, mul uses it
101IFDOS(`	mov	n, %rbx         ')
102	mul	vl
103IFSTD(`	mov	%rbx, n         ')
104
105	add	%r10, %rax
106	adc	$0, %rdx
107
108	and	$3, R32(%rbx)
109	jz	L(b0)
110	cmp	$2, R32(%rbx)
111	jz	L(b2)
112	jg	L(b3)
113
114L(b1):	dec	n
115	jne	L(gt1)
116	mov	%rax, (rp)
117	jmp	L(ret)
118L(gt1):	lea	8(up,n,8), up
119	lea	-8(rp,n,8), rp
120	neg	n
121	xor	%r10, %r10
122	xor	R32(%rbx), R32(%rbx)
123	mov	%rax, %r9
124	mov	(up,n,8), %rax
125	mov	%rdx, %r8
126	jmp	L(L1)
127
128L(b0):	lea	(up,n,8), up
129	lea	-16(rp,n,8), rp
130	neg	n
131	xor	%r10, %r10
132	mov	%rax, %r8
133	mov	%rdx, %rbx
134	jmp	 L(L0)
135
136L(b3):	lea	-8(up,n,8), up
137	lea	-24(rp,n,8), rp
138	neg	n
139	mov	%rax, %rbx
140	mov	%rdx, %r10
141	jmp	L(L3)
142
143L(b2):	lea	-16(up,n,8), up
144	lea	-32(rp,n,8), rp
145	neg	n
146	xor	%r8, %r8
147	xor	R32(%rbx), R32(%rbx)
148	mov	%rax, %r10
149	mov	24(up,n,8), %rax
150	mov	%rdx, %r9
151	jmp	L(L2)
152
153	ALIGN(16)
154L(top):	mov	%r10, (rp,n,8)
155	add	%rax, %r9
156	mov	(up,n,8), %rax
157	adc	%rdx, %r8
158	mov	$0, R32(%r10)
159L(L1):	mul	vl
160	mov	%r9, 8(rp,n,8)
161	add	%rax, %r8
162	adc	%rdx, %rbx
163L(L0):	mov	8(up,n,8), %rax
164	mul	vl
165	mov	%r8, 16(rp,n,8)
166	add	%rax, %rbx
167	adc	%rdx, %r10
168L(L3):	mov	16(up,n,8), %rax
169	mul	vl
170	mov	%rbx, 24(rp,n,8)
171	mov	$0, R32(%r8)		C zero
172	mov	%r8, %rbx		C zero
173	add	%rax, %r10
174	mov	24(up,n,8), %rax
175	mov	%r8, %r9		C zero
176	adc	%rdx, %r9
177L(L2):	mul	vl
178	add	$4, n
179	js	 L(top)
180
181	mov	%r10, (rp,n,8)
182	add	%rax, %r9
183	adc	%r8, %rdx
184	mov	%r9, 8(rp,n,8)
185	add	%r8, %rdx
186L(ret):	mov	%rdx, %rax
187
188	pop	%rbx
189IFDOS(``pop	%rdi		'')
190IFDOS(``pop	%rsi		'')
191	ret
192EPILOGUE()
193