1dnl  AMD64 mpn_addmul_1 and mpn_submul_1 optimised for AMD Bulldozer.
2
3dnl  Copyright 2003-2005, 2007, 2008, 2011, 2012 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33C	     cycles/limb
34C AMD K8,K9      3.30    3.58
35C AMD K10        3.09
36C AMD bull       4.47    4.72
37C AMD pile       4.66
38C AMD steam
39C AMD excavator
40C AMD bobcat     6.30
41C AMD jaguar     6.29
42C Intel P4      17.3    17.8
43C Intel core2    5.13
44C Intel NHM      4.85
45C Intel SBR      3.83
46C Intel IBR      3.75
47C Intel HWL      3.45
48C Intel BWL      2.56
49C Intel SKL      2.53
50C Intel atom    20.3
51C Intel SLM      9
52C VIA nano
53
54C The loop of this code is the result of running a code generation and
55C optimisation tool suite written by David Harvey and Torbjorn Granlund.
56
57C TODO
58C  * Try to make loop run closer to 4 c/l in Bulldozer and Piledriver.
59
60define(`rp',      `%rdi')   C rcx
61define(`up',      `%rsi')   C rdx
62define(`n_param', `%rdx')   C r8
63define(`v0',      `%rcx')   C r9
64
65define(`n',       `%r11')
66
67ifdef(`OPERATION_addmul_1',`
68      define(`ADDSUB',        `add')
69      define(`func',  `mpn_addmul_1')
70')
71ifdef(`OPERATION_submul_1',`
72      define(`ADDSUB',        `sub')
73      define(`func',  `mpn_submul_1')
74')
75
76ABI_SUPPORT(DOS64)
77ABI_SUPPORT(STD64)
78
79MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)
80
81IFDOS(`	define(`up', ``%rsi'')	') dnl
82IFDOS(`	define(`rp', ``%rcx'')	') dnl
83IFDOS(`	define(`v0', ``%r9'')	') dnl
84IFDOS(`	define(`r9', ``rdi'')	') dnl
85IFDOS(`	define(`n',  ``%r8'')	') dnl
86IFDOS(`	define(`r8', ``r11'')	') dnl
87
88ASM_START()
89	TEXT
90	ALIGN(16)
91PROLOGUE(func)
92IFDOS(``push	%rsi		'')
93IFDOS(``push	%rdi		'')
94IFDOS(``mov	%rdx, %rsi	'')
95
96	mov	(up), %rax		C read first u limb early
97	push	%rbx
98IFSTD(`	mov	n_param, %rbx	')	C move away n from rdx, mul uses it
99IFDOS(`	mov	n, %rbx		')
100	mul	v0
101
102IFSTD(`	mov	%rbx, n		')
103
104	and	$3, R32(%rbx)
105	lea	-16(rp,n,8), rp
106	jz	L(b0)
107	cmp	$2, R32(%rbx)
108	jb	L(b1)
109	jz	L(b2)
110
111L(b3):	mov	$0, R32(%r8)
112	mov	%rax, %rbx
113	mov	$0, R32(%r9)
114	mov	8(up), %rax
115	mov	%rdx, %r10
116	lea	(up,n,8), up
117	not	n
118	jmp	L(L3)
119
120L(b0):	mov	$0, R32(%r10)
121	mov	%rax, %r8
122	mov	%rdx, %rbx
123	mov	8(up), %rax
124	lea	(up,n,8), up
125	neg	n
126	jmp	L(L0)
127
128L(b1):	cmp	$1, n
129	jz	L(n1)
130	mov	%rax, %r9
131	mov	8(up), %rax
132	mov	%rdx, %r8
133	mov	$0, R32(%rbx)
134	lea	(up,n,8), up
135	neg	n
136	inc	n
137	jmp	L(L1)
138
139L(b2):	mov	$0, R32(%rbx)
140	mov	%rax, %r10
141	mov	%rdx, %r9
142	mov	8(up), %rax
143	mov	$0, R32(%r8)
144	lea	(up,n,8), up
145	neg	n
146	add	$2, n
147	jns	L(end)
148
149	ALIGN(32)
150L(top):	mul	v0
151	ADDSUB	%r10, (rp,n,8)
152	adc	%rax, %r9
153	mov	(up,n,8), %rax
154	adc	%rdx, %r8
155L(L1):	mul	v0
156	mov	$0, R32(%r10)
157	ADDSUB	%r9, 8(rp,n,8)
158	adc	%rax, %r8
159	adc	%rdx, %rbx
160	mov	8(up,n,8), %rax
161L(L0):	mul	v0
162	ADDSUB	%r8, 16(rp,n,8)
163	mov	$0, R32(%r8)
164	adc	%rax, %rbx
165	mov	$0, R32(%r9)
166	mov	16(up,n,8), %rax
167	adc	%rdx, %r10
168L(L3):	mul	v0
169	ADDSUB	%rbx, 24(rp,n,8)
170	mov	$0, R32(%rbx)
171	adc	%rax, %r10
172	adc	%rdx, %r9
173	mov	24(up,n,8), %rax
174	add	$4, n
175	js	L(top)
176
177L(end):	mul	v0
178	ADDSUB	%r10, (rp)
179	adc	%r9, %rax
180	adc	%r8, %rdx
181L(n1):	ADDSUB	%rax, 8(rp)
182	adc	$0, %rdx
183	mov	%rdx, %rax
184
185	pop	%rbx
186IFDOS(``pop	%rdi		'')
187IFDOS(``pop	%rsi		'')
188	ret
189EPILOGUE()
190ASM_END()
191