1dnl  AMD64 mpn_addmul_1 and mpn_submul_1 optimised for AMD bt1/bt2.
2
3dnl  Copyright 2003-2005, 2007, 2008, 2011, 2012, 2018-2019 Free Software
4dnl  Foundation, Inc.
5
6dnl  This file is part of the GNU MP Library.
7dnl
8dnl  The GNU MP Library is free software; you can redistribute it and/or modify
9dnl  it under the terms of either:
10dnl
11dnl    * the GNU Lesser General Public License as published by the Free
12dnl      Software Foundation; either version 3 of the License, or (at your
13dnl      option) any later version.
14dnl
15dnl  or
16dnl
17dnl    * the GNU General Public License as published by the Free Software
18dnl      Foundation; either version 2 of the License, or (at your option) any
19dnl      later version.
20dnl
21dnl  or both in parallel, as here.
22dnl
23dnl  The GNU MP Library is distributed in the hope that it will be useful, but
24dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
25dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
26dnl  for more details.
27dnl
28dnl  You should have received copies of the GNU General Public License and the
29dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
30dnl  see https://www.gnu.org/licenses/.
31
32include(`../config.m4')
33
34C	     cycles/limb
35C AMD K8,K9	 4.52		old measurement
36C AMD K10	 4.51		old measurement
37C AMD bd1	 4.66		old measurement
38C AMD bd2	 4.57		old measurement
39C AMD bd3	 ?
40C AMD bd4	 ?
41C AMD zen	 ?
42C AMD bt1	 5.04
43C AMD bt2	 5.07
44C Intel P4	16.8	18.6	old measurement
45C Intel PNR	 5.59		old measurement
46C Intel NHM	 5.39		old measurement
47C Intel SBR	 3.93		old measurement
48C Intel IBR	 3.59		old measurement
49C Intel HWL	 3.61		old measurement
50C Intel BWL	 2.76		old measurement
51C Intel SKL	 2.77		old measurement
52C Intel atom	23		old measurement
53C Intel SLM	 8		old measurement
54C Intel GLM	 ?
55C VIA nano	 5.63		old measurement
56
57C The ALIGNment here might look completely ad-hoc.  They are not.
58
59ABI_SUPPORT(DOS64)
60ABI_SUPPORT(STD64)
61
62ifdef(`OPERATION_addmul_1',`
63      define(`ADDSUB',        `add')
64      define(`func',  `mpn_addmul_1')
65')
66ifdef(`OPERATION_submul_1',`
67      define(`ADDSUB',        `sub')
68      define(`func',  `mpn_submul_1')
69')
70
71MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)
72
73C Standard parameters
74define(`rp',              `%rdi')
75define(`up',              `%rsi')
76define(`n_param',         `%rdx')
77define(`v0',              `%rcx')
78C Standard allocations
79define(`n',               `%rbx')
80define(`w0',              `%r8')
81define(`w1',              `%r9')
82define(`w2',              `%r10')
83define(`w3',              `%r11')
84
85C DOS64 parameters
86IFDOS(` define(`rp',      `%rcx')    ') dnl
87IFDOS(` define(`up',      `%rsi')    ') dnl
88IFDOS(` define(`n_param', `%r8')     ') dnl
89IFDOS(` define(`v0',      `%r9')     ') dnl
90C DOS64 allocations
91IFDOS(` define(`n',       `%rbx')    ') dnl
92IFDOS(` define(`w0',      `%r8')     ') dnl
93IFDOS(` define(`w1',      `%rdi')    ') dnl
94IFDOS(` define(`w2',      `%r10')    ') dnl
95IFDOS(` define(`w3',      `%r11')    ') dnl
96
97ASM_START()
98	TEXT
99	ALIGN(64)
100PROLOGUE(func)
101IFDOS(`	push	%rsi		')
102IFDOS(`	push	%rdi		')
103IFDOS(`	mov	%rdx, %rsi	')
104
105	push	%rbx
106	mov	(up), %rax
107
108	lea	(rp,n_param,8), rp
109	lea	(up,n_param,8), up
110	mov	n_param, n
111
112	test	$1, R8(n_param)
113	jne	L(bx1)
114
115L(bx0):	mul	v0
116	neg	n
117	mov	%rax, w0
118	mov	%rdx, w1
119	test	$2, R8(n)
120	jne	L(L2)
121
122L(b00):	add	$2, n
123	jmp	L(L0)
124
125	ALIGN(16)
126L(bx1):	mul	v0
127	test	$2, R8(n)
128	je	L(b01)
129
130L(b11):	mov	%rax, w2
131	mov	%rdx, w3
132	neg	n
133	inc	n
134	jmp	L(L3)
135
136	ALIGN(16)
137L(b01):	sub	$3, n
138	jc	L(n1)
139	mov	%rax, w2
140	mov	%rdx, w3
141	neg	n
142
143	ALIGN(16)
144L(top):	mov	-16(up,n,8), %rax
145	mul	v0
146	mov	%rax, w0
147	mov	%rdx, w1
148	ADDSUB	w2, -24(rp,n,8)
149	adc	w3, w0
150	adc	$0, w1
151L(L0):	mov	-8(up,n,8), %rax
152	mul	v0
153	mov	%rax, w2
154	mov	%rdx, w3
155	ADDSUB	w0, -16(rp,n,8)
156	adc	w1, w2
157	adc	$0, w3
158L(L3):	mov	(up,n,8), %rax
159	mul	v0
160	mov	%rax, w0
161	mov	%rdx, w1
162	ADDSUB	w2, -8(rp,n,8)
163	adc	w3, w0
164	adc	$0, w1
165L(L2):	mov	8(up,n,8), %rax
166	mul	v0
167	mov	%rax, w2
168	mov	%rdx, w3
169	ADDSUB	w0, (rp,n,8)
170	adc	w1, w2
171	adc	$0, w3
172	add	$4, n
173	js	L(top)
174
175L(end):	xor	R32(%rax), R32(%rax)
176	ADDSUB	w2, -8(rp)
177	adc	w3, %rax
178	pop	%rbx
179IFDOS(`	pop	%rdi		')
180IFDOS(`	pop	%rsi		')
181	ret
182
183	ALIGN(32)
184L(n1):	ADDSUB	%rax, -8(rp)
185	mov	$0, R32(%rax)
186	adc	%rdx, %rax
187	pop	%rbx
188IFDOS(`	pop	%rdi		')
189IFDOS(`	pop	%rsi		')
190	ret
191EPILOGUE()
192