1290001Sglebiusdnl  AMD64 mpn_addmul_1 and mpn_submul_1 optimised for AMD bt1/bt2.
2290001Sglebius
3290001Sglebiusdnl  Copyright 2003-2005, 2007, 2008, 2011, 2012, 2018-2019 Free Software
4290001Sglebiusdnl  Foundation, Inc.
5290001Sglebius
6290001Sglebiusdnl  This file is part of the GNU MP Library.
7290001Sglebiusdnl
8290001Sglebiusdnl  The GNU MP Library is free software; you can redistribute it and/or modify
9290001Sglebiusdnl  it under the terms of either:
10290001Sglebiusdnl
11290001Sglebiusdnl    * the GNU Lesser General Public License as published by the Free
12290001Sglebiusdnl      Software Foundation; either version 3 of the License, or (at your
13290001Sglebiusdnl      option) any later version.
14290001Sglebiusdnl
15290001Sglebiusdnl  or
16290001Sglebiusdnl
17290001Sglebiusdnl    * the GNU General Public License as published by the Free Software
18290001Sglebiusdnl      Foundation; either version 2 of the License, or (at your option) any
19290001Sglebiusdnl      later version.
20290001Sglebiusdnl
21290001Sglebiusdnl  or both in parallel, as here.
22290001Sglebiusdnl
23290001Sglebiusdnl  The GNU MP Library is distributed in the hope that it will be useful, but
24290001Sglebiusdnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
25290001Sglebiusdnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
26290001Sglebiusdnl  for more details.
27290001Sglebiusdnl
28290001Sglebiusdnl  You should have received copies of the GNU General Public License and the
29290001Sglebiusdnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
30290001Sglebiusdnl  see https://www.gnu.org/licenses/.
31290001Sglebius
32290001Sglebiusinclude(`../config.m4')
33290001Sglebius
34290001SglebiusC	     cycles/limb
35290001SglebiusC AMD K8,K9	 4.52		old measurement
36290001SglebiusC AMD K10	 4.51		old measurement
37290001SglebiusC AMD bd1	 4.66		old measurement
38290001SglebiusC AMD bd2	 4.57		old measurement
39290001SglebiusC AMD bd3	 ?
40290001SglebiusC AMD bd4	 ?
41290001SglebiusC AMD zen	 ?
42290001SglebiusC AMD bt1	 5.04
43290001SglebiusC AMD bt2	 5.07
44290001SglebiusC Intel P4	16.8	18.6	old measurement
45290001SglebiusC Intel PNR	 5.59		old measurement
46290001SglebiusC Intel NHM	 5.39		old measurement
47290001SglebiusC Intel SBR	 3.93		old measurement
48290001SglebiusC Intel IBR	 3.59		old measurement
49290001SglebiusC Intel HWL	 3.61		old measurement
50290001SglebiusC Intel BWL	 2.76		old measurement
51290001SglebiusC Intel SKL	 2.77		old measurement
52290001SglebiusC Intel atom	23		old measurement
53290001SglebiusC Intel SLM	 8		old measurement
54290001SglebiusC Intel GLM	 ?
55290001SglebiusC VIA nano	 5.63		old measurement
56290001Sglebius
57290001SglebiusC The ALIGNment here might look completely ad-hoc.  They are not.
58290001Sglebius
59290001SglebiusABI_SUPPORT(DOS64)
60290001SglebiusABI_SUPPORT(STD64)
61290001Sglebius
62290001Sglebiusifdef(`OPERATION_addmul_1',`
63290001Sglebius      define(`ADDSUB',        `add')
64290001Sglebius      define(`func',  `mpn_addmul_1')
65290001Sglebius')
66290001Sglebiusifdef(`OPERATION_submul_1',`
67290001Sglebius      define(`ADDSUB',        `sub')
68290001Sglebius      define(`func',  `mpn_submul_1')
69290001Sglebius')
70290001Sglebius
71290001SglebiusMULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)
72290001Sglebius
73290001SglebiusC Standard parameters
74290001Sglebiusdefine(`rp',              `%rdi')
75290001Sglebiusdefine(`up',              `%rsi')
76290001Sglebiusdefine(`n_param',         `%rdx')
77290001Sglebiusdefine(`v0',              `%rcx')
78290001SglebiusC Standard allocations
79290001Sglebiusdefine(`n',               `%rbx')
80290001Sglebiusdefine(`w0',              `%r8')
81290001Sglebiusdefine(`w1',              `%r9')
82290001Sglebiusdefine(`w2',              `%r10')
83290001Sglebiusdefine(`w3',              `%r11')
84290001Sglebius
85290001SglebiusC DOS64 parameters
86290001SglebiusIFDOS(` define(`rp',      `%rcx')    ') dnl
87290001SglebiusIFDOS(` define(`up',      `%rsi')    ') dnl
88290001SglebiusIFDOS(` define(`n_param', `%r8')     ') dnl
89290001SglebiusIFDOS(` define(`v0',      `%r9')     ') dnl
90290001SglebiusC DOS64 allocations
91290001SglebiusIFDOS(` define(`n',       `%rbx')    ') dnl
92290001SglebiusIFDOS(` define(`w0',      `%r8')     ') dnl
93290001SglebiusIFDOS(` define(`w1',      `%rdi')    ') dnl
94290001SglebiusIFDOS(` define(`w2',      `%r10')    ') dnl
95290001SglebiusIFDOS(` define(`w3',      `%r11')    ') dnl
96290001Sglebius
97290001SglebiusASM_START()
98290001Sglebius	TEXT
99290001Sglebius	ALIGN(64)
100290001SglebiusPROLOGUE(func)
101290001SglebiusIFDOS(`	push	%rsi		')
102290001SglebiusIFDOS(`	push	%rdi		')
103290001SglebiusIFDOS(`	mov	%rdx, %rsi	')
104290001Sglebius
105290001Sglebius	push	%rbx
106290001Sglebius	mov	(up), %rax
107290001Sglebius
108290001Sglebius	lea	(rp,n_param,8), rp
109290001Sglebius	lea	(up,n_param,8), up
110290001Sglebius	mov	n_param, n
111290001Sglebius
112290001Sglebius	test	$1, R8(n_param)
113290001Sglebius	jne	L(bx1)
114290001Sglebius
115290001SglebiusL(bx0):	mul	v0
116290001Sglebius	neg	n
117290001Sglebius	mov	%rax, w0
118290001Sglebius	mov	%rdx, w1
119290001Sglebius	test	$2, R8(n)
120290001Sglebius	jne	L(L2)
121290001Sglebius
122290001SglebiusL(b00):	add	$2, n
123290001Sglebius	jmp	L(L0)
124290001Sglebius
125290001Sglebius	ALIGN(16)
126290001SglebiusL(bx1):	mul	v0
127290001Sglebius	test	$2, R8(n)
128290001Sglebius	je	L(b01)
129290001Sglebius
130290001SglebiusL(b11):	mov	%rax, w2
131290001Sglebius	mov	%rdx, w3
132290001Sglebius	neg	n
133290001Sglebius	inc	n
134290001Sglebius	jmp	L(L3)
135290001Sglebius
136290001Sglebius	ALIGN(16)
137290001SglebiusL(b01):	sub	$3, n
138290001Sglebius	jc	L(n1)
139290001Sglebius	mov	%rax, w2
140290001Sglebius	mov	%rdx, w3
141290001Sglebius	neg	n
142290001Sglebius
143290001Sglebius	ALIGN(16)
144290001SglebiusL(top):	mov	-16(up,n,8), %rax
145290001Sglebius	mul	v0
146290001Sglebius	mov	%rax, w0
147290001Sglebius	mov	%rdx, w1
148290001Sglebius	ADDSUB	w2, -24(rp,n,8)
149290001Sglebius	adc	w3, w0
150290001Sglebius	adc	$0, w1
151290001SglebiusL(L0):	mov	-8(up,n,8), %rax
152290001Sglebius	mul	v0
153290001Sglebius	mov	%rax, w2
154290001Sglebius	mov	%rdx, w3
155290001Sglebius	ADDSUB	w0, -16(rp,n,8)
156290001Sglebius	adc	w1, w2
157290001Sglebius	adc	$0, w3
158290001SglebiusL(L3):	mov	(up,n,8), %rax
159290001Sglebius	mul	v0
160290001Sglebius	mov	%rax, w0
161290001Sglebius	mov	%rdx, w1
162290001Sglebius	ADDSUB	w2, -8(rp,n,8)
163290001Sglebius	adc	w3, w0
164290001Sglebius	adc	$0, w1
165290001SglebiusL(L2):	mov	8(up,n,8), %rax
166290001Sglebius	mul	v0
167290001Sglebius	mov	%rax, w2
168290001Sglebius	mov	%rdx, w3
169290001Sglebius	ADDSUB	w0, (rp,n,8)
170290001Sglebius	adc	w1, w2
171290001Sglebius	adc	$0, w3
172290001Sglebius	add	$4, n
173290001Sglebius	js	L(top)
174290001Sglebius
175290001SglebiusL(end):	xor	R32(%rax), R32(%rax)
176290001Sglebius	ADDSUB	w2, -8(rp)
177290001Sglebius	adc	w3, %rax
178290001Sglebius	pop	%rbx
179290001SglebiusIFDOS(`	pop	%rdi		')
180290001SglebiusIFDOS(`	pop	%rsi		')
181290001Sglebius	ret
182290001Sglebius
183290001Sglebius	ALIGN(32)
184290001SglebiusL(n1):	ADDSUB	%rax, -8(rp)
185290001Sglebius	mov	$0, R32(%rax)
186290001Sglebius	adc	%rdx, %rax
187290001Sglebius	pop	%rbx
188290001SglebiusIFDOS(`	pop	%rdi		')
189290001SglebiusIFDOS(`	pop	%rsi		')
190290001Sglebius	ret
191290001SglebiusEPILOGUE()
192290001Sglebius