1dnl  AMD64 mpn_addmul_1 and mpn_submul_1.
2
3dnl  Copyright 2003-2005, 2007, 2008, 2011, 2012 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33C	     cycles/limb
34C AMD K8,K9	 2.52
35C AMD K10	 2.51
36C AMD bd1	 4.43
37C AMD bd2	 5.03	 5.63
38C AMD bd3	 ?
39C AMD bd4	 ?
40C AMD zen	 ?
41C AMD bobcat	 6.20
42C AMD jaguar	 5.57	 6.56
43C Intel P4	14.9	17.1
44C Intel core2	 5.15
45C Intel NHM	 4.93
46C Intel SBR	 3.95
47C Intel IBR	 3.75
48C Intel HWL	 3.62
49C Intel BWL	 2.53
50C Intel SKL	 2.53
51C Intel atom	21.3
52C Intel SLM	 9.0
53C VIA nano	 5.0
54
55C The loop of this code is the result of running a code generation and
56C optimization tool suite written by David Harvey and Torbjorn Granlund.
57
58C TODO
59C  * The loop is great, but the prologue and epilogue code was quickly written.
60C    Tune it!
61
62define(`rp',      `%rdi')   C rcx
63define(`up',      `%rsi')   C rdx
64define(`n_param', `%rdx')   C r8
65define(`vl',      `%rcx')   C r9
66
67define(`n',       `%r11')
68
69ifdef(`OPERATION_addmul_1',`
70      define(`ADDSUB',        `add')
71      define(`func',  `mpn_addmul_1')
72')
73ifdef(`OPERATION_submul_1',`
74      define(`ADDSUB',        `sub')
75      define(`func',  `mpn_submul_1')
76')
77
78ABI_SUPPORT(DOS64)
79ABI_SUPPORT(STD64)
80
81MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)
82
83IFDOS(`	define(`up', ``%rsi'')	') dnl
84IFDOS(`	define(`rp', ``%rcx'')	') dnl
85IFDOS(`	define(`vl', ``%r9'')	') dnl
86IFDOS(`	define(`r9', ``rdi'')	') dnl
87IFDOS(`	define(`n',  ``%r8'')	') dnl
88IFDOS(`	define(`r8', ``r11'')	') dnl
89
90ASM_START()
91	TEXT
92	ALIGN(16)
93PROLOGUE(func)
94
95IFDOS(``push	%rsi		'')
96IFDOS(``push	%rdi		'')
97IFDOS(``mov	%rdx, %rsi	'')
98
99	mov	(up), %rax		C read first u limb early
100	push	%rbx
101IFSTD(`	mov	n_param, %rbx   ')	C move away n from rdx, mul uses it
102IFDOS(`	mov	n, %rbx         ')
103	mul	vl
104IFSTD(`	mov	%rbx, n         ')
105
106	and	$3, R32(%rbx)
107	jz	L(b0)
108	cmp	$2, R32(%rbx)
109	jz	L(b2)
110	jg	L(b3)
111
112L(b1):	dec	n
113	jne	L(gt1)
114	ADDSUB	%rax, (rp)
115	jmp	L(ret)
116L(gt1):	lea	8(up,n,8), up
117	lea	-8(rp,n,8), rp
118	neg	n
119	xor	%r10, %r10
120	xor	R32(%rbx), R32(%rbx)
121	mov	%rax, %r9
122	mov	(up,n,8), %rax
123	mov	%rdx, %r8
124	jmp	L(L1)
125
126L(b0):	lea	(up,n,8), up
127	lea	-16(rp,n,8), rp
128	neg	n
129	xor	%r10, %r10
130	mov	%rax, %r8
131	mov	%rdx, %rbx
132	jmp	 L(L0)
133
134L(b3):	lea	-8(up,n,8), up
135	lea	-24(rp,n,8), rp
136	neg	n
137	mov	%rax, %rbx
138	mov	%rdx, %r10
139	jmp	L(L3)
140
141L(b2):	lea	-16(up,n,8), up
142	lea	-32(rp,n,8), rp
143	neg	n
144	xor	%r8, %r8
145	xor	R32(%rbx), R32(%rbx)
146	mov	%rax, %r10
147	mov	24(up,n,8), %rax
148	mov	%rdx, %r9
149	jmp	L(L2)
150
151	ALIGN(16)
152L(top):	ADDSUB	%r10, (rp,n,8)
153	adc	%rax, %r9
154	mov	(up,n,8), %rax
155	adc	%rdx, %r8
156	mov	$0, R32(%r10)
157L(L1):	mul	vl
158	ADDSUB	%r9, 8(rp,n,8)
159	adc	%rax, %r8
160	adc	%rdx, %rbx
161L(L0):	mov	8(up,n,8), %rax
162	mul	vl
163	ADDSUB	%r8, 16(rp,n,8)
164	adc	%rax, %rbx
165	adc	%rdx, %r10
166L(L3):	mov	16(up,n,8), %rax
167	mul	vl
168	ADDSUB	%rbx, 24(rp,n,8)
169	mov	$0, R32(%r8)		C zero
170	mov	%r8, %rbx		C zero
171	adc	%rax, %r10
172	mov	24(up,n,8), %rax
173	mov	%r8, %r9		C zero
174	adc	%rdx, %r9
175L(L2):	mul	vl
176	add	$4, n
177	js	 L(top)
178
179	ADDSUB	%r10, (rp,n,8)
180	adc	%rax, %r9
181	adc	%r8, %rdx
182	ADDSUB	%r9, 8(rp,n,8)
183L(ret):	adc	$0, %rdx
184	mov	%rdx, %rax
185
186	pop	%rbx
187IFDOS(``pop	%rdi		'')
188IFDOS(``pop	%rsi		'')
189	ret
190EPILOGUE()
191