1dnl  AMD64 mpn_addmul_1 and mpn_submul_1 optimised for Intel Haswell.
2
3dnl  Contributed to the GNU project by Torbj��rn Granlund.
4
5dnl  Copyright 2013 Free Software Foundation, Inc.
6
7dnl  This file is part of the GNU MP Library.
8dnl
9dnl  The GNU MP Library is free software; you can redistribute it and/or modify
10dnl  it under the terms of either:
11dnl
12dnl    * the GNU Lesser General Public License as published by the Free
13dnl      Software Foundation; either version 3 of the License, or (at your
14dnl      option) any later version.
15dnl
16dnl  or
17dnl
18dnl    * the GNU General Public License as published by the Free Software
19dnl      Foundation; either version 2 of the License, or (at your option) any
20dnl      later version.
21dnl
22dnl  or both in parallel, as here.
23dnl
24dnl  The GNU MP Library is distributed in the hope that it will be useful, but
25dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
27dnl  for more details.
28dnl
29dnl  You should have received copies of the GNU General Public License and the
30dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
31dnl  see https://www.gnu.org/licenses/.
32
33include(`../config.m4')
34
35C	     cycles/limb
36C AMD K8,K9	 -
37C AMD K10	 -
38C AMD bull	 -
39C AMD pile	 -
40C AMD steam	 -
41C AMD excavator	 -
42C AMD bobcat	 -
43C AMD jaguar	 -
44C Intel P4	 -
45C Intel core2	 -
46C Intel NHM	 -
47C Intel SBR	 -
48C Intel IBR	 -
49C Intel HWL	 2.32
50C Intel BWL	 2.04
51C Intel SKL	 1.95
52C Intel atom	 -
53C Intel SLM	 -
54C VIA nano	 -
55
56C The loop of this code is the result of running a code generation and
57C optimisation tool suite written by David Harvey and Torbj��rn Granlund.
58
59C TODO
60C  * Handle small n separately, for lower overhead.
61
62define(`rp',      `%rdi')   C rcx
63define(`up',      `%rsi')   C rdx
64define(`n_param', `%rdx')   C r8
65define(`v0_param',`%rcx')   C r9
66
67define(`n',       `%rbp')
68define(`v0',      `%rdx')
69
70ifdef(`OPERATION_addmul_1',`
71  define(`ADDSUB',        `add')
72  define(`ADCSBB',        `adc')
73  define(`func',  `mpn_addmul_1')
74')
75ifdef(`OPERATION_submul_1',`
76  define(`ADDSUB',        `sub')
77  define(`ADCSBB',        `sbb')
78  define(`func',  `mpn_submul_1')
79')
80
81ABI_SUPPORT(DOS64)
82ABI_SUPPORT(STD64)
83
84MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)
85
86ASM_START()
87	TEXT
88	ALIGN(16)
89PROLOGUE(func)
90	FUNC_ENTRY(4)
91	push	%rbx
92	push	%rbp
93	push	%r12
94	push	%r13
95
96	mov	n_param, n
97	mov	v0_param, v0
98
99	test	$1, R8(n)
100	jnz	L(bx1)
101
102L(bx0):	shr	$2, n
103	jc	L(b10)
104
105L(b00):	mulx(	(up), %r13, %r12)
106	mulx(	8,(up), %rbx, %rax)
107	add	%r12, %rbx
108	adc	$0, %rax
109	mov	(rp), %r12
110	mov	8(rp), %rcx
111	mulx(	16,(up), %r9, %r8)
112	lea	-16(rp), rp
113	lea	16(up), up
114	ADDSUB	%r13, %r12
115	jmp	L(lo0)
116
117L(bx1):	shr	$2, n
118	jc	L(b11)
119
120L(b01):	mulx(	(up), %r11, %r10)
121	jnz	L(gt1)
122L(n1):	ADDSUB	%r11, (rp)
123	mov	$0, R32(%rax)
124	adc	%r10, %rax
125	jmp	L(ret)
126
127L(gt1):	mulx(	8,(up), %r13, %r12)
128	mulx(	16,(up), %rbx, %rax)
129	lea	24(up), up
130	add	%r10, %r13
131	adc	%r12, %rbx
132	adc	$0, %rax
133	mov	(rp), %r10
134	mov	8(rp), %r12
135	mov	16(rp), %rcx
136	lea	-8(rp), rp
137	ADDSUB	%r11, %r10
138	jmp	L(lo1)
139
140L(b11):	mulx(	(up), %rbx, %rax)
141	mov	(rp), %rcx
142	mulx(	8,(up), %r9, %r8)
143	lea	8(up), up
144	lea	-24(rp), rp
145	inc	n			C adjust n
146	ADDSUB	%rbx, %rcx
147	jmp	L(lo3)
148
149L(b10):	mulx(	(up), %r9, %r8)
150	mulx(	8,(up), %r11, %r10)
151	lea	-32(rp), rp
152	mov	$0, R32(%rax)
153	clc				C clear cf
154	jz	L(end)			C depends on old shift
155
156	ALIGN(16)
157L(top):	adc	%rax, %r9
158	lea	32(rp), rp
159	adc	%r8, %r11
160	mulx(	16,(up), %r13, %r12)
161	mov	(rp), %r8
162	mulx(	24,(up), %rbx, %rax)
163	lea	32(up), up
164	adc	%r10, %r13
165	adc	%r12, %rbx
166	adc	$0, %rax
167	mov	8(rp), %r10
168	mov	16(rp), %r12
169	ADDSUB	%r9, %r8
170	mov	24(rp), %rcx
171	mov	%r8, (rp)
172	ADCSBB	%r11, %r10
173L(lo1):	mulx(	(up), %r9, %r8)
174	mov	%r10, 8(rp)
175	ADCSBB	%r13, %r12
176L(lo0):	mov	%r12, 16(rp)
177	ADCSBB	%rbx, %rcx
178L(lo3):	mulx(	8,(up), %r11, %r10)
179	mov	%rcx, 24(rp)
180	dec	n
181	jnz	L(top)
182
183L(end):	adc	%rax, %r9
184	adc	%r8, %r11
185	mov	32(rp), %r8
186	mov	%r10, %rax
187	adc	$0, %rax
188	mov	40(rp), %r10
189	ADDSUB	%r9, %r8
190	mov	%r8, 32(rp)
191	ADCSBB	%r11, %r10
192	mov	%r10, 40(rp)
193	adc	$0, %rax
194
195L(ret):	pop	%r13
196	pop	%r12
197	pop	%rbp
198	pop	%rbx
199	FUNC_EXIT()
200	ret
201EPILOGUE()
202