1dnl  AMD64 mpn_addmul_1/mpn_submul_1 optimised for Intel Atom.
2
3dnl  Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33C	     cycles/limb
34C AMD K8,K9	 4.5
35C AMD K10	 4.5
36C AMD bull	 4.73
37C AMD pile	 4.60	 4.80
38C AMD steam
39C AMD excavator
40C AMD bobcat	 5.48
41C AMD jaguar	 5.61
42C Intel P4	16.6
43C Intel core2	 5.09
44C Intel NHM	 4.79
45C Intel SBR	 3.88
46C Intel IBR	 3.65
47C Intel HWL	 3.53
48C Intel BWL	 2.75
49C Intel SKL	 2.76
50C Intel atom	19.4
51C Intel SLM	 8
52C VIA nano
53
54C The loop of this code is the result of running a code generation and
55C optimisation tool suite written by David Harvey and Torbjorn Granlund.
56
57define(`rp',      `%rdi')   C rcx
58define(`up',      `%rsi')   C rdx
59define(`n_param', `%rdx')   C r8
60define(`v0',      `%rcx')   C r9
61
62define(`n',       `%rbx')
63
64ifdef(`OPERATION_addmul_1',`
65  define(`ADDSUB', `add')
66  define(`func',   `mpn_addmul_1')
67')
68ifdef(`OPERATION_submul_1',`
69  define(`ADDSUB', `sub')
70  define(`func',   `mpn_submul_1')
71')
72
73ABI_SUPPORT(DOS64)
74ABI_SUPPORT(STD64)
75
76MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)
77
78ASM_START()
79	TEXT
80	ALIGN(16)
81PROLOGUE(func)
82	FUNC_ENTRY(4)
83	push	%rbx
84
85	mov	(up), %rax
86	lea	-8(up,n_param,8), up
87	lea	-16(rp,n_param,8), rp
88
89	test	$1, R8(n_param)
90	jnz	L(bx1)
91
92L(bx0):	test	$2, R8(n_param)
93	jnz	L(b10)
94
95L(b00):	mov	$1, R32(n)
96	sub	n_param, n
97	mul	v0
98	mov	%rax, %r11
99	mov	8(up,n,8), %rax
100	mov	%rdx, %r10
101	mul	v0
102	mov	%rax, %r8
103	mov	16(up,n,8), %rax
104	jmp	L(lo0)
105
106L(b10):	mov	$3, R32(n)
107	sub	n_param, n
108	mul	v0
109	mov	%rax, %r11
110	mov	-8(up,n,8), %rax
111	mov	%rdx, %r10
112	mul	v0
113	test	n, n
114	jns	L(cj2)
115	mov	%rax, %r8
116	mov	(up,n,8), %rax
117	mov	%rdx, %r9
118	jmp	L(lo2)
119
120L(bx1):	test	$2, R8(n_param)
121	jnz	L(b11)
122
123L(b01):	mov	$2, R32(n)
124	sub	n_param, n
125	mul	v0
126	test	n, n
127	jns	L(cj1)
128	mov	%rax, %r8
129	mov	(up,n,8), %rax
130	mov	%rdx, %r9
131	mul	v0
132	mov	%rax, %r11
133	mov	8(up,n,8), %rax
134	mov	%rdx, %r10
135	jmp	L(lo1)
136
137L(b11):	xor	R32(n), R32(n)
138	sub	n_param, n
139	mul	v0
140	mov	%rax, %r8
141	mov	16(up,n,8), %rax
142	mov	%rdx, %r9
143	mul	v0
144	mov	%rax, %r11
145	mov	24(up,n,8), %rax
146	jmp	L(lo3)
147
148	ALIGN(16)
149L(top):	mul	v0
150	ADDSUB	%r8, -16(rp,n,8)
151	mov	%rax, %r8
152	mov	(up,n,8), %rax
153	adc	%r9, %r11
154	mov	%rdx, %r9
155	adc	$0, %r10
156L(lo2):	mul	v0
157	ADDSUB	%r11, -8(rp,n,8)
158	mov	%rax, %r11
159	mov	8(up,n,8), %rax
160	adc	%r10, %r8
161	mov	%rdx, %r10
162	adc	$0, %r9
163L(lo1):	mul	v0
164	ADDSUB	%r8, (rp,n,8)
165	mov	%rax, %r8
166	adc	%r9, %r11
167	mov	16(up,n,8), %rax
168	adc	$0, %r10
169L(lo0):	mov	%rdx, %r9
170	mul	v0
171	ADDSUB	%r11, 8(rp,n,8)
172	mov	%rax, %r11
173	adc	%r10, %r8
174	mov	24(up,n,8), %rax
175	adc	$0, %r9
176L(lo3):	add	$4, n
177	mov	%rdx, %r10
178	js	L(top)
179
180L(end):	mul	v0
181	ADDSUB	%r8, -16(rp,n,8)
182	adc	%r9, %r11
183	adc	$0, %r10
184L(cj2):	ADDSUB	%r11, -8(rp,n,8)
185	adc	%r10, %rax
186	adc	$0, %rdx
187L(cj1):	ADDSUB	%rax, (rp,n,8)
188	mov	$0, R32(%rax)
189	adc	%rdx, %rax
190	pop	%rbx
191	FUNC_EXIT()
192	ret
193EPILOGUE()
194ASM_END()
195