1dnl  AMD64 mpn_addmul_2 -- Multiply an n-limb vector with a 2-limb vector and
2dnl  add the result to a third limb vector.
3
4dnl  Copyright 2008 Free Software Foundation, Inc.
5
6dnl  This file is part of the GNU MP Library.
7
8dnl  The GNU MP Library is free software; you can redistribute it and/or modify
9dnl  it under the terms of the GNU Lesser General Public License as published
10dnl  by the Free Software Foundation; either version 3 of the License, or (at
11dnl  your option) any later version.
12
13dnl  The GNU MP Library is distributed in the hope that it will be useful, but
14dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
16dnl  License for more details.
17
18dnl  You should have received a copy of the GNU Lesser General Public License
19dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
20
21include(`../config.m4')
22
23C	     cycles/limb
24C K8,K9:	 2.375
25C K10:		 2.375
26C P4:		 ?
27C P6 core2:	 4.45
28C P6 corei7:	 4.35
29
30C This code is the result of running a code generation and optimization tool
31C suite written by David Harvey and Torbjorn Granlund.
32
33C TODO
34C  * Work on feed-in and wind-down code.
35C  * Convert "mov $0" to "xor".
36C  * Adjust initial lea to save some bytes.
37C  * Perhaps adjust n from n_param&3 value?
38
39C INPUT PARAMETERS
40define(`rp',     `%rdi')
41define(`up',     `%rsi')
42define(`n_param',`%rdx')
43define(`vp',     `%rcx')
44
45define(`v0', `%r8')
46define(`v1', `%r9')
47define(`w0', `%rbx')
48define(`w1', `%rcx')
49define(`w2', `%rbp')
50define(`w3', `%r10')
51define(`n',  `%r11')
52
53ASM_START()
54	TEXT
55	ALIGN(16)
56PROLOGUE(mpn_addmul_2)
57	push	%rbx
58	push	%rbp
59
60	mov	(vp), v0
61	mov	8(vp), v1
62
63	mov	n_param, n
64	neg	n
65	lea	-32(up,n_param,8), up
66	lea	-32(rp,n_param,8), rp
67
68	and	$3, R32(n_param)
69	jz	L(am2p0)
70	cmp	$2, R32(n_param)
71	jc	L(am2p1)
72	jz	L(am2p2)
73L(am2p3):
74	mov	32(up,n,8), %rax
75	mul	v0
76	mov	%rax, w1
77	mov	32(up,n,8), %rax
78	mov	%rdx, w2
79	xor	R32(w3), R32(w3)
80	add	$2, n
81	jmp	L(am3)
82L(am2p0):
83	mov	32(up,n,8), %rax
84	mul	v0
85	mov	%rax, w0
86	mov	32(up,n,8), %rax
87	mov	%rdx, w1
88	xor	R32(w2), R32(w2)
89	add	$3, n
90	jmp	L(am0)
91L(am2p1):
92	mov	32(up,n,8), %rax
93	mul	v0
94	mov	%rax, w3
95	mov	32(up,n,8), %rax
96	mov	%rdx, w0
97	xor	R32(w1), R32(w1)
98	jmp	L(am1)
99L(am2p2):
100	mov	32(up,n,8), %rax
101	mul	v0
102	mov	%rax, w2
103	mov	32(up,n,8), %rax
104	mov	%rdx, w3
105	xor	R32(w0), R32(w0)
106	xor	R32(w1), R32(w1)
107	add	$1, n
108	jmp	L(am2)
109
110	ALIGN(32)
111L(top):
112	add	w3, (rp,n,8)		C 0 21
113	adc	%rax, w0		C 1 24
114	mov	8(up,n,8), %rax
115	adc	%rdx, w1		C 3 26
116	mov	$0, R32(w2)
117	mul	v0
118	add	%rax, w0		C 2 26
119	mov	8(up,n,8), %rax
120	adc	%rdx, w1		C 4 28
121	adc	$0, R32(w2)		C 6 30
122L(am0):	mul	v1
123	add	w0, 8(rp,n,8)		C 3 27
124	adc	%rax, w1		C 6 30
125	adc	%rdx, w2		C 8 32
126	mov	16(up,n,8), %rax
127	mov	$0, R32(w3)
128	mul	v0
129	add	%rax, w1		C 8
130	mov	16(up,n,8), %rax
131	adc	%rdx, w2		C 10
132	adc	$0, R32(w3)		C 12
133L(am3):	mul	v1
134	add	w1, 16(rp,n,8)		C 9
135	adc	%rax, w2		C 12
136	mov	24(up,n,8), %rax
137	adc	%rdx, w3		C 14
138	mul	v0
139	mov	$0, R32(w0)
140	add	%rax, w2		C 14
141	adc	%rdx, w3		C 16
142	mov	$0, R32(w1)
143	mov	24(up,n,8), %rax
144	adc	$0, R32(w0)		C 18
145L(am2):	mul	v1
146	add	w2, 24(rp,n,8)		C 15
147	adc	%rax, w3		C 18
148	adc	%rdx, w0		C 20
149	mov	32(up,n,8), %rax
150	mul	v0
151	add	%rax, w3		C 20
152	mov	32(up,n,8), %rax
153	adc	%rdx, w0		C 22
154	adc	$0, R32(w1)		C 24
155L(am1):	mul	v1
156	add	$4, n
157	js	L(top)
158
159	add	w3, (rp,n,8)
160	adc	%rax, w0
161	adc	%rdx, w1
162	mov	w0, 8(rp,n,8)
163	mov	w1, %rax
164
165	pop	%rbp
166	pop	%rbx
167	ret
168EPILOGUE()
169