1dnl  mpn_addmul_1 for Pentium 4 and P6 models with SSE2 (i.e., 9,D,E,F).
2
3dnl  Copyright 2005, 2007 Free Software Foundation, Inc.
4dnl
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of the GNU Lesser General Public License as published
9dnl  by the Free Software Foundation; either version 3 of the License, or (at
10dnl  your option) any later version.
11dnl
12dnl  The GNU MP Library is distributed in the hope that it will be useful, but
13dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
15dnl  License for more details.
16dnl
17dnl  You should have received a copy of the GNU Lesser General Public License
18dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
19
20
21include(`../config.m4')
22
23C TODO:
24C  * Tweak eax/edx offsets in loop as to save some lea's
25C  * Perhaps software pipeline small-case code
26
27C                           cycles/limb
28C P6 model 0-8,10-12)           -
29C P6 model 9   (Banias)         ?
30C P6 model 13  (Dothan)         5.24
31C P4 model 0-1 (Willamette):    5
32C P4 model 2   (Northwood):     5
33C P4 model 3-4 (Prescott):      5
34
35C INPUT PARAMETERS
36C rp		sp + 4
37C up		sp + 8
38C n		sp + 12
39C v0		sp + 16
40
41	TEXT
42	ALIGN(16)
43PROLOGUE(mpn_addmul_1c)
44	mov	4(%esp), %edx
45	mov	8(%esp), %eax
46	mov	12(%esp), %ecx
47	movd	16(%esp), %mm7
48	movd	20(%esp), %mm6
49	jmp	L(ent)
50EPILOGUE()
51	ALIGN(16)
52PROLOGUE(mpn_addmul_1)
53	mov	4(%esp), %edx
54	mov	8(%esp), %eax
55	mov	12(%esp), %ecx
56	movd	16(%esp), %mm7
57	pxor	%mm6, %mm6
58L(ent):	cmp	$4, %ecx
59	jnc	L(big)
60
61L(lp0):	movd	(%eax), %mm0
62	lea	4(%eax), %eax
63	movd	(%edx), %mm4
64	lea	4(%edx), %edx
65	pmuludq	%mm7, %mm0
66	paddq	%mm0, %mm4
67	paddq	%mm4, %mm6
68	movd	%mm6, -4(%edx)
69	psrlq	$32, %mm6
70	dec	%ecx
71	jnz	L(lp0)
72	movd	%mm6, %eax
73	emms
74	ret
75
76L(big):	and	$3, %ecx
77	je	L(0)
78	cmp	$2, %ecx
79	jc	L(1)
80	je	L(2)
81	jmp	L(3)			C FIXME: one case should fall through
82
83L(0):	movd	(%eax), %mm3
84	sub	12(%esp), %ecx		C loop count
85	lea	-16(%eax), %eax
86	lea	-12(%edx), %edx
87	pmuludq	%mm7, %mm3
88	movd	20(%eax), %mm0
89	movd	12(%edx), %mm5
90	pmuludq	%mm7, %mm0
91	movd	24(%eax), %mm1
92	paddq	%mm3, %mm5
93	movd	16(%edx), %mm4
94	jmp	L(00)
95
96L(1):	movd	(%eax), %mm2
97	sub	12(%esp), %ecx
98	lea	-12(%eax), %eax
99	lea	-8(%edx), %edx
100	movd	8(%edx), %mm4
101	pmuludq	%mm7, %mm2
102	movd	16(%eax), %mm3
103	pmuludq	%mm7, %mm3
104	movd	20(%eax), %mm0
105	paddq	%mm2, %mm4
106	movd	12(%edx), %mm5
107	jmp	L(01)
108
109L(2):	movd	(%eax), %mm1
110	sub	12(%esp), %ecx
111	lea	-8(%eax), %eax
112	lea	-4(%edx), %edx
113	pmuludq	%mm7, %mm1
114	movd	12(%eax), %mm2
115	movd	4(%edx), %mm5
116	pmuludq	%mm7, %mm2
117	movd	16(%eax), %mm3
118	paddq	%mm1, %mm5
119	movd	8(%edx), %mm4
120	jmp	L(10)
121
122L(3):	movd	(%eax), %mm0
123	sub	12(%esp), %ecx
124	lea	-4(%eax), %eax
125	pmuludq	%mm7, %mm0
126	movd	8(%eax), %mm1
127	movd	(%edx), %mm4
128	pmuludq	%mm7, %mm1
129	movd	12(%eax), %mm2
130	paddq	%mm0, %mm4
131	movd	4(%edx), %mm5
132
133	ALIGN(16)
134L(top):	pmuludq	%mm7, %mm2
135	paddq	%mm4, %mm6
136	movd	16(%eax), %mm3
137	paddq	%mm1, %mm5
138	movd	8(%edx), %mm4
139	movd	%mm6, 0(%edx)
140	psrlq	$32, %mm6
141L(10):	pmuludq	%mm7, %mm3
142	paddq	%mm5, %mm6
143	movd	20(%eax), %mm0
144	paddq	%mm2, %mm4
145	movd	12(%edx), %mm5
146	movd	%mm6, 4(%edx)
147	psrlq	$32, %mm6
148L(01):	pmuludq	%mm7, %mm0
149	paddq	%mm4, %mm6
150	movd	24(%eax), %mm1
151	paddq	%mm3, %mm5
152	movd	16(%edx), %mm4
153	movd	%mm6, 8(%edx)
154	psrlq	$32, %mm6
155L(00):	pmuludq	%mm7, %mm1
156	paddq	%mm5, %mm6
157	movd	28(%eax), %mm2
158	paddq	%mm0, %mm4
159	movd	20(%edx), %mm5
160	movd	%mm6, 12(%edx)
161	psrlq	$32, %mm6
162	lea	16(%eax), %eax
163	lea	16(%edx), %edx
164	add	$4, %ecx
165	jnz	L(top)
166
167L(end):	pmuludq	%mm7, %mm2
168	paddq	%mm4, %mm6
169	paddq	%mm1, %mm5
170	movd	8(%edx), %mm4
171	movd	%mm6, 0(%edx)
172	psrlq	$32, %mm6
173	paddq	%mm5, %mm6
174	paddq	%mm2, %mm4
175	movd	%mm6, 4(%edx)
176	psrlq	$32, %mm6
177	paddq	%mm4, %mm6
178	movd	%mm6, 8(%edx)
179	psrlq	$32, %mm6
180	movd	%mm6, %eax
181	emms
182	ret
183EPILOGUE()
184