1dnl  mpn_mul_basecase for Pentium 4 and P6 models with SSE2 (i.e., 9,D,E,F).
2
3dnl  Copyright 2001, 2002, 2005, 2007 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33C TODO:
34C  * Improve ad-hoc outer loop code and register handling.  Some feed-in
35C    scheduling could improve things by several cycles per outer iteration.
36C  * In code for un <= 3, try keeping accumulation operands in registers,
37C    without storing intermediates to rp.
38C  * We might want to keep 32 in a free mm register, since the register form is
39C    3 bytes and the immediate form is 4 bytes.  About 70 bytes to save.
40C  * Look into different loop alignment, we now expand the code about 50 bytes
41C    with possibly needless alignment.
42C  * Perhaps rewrap loops 00,01,02 (6 loops) to allow fall-through entry.
43C  * Use OSP, should solve feed-in latency problems.
44C  * Save a few tens of bytes by doing cross-jumping for Loel0, etc.
45C  * Save around 120 bytes by remapping "m 0", "m 1", "m 2" and "m 3" registers
46C    so that they can share feed-in code, and changing the branch targets from
47C    L<n> to Lm<nn>.
48
49C                           cycles/limb
50C P6 model 9   (Banias)         ?
51C P6 model 13  (Dothan)         5.24
52C P6 model 14  (Yonah)          ?
53C P4 model 0-1 (Willamette):    5
54C P4 model 2   (Northwood):     4.60 at 32 limbs
55C P4 model 3-4 (Prescott):      4.94 at 32 limbs
56
57C INPUT PARAMETERS
58C rp		sp + 4
59C up		sp + 8
60C un		sp + 12
61C vp		sp + 16
62C vn		sp + 20
63
64	TEXT
65	ALIGN(16)
66PROLOGUE(mpn_mul_basecase)
67	push	%esi
68	push	%ebx
69	mov	12(%esp), %edx		C rp
70	mov	16(%esp), %eax		C up
71	mov	20(%esp), %ecx		C un
72	mov	24(%esp), %esi		C vp
73	mov	28(%esp), %ebx		C vn
74	movd	(%esi), %mm7		C
75L(ent):	cmp	$3, %ecx
76	ja	L(big)
77	movd	(%eax), %mm6
78	pmuludq	%mm7, %mm6
79	jz	L(un3)
80	cmp	$2, %ecx
81	jz	L(un2)
82
83L(un1):	movd	%mm6, (%edx)		C				un=1
84	psrlq	$32, %mm6		C				un=1
85	movd	%mm6, 4(%edx)		C				un=1
86	jmp	L(rtr)			C				un=1
87
88L(un2):	movd	4(%eax), %mm1		C				un=2
89	pmuludq	%mm7, %mm1		C				un=2
90	movd	%mm6, (%edx)		C				un=2
91	psrlq	$32, %mm6		C				un=2
92	paddq	%mm1, %mm6		C				un=2
93	movd	%mm6, 4(%edx)		C				un=2
94	psrlq	$32, %mm6		C				un=2
95	movd	%mm6, 8(%edx)		C				un=2
96      dec	%ebx			C				un=2
97      jz	L(rtr)			C				un=2
98	movd	4(%esi), %mm7		C				un=2
99	movd	(%eax), %mm6		C				un=2
100	pmuludq	%mm7, %mm6		C				un=2
101	movd	4(%eax), %mm1		C				un=2
102	movd	4(%edx), %mm4		C				un=2
103	pmuludq	%mm7, %mm1		C				un=2
104	movd	8(%edx), %mm5		C				un=2
105	paddq	%mm4, %mm6		C				un=2
106	paddq	%mm1, %mm5		C				un=2
107	movd	%mm6, 4(%edx)		C				un=2
108	psrlq	$32, %mm6		C				un=2
109	paddq	%mm5, %mm6		C				un=2
110	movd	%mm6, 8(%edx)		C				un=2
111	psrlq	$32, %mm6		C				un=2
112	movd	%mm6, 12(%edx)		C				un=2
113L(rtr):	emms
114	pop	%ebx
115	pop	%esi
116	ret
117
118L(un3):	movd	4(%eax), %mm1		C				un=3
119	pmuludq	%mm7, %mm1		C				un=3
120	movd	8(%eax), %mm2		C				un=3
121	pmuludq	%mm7, %mm2		C				un=3
122	movd	%mm6, (%edx)		C				un=3
123	psrlq	$32, %mm6		C				un=3
124	paddq	%mm1, %mm6		C				un=3
125	movd	%mm6, 4(%edx)		C				un=3
126	psrlq	$32, %mm6		C				un=3
127	paddq	%mm2, %mm6		C				un=3
128	movd	%mm6, 8(%edx)		C				un=3
129	psrlq	$32, %mm6		C				un=3
130	movd	%mm6, 12(%edx)		C				un=3
131      dec	%ebx			C				un=3
132      jz	L(rtr)			C				un=3
133	movd	4(%esi), %mm7		C				un=3
134	movd	(%eax), %mm6		C				un=3
135	pmuludq	%mm7, %mm6		C				un=3
136	movd	4(%eax), %mm1		C				un=3
137	movd	4(%edx), %mm4		C				un=3
138	pmuludq	%mm7, %mm1		C				un=3
139	movd	8(%eax), %mm2		C				un=3
140	movd	8(%edx), %mm5		C				un=3
141	pmuludq	%mm7, %mm2		C				un=3
142	paddq	%mm4, %mm6		C				un=3
143	paddq	%mm1, %mm5		C				un=3
144	movd	12(%edx), %mm4		C				un=3
145	movd	%mm6, 4(%edx)		C				un=3
146	psrlq	$32, %mm6		C				un=3
147	paddq	%mm5, %mm6		C				un=3
148	paddq	%mm2, %mm4		C				un=3
149	movd	%mm6, 8(%edx)		C				un=3
150	psrlq	$32, %mm6		C				un=3
151	paddq	%mm4, %mm6		C				un=3
152	movd	%mm6, 12(%edx)		C				un=3
153	psrlq	$32, %mm6		C				un=3
154	movd	%mm6, 16(%edx)		C				un=3
155      dec	%ebx			C				un=3
156      jz	L(rtr)			C				un=3
157	movd	8(%esi), %mm7		C				un=3
158	movd	(%eax), %mm6		C				un=3
159	pmuludq	%mm7, %mm6		C				un=3
160	movd	4(%eax), %mm1		C				un=3
161	movd	8(%edx), %mm4		C				un=3
162	pmuludq	%mm7, %mm1		C				un=3
163	movd	8(%eax), %mm2		C				un=3
164	movd	12(%edx), %mm5		C				un=3
165	pmuludq	%mm7, %mm2		C				un=3
166	paddq	%mm4, %mm6		C				un=3
167	paddq	%mm1, %mm5		C				un=3
168	movd	16(%edx), %mm4		C				un=3
169	movd	%mm6, 8(%edx)		C				un=3
170	psrlq	$32, %mm6		C				un=3
171	paddq	%mm5, %mm6		C				un=3
172	paddq	%mm2, %mm4		C				un=3
173	movd	%mm6, 12(%edx)		C				un=3
174	psrlq	$32, %mm6		C				un=3
175	paddq	%mm4, %mm6		C				un=3
176	movd	%mm6, 16(%edx)		C				un=3
177	psrlq	$32, %mm6		C				un=3
178	movd	%mm6, 20(%edx)		C				un=3
179	jmp	L(rtr)
180
181
182L(big):	push	%edi
183	pxor	%mm6, %mm6
184	lea	4(%esi), %esi
185	and	$3, %ecx
186	jz	L(0)
187	cmp	$2, %ecx
188	jc	L(1)
189	jz	L(2)
190	jmp	L(3)			C FIXME: one case should fall through
191
192
193L(0):	movd	(%eax), %mm3		C				m 0
194	sub	24(%esp), %ecx		C inner loop count		m 0
195	mov	%ecx, 24(%esp)		C update loop count for later	m 0
196	pmuludq	%mm7, %mm3		C				m 0
197	movd	4(%eax), %mm0		C				m 0
198	pmuludq	%mm7, %mm0		C				m 0
199	movd	8(%eax), %mm1		C				m 0
200	jmp	L(m00)			C				m 0
201	ALIGN(16)			C				m 0
202L(lpm0):
203	pmuludq	%mm7, %mm4		C				m 0
204	paddq	%mm0, %mm6		C				m 0
205	movd	(%eax), %mm3		C				m 0
206	movd	%mm6, -12(%edx)		C				m 0
207	psrlq	$32, %mm6		C				m 0
208	pmuludq	%mm7, %mm3		C				m 0
209	paddq	%mm1, %mm6		C				m 0
210	movd	4(%eax), %mm0		C				m 0
211	movd	%mm6, -8(%edx)		C				m 0
212	psrlq	$32, %mm6		C				m 0
213	pmuludq	%mm7, %mm0		C				m 0
214	paddq	%mm4, %mm6		C				m 0
215	movd	8(%eax), %mm1		C				m 0
216	movd	%mm6, -4(%edx)		C				m 0
217	psrlq	$32, %mm6		C				m 0
218L(m00):	pmuludq	%mm7, %mm1		C				m 0
219	paddq	%mm3, %mm6		C				m 0
220	movd	12(%eax), %mm4		C				m 0
221	movd	%mm6, (%edx)		C				m 0
222	psrlq	$32, %mm6		C				m 0
223	lea	16(%eax), %eax		C				m 0
224	lea	16(%edx), %edx		C				m 0
225	add	$4, %ecx		C				m 0
226	ja	L(lpm0)			C				m 0
227	pmuludq	%mm7, %mm4		C				m 0
228	paddq	%mm0, %mm6		C				m 0
229	movd	%mm6, -12(%edx)		C				m 0
230	psrlq	$32, %mm6		C				m 0
231	paddq	%mm1, %mm6		C				m 0
232	mov	16(%esp), %edi		C rp				  0
233	jmp	L(x0)
234
235L(olp0):
236	lea	4(%edi), %edi		C				am 0
237	movd	(%esi), %mm7		C				am 0
238	lea	4(%esi), %esi		C				am 0
239	mov	%edi, %edx		C rp				am 0
240	mov	20(%esp), %eax		C up				am 0
241	movd	(%eax), %mm3		C				am 0
242	mov	24(%esp), %ecx		C inner loop count		am 0
243	pxor	%mm6, %mm6		C				am 0
244	pmuludq	%mm7, %mm3		C				am 0
245	movd	4(%eax), %mm0		C				am 0
246	movd	(%edx), %mm5		C				am 0
247	pmuludq	%mm7, %mm0		C				am 0
248	movd	8(%eax), %mm1		C				am 0
249	paddq	%mm3, %mm5		C				am 0
250	movd	4(%edx), %mm4		C				am 0
251	jmp	L(am00)			C				am 0
252	ALIGN(16)			C				mm 0
253L(lam0):
254	pmuludq	%mm7, %mm2		C				am 0
255	paddq	%mm4, %mm6		C				am 0
256	movd	(%eax), %mm3		C				am 0
257	paddq	%mm1, %mm5		C				am 0
258	movd	-4(%edx), %mm4		C				am 0
259	movd	%mm6, -12(%edx)		C				am 0
260	psrlq	$32, %mm6		C				am 0
261	pmuludq	%mm7, %mm3		C				am 0
262	paddq	%mm5, %mm6		C				am 0
263	movd	4(%eax), %mm0		C				am 0
264	paddq	%mm2, %mm4		C				am 0
265	movd	(%edx), %mm5		C				am 0
266	movd	%mm6, -8(%edx)		C				am 0
267	psrlq	$32, %mm6		C				am 0
268	pmuludq	%mm7, %mm0		C				am 0
269	paddq	%mm4, %mm6		C				am 0
270	movd	8(%eax), %mm1		C				am 0
271	paddq	%mm3, %mm5		C				am 0
272	movd	4(%edx), %mm4		C				am 0
273	movd	%mm6, -4(%edx)		C				am 0
274	psrlq	$32, %mm6		C				am 0
275L(am00):
276	pmuludq	%mm7, %mm1		C				am 0
277	paddq	%mm5, %mm6		C				am 0
278	movd	12(%eax), %mm2		C				am 0
279	paddq	%mm0, %mm4		C				am 0
280	movd	8(%edx), %mm5		C				am 0
281	movd	%mm6, (%edx)		C				am 0
282	psrlq	$32, %mm6		C				am 0
283	lea	16(%eax), %eax		C				am 0
284	lea	16(%edx), %edx		C				am 0
285	add	$4, %ecx		C				am 0
286	jnz	L(lam0)			C				am 0
287	pmuludq	%mm7, %mm2		C				am 0
288	paddq	%mm4, %mm6		C				am 0
289	paddq	%mm1, %mm5		C				am 0
290	movd	-4(%edx), %mm4		C				am 0
291	movd	%mm6, -12(%edx)		C				am 0
292	psrlq	$32, %mm6		C				am 0
293	paddq	%mm5, %mm6		C				am 0
294	paddq	%mm2, %mm4		C				am 0
295L(x0):	movd	%mm6, -8(%edx)		C				am 0
296	psrlq	$32, %mm6		C				am 0
297	paddq	%mm4, %mm6		C				am 0
298	movd	%mm6, -4(%edx)		C				am 0
299	psrlq	$32, %mm6		C				am 0
300	movd	%mm6, (%edx)		C				am 0
301	dec	%ebx			C				am 0
302	jnz	L(olp0)			C				am 0
303L(oel0):
304	emms				C				   0
305	pop	%edi			C				   0
306	pop	%ebx			C				   0
307	pop	%esi			C				   0
308	ret				C				   0
309
310
311L(1):	movd	(%eax), %mm4		C				m 1
312	sub	24(%esp), %ecx		C				m 1
313	mov	%ecx, 24(%esp)		C update loop count for later	m 1
314	pmuludq	%mm7, %mm4		C				m 1
315	movd	4(%eax), %mm3		C				m 1
316	pmuludq	%mm7, %mm3		C				m 1
317	movd	8(%eax), %mm0		C				m 1
318	jmp	L(m01)			C				m 1
319	ALIGN(16)			C				m 1
320L(lpm1):
321	pmuludq	%mm7, %mm4		C				m 1
322	paddq	%mm0, %mm6		C				m 1
323	movd	4(%eax), %mm3		C				m 1
324	movd	%mm6, -8(%edx)		C				m 1
325	psrlq	$32, %mm6		C				m 1
326	pmuludq	%mm7, %mm3		C				m 1
327	paddq	%mm1, %mm6		C				m 1
328	movd	8(%eax), %mm0		C				m 1
329	movd	%mm6, -4(%edx)		C				m 1
330	psrlq	$32, %mm6		C				m 1
331L(m01):	pmuludq	%mm7, %mm0		C				m 1
332	paddq	%mm4, %mm6		C				m 1
333	movd	12(%eax), %mm1		C				m 1
334	movd	%mm6, (%edx)		C				m 1
335	psrlq	$32, %mm6		C				m 1
336	pmuludq	%mm7, %mm1		C				m 1
337	paddq	%mm3, %mm6		C				m 1
338	movd	16(%eax), %mm4		C				m 1
339	movd	%mm6, 4(%edx)		C				m 1
340	psrlq	$32, %mm6		C				m 1
341	lea	16(%eax), %eax		C				m 1
342	lea	16(%edx), %edx		C				m 1
343	add	$4, %ecx		C				m 1
344	ja	L(lpm1)			C				m 1
345	pmuludq	%mm7, %mm4		C				m 1
346	paddq	%mm0, %mm6		C				m 1
347	movd	%mm6, -8(%edx)		C				m 1
348	psrlq	$32, %mm6		C				m 1
349	paddq	%mm1, %mm6		C				m 1
350	mov	16(%esp), %edi		C rp				  1
351	jmp	L(x1)
352
353L(olp1):
354	lea	4(%edi), %edi		C				am 1
355	movd	(%esi), %mm7		C				am 1
356	lea	4(%esi), %esi		C				am 1
357	mov	%edi, %edx		C rp				am 1
358	mov	20(%esp), %eax		C up				am 1
359	movd	(%eax), %mm2		C				am 1
360	mov	24(%esp), %ecx		C inner loop count		am 1
361	pxor	%mm6, %mm6		C				am 1
362	pmuludq	%mm7, %mm2		C				am 1
363	movd	4(%eax), %mm3		C				am 1
364	movd	(%edx), %mm4		C				am 1
365	pmuludq	%mm7, %mm3		C				am 1
366	movd	8(%eax), %mm0		C				am 1
367	paddq	%mm2, %mm4		C				am 1
368	movd	4(%edx), %mm5		C				am 1
369	jmp	L(am01)			C				am 1
370	ALIGN(16)			C				am 1
371L(lam1):
372	pmuludq	%mm7, %mm2		C				am 1
373	paddq	%mm4, %mm6		C				am 1
374	movd	4(%eax), %mm3		C				am 1
375	paddq	%mm1, %mm5		C				am 1
376	movd	(%edx), %mm4		C				am 1
377	movd	%mm6, -8(%edx)		C				am 1
378	psrlq	$32, %mm6		C				am 1
379	pmuludq	%mm7, %mm3		C				am 1
380	paddq	%mm5, %mm6		C				am 1
381	movd	8(%eax), %mm0		C				am 1
382	paddq	%mm2, %mm4		C				am 1
383	movd	4(%edx), %mm5		C				am 1
384	movd	%mm6, -4(%edx)		C				am 1
385	psrlq	$32, %mm6		C				am 1
386L(am01):
387	pmuludq	%mm7, %mm0		C				am 1
388	paddq	%mm4, %mm6		C				am 1
389	movd	12(%eax), %mm1		C				am 1
390	paddq	%mm3, %mm5		C				am 1
391	movd	8(%edx), %mm4		C				am 1
392	movd	%mm6, (%edx)		C				am 1
393	psrlq	$32, %mm6		C				am 1
394	pmuludq	%mm7, %mm1		C				am 1
395	paddq	%mm5, %mm6		C				am 1
396	movd	16(%eax), %mm2		C				am 1
397	paddq	%mm0, %mm4		C				am 1
398	movd	12(%edx), %mm5		C				am 1
399	movd	%mm6, 4(%edx)		C				am 1
400	psrlq	$32, %mm6		C				am 1
401	lea	16(%eax), %eax		C				am 1
402	lea	16(%edx), %edx		C				am 1
403	add	$4, %ecx		C				am 1
404	jnz	L(lam1)			C				am 1
405	pmuludq	%mm7, %mm2		C				am 1
406	paddq	%mm4, %mm6		C				am 1
407	paddq	%mm1, %mm5		C				am 1
408	movd	(%edx), %mm4		C				am 1
409	movd	%mm6, -8(%edx)		C				am 1
410	psrlq	$32, %mm6		C				am 1
411	paddq	%mm5, %mm6		C				am 1
412	paddq	%mm2, %mm4		C				am 1
413L(x1):	movd	%mm6, -4(%edx)		C				am 1
414	psrlq	$32, %mm6		C				am 1
415	paddq	%mm4, %mm6		C				am 1
416	movd	%mm6, (%edx)		C				am 1
417	psrlq	$32, %mm6		C				am 1
418	movd	%mm6, 4(%edx)		C				am 1
419	dec	%ebx			C				am 1
420	jnz	L(olp1)			C				am 1
421L(oel1):
422	emms				C				   1
423	pop	%edi			C				   1
424	pop	%ebx			C				   1
425	pop	%esi			C				   1
426	ret				C				   1
427
428
429L(2):	movd	(%eax), %mm1		C				m 2
430	sub	24(%esp), %ecx		C				m 2
431	mov	%ecx, 24(%esp)		C update loop count for later	m 2
432	pmuludq	%mm7, %mm1		C				m 2
433	movd	4(%eax), %mm4		C				m 2
434	pmuludq	%mm7, %mm4		C				m 2
435	movd	8(%eax), %mm3		C				m 2
436	jmp	L(m10)			C				m 2
437	ALIGN(16)			C				m 2
438L(lpm2):
439	pmuludq	%mm7, %mm4		C				m 2
440	paddq	%mm0, %mm6		C				m 2
441	movd	8(%eax), %mm3		C				m 2
442	movd	%mm6, -4(%edx)		C				m 2
443	psrlq	$32, %mm6		C				m 2
444L(m10):	pmuludq	%mm7, %mm3		C				m 2
445	paddq	%mm1, %mm6		C				m 2
446	movd	12(%eax), %mm0		C				m 2
447	movd	%mm6, (%edx)		C				m 2
448	psrlq	$32, %mm6		C				m 2
449	pmuludq	%mm7, %mm0		C				m 2
450	paddq	%mm4, %mm6		C				m 2
451	movd	16(%eax), %mm1		C				m 2
452	movd	%mm6, 4(%edx)		C				m 2
453	psrlq	$32, %mm6		C				m 2
454	pmuludq	%mm7, %mm1		C				m 2
455	paddq	%mm3, %mm6		C				m 2
456	movd	20(%eax), %mm4		C				m 2
457	movd	%mm6, 8(%edx)		C				m 2
458	psrlq	$32, %mm6		C				m 2
459	lea	16(%eax), %eax		C				m 2
460	lea	16(%edx), %edx		C				m 2
461	add	$4, %ecx		C				m 2
462	ja	L(lpm2)			C				m 2
463	pmuludq	%mm7, %mm4		C				m 2
464	paddq	%mm0, %mm6		C				m 2
465	movd	%mm6, -4(%edx)		C				m 2
466	psrlq	$32, %mm6		C				m 2
467	paddq	%mm1, %mm6		C				m 2
468	mov	16(%esp), %edi		C rp				  2
469	jmp	L(x2)
470
471L(olp2):
472	lea	4(%edi), %edi		C				am 2
473	movd	(%esi), %mm7		C				am 2
474	lea	4(%esi), %esi		C				am 2
475	mov	%edi, %edx		C rp				am 2
476	mov	20(%esp), %eax		C up				am 2
477	movd	(%eax), %mm1		C				am 2
478	mov	24(%esp), %ecx		C inner loop count		am 2
479	pxor	%mm6, %mm6		C				am 2
480	pmuludq	%mm7, %mm1		C				am 2
481	movd	4(%eax), %mm2		C				am 2
482	movd	(%edx), %mm5		C				am 2
483	pmuludq	%mm7, %mm2		C				am 2
484	movd	8(%eax), %mm3		C				am 2
485	paddq	%mm1, %mm5		C				am 2
486	movd	4(%edx), %mm4		C				am 2
487	jmp	L(am10)			C				am 2
488	ALIGN(16)			C				am 2
489L(lam2):
490	pmuludq	%mm7, %mm2		C				am 2
491	paddq	%mm4, %mm6		C				am 2
492	movd	8(%eax), %mm3		C				am 2
493	paddq	%mm1, %mm5		C				am 2
494	movd	4(%edx), %mm4		C				am 2
495	movd	%mm6, -4(%edx)		C				am 2
496	psrlq	$32, %mm6		C				am 2
497L(am10):
498	pmuludq	%mm7, %mm3		C				am 2
499	paddq	%mm5, %mm6		C				am 2
500	movd	12(%eax), %mm0		C				am 2
501	paddq	%mm2, %mm4		C				am 2
502	movd	8(%edx), %mm5		C				am 2
503	movd	%mm6, (%edx)		C				am 2
504	psrlq	$32, %mm6		C				am 2
505	pmuludq	%mm7, %mm0		C				am 2
506	paddq	%mm4, %mm6		C				am 2
507	movd	16(%eax), %mm1		C				am 2
508	paddq	%mm3, %mm5		C				am 2
509	movd	12(%edx), %mm4		C				am 2
510	movd	%mm6, 4(%edx)		C				am 2
511	psrlq	$32, %mm6		C				am 2
512	pmuludq	%mm7, %mm1		C				am 2
513	paddq	%mm5, %mm6		C				am 2
514	movd	20(%eax), %mm2		C				am 2
515	paddq	%mm0, %mm4		C				am 2
516	movd	16(%edx), %mm5		C				am 2
517	movd	%mm6, 8(%edx)		C				am 2
518	psrlq	$32, %mm6		C				am 2
519	lea	16(%eax), %eax		C				am 2
520	lea	16(%edx), %edx		C				am 2
521	add	$4, %ecx		C				am 2
522	jnz	L(lam2)			C				am 2
523	pmuludq	%mm7, %mm2		C				am 2
524	paddq	%mm4, %mm6		C				am 2
525	paddq	%mm1, %mm5		C				am 2
526	movd	4(%edx), %mm4		C				am 2
527	movd	%mm6, -4(%edx)		C				am 2
528	psrlq	$32, %mm6		C				am 2
529	paddq	%mm5, %mm6		C				am 2
530	paddq	%mm2, %mm4		C				am 2
531L(x2):	movd	%mm6, (%edx)		C				am 2
532	psrlq	$32, %mm6		C				am 2
533	paddq	%mm4, %mm6		C				am 2
534	movd	%mm6, 4(%edx)		C				am 2
535	psrlq	$32, %mm6		C				am 2
536	movd	%mm6, 8(%edx)		C				am 2
537	dec	%ebx			C				am 2
538	jnz	L(olp2)			C				am 2
539L(oel2):
540	emms				C				   2
541	pop	%edi			C				   2
542	pop	%ebx			C				   2
543	pop	%esi			C				   2
544	ret				C				   2
545
546
547L(3):	movd	(%eax), %mm0		C				m 3
548	sub	24(%esp), %ecx		C				m 3
549	mov	%ecx, 24(%esp)		C update loop count for later	m 3
550	pmuludq	%mm7, %mm0		C				m 3
551	movd	4(%eax), %mm1		C				m 3
552	pmuludq	%mm7, %mm1		C				m 3
553	movd	8(%eax), %mm4		C				m 3
554	jmp	L(lpm3)			C				m 3
555	ALIGN(16)			C				m 3
556L(lpm3):
557	pmuludq	%mm7, %mm4		C				m 3
558	paddq	%mm0, %mm6		C				m 3
559	movd	12(%eax), %mm3		C				m 3
560	movd	%mm6, (%edx)		C				m 3
561	psrlq	$32, %mm6		C				m 3
562	pmuludq	%mm7, %mm3		C				m 3
563	paddq	%mm1, %mm6		C				m 3
564	movd	16(%eax), %mm0		C				m 3
565	movd	%mm6, 4(%edx)		C				m 3
566	psrlq	$32, %mm6		C				m 3
567	pmuludq	%mm7, %mm0		C				m 3
568	paddq	%mm4, %mm6		C				m 3
569	movd	20(%eax), %mm1		C				m 3
570	movd	%mm6, 8(%edx)		C				m 3
571	psrlq	$32, %mm6		C				m 3
572	pmuludq	%mm7, %mm1		C				m 3
573	paddq	%mm3, %mm6		C				m 3
574	movd	24(%eax), %mm4		C				m 3
575	movd	%mm6, 12(%edx)		C				m 3
576	psrlq	$32, %mm6		C				m 3
577	lea	16(%eax), %eax		C				m 3
578	lea	16(%edx), %edx		C				m 3
579	add	$4, %ecx		C				m 3
580	ja	L(lpm3)			C				m 3
581	pmuludq	%mm7, %mm4		C				m 3
582	paddq	%mm0, %mm6		C				m 3
583	movd	%mm6, (%edx)		C				m 3
584	psrlq	$32, %mm6		C				m 3
585	paddq	%mm1, %mm6		C				m 3
586	mov	16(%esp), %edi		C rp				  3
587	jmp	L(x3)
588
589L(olp3):
590	lea	4(%edi), %edi		C				am 3
591	movd	(%esi), %mm7		C				am 3
592	lea	4(%esi), %esi		C				am 3
593	mov	%edi, %edx		C rp				am 3
594	mov	20(%esp), %eax		C up				am 3
595	movd	(%eax), %mm0		C				am 3
596	mov	24(%esp), %ecx		C inner loop count		am 3
597	pxor	%mm6, %mm6		C				am 3
598	pmuludq	%mm7, %mm0		C				am 3
599	movd	4(%eax), %mm1		C				am 3
600	movd	(%edx), %mm4		C				am 3
601	pmuludq	%mm7, %mm1		C				am 3
602	movd	8(%eax), %mm2		C				am 3
603	paddq	%mm0, %mm4		C				am 3
604	movd	4(%edx), %mm5		C				am 3
605	jmp	L(lam3)			C				am 3
606	ALIGN(16)			C				am 3
607L(lam3):
608	pmuludq	%mm7, %mm2		C				am 3
609	paddq	%mm4, %mm6		C				am 3
610	movd	12(%eax), %mm3		C				am 3
611	paddq	%mm1, %mm5		C				am 3
612	movd	8(%edx), %mm4		C				am 3
613	movd	%mm6, (%edx)		C				am 3
614	psrlq	$32, %mm6		C				am 3
615	pmuludq	%mm7, %mm3		C				am 3
616	paddq	%mm5, %mm6		C				am 3
617	movd	16(%eax), %mm0		C				am 3
618	paddq	%mm2, %mm4		C				am 3
619	movd	12(%edx), %mm5		C				am 3
620	movd	%mm6, 4(%edx)		C				am 3
621	psrlq	$32, %mm6		C				am 3
622	pmuludq	%mm7, %mm0		C				am 3
623	paddq	%mm4, %mm6		C				am 3
624	movd	20(%eax), %mm1		C				am 3
625	paddq	%mm3, %mm5		C				am 3
626	movd	16(%edx), %mm4		C				am 3
627	movd	%mm6, 8(%edx)		C				am 3
628	psrlq	$32, %mm6		C				am 3
629	pmuludq	%mm7, %mm1		C				am 3
630	paddq	%mm5, %mm6		C				am 3
631	movd	24(%eax), %mm2		C				am 3
632	paddq	%mm0, %mm4		C				am 3
633	movd	20(%edx), %mm5		C				am 3
634	movd	%mm6, 12(%edx)		C				am 3
635	psrlq	$32, %mm6		C				am 3
636	lea	16(%eax), %eax		C				am 3
637	lea	16(%edx), %edx		C				am 3
638	add	$4, %ecx		C				am 3
639	jnz	L(lam3)			C				am 3
640	pmuludq	%mm7, %mm2		C				am 3
641	paddq	%mm4, %mm6		C				am 3
642	paddq	%mm1, %mm5		C				am 3
643	movd	8(%edx), %mm4		C				am 3
644	movd	%mm6, (%edx)		C				am 3
645	psrlq	$32, %mm6		C				am 3
646	paddq	%mm5, %mm6		C				am 3
647	paddq	%mm2, %mm4		C				am 3
648L(x3):	movd	%mm6, 4(%edx)		C				am 3
649	psrlq	$32, %mm6		C				am 3
650	paddq	%mm4, %mm6		C				am 3
651	movd	%mm6, 8(%edx)		C				am 3
652	psrlq	$32, %mm6		C				am 3
653	movd	%mm6, 12(%edx)		C				am 3
654	dec	%ebx			C				am 3
655	jnz	L(olp3)			C				am 3
656L(oel3):
657	emms				C				   3
658	pop	%edi			C				   3
659	pop	%ebx			C				   3
660	pop	%esi			C				   3
661	ret				C				   3
662EPILOGUE()
663