1dnl  mpn_sqr_basecase for Pentium 4 and P6 models with SSE2 (i.e., 9,D,E,F).
2
3dnl  Copyright 2001, 2002, 2007 Free Software Foundation, Inc.
4dnl
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of the GNU Lesser General Public License as published
9dnl  by the Free Software Foundation; either version 3 of the License, or (at
10dnl  your option) any later version.
11dnl
12dnl  The GNU MP Library is distributed in the hope that it will be useful, but
13dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
15dnl  License for more details.
16dnl
17dnl  You should have received a copy of the GNU Lesser General Public License
18dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
19
20include(`../config.m4')
21
22C TODO:
23C  * Improve ad-hoc outer loop code and register handling.  Some feed-in
24C    scheduling could improve things by several cycles per outer iteration.
25C  * In Lam3...Lam1 code for, keep accumulation operands in registers, without
26C    storing intermediates to rp.
27C  * We might want to keep 32 in a free mm register, since the register form is
28C    3 bytes and the immediate form is 4 bytes.  About 80 bytes to save.
29C  * Look into different loop alignment, we now expand the code about 50 bytes
30C    with possibly needless alignment.
31C  * Use OSP, should solve feed-in latency problems.
32C  * Address relative slowness for un<=3 for Pentium M.  The old code is there
33C    considerably faster.  (1:20/14, 2:34:32, 3:66/57)
34
35C INPUT PARAMETERS
36C rp		sp + 4
37C up		sp + 8
38C un		sp + 12
39
40	TEXT
41	ALIGN(16)
42PROLOGUE(mpn_sqr_basecase)
43	mov	4(%esp), %edx		C rp
44	mov	8(%esp), %eax		C up
45	mov	12(%esp), %ecx		C un
46
47	cmp	$2, %ecx
48	jc	L(un1)
49	jz	L(un2)
50	cmp	$4, %ecx
51	jc	L(un3)
52	jz	L(un4)
53	jmp	L(big)
54
55L(un1):	mov	(%eax), %eax
56	mov	%edx, %ecx
57	mul	%eax
58	mov	%eax, (%ecx)
59	mov	%edx, 4(%ecx)
60	ret
61L(un2):	movd	(%eax), %mm0		C				un=2
62	movd	(%eax), %mm2		C				un=2
63	movd	4(%eax), %mm1		C				un=2
64	pmuludq	%mm0, %mm0		C 64b weight 0			un=2
65	pmuludq	%mm1, %mm2		C 64b weight 32			un=2
66	pmuludq	%mm1, %mm1		C 64b weight 64			un=2
67	movd	%mm0, (%edx)		C				un=2
68	psrlq	$32, %mm0		C 32b weight 32			un=2
69	pcmpeqd	%mm7, %mm7		C				un=2
70	psrlq	$33, %mm7		C 0x000000007FFFFFFF		un=2
71	pand	%mm2, %mm7		C 31b weight 32			un=2
72	psrlq	$31, %mm2		C 33b weight 65			un=2
73	psllq	$1, %mm7		C 31b weight 33			un=2
74	paddq	%mm7, %mm0		C				un=2
75	movd	%mm0, 4(%edx)		C				un=2
76	psrlq	$32, %mm0		C				un=2
77	paddq	%mm2, %mm1		C				un=2
78	paddq	%mm0, %mm1		C				un=2
79	movd	%mm1, 8(%edx)		C				un=2
80	psrlq	$32, %mm1		C				un=2
81	movd	%mm1, 12(%edx)		C				un=2
82	emms
83	ret
84L(un3):	movd	(%eax), %mm7		C				un=3
85	movd	4(%eax), %mm6		C				un=3
86	pmuludq	%mm7, %mm6		C				un=3
87	movd	8(%eax), %mm2		C				un=3
88	pmuludq	%mm7, %mm2		C				un=3
89	movd	%mm6, 4(%edx)		C				un=3
90	psrlq	$32, %mm6		C				un=3
91	paddq	%mm2, %mm6		C				un=3
92	movd	%mm6, 8(%edx)		C				un=3
93	psrlq	$32, %mm6		C				un=3
94	movd	%mm6, 12(%edx)		C				un=3
95	lea	4(%edx), %edx		C				un=3
96	lea	4(%eax), %eax		C				un=3
97	jmp	L(am1)
98L(un4):	movd	(%eax), %mm7		C				un=4
99	movd	4(%eax), %mm6		C				un=4
100	pmuludq	%mm7, %mm6		C				un=4
101	movd	8(%eax), %mm0		C				un=4
102	pmuludq	%mm7, %mm0		C				un=4
103	movd	12(%eax), %mm1		C				un=4
104	pmuludq	%mm7, %mm1		C				un=4
105	movd	%mm6, 4(%edx)		C				un=4
106	psrlq	$32, %mm6		C				un=4
107	paddq	%mm0, %mm6		C				un=4
108	movd	%mm6, 8(%edx)		C				un=4
109	psrlq	$32, %mm6		C				un=4
110	paddq	%mm1, %mm6		C				un=4
111	movd	%mm6, 12(%edx)		C				un=4
112	psrlq	$32, %mm6		C				un=4
113	movd	%mm6, 16(%edx)		C				un=4
114	lea	4(%edx), %edx		C				un=4
115	lea	4(%eax), %eax		C				un=4
116	jmp	L(am2)
117
118L(big):	push	%esi
119	push	%ebx
120	push	%edi
121	pxor	%mm6, %mm6
122	movd	(%eax), %mm7		C
123	lea	4(%eax), %esi		C init up, up++
124	lea	4(%eax), %eax		C up2++  FIXME: should fix offsets
125	lea	4(%edx), %edi		C init rp, rp++
126	lea	4(%edx), %edx		C rp2++
127	lea	-4(%ecx), %ebx		C loop count
128	and	$3, %ecx
129	jz	L(3m)
130	cmp	$2, %ecx
131	ja	L(2m)
132	jb	L(0m)
133
134L(1m):
135	movd	(%eax), %mm4		C				m 1
136	lea	(%ebx), %ecx		C inner loop count		m 1
137	pmuludq	%mm7, %mm4		C				m 1
138	movd	4(%eax), %mm3		C				m 1
139	pmuludq	%mm7, %mm3		C				m 1
140	movd	8(%eax), %mm0		C				m 1
141	jmp	L(m01)			C				m 1
142	ALIGN(16)			C				m 1
143L(lpm1):
144	pmuludq	%mm7, %mm4		C				m 1
145	paddq	%mm0, %mm6		C				m 1
146	movd	4(%eax), %mm3		C				m 1
147	movd	%mm6, -8(%edx)		C				m 1
148	psrlq	$32, %mm6		C				m 1
149	pmuludq	%mm7, %mm3		C				m 1
150	paddq	%mm1, %mm6		C				m 1
151	movd	8(%eax), %mm0		C				m 1
152	movd	%mm6, -4(%edx)		C				m 1
153	psrlq	$32, %mm6		C				m 1
154L(m01):	pmuludq	%mm7, %mm0		C				m 1
155	paddq	%mm4, %mm6		C				m 1
156	movd	12(%eax), %mm1		C				m 1
157	movd	%mm6, (%edx)		C				m 1
158	psrlq	$32, %mm6		C				m 1
159	pmuludq	%mm7, %mm1		C				m 1
160	paddq	%mm3, %mm6		C				m 1
161	movd	16(%eax), %mm4		C				m 1
162	movd	%mm6, 4(%edx)		C				m 1
163	psrlq	$32, %mm6		C				m 1
164	lea	16(%eax), %eax		C				m 1
165	lea	16(%edx), %edx		C				m 1
166	sub	$4, %ecx		C				m 1
167	ja	L(lpm1)			C				m 1
168	pmuludq	%mm7, %mm4		C				m 1
169	paddq	%mm0, %mm6		C				m 1
170	movd	%mm6, -8(%edx)		C				m 1
171	psrlq	$32, %mm6		C				m 1
172	paddq	%mm1, %mm6		C				m 1
173	jmp	L(0)
174
175L(2m):
176	movd	(%eax), %mm1		C				m 2
177	lea	(%ebx), %ecx		C inner loop count		m 2
178	pmuludq	%mm7, %mm1		C				m 2
179	movd	4(%eax), %mm4		C				m 2
180	pmuludq	%mm7, %mm4		C				m 2
181	movd	8(%eax), %mm3		C				m 2
182	jmp	L(m10)			C				m 2
183	ALIGN(16)			C				m 2
184L(lpm2):
185	pmuludq	%mm7, %mm4		C				m 2
186	paddq	%mm0, %mm6		C				m 2
187	movd	8(%eax), %mm3		C				m 2
188	movd	%mm6, -4(%edx)		C				m 2
189	psrlq	$32, %mm6		C				m 2
190L(m10):	pmuludq	%mm7, %mm3		C				m 2
191	paddq	%mm1, %mm6		C				m 2
192	movd	12(%eax), %mm0		C				m 2
193	movd	%mm6, (%edx)		C				m 2
194	psrlq	$32, %mm6		C				m 2
195	pmuludq	%mm7, %mm0		C				m 2
196	paddq	%mm4, %mm6		C				m 2
197	movd	16(%eax), %mm1		C				m 2
198	movd	%mm6, 4(%edx)		C				m 2
199	psrlq	$32, %mm6		C				m 2
200	pmuludq	%mm7, %mm1		C				m 2
201	paddq	%mm3, %mm6		C				m 2
202	movd	20(%eax), %mm4		C				m 2
203	movd	%mm6, 8(%edx)		C				m 2
204	psrlq	$32, %mm6		C				m 2
205	lea	16(%eax), %eax		C				m 2
206	lea	16(%edx), %edx		C				m 2
207	sub	$4, %ecx		C				m 2
208	ja	L(lpm2)			C				m 2
209	pmuludq	%mm7, %mm4		C				m 2
210	paddq	%mm0, %mm6		C				m 2
211	movd	%mm6, -4(%edx)		C				m 2
212	psrlq	$32, %mm6		C				m 2
213	paddq	%mm1, %mm6		C				m 2
214	jmp	L(1)
215
216L(3m):
217	movd	(%eax), %mm0		C				m 3
218	lea	(%ebx), %ecx		C inner loop count		m 3
219	pmuludq	%mm7, %mm0		C				m 3
220	movd	4(%eax), %mm1		C				m 3
221	pmuludq	%mm7, %mm1		C				m 3
222	movd	8(%eax), %mm4		C				m 3
223	jmp	L(lpm3)			C				m 3
224	ALIGN(16)			C				m 3
225L(lpm3):
226	pmuludq	%mm7, %mm4		C				m 3
227	paddq	%mm0, %mm6		C				m 3
228	movd	12(%eax), %mm3		C				m 3
229	movd	%mm6, (%edx)		C				m 3
230	psrlq	$32, %mm6		C				m 3
231	pmuludq	%mm7, %mm3		C				m 3
232	paddq	%mm1, %mm6		C				m 3
233	movd	16(%eax), %mm0		C				m 3
234	movd	%mm6, 4(%edx)		C				m 3
235	psrlq	$32, %mm6		C				m 3
236	pmuludq	%mm7, %mm0		C				m 3
237	paddq	%mm4, %mm6		C				m 3
238	movd	20(%eax), %mm1		C				m 3
239	movd	%mm6, 8(%edx)		C				m 3
240	psrlq	$32, %mm6		C				m 3
241	pmuludq	%mm7, %mm1		C				m 3
242	paddq	%mm3, %mm6		C				m 3
243	movd	24(%eax), %mm4		C				m 3
244	movd	%mm6, 12(%edx)		C				m 3
245	psrlq	$32, %mm6		C				m 3
246	lea	16(%eax), %eax		C				m 3
247	lea	16(%edx), %edx		C				m 3
248	sub	$4, %ecx		C				m 3
249	ja	L(lpm3)			C				m 3
250	pmuludq	%mm7, %mm4		C				m 3
251	paddq	%mm0, %mm6		C				m 3
252	movd	%mm6, (%edx)		C				m 3
253	psrlq	$32, %mm6		C				m 3
254	paddq	%mm1, %mm6		C				m 3
255	jmp	L(2)
256
257L(0m):
258	movd	(%eax), %mm3		C				m 0
259	lea	(%ebx), %ecx		C inner loop count		m 0
260	pmuludq	%mm7, %mm3		C				m 0
261	movd	4(%eax), %mm0		C				m 0
262	pmuludq	%mm7, %mm0		C				m 0
263	movd	8(%eax), %mm1		C				m 0
264	jmp	L(m00)			C				m 0
265	ALIGN(16)			C				m 0
266L(lpm0):
267	pmuludq	%mm7, %mm4		C				m 0
268	paddq	%mm0, %mm6		C				m 0
269	movd	(%eax), %mm3		C				m 0
270	movd	%mm6, -12(%edx)		C				m 0
271	psrlq	$32, %mm6		C				m 0
272	pmuludq	%mm7, %mm3		C				m 0
273	paddq	%mm1, %mm6		C				m 0
274	movd	4(%eax), %mm0		C				m 0
275	movd	%mm6, -8(%edx)		C				m 0
276	psrlq	$32, %mm6		C				m 0
277	pmuludq	%mm7, %mm0		C				m 0
278	paddq	%mm4, %mm6		C				m 0
279	movd	8(%eax), %mm1		C				m 0
280	movd	%mm6, -4(%edx)		C				m 0
281	psrlq	$32, %mm6		C				m 0
282L(m00):	pmuludq	%mm7, %mm1		C				m 0
283	paddq	%mm3, %mm6		C				m 0
284	movd	12(%eax), %mm4		C				m 0
285	movd	%mm6, (%edx)		C				m 0
286	psrlq	$32, %mm6		C				m 0
287	lea	16(%eax), %eax		C				m 0
288	lea	16(%edx), %edx		C				m 0
289	sub	$4, %ecx		C				m 0
290	ja	L(lpm0)			C				m 0
291	pmuludq	%mm7, %mm4		C				m 0
292	paddq	%mm0, %mm6		C				m 0
293	movd	%mm6, -12(%edx)		C				m 0
294	psrlq	$32, %mm6		C				m 0
295	paddq	%mm1, %mm6		C				m 0
296	jmp	L(3)
297
298L(outer):
299	lea	8(%edi), %edi		C rp += 2
300	movd	(%esi), %mm7		C				am 3
301	mov	%edi, %edx		C rp2 = rp			am 3
302	lea	4(%esi), %esi		C up++				am 3
303	lea	(%esi), %eax		C up2 = up			am 3
304	movd	(%eax), %mm0		C				am 3
305	lea	(%ebx), %ecx		C inner loop count		am 3
306	pxor	%mm6, %mm6		C				am 3
307	pmuludq	%mm7, %mm0		C				am 3
308	movd	4(%eax), %mm1		C				am 3
309	movd	(%edx), %mm4		C				am 3
310	pmuludq	%mm7, %mm1		C				am 3
311	movd	8(%eax), %mm2		C				am 3
312	paddq	%mm0, %mm4		C				am 3
313	movd	4(%edx), %mm5		C				am 3
314	jmp	L(lam3)			C				am 3
315	ALIGN(16)			C				am 3
316L(lam3):
317	pmuludq	%mm7, %mm2		C				am 3
318	paddq	%mm4, %mm6		C				am 3
319	movd	12(%eax), %mm3		C				am 3
320	paddq	%mm1, %mm5		C				am 3
321	movd	8(%edx), %mm4		C				am 3
322	movd	%mm6, (%edx)		C				am 3
323	psrlq	$32, %mm6		C				am 3
324	pmuludq	%mm7, %mm3		C				am 3
325	paddq	%mm5, %mm6		C				am 3
326	movd	16(%eax), %mm0		C				am 3
327	paddq	%mm2, %mm4		C				am 3
328	movd	12(%edx), %mm5		C				am 3
329	movd	%mm6, 4(%edx)		C				am 3
330	psrlq	$32, %mm6		C				am 3
331	pmuludq	%mm7, %mm0		C				am 3
332	paddq	%mm4, %mm6		C				am 3
333	movd	20(%eax), %mm1		C				am 3
334	paddq	%mm3, %mm5		C				am 3
335	movd	16(%edx), %mm4		C				am 3
336	movd	%mm6, 8(%edx)		C				am 3
337	psrlq	$32, %mm6		C				am 3
338	pmuludq	%mm7, %mm1		C				am 3
339	paddq	%mm5, %mm6		C				am 3
340	movd	24(%eax), %mm2		C				am 3
341	paddq	%mm0, %mm4		C				am 3
342	movd	20(%edx), %mm5		C				am 3
343	movd	%mm6, 12(%edx)		C				am 3
344	psrlq	$32, %mm6		C				am 3
345	lea	16(%eax), %eax		C				am 3
346	lea	16(%edx), %edx		C				am 3
347	sub	$4, %ecx		C				am 3
348	ja	L(lam3)			C				am 3
349	pmuludq	%mm7, %mm2		C				am 3
350	paddq	%mm4, %mm6		C				am 3
351	paddq	%mm1, %mm5		C				am 3
352	movd	8(%edx), %mm4		C				am 3
353	movd	%mm6, (%edx)		C				am 3
354	psrlq	$32, %mm6		C				am 3
355	paddq	%mm5, %mm6		C				am 3
356	paddq	%mm2, %mm4		C				am 3
357L(2):	movd	%mm6, 4(%edx)		C				am 3
358	psrlq	$32, %mm6		C				am 3
359	paddq	%mm4, %mm6		C				am 3
360	movd	%mm6, 8(%edx)		C				am 3
361	psrlq	$32, %mm6		C				am 3
362	movd	%mm6, 12(%edx)		C				am 3
363
364	lea	8(%edi), %edi		C rp += 2
365	movd	(%esi), %mm7		C				am 2
366	mov	%edi, %edx		C rp2 = rp			am 2
367	lea	4(%esi), %esi		C up++				am 2
368	lea	(%esi), %eax		C up2 = up			am 2
369	movd	(%eax), %mm1		C				am 2
370	lea	(%ebx), %ecx		C inner loop count		am 2
371	pxor	%mm6, %mm6		C				am 2
372	pmuludq	%mm7, %mm1		C				am 2
373	movd	4(%eax), %mm2		C				am 2
374	movd	(%edx), %mm5		C				am 2
375	pmuludq	%mm7, %mm2		C				am 2
376	movd	8(%eax), %mm3		C				am 2
377	paddq	%mm1, %mm5		C				am 2
378	movd	4(%edx), %mm4		C				am 2
379	jmp	L(am10)			C				am 2
380	ALIGN(16)			C				am 2
381L(lam2):
382	pmuludq	%mm7, %mm2		C				am 2
383	paddq	%mm4, %mm6		C				am 2
384	movd	8(%eax), %mm3		C				am 2
385	paddq	%mm1, %mm5		C				am 2
386	movd	4(%edx), %mm4		C				am 2
387	movd	%mm6, -4(%edx)		C				am 2
388	psrlq	$32, %mm6		C				am 2
389L(am10):
390	pmuludq	%mm7, %mm3		C				am 2
391	paddq	%mm5, %mm6		C				am 2
392	movd	12(%eax), %mm0		C				am 2
393	paddq	%mm2, %mm4		C				am 2
394	movd	8(%edx), %mm5		C				am 2
395	movd	%mm6, (%edx)		C				am 2
396	psrlq	$32, %mm6		C				am 2
397	pmuludq	%mm7, %mm0		C				am 2
398	paddq	%mm4, %mm6		C				am 2
399	movd	16(%eax), %mm1		C				am 2
400	paddq	%mm3, %mm5		C				am 2
401	movd	12(%edx), %mm4		C				am 2
402	movd	%mm6, 4(%edx)		C				am 2
403	psrlq	$32, %mm6		C				am 2
404	pmuludq	%mm7, %mm1		C				am 2
405	paddq	%mm5, %mm6		C				am 2
406	movd	20(%eax), %mm2		C				am 2
407	paddq	%mm0, %mm4		C				am 2
408	movd	16(%edx), %mm5		C				am 2
409	movd	%mm6, 8(%edx)		C				am 2
410	psrlq	$32, %mm6		C				am 2
411	lea	16(%eax), %eax		C				am 2
412	lea	16(%edx), %edx		C				am 2
413	sub	$4, %ecx		C				am 2
414	ja	L(lam2)			C				am 2
415	pmuludq	%mm7, %mm2		C				am 2
416	paddq	%mm4, %mm6		C				am 2
417	paddq	%mm1, %mm5		C				am 2
418	movd	4(%edx), %mm4		C				am 2
419	movd	%mm6, -4(%edx)		C				am 2
420	psrlq	$32, %mm6		C				am 2
421	paddq	%mm5, %mm6		C				am 2
422	paddq	%mm2, %mm4		C				am 2
423L(1):	movd	%mm6, (%edx)		C				am 2
424	psrlq	$32, %mm6		C				am 2
425	paddq	%mm4, %mm6		C				am 2
426	movd	%mm6, 4(%edx)		C				am 2
427	psrlq	$32, %mm6		C				am 2
428	movd	%mm6, 8(%edx)		C				am 2
429
430	lea	8(%edi), %edi		C rp += 2
431	movd	(%esi), %mm7		C				am 1
432	mov	%edi, %edx		C rp2 = rp			am 1
433	lea	4(%esi), %esi		C up++				am 1
434	lea	(%esi), %eax		C up2 = up			am 1
435	movd	(%eax), %mm2		C				am 1
436	lea	(%ebx), %ecx		C inner loop count		am 1
437	pxor	%mm6, %mm6		C				am 1
438	pmuludq	%mm7, %mm2		C				am 1
439	movd	4(%eax), %mm3		C				am 1
440	movd	(%edx), %mm4		C				am 1
441	pmuludq	%mm7, %mm3		C				am 1
442	movd	8(%eax), %mm0		C				am 1
443	paddq	%mm2, %mm4		C				am 1
444	movd	4(%edx), %mm5		C				am 1
445	jmp	L(am01)			C				am 1
446	ALIGN(16)			C				am 1
447L(lam1):
448	pmuludq	%mm7, %mm2		C				am 1
449	paddq	%mm4, %mm6		C				am 1
450	movd	4(%eax), %mm3		C				am 1
451	paddq	%mm1, %mm5		C				am 1
452	movd	(%edx), %mm4		C				am 1
453	movd	%mm6, -8(%edx)		C				am 1
454	psrlq	$32, %mm6		C				am 1
455	pmuludq	%mm7, %mm3		C				am 1
456	paddq	%mm5, %mm6		C				am 1
457	movd	8(%eax), %mm0		C				am 1
458	paddq	%mm2, %mm4		C				am 1
459	movd	4(%edx), %mm5		C				am 1
460	movd	%mm6, -4(%edx)		C				am 1
461	psrlq	$32, %mm6		C				am 1
462L(am01):
463	pmuludq	%mm7, %mm0		C				am 1
464	paddq	%mm4, %mm6		C				am 1
465	movd	12(%eax), %mm1		C				am 1
466	paddq	%mm3, %mm5		C				am 1
467	movd	8(%edx), %mm4		C				am 1
468	movd	%mm6, (%edx)		C				am 1
469	psrlq	$32, %mm6		C				am 1
470	pmuludq	%mm7, %mm1		C				am 1
471	paddq	%mm5, %mm6		C				am 1
472	movd	16(%eax), %mm2		C				am 1
473	paddq	%mm0, %mm4		C				am 1
474	movd	12(%edx), %mm5		C				am 1
475	movd	%mm6, 4(%edx)		C				am 1
476	psrlq	$32, %mm6		C				am 1
477	lea	16(%eax), %eax		C				am 1
478	lea	16(%edx), %edx		C				am 1
479	sub	$4, %ecx		C				am 1
480	ja	L(lam1)			C				am 1
481	pmuludq	%mm7, %mm2		C				am 1
482	paddq	%mm4, %mm6		C				am 1
483	paddq	%mm1, %mm5		C				am 1
484	movd	(%edx), %mm4		C				am 1
485	movd	%mm6, -8(%edx)		C				am 1
486	psrlq	$32, %mm6		C				am 1
487	paddq	%mm5, %mm6		C				am 1
488	paddq	%mm2, %mm4		C				am 1
489L(0):	movd	%mm6, -4(%edx)		C				am 1
490	psrlq	$32, %mm6		C				am 1
491	paddq	%mm4, %mm6		C				am 1
492	movd	%mm6, (%edx)		C				am 1
493	psrlq	$32, %mm6		C				am 1
494	movd	%mm6, 4(%edx)		C				am 1
495
496	lea	8(%edi), %edi		C rp += 2
497	movd	(%esi), %mm7		C				am 0
498	mov	%edi, %edx		C rp2 = rp			am 0
499	lea	4(%esi), %esi		C up++				am 0
500	lea	(%esi), %eax		C up2 = up			am 0
501	movd	(%eax), %mm3		C				am 0
502	lea	(%ebx), %ecx		C inner loop count		am 0
503	pxor	%mm6, %mm6		C				am 0
504	pmuludq	%mm7, %mm3		C				am 0
505	movd	4(%eax), %mm0		C				am 0
506	movd	(%edx), %mm5		C				am 0
507	pmuludq	%mm7, %mm0		C				am 0
508	movd	8(%eax), %mm1		C				am 0
509	paddq	%mm3, %mm5		C				am 0
510	movd	4(%edx), %mm4		C				am 0
511	jmp	L(am00)			C				am 0
512	ALIGN(16)			C				am 0
513L(lam0):
514	pmuludq	%mm7, %mm2		C				am 0
515	paddq	%mm4, %mm6		C				am 0
516	movd	(%eax), %mm3		C				am 0
517	paddq	%mm1, %mm5		C				am 0
518	movd	-4(%edx), %mm4		C				am 0
519	movd	%mm6, -12(%edx)		C				am 0
520	psrlq	$32, %mm6		C				am 0
521	pmuludq	%mm7, %mm3		C				am 0
522	paddq	%mm5, %mm6		C				am 0
523	movd	4(%eax), %mm0		C				am 0
524	paddq	%mm2, %mm4		C				am 0
525	movd	(%edx), %mm5		C				am 0
526	movd	%mm6, -8(%edx)		C				am 0
527	psrlq	$32, %mm6		C				am 0
528	pmuludq	%mm7, %mm0		C				am 0
529	paddq	%mm4, %mm6		C				am 0
530	movd	8(%eax), %mm1		C				am 0
531	paddq	%mm3, %mm5		C				am 0
532	movd	4(%edx), %mm4		C				am 0
533	movd	%mm6, -4(%edx)		C				am 0
534	psrlq	$32, %mm6		C				am 0
535L(am00):
536	pmuludq	%mm7, %mm1		C				am 0
537	paddq	%mm5, %mm6		C				am 0
538	movd	12(%eax), %mm2		C				am 0
539	paddq	%mm0, %mm4		C				am 0
540	movd	8(%edx), %mm5		C				am 0
541	movd	%mm6, (%edx)		C				am 0
542	psrlq	$32, %mm6		C				am 0
543	lea	16(%eax), %eax		C				am 0
544	lea	16(%edx), %edx		C				am 0
545	sub	$4, %ecx		C				am 0
546	ja	L(lam0)			C				am 0
547	pmuludq	%mm7, %mm2		C				am 0
548	paddq	%mm4, %mm6		C				am 0
549	paddq	%mm1, %mm5		C				am 0
550	movd	-4(%edx), %mm4		C				am 0
551	movd	%mm6, -12(%edx)		C				am 0
552	psrlq	$32, %mm6		C				am 0
553	paddq	%mm5, %mm6		C				am 0
554	paddq	%mm2, %mm4		C				am 0
555L(3):	movd	%mm6, -8(%edx)		C				am 0
556	psrlq	$32, %mm6		C				am 0
557	paddq	%mm4, %mm6		C				am 0
558	movd	%mm6, -4(%edx)		C				am 0
559	psrlq	$32, %mm6		C				am 0
560	movd	%mm6, (%edx)		C				am 0
561	sub	$4, %ebx		C				am 0
562	ja	L(outer)			C				am 0
563
564	mov	%edi, %edx
565	mov	%esi, %eax
566	pop	%edi
567	pop	%ebx
568	pop	%esi
569
570L(am3):	C up[un-1..un-3] x up[un-4]
571	lea	8(%edx), %edx		C rp2 += 2
572	movd	(%eax), %mm7
573	movd	4(%eax), %mm1
574	movd	8(%eax), %mm2
575	movd	12(%eax), %mm3
576	movd	(%edx), %mm4
577	pmuludq	%mm7, %mm1
578	movd	4(%edx), %mm5
579	pmuludq	%mm7, %mm2
580	movd	8(%edx), %mm6
581	pmuludq	%mm7, %mm3
582	paddq	%mm1, %mm4
583	paddq	%mm2, %mm5
584	paddq	%mm3, %mm6
585	movd	%mm4, (%edx)
586	psrlq	$32, %mm4
587	paddq	%mm5, %mm4
588	movd	%mm4, 4(%edx)
589	psrlq	$32, %mm4
590	paddq	%mm6, %mm4
591	movd	%mm4, 8(%edx)
592	psrlq	$32, %mm4
593	movd	%mm4, 12(%edx)		C FIXME feed through!
594	lea	4(%eax), %eax
595
596L(am2):	C up[un-1..un-2] x up[un-3]
597	lea	8(%edx), %edx		C rp2 += 2
598	movd	(%eax), %mm7
599	movd	4(%eax), %mm1
600	movd	8(%eax), %mm2
601	movd	(%edx), %mm4
602	movd	4(%edx), %mm5
603	pmuludq	%mm7, %mm1
604	pmuludq	%mm7, %mm2
605	paddq	%mm1, %mm4
606	paddq	%mm2, %mm5
607	movd	%mm4, (%edx)
608	psrlq	$32, %mm4
609	paddq	%mm5, %mm4
610	movd	%mm4, 4(%edx)
611	psrlq	$32, %mm4
612	movd	%mm4, 8(%edx)		C FIXME feed through!
613	lea	4(%eax), %eax
614
615L(am1):	C up[un-1] x up[un-2]
616	lea	8(%edx), %edx		C rp2 += 2
617	movd	(%eax), %mm7
618	movd	4(%eax), %mm2
619	movd	(%edx), %mm4
620	pmuludq	%mm7, %mm2
621	paddq	%mm2, %mm4
622	movd	%mm4, (%edx)
623	psrlq	$32, %mm4
624	movd	%mm4, 4(%edx)
625
626C *** diag stuff, use elementary code for now
627
628	mov	4(%esp), %edx		C rp
629	mov	8(%esp), %eax		C up
630	mov	12(%esp), %ecx		C un
631
632	movd	(%eax), %mm2
633	pmuludq	%mm2, %mm2		C src[0]^2
634
635	pcmpeqd	%mm7, %mm7
636	psrlq	$32, %mm7
637
638	movd	4(%edx), %mm3		C dst[1]
639
640	movd	%mm2, (%edx)
641	psrlq	$32, %mm2
642
643	psllq	$1, %mm3		C 2*dst[1]
644	paddq	%mm3, %mm2
645	movd	%mm2, 4(%edx)
646	psrlq	$32, %mm2
647
648	sub	$2, %ecx
649
650L(diag):
651	movd	4(%eax), %mm0		C src limb
652	add	$4, %eax
653	pmuludq	%mm0, %mm0
654	movq	%mm7, %mm1
655	pand	%mm0, %mm1		C diagonal low
656	psrlq	$32, %mm0		C diagonal high
657
658	movd	8(%edx), %mm3
659	psllq	$1, %mm3		C 2*dst[i]
660	paddq	%mm3, %mm1
661	paddq	%mm1, %mm2
662	movd	%mm2, 8(%edx)
663	psrlq	$32, %mm2
664
665	movd	12(%edx), %mm3
666	psllq	$1, %mm3		C 2*dst[i+1]
667	paddq	%mm3, %mm0
668	paddq	%mm0, %mm2
669	movd	%mm2, 12(%edx)
670	add	$8, %edx
671	psrlq	$32, %mm2
672
673	sub	$1, %ecx
674	jnz	L(diag)
675
676	movd	4(%eax), %mm0		C src[size-1]
677	pmuludq	%mm0, %mm0
678	pand	%mm0, %mm7		C diagonal low
679	psrlq	$32, %mm0		C diagonal high
680
681	movd	8(%edx), %mm3		C dst[2*size-2]
682	psllq	$1, %mm3
683	paddq	%mm3, %mm7
684	paddq	%mm7, %mm2
685	movd	%mm2, 8(%edx)
686	psrlq	$32, %mm2
687
688	paddq	%mm0, %mm2
689	movd	%mm2, 12(%edx)		C dst[2*size-1]
690
691	emms
692	ret
693
694EPILOGUE()
695