1dnl  mpn_sqr_basecase for Pentium 4 and P6 models with SSE2 (i.e., 9,D,E,F).
2
3dnl  Copyright 2001, 2002, 2007 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33C TODO:
34C  * Improve ad-hoc outer loop code and register handling.  Some feed-in
35C    scheduling could improve things by several cycles per outer iteration.
36C  * In Lam3...Lam1 code for, keep accumulation operands in registers, without
37C    storing intermediates to rp.
38C  * We might want to keep 32 in a free mm register, since the register form is
39C    3 bytes and the immediate form is 4 bytes.  About 80 bytes to save.
40C  * Look into different loop alignment, we now expand the code about 50 bytes
41C    with possibly needless alignment.
42C  * Use OSP, should solve feed-in latency problems.
43C  * Address relative slowness for un<=3 for Pentium M.  The old code is there
44C    considerably faster.  (1:20/14, 2:34:32, 3:66/57)
45
46C INPUT PARAMETERS
47C rp		sp + 4
48C up		sp + 8
49C un		sp + 12
50
51	TEXT
52	ALIGN(16)
53PROLOGUE(mpn_sqr_basecase)
54	mov	4(%esp), %edx		C rp
55	mov	8(%esp), %eax		C up
56	mov	12(%esp), %ecx		C un
57
58	cmp	$2, %ecx
59	jc	L(un1)
60	jz	L(un2)
61	cmp	$4, %ecx
62	jc	L(un3)
63	jz	L(un4)
64	jmp	L(big)
65
66L(un1):	mov	(%eax), %eax
67	mov	%edx, %ecx
68	mul	%eax
69	mov	%eax, (%ecx)
70	mov	%edx, 4(%ecx)
71	ret
72L(un2):	movd	(%eax), %mm0		C				un=2
73	movd	(%eax), %mm2		C				un=2
74	movd	4(%eax), %mm1		C				un=2
75	pmuludq	%mm0, %mm0		C 64b weight 0			un=2
76	pmuludq	%mm1, %mm2		C 64b weight 32			un=2
77	pmuludq	%mm1, %mm1		C 64b weight 64			un=2
78	movd	%mm0, (%edx)		C				un=2
79	psrlq	$32, %mm0		C 32b weight 32			un=2
80	pcmpeqd	%mm7, %mm7		C				un=2
81	psrlq	$33, %mm7		C 0x000000007FFFFFFF		un=2
82	pand	%mm2, %mm7		C 31b weight 32			un=2
83	psrlq	$31, %mm2		C 33b weight 65			un=2
84	psllq	$1, %mm7		C 31b weight 33			un=2
85	paddq	%mm7, %mm0		C				un=2
86	movd	%mm0, 4(%edx)		C				un=2
87	psrlq	$32, %mm0		C				un=2
88	paddq	%mm2, %mm1		C				un=2
89	paddq	%mm0, %mm1		C				un=2
90	movd	%mm1, 8(%edx)		C				un=2
91	psrlq	$32, %mm1		C				un=2
92	movd	%mm1, 12(%edx)		C				un=2
93	emms
94	ret
95L(un3):	movd	(%eax), %mm7		C				un=3
96	movd	4(%eax), %mm6		C				un=3
97	pmuludq	%mm7, %mm6		C				un=3
98	movd	8(%eax), %mm2		C				un=3
99	pmuludq	%mm7, %mm2		C				un=3
100	movd	%mm6, 4(%edx)		C				un=3
101	psrlq	$32, %mm6		C				un=3
102	paddq	%mm2, %mm6		C				un=3
103	movd	%mm6, 8(%edx)		C				un=3
104	psrlq	$32, %mm6		C				un=3
105	movd	%mm6, 12(%edx)		C				un=3
106	lea	4(%edx), %edx		C				un=3
107	lea	4(%eax), %eax		C				un=3
108	jmp	L(am1)
109L(un4):	movd	(%eax), %mm7		C				un=4
110	movd	4(%eax), %mm6		C				un=4
111	pmuludq	%mm7, %mm6		C				un=4
112	movd	8(%eax), %mm0		C				un=4
113	pmuludq	%mm7, %mm0		C				un=4
114	movd	12(%eax), %mm1		C				un=4
115	pmuludq	%mm7, %mm1		C				un=4
116	movd	%mm6, 4(%edx)		C				un=4
117	psrlq	$32, %mm6		C				un=4
118	paddq	%mm0, %mm6		C				un=4
119	movd	%mm6, 8(%edx)		C				un=4
120	psrlq	$32, %mm6		C				un=4
121	paddq	%mm1, %mm6		C				un=4
122	movd	%mm6, 12(%edx)		C				un=4
123	psrlq	$32, %mm6		C				un=4
124	movd	%mm6, 16(%edx)		C				un=4
125	lea	4(%edx), %edx		C				un=4
126	lea	4(%eax), %eax		C				un=4
127	jmp	L(am2)
128
129L(big):	push	%esi
130	push	%ebx
131	push	%edi
132	pxor	%mm6, %mm6
133	movd	(%eax), %mm7		C
134	lea	4(%eax), %esi		C init up, up++
135	lea	4(%eax), %eax		C up2++  FIXME: should fix offsets
136	lea	4(%edx), %edi		C init rp, rp++
137	lea	4(%edx), %edx		C rp2++
138	lea	-4(%ecx), %ebx		C loop count
139	and	$3, %ecx
140	jz	L(3m)
141	cmp	$2, %ecx
142	ja	L(2m)
143	jb	L(0m)
144
145L(1m):
146	movd	(%eax), %mm4		C				m 1
147	lea	(%ebx), %ecx		C inner loop count		m 1
148	pmuludq	%mm7, %mm4		C				m 1
149	movd	4(%eax), %mm3		C				m 1
150	pmuludq	%mm7, %mm3		C				m 1
151	movd	8(%eax), %mm0		C				m 1
152	jmp	L(m01)			C				m 1
153	ALIGN(16)			C				m 1
154L(lpm1):
155	pmuludq	%mm7, %mm4		C				m 1
156	paddq	%mm0, %mm6		C				m 1
157	movd	4(%eax), %mm3		C				m 1
158	movd	%mm6, -8(%edx)		C				m 1
159	psrlq	$32, %mm6		C				m 1
160	pmuludq	%mm7, %mm3		C				m 1
161	paddq	%mm1, %mm6		C				m 1
162	movd	8(%eax), %mm0		C				m 1
163	movd	%mm6, -4(%edx)		C				m 1
164	psrlq	$32, %mm6		C				m 1
165L(m01):	pmuludq	%mm7, %mm0		C				m 1
166	paddq	%mm4, %mm6		C				m 1
167	movd	12(%eax), %mm1		C				m 1
168	movd	%mm6, (%edx)		C				m 1
169	psrlq	$32, %mm6		C				m 1
170	pmuludq	%mm7, %mm1		C				m 1
171	paddq	%mm3, %mm6		C				m 1
172	movd	16(%eax), %mm4		C				m 1
173	movd	%mm6, 4(%edx)		C				m 1
174	psrlq	$32, %mm6		C				m 1
175	lea	16(%eax), %eax		C				m 1
176	lea	16(%edx), %edx		C				m 1
177	sub	$4, %ecx		C				m 1
178	ja	L(lpm1)			C				m 1
179	pmuludq	%mm7, %mm4		C				m 1
180	paddq	%mm0, %mm6		C				m 1
181	movd	%mm6, -8(%edx)		C				m 1
182	psrlq	$32, %mm6		C				m 1
183	paddq	%mm1, %mm6		C				m 1
184	jmp	L(0)
185
186L(2m):
187	movd	(%eax), %mm1		C				m 2
188	lea	(%ebx), %ecx		C inner loop count		m 2
189	pmuludq	%mm7, %mm1		C				m 2
190	movd	4(%eax), %mm4		C				m 2
191	pmuludq	%mm7, %mm4		C				m 2
192	movd	8(%eax), %mm3		C				m 2
193	jmp	L(m10)			C				m 2
194	ALIGN(16)			C				m 2
195L(lpm2):
196	pmuludq	%mm7, %mm4		C				m 2
197	paddq	%mm0, %mm6		C				m 2
198	movd	8(%eax), %mm3		C				m 2
199	movd	%mm6, -4(%edx)		C				m 2
200	psrlq	$32, %mm6		C				m 2
201L(m10):	pmuludq	%mm7, %mm3		C				m 2
202	paddq	%mm1, %mm6		C				m 2
203	movd	12(%eax), %mm0		C				m 2
204	movd	%mm6, (%edx)		C				m 2
205	psrlq	$32, %mm6		C				m 2
206	pmuludq	%mm7, %mm0		C				m 2
207	paddq	%mm4, %mm6		C				m 2
208	movd	16(%eax), %mm1		C				m 2
209	movd	%mm6, 4(%edx)		C				m 2
210	psrlq	$32, %mm6		C				m 2
211	pmuludq	%mm7, %mm1		C				m 2
212	paddq	%mm3, %mm6		C				m 2
213	movd	20(%eax), %mm4		C				m 2
214	movd	%mm6, 8(%edx)		C				m 2
215	psrlq	$32, %mm6		C				m 2
216	lea	16(%eax), %eax		C				m 2
217	lea	16(%edx), %edx		C				m 2
218	sub	$4, %ecx		C				m 2
219	ja	L(lpm2)			C				m 2
220	pmuludq	%mm7, %mm4		C				m 2
221	paddq	%mm0, %mm6		C				m 2
222	movd	%mm6, -4(%edx)		C				m 2
223	psrlq	$32, %mm6		C				m 2
224	paddq	%mm1, %mm6		C				m 2
225	jmp	L(1)
226
227L(3m):
228	movd	(%eax), %mm0		C				m 3
229	lea	(%ebx), %ecx		C inner loop count		m 3
230	pmuludq	%mm7, %mm0		C				m 3
231	movd	4(%eax), %mm1		C				m 3
232	pmuludq	%mm7, %mm1		C				m 3
233	movd	8(%eax), %mm4		C				m 3
234	jmp	L(lpm3)			C				m 3
235	ALIGN(16)			C				m 3
236L(lpm3):
237	pmuludq	%mm7, %mm4		C				m 3
238	paddq	%mm0, %mm6		C				m 3
239	movd	12(%eax), %mm3		C				m 3
240	movd	%mm6, (%edx)		C				m 3
241	psrlq	$32, %mm6		C				m 3
242	pmuludq	%mm7, %mm3		C				m 3
243	paddq	%mm1, %mm6		C				m 3
244	movd	16(%eax), %mm0		C				m 3
245	movd	%mm6, 4(%edx)		C				m 3
246	psrlq	$32, %mm6		C				m 3
247	pmuludq	%mm7, %mm0		C				m 3
248	paddq	%mm4, %mm6		C				m 3
249	movd	20(%eax), %mm1		C				m 3
250	movd	%mm6, 8(%edx)		C				m 3
251	psrlq	$32, %mm6		C				m 3
252	pmuludq	%mm7, %mm1		C				m 3
253	paddq	%mm3, %mm6		C				m 3
254	movd	24(%eax), %mm4		C				m 3
255	movd	%mm6, 12(%edx)		C				m 3
256	psrlq	$32, %mm6		C				m 3
257	lea	16(%eax), %eax		C				m 3
258	lea	16(%edx), %edx		C				m 3
259	sub	$4, %ecx		C				m 3
260	ja	L(lpm3)			C				m 3
261	pmuludq	%mm7, %mm4		C				m 3
262	paddq	%mm0, %mm6		C				m 3
263	movd	%mm6, (%edx)		C				m 3
264	psrlq	$32, %mm6		C				m 3
265	paddq	%mm1, %mm6		C				m 3
266	jmp	L(2)
267
268L(0m):
269	movd	(%eax), %mm3		C				m 0
270	lea	(%ebx), %ecx		C inner loop count		m 0
271	pmuludq	%mm7, %mm3		C				m 0
272	movd	4(%eax), %mm0		C				m 0
273	pmuludq	%mm7, %mm0		C				m 0
274	movd	8(%eax), %mm1		C				m 0
275	jmp	L(m00)			C				m 0
276	ALIGN(16)			C				m 0
277L(lpm0):
278	pmuludq	%mm7, %mm4		C				m 0
279	paddq	%mm0, %mm6		C				m 0
280	movd	(%eax), %mm3		C				m 0
281	movd	%mm6, -12(%edx)		C				m 0
282	psrlq	$32, %mm6		C				m 0
283	pmuludq	%mm7, %mm3		C				m 0
284	paddq	%mm1, %mm6		C				m 0
285	movd	4(%eax), %mm0		C				m 0
286	movd	%mm6, -8(%edx)		C				m 0
287	psrlq	$32, %mm6		C				m 0
288	pmuludq	%mm7, %mm0		C				m 0
289	paddq	%mm4, %mm6		C				m 0
290	movd	8(%eax), %mm1		C				m 0
291	movd	%mm6, -4(%edx)		C				m 0
292	psrlq	$32, %mm6		C				m 0
293L(m00):	pmuludq	%mm7, %mm1		C				m 0
294	paddq	%mm3, %mm6		C				m 0
295	movd	12(%eax), %mm4		C				m 0
296	movd	%mm6, (%edx)		C				m 0
297	psrlq	$32, %mm6		C				m 0
298	lea	16(%eax), %eax		C				m 0
299	lea	16(%edx), %edx		C				m 0
300	sub	$4, %ecx		C				m 0
301	ja	L(lpm0)			C				m 0
302	pmuludq	%mm7, %mm4		C				m 0
303	paddq	%mm0, %mm6		C				m 0
304	movd	%mm6, -12(%edx)		C				m 0
305	psrlq	$32, %mm6		C				m 0
306	paddq	%mm1, %mm6		C				m 0
307	jmp	L(3)
308
309L(outer):
310	lea	8(%edi), %edi		C rp += 2
311	movd	(%esi), %mm7		C				am 3
312	mov	%edi, %edx		C rp2 = rp			am 3
313	lea	4(%esi), %esi		C up++				am 3
314	lea	(%esi), %eax		C up2 = up			am 3
315	movd	(%eax), %mm0		C				am 3
316	lea	(%ebx), %ecx		C inner loop count		am 3
317	pxor	%mm6, %mm6		C				am 3
318	pmuludq	%mm7, %mm0		C				am 3
319	movd	4(%eax), %mm1		C				am 3
320	movd	(%edx), %mm4		C				am 3
321	pmuludq	%mm7, %mm1		C				am 3
322	movd	8(%eax), %mm2		C				am 3
323	paddq	%mm0, %mm4		C				am 3
324	movd	4(%edx), %mm5		C				am 3
325	jmp	L(lam3)			C				am 3
326	ALIGN(16)			C				am 3
327L(lam3):
328	pmuludq	%mm7, %mm2		C				am 3
329	paddq	%mm4, %mm6		C				am 3
330	movd	12(%eax), %mm3		C				am 3
331	paddq	%mm1, %mm5		C				am 3
332	movd	8(%edx), %mm4		C				am 3
333	movd	%mm6, (%edx)		C				am 3
334	psrlq	$32, %mm6		C				am 3
335	pmuludq	%mm7, %mm3		C				am 3
336	paddq	%mm5, %mm6		C				am 3
337	movd	16(%eax), %mm0		C				am 3
338	paddq	%mm2, %mm4		C				am 3
339	movd	12(%edx), %mm5		C				am 3
340	movd	%mm6, 4(%edx)		C				am 3
341	psrlq	$32, %mm6		C				am 3
342	pmuludq	%mm7, %mm0		C				am 3
343	paddq	%mm4, %mm6		C				am 3
344	movd	20(%eax), %mm1		C				am 3
345	paddq	%mm3, %mm5		C				am 3
346	movd	16(%edx), %mm4		C				am 3
347	movd	%mm6, 8(%edx)		C				am 3
348	psrlq	$32, %mm6		C				am 3
349	pmuludq	%mm7, %mm1		C				am 3
350	paddq	%mm5, %mm6		C				am 3
351	movd	24(%eax), %mm2		C				am 3
352	paddq	%mm0, %mm4		C				am 3
353	movd	20(%edx), %mm5		C				am 3
354	movd	%mm6, 12(%edx)		C				am 3
355	psrlq	$32, %mm6		C				am 3
356	lea	16(%eax), %eax		C				am 3
357	lea	16(%edx), %edx		C				am 3
358	sub	$4, %ecx		C				am 3
359	ja	L(lam3)			C				am 3
360	pmuludq	%mm7, %mm2		C				am 3
361	paddq	%mm4, %mm6		C				am 3
362	paddq	%mm1, %mm5		C				am 3
363	movd	8(%edx), %mm4		C				am 3
364	movd	%mm6, (%edx)		C				am 3
365	psrlq	$32, %mm6		C				am 3
366	paddq	%mm5, %mm6		C				am 3
367	paddq	%mm2, %mm4		C				am 3
368L(2):	movd	%mm6, 4(%edx)		C				am 3
369	psrlq	$32, %mm6		C				am 3
370	paddq	%mm4, %mm6		C				am 3
371	movd	%mm6, 8(%edx)		C				am 3
372	psrlq	$32, %mm6		C				am 3
373	movd	%mm6, 12(%edx)		C				am 3
374
375	lea	8(%edi), %edi		C rp += 2
376	movd	(%esi), %mm7		C				am 2
377	mov	%edi, %edx		C rp2 = rp			am 2
378	lea	4(%esi), %esi		C up++				am 2
379	lea	(%esi), %eax		C up2 = up			am 2
380	movd	(%eax), %mm1		C				am 2
381	lea	(%ebx), %ecx		C inner loop count		am 2
382	pxor	%mm6, %mm6		C				am 2
383	pmuludq	%mm7, %mm1		C				am 2
384	movd	4(%eax), %mm2		C				am 2
385	movd	(%edx), %mm5		C				am 2
386	pmuludq	%mm7, %mm2		C				am 2
387	movd	8(%eax), %mm3		C				am 2
388	paddq	%mm1, %mm5		C				am 2
389	movd	4(%edx), %mm4		C				am 2
390	jmp	L(am10)			C				am 2
391	ALIGN(16)			C				am 2
392L(lam2):
393	pmuludq	%mm7, %mm2		C				am 2
394	paddq	%mm4, %mm6		C				am 2
395	movd	8(%eax), %mm3		C				am 2
396	paddq	%mm1, %mm5		C				am 2
397	movd	4(%edx), %mm4		C				am 2
398	movd	%mm6, -4(%edx)		C				am 2
399	psrlq	$32, %mm6		C				am 2
400L(am10):
401	pmuludq	%mm7, %mm3		C				am 2
402	paddq	%mm5, %mm6		C				am 2
403	movd	12(%eax), %mm0		C				am 2
404	paddq	%mm2, %mm4		C				am 2
405	movd	8(%edx), %mm5		C				am 2
406	movd	%mm6, (%edx)		C				am 2
407	psrlq	$32, %mm6		C				am 2
408	pmuludq	%mm7, %mm0		C				am 2
409	paddq	%mm4, %mm6		C				am 2
410	movd	16(%eax), %mm1		C				am 2
411	paddq	%mm3, %mm5		C				am 2
412	movd	12(%edx), %mm4		C				am 2
413	movd	%mm6, 4(%edx)		C				am 2
414	psrlq	$32, %mm6		C				am 2
415	pmuludq	%mm7, %mm1		C				am 2
416	paddq	%mm5, %mm6		C				am 2
417	movd	20(%eax), %mm2		C				am 2
418	paddq	%mm0, %mm4		C				am 2
419	movd	16(%edx), %mm5		C				am 2
420	movd	%mm6, 8(%edx)		C				am 2
421	psrlq	$32, %mm6		C				am 2
422	lea	16(%eax), %eax		C				am 2
423	lea	16(%edx), %edx		C				am 2
424	sub	$4, %ecx		C				am 2
425	ja	L(lam2)			C				am 2
426	pmuludq	%mm7, %mm2		C				am 2
427	paddq	%mm4, %mm6		C				am 2
428	paddq	%mm1, %mm5		C				am 2
429	movd	4(%edx), %mm4		C				am 2
430	movd	%mm6, -4(%edx)		C				am 2
431	psrlq	$32, %mm6		C				am 2
432	paddq	%mm5, %mm6		C				am 2
433	paddq	%mm2, %mm4		C				am 2
434L(1):	movd	%mm6, (%edx)		C				am 2
435	psrlq	$32, %mm6		C				am 2
436	paddq	%mm4, %mm6		C				am 2
437	movd	%mm6, 4(%edx)		C				am 2
438	psrlq	$32, %mm6		C				am 2
439	movd	%mm6, 8(%edx)		C				am 2
440
441	lea	8(%edi), %edi		C rp += 2
442	movd	(%esi), %mm7		C				am 1
443	mov	%edi, %edx		C rp2 = rp			am 1
444	lea	4(%esi), %esi		C up++				am 1
445	lea	(%esi), %eax		C up2 = up			am 1
446	movd	(%eax), %mm2		C				am 1
447	lea	(%ebx), %ecx		C inner loop count		am 1
448	pxor	%mm6, %mm6		C				am 1
449	pmuludq	%mm7, %mm2		C				am 1
450	movd	4(%eax), %mm3		C				am 1
451	movd	(%edx), %mm4		C				am 1
452	pmuludq	%mm7, %mm3		C				am 1
453	movd	8(%eax), %mm0		C				am 1
454	paddq	%mm2, %mm4		C				am 1
455	movd	4(%edx), %mm5		C				am 1
456	jmp	L(am01)			C				am 1
457	ALIGN(16)			C				am 1
458L(lam1):
459	pmuludq	%mm7, %mm2		C				am 1
460	paddq	%mm4, %mm6		C				am 1
461	movd	4(%eax), %mm3		C				am 1
462	paddq	%mm1, %mm5		C				am 1
463	movd	(%edx), %mm4		C				am 1
464	movd	%mm6, -8(%edx)		C				am 1
465	psrlq	$32, %mm6		C				am 1
466	pmuludq	%mm7, %mm3		C				am 1
467	paddq	%mm5, %mm6		C				am 1
468	movd	8(%eax), %mm0		C				am 1
469	paddq	%mm2, %mm4		C				am 1
470	movd	4(%edx), %mm5		C				am 1
471	movd	%mm6, -4(%edx)		C				am 1
472	psrlq	$32, %mm6		C				am 1
473L(am01):
474	pmuludq	%mm7, %mm0		C				am 1
475	paddq	%mm4, %mm6		C				am 1
476	movd	12(%eax), %mm1		C				am 1
477	paddq	%mm3, %mm5		C				am 1
478	movd	8(%edx), %mm4		C				am 1
479	movd	%mm6, (%edx)		C				am 1
480	psrlq	$32, %mm6		C				am 1
481	pmuludq	%mm7, %mm1		C				am 1
482	paddq	%mm5, %mm6		C				am 1
483	movd	16(%eax), %mm2		C				am 1
484	paddq	%mm0, %mm4		C				am 1
485	movd	12(%edx), %mm5		C				am 1
486	movd	%mm6, 4(%edx)		C				am 1
487	psrlq	$32, %mm6		C				am 1
488	lea	16(%eax), %eax		C				am 1
489	lea	16(%edx), %edx		C				am 1
490	sub	$4, %ecx		C				am 1
491	ja	L(lam1)			C				am 1
492	pmuludq	%mm7, %mm2		C				am 1
493	paddq	%mm4, %mm6		C				am 1
494	paddq	%mm1, %mm5		C				am 1
495	movd	(%edx), %mm4		C				am 1
496	movd	%mm6, -8(%edx)		C				am 1
497	psrlq	$32, %mm6		C				am 1
498	paddq	%mm5, %mm6		C				am 1
499	paddq	%mm2, %mm4		C				am 1
500L(0):	movd	%mm6, -4(%edx)		C				am 1
501	psrlq	$32, %mm6		C				am 1
502	paddq	%mm4, %mm6		C				am 1
503	movd	%mm6, (%edx)		C				am 1
504	psrlq	$32, %mm6		C				am 1
505	movd	%mm6, 4(%edx)		C				am 1
506
507	lea	8(%edi), %edi		C rp += 2
508	movd	(%esi), %mm7		C				am 0
509	mov	%edi, %edx		C rp2 = rp			am 0
510	lea	4(%esi), %esi		C up++				am 0
511	lea	(%esi), %eax		C up2 = up			am 0
512	movd	(%eax), %mm3		C				am 0
513	lea	(%ebx), %ecx		C inner loop count		am 0
514	pxor	%mm6, %mm6		C				am 0
515	pmuludq	%mm7, %mm3		C				am 0
516	movd	4(%eax), %mm0		C				am 0
517	movd	(%edx), %mm5		C				am 0
518	pmuludq	%mm7, %mm0		C				am 0
519	movd	8(%eax), %mm1		C				am 0
520	paddq	%mm3, %mm5		C				am 0
521	movd	4(%edx), %mm4		C				am 0
522	jmp	L(am00)			C				am 0
523	ALIGN(16)			C				am 0
524L(lam0):
525	pmuludq	%mm7, %mm2		C				am 0
526	paddq	%mm4, %mm6		C				am 0
527	movd	(%eax), %mm3		C				am 0
528	paddq	%mm1, %mm5		C				am 0
529	movd	-4(%edx), %mm4		C				am 0
530	movd	%mm6, -12(%edx)		C				am 0
531	psrlq	$32, %mm6		C				am 0
532	pmuludq	%mm7, %mm3		C				am 0
533	paddq	%mm5, %mm6		C				am 0
534	movd	4(%eax), %mm0		C				am 0
535	paddq	%mm2, %mm4		C				am 0
536	movd	(%edx), %mm5		C				am 0
537	movd	%mm6, -8(%edx)		C				am 0
538	psrlq	$32, %mm6		C				am 0
539	pmuludq	%mm7, %mm0		C				am 0
540	paddq	%mm4, %mm6		C				am 0
541	movd	8(%eax), %mm1		C				am 0
542	paddq	%mm3, %mm5		C				am 0
543	movd	4(%edx), %mm4		C				am 0
544	movd	%mm6, -4(%edx)		C				am 0
545	psrlq	$32, %mm6		C				am 0
546L(am00):
547	pmuludq	%mm7, %mm1		C				am 0
548	paddq	%mm5, %mm6		C				am 0
549	movd	12(%eax), %mm2		C				am 0
550	paddq	%mm0, %mm4		C				am 0
551	movd	8(%edx), %mm5		C				am 0
552	movd	%mm6, (%edx)		C				am 0
553	psrlq	$32, %mm6		C				am 0
554	lea	16(%eax), %eax		C				am 0
555	lea	16(%edx), %edx		C				am 0
556	sub	$4, %ecx		C				am 0
557	ja	L(lam0)			C				am 0
558	pmuludq	%mm7, %mm2		C				am 0
559	paddq	%mm4, %mm6		C				am 0
560	paddq	%mm1, %mm5		C				am 0
561	movd	-4(%edx), %mm4		C				am 0
562	movd	%mm6, -12(%edx)		C				am 0
563	psrlq	$32, %mm6		C				am 0
564	paddq	%mm5, %mm6		C				am 0
565	paddq	%mm2, %mm4		C				am 0
566L(3):	movd	%mm6, -8(%edx)		C				am 0
567	psrlq	$32, %mm6		C				am 0
568	paddq	%mm4, %mm6		C				am 0
569	movd	%mm6, -4(%edx)		C				am 0
570	psrlq	$32, %mm6		C				am 0
571	movd	%mm6, (%edx)		C				am 0
572	sub	$4, %ebx		C				am 0
573	ja	L(outer)			C				am 0
574
575	mov	%edi, %edx
576	mov	%esi, %eax
577	pop	%edi
578	pop	%ebx
579	pop	%esi
580
581L(am3):	C up[un-1..un-3] x up[un-4]
582	lea	8(%edx), %edx		C rp2 += 2
583	movd	(%eax), %mm7
584	movd	4(%eax), %mm1
585	movd	8(%eax), %mm2
586	movd	12(%eax), %mm3
587	movd	(%edx), %mm4
588	pmuludq	%mm7, %mm1
589	movd	4(%edx), %mm5
590	pmuludq	%mm7, %mm2
591	movd	8(%edx), %mm6
592	pmuludq	%mm7, %mm3
593	paddq	%mm1, %mm4
594	paddq	%mm2, %mm5
595	paddq	%mm3, %mm6
596	movd	%mm4, (%edx)
597	psrlq	$32, %mm4
598	paddq	%mm5, %mm4
599	movd	%mm4, 4(%edx)
600	psrlq	$32, %mm4
601	paddq	%mm6, %mm4
602	movd	%mm4, 8(%edx)
603	psrlq	$32, %mm4
604	movd	%mm4, 12(%edx)		C FIXME feed through!
605	lea	4(%eax), %eax
606
607L(am2):	C up[un-1..un-2] x up[un-3]
608	lea	8(%edx), %edx		C rp2 += 2
609	movd	(%eax), %mm7
610	movd	4(%eax), %mm1
611	movd	8(%eax), %mm2
612	movd	(%edx), %mm4
613	movd	4(%edx), %mm5
614	pmuludq	%mm7, %mm1
615	pmuludq	%mm7, %mm2
616	paddq	%mm1, %mm4
617	paddq	%mm2, %mm5
618	movd	%mm4, (%edx)
619	psrlq	$32, %mm4
620	paddq	%mm5, %mm4
621	movd	%mm4, 4(%edx)
622	psrlq	$32, %mm4
623	movd	%mm4, 8(%edx)		C FIXME feed through!
624	lea	4(%eax), %eax
625
626L(am1):	C up[un-1] x up[un-2]
627	lea	8(%edx), %edx		C rp2 += 2
628	movd	(%eax), %mm7
629	movd	4(%eax), %mm2
630	movd	(%edx), %mm4
631	pmuludq	%mm7, %mm2
632	paddq	%mm2, %mm4
633	movd	%mm4, (%edx)
634	psrlq	$32, %mm4
635	movd	%mm4, 4(%edx)
636
637C *** diag stuff, use elementary code for now
638
639	mov	4(%esp), %edx		C rp
640	mov	8(%esp), %eax		C up
641	mov	12(%esp), %ecx		C un
642
643	movd	(%eax), %mm2
644	pmuludq	%mm2, %mm2		C src[0]^2
645
646	pcmpeqd	%mm7, %mm7
647	psrlq	$32, %mm7
648
649	movd	4(%edx), %mm3		C dst[1]
650
651	movd	%mm2, (%edx)
652	psrlq	$32, %mm2
653
654	psllq	$1, %mm3		C 2*dst[1]
655	paddq	%mm3, %mm2
656	movd	%mm2, 4(%edx)
657	psrlq	$32, %mm2
658
659	sub	$2, %ecx
660
661L(diag):
662	movd	4(%eax), %mm0		C src limb
663	add	$4, %eax
664	pmuludq	%mm0, %mm0
665	movq	%mm7, %mm1
666	pand	%mm0, %mm1		C diagonal low
667	psrlq	$32, %mm0		C diagonal high
668
669	movd	8(%edx), %mm3
670	psllq	$1, %mm3		C 2*dst[i]
671	paddq	%mm3, %mm1
672	paddq	%mm1, %mm2
673	movd	%mm2, 8(%edx)
674	psrlq	$32, %mm2
675
676	movd	12(%edx), %mm3
677	psllq	$1, %mm3		C 2*dst[i+1]
678	paddq	%mm3, %mm0
679	paddq	%mm0, %mm2
680	movd	%mm2, 12(%edx)
681	add	$8, %edx
682	psrlq	$32, %mm2
683
684	sub	$1, %ecx
685	jnz	L(diag)
686
687	movd	4(%eax), %mm0		C src[size-1]
688	pmuludq	%mm0, %mm0
689	pand	%mm0, %mm7		C diagonal low
690	psrlq	$32, %mm0		C diagonal high
691
692	movd	8(%edx), %mm3		C dst[2*size-2]
693	psllq	$1, %mm3
694	paddq	%mm3, %mm7
695	paddq	%mm7, %mm2
696	movd	%mm2, 8(%edx)
697	psrlq	$32, %mm2
698
699	paddq	%mm0, %mm2
700	movd	%mm2, 12(%edx)		C dst[2*size-1]
701
702	emms
703	ret
704
705EPILOGUE()
706