1dnl  X86-64 mpn_sqr_basecase optimised for Intel Nehalem/Westmere.
2dnl  It also seems good for Conroe/Wolfdale.
3
4dnl  Contributed to the GNU project by Torbj��rn Granlund.
5
6dnl  Copyright 2008, 2011-2013 Free Software Foundation, Inc.
7
8dnl  This file is part of the GNU MP Library.
9dnl
10dnl  The GNU MP Library is free software; you can redistribute it and/or modify
11dnl  it under the terms of either:
12dnl
13dnl    * the GNU Lesser General Public License as published by the Free
14dnl      Software Foundation; either version 3 of the License, or (at your
15dnl      option) any later version.
16dnl
17dnl  or
18dnl
19dnl    * the GNU General Public License as published by the Free Software
20dnl      Foundation; either version 2 of the License, or (at your option) any
21dnl      later version.
22dnl
23dnl  or both in parallel, as here.
24dnl
25dnl  The GNU MP Library is distributed in the hope that it will be useful, but
26dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
27dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
28dnl  for more details.
29dnl
30dnl  You should have received copies of the GNU General Public License and the
31dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
32dnl  see https://www.gnu.org/licenses/.
33
34include(`../config.m4')
35
36C cycles/limb	mul_2		addmul_2	sqr_diag_addlsh1
37C AMD K8,K9
38C AMD K10
39C AMD bull
40C AMD pile
41C AMD steam
42C AMD bobcat
43C AMD jaguar
44C Intel P4
45C Intel core	 4.9		4.18-4.25		 3.87
46C Intel NHM	 3.8		4.06-4.2		 3.5
47C Intel SBR
48C Intel IBR
49C Intel HWL
50C Intel BWL
51C Intel atom
52C VIA nano
53
54C The inner loops of this code are the result of running a code generation and
55C optimisation tool suite written by David Harvey and Torbj��rn Granlund.
56
57C Code structure:
58C
59C
60C        m_2(0m4)        m_2(2m4)        m_2(1m4)        m_2(3m4)
61C           |               |               |               |
62C           |               |               |               |
63C           |               |               |               |
64C          \|/             \|/             \|/             \|/
65C              ____________                   ____________
66C             /            \                 /            \
67C            \|/            \               \|/            \
68C         am_2(3m4)       am_2(1m4)       am_2(0m4)       am_2(2m4)
69C            \            /|\                \            /|\
70C             \____________/                  \____________/
71C                       \                        /
72C                        \                      /
73C                         \                    /
74C                       tail(0m2)          tail(1m2)
75C                            \              /
76C                             \            /
77C                            sqr_diag_addlsh1
78
79C TODO
80C  * Tune.  None done so far.
81C  * Currently 2761 bytes, making it smaller would be nice.
82C  * Consider using a jumptab-based entry sequence.  One might even use a mask-
83C    less sequence, if the table is large enough to support tuneup's needs.
84C    The code would be, using non-PIC code,
85C        lea tab(%rip),%rax; jmp *(n,%rax)
86C    or,
87C        lea tab(%rip),%rax; lea (%rip),%rbx; add (n,%rax),%rbx; jmp *%rbx
88C    using PIC code.  The table entries would be Ln1,Ln2,Ln3,Lm0,Lm1,Lm2,Lm3,..
89C    with the last four entries repeated a safe number of times.
90C  * Consider expanding feed-in code in order to avoid zeroing registers.
91C  * Zero consistently with xor.
92C  * Check if using "lea (reg),reg" should be done in more places; we have some
93C    explicit "mov %rax,reg" now.
94C  * Try zeroing with xor in m2 loops.
95C  * Try re-rolling the m2 loops to avoid the current 9 insn code duplication
96C    between loop header and wind-down code.
97C  * Consider adc reg,reg instead of adc $0,reg in m2 loops.  This save a byte.
98
99C When playing with pointers, set this to $2 to fall back to conservative
100C indexing in wind-down code.
101define(`I',`$1')
102
103C Define this to $1 to use late loop index variable as zero, $2 to use an
104C explicit $0.
105define(`Z',`$1')
106
107define(`rp',       `%rdi')
108define(`up',       `%rsi')
109define(`n_param',  `%rdx')
110
111define(`n',        `%r8')
112
113define(`v0',       `%r10')
114define(`v1',       `%r11')
115define(`w0',       `%rbx')
116define(`w1',       `%rcx')
117define(`w2',       `%rbp')
118define(`w3',       `%r9')
119define(`i',        `%r13')
120
121define(`X0',       `%r12')
122define(`X1',       `%r14')
123
124C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15
125
126ABI_SUPPORT(DOS64)
127ABI_SUPPORT(STD64)
128
129define(`ALIGNx', `ALIGN(16)')
130
131define(`N', 85)
132ifdef(`N',,`define(`N',0)')
133define(`MOV', `ifelse(eval(N & $3),0,`mov	$1, $2',`lea	($1), $2')')
134
135ASM_START()
136	TEXT
137	ALIGN(32)
138PROLOGUE(mpn_sqr_basecase)
139	FUNC_ENTRY(3)
140
141	cmp	$4, n_param
142	jl	L(small)
143
144	push	%rbx
145	push	%rbp
146	push	%r12
147	push	%r13
148	push	%r14
149
150	mov	(up), v0
151	mov	8(up), %rax
152	mov	%rax, v1
153
154	mov	$1, R32(n)
155	sub	n_param, n		C n = -n_param+1
156	push	n
157
158	lea	(up,n_param,8), up
159	lea	(rp,n_param,8), rp
160
161	mul	v0
162
163	test	$1, R8(n)
164	jnz	L(bx1)
165
166L(bx0):	test	$2, R8(n)
167	mov	%rax, (rp,n,8)
168	jnz	L(b10)
169
170L(b00):	lea	(n), i			C n = 5, 9, ...
171	mov	%rdx, w1		C FIXME: Use lea?
172	xor	R32(w2), R32(w2)
173	jmp	L(m2e0)
174
175L(b10):	lea	2(n), i			C n = 7, 11, ...
176	mov	8(up,n,8), %rax
177	mov	%rdx, w3		C FIXME: Use lea?
178	xor	R32(w0), R32(w0)
179	xor	R32(w1), R32(w1)
180	jmp	L(m2e2)
181
182L(bx1):	test	$2, R8(n)
183	mov	%rax, (rp,n,8)
184	jz	L(b11)
185
186L(b01):	lea	1(n), i			C n = 6, 10, ...
187	mov	%rdx, w0		C FIXME: Use lea?
188	xor	R32(w1), R32(w1)
189	jmp	L(m2e1)
190
191L(b11):	lea	-1(n), i		C n = 4, 8, 12, ...
192	mov	%rdx, w2		C FIXME: Use lea?
193	xor	R32(w3), R32(w3)
194	jmp	L(m2e3)
195
196
197	ALIGNx
198L(m2top1):
199	mul	v0
200	add	%rax, w3
201	mov	-8(up,i,8), %rax
202	mov	w3, -8(rp,i,8)
203	adc	%rdx, w0
204	adc	$0, R32(w1)
205	mul	v1
206	add	%rax, w0
207	adc	%rdx, w1
208L(m2e1):mov	$0, R32(w2)
209	mov	(up,i,8), %rax
210	mul	v0
211	add	%rax, w0
212	mov	w0, (rp,i,8)
213	adc	%rdx, w1
214	mov	(up,i,8), %rax
215	adc	$0, R32(w2)
216	mul	v1
217	add	%rax, w1
218	adc	%rdx, w2
219	mov	8(up,i,8), %rax
220	mul	v0
221	mov	$0, R32(w3)
222	add	%rax, w1
223	adc	%rdx, w2
224	adc	$0, R32(w3)
225	mov	8(up,i,8), %rax
226	mul	v1
227	add	%rax, w2
228	mov	w1, 8(rp,i,8)
229	adc	%rdx, w3
230	mov	$0, R32(w0)
231	mov	16(up,i,8), %rax
232	mul	v0
233	add	%rax, w2
234	mov	16(up,i,8), %rax
235	adc	%rdx, w3
236	adc	$0, R32(w0)
237	mul	v1
238	mov	$0, R32(w1)
239	add	%rax, w3
240	mov	24(up,i,8), %rax
241	mov	w2, 16(rp,i,8)
242	adc	%rdx, w0
243	add	$4, i
244	js	L(m2top1)
245
246	mul	v0
247	add	%rax, w3
248	mov	I(-8(up),-8(up,i,8)), %rax
249	mov	w3, I(-8(rp),-8(rp,i,8))
250	adc	%rdx, w0
251	adc	R32(w1), R32(w1)
252	mul	v1
253	add	w0, %rax
254	adc	w1, %rdx
255	mov	%rax, I((rp),(rp,i,8))
256	mov	%rdx, I(8(rp),8(rp,i,8))
257
258	lea	16(rp), rp
259	add	$2, n			C decrease |n|
260	jmp	L(am2o3)
261
262	ALIGNx
263L(m2top3):
264	mul	v0
265	add	%rax, w3
266	mov	-8(up,i,8), %rax
267	mov	w3, -8(rp,i,8)
268	adc	%rdx, w0
269	adc	$0, R32(w1)
270	mul	v1
271	add	%rax, w0
272	adc	%rdx, w1
273	mov	$0, R32(w2)
274	mov	(up,i,8), %rax
275	mul	v0
276	add	%rax, w0
277	mov	w0, (rp,i,8)
278	adc	%rdx, w1
279	mov	(up,i,8), %rax
280	adc	$0, R32(w2)
281	mul	v1
282	add	%rax, w1
283	adc	%rdx, w2
284	mov	8(up,i,8), %rax
285	mul	v0
286	mov	$0, R32(w3)
287	add	%rax, w1
288	adc	%rdx, w2
289	adc	$0, R32(w3)
290	mov	8(up,i,8), %rax
291	mul	v1
292	add	%rax, w2
293	mov	w1, 8(rp,i,8)
294	adc	%rdx, w3
295L(m2e3):mov	$0, R32(w0)
296	mov	16(up,i,8), %rax
297	mul	v0
298	add	%rax, w2
299	mov	16(up,i,8), %rax
300	adc	%rdx, w3
301	adc	$0, R32(w0)
302	mul	v1
303	mov	$0, R32(w1)
304	add	%rax, w3
305	mov	24(up,i,8), %rax
306	mov	w2, 16(rp,i,8)
307	adc	%rdx, w0
308	add	$4, i
309	js	L(m2top3)
310
311	mul	v0
312	add	%rax, w3
313	mov	I(-8(up),-8(up,i,8)), %rax
314	mov	w3, I(-8(rp),-8(rp,i,8))
315	adc	%rdx, w0
316	adc	R32(w1), R32(w1)
317	mul	v1
318	add	w0, %rax
319	adc	w1, %rdx
320	mov	%rax, I((rp),(rp,i,8))
321	mov	%rdx, I(8(rp),8(rp,i,8))
322
323	lea	16(rp), rp
324	add	$2, n			C decrease |n|
325	cmp	$-1, n
326	jz	L(cor1)			C jumps iff entry n = 4
327
328L(am2o1):
329	mov	-8(up,n,8), v0
330	mov	(up,n,8), %rax
331	mov	%rax, v1
332	lea	1(n), i
333	mul	v0
334	mov	%rax, X1
335	MOV(	%rdx, X0, 128)
336	mov	(rp,n,8), w1
337	xor	R32(w2), R32(w2)
338	mov	8(up,n,8), %rax
339	xor	R32(w3), R32(w3)
340	jmp	L(lo1)
341
342	ALIGNx
343L(am2top1):
344	mul	v1
345	add	w0, w1
346	adc	%rax, w2
347	mov	(up,i,8), %rax
348	MOV(	%rdx, w3, 1)
349	adc	$0, w3
350L(lo1):	mul	v0
351	add	w1, X1
352	mov	X1, -8(rp,i,8)
353	adc	%rax, X0
354	MOV(	%rdx, X1, 2)
355	adc	$0, X1
356	mov	(up,i,8), %rax
357	mul	v1
358	MOV(	%rdx, w0, 4)
359	mov	(rp,i,8), w1
360	add	w1, w2
361	adc	%rax, w3
362	adc	$0, w0
363	mov	8(up,i,8), %rax
364	mul	v0
365	add	w2, X0
366	adc	%rax, X1
367	mov	X0, (rp,i,8)
368	MOV(	%rdx, X0, 8)
369	adc	$0, X0
370	mov	8(up,i,8), %rax
371	mov	8(rp,i,8), w2
372	mul	v1
373	add	w2, w3
374	adc	%rax, w0
375	MOV(	%rdx, w1, 16)
376	adc	$0, w1
377	mov	16(up,i,8), %rax
378	mul	v0
379	add	w3, X1
380	mov	X1, 8(rp,i,8)
381	adc	%rax, X0
382	MOV(	%rdx, X1, 32)
383	mov	16(rp,i,8), w3
384	adc	$0, X1
385	mov	16(up,i,8), %rax
386	mul	v1
387	add	w3, w0
388	MOV(	%rdx, w2, 64)
389	adc	%rax, w1
390	mov	24(up,i,8), %rax
391	adc	$0, w2
392	mul	v0
393	add	w0, X0
394	mov	X0, 16(rp,i,8)
395	MOV(	%rdx, X0, 128)
396	adc	%rax, X1
397	mov	24(up,i,8), %rax
398	mov	24(rp,i,8), w0
399	adc	$0, X0
400	add	$4, i
401	jnc	L(am2top1)
402
403	mul	v1
404	add	w0, w1
405	adc	w2, %rax
406	adc	Z(i,$0), %rdx
407	add	w1, X1
408	adc	Z(i,$0), X0
409	mov	X1, I(-8(rp),-8(rp,i,8))
410	add	X0, %rax
411	mov	%rax, I((rp),(rp,i,8))
412	adc	Z(i,$0), %rdx
413	mov	%rdx, I(8(rp),8(rp,i,8))
414
415	lea	16(rp), rp
416	add	$2, n
417
418L(am2o3):
419	mov	-8(up,n,8), v0
420	mov	(up,n,8), %rax
421	mov	%rax, v1
422	lea	-1(n), i
423	mul	v0
424	mov	%rax, X1
425	MOV(	%rdx, X0, 8)
426	mov	(rp,n,8), w3
427	xor	R32(w0), R32(w0)
428	xor	R32(w1), R32(w1)
429	mov	8(up,n,8), %rax
430	jmp	L(lo3)
431
432	ALIGNx
433L(am2top3):
434	mul	v1
435	add	w0, w1
436	adc	%rax, w2
437	mov	(up,i,8), %rax
438	MOV(	%rdx, w3, 1)
439	adc	$0, w3
440	mul	v0
441	add	w1, X1
442	mov	X1, -8(rp,i,8)
443	adc	%rax, X0
444	MOV(	%rdx, X1, 2)
445	adc	$0, X1
446	mov	(up,i,8), %rax
447	mul	v1
448	MOV(	%rdx, w0, 4)
449	mov	(rp,i,8), w1
450	add	w1, w2
451	adc	%rax, w3
452	adc	$0, w0
453	mov	8(up,i,8), %rax
454	mul	v0
455	add	w2, X0
456	adc	%rax, X1
457	mov	X0, (rp,i,8)
458	MOV(	%rdx, X0, 8)
459	adc	$0, X0
460	mov	8(up,i,8), %rax
461	mov	8(rp,i,8), w2
462	mul	v1
463	add	w2, w3
464	adc	%rax, w0
465	MOV(	%rdx, w1, 16)
466	adc	$0, w1
467	mov	16(up,i,8), %rax
468L(lo3):	mul	v0
469	add	w3, X1
470	mov	X1, 8(rp,i,8)
471	adc	%rax, X0
472	MOV(	%rdx, X1, 32)
473	mov	16(rp,i,8), w3
474	adc	$0, X1
475	mov	16(up,i,8), %rax
476	mul	v1
477	add	w3, w0
478	MOV(	%rdx, w2, 64)
479	adc	%rax, w1
480	mov	24(up,i,8), %rax
481	adc	$0, w2
482	mul	v0
483	add	w0, X0
484	mov	X0, 16(rp,i,8)
485	MOV(	%rdx, X0, 128)
486	adc	%rax, X1
487	mov	24(up,i,8), %rax
488	mov	24(rp,i,8), w0
489	adc	$0, X0
490	add	$4, i
491	jnc	L(am2top3)
492
493	mul	v1
494	add	w0, w1
495	adc	w2, %rax
496	adc	Z(i,$0), %rdx
497	add	w1, X1
498	adc	Z(i,$0), X0
499	mov	X1, I(-8(rp),-8(rp,i,8))
500	add	X0, %rax
501	mov	%rax, I((rp),(rp,i,8))
502	adc	Z(i,$0), %rdx
503	mov	%rdx, I(8(rp),8(rp,i,8))
504
505	lea	16(rp), rp
506	add	$2, n
507	cmp	$-1, n
508	jnz	L(am2o1)
509
510L(cor1):pop	n
511	mov	%rdx, w3
512	mov	-16(up), v0
513	mov	-8(up), %rax
514	mul	v0
515	add	w3, %rax
516	adc	$0, %rdx
517	mov	%rax, -8(rp)
518	mov	%rdx, (rp)
519	jmp	L(sqr_diag_addlsh1)
520
521	ALIGNx
522L(m2top2):
523L(m2e2):mul	v0
524	add	%rax, w3
525	mov	-8(up,i,8), %rax
526	mov	w3, -8(rp,i,8)
527	adc	%rdx, w0
528	adc	$0, R32(w1)
529	mul	v1
530	add	%rax, w0
531	adc	%rdx, w1
532	mov	$0, R32(w2)
533	mov	(up,i,8), %rax
534	mul	v0
535	add	%rax, w0
536	mov	w0, (rp,i,8)
537	adc	%rdx, w1
538	mov	(up,i,8), %rax
539	adc	$0, R32(w2)
540	mul	v1
541	add	%rax, w1
542	adc	%rdx, w2
543	mov	8(up,i,8), %rax
544	mul	v0
545	mov	$0, R32(w3)
546	add	%rax, w1
547	adc	%rdx, w2
548	adc	$0, R32(w3)
549	mov	8(up,i,8), %rax
550	mul	v1
551	add	%rax, w2
552	mov	w1, 8(rp,i,8)
553	adc	%rdx, w3
554	mov	$0, R32(w0)
555	mov	16(up,i,8), %rax
556	mul	v0
557	add	%rax, w2
558	mov	16(up,i,8), %rax
559	adc	%rdx, w3
560	adc	$0, R32(w0)
561	mul	v1
562	mov	$0, R32(w1)
563	add	%rax, w3
564	mov	24(up,i,8), %rax
565	mov	w2, 16(rp,i,8)
566	adc	%rdx, w0
567	add	$4, i
568	js	L(m2top2)
569
570	mul	v0
571	add	%rax, w3
572	mov	I(-8(up),-8(up,i,8)), %rax
573	mov	w3, I(-8(rp),-8(rp,i,8))
574	adc	%rdx, w0
575	adc	R32(w1), R32(w1)
576	mul	v1
577	add	w0, %rax
578	adc	w1, %rdx
579	mov	%rax, I((rp),(rp,i,8))
580	mov	%rdx, I(8(rp),8(rp,i,8))
581
582	lea	16(rp), rp
583	add	$2, n			C decrease |n|
584	jmp	L(am2o0)
585
586	ALIGNx
587L(m2top0):
588	mul	v0
589	add	%rax, w3
590	mov	-8(up,i,8), %rax
591	mov	w3, -8(rp,i,8)
592	adc	%rdx, w0
593	adc	$0, R32(w1)
594	mul	v1
595	add	%rax, w0
596	adc	%rdx, w1
597	mov	$0, R32(w2)
598	mov	(up,i,8), %rax
599	mul	v0
600	add	%rax, w0
601	mov	w0, (rp,i,8)
602	adc	%rdx, w1
603	mov	(up,i,8), %rax
604	adc	$0, R32(w2)
605	mul	v1
606	add	%rax, w1
607	adc	%rdx, w2
608L(m2e0):mov	8(up,i,8), %rax
609	mul	v0
610	mov	$0, R32(w3)
611	add	%rax, w1
612	adc	%rdx, w2
613	adc	$0, R32(w3)
614	mov	8(up,i,8), %rax
615	mul	v1
616	add	%rax, w2
617	mov	w1, 8(rp,i,8)
618	adc	%rdx, w3
619	mov	$0, R32(w0)
620	mov	16(up,i,8), %rax
621	mul	v0
622	add	%rax, w2
623	mov	16(up,i,8), %rax
624	adc	%rdx, w3
625	adc	$0, R32(w0)
626	mul	v1
627	mov	$0, R32(w1)
628	add	%rax, w3
629	mov	24(up,i,8), %rax
630	mov	w2, 16(rp,i,8)
631	adc	%rdx, w0
632	add	$4, i
633	js	L(m2top0)
634
635	mul	v0
636	add	%rax, w3
637	mov	I(-8(up),-8(up,i,8)), %rax
638	mov	w3, I(-8(rp),-8(rp,i,8))
639	adc	%rdx, w0
640	adc	R32(w1), R32(w1)
641	mul	v1
642	add	w0, %rax
643	adc	w1, %rdx
644	mov	%rax, I((rp),(rp,i,8))
645	mov	%rdx, I(8(rp),8(rp,i,8))
646
647	lea	16(rp), rp
648	add	$2, n			C decrease |n|
649	cmp	$-2, n
650	jz	L(cor2)			C jumps iff entry n = 5
651
652L(am2o2):
653	mov	-8(up,n,8), v0
654	mov	(up,n,8), %rax
655	mov	%rax, v1
656	lea	-2(n), i
657	mul	v0
658	mov	%rax, X0
659	MOV(	%rdx, X1, 32)
660	mov	(rp,n,8), w0
661	xor	R32(w1), R32(w1)
662	xor	R32(w2), R32(w2)
663	mov	8(up,n,8), %rax
664	jmp	L(lo2)
665
666	ALIGNx
667L(am2top2):
668	mul	v1
669	add	w0, w1
670	adc	%rax, w2
671	mov	(up,i,8), %rax
672	MOV(	%rdx, w3, 1)
673	adc	$0, w3
674	mul	v0
675	add	w1, X1
676	mov	X1, -8(rp,i,8)
677	adc	%rax, X0
678	MOV(	%rdx, X1, 2)
679	adc	$0, X1
680	mov	(up,i,8), %rax
681	mul	v1
682	MOV(	%rdx, w0, 4)
683	mov	(rp,i,8), w1
684	add	w1, w2
685	adc	%rax, w3
686	adc	$0, w0
687	mov	8(up,i,8), %rax
688	mul	v0
689	add	w2, X0
690	adc	%rax, X1
691	mov	X0, (rp,i,8)
692	MOV(	%rdx, X0, 8)
693	adc	$0, X0
694	mov	8(up,i,8), %rax
695	mov	8(rp,i,8), w2
696	mul	v1
697	add	w2, w3
698	adc	%rax, w0
699	MOV(	%rdx, w1, 16)
700	adc	$0, w1
701	mov	16(up,i,8), %rax
702	mul	v0
703	add	w3, X1
704	mov	X1, 8(rp,i,8)
705	adc	%rax, X0
706	MOV(	%rdx, X1, 32)
707	mov	16(rp,i,8), w3
708	adc	$0, X1
709	mov	16(up,i,8), %rax
710	mul	v1
711	add	w3, w0
712	MOV(	%rdx, w2, 64)
713	adc	%rax, w1
714	mov	24(up,i,8), %rax
715	adc	$0, w2
716L(lo2):	mul	v0
717	add	w0, X0
718	mov	X0, 16(rp,i,8)
719	MOV(	%rdx, X0, 128)
720	adc	%rax, X1
721	mov	24(up,i,8), %rax
722	mov	24(rp,i,8), w0
723	adc	$0, X0
724	add	$4, i
725	jnc	L(am2top2)
726
727	mul	v1
728	add	w0, w1
729	adc	w2, %rax
730	adc	Z(i,$0), %rdx
731	add	w1, X1
732	adc	Z(i,$0), X0
733	mov	X1, I(-8(rp),-8(rp,i,8))
734	add	X0, %rax
735	mov	%rax, I((rp),(rp,i,8))
736	adc	Z(i,$0), %rdx
737	mov	%rdx, I(8(rp),8(rp,i,8))
738
739	lea	16(rp), rp
740	add	$2, n
741
742L(am2o0):
743	mov	-8(up,n,8), v0
744	mov	(up,n,8), %rax
745	mov	%rax, v1
746	lea	0(n), i
747	mul	v0
748	mov	%rax, X0
749	MOV(	%rdx, X1, 2)
750	xor	R32(w0), R32(w0)
751	mov	(rp,n,8), w2
752	xor	R32(w3), R32(w3)
753	jmp	L(lo0)
754
755	ALIGNx
756L(am2top0):
757	mul	v1
758	add	w0, w1
759	adc	%rax, w2
760	mov	(up,i,8), %rax
761	MOV(	%rdx, w3, 1)
762	adc	$0, w3
763	mul	v0
764	add	w1, X1
765	mov	X1, -8(rp,i,8)
766	adc	%rax, X0
767	MOV(	%rdx, X1, 2)
768	adc	$0, X1
769	mov	(up,i,8), %rax
770	mul	v1
771	MOV(	%rdx, w0, 4)
772	mov	(rp,i,8), w1
773	add	w1, w2
774	adc	%rax, w3
775	adc	$0, w0
776L(lo0):	mov	8(up,i,8), %rax
777	mul	v0
778	add	w2, X0
779	adc	%rax, X1
780	mov	X0, (rp,i,8)
781	MOV(	%rdx, X0, 8)
782	adc	$0, X0
783	mov	8(up,i,8), %rax
784	mov	8(rp,i,8), w2
785	mul	v1
786	add	w2, w3
787	adc	%rax, w0
788	MOV(	%rdx, w1, 16)
789	adc	$0, w1
790	mov	16(up,i,8), %rax
791	mul	v0
792	add	w3, X1
793	mov	X1, 8(rp,i,8)
794	adc	%rax, X0
795	MOV(	%rdx, X1, 32)
796	mov	16(rp,i,8), w3
797	adc	$0, X1
798	mov	16(up,i,8), %rax
799	mul	v1
800	add	w3, w0
801	MOV(	%rdx, w2, 64)
802	adc	%rax, w1
803	mov	24(up,i,8), %rax
804	adc	$0, w2
805	mul	v0
806	add	w0, X0
807	mov	X0, 16(rp,i,8)
808	MOV(	%rdx, X0, 128)
809	adc	%rax, X1
810	mov	24(up,i,8), %rax
811	mov	24(rp,i,8), w0
812	adc	$0, X0
813	add	$4, i
814	jnc	L(am2top0)
815
816	mul	v1
817	add	w0, w1
818	adc	w2, %rax
819	adc	Z(i,$0), %rdx
820	add	w1, X1
821	adc	Z(i,$0), X0
822	mov	X1, I(-8(rp),-8(rp,i,8))
823	add	X0, %rax
824	mov	%rax, I((rp),(rp,i,8))
825	adc	Z(i,$0), %rdx
826	mov	%rdx, I(8(rp),8(rp,i,8))
827
828	lea	16(rp), rp
829	add	$2, n
830	cmp	$-2, n
831	jnz	L(am2o2)
832
833L(cor2):pop	n
834	mov	-24(up), v0
835	mov	%rax, w2
836	mov	%rdx, w0
837	mov	-16(up), %rax
838	mov	%rax, v1
839	mul	v0
840	mov	%rax, X0
841	MOV(	%rdx, X1, 32)
842	mov	-8(up), %rax
843	mul	v0
844	add	w2, X0
845	mov	X0, -16(rp)
846	MOV(	%rdx, X0, 128)
847	adc	%rax, X1
848	mov	-8(up), %rax
849	adc	$0, X0
850	mul	v1
851	add	w0, X1
852	adc	$0, X0
853	mov	X1, -8(rp)
854	add	X0, %rax
855	mov	%rax, (rp)
856	adc	$0, %rdx
857	mov	%rdx, 8(rp)
858	lea	8(rp), rp
859
860L(sqr_diag_addlsh1):
861	mov	-8(up,n,8), %rax
862	shl	n
863	xor	R32(%rbx), R32(%rbx)
864	mul	%rax
865	mov	8(rp,n,8), %r11
866	lea	(%rdx), %r10
867	mov	16(rp,n,8), %r9
868	add	%r11, %r11
869	jmp	L(dm)
870
871	ALIGNx
872L(dtop):mul	%rax
873	add	%r11, %r10
874	mov	8(rp,n,8), %r11
875	mov	%r10, -8(rp,n,8)
876	adc	%r9, %rax
877	lea	(%rdx,%rbx), %r10
878	mov	16(rp,n,8), %r9
879	adc	%r11, %r11
880L(dm):	mov	%rax, (rp,n,8)
881	mov	(up,n,4), %rax
882	adc	%r9, %r9
883	setc	R8(%rbx)
884	add	$2, n
885	js	L(dtop)
886
887	mul	%rax
888	add	%r11, %r10
889	mov	%r10, -8(rp)
890	adc	%r9, %rax
891	lea	(%rdx,%rbx), %r10
892	mov	%rax, (rp)
893	adc	$0, %r10
894	mov	%r10, 8(rp)
895
896	pop	%r14
897	pop	%r13
898	pop	%r12
899	pop	%rbp
900	pop	%rbx
901	FUNC_EXIT()
902	ret
903
904	ALIGN(16)
905L(small):
906	mov	(up), %rax
907	cmp	$2, n_param
908	jae	L(gt1)
909L(n1):
910	mul	%rax
911	mov	%rax, (rp)
912	mov	%rdx, 8(rp)
913	FUNC_EXIT()
914	ret
915
916L(gt1):	jne	L(gt2)
917L(n2):	mov	%rax, %r8
918	mul	%rax
919	mov	8(up), %r11
920	mov	%rax, (rp)
921	mov	%r11, %rax
922	mov	%rdx, %r9
923	mul	%rax
924	mov	%rax, %r10
925	mov	%r11, %rax
926	mov	%rdx, %r11
927	mul	%r8
928	xor	%r8, %r8
929	add	%rax, %r9
930	adc	%rdx, %r10
931	adc	%r8, %r11
932	add	%rax, %r9
933	mov	%r9, 8(rp)
934	adc	%rdx, %r10
935	mov	%r10, 16(rp)
936	adc	%r8, %r11
937	mov	%r11, 24(rp)
938	FUNC_EXIT()
939	ret
940
941L(gt2):
942L(n3):	mov	%rax, %r10
943	mul	%rax
944	mov	8(up), %r11
945	mov	%rax, (rp)
946	mov	%r11, %rax
947	mov	%rdx, 8(rp)
948	mul	%rax
949	mov	16(up), %rcx
950	mov	%rax, 16(rp)
951	mov	%rcx, %rax
952	mov	%rdx, 24(rp)
953	mul	%rax
954	mov	%rax, 32(rp)
955	mov	%rdx, 40(rp)
956
957	mov	%r11, %rax
958	mul	%r10
959	mov	%rax, %r8
960	mov	%rcx, %rax
961	mov	%rdx, %r9
962	mul	%r10
963	xor	%r10, %r10
964	add	%rax, %r9
965	mov	%r11, %rax
966	mov	%r10, %r11
967	adc	%rdx, %r10
968
969	mul	%rcx
970	add	%rax, %r10
971	adc	%r11, %rdx
972	add	%r8, %r8
973	adc	%r9, %r9
974	adc	%r10, %r10
975	adc	%rdx, %rdx
976	adc	%r11, %r11
977	add	%r8, 8(rp)
978	adc	%r9, 16(rp)
979	adc	%r10, 24(rp)
980	adc	%rdx, 32(rp)
981	adc	%r11, 40(rp)
982	FUNC_EXIT()
983	ret
984EPILOGUE()
985