1dnl  x86 mpn_sqr_basecase -- square an mpn number, optimised for atom.
2
3dnl  Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
4
5dnl  Copyright 2011 Free Software Foundation, Inc.
6
7dnl  This file is part of the GNU MP Library.
8dnl
9dnl  The GNU MP Library is free software; you can redistribute it and/or modify
10dnl  it under the terms of either:
11dnl
12dnl    * the GNU Lesser General Public License as published by the Free
13dnl      Software Foundation; either version 3 of the License, or (at your
14dnl      option) any later version.
15dnl
16dnl  or
17dnl
18dnl    * the GNU General Public License as published by the Free Software
19dnl      Foundation; either version 2 of the License, or (at your option) any
20dnl      later version.
21dnl
22dnl  or both in parallel, as here.
23dnl
24dnl  The GNU MP Library is distributed in the hope that it will be useful, but
25dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
27dnl  for more details.
28dnl
29dnl  You should have received copies of the GNU General Public License and the
30dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
31dnl  see https://www.gnu.org/licenses/.
32
33include(`../config.m4')
34
35C TODO
36C  * Check if 'jmp N(%esp)' is well-predicted enough to allow us to combine the
37C    4 large loops into one; we could use it for the outer loop branch.
38C  * Optimise code outside of inner loops.
39C  * Write combined addmul_1 feed-in a wind-down code, and use when iterating
40C    outer each loop.  ("Overlapping software pipelining")
41C  * Perhaps use caller-saves regs for inlined mul_1, allowing us to postpone
42C    all pushes.
43C  * Perhaps write special code for n < M, for some small M.
44C  * Replace inlined addmul_1 with smaller code from aorsmul_1.asm, or perhaps
45C    with even less pipelined code.
46C  * We run the outer loop until we have a 2-limb by 1-limb addmul_1 left.
47C    Consider breaking out earlier, saving high the cost of short loops.
48
49C void mpn_sqr_basecase (mp_ptr wp,
50C                        mp_srcptr xp, mp_size_t xn);
51
52define(`rp',  `%edi')
53define(`up',  `%esi')
54define(`n',   `%ecx')
55
56define(`un',  `%ebp')
57
58	TEXT
59	ALIGN(16)
60PROLOGUE(mpn_sqr_basecase)
61	push	%edi
62	push	%esi
63	mov	12(%esp), rp
64	mov	16(%esp), up
65	mov	20(%esp), n
66
67	lea	4(rp), rp	C write triangular product starting at rp[1]
68	dec	n
69	movd	(up), %mm7
70
71	jz	L(one)
72	lea	4(up), up
73	push	%ebx
74	push	%ebp
75	mov	n, %eax
76
77	movd	(up), %mm0
78	neg	n
79	pmuludq	%mm7, %mm0
80	pxor	%mm6, %mm6
81	mov	n, un
82
83	and	$3, %eax
84	jz	L(of0)
85	cmp	$2, %eax
86	jc	L(of1)
87	jz	L(of2)
88
89C ================================================================
90	jmp	L(m3)
91	ALIGN(16)
92L(lm3):	movd	-4(up), %mm0
93	pmuludq	%mm7, %mm0
94	psrlq	$32, %mm6
95	lea	16(rp), rp
96	paddq	%mm0, %mm6
97	movd	(up), %mm0
98	pmuludq	%mm7, %mm0
99	movd	%mm6, -4(rp)
100	psrlq	$32, %mm6
101L(m3):	paddq	%mm0, %mm6
102	movd	4(up), %mm0
103	pmuludq	%mm7, %mm0
104	movd	%mm6, (rp)
105	psrlq	$32, %mm6
106	paddq	%mm0, %mm6
107	movd	8(up), %mm0
108	pmuludq	%mm7, %mm0
109	movd	%mm6, 4(rp)
110	psrlq	$32, %mm6
111	paddq	%mm0, %mm6
112	add	$4, un
113	movd	%mm6, 8(rp)
114	lea	16(up), up
115	js	L(lm3)
116
117	psrlq	$32, %mm6
118	movd	%mm6, 12(rp)
119
120	inc	n
121C	jz	L(done)
122  lea	-12(up), up
123  lea	4(rp), rp
124	jmp	L(ol2)
125
126C ================================================================
127	ALIGN(16)
128L(lm0):	movd	(up), %mm0
129	pmuludq	%mm7, %mm0
130	psrlq	$32, %mm6
131	lea	16(rp), rp
132L(of0):	paddq	%mm0, %mm6
133	movd	4(up), %mm0
134	pmuludq	%mm7, %mm0
135	movd	%mm6, (rp)
136	psrlq	$32, %mm6
137	paddq	%mm0, %mm6
138	movd	8(up), %mm0
139	pmuludq	%mm7, %mm0
140	movd	%mm6, 4(rp)
141	psrlq	$32, %mm6
142	paddq	%mm0, %mm6
143	movd	12(up), %mm0
144	pmuludq	%mm7, %mm0
145	movd	%mm6, 8(rp)
146	psrlq	$32, %mm6
147	paddq	%mm0, %mm6
148	add	$4, un
149	movd	%mm6, 12(rp)
150	lea	16(up), up
151	js	L(lm0)
152
153	psrlq	$32, %mm6
154	movd	%mm6, 16(rp)
155
156	inc	n
157C	jz	L(done)
158  lea	-8(up), up
159  lea	8(rp), rp
160	jmp	L(ol3)
161
162C ================================================================
163	ALIGN(16)
164L(lm1):	movd	-12(up), %mm0
165	pmuludq	%mm7, %mm0
166	psrlq	$32, %mm6
167	lea	16(rp), rp
168	paddq	%mm0, %mm6
169	movd	-8(up), %mm0
170	pmuludq	%mm7, %mm0
171	movd	%mm6, -12(rp)
172	psrlq	$32, %mm6
173	paddq	%mm0, %mm6
174	movd	-4(up), %mm0
175	pmuludq	%mm7, %mm0
176	movd	%mm6, -8(rp)
177	psrlq	$32, %mm6
178	paddq	%mm0, %mm6
179	movd	(up), %mm0
180	pmuludq	%mm7, %mm0
181	movd	%mm6, -4(rp)
182	psrlq	$32, %mm6
183L(of1):	paddq	%mm0, %mm6
184	add	$4, un
185	movd	%mm6, (rp)
186	lea	16(up), up
187	js	L(lm1)
188
189	psrlq	$32, %mm6
190	movd	%mm6, 4(rp)
191
192	inc	n
193	jz	L(done)		C goes away when we add special n=2 code
194  lea	-20(up), up
195  lea	-4(rp), rp
196	jmp	L(ol0)
197
198C ================================================================
199	ALIGN(16)
200L(lm2):	movd	-8(up), %mm0
201	pmuludq	%mm7, %mm0
202	psrlq	$32, %mm6
203	lea	16(rp), rp
204	paddq	%mm0, %mm6
205	movd	-4(up), %mm0
206	pmuludq	%mm7, %mm0
207	movd	%mm6, -8(rp)
208	psrlq	$32, %mm6
209	paddq	%mm0, %mm6
210	movd	(up), %mm0
211	pmuludq	%mm7, %mm0
212	movd	%mm6, -4(rp)
213	psrlq	$32, %mm6
214L(of2):	paddq	%mm0, %mm6
215	movd	4(up), %mm0
216	pmuludq	%mm7, %mm0
217	movd	%mm6, (rp)
218	psrlq	$32, %mm6
219	paddq	%mm0, %mm6
220	add	$4, un
221	movd	%mm6, 4(rp)
222	lea	16(up), up
223	js	L(lm2)
224
225	psrlq	$32, %mm6
226	movd	%mm6, 8(rp)
227
228	inc	n
229C	jz	L(done)
230  lea	-16(up), up
231C  lea	(rp), rp
232C	jmp	L(ol1)
233
234C ================================================================
235
236L(ol1):	lea	4(up,n,4), up
237	movd	(up), %mm7	C read next U invariant limb
238	lea	8(rp,n,4), rp
239	mov	n, un
240
241	movd	4(up), %mm1
242	pmuludq	%mm7, %mm1
243	sar	$2, un
244	movd	%mm1, %ebx
245	inc	un
246	jz	L(re1)
247
248	movd	8(up), %mm0
249	pmuludq	%mm7, %mm0
250	xor	%edx, %edx	C zero edx and CF
251	jmp	L(a1)
252
253L(la1):	adc	$0, %edx
254	add	%ebx, 12(rp)
255	movd	%mm0, %eax
256	pmuludq	%mm7, %mm1
257	lea	16(rp), rp
258	psrlq	$32, %mm0
259	adc	%edx, %eax
260	movd	%mm0, %edx
261	movd	%mm1, %ebx
262	movd	8(up), %mm0
263	pmuludq	%mm7, %mm0
264	adc	$0, %edx
265	add	%eax, (rp)
266L(a1):	psrlq	$32, %mm1
267	adc	%edx, %ebx
268	movd	%mm1, %edx
269	movd	%mm0, %eax
270	movd	12(up), %mm1
271	pmuludq	%mm7, %mm1
272	adc	$0, %edx
273	add	%ebx, 4(rp)
274	psrlq	$32, %mm0
275	adc	%edx, %eax
276	movd	%mm0, %edx
277	movd	%mm1, %ebx
278	lea	16(up), up
279	movd	(up), %mm0
280	adc	$0, %edx
281	add	%eax, 8(rp)
282	psrlq	$32, %mm1
283	adc	%edx, %ebx
284	movd	%mm1, %edx
285	pmuludq	%mm7, %mm0
286	inc	un
287	movd	4(up), %mm1
288	jnz	L(la1)
289
290	adc	un, %edx	C un is zero here
291	add	%ebx, 12(rp)
292	movd	%mm0, %eax
293	pmuludq	%mm7, %mm1
294	lea	16(rp), rp
295	psrlq	$32, %mm0
296	adc	%edx, %eax
297	movd	%mm0, %edx
298	movd	%mm1, %ebx
299	adc	un, %edx
300	add	%eax, (rp)
301	psrlq	$32, %mm1
302	adc	%edx, %ebx
303	movd	%mm1, %eax
304	adc	un, %eax
305	add	%ebx, 4(rp)
306	adc	un, %eax
307	mov	%eax, 8(rp)
308
309	inc	n
310
311C ================================================================
312
313L(ol0):	lea	(up,n,4), up
314	movd	4(up), %mm7	C read next U invariant limb
315	lea	4(rp,n,4), rp
316	mov	n, un
317
318	movd	8(up), %mm0
319	pmuludq	%mm7, %mm0
320	sar	$2, un
321	movd	12(up), %mm1
322	movd	%mm0, %eax
323	pmuludq	%mm7, %mm1
324	xor	%edx, %edx	C zero edx and CF
325	jmp	L(a0)
326
327L(la0):	adc	$0, %edx
328	add	%ebx, 12(rp)
329	movd	%mm0, %eax
330	pmuludq	%mm7, %mm1
331	lea	16(rp), rp
332	psrlq	$32, %mm0
333	adc	%edx, %eax
334	movd	%mm0, %edx
335	movd	%mm1, %ebx
336	movd	8(up), %mm0
337	pmuludq	%mm7, %mm0
338	adc	$0, %edx
339	add	%eax, (rp)
340	psrlq	$32, %mm1
341	adc	%edx, %ebx
342	movd	%mm1, %edx
343	movd	%mm0, %eax
344	movd	12(up), %mm1
345	pmuludq	%mm7, %mm1
346	adc	$0, %edx
347	add	%ebx, 4(rp)
348L(a0):	psrlq	$32, %mm0
349	adc	%edx, %eax
350	movd	%mm0, %edx
351	movd	%mm1, %ebx
352	lea	16(up), up
353	movd	(up), %mm0
354	adc	$0, %edx
355	add	%eax, 8(rp)
356	psrlq	$32, %mm1
357	adc	%edx, %ebx
358	movd	%mm1, %edx
359	pmuludq	%mm7, %mm0
360	inc	un
361	movd	4(up), %mm1
362	jnz	L(la0)
363
364	adc	un, %edx	C un is zero here
365	add	%ebx, 12(rp)
366	movd	%mm0, %eax
367	pmuludq	%mm7, %mm1
368	lea	16(rp), rp
369	psrlq	$32, %mm0
370	adc	%edx, %eax
371	movd	%mm0, %edx
372	movd	%mm1, %ebx
373	adc	un, %edx
374	add	%eax, (rp)
375	psrlq	$32, %mm1
376	adc	%edx, %ebx
377	movd	%mm1, %eax
378	adc	un, %eax
379	add	%ebx, 4(rp)
380	adc	un, %eax
381	mov	%eax, 8(rp)
382
383	inc	n
384
385C ================================================================
386
387L(ol3):	lea	12(up,n,4), up
388	movd	-8(up), %mm7	C read next U invariant limb
389	lea	(rp,n,4), rp	C put rp back
390	mov	n, un
391
392	movd	-4(up), %mm1
393	pmuludq	%mm7, %mm1
394	sar	$2, un
395	movd	%mm1, %ebx
396	movd	(up), %mm0
397	xor	%edx, %edx	C zero edx and CF
398	jmp	L(a3)
399
400L(la3):	adc	$0, %edx
401	add	%ebx, 12(rp)
402	movd	%mm0, %eax
403	pmuludq	%mm7, %mm1
404	lea	16(rp), rp
405	psrlq	$32, %mm0
406	adc	%edx, %eax
407	movd	%mm0, %edx
408	movd	%mm1, %ebx
409	movd	8(up), %mm0
410	pmuludq	%mm7, %mm0
411	adc	$0, %edx
412	add	%eax, (rp)
413	psrlq	$32, %mm1
414	adc	%edx, %ebx
415	movd	%mm1, %edx
416	movd	%mm0, %eax
417	movd	12(up), %mm1
418	pmuludq	%mm7, %mm1
419	adc	$0, %edx
420	add	%ebx, 4(rp)
421	psrlq	$32, %mm0
422	adc	%edx, %eax
423	movd	%mm0, %edx
424	movd	%mm1, %ebx
425	lea	16(up), up
426	movd	(up), %mm0
427	adc	$0, %edx
428	add	%eax, 8(rp)
429L(a3):	psrlq	$32, %mm1
430	adc	%edx, %ebx
431	movd	%mm1, %edx
432	pmuludq	%mm7, %mm0
433	inc	un
434	movd	4(up), %mm1
435	jnz	L(la3)
436
437	adc	un, %edx	C un is zero here
438	add	%ebx, 12(rp)
439	movd	%mm0, %eax
440	pmuludq	%mm7, %mm1
441	lea	16(rp), rp
442	psrlq	$32, %mm0
443	adc	%edx, %eax
444	movd	%mm0, %edx
445	movd	%mm1, %ebx
446	adc	un, %edx
447	add	%eax, (rp)
448	psrlq	$32, %mm1
449	adc	%edx, %ebx
450	movd	%mm1, %eax
451	adc	un, %eax
452	add	%ebx, 4(rp)
453	adc	un, %eax
454	mov	%eax, 8(rp)
455
456	inc	n
457
458C ================================================================
459
460L(ol2):	lea	8(up,n,4), up
461	movd	-4(up), %mm7	C read next U invariant limb
462	lea	12(rp,n,4), rp
463	mov	n, un
464
465	movd	(up), %mm0
466	pmuludq	%mm7, %mm0
467	xor	%edx, %edx
468	sar	$2, un
469	movd	4(up), %mm1
470	test	un, un		C clear carry
471	movd	%mm0, %eax
472	pmuludq	%mm7, %mm1
473	inc	un
474	jnz	L(a2)
475	jmp	L(re2)
476
477L(la2):	adc	$0, %edx
478	add	%ebx, 12(rp)
479	movd	%mm0, %eax
480	pmuludq	%mm7, %mm1
481	lea	16(rp), rp
482L(a2):	psrlq	$32, %mm0
483	adc	%edx, %eax
484	movd	%mm0, %edx
485	movd	%mm1, %ebx
486	movd	8(up), %mm0
487	pmuludq	%mm7, %mm0
488	adc	$0, %edx
489	add	%eax, (rp)
490	psrlq	$32, %mm1
491	adc	%edx, %ebx
492	movd	%mm1, %edx
493	movd	%mm0, %eax
494	movd	12(up), %mm1
495	pmuludq	%mm7, %mm1
496	adc	$0, %edx
497	add	%ebx, 4(rp)
498	psrlq	$32, %mm0
499	adc	%edx, %eax
500	movd	%mm0, %edx
501	movd	%mm1, %ebx
502	lea	16(up), up
503	movd	(up), %mm0
504	adc	$0, %edx
505	add	%eax, 8(rp)
506	psrlq	$32, %mm1
507	adc	%edx, %ebx
508	movd	%mm1, %edx
509	pmuludq	%mm7, %mm0
510	inc	un
511	movd	4(up), %mm1
512	jnz	L(la2)
513
514	adc	un, %edx	C un is zero here
515	add	%ebx, 12(rp)
516	movd	%mm0, %eax
517	pmuludq	%mm7, %mm1
518	lea	16(rp), rp
519	psrlq	$32, %mm0
520	adc	%edx, %eax
521	movd	%mm0, %edx
522	movd	%mm1, %ebx
523	adc	un, %edx
524	add	%eax, (rp)
525	psrlq	$32, %mm1
526	adc	%edx, %ebx
527	movd	%mm1, %eax
528	adc	un, %eax
529	add	%ebx, 4(rp)
530	adc	un, %eax
531	mov	%eax, 8(rp)
532
533	inc	n
534	jmp	L(ol1)
535
536C ================================================================
537L(re2):	psrlq	$32, %mm0
538	movd	(up), %mm7	C read next U invariant limb
539	adc	%edx, %eax
540	movd	%mm0, %edx
541	movd	%mm1, %ebx
542	adc	un, %edx
543	add	%eax, (rp)
544	lea	4(rp), rp
545	psrlq	$32, %mm1
546	adc	%edx, %ebx
547	movd	%mm1, %eax
548	movd	4(up), %mm1
549	adc	un, %eax
550	add	%ebx, (rp)
551	pmuludq	%mm7, %mm1
552	adc	un, %eax
553	mov	%eax, 4(rp)
554	movd	%mm1, %ebx
555
556L(re1):	psrlq	$32, %mm1
557	add	%ebx, 4(rp)
558	movd	%mm1, %eax
559	adc	un, %eax
560	xor	n, n		C make n zeroness assumption below true
561	mov	%eax, 8(rp)
562
563L(done):			C n is zero here
564	mov	24(%esp), up
565	mov	28(%esp), %eax
566
567	movd	(up), %mm0
568	inc	%eax
569	pmuludq	%mm0, %mm0
570	lea	4(up), up
571	mov	20(%esp), rp
572	shr	%eax
573	movd	%mm0, (rp)
574	psrlq	$32, %mm0
575	lea	-12(rp), rp
576	mov	%eax, 28(%esp)
577	jnc	L(odd)
578
579	movd	%mm0, %ebp
580	movd	(up), %mm0
581	lea	8(rp), rp
582	pmuludq	%mm0, %mm0
583	lea	-4(up), up
584	add	8(rp), %ebp
585	movd	%mm0, %edx
586	adc	12(rp), %edx
587	rcr	n
588	jmp	L(ent)
589
590C	ALIGN(16)		C alignment seems irrelevant
591L(top):	movd	(up), %mm1
592	adc	n, n
593	movd	%mm0, %eax
594	pmuludq	%mm1, %mm1
595	movd	4(up), %mm0
596	adc	(rp), %eax
597	movd	%mm1, %ebx
598	pmuludq	%mm0, %mm0
599	psrlq	$32, %mm1
600	adc	4(rp), %ebx
601	movd	%mm1, %ebp
602	movd	%mm0, %edx
603	adc	8(rp), %ebp
604	adc	12(rp), %edx
605	rcr	n		C FIXME: isn't this awfully slow on atom???
606	adc	%eax, (rp)
607	adc	%ebx, 4(rp)
608L(ent):	lea	8(up), up
609	adc	%ebp, 8(rp)
610	psrlq	$32, %mm0
611	adc	%edx, 12(rp)
612L(odd):	decl	28(%esp)
613	lea	16(rp), rp
614	jnz	L(top)
615
616L(end):	adc	n, n
617	movd	%mm0, %eax
618	adc	n, %eax
619	mov	%eax, (rp)
620
621L(rtn):	emms
622	pop	%ebp
623	pop	%ebx
624	pop	%esi
625	pop	%edi
626	ret
627
628L(one):	pmuludq	%mm7, %mm7
629	movq	%mm7, -4(rp)
630	emms
631	pop	%esi
632	pop	%edi
633	ret
634EPILOGUE()
635