1dnl  X86-64 mpn_redc_1 optimised for AMD K8-K10.
2
3dnl  Contributed to the GNU project by Torbj��rn Granlund.
4
5dnl  Copyright 2004, 2008, 2013 Free Software Foundation, Inc.
6
7dnl  This file is part of the GNU MP Library.
8dnl
9dnl  The GNU MP Library is free software; you can redistribute it and/or modify
10dnl  it under the terms of either:
11dnl
12dnl    * the GNU Lesser General Public License as published by the Free
13dnl      Software Foundation; either version 3 of the License, or (at your
14dnl      option) any later version.
15dnl
16dnl  or
17dnl
18dnl    * the GNU General Public License as published by the Free Software
19dnl      Foundation; either version 2 of the License, or (at your option) any
20dnl      later version.
21dnl
22dnl  or both in parallel, as here.
23dnl
24dnl  The GNU MP Library is distributed in the hope that it will be useful, but
25dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
27dnl  for more details.
28dnl
29dnl  You should have received copies of the GNU General Public License and the
30dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
31dnl  see https://www.gnu.org/licenses/.
32
33include(`../config.m4')
34
35C	     cycles/limb
36C AMD K8,K9	 ?
37C AMD K10	 ?
38C AMD bull	 ?
39C AMD pile	 ?
40C AMD steam	 ?
41C AMD bobcat	 ?
42C AMD jaguar	 ?
43C Intel P4	 ?
44C Intel core	 ?
45C Intel NHM	 ?
46C Intel SBR	 ?
47C Intel IBR	 ?
48C Intel HWL	 ?
49C Intel BWL	 ?
50C Intel atom	 ?
51C VIA nano	 ?
52
53C The inner loops of this code are the result of running a code generation and
54C optimisation tool suite written by David Harvey and Torbj��rn Granlund.
55
56C TODO
57C  * Micro-optimise, none performed thus far.
58C  * This looks different from other current redc_1.asm variants.  Consider
59C    adapting this to the mainstream style.
60C  * Is this code really faster than more approaches which compute q0 later?
61C    Is the use of a jump jump table faster?  Or is the edge of this due to the
62C    inlined add_n code?
63C  * Put initial m[0] x q0 computation in header.
64C  * Put basecases at the file's end, single them out before the pushes.
65
66define(`rp',          `%rdi')   C rcx
67define(`up',          `%rsi')   C rdx
68define(`mp_param',    `%rdx')   C r8
69define(`n',           `%rcx')   C r9
70define(`u0inv',       `%r8')    C stack
71
72define(`i',           `%r11')
73define(`nneg',        `%r12')
74define(`mp',          `%r13')
75define(`q0',          `%rbp')
76define(`vp',          `%rdx')
77
78ABI_SUPPORT(DOS64)
79ABI_SUPPORT(STD64)
80
81ASM_START()
82	TEXT
83	ALIGN(32)
84PROLOGUE(mpn_redc_1)
85	FUNC_ENTRY(4)
86IFDOS(`	mov	56(%rsp), %r8	')
87	push	%rbp
88	mov	(up), q0		C up[0]
89	push	%rbx
90	imul	u0inv, q0		C first q0, for all execution paths
91	push	%r12
92	push	%r13
93	push	%r14
94	push	%r15
95
96	mov	n, nneg
97	neg	nneg
98	lea	(mp_param,n,8), mp	C mp += n
99	lea	-16(up,n,8), up		C up += n
100
101	mov	R32(n), R32(%rax)
102	and	$3, R32(%rax)
103	lea	4(%rax), %r9
104	cmp	$4, R32(n)
105	cmovg	%r9, %rax
106	lea	L(tab)(%rip), %r9
107ifdef(`PIC',`
108	movslq	(%r9,%rax,4), %rax
109	add	%r9, %rax
110	jmp	*%rax
111',`
112	jmp	*(%r9,%rax,8)
113')
114
115	JUMPTABSECT
116	ALIGN(8)
117L(tab):	JMPENT(	L(0), L(tab))
118	JMPENT(	L(1), L(tab))
119	JMPENT(	L(2), L(tab))
120	JMPENT(	L(3), L(tab))
121	JMPENT(	L(0m4), L(tab))
122	JMPENT(	L(1m4), L(tab))
123	JMPENT(	L(2m4), L(tab))
124	JMPENT(	L(3m4), L(tab))
125	TEXT
126
127	ALIGN(16)
128L(1):	mov	(mp_param), %rax
129	mul	q0
130	add	8(up), %rax
131	adc	16(up), %rdx
132	mov	%rdx, (rp)
133	mov	$0, R32(%rax)
134	adc	R32(%rax), R32(%rax)
135	jmp	L(ret)
136
137
138	ALIGN(16)
139L(2):	mov	(mp_param), %rax
140	mul	q0
141	xor	R32(%r14), R32(%r14)
142	mov	%rax, %r10
143	mov	-8(mp), %rax
144	mov	%rdx, %r9
145	mul	q0
146	add	(up), %r10
147	adc	%rax, %r9
148	adc	%rdx, %r14
149	add	8(up), %r9
150	adc	$0, %r14
151	mov	%r9, q0
152	imul	u0inv, q0
153	mov	-16(mp), %rax
154	mul	q0
155	xor	R32(%rbx), R32(%rbx)
156	mov	%rax, %r10
157	mov	-8(mp), %rax
158	mov	%rdx, %r11
159	mul	q0
160	add	%r9, %r10
161	adc	%rax, %r11
162	adc	%rdx, %rbx
163	add	16(up), %r11
164	adc	$0, %rbx
165	xor	R32(%rax), R32(%rax)
166	add	%r11, %r14
167	adc	24(up), %rbx
168	mov	%r14, (rp)
169	mov	%rbx, 8(rp)
170	adc	R32(%rax), R32(%rax)
171	jmp	L(ret)
172
173
174L(3):	mov	(mp_param), %rax
175	mul	q0
176	mov	%rax, %rbx
177	mov	%rdx, %r10
178	mov	-16(mp), %rax
179	mul	q0
180	xor	R32(%r9), R32(%r9)
181	xor	R32(%r14), R32(%r14)
182	add	-8(up), %rbx
183	adc	%rax, %r10
184	mov	-8(mp), %rax
185	adc	%rdx, %r9
186	mul	q0
187	add	(up), %r10
188	mov	%r10, (up)
189	adc	%rax, %r9
190	adc	%rdx, %r14
191	mov	%r10, q0
192	imul	u0inv, q0
193	add	%r9, 8(up)
194	adc	$0, %r14
195	mov	%r14, -8(up)
196
197	mov	-24(mp), %rax
198	mul	q0
199	mov	%rax, %rbx
200	mov	%rdx, %r10
201	mov	-16(mp), %rax
202	mul	q0
203	xor	R32(%r9), R32(%r9)
204	xor	R32(%r14), R32(%r14)
205	add	(up), %rbx
206	adc	%rax, %r10
207	mov	-8(mp), %rax
208	adc	%rdx, %r9
209	mul	q0
210	add	8(up), %r10
211	mov	%r10, 8(up)
212	adc	%rax, %r9
213	adc	%rdx, %r14
214	mov	%r10, q0
215	imul	u0inv, q0
216	add	%r9, 16(up)
217	adc	$0, %r14
218	mov	%r14, (up)
219
220	mov	-24(mp), %rax
221	mul	q0
222	mov	%rax, %rbx
223	mov	%rdx, %r10
224	mov	-16(mp), %rax
225	mul	q0
226	xor	R32(%r9), R32(%r9)
227	xor	R32(%r14), R32(%r14)
228	add	8(up), %rbx
229	adc	%rax, %r10
230	mov	-8(mp), %rax
231	adc	%rdx, %r9
232	mul	q0
233	add	16(up), %r10
234	adc	%rax, %r9
235	adc	%rdx, %r14
236	add	24(up), %r9
237	adc	$0, %r14
238
239	xor	R32(%rax), R32(%rax)
240	add	-8(up), %r10
241	adc	(up), %r9
242	adc	32(up), %r14
243	mov	%r10, (rp)
244	mov	%r9, 8(rp)
245	mov	%r14, 16(rp)
246	adc	R32(%rax), R32(%rax)
247	jmp	L(ret)
248
249
250	ALIGN(16)
251L(2m4):
252L(lo2):	mov	(mp,nneg,8), %rax
253	mul	q0
254	xor	R32(%r14), R32(%r14)
255	xor	R32(%rbx), R32(%rbx)
256	mov	%rax, %r10
257	mov	8(mp,nneg,8), %rax
258	mov	24(up,nneg,8), %r15
259	mov	%rdx, %r9
260	mul	q0
261	add	16(up,nneg,8), %r10
262	adc	%rax, %r9
263	mov	16(mp,nneg,8), %rax
264	adc	%rdx, %r14
265	mul	q0
266	mov	$0, R32(%r10)		C xor?
267	lea	2(nneg), i
268	add	%r9, %r15
269	imul	u0inv, %r15
270	jmp	 L(e2)
271
272	ALIGN(16)
273L(li2):	add	%r10, (up,i,8)
274	adc	%rax, %r9
275	mov	(mp,i,8), %rax
276	adc	%rdx, %r14
277	xor	R32(%r10), R32(%r10)
278	mul	q0
279L(e2):	add	%r9, 8(up,i,8)
280	adc	%rax, %r14
281	adc	%rdx, %rbx
282	mov	8(mp,i,8), %rax
283	mul	q0
284	add	%r14, 16(up,i,8)
285	adc	%rax, %rbx
286	adc	%rdx, %r10
287	mov	16(mp,i,8), %rax
288	mul	q0
289	add	%rbx, 24(up,i,8)
290	mov	$0, R32(%r14)		C zero
291	mov	%r14, %rbx		C zero
292	adc	%rax, %r10
293	mov	24(mp,i,8), %rax
294	mov	%r14, %r9		C zero
295	adc	%rdx, %r9
296	mul	q0
297	add	$4, i
298	js	 L(li2)
299
300L(le2):	add	%r10, (up)
301	adc	%rax, %r9
302	adc	%r14, %rdx
303	add	%r9, 8(up)
304	adc	$0, %rdx
305	mov	%rdx, 16(up,nneg,8)	C up[0]
306	add	$8, up
307	mov	%r15, q0
308	dec	n
309	jnz	L(lo2)
310
311	mov	nneg, n
312	sar	$2, n
313	lea	32(up,nneg,8), up
314	lea	(up,nneg,8), vp
315
316	mov	-16(up), %r8
317	mov	-8(up), %r9
318	add	-16(vp), %r8
319	adc	-8(vp), %r9
320	mov	%r8, (rp)
321	mov	%r9, 8(rp)
322	lea	16(rp), rp
323	jmp	L(addx)
324
325
326	ALIGN(16)
327L(1m4):
328L(lo1):	mov	(mp,nneg,8), %rax
329	xor	%r9, %r9
330	xor	R32(%rbx), R32(%rbx)
331	mul	q0
332	mov	%rax, %r9
333	mov	8(mp,nneg,8), %rax
334	mov	24(up,nneg,8), %r15
335	mov	%rdx, %r14
336	mov	$0, R32(%r10)		C xor?
337	mul	q0
338	add	16(up,nneg,8), %r9
339	adc	%rax, %r14
340	adc	%rdx, %rbx
341	mov	16(mp,nneg,8), %rax
342	mul	q0
343	lea	1(nneg), i
344	add	%r14, %r15
345	imul	u0inv, %r15
346	jmp	 L(e1)
347
348	ALIGN(16)
349L(li1):	add	%r10, (up,i,8)
350	adc	%rax, %r9
351	mov	(mp,i,8), %rax
352	adc	%rdx, %r14
353	xor	R32(%r10), R32(%r10)
354	mul	q0
355	add	%r9, 8(up,i,8)
356	adc	%rax, %r14
357	adc	%rdx, %rbx
358	mov	8(mp,i,8), %rax
359	mul	q0
360L(e1):	add	%r14, 16(up,i,8)
361	adc	%rax, %rbx
362	adc	%rdx, %r10
363	mov	16(mp,i,8), %rax
364	mul	q0
365	add	%rbx, 24(up,i,8)
366	mov	$0, R32(%r14)		C zero
367	mov	%r14, %rbx		C zero
368	adc	%rax, %r10
369	mov	24(mp,i,8), %rax
370	mov	%r14, %r9		C zero
371	adc	%rdx, %r9
372	mul	q0
373	add	$4, i
374	js	 L(li1)
375
376L(le1):	add	%r10, (up)
377	adc	%rax, %r9
378	adc	%r14, %rdx
379	add	%r9, 8(up)
380	adc	$0, %rdx
381	mov	%rdx, 16(up,nneg,8)	C up[0]
382	add	$8, up
383	mov	%r15, q0
384	dec	n
385	jnz	L(lo1)
386
387	mov	nneg, n
388	sar	$2, n
389	lea	24(up,nneg,8), up
390	lea	(up,nneg,8), vp
391
392	mov	-8(up), %r8
393	add	-8(vp), %r8
394	mov	%r8, (rp)
395	lea	8(rp), rp
396	jmp	L(addx)
397
398
399	ALIGN(16)
400L(0):
401L(0m4):
402L(lo0):	mov	(mp,nneg,8), %rax
403	mov	nneg, i
404	mul	q0
405	xor	R32(%r10), R32(%r10)
406	mov	%rax, %r14
407	mov	%rdx, %rbx
408	mov	8(mp,nneg,8), %rax
409	mov	24(up,nneg,8), %r15
410	mul	q0
411	add	16(up,nneg,8), %r14
412	adc	%rax, %rbx
413	adc	%rdx, %r10
414	add	%rbx, %r15
415	imul	u0inv, %r15
416	jmp	L(e0)
417
418	ALIGN(16)
419L(li0):	add	%r10, (up,i,8)
420	adc	%rax, %r9
421	mov	(mp,i,8), %rax
422	adc	%rdx, %r14
423	xor	R32(%r10), R32(%r10)
424	mul	q0
425	add	%r9, 8(up,i,8)
426	adc	%rax, %r14
427	adc	%rdx, %rbx
428	mov	8(mp,i,8), %rax
429	mul	q0
430	add	%r14, 16(up,i,8)
431	adc	%rax, %rbx
432	adc	%rdx, %r10
433L(e0):	mov	16(mp,i,8), %rax
434	mul	q0
435	add	%rbx, 24(up,i,8)
436	mov	$0, R32(%r14)		C zero
437	mov	%r14, %rbx		C zero
438	adc	%rax, %r10
439	mov	24(mp,i,8), %rax
440	mov	%r14, %r9		C zero
441	adc	%rdx, %r9
442	mul	q0
443	add	$4, i
444	js	 L(li0)
445
446L(le0):	add	%r10, (up)
447	adc	%rax, %r9
448	adc	%r14, %rdx
449	add	%r9, 8(up)
450	adc	$0, %rdx
451	mov	%rdx, 16(up,nneg,8)	C up[0]
452	add	$8, up
453	mov	%r15, q0
454	dec	n
455	jnz	L(lo0)
456
457	mov	nneg, n
458	sar	$2, n
459	clc
460	lea	16(up,nneg,8), up
461	lea	(up,nneg,8), vp
462	jmp	L(addy)
463
464
465	ALIGN(16)
466L(3m4):
467L(lo3):	mov	(mp,nneg,8), %rax
468	mul	q0
469	mov	%rax, %rbx
470	mov	%rdx, %r10
471	mov	8(mp,nneg,8), %rax
472	mov	24(up,nneg,8), %r15
473	mul	q0
474	add	16(up,nneg,8), %rbx	C result is zero, might carry
475	mov	$0, R32(%rbx)		C zero
476	mov	%rbx, %r14		C zero
477	adc	%rax, %r10
478	mov	16(mp,nneg,8), %rax
479	mov	%r14, %r9		C zero
480	adc	%rdx, %r9
481	add	%r10, %r15
482	mul	q0
483	lea	3(nneg), i
484	imul	u0inv, %r15
485C	jmp	L(li3)
486
487	ALIGN(16)
488L(li3):	add	%r10, (up,i,8)
489	adc	%rax, %r9
490	mov	(mp,i,8), %rax
491	adc	%rdx, %r14
492	xor	R32(%r10), R32(%r10)
493	mul	q0
494	add	%r9, 8(up,i,8)
495	adc	%rax, %r14
496	adc	%rdx, %rbx
497	mov	8(mp,i,8), %rax
498	mul	q0
499	add	%r14, 16(up,i,8)
500	adc	%rax, %rbx
501	adc	%rdx, %r10
502	mov	16(mp,i,8), %rax
503	mul	q0
504	add	%rbx, 24(up,i,8)
505	mov	$0, R32(%r14)		C zero
506	mov	%r14, %rbx		C zero
507	adc	%rax, %r10
508	mov	24(mp,i,8), %rax
509	mov	%r14, %r9		C zero
510	adc	%rdx, %r9
511	mul	q0
512	add	$4, i
513	js	 L(li3)
514
515L(le3):	add	%r10, (up)
516	adc	%rax, %r9
517	adc	%r14, %rdx
518	add	%r9, 8(up)
519	adc	$0, %rdx
520	mov	%rdx, 16(up,nneg,8)	C up[0]
521	mov	%r15, q0
522	lea	8(up), up
523	dec	n
524	jnz	L(lo3)
525
526
527C ==== Addition code ====
528	mov	nneg, n
529	sar	$2, n
530	lea	40(up,nneg,8), up
531	lea	(up,nneg,8), vp
532
533	mov	-24(up), %r8
534	mov	-16(up), %r9
535	mov	-8(up), %r10
536	add	-24(vp), %r8
537	adc	-16(vp), %r9
538	adc	-8(vp), %r10
539	mov	%r8, (rp)
540	mov	%r9, 8(rp)
541	mov	%r10, 16(rp)
542	lea	24(rp), rp
543
544L(addx):inc	n
545	jz	L(ad3)
546
547L(addy):mov	(up), %r8
548	mov	8(up), %r9
549	inc	n
550	jmp	L(mid)
551
552C	ALIGN(16)
553L(al3):	adc	(vp), %r8
554	adc	8(vp), %r9
555	adc	16(vp), %r10
556	adc	24(vp), %r11
557	mov	%r8, (rp)
558	lea	32(up), up
559	mov	%r9, 8(rp)
560	mov	%r10, 16(rp)
561	inc	n
562	mov	%r11, 24(rp)
563	lea	32(vp), vp
564	mov	(up), %r8
565	mov	8(up), %r9
566	lea	32(rp), rp
567L(mid):	mov	16(up), %r10
568	mov	24(up), %r11
569	jnz	L(al3)
570
571L(ae3):	adc	(vp), %r8
572	adc	8(vp), %r9
573	adc	16(vp), %r10
574	adc	24(vp), %r11
575	mov	%r8, (rp)
576	mov	%r9, 8(rp)
577	mov	%r10, 16(rp)
578	mov	%r11, 24(rp)
579
580L(ad3):	mov	R32(n), R32(%rax)	C zero
581	adc	R32(%rax), R32(%rax)
582
583L(ret):	pop	%r15
584	pop	%r14
585	pop	%r13
586	pop	%r12
587	pop	%rbx
588	pop	%rbp
589	FUNC_EXIT()
590	ret
591EPILOGUE()
592