redc_1.asm revision 1.1.1.1
1dnl  X86-64 mpn_redc_1 optimised for Intel Nehalem and Westmere.
2
3dnl  Contributed to the GNU project by Torbj��rn Granlund.
4
5dnl  Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc.
6
7dnl  This file is part of the GNU MP Library.
8dnl
9dnl  The GNU MP Library is free software; you can redistribute it and/or modify
10dnl  it under the terms of either:
11dnl
12dnl    * the GNU Lesser General Public License as published by the Free
13dnl      Software Foundation; either version 3 of the License, or (at your
14dnl      option) any later version.
15dnl
16dnl  or
17dnl
18dnl    * the GNU General Public License as published by the Free Software
19dnl      Foundation; either version 2 of the License, or (at your option) any
20dnl      later version.
21dnl
22dnl  or both in parallel, as here.
23dnl
24dnl  The GNU MP Library is distributed in the hope that it will be useful, but
25dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
27dnl  for more details.
28dnl
29dnl  You should have received copies of the GNU General Public License and the
30dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
31dnl  see https://www.gnu.org/licenses/.
32
33include(`../config.m4')
34
35C	     cycles/limb
36C AMD K8,K9	 ?
37C AMD K10	 ?
38C AMD bull	 ?
39C AMD pile	 ?
40C AMD steam	 ?
41C AMD bobcat	 ?
42C AMD jaguar	 ?
43C Intel P4	 ?
44C Intel core	 ?
45C Intel NHM	 ?
46C Intel SBR	 ?
47C Intel IBR	 ?
48C Intel HWL	 ?
49C Intel BWL	 ?
50C Intel atom	 ?
51C VIA nano	 ?
52
53C The inner loops of this code are the result of running a code generation and
54C optimisation tool suite written by David Harvey and Torbj��rn Granlund.
55
56C TODO
57C  * Micro-optimise, none performed thus far.
58C  * Consider inlining mpn_add_n.
59C  * Single basecases out before the pushes.
60
61C When playing with pointers, set this to $2 to fall back to conservative
62C indexing in wind-down code.
63define(`I',`$1')
64
65define(`rp',          `%rdi')   C rcx
66define(`up',          `%rsi')   C rdx
67define(`mp_param',    `%rdx')   C r8
68define(`n',           `%rcx')   C r9
69define(`u0inv',       `%r8')    C stack
70
71define(`i',           `%r14')
72define(`j',           `%r15')
73define(`mp',          `%r12')
74define(`q0',          `%r13')
75
76C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15
77
78ABI_SUPPORT(DOS64)
79ABI_SUPPORT(STD64)
80
81define(`ALIGNx', `ALIGN(16)')
82
83ASM_START()
84	TEXT
85	ALIGN(32)
86PROLOGUE(mpn_redc_1)
87	FUNC_ENTRY(4)
88IFDOS(`	mov	56(%rsp), %r8	')
89	push	%rbx
90	push	%rbp
91	push	%r12
92	push	%r13
93	push	%r14
94	push	%r15
95
96	mov	(up), q0
97	mov	n, j			C outer loop induction var
98	lea	(mp_param,n,8), mp
99	lea	(up,n,8), up
100	neg	n
101	imul	u0inv, q0		C first iteration q0
102
103	test	$1, R8(n)
104	jz	L(bx0)
105
106L(bx1):	test	$2, R8(n)
107	jz	L(b3)
108
109L(b1):	cmp	$-1, R32(n)
110	jz	L(n1)
111
112L(otp1):lea	3(n), i
113	mov	(mp,n,8), %rax
114	mov	(up,n,8), %rbp
115	mul	q0
116	add	%rax, %rbp
117	mov	$0, R32(%r9)
118	mov	8(mp,n,8), %rax
119	adc	%rdx, %r9
120	mul	q0
121	mov	$0, R32(%r11)
122	mov	8(up,n,8), %rbx
123	add	%rax, %rbx
124	mov	16(mp,n,8), %rax
125	adc	%rdx, %r11
126	add	%r9, %rbx
127	adc	$0, %r11
128	mov	16(up,n,8), %rbp
129	mul	q0
130	add	%rax, %rbp
131	mov	$0, R32(%r9)
132	mov	24(mp,n,8), %rax
133	adc	%rdx, %r9
134	mov	%rbx, 8(up,n,8)
135	imul	u0inv, %rbx		C next q limb
136	jmp	L(e1)
137
138	ALIGNx
139L(tp1):	mul	q0
140	add	%rax, %rbp
141	mov	$0, R32(%r9)
142	mov	-16(mp,i,8), %rax
143	adc	%rdx, %r9
144	mul	q0
145	add	%r11, %rbp
146	mov	$0, R32(%r11)
147	mov	-16(up,i,8), %r10
148	adc	$0, %r9
149	add	%rax, %r10
150	mov	-8(mp,i,8), %rax
151	adc	%rdx, %r11
152	mov	%rbp, -24(up,i,8)
153	add	%r9, %r10
154	adc	$0, %r11
155	mov	-8(up,i,8), %rbp
156	mul	q0
157	add	%rax, %rbp
158	mov	$0, R32(%r9)
159	mov	(mp,i,8), %rax
160	adc	%rdx, %r9
161	mov	%r10, -16(up,i,8)
162L(e1):	add	%r11, %rbp
163	adc	$0, %r9
164	mul	q0
165	mov	(up,i,8), %r10
166	mov	$0, R32(%r11)
167	add	%rax, %r10
168	mov	8(mp,i,8), %rax
169	adc	%rdx, %r11
170	mov	%rbp, -8(up,i,8)
171	add	%r9, %r10
172	adc	$0, %r11
173	mov	8(up,i,8), %rbp
174	mov	%r10, (up,i,8)
175	add	$4, i
176	jnc	L(tp1)
177
178L(ed1):	mul	q0
179	add	%rax, %rbp
180	adc	$0, %rdx
181	add	%r11, %rbp
182	adc	$0, %rdx
183	mov	%rbp, I(-8(up),-24(up,i,8))
184	mov	%rdx, (up,n,8)		C up[0]
185	mov	%rbx, q0		C previously computed q limb -> q0
186	lea	8(up), up		C up++
187	dec	j
188	jnz	L(otp1)
189	jmp	L(cj)
190
191L(b3):	cmp	$-3, R32(n)
192	jz	L(n3)
193
194L(otp3):lea	5(n), i
195	mov	(mp,n,8), %rax
196	mov	(up,n,8), %rbp
197	mul	q0
198	add	%rax, %rbp
199	mov	$0, R32(%r9)
200	mov	8(mp,n,8), %rax
201	adc	%rdx, %r9
202	mul	q0
203	mov	8(up,n,8), %rbx
204	mov	$0, R32(%r11)
205	add	%rax, %rbx
206	mov	16(mp,n,8), %rax
207	adc	%rdx, %r11
208	add	%r9, %rbx
209	adc	$0, %r11
210	mov	16(up,n,8), %rbp
211	mov	%rbx, 8(up,n,8)
212	imul	u0inv, %rbx		C next q limb
213C	jmp	L(tp3)
214
215	ALIGNx
216L(tp3):	mul	q0
217	add	%rax, %rbp
218	mov	$0, R32(%r9)
219	mov	-16(mp,i,8), %rax
220	adc	%rdx, %r9
221	mul	q0
222	add	%r11, %rbp
223	mov	$0, R32(%r11)
224	mov	-16(up,i,8), %r10
225	adc	$0, %r9
226	add	%rax, %r10
227	mov	-8(mp,i,8), %rax
228	adc	%rdx, %r11
229	mov	%rbp, -24(up,i,8)
230	add	%r9, %r10
231	adc	$0, %r11
232	mov	-8(up,i,8), %rbp
233	mul	q0
234	add	%rax, %rbp
235	mov	$0, R32(%r9)
236	mov	(mp,i,8), %rax
237	adc	%rdx, %r9
238	mov	%r10, -16(up,i,8)
239	add	%r11, %rbp
240	adc	$0, %r9
241	mul	q0
242	mov	(up,i,8), %r10
243	mov	$0, R32(%r11)
244	add	%rax, %r10
245	mov	8(mp,i,8), %rax
246	adc	%rdx, %r11
247	mov	%rbp, -8(up,i,8)
248	add	%r9, %r10
249	adc	$0, %r11
250	mov	8(up,i,8), %rbp
251	mov	%r10, (up,i,8)
252	add	$4, i
253	jnc	L(tp3)
254
255L(ed3):	mul	q0
256	add	%rax, %rbp
257	adc	$0, %rdx
258	add	%r11, %rbp
259	adc	$0, %rdx
260	mov	%rbp, I(-8(up),-24(up,i,8))
261	mov	%rdx, (up,n,8)		C up[0]
262	mov	%rbx, q0		C previously computed q limb -> q0
263	lea	8(up), up		C up++
264	dec	j
265	jnz	L(otp3)
266C	jmp	L(cj)
267
268L(cj):
269IFSTD(`	lea	(up,n,8), up		C param 2: up
270	lea	(up,n,8), %rdx		C param 3: up - n
271	neg	R32(n)		')	C param 4: n
272
273IFDOS(`	lea	(up,n,8), %rdx		C param 2: up
274	lea	(%rdx,n,8), %r8		C param 3: up - n
275	neg	R32(n)
276	mov	n, %r9			C param 4: n
277	mov	rp, %rcx	')	C param 1: rp
278
279IFSTD(`	sub	$8, %rsp	')
280IFDOS(`	sub	$40, %rsp	')
281	ASSERT(nz, `test $15, %rsp')
282	CALL(	mpn_add_n)
283IFSTD(`	add	$8, %rsp	')
284IFDOS(`	add	$40, %rsp	')
285
286L(ret):	pop	%r15
287	pop	%r14
288	pop	%r13
289	pop	%r12
290	pop	%rbp
291	pop	%rbx
292	FUNC_EXIT()
293	ret
294
295L(bx0):	test	$2, R8(n)
296	jnz	L(b2)
297
298L(b0):
299L(otp0):lea	2(n), i
300	mov	(mp,n,8), %rax
301	mul	q0
302	mov	$0, R32(%r11)
303	mov	(up,n,8), %r10
304	add	%rax, %r10
305	mov	8(mp,n,8), %rax
306	adc	%rdx, %r11
307	mov	8(up,n,8), %rbx
308	mul	q0
309	add	%rax, %rbx
310	mov	$0, R32(%r9)
311	mov	16(mp,n,8), %rax
312	adc	%rdx, %r9
313	add	%r11, %rbx
314	adc	$0, %r9
315	mul	q0
316	mov	16(up,n,8), %r10
317	mov	$0, R32(%r11)
318	add	%rax, %r10
319	mov	24(mp,n,8), %rax
320	adc	%rdx, %r11
321	mov	%rbx, 8(up,n,8)
322	imul	u0inv, %rbx		C next q limb
323	jmp	L(e0)
324
325	ALIGNx
326L(tp0):	mul	q0
327	add	%rax, %rbp
328	mov	$0, R32(%r9)
329	mov	-16(mp,i,8), %rax
330	adc	%rdx, %r9
331	mul	q0
332	add	%r11, %rbp
333	mov	$0, R32(%r11)
334	mov	-16(up,i,8), %r10
335	adc	$0, %r9
336	add	%rax, %r10
337	mov	-8(mp,i,8), %rax
338	adc	%rdx, %r11
339	mov	%rbp, -24(up,i,8)
340	add	%r9, %r10
341	adc	$0, %r11
342	mov	-8(up,i,8), %rbp
343	mul	q0
344	add	%rax, %rbp
345	mov	$0, R32(%r9)
346	mov	(mp,i,8), %rax
347	adc	%rdx, %r9
348	mov	%r10, -16(up,i,8)
349	add	%r11, %rbp
350	adc	$0, %r9
351	mul	q0
352	mov	(up,i,8), %r10
353	mov	$0, R32(%r11)
354	add	%rax, %r10
355	mov	8(mp,i,8), %rax
356	adc	%rdx, %r11
357	mov	%rbp, -8(up,i,8)
358L(e0):	add	%r9, %r10
359	adc	$0, %r11
360	mov	8(up,i,8), %rbp
361	mov	%r10, (up,i,8)
362	add	$4, i
363	jnc	L(tp0)
364
365L(ed0):	mul	q0
366	add	%rax, %rbp
367	adc	$0, %rdx
368	add	%r11, %rbp
369	adc	$0, %rdx
370	mov	%rbp, I(-8(up),-24(up,i,8))
371	mov	%rdx, (up,n,8)		C up[0]
372	mov	%rbx, q0		C previously computed q limb -> q0
373	lea	8(up), up		C up++
374	dec	j
375	jnz	L(otp0)
376	jmp	L(cj)
377
378L(b2):	cmp	$-2, R32(n)
379	jz	L(n2)
380
381L(otp2):lea	4(n), i
382	mov	(mp,n,8), %rax
383	mul	q0
384	mov	(up,n,8), %r10
385	mov	$0, R32(%r11)
386	add	%rax, %r10
387	mov	8(mp,n,8), %rax
388	adc	%rdx, %r11
389	mov	8(up,n,8), %rbx
390	mul	q0
391	add	%rax, %rbx
392	mov	$0, R32(%r9)
393	mov	16(mp,n,8), %rax
394	adc	%rdx, %r9
395	mul	q0
396	add	%r11, %rbx
397	mov	$0, R32(%r11)
398	mov	16(up,n,8), %r10
399	adc	$0, %r9
400	add	%rax, %r10
401	mov	24(mp,n,8), %rax
402	adc	%rdx, %r11
403	mov	%rbx, 8(up,n,8)
404	imul	u0inv, %rbx		C next q limb
405	jmp	L(e2)
406
407	ALIGNx
408L(tp2):	mul	q0
409	add	%rax, %rbp
410	mov	$0, R32(%r9)
411	mov	-16(mp,i,8), %rax
412	adc	%rdx, %r9
413	mul	q0
414	add	%r11, %rbp
415	mov	$0, R32(%r11)
416	mov	-16(up,i,8), %r10
417	adc	$0, %r9
418	add	%rax, %r10
419	mov	-8(mp,i,8), %rax
420	adc	%rdx, %r11
421	mov	%rbp, -24(up,i,8)
422L(e2):	add	%r9, %r10
423	adc	$0, %r11
424	mov	-8(up,i,8), %rbp
425	mul	q0
426	add	%rax, %rbp
427	mov	$0, R32(%r9)
428	mov	(mp,i,8), %rax
429	adc	%rdx, %r9
430	mov	%r10, -16(up,i,8)
431	add	%r11, %rbp
432	adc	$0, %r9
433	mul	q0
434	mov	(up,i,8), %r10
435	mov	$0, R32(%r11)
436	add	%rax, %r10
437	mov	8(mp,i,8), %rax
438	adc	%rdx, %r11
439	mov	%rbp, -8(up,i,8)
440	add	%r9, %r10
441	adc	$0, %r11
442	mov	8(up,i,8), %rbp
443	mov	%r10, (up,i,8)
444	add	$4, i
445	jnc	L(tp2)
446
447L(ed2):	mul	q0
448	add	%rax, %rbp
449	adc	$0, %rdx
450	add	%r11, %rbp
451	adc	$0, %rdx
452	mov	%rbp, I(-8(up),-24(up,i,8))
453	mov	%rdx, (up,n,8)		C up[0]
454	mov	%rbx, q0		C previously computed q limb -> q0
455	lea	8(up), up		C up++
456	dec	j
457	jnz	L(otp2)
458	jmp	L(cj)
459
460L(n1):	mov	(mp_param), %rax
461	mul	q0
462	add	-8(up), %rax
463	adc	(up), %rdx
464	mov	%rdx, (rp)
465	mov	$0, R32(%rax)
466	adc	R32(%rax), R32(%rax)
467	jmp	L(ret)
468
469L(n2):	mov	(mp_param), %rax
470	mov	-16(up), %rbp
471	mul	q0
472	add	%rax, %rbp
473	mov	%rdx, %r9
474	adc	$0, %r9
475	mov	-8(mp), %rax
476	mov	-8(up), %r10
477	mul	q0
478	add	%rax, %r10
479	mov	%rdx, %r11
480	adc	$0, %r11
481	add	%r9, %r10
482	adc	$0, %r11
483	mov	%r10, q0
484	imul	u0inv, q0		C next q0
485	mov	-16(mp), %rax
486	mul	q0
487	add	%rax, %r10
488	mov	%rdx, %r9
489	adc	$0, %r9
490	mov	-8(mp), %rax
491	mov	(up), %r14
492	mul	q0
493	add	%rax, %r14
494	adc	$0, %rdx
495	add	%r9, %r14
496	adc	$0, %rdx
497	xor	R32(%rax), R32(%rax)
498	add	%r11, %r14
499	adc	8(up), %rdx
500	mov	%r14, (rp)
501	mov	%rdx, 8(rp)
502	adc	R32(%rax), R32(%rax)
503	jmp	L(ret)
504
505	ALIGNx
506L(n3):	mov	-24(mp), %rax
507	mov	-24(up), %r10
508	mul	q0
509	add	%rax, %r10
510	mov	-16(mp), %rax
511	mov	%rdx, %r11
512	adc	$0, %r11
513	mov	-16(up), %rbp
514	mul	q0
515	add	%rax, %rbp
516	mov	%rdx, %r9
517	adc	$0, %r9
518	mov	-8(mp), %rax
519	add	%r11, %rbp
520	mov	-8(up), %r10
521	adc	$0, %r9
522	mul	q0
523	mov	%rbp, q0
524	imul	u0inv, q0		C next q0
525	add	%rax, %r10
526	mov	%rdx, %r11
527	adc	$0, %r11
528	mov	%rbp, -16(up)
529	add	%r9, %r10
530	adc	$0, %r11
531	mov	%r10, -8(up)
532	mov	%r11, -24(up)		C up[0]
533	lea	8(up), up		C up++
534	dec	j
535	jnz	L(n3)
536
537	mov	-48(up), %rdx
538	mov	-40(up), %rbx
539	xor	R32(%rax), R32(%rax)
540	add	%rbp, %rdx
541	adc	%r10, %rbx
542	adc	-8(up), %r11
543	mov	%rdx, (rp)
544	mov	%rbx, 8(rp)
545	mov	%r11, 16(rp)
546	adc	R32(%rax), R32(%rax)
547	jmp	L(ret)
548EPILOGUE()
549ASM_END()
550