redc_1.asm revision 1.1.1.1
1dnl  X86-64 mpn_redc_1 optimised for Intel Sandy Bridge and Ivy Bridge.
2
3dnl  Contributed to the GNU project by Torbj��rn Granlund.
4
5dnl  Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc.
6
7dnl  This file is part of the GNU MP Library.
8dnl
9dnl  The GNU MP Library is free software; you can redistribute it and/or modify
10dnl  it under the terms of either:
11dnl
12dnl    * the GNU Lesser General Public License as published by the Free
13dnl      Software Foundation; either version 3 of the License, or (at your
14dnl      option) any later version.
15dnl
16dnl  or
17dnl
18dnl    * the GNU General Public License as published by the Free Software
19dnl      Foundation; either version 2 of the License, or (at your option) any
20dnl      later version.
21dnl
22dnl  or both in parallel, as here.
23dnl
24dnl  The GNU MP Library is distributed in the hope that it will be useful, but
25dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
27dnl  for more details.
28dnl
29dnl  You should have received copies of the GNU General Public License and the
30dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
31dnl  see https://www.gnu.org/licenses/.
32
33include(`../config.m4')
34
35C	     cycles/limb
36C AMD K8,K9	 ?
37C AMD K10	 ?
38C AMD bull	 ?
39C AMD pile	 ?
40C AMD steam	 ?
41C AMD bobcat	 ?
42C AMD jaguar	 ?
43C Intel P4	 ?
44C Intel core	 ?
45C Intel NHM	 ?
46C Intel SBR	 3.24
47C Intel IBR	 3.04
48C Intel HWL	 ?
49C Intel BWL	 ?
50C Intel atom	 ?
51C VIA nano	 ?
52
53C The inner loops of this code are the result of running a code generation and
54C optimisation tool suite written by David Harvey and Torbj��rn Granlund.
55
56C TODO
57C  * Micro-optimise, none performed thus far.
58C  * Consider inlining mpn_add_n.
59C  * Single basecases out before the pushes.
60
61C When playing with pointers, set this to $2 to fall back to conservative
62C indexing in wind-down code.
63define(`I',`$1')
64
65define(`rp',          `%rdi')   C rcx
66define(`up',          `%rsi')   C rdx
67define(`mp_param',    `%rdx')   C r8
68define(`n',           `%rcx')   C r9
69define(`u0inv',       `%r8')    C stack
70
71define(`i',           `%r14')
72define(`j',           `%r15')
73define(`mp',          `%r12')
74define(`q0',          `%r13')
75
76C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15
77
78ABI_SUPPORT(DOS64)
79ABI_SUPPORT(STD64)
80
81define(`ALIGNx', `ALIGN(16)')
82
83ASM_START()
84	TEXT
85	ALIGN(32)
86PROLOGUE(mpn_redc_1)
87	FUNC_ENTRY(4)
88IFDOS(`	mov	56(%rsp), %r8	')
89	push	%rbx
90	push	%rbp
91	push	%r12
92	push	%r13
93	push	%r14
94	push	%r15
95
96	mov	(up), q0
97	mov	n, j			C outer loop induction var
98	lea	8(mp_param,n,8), mp
99	lea	8(up,n,8), up
100	neg	n
101	imul	u0inv, q0		C first iteration q0
102
103	test	$1, R8(n)
104	jz	L(bx0)
105
106L(bx1):	test	$2, R8(n)
107	jz	L(b3)
108
109L(b1):	cmp	$-1, R32(n)
110	jz	L(n1)
111
112L(otp1):lea	1(n), i
113	mov	-8(mp,n,8), %rax
114	mul	q0
115	mov	-8(up,n,8), %r10
116	mov	%rdx, %r11
117	add	%rax, %r10
118	mov	(mp,n,8), %rax
119	adc	$0, %r11
120	mul	q0
121	mov	%rdx, %r9
122	mov	(up,n,8), %rbx
123	add	%rax, %rbx
124	adc	$0, %r9
125	mov	(mp,i,8), %rax
126	mul	q0
127	mov	(up,i,8), %r10
128	add	%r11, %rbx
129	mov	%rbx, -8(up,i,8)	C next low remainder limb
130	adc	$0, %r9
131	imul	u0inv, %rbx		C next q limb
132	jmp	L(e1)
133
134	ALIGNx
135L(tp1):	mul	q0
136	mov	-16(up,i,8), %r10
137	add	%r11, %rbp
138	mov	%rdx, %r11
139	adc	$0, %r9
140	mov	%rbp, -24(up,i,8)
141	add	%rax, %r10
142	mov	-8(mp,i,8), %rax
143	adc	$0, %r11
144	mul	q0
145	add	%r9, %r10
146	mov	%rdx, %r9
147	mov	-8(up,i,8), %rbp
148	adc	$0, %r11
149	mov	%r10, -16(up,i,8)
150	add	%rax, %rbp
151	adc	$0, %r9
152	mov	(mp,i,8), %rax
153	mul	q0
154	mov	(up,i,8), %r10
155	add	%r11, %rbp
156	mov	%rbp, -8(up,i,8)
157	adc	$0, %r9
158L(e1):	mov	%rdx, %r11
159	add	%rax, %r10
160	mov	8(mp,i,8), %rax
161	adc	$0, %r11
162	mul	q0
163	mov	8(up,i,8), %rbp
164	add	%r9, %r10
165	mov	%rdx, %r9
166	mov	%r10, (up,i,8)
167	adc	$0, %r11
168	add	%rax, %rbp
169	adc	$0, %r9
170	mov	16(mp,i,8), %rax
171	add	$4, i
172	jnc	L(tp1)
173
174L(ed1):	mul	q0
175	mov	I(-16(up),-16(up,i,8)), %r10
176	add	%r11, %rbp
177	adc	$0, %r9
178	mov	%rbp, I(-24(up),-24(up,i,8))
179	add	%rax, %r10
180	adc	$0, %rdx
181	add	%r9, %r10
182	adc	$0, %rdx
183	mov	%r10, I(-16(up),-16(up,i,8))
184	mov	%rdx, -8(up,n,8)	C up[0]
185	mov	%rbx, q0		C previously computed q limb -> q0
186	lea	8(up), up		C up++
187	dec	j
188	jnz	L(otp1)
189	jmp	L(cj)
190
191L(b3):	cmp	$-3, R32(n)
192	jz	L(n3)
193
194L(otp3):lea	3(n), i
195	mov	-8(mp,n,8), %rax
196	mul	q0
197	mov	-8(up,n,8), %r10
198	mov	%rdx, %r11
199	add	%rax, %r10
200	mov	(mp,n,8), %rax
201	adc	$0, %r11
202	mul	q0
203	mov	(up,n,8), %rbx
204	mov	%rdx, %r9
205	add	%rax, %rbx
206	adc	$0, %r9
207	mov	8(mp,n,8), %rax
208	mul	q0
209	mov	8(up,n,8), %r10
210	add	%r11, %rbx
211	mov	%rdx, %r11
212	adc	$0, %r9
213	mov	%rbx, (up,n,8)
214	imul	u0inv, %rbx		C next q limb
215	jmp	L(e3)
216
217	ALIGNx
218L(tp3):	mul	q0
219	mov	-16(up,i,8), %r10
220	add	%r11, %rbp
221	mov	%rdx, %r11
222	adc	$0, %r9
223	mov	%rbp, -24(up,i,8)
224L(e3):	add	%rax, %r10
225	mov	-8(mp,i,8), %rax
226	adc	$0, %r11
227	mul	q0
228	add	%r9, %r10
229	mov	%rdx, %r9
230	mov	-8(up,i,8), %rbp
231	adc	$0, %r11
232	mov	%r10, -16(up,i,8)
233	add	%rax, %rbp
234	adc	$0, %r9
235	mov	(mp,i,8), %rax
236	mul	q0
237	mov	(up,i,8), %r10
238	add	%r11, %rbp
239	mov	%rbp, -8(up,i,8)
240	adc	$0, %r9
241	mov	%rdx, %r11
242	add	%rax, %r10
243	mov	8(mp,i,8), %rax
244	adc	$0, %r11
245	mul	q0
246	mov	8(up,i,8), %rbp
247	add	%r9, %r10
248	mov	%rdx, %r9
249	mov	%r10, (up,i,8)
250	adc	$0, %r11
251	add	%rax, %rbp
252	adc	$0, %r9
253	mov	16(mp,i,8), %rax
254	add	$4, i
255	jnc	L(tp3)
256
257L(ed3):	mul	q0
258	mov	I(-16(up),-16(up,i,8)), %r10
259	add	%r11, %rbp
260	adc	$0, %r9
261	mov	%rbp, I(-24(up),-24(up,i,8))
262	add	%rax, %r10
263	adc	$0, %rdx
264	add	%r9, %r10
265	adc	$0, %rdx
266	mov	%r10, I(-16(up),-16(up,i,8))
267	mov	%rdx, -8(up,n,8)	C up[0]
268	mov	%rbx, q0		C previously computed q limb -> q0
269	lea	8(up), up		C up++
270	dec	j
271	jnz	L(otp3)
272C	jmp	L(cj)
273
274L(cj):
275IFSTD(`	lea	-8(up,n,8), up		C param 2: up
276	lea	(up,n,8), %rdx		C param 3: up - n
277	neg	R32(n)		')	C param 4: n
278
279IFDOS(`	lea	-8(up,n,8), %rdx	C param 2: up
280	lea	(%rdx,n,8), %r8		C param 3: up - n
281	neg	R32(n)
282	mov	n, %r9			C param 4: n
283	mov	rp, %rcx	')	C param 1: rp
284
285IFSTD(`	sub	$8, %rsp	')
286IFDOS(`	sub	$40, %rsp	')
287	ASSERT(nz, `test $15, %rsp')
288	CALL(	mpn_add_n)
289IFSTD(`	add	$8, %rsp	')
290IFDOS(`	add	$40, %rsp	')
291
292L(ret):	pop	%r15
293	pop	%r14
294	pop	%r13
295	pop	%r12
296	pop	%rbp
297	pop	%rbx
298	FUNC_EXIT()
299	ret
300
301L(bx0):	test	$2, R8(n)
302	jnz	L(b2)
303
304L(b0):
305L(otp0):lea	(n), i
306	mov	-8(mp,n,8), %rax
307	mul	q0
308	mov	%rdx, %r9
309	mov	-8(up,n,8), %rbp
310	add	%rax, %rbp
311	adc	$0, %r9
312	mov	(mp,n,8), %rax
313	mul	q0
314	mov	(up,n,8), %rbx
315	mov	%rdx, %r11
316	add	%rax, %rbx
317	mov	8(mp,n,8), %rax
318	adc	$0, %r11
319	mul	q0
320	mov	8(up,n,8), %rbp
321	add	%r9, %rbx
322	mov	%rdx, %r9
323	mov	%rbx, (up,n,8)
324	adc	$0, %r11
325	imul	u0inv, %rbx		C next q limb
326	jmp	L(e0)
327
328	ALIGNx
329L(tp0):	mul	q0
330	mov	-16(up,i,8), %r10
331	add	%r11, %rbp
332	mov	%rdx, %r11
333	adc	$0, %r9
334	mov	%rbp, -24(up,i,8)
335	add	%rax, %r10
336	mov	-8(mp,i,8), %rax
337	adc	$0, %r11
338	mul	q0
339	add	%r9, %r10
340	mov	%rdx, %r9
341	mov	-8(up,i,8), %rbp
342	adc	$0, %r11
343	mov	%r10, -16(up,i,8)
344	add	%rax, %rbp
345	adc	$0, %r9
346	mov	(mp,i,8), %rax
347	mul	q0
348	mov	(up,i,8), %r10
349	add	%r11, %rbp
350	mov	%rbp, -8(up,i,8)
351	adc	$0, %r9
352	mov	%rdx, %r11
353	add	%rax, %r10
354	mov	8(mp,i,8), %rax
355	adc	$0, %r11
356	mul	q0
357	mov	8(up,i,8), %rbp
358	add	%r9, %r10
359	mov	%rdx, %r9
360	mov	%r10, (up,i,8)
361	adc	$0, %r11
362L(e0):	add	%rax, %rbp
363	adc	$0, %r9
364	mov	16(mp,i,8), %rax
365	add	$4, i
366	jnc	L(tp0)
367
368L(ed0):	mul	q0
369	mov	I(-16(up),-16(up,i,8)), %r10
370	add	%r11, %rbp
371	adc	$0, %r9
372	mov	%rbp, I(-24(up),-24(up,i,8))
373	add	%rax, %r10
374	adc	$0, %rdx
375	add	%r9, %r10
376	adc	$0, %rdx
377	mov	%r10, I(-16(up),-16(up,i,8))
378	mov	%rdx, -8(up,n,8)	C up[0]
379	mov	%rbx, q0		C previously computed q limb -> q0
380	lea	8(up), up		C up++
381	dec	j
382	jnz	L(otp0)
383	jmp	L(cj)
384
385L(b2):	cmp	$-2, R32(n)
386	jz	L(n2)
387
388L(otp2):lea	2(n), i
389	mov	-8(mp,n,8), %rax
390	mul	q0
391	mov	-8(up,n,8), %rbp
392	mov	%rdx, %r9
393	add	%rax, %rbp
394	adc	$0, %r9
395	mov	(mp,n,8), %rax
396	mul	q0
397	mov	(up,n,8), %rbx
398	mov	%rdx, %r11
399	add	%rax, %rbx
400	mov	8(mp,n,8), %rax
401	adc	$0, %r11
402	mul	q0
403	add	%r9, %rbx
404	mov	%rdx, %r9
405	mov	8(up,n,8), %rbp
406	adc	$0, %r11
407	mov	%rbx, (up,n,8)
408	imul	u0inv, %rbx		C next q limb
409	jmp	L(e2)
410
411	ALIGNx
412L(tp2):	mul	q0
413	mov	-16(up,i,8), %r10
414	add	%r11, %rbp
415	mov	%rdx, %r11
416	adc	$0, %r9
417	mov	%rbp, -24(up,i,8)
418	add	%rax, %r10
419	mov	-8(mp,i,8), %rax
420	adc	$0, %r11
421	mul	q0
422	add	%r9, %r10
423	mov	%rdx, %r9
424	mov	-8(up,i,8), %rbp
425	adc	$0, %r11
426	mov	%r10, -16(up,i,8)
427L(e2):	add	%rax, %rbp
428	adc	$0, %r9
429	mov	(mp,i,8), %rax
430	mul	q0
431	mov	(up,i,8), %r10
432	add	%r11, %rbp
433	mov	%rbp, -8(up,i,8)
434	adc	$0, %r9
435	mov	%rdx, %r11
436	add	%rax, %r10
437	mov	8(mp,i,8), %rax
438	adc	$0, %r11
439	mul	q0
440	mov	8(up,i,8), %rbp
441	add	%r9, %r10
442	mov	%rdx, %r9
443	mov	%r10, (up,i,8)
444	adc	$0, %r11
445	add	%rax, %rbp
446	adc	$0, %r9
447	mov	16(mp,i,8), %rax
448	add	$4, i
449	jnc	L(tp2)
450
451L(ed2):	mul	q0
452	mov	I(-16(up),-16(up,i,8)), %r10
453	add	%r11, %rbp
454	adc	$0, %r9
455	mov	%rbp, I(-24(up),-24(up,i,8))
456	add	%rax, %r10
457	adc	$0, %rdx
458	add	%r9, %r10
459	adc	$0, %rdx
460	mov	%r10, I(-16(up),-16(up,i,8))
461	mov	%rdx, -8(up,n,8)	C up[0]
462	mov	%rbx, q0		C previously computed q limb -> q0
463	lea	8(up), up		C up++
464	dec	j
465	jnz	L(otp2)
466	jmp	L(cj)
467
468L(n1):	mov	(mp_param), %rax
469	mul	q0
470	add	-16(up), %rax
471	adc	-8(up), %rdx
472	mov	%rdx, (rp)
473	mov	$0, R32(%rax)
474	adc	R32(%rax), R32(%rax)
475	jmp	L(ret)
476
477L(n2):	mov	(mp_param), %rax
478	mov	-24(up), %rbp
479	mul	q0
480	add	%rax, %rbp
481	mov	%rdx, %r9
482	adc	$0, %r9
483	mov	-16(mp), %rax
484	mov	-16(up), %r10
485	mul	q0
486	add	%rax, %r10
487	mov	%rdx, %r11
488	adc	$0, %r11
489	add	%r9, %r10
490	adc	$0, %r11
491	mov	%r10, q0
492	imul	u0inv, q0		C next q0
493	mov	-24(mp), %rax
494	mul	q0
495	add	%rax, %r10
496	mov	%rdx, %r9
497	adc	$0, %r9
498	mov	-16(mp), %rax
499	mov	-8(up), %r14
500	mul	q0
501	add	%rax, %r14
502	adc	$0, %rdx
503	add	%r9, %r14
504	adc	$0, %rdx
505	xor	R32(%rax), R32(%rax)
506	add	%r11, %r14
507	adc	(up), %rdx
508	mov	%r14, (rp)
509	mov	%rdx, 8(rp)
510	adc	R32(%rax), R32(%rax)
511	jmp	L(ret)
512
513	ALIGNx
514L(n3):	mov	-32(mp), %rax
515	mov	-32(up), %r10
516	mul	q0
517	add	%rax, %r10
518	mov	-24(mp), %rax
519	mov	%rdx, %r11
520	adc	$0, %r11
521	mov	-24(up), %rbp
522	mul	q0
523	add	%rax, %rbp
524	mov	%rdx, %r9
525	adc	$0, %r9
526	mov	-16(mp), %rax
527	add	%r11, %rbp
528	mov	-16(up), %r10
529	adc	$0, %r9
530	mul	q0
531	mov	%rbp, q0
532	imul	u0inv, q0		C next q0
533	add	%rax, %r10
534	mov	%rdx, %r11
535	adc	$0, %r11
536	mov	%rbp, -24(up)
537	add	%r9, %r10
538	adc	$0, %r11
539	mov	%r10, -16(up)
540	mov	%r11, -32(up)		C up[0]
541	lea	8(up), up		C up++
542	dec	j
543	jnz	L(n3)
544	jmp	L(cj)
545EPILOGUE()
546ASM_END()
547