redc_1.asm revision 1.1.1.1
1dnl  X86-64 mpn_redc_1 optimised for Intel Atom.
2
3dnl  Contributed to the GNU project by Torbj��rn Granlund.
4
5dnl  Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc.
6
7dnl  This file is part of the GNU MP Library.
8dnl
9dnl  The GNU MP Library is free software; you can redistribute it and/or modify
10dnl  it under the terms of either:
11dnl
12dnl    * the GNU Lesser General Public License as published by the Free
13dnl      Software Foundation; either version 3 of the License, or (at your
14dnl      option) any later version.
15dnl
16dnl  or
17dnl
18dnl    * the GNU General Public License as published by the Free Software
19dnl      Foundation; either version 2 of the License, or (at your option) any
20dnl      later version.
21dnl
22dnl  or both in parallel, as here.
23dnl
24dnl  The GNU MP Library is distributed in the hope that it will be useful, but
25dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
27dnl  for more details.
28dnl
29dnl  You should have received copies of the GNU General Public License and the
30dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
31dnl  see https://www.gnu.org/licenses/.
32
33include(`../config.m4')
34
35C	     cycles/limb
36C AMD K8,K9	 ?
37C AMD K10	 ?
38C AMD bull	 ?
39C AMD pile	 ?
40C AMD steam	 ?
41C AMD bobcat	 5.0
42C AMD jaguar	 ?
43C Intel P4	 ?
44C Intel core	 ?
45C Intel NHM	 ?
46C Intel SBR	 ?
47C Intel IBR	 ?
48C Intel HWL	 ?
49C Intel BWL	 ?
50C Intel atom	 ?
51C VIA nano	 ?
52
53C TODO
54C  * Micro-optimise, none performed thus far.
55C  * Consider inlining mpn_add_n.
56C  * Single basecases out before the pushes.
57C  * Make lead-in code for the inner loops be more similar.
58
59C When playing with pointers, set this to $2 to fall back to conservative
60C indexing in wind-down code.
61define(`I',`$1')
62
63define(`rp',          `%rdi')   C rcx
64define(`up',          `%rsi')   C rdx
65define(`mp_param',    `%rdx')   C r8
66define(`n',           `%rcx')   C r9
67define(`u0inv',       `%r8')    C stack
68
69define(`i',           `%r14')
70define(`j',           `%r15')
71define(`mp',          `%r12')
72define(`q0',          `%r13')
73define(`w0',          `%rbp')
74define(`w1',          `%r9')
75define(`w2',          `%r10')
76define(`w3',          `%r11')
77
78C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15
79
80ABI_SUPPORT(DOS64)
81ABI_SUPPORT(STD64)
82
83define(`ALIGNx', `ALIGN(16)')
84
85ASM_START()
86	TEXT
87	ALIGN(32)
88PROLOGUE(mpn_redc_1)
89	FUNC_ENTRY(4)
90IFDOS(`	mov	56(%rsp), %r8	')
91	push	%rbx
92	push	%rbp
93	push	%r12
94	push	%r13
95	push	%r14
96	push	%r15
97
98	mov	(up), q0
99	mov	n, j			C outer loop induction var
100	lea	(mp_param,n,8), mp
101	lea	(up,n,8), up
102	neg	n
103	imul	u0inv, q0		C first iteration q0
104
105	test	$1, R8(n)
106	jz	L(bx0)
107
108L(bx1):	test	$2, R8(n)
109	jz	L(b3)
110
111L(b1):	cmp	$-1, R32(n)
112	jz	L(n1)
113
114L(otp1):lea	1(n), i
115	mov	(mp,n,8), %rax
116	mul	q0
117	mov	%rax, %rbp
118	mov	8(mp,n,8), %rax
119	mov	%rdx, %r9
120	mul	q0
121	mov	%rax, %rbx
122	mov	16(mp,n,8), %rax
123	mov	%rdx, %r10
124	mul	q0
125	add	(up,n,8), %rbp
126	mov	%rax, %rbp
127	adc	%r9, %rbx
128	mov	24(mp,n,8), %rax
129	adc	$0, %r10
130	mov	%rdx, %r9
131	mul	q0
132	add	8(up,n,8), %rbx
133	mov	%rbx, 8(up,n,8)
134	mov	%rax, %r11
135	adc	%r10, %rbp
136	mov	32(mp,n,8), %rax
137	adc	$0, %r9
138	imul	u0inv, %rbx		C next q limb
139	jmp	L(e1)
140
141	ALIGNx
142L(tp1):	mul	q0
143	add	%rbp, -24(up,i,8)
144	mov	%rax, %rbp
145	mov	(mp,i,8), %rax
146	adc	%r9, %r11
147	mov	%rdx, %r9
148	adc	$0, %r10
149	mul	q0
150	add	%r11, -16(up,i,8)
151	mov	%rax, %r11
152	mov	8(mp,i,8), %rax
153	adc	%r10, %rbp
154	mov	%rdx, %r10
155	adc	$0, %r9
156	mul	q0
157	add	%rbp, -8(up,i,8)
158	mov	%rax, %rbp
159	adc	%r9, %r11
160	mov	16(mp,i,8), %rax
161	adc	$0, %r10
162	mov	%rdx, %r9
163	mul	q0
164	add	%r11, (up,i,8)
165	mov	%rax, %r11
166	adc	%r10, %rbp
167	mov	24(mp,i,8), %rax
168	adc	$0, %r9
169L(e1):	add	$4, i
170	mov	%rdx, %r10
171	js	L(tp1)
172
173L(ed1):	mul	q0
174	add	%rbp, I(-24(up),-24(up,i,8))
175	adc	%r9, %r11
176	adc	$0, %r10
177	add	%r11, I(-16(up),-16(up,i,8))
178	adc	%r10, %rax
179	adc	$0, %rdx
180	add	%rax, I(-8(up),-8(up,i,8))
181	adc	$0, %rdx
182	mov	%rdx, (up,n,8)		C up[0]
183	mov	%rbx, q0		C previously computed q limb -> q0
184	lea	8(up), up		C up++
185	dec	j
186	jnz	L(otp1)
187	jmp	L(cj)
188
189L(b3):	cmp	$-3, R32(n)
190	jz	L(n3)
191
192L(otp3):lea	3(n), i
193	mov	(mp,n,8), %rax
194	mul	q0
195	mov	%rax, %rbp
196	mov	8(mp,n,8), %rax
197	mov	%rdx, %r9
198	mul	q0
199	mov	%rax, %rbx
200	mov	16(mp,n,8), %rax
201	mov	%rdx, %r10
202	mul	q0
203	add	(up,n,8), %rbp
204	mov	%rax, %rbp
205	mov	24(mp,n,8), %rax
206	adc	%r9, %rbx
207	mov	%rdx, %r9
208	adc	$0, %r10
209	mul	q0
210	add	8(up,n,8), %rbx
211	mov	%rbx, 8(up,n,8)
212	mov	%rax, %r11
213	mov	32(mp,n,8), %rax
214	adc	%r10, %rbp
215	mov	%rdx, %r10
216	adc	$0, %r9
217	imul	u0inv, %rbx		C next q limb
218	jmp	L(e3)
219
220	ALIGNx
221L(tp3):	mul	q0
222	add	%rbp, -24(up,i,8)
223	mov	%rax, %rbp
224	mov	(mp,i,8), %rax
225	adc	%r9, %r11
226	mov	%rdx, %r9
227	adc	$0, %r10
228	mul	q0
229	add	%r11, -16(up,i,8)
230	mov	%rax, %r11
231	mov	8(mp,i,8), %rax
232	adc	%r10, %rbp
233	mov	%rdx, %r10
234	adc	$0, %r9
235L(e3):	mul	q0
236	add	%rbp, -8(up,i,8)
237	mov	%rax, %rbp
238	adc	%r9, %r11
239	mov	16(mp,i,8), %rax
240	adc	$0, %r10
241	mov	%rdx, %r9
242	mul	q0
243	add	%r11, (up,i,8)
244	mov	%rax, %r11
245	adc	%r10, %rbp
246	mov	24(mp,i,8), %rax
247	adc	$0, %r9
248	add	$4, i
249	mov	%rdx, %r10
250	js	L(tp3)
251
252L(ed3):	mul	q0
253	add	%rbp, I(-24(up),-24(up,i,8))
254	adc	%r9, %r11
255	adc	$0, %r10
256	add	%r11, I(-16(up),-16(up,i,8))
257	adc	%r10, %rax
258	adc	$0, %rdx
259	add	%rax, I(-8(up),-8(up,i,8))
260	adc	$0, %rdx
261	mov	%rdx, (up,n,8)		C up[0]
262	mov	%rbx, q0		C previously computed q limb -> q0
263	lea	8(up), up		C up++
264	dec	j
265	jnz	L(otp3)
266C	jmp	L(cj)
267
268L(cj):
269IFSTD(`	lea	(up,n,8), up		C param 2: up
270	lea	(up,n,8), %rdx		C param 3: up - n
271	neg	R32(n)		')	C param 4: n
272
273IFDOS(`	lea	(up,n,8), %rdx		C param 2: up
274	lea	(%rdx,n,8), %r8		C param 3: up - n
275	neg	R32(n)
276	mov	n, %r9			C param 4: n
277	mov	rp, %rcx	')	C param 1: rp
278
279IFSTD(`	sub	$8, %rsp	')
280IFDOS(`	sub	$40, %rsp	')
281	ASSERT(nz, `test $15, %rsp')
282	CALL(	mpn_add_n)
283IFSTD(`	add	$8, %rsp	')
284IFDOS(`	add	$40, %rsp	')
285
286L(ret):	pop	%r15
287	pop	%r14
288	pop	%r13
289	pop	%r12
290	pop	%rbp
291	pop	%rbx
292	FUNC_EXIT()
293	ret
294
295L(bx0):	test	$2, R8(n)
296	jnz	L(b2)
297
298L(b0):	cmp	$-4, R32(n)
299	jz	L(n4)
300
301L(otp0):lea	4(n), i
302	mov	(mp,n,8), %rax
303	mul	q0
304	mov	%rax, %r11
305	mov	8(mp,n,8), %rax
306	mov	%rdx, %r10
307	mul	q0
308	mov	%rax, %rbx
309	mov	16(mp,n,8), %rax
310	mov	%rdx, %r9
311	mul	q0
312	add	(up,n,8), %r11
313	mov	%rax, %r11
314	adc	%r10, %rbx
315	mov	24(mp,n,8), %rax
316	adc	$0, %r9
317	mov	%rdx, %r10
318	mul	q0
319	add	8(up,n,8), %rbx
320	mov	%rbx, 8(up,n,8)
321	mov	%rax, %rbp
322	mov	32(mp,n,8), %rax
323	adc	%r9, %r11
324	mov	%rdx, %r9
325	adc	$0, %r10
326	imul	u0inv, %rbx		C next q limb
327	jmp	L(e0)
328
329	ALIGNx
330L(tp0):	mul	q0
331	add	%rbp, -24(up,i,8)
332	mov	%rax, %rbp
333	mov	(mp,i,8), %rax
334	adc	%r9, %r11
335	mov	%rdx, %r9
336	adc	$0, %r10
337L(e0):	mul	q0
338	add	%r11, -16(up,i,8)
339	mov	%rax, %r11
340	mov	8(mp,i,8), %rax
341	adc	%r10, %rbp
342	mov	%rdx, %r10
343	adc	$0, %r9
344	mul	q0
345	add	%rbp, -8(up,i,8)
346	mov	%rax, %rbp
347	adc	%r9, %r11
348	mov	16(mp,i,8), %rax
349	adc	$0, %r10
350	mov	%rdx, %r9
351	mul	q0
352	add	%r11, (up,i,8)
353	mov	%rax, %r11
354	adc	%r10, %rbp
355	mov	24(mp,i,8), %rax
356	adc	$0, %r9
357	add	$4, i
358	mov	%rdx, %r10
359	js	L(tp0)
360
361L(ed0):	mul	q0
362	add	%rbp, I(-24(up),-24(up,i,8))
363	adc	%r9, %r11
364	adc	$0, %r10
365	add	%r11, I(-16(up),-16(up,i,8))
366	adc	%r10, %rax
367	adc	$0, %rdx
368	add	%rax, I(-8(up),-8(up,i,8))
369	adc	$0, %rdx
370	mov	%rdx, (up,n,8)		C up[0]
371	mov	%rbx, q0		C previously computed q limb -> q0
372	lea	8(up), up		C up++
373	dec	j
374	jnz	L(otp0)
375	jmp	L(cj)
376
377L(b2):	cmp	$-2, R32(n)
378	jz	L(n2)
379
380L(otp2):lea	2(n), i
381	mov	(mp,n,8), %rax
382	mul	q0
383	mov	%rax, %r11
384	mov	8(mp,n,8), %rax
385	mov	%rdx, %r10
386	mul	q0
387	mov	%rax, %rbx
388	mov	16(mp,n,8), %rax
389	mov	%rdx, %r9
390	mul	q0
391	add	(up,n,8), %r11
392	mov	%rax, %r11
393	adc	%r10, %rbx
394	mov	24(mp,n,8), %rax
395	adc	$0, %r9
396	mov	%rdx, %r10
397	mul	q0
398	add	8(up,n,8), %rbx
399	mov	%rbx, 8(up,n,8)
400	mov	%rax, %rbp
401	mov	32(mp,n,8), %rax
402	adc	%r9, %r11
403	mov	%rdx, %r9
404	adc	$0, %r10
405	imul	u0inv, %rbx		C next q limb
406	jmp	L(e2)
407
408	ALIGNx
409L(tp2):	mul	q0
410	add	%rbp, -24(up,i,8)
411	mov	%rax, %rbp
412	mov	(mp,i,8), %rax
413	adc	%r9, %r11
414	mov	%rdx, %r9
415	adc	$0, %r10
416	mul	q0
417	add	%r11, -16(up,i,8)
418	mov	%rax, %r11
419	mov	8(mp,i,8), %rax
420	adc	%r10, %rbp
421	mov	%rdx, %r10
422	adc	$0, %r9
423	mul	q0
424	add	%rbp, -8(up,i,8)
425	mov	%rax, %rbp
426	adc	%r9, %r11
427	mov	16(mp,i,8), %rax
428	adc	$0, %r10
429	mov	%rdx, %r9
430L(e2):	mul	q0
431	add	%r11, (up,i,8)
432	mov	%rax, %r11
433	adc	%r10, %rbp
434	mov	24(mp,i,8), %rax
435	adc	$0, %r9
436	add	$4, i
437	mov	%rdx, %r10
438	js	L(tp2)
439
440L(ed2):	mul	q0
441	add	%rbp, I(-24(up),-24(up,i,8))
442	adc	%r9, %r11
443	adc	$0, %r10
444	add	%r11, I(-16(up),-16(up,i,8))
445	adc	%r10, %rax
446	adc	$0, %rdx
447	add	%rax, I(-8(up),-8(up,i,8))
448	adc	$0, %rdx
449	mov	%rdx, (up,n,8)		C up[0]
450	mov	%rbx, q0		C previously computed q limb -> q0
451	lea	8(up), up		C up++
452	dec	j
453	jnz	L(otp2)
454	jmp	L(cj)
455
456L(n1):	mov	(mp_param), %rax
457	mul	q0
458	add	-8(up), %rax
459	adc	(up), %rdx
460	mov	%rdx, (rp)
461	mov	$0, R32(%rax)
462	adc	R32(%rax), R32(%rax)
463	jmp	L(ret)
464
465L(n2):	mov	(mp_param), %rax
466	mov	-16(up), %rbp
467	mul	q0
468	add	%rax, %rbp
469	mov	%rdx, %r9
470	adc	$0, %r9
471	mov	-8(mp), %rax
472	mov	-8(up), %r10
473	mul	q0
474	add	%rax, %r10
475	mov	%rdx, %r11
476	adc	$0, %r11
477	add	%r9, %r10
478	adc	$0, %r11
479	mov	%r10, q0
480	imul	u0inv, q0		C next q0
481	mov	-16(mp), %rax
482	mul	q0
483	add	%rax, %r10
484	mov	%rdx, %r9
485	adc	$0, %r9
486	mov	-8(mp), %rax
487	mov	(up), %r14
488	mul	q0
489	add	%rax, %r14
490	adc	$0, %rdx
491	add	%r9, %r14
492	adc	$0, %rdx
493	xor	R32(%rax), R32(%rax)
494	add	%r11, %r14
495	adc	8(up), %rdx
496	mov	%r14, (rp)
497	mov	%rdx, 8(rp)
498	adc	R32(%rax), R32(%rax)
499	jmp	L(ret)
500
501	ALIGNx
502L(n3):	mov	-24(mp), %rax
503	mov	-24(up), %r10
504	mul	q0
505	add	%rax, %r10
506	mov	-16(mp), %rax
507	mov	%rdx, %r11
508	adc	$0, %r11
509	mov	-16(up), %rbp
510	mul	q0
511	add	%rax, %rbp
512	mov	%rdx, %r9
513	adc	$0, %r9
514	mov	-8(mp), %rax
515	add	%r11, %rbp
516	mov	-8(up), %r10
517	adc	$0, %r9
518	mul	q0
519	mov	%rbp, q0
520	imul	u0inv, q0		C next q0
521	add	%rax, %r10
522	mov	%rdx, %r11
523	adc	$0, %r11
524	mov	%rbp, -16(up)
525	add	%r9, %r10
526	adc	$0, %r11
527	mov	%r10, -8(up)
528	mov	%r11, -24(up)		C up[0]
529	lea	8(up), up		C up++
530	dec	j
531	jnz	L(n3)
532
533	mov	-48(up), %rdx
534	mov	-40(up), %rbx
535	xor	R32(%rax), R32(%rax)
536	add	%rbp, %rdx
537	adc	%r10, %rbx
538	adc	-8(up), %r11
539	mov	%rdx, (rp)
540	mov	%rbx, 8(rp)
541	mov	%r11, 16(rp)
542	adc	R32(%rax), R32(%rax)
543	jmp	L(ret)
544
545L(n4):	mov	-32(mp), %rax
546	mul	q0
547	mov	%rax, %r11
548	mov	-24(mp), %rax
549	mov	%rdx, %r10
550	mul	q0
551	mov	%rax, %rbx
552	mov	-16(mp), %rax
553	mov	%rdx, %r9
554	mul	q0
555	add	-32(up), %r11
556	mov	%rax, %r11
557	adc	%r10, %rbx
558	mov	-8(mp), %rax
559	adc	$0, %r9
560	mov	%rdx, %r10
561	mul	q0
562	add	-24(up), %rbx
563	mov	%rbx, -24(up)
564	adc	%r9, %r11
565	adc	$0, %r10
566	imul	u0inv, %rbx		C next q limb
567	add	%r11, -16(up)
568	adc	%r10, %rax
569	adc	$0, %rdx
570	add	%rax, -8(up)
571	adc	$0, %rdx
572	mov	%rdx, -32(up)		C up[0]
573	mov	%rbx, q0		C previously computed q limb -> q0
574	dec	j
575	lea	8(up), up		C up++
576	jnz	L(n4)
577	jmp	L(cj)
578EPILOGUE()
579ASM_END()
580