mod_34lsub1.asm revision 1.1.1.1
1dnl  IA-64 mpn_mod_34lsub1
2
3dnl  Contributed to the GNU project by Torbjorn Granlund.
4
5dnl  Copyright 2003, 2004, 2005, 2010 Free Software Foundation, Inc.
6
7dnl  This file is part of the GNU MP Library.
8
9dnl  The GNU MP Library is free software; you can redistribute it and/or modify
10dnl  it under the terms of the GNU Lesser General Public License as published
11dnl  by the Free Software Foundation; either version 3 of the License, or (at
12dnl  your option) any later version.
13
14dnl  The GNU MP Library is distributed in the hope that it will be useful, but
15dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
16dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
17dnl  License for more details.
18
19dnl  You should have received a copy of the GNU Lesser General Public License
20dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
21
22include(`../config.m4')
23
24C           cycles/limb
25C Itanium:      ?
26C Itanium 2:    1
27
28
29C INPUT PARAMETERS
30define(`up', `r32')
31define(`n',  `r33')
32
33C Some useful aliases for registers we use
34define(`u0',`r14') define(`u1',`r15') define(`u2',`r16')
35define(`a0',`r17') define(`a1',`r18') define(`a2',`r19')
36define(`c0',`r20') define(`c1',`r21') define(`c2',`r22')
37
38C This is a fairly simple-minded implementation.  One could approach 0.67 c/l
39C with a more sophisticated implementation.  If we're really crazy, we could
40C super-unroll, storing carries just in predicate registers, then copy them to
41C a general register, and population count them from there.  That'd bring us
42C close to 3 insn/limb, for nearly 0.5 c/l.
43
44C Computing n/3 needs 16 cycles, which is a lot of startup overhead.
45C We therefore use a plain while-style loop:
46C	add		n = -3, n
47C	cmp.le		p9, p0 = 3, n
48C  (p9)	br.cond		.Loop
49C Alternatively, we could table n/3 for, say, n < 256, and predicate the
50C 16-cycle code.
51
52C The summing-up code at the end was written quickly, and could surely be
53C vastly improved.
54
55ASM_START()
56PROLOGUE(mpn_mod_34lsub1)
57	.prologue
58	.save	ar.lc, r2
59	.body
60ifdef(`HAVE_ABI_32',`
61	addp4		up = 0, up		C			M I
62	zxt4		n = n			C			I
63	;;
64')
65
66ifelse(0,1,`
67	movl		r14 = 0xAAAAAAAAAAAAAAAB
68	;;
69	setf.sig	f6 = r14
70	setf.sig	f7 = r33
71	;;
72	xmpy.hu		f6 = f6, f7
73	;;
74	getf.sig	r8 = f6
75	;;
76	shr.u		r8 = r8, 1		C Loop count
77	;;
78	mov.i		ar.lc = r8
79')
80
81	ld8	u0 = [up], 8
82	cmp.ne	p9, p0 = 1, n
83  (p9)	br	L(gt1)
84	;;
85	shr.u	r8 = u0, 48
86	dep.z	r27 = u0, 0, 48
87	;;
88	add	r8 = r8, r27
89	br.ret.sptk.many b0
90
91
92L(gt1):
93.mmi;	nop.m	0
94	mov	a0 = 0
95	add	n = -2, n
96.mmi;	mov	c0 = 0
97	mov	c1 = 0
98	mov	c2 = 0
99	;;
100.mmi;	ld8	u1 = [up], 8
101	mov	a1 = 0
102	cmp.ltu	p6, p0 = r0, r0		C clear p6
103.mmb;	cmp.gt	p9, p0 = 3, n
104	mov	a2 = 0
105  (p9)	br.cond.dptk	L(end)
106	;;
107
108	ALIGN(32)
109L(top):
110.mmi;	ld8	u2 = [up], 8
111  (p6)	add	c0 = 1, c0
112	cmp.ltu	p7, p0 = a0, u0
113.mmb;	sub	a0 = a0, u0
114	add	n = -3, n
115	nop.b	0
116	;;
117.mmi;	ld8	u0 = [up], 8
118  (p7)	add	c1 = 1, c1
119	cmp.ltu	p8, p0 = a1, u1
120.mmb;	sub	a1 = a1, u1
121	cmp.le	p9, p0 = 3, n
122	nop.b	0
123	;;
124.mmi;	ld8	u1 = [up], 8
125  (p8)	add	c2 = 1, c2
126	cmp.ltu	p6, p0 = a2, u2
127.mmb;	sub	a2 = a2, u2
128	nop.m	0
129dnl	br.cloop.dptk	L(top)
130  (p9)	br.cond.dptk	L(top)
131	;;
132
133L(end):
134	cmp.eq	p10, p0 = 0, n
135	cmp.eq	p11, p0 = 1, n
136  (p10)	br	L(0)
137
138L(2):
139.mmi;	ld8	u2 = [up], 8
140  (p6)	add	c0 = 1, c0
141	cmp.ltu	p7, p0 = a0, u0
142.mmb;	sub	a0 = a0, u0
143	nop.m	0
144  (p11)	br	L(1)
145	;;
146	ld8	u0 = [up], 8
147  (p7)	add	c1 = 1, c1
148	cmp.ltu	p8, p0 = a1, u1
149	sub	a1 = a1, u1
150	;;
151  (p8)	add	c2 = 1, c2
152	cmp.ltu	p6, p0 = a2, u2
153	sub	a2 = a2, u2
154	;;
155  (p6)	add	c0 = 1, c0
156	cmp.ltu	p7, p0 = a0, u0
157	sub	a0 = a0, u0
158	;;
159  (p7)	add	c1 = 1, c1
160	br	L(com)
161
162
163L(1):
164  (p7)	add	c1 = 1, c1
165	cmp.ltu	p8, p0 = a1, u1
166	sub	a1 = a1, u1
167	;;
168  (p8)	add	c2 = 1, c2
169	cmp.ltu	p6, p0 = a2, u2
170	sub	a2 = a2, u2
171	;;
172  (p6)	add	c0 = 1, c0
173	br	L(com)
174
175
176L(0):
177  (p6)	add	c0 = 1, c0
178	cmp.ltu	p7, p0 = a0, u0
179	sub	a0 = a0, u0
180	;;
181  (p7)	add	c1 = 1, c1
182	cmp.ltu	p8, p0 = a1, u1
183	sub	a1 = a1, u1
184	;;
185  (p8)	add	c2 = 1, c2
186
187L(com):
188C |     a2    |     a1    |     a0    |
189C |        |        |        |        |
190	shr.u	r24 = a0, 48		C 16 bits
191	shr.u	r25 = a1, 32		C 32 bits
192	shr.u	r26 = a2, 16		C 48 bits
193	;;
194	shr.u	r10 = c0, 48		C 16 bits, always zero
195	shr.u	r11 = c1, 32		C 32 bits
196	shr.u	r30 = c2, 16		C 48 bits
197	;;
198	dep.z	r27 = a0,  0, 48	C 48 bits
199	dep.z	r28 = a1, 16, 32	C 48 bits
200	dep.z	r29 = a2, 32, 16	C 48 bits
201	dep.z	r31 = c0,  0, 48	C 48 bits
202	dep.z	r14 = c1, 16, 32	C 48 bits
203	dep.z	r15 = c2, 32, 16	C 48 bits
204	;;
205.mmi;	add	r24 = r24, r25
206	add	r26 = r26, r27
207	add	r28 = r28, r29
208.mmi;	add	r10 = r10, r11
209	add	r30 = r30, r31
210	add	r14 = r14, r15
211	;;
212	movl	r8 = 0xffffffffffff0
213	add	r24 = r24, r26
214	add	r10 = r10, r30
215	;;
216	add	r24 = r24, r28
217	add	r10 = r10, r14
218	;;
219	sub	r8 = r8, r24
220	;;
221	add	r8 = r8, r10
222	br.ret.sptk.many b0
223EPILOGUE()
224ASM_END()
225