1dnl  IA-64 mpn_mod_34lsub1
2
3dnl  Contributed to the GNU project by Torbjorn Granlund.
4
5dnl  Copyright 2003-2005, 2010 Free Software Foundation, Inc.
6
7dnl  This file is part of the GNU MP Library.
8dnl
9dnl  The GNU MP Library is free software; you can redistribute it and/or modify
10dnl  it under the terms of either:
11dnl
12dnl    * the GNU Lesser General Public License as published by the Free
13dnl      Software Foundation; either version 3 of the License, or (at your
14dnl      option) any later version.
15dnl
16dnl  or
17dnl
18dnl    * the GNU General Public License as published by the Free Software
19dnl      Foundation; either version 2 of the License, or (at your option) any
20dnl      later version.
21dnl
22dnl  or both in parallel, as here.
23dnl
24dnl  The GNU MP Library is distributed in the hope that it will be useful, but
25dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
27dnl  for more details.
28dnl
29dnl  You should have received copies of the GNU General Public License and the
30dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
31dnl  see https://www.gnu.org/licenses/.
32
33include(`../config.m4')
34
35C           cycles/limb
36C Itanium:      ?
37C Itanium 2:    1
38
39
40C INPUT PARAMETERS
41define(`up', `r32')
42define(`n',  `r33')
43
44C Some useful aliases for registers we use
45define(`u0',`r14') define(`u1',`r15') define(`u2',`r16')
46define(`a0',`r17') define(`a1',`r18') define(`a2',`r19')
47define(`c0',`r20') define(`c1',`r21') define(`c2',`r22')
48
49C This is a fairly simple-minded implementation.  One could approach 0.67 c/l
50C with a more sophisticated implementation.  If we're really crazy, we could
51C super-unroll, storing carries just in predicate registers, then copy them to
52C a general register, and population count them from there.  That'd bring us
53C close to 3 insn/limb, for nearly 0.5 c/l.
54
55C Computing n/3 needs 16 cycles, which is a lot of startup overhead.
56C We therefore use a plain while-style loop:
57C	add		n = -3, n
58C	cmp.le		p9, p0 = 3, n
59C  (p9)	br.cond		.Loop
60C Alternatively, we could table n/3 for, say, n < 256, and predicate the
61C 16-cycle code.
62
63C The summing-up code at the end was written quickly, and could surely be
64C vastly improved.
65
66ASM_START()
67PROLOGUE(mpn_mod_34lsub1)
68	.prologue
69	.save	ar.lc, r2
70	.body
71ifdef(`HAVE_ABI_32',`
72	addp4		up = 0, up		C			M I
73	nop.m		0
74	zxt4		n = n			C			I
75	;;
76')
77
78ifelse(0,1,`
79	movl		r14 = 0xAAAAAAAAAAAAAAAB
80	;;
81	setf.sig	f6 = r14
82	setf.sig	f7 = r33
83	;;
84	xmpy.hu		f6 = f6, f7
85	;;
86	getf.sig	r8 = f6
87	;;
88	shr.u		r8 = r8, 1		C Loop count
89	;;
90	mov.i		ar.lc = r8
91')
92
93	ld8	u0 = [up], 8
94	cmp.ne	p9, p0 = 1, n
95  (p9)	br	L(gt1)
96	;;
97	shr.u	r8 = u0, 48
98	dep.z	r27 = u0, 0, 48
99	;;
100	add	r8 = r8, r27
101	br.ret.sptk.many b0
102
103
104L(gt1):
105 {.mmi;	nop.m	0
106	mov	a0 = 0
107	add	n = -2, n
108}{.mmi;	mov	c0 = 0
109	mov	c1 = 0
110	mov	c2 = 0
111	;;
112}{.mmi;	ld8	u1 = [up], 8
113	mov	a1 = 0
114	cmp.ltu	p6, p0 = r0, r0		C clear p6
115}{.mmb;	cmp.gt	p9, p0 = 3, n
116	mov	a2 = 0
117  (p9)	br.cond.dptk	L(end)
118	;;
119}
120	ALIGN(32)
121L(top):
122 {.mmi;	ld8	u2 = [up], 8
123  (p6)	add	c0 = 1, c0
124	cmp.ltu	p7, p0 = a0, u0
125}{.mmb;	sub	a0 = a0, u0
126	add	n = -3, n
127	nop.b	0
128	;;
129}{.mmi;	ld8	u0 = [up], 8
130  (p7)	add	c1 = 1, c1
131	cmp.ltu	p8, p0 = a1, u1
132}{.mmb;	sub	a1 = a1, u1
133	cmp.le	p9, p0 = 3, n
134	nop.b	0
135	;;
136}{.mmi;	ld8	u1 = [up], 8
137  (p8)	add	c2 = 1, c2
138	cmp.ltu	p6, p0 = a2, u2
139}{.mmb;	sub	a2 = a2, u2
140	nop.m	0
141dnl	br.cloop.dptk	L(top)
142  (p9)	br.cond.dptk	L(top)
143	;;
144}
145L(end):
146	cmp.eq	p10, p0 = 0, n
147	cmp.eq	p11, p0 = 1, n
148  (p10)	br	L(0)
149
150L(2):
151 {.mmi;	ld8	u2 = [up], 8
152  (p6)	add	c0 = 1, c0
153	cmp.ltu	p7, p0 = a0, u0
154}{.mmb;	sub	a0 = a0, u0
155	nop.m	0
156  (p11)	br	L(1)
157	;;
158}	ld8	u0 = [up], 8
159  (p7)	add	c1 = 1, c1
160	cmp.ltu	p8, p0 = a1, u1
161	sub	a1 = a1, u1
162	;;
163  (p8)	add	c2 = 1, c2
164	cmp.ltu	p6, p0 = a2, u2
165	sub	a2 = a2, u2
166	;;
167  (p6)	add	c0 = 1, c0
168	cmp.ltu	p7, p0 = a0, u0
169	sub	a0 = a0, u0
170	;;
171  (p7)	add	c1 = 1, c1
172	br	L(com)
173
174
175L(1):
176  (p7)	add	c1 = 1, c1
177	cmp.ltu	p8, p0 = a1, u1
178	sub	a1 = a1, u1
179	;;
180  (p8)	add	c2 = 1, c2
181	cmp.ltu	p6, p0 = a2, u2
182	sub	a2 = a2, u2
183	;;
184  (p6)	add	c0 = 1, c0
185	br	L(com)
186
187
188L(0):
189  (p6)	add	c0 = 1, c0
190	cmp.ltu	p7, p0 = a0, u0
191	sub	a0 = a0, u0
192	;;
193  (p7)	add	c1 = 1, c1
194	cmp.ltu	p8, p0 = a1, u1
195	sub	a1 = a1, u1
196	;;
197  (p8)	add	c2 = 1, c2
198
199L(com):
200C |     a2    |     a1    |     a0    |
201C |        |        |        |        |
202	shr.u	r24 = a0, 48		C 16 bits
203	shr.u	r25 = a1, 32		C 32 bits
204	shr.u	r26 = a2, 16		C 48 bits
205	;;
206	shr.u	r10 = c0, 48		C 16 bits, always zero
207	shr.u	r11 = c1, 32		C 32 bits
208	shr.u	r30 = c2, 16		C 48 bits
209	;;
210	dep.z	r27 = a0,  0, 48	C 48 bits
211	dep.z	r28 = a1, 16, 32	C 48 bits
212	dep.z	r29 = a2, 32, 16	C 48 bits
213	dep.z	r31 = c0,  0, 48	C 48 bits
214	dep.z	r14 = c1, 16, 32	C 48 bits
215	dep.z	r15 = c2, 32, 16	C 48 bits
216	;;
217 {.mmi;	add	r24 = r24, r25
218	add	r26 = r26, r27
219	add	r28 = r28, r29
220}{.mmi;	add	r10 = r10, r11
221	add	r30 = r30, r31
222	add	r14 = r14, r15
223	;;
224}
225	movl	r8 = 0xffffffffffff0
226	add	r24 = r24, r26
227	add	r10 = r10, r30
228	;;
229	add	r24 = r24, r28
230	add	r10 = r10, r14
231	;;
232	sub	r8 = r8, r24
233	;;
234	add	r8 = r8, r10
235	br.ret.sptk.many b0
236EPILOGUE()
237ASM_END()
238