lorrshift.asm revision 1.1.1.2
1dnl  IA-64 mpn_lshift/mpn_rshift.
2
3dnl  Contributed to the GNU project by Torbjorn Granlund.
4
5dnl  Copyright 2000, 2001, 2002, 2003, 2004, 2005 Free Software Foundation,
6dnl  Inc.
7
8dnl  This file is part of the GNU MP Library.
9
10dnl  The GNU MP Library is free software; you can redistribute it and/or modify
11dnl  it under the terms of the GNU Lesser General Public License as published
12dnl  by the Free Software Foundation; either version 3 of the License, or (at
13dnl  your option) any later version.
14
15dnl  The GNU MP Library is distributed in the hope that it will be useful, but
16dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
17dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
18dnl  License for more details.
19
20dnl  You should have received a copy of the GNU Lesser General Public License
21dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
22
23include(`../config.m4')
24
25C           cycles/limb
26C Itanium:      2
27C Itanium 2:    1
28
29C This code is scheduled deeply since the plain shift instructions shr and shl
30C have a latency of 4 (on Itanium) or 3 (on Itanium 2).  Poor scheduling of
31C these instructions cause a 10 cycle replay trap on Itanium.
32
33C The ld8 scheduling should probably be decreased to make the function smaller.
34C Good lfetch  will make sure we never stall anyway.
35
36C We should actually issue the first ld8 at cycle 0, and the first BSH/FSH pair
37C at cycle 2.  Judicious use of predicates could allow us to issue more ld8's
38C in the prologue.
39
40
41C INPUT PARAMETERS
42define(`rp', `r32')
43define(`up', `r33')
44define(`n',  `r34')
45define(`cnt',`r35')
46
47define(`tnc',`r9')
48
49ifdef(`OPERATION_lshift',`
50	define(`FSH',`shl')
51	define(`BSH',`shr.u')
52	define(`UPD',`-8')
53	define(`POFF',`-512')
54	define(`PUPD',`-32')
55	define(`func',`mpn_lshift')
56')
57ifdef(`OPERATION_rshift',`
58	define(`FSH',`shr.u')
59	define(`BSH',`shl')
60	define(`UPD',`8')
61	define(`POFF',`512')
62	define(`PUPD',`32')
63	define(`func',`mpn_rshift')
64')
65
66MULFUNC_PROLOGUE(mpn_lshift mpn_rshift)
67
68ASM_START()
69PROLOGUE(func)
70	.prologue
71	.save	ar.lc, r2
72	.body
73ifdef(`HAVE_ABI_32',
74`	addp4	rp = 0, rp		C			M I
75	addp4	up = 0, up		C		M I
76	sxt4	n = n			C		M I
77	zxt4	cnt = cnt		C		I
78	;;
79')
80
81 {.mmi;	cmp.lt	p14, p15 = 4, n		C		M I
82	and	r14 = 3, n		C		M I
83	mov.i	r2 = ar.lc		C		I0
84}{.mmi;	add	r15 = -1, n		C		M I
85	sub	tnc = 64, cnt		C		M I
86	add	r16 = -5, n
87	;;
88}{.mmi;	cmp.eq	p6, p0 = 1, r14		C		M I
89	cmp.eq	p7, p0 = 2, r14		C		M I
90	shr.u	n = r16, 2		C		I0
91}{.mmi;	cmp.eq	p8, p0 = 3, r14		C		M I
92ifdef(`OPERATION_lshift',
93`	shladd	up = r15, 3, up		C		M I
94	shladd	rp = r15, 3, rp')	C		M I
95	;;
96}{.mmi;	add	r11 = POFF, up		C		M I
97	ld8	r10 = [up], UPD		C		M01
98	mov.i	ar.lc = n		C		I0
99}{.bbb;
100   (p6)	br.dptk	.Lb01
101   (p7)	br.dptk	.Lb10
102   (p8)	br.dptk	.Lb11
103	;; }
104
105.Lb00:	ld8	r19 = [up], UPD
106	;;
107	ld8	r16 = [up], UPD
108	;;
109	ld8	r17 = [up], UPD
110	BSH	r8 = r10, tnc		C function return value
111	;;
112	FSH	r24 = r10, cnt
113	BSH	r25 = r19, tnc
114  (p14)	br.cond.dptk	.grt4
115	;;
116	FSH	r26 = r19, cnt
117	BSH	r27 = r16, tnc
118	;;
119	FSH	r20 = r16, cnt
120	BSH	r21 = r17, tnc
121	;;
122	or	r14 = r25, r24
123	FSH	r22 = r17, cnt
124	BSH	r23 = r10, tnc
125	br	.Lr4
126
127.grt4:	ld8	r18 = [up], UPD
128	FSH	r26 = r19, cnt
129	BSH	r27 = r16, tnc
130	;;
131	ld8	r19 = [up], UPD
132	FSH	r20 = r16, cnt
133	BSH	r21 = r17, tnc
134	;;
135	ld8	r16 = [up], UPD
136	FSH	r22 = r17, cnt
137	BSH	r23 = r18, tnc
138	;;
139	or	r14 = r25, r24
140	ld8	r17 = [up], UPD
141	br.cloop.dpnt	.Ltop
142	br	.Lbot
143
144.Lb01:
145  (p15)	BSH	r8 = r10, tnc		C function return value	I
146  (p15)	FSH	r22 = r10, cnt		C		I
147  (p15)	br.cond.dptk	.Lr1		C return	B
148
149.grt1:	ld8	r18 = [up], UPD
150	;;
151	ld8	r19 = [up], UPD
152	BSH	r8 = r10, tnc		C function return value
153	;;
154	ld8	r16 = [up], UPD
155	FSH	r22 = r10, cnt
156	BSH	r23 = r18, tnc
157	;;
158	ld8	r17 = [up], UPD
159	FSH	r24 = r18, cnt
160	BSH	r25 = r19, tnc
161	br.cloop.dpnt	.grt5
162	;;
163	or	r15 = r23, r22
164	FSH	r26 = r19, cnt
165	BSH	r27 = r16, tnc
166	;;
167	FSH	r20 = r16, cnt
168	BSH	r21 = r17, tnc
169	br	.Lr5
170
171.grt5:	ld8	r18 = [up], UPD
172	FSH	r26 = r19, cnt
173	BSH	r27 = r16, tnc
174	;;
175	ld8	r19 = [up], UPD
176	FSH	r20 = r16, cnt
177	BSH	r21 = r17, tnc
178	;;
179	or	r15 = r23, r22
180	ld8	r16 = [up], UPD
181	br	.LL01
182
183
184.Lb10:	ld8	r17 = [up], UPD
185  (p14)	br.cond.dptk	.grt2
186
187	BSH	r8 = r10, tnc		C function return value
188	;;
189	FSH	r20 = r10, cnt
190	BSH	r21 = r17, tnc
191	;;
192	or	r14 = r21, r20
193	FSH	r22 = r17, cnt
194	br	.Lr2			C return
195
196.grt2:	ld8	r18 = [up], UPD
197	BSH	r8 = r10, tnc		C function return value
198	;;
199	ld8	r19 = [up], UPD
200	FSH	r20 = r10, cnt
201	BSH	r21 = r17, tnc
202	;;
203	ld8	r16 = [up], UPD
204	FSH	r22 = r17, cnt
205	BSH	r23 = r18, tnc
206	;;
207 {.mmi;	ld8	r17 = [up], UPD
208	or	r14 = r21, r20
209	FSH	r24 = r18, cnt
210}{.mib;	nop	0
211	BSH	r25 = r19, tnc
212	br.cloop.dpnt	.grt6
213	;; }
214
215	FSH	r26 = r19, cnt
216	BSH	r27 = r16, tnc
217	br	.Lr6
218
219.grt6:	ld8	r18 = [up], UPD
220	FSH	r26 = r19, cnt
221	BSH	r27 = r16, tnc
222	;;
223	ld8	r19 = [up], UPD
224	br	.LL10
225
226
227.Lb11:	ld8	r16 = [up], UPD
228	;;
229	ld8	r17 = [up], UPD
230	BSH	r8 = r10, tnc		C function return value
231  (p14)	br.cond.dptk	.grt3
232	;;
233
234	FSH	r26 = r10, cnt
235	BSH	r27 = r16, tnc
236	;;
237	FSH	r20 = r16, cnt
238	BSH	r21 = r17, tnc
239	;;
240	or	r15 = r27, r26
241	FSH	r22 = r17, cnt
242	br	.Lr3			C return
243
244.grt3:	ld8	r18 = [up], UPD
245	FSH	r26 = r10, cnt
246	BSH	r27 = r16, tnc
247	;;
248	ld8	r19 = [up], UPD
249	FSH	r20 = r16, cnt
250	BSH	r21 = r17, tnc
251	;;
252	ld8	r16 = [up], UPD
253	FSH	r22 = r17, cnt
254	BSH	r23 = r18, tnc
255	;;
256	ld8	r17 = [up], UPD
257	br.cloop.dpnt	.grt7
258
259	or	r15 = r27, r26
260	FSH	r24 = r18, cnt
261	BSH	r25 = r19, tnc
262	br	.Lr7
263
264.grt7:	or	r15 = r27, r26
265	FSH	r24 = r18, cnt
266	BSH	r25 = r19, tnc
267	ld8	r18 = [up], UPD
268	br	.LL11
269
270C *** MAIN LOOP START ***
271	ALIGN(32)
272.Ltop:
273 {.mmi;	st8	[rp] = r14, UPD		C M2
274	or	r15 = r27, r26		C M3
275	FSH	r24 = r18, cnt		C I0
276}{.mmi;	ld8	r18 = [up], UPD		C M1
277	lfetch	[r11], PUPD
278	BSH	r25 = r19, tnc		C I1
279	;; }
280.LL11:
281 {.mmi;	st8	[rp] = r15, UPD
282	or	r14 = r21, r20
283	FSH	r26 = r19, cnt
284}{.mmi;	ld8	r19 = [up], UPD
285	nop.m	0
286	BSH	r27 = r16, tnc
287	;; }
288.LL10:
289 {.mmi;	st8	[rp] = r14, UPD
290	or	r15 = r23, r22
291	FSH	r20 = r16, cnt
292}{.mmi;	ld8	r16 = [up], UPD
293	nop.m	0
294	BSH	r21 = r17, tnc
295	;; }
296.LL01:
297 {.mmi;	st8	[rp] = r15, UPD
298	or	r14 = r25, r24
299	FSH	r22 = r17, cnt
300}{.mib;	ld8	r17 = [up], UPD
301	BSH	r23 = r18, tnc
302	br.cloop.dptk	.Ltop
303	;; }
304C *** MAIN LOOP END ***
305
306.Lbot:
307 {.mmi;	st8	[rp] = r14, UPD
308	or	r15 = r27, r26
309	FSH	r24 = r18, cnt
310}{.mib;	nop	0
311	BSH	r25 = r19, tnc
312	nop	0
313	;; }
314.Lr7:
315 {.mmi;	st8	[rp] = r15, UPD
316	or	r14 = r21, r20
317	FSH	r26 = r19, cnt
318}{.mib;	nop	0
319	BSH	r27 = r16, tnc
320	nop	0
321	;; }
322.Lr6:
323 {.mmi;	st8	[rp] = r14, UPD
324	or	r15 = r23, r22
325	FSH	r20 = r16, cnt
326}{.mib;	nop	0
327	BSH	r21 = r17, tnc
328	nop	0
329	;; }
330.Lr5:	st8	[rp] = r15, UPD
331	or	r14 = r25, r24
332	FSH	r22 = r17, cnt
333	;;
334.Lr4:	st8	[rp] = r14, UPD
335	or	r15 = r27, r26
336	;;
337.Lr3:	st8	[rp] = r15, UPD
338	or	r14 = r21, r20
339	;;
340.Lr2:	st8	[rp] = r14, UPD
341	;;
342.Lr1:	st8	[rp] = r22, UPD		C		M23
343	mov	ar.lc = r2		C		I0
344	br.ret.sptk.many b0		C		B
345EPILOGUE(func)
346ASM_END()
347