1dnl  IA-64 mpn_lshiftc.
2
3dnl  Contributed to the GNU project by Torbjorn Granlund.
4
5dnl  Copyright 2000-2005, 2010 Free Software Foundation, Inc.
6
7dnl  This file is part of the GNU MP Library.
8dnl
9dnl  The GNU MP Library is free software; you can redistribute it and/or modify
10dnl  it under the terms of either:
11dnl
12dnl    * the GNU Lesser General Public License as published by the Free
13dnl      Software Foundation; either version 3 of the License, or (at your
14dnl      option) any later version.
15dnl
16dnl  or
17dnl
18dnl    * the GNU General Public License as published by the Free Software
19dnl      Foundation; either version 2 of the License, or (at your option) any
20dnl      later version.
21dnl
22dnl  or both in parallel, as here.
23dnl
24dnl  The GNU MP Library is distributed in the hope that it will be useful, but
25dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
27dnl  for more details.
28dnl
29dnl  You should have received copies of the GNU General Public License and the
30dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
31dnl  see https://www.gnu.org/licenses/.
32
33include(`../config.m4')
34
35C           cycles/limb
36C Itanium:      ?
37C Itanium 2:    1.25
38
39C This code is scheduled deeply since the plain shift instructions shr and shl
40C have a latency of 4 (on Itanium) or 3 (on Itanium 2).  Poor scheduling of
41C these instructions cause a 10 cycle replay trap on Itanium.
42
43C The ld8 scheduling should probably be decreased to make the function smaller.
44C Good lfetch  will make sure we never stall anyway.
45
46C We should actually issue the first ld8 at cycle 0, and the first BSH/FSH pair
47C at cycle 2.  Judicious use of predicates could allow us to issue more ld8's
48C in the prologue.
49
50
51C INPUT PARAMETERS
52define(`rp', `r32')
53define(`up', `r33')
54define(`n',  `r34')
55define(`cnt',`r35')
56
57define(`tnc',`r9')
58
59define(`FSH',`shl')
60define(`BSH',`shr.u')
61define(`UPD',`-8')
62define(`POFF',`-512')
63define(`PUPD',`-32')
64define(`func',`mpn_lshiftc')
65
66ASM_START()
67PROLOGUE(mpn_lshiftc)
68	.prologue
69	.save	ar.lc, r2
70	.body
71ifdef(`HAVE_ABI_32',
72`	addp4	rp = 0, rp		C				M I
73	addp4	up = 0, up		C				M I
74	sxt4	n = n			C				M I
75	nop.m		0
76	nop.m		0
77	zxt4	cnt = cnt		C				I
78	;;
79')
80
81 {.mmi;	nop	0			C				M I
82	and	r14 = 3, n		C				M I
83	mov.i	r2 = ar.lc		C				I0
84}{.mmi;	add	r15 = -1, n		C				M I
85	sub	tnc = 64, cnt		C				M I
86	nop	0
87	;;
88}{.mmi;	cmp.eq	p6, p0 = 1, r14		C				M I
89	cmp.eq	p7, p0 = 2, r14		C				M I
90	shr.u	n = r15, 2		C				I0
91}{.mmi;	cmp.eq	p8, p0 = 3, r14		C				M I
92	shladd	up = r15, 3, up		C				M I
93	shladd	rp = r15, 3, rp		C				M I
94	;;
95}{.mmi;	add	r11 = POFF, up		C				M I
96	ld8	r10 = [up], UPD		C				M01
97	mov.i	ar.lc = n		C				I0
98}{.bbb;
99   (p6)	br.dptk	.Lb01
100   (p7)	br.dptk	.Lb10
101   (p8)	br.dptk	.Lb11
102	;; }
103
104.Lb00:
105	ld8	r19 = [up], UPD
106	;;
107	ld8	r16 = [up], UPD
108	;;
109	ld8	r17 = [up], UPD
110	BSH	r8 = r10, tnc
111	br.cloop.dptk	L(gt4)
112	;;
113	FSH	r24 = r10, cnt
114	BSH	r25 = r19, tnc
115	;;
116	FSH	r26 = r19, cnt
117	BSH	r27 = r16, tnc
118	;;
119	FSH	r20 = r16, cnt
120	BSH	r21 = r17, tnc
121	;;
122	or	r14 = r25, r24
123	FSH	r22 = r17, cnt
124	;;
125	or	r15 = r27, r26
126	sub	r31 = -1, r14
127	br	.Lr4
128
129L(gt4):
130 {.mmi;	nop	0
131	nop	0
132	FSH	r24 = r10, cnt
133}{.mmi;	ld8	r18 = [up], UPD
134	nop	0
135	BSH	r25 = r19, tnc
136	;; }
137 {.mmi;	nop	0
138	nop	0
139	FSH	r26 = r19, cnt
140}{.mmi;	ld8	r19 = [up], UPD
141	nop	0
142	BSH	r27 = r16, tnc
143	;; }
144 {.mmi;	nop	0
145	nop	0
146	FSH	r20 = r16, cnt
147}{.mmi;	ld8	r16 = [up], UPD
148	nop	0
149	BSH	r21 = r17, tnc
150	;; }
151 {.mmi;	nop	0
152	or	r14 = r25, r24
153	FSH	r22 = r17, cnt
154}{.mib;	ld8	r17 = [up], UPD
155	BSH	r23 = r18, tnc
156	br.cloop.dptk	L(gt8)
157	;; }
158 {.mmi;	nop	0
159	or	r15 = r27, r26
160	FSH	r24 = r18, cnt
161}{.mib;	sub	r31 = -1, r14
162	BSH	r25 = r19, tnc
163	br	.Lr8 }
164
165L(gt8):
166	or	r15 = r27, r26
167	FSH	r24 = r18, cnt
168	ld8	r18 = [up], UPD
169	sub	r31 = -1, r14
170	BSH	r25 = r19, tnc
171	br	.LL00
172
173.Lb01:
174	br.cloop.dptk	L(gt1)
175	;;
176	BSH	r8 = r10, tnc
177	FSH	r22 = r10, cnt
178	;;
179	sub	r31 = -1, r22
180	br	.Lr1
181	;;
182L(gt1):
183	ld8	r18 = [up], UPD
184	BSH	r8 = r10, tnc
185	FSH	r22 = r10, cnt
186	;;
187	ld8	r19 = [up], UPD
188	;;
189	ld8	r16 = [up], UPD
190	;;
191	ld8	r17 = [up], UPD
192	BSH	r23 = r18, tnc
193	br.cloop.dptk	L(gt5)
194	;;
195	nop	0
196	FSH	r24 = r18, cnt
197	BSH	r25 = r19, tnc
198	;;
199	nop	0
200	FSH	r26 = r19, cnt
201	BSH	r27 = r16, tnc
202	;;
203	or	r15 = r23, r22
204	FSH	r20 = r16, cnt
205	BSH	r21 = r17, tnc
206	;;
207	or	r14 = r25, r24
208	FSH	r22 = r17, cnt
209	sub	r31 = -1, r15
210	br	.Lr5
211
212L(gt5):
213 {.mmi;	nop	0
214	nop	0
215	FSH	r24 = r18, cnt
216}{.mmi;	ld8	r18 = [up], UPD
217	nop	0
218	BSH	r25 = r19, tnc
219	;; }
220 {.mmi;	nop	0
221	nop	0
222	FSH	r26 = r19, cnt
223}{.mmi;	ld8	r19 = [up], UPD
224	nop	0
225	BSH	r27 = r16, tnc
226	;; }
227 {.mmi;	nop	0
228	or	r15 = r23, r22
229	FSH	r20 = r16, cnt
230}{.mmi;	ld8	r16 = [up], UPD
231	nop	0
232	BSH	r21 = r17, tnc
233	;; }
234 {.mmi;	or	r14 = r25, r24
235	sub	r31 = -1, r15
236	FSH	r22 = r17, cnt
237}{.mib;	ld8	r17 = [up], UPD
238	BSH	r23 = r18, tnc
239	br	L(end)
240	;; }
241
242.Lb10:
243	ld8	r17 = [up], UPD
244	br.cloop.dptk	L(gt2)
245	;;
246	BSH	r8 = r10, tnc
247	FSH	r20 = r10, cnt
248	;;
249	BSH	r21 = r17, tnc
250	FSH	r22 = r17, cnt
251	;;
252	or	r14 = r21, r20
253	;;
254	sub	r31 = -1, r14
255	br	.Lr2
256	;;
257L(gt2):
258	ld8	r18 = [up], UPD
259	BSH	r8 = r10, tnc
260	FSH	r20 = r10, cnt
261	;;
262	ld8	r19 = [up], UPD
263	;;
264	ld8	r16 = [up], UPD
265	BSH	r21 = r17, tnc
266	FSH	r22 = r17, cnt
267	;;
268	ld8	r17 = [up], UPD
269	BSH	r23 = r18, tnc
270	br.cloop.dptk	L(gt6)
271	;;
272	nop	0
273	FSH	r24 = r18, cnt
274	BSH	r25 = r19, tnc
275	;;
276	or	r14 = r21, r20
277	FSH	r26 = r19, cnt
278	BSH	r27 = r16, tnc
279	;;
280 {.mmi;	nop	0
281	or	r15 = r23, r22
282	FSH	r20 = r16, cnt
283}{.mib;	sub	r31 = -1, r14
284	BSH	r21 = r17, tnc
285	br	.Lr6
286	;; }
287L(gt6):
288 {.mmi;	nop	0
289	nop	0
290	FSH	r24 = r18, cnt
291}{.mmi;	ld8	r18 = [up], UPD
292	nop	0
293	BSH	r25 = r19, tnc
294	;; }
295 {.mmi; nop   0
296	or	r14 = r21, r20
297	FSH	r26 = r19, cnt
298}{.mmi;	ld8	r19 = [up], UPD
299	nop	0
300	BSH	r27 = r16, tnc
301	;; }
302 {.mmi;	or	r15 = r23, r22
303	sub	r31 = -1, r14
304	FSH	r20 = r16, cnt
305}{.mib;	ld8	r16 = [up], UPD
306	BSH	r21 = r17, tnc
307	br	.LL10
308}
309
310.Lb11:
311	ld8	r16 = [up], UPD
312	;;
313	ld8	r17 = [up], UPD
314	BSH	r8 = r10, tnc
315	FSH	r26 = r10, cnt
316	br.cloop.dptk	L(gt3)
317	;;
318	BSH	r27 = r16, tnc
319	;;
320	FSH	r20 = r16, cnt
321	BSH	r21 = r17, tnc
322	;;
323	FSH	r22 = r17, cnt
324	;;
325	or	r15 = r27, r26
326	;;
327	or	r14 = r21, r20
328	sub	r31 = -1, r15
329	br	.Lr3
330	;;
331L(gt3):
332	ld8	r18 = [up], UPD
333	;;
334	ld8	r19 = [up], UPD
335	BSH	r27 = r16, tnc
336	;;
337 {.mmi;	nop	0
338	nop	0
339	FSH	r20 = r16, cnt
340}{.mmi;	ld8	r16 = [up], UPD
341	nop	0
342	BSH	r21 = r17, tnc
343	;;
344}{.mmi;	nop	0
345	nop	0
346	FSH	r22 = r17, cnt
347}{.mib;	ld8	r17 = [up], UPD
348	BSH	r23 = r18, tnc
349	br.cloop.dptk	L(gt7)
350	;; }
351	or	r15 = r27, r26
352	FSH	r24 = r18, cnt
353	BSH	r25 = r19, tnc
354	;;
355 {.mmi;	nop	0
356	or	r14 = r21, r20
357	FSH	r26 = r19, cnt
358}{.mib;	sub	r31 = -1, r15
359	BSH	r27 = r16, tnc
360	br	.Lr7
361}
362L(gt7):
363 {.mmi;	nop	0
364	or	r15 = r27, r26
365	FSH	r24 = r18, cnt
366}{.mmi;	ld8	r18 = [up], UPD
367	nop	0
368	BSH	r25 = r19, tnc
369	;; }
370 {.mmi;	or	r14 = r21, r20
371	sub	r31 = -1, r15
372	FSH	r26 = r19, cnt
373}{.mib;	ld8	r19 = [up], UPD
374	BSH	r27 = r16, tnc
375	br	.LL11
376}
377
378C *** MAIN LOOP START ***
379	ALIGN(32)
380L(top):
381.LL01:
382 {.mmi;	st8	[rp] = r31, UPD		C M2
383	or	r15 = r27, r26		C M3
384	FSH	r24 = r18, cnt		C I0
385}{.mmi;	ld8	r18 = [up], UPD		C M0
386	sub	r31 = -1, r14		C M1
387	BSH	r25 = r19, tnc		C I1
388	;; }
389.LL00:
390 {.mmi;	st8	[rp] = r31, UPD
391	or	r14 = r21, r20
392	FSH	r26 = r19, cnt
393}{.mmi;	ld8	r19 = [up], UPD
394	sub	r31 = -1, r15
395	BSH	r27 = r16, tnc
396	;; }
397.LL11:
398 {.mmi;	st8	[rp] = r31, UPD
399	or	r15 = r23, r22
400	FSH	r20 = r16, cnt
401}{.mmi;	ld8	r16 = [up], UPD
402	sub	r31 = -1, r14
403	BSH	r21 = r17, tnc
404	;; }
405.LL10:
406 {.mmi;	st8	[rp] = r31, UPD
407	or	r14 = r25, r24
408	FSH	r22 = r17, cnt
409}{.mmi;	ld8	r17 = [up], UPD
410	sub	r31 = -1, r15
411	BSH	r23 = r18, tnc
412	;; }
413L(end):	lfetch		[r11], PUPD
414	br.cloop.dptk	L(top)
415C *** MAIN LOOP END ***
416
417 {.mmi;	st8	[rp] = r31, UPD
418	or	r15 = r27, r26
419	FSH	r24 = r18, cnt
420}{.mib;	sub	r31 = -1, r14
421	BSH	r25 = r19, tnc
422	nop	0
423	;; }
424.Lr8:
425 {.mmi;	st8	[rp] = r31, UPD
426	or	r14 = r21, r20
427	FSH	r26 = r19, cnt
428}{.mib;	sub	r31 = -1, r15
429	BSH	r27 = r16, tnc
430	nop	0
431	;; }
432.Lr7:
433 {.mmi;	st8	[rp] = r31, UPD
434	or	r15 = r23, r22
435	FSH	r20 = r16, cnt
436}{.mib;	sub	r31 = -1, r14
437	BSH	r21 = r17, tnc
438	nop	0
439	;; }
440.Lr6:	st8	[rp] = r31, UPD
441	or	r14 = r25, r24
442	FSH	r22 = r17, cnt
443	sub	r31 = -1, r15
444	;;
445.Lr5:	st8	[rp] = r31, UPD
446	or	r15 = r27, r26
447	sub	r31 = -1, r14
448	;;
449.Lr4:	st8	[rp] = r31, UPD
450	or	r14 = r21, r20
451	sub	r31 = -1, r15
452	;;
453.Lr3:	st8	[rp] = r31, UPD
454	sub	r31 = -1, r14
455	;;
456.Lr2:	st8	[rp] = r31, UPD
457	sub	r31 = -1, r22
458	;;
459.Lr1:	st8	[rp] = r31, UPD		C				M23
460	mov	ar.lc = r2		C				I0
461	br.ret.sptk.many b0		C				B
462EPILOGUE(func)
463ASM_END()
464