1dnl  IA-64 mpn_copyd -- copy limb vector, decrementing.
2
3dnl  Copyright 2001, 2002, 2004 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of the GNU Lesser General Public License as published
9dnl  by the Free Software Foundation; either version 3 of the License, or (at
10dnl  your option) any later version.
11
12dnl  The GNU MP Library is distributed in the hope that it will be useful, but
13dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
15dnl  License for more details.
16
17dnl  You should have received a copy of the GNU Lesser General Public License
18dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
19
20include(`../config.m4')
21
22C         cycles/limb
23C Itanium:    1
24C Itanium 2:  0.5
25
26C INPUT PARAMETERS
27C rp = r32
28C sp = r33
29C n = r34
30
31ASM_START()
32PROLOGUE(mpn_copyd)
33	.prologue
34	.save ar.lc, r2
35	.body
36ifdef(`HAVE_ABI_32',
37`	addp4		r32 = 0, r32
38	addp4		r33 = 0, r33
39	sxt4		r34 = r34
40	;;
41')
42{.mmi
43	shladd		r32 = r34, 3, r32
44	shladd		r33 = r34, 3, r33
45	mov.i		r2 = ar.lc
46}
47{.mmi
48	and		r14 = 3, r34
49	cmp.ge		p14, p15 = 3, r34
50	add		r34 = -4, r34
51	;;
52}
53{.mmi
54	cmp.eq		p8, p0 = 1, r14
55	cmp.eq		p10, p0 = 2, r14
56	cmp.eq		p12, p0 = 3, r14
57}
58{.bbb
59  (p8)	br.dptk		.Lb01
60  (p10)	br.dptk		.Lb10
61  (p12)	br.dptk		.Lb11
62}
63
64.Lb00:	C  n = 0, 4, 8, 12, ...
65	add		r32 = -8, r32
66	add		r33 = -8, r33
67  (p14)	br.dptk		.Ls00
68	;;
69	add		r21 = -8, r33
70	ld8		r16 = [r33], -16
71	shr		r15 = r34, 2
72	;;
73	ld8		r17 = [r21], -16
74	mov.i		ar.lc = r15
75	ld8		r18 = [r33], -16
76	add		r20 = -8, r32
77	;;
78	ld8		r19 = [r21], -16
79	br.cloop.dptk	.Loop
80	;;
81	br.sptk		.Lend
82	;;
83
84.Lb01:	C  n = 1, 5, 9, 13, ...
85	add		r21 = -8, r33
86	add		r20 = -8, r32
87	add		r33 = -16, r33
88	add		r32 = -16, r32
89	;;
90	ld8		r19 = [r21], -16
91	shr		r15 = r34, 2
92  (p14)	br.dptk		.Ls01
93	;;
94	ld8		r16 = [r33], -16
95	mov.i		ar.lc = r15
96	;;
97	ld8		r17 = [r21], -16
98	ld8		r18 = [r33], -16
99	br.sptk		.Li01
100	;;
101
102.Lb10:	C  n = 2,6, 10, 14, ...
103	add		r21 = -16, r33
104	shr		r15 = r34, 2
105	add		r20 = -16, r32
106	add		r32 = -8, r32
107	add		r33 = -8, r33
108	;;
109	ld8		r18 = [r33], -16
110	ld8		r19 = [r21], -16
111	mov.i		ar.lc = r15
112  (p14)	br.dptk		.Ls10
113	;;
114	ld8		r16 = [r33], -16
115	ld8		r17 = [r21], -16
116	br.sptk		.Li10
117	;;
118
119.Lb11:	C  n = 3, 7, 11, 15, ...
120	add		r21 = -8, r33
121	add		r20 = -8, r32
122	add		r33 = -16, r33
123	add		r32 = -16, r32
124	;;
125	ld8		r17 = [r21], -16
126	shr		r15 = r34, 2
127	;;
128	ld8		r18 = [r33], -16
129	mov.i		ar.lc = r15
130	ld8		r19 = [r21], -16
131  (p14)	br.dptk		.Ls11
132	;;
133	ld8		r16 = [r33], -16
134	br.sptk		.Li11
135	;;
136
137	ALIGN(32)
138.Loop:
139.Li00:
140{.mmb
141	st8		[r32] = r16, -16
142	ld8		r16 = [r33], -16
143	nop.b		0
144}
145.Li11:
146{.mmb
147	st8		[r20] = r17, -16
148	ld8		r17 = [r21], -16
149	nop.b		0
150	;;
151}
152.Li10:
153{.mmb
154	st8		[r32] = r18, -16
155	ld8		r18 = [r33], -16
156	nop.b		0
157}
158.Li01:
159{.mmb
160	st8		[r20] = r19, -16
161	ld8		r19 = [r21], -16
162	br.cloop.dptk	.Loop
163	;;
164}
165.Lend:	st8		[r32] = r16, -16
166.Ls11:	st8		[r20] = r17, -16
167	;;
168.Ls10:	st8		[r32] = r18, -16
169.Ls01:	st8		[r20] = r19, -16
170.Ls00:	mov.i		ar.lc = r2
171	br.ret.sptk.many b0
172EPILOGUE()
173ASM_END()
174