1dnl  PowerPC-32/VMX and PowerPC-64/VMX mpn_copyd.
2
3dnl  Copyright 2006 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of the GNU Lesser General Public License as published
9dnl  by the Free Software Foundation; either version 3 of the License, or (at
10dnl  your option) any later version.
11
12dnl  The GNU MP Library is distributed in the hope that it will be useful, but
13dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
15dnl  License for more details.
16
17dnl  You should have received a copy of the GNU Lesser General Public License
18dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
19
20include(`../config.m4')
21
22C                16-byte coaligned      unaligned
23C                   cycles/limb        cycles/limb
24C 7400,7410 (G4):       0.5                0.64
25C 744x,745x (G4+):      0.75               0.82
26C 970 (G5):             0.78               1.02		(64-bit limbs)
27
28C STATUS
29C  * Works for all sizes and alignments.
30
31C TODO
32C  * Optimize unaligned case.  Some basic tests with 2-way and 4-way unrolling
33C    indicate that we can reach 0.56 c/l for 7400, 0.75 c/l for 745x, and 0.80
34C    c/l for 970.
35C  * Consider using VMX instructions also for head and tail, by using some
36C    read-modify-write tricks.
37C  * The VMX code is used from the smallest sizes it handles, but measurements
38C    show a large speed bump at the cutoff points.  Small copying (perhaps
39C    using some read-modify-write technique) should be optimized.
40C  * Make a mpn_com based on this code.
41
42define(`GMP_LIMB_BYTES', eval(GMP_LIMB_BITS/8))
43define(`LIMBS_PER_VR',  eval(16/GMP_LIMB_BYTES))
44define(`LIMBS_PER_2VR', eval(32/GMP_LIMB_BYTES))
45
46
47ifelse(GMP_LIMB_BITS,32,`
48	define(`LIMB32',`	$1')
49	define(`LIMB64',`')
50',`
51	define(`LIMB32',`')
52	define(`LIMB64',`	$1')
53')
54
55C INPUT PARAMETERS
56define(`rp',	`r3')
57define(`up',	`r4')
58define(`n',	`r5')
59
60define(`us',	`v4')
61
62
63ASM_START()
64PROLOGUE(mpn_copyd)
65
66LIMB32(`slwi.	r0, n, 2	')
67LIMB64(`sldi.	r0, n, 3	')
68	add	rp, rp, r0
69	add	up, up, r0
70
71LIMB32(`cmpi	cr7, n, 11	')
72LIMB64(`cmpdi	cr7, n, 5	')
73	bge	cr7, L(big)
74
75	beqlr	cr0
76
77C Handle small cases with plain operations
78	mtctr	n
79L(topS):
80LIMB32(`lwz	r0, -4(up)	')
81LIMB64(`ld	r0, -8(up)	')
82	addi	up, up, -GMP_LIMB_BYTES
83LIMB32(`stw	r0, -4(rp)	')
84LIMB64(`std	r0, -8(rp)	')
85	addi	rp, rp, -GMP_LIMB_BYTES
86	bdnz	L(topS)
87	blr
88
89C Handle large cases with VMX operations
90L(big):
91	addi	rp, rp, -16
92	addi	up, up, -16
93	mfspr	r12, 256
94	oris	r0, r12, 0xf800		C Set VRSAVE bit 0-4
95	mtspr	256, r0
96
97LIMB32(`rlwinm.	r7, rp, 30,30,31')	C (rp >> 2) mod 4
98LIMB64(`rlwinm.	r7, rp, 29,31,31')	C (rp >> 3) mod 2
99	beq	L(rp_aligned)
100
101	subf	n, r7, n
102L(top0):
103LIMB32(`lwz	r0, 12(up)	')
104LIMB64(`ld	r0, 8(up)	')
105	addi	up, up, -GMP_LIMB_BYTES
106LIMB32(`addic.	r7, r7, -1	')
107LIMB32(`stw	r0, 12(rp)	')
108LIMB64(`std	r0, 8(rp)	')
109	addi	rp, rp, -GMP_LIMB_BYTES
110LIMB32(`bne	L(top0)		')
111
112L(rp_aligned):
113
114LIMB32(`rlwinm.	r0, up, 30,30,31')	C (up >> 2) mod 4
115LIMB64(`rlwinm.	r0, up, 29,31,31')	C (up >> 3) mod 2
116
117LIMB64(`srdi	r7, n, 2	')	C loop count corresponding to n
118LIMB32(`srwi	r7, n, 3	')	C loop count corresponding to n
119	mtctr	r7			C copy n to count register
120
121	li	r10, -16
122
123	beq	L(up_aligned)
124
125	lvsl	us, 0, up
126
127	addi	up, up, 16
128LIMB32(`andi.	r0, n, 0x4	')
129LIMB64(`andi.	r0, n, 0x2	')
130	beq	L(1)
131	lvx	v0, 0, up
132	lvx	v2, r10, up
133	vperm	v3, v2, v0, us
134	stvx	v3, 0, rp
135	addi	up, up, -32
136	addi	rp, rp, -16
137	b	L(lpu)
138L(1):	lvx	v2, 0, up
139	addi	up, up, -16
140	b	L(lpu)
141
142	ALIGN(32)
143L(lpu):	lvx	v0, 0, up
144	vperm	v3, v0, v2, us
145	stvx	v3, 0, rp
146	lvx	v2, r10, up
147	addi	up, up, -32
148	vperm	v3, v2, v0, us
149	stvx	v3, r10, rp
150	addi	rp, rp, -32
151	bdnz	L(lpu)
152
153	b	L(tail)
154
155L(up_aligned):
156
157LIMB32(`andi.	r0, n, 0x4	')
158LIMB64(`andi.	r0, n, 0x2	')
159	beq	L(lpa)
160	lvx	v0, 0,   up
161	stvx	v0, 0,   rp
162	addi	up, up, -16
163	addi	rp, rp, -16
164	b	L(lpa)
165
166	ALIGN(32)
167L(lpa):	lvx	v0, 0,   up
168	lvx	v1, r10, up
169	addi	up, up, -32
170	nop
171	stvx	v0, 0,   rp
172	stvx	v1, r10, rp
173	addi	rp, rp, -32
174	bdnz	L(lpa)
175
176L(tail):
177LIMB32(`rlwinm.	r7, n, 0,30,31	')	C r7 = n mod 4
178LIMB64(`rlwinm.	r7, n, 0,31,31	')	C r7 = n mod 2
179	beq	L(ret)
180LIMB32(`li	r10, 12		')
181L(top2):
182LIMB32(`lwzx	r0, r10, up	')
183LIMB64(`ld	r0, 8(up)	')
184LIMB32(`addic.	r7, r7, -1	')
185LIMB32(`stwx	r0, r10, rp	')
186LIMB64(`std	r0, 8(rp)	')
187LIMB32(`addi	r10, r10, -GMP_LIMB_BYTES')
188LIMB32(`bne	L(top2)		')
189
190L(ret):	mtspr	256, r12
191	blr
192EPILOGUE()
193