1dnl  PowerPC-32/VMX and PowerPC-64/VMX mpn_copyi.
2
3dnl  Copyright 2006 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of the GNU Lesser General Public License as published
9dnl  by the Free Software Foundation; either version 3 of the License, or (at
10dnl  your option) any later version.
11
12dnl  The GNU MP Library is distributed in the hope that it will be useful, but
13dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
15dnl  License for more details.
16
17dnl  You should have received a copy of the GNU Lesser General Public License
18dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
19
20include(`../config.m4')
21
22C                16-byte coaligned      unaligned
23C                   cycles/limb        cycles/limb
24C 7400,7410 (G4):       0.5                0.64
25C 744x,745x (G4+):      0.75               0.82
26C 970 (G5):             0.78               1.02		(64-bit limbs)
27
28C STATUS
29C  * Works for all sizes and alignments.
30
31C TODO
32C  * Optimize unaligned case.  Some basic tests with 2-way and 4-way unrolling
33C    indicate that we can reach 0.56 c/l for 7400, 0.75 c/l for 745x, and 0.80
34C    c/l for 970.
35C  * Consider using VMX instructions also for head and tail, by using some
36C    read-modify-write tricks.
37C  * The VMX code is used from the smallest sizes it handles, but measurements
38C    show a large speed bump at the cutoff points.  Small copying (perhaps
39C    using some read-modify-write technique) should be optimized.
40C  * Make a mpn_com based on this code.
41
42define(`GMP_LIMB_BYTES', eval(GMP_LIMB_BITS/8))
43define(`LIMBS_PER_VR',  eval(16/GMP_LIMB_BYTES))
44define(`LIMBS_PER_2VR', eval(32/GMP_LIMB_BYTES))
45
46
47ifelse(GMP_LIMB_BITS,32,`
48	define(`LIMB32',`	$1')
49	define(`LIMB64',`')
50',`
51	define(`LIMB32',`')
52	define(`LIMB64',`	$1')
53')
54
55C INPUT PARAMETERS
56define(`rp',	`r3')
57define(`up',	`r4')
58define(`n',	`r5')
59
60define(`us',	`v4')
61
62
63ASM_START()
64PROLOGUE(mpn_copyi)
65
66LIMB32(`cmpi	cr7, n, 11	')
67LIMB64(`cmpdi	cr7, n, 5	')
68	bge	cr7, L(big)
69
70	or.	r0, n, n
71	beqlr	cr0
72
73C Handle small cases with plain operations
74	mtctr	n
75L(topS):
76LIMB32(`lwz	r0, 0(up)	')
77LIMB64(`ld	r0, 0(up)	')
78	addi	up, up, GMP_LIMB_BYTES
79LIMB32(`stw	r0, 0(rp)	')
80LIMB64(`std	r0, 0(rp)	')
81	addi	rp, rp, GMP_LIMB_BYTES
82	bdnz	L(topS)
83	blr
84
85C Handle large cases with VMX operations
86L(big):
87	mfspr	r12, 256
88	oris	r0, r12, 0xf800		C Set VRSAVE bit 0-4
89	mtspr	256, r0
90
91LIMB32(`rlwinm.	r7, rp, 30,30,31')	C (rp >> 2) mod 4
92LIMB64(`rlwinm.	r7, rp, 29,31,31')	C (rp >> 3) mod 2
93	beq	L(rp_aligned)
94
95	subfic	r7, r7, LIMBS_PER_VR
96	subf	n, r7, n
97L(top0):
98LIMB32(`lwz	r0, 0(up)	')
99LIMB64(`ld	r0, 0(up)	')
100	addi	up, up, GMP_LIMB_BYTES
101LIMB32(`addic.	r7, r7, -1	')
102LIMB32(`stw	r0, 0(rp)	')
103LIMB64(`std	r0, 0(rp)	')
104	addi	rp, rp, GMP_LIMB_BYTES
105LIMB32(`bne	L(top0)		')
106
107L(rp_aligned):
108
109LIMB32(`rlwinm.	r0, up, 30,30,31')	C (up >> 2) mod 4
110LIMB64(`rlwinm.	r0, up, 29,31,31')	C (up >> 3) mod 2
111
112LIMB64(`srdi	r7, n, 2	')	C loop count corresponding to n
113LIMB32(`srwi	r7, n, 3	')	C loop count corresponding to n
114	mtctr	r7			C copy n to count register
115
116	li	r10, 16
117
118	beq	L(up_aligned)
119
120	lvsl	us, 0, up
121
122LIMB32(`andi.	r0, n, 0x4	')
123LIMB64(`andi.	r0, n, 0x2	')
124	beq	L(1)
125	lvx	v0, 0, up
126	lvx	v2, r10, up
127	vperm	v3, v0, v2, us
128	stvx	v3, 0, rp
129	addi	up, up, 32
130	addi	rp, rp, 16
131	b	L(lpu)
132L(1):	lvx	v2, 0, up
133	addi	up, up, 16
134	b	L(lpu)
135
136	ALIGN(32)
137L(lpu):	lvx	v0, 0, up
138	vperm	v3, v2, v0, us
139	stvx	v3, 0, rp
140	lvx	v2, r10, up
141	addi	up, up, 32
142	vperm	v3, v0, v2, us
143	stvx	v3, r10, rp
144	addi	rp, rp, 32
145	bdnz	L(lpu)
146
147	addi	up, up, -16
148	b	L(tail)
149
150L(up_aligned):
151
152LIMB32(`andi.	r0, n, 0x4	')
153LIMB64(`andi.	r0, n, 0x2	')
154	beq	L(lpa)
155	lvx	v0, 0,   up
156	stvx	v0, 0,   rp
157	addi	up, up, 16
158	addi	rp, rp, 16
159	b	L(lpa)
160
161	ALIGN(32)
162L(lpa):	lvx	v0, 0,   up
163	lvx	v1, r10, up
164	addi	up, up, 32
165	nop
166	stvx	v0, 0,   rp
167	stvx	v1, r10, rp
168	addi	rp, rp, 32
169	bdnz	L(lpa)
170
171L(tail):
172LIMB32(`rlwinm.	r7, n, 0,30,31	')	C r7 = n mod 4
173LIMB64(`rlwinm.	r7, n, 0,31,31	')	C r7 = n mod 2
174	beq	L(ret)
175LIMB32(`li	r10, 0		')
176L(top2):
177LIMB32(`lwzx	r0, r10, up	')
178LIMB64(`ld	r0, 0(up)	')
179LIMB32(`addic.	r7, r7, -1	')
180LIMB32(`stwx	r0, r10, rp	')
181LIMB64(`std	r0, 0(rp)	')
182LIMB32(`addi	r10, r10, GMP_LIMB_BYTES')
183LIMB32(`bne	L(top2)		')
184
185L(ret):	mtspr	256, r12
186	blr
187EPILOGUE()
188