1dnl  PowerPC-32/VMX and PowerPC-64/VMX mpn_copyd.
2
3dnl  Copyright 2006 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33C                16-byte coaligned      unaligned
34C                   cycles/limb        cycles/limb
35C 7400,7410 (G4):       0.5                0.64
36C 744x,745x (G4+):      0.75               0.82
37C 970 (G5):             0.78               1.02		(64-bit limbs)
38
39C STATUS
40C  * Works for all sizes and alignments.
41
42C TODO
43C  * Optimize unaligned case.  Some basic tests with 2-way and 4-way unrolling
44C    indicate that we can reach 0.56 c/l for 7400, 0.75 c/l for 745x, and 0.80
45C    c/l for 970.
46C  * Consider using VMX instructions also for head and tail, by using some
47C    read-modify-write tricks.
48C  * The VMX code is used from the smallest sizes it handles, but measurements
49C    show a large speed bump at the cutoff points.  Small copying (perhaps
50C    using some read-modify-write technique) should be optimized.
51C  * Make an mpn_com based on this code.
52
53define(`GMP_LIMB_BYTES', eval(GMP_LIMB_BITS/8))
54define(`LIMBS_PER_VR',  eval(16/GMP_LIMB_BYTES))
55define(`LIMBS_PER_2VR', eval(32/GMP_LIMB_BYTES))
56
57
58ifelse(GMP_LIMB_BITS,32,`
59	define(`LIMB32',`	$1')
60	define(`LIMB64',`')
61',`
62	define(`LIMB32',`')
63	define(`LIMB64',`	$1')
64')
65
66C INPUT PARAMETERS
67define(`rp',	`r3')
68define(`up',	`r4')
69define(`n',	`r5')
70
71define(`us',	`v4')
72
73
74ASM_START()
75PROLOGUE(mpn_copyd)
76
77LIMB32(`slwi.	r0, n, 2	')
78LIMB64(`sldi.	r0, n, 3	')
79	add	rp, rp, r0
80	add	up, up, r0
81
82LIMB32(`cmpi	cr7, n, 11	')
83LIMB64(`cmpdi	cr7, n, 5	')
84	bge	cr7, L(big)
85
86	beqlr	cr0
87
88C Handle small cases with plain operations
89	mtctr	n
90L(topS):
91LIMB32(`lwz	r0, -4(up)	')
92LIMB64(`ld	r0, -8(up)	')
93	addi	up, up, -GMP_LIMB_BYTES
94LIMB32(`stw	r0, -4(rp)	')
95LIMB64(`std	r0, -8(rp)	')
96	addi	rp, rp, -GMP_LIMB_BYTES
97	bdnz	L(topS)
98	blr
99
100C Handle large cases with VMX operations
101L(big):
102	addi	rp, rp, -16
103	addi	up, up, -16
104	mfspr	r12, 256
105	oris	r0, r12, 0xf800		C Set VRSAVE bit 0-4
106	mtspr	256, r0
107
108LIMB32(`rlwinm.	r7, rp, 30,30,31')	C (rp >> 2) mod 4
109LIMB64(`rlwinm.	r7, rp, 29,31,31')	C (rp >> 3) mod 2
110	beq	L(rp_aligned)
111
112	subf	n, r7, n
113L(top0):
114LIMB32(`lwz	r0, 12(up)	')
115LIMB64(`ld	r0, 8(up)	')
116	addi	up, up, -GMP_LIMB_BYTES
117LIMB32(`addic.	r7, r7, -1	')
118LIMB32(`stw	r0, 12(rp)	')
119LIMB64(`std	r0, 8(rp)	')
120	addi	rp, rp, -GMP_LIMB_BYTES
121LIMB32(`bne	L(top0)		')
122
123L(rp_aligned):
124
125LIMB32(`rlwinm.	r0, up, 30,30,31')	C (up >> 2) mod 4
126LIMB64(`rlwinm.	r0, up, 29,31,31')	C (up >> 3) mod 2
127
128LIMB64(`srdi	r7, n, 2	')	C loop count corresponding to n
129LIMB32(`srwi	r7, n, 3	')	C loop count corresponding to n
130	mtctr	r7			C copy n to count register
131
132	li	r10, -16
133
134	beq	L(up_aligned)
135
136	lvsl	us, 0, up
137
138	addi	up, up, 16
139LIMB32(`andi.	r0, n, 0x4	')
140LIMB64(`andi.	r0, n, 0x2	')
141	beq	L(1)
142	lvx	v0, 0, up
143	lvx	v2, r10, up
144	vperm	v3, v2, v0, us
145	stvx	v3, 0, rp
146	addi	up, up, -32
147	addi	rp, rp, -16
148	b	L(lpu)
149L(1):	lvx	v2, 0, up
150	addi	up, up, -16
151	b	L(lpu)
152
153	ALIGN(32)
154L(lpu):	lvx	v0, 0, up
155	vperm	v3, v0, v2, us
156	stvx	v3, 0, rp
157	lvx	v2, r10, up
158	addi	up, up, -32
159	vperm	v3, v2, v0, us
160	stvx	v3, r10, rp
161	addi	rp, rp, -32
162	bdnz	L(lpu)
163
164	b	L(tail)
165
166L(up_aligned):
167
168LIMB32(`andi.	r0, n, 0x4	')
169LIMB64(`andi.	r0, n, 0x2	')
170	beq	L(lpa)
171	lvx	v0, 0,   up
172	stvx	v0, 0,   rp
173	addi	up, up, -16
174	addi	rp, rp, -16
175	b	L(lpa)
176
177	ALIGN(32)
178L(lpa):	lvx	v0, 0,   up
179	lvx	v1, r10, up
180	addi	up, up, -32
181	nop
182	stvx	v0, 0,   rp
183	stvx	v1, r10, rp
184	addi	rp, rp, -32
185	bdnz	L(lpa)
186
187L(tail):
188LIMB32(`rlwinm.	r7, n, 0,30,31	')	C r7 = n mod 4
189LIMB64(`rlwinm.	r7, n, 0,31,31	')	C r7 = n mod 2
190	beq	L(ret)
191LIMB32(`li	r10, 12		')
192L(top2):
193LIMB32(`lwzx	r0, r10, up	')
194LIMB64(`ld	r0, 8(up)	')
195LIMB32(`addic.	r7, r7, -1	')
196LIMB32(`stwx	r0, r10, rp	')
197LIMB64(`std	r0, 8(rp)	')
198LIMB32(`addi	r10, r10, -GMP_LIMB_BYTES')
199LIMB32(`bne	L(top2)		')
200
201L(ret):	mtspr	256, r12
202	blr
203EPILOGUE()
204