1dnl  Intel P5 mpn_rshift -- mpn right shift.
2
3dnl  Copyright 2000, 2002 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33
34C P5: 1.75 cycles/limb.
35
36
37C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
38C                       unsigned shift);
39C
40C Shift src,size right by shift many bits and store the result in dst,size.
41C Zeros are shifted in at the left.  Return the bits shifted out at the
42C right.
43C
44C It takes 6 mmx instructions to process 2 limbs, making 1.5 cycles/limb,
45C and with a 4 limb loop and 1 cycle of loop overhead the total is 1.75 c/l.
46C
47C Full speed depends on source and destination being aligned.  Unaligned mmx
48C loads and stores on P5 don't pair and have a 2 cycle penalty.  Some hairy
49C setups and finish-ups are done to ensure alignment for the loop.
50C
51C MMX shifts work out a bit faster even for the simple loop.
52
53defframe(PARAM_SHIFT,16)
54defframe(PARAM_SIZE, 12)
55defframe(PARAM_SRC,  8)
56defframe(PARAM_DST,  4)
57deflit(`FRAME',0)
58
59dnl  Minimum 5, because the unrolled loop can't handle less.
60deflit(UNROLL_THRESHOLD, 5)
61
62	TEXT
63	ALIGN(8)
64
65PROLOGUE(mpn_rshift)
66
67	pushl	%ebx
68	pushl	%edi
69deflit(`FRAME',8)
70
71	movl	PARAM_SIZE, %eax
72	movl	PARAM_DST, %edx
73
74	movl	PARAM_SRC, %ebx
75	movl	PARAM_SHIFT, %ecx
76
77	cmp	$UNROLL_THRESHOLD, %eax
78	jae	L(unroll)
79
80	decl	%eax
81	movl	(%ebx), %edi		C src low limb
82
83	jnz	L(simple)
84
85	shrdl(	%cl, %edi, %eax)	C eax was decremented to zero
86
87	shrl	%cl, %edi
88
89	movl	%edi, (%edx)		C dst low limb
90	popl	%edi			C risk of data cache bank clash
91
92	popl	%ebx
93
94	ret
95
96
97C -----------------------------------------------------------------------------
98	ALIGN(8)
99L(simple):
100	C eax	size-1
101	C ebx	src
102	C ecx	shift
103	C edx	dst
104	C esi
105	C edi
106	C ebp
107deflit(`FRAME',8)
108
109	movd	(%ebx), %mm5		C src[0]
110	leal	(%ebx,%eax,4), %ebx	C &src[size-1]
111
112	movd	%ecx, %mm6		C rshift
113	leal	-4(%edx,%eax,4), %edx	C &dst[size-2]
114
115	psllq	$32, %mm5
116	negl	%eax
117
118
119C This loop is 5 or 8 cycles, with every second load unaligned and a wasted
120C cycle waiting for the mm0 result to be ready.  For comparison a shrdl is 4
121C cycles and would be 8 in a simple loop.  Using mmx helps the return value
122C and last limb calculations too.
123
124L(simple_top):
125	C eax	counter, limbs, negative
126	C ebx	&src[size-1]
127	C ecx	return value
128	C edx	&dst[size-2]
129	C
130	C mm0	scratch
131	C mm5	return value
132	C mm6	shift
133
134	movq	(%ebx,%eax,4), %mm0
135	incl	%eax
136
137	psrlq	%mm6, %mm0
138
139	movd	%mm0, (%edx,%eax,4)
140	jnz	L(simple_top)
141
142
143	movd	(%ebx), %mm0
144	psrlq	%mm6, %mm5		C return value
145
146	psrlq	%mm6, %mm0
147	popl	%edi
148
149	movd	%mm5, %eax
150	popl	%ebx
151
152	movd	%mm0, 4(%edx)
153
154	emms
155
156	ret
157
158
159C -----------------------------------------------------------------------------
160	ALIGN(8)
161L(unroll):
162	C eax	size
163	C ebx	src
164	C ecx	shift
165	C edx	dst
166	C esi
167	C edi
168	C ebp
169deflit(`FRAME',8)
170
171	movd	(%ebx), %mm5		C src[0]
172	movl	$4, %edi
173
174	movd	%ecx, %mm6		C rshift
175	testl	%edi, %ebx
176
177	psllq	$32, %mm5
178	jz	L(start_src_aligned)
179
180
181	C src isn't aligned, process low limb separately (marked xxx) and
182	C step src and dst by one limb, making src aligned.
183	C
184	C source                  ebx
185	C --+-------+-------+-------+
186	C           |          xxx  |
187	C --+-------+-------+-------+
188	C         4mod8   0mod8   4mod8
189	C
190	C         dest            edx
191	C         --+-------+-------+
192	C           |       |  xxx  |
193	C         --+-------+-------+
194
195	movq	(%ebx), %mm0		C unaligned load
196
197	psrlq	%mm6, %mm0
198	addl	$4, %ebx
199
200	decl	%eax
201
202	movd	%mm0, (%edx)
203	addl	$4, %edx
204L(start_src_aligned):
205
206
207	movq	(%ebx), %mm1
208	testl	%edi, %edx
209
210	psrlq	%mm6, %mm5		C retval
211	jz	L(start_dst_aligned)
212
213	C dst isn't aligned, add 4 to make it so, and pretend the shift is
214	C 32 bits extra.  Low limb of dst (marked xxx) handled here
215	C separately.
216	C
217	C          source          ebx
218	C          --+-------+-------+
219	C            |      mm1      |
220	C          --+-------+-------+
221	C                  4mod8   0mod8
222	C
223	C  dest                    edx
224	C  --+-------+-------+-------+
225	C                    |  xxx  |
226	C  --+-------+-------+-------+
227	C          4mod8   0mod8   4mod8
228
229	movq	%mm1, %mm0
230	addl	$32, %ecx		C new shift
231
232	psrlq	%mm6, %mm0
233
234	movd	%ecx, %mm6
235
236	movd	%mm0, (%edx)
237	addl	$4, %edx
238L(start_dst_aligned):
239
240
241	movq	8(%ebx), %mm3
242	negl	%ecx
243
244	movq	%mm3, %mm2		C mm2 src qword
245	addl	$64, %ecx
246
247	movd	%ecx, %mm7
248	psrlq	%mm6, %mm1
249
250	leal	-12(%ebx,%eax,4), %ebx
251	leal	-20(%edx,%eax,4), %edx
252
253	psllq	%mm7, %mm3
254	subl	$7, %eax		C size-7
255
256	por	%mm1, %mm3		C mm3 ready to store
257	negl	%eax			C -(size-7)
258
259	jns	L(finish)
260
261
262	C This loop is the important bit, the rest is just support.  Careful
263	C instruction scheduling achieves the claimed 1.75 c/l.  The
264	C relevant parts of the pairing rules are:
265	C
266	C - mmx loads and stores execute only in the U pipe
267	C - only one mmx shift in a pair
268	C - wait one cycle before storing an mmx register result
269	C - the usual address generation interlock
270	C
271	C Two qword calculations are slightly interleaved.  The instructions
272	C marked "C" belong to the second qword, and the "C prev" one is for
273	C the second qword from the previous iteration.
274
275	ALIGN(8)
276L(unroll_loop):
277	C eax	counter, limbs, negative
278	C ebx	&src[size-12]
279	C ecx
280	C edx	&dst[size-12]
281	C esi
282	C edi
283	C
284	C mm0
285	C mm1
286	C mm2	src qword from -8(%ebx,%eax,4)
287	C mm3	dst qword ready to store to -8(%edx,%eax,4)
288	C
289	C mm5	return value
290	C mm6	rshift
291	C mm7	lshift
292
293	movq	(%ebx,%eax,4), %mm0
294	psrlq	%mm6, %mm2
295
296	movq	%mm0, %mm1
297	psllq	%mm7, %mm0
298
299	movq	%mm3, -8(%edx,%eax,4)	C prev
300	por	%mm2, %mm0
301
302	movq	8(%ebx,%eax,4), %mm3	C
303	psrlq	%mm6, %mm1		C
304
305	movq	%mm0, (%edx,%eax,4)
306	movq	%mm3, %mm2		C
307
308	psllq	%mm7, %mm3		C
309	addl	$4, %eax
310
311	por	%mm1, %mm3		C
312	js	L(unroll_loop)
313
314
315L(finish):
316	C eax	0 to 3 representing respectively 3 to 0 limbs remaining
317
318	testb	$2, %al
319
320	jnz	L(finish_no_two)
321
322	movq	(%ebx,%eax,4), %mm0
323	psrlq	%mm6, %mm2
324
325	movq	%mm0, %mm1
326	psllq	%mm7, %mm0
327
328	movq	%mm3, -8(%edx,%eax,4)	C prev
329	por	%mm2, %mm0
330
331	movq	%mm1, %mm2
332	movq	%mm0, %mm3
333
334	addl	$2, %eax
335L(finish_no_two):
336
337
338	C eax	2 or 3 representing respectively 1 or 0 limbs remaining
339	C
340	C mm2	src prev qword, from -8(%ebx,%eax,4)
341	C mm3	dst qword, for -8(%edx,%eax,4)
342
343	testb	$1, %al
344	popl	%edi
345
346	movd	%mm5, %eax	C retval
347	jnz	L(finish_zero)
348
349
350	C One extra limb, destination was aligned.
351	C
352	C source                ebx
353	C +-------+---------------+--
354	C |       |      mm2      |
355	C +-------+---------------+--
356	C
357	C dest                                  edx
358	C +-------+---------------+---------------+--
359	C |       |               |      mm3      |
360	C +-------+---------------+---------------+--
361	C
362	C mm6 = shift
363	C mm7 = ecx = 64-shift
364
365
366	C One extra limb, destination was unaligned.
367	C
368	C source                ebx
369	C +-------+---------------+--
370	C |       |      mm2      |
371	C +-------+---------------+--
372	C
373	C dest                          edx
374	C +---------------+---------------+--
375	C |               |      mm3      |
376	C +---------------+---------------+--
377	C
378	C mm6 = shift+32
379	C mm7 = ecx = 64-(shift+32)
380
381
382	C In both cases there's one extra limb of src to fetch and combine
383	C with mm2 to make a qword at 8(%edx), and in the aligned case
384	C there's a further extra limb of dst to be formed.
385
386
387	movd	8(%ebx), %mm0
388	psrlq	%mm6, %mm2
389
390	movq	%mm0, %mm1
391	psllq	%mm7, %mm0
392
393	movq	%mm3, (%edx)
394	por	%mm2, %mm0
395
396	psrlq	%mm6, %mm1
397	andl	$32, %ecx
398
399	popl	%ebx
400	jz	L(finish_one_unaligned)
401
402	C dst was aligned, must store one extra limb
403	movd	%mm1, 16(%edx)
404L(finish_one_unaligned):
405
406	movq	%mm0, 8(%edx)
407
408	emms
409
410	ret
411
412
413L(finish_zero):
414
415	C No extra limbs, destination was aligned.
416	C
417	C source        ebx
418	C +---------------+--
419	C |      mm2      |
420	C +---------------+--
421	C
422	C dest                        edx+4
423	C +---------------+---------------+--
424	C |               |      mm3      |
425	C +---------------+---------------+--
426	C
427	C mm6 = shift
428	C mm7 = ecx = 64-shift
429
430
431	C No extra limbs, destination was unaligned.
432	C
433	C source        ebx
434	C +---------------+--
435	C |      mm2      |
436	C +---------------+--
437	C
438	C dest                edx+4
439	C +-------+---------------+--
440	C |       |      mm3      |
441	C +-------+---------------+--
442	C
443	C mm6 = shift+32
444	C mm7 = 64-(shift+32)
445
446
447	C The movd for the unaligned case is clearly the same data as the
448	C movq for the aligned case, it's just a choice between whether one
449	C or two limbs should be written.
450
451
452	movq	%mm3, 4(%edx)
453	psrlq	%mm6, %mm2
454
455	movd	%mm2, 12(%edx)
456	andl	$32, %ecx
457
458	popl	%ebx
459	jz	L(finish_zero_unaligned)
460
461	movq	%mm2, 12(%edx)
462L(finish_zero_unaligned):
463
464	emms
465
466	ret
467
468EPILOGUE()
469