1dnl  AMD K7 mpn_rshift -- mpn right shift.
2
3dnl  Copyright 1999-2002 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33
34C K7: 1.21 cycles/limb (at 16 limbs/loop).
35
36
37
38dnl  K7: UNROLL_COUNT cycles/limb
39dnl           4           1.51
40dnl           8           1.26
41dnl          16           1.21
42dnl          32           1.2
43dnl  Maximum possible with the current code is 64.
44
45deflit(UNROLL_COUNT, 16)
46
47
48C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
49C                       unsigned shift);
50C
51C Shift src,size right by shift many bits and store the result in dst,size.
52C Zeros are shifted in at the left.  The bits shifted out at the right are
53C the return value.
54C
55C This code uses 64-bit MMX operations, which makes it possible to handle
56C two limbs at a time, for a theoretical 1.0 cycles/limb.  Plain integer
57C code, on the other hand, suffers from shrd being a vector path decode and
58C running at 3 cycles back-to-back.
59C
60C Full speed depends on source and destination being aligned, and some hairy
61C setups and finish-ups are done to arrange this for the loop.
62
63ifdef(`PIC',`
64deflit(UNROLL_THRESHOLD, 10)
65',`
66deflit(UNROLL_THRESHOLD, 10)
67')
68
69defframe(PARAM_SHIFT,16)
70defframe(PARAM_SIZE, 12)
71defframe(PARAM_SRC,  8)
72defframe(PARAM_DST,  4)
73
74defframe(SAVE_EDI, -4)
75defframe(SAVE_ESI, -8)
76defframe(SAVE_EBX, -12)
77deflit(SAVE_SIZE, 12)
78
79	TEXT
80	ALIGN(32)
81
82PROLOGUE(mpn_rshift)
83deflit(`FRAME',0)
84
85	movl	PARAM_SIZE, %eax
86	movl	PARAM_SRC, %edx
87	subl	$SAVE_SIZE, %esp
88deflit(`FRAME',SAVE_SIZE)
89
90	movl	PARAM_SHIFT, %ecx
91	movl	%edi, SAVE_EDI
92
93	movl	PARAM_DST, %edi
94	decl	%eax
95	jnz	L(more_than_one_limb)
96
97	movl	(%edx), %edx		C src limb
98
99	shrdl(	%cl, %edx, %eax)	C eax was decremented to zero
100
101	shrl	%cl, %edx
102
103	movl	%edx, (%edi)		C dst limb
104	movl	SAVE_EDI, %edi
105	addl	$SAVE_SIZE, %esp
106
107	ret
108
109
110C -----------------------------------------------------------------------------
111L(more_than_one_limb):
112	C eax	size-1
113	C ebx
114	C ecx	shift
115	C edx	src
116	C esi
117	C edi	dst
118	C ebp
119
120	movd	PARAM_SHIFT, %mm6	C rshift
121	movd	(%edx), %mm5		C src low limb
122	cmp	$UNROLL_THRESHOLD-1, %eax
123
124	jae	L(unroll)
125	leal	(%edx,%eax,4), %edx	C &src[size-1]
126	leal	-4(%edi,%eax,4), %edi	C &dst[size-2]
127
128	movd	(%edx), %mm4		C src high limb
129	negl	%eax
130
131
132L(simple_top):
133	C eax	loop counter, limbs, negative
134	C ebx
135	C ecx	shift
136	C edx	carry
137	C edx	&src[size-1]
138	C edi	&dst[size-2]
139	C ebp
140	C
141	C mm0	scratch
142	C mm4	src high limb
143	C mm5	src low limb
144	C mm6	shift
145
146	movq	(%edx,%eax,4), %mm0
147	incl	%eax
148
149	psrlq	%mm6, %mm0
150
151	movd	%mm0, (%edi,%eax,4)
152	jnz	L(simple_top)
153
154
155	psllq	$32, %mm5
156	psrlq	%mm6, %mm4
157
158	psrlq	%mm6, %mm5
159	movd	%mm4, 4(%edi)		C dst high limb
160
161	movd	%mm5, %eax		C return value
162
163	movl	SAVE_EDI, %edi
164	addl	$SAVE_SIZE, %esp
165	emms
166
167	ret
168
169
170C -----------------------------------------------------------------------------
171	ALIGN(16)
172L(unroll):
173	C eax	size-1
174	C ebx
175	C ecx	shift
176	C edx	src
177	C esi
178	C edi	dst
179	C ebp
180	C
181	C mm5	src low limb
182	C mm6	rshift
183
184	testb	$4, %dl
185	movl	%esi, SAVE_ESI
186	movl	%ebx, SAVE_EBX
187
188	psllq	$32, %mm5
189	jz	L(start_src_aligned)
190
191
192	C src isn't aligned, process low limb separately (marked xxx) and
193	C step src and dst by one limb, making src aligned.
194	C
195	C source                  edx
196	C --+-------+-------+-------+
197	C           |          xxx  |
198	C --+-------+-------+-------+
199	C         4mod8   0mod8   4mod8
200	C
201	C         dest            edi
202	C         --+-------+-------+
203	C           |       |  xxx  |
204	C         --+-------+-------+
205
206	movq	(%edx), %mm0		C src low two limbs
207	addl	$4, %edx
208	movl	%eax, PARAM_SIZE	C size-1
209
210	addl	$4, %edi
211	decl	%eax			C size-2 is new size-1
212
213	psrlq	%mm6, %mm0
214	movl	%edi, PARAM_DST		C new dst
215
216	movd	%mm0, -4(%edi)
217L(start_src_aligned):
218
219
220	movq	(%edx), %mm1		C src low two limbs
221	decl	%eax			C size-2, two last limbs handled at end
222	testl	$4, %edi
223
224	psrlq	%mm6, %mm5
225	jz	L(start_dst_aligned)
226
227
228	C dst isn't aligned, add 4 to make it so, and pretend the shift is
229	C 32 bits extra.  Low limb of dst (marked xxx) handled here separately.
230	C
231	C          source          edx
232	C          --+-------+-------+
233	C            |      mm1      |
234	C          --+-------+-------+
235	C                  4mod8   0mod8
236	C
237	C  dest                    edi
238	C  --+-------+-------+-------+
239	C                    |  xxx  |
240	C  --+-------+-------+-------+
241	C          4mod8   0mod8   4mod8
242
243	movq	%mm1, %mm0
244	psrlq	%mm6, %mm1
245	addl	$32, %ecx		C shift+32
246
247	movd	%mm1, (%edi)
248	movq	%mm0, %mm1
249	addl	$4, %edi		C new dst
250
251	movd	%ecx, %mm6
252L(start_dst_aligned):
253
254
255	movq	%mm1, %mm2		C copy of src low two limbs
256	negl	%ecx
257	andl	$-2, %eax		C round size down to even
258
259	movl	%eax, %ebx
260	negl	%eax
261	addl	$64, %ecx
262
263	andl	$UNROLL_MASK, %eax
264	decl	%ebx
265
266	shll	%eax
267
268	movd	%ecx, %mm7		C lshift = 64-rshift
269
270ifdef(`PIC',`
271	call	L(pic_calc)
272L(here):
273',`
274	leal	L(entry) (%eax,%eax,4), %esi
275	negl	%eax
276')
277	shrl	$UNROLL_LOG2, %ebx	C loop counter
278
279	leal	ifelse(UNROLL_BYTES,256,128+) 8(%edx,%eax,2), %edx
280	leal	ifelse(UNROLL_BYTES,256,128) (%edi,%eax,2), %edi
281	movl	PARAM_SIZE, %eax	C for use at end
282
283	jmp	*%esi
284
285
286ifdef(`PIC',`
287L(pic_calc):
288	C See mpn/x86/README about old gas bugs
289	leal	(%eax,%eax,4), %esi
290	addl	$L(entry)-L(here), %esi
291	addl	(%esp), %esi
292	negl	%eax
293
294	ret_internal
295')
296
297
298C -----------------------------------------------------------------------------
299	ALIGN(64)
300L(top):
301	C eax	size, for use at end
302	C ebx	loop counter
303	C ecx	lshift
304	C edx	src
305	C esi	was computed jump
306	C edi	dst
307	C ebp
308	C
309	C mm0	scratch
310	C mm1	\ carry (alternating)
311	C mm2	/
312	C mm6	rshift
313	C mm7	lshift
314	C
315	C 10 code bytes/limb
316	C
317	C The two chunks differ in whether mm1 or mm2 hold the carry.
318	C The computed jump puts the initial carry in both mm1 and mm2.
319
320L(entry):
321deflit(CHUNK_COUNT, 4)
322forloop(i, 0, UNROLL_COUNT/CHUNK_COUNT-1, `
323	deflit(`disp0', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
324	deflit(`disp1', eval(disp0 + 8))
325
326Zdisp(	movq,	disp0,(%edx), %mm0)
327	psrlq	%mm6, %mm2
328
329	movq	%mm0, %mm1
330	psllq	%mm7, %mm0
331
332	por	%mm2, %mm0
333Zdisp(	movq,	%mm0, disp0,(%edi))
334
335
336Zdisp(	movq,	disp1,(%edx), %mm0)
337	psrlq	%mm6, %mm1
338
339	movq	%mm0, %mm2
340	psllq	%mm7, %mm0
341
342	por	%mm1, %mm0
343Zdisp(	movq,	%mm0, disp1,(%edi))
344')
345
346	addl	$UNROLL_BYTES, %edx
347	addl	$UNROLL_BYTES, %edi
348	decl	%ebx
349
350	jns	L(top)
351
352
353deflit(`disp0', ifelse(UNROLL_BYTES,256,-128))
354deflit(`disp1', eval(disp0-0 + 8))
355
356	testb	$1, %al
357	psrlq	%mm6, %mm2	C wanted rshifted in all cases below
358	movl	SAVE_ESI, %esi
359
360	movd	%mm5, %eax		C return value
361
362	movl	SAVE_EBX, %ebx
363	jz	L(end_even)
364
365
366	C Size odd, destination was aligned.
367	C
368	C source
369	C       edx
370	C +-------+---------------+--
371	C |       |      mm2      |
372	C +-------+---------------+--
373	C
374	C dest                  edi
375	C +-------+---------------+---------------+--
376	C |       |               |    written    |
377	C +-------+---------------+---------------+--
378	C
379	C mm6 = shift
380	C mm7 = ecx = 64-shift
381
382
383	C Size odd, destination was unaligned.
384	C
385	C source
386	C       edx
387	C +-------+---------------+--
388	C |       |      mm2      |
389	C +-------+---------------+--
390	C
391	C dest          edi
392	C +---------------+---------------+--
393	C |               |    written    |
394	C +---------------+---------------+--
395	C
396	C mm6 = shift+32
397	C mm7 = ecx = 64-(shift+32)
398
399
400	C In both cases there's one extra limb of src to fetch and combine
401	C with mm2 to make a qword to store, and in the aligned case there's
402	C a further extra limb of dst to be formed.
403
404
405	movd	disp0(%edx), %mm0
406	movq	%mm0, %mm1
407
408	psllq	%mm7, %mm0
409	testb	$32, %cl
410
411	por	%mm2, %mm0
412	psrlq	%mm6, %mm1
413
414	movq	%mm0, disp0(%edi)
415	jz	L(finish_odd_unaligned)
416
417	movd	%mm1, disp1(%edi)
418L(finish_odd_unaligned):
419
420	movl	SAVE_EDI, %edi
421	addl	$SAVE_SIZE, %esp
422	emms
423
424	ret
425
426
427L(end_even):
428
429	C Size even, destination was aligned.
430	C
431	C source
432	C +---------------+--
433	C |      mm2      |
434	C +---------------+--
435	C
436	C dest          edi
437	C +---------------+---------------+--
438	C |               |      mm3      |
439	C +---------------+---------------+--
440	C
441	C mm6 = shift
442	C mm7 = ecx = 64-shift
443
444
445	C Size even, destination was unaligned.
446	C
447	C source
448	C +---------------+--
449	C |      mm2      |
450	C +---------------+--
451	C
452	C dest  edi
453	C +-------+---------------+--
454	C |       |      mm3      |
455	C +-------+---------------+--
456	C
457	C mm6 = shift+32
458	C mm7 = 64-(shift+32)
459
460
461	C The movd for the unaligned case is the same data as the movq for
462	C the aligned case, it's just a choice between whether one or two
463	C limbs should be written.
464
465
466	testb	$32, %cl
467	movd	%mm2, disp0(%edi)
468
469	jz	L(end_even_unaligned)
470
471	movq	%mm2, disp0(%edi)
472L(end_even_unaligned):
473
474	movl	SAVE_EDI, %edi
475	addl	$SAVE_SIZE, %esp
476	emms
477
478	ret
479
480EPILOGUE()
481