rshift.asm revision 1.1.1.1
1dnl  AMD K7 mpn_rshift -- mpn right shift.
2
3dnl  Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc.
4dnl
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or
8dnl  modify it under the terms of the GNU Lesser General Public License as
9dnl  published by the Free Software Foundation; either version 3 of the
10dnl  License, or (at your option) any later version.
11dnl
12dnl  The GNU MP Library is distributed in the hope that it will be useful,
13dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
14dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15dnl  Lesser General Public License for more details.
16dnl
17dnl  You should have received a copy of the GNU Lesser General Public License
18dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
19
20include(`../config.m4')
21
22
23C K7: 1.21 cycles/limb (at 16 limbs/loop).
24
25
26
27dnl  K7: UNROLL_COUNT cycles/limb
28dnl           4           1.51
29dnl           8           1.26
30dnl          16           1.21
31dnl          32           1.2
32dnl  Maximum possible with the current code is 64.
33
34deflit(UNROLL_COUNT, 16)
35
36
37C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
38C                       unsigned shift);
39C
40C Shift src,size right by shift many bits and store the result in dst,size.
41C Zeros are shifted in at the left.  The bits shifted out at the right are
42C the return value.
43C
44C This code uses 64-bit MMX operations, which makes it possible to handle
45C two limbs at a time, for a theoretical 1.0 cycles/limb.  Plain integer
46C code, on the other hand, suffers from shrd being a vector path decode and
47C running at 3 cycles back-to-back.
48C
49C Full speed depends on source and destination being aligned, and some hairy
50C setups and finish-ups are done to arrange this for the loop.
51
52ifdef(`PIC',`
53deflit(UNROLL_THRESHOLD, 10)
54',`
55deflit(UNROLL_THRESHOLD, 10)
56')
57
58defframe(PARAM_SHIFT,16)
59defframe(PARAM_SIZE, 12)
60defframe(PARAM_SRC,  8)
61defframe(PARAM_DST,  4)
62
63defframe(SAVE_EDI, -4)
64defframe(SAVE_ESI, -8)
65defframe(SAVE_EBX, -12)
66deflit(SAVE_SIZE, 12)
67
68	TEXT
69	ALIGN(32)
70
71PROLOGUE(mpn_rshift)
72deflit(`FRAME',0)
73
74	movl	PARAM_SIZE, %eax
75	movl	PARAM_SRC, %edx
76	subl	$SAVE_SIZE, %esp
77deflit(`FRAME',SAVE_SIZE)
78
79	movl	PARAM_SHIFT, %ecx
80	movl	%edi, SAVE_EDI
81
82	movl	PARAM_DST, %edi
83	decl	%eax
84	jnz	L(more_than_one_limb)
85
86	movl	(%edx), %edx		C src limb
87
88	shrdl(	%cl, %edx, %eax)	C eax was decremented to zero
89
90	shrl	%cl, %edx
91
92	movl	%edx, (%edi)		C dst limb
93	movl	SAVE_EDI, %edi
94	addl	$SAVE_SIZE, %esp
95
96	ret
97
98
99C -----------------------------------------------------------------------------
100L(more_than_one_limb):
101	C eax	size-1
102	C ebx
103	C ecx	shift
104	C edx	src
105	C esi
106	C edi	dst
107	C ebp
108
109	movd	PARAM_SHIFT, %mm6	C rshift
110	movd	(%edx), %mm5		C src low limb
111	cmp	$UNROLL_THRESHOLD-1, %eax
112
113	jae	L(unroll)
114	leal	(%edx,%eax,4), %edx	C &src[size-1]
115	leal	-4(%edi,%eax,4), %edi	C &dst[size-2]
116
117	movd	(%edx), %mm4		C src high limb
118	negl	%eax
119
120
121L(simple_top):
122	C eax	loop counter, limbs, negative
123	C ebx
124	C ecx	shift
125	C edx	carry
126	C edx	&src[size-1]
127	C edi	&dst[size-2]
128	C ebp
129	C
130	C mm0	scratch
131	C mm4	src high limb
132	C mm5	src low limb
133	C mm6	shift
134
135	movq	(%edx,%eax,4), %mm0
136	incl	%eax
137
138	psrlq	%mm6, %mm0
139
140	movd	%mm0, (%edi,%eax,4)
141	jnz	L(simple_top)
142
143
144	psllq	$32, %mm5
145	psrlq	%mm6, %mm4
146
147	psrlq	%mm6, %mm5
148	movd	%mm4, 4(%edi)		C dst high limb
149
150	movd	%mm5, %eax		C return value
151
152	movl	SAVE_EDI, %edi
153	addl	$SAVE_SIZE, %esp
154	emms
155
156	ret
157
158
159C -----------------------------------------------------------------------------
160	ALIGN(16)
161L(unroll):
162	C eax	size-1
163	C ebx
164	C ecx	shift
165	C edx	src
166	C esi
167	C edi	dst
168	C ebp
169	C
170	C mm5	src low limb
171	C mm6	rshift
172
173	testb	$4, %dl
174	movl	%esi, SAVE_ESI
175	movl	%ebx, SAVE_EBX
176
177	psllq	$32, %mm5
178	jz	L(start_src_aligned)
179
180
181	C src isn't aligned, process low limb separately (marked xxx) and
182	C step src and dst by one limb, making src aligned.
183	C
184	C source                  edx
185	C --+-------+-------+-------+
186	C           |          xxx  |
187	C --+-------+-------+-------+
188	C         4mod8   0mod8   4mod8
189	C
190	C         dest            edi
191	C         --+-------+-------+
192	C           |       |  xxx  |
193	C         --+-------+-------+
194
195	movq	(%edx), %mm0		C src low two limbs
196	addl	$4, %edx
197	movl	%eax, PARAM_SIZE	C size-1
198
199	addl	$4, %edi
200	decl	%eax			C size-2 is new size-1
201
202	psrlq	%mm6, %mm0
203	movl	%edi, PARAM_DST		C new dst
204
205	movd	%mm0, -4(%edi)
206L(start_src_aligned):
207
208
209	movq	(%edx), %mm1		C src low two limbs
210	decl	%eax			C size-2, two last limbs handled at end
211	testl	$4, %edi
212
213	psrlq	%mm6, %mm5
214	jz	L(start_dst_aligned)
215
216
217	C dst isn't aligned, add 4 to make it so, and pretend the shift is
218	C 32 bits extra.  Low limb of dst (marked xxx) handled here separately.
219	C
220	C          source          edx
221	C          --+-------+-------+
222	C            |      mm1      |
223	C          --+-------+-------+
224	C                  4mod8   0mod8
225	C
226	C  dest                    edi
227	C  --+-------+-------+-------+
228	C                    |  xxx  |
229	C  --+-------+-------+-------+
230	C          4mod8   0mod8   4mod8
231
232	movq	%mm1, %mm0
233	psrlq	%mm6, %mm1
234	addl	$32, %ecx		C shift+32
235
236	movd	%mm1, (%edi)
237	movq	%mm0, %mm1
238	addl	$4, %edi		C new dst
239
240	movd	%ecx, %mm6
241L(start_dst_aligned):
242
243
244	movq	%mm1, %mm2		C copy of src low two limbs
245	negl	%ecx
246	andl	$-2, %eax		C round size down to even
247
248	movl	%eax, %ebx
249	negl	%eax
250	addl	$64, %ecx
251
252	andl	$UNROLL_MASK, %eax
253	decl	%ebx
254
255	shll	%eax
256
257	movd	%ecx, %mm7		C lshift = 64-rshift
258
259ifdef(`PIC',`
260	call	L(pic_calc)
261L(here):
262',`
263	leal	L(entry) (%eax,%eax,4), %esi
264	negl	%eax
265')
266	shrl	$UNROLL_LOG2, %ebx	C loop counter
267
268	leal	ifelse(UNROLL_BYTES,256,128+) 8(%edx,%eax,2), %edx
269	leal	ifelse(UNROLL_BYTES,256,128) (%edi,%eax,2), %edi
270	movl	PARAM_SIZE, %eax	C for use at end
271
272	jmp	*%esi
273
274
275ifdef(`PIC',`
276L(pic_calc):
277	C See mpn/x86/README about old gas bugs
278	leal	(%eax,%eax,4), %esi
279	addl	$L(entry)-L(here), %esi
280	addl	(%esp), %esi
281	negl	%eax
282
283	ret_internal
284')
285
286
287C -----------------------------------------------------------------------------
288	ALIGN(64)
289L(top):
290	C eax	size, for use at end
291	C ebx	loop counter
292	C ecx	lshift
293	C edx	src
294	C esi	was computed jump
295	C edi	dst
296	C ebp
297	C
298	C mm0	scratch
299	C mm1	\ carry (alternating)
300	C mm2	/
301	C mm6	rshift
302	C mm7	lshift
303	C
304	C 10 code bytes/limb
305	C
306	C The two chunks differ in whether mm1 or mm2 hold the carry.
307	C The computed jump puts the initial carry in both mm1 and mm2.
308
309L(entry):
310deflit(CHUNK_COUNT, 4)
311forloop(i, 0, UNROLL_COUNT/CHUNK_COUNT-1, `
312	deflit(`disp0', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
313	deflit(`disp1', eval(disp0 + 8))
314
315Zdisp(	movq,	disp0,(%edx), %mm0)
316	psrlq	%mm6, %mm2
317
318	movq	%mm0, %mm1
319	psllq	%mm7, %mm0
320
321	por	%mm2, %mm0
322Zdisp(	movq,	%mm0, disp0,(%edi))
323
324
325Zdisp(	movq,	disp1,(%edx), %mm0)
326	psrlq	%mm6, %mm1
327
328	movq	%mm0, %mm2
329	psllq	%mm7, %mm0
330
331	por	%mm1, %mm0
332Zdisp(	movq,	%mm0, disp1,(%edi))
333')
334
335	addl	$UNROLL_BYTES, %edx
336	addl	$UNROLL_BYTES, %edi
337	decl	%ebx
338
339	jns	L(top)
340
341
342deflit(`disp0', ifelse(UNROLL_BYTES,256,-128))
343deflit(`disp1', eval(disp0-0 + 8))
344
345	testb	$1, %al
346	psrlq	%mm6, %mm2	C wanted rshifted in all cases below
347	movl	SAVE_ESI, %esi
348
349	movd	%mm5, %eax		C return value
350
351	movl	SAVE_EBX, %ebx
352	jz	L(end_even)
353
354
355	C Size odd, destination was aligned.
356	C
357	C source
358	C       edx
359	C +-------+---------------+--
360	C |       |      mm2      |
361	C +-------+---------------+--
362	C
363	C dest                  edi
364	C +-------+---------------+---------------+--
365	C |       |               |    written    |
366	C +-------+---------------+---------------+--
367	C
368	C mm6 = shift
369	C mm7 = ecx = 64-shift
370
371
372	C Size odd, destination was unaligned.
373	C
374	C source
375	C       edx
376	C +-------+---------------+--
377	C |       |      mm2      |
378	C +-------+---------------+--
379	C
380	C dest          edi
381	C +---------------+---------------+--
382	C |               |    written    |
383	C +---------------+---------------+--
384	C
385	C mm6 = shift+32
386	C mm7 = ecx = 64-(shift+32)
387
388
389	C In both cases there's one extra limb of src to fetch and combine
390	C with mm2 to make a qword to store, and in the aligned case there's
391	C a further extra limb of dst to be formed.
392
393
394	movd	disp0(%edx), %mm0
395	movq	%mm0, %mm1
396
397	psllq	%mm7, %mm0
398	testb	$32, %cl
399
400	por	%mm2, %mm0
401	psrlq	%mm6, %mm1
402
403	movq	%mm0, disp0(%edi)
404	jz	L(finish_odd_unaligned)
405
406	movd	%mm1, disp1(%edi)
407L(finish_odd_unaligned):
408
409	movl	SAVE_EDI, %edi
410	addl	$SAVE_SIZE, %esp
411	emms
412
413	ret
414
415
416L(end_even):
417
418	C Size even, destination was aligned.
419	C
420	C source
421	C +---------------+--
422	C |      mm2      |
423	C +---------------+--
424	C
425	C dest          edi
426	C +---------------+---------------+--
427	C |               |      mm3      |
428	C +---------------+---------------+--
429	C
430	C mm6 = shift
431	C mm7 = ecx = 64-shift
432
433
434	C Size even, destination was unaligned.
435	C
436	C source
437	C +---------------+--
438	C |      mm2      |
439	C +---------------+--
440	C
441	C dest  edi
442	C +-------+---------------+--
443	C |       |      mm3      |
444	C +-------+---------------+--
445	C
446	C mm6 = shift+32
447	C mm7 = 64-(shift+32)
448
449
450	C The movd for the unaligned case is the same data as the movq for
451	C the aligned case, it's just a choice between whether one or two
452	C limbs should be written.
453
454
455	testb	$32, %cl
456	movd	%mm2, disp0(%edi)
457
458	jz	L(end_even_unaligned)
459
460	movq	%mm2, disp0(%edi)
461L(end_even_unaligned):
462
463	movl	SAVE_EDI, %edi
464	addl	$SAVE_SIZE, %esp
465	emms
466
467	ret
468
469EPILOGUE()
470