1dnl  AMD K7 mpn_lshift -- mpn left shift.
2
3dnl  Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc.
4dnl
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or
8dnl  modify it under the terms of the GNU Lesser General Public License as
9dnl  published by the Free Software Foundation; either version 3 of the
10dnl  License, or (at your option) any later version.
11dnl
12dnl  The GNU MP Library is distributed in the hope that it will be useful,
13dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
14dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15dnl  Lesser General Public License for more details.
16dnl
17dnl  You should have received a copy of the GNU Lesser General Public License
18dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
19
20include(`../config.m4')
21
22
23C K7: 1.21 cycles/limb (at 16 limbs/loop).
24
25
26
27dnl  K7: UNROLL_COUNT cycles/limb
28dnl           4           1.51
29dnl           8           1.26
30dnl          16           1.21
31dnl          32           1.2
32dnl  Maximum possible with the current code is 64.
33
34deflit(UNROLL_COUNT, 16)
35
36
37C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
38C                       unsigned shift);
39C
40C Shift src,size left by shift many bits and store the result in dst,size.
41C Zeros are shifted in at the right.  The bits shifted out at the left are
42C the return value.
43C
44C The comments in mpn_rshift apply here too.
45
46ifdef(`PIC',`
47deflit(UNROLL_THRESHOLD, 10)
48',`
49deflit(UNROLL_THRESHOLD, 10)
50')
51
52defframe(PARAM_SHIFT,16)
53defframe(PARAM_SIZE, 12)
54defframe(PARAM_SRC,  8)
55defframe(PARAM_DST,  4)
56
57defframe(SAVE_EDI, -4)
58defframe(SAVE_ESI, -8)
59defframe(SAVE_EBX, -12)
60deflit(SAVE_SIZE, 12)
61
62	TEXT
63	ALIGN(32)
64
65PROLOGUE(mpn_lshift)
66deflit(`FRAME',0)
67
68	movl	PARAM_SIZE, %eax
69	movl	PARAM_SRC, %edx
70	subl	$SAVE_SIZE, %esp
71deflit(`FRAME',SAVE_SIZE)
72
73	movl	PARAM_SHIFT, %ecx
74	movl	%edi, SAVE_EDI
75
76	movl	PARAM_DST, %edi
77	decl	%eax
78	jnz	L(more_than_one_limb)
79
80	movl	(%edx), %edx
81
82	shldl(	%cl, %edx, %eax)	C eax was decremented to zero
83
84	shll	%cl, %edx
85
86	movl	%edx, (%edi)
87	movl	SAVE_EDI, %edi
88	addl	$SAVE_SIZE, %esp
89
90	ret
91
92
93C -----------------------------------------------------------------------------
94L(more_than_one_limb):
95	C eax	size-1
96	C ebx
97	C ecx	shift
98	C edx	src
99	C esi
100	C edi	dst
101	C ebp
102
103	movd	PARAM_SHIFT, %mm6
104	movd	(%edx,%eax,4), %mm5	C src high limb
105	cmp	$UNROLL_THRESHOLD-1, %eax
106
107	jae	L(unroll)
108	negl	%ecx
109	movd	(%edx), %mm4		C src low limb
110
111	addl	$32, %ecx
112
113	movd	%ecx, %mm7
114
115L(simple_top):
116	C eax	loop counter, limbs
117	C ebx
118	C ecx
119	C edx	src
120	C esi
121	C edi	dst
122	C ebp
123	C
124	C mm0	scratch
125	C mm4	src low limb
126	C mm5	src high limb
127	C mm6	shift
128	C mm7	32-shift
129
130	movq	-4(%edx,%eax,4), %mm0
131	decl	%eax
132
133	psrlq	%mm7, %mm0
134
135	movd	%mm0, 4(%edi,%eax,4)
136	jnz	L(simple_top)
137
138
139	psllq	%mm6, %mm5
140	psllq	%mm6, %mm4
141
142	psrlq	$32, %mm5
143	movd	%mm4, (%edi)		C dst low limb
144
145	movd	%mm5, %eax		C return value
146
147	movl	SAVE_EDI, %edi
148	addl	$SAVE_SIZE, %esp
149	emms
150
151	ret
152
153
154C -----------------------------------------------------------------------------
155	ALIGN(16)
156L(unroll):
157	C eax	size-1
158	C ebx	(saved)
159	C ecx	shift
160	C edx	src
161	C esi
162	C edi	dst
163	C ebp
164	C
165	C mm5	src high limb, for return value
166	C mm6	lshift
167
168	movl	%esi, SAVE_ESI
169	movl	%ebx, SAVE_EBX
170	leal	-4(%edx,%eax,4), %edx   C &src[size-2]
171
172	testb	$4, %dl
173	movq	(%edx), %mm1		C src high qword
174
175	jz	L(start_src_aligned)
176
177
178	C src isn't aligned, process high limb (marked xxx) separately to
179	C make it so
180	C
181	C  source    -4(edx,%eax,4)
182	C                  |
183	C  +-------+-------+-------+--
184	C  |  xxx          |
185	C  +-------+-------+-------+--
186	C        0mod8   4mod8   0mod8
187	C
188	C  dest      -4(edi,%eax,4)
189	C                  |
190	C  +-------+-------+--
191	C  |  xxx  |       |
192	C  +-------+-------+--
193
194	psllq	%mm6, %mm1
195	subl	$4, %edx
196	movl	%eax, PARAM_SIZE	C size-1
197
198	psrlq	$32, %mm1
199	decl	%eax			C size-2 is new size-1
200
201	movd	%mm1, 4(%edi,%eax,4)
202	movq	(%edx), %mm1		C new src high qword
203L(start_src_aligned):
204
205
206	leal	-4(%edi,%eax,4), %edi   C &dst[size-2]
207	psllq	%mm6, %mm5
208
209	testl	$4, %edi
210	psrlq	$32, %mm5		C return value
211
212	jz	L(start_dst_aligned)
213
214
215	C dst isn't aligned, subtract 4 bytes to make it so, and pretend the
216	C shift is 32 bits extra.  High limb of dst (marked xxx) handled
217	C here separately.
218	C
219	C  source       %edx
220	C  +-------+-------+--
221	C  |      mm1      |
222	C  +-------+-------+--
223	C                0mod8   4mod8
224	C
225	C  dest         %edi
226	C  +-------+-------+-------+--
227	C  |  xxx  |
228	C  +-------+-------+-------+--
229	C        0mod8   4mod8   0mod8
230
231	movq	%mm1, %mm0
232	psllq	%mm6, %mm1
233	addl	$32, %ecx		C shift+32
234
235	psrlq	$32, %mm1
236
237	movd	%mm1, 4(%edi)
238	movq	%mm0, %mm1
239	subl	$4, %edi
240
241	movd	%ecx, %mm6		C new lshift
242L(start_dst_aligned):
243
244	decl	%eax			C size-2, two last limbs handled at end
245	movq	%mm1, %mm2		C copy of src high qword
246	negl	%ecx
247
248	andl	$-2, %eax		C round size down to even
249	addl	$64, %ecx
250
251	movl	%eax, %ebx
252	negl	%eax
253
254	andl	$UNROLL_MASK, %eax
255	decl	%ebx
256
257	shll	%eax
258
259	movd	%ecx, %mm7		C rshift = 64-lshift
260
261ifdef(`PIC',`
262	call	L(pic_calc)
263L(here):
264',`
265	leal	L(entry) (%eax,%eax,4), %esi
266')
267	shrl	$UNROLL_LOG2, %ebx	C loop counter
268
269	leal	ifelse(UNROLL_BYTES,256,128) -8(%edx,%eax,2), %edx
270	leal	ifelse(UNROLL_BYTES,256,128) (%edi,%eax,2), %edi
271	movl	PARAM_SIZE, %eax	C for use at end
272	jmp	*%esi
273
274
275ifdef(`PIC',`
276L(pic_calc):
277	C See mpn/x86/README about old gas bugs
278	leal	(%eax,%eax,4), %esi
279	addl	$L(entry)-L(here), %esi
280	addl	(%esp), %esi
281
282	ret_internal
283')
284
285
286C -----------------------------------------------------------------------------
287	ALIGN(32)
288L(top):
289	C eax	size (for use at end)
290	C ebx	loop counter
291	C ecx	rshift
292	C edx	src
293	C esi	computed jump
294	C edi	dst
295	C ebp
296	C
297	C mm0	scratch
298	C mm1	\ carry (alternating, mm2 first)
299	C mm2	/
300	C mm6	lshift
301	C mm7	rshift
302	C
303	C 10 code bytes/limb
304	C
305	C The two chunks differ in whether mm1 or mm2 hold the carry.
306	C The computed jump puts the initial carry in both mm1 and mm2.
307
308L(entry):
309deflit(CHUNK_COUNT, 4)
310forloop(i, 0, UNROLL_COUNT/CHUNK_COUNT-1, `
311	deflit(`disp0', eval(-i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
312	deflit(`disp1', eval(disp0 - 8))
313
314Zdisp(	movq,	disp0,(%edx), %mm0)
315	psllq	%mm6, %mm2
316
317	movq	%mm0, %mm1
318	psrlq	%mm7, %mm0
319
320	por	%mm2, %mm0
321Zdisp(	movq,	%mm0, disp0,(%edi))
322
323
324Zdisp(	movq,	disp1,(%edx), %mm0)
325	psllq	%mm6, %mm1
326
327	movq	%mm0, %mm2
328	psrlq	%mm7, %mm0
329
330	por	%mm1, %mm0
331Zdisp(	movq,	%mm0, disp1,(%edi))
332')
333
334	subl	$UNROLL_BYTES, %edx
335	subl	$UNROLL_BYTES, %edi
336	decl	%ebx
337
338	jns	L(top)
339
340
341
342define(`disp', `m4_empty_if_zero(eval($1 ifelse(UNROLL_BYTES,256,-128)))')
343
344L(end):
345	testb	$1, %al
346	movl	SAVE_EBX, %ebx
347	psllq	%mm6, %mm2	C wanted left shifted in all cases below
348
349	movd	%mm5, %eax
350
351	movl	SAVE_ESI, %esi
352	jz	L(end_even)
353
354
355L(end_odd):
356
357	C Size odd, destination was aligned.
358	C
359	C                 source        edx+8   edx+4
360	C                 --+---------------+-------+
361	C                   |      mm2      |       |
362	C                 --+---------------+-------+
363	C
364	C dest                            edi
365	C --+---------------+---------------+-------+
366	C   |   written     |               |       |
367	C --+---------------+---------------+-------+
368	C
369	C mm6 = shift
370	C mm7 = ecx = 64-shift
371
372
373	C Size odd, destination was unaligned.
374	C
375	C                 source        edx+8   edx+4
376	C                 --+---------------+-------+
377	C                   |      mm2      |       |
378	C                 --+---------------+-------+
379	C
380	C         dest                            edi
381	C         --+---------------+---------------+
382	C           |   written     |               |
383	C         --+---------------+---------------+
384	C
385	C mm6 = shift+32
386	C mm7 = ecx = 64-(shift+32)
387
388
389	C In both cases there's one extra limb of src to fetch and combine
390	C with mm2 to make a qword at (%edi), and in the aligned case
391	C there's an extra limb of dst to be formed from that extra src limb
392	C left shifted.
393
394	movd	disp(4) (%edx), %mm0
395	testb	$32, %cl
396
397	movq	%mm0, %mm1
398	psllq	$32, %mm0
399
400	psrlq	%mm7, %mm0
401	psllq	%mm6, %mm1
402
403	por	%mm2, %mm0
404
405	movq	%mm0, disp(0) (%edi)
406	jz	L(end_odd_unaligned)
407	movd	%mm1, disp(-4) (%edi)
408L(end_odd_unaligned):
409
410	movl	SAVE_EDI, %edi
411	addl	$SAVE_SIZE, %esp
412	emms
413
414	ret
415
416
417L(end_even):
418
419	C Size even, destination was aligned.
420	C
421	C                 source        edx+8
422	C                 --+---------------+
423	C                   |      mm2      |
424	C                 --+---------------+
425	C
426	C dest                            edi
427	C --+---------------+---------------+
428	C   |   written     |               |
429	C --+---------------+---------------+
430	C
431	C mm6 = shift
432	C mm7 = ecx = 64-shift
433
434
435	C Size even, destination was unaligned.
436	C
437	C               source          edx+8
438	C                 --+---------------+
439	C                   |      mm2      |
440	C                 --+---------------+
441	C
442	C         dest                  edi+4
443	C         --+---------------+-------+
444	C           |    written    |       |
445	C         --+---------------+-------+
446	C
447	C mm6 = shift+32
448	C mm7 = ecx = 64-(shift+32)
449
450
451	C The movq for the aligned case overwrites the movd for the
452	C unaligned case.
453
454	movq	%mm2, %mm0
455	psrlq	$32, %mm2
456
457	testb	$32, %cl
458	movd	%mm2, disp(4) (%edi)
459
460	jz	L(end_even_unaligned)
461	movq	%mm0, disp(0) (%edi)
462L(end_even_unaligned):
463
464	movl	SAVE_EDI, %edi
465	addl	$SAVE_SIZE, %esp
466	emms
467
468	ret
469
470EPILOGUE()
471