1dnl  AMD K7 mpn_lshift -- mpn left shift.
2
3dnl  Copyright 1999-2002 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33
34C K7: 1.21 cycles/limb (at 16 limbs/loop).
35
36
37
38dnl  K7: UNROLL_COUNT cycles/limb
39dnl           4           1.51
40dnl           8           1.26
41dnl          16           1.21
42dnl          32           1.2
43dnl  Maximum possible with the current code is 64.
44
45deflit(UNROLL_COUNT, 16)
46
47
48C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
49C                       unsigned shift);
50C
51C Shift src,size left by shift many bits and store the result in dst,size.
52C Zeros are shifted in at the right.  The bits shifted out at the left are
53C the return value.
54C
55C The comments in mpn_rshift apply here too.
56
57ifdef(`PIC',`
58deflit(UNROLL_THRESHOLD, 10)
59',`
60deflit(UNROLL_THRESHOLD, 10)
61')
62
63defframe(PARAM_SHIFT,16)
64defframe(PARAM_SIZE, 12)
65defframe(PARAM_SRC,  8)
66defframe(PARAM_DST,  4)
67
68defframe(SAVE_EDI, -4)
69defframe(SAVE_ESI, -8)
70defframe(SAVE_EBX, -12)
71deflit(SAVE_SIZE, 12)
72
73	TEXT
74	ALIGN(32)
75
76PROLOGUE(mpn_lshift)
77deflit(`FRAME',0)
78
79	movl	PARAM_SIZE, %eax
80	movl	PARAM_SRC, %edx
81	subl	$SAVE_SIZE, %esp
82deflit(`FRAME',SAVE_SIZE)
83
84	movl	PARAM_SHIFT, %ecx
85	movl	%edi, SAVE_EDI
86
87	movl	PARAM_DST, %edi
88	decl	%eax
89	jnz	L(more_than_one_limb)
90
91	movl	(%edx), %edx
92
93	shldl(	%cl, %edx, %eax)	C eax was decremented to zero
94
95	shll	%cl, %edx
96
97	movl	%edx, (%edi)
98	movl	SAVE_EDI, %edi
99	addl	$SAVE_SIZE, %esp
100
101	ret
102
103
104C -----------------------------------------------------------------------------
105L(more_than_one_limb):
106	C eax	size-1
107	C ebx
108	C ecx	shift
109	C edx	src
110	C esi
111	C edi	dst
112	C ebp
113
114	movd	PARAM_SHIFT, %mm6
115	movd	(%edx,%eax,4), %mm5	C src high limb
116	cmp	$UNROLL_THRESHOLD-1, %eax
117
118	jae	L(unroll)
119	negl	%ecx
120	movd	(%edx), %mm4		C src low limb
121
122	addl	$32, %ecx
123
124	movd	%ecx, %mm7
125
126L(simple_top):
127	C eax	loop counter, limbs
128	C ebx
129	C ecx
130	C edx	src
131	C esi
132	C edi	dst
133	C ebp
134	C
135	C mm0	scratch
136	C mm4	src low limb
137	C mm5	src high limb
138	C mm6	shift
139	C mm7	32-shift
140
141	movq	-4(%edx,%eax,4), %mm0
142	decl	%eax
143
144	psrlq	%mm7, %mm0
145
146	movd	%mm0, 4(%edi,%eax,4)
147	jnz	L(simple_top)
148
149
150	psllq	%mm6, %mm5
151	psllq	%mm6, %mm4
152
153	psrlq	$32, %mm5
154	movd	%mm4, (%edi)		C dst low limb
155
156	movd	%mm5, %eax		C return value
157
158	movl	SAVE_EDI, %edi
159	addl	$SAVE_SIZE, %esp
160	emms
161
162	ret
163
164
165C -----------------------------------------------------------------------------
166	ALIGN(16)
167L(unroll):
168	C eax	size-1
169	C ebx	(saved)
170	C ecx	shift
171	C edx	src
172	C esi
173	C edi	dst
174	C ebp
175	C
176	C mm5	src high limb, for return value
177	C mm6	lshift
178
179	movl	%esi, SAVE_ESI
180	movl	%ebx, SAVE_EBX
181	leal	-4(%edx,%eax,4), %edx   C &src[size-2]
182
183	testb	$4, %dl
184	movq	(%edx), %mm1		C src high qword
185
186	jz	L(start_src_aligned)
187
188
189	C src isn't aligned, process high limb (marked xxx) separately to
190	C make it so
191	C
192	C  source    -4(edx,%eax,4)
193	C                  |
194	C  +-------+-------+-------+--
195	C  |  xxx          |
196	C  +-------+-------+-------+--
197	C        0mod8   4mod8   0mod8
198	C
199	C  dest      -4(edi,%eax,4)
200	C                  |
201	C  +-------+-------+--
202	C  |  xxx  |       |
203	C  +-------+-------+--
204
205	psllq	%mm6, %mm1
206	subl	$4, %edx
207	movl	%eax, PARAM_SIZE	C size-1
208
209	psrlq	$32, %mm1
210	decl	%eax			C size-2 is new size-1
211
212	movd	%mm1, 4(%edi,%eax,4)
213	movq	(%edx), %mm1		C new src high qword
214L(start_src_aligned):
215
216
217	leal	-4(%edi,%eax,4), %edi   C &dst[size-2]
218	psllq	%mm6, %mm5
219
220	testl	$4, %edi
221	psrlq	$32, %mm5		C return value
222
223	jz	L(start_dst_aligned)
224
225
226	C dst isn't aligned, subtract 4 bytes to make it so, and pretend the
227	C shift is 32 bits extra.  High limb of dst (marked xxx) handled
228	C here separately.
229	C
230	C  source       %edx
231	C  +-------+-------+--
232	C  |      mm1      |
233	C  +-------+-------+--
234	C                0mod8   4mod8
235	C
236	C  dest         %edi
237	C  +-------+-------+-------+--
238	C  |  xxx  |
239	C  +-------+-------+-------+--
240	C        0mod8   4mod8   0mod8
241
242	movq	%mm1, %mm0
243	psllq	%mm6, %mm1
244	addl	$32, %ecx		C shift+32
245
246	psrlq	$32, %mm1
247
248	movd	%mm1, 4(%edi)
249	movq	%mm0, %mm1
250	subl	$4, %edi
251
252	movd	%ecx, %mm6		C new lshift
253L(start_dst_aligned):
254
255	decl	%eax			C size-2, two last limbs handled at end
256	movq	%mm1, %mm2		C copy of src high qword
257	negl	%ecx
258
259	andl	$-2, %eax		C round size down to even
260	addl	$64, %ecx
261
262	movl	%eax, %ebx
263	negl	%eax
264
265	andl	$UNROLL_MASK, %eax
266	decl	%ebx
267
268	shll	%eax
269
270	movd	%ecx, %mm7		C rshift = 64-lshift
271
272ifdef(`PIC',`
273	call	L(pic_calc)
274L(here):
275',`
276	leal	L(entry) (%eax,%eax,4), %esi
277')
278	shrl	$UNROLL_LOG2, %ebx	C loop counter
279
280	leal	ifelse(UNROLL_BYTES,256,128) -8(%edx,%eax,2), %edx
281	leal	ifelse(UNROLL_BYTES,256,128) (%edi,%eax,2), %edi
282	movl	PARAM_SIZE, %eax	C for use at end
283	jmp	*%esi
284
285
286ifdef(`PIC',`
287L(pic_calc):
288	C See mpn/x86/README about old gas bugs
289	leal	(%eax,%eax,4), %esi
290	addl	$L(entry)-L(here), %esi
291	addl	(%esp), %esi
292
293	ret_internal
294')
295
296
297C -----------------------------------------------------------------------------
298	ALIGN(32)
299L(top):
300	C eax	size (for use at end)
301	C ebx	loop counter
302	C ecx	rshift
303	C edx	src
304	C esi	computed jump
305	C edi	dst
306	C ebp
307	C
308	C mm0	scratch
309	C mm1	\ carry (alternating, mm2 first)
310	C mm2	/
311	C mm6	lshift
312	C mm7	rshift
313	C
314	C 10 code bytes/limb
315	C
316	C The two chunks differ in whether mm1 or mm2 hold the carry.
317	C The computed jump puts the initial carry in both mm1 and mm2.
318
319L(entry):
320deflit(CHUNK_COUNT, 4)
321forloop(i, 0, UNROLL_COUNT/CHUNK_COUNT-1, `
322	deflit(`disp0', eval(-i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
323	deflit(`disp1', eval(disp0 - 8))
324
325Zdisp(	movq,	disp0,(%edx), %mm0)
326	psllq	%mm6, %mm2
327
328	movq	%mm0, %mm1
329	psrlq	%mm7, %mm0
330
331	por	%mm2, %mm0
332Zdisp(	movq,	%mm0, disp0,(%edi))
333
334
335Zdisp(	movq,	disp1,(%edx), %mm0)
336	psllq	%mm6, %mm1
337
338	movq	%mm0, %mm2
339	psrlq	%mm7, %mm0
340
341	por	%mm1, %mm0
342Zdisp(	movq,	%mm0, disp1,(%edi))
343')
344
345	subl	$UNROLL_BYTES, %edx
346	subl	$UNROLL_BYTES, %edi
347	decl	%ebx
348
349	jns	L(top)
350
351
352
353define(`disp', `m4_empty_if_zero(eval($1 ifelse(UNROLL_BYTES,256,-128)))')
354
355L(end):
356	testb	$1, %al
357	movl	SAVE_EBX, %ebx
358	psllq	%mm6, %mm2	C wanted left shifted in all cases below
359
360	movd	%mm5, %eax
361
362	movl	SAVE_ESI, %esi
363	jz	L(end_even)
364
365
366L(end_odd):
367
368	C Size odd, destination was aligned.
369	C
370	C                 source        edx+8   edx+4
371	C                 --+---------------+-------+
372	C                   |      mm2      |       |
373	C                 --+---------------+-------+
374	C
375	C dest                            edi
376	C --+---------------+---------------+-------+
377	C   |   written     |               |       |
378	C --+---------------+---------------+-------+
379	C
380	C mm6 = shift
381	C mm7 = ecx = 64-shift
382
383
384	C Size odd, destination was unaligned.
385	C
386	C                 source        edx+8   edx+4
387	C                 --+---------------+-------+
388	C                   |      mm2      |       |
389	C                 --+---------------+-------+
390	C
391	C         dest                            edi
392	C         --+---------------+---------------+
393	C           |   written     |               |
394	C         --+---------------+---------------+
395	C
396	C mm6 = shift+32
397	C mm7 = ecx = 64-(shift+32)
398
399
400	C In both cases there's one extra limb of src to fetch and combine
401	C with mm2 to make a qword at (%edi), and in the aligned case
402	C there's an extra limb of dst to be formed from that extra src limb
403	C left shifted.
404
405	movd	disp(4) (%edx), %mm0
406	testb	$32, %cl
407
408	movq	%mm0, %mm1
409	psllq	$32, %mm0
410
411	psrlq	%mm7, %mm0
412	psllq	%mm6, %mm1
413
414	por	%mm2, %mm0
415
416	movq	%mm0, disp(0) (%edi)
417	jz	L(end_odd_unaligned)
418	movd	%mm1, disp(-4) (%edi)
419L(end_odd_unaligned):
420
421	movl	SAVE_EDI, %edi
422	addl	$SAVE_SIZE, %esp
423	emms
424
425	ret
426
427
428L(end_even):
429
430	C Size even, destination was aligned.
431	C
432	C                 source        edx+8
433	C                 --+---------------+
434	C                   |      mm2      |
435	C                 --+---------------+
436	C
437	C dest                            edi
438	C --+---------------+---------------+
439	C   |   written     |               |
440	C --+---------------+---------------+
441	C
442	C mm6 = shift
443	C mm7 = ecx = 64-shift
444
445
446	C Size even, destination was unaligned.
447	C
448	C               source          edx+8
449	C                 --+---------------+
450	C                   |      mm2      |
451	C                 --+---------------+
452	C
453	C         dest                  edi+4
454	C         --+---------------+-------+
455	C           |    written    |       |
456	C         --+---------------+-------+
457	C
458	C mm6 = shift+32
459	C mm7 = ecx = 64-(shift+32)
460
461
462	C The movq for the aligned case overwrites the movd for the
463	C unaligned case.
464
465	movq	%mm2, %mm0
466	psrlq	$32, %mm2
467
468	testb	$32, %cl
469	movd	%mm2, disp(4) (%edi)
470
471	jz	L(end_even_unaligned)
472	movq	%mm0, disp(0) (%edi)
473L(end_even_unaligned):
474
475	movl	SAVE_EDI, %edi
476	addl	$SAVE_SIZE, %esp
477	emms
478
479	ret
480
481EPILOGUE()
482