1/* Intel Pentium-4 mpn_rshift -- right shift.
2 *
3 * Copyright 2001, 2002 Free Software Foundation, Inc.
4 *
5 * This file is part of Libgcrypt.
6 *
7 * Libgcrypt is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU Lesser General Public License as
9 * published by the Free Software Foundation; either version 2.1 of
10 * the License, or (at your option) any later version.
11 *
12 * Libgcrypt is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 * GNU Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
20 *
21 * Note: This code is heavily based on the GNU MP Library.
22 *	 Actually it's the same code with only minor changes in the
23 *	 way the data is stored; this is to support the abstraction
24 *	 of an optional secure memory allocation which may be used
25 *	 to avoid revealing of sensitive data due to paging etc.
26 */
27
28
29#include "sysdep.h"
30#include "asm-syntax.h"
31
32
33/*******************
34 * mpi_limb_t
35 * _gcry_mpih_rshift( mpi_ptr_t wp,	(sp + 4)
36 *		   mpi_ptr_t up,	(sp + 8)
37 *		   mpi_size_t usize,	(sp + 12)
38 *		   unsigned cnt)	(sp + 16)
39 *
40 * P4 Willamette, Northwood: 1.75 cycles/limb
41 * P4 Prescott:		     2.0 cycles/limb
42 */
43
44.text
45	ALIGN (3)
46	.globl C_SYMBOL_NAME(_gcry_mpih_rshift)
47C_SYMBOL_NAME(_gcry_mpih_rshift:)
48	pushl	%ebx
49	pushl	%edi
50
51
52	movl	20(%esp), %eax
53	movl	12(%esp), %edx
54
55	movl	16(%esp), %ebx
56	movl	24(%esp), %ecx
57
58	cmp	$5, %eax
59	jae	.Lunroll
60
61	decl	%eax
62	movl	(%ebx), %edi
63
64	jnz	.Lsimple
65
66	shrdl	%cl, %edi, %eax
67
68	shrl	%cl, %edi
69
70	movl	%edi, (%edx)
71	popl	%edi
72
73	popl	%ebx
74
75	ret
76
77
78
79
80
81	.align	8, 0x90
82.Lsimple:
83
84
85
86
87
88
89
90
91
92	movd	(%ebx), %mm5
93	leal	(%ebx,%eax,4), %ebx
94
95	movd	%ecx, %mm6
96	leal	-4(%edx,%eax,4), %edx
97
98	psllq	$32, %mm5
99	negl	%eax
100
101
102
103
104
105
106
107.Lsimple_top:
108
109
110
111
112
113
114
115
116
117	movq	(%ebx,%eax,4), %mm0
118	incl	%eax
119
120	psrlq	%mm6, %mm0
121
122	movd	%mm0, (%edx,%eax,4)
123	jnz	.Lsimple_top
124
125
126	movd	(%ebx), %mm0
127	psrlq	%mm6, %mm5
128
129	psrlq	%mm6, %mm0
130	popl	%edi
131
132	movd	%mm5, %eax
133	popl	%ebx
134
135	movd	%mm0, 4(%edx)
136
137	emms
138
139	ret
140
141
142
143
144
145	.align	8, 0x90
146.Lunroll:
147
148
149
150
151
152
153
154
155
156	movd	(%ebx), %mm5
157	movl	$4, %edi
158
159	movd	%ecx, %mm6
160	testl	%edi, %ebx
161
162	psllq	$32, %mm5
163	jz	.Lstart_src_aligned
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180	movq	(%ebx), %mm0
181
182	psrlq	%mm6, %mm0
183	addl	$4, %ebx
184
185	decl	%eax
186
187	movd	%mm0, (%edx)
188	addl	$4, %edx
189.Lstart_src_aligned:
190
191
192	movq	(%ebx), %mm1
193	testl	%edi, %edx
194
195	psrlq	%mm6, %mm5
196	jz	.Lstart_dst_aligned
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214	movq	%mm1, %mm0
215	addl	$32, %ecx
216
217	psrlq	%mm6, %mm0
218
219	movd	%ecx, %mm6
220
221	movd	%mm0, (%edx)
222	addl	$4, %edx
223.Lstart_dst_aligned:
224
225
226	movq	8(%ebx), %mm3
227	negl	%ecx
228
229	movq	%mm3, %mm2
230	addl	$64, %ecx
231
232	movd	%ecx, %mm7
233	psrlq	%mm6, %mm1
234
235	leal	-12(%ebx,%eax,4), %ebx
236	leal	-20(%edx,%eax,4), %edx
237
238	psllq	%mm7, %mm3
239	subl	$7, %eax
240
241	por	%mm1, %mm3
242	negl	%eax
243
244	jns	.Lfinish
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260	.align	8, 0x90
261.Lunroll_loop:
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278	movq	(%ebx,%eax,4), %mm0
279	psrlq	%mm6, %mm2
280
281	movq	%mm0, %mm1
282	psllq	%mm7, %mm0
283
284	movq	%mm3, -8(%edx,%eax,4)
285	por	%mm2, %mm0
286
287	movq	8(%ebx,%eax,4), %mm3
288	psrlq	%mm6, %mm1
289
290	movq	%mm0, (%edx,%eax,4)
291	movq	%mm3, %mm2
292
293	psllq	%mm7, %mm3
294	addl	$4, %eax
295
296	por	%mm1, %mm3
297	js	.Lunroll_loop
298
299
300.Lfinish:
301
302
303	testb	$2, %al
304
305	jnz	.Lfinish_no_two
306
307	movq	(%ebx,%eax,4), %mm0
308	psrlq	%mm6, %mm2
309
310	movq	%mm0, %mm1
311	psllq	%mm7, %mm0
312
313	movq	%mm3, -8(%edx,%eax,4)
314	por	%mm2, %mm0
315
316	movq	%mm1, %mm2
317	movq	%mm0, %mm3
318
319	addl	$2, %eax
320.Lfinish_no_two:
321
322
323
324
325
326
327
328	testb	$1, %al
329	popl	%edi
330
331	movd	%mm5, %eax
332	jnz	.Lfinish_zero
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372	movd	8(%ebx), %mm0
373	psrlq	%mm6, %mm2
374
375	movq	%mm0, %mm1
376	psllq	%mm7, %mm0
377
378	movq	%mm3, (%edx)
379	por	%mm2, %mm0
380
381	psrlq	%mm6, %mm1
382	andl	$32, %ecx
383
384	popl	%ebx
385	jz	.Lfinish_one_unaligned
386
387
388	movd	%mm1, 16(%edx)
389.Lfinish_one_unaligned:
390
391	movq	%mm0, 8(%edx)
392
393	emms
394
395	ret
396
397
398
399
400.Lfinish_zero:
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439	movq	%mm3, 4(%edx)
440	psrlq	%mm6, %mm2
441
442	movd	%mm2, 12(%edx)
443	andl	$32, %ecx
444
445	popl	%ebx
446	jz	.Lfinish_zero_unaligned
447
448	movq	%mm2, 12(%edx)
449.Lfinish_zero_unaligned:
450
451	emms
452
453	ret
454