• Home
  • History
  • Annotate
  • Line#
  • Navigate
  • Raw
  • Download
  • only in /netgear-WNDR4500v2-V1.0.0.60_1.0.38/ap/gpl/timemachine/libgcrypt-1.5.0/mpi/pentium4/mmx/
1/* Intel Pentium-4 mpn_lshift -- left shift.
2 *
3 * Copyright 2001, 2002 Free Software Foundation, Inc.
4 *
5 * This file is part of Libgcrypt.
6 *
7 * Libgcrypt is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU Lesser General Public License as
9 * published by the Free Software Foundation; either version 2.1 of
10 * the License, or (at your option) any later version.
11 *
12 * Libgcrypt is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 * GNU Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
20 *
21 * Note: This code is heavily based on the GNU MP Library.
22 *	 Actually it's the same code with only minor changes in the
23 *	 way the data is stored; this is to support the abstraction
24 *	 of an optional secure memory allocation which may be used
25 *	 to avoid revealing of sensitive data due to paging etc.
26 */
27
28
29#include "sysdep.h"
30#include "asm-syntax.h"
31
32
33/*******************
34 * mpi_limb_t
35 * _gcry_mpih_lshift( mpi_ptr_t wp,	(sp + 4)
36 *		   mpi_ptr_t up,	(sp + 8)
37 *		   mpi_size_t usize,	(sp + 12)
38 *		   unsigned cnt)	(sp + 16)
39 *
40 * P4 Willamette, Northwood: 1.75 cycles/limb
41 * P4 Prescott:		     2.0 cycles/limb
42 */
43
44.text
45	ALIGN (3)
46	.globl C_SYMBOL_NAME(_gcry_mpih_lshift)
47C_SYMBOL_NAME(_gcry_mpih_lshift:)
48
49
50	pushl	%ebx
51	pushl	%edi
52
53
54	movl	20(%esp), %eax
55	movl	12(%esp), %edx
56
57	movl	16(%esp), %ebx
58	movl	24(%esp), %ecx
59
60	cmp	$5, %eax
61	jae	.Lunroll
62
63	movl	-4(%ebx,%eax,4), %edi
64	decl	%eax
65
66	jnz	.Lsimple
67
68	shldl	%cl, %edi, %eax
69
70	shll	%cl, %edi
71
72	movl	%edi, (%edx)
73	popl	%edi
74
75	popl	%ebx
76
77	ret
78
79
80
81
82
83.Lsimple:
84
85
86
87
88
89
90
91
92
93	movd	(%ebx,%eax,4), %mm5
94
95	movd	%ecx, %mm6
96	negl	%ecx
97
98	psllq	%mm6, %mm5
99	addl	$32, %ecx
100
101	movd	%ecx, %mm7
102	psrlq	$32, %mm5
103
104
105.Lsimple_top:
106
107
108
109
110
111
112
113
114
115
116
117
118	movq	-4(%ebx,%eax,4), %mm0
119	decl	%eax
120
121	psrlq	%mm7, %mm0
122
123
124
125	movd	%mm0, 4(%edx,%eax,4)
126	jnz	.Lsimple_top
127
128
129	movd	(%ebx), %mm0
130
131	movd	%mm5, %eax
132	psllq	%mm6, %mm0
133
134	popl	%edi
135	popl	%ebx
136
137	movd	%mm0, (%edx)
138
139	emms
140
141	ret
142
143
144
145
146
147	.align	8, 0x90
148.Lunroll:
149
150
151
152
153
154
155
156
157
158	movd	-4(%ebx,%eax,4), %mm5
159	leal	(%ebx,%eax,4), %edi
160
161	movd	%ecx, %mm6
162	andl	$4, %edi
163
164	psllq	%mm6, %mm5
165	jz	.Lstart_src_aligned
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185	movq	-8(%ebx,%eax,4), %mm0
186
187	psllq	%mm6, %mm0
188	decl	%eax
189
190	psrlq	$32, %mm0
191
192
193
194	movd	%mm0, (%edx,%eax,4)
195.Lstart_src_aligned:
196
197	movq	-8(%ebx,%eax,4), %mm1
198	leal	(%edx,%eax,4), %edi
199
200	andl	$4, %edi
201	psrlq	$32, %mm5
202
203	movq	-16(%ebx,%eax,4), %mm3
204	jz	.Lstart_dst_aligned
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225	movq	%mm1, %mm0
226	addl	$32, %ecx
227
228	psllq	%mm6, %mm0
229
230	movd	%ecx, %mm6
231	psrlq	$32, %mm0
232
233
234
235	movd	%mm0, -4(%edx,%eax,4)
236	subl	$4, %edx
237.Lstart_dst_aligned:
238
239
240	psllq	%mm6, %mm1
241	negl	%ecx
242
243	addl	$64, %ecx
244	movq	%mm3, %mm2
245
246	movd	%ecx, %mm7
247	subl	$8, %eax
248
249	psrlq	%mm7, %mm3
250
251	por	%mm1, %mm3
252	jc	.Lfinish
253
254
255
256
257	.align	8, 0x90
258.Lunroll_loop:
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275	movq	8(%ebx,%eax,4), %mm0
276	psllq	%mm6, %mm2
277
278	movq	%mm0, %mm1
279	psrlq	%mm7, %mm0
280
281	movq	%mm3, 24(%edx,%eax,4)
282	por	%mm2, %mm0
283
284	movq	(%ebx,%eax,4), %mm3
285	psllq	%mm6, %mm1
286
287	movq	%mm0, 16(%edx,%eax,4)
288	movq	%mm3, %mm2
289
290	psrlq	%mm7, %mm3
291	subl	$4, %eax
292
293	por	%mm1, %mm3
294	jnc	.Lunroll_loop
295
296
297
298.Lfinish:
299
300
301	testb	$2, %al
302
303	jz	.Lfinish_no_two
304
305	movq	8(%ebx,%eax,4), %mm0
306	psllq	%mm6, %mm2
307
308	movq	%mm0, %mm1
309	psrlq	%mm7, %mm0
310
311	movq	%mm3, 24(%edx,%eax,4)
312	por	%mm2, %mm0
313
314	movq	%mm1, %mm2
315	movq	%mm0, %mm3
316
317	subl	$2, %eax
318.Lfinish_no_two:
319
320
321
322
323
324
325
326	testb	$1, %al
327	movd	%mm5, %eax
328
329	popl	%edi
330	jz	.Lfinish_zero
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371	movd	(%ebx), %mm0
372	psllq	%mm6, %mm2
373
374	movq	%mm3, 12(%edx)
375	psllq	$32, %mm0
376
377	movq	%mm0, %mm1
378	psrlq	%mm7, %mm0
379
380	por	%mm2, %mm0
381	psllq	%mm6, %mm1
382
383	movq	%mm0, 4(%edx)
384	psrlq	$32, %mm1
385
386	andl	$32, %ecx
387	popl	%ebx
388
389	jz	.Lfinish_one_unaligned
390
391	movd	%mm1, (%edx)
392.Lfinish_one_unaligned:
393
394	emms
395
396	ret
397
398
399
400
401.Lfinish_zero:
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439	movq	%mm3, 8(%edx)
440	andl	$32, %ecx
441
442	psllq	%mm6, %mm2
443	jz	.Lfinish_zero_unaligned
444
445	movq	%mm2, (%edx)
446.Lfinish_zero_unaligned:
447
448	psrlq	$32, %mm2
449	popl	%ebx
450
451	movd	%mm5, %eax
452
453	movd	%mm2, 4(%edx)
454
455	emms
456
457	ret
458