• Home
  • History
  • Annotate
  • Line#
  • Navigate
  • Raw
  • Download
  • only in /netgear-WNDR4500v2-V1.0.0.60_1.0.38/ap/gpl/timemachine/libgcrypt-1.5.0/mpi/pentium4/sse2/
1/* Intel Pentium-4 mpn_addmul_1 -- Multiply a limb vector with a limb and add
2 * the result to a second limb vector.
3 *
4 * Copyright 2001, 2002, 2004, 2005 Free Software Foundation, Inc.
5 *
6 * This file is part of Libgcrypt.
7 *
8 * Libgcrypt is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU Lesser General Public License as
10 * published by the Free Software Foundation; either version 2.1 of
11 * the License, or (at your option) any later version.
12 *
13 * Libgcrypt is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16 * GNU Lesser General Public License for more details.
17 *
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with this program; if not, write to the Free Software
20 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
21 *
22 * Note: This code is heavily based on the GNU MP Library.
23 *	 Actually it's the same code with only minor changes in the
24 *	 way the data is stored; this is to support the abstraction
25 *	 of an optional secure memory allocation which may be used
26 *	 to avoid revealing of sensitive data due to paging etc.
27 */
28
29
30#include "sysdep.h"
31#include "asm-syntax.h"
32
33
34/*******************
35 * mpi_limb_t
36 * _gcry_mpih_addmul_1( mpi_ptr_t res_ptr,      (sp + 4)
37 *		     mpi_ptr_t s1_ptr,	     (sp + 8)
38 *		     mpi_size_t s1_size,     (sp + 12)
39 *		     mpi_limb_t s2_limb)     (sp + 16)
40 *
41 * P3 model 9  (Banias)          ?.?
42 * P3 model 13 (Dothan)          5.8
43 * P4 model 0  (Willamette)      5.5
44 * P4 model 1  (?)               5.5
45 * P4 model 2  (Northwood)       5.5
46 * P4 model 3  (Prescott)        6.0
47 * P4 model 4  (Nocona)
48 *
49 * Only the carry limb propagation is on the dependent chain, but some other
50 * Pentium4 pipeline magic brings down performance to 6 cycles/l from the
51 * ideal 4 cycles/l.
52 */
53
54
55	TEXT
56	ALIGN (4)
57	GLOBL	C_SYMBOL_NAME(_gcry_mpih_addmul_1)
58C_SYMBOL_NAME(_gcry_mpih_addmul_1:)
59
60	pxor	%mm4, %mm4
61.Lstart_1c:
62	movl	8(%esp), %eax
63	movl	12(%esp), %ecx
64	movl	4(%esp), %edx
65	movd	16(%esp), %mm7
66
67/*
68	C eax	src, incrementing ; 5B
69	C ecx	loop counter, decrementing
70	C edx	dst, incrementing
71	C
72	C mm4	carry, low 32-bits
73	C mm7	multiplier
74*/
75
76	movd		(%eax), %mm2
77	pmuludq		%mm7, %mm2
78
79	shrl	$1, %ecx
80	jnc	.Leven
81
82	leal		4(%eax), %eax
83	movd		(%edx), %mm1
84	paddq		%mm2, %mm1
85	paddq		%mm1, %mm4
86	movd		%mm4, (%edx)
87	psrlq		$32, %mm4
88
89	testl	%ecx, %ecx
90	jz	.Lrtn
91	leal	4(%edx), %edx
92
93	movd		(%eax), %mm2
94	pmuludq		%mm7, %mm2
95.Leven:
96	movd		4(%eax), %mm0
97	movd		(%edx), %mm1
98	pmuludq		%mm7, %mm0
99
100	subl	$1, %ecx
101	jz	.Lend
102.Lloop:
103	paddq		%mm2, %mm1
104	movd		8(%eax), %mm2
105	paddq		%mm1, %mm4
106	movd		4(%edx), %mm3
107	pmuludq		%mm7, %mm2
108	movd		%mm4, (%edx)
109	psrlq		$32, %mm4
110
111	paddq		%mm0, %mm3
112	movd		12(%eax), %mm0
113	paddq		%mm3, %mm4
114	movd		8(%edx), %mm1
115	pmuludq		%mm7, %mm0
116	movd		%mm4, 4(%edx)
117	psrlq		$32, %mm4
118
119	leal	8(%eax), %eax
120	leal	8(%edx), %edx
121	subl	$1, %ecx
122	jnz	.Lloop
123.Lend:
124	paddq		%mm2, %mm1
125	paddq		%mm1, %mm4
126	movd		4(%edx), %mm3
127	movd		%mm4, (%edx)
128	psrlq		$32, %mm4
129	paddq		%mm0, %mm3
130	paddq		%mm3, %mm4
131	movd		%mm4, 4(%edx)
132	psrlq		$32, %mm4
133.Lrtn:
134	movd	%mm4, %eax
135	emms
136	ret
137