1dnl  x86 __gmpn_addmul_1 (for 386 and 486) -- Multiply a limb vector with a
2dnl  limb and add the result to a second limb vector.
3
4dnl  Copyright 1992, 1994, 1997, 1999-2002, 2005 Free Software Foundation, Inc.
5
6dnl  This file is part of the GNU MP Library.
7dnl
8dnl  The GNU MP Library is free software; you can redistribute it and/or modify
9dnl  it under the terms of either:
10dnl
11dnl    * the GNU Lesser General Public License as published by the Free
12dnl      Software Foundation; either version 3 of the License, or (at your
13dnl      option) any later version.
14dnl
15dnl  or
16dnl
17dnl    * the GNU General Public License as published by the Free Software
18dnl      Foundation; either version 2 of the License, or (at your option) any
19dnl      later version.
20dnl
21dnl  or both in parallel, as here.
22dnl
23dnl  The GNU MP Library is distributed in the hope that it will be useful, but
24dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
25dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
26dnl  for more details.
27dnl
28dnl  You should have received copies of the GNU General Public License and the
29dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
30dnl  see https://www.gnu.org/licenses/.
31
32include(`../config.m4')
33
34C			    cycles/limb
35C P5				14.75
36C P6 model 0-8,10-12		 7.5
37C P6 model 9  (Banias)		 6.7
38C P6 model 13 (Dothan)		 6.75
39C P4 model 0  (Willamette)	24.0
40C P4 model 1  (?)		24.0
41C P4 model 2  (Northwood)	24.0
42C P4 model 3  (Prescott)
43C P4 model 4  (Nocona)
44C Intel Atom
45C AMD K6			12.5
46C AMD K7			 5.25
47C AMD K8
48C AMD K10
49
50
51ifdef(`OPERATION_addmul_1',`
52      define(M4_inst,        addl)
53      define(M4_function_1,  mpn_addmul_1)
54
55',`ifdef(`OPERATION_submul_1',`
56      define(M4_inst,        subl)
57      define(M4_function_1,  mpn_submul_1)
58
59',`m4_error(`Need OPERATION_addmul_1 or OPERATION_submul_1
60')')')
61
62MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)
63
64
65C mp_limb_t M4_function_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
66C                          mp_limb_t mult);
67
68define(PARAM_MULTIPLIER, `FRAME+16(%esp)')
69define(PARAM_SIZE,       `FRAME+12(%esp)')
70define(PARAM_SRC,        `FRAME+8(%esp)')
71define(PARAM_DST,        `FRAME+4(%esp)')
72
73	TEXT
74	ALIGN(8)
75
76PROLOGUE(M4_function_1)
77deflit(`FRAME',0)
78
79	pushl	%edi
80	pushl	%esi
81	pushl	%ebx
82	pushl	%ebp
83deflit(`FRAME',16)
84
85	movl	PARAM_DST,%edi
86	movl	PARAM_SRC,%esi
87	movl	PARAM_SIZE,%ecx
88
89	xorl	%ebx,%ebx
90	andl	$3,%ecx
91	jz	L(end0)
92
93L(oop0):
94	movl	(%esi),%eax
95	mull	PARAM_MULTIPLIER
96	leal	4(%esi),%esi
97	addl	%ebx,%eax
98	movl	$0,%ebx
99	adcl	%ebx,%edx
100	M4_inst	%eax,(%edi)
101	adcl	%edx,%ebx	C propagate carry into cylimb
102
103	leal	4(%edi),%edi
104	decl	%ecx
105	jnz	L(oop0)
106
107L(end0):
108	movl	PARAM_SIZE,%ecx
109	shrl	$2,%ecx
110	jz	L(end)
111
112	ALIGN(8)
113L(oop):	movl	(%esi),%eax
114	mull	PARAM_MULTIPLIER
115	addl	%eax,%ebx
116	movl	$0,%ebp
117	adcl	%edx,%ebp
118
119	movl	4(%esi),%eax
120	mull	PARAM_MULTIPLIER
121	M4_inst	%ebx,(%edi)
122	adcl	%eax,%ebp	C new lo + cylimb
123	movl	$0,%ebx
124	adcl	%edx,%ebx
125
126	movl	8(%esi),%eax
127	mull	PARAM_MULTIPLIER
128	M4_inst	%ebp,4(%edi)
129	adcl	%eax,%ebx	C new lo + cylimb
130	movl	$0,%ebp
131	adcl	%edx,%ebp
132
133	movl	12(%esi),%eax
134	mull	PARAM_MULTIPLIER
135	M4_inst	%ebx,8(%edi)
136	adcl	%eax,%ebp	C new lo + cylimb
137	movl	$0,%ebx
138	adcl	%edx,%ebx
139
140	M4_inst	%ebp,12(%edi)
141	adcl	$0,%ebx		C propagate carry into cylimb
142
143	leal	16(%esi),%esi
144	leal	16(%edi),%edi
145	decl	%ecx
146	jnz	L(oop)
147
148L(end):	movl	%ebx,%eax
149
150	popl	%ebp
151	popl	%ebx
152	popl	%esi
153	popl	%edi
154	ret
155
156EPILOGUE()
157