1dnl  AMD K7 mpn_addmul_1/mpn_submul_1 -- add or subtract mpn multiple.
2
3dnl  Copyright 1999, 2000, 2001, 2002, 2005, 2008 Free Software Foundation,
4dnl  Inc.
5dnl
6dnl  This file is part of the GNU MP Library.
7dnl
8dnl  The GNU MP Library is free software; you can redistribute it and/or
9dnl  modify it under the terms of the GNU Lesser General Public License as
10dnl  published by the Free Software Foundation; either version 3 of the
11dnl  License, or (at your option) any later version.
12dnl
13dnl  The GNU MP Library is distributed in the hope that it will be useful,
14dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
15dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16dnl  Lesser General Public License for more details.
17dnl
18dnl  You should have received a copy of the GNU Lesser General Public License
19dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
20
21include(`../config.m4')
22
23
24C                           cycles/limb
25C P5:
26C P6 model 0-8,10-12)
27C P6 model 9  (Banias)
28C P6 model 13 (Dothan)
29C P4 model 0  (Willamette)
30C P4 model 1  (?)
31C P4 model 2  (Northwood)
32C P4 model 3  (Prescott)
33C P4 model 4  (Nocona)
34C K6:
35C K7:                            3.75
36C K8:
37
38C TODO
39C  * Improve feed-in and wind-down code.  We beat the old code for all n != 1,
40C    but lose by 2x for n == 1.
41
42ifdef(`OPERATION_addmul_1',`
43      define(`ADDSUB',        `add')
44      define(`func',  `mpn_addmul_1')
45')
46ifdef(`OPERATION_submul_1',`
47      define(`ADDSUB',        `sub')
48      define(`func',  `mpn_submul_1')
49')
50
51MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)
52
53ASM_START()
54	TEXT
55	ALIGN(16)
56PROLOGUE(func)
57	add	$-16, %esp
58	mov	%ebp, (%esp)
59	mov	%ebx, 4(%esp)
60	mov	%esi, 8(%esp)
61	mov	%edi, 12(%esp)
62
63	mov	20(%esp), %edi
64	mov	24(%esp), %esi
65	mov	28(%esp), %eax
66	mov	32(%esp), %ecx
67	mov	%eax, %ebx
68	shr	$2, %eax
69	mov	%eax, 28(%esp)
70	mov	(%esi), %eax
71	and	$3, %ebx
72	jz	L(b0)
73	cmp	$2, %ebx
74	jz	L(b2)
75	jg	L(b3)
76
77L(b1):	lea	-4(%esi), %esi
78	lea	-4(%edi), %edi
79	mul	%ecx
80	mov	%eax, %ebx
81	mov	%edx, %ebp
82	cmpl	$0, 28(%esp)
83	jz	L(cj1)
84	mov	8(%esi), %eax
85	jmp	L(1)
86
87L(b2):	mul	%ecx
88	mov	%eax, %ebp
89	mov	4(%esi), %eax
90	mov	%edx, %ebx
91	cmpl	$0, 28(%esp)
92	jne	L(2)
93	jmp	L(cj2)
94
95L(b3):	lea	-12(%esi), %esi
96	lea	-12(%edi), %edi
97	mul	%ecx
98	mov	%eax, %ebx
99	mov	%edx, %ebp
100	mov	16(%esi), %eax
101	incl	28(%esp)
102	jmp	L(3)
103
104L(b0):	lea	-8(%esi), %esi
105	lea	-8(%edi), %edi
106	mul	%ecx
107	mov	%eax, %ebp
108	mov	12(%esi), %eax
109	mov	%edx, %ebx
110	jmp	L(0)
111
112	ALIGN(16)
113L(top):	lea	16(%edi), %edi
114L(2):	mul	%ecx
115	ADDSUB	%ebp, 0(%edi)
116	mov	$0, %ebp
117	adc	%eax, %ebx
118	mov	8(%esi), %eax
119	adc	%edx, %ebp
120L(1):	mul	%ecx
121	ADDSUB	%ebx, 4(%edi)
122	mov	$0, %ebx
123	adc	%eax, %ebp
124	mov	12(%esi), %eax
125	adc	%edx, %ebx
126L(0):	mul	%ecx
127	ADDSUB	%ebp, 8(%edi)
128	mov	$0, %ebp
129	adc	%eax, %ebx
130	adc	%edx, %ebp
131	mov	16(%esi), %eax
132L(3):	mul	%ecx
133	ADDSUB	%ebx, 12(%edi)
134	adc	%eax, %ebp
135	mov	20(%esi), %eax
136	lea	16(%esi), %esi
137	mov	$0, %ebx
138	adc	%edx, %ebx
139	decl	28(%esp)
140	jnz	L(top)
141
142L(end):	lea	16(%edi), %edi
143L(cj2):	mul	%ecx
144	ADDSUB	%ebp, (%edi)
145	adc	%eax, %ebx
146	adc	$0, %edx
147L(cj1):	ADDSUB	%ebx, 4(%edi)
148	adc	$0, %edx
149	mov	%edx, %eax
150	mov	(%esp), %ebp
151	mov	4(%esp), %ebx
152	mov	8(%esp), %esi
153	mov	12(%esp), %edi
154	add	$16, %esp
155	ret
156EPILOGUE()
157ASM_END()
158