1284345Ssjgdnl  AMD64 mpn_addaddmul_1msb0, R = Au + Bv, u,v < 2^63.
2284345Ssjg
3284345Ssjgdnl  Copyright 2008 Free Software Foundation, Inc.
4284345Ssjg
5284345Ssjgdnl  This file is part of the GNU MP Library.
6284345Ssjgdnl
7284345Ssjgdnl  The GNU MP Library is free software; you can redistribute it and/or modify
8284345Ssjgdnl  it under the terms of either:
9284345Ssjgdnl
10284345Ssjgdnl    * the GNU Lesser General Public License as published by the Free
11284345Ssjgdnl      Software Foundation; either version 3 of the License, or (at your
12284345Ssjgdnl      option) any later version.
13284345Ssjgdnl
14284345Ssjgdnl  or
15284345Ssjgdnl
16284345Ssjgdnl    * the GNU General Public License as published by the Free Software
17284345Ssjgdnl      Foundation; either version 2 of the License, or (at your option) any
18284345Ssjgdnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33C	     cycles/limb
34C AMD K8,K9	 2.167
35C AMD K10	 2.167
36C Intel P4	12.0
37C Intel core2	 4.0
38C Intel corei	 ?
39C Intel atom	 ?
40C VIA nano	 ?
41
42C TODO
43C  * Perhaps handle various n mod 3 sizes better.  The code now is too large.
44
45C INPUT PARAMETERS
46define(`rp',	`%rdi')
47define(`ap',	`%rsi')
48define(`bp_param', `%rdx')
49define(`n',	`%rcx')
50define(`u0',	`%r8')
51define(`v0',	`%r9')
52
53
54define(`bp', `%rbp')
55
56ASM_START()
57	TEXT
58	ALIGN(16)
59PROLOGUE(mpn_addaddmul_1msb0)
60	push	%r12
61	push	%rbp
62
63	lea	(ap,n,8), ap
64	lea	(bp_param,n,8), bp
65	lea	(rp,n,8), rp
66	neg	n
67
68	mov	(ap,n,8), %rax
69	mul	%r8
70	mov	%rax, %r12
71	mov	(bp,n,8), %rax
72	mov	%rdx, %r10
73	add	$3, n
74	jns	L(end)
75
76	ALIGN(16)
77L(top):	mul	%r9
78	add	%rax, %r12
79	mov	-16(ap,n,8), %rax
80	adc	%rdx, %r10
81	mov	%r12, -24(rp,n,8)
82	mul	%r8
83	add	%rax, %r10
84	mov	-16(bp,n,8), %rax
85	mov	$0, R32(%r11)
86	adc	%rdx, %r11
87	mul	%r9
88	add	%rax, %r10
89	mov	-8(ap,n,8), %rax
90	adc	%rdx, %r11
91	mov	%r10, -16(rp,n,8)
92	mul	%r8
93	add	%rax, %r11
94	mov	-8(bp,n,8), %rax
95	mov	$0, R32(%r12)
96	adc	%rdx, %r12
97	mul	%r9
98	add	%rax, %r11
99	adc	%rdx, %r12
100	mov	(ap,n,8), %rax
101	mul	%r8
102	add	%rax, %r12
103	mov	%r11, -8(rp,n,8)
104	mov	(bp,n,8), %rax
105	mov	$0, R32(%r10)
106	adc	%rdx, %r10
107	add	$3, n
108	js	L(top)
109
110L(end):	cmp	$1, R32(n)
111	ja	2f
112	jz	1f
113
114	mul	%r9
115	add	%rax, %r12
116	mov	-16(ap), %rax
117	adc	%rdx, %r10
118	mov	%r12, -24(rp)
119	mul	%r8
120	add	%rax, %r10
121	mov	-16(bp), %rax
122	mov	$0, R32(%r11)
123	adc	%rdx, %r11
124	mul	%r9
125	add	%rax, %r10
126	mov	-8(ap), %rax
127	adc	%rdx, %r11
128	mov	%r10, -16(rp)
129	mul	%r8
130	add	%rax, %r11
131	mov	-8(bp), %rax
132	mov	$0, R32(%r12)
133	adc	%rdx, %r12
134	mul	%r9
135	add	%rax, %r11
136	adc	%rdx, %r12
137	mov	%r11, -8(rp)
138	mov	%r12, %rax
139	pop	%rbp
140	pop	%r12
141	ret
142
1431:	mul	%r9
144	add	%rax, %r12
145	mov	-8(ap), %rax
146	adc	%rdx, %r10
147	mov	%r12, -16(rp)
148	mul	%r8
149	add	%rax, %r10
150	mov	-8(bp), %rax
151	mov	$0, R32(%r11)
152	adc	%rdx, %r11
153	mul	%r9
154	add	%rax, %r10
155	adc	%rdx, %r11
156	mov	%r10, -8(rp)
157	mov	%r11, %rax
158	pop	%rbp
159	pop	%r12
160	ret
161
1622:	mul	%r9
163	add	%rax, %r12
164	mov	%r12, -8(rp)
165	adc	%rdx, %r10
166	mov	%r10, %rax
167	pop	%rbp
168	pop	%r12
169	ret
170EPILOGUE()
171