addaddmul_1msb0.asm revision 1.1.1.2
1dnl  AMD64 mpn_addaddmul_1msb0, R = Au + Bv, u,v < 2^63.
2
3dnl  Copyright 2008 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of the GNU Lesser General Public License as published
9dnl  by the Free Software Foundation; either version 3 of the License, or (at
10dnl  your option) any later version.
11
12dnl  The GNU MP Library is distributed in the hope that it will be useful, but
13dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
15dnl  License for more details.
16
17dnl  You should have received a copy of the GNU Lesser General Public License
18dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
19
20include(`../config.m4')
21
22C	     cycles/limb
23C AMD K8,K9	 2.167
24C AMD K10	 2.167
25C Intel P4	12.0
26C Intel core2	 4.0
27C Intel corei	 ?
28C Intel atom	 ?
29C VIA nano	 ?
30
31C TODO
32C  * Perhaps handle various n mod 3 sizes better.  The code now is too large.
33
34C INPUT PARAMETERS
35define(`rp',	`%rdi')
36define(`ap',	`%rsi')
37define(`bp_param', `%rdx')
38define(`n',	`%rcx')
39define(`u0',	`%r8')
40define(`v0',	`%r9')
41
42
43define(`bp', `%rbp')
44
45ASM_START()
46	TEXT
47	ALIGN(16)
48PROLOGUE(mpn_addaddmul_1msb0)
49	push	%r12
50	push	%rbp
51
52	lea	(ap,n,8), ap
53	lea	(bp_param,n,8), bp
54	lea	(rp,n,8), rp
55	neg	n
56
57	mov	(ap,n,8), %rax
58	mul	%r8
59	mov	%rax, %r12
60	mov	(bp,n,8), %rax
61	mov	%rdx, %r10
62	add	$3, n
63	jns	L(end)
64
65	ALIGN(16)
66L(top):	mul	%r9
67	add	%rax, %r12
68	mov	-16(ap,n,8), %rax
69	adc	%rdx, %r10
70	mov	%r12, -24(rp,n,8)
71	mul	%r8
72	add	%rax, %r10
73	mov	-16(bp,n,8), %rax
74	mov	$0, R32(%r11)
75	adc	%rdx, %r11
76	mul	%r9
77	add	%rax, %r10
78	mov	-8(ap,n,8), %rax
79	adc	%rdx, %r11
80	mov	%r10, -16(rp,n,8)
81	mul	%r8
82	add	%rax, %r11
83	mov	-8(bp,n,8), %rax
84	mov	$0, R32(%r12)
85	adc	%rdx, %r12
86	mul	%r9
87	add	%rax, %r11
88	adc	%rdx, %r12
89	mov	(ap,n,8), %rax
90	mul	%r8
91	add	%rax, %r12
92	mov	%r11, -8(rp,n,8)
93	mov	(bp,n,8), %rax
94	mov	$0, R32(%r10)
95	adc	%rdx, %r10
96	add	$3, n
97	js	L(top)
98
99L(end):	cmp	$1, R32(n)
100	ja	2f
101	jz	1f
102
103	mul	%r9
104	add	%rax, %r12
105	mov	-16(ap), %rax
106	adc	%rdx, %r10
107	mov	%r12, -24(rp)
108	mul	%r8
109	add	%rax, %r10
110	mov	-16(bp), %rax
111	mov	$0, R32(%r11)
112	adc	%rdx, %r11
113	mul	%r9
114	add	%rax, %r10
115	mov	-8(ap), %rax
116	adc	%rdx, %r11
117	mov	%r10, -16(rp)
118	mul	%r8
119	add	%rax, %r11
120	mov	-8(bp), %rax
121	mov	$0, R32(%r12)
122	adc	%rdx, %r12
123	mul	%r9
124	add	%rax, %r11
125	adc	%rdx, %r12
126	mov	%r11, -8(rp)
127	mov	%r12, %rax
128	pop	%rbp
129	pop	%r12
130	ret
131
1321:	mul	%r9
133	add	%rax, %r12
134	mov	-8(ap), %rax
135	adc	%rdx, %r10
136	mov	%r12, -16(rp)
137	mul	%r8
138	add	%rax, %r10
139	mov	-8(bp), %rax
140	mov	$0, R32(%r11)
141	adc	%rdx, %r11
142	mul	%r9
143	add	%rax, %r10
144	adc	%rdx, %r11
145	mov	%r10, -8(rp)
146	mov	%r11, %rax
147	pop	%rbp
148	pop	%r12
149	ret
150
1512:	mul	%r9
152	add	%rax, %r12
153	mov	%r12, -8(rp)
154	adc	%rdx, %r10
155	mov	%r10, %rax
156	pop	%rbp
157	pop	%r12
158	ret
159EPILOGUE()
160