addaddmul_1msb0.asm revision 1.1.1.1
1dnl  AMD64 mpn_addaddmul_1msb0, R = Au + Bv, u,v < 2^63.
2
3dnl  Copyright 2008 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of the GNU Lesser General Public License as published
9dnl  by the Free Software Foundation; either version 3 of the License, or (at
10dnl  your option) any later version.
11
12dnl  The GNU MP Library is distributed in the hope that it will be useful, but
13dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
15dnl  License for more details.
16
17dnl  You should have received a copy of the GNU Lesser General Public License
18dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
19
20include(`../config.m4')
21
22C	     cycles/limb
23C K8:		 2.167
24C P4:		12.0
25C P6-15:	 4.0
26
27C TODO
28C  * Perhaps handle various n mod 3 sizes better.  The code now is too large.
29
30C INPUT PARAMETERS
31define(`rp',	`%rdi')
32define(`ap',	`%rsi')
33define(`bp_param', `%rdx')
34define(`n',	`%rcx')
35define(`u0',	`%r8')
36define(`v0',	`%r9')
37
38
39define(`bp', `%rbp')
40
41ASM_START()
42	TEXT
43	ALIGN(16)
44PROLOGUE(mpn_addaddmul_1msb0)
45	push	%r12
46	push	%rbp
47
48	lea	(ap,n,8), ap
49	lea	(bp_param,n,8), bp
50	lea	(rp,n,8), rp
51	neg	n
52
53	mov	(ap,n,8), %rax
54	mul	%r8
55	mov	%rax, %r12
56	mov	(bp,n,8), %rax
57	mov	%rdx, %r10
58	add	$3, n
59	jns	L(end)
60
61	ALIGN(16)
62L(top):	mul	%r9
63	add	%rax, %r12
64	mov	-16(ap,n,8), %rax
65	adc	%rdx, %r10
66	mov	%r12, -24(rp,n,8)
67	mul	%r8
68	add	%rax, %r10
69	mov	-16(bp,n,8), %rax
70	mov	$0, %r11d
71	adc	%rdx, %r11
72	mul	%r9
73	add	%rax, %r10
74	mov	-8(ap,n,8), %rax
75	adc	%rdx, %r11
76	mov	%r10, -16(rp,n,8)
77	mul	%r8
78	add	%rax, %r11
79	mov	-8(bp,n,8), %rax
80	mov	$0, %r12d
81	adc	%rdx, %r12
82	mul	%r9
83	add	%rax, %r11
84	adc	%rdx, %r12
85	mov	(ap,n,8), %rax
86	mul	%r8
87	add	%rax, %r12
88	mov	%r11, -8(rp,n,8)
89	mov	(bp,n,8), %rax
90	mov	$0, %r10d
91	adc	%rdx, %r10
92	add	$3, n
93	js	L(top)
94
95L(end):	cmp	$1, R32(n)
96	ja	2f
97	jz	1f
98
99	mul	%r9
100	add	%rax, %r12
101	mov	-16(ap), %rax
102	adc	%rdx, %r10
103	mov	%r12, -24(rp)
104	mul	%r8
105	add	%rax, %r10
106	mov	-16(bp), %rax
107	mov	$0, %r11d
108	adc	%rdx, %r11
109	mul	%r9
110	add	%rax, %r10
111	mov	-8(ap), %rax
112	adc	%rdx, %r11
113	mov	%r10, -16(rp)
114	mul	%r8
115	add	%rax, %r11
116	mov	-8(bp), %rax
117	mov	$0, %r12d
118	adc	%rdx, %r12
119	mul	%r9
120	add	%rax, %r11
121	adc	%rdx, %r12
122	mov	%r11, -8(rp)
123	mov	%r12, %rax
124	pop	%rbp
125	pop	%r12
126	ret
127
1281:	mul	%r9
129	add	%rax, %r12
130	mov	-8(ap), %rax
131	adc	%rdx, %r10
132	mov	%r12, -16(rp)
133	mul	%r8
134	add	%rax, %r10
135	mov	-8(bp), %rax
136	mov	$0, %r11d
137	adc	%rdx, %r11
138	mul	%r9
139	add	%rax, %r10
140	adc	%rdx, %r11
141	mov	%r10, -8(rp)
142	mov	%r11, %rax
143	pop	%rbp
144	pop	%r12
145	ret
146
1472:	mul	%r9
148	add	%rax, %r12
149	mov	%r12, -8(rp)
150	adc	%rdx, %r10
151	mov	%r10, %rax
152	pop	%rbp
153	pop	%r12
154	ret
155EPILOGUE()
156