1dnl  AMD64 mpn_addmul_1 and mpn_submul_1.
2
3dnl  Copyright 2003, 2004, 2005, 2007, 2008 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of the GNU Lesser General Public License as published
9dnl  by the Free Software Foundation; either version 3 of the License, or (at
10dnl  your option) any later version.
11
12dnl  The GNU MP Library is distributed in the hope that it will be useful, but
13dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
15dnl  License for more details.
16
17dnl  You should have received a copy of the GNU Lesser General Public License
18dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
19
20include(`../config.m4')
21
22C	     cycles/limb
23C K8,K9:	 2.5
24C K10:		 2.5
25C P4:		14.9
26C P6 core2:	 5.09
27C P6 corei7:
28C P6 atom:	21.3
29
30C The inner loop of this code is the result of running a code generation and
31C optimization tool suite written by David Harvey and Torbjorn Granlund.
32
33C TODO:
34C  * The inner loop is great, but the prologue and epilogue code was
35C    quickly written.  Tune it!
36
37C INPUT PARAMETERS
38define(`rp',	 `%rdi')
39define(`up',	 `%rsi')
40define(`n_param',`%rdx')
41define(`vl',	 `%rcx')
42
43define(`n',	`%r11')
44
45ifdef(`OPERATION_addmul_1',`
46      define(`ADDSUB',        `add')
47      define(`func',  `mpn_addmul_1')
48')
49ifdef(`OPERATION_submul_1',`
50      define(`ADDSUB',        `sub')
51      define(`func',  `mpn_submul_1')
52')
53
54MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)
55
56ASM_START()
57	TEXT
58	ALIGN(16)
59PROLOGUE(func)
60	mov	(up), %rax		C read first u limb early
61	push	%rbx
62	mov	n_param, %rbx		C move away n from rdx, mul uses it
63	mul	vl
64	mov	%rbx, %r11
65
66	and	$3, R32(%rbx)
67	jz	L(b0)
68	cmp	$2, R32(%rbx)
69	jz	L(b2)
70	jg	L(b3)
71
72L(b1):	dec	n
73	jne	L(gt1)
74	ADDSUB	%rax, (rp)
75	jmp	L(ret)
76L(gt1):	lea	8(up,n,8), up
77	lea	-8(rp,n,8), rp
78	neg	n
79	xor	%r10, %r10
80	xor	R32(%rbx), R32(%rbx)
81	mov	%rax, %r9
82	mov	(up,n,8), %rax
83	mov	%rdx, %r8
84	jmp	L(L1)
85
86L(b0):	lea	(up,n,8), up
87	lea	-16(rp,n,8), rp
88	neg	n
89	xor	%r10, %r10
90	mov	%rax, %r8
91	mov	%rdx, %rbx
92	jmp	 L(L0)
93
94L(b3):	lea	-8(up,n,8), up
95	lea	-24(rp,n,8), rp
96	neg	n
97	mov	%rax, %rbx
98	mov	%rdx, %r10
99	jmp	L(L3)
100
101L(b2):	lea	-16(up,n,8), up
102	lea	-32(rp,n,8), rp
103	neg	n
104	xor	%r8, %r8
105	xor	R32(%rbx), R32(%rbx)
106	mov	%rax, %r10
107	mov	24(up,n,8), %rax
108	mov	%rdx, %r9
109	jmp	L(L2)
110
111	ALIGN(16)
112L(top):	ADDSUB	%r10, (rp,n,8)
113	adc	%rax, %r9
114	mov	(up,n,8), %rax
115	adc	%rdx, %r8
116	mov	$0, %r10d
117L(L1):	mul	vl
118	ADDSUB	%r9, 8(rp,n,8)
119	adc	%rax, %r8
120	adc	%rdx, %rbx
121L(L0):	mov	8(up,n,8), %rax
122	mul	vl
123	ADDSUB	%r8, 16(rp,n,8)
124	adc	%rax, %rbx
125	adc	%rdx, %r10
126L(L3):	mov	16(up,n,8), %rax
127	mul	vl
128	ADDSUB	%rbx, 24(rp,n,8)
129	mov	$0, %r8d		# zero
130	mov	%r8, %rbx		# zero
131	adc	%rax, %r10
132	mov	24(up,n,8), %rax
133	mov	%r8, %r9		# zero
134	adc	%rdx, %r9
135L(L2):	mul	vl
136	add	$4, n
137	js	 L(top)
138
139	ADDSUB	%r10, (rp,n,8)
140	adc	%rax, %r9
141	adc	%r8, %rdx
142	ADDSUB	%r9, 8(rp,n,8)
143L(ret):	adc	$0, %rdx
144	mov	%rdx, %rax
145
146	pop	%rbx
147	ret
148EPILOGUE()
149