1dnl  AMD64 mpn_mul_1.
2
3dnl  Copyright 2003, 2004, 2005, 2007, 2008 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of the GNU Lesser General Public License as published
9dnl  by the Free Software Foundation; either version 3 of the License, or (at
10dnl  your option) any later version.
11
12dnl  The GNU MP Library is distributed in the hope that it will be useful, but
13dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
15dnl  License for more details.
16
17dnl  You should have received a copy of the GNU Lesser General Public License
18dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
19
20include(`../config.m4')
21
22C	     cycles/limb
23C K8,K9:	 2.5
24C K10:		 2.5
25C P4:		 12.3
26C P6 core2:	 4.0
27C P6 corei7:	 3.8
28C Atom:		19.8
29
30C The inner loop of this code is the result of running a code generation and
31C optimization tool suite written by David Harvey and Torbjorn Granlund.
32
33C TODO:
34C  * The inner loop is great, but the prologue and epilogue code was
35C    quickly written.  Tune it!
36
37C INPUT PARAMETERS
38define(`rp',	 `%rdi')
39define(`up',	 `%rsi')
40define(`n_param',`%rdx')
41define(`vl',	 `%rcx')
42
43define(`n',	`%r11')
44
45ASM_START()
46	TEXT
47	ALIGN(16)
48PROLOGUE(mpn_mul_1c)
49	push	%rbx
50	mov	%r8, %r10
51	jmp	L(common)
52EPILOGUE()
53
54PROLOGUE(mpn_mul_1)
55	push	%rbx
56	xor	%r10, %r10
57L(common):
58	mov	(up), %rax		C read first u limb early
59	mov	n_param, %rbx		C move away n from rdx, mul uses it
60	mul	vl
61	mov	%rbx, %r11
62
63	add	%r10, %rax
64	adc	$0, %rdx
65
66	and	$3, R32(%rbx)
67	jz	L(b0)
68	cmp	$2, R32(%rbx)
69	jz	L(b2)
70	jg	L(b3)
71
72L(b1):	dec	n
73	jne	L(gt1)
74	mov	%rax, (rp)
75	jmp	L(ret)
76L(gt1):	lea	8(up,n,8), up
77	lea	-8(rp,n,8), rp
78	neg	n
79	xor	%r10, %r10
80	xor	R32(%rbx), R32(%rbx)
81	mov	%rax, %r9
82	mov	(up,n,8), %rax
83	mov	%rdx, %r8
84	jmp	L(L1)
85
86L(b0):	lea	(up,n,8), up
87	lea	-16(rp,n,8), rp
88	neg	n
89	xor	%r10, %r10
90	mov	%rax, %r8
91	mov	%rdx, %rbx
92	jmp	 L(L0)
93
94L(b3):	lea	-8(up,n,8), up
95	lea	-24(rp,n,8), rp
96	neg	n
97	mov	%rax, %rbx
98	mov	%rdx, %r10
99	jmp	L(L3)
100
101L(b2):	lea	-16(up,n,8), up
102	lea	-32(rp,n,8), rp
103	neg	n
104	xor	%r8, %r8
105	xor	R32(%rbx), R32(%rbx)
106	mov	%rax, %r10
107	mov	24(up,n,8), %rax
108	mov	%rdx, %r9
109	jmp	L(L2)
110
111	ALIGN(16)
112L(top):	mov	%r10, (rp,n,8)
113	add	%rax, %r9
114	mov	(up,n,8), %rax
115	adc	%rdx, %r8
116	mov	$0, %r10d
117L(L1):	mul	vl
118	mov	%r9, 8(rp,n,8)
119	add	%rax, %r8
120	adc	%rdx, %rbx
121L(L0):	mov	8(up,n,8), %rax
122	mul	vl
123	mov	%r8, 16(rp,n,8)
124	add	%rax, %rbx
125	adc	%rdx, %r10
126L(L3):	mov	16(up,n,8), %rax
127	mul	vl
128	mov	%rbx, 24(rp,n,8)
129	mov	$0, %r8d		# zero
130	mov	%r8, %rbx		# zero
131	add	%rax, %r10
132	mov	24(up,n,8), %rax
133	mov	%r8, %r9		# zero
134	adc	%rdx, %r9
135L(L2):	mul	vl
136	add	$4, n
137	js	 L(top)
138
139	mov	%r10, (rp,n,8)
140	add	%rax, %r9
141	adc	%r8, %rdx
142	mov	%r9, 8(rp,n,8)
143	add	%r8, %rdx
144L(ret):	mov	%rdx, %rax
145
146	pop	%rbx
147	ret
148EPILOGUE()
149