mul_2.asm revision 1.1.1.1
1dnl  AMD64 mpn_mul_2 optimised for Intel Sandy Bridge.
2
3dnl  Contributed to the GNU project by Torbj��rn Granlund.
4
5dnl  Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc.
6
7dnl  This file is part of the GNU MP Library.
8dnl
9dnl  The GNU MP Library is free software; you can redistribute it and/or modify
10dnl  it under the terms of either:
11dnl
12dnl    * the GNU Lesser General Public License as published by the Free
13dnl      Software Foundation; either version 3 of the License, or (at your
14dnl      option) any later version.
15dnl
16dnl  or
17dnl
18dnl    * the GNU General Public License as published by the Free Software
19dnl      Foundation; either version 2 of the License, or (at your option) any
20dnl      later version.
21dnl
22dnl  or both in parallel, as here.
23dnl
24dnl  The GNU MP Library is distributed in the hope that it will be useful, but
25dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
27dnl  for more details.
28dnl
29dnl  You should have received copies of the GNU General Public License and the
30dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
31dnl  see https://www.gnu.org/licenses/.
32
33include(`../config.m4')
34
35C	     cycles/limb	best
36C AMD K8,K9
37C AMD K10
38C AMD bull
39C AMD pile
40C AMD bobcat
41C AMD jaguar
42C Intel P4
43C Intel core
44C Intel NHM
45C Intel SBR	 2.57		 2.52 using 4-way code
46C Intel IBR	 2.35		 2.32 using 4-way code
47C Intel HWL	 2.02		 1.86
48C Intel BWL
49C Intel atom
50C VIA nano
51
52C This code is the result of running a code generation and optimisation tool
53C suite written by David Harvey and Torbjorn Granlund.
54
55C When playing with pointers, set this to $2 to fall back to conservative
56C indexing in wind-down code.
57define(`I',`$1')
58
59define(`rp',      `%rdi')   C rcx
60define(`up',      `%rsi')   C rdx
61define(`n_param', `%rdx')   C r8
62define(`vp',      `%rcx')   C r9
63
64define(`n',	  `%rcx')
65define(`v0',      `%rbx')
66define(`v1',      `%rbp')
67
68define(`w0',	`%r8')
69define(`w1',	`%r9')
70define(`w2',	`%r10')
71define(`w3',	`%r11')
72
73ABI_SUPPORT(DOS64)
74ABI_SUPPORT(STD64)
75
76ASM_START()
77	TEXT
78	ALIGN(32)
79PROLOGUE(mpn_mul_2)
80	FUNC_ENTRY(4)
81	push	%rbx
82	push	%rbp
83
84	mov	(vp), v0
85	mov	8(vp), v1
86
87	mov	(up), %rax
88	lea	(up,n_param,8), up
89	lea	(rp,n_param,8), rp
90
91	test	$1, R8(n_param)
92	jnz	L(b1)
93
94L(b0):	mov	$0, R32(n)
95	sub	n_param, n
96	xor	w0, w0
97	mul	v0
98	mov	%rax, w2
99	mov	%rdx, w1
100	mov	(up,n,8), %rax
101	jmp	L(lo0)
102
103L(b1):	mov	$1, R32(n)
104	sub	n_param, n
105	xor	w2, w2
106	mul	v0
107	mov	%rax, w0
108	mov	%rdx, w3
109	mov	-8(up,n,8), %rax
110	mul	v1
111	jmp	L(lo1)
112
113	ALIGN(32)
114L(top):	mul	v0
115	add	%rax, w0		C 1
116	mov	%rdx, w3		C 2
117	adc	$0, w3			C 2
118	mov	-8(up,n,8), %rax
119	mul	v1
120	add	w1, w0			C 1
121	adc	$0, w3			C 2
122L(lo1):	add	%rax, w2		C 2
123	mov	w0, -8(rp,n,8)		C 1
124	mov	%rdx, w0		C 3
125	adc	$0, w0			C 3
126	mov	(up,n,8), %rax
127	mul	v0
128	add	%rax, w2		C 2
129	mov	%rdx, w1		C 3
130	adc	$0, w1			C 3
131	add	w3, w2			C 2
132	mov	(up,n,8), %rax
133	adc	$0, w1			C 1
134L(lo0):	mul	v1
135	mov	w2, (rp,n,8)		C 2
136	add	%rax, w0		C 3
137	mov	%rdx, w2		C 4
138	mov	8(up,n,8), %rax
139	adc	$0, w2			C 4
140	add	$2, n
141	jnc	L(top)
142
143L(end):	mul	v0
144	add	%rax, w0
145	mov	%rdx, w3
146	adc	$0, w3
147	mov	I(-8(up),-8(up,n,8)), %rax
148	mul	v1
149	add	w1, w0
150	adc	$0, w3
151	add	%rax, w2
152	mov	w0, I(-8(rp),-8(rp,n,8))
153	adc	$0, %rdx
154	add	w3, w2
155	mov	w2, I((rp),(rp,n,8))
156	adc	$0, %rdx
157	mov	%rdx, %rax
158
159	pop	%rbp
160	pop	%rbx
161	FUNC_EXIT()
162	ret
163EPILOGUE()
164