mul_1.asm revision 1.1.1.1
1dnl  AMD64 mpn_mul_1 optimised for Intel Atom.
2
3dnl  Copyright 2003-2005, 2007, 2008, 2012, 2013 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33C	     cycles/limb	best
34C AMD K8,K9
35C AMD K10
36C AMD bd1
37C AMD bd2
38C AMD bobcat
39C AMD jaguar
40C Intel P4
41C Intel PNR
42C Intel NHM
43C Intel SBR
44C Intel IBR
45C Intel HWL
46C Intel BWL
47C Intel atom	17.3		this
48C VIA nano
49
50C The loop of this code is the result of running a code generation and
51C optimisation tool suite written by David Harvey and Torbjorn Granlund.
52
53define(`rp',      `%rdi')   C rcx
54define(`up',      `%rsi')   C rdx
55define(`n_param', `%rdx')   C r8
56define(`v0',      `%rcx')   C r9
57
58define(`n',       `%r11')
59
60ABI_SUPPORT(DOS64)
61ABI_SUPPORT(STD64)
62
63ASM_START()
64	TEXT
65	ALIGN(16)
66PROLOGUE(mpn_mul_1)
67	FUNC_ENTRY(4)
68	xor	%r8, %r8
69L(com):	mov	(up), %rax
70	lea	-16(up,n_param,8), up
71	lea	-8(rp,n_param,8), rp
72	test	$1, R8(n_param)
73	jnz	L(bx1)
74
75L(bx0):	mov	%r8, %r9
76	test	$2, R8(n_param)
77	jnz	L(b10)
78
79L(b00):	mov	$2, R32(n)
80	sub	n_param, n
81	jmp	L(lo0)
82
83L(bx1):	test	$2, R8(n_param)
84	jnz	L(b11)
85
86L(b01):	mov	$3, R32(n)
87	sub	n_param, n
88	mul	v0
89	cmp	$2, n
90	jnz	L(lo1)
91	jmp	L(cj1)
92
93L(b11):	mov	$1, R32(n)
94	sub	n_param, n
95	jmp	L(lo3)
96
97L(b10):	xor	R32(n), R32(n)
98	sub	n_param, n
99	jmp	L(lo2)
100
101L(top):	mul	v0
102	mov	%r9, -24(rp,n,8)
103L(lo1):	xor	%r9d, %r9d
104	add	%rax, %r8
105	mov	(up,n,8), %rax
106	adc	%rdx, %r9
107	mov	%r8, -16(rp,n,8)
108L(lo0):	xor	%r8d, %r8d
109	mul	v0
110	add	%rax, %r9
111	mov	8(up,n,8), %rax
112	adc	%rdx, %r8
113	mov	%r9, -8(rp,n,8)
114L(lo3):	xor	%r9d, %r9d
115	mul	v0
116	add	%rax, %r8
117	mov	16(up,n,8), %rax
118	adc	%rdx, %r9
119	mov	%r8, (rp,n,8)
120L(lo2):	xor	%r8d, %r8d
121	mul	v0
122	add	%rax, %r9
123	mov	24(up,n,8), %rax
124	adc	%rdx, %r8
125	add	$4, n
126	js	L(top)
127
128L(end):	mul	v0
129	mov	%r9, -8(rp)
130L(cj1):	add	%rax, %r8
131	mov	$0, R32(%rax)
132	adc	%rdx, %rax
133	mov	%r8, (rp)
134	FUNC_EXIT()
135	ret
136EPILOGUE()
137
138PROLOGUE(mpn_mul_1c)
139	FUNC_ENTRY(4)
140IFDOS(`	mov	56(%rsp), %r8	')
141	jmp	L(com)
142EPILOGUE()
143ASM_END()
144