1dnl  AMD64 mpn_mul_1 optimised for Intel Atom.
2
3dnl  Copyright 2003-2005, 2007, 2008, 2012, 2013 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33C	     cycles/limb
34C AMD K8,K9      3.03
35C AMD K10        3.03
36C AMD bull       4.74
37C AMD pile       4.56
38C AMD steam
39C AMD excavator
40C AMD bobcat     5.56    6.04
41C AMD jaguar     5.55    5.84
42C Intel P4      13.05
43C Intel core2    4.03
44C Intel NHM      3.80
45C Intel SBR      2.75
46C Intel IBR      2.69
47C Intel HWL      2.50
48C Intel BWL      2.55
49C Intel SKL      2.57
50C Intel atom    17.3
51C Intel SLM     14.7
52C VIA nano
53
54C The loop of this code is the result of running a code generation and
55C optimisation tool suite written by David Harvey and Torbjorn Granlund.
56
57define(`rp',      `%rdi')   C rcx
58define(`up',      `%rsi')   C rdx
59define(`n_param', `%rdx')   C r8
60define(`v0',      `%rcx')   C r9
61
62define(`n',       `%r11')
63
64ABI_SUPPORT(DOS64)
65ABI_SUPPORT(STD64)
66
67ASM_START()
68	TEXT
69	ALIGN(16)
70PROLOGUE(mpn_mul_1)
71	FUNC_ENTRY(4)
72	xor	%r8, %r8
73L(com):	mov	(up), %rax
74	lea	-16(up,n_param,8), up
75	lea	-8(rp,n_param,8), rp
76	test	$1, R8(n_param)
77	jnz	L(bx1)
78
79L(bx0):	mov	%r8, %r9
80	test	$2, R8(n_param)
81	jnz	L(b10)
82
83L(b00):	mov	$2, R32(n)
84	sub	n_param, n
85	jmp	L(lo0)
86
87L(bx1):	test	$2, R8(n_param)
88	jnz	L(b11)
89
90L(b01):	mov	$3, R32(n)
91	sub	n_param, n
92	mul	v0
93	cmp	$2, n
94	jnz	L(lo1)
95	jmp	L(cj1)
96
97L(b11):	mov	$1, R32(n)
98	sub	n_param, n
99	jmp	L(lo3)
100
101L(b10):	xor	R32(n), R32(n)
102	sub	n_param, n
103	jmp	L(lo2)
104
105L(top):	mul	v0
106	mov	%r9, -24(rp,n,8)
107L(lo1):	xor	%r9d, %r9d
108	add	%rax, %r8
109	mov	(up,n,8), %rax
110	adc	%rdx, %r9
111	mov	%r8, -16(rp,n,8)
112L(lo0):	xor	%r8d, %r8d
113	mul	v0
114	add	%rax, %r9
115	mov	8(up,n,8), %rax
116	adc	%rdx, %r8
117	mov	%r9, -8(rp,n,8)
118L(lo3):	xor	%r9d, %r9d
119	mul	v0
120	add	%rax, %r8
121	mov	16(up,n,8), %rax
122	adc	%rdx, %r9
123	mov	%r8, (rp,n,8)
124L(lo2):	xor	%r8d, %r8d
125	mul	v0
126	add	%rax, %r9
127	mov	24(up,n,8), %rax
128	adc	%rdx, %r8
129	add	$4, n
130	js	L(top)
131
132L(end):	mul	v0
133	mov	%r9, -8(rp)
134L(cj1):	add	%rax, %r8
135	mov	$0, R32(%rax)
136	adc	%rdx, %rax
137	mov	%r8, (rp)
138	FUNC_EXIT()
139	ret
140EPILOGUE()
141
142PROLOGUE(mpn_mul_1c)
143	FUNC_ENTRY(4)
144IFDOS(`	mov	56(%rsp), %r8	')
145	jmp	L(com)
146EPILOGUE()
147ASM_END()
148