1dnl  AMD64 mpn_addmul_1 and mpn_submul_1 for CPUs with mulx.
2
3dnl  Copyright 2012, 2013, 2017 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33C	     cycles/limb
34C AMD K8,K9	 -
35C AMD K10	 -
36C AMD bd1	 -
37C AMD bd2	 -
38C AMD bd3	 -
39C AMD bd4	 4.3
40C AMD zen	 2
41C AMD bt1	 -
42C AMD bt2	 -
43C Intel P4	 -
44C Intel PNR	 -
45C Intel NHM	 -
46C Intel SBR	 -
47C Intel IBR	 -
48C Intel HWL	 ?
49C Intel BWL	 ?
50C Intel SKL	 ?
51C Intel atom	 -
52C Intel SLM	 -
53C VIA nano	 -
54
55define(`rp',      `%rdi')   C rcx
56define(`up',      `%rsi')   C rdx
57define(`n_param', `%rdx')   C r8
58define(`v0_param',`%rcx')   C r9
59
60define(`n',       `%rcx')
61define(`v0',      `%rdx')
62
63ifdef(`OPERATION_addmul_1',`
64      define(`ADDSUB',        `add')
65      define(`ADCSBB',        `adc')
66      define(`func',  `mpn_addmul_1')
67')
68ifdef(`OPERATION_submul_1',`
69      define(`ADDSUB',        `sub')
70      define(`ADCSBB',        `sbb')
71      define(`func',  `mpn_submul_1')
72')
73
74ABI_SUPPORT(DOS64)
75ABI_SUPPORT(STD64)
76
77MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)
78
79ASM_START()
80	TEXT
81	ALIGN(16)
82PROLOGUE(func)
83	FUNC_ENTRY(4)
84	mov	(up), %r8
85
86	push	%rbx
87	push	%r12
88	push	%r13
89
90	lea	(up,n_param,8), up
91	lea	-32(rp,n_param,8), rp
92	mov	R32(n_param), R32(%rax)
93	xchg	v0_param, v0		C FIXME: is this insn fast?
94
95	neg	n
96
97	and	$3, R8(%rax)
98	jz	L(b0)
99	cmp	$2, R8(%rax)
100	jz	L(b2)
101	jg	L(b3)
102
103L(b1):	mulx(	%r8, %rbx, %rax)
104	sub	$-1, n
105	jz	L(wd1)
106	.byte	0xc4,0x62,0xb3,0xf6,0x04,0xce		C mulx (up,n,8), %r9, %r8
107	.byte	0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08	C mulx 8(up,n,8), %r11, %r10
108	test	R32(%rax), R32(%rax)		C clear cy
109	jmp	L(lo1)
110
111L(b0):	mulx(	%r8, %r9, %r8)
112	.byte	0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08	C mulx 8(up,n,8), %r11, %r10
113	.byte	0xc4,0x62,0x93,0xf6,0x64,0xce,0x10	C mulx 16(up,n,8), %r13, %r12
114	xor	R32(%rax), R32(%rax)
115	jmp	L(lo0)
116
117L(b3):	mulx(	%r8, %r11, %r10)
118	.byte	0xc4,0x62,0x93,0xf6,0x64,0xce,0x08	C mulx 8(up,n,8), %r13, %r12
119	.byte	0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x10	C mulx 16(up,n,8), %rbx, %rax
120	add	%r10, %r13
121	adc	%r12, %rbx
122	adc	$0, %rax
123	sub	$-3, n
124	jz	L(wd3)
125	test	R32(%rax), R32(%rax)		C clear cy
126	jmp	L(lo3)
127
128L(b2):	mulx(	%r8, %r13, %r12)
129	.byte	0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x08	C mulx 8(up,n,8), %rbx, %rax
130	add	%r12, %rbx
131	adc	$0, %rax
132	sub	$-2, n
133	jz	L(wd2)
134	.byte	0xc4,0x62,0xb3,0xf6,0x04,0xce		C mulx (up,n,8), %r9, %r8
135	test	R32(%rax), R32(%rax)		C clear cy
136	jmp	L(lo2)
137
138L(top):	ADDSUB	%r9, (rp,n,8)
139L(lo3):	.byte	0xc4,0x62,0xb3,0xf6,0x04,0xce		C mulx (up,n,8), %r9, %r8
140	ADCSBB	%r11, 8(rp,n,8)
141L(lo2):	.byte	0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08	C mulx 8(up,n,8), %r11, %r10
142	ADCSBB	%r13, 16(rp,n,8)
143L(lo1):	.byte	0xc4,0x62,0x93,0xf6,0x64,0xce,0x10	C mulx 16(up,n,8), %r13, %r12
144	ADCSBB	%rbx, 24(rp,n,8)
145	adc	%rax, %r9
146L(lo0):	.byte	0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x18	C mulx 24(up,n,8), %rbx, %rax
147	adc	%r8, %r11
148	adc	%r10, %r13
149	adc	%r12, %rbx
150	adc	$0, %rax		C rax = carry limb
151	add	$4, n
152	js	L(top)
153
154L(end):	ADDSUB	%r9, (rp)
155L(wd3):	ADCSBB	%r11, 8(rp)
156L(wd2):	ADCSBB	%r13, 16(rp)
157L(wd1):	ADCSBB	%rbx, 24(rp)
158	adc	n, %rax
159	pop	%r13
160	pop	%r12
161	pop	%rbx
162	FUNC_EXIT()
163	ret
164EPILOGUE()
165ASM_END()
166