1dnl  AMD64 mpn_sqr_basecase optimised for AMD Zen.
2
3dnl  Copyright 2012, 2013, 2017 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33C TODO
34C  * Do overlapped software pipelining.  This should close the remaining gap to
35C    mul_basecase.
36C
37C  * Update un just once in the outer loop.
38C
39C  * Perhaps keep un and n pre-multiplied by 8, thus suppressing ",8" from
40C    loads and stores.  At least in some cases, the non-scaled form is faster.
41C
42C  * Optimise xit3 code, e.g., using shrx and sarx like in the main loop.
43C
44C  * The mul_1 feed-in code has gotten little attention and could probably be
45C    improved.  Perhaps even expand it to 4 separate loops to allow straight
46C    fall-through into the 4 addmul_1 loops.
47C
48C  * Clean up ad-hoc scratch register usage in the addmul_1 feed-in code blocks.
49
50define(`rp',      `%rdi')
51define(`up',      `%rsi')
52define(`un_param',`%rdx')
53
54define(`un',      `%rbp')
55define(`n',       `%rcx')
56
57C these are used just for the small op code
58define(`w0',	`%r8')
59define(`w1',	`%r9')
60define(`w2',	`%r10')
61define(`w3',	`%r11')
62
63
64ABI_SUPPORT(DOS64)
65ABI_SUPPORT(STD64)
66
67ASM_START()
68	TEXT
69	ALIGN(16)
70PROLOGUE(mpn_sqr_basecase)
71	FUNC_ENTRY(3)
72
73	cmp	$2, R32(un_param)
74	jae	L(gt1)
75
76	mov	(up), %rdx
77	mulx(	%rdx, %rax, %rdx)
78	mov	%rax, (rp)
79	mov	%rdx, 8(rp)
80	FUNC_EXIT()
81	ret
82
83L(gt1):	jne	L(gt2)
84
85	mov	(up), %rdx
86	mov	8(up), %rcx
87	mulx(	%rcx, %r9, %r10)	C v0 * v1	W 1 2
88	mulx(	%rdx, %rax, %r8)	C v0 * v0	W 0 1
89	mov	%rcx, %rdx
90	mulx(	%rdx, %r11, %rdx)	C v1 * v1	W 2 3
91	add	%r9, %r9		C		W 1
92	adc	%r10, %r10		C		W 2
93	adc	$0, %rdx		C		W 3
94	add	%r9, %r8		C W 1
95	adc	%r11, %r10		C W 2
96	adc	$0, %rdx		C W 3
97	mov	%rax, (rp)
98	mov	%r8, 8(rp)
99	mov	%r10, 16(rp)
100	mov	%rdx, 24(rp)
101	FUNC_EXIT()
102	ret
103
104L(gt2):	cmp	$4, R32(un_param)
105	jae	L(gt3)
106
107	push	%rbx
108	mov	(up), %rdx
109	mulx(	8,(up), w2, w3)
110	mulx(	16,(up), w0, w1)
111	add	w3, w0
112	mov	8(up), %rdx
113	mulx(	16,(up), %rax, w3)
114	adc	%rax, w1
115	adc	$0, w3
116	test	R32(%rbx), R32(%rbx)
117	mov	(up), %rdx
118	mulx(	%rdx, %rbx, %rcx)
119	mov	%rbx, (rp)
120	mov	8(up), %rdx
121	mulx(	%rdx, %rax, %rbx)
122	mov	16(up), %rdx
123	mulx(	%rdx, %rsi, %rdx)
124	adcx(	w2, w2)
125	adcx(	w0, w0)
126	adcx(	w1, w1)
127	adcx(	w3, w3)
128	adox(	w2, %rcx)
129	adox(	w0, %rax)
130	adox(	w1, %rbx)
131	adox(	w3, %rsi)
132	mov	$0, R32(%r8)
133	adox(	%r8, %rdx)
134	adcx(	%r8, %rdx)
135	mov	%rcx, 8(rp)
136	mov	%rax, 16(rp)
137	mov	%rbx, 24(rp)
138	mov	%rsi, 32(rp)
139	mov	%rdx, 40(rp)
140	pop	%rbx
141	FUNC_EXIT()
142	ret
143
144L(gt3):	push	%r15
145C	push	%r14
146	push	%r13
147	push	%r12
148	push	%rbp
149	push	%rbx
150	mov	R32(un_param), R32(un)
151
152	mov	(up), %rdx		C up[0]
153	mov	8(up), %r9		C up[1]
154
155	mulx(	%rdx, %rax, %r15)	C up[0]^2
156	mov	%rax, (rp)
157	shl	%rdx
158
159	lea	(up,un,8), up
160	lea	-32(rp,un,8), rp
161
162	neg	un
163	lea	4(un), n
164	and	$-4, n
165
166	test	$1, R8(un)
167	jnz	L(mx0)
168L(mx1):	test	$2, R8(un)
169	jz	L(mb3)
170
171L(mb1):	mulx(	%r9, %rbx, %rax)
172	.byte	0xc4,0x62,0xb3,0xf6,0x44,0xee,0x10	C mulx 16(up,un,8), %r9, %r8
173	.byte	0xc4,0x62,0xa3,0xf6,0x54,0xee,0x18	C mulx 24(up,un,8), %r11, %r10
174	add	%r15, %rbx
175	jmp	L(mlo1)
176
177L(mb3):	mulx(	%r9, %r11, %r10)
178	.byte	0xc4,0x62,0x93,0xf6,0x64,0xee,0x10	C mulx 16(up,un,8), %r13, %r12
179	.byte	0xc4,0xe2,0xe3,0xf6,0x44,0xee,0x18	C mulx 24(up,un,8), %rbx, %rax
180	add	%r15, %r11
181	jrcxz	L(n4)
182	jmp	L(mlo3)
183L(n4):	mov	%r11, 8(rp)
184	adc	%r10, %r13
185	adc	%r12, %rbx
186	jmp	L(m)
187
188L(mx0):	test	$2, R8(un)
189	jnz	L(mb0)
190
191L(mb2):	mulx(	%r9, %r13, %r12)
192	.byte	0xc4,0xe2,0xe3,0xf6,0x44,0xee,0x10	C mulx 16(up,un,8), %rbx, %rax
193	.byte	0xc4,0x62,0xb3,0xf6,0x44,0xee,0x18	C mulx 24(up,un,8), %r9, %r8
194	add	%r15, %r13
195	jmp	L(mlo2)
196
197L(mb0):	mulx(	%r9, %r9, %r8)
198	.byte	0xc4,0x62,0xa3,0xf6,0x54,0xee,0x10	C mulx 16(up,un,8), %r11, %r10
199	.byte	0xc4,0x62,0x93,0xf6,0x64,0xee,0x18	C mulx 24(up,un,8), %r13, %r12
200	add	%r15, %r9
201	jmp	L(mlo0)
202
203	ALIGN(16)
204L(mtop):jrcxz	L(mend)
205	adc	%r8, %r11
206	mov	%r9, (rp,n,8)
207L(mlo3):.byte	0xc4,0x62,0xb3,0xf6,0x04,0xce		C mulx (up,n,8), %r9, %r8
208	adc	%r10, %r13
209	mov	%r11, 8(rp,n,8)
210L(mlo2):.byte	0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08	C mulx 8(up,n,8), %r11, %r10
211	adc	%r12, %rbx
212	mov	%r13, 16(rp,n,8)
213L(mlo1):.byte	0xc4,0x62,0x93,0xf6,0x64,0xce,0x10	C mulx 16(up,n,8), %r13, %r12
214	adc	%rax, %r9
215	mov	%rbx, 24(rp,n,8)
216L(mlo0):.byte	0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x18	C mulx 24(up,n,8), %rbx, %rax
217	lea	4(n), n
218	jmp	L(mtop)
219
220L(mend):mov	%r9, (rp)
221	adc	%r8, %r11
222	mov	%r11, 8(rp)
223	adc	%r10, %r13
224	mov	%r13, 16(rp)
225	adc	%r12, %rbx
226	adc	$0, %rax
227	mov	%rbx, 24(rp)
228	mov	%rax, 32(rp)
229
230	lea	2(un), un
231
232	mov	$63, R32(%r15)			C keep at 63 for shrx/sarx.
233	test	$1, R8(un)
234	jz	L(x0)
235L(x1):	test	$2, R8(un)
236	jz	L(f3)
237	jmp	L(f1)
238L(x0):	test	$2, R8(un)
239	jz	L(f0)
240C	jmp	L(f2)
241
242L(f2):	mov	-8(up,un,8), %rdx		C up[0]
243	lea	2(un), n
244	lea	8(rp), rp
245	.byte	0xc4,0x62,0x82,0xf7,0x5c,0xee,0xf0	C sarx %r15, -16(up,un,8), %r11
246	.byte	0xc4,0x62,0x83,0xf7,0x6c,0xee,0xf0	C shrx %r15, -16(up,un,8), %r13
247	and	%rdx, %r11			C "ci" in C code
248	mulx(	%rdx, %rax, %r10)		C up[0]^2
249	lea	(%r13,%rdx,2), %rdx		C "u0" arg in C code
250	add	%rax, %r11
251
252	.byte	0xc4,0x62,0x93,0xf6,0x24,0xee		C mulx (up,un,8), %r13, %r12
253	.byte	0xc4,0xe2,0xe3,0xf6,0x44,0xee,0x08	C mulx 8(up,un,8), %rbx, %rax
254	adc	%r10, %r13
255	adc	%r12, %rbx
256	adc	$0, %rax
257	jmp	L(b2)
258
259	ALIGN(16)
260L(top2):add	%r9, (rp,n,8)
261L(b2):	.byte	0xc4,0x62,0xb3,0xf6,0x04,0xce		C mulx (up,n,8), %r9, %r8
262	adc	%r11, 8(rp,n,8)
263	.byte	0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08	C mulx 8(up,n,8), %r11, %r10
264	adc	%r13, 16(rp,n,8)
265	.byte	0xc4,0x62,0x93,0xf6,0x64,0xce,0x10	C mulx 16(up,n,8), %r13, %r12
266	adc	%rbx, 24(rp,n,8)
267	adc	%rax, %r9
268	.byte	0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x18	C mulx 24(up,n,8), %rbx, %rax
269	adc	%r8, %r11
270	adc	%r10, %r13
271	adc	%r12, %rbx
272	adc	$0, %rax
273	add	$4, n
274	jnz	L(top2)
275
276	inc	un
277	add	%r9, (rp)
278	adc	%r11, 8(rp)
279	adc	%r13, 16(rp)
280	adc	%rbx, 24(rp)
281	adc	$0, %rax
282	mov	%rax, 32(rp)
283
284L(f1):	mov	-8(up,un,8), %rdx		C up[0]
285	lea	1(un), n
286	lea	8(rp), rp
287	.byte	0xc4,0x62,0x82,0xf7,0x6c,0xee,0xf0	C sarx	%r15, -16(up,un,8), %r13
288	.byte	0xc4,0xe2,0x83,0xf7,0x5c,0xee,0xf0	C shrx	%r15, -16(up,un,8), %rbx
289	and	%rdx, %r13			C "ci" in C code
290	mulx(	%rdx, %rax, %r12)		C up[0]^2
291	lea	(%rbx,%rdx,2), %rdx		C "u0" arg in C code
292	add	%rax, %r13
293
294	.byte	0xc4,0xe2,0xe3,0xf6,0x04,0xee		C mulx (up,un,8), %rbx, %rax
295	adc	%r12, %rbx
296	adc	$0, %rax
297	.byte	0xc4,0x62,0xb3,0xf6,0x44,0xee,0x08	C mulx 8(up,un,8), %r9, %r8
298	jmp	L(b1)
299
300	ALIGN(16)
301L(top1):add	%r9, (rp,n,8)
302	.byte	0xc4,0x62,0xb3,0xf6,0x04,0xce		C mulx (up,n,8), %r9, %r8
303	adc	%r11, 8(rp,n,8)
304L(b1):	.byte	0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08	C mulx 8(up,n,8), %r11, %r10
305	adc	%r13, 16(rp,n,8)
306	.byte	0xc4,0x62,0x93,0xf6,0x64,0xce,0x10	C mulx 16(up,n,8), %r13, %r12
307	adc	%rbx, 24(rp,n,8)
308	adc	%rax, %r9
309	.byte	0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x18	C mulx 24(up,n,8), %rbx, %rax
310	adc	%r8, %r11
311	adc	%r10, %r13
312	adc	%r12, %rbx
313	adc	$0, %rax
314	add	$4, n
315	jnz	L(top1)
316
317	inc	un
318	add	%r9, (rp)
319	adc	%r11, 8(rp)
320	adc	%r13, 16(rp)
321	adc	%rbx, 24(rp)
322	adc	$0, %rax
323	mov	%rax, 32(rp)
324
325L(f0):	mov	-8(up,un,8), %rdx		C up[0]
326	lea	(un), n
327	lea	8(rp), rp
328	.byte	0xc4,0xe2,0x82,0xf7,0x5c,0xee,0xf0	C sarx	%r15, -16(up,un,8), %rbx
329	.byte	0xc4,0x62,0x83,0xf7,0x4c,0xee,0xf0	C shrx	%r15, -16(up,un,8), %r9
330	and	%rdx, %rbx			C "ci" in C code
331	mulx(	%rdx, %r10, %rax)		C up[0]^2
332	lea	(%r9,%rdx,2), %rdx		C "u0" arg in C code
333	add	%r10, %rbx
334	adc	$0, %rax			C "cin" in C code
335
336	.byte	0xc4,0x62,0xb3,0xf6,0x04,0xce		C mulx (up,un,8), %r9, %r8
337	.byte	0xc4,0x62,0xa3,0xf6,0x54,0xee,0x08	C mulx 8(up,un,8), %r11, %r10
338	jmp	L(b0)
339
340	ALIGN(16)
341L(top0):add	%r9, (rp,n,8)
342	.byte	0xc4,0x62,0xb3,0xf6,0x04,0xce		C mulx (up,n,8), %r9, %r8
343	adc	%r11, 8(rp,n,8)
344	.byte	0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08	C mulx 8(up,n,8), %r11, %r10
345	adc	%r13, 16(rp,n,8)
346L(b0):	.byte	0xc4,0x62,0x93,0xf6,0x64,0xce,0x10	C mulx 16(up,n,8), %r13, %r12
347	adc	%rbx, 24(rp,n,8)
348	adc	%rax, %r9
349	.byte	0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x18	C mulx 24(up,n,8), %rbx, %rax
350	adc	%r8, %r11
351	adc	%r10, %r13
352	adc	%r12, %rbx
353	adc	$0, %rax
354	add	$4, n
355	jnz	L(top0)
356
357	inc	un
358	add	%r9, (rp)
359	adc	%r11, 8(rp)
360	adc	%r13, 16(rp)
361	adc	%rbx, 24(rp)
362	adc	$0, %rax
363	mov	%rax, 32(rp)
364
365L(f3):	mov	-8(up,un,8), %rdx		C up[0]
366	lea	3(un), n
367	lea	8(rp), rp
368	.byte	0xc4,0x62,0x82,0xf7,0x4c,0xee,0xf0	C sarx %r15, -16(up,un,8), %r9
369	.byte	0xc4,0x62,0x83,0xf7,0x5c,0xee,0xf0	C shrx %r15, -16(up,un,8), %r11
370	and	%rdx, %r9			C "ci" in C code
371	mulx(	%rdx, %rax, %r8)		C up[0]^2
372	lea	(%r11,%rdx,2), %rdx		C "u0" arg in C code
373	add	%rax, %r9
374
375	.byte	0xc4,0x62,0xa3,0xf6,0x14,0xee		C mulx (%rsi,%rbp,8),%r11,%r10
376	.byte	0xc4,0x62,0x93,0xf6,0x64,0xee,0x08	C mulx 0x8(%rsi,%rbp,8),%r13,%r12
377	.byte	0xc4,0xe2,0xe3,0xf6,0x44,0xee,0x10	C mulx 0x10(%rsi,%rbp,8),%rbx,%rax
378	adc	%r8, %r11
379	adc	%r10, %r13
380	adc	%r12, %rbx
381	adc	$0, %rax
382	jrcxz	L(xit3)
383	jmp	L(top3)			C FIXME perhaps fall through
384
385	ALIGN(16)
386L(top3):add	%r9, (rp,n,8)
387	.byte	0xc4,0x62,0xb3,0xf6,0x04,0xce		C mulx (up,n,8), %r9, %r8
388	adc	%r11, 8(rp,n,8)
389	.byte	0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08	C mulx 8(up,n,8), %r11, %r10
390	adc	%r13, 16(rp,n,8)
391	.byte	0xc4,0x62,0x93,0xf6,0x64,0xce,0x10	C mulx 16(up,n,8), %r13, %r12
392	adc	%rbx, 24(rp,n,8)
393	adc	%rax, %r9
394	.byte	0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x18	C mulx 24(up,n,8), %rbx, %rax
395	adc	%r8, %r11
396	adc	%r10, %r13
397	adc	%r12, %rbx
398	adc	$0, %rax
399	add	$4, n
400	jnz	L(top3)
401
402	inc	un
403	add	%r9, (rp)
404	adc	%r11, 8(rp)
405	adc	%r13, 16(rp)
406	adc	%rbx, 24(rp)
407	adc	$0, %rax
408	mov	%rax, 32(rp)
409	jmp	L(f2)
410
411
412L(xit3):add	%r9, (rp)
413	adc	%r11, 8(rp)
414	adc	16(rp), %r13
415	adc	24(rp), %rbx
416L(m):	adc	$0, %rax
417	mov	%rax, 32(rp)
418	mov	-24(up), %rdx		C FIXME: CSE
419	mov	-32(up), %r9		C FIXME: CSE
420	sar	$63, %r9
421	and	%rdx, %r9
422	add	%r13, %r9
423	mulx(	%rdx, %rax, %r10)
424	mov	-16(up), %r8		C FIXME: CSE
425	adc	$0, %r10
426	add	%rax, %r9
427	adc	$0, %r10
428	mov	%r9, 16(rp)
429	mov	-32(up), %rax
430	shl	%rax
431	adc	%rdx, %rdx
432	mulx(	%r8, %r13, %r12)
433	mulx(	-8,(up), %r11, %rax)	C FIXME: CSE
434	add	%r10, %r13
435	adc	%r12, %r11
436	adc	$0, %rax
437	add	%rbx, %r13
438	mov	%r13, 24(rp)
439	adc	32(rp), %r11
440	adc	$0, %rax
441	mov	-16(up), %rdx		C FIXME: CSE
442	mov	-8(up), %r8		C FIXME: CSE
443	mov	-24(up), %r9
444	sar	$63, %r9
445	and	%rdx, %r9
446	add	%r11, %r9
447	mulx(	%rdx, %rbp, %r10)
448	adc	$0, %r10
449	add	%rbp, %r9
450	adc	$0, %r10
451	mov	%r9, 32(rp)
452	mov	-24(up), %rbp
453	shl	%rbp
454	adc	%rdx, %rdx
455	mulx(	%r8, %rbx, %rbp)
456	add	%r10, %rbx
457	adc	$0, %rbp
458	adc	%rbx, %rax
459	mov	%rax, 40(rp)
460	adc	$0, %rbp
461	mov	-8(up), %rdx		C FIXME: CSE
462	mov	-16(up), %r9		C FIXME: CSE
463	sar	$63, %r9
464	and	%rdx, %r9
465	add	%rbp, %r9
466	mulx(	%rdx, %rbp, %r10)
467	adc	$0, %r10
468	add	%rbp, %r9
469	adc	$0, %r10
470	mov	%r9, 48(rp)
471	mov	%r10, 56(rp)
472
473	pop	%rbx
474	pop	%rbp
475	pop	%r12
476	pop	%r13
477C	pop	%r14
478	pop	%r15
479
480	FUNC_EXIT()
481	ret
482EPILOGUE()
483