1dnl  AMD64 mpn_sqr_basecase optimised for Intel Sandy bridge and Ivy bridge.
2
3dnl  Contributed to the GNU project by Torbj��rn Granlund.
4
5dnl  Copyright 2008, 2009, 2011-2013 Free Software Foundation, Inc.
6
7dnl  This file is part of the GNU MP Library.
8dnl
9dnl  The GNU MP Library is free software; you can redistribute it and/or modify
10dnl  it under the terms of either:
11dnl
12dnl    * the GNU Lesser General Public License as published by the Free
13dnl      Software Foundation; either version 3 of the License, or (at your
14dnl      option) any later version.
15dnl
16dnl  or
17dnl
18dnl    * the GNU General Public License as published by the Free Software
19dnl      Foundation; either version 2 of the License, or (at your option) any
20dnl      later version.
21dnl
22dnl  or both in parallel, as here.
23dnl
24dnl  The GNU MP Library is distributed in the hope that it will be useful, but
25dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
27dnl  for more details.
28dnl
29dnl  You should have received copies of the GNU General Public License and the
30dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
31dnl  see https://www.gnu.org/licenses/.
32
33include(`../config.m4')
34
35C cycles/limb	mul_2		addmul_2	sqr_diag_addlsh1
36C AMD K8,K9	 ?		 ?			 ?
37C AMD K10	 ?		 ?			 ?
38C AMD bull	 ?		 ?			 ?
39C AMD pile	 ?		 ?			 ?
40C AMD steam	 ?		 ?			 ?
41C AMD bobcat	 ?		 ?			 ?
42C AMD jaguar	 ?		 ?			 ?
43C Intel P4	 ?		 ?			 ?
44C Intel core	 ?		 ?			 ?
45C Intel NHM	 ?		 ?			 ?
46C Intel SBR	 2.57		 2.93			 3.0
47C Intel IBR	 2.35		 2.66			 3.0
48C Intel HWL	 2.02		 2.5			 2.5
49C Intel BWL	 ?		 ?			 ?
50C Intel atom	 ?		 ?			 ?
51C VIA nano	 ?		 ?			 ?
52
53C The inner loops of this code are the result of running a code generation and
54C optimisation tool suite written by David Harvey and Torbj��rn Granlund, except
55C that the sqr_diag_addlsh1 loop was manually written.
56
57C TODO
58C  * Replace current unoptimised sqr_diag_addlsh1 loop, 2.5 c/l should be easy.
59C  * Streamline pointer updates.
60C  * Perhaps suppress a few more xor insns in feed-in code.
61C  * Make sure we write no dead registers in feed-in code.
62C  * We might use 32-bit size ops, since n >= 2^32 is non-terminating.  Watch
63C    out for negative sizes being zero-extended, though.
64C  * The straight-line code for n <= 3 comes from the K8 code, and might be
65C    quite sub-optimal here.  Write specific code, and add code for n = 4.
66C  * The mul_2 loop has a 10 insn common sequence in the loop start and the
67C    wind-down code.  Try re-rolling it.
68C  * This file has been the subject to just basic micro-optimisation.
69
70C When playing with pointers, set this to $2 to fall back to conservative
71C indexing in wind-down code.
72define(`I',`$1')
73
74define(`rp',	  `%rdi')
75define(`up',	  `%rsi')
76define(`un_param',`%rdx')
77
78
79ABI_SUPPORT(DOS64)
80ABI_SUPPORT(STD64)
81
82ASM_START()
83	TEXT
84	ALIGN(32)
85PROLOGUE(mpn_sqr_basecase)
86	FUNC_ENTRY(3)
87
88	cmp	$2, un_param
89	jae	L(gt1)
90
91	mov	(up), %rax
92	mul	%rax
93	mov	%rax, (rp)
94	mov	%rdx, 8(rp)
95	FUNC_EXIT()
96	ret
97
98L(gt1):	jne	L(gt2)
99
100	mov	(up), %rax
101	mov	%rax, %r8
102	mul	%rax
103	mov	8(up), %r11
104	mov	%rax, (rp)
105	mov	%r11, %rax
106	mov	%rdx, %r9
107	mul	%rax
108	mov	%rax, %r10
109	mov	%r11, %rax
110	mov	%rdx, %r11
111	mul	%r8
112	xor	%r8, %r8
113	add	%rax, %r9
114	adc	%rdx, %r10
115	adc	%r8, %r11
116	add	%rax, %r9
117	mov	%r9, 8(rp)
118	adc	%rdx, %r10
119	mov	%r10, 16(rp)
120	adc	%r8, %r11
121	mov	%r11, 24(rp)
122	FUNC_EXIT()
123	ret
124
125L(gt2):	cmp	$4, un_param
126	jae	L(gt3)
127define(`v0', `%r8')
128define(`v1', `%r9')
129define(`w0', `%r10')
130define(`w2', `%r11')
131
132	mov	(up), %rax
133	mov	%rax, %r10
134	mul	%rax
135	mov	8(up), %r11
136	mov	%rax, (rp)
137	mov	%r11, %rax
138	mov	%rdx, 8(rp)
139	mul	%rax
140	mov	16(up), %rcx
141	mov	%rax, 16(rp)
142	mov	%rcx, %rax
143	mov	%rdx, 24(rp)
144	mul	%rax
145	mov	%rax, 32(rp)
146	mov	%rdx, 40(rp)
147
148	mov	%r11, %rax
149	mul	%r10
150	mov	%rax, %r8
151	mov	%rcx, %rax
152	mov	%rdx, %r9
153	mul	%r10
154	xor	%r10, %r10
155	add	%rax, %r9
156	mov	%r11, %rax
157	mov	%r10, %r11
158	adc	%rdx, %r10
159
160	mul	%rcx
161	add	%rax, %r10
162	adc	%r11, %rdx
163	add	%r8, %r8
164	adc	%r9, %r9
165	adc	%r10, %r10
166	adc	%rdx, %rdx
167	adc	%r11, %r11
168	add	%r8, 8(rp)
169	adc	%r9, 16(rp)
170	adc	%r10, 24(rp)
171	adc	%rdx, 32(rp)
172	adc	%r11, 40(rp)
173	FUNC_EXIT()
174	ret
175
176L(gt3):
177
178define(`v0', `%r8')
179define(`v1', `%r9')
180define(`w0', `%r10')
181define(`w1', `%r11')
182define(`w2', `%rbx')
183define(`w3', `%rbp')
184define(`un', `%r12')
185define(`n',  `%rcx')
186
187define(`X0', `%r13')
188define(`X1', `%r14')
189
190L(do_mul_2):
191	mov	(up), v0
192	push	%rbx
193	lea	(rp,un_param,8), rp	C point rp at R[un]
194	mov	8(up), %rax
195	push	%rbp
196	lea	(up,un_param,8), up	C point up right after U's end
197	mov	%rax, v1
198	push	%r12
199	mov	$1, R32(un)		C free up rdx
200	push	%r13
201	sub	un_param, un
202	push	%r14
203	push	un
204	mul	v0
205	mov	%rax, (rp,un,8)
206	mov	8(up,un,8), %rax
207	test	$1, R8(un)
208	jnz	L(m2b1)
209
210L(m2b0):lea	2(un), n
211	xor	R32(w1), R32(w1)	C FIXME
212	xor	R32(w2), R32(w2)	C FIXME
213	mov	%rdx, w0
214	jmp	L(m2l0)
215
216L(m2b1):lea	1(un), n
217	xor	R32(w3), R32(w3)	C FIXME
218	xor	R32(w0), R32(w0)	C FIXME
219	mov	%rdx, w2
220	jmp	L(m2l1)
221
222	ALIGN(32)
223L(m2tp):
224L(m2l0):mul	v0
225	add	%rax, w0
226	mov	%rdx, w3
227	adc	$0, w3
228	mov	-8(up,n,8), %rax
229	mul	v1
230	add	w1, w0
231	adc	$0, w3
232	add	%rax, w2
233	mov	w0, -8(rp,n,8)
234	mov	%rdx, w0
235	adc	$0, w0
236	mov	(up,n,8), %rax
237L(m2l1):mul	v0
238	add	%rax, w2
239	mov	%rdx, w1
240	adc	$0, w1
241	add	w3, w2
242	mov	(up,n,8), %rax
243	adc	$0, w1
244	mul	v1
245	mov	w2, (rp,n,8)
246	add	%rax, w0
247	mov	%rdx, w2
248	mov	8(up,n,8), %rax
249	adc	$0, w2
250	add	$2, n
251	jnc	L(m2tp)
252
253L(m2ed):mul	v0
254	add	%rax, w0
255	mov	%rdx, w3
256	adc	$0, w3
257	mov	I(-8(up),-8(up,n,8)), %rax
258	mul	v1
259	add	w1, w0
260	adc	$0, w3
261	add	%rax, w2
262	mov	w0, I(-8(rp),-8(rp,n,8))
263	adc	$0, %rdx
264	add	w3, w2
265	mov	w2, I((rp),(rp,n,8))
266	adc	$0, %rdx
267	mov	%rdx, I(8(rp),8(rp,n,8))
268
269	add	$2, un			C decrease |un|
270
271L(do_addmul_2):
272L(outer):
273	lea	16(rp), rp
274	cmp	$-2, R32(un)		C jump if un C {-1,0}  FIXME jump if un C {-2,1}
275	jge	L(corner)		C FIXME: move to before the lea above
276
277	mov	-8(up,un,8), v0
278	mov	(up,un,8), %rax
279	mov	%rax, v1
280	mul	v0
281	test	$1, R8(un)
282	jnz	L(a1x1)
283
284L(a1x0):mov	(rp,un,8), X0
285	xor	w0, w0
286	mov	8(rp,un,8), X1
287	add	%rax, X0
288	mov	%rdx, w1
289	adc	$0, w1
290	xor	w2, w2
291	mov	X0, (rp,un,8)
292	mov	8(up,un,8), %rax
293	test	$2, R8(un)
294	jnz	L(a110)
295
296L(a100):lea	2(un), n		C un = 4, 8, 12, ...
297	jmp	L(lo0)
298
299L(a110):lea	(un), n			C un = 2, 6, 10, ...
300	jmp	L(lo2)
301
302L(a1x1):mov	(rp,un,8), X1
303	xor	w2, w2
304	mov	8(rp,un,8), X0
305	add	%rax, X1
306	mov	%rdx, w3
307	adc	$0, w3
308	xor	w0, w0
309	mov	8(up,un,8), %rax
310	test	$2, R8(un)
311	jz	L(a111)
312
313L(a101):lea	3(un), n		C un = 1, 5, 9, ...
314	jmp	L(lo1)
315
316L(a111):lea	1(un), n		C un = 3, 7, 11, ...
317	jmp	L(lo3)
318
319	ALIGN(32)
320L(top):	mul	v1
321	mov	%rdx, w0
322	add	%rax, X0
323	adc	$0, w0
324	add	w1, X1
325	adc	$0, w3
326	add	w2, X0
327	adc	$0, w0
328	mov	-16(up,n,8), %rax
329L(lo1):	mul	v0
330	add	%rax, X0
331	mov	%rdx, w1
332	adc	$0, w1
333	mov	-16(up,n,8), %rax
334	mul	v1
335	mov	X1, -24(rp,n,8)
336	mov	-8(rp,n,8), X1
337	add	w3, X0
338	adc	$0, w1
339	mov	%rdx, w2
340	mov	X0, -16(rp,n,8)
341	add	%rax, X1
342	adc	$0, w2
343	mov	-8(up,n,8), %rax
344	add	w0, X1
345	adc	$0, w2
346L(lo0):	mul	v0
347	add	%rax, X1
348	mov	%rdx, w3
349	adc	$0, w3
350	mov	-8(up,n,8), %rax
351	mul	v1
352	add	w1, X1
353	mov	(rp,n,8), X0
354	adc	$0, w3
355	mov	%rdx, w0
356	add	%rax, X0
357	adc	$0, w0
358	mov	(up,n,8), %rax
359L(lo3):	mul	v0
360	add	w2, X0
361	mov	X1, -8(rp,n,8)
362	mov	%rdx, w1
363	adc	$0, w0
364	add	%rax, X0
365	adc	$0, w1
366	mov	(up,n,8), %rax
367	add	w3, X0
368	adc	$0, w1
369	mul	v1
370	mov	8(rp,n,8), X1
371	add	%rax, X1
372	mov	%rdx, w2
373	adc	$0, w2
374	mov	8(up,n,8), %rax
375	mov	X0, (rp,n,8)
376L(lo2):	mul	v0
377	add	w0, X1
378	mov	%rdx, w3
379	adc	$0, w2
380	add	%rax, X1
381	mov	8(up,n,8), %rax
382	mov	16(rp,n,8), X0
383	adc	$0, w3
384	add	$4, n
385	jnc	L(top)
386
387L(end):	mul	v1
388	add	w1, X1
389	adc	$0, w3
390	add	w2, %rax
391	adc	$0, %rdx
392	mov	X1, I(-8(rp),-24(rp,n,8))
393	add	w3, %rax
394	adc	$0, %rdx
395	mov	%rax, I((rp),-16(rp,n,8))
396	mov	%rdx, I(8(rp),-8(rp,n,8))
397
398	add	$2, un			C decrease |un|
399	jmp	L(outer)		C loop until a small corner remains
400
401L(corner):
402	pop	n
403	jg	L(small_corner)
404
405	lea	8(rp), rp
406	mov	-24(up), v0
407	mov	-16(up), %rax
408	mov	%rax, v1
409	mul	v0
410	mov	-24(rp), X0
411	mov	-16(rp), X1
412	add	%rax, X0
413	mov	%rdx, w1
414	adc	$0, w1
415	xor	w2, w2
416	mov	X0, -24(rp)
417	mov	-8(up), %rax
418	mul	v0
419	add	$0, X1
420	mov	%rdx, w3
421	adc	$0, w2
422	add	%rax, X1
423	mov	-8(up), %rax
424	adc	$0, w3
425	mul	v1
426	add	w1, X1
427	adc	$0, w3
428	add	w2, %rax
429	adc	$0, %rdx
430	mov	X1, -16(rp)
431	jmp	L(com)
432
433L(small_corner):
434	mov	-8(rp), w3
435	mov	-16(up), v0
436	mov	-8(up), %rax
437	mul	v0
438L(com):	add	w3, %rax
439	adc	$0, %rdx
440	mov	%rax, -8(rp)
441	mov	%rdx, (rp)
442
443L(sqr_diag_addlsh1):
444	mov	-8(up,n,8), %rax
445	shl	n
446	mul	%rax
447	mov	%rax, (rp,n,8)
448
449	xor	R32(%rbx), R32(%rbx)
450	mov	8(rp,n,8), %r8
451	mov	16(rp,n,8), %r9
452	jmp	L(dm)
453
454	ALIGN(32)
455L(dtop):add	%r8, %r10
456	adc	%r9, %rax
457	mov	8(rp,n,8), %r8
458	mov	16(rp,n,8), %r9
459	mov	%r10, -8(rp,n,8)
460	mov	%rax, (rp,n,8)
461L(dm):	adc	%r8, %r8
462	adc	%r9, %r9
463	mov	(up,n,4), %rax
464	lea	(%rdx,%rbx), %r10
465	setc	R8(%rbx)
466	mul	%rax
467	add	$2, n
468	js	L(dtop)
469
470L(dend):add	%r8, %r10
471	adc	%r9, %rax
472	mov	%r10, I(-8(rp),-8(rp,n,8))
473	mov	%rax, I((rp),(rp,n,8))
474	adc	%rbx, %rdx
475	mov	%rdx, I(8(rp),8(rp,n,8))
476
477	pop	%r14
478	pop	%r13
479	pop	%r12
480	pop	%rbp
481	pop	%rbx
482	FUNC_EXIT()
483	ret
484EPILOGUE()
485