1dnl  IA-64 mpn_sqr_diagonal.  Helper for sqr_basecase.
2
3dnl  Copyright 2001, 2002, 2004 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of the GNU Lesser General Public License as published
9dnl  by the Free Software Foundation; either version 3 of the License, or (at
10dnl  your option) any later version.
11
12dnl  The GNU MP Library is distributed in the hope that it will be useful, but
13dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
15dnl  License for more details.
16
17dnl  You should have received a copy of the GNU Lesser General Public License
18dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
19
20include(`../config.m4')
21
22C         cycles/limb
23C Itanium:    4
24C Itanium 2:  2
25
26C TODO
27C  * Perhaps avoid ctop loop.  Unfortunately, a cloop loop running at 1 c/l
28C    would need prohibitive 8-way unrolling.
29C  * Instead of messing too much with this, write a nifty mpn_sqr_basecase.
30
31C INPUT PARAMETERS
32C rp = r32
33C sp = r33
34C n = r34
35
36ASM_START()
37PROLOGUE(mpn_sqr_diagonal)
38	.prologue
39	.save	ar.lc, r2
40	.save	pr, r15
41	.body
42ifdef(`HAVE_ABI_32',
43`	addp4	r32 = 0, r32
44	addp4	r33 = 0, r33
45	zxt4	r34 = r34
46	;;
47')
48	ldf8		f32 = [r33], 8		C M	load rp[0] early
49	mov		r2 = ar.lc		C I0
50	mov		r14 = ar.ec		C I0
51	mov		r15 = pr		C I0
52	add		r19 = -1, r34		C M I	decr n
53	add		r18 = 8, r32		C M I	rp for high limb
54	;;
55	mov		ar.lc = r19		C I0
56	mov		ar.ec = 5		C I0
57	mov		pr.rot = 1<<16		C I0
58	;;
59	br.cexit.spnt	.Ldone			C B
60	;;
61	ALIGN(32)
62.Loop:
63  (p16)	ldf8		f32 = [r33], 8		C M
64  (p19)	xma.l		f36 = f35, f35, f0	C F
65  (p21)	stf8		[r32] = f38, 16		C M2 M3
66  (p19)	xma.hu		f40 = f35, f35, f0	C F
67  (p21)	stf8		[r18] = f42, 16		C M2 M3
68	br.ctop.dptk	.Loop			C B
69	;;
70.Ldone:
71	stf8		[r32] = f38		C M2 M3
72	stf8		[r18] = f42		C M2 M3
73	mov		ar.ec = r14		C I0
74	;;
75	mov		pr = r15, 0x1ffff	C I0
76	mov		ar.lc = r2		C I0
77	br.ret.sptk.many b0			C B
78EPILOGUE(mpn_sqr_diagonal)
79ASM_END()
80