sqr_diag_addlsh1.asm revision 1.1.1.1
1dnl  IA-64 mpn_sqr_diag_addlsh1
2
3dnl  Contributed to the GNU project by Torbjorn Granlund.
4
5dnl  Copyright 2010, 2011 Free Software Foundation, Inc.
6
7dnl  This file is part of the GNU MP Library.
8
9dnl  The GNU MP Library is free software; you can redistribute it and/or modify
10dnl  it under the terms of the GNU Lesser General Public License as published
11dnl  by the Free Software Foundation; either version 3 of the License, or (at
12dnl  your option) any later version.
13
14dnl  The GNU MP Library is distributed in the hope that it will be useful, but
15dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
16dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
17dnl  License for more details.
18
19dnl  You should have received a copy of the GNU Lesser General Public License
20dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
21
22include(`../config.m4')
23
24C           cycles/limb
25C Itanium:      ?
26C Itanium 2:    2	Unrolling could bring it to 1.5 + epsilon
27
28C Exact performance table.  The 2nd line is this code, the 3rd line is ctop-
29C less code.  In an assembly sqr_basecase, the ctop-full numbers will become a
30C few cycles better since we can mitigate the many I0 instructions.
31C
32C 1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20
33C -  20  22  24  26  28  30  32  34  36  38  40  42  44  46  48  50  52  54  56 Needs updating
34C -  13  16  17  18  20  21  23  25  26  30  31  31  33  34  36  38  39  42  43
35
36C We should keep in mind that this code takes linear time in a O(n^2) context
37C and that it will only be used under SQR_TOOM2_THRESHOLD, which might become
38C around 60.  Keeping overhead down for smallish operands (< 10) is more
39C important than optimal cycle counts.
40
41C TODO
42C  * Make sure we don't depend on uninitialised r-registers, f-registers, or
43C  * p-registers.
44C  * Optimise by doing first two loop iterations in function header.
45
46C INPUT PARAMETERS
47define(`rp_param', `r32')  define(`rp', `r14')		C size: 2n
48define(`tp_param', `r33')  define(`tp', `r15')		C size: 2n - 2
49define(`up_param', `r34')  define(`up', `r31')		C size: n
50define(`n',  `r35')
51
52
53ASM_START()
54PROLOGUE(mpn_sqr_diag_addlsh1)
55
56	.prologue
57	.save	ar.pfs, r2
58	.save	ar.lc, r3
59	.body
60
61.mmi;		alloc	r2 = ar.pfs, 4,24,0,24	C			M
62		nop	4711
63		mov	r3 = ar.lc		C			I0
64.mmi;		mov	tp = tp_param		C			M I
65		mov	up = up_param		C			M I
66		mov	rp = rp_param		C			M I
67	;;
68.mmi;		ld8	r36 = [tp], 8		C			M
69		add	r20 = -2, n		C			M I
70		mov	r9 = ar.ec		C			I0
71	;;
72.mmi;		ld8	r32 = [tp], 8		C			M
73		mov	r16 = 0			C			M I
74		mov	ar.ec = 7		C			I0
75	;;
76.mmi;		nop	4711
77		mov	r44 = 0			C			M I
78		mov	ar.lc = r20		C			I0
79	;;
80.mii;		mov	r33 = 0
81		mov	r10 = pr		C			I0
82		mov	pr.rot = 0x30000	C			I0
83	;;
84		br.cexit.spnt.few.clr	L(end)
85
86dnl *** MAIN LOOP START ***
87	ALIGN(32)
88L(top):
89.mfi;	(p18)	ldf8	f33 = [up], 8		C			M
90	(p20)	xma.l	f36 = f35, f35, f42	C			F
91	(p41)	cmpequc	p50, p0 = -1, r44	C			M I
92.mfi;		setfsig	f40 = r16		C			M23
93	(p20)	xma.hu	f38 = f35, f35, f42	C			F
94	(p23)	add	r50 = r41, r49		C			M I
95	;;
96.mmi;	(p16)	ld8	r36 = [tp], 8		C			M
97	(p23)	cmpltu	p40, p0 = r50, r41	C cyout hi		M I
98	(p19)	shrp	r45 = r38, r35, 63	C non-critical		I0
99.mmi;	(p21)	getfsig	r39 = f39		C hi			M2
100	(p24)	st8	[rp] = r51, 8		C hi			M23
101	(p41)	add	r44 = 1, r44		C			M I
102	;;
103.mmi;	(p16)	ld8	r32 = [tp], 8		C			M
104	(p50)	cmpeqor	p40, p0 = -1, r50	C cyout hi		M I
105	(p17)	shrp	r16 = r33, r37, 63	C critical		I0
106.mmi;	(p21)	getfsig	r42 = f37		C lo			M2
107	(p23)	st8	[rp] = r44, 8		C lo			M23
108	(p50)	add	r50 = 1, r50		C			M I
109	;;
110		br.ctop.sptk.few.clr L(top)	C			B
111dnl *** MAIN LOOP END ***
112	;;
113L(end):
114.mmi;		nop	4711
115	(p41)	add	r44 = 1, r44		C			M I
116		shr.u	r48 = r39, 63		C			I0
117	;;
118.mmi;		st8	[rp] = r51, 8		C			M23
119	(p41)	cmpequc	p6, p0 = 0, r44		C			M I
120		add	r50 = r41, r48		C			M I
121	;;
122.mmi;		st8	[rp] = r44, 8		C			M23
123	(p6)	add	r50 = 1, r50		C			M I
124		mov	ar.lc = r3		C			I0
125	;;
126.mii;		st8	[rp] = r50		C			M23
127		mov	ar.ec = r9		C			I0
128		mov	pr = r10		C			I0
129	;;
130.mib;		nop	4711
131		mov	ar.pfs = r2		C			I0
132		br.ret.sptk.many b0		C			B
133EPILOGUE()
134