1139825Simpdnl  S/390-32 mpn_add_n and mpn_sub_n.
280708Sjake
380708Sjakednl  Copyright 2011 Free Software Foundation, Inc.
480708Sjake
580708Sjakednl  This file is part of the GNU MP Library.
680708Sjakednl
780708Sjakednl  The GNU MP Library is free software; you can redistribute it and/or modify
880708Sjakednl  it under the terms of either:
980708Sjakednl
1080708Sjakednl    * the GNU Lesser General Public License as published by the Free
1180708Sjakednl      Software Foundation; either version 3 of the License, or (at your
1280708Sjakednl      option) any later version.
1380708Sjakednl
1480708Sjakednl  or
1580708Sjakednl
1680708Sjakednl    * the GNU General Public License as published by the Free Software
1780708Sjakednl      Foundation; either version 2 of the License, or (at your option) any
1880708Sjakednl      later version.
1980708Sjakednl
2080708Sjakednl  or both in parallel, as here.
2180708Sjakednl
2280708Sjakednl  The GNU MP Library is distributed in the hope that it will be useful, but
2380708Sjakednl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
2480708Sjakednl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
2580708Sjakednl  for more details.
2680708Sjakednl
2780708Sjakednl  You should have received copies of the GNU General Public License and the
2880709Sjakednl  GNU Lesser General Public License along with the GNU MP Library.  If not,
2980709Sjakednl  see https://www.gnu.org/licenses/.
3080708Sjake
3180708Sjakeinclude(`../config.m4')
3280708Sjake
3380708SjakeC            cycles/limb
3480708SjakeC z900		 ?
3580708SjakeC z990	      2.75-3		(fast for even n, slow for odd n)
3680708SjakeC z9		 ?
3780708SjakeC z10		 ?
3880708SjakeC z196		 ?
3980708Sjake
4080708SjakeC TODO
4180708SjakeC  * Optimise for small n
4280708SjakeC  * Use r0 and save/restore one less register
4380708SjakeC  * Using logops_n's v1 inner loop operand order make the loop about 20%
4480708SjakeC    faster, at the expense of highly alignment-dependent performance.
4580708Sjake
46C INPUT PARAMETERS
47define(`rp',	`%r2')
48define(`up',	`%r3')
49define(`vp',	`%r4')
50define(`n',	`%r5')
51
52ifdef(`OPERATION_add_n', `
53  define(ADSB,		al)
54  define(ADSBCR,	alcr)
55  define(ADSBC,		alc)
56  define(RETVAL,`dnl
57	lhi	%r2, 0
58	alcr	%r2, %r2')
59  define(func,		mpn_add_n)
60  define(func_nc,	mpn_add_nc)')
61ifdef(`OPERATION_sub_n', `
62  define(ADSB,		sl)
63  define(ADSBCR,	slbr)
64  define(ADSBC,		slb)
65  define(RETVAL,`dnl
66	slbr	%r2, %r2
67	lcr	%r2, %r2')
68  define(func,		mpn_sub_n)
69  define(func_nc,	mpn_sub_nc)')
70
71MULFUNC_PROLOGUE(mpn_add_n mpn_sub_n)
72
73ASM_START()
74PROLOGUE(func)
75	stm	%r6, %r8, 24(%r15)
76
77	ahi	n, 3
78	lhi	%r7, 3
79	lr	%r1, n
80	srl	%r1, 2
81	nr	%r7, n			C n mod 4
82	je	L(b1)
83	chi	%r7, 2
84	jl	L(b2)
85	jne	L(b0)
86
87L(b3):	lm	%r5, %r7, 0(up)
88	la	up, 12(up)
89	ADSB	%r5, 0(vp)
90	ADSBC	%r6, 4(vp)
91	ADSBC	%r7, 8(vp)
92	la	vp, 12(vp)
93	stm	%r5, %r7, 0(rp)
94	la	rp, 12(rp)
95	brct	%r1, L(top)
96	j	L(end)
97
98L(b0):	lm	%r5, %r8, 0(up)		C This redundant insns is no mistake,
99	la	up, 16(up)		C it is needed to make main loop run
100	ADSB	%r5, 0(vp)		C fast for n = 0 (mod 4).
101	ADSBC	%r6, 4(vp)
102	j	L(m0)
103
104L(b1):	l	%r5, 0(up)
105	la	up, 4(up)
106	ADSB	%r5, 0(vp)
107	la	vp, 4(vp)
108	st	%r5, 0(rp)
109	la	rp, 4(rp)
110	brct	%r1, L(top)
111	j	L(end)
112
113L(b2):	lm	%r5, %r6, 0(up)
114	la	up, 8(up)
115	ADSB	%r5, 0(vp)
116	ADSBC	%r6, 4(vp)
117	la	vp, 8(vp)
118	stm	%r5, %r6, 0(rp)
119	la	rp, 8(rp)
120	brct	%r1, L(top)
121	j	L(end)
122
123L(top):	lm	%r5, %r8, 0(up)
124	la	up, 16(up)
125	ADSBC	%r5, 0(vp)
126	ADSBC	%r6, 4(vp)
127L(m0):	ADSBC	%r7, 8(vp)
128	ADSBC	%r8, 12(vp)
129	la	vp, 16(vp)
130	stm	%r5, %r8, 0(rp)
131	la	rp, 16(rp)
132	brct	%r1, L(top)
133
134L(end):	RETVAL
135	lm	%r6, %r8, 24(%r15)
136	br	%r14
137EPILOGUE()
138