1dnl  Alpha mpn_divrem_2 -- Divide an mpn number by a normalized 2-limb number.
2
3dnl  Copyright 2007, 2008 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of the GNU Lesser General Public License as published
9dnl  by the Free Software Foundation; either version 3 of the License, or (at
10dnl  your option) any later version.
11
12dnl  The GNU MP Library is distributed in the hope that it will be useful, but
13dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
15dnl  License for more details.
16
17dnl  You should have received a copy of the GNU Lesser General Public License
18dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
19
20include(`../config.m4')
21
22C		norm	frac
23C ev4
24C ev5		70	70
25C ev6		29	29
26
27C TODO
28C  * Perhaps inline mpn_invert_limb, that would allow us to not save/restore
29C    any registers (thus save ~10 cycles per call).
30C  * Use negated d1 and/or d0 to speed carry propagation.  Might save a cycle
31C    or two.
32C  * Check cluster delays (for ev6).  We very likely could save some cycles.
33C  * Use branch-free code for computing di.
34C  * CAVEAT: We rely on r19 not being clobbered by mpn_invert_limb call.
35
36C INPUT PARAMETERS
37define(`qp',		`r16')
38define(`fn',		`r17')
39define(`up_param',	`r18')
40define(`un_param',	`r19')
41define(`dp',		`r20')
42
43ASM_START()
44PROLOGUE(mpn_divrem_2)
45	ldgp	r29, 0(r27)
46	lda	r30, -80(r30)
47	stq	r26, 0(r30)
48	stq	r9, 8(r30)
49	stq	r10, 16(r30)
50	stq	r11, 24(r30)
51	stq	r12, 32(r30)
52	stq	r13, 40(r30)
53C	stq	r14, 48(r30)
54	stq	r15, 56(r30)
55	.prologue	1
56	stq	r16, 64(r30)
57	bis	r31, r17, r15
58	s8addq	r19, r18, r13
59	lda	r13, -24(r13)
60	ldq	r12, 8(r20)
61	ldq	r10, 0(r20)
62	ldq	r11, 16(r13)
63	ldq	r9, 8(r13)
64
65	bis	r31, r31, r3		C most_significant_q_limb = 0
66	cmpult	r11, r12, r1
67	bne	r1, L(L8)
68	cmpule	r11, r12, r1
69	cmpult	r9, r10, r2
70	and	r1, r2, r1
71	bne	r1, L(L8)
72	subq	r11, r12, r11
73	subq	r11, r2, r11
74	subq	r9, r10, r9
75	lda	r3, 1(r31)		C most_significant_q_limb = 1
76L(L8):	stq	r3, 72(r30)
77
78	addq	r15, r19, r19
79	lda	r19, -3(r19)
80	blt	r19, L(L10)
81	bis	r31, r12, r16
82	jsr	r26, mpn_invert_limb
83	ldgp	r29, 0(r26)
84	mulq	r0, r12, r4		C t0 = LO(di * d1)
85	umulh	r0, r10, r2		C s1 = HI(di * d0)
86	addq	r4, r10, r4		C t0 += d0
87	cmpule	r10, r4, r7		C (t0 < d0)
88	addq	r4, r2, r4		C t0 += s1
89	cmpult	r4, r2, r1
90	subq	r1, r7, r7		C t1 (-1, 0, or 1)
91	blt	r7, L(L42)
92L(L22):
93	lda	r0, -1(r0)		C di--
94	cmpult	r4, r12, r1		C cy for: t0 -= d1 (below)
95	subq	r7, r1, r7		C t1 -= cy
96	subq	r4, r12, r4		C t0 -= d1
97	bge	r7, L(L22)
98L(L42):
99	ldq	r16, 64(r30)
100	s8addq	r19, r16, r16
101	ALIGN(16)
102L(loop):
103	mulq	r11, r0, r5		C q0 (early)
104	umulh	r11, r0, r6		C q  (early)
105	addq	r5, r9, r8		C q0 += n1
106	addq	r6, r11, r6		C q  += n2
107	cmpult	r8, r5, r1		C cy for: q0 += n1
108	addq	r6, r1, r6		C q  += cy
109	unop
110	mulq	r12, r6, r1		C LO(d1 * q)
111	umulh	r10, r6, r7		C t1 = HI(d0 * q)
112	subq	r9, r1, r9		C n1 -= LO(d1 * q)
113	mulq	r10, r6, r4		C t0 = LO(d0 * q)
114	unop
115	cmple	r15, r19, r5		C condition and n0...
116	beq	r5, L(L31)
117	ldq	r5, 0(r13)
118	lda	r13, -8(r13)
119L(L31):	subq	r9, r12, r9		C n1 -= d1
120	cmpult	r5, r10, r1		C
121	subq	r9, r1, r9		C
122	subq	r5, r10, r5		C n0 -= d0
123	subq	r9, r7, r9		C n1 -= t0
124	cmpult	r5, r4, r1		C
125	subq	r9, r1, r2		C
126	subq	r5, r4, r5		C n0 -= t1
127	cmpult	r2, r8, r1		C (n1 < q0)
128	addq	r6, r1, r6		C q += cond
129	lda	r1, -1(r1)		C -(n1 >= q0)
130	and	r1, r10, r4		C
131	addq	r5, r4, r9		C n0 += mask & d0
132	and	r1, r12, r1		C
133	cmpult	r9, r5, r11		C cy for: n0 += mask & d0
134	addq	r2, r1, r1		C n1 += mask & d1
135	addq	r1, r11, r11		C n1 += cy
136	cmpult	r11, r12, r1		C
137	beq	r1, L(fix)		C
138L(bck):	stq	r6, 0(r16)
139	lda	r16, -8(r16)
140	lda	r19, -1(r19)
141	bge	r19, L(loop)
142
143L(L10):	stq	r9, 8(r13)
144	stq	r11, 16(r13)
145	ldq	r0, 72(r30)
146	ldq	r26, 0(r30)
147	ldq	r9, 8(r30)
148	ldq	r10, 16(r30)
149	ldq	r11, 24(r30)
150	ldq	r12, 32(r30)
151	ldq	r13, 40(r30)
152C	ldq	r14, 48(r30)
153	ldq	r15, 56(r30)
154	lda	r30, 80(r30)
155	ret	r31, (r26), 1
156
157L(fix):	cmpule	r11, r12, r1
158	cmpult	r9, r10, r2
159	and	r1, r2, r1
160	bne	r1, L(bck)
161	subq	r11, r12, r11
162	subq	r11, r2, r11
163	subq	r9, r10, r9
164	lda	r6, 1(r6)
165	br	L(bck)
166EPILOGUE()
167ASM_END()
168