div_qr_2n_pi1.asm revision 1.1.1.2
1156952Sumednl  x86-64 mpn_div_qr_2n_pi1
2156952Sumednl  -- Divide an mpn number by a normalized 2-limb number,
3156952Sumednl     using a single-limb inverse.
4156952Sume
5156952Sumednl  Copyright 2007, 2008, 2010-2012 Free Software Foundation, Inc.
6156952Sume
7156952Sumednl  This file is part of the GNU MP Library.
8156952Sumednl
9156952Sumednl  The GNU MP Library is free software; you can redistribute it and/or modify
10156952Sumednl  it under the terms of either:
11156952Sumednl
12156952Sumednl    * the GNU Lesser General Public License as published by the Free
13156952Sumednl      Software Foundation; either version 3 of the License, or (at your
14156952Sumednl      option) any later version.
15156952Sumednl
16156952Sumednl  or
17156952Sumednl
18156952Sumednl    * the GNU General Public License as published by the Free Software
19156952Sumednl      Foundation; either version 2 of the License, or (at your option) any
20156952Sumednl      later version.
21170242Sumednl
22156952Sumednl  or both in parallel, as here.
23156952Sumednl
24156952Sumednl  The GNU MP Library is distributed in the hope that it will be useful, but
25156952Sumednl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26156952Sumednl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
27156952Sumednl  for more details.
28156952Sumednl
29156952Sumednl  You should have received copies of the GNU General Public License and the
30156952Sumednl  GNU Lesser General Public License along with the GNU MP Library.  If not,
31156952Sumednl  see https://www.gnu.org/licenses/.
32156952Sume
33156952Sumeinclude(`../config.m4')
34156952Sume
35156952Sume
36156952SumeC		c/l
37156952SumeC INPUT PARAMETERS
38156952Sumedefine(`qp',		`%rdi')
39156952Sumedefine(`rp',		`%rsi')
40156952Sumedefine(`up_param',	`%rdx')
41156952Sumedefine(`un',		`%rcx')
42156952Sumedefine(`d1',		`%r8')
43156952Sumedefine(`d0',		`%r9')
44156952Sumedefine(`di_param',	`8(%rsp)')
45156952Sume
46156952Sumedefine(`di',		`%r10')
47156952Sumedefine(`up',		`%r11')
48156952Sumedefine(`u2',		`%rbx')
49156952Sumedefine(`u1',		`%r12')
50156952Sumedefine(`t1',		`%r13')
51156952Sumedefine(`t0',		`%r14')
52156952Sumedefine(`md1',		`%r15')
53156952Sume
54156952SumeC TODO
55156952SumeC * Store qh in the same stack slot as di_param, instead of pushing
56156952SumeC   it. (we could put it in register %rbp, but then we would need to
57156952SumeC   save and restore that instead, which doesn't seem like a win).
58156952Sume
59156952SumeABI_SUPPORT(DOS64)
60156952SumeABI_SUPPORT(STD64)
61156952Sume
62156952SumeASM_START()
63156952Sume	TEXT
64156952Sume	ALIGN(16)
65156952SumePROLOGUE(mpn_div_qr_2n_pi1)
66156952Sume	FUNC_ENTRY(4)
67156952SumeIFDOS(`	mov	56(%rsp), %r8	')
68156952SumeIFDOS(`	mov	64(%rsp), %r9	')
69156952SumeIFDOS(`define(`di_param', `72(%rsp)')')
70156952Sume	mov	di_param, di
71156952Sume	mov	up_param, up
72156952Sume	push	%r15
73156952Sume	push	%r14
74156952Sume	push	%r13
75156952Sume	push	%r12
76156952Sume	push	%rbx
77156952Sume
78156952Sume	mov	-16(up, un, 8), u1
79156952Sume	mov	-8(up, un, 8), u2
80156952Sume
81156952Sume	mov	u1, t0
82156952Sume	mov	u2, t1
83156952Sume	sub	d0, t0
84156952Sume	sbb	d1, t1
85156952Sume	cmovnc  t0, u1
86156952Sume	cmovnc	t1, u2
87156952Sume	C push qh which is !carry
88156952Sume	sbb	%rax, %rax
89156952Sume	inc	%rax
90156952Sume	push	%rax
91156952Sume	lea	-2(un), un
92156952Sume	mov	d1, md1
93156952Sume	neg	md1
94156952Sume
95156952Sume	jmp	L(next)
96156952Sume
97156952Sume	ALIGN(16)
98156952SumeL(loop):
99156952Sume	C udiv_qr_3by2 (q,u2,u1,u2,u1,n0, d1,d0,di)
100156952Sume	C Based on the optimized divrem_2.asm code.
101156952Sume
102156952Sume	mov	di, %rax
103156952Sume	mul	u2
104156952Sume	mov	u1, t0
105156952Sume	add	%rax, t0	C q0 in t0
106156952Sume	adc	u2, %rdx
107156952Sume	mov	%rdx, t1	C q in t1
108156952Sume	imul	md1, %rdx
109156952Sume	mov	d0, %rax
110156952Sume	lea	(%rdx, u1), u2
111156952Sume	mul	t1
112156952Sume	mov	(up, un, 8), u1
113156952Sume	sub	d0, u1
114156952Sume	sbb	d1, u2
115156952Sume	sub	%rax, u1
116156952Sume	sbb	%rdx, u2
117156952Sume	xor	R32(%rax), R32(%rax)
118156952Sume	xor	R32(%rdx), R32(%rdx)
119156952Sume	cmp	t0, u2
120156952Sume	cmovnc	d0, %rax
121156952Sume	cmovnc	d1, %rdx
122156952Sume	adc	$0, t1
123156952Sume	nop
124156952Sume	add	%rax, u1
125156952Sume	adc	%rdx, u2
126156952Sume	cmp	d1, u2
127156952Sume	jae	L(fix)
128156952SumeL(bck):
129156952Sume	mov	t1, (qp, un, 8)
130156952SumeL(next):
131156952Sume	sub	$1, un
132156952Sume	jnc	L(loop)
133156952SumeL(end):
134156952Sume	mov	u2, 8(rp)
135156952Sume	mov	u1, (rp)
136156952Sume
137156952Sume	C qh on stack
138156952Sume	pop	%rax
139156952Sume
140156952Sume	pop	%rbx
141156952Sume	pop	%r12
142156952Sume	pop	%r13
143156952Sume	pop	%r14
144156952Sume	pop	%r15
145156952Sume	FUNC_EXIT()
146156952Sume	ret
147156952Sume
148156952SumeL(fix):	C Unlikely update. u2 >= d1
149156952Sume	seta	%dl
150156952Sume	cmp	d0, u1
151156952Sume	setae	%al
152156952Sume	orb	%dl, %al		C "orb" form to placate Sun tools
153156952Sume	je	L(bck)
154156952Sume	inc	t1
155156952Sume	sub	d0, u1
156156952Sume	sbb	d1, u2
157156952Sume	jmp	L(bck)
158156952SumeEPILOGUE()
159156952Sume