1284778Sdelphijdnl  x86-64 mpn_divrem_2 -- Divide an mpn number by a normalized 2-limb number.
2186690Sobrien
3186690Sobriendnl  Copyright 2007, 2008, 2010, 2014 Free Software Foundation, Inc.
4284778Sdelphij
5268515Sdelphijdnl  This file is part of the GNU MP Library.
6186690Sobriendnl
7186690Sobriendnl  The GNU MP Library is free software; you can redistribute it and/or modify
8186690Sobriendnl  it under the terms of either:
9186690Sobriendnl
10186690Sobriendnl    * the GNU Lesser General Public License as published by the Free
11186690Sobriendnl      Software Foundation; either version 3 of the License, or (at your
12186690Sobriendnl      option) any later version.
13186690Sobriendnl
14186690Sobriendnl  or
15186690Sobriendnl
16186690Sobriendnl    * the GNU General Public License as published by the Free Software
17284778Sdelphijdnl      Foundation; either version 2 of the License, or (at your option) any
18284778Sdelphijdnl      later version.
19284778Sdelphijdnl
20284778Sdelphijdnl  or both in parallel, as here.
21284778Sdelphijdnl
22284778Sdelphijdnl  The GNU MP Library is distributed in the hope that it will be useful, but
23284778Sdelphijdnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24284778Sdelphijdnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25284778Sdelphijdnl  for more details.
26284778Sdelphijdnl
27284778Sdelphijdnl  You should have received copies of the GNU General Public License and the
28268515Sdelphijdnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29268515Sdelphijdnl  see https://www.gnu.org/licenses/.
30268515Sdelphij
31268515Sdelphijinclude(`../config.m4')
32268515Sdelphij
33268515Sdelphij
34268515SdelphijC	     cycles/limb	best
35268515SdelphijC AMD K8,K9	18
36268515SdelphijC AMD K10	18
37268515SdelphijC AMD bull
38268515SdelphijC AMD pile
39268515SdelphijC AMD bobcat
40268515SdelphijC AMD jaguar
41268515SdelphijC Intel P4	68
42268515SdelphijC Intel core	34
43268515SdelphijC Intel NHM	30.25
44268515SdelphijC Intel SBR	21.3
45268515SdelphijC Intel IBR	21.4
46268515SdelphijC Intel HWL	20.6
47268515SdelphijC Intel BWL
48268515SdelphijC Intel atom	73
49268515SdelphijC VIA nano	33
50268515Sdelphij
51268515Sdelphij
52268515SdelphijC INPUT PARAMETERS
53268515Sdelphijdefine(`qp',		`%rdi')
54268515Sdelphijdefine(`fn',		`%rsi')
55268515Sdelphijdefine(`up_param',	`%rdx')
56268515Sdelphijdefine(`un_param',	`%rcx')
57268515Sdelphijdefine(`dp',		`%r8')
58268515Sdelphij
59268515SdelphijABI_SUPPORT(DOS64)
60268515SdelphijABI_SUPPORT(STD64)
61268515Sdelphij
62268515SdelphijASM_START()
63268515Sdelphij	TEXT
64268515Sdelphij	ALIGN(16)
65268515SdelphijPROLOGUE(mpn_divrem_2)
66268515Sdelphij	FUNC_ENTRY(4)
67268515SdelphijIFDOS(`	mov	56(%rsp), %r8	')
68268515Sdelphij	push	%r15
69268515Sdelphij	push	%r14
70268515Sdelphij	push	%r13
71268515Sdelphij	push	%r12
72234449Sobrien	lea	-24(%rdx,%rcx,8), %r12	C r12 = &up[un-1]
73186690Sobrien	mov	%rsi, %r13
74234449Sobrien	push	%rbp
75186690Sobrien	mov	%rdi, %rbp
76186690Sobrien	push	%rbx
77186690Sobrien	mov	8(%r8), %r11		C d1
78186690Sobrien	mov	16(%r12), %rbx
79186690Sobrien	mov	(%r8), %r8		C d0
80186690Sobrien	mov	8(%r12), %r10
81186690Sobrien
82186690Sobrien	xor	R32(%r15), R32(%r15)
83186690Sobrien	cmp	%rbx, %r11
84186690Sobrien	ja	L(2)
85186690Sobrien	setb	%dl
86186690Sobrien	cmp	%r10, %r8
87186690Sobrien	setbe	%al
88186690Sobrien	orb	%al, %dl		C "orb" form to placate Sun tools
89186690Sobrien	je	L(2)
90186690Sobrien	inc	R32(%r15)
91186690Sobrien	sub	%r8, %r10
92234449Sobrien	sbb	%r11, %rbx
93234449SobrienL(2):
94234449Sobrien	lea	-3(%rcx,%r13), %r14	C un + fn - 3
95234449Sobrien	test	%r14, %r14
96186690Sobrien	js	L(end)
97186690Sobrien
98284778Sdelphij	push	%r8
99186690Sobrien	push	%r10
100186690Sobrien	push	%r11
101186690SobrienIFSTD(`	mov	%r11, %rdi	')
102234449SobrienIFDOS(`	mov	%r11, %rcx	')
103186690SobrienIFDOS(`	sub	$32, %rsp	')
104186690Sobrien	ASSERT(nz, `test $15, %rsp')
105186690Sobrien	CALL(	mpn_invert_limb)
106268515SdelphijIFDOS(`	add	$32, %rsp	')
107268515Sdelphij	pop	%r11
108234449Sobrien	pop	%r10
109268515Sdelphij	pop	%r8
110268515Sdelphij
111268515Sdelphij	mov	%r11, %rdx
112268515Sdelphij	mov	%rax, %rdi
113268515Sdelphij	imul	%rax, %rdx
114268515Sdelphij	mov	%rdx, %r9
115268515Sdelphij	mul	%r8
116268515Sdelphij	xor	R32(%rcx), R32(%rcx)
117268515Sdelphij	add	%r8, %r9
118268515Sdelphij	adc	$-1, %rcx
119268515Sdelphij	add	%rdx, %r9
120268515Sdelphij	adc	$0, %rcx
121268515Sdelphij	js	2f
122234449Sobrien1:	dec	%rdi
123186690Sobrien	sub	%r11, %r9
124186690Sobrien	sbb	$0, %rcx
125234449Sobrien	jns	1b
126186690Sobrien2:
127186690Sobrien
128234449Sobrien	lea	(%rbp,%r14,8), %rbp
129234449Sobrien	mov	%r11, %rsi
130234449Sobrien	neg	%rsi			C -d1
131234449Sobrien
132268515SdelphijC rax rbx rcx rdx rsi rdi  rbp r8 r9 r10 r11 r12 r13 r14 r15
133268515SdelphijC     n2  un      -d1 dinv qp  d0 q0     d1  up  fn      msl
134268515Sdelphij
135268515Sdelphij	ALIGN(16)
136186690SobrienL(top):	mov	%rdi, %rax		C di		ncp
137234449Sobrien	mul	%rbx			C		0, 17
138234449Sobrien	mov	%r10, %rcx		C
139234449Sobrien	add	%rax, %rcx		C		4
140268515Sdelphij	adc	%rbx, %rdx		C		5
141268515Sdelphij	mov	%rdx, %r9		C q		6
142268515Sdelphij	imul	%rsi, %rdx		C		6
143268515Sdelphij	mov	%r8, %rax		C		ncp
144186690Sobrien	lea	(%rdx, %r10), %rbx	C n1 -= ...	10
145186690Sobrien	xor	R32(%r10), R32(%r10)	C
146268515Sdelphij	mul	%r9			C		7
147268515Sdelphij	cmp	%r14, %r13		C
148268515Sdelphij	jg	L(19)			C
149268515Sdelphij	mov	(%r12), %r10		C
150268515Sdelphij	sub	$8, %r12		C
151268515SdelphijL(19):	sub	%r8, %r10		C		ncp
152268515Sdelphij	sbb	%r11, %rbx		C		11
153268515Sdelphij	sub	%rax, %r10		C		11
154268515Sdelphij	sbb	%rdx, %rbx		C		12
155268515Sdelphij	xor	R32(%rax), R32(%rax)	C
156268515Sdelphij	xor	R32(%rdx), R32(%rdx)	C
157268515Sdelphij	cmp	%rcx, %rbx		C		13
158268515Sdelphij	cmovnc	%r8, %rax		C		14
159268515Sdelphij	cmovnc	%r11, %rdx		C		14
160268515Sdelphij	adc	$0, %r9			C adjust q	14
161268515Sdelphij	nop
162268515Sdelphij	add	%rax, %r10		C		15
163268515Sdelphij	adc	%rdx, %rbx		C		16
164268515Sdelphij	cmp	%r11, %rbx		C
165268515Sdelphij	jae	L(fix)			C
166268515SdelphijL(bck):	mov	%r9, (%rbp)		C
167268515Sdelphij	sub	$8, %rbp		C
168186690Sobrien	dec	%r14
169186690Sobrien	jns	L(top)
170284778Sdelphij
171186690SobrienL(end):	mov	%r10, 8(%r12)
172192350Sdelphij	mov	%rbx, 16(%r12)
173186690Sobrien	pop	%rbx
174186690Sobrien	pop	%rbp
175234449Sobrien	pop	%r12
176186690Sobrien	pop	%r13
177186690Sobrien	pop	%r14
178186690Sobrien	mov	%r15, %rax
179186690Sobrien	pop	%r15
180186690Sobrien	FUNC_EXIT()
181186690Sobrien	ret
182186690Sobrien
183186690SobrienL(fix):	seta	%dl
184268515Sdelphij	cmp	%r8, %r10
185186690Sobrien	setae	%al
186186690Sobrien	orb	%dl, %al		C "orb" form to placate Sun tools
187186690Sobrien	je	L(bck)
188186690Sobrien	inc	%r9
189186690Sobrien	sub	%r8, %r10
190268515Sdelphij	sbb	%r11, %rbx
191192350Sdelphij	jmp	L(bck)
192234449SobrienEPILOGUE()
193186690Sobrien