190075Sobriendnl  AMD64 mpn_divexact_1 -- mpn by limb exact division.
2169689Skan
3132718Skandnl  Copyright 2001, 2002, 2004-2006, 2011, 2012 Free Software Foundation, Inc.
490075Sobrien
590075Sobriendnl  This file is part of the GNU MP Library.
690075Sobriendnl
790075Sobriendnl  The GNU MP Library is free software; you can redistribute it and/or modify
890075Sobriendnl  it under the terms of either:
990075Sobriendnl
1090075Sobriendnl    * the GNU Lesser General Public License as published by the Free
1190075Sobriendnl      Software Foundation; either version 3 of the License, or (at your
1290075Sobriendnl      option) any later version.
1390075Sobriendnl
1490075Sobriendnl  or
1590075Sobriendnl
1690075Sobriendnl    * the GNU General Public License as published by the Free Software
1790075Sobriendnl      Foundation; either version 2 of the License, or (at your option) any
1890075Sobriendnl      later version.
19169689Skandnl
20169689Skandnl  or both in parallel, as here.
2190075Sobriendnl
22132718Skandnl  The GNU MP Library is distributed in the hope that it will be useful, but
2390075Sobriendnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24132718Skandnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25132718Skandnl  for more details.
2690075Sobriendnl
2790075Sobriendnl  You should have received copies of the GNU General Public License and the
2890075Sobriendnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29117395Skandnl  see https://www.gnu.org/licenses/.
3090075Sobrien
3190075Sobrieninclude(`../config.m4')
3290075Sobrien
3390075Sobrien
3490075SobrienC	     cycles/limb
3590075SobrienC AMD K8,K9	10
36117395SkanC AMD K10	10
37117395SkanC Intel P4	33
38169689SkanC Intel core2	13.25
39169689SkanC Intel corei	14
40169689SkanC Intel atom	42
41169689SkanC VIA nano	43
42169689Skan
43169689SkanC A quick adoption of the 32-bit K7 code.
4490075Sobrien
4590075Sobrien
4690075SobrienC INPUT PARAMETERS
4790075SobrienC rp		rdi
4890075SobrienC up		rsi
4990075SobrienC n		rdx
5090075SobrienC divisor	rcx
5190075Sobrien
5290075SobrienABI_SUPPORT(DOS64)
5390075SobrienABI_SUPPORT(STD64)
54117395Skan
55117395SkanASM_START()
5690075Sobrien	TEXT
5790075Sobrien	ALIGN(16)
5890075SobrienPROLOGUE(mpn_divexact_1)
5990075Sobrien	FUNC_ENTRY(4)
6090075Sobrien	push	%rbx
6190075Sobrien
6290075Sobrien	mov	%rcx, %rax
6390075Sobrien	xor	R32(%rcx), R32(%rcx)	C shift count
64117395Skan	mov	%rdx, %r8
6590075Sobrien
6690075Sobrien	bt	$0, R32(%rax)
67169689Skan	jnc	L(evn)			C skip bsfq unless divisor is even
68169689Skan
69169689SkanL(odd):	mov	%rax, %rbx
7090075Sobrien	shr	R32(%rax)
7190075Sobrien	and	$127, R32(%rax)		C d/2, 7 bits
7290075Sobrien
7390075Sobrien	LEA(	binvert_limb_table, %rdx)
74169689Skan
75169689Skan	movzbl	(%rdx,%rax), R32(%rax)	C inv 8 bits
7690075Sobrien
7790075Sobrien	mov	%rbx, %r11		C d without twos
7890075Sobrien
7990075Sobrien	lea	(%rax,%rax), R32(%rdx)	C 2*inv
8090075Sobrien	imul	R32(%rax), R32(%rax)	C inv*inv
8190075Sobrien	imul	R32(%rbx), R32(%rax)	C inv*inv*d
8290075Sobrien	sub	R32(%rax), R32(%rdx)	C inv = 2*inv - inv*inv*d, 16 bits
83169689Skan
84169689Skan	lea	(%rdx,%rdx), R32(%rax)	C 2*inv
8590075Sobrien	imul	R32(%rdx), R32(%rdx)	C inv*inv
8690075Sobrien	imul	R32(%rbx), R32(%rdx)	C inv*inv*d
8790075Sobrien	sub	R32(%rdx), R32(%rax)	C inv = 2*inv - inv*inv*d, 32 bits
8890075Sobrien
8990075Sobrien	lea	(%rax,%rax), %r10	C 2*inv
9090075Sobrien	imul	%rax, %rax		C inv*inv
9190075Sobrien	imul	%rbx, %rax		C inv*inv*d
9290075Sobrien	sub	%rax, %r10		C inv = 2*inv - inv*inv*d, 64 bits
9390075Sobrien
9490075Sobrien	lea	(%rsi,%r8,8), %rsi	C up end
9590075Sobrien	lea	-8(%rdi,%r8,8), %rdi	C rp end
9690075Sobrien	neg	%r8			C -n
97117395Skan
9890075Sobrien	mov	(%rsi,%r8,8), %rax	C up[0]
9990075Sobrien
100117395Skan	inc	%r8
10190075Sobrien	jz	L(one)
102132718Skan
103132718Skan	mov	(%rsi,%r8,8), %rdx	C up[1]
10490075Sobrien
105132718Skan	shrd	R8(%rcx), %rdx, %rax
106132718Skan
107132718Skan	xor	R32(%rbx), R32(%rbx)
108132718Skan	jmp	L(ent)
109132718Skan
110132718SkanL(evn):	bsf	%rax, %rcx
111132718Skan	shr	R8(%rcx), %rax
112132718Skan	jmp	L(odd)
113132718Skan
114132718Skan	ALIGN(8)
115132718SkanL(top):
116132718Skan	C rax	q
117132718Skan	C rbx	carry bit, 0 or 1
118132718Skan	C rcx	shift
119132718Skan	C rdx
120169689Skan	C rsi	up end
121169689Skan	C rdi	rp end
12290075Sobrien	C r8	counter, limbs, negative
12390075Sobrien	C r10	d^(-1) mod 2^64
124132718Skan	C r11	d, shifted down
12590075Sobrien
126132718Skan	mul	%r11			C carry limb in rdx	0 10
12790075Sobrien	mov	-8(%rsi,%r8,8), %rax	C
128132718Skan	mov	(%rsi,%r8,8), %r9	C
129132718Skan	shrd	R8(%rcx), %r9, %rax	C
13090075Sobrien	nop				C
13190075Sobrien	sub	%rbx, %rax		C apply carry bit
13290075Sobrien	setc	%bl			C
13390075Sobrien	sub	%rdx, %rax		C apply carry limb	5
134132718Skan	adc	$0, %rbx		C			6
13590075SobrienL(ent):	imul	%r10, %rax		C			6
13690075Sobrien	mov	%rax, (%rdi,%r8,8)	C
13790075Sobrien	inc	%r8			C
13890075Sobrien	jnz	L(top)
13990075Sobrien
14090075Sobrien	mul	%r11			C carry limb in rdx
141169689Skan	mov	-8(%rsi), %rax		C up high limb
142132718Skan	shr	R8(%rcx), %rax
14390075Sobrien	sub	%rbx, %rax		C apply carry bit
14490075Sobrien	sub	%rdx, %rax		C apply carry limb
14590075Sobrien	imul	%r10, %rax
14690075Sobrien	mov	%rax, (%rdi)
14790075Sobrien	pop	%rbx
14890075Sobrien	FUNC_EXIT()
14990075Sobrien	ret
150169689Skan
151169689SkanL(one):	shr	R8(%rcx), %rax
15290075Sobrien	imul	%r10, %rax
153169689Skan	mov	%rax, (%rdi)
154132718Skan	pop	%rbx
155132718Skan	FUNC_EXIT()
15690075Sobrien	ret
157169689Skan
15890075SobrienEPILOGUE()
159117395Skan