1dnl  AMD64 mpn_mod_34lsub1 -- remainder modulo 2^48-1.
2
3dnl  Copyright 2000-2002, 2004, 2005, 2007, 2010-2012 Free Software Foundation,
4dnl  Inc.
5
6dnl  This file is part of the GNU MP Library.
7dnl
8dnl  The GNU MP Library is free software; you can redistribute it and/or modify
9dnl  it under the terms of either:
10dnl
11dnl    * the GNU Lesser General Public License as published by the Free
12dnl      Software Foundation; either version 3 of the License, or (at your
13dnl      option) any later version.
14dnl
15dnl  or
16dnl
17dnl    * the GNU General Public License as published by the Free Software
18dnl      Foundation; either version 2 of the License, or (at your option) any
19dnl      later version.
20dnl
21dnl  or both in parallel, as here.
22dnl
23dnl  The GNU MP Library is distributed in the hope that it will be useful, but
24dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
25dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
26dnl  for more details.
27dnl
28dnl  You should have received copies of the GNU General Public License and the
29dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
30dnl  see https://www.gnu.org/licenses/.
31
32include(`../config.m4')
33
34
35C	     cycles/limb
36C AMD K8,K9	 1.0
37C AMD K10	 1.12
38C Intel P4	 3.25
39C Intel core2	 1.5
40C Intel corei	 1.5
41C Intel atom	 2.5
42C VIA nano	 1.75
43
44
45C INPUT PARAMETERS
46define(`ap',	%rdi)
47define(`n',	%rsi)
48
49C mp_limb_t mpn_mod_34lsub1 (mp_srcptr up, mp_size_t n)
50
51C TODO
52C  * Review feed-in and wind-down code.  In particular, try to avoid adc and
53C    sbb to placate Pentium4.
54C  * It seems possible to reach 2.67 c/l by using a cleaner 6-way unrolling,
55C    without the dual loop exits.
56
57ABI_SUPPORT(DOS64)
58ABI_SUPPORT(STD64)
59
60ASM_START()
61	TEXT
62	ALIGN(32)
63PROLOGUE(mpn_mod_34lsub1)
64	FUNC_ENTRY(2)
65
66	mov	$0x0000FFFFFFFFFFFF, %r11
67
68	sub	$2, %rsi
69	ja	L(gt2)
70
71	mov	(ap), %rax
72	nop
73	jb	L(1)
74
75	mov	8(ap), %rsi
76	mov	%rax, %rdx
77	shr	$48, %rax		C src[0] low
78
79	and	%r11, %rdx		C src[0] high
80	add	%rdx, %rax
81	mov	R32(%rsi), R32(%rdx)
82
83	shr	$32, %rsi		C src[1] high
84	add	%rsi, %rax
85
86	shl	$16, %rdx		C src[1] low
87	add	%rdx, %rax
88
89L(1):	FUNC_EXIT()
90	ret
91
92
93	ALIGN(16)
94L(gt2):	xor	R32(%rax), R32(%rax)
95	xor	R32(%rcx), R32(%rcx)
96	xor	R32(%rdx), R32(%rdx)
97	xor	%r8, %r8
98	xor	%r9, %r9
99	xor	%r10, %r10
100
101L(top):	add	(ap), %rax
102	adc	$0, %r10
103	add	8(ap), %rcx
104	adc	$0, %r8
105	add	16(ap), %rdx
106	adc	$0, %r9
107
108	sub	$3, %rsi
109	jng	L(end)
110
111	add	24(ap), %rax
112	adc	$0, %r10
113	add	32(ap), %rcx
114	adc	$0, %r8
115	add	40(ap), %rdx
116	lea	48(ap), ap
117	adc	$0, %r9
118
119	sub	$3, %rsi
120	jg	L(top)
121
122
123	add	$-24, ap
124L(end):	add	%r9, %rax
125	adc	%r10, %rcx
126	adc	%r8, %rdx
127
128	inc	%rsi
129	mov	$0x1, R32(%r10)
130	js	L(combine)
131
132	mov	$0x10000, R32(%r10)
133	adc	24(ap), %rax
134	dec	%rsi
135	js	L(combine)
136
137	adc	32(ap), %rcx
138	mov	$0x100000000, %r10
139
140L(combine):
141	sbb	%rsi, %rsi		C carry
142	mov	%rax, %rdi		C 0mod3
143	shr	$48, %rax		C 0mod3 high
144
145	and	%r10, %rsi		C carry masked
146	and	%r11, %rdi		C 0mod3 low
147	mov	R32(%rcx), R32(%r10)	C 1mod3
148
149	add	%rsi, %rax		C apply carry
150	shr	$32, %rcx		C 1mod3 high
151
152	add	%rdi, %rax		C apply 0mod3 low
153	movzwl	%dx, R32(%rdi)		C 2mod3
154	shl	$16, %r10		C 1mod3 low
155
156	add	%rcx, %rax		C apply 1mod3 high
157	shr	$16, %rdx		C 2mod3 high
158
159	add	%r10, %rax		C apply 1mod3 low
160	shl	$32, %rdi		C 2mod3 low
161
162	add	%rdx, %rax		C apply 2mod3 high
163	add	%rdi, %rax		C apply 2mod3 low
164
165	FUNC_EXIT()
166	ret
167EPILOGUE()
168