1dnl  AMD64 mpn_rshift optimised for CPUs with fast SSE including fast movdqu.
2
3dnl  Contributed to the GNU project by Torbjorn Granlund.
4
5dnl  Copyright 2010-2012 Free Software Foundation, Inc.
6
7dnl  This file is part of the GNU MP Library.
8dnl
9dnl  The GNU MP Library is free software; you can redistribute it and/or modify
10dnl  it under the terms of either:
11dnl
12dnl    * the GNU Lesser General Public License as published by the Free
13dnl      Software Foundation; either version 3 of the License, or (at your
14dnl      option) any later version.
15dnl
16dnl  or
17dnl
18dnl    * the GNU General Public License as published by the Free Software
19dnl      Foundation; either version 2 of the License, or (at your option) any
20dnl      later version.
21dnl
22dnl  or both in parallel, as here.
23dnl
24dnl  The GNU MP Library is distributed in the hope that it will be useful, but
25dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
27dnl  for more details.
28dnl
29dnl  You should have received copies of the GNU General Public License and the
30dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
31dnl  see https://www.gnu.org/licenses/.
32
33include(`../config.m4')
34
35
36C	     cycles/limb     cycles/limb     cycles/limb    good
37C              aligned	      unaligned	      best seen	   for cpu?
38C AMD K8,K9	 3		 3		 2.35	  no, use shl/shr
39C AMD K10	 1.5-1.8	 1.5-1.8	 1.33	  yes
40C AMD bd1	 1.7-1.9	 1.7-1.9	 1.33	  yes
41C AMD bobcat	 3.17		 3.17			  yes, bad for n < 20
42C Intel P4	 4.67		 4.67		 2.7	  no, slow movdqu
43C Intel core2	 2.15		 2.15		 1.25	  no, use shld/shrd
44C Intel NHM	 1.66		 1.66		 1.25	  no, use shld/shrd
45C Intel SBR	 1.3		 1.3		 1.25	  yes, bad for n = 4-6
46C Intel atom	11.7		11.7		 4.5	  no
47C VIA nano	 5.7		 5.95		 2.0	  no, slow movdqu
48
49C We try to do as many aligned 16-byte operations as possible.  The top-most
50C and bottom-most writes might need 8-byte operations.
51C
52C This variant rely on fast load movdqu, and uses it even for aligned operands,
53C in order to avoid the need for two separate loops.
54C
55C TODO
56C  * Could 2-limb wind-down code be simplified?
57C  * Improve basecase code, using shld/shrd for SBR, discrete integer shifts
58C    for other affected CPUs.
59
60C INPUT PARAMETERS
61define(`rp',  `%rdi')
62define(`ap',  `%rsi')
63define(`n',   `%rdx')
64define(`cnt', `%rcx')
65
66ASM_START()
67	TEXT
68	ALIGN(64)
69PROLOGUE(mpn_rshift)
70	FUNC_ENTRY(4)
71	movd	R32(%rcx), %xmm4
72	mov	$64, R32(%rax)
73	sub	R32(%rcx), R32(%rax)
74	movd	R32(%rax), %xmm5
75
76	neg	R32(%rcx)
77	mov	(ap), %rax
78	shl	R8(%rcx), %rax
79
80	cmp	$3, n
81	jle	L(bc)
82
83	test	$8, R8(rp)
84	jz	L(rp_aligned)
85
86C Do one initial limb in order to make rp aligned
87	movq	(ap), %xmm0
88	movq	8(ap), %xmm1
89	psrlq	%xmm4, %xmm0
90	psllq	%xmm5, %xmm1
91	por	%xmm1, %xmm0
92	movq	%xmm0, (rp)
93	lea	8(ap), ap
94	lea	8(rp), rp
95	dec	n
96
97L(rp_aligned):
98	lea	1(n), %r8d
99	lea	(ap,n,8), ap
100	lea	(rp,n,8), rp
101	neg	n
102
103	and	$6, R32(%r8)
104	jz	L(bu0)
105	cmp	$4, R32(%r8)
106	jz	L(bu4)
107	jc	L(bu2)
108L(bu6):	add	$4, n
109	jmp	L(i56)
110L(bu0):	add	$6, n
111	jmp	L(i70)
112L(bu4):	add	$2, n
113	jmp	L(i34)
114L(bu2):	add	$8, n
115	jge	L(end)
116
117	ALIGN(16)
118L(top):	movdqu	-64(ap,n,8), %xmm1
119	movdqu	-56(ap,n,8), %xmm0
120	psllq	%xmm5, %xmm0
121	psrlq	%xmm4, %xmm1
122	por	%xmm1, %xmm0
123	movdqa	%xmm0, -64(rp,n,8)
124L(i70):
125	movdqu	-48(ap,n,8), %xmm1
126	movdqu	-40(ap,n,8), %xmm0
127	psllq	%xmm5, %xmm0
128	psrlq	%xmm4, %xmm1
129	por	%xmm1, %xmm0
130	movdqa	%xmm0, -48(rp,n,8)
131L(i56):
132	movdqu	-32(ap,n,8), %xmm1
133	movdqu	-24(ap,n,8), %xmm0
134	psllq	%xmm5, %xmm0
135	psrlq	%xmm4, %xmm1
136	por	%xmm1, %xmm0
137	movdqa	%xmm0, -32(rp,n,8)
138L(i34):
139	movdqu	-16(ap,n,8), %xmm1
140	movdqu	-8(ap,n,8), %xmm0
141	psllq	%xmm5, %xmm0
142	psrlq	%xmm4, %xmm1
143	por	%xmm1, %xmm0
144	movdqa	%xmm0, -16(rp,n,8)
145	add	$8, n
146	jl	L(top)
147
148L(end):	test	$1, R8(n)
149	jnz	L(e1)
150
151	movdqu	-16(ap), %xmm1
152	movq	-8(ap), %xmm0
153	psrlq	%xmm4, %xmm1
154	psllq	%xmm5, %xmm0
155	por	%xmm1, %xmm0
156	movdqa	%xmm0, -16(rp)
157	FUNC_EXIT()
158	ret
159
160L(e1):	movq	-8(ap), %xmm0
161	psrlq	%xmm4, %xmm0
162	movq	%xmm0, -8(rp)
163	FUNC_EXIT()
164	ret
165
166C Basecase
167	ALIGN(16)
168L(bc):	dec	R32(n)
169	jnz	1f
170	movq	(ap), %xmm0
171	psrlq	%xmm4, %xmm0
172	movq	%xmm0, (rp)
173	FUNC_EXIT()
174	ret
175
1761:	movq	(ap), %xmm1
177	movq	8(ap), %xmm0
178	psrlq	%xmm4, %xmm1
179	psllq	%xmm5, %xmm0
180	por	%xmm1, %xmm0
181	movq	%xmm0, (rp)
182	dec	R32(n)
183	jnz	1f
184	movq	8(ap), %xmm0
185	psrlq	%xmm4, %xmm0
186	movq	%xmm0, 8(rp)
187	FUNC_EXIT()
188	ret
189
1901:	movq	8(ap), %xmm1
191	movq	16(ap), %xmm0
192	psrlq	%xmm4, %xmm1
193	psllq	%xmm5, %xmm0
194	por	%xmm1, %xmm0
195	movq	%xmm0,	8(rp)
196	movq	16(ap), %xmm0
197	psrlq	%xmm4, %xmm0
198	movq	%xmm0, 16(rp)
199	FUNC_EXIT()
200	ret
201EPILOGUE()
202