1dnl  Intel P6 mpn_lshsub_n -- mpn papillion support.
2
3dnl  Copyright 2006 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33C P6/13: 3.35 cycles/limb	(separate mpn_sub_n + mpn_lshift needs 4.12)
34
35C (1) The loop is not scheduled in any way, and scheduling attempts have not
36C     improved speed on P6/13.  Presumably, the K7 will want scheduling, if it
37C     at all wants to use MMX.
38C (2) We could save a register by not alternatingly using eax and edx in the
39C     loop.
40
41define(`rp',	`%edi')
42define(`up',	`%esi')
43define(`vp',	`%ebx')
44define(`n',	`%ecx')
45define(`cnt',	`%mm7')
46
47ASM_START()
48
49	TEXT
50	ALIGN(16)
51
52PROLOGUE(mpn_lshsub_n)
53	push	%edi
54	push	%esi
55	push	%ebx
56
57	mov	16(%esp), rp
58	mov	20(%esp), up
59	mov	24(%esp), vp
60	mov	28(%esp), n
61	mov	$32, %eax
62	sub	32(%esp), %eax
63	movd	%eax, cnt
64
65	lea	(up,n,4), up
66	lea	(vp,n,4), vp
67	lea	(rp,n,4), rp
68
69	neg	n
70	mov	n, %eax
71	and	$-8, n
72	and	$7, %eax
73	shl	%eax				C eax = 2x
74	lea	(%eax,%eax,4), %edx		C edx = 10x
75ifdef(`PIC',`
76	call	L(pic_calc)
77L(here):
78',`
79	lea	L(ent)(%eax,%edx,2), %eax	C eax = 22x
80')
81
82	pxor	%mm1, %mm1
83	pxor	%mm0, %mm0
84
85	jmp	*%eax
86
87ifdef(`PIC',`
88L(pic_calc):
89	C See mpn/x86/README about old gas bugs
90	lea	(%eax,%edx,2), %eax
91	add	$L(ent)-L(here), %eax
92	add	(%esp), %eax
93	ret_internal
94')
95
96L(end):	C compute (cy<<cnt) | (edx>>(32-cnt))
97	sbb	%eax, %eax
98	neg	%eax
99	mov	32(%esp), %ecx
100	shld	%cl, %edx, %eax
101
102	emms
103
104	pop	%ebx
105	pop	%esi
106	pop	%edi
107	ret
108	ALIGN(16)
109L(top):	jecxz	L(end)
110L(ent):	mov	   0(up,n,4), %eax
111	sbb	   0(vp,n,4), %eax
112	movd	   %eax, %mm0
113	punpckldq  %mm0, %mm1
114	psrlq	   %mm7, %mm1
115	movd	   %mm1, 0(rp,n,4)
116
117	mov	   4(up,n,4), %edx
118	sbb	   4(vp,n,4), %edx
119	movd	   %edx, %mm1
120	punpckldq  %mm1, %mm0
121	psrlq	   %mm7, %mm0
122	movd	   %mm0, 4(rp,n,4)
123
124	mov	   8(up,n,4), %eax
125	sbb	   8(vp,n,4), %eax
126	movd	   %eax, %mm0
127	punpckldq  %mm0, %mm1
128	psrlq	   %mm7, %mm1
129	movd	   %mm1, 8(rp,n,4)
130
131	mov	   12(up,n,4), %edx
132	sbb	   12(vp,n,4), %edx
133	movd	   %edx, %mm1
134	punpckldq  %mm1, %mm0
135	psrlq	   %mm7, %mm0
136	movd	   %mm0, 12(rp,n,4)
137
138	mov	   16(up,n,4), %eax
139	sbb	   16(vp,n,4), %eax
140	movd	   %eax, %mm0
141	punpckldq  %mm0, %mm1
142	psrlq	   %mm7, %mm1
143	movd	   %mm1, 16(rp,n,4)
144
145	mov	   20(up,n,4), %edx
146	sbb	   20(vp,n,4), %edx
147	movd	   %edx, %mm1
148	punpckldq  %mm1, %mm0
149	psrlq	   %mm7, %mm0
150	movd	   %mm0, 20(rp,n,4)
151
152	mov	   24(up,n,4), %eax
153	sbb	   24(vp,n,4), %eax
154	movd	   %eax, %mm0
155	punpckldq  %mm0, %mm1
156	psrlq	   %mm7, %mm1
157	movd	   %mm1, 24(rp,n,4)
158
159	mov	   28(up,n,4), %edx
160	sbb	   28(vp,n,4), %edx
161	movd	   %edx, %mm1
162	punpckldq  %mm1, %mm0
163	psrlq	   %mm7, %mm0
164	movd	   %mm0, 28(rp,n,4)
165
166	lea	   8(n), n
167	jmp	   L(top)
168
169EPILOGUE()
170