1/* $Id: VIScsumcopy.S,v 1.1.1.1 2008/10/15 03:26:19 james26_jang Exp $
2 * VIScsumcopy.S: High bandwidth IP checksumming with simultaneous
3 *            copying utilizing the UltraSparc Visual Instruction Set.
4 *
5 * Copyright (C) 1997, 1999 Jakub Jelinek (jj@ultra.linux.cz)
6 *
7 * Based on older sparc32/sparc64 checksum.S, which is:
8 *
9 *      Copyright(C) 1995 Linus Torvalds
10 *      Copyright(C) 1995 Miguel de Icaza
11 *      Copyright(C) 1996,1997 David S. Miller
12 *    derived from:
13 *	  Linux/Alpha checksum c-code
14 *        Linux/ix86 inline checksum assembly
15 *        RFC1071 Computing the Internet Checksum (esp. Jacobsons m68k code)
16 *	  David Mosberger-Tang for optimized reference c-code
17 *	  BSD4.4 portable checksum routine
18 */
19
20#ifdef __sparc_v9__
21#define STACKOFF	0x7ff+128
22#else
23#define STACKOFF	64
24#endif
25
26#ifdef __KERNEL__
27#include <asm/head.h>
28#include <asm/asi.h>
29#include <asm/page.h>
30#include <asm/visasm.h>
31#define ASI_BLK_XOR	0
32#define ASI_BLK_XOR1	(ASI_BLK_P ^ (ASI_BLK_P >> 3) ^ ASI_P)
33#define ASI_BLK_OR	(ASI_BLK_P & ~ASI_P)
34#else
35#define ASI_P		0x80
36#define ASI_BLK_P	0xf0
37#define FRPS_FEF	0x04
38#define FPRS_DU		0x02
39#define FPRS_DL		0x01
40#define ASI_BLK_XOR	(ASI_BLK_P ^ ASI_P)
41#endif
42
43#define src		o0
44#define dst		o1
45#define	len		o2
46#define sum		o3
47#define x1		g1
48#define x2		g2
49#define x3		o4
50#define x4		g4
51#define x5		g5
52#define x6		g7
53#define x7		g3
54#define x8		o5
55
56/* Dobrou noc, SunSoft engineers. Spete sladce.
57 * This has a couple of tricks in and those
58 * tricks are UltraLinux trade secrets :))
59 * Once AGAIN, the SunSoft engineers are caught
60 * asleep at the keyboard :)).
61 * The main loop does about 20 superscalar cycles
62 * per 64bytes checksummed/copied.
63 */
64
65#define LDBLK(O0)									\
66	ldda		[%src] %asi, %O0 	/*  Load	Group		*/
67
68#define STBLK										\
69	stda		%f48, [%dst] ASI_BLK_P	/*  Store			*/
70
71#define ST(fx,off)									\
72	std		%fx, [%dst + off]	/*  Store			*/
73
74#define SYNC										\
75	membar		#Sync
76
77
78#define DO_THE_TRICK(f0,f2,f4,f6,f8,f10,f12,f14,F0,F2,F4,F6,F8,F10,F12,F14,DUMMY1,A0,A2,A4,A6,A8,A10,A12,A14,B14,DUMMY2,LOAD,STORE1,STORE2,STORE3,STORE4,STORE5,STORE6,STORE7,STORE8,DUMMY3,BRANCH...)	\
79	LOAD					/*  Load	(Group)		*/;	\
80	faligndata	%A14, %F0, %A14		/*  FPA		Group		*/;	\
81	inc		%x5			/*  IEU0			*/;	\
82	STORE1					/*  Store (optional)		*/;	\
83	faligndata	%F0, %F2, %A0		/*  FPA		Group		*/;	\
84	srl		%x5, 1, %x5		/*  IEU0			*/;	\
85	add		%sum, %x4, %sum		/*  IEU1			*/;	\
86	fpadd32		%F0, %f0, %F0		/*  FPA		Group		*/;	\
87	inc		%x6			/*  IEU0			*/;	\
88	STORE2					/*  Store (optional)		*/;	\
89	faligndata	%F2, %F4, %A2		/*  FPA		Group		*/;	\
90	srl		%x6, 1, %x6		/*  IEU0			*/;	\
91	add		%sum, %x5, %sum		/*  IEU1			*/;	\
92	fpadd32		%F2, %f2, %F2		/*  FPA		Group		*/;	\
93	add		%src, 64, %src		/*  IEU0			*/;	\
94	fcmpgt32	%f0, %F0, %x1		/*  FPM				*/;	\
95	add		%dst, 64, %dst		/*  IEU1	Group		*/;	\
96	inc		%x7			/*  IEU0			*/;	\
97	STORE3					/*  Store (optional)		*/;	\
98	faligndata	%F4, %F6, %A4		/*  FPA				*/;	\
99	fpadd32		%F4, %f4, %F4		/*  FPA		Group		*/;	\
100	add		%sum, %x6, %sum		/*  IEU1			*/;	\
101	fcmpgt32	%f2, %F2, %x2		/*  FPM				*/;	\
102	srl		%x7, 1, %x7		/*  IEU0	Group		*/;	\
103	inc		%x8			/*  IEU1			*/;	\
104	STORE4					/*  Store (optional)		*/;	\
105	faligndata	%F6, %F8, %A6		/*  FPA				*/;	\
106	fpadd32		%F6, %f6, %F6		/*  FPA		Group		*/;	\
107	srl		%x8, 1, %x8		/*  IEU0			*/;	\
108	fcmpgt32	%f4, %F4, %x3		/*  FPM				*/;	\
109	add		%sum, %x7, %sum		/*  IEU0	Group		*/;	\
110	inc		%x1			/*  IEU1			*/;	\
111	STORE5					/*  Store (optional)		*/;	\
112	faligndata	%F8, %F10, %A8		/*  FPA				*/;	\
113	fpadd32		%F8, %f8, %F8		/*  FPA		Group		*/;	\
114	srl		%x1, 1, %x1		/*  IEU0			*/;	\
115	fcmpgt32	%f6, %F6, %x4		/*  FPM				*/;	\
116	add		%sum, %x8, %sum		/*  IEU0	Group		*/;	\
117	inc		%x2			/*  IEU1			*/;	\
118	STORE6					/*  Store (optional)		*/;	\
119	faligndata	%F10, %F12, %A10	/*  FPA				*/;	\
120	fpadd32		%F10, %f10, %F10	/*  FPA		Group		*/;	\
121	srl		%x2, 1, %x2		/*  IEU0			*/;	\
122	fcmpgt32	%f8, %F8, %x5		/*  FPM				*/;	\
123	add		%sum, %x1, %sum		/*  IEU0	Group		*/;	\
124	inc		%x3			/*  IEU1			*/;	\
125	STORE7					/*  Store (optional)		*/;	\
126	faligndata	%F12, %F14, %A12	/*  FPA				*/;	\
127	fpadd32		%F12, %f12, %F12	/*  FPA		Group		*/;	\
128	srl		%x3, 1, %x3		/*  IEU0			*/;	\
129	fcmpgt32	%f10, %F10, %x6		/*  FPM				*/;	\
130	add		%sum, %x2, %sum		/*  IEU0	Group		*/;	\
131	inc		%x4			/*  IEU1			*/;	\
132	STORE8					/*  Store (optional)		*/;	\
133	fmovd		%F14, %B14		/*  FPA				*/;	\
134	fpadd32		%F14, %f14, %F14	/*  FPA		Group		*/;	\
135	srl		%x4, 1, %x4		/*  IEU0			*/;	\
136	fcmpgt32	%f12, %F12, %x7		/*  FPM				*/;	\
137	add		%sum, %x3, %sum		/*  IEU0	Group		*/;	\
138	subcc		%len, 64, %len		/*  IEU1			*/;	\
139	BRANCH					/*  CTI				*/;	\
140	fcmpgt32	%f14, %F14, %x8		/*  FPM		Group		*/;
141
142#define END_THE_TRICK(f0,f2,f4,f6,f8,f10,f12,f14,FA,FB,S0,S1,S2,S3,T0,T1,U0,fz) \
143	inc		%x5			/*  IEU0	Group		*/;	\
144	fpadd32		%f2, %f0, %S0		/*  FPA				*/;	\
145	add		%sum, %x4, %sum		/*  IEU1			*/;	\
146	srl		%x5, 1, %x5		/*  IEU0	Group		*/;	\
147	fpadd32		%f6, %f4, %S1		/*  FPA				*/;	\
148	inc		%x6			/*  IEU1			*/;	\
149	fpadd32		%f10, %f8, %S2		/*  FPA		Group		*/;	\
150	add		%sum, %x5, %sum		/*  IEU0			*/;	\
151	fcmpgt32	%f0, %S0, %x1		/*  FPM				*/;	\
152	fpadd32		%f14, %f12, %S3		/*  FPA		Group		*/;	\
153	srl		%x6, 1, %x6		/*  IEU0			*/;	\
154	fcmpgt32	%f4, %S1, %x2		/*  FPM				*/;	\
155	add		%sum, %x6, %sum		/*  IEU0	Group		*/;	\
156	fzero		%fz			/*  FPA				*/;	\
157	fcmpgt32	%f8, %S2, %x3		/*  FPM				*/;	\
158	inc		%x7			/*  IEU0	Group		*/;	\
159	inc		%x8			/*  IEU1			*/;	\
160	srl		%x7, 1, %x7		/*  IEU0	Group		*/;	\
161	inc		%x1			/*  IEU1			*/;	\
162	fpadd32		%S0, %S1, %T0		/*  FPA				*/;	\
163	fpadd32		%S2, %S3, %T1		/*  FPA		Group		*/;	\
164	add		%sum, %x7, %sum		/*  IEU0			*/;	\
165	fcmpgt32	%f12, %S3, %x4		/*  FPM				*/;	\
166	srl		%x8, 1, %x8		/*  IEU0	Group		*/;	\
167	inc		%x2			/*  IEU1			*/;	\
168	srl		%x1, 1, %x1		/*  IEU0	Group		*/;	\
169	add		%sum, %x8, %sum		/*  IEU1			*/;	\
170	add		%sum, %x1, %sum		/*  IEU0	Group		*/;	\
171	fcmpgt32	%S0, %T0, %x5		/*  FPM				*/;	\
172	srl		%x2, 1, %x2		/*  IEU0	Group		*/;	\
173	fcmpgt32	%S2, %T1, %x6		/*  FPM				*/;	\
174	inc		%x3			/*  IEU0	Group		*/;	\
175	add		%sum, %x2, %sum		/*  IEU1			*/;	\
176	srl		%x3, 1, %x3		/*  IEU0	Group		*/;	\
177	inc		%x4			/*  IEU1			*/;	\
178	fpadd32		%T0, %T1, %U0		/*  FPA		Group		*/;	\
179	add		%sum, %x3, %sum		/*  IEU0			*/;	\
180	fcmpgt32	%fz, %f2, %x7		/*  FPM				*/;	\
181	srl		%x4, 1, %x4		/*  IEU0	Group		*/;	\
182	fcmpgt32	%fz, %f6, %x8		/*  FPM				*/;	\
183	inc		%x5			/*  IEU0	Group		*/;	\
184	add		%sum, %x4, %sum		/*  IEU1			*/;	\
185	srl		%x5, 1, %x5		/*  IEU0	Group		*/;	\
186	fcmpgt32	%fz, %f10, %x1		/*  FPM				*/;	\
187	inc		%x6			/*  IEU0	Group		*/;	\
188	add		%sum, %x5, %sum		/*  IEU1			*/;	\
189	fmovd		%FA, %FB		/*  FPA		Group		*/;	\
190	fcmpgt32	%fz, %f14, %x2		/*  FPM				*/;	\
191	srl		%x6, 1, %x6		/*  IEU0	Group		*/;	\
192	ba,pt		%xcc, ett		/*  CTI				*/;	\
193	 inc		%x7			/*  IEU1			*/;
194
195#define END_THE_TRICK1(f0,f2,f4,f6,f8,f10,f12,f14,FA,FB) 				\
196	END_THE_TRICK(f0,f2,f4,f6,f8,f10,f12,f14,FA,FB,f48,f50,f52,f54,f56,f58,f60,f62)
197
198#define END_THE_TRICK2(S0,S1,S2,S3,T0,T1,U0,U1,V0,fz) 					\
199	fpadd32		%U0, %U1, %V0		/*  FPA		Group		*/;	\
200	srl		%x7, 1, %x7		/*  IEU0			*/;	\
201	add		%sum, %x6, %sum		/*  IEU1			*/;	\
202	std		%V0, [%sp + STACKOFF]	/*  Store	Group		*/;	\
203	inc		%x8			/*  IEU0			*/;	\
204	sub		%sum, %x7, %sum		/*  IEU1			*/;	\
205	srl		%x8, 1, %x8		/*  IEU0	Group		*/;	\
206	fcmpgt32	%fz, %S1, %x3		/*  FPM				*/;	\
207	inc		%x1			/*  IEU0	Group		*/;	\
208	fcmpgt32	%fz, %S3, %x4		/*  FPM				*/;	\
209	srl		%x1, 1, %x1		/*  IEU0	Group		*/;	\
210	sub		%sum, %x8, %sum		/*  IEU1			*/;	\
211	ldx		[%sp + STACKOFF], %x8	/*  Load	Group		*/;	\
212	inc		%x2			/*  IEU0			*/;	\
213	sub		%sum, %x1, %sum		/*  IEU1			*/;	\
214	srl		%x2, 1, %x2		/*  IEU0	Group		*/;	\
215	fcmpgt32	%fz, %T1, %x5		/*  FPM				*/;	\
216	inc		%x3			/*  IEU0	Group		*/;	\
217	fcmpgt32	%T0, %U0, %x6		/*  FPM				*/;	\
218	srl		%x3, 1, %x3		/*  IEU0	Group		*/;	\
219	sub		%sum, %x2, %sum		/*  IEU1			*/;	\
220	inc		%x4			/*  IEU0	Group		*/;	\
221	sub		%sum, %x3, %sum		/*  IEU1			*/;	\
222	srl		%x4, 1, %x4		/*  IEU0	Group		*/;	\
223	fcmpgt32	%fz, %U1, %x7		/*  FPM				*/;	\
224	inc		%x5			/*  IEU0	Group		*/;	\
225	fcmpgt32	%U0, %V0, %x1		/*  FPM				*/;	\
226	srl		%x5, 1, %x5		/*  IEU0	Group		*/;	\
227	sub		%sum, %x4, %sum		/*  IEU1			*/;	\
228	sub		%sum, %x5, %sum		/*  IEU0	Group		*/;	\
229	fcmpgt32	%fz, %V0, %x2		/*  FPM				*/;	\
230	inc		%x6			/*  IEU0	Group		*/;	\
231	inc		%x7			/*  IEU1			*/;	\
232	srl		%x6, 1, %x6		/*  IEU0	Group		*/;	\
233	inc		%x1			/*  IEU1			*/;	\
234	srl		%x7, 1, %x7		/*  IEU0	Group		*/;	\
235	add		%sum, %x6, %sum		/*  IEU1			*/;	\
236	srl		%x1, 1, %x1		/*  IEU0	Group		*/;	\
237	sub		%sum, %x7, %sum		/*  IEU1			*/;	\
238	inc		%x2			/*  IEU0	Group		*/;	\
239	add		%sum, %x1, %sum		/*  IEU1			*/;	\
240	srl		%x2, 1, %x2		/*  IEU0	Group		*/;	\
241	sub		%sum, %x2, %sum		/*  IEU0	Group		*/;	\
242	addcc		%sum, %x8, %sum		/*  IEU1	Group		*/;	\
243	bcs,a,pn	%xcc, 33f		/*  CTI				*/;	\
244	 add		%sum, 1, %sum		/*  IEU0	(Group)		*/;	\
24533:						/*  That's it			*/;
246
247	.text
248	.globl		csum_partial_copy_vis
249	.align		32
250/* %asi should be either ASI_P or ASI_AIUS for csum_partial_copy resp.
251 * csum_partial_copy_from_user
252 * This assumes that !((%src^%dst)&3) && !((%src|%dst)&1) && %len >= 256
253 */
254csum_partial_copy_vis:
255	andcc		%dst, 7, %g0		/*  IEU1	Group		*/
256	be,pt		%icc, 4f		/*  CTI				*/
257	 and		%dst, 0x38, %o4		/*  IEU0			*/
258	mov		1, %g5			/*  IEU0	Group		*/
259	andcc		%dst, 2, %g0		/*  IEU1			*/
260	be,pt		%icc, 1f		/*  CTI				*/
261	 and		%dst, 4, %g7		/*  IEU0	Group		*/
262	lduha		[%src] %asi, %g2	/*  Load			*/
263	sub		%len, 2, %len		/*  IEU0	Group		*/
264	add		%dst, 2, %dst		/*  IEU1			*/
265	andcc		%dst, 4, %g7		/*  IEU1	Group		*/
266	sll		%g5, 16, %g5		/*  IEU0			*/
267	sth		%g2, [%dst - 2]		/*  Store	Group		*/
268	sll		%g2, 16, %g2		/*  IEU0	 		*/
269	add		%src, 2, %src		/*  IEU1			*/
270	addcc		%g2, %sum, %sum		/*  IEU1	Group		*/
271	bcs,a,pn	%icc, 1f		/*  CTI				*/
272	 add		%sum, %g5, %sum		/*  IEU0			*/
2731:	lduwa		[%src] %asi, %g2	/*  Load			*/
274	brz,a,pn	%g7, 4f			/*  CTI+IEU1	Group		*/
275	 and		%dst, 0x38, %o4		/*  IEU0			*/
276	add		%dst, 4, %dst		/*  IEU0	Group		*/
277	sub		%len, 4, %len		/*  IEU1			*/
278	addcc		%g2, %sum, %sum		/*  IEU1	Group		*/
279	bcs,a,pn	%icc, 1f		/*  CTI				*/
280	 add		%sum, 1, %sum		/*  IEU0			*/
2811:	and		%dst, 0x38, %o4		/*  IEU0	Group		*/
282	stw		%g2, [%dst - 4]		/*  Store			*/
283	add		%src, 4, %src		/*  IEU1			*/
2844:
285#ifdef __KERNEL__
286	VISEntry
287#endif
288	mov		%src, %g7		/*  IEU1	Group		*/
289	fzero		%f48			/*  FPA				*/
290	alignaddr	%src, %g0, %src		/*  Single	Group		*/
291	subcc		%g7, %src, %g7		/*  IEU1	Group		*/
292	be,pt		%xcc, 1f		/*  CTI				*/
293	 mov		0x40, %g1		/*  IEU0			*/
294	lduwa		[%src] %asi, %g2	/*  Load	Group		*/
295	subcc		%sum, %g2, %sum		/*  IEU1	Group+load stall*/
296	bcs,a,pn	%icc, 1f		/*  CTI				*/
297	 sub		%sum, 1, %sum		/*  IEU0			*/
2981:	srl		%sum, 0, %sum		/*  IEU0	Group		*/
299	clr		%g5			/*  IEU1			*/
300	brz,pn		%o4, 3f			/*  CTI+IEU1	Group		*/
301	 sub		%g1, %o4, %g1		/*  IEU0			*/
302	ldda		[%src] %asi, %f0	/*  Load			*/
303	clr		%o4			/*  IEU0	Group		*/
304	andcc		%dst, 8, %g0		/*  IEU1			*/
305	be,pn		%icc, 1f		/*  CTI				*/
306	 ldda		[%src + 8] %asi, %f2	/*  Load	Group		*/
307	add		%src, 8, %src		/*  IEU0			*/
308	sub		%len, 8, %len		/*  IEU1			*/
309	fpadd32		%f0, %f48, %f50		/*  FPA				*/
310	addcc		%dst, 8, %dst		/*  IEU1	Group		*/
311	faligndata	%f0, %f2, %f16		/*  FPA				*/
312	fcmpgt32	%f48, %f50, %o4		/*  FPM		Group		*/
313	fmovd		%f2, %f0		/*  FPA		Group		*/
314	ldda		[%src + 8] %asi, %f2	/*  Load			*/
315	std		%f16, [%dst - 8]	/*  Store			*/
316	fmovd		%f50, %f48		/*  FPA				*/
3171:	andcc		%g1, 0x10, %g0		/*  IEU1	Group		*/
318	be,pn		%icc, 1f		/*  CTI				*/
319	 and		%g1, 0x20, %g1		/*  IEU0			*/
320	fpadd32		%f0, %f48, %f50		/*  FPA				*/
321	ldda		[%src + 16] %asi, %f4	/*  Load	Group		*/
322	add		%src, 16, %src		/*  IEU0			*/
323	add		%dst, 16, %dst		/*  IEU1			*/
324	faligndata	%f0, %f2, %f16		/*  FPA				*/
325	fcmpgt32	%f48, %f50, %g5		/*  FPM		Group		*/
326	sub		%len, 16, %len		/*  IEU0			*/
327	inc		%o4			/*  IEU1			*/
328	std		%f16, [%dst - 16]	/*  Store	Group		*/
329	fpadd32		%f2, %f50, %f48		/*  FPA				*/
330	srl		%o4, 1, %o5		/*  IEU0			*/
331	faligndata	%f2, %f4, %f18		/*  FPA		Group		*/
332	std		%f18, [%dst - 8]	/*  Store			*/
333	fcmpgt32	%f50, %f48, %o4		/*  FPM		Group		*/
334	add		%o5, %sum, %sum		/*  IEU0			*/
335	ldda		[%src + 8] %asi, %f2	/*  Load			*/
336	fmovd		%f4, %f0		/*  FPA				*/
3371:	brz,a,pn	%g1, 4f			/*  CTI+IEU1	Group		*/
338	 rd		%asi, %g2		/*  LSU		Group + 4 bubbles*/
339	inc		%g5			/*  IEU0			*/
340	fpadd32		%f0, %f48, %f50		/*  FPA				*/
341	ldda		[%src + 16] %asi, %f4	/*  Load	Group		*/
342	srl		%g5, 1, %g5		/*  IEU0			*/
343	add		%dst, 32, %dst		/*  IEU1			*/
344	faligndata	%f0, %f2, %f16		/*  FPA				*/
345	fcmpgt32	%f48, %f50, %o5		/*  FPM		Group		*/
346	inc		%o4			/*  IEU0			*/
347	ldda		[%src + 24] %asi, %f6	/*  Load			*/
348	srl		%o4, 1, %o4		/*  IEU0	Group		*/
349	add		%g5, %sum, %sum		/*  IEU1			*/
350	ldda		[%src + 32] %asi, %f8	/*  Load			*/
351	fpadd32		%f2, %f50, %f48		/*  FPA				*/
352	faligndata	%f2, %f4, %f18		/*  FPA		Group		*/
353	sub		%len, 32, %len		/*  IEU0			*/
354	std		%f16, [%dst - 32]	/*  Store			*/
355	fcmpgt32	%f50, %f48, %g3		/*  FPM		Group		*/
356	inc		%o5			/*  IEU0			*/
357	add		%o4, %sum, %sum		/*  IEU1			*/
358	fpadd32		%f4, %f48, %f50		/*  FPA				*/
359	faligndata	%f4, %f6, %f20		/*  FPA		Group		*/
360	srl		%o5, 1, %o5		/*  IEU0			*/
361	fcmpgt32	%f48, %f50, %g5		/*  FPM		Group		*/
362	add		%o5, %sum, %sum		/*  IEU0			*/
363	std		%f18, [%dst - 24]	/*  Store			*/
364	fpadd32		%f6, %f50, %f48		/*  FPA				*/
365	inc		%g3			/*  IEU0	Group		*/
366	std		%f20, [%dst - 16]	/*  Store			*/
367	add		%src, 32, %src		/*  IEU1			*/
368	faligndata	%f6, %f8, %f22		/*  FPA				*/
369	fcmpgt32	%f50, %f48, %o4		/*  FPM		Group		*/
370	srl		%g3, 1, %g3		/*  IEU0			*/
371	std		%f22, [%dst - 8]	/*  Store			*/
372	add		%g3, %sum, %sum		/*  IEU0	Group		*/
3733:	rd		%asi, %g2		/*  LSU		Group + 4 bubbles*/
374#ifdef __KERNEL__
3754:	sethi		%hi(vis0s), %g7		/*  IEU0	Group		*/
376	or		%g2, ASI_BLK_OR, %g2	/*  IEU1			*/
377#else
3784:	rd		%pc, %g7		/*  LSU		Group + 4 bubbles*/
379#endif
380	inc		%g5			/*  IEU0	Group		*/
381	and		%src, 0x38, %g3		/*  IEU1			*/
382	membar		#StoreLoad		/*  LSU		Group		*/
383	srl		%g5, 1, %g5		/*  IEU0			*/
384	inc		%o4			/*  IEU1			*/
385	sll		%g3, 8, %g3		/*  IEU0	Group		*/
386	sub		%len, 0xc0, %len	/*  IEU1			*/
387	addcc		%g5, %sum, %sum		/*  IEU1	Group		*/
388	srl		%o4, 1, %o4		/*  IEU0			*/
389	add		%g7, %g3, %g7		/*  IEU0	Group		*/
390	add		%o4, %sum, %sum		/*  IEU1			*/
391#ifdef __KERNEL__
392	jmpl		%g7 + %lo(vis0s), %g0	/*  CTI+IEU1	Group		*/
393#else
394	jmpl		%g7 + (vis0s - 4b), %g0	/*  CTI+IEU1	Group		*/
395#endif
396	 fzero		%f32			/*  FPA				*/
397
398	.align		2048
399vis0s:	wr		%g2, ASI_BLK_XOR, %asi	/*  LSU		Group		*/
400	add		%src, 128, %src		/*  IEU0	Group		*/
401	ldda		[%src-128] %asi, %f0	/*  Load	Group		*/
402	ldda		[%src-64] %asi, %f16	/*  Load	Group		*/
403	fmovd		%f48, %f62		/*  FPA		Group	f0 available*/
404	faligndata	%f0, %f2, %f48		/*  FPA		Group	f2 available*/
405	fcmpgt32	%f32, %f2, %x1		/*  FPM		Group	f4 available*/
406	fpadd32		%f0, %f62, %f0		/*  FPA				*/
407	fcmpgt32	%f32, %f4, %x2		/*  FPM		Group	f6 available*/
408	faligndata	%f2, %f4, %f50		/*  FPA				*/
409	fcmpgt32	%f62, %f0, %x3		/*  FPM		Group	f8 available*/
410	faligndata	%f4, %f6, %f52		/*  FPA				*/
411	fcmpgt32	%f32, %f6, %x4		/*  FPM		Group	f10 available*/
412	inc		%x1			/*  IEU0			*/
413	faligndata	%f6, %f8, %f54		/*  FPA				*/
414	fcmpgt32	%f32, %f8, %x5		/*  FPM		Group	f12 available*/
415	srl		%x1, 1, %x1		/*  IEU0			*/
416	inc		%x2			/*  IEU1			*/
417	faligndata	%f8, %f10, %f56		/*  FPA				*/
418	fcmpgt32	%f32, %f10, %x6		/*  FPM		Group	f14 available*/
419	srl		%x2, 1, %x2		/*  IEU0			*/
420	add		%sum, %x1, %sum		/*  IEU1			*/
421	faligndata	%f10, %f12, %f58	/*  FPA				*/
422	fcmpgt32	%f32, %f12, %x7		/*  FPM		Group		*/
423	inc		%x3			/*  IEU0			*/
424	add		%sum, %x2, %sum		/*  IEU1			*/
425	faligndata	%f12, %f14, %f60	/*  FPA				*/
426	fcmpgt32	%f32, %f14, %x8		/*  FPM		Group		*/
427	srl		%x3, 1, %x3		/*  IEU0			*/
428	inc		%x4			/*  IEU1			*/
429	fmovd		%f14, %f62		/*  FPA				*/
430	srl		%x4, 1, %x4		/*  IEU0	Group		*/
431	add		%sum, %x3, %sum		/*  IEU1			*/
432vis0:	DO_THE_TRICK(	f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,
433			,f48,f50,f52,f54,f56,f58,f60,f62,f62,
434			,LDBLK(f32),	STBLK,,,,,,,,
435			,bcs,pn %icc, vis0e1)
436	DO_THE_TRICK(	f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
437			,f48,f50,f52,f54,f56,f58,f60,f62,f62,
438			,LDBLK(f0),	STBLK,,,,,,,,
439			,bcs,pn %icc, vis0e2)
440	DO_THE_TRICK(	f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,
441			,f48,f50,f52,f54,f56,f58,f60,f62,f62,
442			,LDBLK(f16),	STBLK,,,,,,,,
443			,bcc,pt %icc, vis0)
444vis0e3:	DO_THE_TRICK(	f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,
445			,f48,f50,f52,f54,f56,f58,f60,f62,f32,
446			,SYNC,		STBLK,ST(f48,64),ST(f50,8),ST(f52,16),ST(f54,24),ST(f56,32),ST(f58,40),ST(f60,48),
447			,add %dst, 56, %dst; add %len, 192 - 8*8, %len; ba,pt %icc, e2)
448vis0e1:	DO_THE_TRICK(	f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
449			,f48,f50,f52,f54,f56,f58,f60,f62,f0,
450			,SYNC,		STBLK,ST(f48,64),ST(f50,8),ST(f52,16),ST(f54,24),ST(f56,32),ST(f58,40),ST(f60,48),
451			,add %dst, 56, %dst; add %len, 192 - 8*8, %len; ba,pt %icc, e3)
452vis0e2:	DO_THE_TRICK(	f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,
453			,f48,f50,f52,f54,f56,f58,f60,f62,f16,
454			,SYNC,		STBLK,ST(f48,64),ST(f50,8),ST(f52,16),ST(f54,24),ST(f56,32),ST(f58,40),ST(f60,48),
455			,add %dst, 56, %dst; add %len, 192 - 8*8, %len; ba,pt %icc, e1)
456	.align		2048
457vis1s:	wr		%g2, ASI_BLK_XOR, %asi	/*  LSU		Group		*/
458	add		%src, 128 - 8, %src	/*  IEU0	Group		*/
459	ldda		[%src-128] %asi, %f0	/*  Load	Group		*/
460	ldda		[%src-64] %asi, %f16	/*  Load	Group		*/
461	fmovd		%f0, %f58		/*  FPA		Group		*/
462	fmovd		%f48, %f0		/*  FPA		Group		*/
463	fcmpgt32	%f32, %f2, %x2		/*  FPM		Group		*/
464	faligndata	%f2, %f4, %f48		/*  FPA				*/
465	fcmpgt32	%f32, %f4, %x3		/*  FPM		Group		*/
466	faligndata	%f4, %f6, %f50		/*  FPA				*/
467	fcmpgt32	%f32, %f6, %x4		/*  FPM		Group		*/
468	faligndata	%f6, %f8, %f52		/*  FPA				*/
469	fcmpgt32	%f32, %f8, %x5		/*  FPM		Group		*/
470	inc		%x2			/*  IEU1			*/
471	faligndata	%f8, %f10, %f54		/*  FPA				*/
472	fcmpgt32	%f32, %f10, %x6		/*  FPM		Group		*/
473	srl		%x2, 1, %x2		/*  IEU0			*/
474	faligndata	%f10, %f12, %f56	/*  FPA				*/
475	fcmpgt32	%f32, %f12, %x7		/*  FPM		Group		*/
476	inc		%x3			/*  IEU0			*/
477	add		%sum, %x2, %sum		/*  IEU1			*/
478	faligndata	%f12, %f14, %f58	/*  FPA				*/
479	fcmpgt32	%f32, %f14, %x8		/*  FPM		Group		*/
480	srl		%x3, 1, %x3		/*  IEU0			*/
481	inc		%x4			/*  IEU1			*/
482	fmovd		%f14, %f60		/*  FPA				*/
483	srl		%x4, 1, %x4		/*  IEU0	Group		*/
484	add		%sum, %x3, %sum		/*  IEU1			*/
485vis1:	DO_THE_TRICK(	f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,
486			,f62,f48,f50,f52,f54,f56,f58,f60,f60,
487			,LDBLK(f32),	,STBLK,,,,,,,
488			,bcs,pn %icc, vis1e1)
489	DO_THE_TRICK(	f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
490			,f62,f48,f50,f52,f54,f56,f58,f60,f60,
491			,LDBLK(f0),	,STBLK,,,,,,,
492			,bcs,pn %icc, vis1e2)
493	DO_THE_TRICK(	f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,
494			,f62,f48,f50,f52,f54,f56,f58,f60,f60,
495			,LDBLK(f16),	,STBLK,,,,,,,
496			,bcc,pt %icc, vis1)
497vis1e3:	DO_THE_TRICK(	f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,
498			,f62,f48,f50,f52,f54,f56,f58,f60,f32,
499			,SYNC,		,STBLK,ST(f48,0),ST(f50,8),ST(f52,16),ST(f54,24),ST(f56,32),ST(f58,40),
500			,add %dst, 48, %dst; add %len, 192 - 7*8, %len; ba,pt %icc, e2)
501vis1e1:	DO_THE_TRICK(	f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
502			,f62,f48,f50,f52,f54,f56,f58,f60,f0,
503			,SYNC,		,STBLK,ST(f48,0),ST(f50,8),ST(f52,16),ST(f54,24),ST(f56,32),ST(f58,40),
504			,add %dst, 48, %dst; add %len, 192 - 7*8, %len; ba,pt %icc, e3)
505vis1e2:	DO_THE_TRICK(	f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,
506			,f62,f48,f50,f52,f54,f56,f58,f60,f16,
507			,SYNC,		,STBLK,ST(f48,0),ST(f50,8),ST(f52,16),ST(f54,24),ST(f56,32),ST(f58,40),
508			,add %dst, 48, %dst; add %len, 192 - 7*8, %len; ba,pt %icc, e1)
509	.align		2048
510vis2s:	wr		%g2, ASI_BLK_XOR, %asi	/*  LSU		Group		*/
511	add		%src, 128 - 16, %src	/*  IEU0	Group		*/
512	ldda		[%src-128] %asi, %f0	/*  Load	Group		*/
513	ldda		[%src-64] %asi, %f16	/*  Load	Group		*/
514	fmovd		%f0, %f56		/*  FPA		Group		*/
515	fmovd		%f48, %f0		/*  FPA		Group		*/
516	sub		%dst, 64, %dst		/*  IEU0			*/
517	fpsub32		%f2, %f2, %f2		/*  FPA		Group		*/
518	fcmpgt32	%f32, %f4, %x3		/*  FPM		Group		*/
519	faligndata	%f4, %f6, %f48		/*  FPA				*/
520	fcmpgt32	%f32, %f6, %x4		/*  FPM		Group		*/
521	faligndata	%f6, %f8, %f50		/*  FPA				*/
522	fcmpgt32	%f32, %f8, %x5		/*  FPM		Group		*/
523	faligndata	%f8, %f10, %f52		/*  FPA				*/
524	fcmpgt32	%f32, %f10, %x6		/*  FPM		Group		*/
525	faligndata	%f10, %f12, %f54	/*  FPA				*/
526	fcmpgt32	%f32, %f12, %x7		/*  FPM		Group		*/
527	inc		%x3			/*  IEU0			*/
528	faligndata	%f12, %f14, %f56	/*  FPA				*/
529	fcmpgt32	%f32, %f14, %x8		/*  FPM		Group		*/
530	srl		%x3, 1, %x3		/*  IEU0			*/
531	inc		%x4			/*  IEU1			*/
532	fmovd		%f14, %f58		/*  FPA				*/
533	srl		%x4, 1, %x4		/*  IEU0	Group		*/
534	add		%sum, %x3, %sum		/*  IEU1			*/
535vis2:	DO_THE_TRICK(	f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,
536			,f60,f62,f48,f50,f52,f54,f56,f58,f58,
537			,LDBLK(f32),	,,STBLK,,,,,,
538			,bcs,pn %icc, vis2e1)
539	DO_THE_TRICK(	f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
540			,f60,f62,f48,f50,f52,f54,f56,f58,f58,
541			,LDBLK(f0),	,,STBLK,,,,,,
542			,bcs,pn %icc, vis2e2)
543	DO_THE_TRICK(	f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,
544			,f60,f62,f48,f50,f52,f54,f56,f58,f58,
545			,LDBLK(f16),	,,STBLK,,,,,,
546			,bcc,pt %icc, vis2)
547vis2e3:	DO_THE_TRICK(	f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,
548			,f60,f62,f48,f50,f52,f54,f56,f58,f32,
549			,SYNC,		,,STBLK,ST(f48,64),ST(f50,72),ST(f52,80),ST(f54,88),ST(f56,96),
550			,add %dst, 104, %dst; add %len, 192 - 6*8, %len; ba,pt %icc, e2)
551vis2e1:	DO_THE_TRICK(	f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
552			,f60,f62,f48,f50,f52,f54,f56,f58,f0,
553			,SYNC,		,,STBLK,ST(f48,64),ST(f50,72),ST(f52,80),ST(f54,88),ST(f56,96),
554			,add %dst, 104, %dst; add %len, 192 - 6*8, %len; ba,pt %icc, e3)
555vis2e2:	DO_THE_TRICK(	f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,
556			,f60,f62,f48,f50,f52,f54,f56,f58,f16,
557			,SYNC,		,,STBLK,ST(f48,64),ST(f50,72),ST(f52,80),ST(f54,88),ST(f56,96),
558			,add %dst, 104, %dst; add %len, 192 - 6*8, %len; ba,pt %icc, e1)
559	.align		2048
560vis3s:	wr		%g2, ASI_BLK_XOR, %asi	/*  LSU		Group		*/
561	add		%src, 128 - 24, %src	/*  IEU0	Group		*/
562	ldda		[%src-128] %asi, %f0	/*  Load	Group		*/
563	ldda		[%src-64] %asi, %f16	/*  Load	Group		*/
564	fmovd		%f0, %f54		/*  FPA		Group		*/
565	fmovd		%f48, %f0		/*  FPA		Group		*/
566	sub		%dst, 64, %dst		/*  IEU0			*/
567	fpsub32		%f2, %f2, %f2		/*  FPA		Group		*/
568	fpsub32		%f4, %f4, %f4		/*  FPA		Group		*/
569	fcmpgt32	%f32, %f6, %x4		/*  FPM		Group		*/
570	faligndata	%f6, %f8, %f48		/*  FPA				*/
571	fcmpgt32	%f32, %f8, %x5		/*  FPM		Group		*/
572	faligndata	%f8, %f10, %f50		/*  FPA				*/
573	fcmpgt32	%f32, %f10, %x6		/*  FPM		Group		*/
574	faligndata	%f10, %f12, %f52	/*  FPA				*/
575	fcmpgt32	%f32, %f12, %x7		/*  FPM		Group		*/
576	faligndata	%f12, %f14, %f54	/*  FPA				*/
577	fcmpgt32	%f32, %f14, %x8		/*  FPM		Group		*/
578	fmovd		%f14, %f56		/*  FPA				*/
579	inc		%x4			/*  IEU0			*/
580	srl		%x4, 1, %x4		/*  IEU0	Group		*/
581vis3:	DO_THE_TRICK(	f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,
582			,f58,f60,f62,f48,f50,f52,f54,f56,f56,
583			,LDBLK(f32),	,,,STBLK,,,,,
584			,bcs,pn %icc, vis3e1)
585	DO_THE_TRICK(	f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
586			,f58,f60,f62,f48,f50,f52,f54,f56,f56,
587			,LDBLK(f0),	,,,STBLK,,,,,
588			,bcs,pn %icc, vis3e2)
589	DO_THE_TRICK(	f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,
590			,f58,f60,f62,f48,f50,f52,f54,f56,f56,
591			,LDBLK(f16),	,,,STBLK,,,,,
592			,bcc,pt %icc, vis3)
593vis3e3:	DO_THE_TRICK(	f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,
594			,f58,f60,f62,f48,f50,f52,f54,f56,f32,
595			,SYNC,		,,,STBLK,ST(f48,64),ST(f50,72),ST(f52,80),ST(f54,88),
596			,add %dst, 96, %dst; add %len, 192 - 5*8, %len; ba,pt %icc, e2)
597vis3e1:	DO_THE_TRICK(	f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
598			,f58,f60,f62,f48,f50,f52,f54,f56,f0,
599			,SYNC,		,,,STBLK,ST(f48,64),ST(f50,72),ST(f52,80),ST(f54,88),
600			,add %dst, 96, %dst; add %len, 192 - 5*8, %len; ba,pt %icc, e3)
601vis3e2:	DO_THE_TRICK(	f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,
602			,f58,f60,f62,f48,f50,f52,f54,f56,f16,
603			,SYNC,		,,,STBLK,ST(f48,64),ST(f50,72),ST(f52,80),ST(f54,88),
604			,add %dst, 96, %dst; add %len, 192 - 5*8, %len; ba,pt %icc, e1)
605	.align		2048
606vis4s:	wr		%g2, ASI_BLK_XOR, %asi	/*  LSU		Group		*/
607	add		%src, 128 - 32, %src	/*  IEU0	Group		*/
608	ldda		[%src-128] %asi, %f0	/*  Load	Group		*/
609	ldda		[%src-64] %asi, %f16	/*  Load	Group		*/
610	fmovd		%f0, %f52		/*  FPA		Group		*/
611	fmovd		%f48, %f0		/*  FPA		Group		*/
612	sub		%dst, 64, %dst		/*  IEU0			*/
613	fpsub32		%f2, %f2, %f2		/*  FPA		Group		*/
614	fpsub32		%f4, %f4, %f4		/*  FPA		Group		*/
615	fpsub32		%f6, %f6, %f6		/*  FPA		Group		*/
616	clr		%x4			/*  IEU0			*/
617	fcmpgt32	%f32, %f8, %x5		/*  FPM		Group		*/
618	faligndata	%f8, %f10, %f48		/*  FPA				*/
619	fcmpgt32	%f32, %f10, %x6		/*  FPM		Group		*/
620	faligndata	%f10, %f12, %f50	/*  FPA				*/
621	fcmpgt32	%f32, %f12, %x7		/*  FPM		Group		*/
622	faligndata	%f12, %f14, %f52	/*  FPA				*/
623	fcmpgt32	%f32, %f14, %x8		/*  FPM		Group		*/
624	fmovd		%f14, %f54		/*  FPA				*/
625vis4:	DO_THE_TRICK(	f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,
626			,f56,f58,f60,f62,f48,f50,f52,f54,f54,
627			,LDBLK(f32),	,,,,STBLK,,,,
628			,bcs,pn %icc, vis4e1)
629	DO_THE_TRICK(	f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
630			,f56,f58,f60,f62,f48,f50,f52,f54,f54,
631			,LDBLK(f0),	,,,,STBLK,,,,
632			,bcs,pn %icc, vis4e2)
633	DO_THE_TRICK(	f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,
634			,f56,f58,f60,f62,f48,f50,f52,f54,f54,
635			,LDBLK(f16),	,,,,STBLK,,,,
636			,bcc,pt %icc, vis4)
637vis4e3:	DO_THE_TRICK(	f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,
638			,f56,f58,f60,f62,f48,f50,f52,f54,f32,
639			,SYNC,		,,,,STBLK,ST(f48,64),ST(f50,72),ST(f52,80),
640			,add %dst, 88, %dst; add %len, 192 - 4*8, %len; ba,pt %icc, e2)
641vis4e1:	DO_THE_TRICK(	f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
642			,f56,f58,f60,f62,f48,f50,f52,f54,f0,
643			,SYNC,		,,,,STBLK,ST(f48,64),ST(f50,72),ST(f52,80),
644			,add %dst, 88, %dst; add %len, 192 - 4*8, %len; ba,pt %icc, e3)
645vis4e2:	DO_THE_TRICK(	f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,
646			,f56,f58,f60,f62,f48,f50,f52,f54,f16,
647			,SYNC,		,,,,STBLK,ST(f48,64),ST(f50,72),ST(f52,80),
648			,add %dst, 88, %dst; add %len, 192 - 4*8, %len; ba,pt %icc, e1)
649	.align		2048
650vis5s:	add		%src, 128 - 40, %src	/*  IEU0	Group		*/
651	ldda		[%src-88] %asi, %f10	/*  Load	Group		*/
652	ldda		[%src-80] %asi, %f12	/*  Load	Group		*/
653	ldda		[%src-72] %asi, %f14	/*  Load	Group		*/
654	wr		%g2, ASI_BLK_XOR, %asi	/*  LSU		Group		*/
655	ldda		[%src-64] %asi, %f16	/*  Load	Group		*/
656	fmovd		%f48, %f0		/*  FPA		Group		*/
657	fmuld		%f32, %f32, %f2		/*  FPM				*/
658	clr		%x4			/*  IEU0			*/
659	faddd		%f32, %f32, %f4		/*  FPA		Group		*/
660	fmuld		%f32, %f32, %f6		/*  FPM				*/
661	clr		%x5			/*  IEU0			*/
662	faddd		%f32, %f32, %f8		/*  FPA		Group		*/
663	fcmpgt32	%f32, %f10, %x6		/*  FPM		Group		*/
664	sub		%dst, 64, %dst		/*  IEU0			*/
665	faligndata	%f10, %f12, %f48	/*  FPA				*/
666	fcmpgt32	%f32, %f12, %x7		/*  FPM		Group		*/
667	faligndata	%f12, %f14, %f50	/*  FPA				*/
668	fcmpgt32	%f32, %f14, %x8		/*  FPM		Group		*/
669	fmovd		%f14, %f52		/*  FPA				*/
670vis5:	DO_THE_TRICK(	f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,
671			,f54,f56,f58,f60,f62,f48,f50,f52,f52,
672			,LDBLK(f32),	,,,,,STBLK,,,
673			,bcs,pn %icc, vis5e1)
674	DO_THE_TRICK(	f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
675			,f54,f56,f58,f60,f62,f48,f50,f52,f52,
676			,LDBLK(f0),	,,,,,STBLK,,,
677			,bcs,pn %icc, vis5e2)
678	DO_THE_TRICK(	f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,
679			,f54,f56,f58,f60,f62,f48,f50,f52,f52,
680			,LDBLK(f16),	,,,,,STBLK,,,
681			,bcc,pt %icc, vis5)
682vis5e3:	DO_THE_TRICK(	f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,
683			,f54,f56,f58,f60,f62,f48,f50,f52,f32,
684			,SYNC,		,,,,,STBLK,ST(f48,64),ST(f50,72),
685			,add %dst, 80, %dst; add %len, 192 - 3*8, %len; ba,pt %icc, e2)
686vis5e1:	DO_THE_TRICK(	f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
687			,f54,f56,f58,f60,f62,f48,f50,f52,f0,
688			,SYNC,		,,,,,STBLK,ST(f48,64),ST(f50,72),
689			,add %dst, 80, %dst; add %len, 192 - 3*8, %len; ba,pt %icc, e3)
690vis5e2:	DO_THE_TRICK(	f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,
691			,f54,f56,f58,f60,f62,f48,f50,f52,f16,
692			,SYNC,		,,,,,STBLK,ST(f48,64),ST(f50,72),
693			,add %dst, 80, %dst; add %len, 192 - 3*8, %len; ba,pt %icc, e1)
694	.align		2048
695vis6s:	add		%src, 128 - 48, %src	/*  IEU0	Group		*/
696	ldda		[%src-80] %asi, %f12	/*  Load	Group		*/
697	ldda		[%src-72] %asi, %f14	/*  Load	Group		*/
698	wr		%g2, ASI_BLK_XOR, %asi	/*  LSU		Group		*/
699	ldda		[%src-64] %asi, %f16	/*  Load	Group		*/
700	fmovd		%f48, %f0		/*  FPA		Group		*/
701	fmuld		%f32, %f32, %f2		/*  FPM				*/
702	clr		%x4			/*  IEU0			*/
703	faddd		%f32, %f32, %f4		/*  FPA		Group		*/
704	fmuld		%f32, %f32, %f6		/*  FPM				*/
705	clr		%x5			/*  IEU0			*/
706	faddd		%f32, %f32, %f8		/*  FPA		Group		*/
707	fmuld		%f32, %f32, %f10	/*  FPM				*/
708	clr		%x6			/*  IEU0			*/
709	fcmpgt32	%f32, %f12, %x7		/*  FPM		Group		*/
710	sub		%dst, 64, %dst		/*  IEU0			*/
711	fcmpgt32	%f32, %f14, %x8		/*  FPM		Group		*/
712	faligndata	%f12, %f14, %f48	/*  FPA				*/
713	fmovd		%f14, %f50		/*  FPA		Group		*/
714vis6:	DO_THE_TRICK(	f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,
715			,f52,f54,f56,f58,f60,f62,f48,f50,f50,
716			,LDBLK(f32),	,,,,,,STBLK,,
717			,bcs,pn %icc, vis6e1)
718	DO_THE_TRICK(	f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
719			,f52,f54,f56,f58,f60,f62,f48,f50,f50,
720			,LDBLK(f0),	,,,,,,STBLK,,
721			,bcs,pn %icc, vis6e2)
722	DO_THE_TRICK(	f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,
723			,f52,f54,f56,f58,f60,f62,f48,f50,f50,
724			,LDBLK(f16),	,,,,,,STBLK,,
725			,bcc,pt %icc, vis6)
726vis6e3:	DO_THE_TRICK(	f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,
727			,f52,f54,f56,f58,f60,f62,f48,f50,f32,
728			,SYNC,		,,,,,,STBLK,ST(f48,64),
729			,add %dst, 72, %dst; add %len, 192 - 2*8, %len; ba,pt %icc, e2)
730vis6e1:	DO_THE_TRICK(	f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
731			,f52,f54,f56,f58,f60,f62,f48,f50,f0,
732			,SYNC,		,,,,,,STBLK,ST(f48,64),
733			,add %dst, 72, %dst; add %len, 192 - 2*8, %len; ba,pt %icc, e3)
734vis6e2:	DO_THE_TRICK(	f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,
735			,f52,f54,f56,f58,f60,f62,f48,f50,f16,
736			,SYNC,		,,,,,,STBLK,ST(f48,64),
737			,add %dst, 72, %dst; add %len, 192 - 2*8, %len; ba,pt %icc, e1)
738	.align		2048
739vis7s:	add		%src, 128 - 56, %src	/*  IEU0	Group		*/
740	ldda		[%src-72] %asi, %f14	/*  Load	Group		*/
741	wr		%g2, ASI_BLK_XOR, %asi	/*  LSU		Group		*/
742	ldda		[%src-64] %asi, %f16	/*  Load	Group		*/
743	fmovd		%f48, %f0		/*  FPA		Group		*/
744	fmuld		%f32, %f32, %f2		/*  FPM				*/
745	clr		%x4			/*  IEU0			*/
746	faddd		%f32, %f32, %f4		/*  FPA		Group		*/
747	fmuld		%f32, %f32, %f6		/*  FPM				*/
748	clr		%x5			/*  IEU0			*/
749	faddd		%f32, %f32, %f8		/*  FPA		Group		*/
750	fmuld		%f32, %f32, %f10	/*  FPM				*/
751	clr		%x6			/*  IEU0			*/
752	faddd		%f32, %f32, %f12	/*  FPA		Group		*/
753	clr		%x7			/*  IEU0			*/
754	fcmpgt32	%f32, %f14, %x8		/*  FPM		Group		*/
755	sub		%dst, 64, %dst		/*  IEU0			*/
756	fmovd		%f14, %f48		/*  FPA				*/
757vis7:	DO_THE_TRICK(	f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,
758			,f50,f52,f54,f56,f58,f60,f62,f48,f48,
759			,LDBLK(f32),	,,,,,,,STBLK,
760			,bcs,pn %icc, vis7e1)
761	DO_THE_TRICK(	f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
762			,f50,f52,f54,f56,f58,f60,f62,f48,f48,
763			,LDBLK(f0),	,,,,,,,STBLK,
764			,bcs,pn %icc, vis7e2)
765	DO_THE_TRICK(	f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,
766			,f50,f52,f54,f56,f58,f60,f62,f48,f48,
767			,LDBLK(f16),	,,,,,,,STBLK,
768			,bcc,pt %icc, vis7)
769vis7e3:	DO_THE_TRICK(	f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,
770			,f50,f52,f54,f56,f58,f60,f62,f48,f32,
771			,SYNC,		,,,,,,,STBLK,
772			,add %dst, 64, %dst; add %len, 192 - 1*8, %len; ba,pt %icc, e2)
773vis7e1:	DO_THE_TRICK(	f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
774			,f50,f52,f54,f56,f58,f60,f62,f48,f0,
775			,SYNC,		,,,,,,,STBLK,
776			,add %dst, 64, %dst; add %len, 192 - 1*8, %len; ba,pt %icc, e3)
777vis7e2:	DO_THE_TRICK(	f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,
778			,f50,f52,f54,f56,f58,f60,f62,f48,f16,
779			,SYNC,		,,,,,,,STBLK,
780			,add %dst, 64, %dst; add %len, 192 - 1*8, %len; ba,pt %icc, e1)
781e1:	END_THE_TRICK1(	f0,f2,f4,f6,f8,f10,f12,f14,f16,f6)
782e2:	END_THE_TRICK1(	f16,f18,f20,f22,f24,f26,f28,f30,f32,f6)
783e3:	END_THE_TRICK1(	f32,f34,f36,f38,f40,f42,f44,f46,f0,f6)
784ett:	rd		%asi, %x4		/*  LSU		Group+4bubbles	*/
785	rd		%gsr, %x3		/*  LSU		Group+4bubbles	*/
786#ifdef __KERNEL__
787	srl		%x4, 3, %x5		/*  IEU0	Group		*/
788	xor		%x4, ASI_BLK_XOR1, %x4	/*  IEU1			*/
789	wr		%x4, %x5, %asi		/*  LSU		Group+4bubbles	*/
790#else
791	wr		%x4, ASI_BLK_XOR, %asi	/*  LSU		Group+4bubbles	*/
792#endif
793	andcc		%x3, 7, %x3		/*  IEU1	Group		*/
794	add		%dst, 8, %dst		/*  IEU0			*/
795	bne,pn		%icc, 1f		/*  CTI				*/
796	 fzero		%f10			/*  FPA				*/
797	brz,a,pn	%len, 2f		/*  CTI+IEU1	Group		*/
798	 std		%f6, [%dst - 8]		/*  Store			*/
7991:	cmp		%len, 8			/*  IEU1			*/
800	blu,pn		%icc, 3f		/*  CTI				*/
801	 sub		%src, 64, %src		/*  IEU0	Group		*/
8021:	ldda		[%src] %asi, %f2	/*  Load	Group		*/
803	fpadd32		%f10, %f2, %f12		/*  FPA		Group+load stall*/
804	add		%src, 8, %src		/*  IEU0			*/
805	add		%dst, 8, %dst		/*  IEU1			*/
806	faligndata	%f6, %f2, %f14		/*  FPA		Group		*/
807	fcmpgt32	%f10, %f12, %x5		/*  FPM		Group		*/
808	std		%f14, [%dst - 16]	/*  Store			*/
809	fmovd		%f2, %f6		/*  FPA				*/
810	fmovd		%f12, %f10		/*  FPA		Group		*/
811	sub		%len, 8, %len		/*  IEU1			*/
812	fzero		%f16			/*  FPA		Group - FPU nop	*/
813	fzero		%f18			/*  FPA		Group - FPU nop	*/
814	inc		%x5			/*  IEU0			*/
815	srl		%x5, 1, %x5		/*  IEU0	Group (regdep)	*/
816	cmp		%len, 8			/*  IEU1			*/
817	bgeu,pt		%icc, 1b		/*  CTI				*/
818	 add		%x5, %sum, %sum		/*  IEU0	Group		*/
8193:	brz,a,pt	%x3, 2f			/*  CTI+IEU1			*/
820	 std		%f6, [%dst - 8]		/*  Store	Group		*/
821	st		%f7, [%dst - 8]		/*  Store	Group		*/
822	sub		%dst, 4, %dst		/*  IEU0			*/
823	add		%len, 4, %len		/*  IEU1			*/
8242:
825#ifdef __KERNEL__
826	sub		%sp, 8, %sp		/*  IEU0	Group		*/
827#endif
828	END_THE_TRICK2(	f48,f50,f52,f54,f56,f58,f60,f10,f12,f62)
829	membar		#Sync			/*  LSU		Group		*/
830#ifdef __KERNEL__
831	VISExit
832	add		%sp, 8, %sp		/*  IEU0	Group		*/
833#endif
83423:	brnz,pn		%len, 26f		/*  CTI+IEU1	Group		*/
83524:	 sllx		%sum, 32, %g1		/*  IEU0			*/
83625:	addcc		%sum, %g1, %src		/*  IEU1	Group		*/
837	srlx		%src, 32, %src		/*  IEU0	Group (regdep)	*/
838	bcs,a,pn	%xcc, 1f		/*  CTI				*/
839	 add		%src, 1, %src		/*  IEU1			*/
840#ifndef __KERNEL__
8411:	retl					/*  CTI		Group brk forced*/
842	 srl		%src, 0, %src		/*  IEU0			*/
843#else
8441:	sethi		%uhi(PAGE_OFFSET), %g4	/*  IEU0	Group		*/
845	retl					/*  CTI		Group brk forced*/
846	 sllx		%g4, 32, %g4		/*  IEU0			*/
847#endif
84826:	andcc		%len, 8, %g0		/*  IEU1	Group		*/
849	be,pn		%icc, 1f		/*  CTI				*/
850	 lduwa		[%src] %asi, %o4	/*  Load			*/
851	lduwa		[%src+4] %asi, %g2	/*  Load	Group		*/
852	add		%src, 8, %src		/*  IEU0			*/
853	add		%dst, 8, %dst		/*  IEU1			*/
854	sllx		%o4, 32, %g5		/*  IEU0	Group		*/
855	stw		%o4, [%dst - 8]		/*  Store			*/
856	or		%g5, %g2, %g5		/*  IEU0	Group		*/
857	stw		%g2, [%dst - 4]		/*  Store			*/
858	addcc		%g5, %sum, %sum		/*  IEU1	Group		*/
859	bcs,a,pn	%xcc, 1f		/*  CTI				*/
860	 add		%sum, 1, %sum		/*  IEU0			*/
8611:	andcc		%len, 4, %g0		/*  IEU1	Group		*/
862	be,a,pn		%icc, 1f		/*  CTI				*/
863	 clr		%g2			/*  IEU0			*/
864	lduwa		[%src] %asi, %g7	/*  Load			*/
865	add		%src, 4, %src		/*  IEU0	Group		*/
866	add		%dst, 4, %dst		/*  IEU1			*/
867	sllx		%g7, 32, %g2		/*  IEU0	Group		*/
868	stw		%g7, [%dst - 4]		/*  Store			*/
8691:	andcc		%len, 2, %g0		/*  IEU1			*/
870	be,a,pn		%icc, 1f		/*  CTI				*/
871	 clr		%g3			/*  IEU0	Group		*/
872	lduha		[%src] %asi, %g7	/*  Load			*/
873	add		%src, 2, %src		/*  IEU1			*/
874	add		%dst, 2, %dst		/*  IEU0	Group		*/
875	sll		%g7, 16, %g3		/*  IEU0	Group		*/
876	sth		%g7, [%dst - 2]		/*  Store			*/
8771:	andcc		%len, 1, %g0		/*  IEU1			*/
878	be,a,pn		%icc, 1f		/*  CTI				*/
879	 clr		%o5			/*  IEU0	Group		*/
880	lduba		[%src] %asi, %g7	/*  Load			*/
881	sll		%g7, 8, %o5		/*  IEU0	Group		*/
882	stb		%g7, [%dst]		/*  Store			*/
8831:	or		%g2, %g3, %g3		/*  IEU1			*/
884	or		%o5, %g3, %g3		/*  IEU0	Group (regdep)	*/
885	addcc		%g3, %sum, %sum		/*  IEU1	Group (regdep)	*/
886	bcs,a,pn	%xcc, 1f		/*  CTI				*/
887	 add		%sum, 1, %sum		/*  IEU0			*/
8881:	ba,pt		%xcc, 25b		/*  CTI		Group		*/
889	 sllx		%sum, 32, %g1		/*  IEU0			*/
890
891#ifdef __KERNEL__
892end:
893
894	.section	__ex_table
895	.align		4
896	.word		csum_partial_copy_vis, 0, end, cpc_handler
897#endif
898