1117260Sache#ifndef __ASSEMBLER__
2117260Sache# define __ASSEMBLER__ 1
3117260Sache#endif
4117260Sache#include "crypto/sparc_arch.h"
5117260Sache
6117260Sache#ifdef	__arch64__
7117260Sache.register	%g2,#scratch
8117260Sache.register	%g3,#scratch
9117260Sache# define	STPTR	stx
10117260Sache# define	SIZE_T	8
11117260Sache#else
12117260Sache# define	STPTR	st
13117260Sache# define	SIZE_T	4
14117260Sache#endif
15117260Sache#define	LOCALS	(STACK_BIAS+STACK_FRAME)
16117260Sache
17117260Sache.section	".text",#alloc,#execinstr
18117260Sache
19117260Sache#ifdef __PIC__
20117260SacheSPARC_PIC_THUNK(%g1)
21117260Sache#endif
22117260Sache
23117260Sache.globl	poly1305_init
24117260Sache.align	32
25117260Sachepoly1305_init:
26117260Sache	save	%sp,-STACK_FRAME-16,%sp
27117260Sache	nop
28117260Sache
29117260Sache	SPARC_LOAD_ADDRESS(OPENSSL_sparcv9cap_P,%g1)
30117260Sache	ld	[%g1],%g1
31117260Sache
32117260Sache	and	%g1,SPARCV9_FMADD|SPARCV9_VIS3,%g1
33117260Sache	cmp	%g1,SPARCV9_FMADD
34117260Sache	be	.Lpoly1305_init_fma
35117260Sache	nop
36117260Sache
37117260Sache	stx	%g0,[%i0+0]
38117260Sache	stx	%g0,[%i0+8]		! zero hash value
39117260Sache	brz,pn	%i1,.Lno_key
40117260Sache	stx	%g0,[%i0+16]
41117260Sache
42117260Sache	and	%i1,7,%i5		! alignment factor
43117260Sache	andn	%i1,7,%i1
44117260Sache	sll	%i5,3,%i5		! *8
45117260Sache	neg	%i5,%i4
46117260Sache
47117260Sache	sethi	%hi(0x0ffffffc),%o4
48117260Sache	set	8,%o1
49117260Sache	or	%o4,%lo(0x0ffffffc),%o4
50117260Sache	set	16,%o2
51	sllx	%o4,32,%o5
52	or	%o4,%o5,%o5		! 0x0ffffffc0ffffffc
53	or	%o5,3,%o4		! 0x0ffffffc0fffffff
54
55	ldxa	[%i1+%g0]0x88,%o0	! load little-endian key
56	brz,pt	%i5,.Lkey_aligned
57	ldxa	[%i1+%o1]0x88,%o1
58
59	ldxa	[%i1+%o2]0x88,%o2
60	srlx	%o0,%i5,%o0
61	sllx	%o1,%i4,%o7
62	srlx	%o1,%i5,%o1
63	or	%o7,%o0,%o0
64	sllx	%o2,%i4,%o2
65	or	%o2,%o1,%o1
66
67.Lkey_aligned:
68	and	%o4,%o0,%o0
69	and	%o5,%o1,%o1
70	stx	%o0,[%i0+32+0]		! store key
71	stx	%o1,[%i0+32+8]
72
73	andcc	%g1,SPARCV9_VIS3,%g0
74	be	.Lno_key
75	nop
76
771:	call	.+8
78	add	%o7,poly1305_blocks_vis3-1b,%o7
79
80	add	%o7,poly1305_emit-poly1305_blocks_vis3,%o5
81	STPTR	%o7,[%i2]
82	STPTR	%o5,[%i2+SIZE_T]
83
84	ret
85	restore	%g0,1,%o0		! return 1
86
87.Lno_key:
88	ret
89	restore	%g0,%g0,%o0		! return 0
90.type	poly1305_init,#function
91.size	poly1305_init,.-poly1305_init
92
93.globl	poly1305_blocks
94.align	32
95poly1305_blocks:
96	save	%sp,-STACK_FRAME,%sp
97	srln	%i2,4,%i2
98
99	brz,pn	%i2,.Lno_data
100	nop
101
102	ld	[%i0+32+0],%l1		! load key
103	ld	[%i0+32+4],%l0
104	ld	[%i0+32+8],%l3
105	ld	[%i0+32+12],%l2
106
107	ld	[%i0+0],%o1		! load hash value
108	ld	[%i0+4],%o0
109	ld	[%i0+8],%o3
110	ld	[%i0+12],%o2
111	ld	[%i0+16],%l7
112
113	and	%i1,7,%i5		! alignment factor
114	andn	%i1,7,%i1
115	set	8,%g2
116	sll	%i5,3,%i5		! *8
117	set	16,%g3
118	neg	%i5,%i4
119
120	srl	%l1,2,%l4
121	srl	%l2,2,%l5
122	add	%l1,%l4,%l4
123	srl	%l3,2,%l6
124	add	%l2,%l5,%l5
125	add	%l3,%l6,%l6
126
127.Loop:
128	ldxa	[%i1+%g0]0x88,%g1	! load little-endian input
129	brz,pt	%i5,.Linp_aligned
130	ldxa	[%i1+%g2]0x88,%g2
131
132	ldxa	[%i1+%g3]0x88,%g3
133	srlx	%g1,%i5,%g1
134	sllx	%g2,%i4,%o5
135	srlx	%g2,%i5,%g2
136	or	%o5,%g1,%g1
137	sllx	%g3,%i4,%g3
138	or	%g3,%g2,%g2
139
140.Linp_aligned:
141	srlx	%g1,32,%o4
142	addcc	%g1,%o0,%o0		! accumulate input
143	srlx	%g2,32,%o5
144	addccc	%o4,%o1,%o1
145	addccc	%g2,%o2,%o2
146	addccc	%o5,%o3,%o3
147	addc	%i3,%l7,%l7
148
149	umul	%l0,%o0,%g1
150	umul	%l1,%o0,%g2
151	umul	%l2,%o0,%g3
152	umul	%l3,%o0,%g4
153	 sub	%i2,1,%i2
154	 add	%i1,16,%i1
155
156	umul	%l6,%o1,%o4
157	umul	%l0,%o1,%o5
158	umul	%l1,%o1,%o7
159	add	%o4,%g1,%g1
160	add	%o5,%g2,%g2
161	umul	%l2,%o1,%o4
162	add	%o7,%g3,%g3
163	add	%o4,%g4,%g4
164
165	umul	%l5,%o2,%o5
166	umul	%l6,%o2,%o7
167	umul	%l0,%o2,%o4
168	add	%o5,%g1,%g1
169	add	%o7,%g2,%g2
170	umul	%l1,%o2,%o5
171	add	%o4,%g3,%g3
172	add	%o5,%g4,%g4
173
174	umul	%l4,%o3,%o7
175	umul	%l5,%o3,%o4
176	umul	%l6,%o3,%o5
177	add	%o7,%g1,%g1
178	add	%o4,%g2,%g2
179	umul	%l0,%o3,%o7
180	add	%o5,%g3,%g3
181	add	%o7,%g4,%g4
182
183	umul	%l4,%l7,%o4
184	umul	%l5,%l7,%o5
185	umul	%l6,%l7,%o7
186	umul	%l0,%l7,%l7
187	add	%o4,%g2,%g2
188	add	%o5,%g3,%g3
189	srlx	%g1,32,%o1
190	add	%o7,%g4,%g4
191	srlx	%g2,32,%o2
192
193	addcc	%g2,%o1,%o1
194	srlx	%g3,32,%o3
195	 set	8,%g2
196	addccc	%g3,%o2,%o2
197	srlx	%g4,32,%o4
198	 set	16,%g3
199	addccc	%g4,%o3,%o3
200	addc	%o4,%l7,%l7
201
202	srl	%l7,2,%o4		! final reduction step
203	andn	%l7,3,%o5
204	and	%l7,3,%l7
205	add	%o5,%o4,%o4
206
207	addcc	%o4,%g1,%o0
208	addccc	%g0,%o1,%o1
209	addccc	%g0,%o2,%o2
210	addccc	%g0,%o3,%o3
211	brnz,pt	%i2,.Loop
212	addc	%g0,%l7,%l7
213
214	st	%o1,[%i0+0]		! store hash value
215	st	%o0,[%i0+4]
216	st	%o3,[%i0+8]
217	st	%o2,[%i0+12]
218	st	%l7,[%i0+16]
219
220.Lno_data:
221	ret
222	restore
223.type	poly1305_blocks,#function
224.size	poly1305_blocks,.-poly1305_blocks
225.align	32
226poly1305_blocks_vis3:
227	save	%sp,-STACK_FRAME,%sp
228	srln	%i2,4,%i2
229
230	brz,pn	%i2,.Lno_data
231	nop
232
233	ldx	[%i0+32+0],%o3		! load key
234	ldx	[%i0+32+8],%o4
235
236	ldx	[%i0+0],%o0		! load hash value
237	ldx	[%i0+8],%o1
238	ld	[%i0+16],%o2
239
240	and	%i1,7,%i5		! alignment factor
241	andn	%i1,7,%i1
242	set	8,%l1
243	sll	%i5,3,%i5		! *8
244	set	16,%l2
245	neg	%i5,%i4
246
247	srlx	%o4,2,%o5
248	b	.Loop_vis3
249	add	%o4,%o5,%o5
250
251.Loop_vis3:
252	ldxa	[%i1+%g0]0x88,%g1	! load little-endian input
253	brz,pt	%i5,.Linp_aligned_vis3
254	ldxa	[%i1+%l1]0x88,%g2
255
256	ldxa	[%i1+%l2]0x88,%g3
257	srlx	%g1,%i5,%g1
258	sllx	%g2,%i4,%o7
259	srlx	%g2,%i5,%g2
260	or	%o7,%g1,%g1
261	sllx	%g3,%i4,%g3
262	or	%g3,%g2,%g2
263
264.Linp_aligned_vis3:
265	addcc	%g1,%o0,%o0		! accumulate input
266	 sub	%i2,1,%i2
267	.word	0x93b08269 !addxccc	%g2,%o1,%o1
268	 add	%i1,16,%i1
269
270	mulx	%o3,%o0,%g1		! r0*h0
271	.word	0x95b6c22a !addxc	%i3,%o2,%o2
272	.word	0x85b2c2c8 !umulxhi	%o3,%o0,%g2
273	mulx	%o5,%o1,%g4		! s1*h1
274	.word	0x9fb342c9 !umulxhi	%o5,%o1,%o7
275	addcc	%g4,%g1,%g1
276	mulx	%o4,%o0,%g4		! r1*h0
277	.word	0x85b3c222 !addxc	%o7,%g2,%g2
278	.word	0x87b302c8 !umulxhi	%o4,%o0,%g3
279	addcc	%g4,%g2,%g2
280	mulx	%o3,%o1,%g4		! r0*h1
281	.word	0x87b00223 !addxc	%g0,%g3,%g3
282	.word	0x9fb2c2c9 !umulxhi	%o3,%o1,%o7
283	addcc	%g4,%g2,%g2
284	mulx	%o5,%o2,%g4		! s1*h2
285	.word	0x87b3c223 !addxc	%o7,%g3,%g3
286	mulx	%o3,%o2,%o7		! r0*h2
287	addcc	%g4,%g2,%g2
288	.word	0x87b3c223 !addxc	%o7,%g3,%g3
289
290	srlx	%g3,2,%g4		! final reduction step
291	andn	%g3,3,%o7
292	and	%g3,3,%o2
293	add	%o7,%g4,%g4
294
295	addcc	%g4,%g1,%o0
296	.word	0x93b00262 !addxccc	%g0,%g2,%o1
297	brnz,pt	%i2,.Loop_vis3
298	.word	0x95b0022a !addxc	%g0,%o2,%o2
299
300	stx	%o0,[%i0+0]		! store hash value
301	stx	%o1,[%i0+8]
302	st	%o2,[%i0+16]
303
304	ret
305	restore
306.type	poly1305_blocks_vis3,#function
307.size	poly1305_blocks_vis3,.-poly1305_blocks_vis3
308.globl	poly1305_emit
309.align	32
310poly1305_emit:
311	save	%sp,-STACK_FRAME,%sp
312
313	ld	[%i0+0],%o1		! load hash value
314	ld	[%i0+4],%o0
315	ld	[%i0+8],%o3
316	ld	[%i0+12],%o2
317	ld	[%i0+16],%l7
318
319	addcc	%o0,5,%l0		! compare to modulus
320	addccc	%o1,0,%l1
321	addccc	%o2,0,%l2
322	addccc	%o3,0,%l3
323	addc	%l7,0,%l7
324	andcc	%l7,4,%g0		! did it carry/borrow?
325
326	movnz	%icc,%l0,%o0
327	ld	[%i2+0],%l0		! load nonce
328	movnz	%icc,%l1,%o1
329	ld	[%i2+4],%l1
330	movnz	%icc,%l2,%o2
331	ld	[%i2+8],%l2
332	movnz	%icc,%l3,%o3
333	ld	[%i2+12],%l3
334
335	addcc	%l0,%o0,%o0		! accumulate nonce
336	addccc	%l1,%o1,%o1
337	addccc	%l2,%o2,%o2
338	addc	%l3,%o3,%o3
339
340	srl	%o0,8,%l0
341	stb	%o0,[%i1+0]		! store little-endian result
342	srl	%o0,16,%l1
343	stb	%l0,[%i1+1]
344	srl	%o0,24,%l2
345	stb	%l1,[%i1+2]
346	stb	%l2,[%i1+3]
347
348	srl	%o1,8,%l0
349	stb	%o1,[%i1+4]
350	srl	%o1,16,%l1
351	stb	%l0,[%i1+5]
352	srl	%o1,24,%l2
353	stb	%l1,[%i1+6]
354	stb	%l2,[%i1+7]
355
356	srl	%o2,8,%l0
357	stb	%o2,[%i1+8]
358	srl	%o2,16,%l1
359	stb	%l0,[%i1+9]
360	srl	%o2,24,%l2
361	stb	%l1,[%i1+10]
362	stb	%l2,[%i1+11]
363
364	srl	%o3,8,%l0
365	stb	%o3,[%i1+12]
366	srl	%o3,16,%l1
367	stb	%l0,[%i1+13]
368	srl	%o3,24,%l2
369	stb	%l1,[%i1+14]
370	stb	%l2,[%i1+15]
371
372	ret
373	restore
374.type	poly1305_emit,#function
375.size	poly1305_emit,.-poly1305_emit
376.align	32
377poly1305_init_fma:
378	save	%sp,-STACK_FRAME-16,%sp
379	nop
380
381.Lpoly1305_init_fma:
3821:	call	.+8
383	add	%o7,.Lconsts_fma-1b,%o7
384
385	ldd	[%o7+8*0],%f16			! load constants
386	ldd	[%o7+8*1],%f18
387	ldd	[%o7+8*2],%f20
388	ldd	[%o7+8*3],%f22
389	ldd	[%o7+8*5],%f26
390
391	std	%f16,[%i0+8*0]		! initial hash value, biased 0
392	std	%f18,[%i0+8*1]
393	std	%f20,[%i0+8*2]
394	std	%f22,[%i0+8*3]
395
396	brz,pn	%i1,.Lno_key_fma
397	nop
398
399	stx	%fsr,[%sp+LOCALS]		! save original %fsr
400	ldx	[%o7+8*6],%fsr			! load new %fsr
401
402	std	%f16,[%i0+8*4] 		! key "template"
403	std	%f18,[%i0+8*5]
404	std	%f20,[%i0+8*6]
405	std	%f22,[%i0+8*7]
406
407	and	%i1,7,%l2
408	andn	%i1,7,%i1			! align pointer
409	mov	8,%l0
410	sll	%l2,3,%l2
411	mov	16,%l1
412	neg	%l2,%l3
413
414	ldxa	[%i1+%g0]0x88,%o0		! load little-endian key
415	ldxa	[%i1+%l0]0x88,%o2
416
417	brz	%l2,.Lkey_aligned_fma
418	sethi	%hi(0xf0000000),%l0		!   0xf0000000
419
420	ldxa	[%i1+%l1]0x88,%o4
421
422	srlx	%o0,%l2,%o0			! align data
423	sllx	%o2,%l3,%o1
424	srlx	%o2,%l2,%o2
425	or	%o1,%o0,%o0
426	sllx	%o4,%l3,%o3
427	or	%o3,%o2,%o2
428
429.Lkey_aligned_fma:
430	or	%l0,3,%l1			!   0xf0000003
431	srlx	%o0,32,%o1
432	andn	%o0,%l0,%o0			! &=0x0fffffff
433	andn	%o1,%l1,%o1			! &=0x0ffffffc
434	srlx	%o2,32,%o3
435	andn	%o2,%l1,%o2
436	andn	%o3,%l1,%o3
437
438	st	%o0,[%i0+36]		! fill "template"
439	st	%o1,[%i0+44]
440	st	%o2,[%i0+52]
441	st	%o3,[%i0+60]
442
443	ldd	[%i0+8*4],%f0 		! load [biased] key
444	ldd	[%i0+8*5],%f4
445	ldd	[%i0+8*6],%f8
446	ldd	[%i0+8*7],%f12
447
448	fsubd	%f0,%f16, %f0		! r0
449	 ldd	[%o7+8*7],%f16 		! more constants
450	fsubd	%f4,%f18,%f4		! r1
451	 ldd	[%o7+8*8],%f18
452	fsubd	%f8,%f20,%f8		! r2
453	 ldd	[%o7+8*9],%f20
454	fsubd	%f12,%f22,%f12		! r3
455	 ldd	[%o7+8*10],%f22
456
457	fmuld	%f26,%f4,%f52	! s1
458	fmuld	%f26,%f8,%f40	! s2
459	fmuld	%f26,%f12,%f44	! s3
460
461	faddd	%f0,%f16, %f2
462	faddd	%f4,%f18,%f6
463	faddd	%f8,%f20,%f10
464	faddd	%f12,%f22,%f14
465
466	fsubd	%f2,%f16, %f2
467	 ldd	[%o7+8*11],%f16		! more constants
468	fsubd	%f6,%f18,%f6
469	 ldd	[%o7+8*12],%f18
470	fsubd	%f10,%f20,%f10
471	 ldd	[%o7+8*13],%f20
472	fsubd	%f14,%f22,%f14
473
474	fsubd	%f0,%f2,%f0
475	 std	%f2,[%i0+8*5] 		! r0hi
476	fsubd	%f4,%f6,%f4
477	 std	%f6,[%i0+8*7] 		! r1hi
478	fsubd	%f8,%f10,%f8
479	 std	%f10,[%i0+8*9] 		! r2hi
480	fsubd	%f12,%f14,%f12
481	 std	%f14,[%i0+8*11]		! r3hi
482
483	faddd	%f52,%f16, %f54
484	faddd	%f40,%f18,%f42
485	faddd	%f44,%f20,%f46
486
487	fsubd	%f54,%f16, %f54
488	fsubd	%f42,%f18,%f42
489	fsubd	%f46,%f20,%f46
490
491	fsubd	%f52,%f54,%f52
492	fsubd	%f40,%f42,%f40
493	fsubd	%f44,%f46,%f44
494
495	ldx	[%sp+LOCALS],%fsr		! restore %fsr
496
497	std	%f0,[%i0+8*4] 		! r0lo
498	std	%f4,[%i0+8*6] 		! r1lo
499	std	%f8,[%i0+8*8] 		! r2lo
500	std	%f12,[%i0+8*10]		! r3lo
501
502	std	%f54,[%i0+8*13]
503	std	%f42,[%i0+8*15]
504	std	%f46,[%i0+8*17]
505
506	std	%f52,[%i0+8*12]
507	std	%f40,[%i0+8*14]
508	std	%f44,[%i0+8*16]
509
510	add	%o7,poly1305_blocks_fma-.Lconsts_fma,%o0
511	add	%o7,poly1305_emit_fma-.Lconsts_fma,%o1
512	STPTR	%o0,[%i2]
513	STPTR	%o1,[%i2+SIZE_T]
514
515	ret
516	restore	%g0,1,%o0			! return 1
517
518.Lno_key_fma:
519	ret
520	restore	%g0,%g0,%o0			! return 0
521.type	poly1305_init_fma,#function
522.size	poly1305_init_fma,.-poly1305_init_fma
523
524.align	32
525poly1305_blocks_fma:
526	save	%sp,-STACK_FRAME-48,%sp
527	srln	%i2,4,%i2
528
529	brz,pn	%i2,.Labort
530	sub	%i2,1,%i2
531
5321:	call	.+8
533	add	%o7,.Lconsts_fma-1b,%o7
534
535	ldd	[%o7+8*0],%f16			! load constants
536	ldd	[%o7+8*1],%f18
537	ldd	[%o7+8*2],%f20
538	ldd	[%o7+8*3],%f22
539	ldd	[%o7+8*4],%f24
540	ldd	[%o7+8*5],%f26
541
542	ldd	[%i0+8*0],%f0 		! load [biased] hash value
543	ldd	[%i0+8*1],%f4
544	ldd	[%i0+8*2],%f8
545	ldd	[%i0+8*3],%f12
546
547	std	%f16,[%sp+LOCALS+8*0]		! input "template"
548	sethi	%hi((1023+52+96)<<20),%o3
549	std	%f18,[%sp+LOCALS+8*1]
550	or	%i3,%o3,%o3
551	std	%f20,[%sp+LOCALS+8*2]
552	st	%o3,[%sp+LOCALS+8*3]
553
554	and	%i1,7,%l2
555	andn	%i1,7,%i1			! align pointer
556	mov	8,%l0
557	sll	%l2,3,%l2
558	mov	16,%l1
559	neg	%l2,%l3
560
561	ldxa	[%i1+%g0]0x88,%o0		! load little-endian input
562	brz	%l2,.Linp_aligned_fma
563	ldxa	[%i1+%l0]0x88,%o2
564
565	ldxa	[%i1+%l1]0x88,%o4
566	add	%i1,8,%i1
567
568	srlx	%o0,%l2,%o0			! align data
569	sllx	%o2,%l3,%o1
570	srlx	%o2,%l2,%o2
571	or	%o1,%o0,%o0
572	sllx	%o4,%l3,%o3
573	srlx	%o4,%l2,%o4			! pre-shift
574	or	%o3,%o2,%o2
575
576.Linp_aligned_fma:
577	srlx	%o0,32,%o1
578	movrz	%i2,0,%l1
579	srlx	%o2,32,%o3
580	add	%l1,%i1,%i1			! conditional advance
581
582	st	%o0,[%sp+LOCALS+8*0+4]		! fill "template"
583	st	%o1,[%sp+LOCALS+8*1+4]
584	st	%o2,[%sp+LOCALS+8*2+4]
585	st	%o3,[%sp+LOCALS+8*3+4]
586
587	ldd	[%i0+8*4],%f28 		! load key
588	ldd	[%i0+8*5],%f30
589	ldd	[%i0+8*6],%f32
590	ldd	[%i0+8*7],%f34
591	ldd	[%i0+8*8],%f36
592	ldd	[%i0+8*9],%f38
593	ldd	[%i0+8*10],%f48
594	ldd	[%i0+8*11],%f50
595	ldd	[%i0+8*12],%f52
596	ldd	[%i0+8*13],%f54
597	ldd	[%i0+8*14],%f40
598	ldd	[%i0+8*15],%f42
599	ldd	[%i0+8*16],%f44
600	ldd	[%i0+8*17],%f46
601
602	stx	%fsr,[%sp+LOCALS+8*4]		! save original %fsr
603	ldx	[%o7+8*6],%fsr			! load new %fsr
604
605	subcc	%i2,1,%i2
606	movrz	%i2,0,%l1
607
608	ldd	[%sp+LOCALS+8*0],%f56		! load biased input
609	ldd	[%sp+LOCALS+8*1],%f58
610	ldd	[%sp+LOCALS+8*2],%f60
611	ldd	[%sp+LOCALS+8*3],%f62
612
613	fsubd	%f0,%f16, %f0		! de-bias hash value
614	fsubd	%f4,%f18,%f4
615	 ldxa	[%i1+%g0]0x88,%o0		! modulo-scheduled input load
616	fsubd	%f8,%f20,%f8
617	fsubd	%f12,%f22,%f12
618	 ldxa	[%i1+%l0]0x88,%o2
619
620	fsubd	%f56,%f16, %f56  		! de-bias input
621	fsubd	%f58,%f18,%f58
622	fsubd	%f60,%f20,%f60
623	fsubd	%f62,%f22,%f62
624
625	brz	%l2,.Linp_aligned_fma2
626	add	%l1,%i1,%i1			! conditional advance
627
628	sllx	%o0,%l3,%o1			! align data
629	srlx	%o0,%l2,%o3
630	or	%o1,%o4,%o0
631	sllx	%o2,%l3,%o1
632	srlx	%o2,%l2,%o4			! pre-shift
633	or	%o3,%o1,%o2
634.Linp_aligned_fma2:
635	srlx	%o0,32,%o1
636	srlx	%o2,32,%o3
637
638	faddd	%f0,%f56,%f56			! accumulate input
639	 stw	%o0,[%sp+LOCALS+8*0+4]
640	faddd	%f4,%f58,%f58
641	 stw	%o1,[%sp+LOCALS+8*1+4]
642	faddd	%f8,%f60,%f60
643	 stw	%o2,[%sp+LOCALS+8*2+4]
644	faddd	%f12,%f62,%f62
645	 stw	%o3,[%sp+LOCALS+8*3+4]
646
647	b	.Lentry_fma
648	nop
649
650.align	16
651.Loop_fma:
652	ldxa	[%i1+%g0]0x88,%o0		! modulo-scheduled input load
653	ldxa	[%i1+%l0]0x88,%o2
654	movrz	%i2,0,%l1
655
656	faddd	%f52,%f0,%f0 		! accumulate input
657	faddd	%f54,%f2,%f2
658	faddd	%f62,%f8,%f8
659	faddd	%f60,%f10,%f10
660
661	brz,pn	%l2,.Linp_aligned_fma3
662	add	%l1,%i1,%i1			! conditional advance
663
664	sllx	%o0,%l3,%o1			! align data
665	srlx	%o0,%l2,%o3
666	or	%o1,%o4,%o0
667	sllx	%o2,%l3,%o1
668	srlx	%o2,%l2,%o4			! pre-shift
669	or	%o3,%o1,%o2
670
671.Linp_aligned_fma3:
672	!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! base 2^48 -> base 2^32
673	faddd	%f20,%f4,%f52
674	 srlx	%o0,32,%o1
675	faddd	%f20,%f6,%f54
676	 srlx	%o2,32,%o3
677	faddd	%f24,%f12,%f60
678	 st	%o0,[%sp+LOCALS+8*0+4]		! fill "template"
679	faddd	%f24,%f14,%f62
680	 st	%o1,[%sp+LOCALS+8*1+4]
681	faddd	%f18,%f0,%f48
682	 st	%o2,[%sp+LOCALS+8*2+4]
683	faddd	%f18,%f2,%f50
684	 st	%o3,[%sp+LOCALS+8*3+4]
685	faddd	%f22,%f8,%f56
686	faddd	%f22,%f10,%f58
687
688	fsubd	%f52,%f20,%f52
689	fsubd	%f54,%f20,%f54
690	fsubd	%f60,%f24,%f60
691	fsubd	%f62,%f24,%f62
692	fsubd	%f48,%f18,%f48
693	fsubd	%f50,%f18,%f50
694	fsubd	%f56,%f22,%f56
695	fsubd	%f58,%f22,%f58
696
697	fsubd	%f4,%f52,%f4
698	fsubd	%f6,%f54,%f6
699	fsubd	%f12,%f60,%f12
700	fsubd	%f14,%f62,%f14
701	fsubd	%f8,%f56,%f8
702	fsubd	%f10,%f58,%f10
703	fsubd	%f0,%f48,%f0
704	fsubd	%f2,%f50,%f2
705
706	faddd	%f4,%f48,%f4
707	faddd	%f6,%f50,%f6
708	faddd	%f12,%f56,%f12
709	faddd	%f14,%f58,%f14
710	faddd	%f8,%f52,%f8
711	faddd	%f10,%f54,%f10
712	.word	0x81be805d !fmaddd	%f26,%f60,%f0,%f0
713	.word	0x85be845f !fmaddd	%f26,%f62,%f2,%f2
714
715	faddd	%f4,%f6,%f58
716	 ldd	[%i0+8*12],%f52		! reload constants
717	faddd	%f12,%f14,%f62
718	 ldd	[%i0+8*13],%f54
719	faddd	%f8,%f10,%f60
720	 ldd	[%i0+8*10],%f48
721	faddd	%f0,%f2,%f56
722	 ldd	[%i0+8*11],%f50
723
724.Lentry_fma:
725	fmuld	%f58,%f44,%f0
726	fmuld	%f58,%f46,%f2
727	fmuld	%f58,%f32,%f8
728	fmuld	%f58,%f34,%f10
729	fmuld	%f58,%f28,%f4
730	fmuld	%f58,%f30,%f6
731	fmuld	%f58,%f36,%f12
732	fmuld	%f58,%f38,%f14
733
734	.word	0x81bfc055 !fmaddd	%f62,%f52,%f0,%f0
735	.word	0x85bfc457 !fmaddd	%f62,%f54,%f2,%f2
736	.word	0x91bfd04d !fmaddd	%f62,%f44,%f8,%f8
737	.word	0x95bfd44f !fmaddd	%f62,%f46,%f10,%f10
738	.word	0x89bfc849 !fmaddd	%f62,%f40,%f4,%f4
739	.word	0x8dbfcc4b !fmaddd	%f62,%f42,%f6,%f6
740	.word	0x99bfd85c !fmaddd	%f62,%f28,%f12,%f12
741	.word	0x9dbfdc5e !fmaddd	%f62,%f30,%f14,%f14
742
743	.word	0x81bf4049 !fmaddd	%f60,%f40,%f0,%f0
744	.word	0x85bf444b !fmaddd	%f60,%f42,%f2,%f2
745	.word	0x91bf505c !fmaddd	%f60,%f28,%f8,%f8
746	.word	0x95bf545e !fmaddd	%f60,%f30,%f10,%f10
747	.word	0x89bf484d !fmaddd	%f60,%f44,%f4,%f4
748	 ldd	[%sp+LOCALS+8*0],%f52		! load [biased] input
749	.word	0x8dbf4c4f !fmaddd	%f60,%f46,%f6,%f6
750	 ldd	[%sp+LOCALS+8*1],%f54
751	.word	0x99bf5841 !fmaddd	%f60,%f32,%f12,%f12
752	 ldd	[%sp+LOCALS+8*2],%f62
753	.word	0x9dbf5c43 !fmaddd	%f60,%f34,%f14,%f14
754	 ldd	[%sp+LOCALS+8*3],%f60
755
756	.word	0x81be405c !fmaddd	%f56,%f28,%f0,%f0
757	 fsubd	%f52,%f16, %f52  		! de-bias input
758	.word	0x85be445e !fmaddd	%f56,%f30,%f2,%f2
759	 fsubd	%f54,%f18,%f54
760	.word	0x91be5045 !fmaddd	%f56,%f36,%f8,%f8
761	 fsubd	%f62,%f20,%f62
762	.word	0x95be5447 !fmaddd	%f56,%f38,%f10,%f10
763	 fsubd	%f60,%f22,%f60
764	.word	0x89be4841 !fmaddd	%f56,%f32,%f4,%f4
765	.word	0x8dbe4c43 !fmaddd	%f56,%f34,%f6,%f6
766	.word	0x99be5851 !fmaddd	%f56,%f48,%f12,%f12
767	.word	0x9dbe5c53 !fmaddd	%f56,%f50,%f14,%f14
768
769	bcc	SIZE_T_CC,.Loop_fma
770	subcc	%i2,1,%i2
771
772	!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! base 2^48 -> base 2^32
773	faddd	%f0,%f18,%f48
774	faddd	%f2,%f18,%f50
775	faddd	%f8,%f22,%f56
776	faddd	%f10,%f22,%f58
777	faddd	%f4,%f20,%f52
778	faddd	%f6,%f20,%f54
779	faddd	%f12,%f24,%f60
780	faddd	%f14,%f24,%f62
781
782	fsubd	%f48,%f18,%f48
783	fsubd	%f50,%f18,%f50
784	fsubd	%f56,%f22,%f56
785	fsubd	%f58,%f22,%f58
786	fsubd	%f52,%f20,%f52
787	fsubd	%f54,%f20,%f54
788	fsubd	%f60,%f24,%f60
789	fsubd	%f62,%f24,%f62
790
791	fsubd	%f4,%f52,%f4
792	fsubd	%f6,%f54,%f6
793	fsubd	%f12,%f60,%f12
794	fsubd	%f14,%f62,%f14
795	fsubd	%f8,%f56,%f8
796	fsubd	%f10,%f58,%f10
797	fsubd	%f0,%f48,%f0
798	fsubd	%f2,%f50,%f2
799
800	faddd	%f4,%f48,%f4
801	faddd	%f6,%f50,%f6
802	faddd	%f12,%f56,%f12
803	faddd	%f14,%f58,%f14
804	faddd	%f8,%f52,%f8
805	faddd	%f10,%f54,%f10
806	.word	0x81be805d !fmaddd	%f26,%f60,%f0,%f0
807	.word	0x85be845f !fmaddd	%f26,%f62,%f2,%f2
808
809	faddd	%f4,%f6,%f58
810	faddd	%f12,%f14,%f62
811	faddd	%f8,%f10,%f60
812	faddd	%f0,%f2,%f56
813
814	faddd	%f58,%f18,%f58  		! bias
815	faddd	%f62,%f22,%f62
816	faddd	%f60,%f20,%f60
817	faddd	%f56,%f16, %f56
818
819	ldx	[%sp+LOCALS+8*4],%fsr		! restore saved %fsr
820
821	std	%f58,[%i0+8*1]			! store [biased] hash value
822	std	%f62,[%i0+8*3]
823	std	%f60,[%i0+8*2]
824	std	%f56,[%i0+8*0]
825
826.Labort:
827	ret
828	restore
829.type	poly1305_blocks_fma,#function
830.size	poly1305_blocks_fma,.-poly1305_blocks_fma
831.align	32
832poly1305_emit_fma:
833	save	%sp,-STACK_FRAME,%sp
834
835	ld	[%i0+8*0+0],%l5		! load hash
836	ld	[%i0+8*0+4],%l0
837	ld	[%i0+8*1+0],%o0
838	ld	[%i0+8*1+4],%l1
839	ld	[%i0+8*2+0],%o1
840	ld	[%i0+8*2+4],%l2
841	ld	[%i0+8*3+0],%o2
842	ld	[%i0+8*3+4],%l3
843
844	sethi	%hi(0xfff00000),%o3
845	andn	%l5,%o3,%l5			! mask exponent
846	andn	%o0,%o3,%o0
847	andn	%o1,%o3,%o1
848	andn	%o2,%o3,%o2			! can be partially reduced...
849	mov	3,%o3
850
851	srl	%o2,2,%i3			! ... so reduce
852	and	%o2,%o3,%l4
853	andn	%o2,%o3,%o2
854	add	%i3,%o2,%o2
855
856	addcc	%o2,%l0,%l0
857	addccc	%l5,%l1,%l1
858	addccc	%o0,%l2,%l2
859	addccc	%o1,%l3,%l3
860	addc	%g0,%l4,%l4
861
862	addcc	%l0,5,%l5			! compare to modulus
863	addccc	%l1,0,%o0
864	addccc	%l2,0,%o1
865	addccc	%l3,0,%o2
866	addc	%l4,0,%o3
867
868	srl	%o3,2,%o3			! did it carry/borrow?
869	neg	%o3,%o3
870	sra	%o3,31,%o3			! mask
871
872	andn	%l0,%o3,%l0
873	and	%l5,%o3,%l5
874	andn	%l1,%o3,%l1
875	and	%o0,%o3,%o0
876	or	%l5,%l0,%l0
877	ld	[%i2+0],%l5			! load nonce
878	andn	%l2,%o3,%l2
879	and	%o1,%o3,%o1
880	or	%o0,%l1,%l1
881	ld	[%i2+4],%o0
882	andn	%l3,%o3,%l3
883	and	%o2,%o3,%o2
884	or	%o1,%l2,%l2
885	ld	[%i2+8],%o1
886	or	%o2,%l3,%l3
887	ld	[%i2+12],%o2
888
889	addcc	%l5,%l0,%l0			! accumulate nonce
890	addccc	%o0,%l1,%l1
891	addccc	%o1,%l2,%l2
892	addc	%o2,%l3,%l3
893
894	stb	%l0,[%i1+0]			! write little-endian result
895	srl	%l0,8,%l0
896	stb	%l1,[%i1+4]
897	srl	%l1,8,%l1
898	stb	%l2,[%i1+8]
899	srl	%l2,8,%l2
900	stb	%l3,[%i1+12]
901	srl	%l3,8,%l3
902
903	stb	%l0,[%i1+1]
904	srl	%l0,8,%l0
905	stb	%l1,[%i1+5]
906	srl	%l1,8,%l1
907	stb	%l2,[%i1+9]
908	srl	%l2,8,%l2
909	stb	%l3,[%i1+13]
910	srl	%l3,8,%l3
911
912	stb	%l0,[%i1+2]
913	srl	%l0,8,%l0
914	stb	%l1,[%i1+6]
915	srl	%l1,8,%l1
916	stb	%l2,[%i1+10]
917	srl	%l2,8,%l2
918	stb	%l3,[%i1+14]
919	srl	%l3,8,%l3
920
921	stb	%l0,[%i1+3]
922	stb	%l1,[%i1+7]
923	stb	%l2,[%i1+11]
924	stb	%l3,[%i1+15]
925
926	ret
927	restore
928.type	poly1305_emit_fma,#function
929.size	poly1305_emit_fma,.-poly1305_emit_fma
930.align	64
931.Lconsts_fma:
932.word	0x43300000,0x00000000		! 2^(52+0)
933.word	0x45300000,0x00000000		! 2^(52+32)
934.word	0x47300000,0x00000000		! 2^(52+64)
935.word	0x49300000,0x00000000		! 2^(52+96)
936.word	0x4b500000,0x00000000		! 2^(52+130)
937
938.word	0x37f40000,0x00000000		! 5/2^130
939.word	0,1<<30				! fsr: truncate, no exceptions
940
941.word	0x44300000,0x00000000		! 2^(52+16+0)
942.word	0x46300000,0x00000000		! 2^(52+16+32)
943.word	0x48300000,0x00000000		! 2^(52+16+64)
944.word	0x4a300000,0x00000000		! 2^(52+16+96)
945.word	0x3e300000,0x00000000		! 2^(52+16+0-96)
946.word	0x40300000,0x00000000		! 2^(52+16+32-96)
947.word	0x42300000,0x00000000		! 2^(52+16+64-96)
948.asciz	"Poly1305 for SPARCv9/VIS3/FMA, CRYPTOGAMS by <appro@openssl.org>"
949.align	4
950