1.ident	"sparcv8plus.s, Version 1.4"
2.ident	"SPARC v9 ISA artwork by Andy Polyakov <appro@fy.chalmers.se>"
3
4/*
5 * ====================================================================
6 * Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
7 * project.
8 *
9 * Rights for redistribution and usage in source and binary forms are
10 * granted according to the OpenSSL license. Warranty of any kind is
11 * disclaimed.
12 * ====================================================================
13 */
14
15/*
16 * This is my modest contributon to OpenSSL project (see
17 * http://www.openssl.org/ for more information about it) and is
18 * a drop-in UltraSPARC ISA replacement for crypto/bn/bn_asm.c
19 * module. For updates see http://fy.chalmers.se/~appro/hpe/.
20 *
21 * Questions-n-answers.
22 *
23 * Q. How to compile?
24 * A. With SC4.x/SC5.x:
25 *
26 *	cc -xarch=v8plus -c bn_asm.sparc.v8plus.S -o bn_asm.o
27 *
28 *    and with gcc:
29 *
30 *	gcc -mcpu=ultrasparc -c bn_asm.sparc.v8plus.S -o bn_asm.o
31 *
32 *    or if above fails (it does if you have gas installed):
33 *
34 *	gcc -E bn_asm.sparc.v8plus.S | as -xarch=v8plus /dev/fd/0 -o bn_asm.o
35 *
36 *    Quick-n-dirty way to fuse the module into the library.
37 *    Provided that the library is already configured and built
38 *    (in 0.9.2 case with no-asm option):
39 *
40 *	# cd crypto/bn
41 *	# cp /some/place/bn_asm.sparc.v8plus.S .
42 *	# cc -xarch=v8plus -c bn_asm.sparc.v8plus.S -o bn_asm.o
43 *	# make
44 *	# cd ../..
45 *	# make; make test
46 *
47 *    Quick-n-dirty way to get rid of it:
48 *
49 *	# cd crypto/bn
50 *	# touch bn_asm.c
51 *	# make
52 *	# cd ../..
53 *	# make; make test
54 *
55 * Q. V8plus achitecture? What kind of beast is that?
56 * A. Well, it's rather a programming model than an architecture...
57 *    It's actually v9-compliant, i.e. *any* UltraSPARC, CPU under
58 *    special conditions, namely when kernel doesn't preserve upper
59 *    32 bits of otherwise 64-bit registers during a context switch.
60 *
61 * Q. Why just UltraSPARC? What about SuperSPARC?
62 * A. Original release did target UltraSPARC only. Now SuperSPARC
63 *    version is provided along. Both version share bn_*comba[48]
64 *    implementations (see comment later in code for explanation).
65 *    But what's so special about this UltraSPARC implementation?
66 *    Why didn't I let compiler do the job? Trouble is that most of
67 *    available compilers (well, SC5.0 is the only exception) don't
68 *    attempt to take advantage of UltraSPARC's 64-bitness under
69 *    32-bit kernels even though it's perfectly possible (see next
70 *    question).
71 *
72 * Q. 64-bit registers under 32-bit kernels? Didn't you just say it
73 *    doesn't work?
74 * A. You can't adress *all* registers as 64-bit wide:-( The catch is
75 *    that you actually may rely upon %o0-%o5 and %g1-%g4 being fully
76 *    preserved if you're in a leaf function, i.e. such never calling
77 *    any other functions. All functions in this module are leaf and
78 *    10 registers is a handful. And as a matter of fact none-"comba"
79 *    routines don't require even that much and I could even afford to
80 *    not allocate own stack frame for 'em:-)
81 *
82 * Q. What about 64-bit kernels?
83 * A. What about 'em? Just kidding:-) Pure 64-bit version is currently
84 *    under evaluation and development...
85 *
86 * Q. What about shared libraries?
87 * A. What about 'em? Kidding again:-) Code does *not* contain any
88 *    code position dependencies and it's safe to include it into
89 *    shared library as is.
90 *
91 * Q. How much faster does it go?
92 * A. Do you have a good benchmark? In either case below is what I
93 *    experience with crypto/bn/expspeed.c test program:
94 *
95 *	v8plus module on U10/300MHz against bn_asm.c compiled with:
96 *
97 *	cc-5.0 -xarch=v8plus -xO5 -xdepend	+7-12%
98 *	cc-4.2 -xarch=v8plus -xO5 -xdepend	+25-35%
99 *	egcs-1.1.2 -mcpu=ultrasparc -O3		+35-45%
100 *
101 *	v8 module on SS10/60MHz against bn_asm.c compiled with:
102 *
103 *	cc-5.0 -xarch=v8 -xO5 -xdepend		+7-10%
104 *	cc-4.2 -xarch=v8 -xO5 -xdepend		+10%
105 *	egcs-1.1.2 -mv8 -O3			+35-45%
106 *
107 *    As you can see it's damn hard to beat the new Sun C compiler
108 *    and it's in first place GNU C users who will appreciate this
109 *    assembler implementation:-)
110 */
111
112/*
113 * Revision history.
114 *
115 * 1.0	- initial release;
116 * 1.1	- new loop unrolling model(*);
117 *	- some more fine tuning;
118 * 1.2	- made gas friendly;
119 *	- updates to documentation concerning v9;
120 *	- new performance comparison matrix;
121 * 1.3	- fixed problem with /usr/ccs/lib/cpp;
122 * 1.4	- native V9 bn_*_comba[48] implementation (15% more efficient)
123 *	  resulting in slight overall performance kick;
124 *	- some retunes;
125 *	- support for GNU as added;
126 *
127 * (*)	Originally unrolled loop looked like this:
128 *	    for (;;) {
129 *		op(p+0); if (--n==0) break;
130 *		op(p+1); if (--n==0) break;
131 *		op(p+2); if (--n==0) break;
132 *		op(p+3); if (--n==0) break;
133 *		p+=4;
134 *	    }
135 *	I unroll according to following:
136 *	    while (n&~3) {
137 *		op(p+0); op(p+1); op(p+2); op(p+3);
138 *		p+=4; n=-4;
139 *	    }
140 *	    if (n) {
141 *		op(p+0); if (--n==0) return;
142 *		op(p+2); if (--n==0) return;
143 *		op(p+3); return;
144 *	    }
145 */
146
147/*
148 * GNU assembler can't stand stuw:-(
149 */
150#define stuw st
151
152.section	".text",#alloc,#execinstr
153.file		"bn_asm.sparc.v8plus.S"
154
155.align	32
156
157.global bn_mul_add_words
158/*
159 * BN_ULONG bn_mul_add_words(rp,ap,num,w)
160 * BN_ULONG *rp,*ap;
161 * int num;
162 * BN_ULONG w;
163 */
164bn_mul_add_words:
165	brgz,a	%o2,.L_bn_mul_add_words_proceed
166	lduw	[%o1],%g2
167	retl
168	clr	%o0
169
170.L_bn_mul_add_words_proceed:
171	srl	%o3,%g0,%o3	! clruw	%o3
172	andcc	%o2,-4,%g0
173	bz,pn	%icc,.L_bn_mul_add_words_tail
174	clr	%o5
175
176.L_bn_mul_add_words_loop:	! wow! 32 aligned!
177	lduw	[%o0],%g1
178	lduw	[%o1+4],%g3
179	mulx	%o3,%g2,%g2
180	add	%g1,%o5,%o4
181	nop
182	add	%o4,%g2,%o4
183	stuw	%o4,[%o0]
184	srlx	%o4,32,%o5
185
186	lduw	[%o0+4],%g1
187	lduw	[%o1+8],%g2
188	mulx	%o3,%g3,%g3
189	add	%g1,%o5,%o4
190	dec	4,%o2
191	add	%o4,%g3,%o4
192	stuw	%o4,[%o0+4]
193	srlx	%o4,32,%o5
194
195	lduw	[%o0+8],%g1
196	lduw	[%o1+12],%g3
197	mulx	%o3,%g2,%g2
198	add	%g1,%o5,%o4
199	inc	16,%o1
200	add	%o4,%g2,%o4
201	stuw	%o4,[%o0+8]
202	srlx	%o4,32,%o5
203
204	lduw	[%o0+12],%g1
205	mulx	%o3,%g3,%g3
206	add	%g1,%o5,%o4
207	inc	16,%o0
208	add	%o4,%g3,%o4
209	andcc	%o2,-4,%g0
210	stuw	%o4,[%o0-4]
211	srlx	%o4,32,%o5
212	bnz,a,pt	%icc,.L_bn_mul_add_words_loop
213	lduw	[%o1],%g2
214
215	brnz,a,pn	%o2,.L_bn_mul_add_words_tail
216	lduw	[%o1],%g2
217.L_bn_mul_add_words_return:
218	retl
219	mov	%o5,%o0
220
221.L_bn_mul_add_words_tail:
222	lduw	[%o0],%g1
223	mulx	%o3,%g2,%g2
224	add	%g1,%o5,%o4
225	dec	%o2
226	add	%o4,%g2,%o4
227	srlx	%o4,32,%o5
228	brz,pt	%o2,.L_bn_mul_add_words_return
229	stuw	%o4,[%o0]
230
231	lduw	[%o1+4],%g2
232	lduw	[%o0+4],%g1
233	mulx	%o3,%g2,%g2
234	add	%g1,%o5,%o4
235	dec	%o2
236	add	%o4,%g2,%o4
237	srlx	%o4,32,%o5
238	brz,pt	%o2,.L_bn_mul_add_words_return
239	stuw	%o4,[%o0+4]
240
241	lduw	[%o1+8],%g2
242	lduw	[%o0+8],%g1
243	mulx	%o3,%g2,%g2
244	add	%g1,%o5,%o4
245	add	%o4,%g2,%o4
246	stuw	%o4,[%o0+8]
247	retl
248	srlx	%o4,32,%o0
249
250.type	bn_mul_add_words,#function
251.size	bn_mul_add_words,(.-bn_mul_add_words)
252
253.align	32
254
255.global bn_mul_words
256/*
257 * BN_ULONG bn_mul_words(rp,ap,num,w)
258 * BN_ULONG *rp,*ap;
259 * int num;
260 * BN_ULONG w;
261 */
262bn_mul_words:
263	brgz,a	%o2,.L_bn_mul_words_proceeed
264	lduw	[%o1],%g2
265	retl
266	clr	%o0
267
268.L_bn_mul_words_proceeed:
269	srl	%o3,%g0,%o3	! clruw	%o3
270	andcc	%o2,-4,%g0
271	bz,pn	%icc,.L_bn_mul_words_tail
272	clr	%o5
273
274.L_bn_mul_words_loop:		! wow! 32 aligned!
275	lduw	[%o1+4],%g3
276	mulx	%o3,%g2,%g2
277	add	%g2,%o5,%o4
278	nop
279	stuw	%o4,[%o0]
280	srlx	%o4,32,%o5
281
282	lduw	[%o1+8],%g2
283	mulx	%o3,%g3,%g3
284	add	%g3,%o5,%o4
285	dec	4,%o2
286	stuw	%o4,[%o0+4]
287	srlx	%o4,32,%o5
288
289	lduw	[%o1+12],%g3
290	mulx	%o3,%g2,%g2
291	add	%g2,%o5,%o4
292	inc	16,%o1
293	stuw	%o4,[%o0+8]
294	srlx	%o4,32,%o5
295
296	mulx	%o3,%g3,%g3
297	add	%g3,%o5,%o4
298	inc	16,%o0
299	stuw	%o4,[%o0-4]
300	srlx	%o4,32,%o5
301	andcc	%o2,-4,%g0
302	bnz,a,pt	%icc,.L_bn_mul_words_loop
303	lduw	[%o1],%g2
304	nop
305	nop
306
307	brnz,a,pn	%o2,.L_bn_mul_words_tail
308	lduw	[%o1],%g2
309.L_bn_mul_words_return:
310	retl
311	mov	%o5,%o0
312
313.L_bn_mul_words_tail:
314	mulx	%o3,%g2,%g2
315	add	%g2,%o5,%o4
316	dec	%o2
317	srlx	%o4,32,%o5
318	brz,pt	%o2,.L_bn_mul_words_return
319	stuw	%o4,[%o0]
320
321	lduw	[%o1+4],%g2
322	mulx	%o3,%g2,%g2
323	add	%g2,%o5,%o4
324	dec	%o2
325	srlx	%o4,32,%o5
326	brz,pt	%o2,.L_bn_mul_words_return
327	stuw	%o4,[%o0+4]
328
329	lduw	[%o1+8],%g2
330	mulx	%o3,%g2,%g2
331	add	%g2,%o5,%o4
332	stuw	%o4,[%o0+8]
333	retl
334	srlx	%o4,32,%o0
335
336.type	bn_mul_words,#function
337.size	bn_mul_words,(.-bn_mul_words)
338
339.align  32
340.global	bn_sqr_words
341/*
342 * void bn_sqr_words(r,a,n)
343 * BN_ULONG *r,*a;
344 * int n;
345 */
346bn_sqr_words:
347	brgz,a	%o2,.L_bn_sqr_words_proceeed
348	lduw	[%o1],%g2
349	retl
350	clr	%o0
351
352.L_bn_sqr_words_proceeed:
353	andcc	%o2,-4,%g0
354	nop
355	bz,pn	%icc,.L_bn_sqr_words_tail
356	nop
357
358.L_bn_sqr_words_loop:		! wow! 32 aligned!
359	lduw	[%o1+4],%g3
360	mulx	%g2,%g2,%o4
361	stuw	%o4,[%o0]
362	srlx	%o4,32,%o5
363	stuw	%o5,[%o0+4]
364	nop
365
366	lduw	[%o1+8],%g2
367	mulx	%g3,%g3,%o4
368	dec	4,%o2
369	stuw	%o4,[%o0+8]
370	srlx	%o4,32,%o5
371	stuw	%o5,[%o0+12]
372
373	lduw	[%o1+12],%g3
374	mulx	%g2,%g2,%o4
375	srlx	%o4,32,%o5
376	stuw	%o4,[%o0+16]
377	inc	16,%o1
378	stuw	%o5,[%o0+20]
379
380	mulx	%g3,%g3,%o4
381	inc	32,%o0
382	stuw	%o4,[%o0-8]
383	srlx	%o4,32,%o5
384	andcc	%o2,-4,%g2
385	stuw	%o5,[%o0-4]
386	bnz,a,pt	%icc,.L_bn_sqr_words_loop
387	lduw	[%o1],%g2
388	nop
389
390	brnz,a,pn	%o2,.L_bn_sqr_words_tail
391	lduw	[%o1],%g2
392.L_bn_sqr_words_return:
393	retl
394	clr	%o0
395
396.L_bn_sqr_words_tail:
397	mulx	%g2,%g2,%o4
398	dec	%o2
399	stuw	%o4,[%o0]
400	srlx	%o4,32,%o5
401	brz,pt	%o2,.L_bn_sqr_words_return
402	stuw	%o5,[%o0+4]
403
404	lduw	[%o1+4],%g2
405	mulx	%g2,%g2,%o4
406	dec	%o2
407	stuw	%o4,[%o0+8]
408	srlx	%o4,32,%o5
409	brz,pt	%o2,.L_bn_sqr_words_return
410	stuw	%o5,[%o0+12]
411
412	lduw	[%o1+8],%g2
413	mulx	%g2,%g2,%o4
414	srlx	%o4,32,%o5
415	stuw	%o4,[%o0+16]
416	stuw	%o5,[%o0+20]
417	retl
418	clr	%o0
419
420.type	bn_sqr_words,#function
421.size	bn_sqr_words,(.-bn_sqr_words)
422
423.align	32
424.global bn_div_words
425/*
426 * BN_ULONG bn_div_words(h,l,d)
427 * BN_ULONG h,l,d;
428 */
429bn_div_words:
430	sllx	%o0,32,%o0
431	or	%o0,%o1,%o0
432	udivx	%o0,%o2,%o0
433	retl
434	srl	%o0,%g0,%o0	! clruw	%o0
435
436.type	bn_div_words,#function
437.size	bn_div_words,(.-bn_div_words)
438
439.align	32
440
441.global bn_add_words
442/*
443 * BN_ULONG bn_add_words(rp,ap,bp,n)
444 * BN_ULONG *rp,*ap,*bp;
445 * int n;
446 */
447bn_add_words:
448	brgz,a	%o3,.L_bn_add_words_proceed
449	lduw	[%o1],%o4
450	retl
451	clr	%o0
452
453.L_bn_add_words_proceed:
454	andcc	%o3,-4,%g0
455	bz,pn	%icc,.L_bn_add_words_tail
456	addcc	%g0,0,%g0	! clear carry flag
457	nop
458
459.L_bn_add_words_loop:		! wow! 32 aligned!
460	dec	4,%o3
461	lduw	[%o2],%o5
462	lduw	[%o1+4],%g1
463	lduw	[%o2+4],%g2
464	lduw	[%o1+8],%g3
465	lduw	[%o2+8],%g4
466	addccc	%o5,%o4,%o5
467	stuw	%o5,[%o0]
468
469	lduw	[%o1+12],%o4
470	lduw	[%o2+12],%o5
471	inc	16,%o1
472	addccc	%g1,%g2,%g1
473	stuw	%g1,[%o0+4]
474
475	inc	16,%o2
476	addccc	%g3,%g4,%g3
477	stuw	%g3,[%o0+8]
478
479	inc	16,%o0
480	addccc	%o5,%o4,%o5
481	stuw	%o5,[%o0-4]
482	and	%o3,-4,%g1
483	brnz,a,pt	%g1,.L_bn_add_words_loop
484	lduw	[%o1],%o4
485
486	brnz,a,pn	%o3,.L_bn_add_words_tail
487	lduw	[%o1],%o4
488.L_bn_add_words_return:
489	clr	%o0
490	retl
491	movcs	%icc,1,%o0
492	nop
493
494.L_bn_add_words_tail:
495	lduw	[%o2],%o5
496	dec	%o3
497	addccc	%o5,%o4,%o5
498	brz,pt	%o3,.L_bn_add_words_return
499	stuw	%o5,[%o0]
500
501	lduw	[%o1+4],%o4
502	lduw	[%o2+4],%o5
503	dec	%o3
504	addccc	%o5,%o4,%o5
505	brz,pt	%o3,.L_bn_add_words_return
506	stuw	%o5,[%o0+4]
507
508	lduw	[%o1+8],%o4
509	lduw	[%o2+8],%o5
510	addccc	%o5,%o4,%o5
511	stuw	%o5,[%o0+8]
512	clr	%o0
513	retl
514	movcs	%icc,1,%o0
515
516.type	bn_add_words,#function
517.size	bn_add_words,(.-bn_add_words)
518
519.global bn_sub_words
520/*
521 * BN_ULONG bn_sub_words(rp,ap,bp,n)
522 * BN_ULONG *rp,*ap,*bp;
523 * int n;
524 */
525bn_sub_words:
526	brgz,a	%o3,.L_bn_sub_words_proceed
527	lduw	[%o1],%o4
528	retl
529	clr	%o0
530
531.L_bn_sub_words_proceed:
532	andcc	%o3,-4,%g0
533	bz,pn	%icc,.L_bn_sub_words_tail
534	addcc	%g0,0,%g0	! clear carry flag
535	nop
536
537.L_bn_sub_words_loop:		! wow! 32 aligned!
538	dec	4,%o3
539	lduw	[%o2],%o5
540	lduw	[%o1+4],%g1
541	lduw	[%o2+4],%g2
542	lduw	[%o1+8],%g3
543	lduw	[%o2+8],%g4
544	subccc	%o4,%o5,%o5
545	stuw	%o5,[%o0]
546
547	lduw	[%o1+12],%o4
548	lduw	[%o2+12],%o5
549	inc	16,%o1
550	subccc	%g1,%g2,%g2
551	stuw	%g2,[%o0+4]
552
553	inc	16,%o2
554	subccc	%g3,%g4,%g4
555	stuw	%g4,[%o0+8]
556
557	inc	16,%o0
558	subccc	%o4,%o5,%o5
559	stuw	%o5,[%o0-4]
560	and	%o3,-4,%g1
561	brnz,a,pt	%g1,.L_bn_sub_words_loop
562	lduw	[%o1],%o4
563
564	brnz,a,pn	%o3,.L_bn_sub_words_tail
565	lduw	[%o1],%o4
566.L_bn_sub_words_return:
567	clr	%o0
568	retl
569	movcs	%icc,1,%o0
570	nop
571
572.L_bn_sub_words_tail:		! wow! 32 aligned!
573	lduw	[%o2],%o5
574	dec	%o3
575	subccc	%o4,%o5,%o5
576	brz,pt	%o3,.L_bn_sub_words_return
577	stuw	%o5,[%o0]
578
579	lduw	[%o1+4],%o4
580	lduw	[%o2+4],%o5
581	dec	%o3
582	subccc	%o4,%o5,%o5
583	brz,pt	%o3,.L_bn_sub_words_return
584	stuw	%o5,[%o0+4]
585
586	lduw	[%o1+8],%o4
587	lduw	[%o2+8],%o5
588	subccc	%o4,%o5,%o5
589	stuw	%o5,[%o0+8]
590	clr	%o0
591	retl
592	movcs	%icc,1,%o0
593
594.type	bn_sub_words,#function
595.size	bn_sub_words,(.-bn_sub_words)
596
597/*
598 * Code below depends on the fact that upper parts of the %l0-%l7
599 * and %i0-%i7 are zeroed by kernel after context switch. In
600 * previous versions this comment stated that "the trouble is that
601 * it's not feasible to implement the mumbo-jumbo in less V9
602 * instructions:-(" which apparently isn't true thanks to
603 * 'bcs,a %xcc,.+8; inc %rd' pair. But the performance improvement
604 * results not from the shorter code, but from elimination of
605 * multicycle none-pairable 'rd %y,%rd' instructions.
606 *
607 *							Andy.
608 */
609
610#define FRAME_SIZE	-96
611
612/*
613 * Here is register usage map for *all* routines below.
614 */
615#define t_1	%o0
616#define	t_2	%o1
617#define c_12	%o2
618#define c_3	%o3
619
620#define ap(I)	[%i1+4*I]
621#define bp(I)	[%i2+4*I]
622#define rp(I)	[%i0+4*I]
623
624#define	a_0	%l0
625#define	a_1	%l1
626#define	a_2	%l2
627#define	a_3	%l3
628#define	a_4	%l4
629#define	a_5	%l5
630#define	a_6	%l6
631#define	a_7	%l7
632
633#define	b_0	%i3
634#define	b_1	%i4
635#define	b_2	%i5
636#define	b_3	%o4
637#define	b_4	%o5
638#define	b_5	%o7
639#define	b_6	%g1
640#define	b_7	%g4
641
642.align	32
643.global bn_mul_comba8
644/*
645 * void bn_mul_comba8(r,a,b)
646 * BN_ULONG *r,*a,*b;
647 */
648bn_mul_comba8:
649	save	%sp,FRAME_SIZE,%sp
650	mov	1,t_2
651	lduw	ap(0),a_0
652	sllx	t_2,32,t_2
653	lduw	bp(0),b_0	!=
654	lduw	bp(1),b_1
655	mulx	a_0,b_0,t_1	!mul_add_c(a[0],b[0],c1,c2,c3);
656	srlx	t_1,32,c_12
657	stuw	t_1,rp(0)	!=!r[0]=c1;
658
659	lduw	ap(1),a_1
660	mulx	a_0,b_1,t_1	!mul_add_c(a[0],b[1],c2,c3,c1);
661	addcc	c_12,t_1,c_12
662	clr	c_3		!=
663	bcs,a	%xcc,.+8
664	add	c_3,t_2,c_3
665	lduw	ap(2),a_2
666	mulx	a_1,b_0,t_1	!=!mul_add_c(a[1],b[0],c2,c3,c1);
667	addcc	c_12,t_1,t_1
668	bcs,a	%xcc,.+8
669	add	c_3,t_2,c_3
670	srlx	t_1,32,c_12	!=
671	stuw	t_1,rp(1)	!r[1]=c2;
672	or	c_12,c_3,c_12
673
674	mulx	a_2,b_0,t_1	!mul_add_c(a[2],b[0],c3,c1,c2);
675	addcc	c_12,t_1,c_12	!=
676	clr	c_3
677	bcs,a	%xcc,.+8
678	add	c_3,t_2,c_3
679	lduw	bp(2),b_2	!=
680	mulx	a_1,b_1,t_1	!mul_add_c(a[1],b[1],c3,c1,c2);
681	addcc	c_12,t_1,c_12
682	bcs,a	%xcc,.+8
683	add	c_3,t_2,c_3	!=
684	lduw	bp(3),b_3
685	mulx	a_0,b_2,t_1	!mul_add_c(a[0],b[2],c3,c1,c2);
686	addcc	c_12,t_1,t_1
687	bcs,a	%xcc,.+8	!=
688	add	c_3,t_2,c_3
689	srlx	t_1,32,c_12
690	stuw	t_1,rp(2)	!r[2]=c3;
691	or	c_12,c_3,c_12	!=
692
693	mulx	a_0,b_3,t_1	!mul_add_c(a[0],b[3],c1,c2,c3);
694	addcc	c_12,t_1,c_12
695	clr	c_3
696	bcs,a	%xcc,.+8	!=
697	add	c_3,t_2,c_3
698	mulx	a_1,b_2,t_1	!=!mul_add_c(a[1],b[2],c1,c2,c3);
699	addcc	c_12,t_1,c_12
700	bcs,a	%xcc,.+8	!=
701	add	c_3,t_2,c_3
702	lduw	ap(3),a_3
703	mulx	a_2,b_1,t_1	!mul_add_c(a[2],b[1],c1,c2,c3);
704	addcc	c_12,t_1,c_12	!=
705	bcs,a	%xcc,.+8
706	add	c_3,t_2,c_3
707	lduw	ap(4),a_4
708	mulx	a_3,b_0,t_1	!=!mul_add_c(a[3],b[0],c1,c2,c3);!=
709	addcc	c_12,t_1,t_1
710	bcs,a	%xcc,.+8
711	add	c_3,t_2,c_3
712	srlx	t_1,32,c_12	!=
713	stuw	t_1,rp(3)	!r[3]=c1;
714	or	c_12,c_3,c_12
715
716	mulx	a_4,b_0,t_1	!mul_add_c(a[4],b[0],c2,c3,c1);
717	addcc	c_12,t_1,c_12	!=
718	clr	c_3
719	bcs,a	%xcc,.+8
720	add	c_3,t_2,c_3
721	mulx	a_3,b_1,t_1	!=!mul_add_c(a[3],b[1],c2,c3,c1);
722	addcc	c_12,t_1,c_12
723	bcs,a	%xcc,.+8
724	add	c_3,t_2,c_3
725	mulx	a_2,b_2,t_1	!=!mul_add_c(a[2],b[2],c2,c3,c1);
726	addcc	c_12,t_1,c_12
727	bcs,a	%xcc,.+8
728	add	c_3,t_2,c_3
729	lduw	bp(4),b_4	!=
730	mulx	a_1,b_3,t_1	!mul_add_c(a[1],b[3],c2,c3,c1);
731	addcc	c_12,t_1,c_12
732	bcs,a	%xcc,.+8
733	add	c_3,t_2,c_3	!=
734	lduw	bp(5),b_5
735	mulx	a_0,b_4,t_1	!mul_add_c(a[0],b[4],c2,c3,c1);
736	addcc	c_12,t_1,t_1
737	bcs,a	%xcc,.+8	!=
738	add	c_3,t_2,c_3
739	srlx	t_1,32,c_12
740	stuw	t_1,rp(4)	!r[4]=c2;
741	or	c_12,c_3,c_12	!=
742
743	mulx	a_0,b_5,t_1	!mul_add_c(a[0],b[5],c3,c1,c2);
744	addcc	c_12,t_1,c_12
745	clr	c_3
746	bcs,a	%xcc,.+8	!=
747	add	c_3,t_2,c_3
748	mulx	a_1,b_4,t_1	!mul_add_c(a[1],b[4],c3,c1,c2);
749	addcc	c_12,t_1,c_12
750	bcs,a	%xcc,.+8	!=
751	add	c_3,t_2,c_3
752	mulx	a_2,b_3,t_1	!mul_add_c(a[2],b[3],c3,c1,c2);
753	addcc	c_12,t_1,c_12
754	bcs,a	%xcc,.+8	!=
755	add	c_3,t_2,c_3
756	mulx	a_3,b_2,t_1	!mul_add_c(a[3],b[2],c3,c1,c2);
757	addcc	c_12,t_1,c_12
758	bcs,a	%xcc,.+8	!=
759	add	c_3,t_2,c_3
760	lduw	ap(5),a_5
761	mulx	a_4,b_1,t_1	!mul_add_c(a[4],b[1],c3,c1,c2);
762	addcc	c_12,t_1,c_12	!=
763	bcs,a	%xcc,.+8
764	add	c_3,t_2,c_3
765	lduw	ap(6),a_6
766	mulx	a_5,b_0,t_1	!=!mul_add_c(a[5],b[0],c3,c1,c2);
767	addcc	c_12,t_1,t_1
768	bcs,a	%xcc,.+8
769	add	c_3,t_2,c_3
770	srlx	t_1,32,c_12	!=
771	stuw	t_1,rp(5)	!r[5]=c3;
772	or	c_12,c_3,c_12
773
774	mulx	a_6,b_0,t_1	!mul_add_c(a[6],b[0],c1,c2,c3);
775	addcc	c_12,t_1,c_12	!=
776	clr	c_3
777	bcs,a	%xcc,.+8
778	add	c_3,t_2,c_3
779	mulx	a_5,b_1,t_1	!=!mul_add_c(a[5],b[1],c1,c2,c3);
780	addcc	c_12,t_1,c_12
781	bcs,a	%xcc,.+8
782	add	c_3,t_2,c_3
783	mulx	a_4,b_2,t_1	!=!mul_add_c(a[4],b[2],c1,c2,c3);
784	addcc	c_12,t_1,c_12
785	bcs,a	%xcc,.+8
786	add	c_3,t_2,c_3
787	mulx	a_3,b_3,t_1	!=!mul_add_c(a[3],b[3],c1,c2,c3);
788	addcc	c_12,t_1,c_12
789	bcs,a	%xcc,.+8
790	add	c_3,t_2,c_3
791	mulx	a_2,b_4,t_1	!=!mul_add_c(a[2],b[4],c1,c2,c3);
792	addcc	c_12,t_1,c_12
793	bcs,a	%xcc,.+8
794	add	c_3,t_2,c_3
795	lduw	bp(6),b_6	!=
796	mulx	a_1,b_5,t_1	!mul_add_c(a[1],b[5],c1,c2,c3);
797	addcc	c_12,t_1,c_12
798	bcs,a	%xcc,.+8
799	add	c_3,t_2,c_3	!=
800	lduw	bp(7),b_7
801	mulx	a_0,b_6,t_1	!mul_add_c(a[0],b[6],c1,c2,c3);
802	addcc	c_12,t_1,t_1
803	bcs,a	%xcc,.+8	!=
804	add	c_3,t_2,c_3
805	srlx	t_1,32,c_12
806	stuw	t_1,rp(6)	!r[6]=c1;
807	or	c_12,c_3,c_12	!=
808
809	mulx	a_0,b_7,t_1	!mul_add_c(a[0],b[7],c2,c3,c1);
810	addcc	c_12,t_1,c_12
811	clr	c_3
812	bcs,a	%xcc,.+8	!=
813	add	c_3,t_2,c_3
814	mulx	a_1,b_6,t_1	!mul_add_c(a[1],b[6],c2,c3,c1);
815	addcc	c_12,t_1,c_12
816	bcs,a	%xcc,.+8	!=
817	add	c_3,t_2,c_3
818	mulx	a_2,b_5,t_1	!mul_add_c(a[2],b[5],c2,c3,c1);
819	addcc	c_12,t_1,c_12
820	bcs,a	%xcc,.+8	!=
821	add	c_3,t_2,c_3
822	mulx	a_3,b_4,t_1	!mul_add_c(a[3],b[4],c2,c3,c1);
823	addcc	c_12,t_1,c_12
824	bcs,a	%xcc,.+8	!=
825	add	c_3,t_2,c_3
826	mulx	a_4,b_3,t_1	!mul_add_c(a[4],b[3],c2,c3,c1);
827	addcc	c_12,t_1,c_12
828	bcs,a	%xcc,.+8	!=
829	add	c_3,t_2,c_3
830	mulx	a_5,b_2,t_1	!mul_add_c(a[5],b[2],c2,c3,c1);
831	addcc	c_12,t_1,c_12
832	bcs,a	%xcc,.+8	!=
833	add	c_3,t_2,c_3
834	lduw	ap(7),a_7
835	mulx	a_6,b_1,t_1	!=!mul_add_c(a[6],b[1],c2,c3,c1);
836	addcc	c_12,t_1,c_12
837	bcs,a	%xcc,.+8
838	add	c_3,t_2,c_3
839	mulx	a_7,b_0,t_1	!=!mul_add_c(a[7],b[0],c2,c3,c1);
840	addcc	c_12,t_1,t_1
841	bcs,a	%xcc,.+8
842	add	c_3,t_2,c_3
843	srlx	t_1,32,c_12	!=
844	stuw	t_1,rp(7)	!r[7]=c2;
845	or	c_12,c_3,c_12
846
847	mulx	a_7,b_1,t_1	!=!mul_add_c(a[7],b[1],c3,c1,c2);
848	addcc	c_12,t_1,c_12
849	clr	c_3
850	bcs,a	%xcc,.+8
851	add	c_3,t_2,c_3	!=
852	mulx	a_6,b_2,t_1	!mul_add_c(a[6],b[2],c3,c1,c2);
853	addcc	c_12,t_1,c_12
854	bcs,a	%xcc,.+8
855	add	c_3,t_2,c_3	!=
856	mulx	a_5,b_3,t_1	!mul_add_c(a[5],b[3],c3,c1,c2);
857	addcc	c_12,t_1,c_12
858	bcs,a	%xcc,.+8
859	add	c_3,t_2,c_3	!=
860	mulx	a_4,b_4,t_1	!mul_add_c(a[4],b[4],c3,c1,c2);
861	addcc	c_12,t_1,c_12
862	bcs,a	%xcc,.+8
863	add	c_3,t_2,c_3	!=
864	mulx	a_3,b_5,t_1	!mul_add_c(a[3],b[5],c3,c1,c2);
865	addcc	c_12,t_1,c_12
866	bcs,a	%xcc,.+8
867	add	c_3,t_2,c_3	!=
868	mulx	a_2,b_6,t_1	!mul_add_c(a[2],b[6],c3,c1,c2);
869	addcc	c_12,t_1,c_12
870	bcs,a	%xcc,.+8
871	add	c_3,t_2,c_3	!=
872	mulx	a_1,b_7,t_1	!mul_add_c(a[1],b[7],c3,c1,c2);
873	addcc	c_12,t_1,t_1
874	bcs,a	%xcc,.+8
875	add	c_3,t_2,c_3	!=
876	srlx	t_1,32,c_12
877	stuw	t_1,rp(8)	!r[8]=c3;
878	or	c_12,c_3,c_12
879
880	mulx	a_2,b_7,t_1	!=!mul_add_c(a[2],b[7],c1,c2,c3);
881	addcc	c_12,t_1,c_12
882	clr	c_3
883	bcs,a	%xcc,.+8
884	add	c_3,t_2,c_3	!=
885	mulx	a_3,b_6,t_1	!mul_add_c(a[3],b[6],c1,c2,c3);
886	addcc	c_12,t_1,c_12
887	bcs,a	%xcc,.+8	!=
888	add	c_3,t_2,c_3
889	mulx	a_4,b_5,t_1	!mul_add_c(a[4],b[5],c1,c2,c3);
890	addcc	c_12,t_1,c_12
891	bcs,a	%xcc,.+8	!=
892	add	c_3,t_2,c_3
893	mulx	a_5,b_4,t_1	!mul_add_c(a[5],b[4],c1,c2,c3);
894	addcc	c_12,t_1,c_12
895	bcs,a	%xcc,.+8	!=
896	add	c_3,t_2,c_3
897	mulx	a_6,b_3,t_1	!mul_add_c(a[6],b[3],c1,c2,c3);
898	addcc	c_12,t_1,c_12
899	bcs,a	%xcc,.+8	!=
900	add	c_3,t_2,c_3
901	mulx	a_7,b_2,t_1	!mul_add_c(a[7],b[2],c1,c2,c3);
902	addcc	c_12,t_1,t_1
903	bcs,a	%xcc,.+8	!=
904	add	c_3,t_2,c_3
905	srlx	t_1,32,c_12
906	stuw	t_1,rp(9)	!r[9]=c1;
907	or	c_12,c_3,c_12	!=
908
909	mulx	a_7,b_3,t_1	!mul_add_c(a[7],b[3],c2,c3,c1);
910	addcc	c_12,t_1,c_12
911	clr	c_3
912	bcs,a	%xcc,.+8	!=
913	add	c_3,t_2,c_3
914	mulx	a_6,b_4,t_1	!mul_add_c(a[6],b[4],c2,c3,c1);
915	addcc	c_12,t_1,c_12
916	bcs,a	%xcc,.+8	!=
917	add	c_3,t_2,c_3
918	mulx	a_5,b_5,t_1	!mul_add_c(a[5],b[5],c2,c3,c1);
919	addcc	c_12,t_1,c_12
920	bcs,a	%xcc,.+8	!=
921	add	c_3,t_2,c_3
922	mulx	a_4,b_6,t_1	!mul_add_c(a[4],b[6],c2,c3,c1);
923	addcc	c_12,t_1,c_12
924	bcs,a	%xcc,.+8	!=
925	add	c_3,t_2,c_3
926	mulx	a_3,b_7,t_1	!mul_add_c(a[3],b[7],c2,c3,c1);
927	addcc	c_12,t_1,t_1
928	bcs,a	%xcc,.+8	!=
929	add	c_3,t_2,c_3
930	srlx	t_1,32,c_12
931	stuw	t_1,rp(10)	!r[10]=c2;
932	or	c_12,c_3,c_12	!=
933
934	mulx	a_4,b_7,t_1	!mul_add_c(a[4],b[7],c3,c1,c2);
935	addcc	c_12,t_1,c_12
936	clr	c_3
937	bcs,a	%xcc,.+8	!=
938	add	c_3,t_2,c_3
939	mulx	a_5,b_6,t_1	!mul_add_c(a[5],b[6],c3,c1,c2);
940	addcc	c_12,t_1,c_12
941	bcs,a	%xcc,.+8	!=
942	add	c_3,t_2,c_3
943	mulx	a_6,b_5,t_1	!mul_add_c(a[6],b[5],c3,c1,c2);
944	addcc	c_12,t_1,c_12
945	bcs,a	%xcc,.+8	!=
946	add	c_3,t_2,c_3
947	mulx	a_7,b_4,t_1	!mul_add_c(a[7],b[4],c3,c1,c2);
948	addcc	c_12,t_1,t_1
949	bcs,a	%xcc,.+8	!=
950	add	c_3,t_2,c_3
951	srlx	t_1,32,c_12
952	stuw	t_1,rp(11)	!r[11]=c3;
953	or	c_12,c_3,c_12	!=
954
955	mulx	a_7,b_5,t_1	!mul_add_c(a[7],b[5],c1,c2,c3);
956	addcc	c_12,t_1,c_12
957	clr	c_3
958	bcs,a	%xcc,.+8	!=
959	add	c_3,t_2,c_3
960	mulx	a_6,b_6,t_1	!mul_add_c(a[6],b[6],c1,c2,c3);
961	addcc	c_12,t_1,c_12
962	bcs,a	%xcc,.+8	!=
963	add	c_3,t_2,c_3
964	mulx	a_5,b_7,t_1	!mul_add_c(a[5],b[7],c1,c2,c3);
965	addcc	c_12,t_1,t_1
966	bcs,a	%xcc,.+8	!=
967	add	c_3,t_2,c_3
968	srlx	t_1,32,c_12
969	stuw	t_1,rp(12)	!r[12]=c1;
970	or	c_12,c_3,c_12	!=
971
972	mulx	a_6,b_7,t_1	!mul_add_c(a[6],b[7],c2,c3,c1);
973	addcc	c_12,t_1,c_12
974	clr	c_3
975	bcs,a	%xcc,.+8	!=
976	add	c_3,t_2,c_3
977	mulx	a_7,b_6,t_1	!mul_add_c(a[7],b[6],c2,c3,c1);
978	addcc	c_12,t_1,t_1
979	bcs,a	%xcc,.+8	!=
980	add	c_3,t_2,c_3
981	srlx	t_1,32,c_12
982	st	t_1,rp(13)	!r[13]=c2;
983	or	c_12,c_3,c_12	!=
984
985	mulx	a_7,b_7,t_1	!mul_add_c(a[7],b[7],c3,c1,c2);
986	addcc	c_12,t_1,t_1
987	srlx	t_1,32,c_12	!=
988	stuw	t_1,rp(14)	!r[14]=c3;
989	stuw	c_12,rp(15)	!r[15]=c1;
990
991	ret
992	restore	%g0,%g0,%o0	!=
993
994.type	bn_mul_comba8,#function
995.size	bn_mul_comba8,(.-bn_mul_comba8)
996
997.align	32
998
999.global bn_mul_comba4
1000/*
1001 * void bn_mul_comba4(r,a,b)
1002 * BN_ULONG *r,*a,*b;
1003 */
1004bn_mul_comba4:
1005	save	%sp,FRAME_SIZE,%sp
1006	lduw	ap(0),a_0
1007	mov	1,t_2
1008	lduw	bp(0),b_0
1009	sllx	t_2,32,t_2	!=
1010	lduw	bp(1),b_1
1011	mulx	a_0,b_0,t_1	!mul_add_c(a[0],b[0],c1,c2,c3);
1012	srlx	t_1,32,c_12
1013	stuw	t_1,rp(0)	!=!r[0]=c1;
1014
1015	lduw	ap(1),a_1
1016	mulx	a_0,b_1,t_1	!mul_add_c(a[0],b[1],c2,c3,c1);
1017	addcc	c_12,t_1,c_12
1018	clr	c_3		!=
1019	bcs,a	%xcc,.+8
1020	add	c_3,t_2,c_3
1021	lduw	ap(2),a_2
1022	mulx	a_1,b_0,t_1	!=!mul_add_c(a[1],b[0],c2,c3,c1);
1023	addcc	c_12,t_1,t_1
1024	bcs,a	%xcc,.+8
1025	add	c_3,t_2,c_3
1026	srlx	t_1,32,c_12	!=
1027	stuw	t_1,rp(1)	!r[1]=c2;
1028	or	c_12,c_3,c_12
1029
1030	mulx	a_2,b_0,t_1	!mul_add_c(a[2],b[0],c3,c1,c2);
1031	addcc	c_12,t_1,c_12	!=
1032	clr	c_3
1033	bcs,a	%xcc,.+8
1034	add	c_3,t_2,c_3
1035	lduw	bp(2),b_2	!=
1036	mulx	a_1,b_1,t_1	!mul_add_c(a[1],b[1],c3,c1,c2);
1037	addcc	c_12,t_1,c_12
1038	bcs,a	%xcc,.+8
1039	add	c_3,t_2,c_3	!=
1040	lduw	bp(3),b_3
1041	mulx	a_0,b_2,t_1	!mul_add_c(a[0],b[2],c3,c1,c2);
1042	addcc	c_12,t_1,t_1
1043	bcs,a	%xcc,.+8	!=
1044	add	c_3,t_2,c_3
1045	srlx	t_1,32,c_12
1046	stuw	t_1,rp(2)	!r[2]=c3;
1047	or	c_12,c_3,c_12	!=
1048
1049	mulx	a_0,b_3,t_1	!mul_add_c(a[0],b[3],c1,c2,c3);
1050	addcc	c_12,t_1,c_12
1051	clr	c_3
1052	bcs,a	%xcc,.+8	!=
1053	add	c_3,t_2,c_3
1054	mulx	a_1,b_2,t_1	!mul_add_c(a[1],b[2],c1,c2,c3);
1055	addcc	c_12,t_1,c_12
1056	bcs,a	%xcc,.+8	!=
1057	add	c_3,t_2,c_3
1058	lduw	ap(3),a_3
1059	mulx	a_2,b_1,t_1	!mul_add_c(a[2],b[1],c1,c2,c3);
1060	addcc	c_12,t_1,c_12	!=
1061	bcs,a	%xcc,.+8
1062	add	c_3,t_2,c_3
1063	mulx	a_3,b_0,t_1	!mul_add_c(a[3],b[0],c1,c2,c3);!=
1064	addcc	c_12,t_1,t_1	!=
1065	bcs,a	%xcc,.+8
1066	add	c_3,t_2,c_3
1067	srlx	t_1,32,c_12
1068	stuw	t_1,rp(3)	!=!r[3]=c1;
1069	or	c_12,c_3,c_12
1070
1071	mulx	a_3,b_1,t_1	!mul_add_c(a[3],b[1],c2,c3,c1);
1072	addcc	c_12,t_1,c_12
1073	clr	c_3		!=
1074	bcs,a	%xcc,.+8
1075	add	c_3,t_2,c_3
1076	mulx	a_2,b_2,t_1	!mul_add_c(a[2],b[2],c2,c3,c1);
1077	addcc	c_12,t_1,c_12	!=
1078	bcs,a	%xcc,.+8
1079	add	c_3,t_2,c_3
1080	mulx	a_1,b_3,t_1	!mul_add_c(a[1],b[3],c2,c3,c1);
1081	addcc	c_12,t_1,t_1	!=
1082	bcs,a	%xcc,.+8
1083	add	c_3,t_2,c_3
1084	srlx	t_1,32,c_12
1085	stuw	t_1,rp(4)	!=!r[4]=c2;
1086	or	c_12,c_3,c_12
1087
1088	mulx	a_2,b_3,t_1	!mul_add_c(a[2],b[3],c3,c1,c2);
1089	addcc	c_12,t_1,c_12
1090	clr	c_3		!=
1091	bcs,a	%xcc,.+8
1092	add	c_3,t_2,c_3
1093	mulx	a_3,b_2,t_1	!mul_add_c(a[3],b[2],c3,c1,c2);
1094	addcc	c_12,t_1,t_1	!=
1095	bcs,a	%xcc,.+8
1096	add	c_3,t_2,c_3
1097	srlx	t_1,32,c_12
1098	stuw	t_1,rp(5)	!=!r[5]=c3;
1099	or	c_12,c_3,c_12
1100
1101	mulx	a_3,b_3,t_1	!mul_add_c(a[3],b[3],c1,c2,c3);
1102	addcc	c_12,t_1,t_1
1103	srlx	t_1,32,c_12	!=
1104	stuw	t_1,rp(6)	!r[6]=c1;
1105	stuw	c_12,rp(7)	!r[7]=c2;
1106
1107	ret
1108	restore	%g0,%g0,%o0
1109
1110.type	bn_mul_comba4,#function
1111.size	bn_mul_comba4,(.-bn_mul_comba4)
1112
1113.align	32
1114
1115.global bn_sqr_comba8
1116bn_sqr_comba8:
1117	save	%sp,FRAME_SIZE,%sp
1118	mov	1,t_2
1119	lduw	ap(0),a_0
1120	sllx	t_2,32,t_2
1121	lduw	ap(1),a_1
1122	mulx	a_0,a_0,t_1	!sqr_add_c(a,0,c1,c2,c3);
1123	srlx	t_1,32,c_12
1124	stuw	t_1,rp(0)	!r[0]=c1;
1125
1126	lduw	ap(2),a_2
1127	mulx	a_0,a_1,t_1	!=!sqr_add_c2(a,1,0,c2,c3,c1);
1128	addcc	c_12,t_1,c_12
1129	clr	c_3
1130	bcs,a	%xcc,.+8
1131	add	c_3,t_2,c_3
1132	addcc	c_12,t_1,t_1
1133	bcs,a	%xcc,.+8
1134	add	c_3,t_2,c_3
1135	srlx	t_1,32,c_12
1136	stuw	t_1,rp(1)	!r[1]=c2;
1137	or	c_12,c_3,c_12
1138
1139	mulx	a_2,a_0,t_1	!sqr_add_c2(a,2,0,c3,c1,c2);
1140	addcc	c_12,t_1,c_12
1141	clr	c_3
1142	bcs,a	%xcc,.+8
1143	add	c_3,t_2,c_3
1144	addcc	c_12,t_1,c_12
1145	bcs,a	%xcc,.+8
1146	add	c_3,t_2,c_3
1147	lduw	ap(3),a_3
1148	mulx	a_1,a_1,t_1	!sqr_add_c(a,1,c3,c1,c2);
1149	addcc	c_12,t_1,t_1
1150	bcs,a	%xcc,.+8
1151	add	c_3,t_2,c_3
1152	srlx	t_1,32,c_12
1153	stuw	t_1,rp(2)	!r[2]=c3;
1154	or	c_12,c_3,c_12
1155
1156	mulx	a_0,a_3,t_1	!sqr_add_c2(a,3,0,c1,c2,c3);
1157	addcc	c_12,t_1,c_12
1158	clr	c_3
1159	bcs,a	%xcc,.+8
1160	add	c_3,t_2,c_3
1161	addcc	c_12,t_1,c_12
1162	bcs,a	%xcc,.+8
1163	add	c_3,t_2,c_3
1164	lduw	ap(4),a_4
1165	mulx	a_1,a_2,t_1	!sqr_add_c2(a,2,1,c1,c2,c3);
1166	addcc	c_12,t_1,c_12
1167	bcs,a	%xcc,.+8
1168	add	c_3,t_2,c_3
1169	addcc	c_12,t_1,t_1
1170	bcs,a	%xcc,.+8
1171	add	c_3,t_2,c_3
1172	srlx	t_1,32,c_12
1173	st	t_1,rp(3)	!r[3]=c1;
1174	or	c_12,c_3,c_12
1175
1176	mulx	a_4,a_0,t_1	!sqr_add_c2(a,4,0,c2,c3,c1);
1177	addcc	c_12,t_1,c_12
1178	clr	c_3
1179	bcs,a	%xcc,.+8
1180	add	c_3,t_2,c_3
1181	addcc	c_12,t_1,c_12
1182	bcs,a	%xcc,.+8
1183	add	c_3,t_2,c_3
1184	mulx	a_3,a_1,t_1	!sqr_add_c2(a,3,1,c2,c3,c1);
1185	addcc	c_12,t_1,c_12
1186	bcs,a	%xcc,.+8
1187	add	c_3,t_2,c_3
1188	addcc	c_12,t_1,c_12
1189	bcs,a	%xcc,.+8
1190	add	c_3,t_2,c_3
1191	lduw	ap(5),a_5
1192	mulx	a_2,a_2,t_1	!sqr_add_c(a,2,c2,c3,c1);
1193	addcc	c_12,t_1,t_1
1194	bcs,a	%xcc,.+8
1195	add	c_3,t_2,c_3
1196	srlx	t_1,32,c_12
1197	stuw	t_1,rp(4)	!r[4]=c2;
1198	or	c_12,c_3,c_12
1199
1200	mulx	a_0,a_5,t_1	!sqr_add_c2(a,5,0,c3,c1,c2);
1201	addcc	c_12,t_1,c_12
1202	clr	c_3
1203	bcs,a	%xcc,.+8
1204	add	c_3,t_2,c_3
1205	addcc	c_12,t_1,c_12
1206	bcs,a	%xcc,.+8
1207	add	c_3,t_2,c_3
1208	mulx	a_1,a_4,t_1	!sqr_add_c2(a,4,1,c3,c1,c2);
1209	addcc	c_12,t_1,c_12
1210	bcs,a	%xcc,.+8
1211	add	c_3,t_2,c_3
1212	addcc	c_12,t_1,c_12
1213	bcs,a	%xcc,.+8
1214	add	c_3,t_2,c_3
1215	lduw	ap(6),a_6
1216	mulx	a_2,a_3,t_1	!sqr_add_c2(a,3,2,c3,c1,c2);
1217	addcc	c_12,t_1,c_12
1218	bcs,a	%xcc,.+8
1219	add	c_3,t_2,c_3
1220	addcc	c_12,t_1,t_1
1221	bcs,a	%xcc,.+8
1222	add	c_3,t_2,c_3
1223	srlx	t_1,32,c_12
1224	stuw	t_1,rp(5)	!r[5]=c3;
1225	or	c_12,c_3,c_12
1226
1227	mulx	a_6,a_0,t_1	!sqr_add_c2(a,6,0,c1,c2,c3);
1228	addcc	c_12,t_1,c_12
1229	clr	c_3
1230	bcs,a	%xcc,.+8
1231	add	c_3,t_2,c_3
1232	addcc	c_12,t_1,c_12
1233	bcs,a	%xcc,.+8
1234	add	c_3,t_2,c_3
1235	mulx	a_5,a_1,t_1	!sqr_add_c2(a,5,1,c1,c2,c3);
1236	addcc	c_12,t_1,c_12
1237	bcs,a	%xcc,.+8
1238	add	c_3,t_2,c_3
1239	addcc	c_12,t_1,c_12
1240	bcs,a	%xcc,.+8
1241	add	c_3,t_2,c_3
1242	mulx	a_4,a_2,t_1	!sqr_add_c2(a,4,2,c1,c2,c3);
1243	addcc	c_12,t_1,c_12
1244	bcs,a	%xcc,.+8
1245	add	c_3,t_2,c_3
1246	addcc	c_12,t_1,c_12
1247	bcs,a	%xcc,.+8
1248	add	c_3,t_2,c_3
1249	lduw	ap(7),a_7
1250	mulx	a_3,a_3,t_1	!=!sqr_add_c(a,3,c1,c2,c3);
1251	addcc	c_12,t_1,t_1
1252	bcs,a	%xcc,.+8
1253	add	c_3,t_2,c_3
1254	srlx	t_1,32,c_12
1255	stuw	t_1,rp(6)	!r[6]=c1;
1256	or	c_12,c_3,c_12
1257
1258	mulx	a_0,a_7,t_1	!sqr_add_c2(a,7,0,c2,c3,c1);
1259	addcc	c_12,t_1,c_12
1260	clr	c_3
1261	bcs,a	%xcc,.+8
1262	add	c_3,t_2,c_3
1263	addcc	c_12,t_1,c_12
1264	bcs,a	%xcc,.+8
1265	add	c_3,t_2,c_3
1266	mulx	a_1,a_6,t_1	!sqr_add_c2(a,6,1,c2,c3,c1);
1267	addcc	c_12,t_1,c_12
1268	bcs,a	%xcc,.+8
1269	add	c_3,t_2,c_3
1270	addcc	c_12,t_1,c_12
1271	bcs,a	%xcc,.+8
1272	add	c_3,t_2,c_3
1273	mulx	a_2,a_5,t_1	!sqr_add_c2(a,5,2,c2,c3,c1);
1274	addcc	c_12,t_1,c_12
1275	bcs,a	%xcc,.+8
1276	add	c_3,t_2,c_3
1277	addcc	c_12,t_1,c_12
1278	bcs,a	%xcc,.+8
1279	add	c_3,t_2,c_3
1280	mulx	a_3,a_4,t_1	!sqr_add_c2(a,4,3,c2,c3,c1);
1281	addcc	c_12,t_1,c_12
1282	bcs,a	%xcc,.+8
1283	add	c_3,t_2,c_3
1284	addcc	c_12,t_1,t_1
1285	bcs,a	%xcc,.+8
1286	add	c_3,t_2,c_3
1287	srlx	t_1,32,c_12
1288	stuw	t_1,rp(7)	!r[7]=c2;
1289	or	c_12,c_3,c_12
1290
1291	mulx	a_7,a_1,t_1	!sqr_add_c2(a,7,1,c3,c1,c2);
1292	addcc	c_12,t_1,c_12
1293	clr	c_3
1294	bcs,a	%xcc,.+8
1295	add	c_3,t_2,c_3
1296	addcc	c_12,t_1,c_12
1297	bcs,a	%xcc,.+8
1298	add	c_3,t_2,c_3
1299	mulx	a_6,a_2,t_1	!sqr_add_c2(a,6,2,c3,c1,c2);
1300	addcc	c_12,t_1,c_12
1301	bcs,a	%xcc,.+8
1302	add	c_3,t_2,c_3
1303	addcc	c_12,t_1,c_12
1304	bcs,a	%xcc,.+8
1305	add	c_3,t_2,c_3
1306	mulx	a_5,a_3,t_1	!sqr_add_c2(a,5,3,c3,c1,c2);
1307	addcc	c_12,t_1,c_12
1308	bcs,a	%xcc,.+8
1309	add	c_3,t_2,c_3
1310	addcc	c_12,t_1,c_12
1311	bcs,a	%xcc,.+8
1312	add	c_3,t_2,c_3
1313	mulx	a_4,a_4,t_1	!sqr_add_c(a,4,c3,c1,c2);
1314	addcc	c_12,t_1,t_1
1315	bcs,a	%xcc,.+8
1316	add	c_3,t_2,c_3
1317	srlx	t_1,32,c_12
1318	stuw	t_1,rp(8)	!r[8]=c3;
1319	or	c_12,c_3,c_12
1320
1321	mulx	a_2,a_7,t_1	!sqr_add_c2(a,7,2,c1,c2,c3);
1322	addcc	c_12,t_1,c_12
1323	clr	c_3
1324	bcs,a	%xcc,.+8
1325	add	c_3,t_2,c_3
1326	addcc	c_12,t_1,c_12
1327	bcs,a	%xcc,.+8
1328	add	c_3,t_2,c_3
1329	mulx	a_3,a_6,t_1	!sqr_add_c2(a,6,3,c1,c2,c3);
1330	addcc	c_12,t_1,c_12
1331	bcs,a	%xcc,.+8
1332	add	c_3,t_2,c_3
1333	addcc	c_12,t_1,c_12
1334	bcs,a	%xcc,.+8
1335	add	c_3,t_2,c_3
1336	mulx	a_4,a_5,t_1	!sqr_add_c2(a,5,4,c1,c2,c3);
1337	addcc	c_12,t_1,c_12
1338	bcs,a	%xcc,.+8
1339	add	c_3,t_2,c_3
1340	addcc	c_12,t_1,t_1
1341	bcs,a	%xcc,.+8
1342	add	c_3,t_2,c_3
1343	srlx	t_1,32,c_12
1344	stuw	t_1,rp(9)	!r[9]=c1;
1345	or	c_12,c_3,c_12
1346
1347	mulx	a_7,a_3,t_1	!sqr_add_c2(a,7,3,c2,c3,c1);
1348	addcc	c_12,t_1,c_12
1349	clr	c_3
1350	bcs,a	%xcc,.+8
1351	add	c_3,t_2,c_3
1352	addcc	c_12,t_1,c_12
1353	bcs,a	%xcc,.+8
1354	add	c_3,t_2,c_3
1355	mulx	a_6,a_4,t_1	!sqr_add_c2(a,6,4,c2,c3,c1);
1356	addcc	c_12,t_1,c_12
1357	bcs,a	%xcc,.+8
1358	add	c_3,t_2,c_3
1359	addcc	c_12,t_1,c_12
1360	bcs,a	%xcc,.+8
1361	add	c_3,t_2,c_3
1362	mulx	a_5,a_5,t_1	!sqr_add_c(a,5,c2,c3,c1);
1363	addcc	c_12,t_1,t_1
1364	bcs,a	%xcc,.+8
1365	add	c_3,t_2,c_3
1366	srlx	t_1,32,c_12
1367	stuw	t_1,rp(10)	!r[10]=c2;
1368	or	c_12,c_3,c_12
1369
1370	mulx	a_4,a_7,t_1	!sqr_add_c2(a,7,4,c3,c1,c2);
1371	addcc	c_12,t_1,c_12
1372	clr	c_3
1373	bcs,a	%xcc,.+8
1374	add	c_3,t_2,c_3
1375	addcc	c_12,t_1,c_12
1376	bcs,a	%xcc,.+8
1377	add	c_3,t_2,c_3
1378	mulx	a_5,a_6,t_1	!sqr_add_c2(a,6,5,c3,c1,c2);
1379	addcc	c_12,t_1,c_12
1380	bcs,a	%xcc,.+8
1381	add	c_3,t_2,c_3
1382	addcc	c_12,t_1,t_1
1383	bcs,a	%xcc,.+8
1384	add	c_3,t_2,c_3
1385	srlx	t_1,32,c_12
1386	stuw	t_1,rp(11)	!r[11]=c3;
1387	or	c_12,c_3,c_12
1388
1389	mulx	a_7,a_5,t_1	!sqr_add_c2(a,7,5,c1,c2,c3);
1390	addcc	c_12,t_1,c_12
1391	clr	c_3
1392	bcs,a	%xcc,.+8
1393	add	c_3,t_2,c_3
1394	addcc	c_12,t_1,c_12
1395	bcs,a	%xcc,.+8
1396	add	c_3,t_2,c_3
1397	mulx	a_6,a_6,t_1	!sqr_add_c(a,6,c1,c2,c3);
1398	addcc	c_12,t_1,t_1
1399	bcs,a	%xcc,.+8
1400	add	c_3,t_2,c_3
1401	srlx	t_1,32,c_12
1402	stuw	t_1,rp(12)	!r[12]=c1;
1403	or	c_12,c_3,c_12
1404
1405	mulx	a_6,a_7,t_1	!sqr_add_c2(a,7,6,c2,c3,c1);
1406	addcc	c_12,t_1,c_12
1407	clr	c_3
1408	bcs,a	%xcc,.+8
1409	add	c_3,t_2,c_3
1410	addcc	c_12,t_1,t_1
1411	bcs,a	%xcc,.+8
1412	add	c_3,t_2,c_3
1413	srlx	t_1,32,c_12
1414	stuw	t_1,rp(13)	!r[13]=c2;
1415	or	c_12,c_3,c_12
1416
1417	mulx	a_7,a_7,t_1	!sqr_add_c(a,7,c3,c1,c2);
1418	addcc	c_12,t_1,t_1
1419	srlx	t_1,32,c_12
1420	stuw	t_1,rp(14)	!r[14]=c3;
1421	stuw	c_12,rp(15)	!r[15]=c1;
1422
1423	ret
1424	restore	%g0,%g0,%o0
1425
1426.type	bn_sqr_comba8,#function
1427.size	bn_sqr_comba8,(.-bn_sqr_comba8)
1428
1429.align	32
1430
1431.global bn_sqr_comba4
1432/*
1433 * void bn_sqr_comba4(r,a)
1434 * BN_ULONG *r,*a;
1435 */
1436bn_sqr_comba4:
1437	save	%sp,FRAME_SIZE,%sp
1438	mov	1,t_2
1439	lduw	ap(0),a_0
1440	sllx	t_2,32,t_2
1441	lduw	ap(1),a_1
1442	mulx	a_0,a_0,t_1	!sqr_add_c(a,0,c1,c2,c3);
1443	srlx	t_1,32,c_12
1444	stuw	t_1,rp(0)	!r[0]=c1;
1445
1446	lduw	ap(2),a_2
1447	mulx	a_0,a_1,t_1	!sqr_add_c2(a,1,0,c2,c3,c1);
1448	addcc	c_12,t_1,c_12
1449	clr	c_3
1450	bcs,a	%xcc,.+8
1451	add	c_3,t_2,c_3
1452	addcc	c_12,t_1,t_1
1453	bcs,a	%xcc,.+8
1454	add	c_3,t_2,c_3
1455	srlx	t_1,32,c_12
1456	stuw	t_1,rp(1)	!r[1]=c2;
1457	or	c_12,c_3,c_12
1458
1459	mulx	a_2,a_0,t_1	!sqr_add_c2(a,2,0,c3,c1,c2);
1460	addcc	c_12,t_1,c_12
1461	clr	c_3
1462	bcs,a	%xcc,.+8
1463	add	c_3,t_2,c_3
1464	addcc	c_12,t_1,c_12
1465	bcs,a	%xcc,.+8
1466	add	c_3,t_2,c_3
1467	lduw	ap(3),a_3
1468	mulx	a_1,a_1,t_1	!sqr_add_c(a,1,c3,c1,c2);
1469	addcc	c_12,t_1,t_1
1470	bcs,a	%xcc,.+8
1471	add	c_3,t_2,c_3
1472	srlx	t_1,32,c_12
1473	stuw	t_1,rp(2)	!r[2]=c3;
1474	or	c_12,c_3,c_12
1475
1476	mulx	a_0,a_3,t_1	!sqr_add_c2(a,3,0,c1,c2,c3);
1477	addcc	c_12,t_1,c_12
1478	clr	c_3
1479	bcs,a	%xcc,.+8
1480	add	c_3,t_2,c_3
1481	addcc	c_12,t_1,c_12
1482	bcs,a	%xcc,.+8
1483	add	c_3,t_2,c_3
1484	mulx	a_1,a_2,t_1	!sqr_add_c2(a,2,1,c1,c2,c3);
1485	addcc	c_12,t_1,c_12
1486	bcs,a	%xcc,.+8
1487	add	c_3,t_2,c_3
1488	addcc	c_12,t_1,t_1
1489	bcs,a	%xcc,.+8
1490	add	c_3,t_2,c_3
1491	srlx	t_1,32,c_12
1492	stuw	t_1,rp(3)	!r[3]=c1;
1493	or	c_12,c_3,c_12
1494
1495	mulx	a_3,a_1,t_1	!sqr_add_c2(a,3,1,c2,c3,c1);
1496	addcc	c_12,t_1,c_12
1497	clr	c_3
1498	bcs,a	%xcc,.+8
1499	add	c_3,t_2,c_3
1500	addcc	c_12,t_1,c_12
1501	bcs,a	%xcc,.+8
1502	add	c_3,t_2,c_3
1503	mulx	a_2,a_2,t_1	!sqr_add_c(a,2,c2,c3,c1);
1504	addcc	c_12,t_1,t_1
1505	bcs,a	%xcc,.+8
1506	add	c_3,t_2,c_3
1507	srlx	t_1,32,c_12
1508	stuw	t_1,rp(4)	!r[4]=c2;
1509	or	c_12,c_3,c_12
1510
1511	mulx	a_2,a_3,t_1	!sqr_add_c2(a,3,2,c3,c1,c2);
1512	addcc	c_12,t_1,c_12
1513	clr	c_3
1514	bcs,a	%xcc,.+8
1515	add	c_3,t_2,c_3
1516	addcc	c_12,t_1,t_1
1517	bcs,a	%xcc,.+8
1518	add	c_3,t_2,c_3
1519	srlx	t_1,32,c_12
1520	stuw	t_1,rp(5)	!r[5]=c3;
1521	or	c_12,c_3,c_12
1522
1523	mulx	a_3,a_3,t_1	!sqr_add_c(a,3,c1,c2,c3);
1524	addcc	c_12,t_1,t_1
1525	srlx	t_1,32,c_12
1526	stuw	t_1,rp(6)	!r[6]=c1;
1527	stuw	c_12,rp(7)	!r[7]=c2;
1528
1529	ret
1530	restore	%g0,%g0,%o0
1531
1532.type	bn_sqr_comba4,#function
1533.size	bn_sqr_comba4,(.-bn_sqr_comba4)
1534
1535.align	32
1536