1.ident	"sparcv8plus.s, Version 1.4"
2.ident	"SPARC v9 ISA artwork by Andy Polyakov <appro@fy.chalmers.se>"
3
4/*
5 * ====================================================================
6 * Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
7 * project.
8 *
9 * Rights for redistribution and usage in source and binary forms are
10 * granted according to the OpenSSL license. Warranty of any kind is
11 * disclaimed.
12 * ====================================================================
13 */
14
15/*
16 * This is my modest contributon to OpenSSL project (see
17 * http://www.openssl.org/ for more information about it) and is
18 * a drop-in UltraSPARC ISA replacement for crypto/bn/bn_asm.c
19 * module. For updates see http://fy.chalmers.se/~appro/hpe/.
20 *
21 * Questions-n-answers.
22 *
23 * Q. How to compile?
24 * A. With SC4.x/SC5.x:
25 *
26 *	cc -xarch=v8plus -c bn_asm.sparc.v8plus.S -o bn_asm.o
27 *
28 *    and with gcc:
29 *
30 *	gcc -mcpu=ultrasparc -c bn_asm.sparc.v8plus.S -o bn_asm.o
31 *
32 *    or if above fails (it does if you have gas installed):
33 *
34 *	gcc -E bn_asm.sparc.v8plus.S | as -xarch=v8plus /dev/fd/0 -o bn_asm.o
35 *
36 *    Quick-n-dirty way to fuse the module into the library.
37 *    Provided that the library is already configured and built
38 *    (in 0.9.2 case with no-asm option):
39 *
40 *	# cd crypto/bn
41 *	# cp /some/place/bn_asm.sparc.v8plus.S .
42 *	# cc -xarch=v8plus -c bn_asm.sparc.v8plus.S -o bn_asm.o
43 *	# make
44 *	# cd ../..
45 *	# make; make test
46 *
47 *    Quick-n-dirty way to get rid of it:
48 *
49 *	# cd crypto/bn
50 *	# touch bn_asm.c
51 *	# make
52 *	# cd ../..
53 *	# make; make test
54 *
55 * Q. V8plus achitecture? What kind of beast is that?
56 * A. Well, it's rather a programming model than an architecture...
57 *    It's actually v9-compliant, i.e. *any* UltraSPARC, CPU under
58 *    special conditions, namely when kernel doesn't preserve upper
59 *    32 bits of otherwise 64-bit registers during a context switch.
60 *
61 * Q. Why just UltraSPARC? What about SuperSPARC?
62 * A. Original release did target UltraSPARC only. Now SuperSPARC
63 *    version is provided along. Both version share bn_*comba[48]
64 *    implementations (see comment later in code for explanation).
65 *    But what's so special about this UltraSPARC implementation?
66 *    Why didn't I let compiler do the job? Trouble is that most of
67 *    available compilers (well, SC5.0 is the only exception) don't
68 *    attempt to take advantage of UltraSPARC's 64-bitness under
69 *    32-bit kernels even though it's perfectly possible (see next
70 *    question).
71 *
72 * Q. 64-bit registers under 32-bit kernels? Didn't you just say it
73 *    doesn't work?
74 * A. You can't adress *all* registers as 64-bit wide:-( The catch is
75 *    that you actually may rely upon %o0-%o5 and %g1-%g4 being fully
76 *    preserved if you're in a leaf function, i.e. such never calling
77 *    any other functions. All functions in this module are leaf and
78 *    10 registers is a handful. And as a matter of fact none-"comba"
79 *    routines don't require even that much and I could even afford to
80 *    not allocate own stack frame for 'em:-)
81 *
82 * Q. What about 64-bit kernels?
83 * A. What about 'em? Just kidding:-) Pure 64-bit version is currently
84 *    under evaluation and development...
85 *
86 * Q. What about shared libraries?
87 * A. What about 'em? Kidding again:-) Code does *not* contain any
88 *    code position dependencies and it's safe to include it into
89 *    shared library as is.
90 *
91 * Q. How much faster does it go?
92 * A. Do you have a good benchmark? In either case below is what I
93 *    experience with crypto/bn/expspeed.c test program:
94 *
95 *	v8plus module on U10/300MHz against bn_asm.c compiled with:
96 *
97 *	cc-5.0 -xarch=v8plus -xO5 -xdepend	+7-12%
98 *	cc-4.2 -xarch=v8plus -xO5 -xdepend	+25-35%
99 *	egcs-1.1.2 -mcpu=ultrasparc -O3		+35-45%
100 *
101 *	v8 module on SS10/60MHz against bn_asm.c compiled with:
102 *
103 *	cc-5.0 -xarch=v8 -xO5 -xdepend		+7-10%
104 *	cc-4.2 -xarch=v8 -xO5 -xdepend		+10%
105 *	egcs-1.1.2 -mv8 -O3			+35-45%
106 *
107 *    As you can see it's damn hard to beat the new Sun C compiler
108 *    and it's in first place GNU C users who will appreciate this
109 *    assembler implementation:-)
110 */
111
112/*
113 * Revision history.
114 *
115 * 1.0	- initial release;
116 * 1.1	- new loop unrolling model(*);
117 *	- some more fine tuning;
118 * 1.2	- made gas friendly;
119 *	- updates to documentation concerning v9;
120 *	- new performance comparison matrix;
121 * 1.3	- fixed problem with /usr/ccs/lib/cpp;
122 * 1.4	- native V9 bn_*_comba[48] implementation (15% more efficient)
123 *	  resulting in slight overall performance kick;
124 *	- some retunes;
125 *	- support for GNU as added;
126 *
127 * (*)	Originally unrolled loop looked like this:
128 *	    for (;;) {
129 *		op(p+0); if (--n==0) break;
130 *		op(p+1); if (--n==0) break;
131 *		op(p+2); if (--n==0) break;
132 *		op(p+3); if (--n==0) break;
133 *		p+=4;
134 *	    }
135 *	I unroll according to following:
136 *	    while (n&~3) {
137 *		op(p+0); op(p+1); op(p+2); op(p+3);
138 *		p+=4; n=-4;
139 *	    }
140 *	    if (n) {
141 *		op(p+0); if (--n==0) return;
142 *		op(p+2); if (--n==0) return;
143 *		op(p+3); return;
144 *	    }
145 */
146
147/*
148 * GNU assembler can't stand stuw:-(
149 */
150#define stuw st
151
152.section	".text",#alloc,#execinstr
153.file		"bn_asm.sparc.v8plus.S"
154
155.align	32
156
157.global bn_mul_add_words
158/*
159 * BN_ULONG bn_mul_add_words(rp,ap,num,w)
160 * BN_ULONG *rp,*ap;
161 * int num;
162 * BN_ULONG w;
163 */
164bn_mul_add_words:
165	sra	%o2,%g0,%o2	! signx %o2
166	brgz,a	%o2,.L_bn_mul_add_words_proceed
167	lduw	[%o1],%g2
168	retl
169	clr	%o0
170	nop
171	nop
172	nop
173
174.L_bn_mul_add_words_proceed:
175	srl	%o3,%g0,%o3	! clruw	%o3
176	andcc	%o2,-4,%g0
177	bz,pn	%icc,.L_bn_mul_add_words_tail
178	clr	%o5
179
180.L_bn_mul_add_words_loop:	! wow! 32 aligned!
181	lduw	[%o0],%g1
182	lduw	[%o1+4],%g3
183	mulx	%o3,%g2,%g2
184	add	%g1,%o5,%o4
185	nop
186	add	%o4,%g2,%o4
187	stuw	%o4,[%o0]
188	srlx	%o4,32,%o5
189
190	lduw	[%o0+4],%g1
191	lduw	[%o1+8],%g2
192	mulx	%o3,%g3,%g3
193	add	%g1,%o5,%o4
194	dec	4,%o2
195	add	%o4,%g3,%o4
196	stuw	%o4,[%o0+4]
197	srlx	%o4,32,%o5
198
199	lduw	[%o0+8],%g1
200	lduw	[%o1+12],%g3
201	mulx	%o3,%g2,%g2
202	add	%g1,%o5,%o4
203	inc	16,%o1
204	add	%o4,%g2,%o4
205	stuw	%o4,[%o0+8]
206	srlx	%o4,32,%o5
207
208	lduw	[%o0+12],%g1
209	mulx	%o3,%g3,%g3
210	add	%g1,%o5,%o4
211	inc	16,%o0
212	add	%o4,%g3,%o4
213	andcc	%o2,-4,%g0
214	stuw	%o4,[%o0-4]
215	srlx	%o4,32,%o5
216	bnz,a,pt	%icc,.L_bn_mul_add_words_loop
217	lduw	[%o1],%g2
218
219	brnz,a,pn	%o2,.L_bn_mul_add_words_tail
220	lduw	[%o1],%g2
221.L_bn_mul_add_words_return:
222	retl
223	mov	%o5,%o0
224
225.L_bn_mul_add_words_tail:
226	lduw	[%o0],%g1
227	mulx	%o3,%g2,%g2
228	add	%g1,%o5,%o4
229	dec	%o2
230	add	%o4,%g2,%o4
231	srlx	%o4,32,%o5
232	brz,pt	%o2,.L_bn_mul_add_words_return
233	stuw	%o4,[%o0]
234
235	lduw	[%o1+4],%g2
236	lduw	[%o0+4],%g1
237	mulx	%o3,%g2,%g2
238	add	%g1,%o5,%o4
239	dec	%o2
240	add	%o4,%g2,%o4
241	srlx	%o4,32,%o5
242	brz,pt	%o2,.L_bn_mul_add_words_return
243	stuw	%o4,[%o0+4]
244
245	lduw	[%o1+8],%g2
246	lduw	[%o0+8],%g1
247	mulx	%o3,%g2,%g2
248	add	%g1,%o5,%o4
249	add	%o4,%g2,%o4
250	stuw	%o4,[%o0+8]
251	retl
252	srlx	%o4,32,%o0
253
254.type	bn_mul_add_words,#function
255.size	bn_mul_add_words,(.-bn_mul_add_words)
256
257.align	32
258
259.global bn_mul_words
260/*
261 * BN_ULONG bn_mul_words(rp,ap,num,w)
262 * BN_ULONG *rp,*ap;
263 * int num;
264 * BN_ULONG w;
265 */
266bn_mul_words:
267	sra	%o2,%g0,%o2	! signx %o2
268	brgz,a	%o2,.L_bn_mul_words_proceeed
269	lduw	[%o1],%g2
270	retl
271	clr	%o0
272	nop
273	nop
274	nop
275
276.L_bn_mul_words_proceeed:
277	srl	%o3,%g0,%o3	! clruw	%o3
278	andcc	%o2,-4,%g0
279	bz,pn	%icc,.L_bn_mul_words_tail
280	clr	%o5
281
282.L_bn_mul_words_loop:		! wow! 32 aligned!
283	lduw	[%o1+4],%g3
284	mulx	%o3,%g2,%g2
285	add	%g2,%o5,%o4
286	nop
287	stuw	%o4,[%o0]
288	srlx	%o4,32,%o5
289
290	lduw	[%o1+8],%g2
291	mulx	%o3,%g3,%g3
292	add	%g3,%o5,%o4
293	dec	4,%o2
294	stuw	%o4,[%o0+4]
295	srlx	%o4,32,%o5
296
297	lduw	[%o1+12],%g3
298	mulx	%o3,%g2,%g2
299	add	%g2,%o5,%o4
300	inc	16,%o1
301	stuw	%o4,[%o0+8]
302	srlx	%o4,32,%o5
303
304	mulx	%o3,%g3,%g3
305	add	%g3,%o5,%o4
306	inc	16,%o0
307	stuw	%o4,[%o0-4]
308	srlx	%o4,32,%o5
309	andcc	%o2,-4,%g0
310	bnz,a,pt	%icc,.L_bn_mul_words_loop
311	lduw	[%o1],%g2
312	nop
313	nop
314
315	brnz,a,pn	%o2,.L_bn_mul_words_tail
316	lduw	[%o1],%g2
317.L_bn_mul_words_return:
318	retl
319	mov	%o5,%o0
320
321.L_bn_mul_words_tail:
322	mulx	%o3,%g2,%g2
323	add	%g2,%o5,%o4
324	dec	%o2
325	srlx	%o4,32,%o5
326	brz,pt	%o2,.L_bn_mul_words_return
327	stuw	%o4,[%o0]
328
329	lduw	[%o1+4],%g2
330	mulx	%o3,%g2,%g2
331	add	%g2,%o5,%o4
332	dec	%o2
333	srlx	%o4,32,%o5
334	brz,pt	%o2,.L_bn_mul_words_return
335	stuw	%o4,[%o0+4]
336
337	lduw	[%o1+8],%g2
338	mulx	%o3,%g2,%g2
339	add	%g2,%o5,%o4
340	stuw	%o4,[%o0+8]
341	retl
342	srlx	%o4,32,%o0
343
344.type	bn_mul_words,#function
345.size	bn_mul_words,(.-bn_mul_words)
346
347.align  32
348.global	bn_sqr_words
349/*
350 * void bn_sqr_words(r,a,n)
351 * BN_ULONG *r,*a;
352 * int n;
353 */
354bn_sqr_words:
355	sra	%o2,%g0,%o2	! signx %o2
356	brgz,a	%o2,.L_bn_sqr_words_proceeed
357	lduw	[%o1],%g2
358	retl
359	clr	%o0
360	nop
361	nop
362	nop
363
364.L_bn_sqr_words_proceeed:
365	andcc	%o2,-4,%g0
366	nop
367	bz,pn	%icc,.L_bn_sqr_words_tail
368	nop
369
370.L_bn_sqr_words_loop:		! wow! 32 aligned!
371	lduw	[%o1+4],%g3
372	mulx	%g2,%g2,%o4
373	stuw	%o4,[%o0]
374	srlx	%o4,32,%o5
375	stuw	%o5,[%o0+4]
376	nop
377
378	lduw	[%o1+8],%g2
379	mulx	%g3,%g3,%o4
380	dec	4,%o2
381	stuw	%o4,[%o0+8]
382	srlx	%o4,32,%o5
383	stuw	%o5,[%o0+12]
384
385	lduw	[%o1+12],%g3
386	mulx	%g2,%g2,%o4
387	srlx	%o4,32,%o5
388	stuw	%o4,[%o0+16]
389	inc	16,%o1
390	stuw	%o5,[%o0+20]
391
392	mulx	%g3,%g3,%o4
393	inc	32,%o0
394	stuw	%o4,[%o0-8]
395	srlx	%o4,32,%o5
396	andcc	%o2,-4,%g2
397	stuw	%o5,[%o0-4]
398	bnz,a,pt	%icc,.L_bn_sqr_words_loop
399	lduw	[%o1],%g2
400	nop
401
402	brnz,a,pn	%o2,.L_bn_sqr_words_tail
403	lduw	[%o1],%g2
404.L_bn_sqr_words_return:
405	retl
406	clr	%o0
407
408.L_bn_sqr_words_tail:
409	mulx	%g2,%g2,%o4
410	dec	%o2
411	stuw	%o4,[%o0]
412	srlx	%o4,32,%o5
413	brz,pt	%o2,.L_bn_sqr_words_return
414	stuw	%o5,[%o0+4]
415
416	lduw	[%o1+4],%g2
417	mulx	%g2,%g2,%o4
418	dec	%o2
419	stuw	%o4,[%o0+8]
420	srlx	%o4,32,%o5
421	brz,pt	%o2,.L_bn_sqr_words_return
422	stuw	%o5,[%o0+12]
423
424	lduw	[%o1+8],%g2
425	mulx	%g2,%g2,%o4
426	srlx	%o4,32,%o5
427	stuw	%o4,[%o0+16]
428	stuw	%o5,[%o0+20]
429	retl
430	clr	%o0
431
432.type	bn_sqr_words,#function
433.size	bn_sqr_words,(.-bn_sqr_words)
434
435.align	32
436.global bn_div_words
437/*
438 * BN_ULONG bn_div_words(h,l,d)
439 * BN_ULONG h,l,d;
440 */
441bn_div_words:
442	sllx	%o0,32,%o0
443	or	%o0,%o1,%o0
444	udivx	%o0,%o2,%o0
445	retl
446	srl	%o0,%g0,%o0	! clruw	%o0
447
448.type	bn_div_words,#function
449.size	bn_div_words,(.-bn_div_words)
450
451.align	32
452
453.global bn_add_words
454/*
455 * BN_ULONG bn_add_words(rp,ap,bp,n)
456 * BN_ULONG *rp,*ap,*bp;
457 * int n;
458 */
459bn_add_words:
460	sra	%o3,%g0,%o3	! signx %o3
461	brgz,a	%o3,.L_bn_add_words_proceed
462	lduw	[%o1],%o4
463	retl
464	clr	%o0
465
466.L_bn_add_words_proceed:
467	andcc	%o3,-4,%g0
468	bz,pn	%icc,.L_bn_add_words_tail
469	addcc	%g0,0,%g0	! clear carry flag
470
471.L_bn_add_words_loop:		! wow! 32 aligned!
472	dec	4,%o3
473	lduw	[%o2],%o5
474	lduw	[%o1+4],%g1
475	lduw	[%o2+4],%g2
476	lduw	[%o1+8],%g3
477	lduw	[%o2+8],%g4
478	addccc	%o5,%o4,%o5
479	stuw	%o5,[%o0]
480
481	lduw	[%o1+12],%o4
482	lduw	[%o2+12],%o5
483	inc	16,%o1
484	addccc	%g1,%g2,%g1
485	stuw	%g1,[%o0+4]
486
487	inc	16,%o2
488	addccc	%g3,%g4,%g3
489	stuw	%g3,[%o0+8]
490
491	inc	16,%o0
492	addccc	%o5,%o4,%o5
493	stuw	%o5,[%o0-4]
494	and	%o3,-4,%g1
495	brnz,a,pt	%g1,.L_bn_add_words_loop
496	lduw	[%o1],%o4
497
498	brnz,a,pn	%o3,.L_bn_add_words_tail
499	lduw	[%o1],%o4
500.L_bn_add_words_return:
501	clr	%o0
502	retl
503	movcs	%icc,1,%o0
504	nop
505
506.L_bn_add_words_tail:
507	lduw	[%o2],%o5
508	dec	%o3
509	addccc	%o5,%o4,%o5
510	brz,pt	%o3,.L_bn_add_words_return
511	stuw	%o5,[%o0]
512
513	lduw	[%o1+4],%o4
514	lduw	[%o2+4],%o5
515	dec	%o3
516	addccc	%o5,%o4,%o5
517	brz,pt	%o3,.L_bn_add_words_return
518	stuw	%o5,[%o0+4]
519
520	lduw	[%o1+8],%o4
521	lduw	[%o2+8],%o5
522	addccc	%o5,%o4,%o5
523	stuw	%o5,[%o0+8]
524	clr	%o0
525	retl
526	movcs	%icc,1,%o0
527
528.type	bn_add_words,#function
529.size	bn_add_words,(.-bn_add_words)
530
531.global bn_sub_words
532/*
533 * BN_ULONG bn_sub_words(rp,ap,bp,n)
534 * BN_ULONG *rp,*ap,*bp;
535 * int n;
536 */
537bn_sub_words:
538	sra	%o3,%g0,%o3	! signx %o3
539	brgz,a	%o3,.L_bn_sub_words_proceed
540	lduw	[%o1],%o4
541	retl
542	clr	%o0
543
544.L_bn_sub_words_proceed:
545	andcc	%o3,-4,%g0
546	bz,pn	%icc,.L_bn_sub_words_tail
547	addcc	%g0,0,%g0	! clear carry flag
548
549.L_bn_sub_words_loop:		! wow! 32 aligned!
550	dec	4,%o3
551	lduw	[%o2],%o5
552	lduw	[%o1+4],%g1
553	lduw	[%o2+4],%g2
554	lduw	[%o1+8],%g3
555	lduw	[%o2+8],%g4
556	subccc	%o4,%o5,%o5
557	stuw	%o5,[%o0]
558
559	lduw	[%o1+12],%o4
560	lduw	[%o2+12],%o5
561	inc	16,%o1
562	subccc	%g1,%g2,%g2
563	stuw	%g2,[%o0+4]
564
565	inc	16,%o2
566	subccc	%g3,%g4,%g4
567	stuw	%g4,[%o0+8]
568
569	inc	16,%o0
570	subccc	%o4,%o5,%o5
571	stuw	%o5,[%o0-4]
572	and	%o3,-4,%g1
573	brnz,a,pt	%g1,.L_bn_sub_words_loop
574	lduw	[%o1],%o4
575
576	brnz,a,pn	%o3,.L_bn_sub_words_tail
577	lduw	[%o1],%o4
578.L_bn_sub_words_return:
579	clr	%o0
580	retl
581	movcs	%icc,1,%o0
582	nop
583
584.L_bn_sub_words_tail:		! wow! 32 aligned!
585	lduw	[%o2],%o5
586	dec	%o3
587	subccc	%o4,%o5,%o5
588	brz,pt	%o3,.L_bn_sub_words_return
589	stuw	%o5,[%o0]
590
591	lduw	[%o1+4],%o4
592	lduw	[%o2+4],%o5
593	dec	%o3
594	subccc	%o4,%o5,%o5
595	brz,pt	%o3,.L_bn_sub_words_return
596	stuw	%o5,[%o0+4]
597
598	lduw	[%o1+8],%o4
599	lduw	[%o2+8],%o5
600	subccc	%o4,%o5,%o5
601	stuw	%o5,[%o0+8]
602	clr	%o0
603	retl
604	movcs	%icc,1,%o0
605
606.type	bn_sub_words,#function
607.size	bn_sub_words,(.-bn_sub_words)
608
609/*
610 * Code below depends on the fact that upper parts of the %l0-%l7
611 * and %i0-%i7 are zeroed by kernel after context switch. In
612 * previous versions this comment stated that "the trouble is that
613 * it's not feasible to implement the mumbo-jumbo in less V9
614 * instructions:-(" which apparently isn't true thanks to
615 * 'bcs,a %xcc,.+8; inc %rd' pair. But the performance improvement
616 * results not from the shorter code, but from elimination of
617 * multicycle none-pairable 'rd %y,%rd' instructions.
618 *
619 *							Andy.
620 */
621
622#define FRAME_SIZE	-96
623
624/*
625 * Here is register usage map for *all* routines below.
626 */
627#define t_1	%o0
628#define	t_2	%o1
629#define c_12	%o2
630#define c_3	%o3
631
632#define ap(I)	[%i1+4*I]
633#define bp(I)	[%i2+4*I]
634#define rp(I)	[%i0+4*I]
635
636#define	a_0	%l0
637#define	a_1	%l1
638#define	a_2	%l2
639#define	a_3	%l3
640#define	a_4	%l4
641#define	a_5	%l5
642#define	a_6	%l6
643#define	a_7	%l7
644
645#define	b_0	%i3
646#define	b_1	%i4
647#define	b_2	%i5
648#define	b_3	%o4
649#define	b_4	%o5
650#define	b_5	%o7
651#define	b_6	%g1
652#define	b_7	%g4
653
654.align	32
655.global bn_mul_comba8
656/*
657 * void bn_mul_comba8(r,a,b)
658 * BN_ULONG *r,*a,*b;
659 */
660bn_mul_comba8:
661	save	%sp,FRAME_SIZE,%sp
662	mov	1,t_2
663	lduw	ap(0),a_0
664	sllx	t_2,32,t_2
665	lduw	bp(0),b_0	!=
666	lduw	bp(1),b_1
667	mulx	a_0,b_0,t_1	!mul_add_c(a[0],b[0],c1,c2,c3);
668	srlx	t_1,32,c_12
669	stuw	t_1,rp(0)	!=!r[0]=c1;
670
671	lduw	ap(1),a_1
672	mulx	a_0,b_1,t_1	!mul_add_c(a[0],b[1],c2,c3,c1);
673	addcc	c_12,t_1,c_12
674	clr	c_3		!=
675	bcs,a	%xcc,.+8
676	add	c_3,t_2,c_3
677	lduw	ap(2),a_2
678	mulx	a_1,b_0,t_1	!=!mul_add_c(a[1],b[0],c2,c3,c1);
679	addcc	c_12,t_1,t_1
680	bcs,a	%xcc,.+8
681	add	c_3,t_2,c_3
682	srlx	t_1,32,c_12	!=
683	stuw	t_1,rp(1)	!r[1]=c2;
684	or	c_12,c_3,c_12
685
686	mulx	a_2,b_0,t_1	!mul_add_c(a[2],b[0],c3,c1,c2);
687	addcc	c_12,t_1,c_12	!=
688	clr	c_3
689	bcs,a	%xcc,.+8
690	add	c_3,t_2,c_3
691	lduw	bp(2),b_2	!=
692	mulx	a_1,b_1,t_1	!mul_add_c(a[1],b[1],c3,c1,c2);
693	addcc	c_12,t_1,c_12
694	bcs,a	%xcc,.+8
695	add	c_3,t_2,c_3	!=
696	lduw	bp(3),b_3
697	mulx	a_0,b_2,t_1	!mul_add_c(a[0],b[2],c3,c1,c2);
698	addcc	c_12,t_1,t_1
699	bcs,a	%xcc,.+8	!=
700	add	c_3,t_2,c_3
701	srlx	t_1,32,c_12
702	stuw	t_1,rp(2)	!r[2]=c3;
703	or	c_12,c_3,c_12	!=
704
705	mulx	a_0,b_3,t_1	!mul_add_c(a[0],b[3],c1,c2,c3);
706	addcc	c_12,t_1,c_12
707	clr	c_3
708	bcs,a	%xcc,.+8	!=
709	add	c_3,t_2,c_3
710	mulx	a_1,b_2,t_1	!=!mul_add_c(a[1],b[2],c1,c2,c3);
711	addcc	c_12,t_1,c_12
712	bcs,a	%xcc,.+8	!=
713	add	c_3,t_2,c_3
714	lduw	ap(3),a_3
715	mulx	a_2,b_1,t_1	!mul_add_c(a[2],b[1],c1,c2,c3);
716	addcc	c_12,t_1,c_12	!=
717	bcs,a	%xcc,.+8
718	add	c_3,t_2,c_3
719	lduw	ap(4),a_4
720	mulx	a_3,b_0,t_1	!=!mul_add_c(a[3],b[0],c1,c2,c3);!=
721	addcc	c_12,t_1,t_1
722	bcs,a	%xcc,.+8
723	add	c_3,t_2,c_3
724	srlx	t_1,32,c_12	!=
725	stuw	t_1,rp(3)	!r[3]=c1;
726	or	c_12,c_3,c_12
727
728	mulx	a_4,b_0,t_1	!mul_add_c(a[4],b[0],c2,c3,c1);
729	addcc	c_12,t_1,c_12	!=
730	clr	c_3
731	bcs,a	%xcc,.+8
732	add	c_3,t_2,c_3
733	mulx	a_3,b_1,t_1	!=!mul_add_c(a[3],b[1],c2,c3,c1);
734	addcc	c_12,t_1,c_12
735	bcs,a	%xcc,.+8
736	add	c_3,t_2,c_3
737	mulx	a_2,b_2,t_1	!=!mul_add_c(a[2],b[2],c2,c3,c1);
738	addcc	c_12,t_1,c_12
739	bcs,a	%xcc,.+8
740	add	c_3,t_2,c_3
741	lduw	bp(4),b_4	!=
742	mulx	a_1,b_3,t_1	!mul_add_c(a[1],b[3],c2,c3,c1);
743	addcc	c_12,t_1,c_12
744	bcs,a	%xcc,.+8
745	add	c_3,t_2,c_3	!=
746	lduw	bp(5),b_5
747	mulx	a_0,b_4,t_1	!mul_add_c(a[0],b[4],c2,c3,c1);
748	addcc	c_12,t_1,t_1
749	bcs,a	%xcc,.+8	!=
750	add	c_3,t_2,c_3
751	srlx	t_1,32,c_12
752	stuw	t_1,rp(4)	!r[4]=c2;
753	or	c_12,c_3,c_12	!=
754
755	mulx	a_0,b_5,t_1	!mul_add_c(a[0],b[5],c3,c1,c2);
756	addcc	c_12,t_1,c_12
757	clr	c_3
758	bcs,a	%xcc,.+8	!=
759	add	c_3,t_2,c_3
760	mulx	a_1,b_4,t_1	!mul_add_c(a[1],b[4],c3,c1,c2);
761	addcc	c_12,t_1,c_12
762	bcs,a	%xcc,.+8	!=
763	add	c_3,t_2,c_3
764	mulx	a_2,b_3,t_1	!mul_add_c(a[2],b[3],c3,c1,c2);
765	addcc	c_12,t_1,c_12
766	bcs,a	%xcc,.+8	!=
767	add	c_3,t_2,c_3
768	mulx	a_3,b_2,t_1	!mul_add_c(a[3],b[2],c3,c1,c2);
769	addcc	c_12,t_1,c_12
770	bcs,a	%xcc,.+8	!=
771	add	c_3,t_2,c_3
772	lduw	ap(5),a_5
773	mulx	a_4,b_1,t_1	!mul_add_c(a[4],b[1],c3,c1,c2);
774	addcc	c_12,t_1,c_12	!=
775	bcs,a	%xcc,.+8
776	add	c_3,t_2,c_3
777	lduw	ap(6),a_6
778	mulx	a_5,b_0,t_1	!=!mul_add_c(a[5],b[0],c3,c1,c2);
779	addcc	c_12,t_1,t_1
780	bcs,a	%xcc,.+8
781	add	c_3,t_2,c_3
782	srlx	t_1,32,c_12	!=
783	stuw	t_1,rp(5)	!r[5]=c3;
784	or	c_12,c_3,c_12
785
786	mulx	a_6,b_0,t_1	!mul_add_c(a[6],b[0],c1,c2,c3);
787	addcc	c_12,t_1,c_12	!=
788	clr	c_3
789	bcs,a	%xcc,.+8
790	add	c_3,t_2,c_3
791	mulx	a_5,b_1,t_1	!=!mul_add_c(a[5],b[1],c1,c2,c3);
792	addcc	c_12,t_1,c_12
793	bcs,a	%xcc,.+8
794	add	c_3,t_2,c_3
795	mulx	a_4,b_2,t_1	!=!mul_add_c(a[4],b[2],c1,c2,c3);
796	addcc	c_12,t_1,c_12
797	bcs,a	%xcc,.+8
798	add	c_3,t_2,c_3
799	mulx	a_3,b_3,t_1	!=!mul_add_c(a[3],b[3],c1,c2,c3);
800	addcc	c_12,t_1,c_12
801	bcs,a	%xcc,.+8
802	add	c_3,t_2,c_3
803	mulx	a_2,b_4,t_1	!=!mul_add_c(a[2],b[4],c1,c2,c3);
804	addcc	c_12,t_1,c_12
805	bcs,a	%xcc,.+8
806	add	c_3,t_2,c_3
807	lduw	bp(6),b_6	!=
808	mulx	a_1,b_5,t_1	!mul_add_c(a[1],b[5],c1,c2,c3);
809	addcc	c_12,t_1,c_12
810	bcs,a	%xcc,.+8
811	add	c_3,t_2,c_3	!=
812	lduw	bp(7),b_7
813	mulx	a_0,b_6,t_1	!mul_add_c(a[0],b[6],c1,c2,c3);
814	addcc	c_12,t_1,t_1
815	bcs,a	%xcc,.+8	!=
816	add	c_3,t_2,c_3
817	srlx	t_1,32,c_12
818	stuw	t_1,rp(6)	!r[6]=c1;
819	or	c_12,c_3,c_12	!=
820
821	mulx	a_0,b_7,t_1	!mul_add_c(a[0],b[7],c2,c3,c1);
822	addcc	c_12,t_1,c_12
823	clr	c_3
824	bcs,a	%xcc,.+8	!=
825	add	c_3,t_2,c_3
826	mulx	a_1,b_6,t_1	!mul_add_c(a[1],b[6],c2,c3,c1);
827	addcc	c_12,t_1,c_12
828	bcs,a	%xcc,.+8	!=
829	add	c_3,t_2,c_3
830	mulx	a_2,b_5,t_1	!mul_add_c(a[2],b[5],c2,c3,c1);
831	addcc	c_12,t_1,c_12
832	bcs,a	%xcc,.+8	!=
833	add	c_3,t_2,c_3
834	mulx	a_3,b_4,t_1	!mul_add_c(a[3],b[4],c2,c3,c1);
835	addcc	c_12,t_1,c_12
836	bcs,a	%xcc,.+8	!=
837	add	c_3,t_2,c_3
838	mulx	a_4,b_3,t_1	!mul_add_c(a[4],b[3],c2,c3,c1);
839	addcc	c_12,t_1,c_12
840	bcs,a	%xcc,.+8	!=
841	add	c_3,t_2,c_3
842	mulx	a_5,b_2,t_1	!mul_add_c(a[5],b[2],c2,c3,c1);
843	addcc	c_12,t_1,c_12
844	bcs,a	%xcc,.+8	!=
845	add	c_3,t_2,c_3
846	lduw	ap(7),a_7
847	mulx	a_6,b_1,t_1	!=!mul_add_c(a[6],b[1],c2,c3,c1);
848	addcc	c_12,t_1,c_12
849	bcs,a	%xcc,.+8
850	add	c_3,t_2,c_3
851	mulx	a_7,b_0,t_1	!=!mul_add_c(a[7],b[0],c2,c3,c1);
852	addcc	c_12,t_1,t_1
853	bcs,a	%xcc,.+8
854	add	c_3,t_2,c_3
855	srlx	t_1,32,c_12	!=
856	stuw	t_1,rp(7)	!r[7]=c2;
857	or	c_12,c_3,c_12
858
859	mulx	a_7,b_1,t_1	!=!mul_add_c(a[7],b[1],c3,c1,c2);
860	addcc	c_12,t_1,c_12
861	clr	c_3
862	bcs,a	%xcc,.+8
863	add	c_3,t_2,c_3	!=
864	mulx	a_6,b_2,t_1	!mul_add_c(a[6],b[2],c3,c1,c2);
865	addcc	c_12,t_1,c_12
866	bcs,a	%xcc,.+8
867	add	c_3,t_2,c_3	!=
868	mulx	a_5,b_3,t_1	!mul_add_c(a[5],b[3],c3,c1,c2);
869	addcc	c_12,t_1,c_12
870	bcs,a	%xcc,.+8
871	add	c_3,t_2,c_3	!=
872	mulx	a_4,b_4,t_1	!mul_add_c(a[4],b[4],c3,c1,c2);
873	addcc	c_12,t_1,c_12
874	bcs,a	%xcc,.+8
875	add	c_3,t_2,c_3	!=
876	mulx	a_3,b_5,t_1	!mul_add_c(a[3],b[5],c3,c1,c2);
877	addcc	c_12,t_1,c_12
878	bcs,a	%xcc,.+8
879	add	c_3,t_2,c_3	!=
880	mulx	a_2,b_6,t_1	!mul_add_c(a[2],b[6],c3,c1,c2);
881	addcc	c_12,t_1,c_12
882	bcs,a	%xcc,.+8
883	add	c_3,t_2,c_3	!=
884	mulx	a_1,b_7,t_1	!mul_add_c(a[1],b[7],c3,c1,c2);
885	addcc	c_12,t_1,t_1
886	bcs,a	%xcc,.+8
887	add	c_3,t_2,c_3	!=
888	srlx	t_1,32,c_12
889	stuw	t_1,rp(8)	!r[8]=c3;
890	or	c_12,c_3,c_12
891
892	mulx	a_2,b_7,t_1	!=!mul_add_c(a[2],b[7],c1,c2,c3);
893	addcc	c_12,t_1,c_12
894	clr	c_3
895	bcs,a	%xcc,.+8
896	add	c_3,t_2,c_3	!=
897	mulx	a_3,b_6,t_1	!mul_add_c(a[3],b[6],c1,c2,c3);
898	addcc	c_12,t_1,c_12
899	bcs,a	%xcc,.+8	!=
900	add	c_3,t_2,c_3
901	mulx	a_4,b_5,t_1	!mul_add_c(a[4],b[5],c1,c2,c3);
902	addcc	c_12,t_1,c_12
903	bcs,a	%xcc,.+8	!=
904	add	c_3,t_2,c_3
905	mulx	a_5,b_4,t_1	!mul_add_c(a[5],b[4],c1,c2,c3);
906	addcc	c_12,t_1,c_12
907	bcs,a	%xcc,.+8	!=
908	add	c_3,t_2,c_3
909	mulx	a_6,b_3,t_1	!mul_add_c(a[6],b[3],c1,c2,c3);
910	addcc	c_12,t_1,c_12
911	bcs,a	%xcc,.+8	!=
912	add	c_3,t_2,c_3
913	mulx	a_7,b_2,t_1	!mul_add_c(a[7],b[2],c1,c2,c3);
914	addcc	c_12,t_1,t_1
915	bcs,a	%xcc,.+8	!=
916	add	c_3,t_2,c_3
917	srlx	t_1,32,c_12
918	stuw	t_1,rp(9)	!r[9]=c1;
919	or	c_12,c_3,c_12	!=
920
921	mulx	a_7,b_3,t_1	!mul_add_c(a[7],b[3],c2,c3,c1);
922	addcc	c_12,t_1,c_12
923	clr	c_3
924	bcs,a	%xcc,.+8	!=
925	add	c_3,t_2,c_3
926	mulx	a_6,b_4,t_1	!mul_add_c(a[6],b[4],c2,c3,c1);
927	addcc	c_12,t_1,c_12
928	bcs,a	%xcc,.+8	!=
929	add	c_3,t_2,c_3
930	mulx	a_5,b_5,t_1	!mul_add_c(a[5],b[5],c2,c3,c1);
931	addcc	c_12,t_1,c_12
932	bcs,a	%xcc,.+8	!=
933	add	c_3,t_2,c_3
934	mulx	a_4,b_6,t_1	!mul_add_c(a[4],b[6],c2,c3,c1);
935	addcc	c_12,t_1,c_12
936	bcs,a	%xcc,.+8	!=
937	add	c_3,t_2,c_3
938	mulx	a_3,b_7,t_1	!mul_add_c(a[3],b[7],c2,c3,c1);
939	addcc	c_12,t_1,t_1
940	bcs,a	%xcc,.+8	!=
941	add	c_3,t_2,c_3
942	srlx	t_1,32,c_12
943	stuw	t_1,rp(10)	!r[10]=c2;
944	or	c_12,c_3,c_12	!=
945
946	mulx	a_4,b_7,t_1	!mul_add_c(a[4],b[7],c3,c1,c2);
947	addcc	c_12,t_1,c_12
948	clr	c_3
949	bcs,a	%xcc,.+8	!=
950	add	c_3,t_2,c_3
951	mulx	a_5,b_6,t_1	!mul_add_c(a[5],b[6],c3,c1,c2);
952	addcc	c_12,t_1,c_12
953	bcs,a	%xcc,.+8	!=
954	add	c_3,t_2,c_3
955	mulx	a_6,b_5,t_1	!mul_add_c(a[6],b[5],c3,c1,c2);
956	addcc	c_12,t_1,c_12
957	bcs,a	%xcc,.+8	!=
958	add	c_3,t_2,c_3
959	mulx	a_7,b_4,t_1	!mul_add_c(a[7],b[4],c3,c1,c2);
960	addcc	c_12,t_1,t_1
961	bcs,a	%xcc,.+8	!=
962	add	c_3,t_2,c_3
963	srlx	t_1,32,c_12
964	stuw	t_1,rp(11)	!r[11]=c3;
965	or	c_12,c_3,c_12	!=
966
967	mulx	a_7,b_5,t_1	!mul_add_c(a[7],b[5],c1,c2,c3);
968	addcc	c_12,t_1,c_12
969	clr	c_3
970	bcs,a	%xcc,.+8	!=
971	add	c_3,t_2,c_3
972	mulx	a_6,b_6,t_1	!mul_add_c(a[6],b[6],c1,c2,c3);
973	addcc	c_12,t_1,c_12
974	bcs,a	%xcc,.+8	!=
975	add	c_3,t_2,c_3
976	mulx	a_5,b_7,t_1	!mul_add_c(a[5],b[7],c1,c2,c3);
977	addcc	c_12,t_1,t_1
978	bcs,a	%xcc,.+8	!=
979	add	c_3,t_2,c_3
980	srlx	t_1,32,c_12
981	stuw	t_1,rp(12)	!r[12]=c1;
982	or	c_12,c_3,c_12	!=
983
984	mulx	a_6,b_7,t_1	!mul_add_c(a[6],b[7],c2,c3,c1);
985	addcc	c_12,t_1,c_12
986	clr	c_3
987	bcs,a	%xcc,.+8	!=
988	add	c_3,t_2,c_3
989	mulx	a_7,b_6,t_1	!mul_add_c(a[7],b[6],c2,c3,c1);
990	addcc	c_12,t_1,t_1
991	bcs,a	%xcc,.+8	!=
992	add	c_3,t_2,c_3
993	srlx	t_1,32,c_12
994	st	t_1,rp(13)	!r[13]=c2;
995	or	c_12,c_3,c_12	!=
996
997	mulx	a_7,b_7,t_1	!mul_add_c(a[7],b[7],c3,c1,c2);
998	addcc	c_12,t_1,t_1
999	srlx	t_1,32,c_12	!=
1000	stuw	t_1,rp(14)	!r[14]=c3;
1001	stuw	c_12,rp(15)	!r[15]=c1;
1002
1003	ret
1004	restore	%g0,%g0,%o0	!=
1005
1006.type	bn_mul_comba8,#function
1007.size	bn_mul_comba8,(.-bn_mul_comba8)
1008
1009.align	32
1010
1011.global bn_mul_comba4
1012/*
1013 * void bn_mul_comba4(r,a,b)
1014 * BN_ULONG *r,*a,*b;
1015 */
1016bn_mul_comba4:
1017	save	%sp,FRAME_SIZE,%sp
1018	lduw	ap(0),a_0
1019	mov	1,t_2
1020	lduw	bp(0),b_0
1021	sllx	t_2,32,t_2	!=
1022	lduw	bp(1),b_1
1023	mulx	a_0,b_0,t_1	!mul_add_c(a[0],b[0],c1,c2,c3);
1024	srlx	t_1,32,c_12
1025	stuw	t_1,rp(0)	!=!r[0]=c1;
1026
1027	lduw	ap(1),a_1
1028	mulx	a_0,b_1,t_1	!mul_add_c(a[0],b[1],c2,c3,c1);
1029	addcc	c_12,t_1,c_12
1030	clr	c_3		!=
1031	bcs,a	%xcc,.+8
1032	add	c_3,t_2,c_3
1033	lduw	ap(2),a_2
1034	mulx	a_1,b_0,t_1	!=!mul_add_c(a[1],b[0],c2,c3,c1);
1035	addcc	c_12,t_1,t_1
1036	bcs,a	%xcc,.+8
1037	add	c_3,t_2,c_3
1038	srlx	t_1,32,c_12	!=
1039	stuw	t_1,rp(1)	!r[1]=c2;
1040	or	c_12,c_3,c_12
1041
1042	mulx	a_2,b_0,t_1	!mul_add_c(a[2],b[0],c3,c1,c2);
1043	addcc	c_12,t_1,c_12	!=
1044	clr	c_3
1045	bcs,a	%xcc,.+8
1046	add	c_3,t_2,c_3
1047	lduw	bp(2),b_2	!=
1048	mulx	a_1,b_1,t_1	!mul_add_c(a[1],b[1],c3,c1,c2);
1049	addcc	c_12,t_1,c_12
1050	bcs,a	%xcc,.+8
1051	add	c_3,t_2,c_3	!=
1052	lduw	bp(3),b_3
1053	mulx	a_0,b_2,t_1	!mul_add_c(a[0],b[2],c3,c1,c2);
1054	addcc	c_12,t_1,t_1
1055	bcs,a	%xcc,.+8	!=
1056	add	c_3,t_2,c_3
1057	srlx	t_1,32,c_12
1058	stuw	t_1,rp(2)	!r[2]=c3;
1059	or	c_12,c_3,c_12	!=
1060
1061	mulx	a_0,b_3,t_1	!mul_add_c(a[0],b[3],c1,c2,c3);
1062	addcc	c_12,t_1,c_12
1063	clr	c_3
1064	bcs,a	%xcc,.+8	!=
1065	add	c_3,t_2,c_3
1066	mulx	a_1,b_2,t_1	!mul_add_c(a[1],b[2],c1,c2,c3);
1067	addcc	c_12,t_1,c_12
1068	bcs,a	%xcc,.+8	!=
1069	add	c_3,t_2,c_3
1070	lduw	ap(3),a_3
1071	mulx	a_2,b_1,t_1	!mul_add_c(a[2],b[1],c1,c2,c3);
1072	addcc	c_12,t_1,c_12	!=
1073	bcs,a	%xcc,.+8
1074	add	c_3,t_2,c_3
1075	mulx	a_3,b_0,t_1	!mul_add_c(a[3],b[0],c1,c2,c3);!=
1076	addcc	c_12,t_1,t_1	!=
1077	bcs,a	%xcc,.+8
1078	add	c_3,t_2,c_3
1079	srlx	t_1,32,c_12
1080	stuw	t_1,rp(3)	!=!r[3]=c1;
1081	or	c_12,c_3,c_12
1082
1083	mulx	a_3,b_1,t_1	!mul_add_c(a[3],b[1],c2,c3,c1);
1084	addcc	c_12,t_1,c_12
1085	clr	c_3		!=
1086	bcs,a	%xcc,.+8
1087	add	c_3,t_2,c_3
1088	mulx	a_2,b_2,t_1	!mul_add_c(a[2],b[2],c2,c3,c1);
1089	addcc	c_12,t_1,c_12	!=
1090	bcs,a	%xcc,.+8
1091	add	c_3,t_2,c_3
1092	mulx	a_1,b_3,t_1	!mul_add_c(a[1],b[3],c2,c3,c1);
1093	addcc	c_12,t_1,t_1	!=
1094	bcs,a	%xcc,.+8
1095	add	c_3,t_2,c_3
1096	srlx	t_1,32,c_12
1097	stuw	t_1,rp(4)	!=!r[4]=c2;
1098	or	c_12,c_3,c_12
1099
1100	mulx	a_2,b_3,t_1	!mul_add_c(a[2],b[3],c3,c1,c2);
1101	addcc	c_12,t_1,c_12
1102	clr	c_3		!=
1103	bcs,a	%xcc,.+8
1104	add	c_3,t_2,c_3
1105	mulx	a_3,b_2,t_1	!mul_add_c(a[3],b[2],c3,c1,c2);
1106	addcc	c_12,t_1,t_1	!=
1107	bcs,a	%xcc,.+8
1108	add	c_3,t_2,c_3
1109	srlx	t_1,32,c_12
1110	stuw	t_1,rp(5)	!=!r[5]=c3;
1111	or	c_12,c_3,c_12
1112
1113	mulx	a_3,b_3,t_1	!mul_add_c(a[3],b[3],c1,c2,c3);
1114	addcc	c_12,t_1,t_1
1115	srlx	t_1,32,c_12	!=
1116	stuw	t_1,rp(6)	!r[6]=c1;
1117	stuw	c_12,rp(7)	!r[7]=c2;
1118
1119	ret
1120	restore	%g0,%g0,%o0
1121
1122.type	bn_mul_comba4,#function
1123.size	bn_mul_comba4,(.-bn_mul_comba4)
1124
1125.align	32
1126
1127.global bn_sqr_comba8
1128bn_sqr_comba8:
1129	save	%sp,FRAME_SIZE,%sp
1130	mov	1,t_2
1131	lduw	ap(0),a_0
1132	sllx	t_2,32,t_2
1133	lduw	ap(1),a_1
1134	mulx	a_0,a_0,t_1	!sqr_add_c(a,0,c1,c2,c3);
1135	srlx	t_1,32,c_12
1136	stuw	t_1,rp(0)	!r[0]=c1;
1137
1138	lduw	ap(2),a_2
1139	mulx	a_0,a_1,t_1	!=!sqr_add_c2(a,1,0,c2,c3,c1);
1140	addcc	c_12,t_1,c_12
1141	clr	c_3
1142	bcs,a	%xcc,.+8
1143	add	c_3,t_2,c_3
1144	addcc	c_12,t_1,t_1
1145	bcs,a	%xcc,.+8
1146	add	c_3,t_2,c_3
1147	srlx	t_1,32,c_12
1148	stuw	t_1,rp(1)	!r[1]=c2;
1149	or	c_12,c_3,c_12
1150
1151	mulx	a_2,a_0,t_1	!sqr_add_c2(a,2,0,c3,c1,c2);
1152	addcc	c_12,t_1,c_12
1153	clr	c_3
1154	bcs,a	%xcc,.+8
1155	add	c_3,t_2,c_3
1156	addcc	c_12,t_1,c_12
1157	bcs,a	%xcc,.+8
1158	add	c_3,t_2,c_3
1159	lduw	ap(3),a_3
1160	mulx	a_1,a_1,t_1	!sqr_add_c(a,1,c3,c1,c2);
1161	addcc	c_12,t_1,t_1
1162	bcs,a	%xcc,.+8
1163	add	c_3,t_2,c_3
1164	srlx	t_1,32,c_12
1165	stuw	t_1,rp(2)	!r[2]=c3;
1166	or	c_12,c_3,c_12
1167
1168	mulx	a_0,a_3,t_1	!sqr_add_c2(a,3,0,c1,c2,c3);
1169	addcc	c_12,t_1,c_12
1170	clr	c_3
1171	bcs,a	%xcc,.+8
1172	add	c_3,t_2,c_3
1173	addcc	c_12,t_1,c_12
1174	bcs,a	%xcc,.+8
1175	add	c_3,t_2,c_3
1176	lduw	ap(4),a_4
1177	mulx	a_1,a_2,t_1	!sqr_add_c2(a,2,1,c1,c2,c3);
1178	addcc	c_12,t_1,c_12
1179	bcs,a	%xcc,.+8
1180	add	c_3,t_2,c_3
1181	addcc	c_12,t_1,t_1
1182	bcs,a	%xcc,.+8
1183	add	c_3,t_2,c_3
1184	srlx	t_1,32,c_12
1185	st	t_1,rp(3)	!r[3]=c1;
1186	or	c_12,c_3,c_12
1187
1188	mulx	a_4,a_0,t_1	!sqr_add_c2(a,4,0,c2,c3,c1);
1189	addcc	c_12,t_1,c_12
1190	clr	c_3
1191	bcs,a	%xcc,.+8
1192	add	c_3,t_2,c_3
1193	addcc	c_12,t_1,c_12
1194	bcs,a	%xcc,.+8
1195	add	c_3,t_2,c_3
1196	mulx	a_3,a_1,t_1	!sqr_add_c2(a,3,1,c2,c3,c1);
1197	addcc	c_12,t_1,c_12
1198	bcs,a	%xcc,.+8
1199	add	c_3,t_2,c_3
1200	addcc	c_12,t_1,c_12
1201	bcs,a	%xcc,.+8
1202	add	c_3,t_2,c_3
1203	lduw	ap(5),a_5
1204	mulx	a_2,a_2,t_1	!sqr_add_c(a,2,c2,c3,c1);
1205	addcc	c_12,t_1,t_1
1206	bcs,a	%xcc,.+8
1207	add	c_3,t_2,c_3
1208	srlx	t_1,32,c_12
1209	stuw	t_1,rp(4)	!r[4]=c2;
1210	or	c_12,c_3,c_12
1211
1212	mulx	a_0,a_5,t_1	!sqr_add_c2(a,5,0,c3,c1,c2);
1213	addcc	c_12,t_1,c_12
1214	clr	c_3
1215	bcs,a	%xcc,.+8
1216	add	c_3,t_2,c_3
1217	addcc	c_12,t_1,c_12
1218	bcs,a	%xcc,.+8
1219	add	c_3,t_2,c_3
1220	mulx	a_1,a_4,t_1	!sqr_add_c2(a,4,1,c3,c1,c2);
1221	addcc	c_12,t_1,c_12
1222	bcs,a	%xcc,.+8
1223	add	c_3,t_2,c_3
1224	addcc	c_12,t_1,c_12
1225	bcs,a	%xcc,.+8
1226	add	c_3,t_2,c_3
1227	lduw	ap(6),a_6
1228	mulx	a_2,a_3,t_1	!sqr_add_c2(a,3,2,c3,c1,c2);
1229	addcc	c_12,t_1,c_12
1230	bcs,a	%xcc,.+8
1231	add	c_3,t_2,c_3
1232	addcc	c_12,t_1,t_1
1233	bcs,a	%xcc,.+8
1234	add	c_3,t_2,c_3
1235	srlx	t_1,32,c_12
1236	stuw	t_1,rp(5)	!r[5]=c3;
1237	or	c_12,c_3,c_12
1238
1239	mulx	a_6,a_0,t_1	!sqr_add_c2(a,6,0,c1,c2,c3);
1240	addcc	c_12,t_1,c_12
1241	clr	c_3
1242	bcs,a	%xcc,.+8
1243	add	c_3,t_2,c_3
1244	addcc	c_12,t_1,c_12
1245	bcs,a	%xcc,.+8
1246	add	c_3,t_2,c_3
1247	mulx	a_5,a_1,t_1	!sqr_add_c2(a,5,1,c1,c2,c3);
1248	addcc	c_12,t_1,c_12
1249	bcs,a	%xcc,.+8
1250	add	c_3,t_2,c_3
1251	addcc	c_12,t_1,c_12
1252	bcs,a	%xcc,.+8
1253	add	c_3,t_2,c_3
1254	mulx	a_4,a_2,t_1	!sqr_add_c2(a,4,2,c1,c2,c3);
1255	addcc	c_12,t_1,c_12
1256	bcs,a	%xcc,.+8
1257	add	c_3,t_2,c_3
1258	addcc	c_12,t_1,c_12
1259	bcs,a	%xcc,.+8
1260	add	c_3,t_2,c_3
1261	lduw	ap(7),a_7
1262	mulx	a_3,a_3,t_1	!=!sqr_add_c(a,3,c1,c2,c3);
1263	addcc	c_12,t_1,t_1
1264	bcs,a	%xcc,.+8
1265	add	c_3,t_2,c_3
1266	srlx	t_1,32,c_12
1267	stuw	t_1,rp(6)	!r[6]=c1;
1268	or	c_12,c_3,c_12
1269
1270	mulx	a_0,a_7,t_1	!sqr_add_c2(a,7,0,c2,c3,c1);
1271	addcc	c_12,t_1,c_12
1272	clr	c_3
1273	bcs,a	%xcc,.+8
1274	add	c_3,t_2,c_3
1275	addcc	c_12,t_1,c_12
1276	bcs,a	%xcc,.+8
1277	add	c_3,t_2,c_3
1278	mulx	a_1,a_6,t_1	!sqr_add_c2(a,6,1,c2,c3,c1);
1279	addcc	c_12,t_1,c_12
1280	bcs,a	%xcc,.+8
1281	add	c_3,t_2,c_3
1282	addcc	c_12,t_1,c_12
1283	bcs,a	%xcc,.+8
1284	add	c_3,t_2,c_3
1285	mulx	a_2,a_5,t_1	!sqr_add_c2(a,5,2,c2,c3,c1);
1286	addcc	c_12,t_1,c_12
1287	bcs,a	%xcc,.+8
1288	add	c_3,t_2,c_3
1289	addcc	c_12,t_1,c_12
1290	bcs,a	%xcc,.+8
1291	add	c_3,t_2,c_3
1292	mulx	a_3,a_4,t_1	!sqr_add_c2(a,4,3,c2,c3,c1);
1293	addcc	c_12,t_1,c_12
1294	bcs,a	%xcc,.+8
1295	add	c_3,t_2,c_3
1296	addcc	c_12,t_1,t_1
1297	bcs,a	%xcc,.+8
1298	add	c_3,t_2,c_3
1299	srlx	t_1,32,c_12
1300	stuw	t_1,rp(7)	!r[7]=c2;
1301	or	c_12,c_3,c_12
1302
1303	mulx	a_7,a_1,t_1	!sqr_add_c2(a,7,1,c3,c1,c2);
1304	addcc	c_12,t_1,c_12
1305	clr	c_3
1306	bcs,a	%xcc,.+8
1307	add	c_3,t_2,c_3
1308	addcc	c_12,t_1,c_12
1309	bcs,a	%xcc,.+8
1310	add	c_3,t_2,c_3
1311	mulx	a_6,a_2,t_1	!sqr_add_c2(a,6,2,c3,c1,c2);
1312	addcc	c_12,t_1,c_12
1313	bcs,a	%xcc,.+8
1314	add	c_3,t_2,c_3
1315	addcc	c_12,t_1,c_12
1316	bcs,a	%xcc,.+8
1317	add	c_3,t_2,c_3
1318	mulx	a_5,a_3,t_1	!sqr_add_c2(a,5,3,c3,c1,c2);
1319	addcc	c_12,t_1,c_12
1320	bcs,a	%xcc,.+8
1321	add	c_3,t_2,c_3
1322	addcc	c_12,t_1,c_12
1323	bcs,a	%xcc,.+8
1324	add	c_3,t_2,c_3
1325	mulx	a_4,a_4,t_1	!sqr_add_c(a,4,c3,c1,c2);
1326	addcc	c_12,t_1,t_1
1327	bcs,a	%xcc,.+8
1328	add	c_3,t_2,c_3
1329	srlx	t_1,32,c_12
1330	stuw	t_1,rp(8)	!r[8]=c3;
1331	or	c_12,c_3,c_12
1332
1333	mulx	a_2,a_7,t_1	!sqr_add_c2(a,7,2,c1,c2,c3);
1334	addcc	c_12,t_1,c_12
1335	clr	c_3
1336	bcs,a	%xcc,.+8
1337	add	c_3,t_2,c_3
1338	addcc	c_12,t_1,c_12
1339	bcs,a	%xcc,.+8
1340	add	c_3,t_2,c_3
1341	mulx	a_3,a_6,t_1	!sqr_add_c2(a,6,3,c1,c2,c3);
1342	addcc	c_12,t_1,c_12
1343	bcs,a	%xcc,.+8
1344	add	c_3,t_2,c_3
1345	addcc	c_12,t_1,c_12
1346	bcs,a	%xcc,.+8
1347	add	c_3,t_2,c_3
1348	mulx	a_4,a_5,t_1	!sqr_add_c2(a,5,4,c1,c2,c3);
1349	addcc	c_12,t_1,c_12
1350	bcs,a	%xcc,.+8
1351	add	c_3,t_2,c_3
1352	addcc	c_12,t_1,t_1
1353	bcs,a	%xcc,.+8
1354	add	c_3,t_2,c_3
1355	srlx	t_1,32,c_12
1356	stuw	t_1,rp(9)	!r[9]=c1;
1357	or	c_12,c_3,c_12
1358
1359	mulx	a_7,a_3,t_1	!sqr_add_c2(a,7,3,c2,c3,c1);
1360	addcc	c_12,t_1,c_12
1361	clr	c_3
1362	bcs,a	%xcc,.+8
1363	add	c_3,t_2,c_3
1364	addcc	c_12,t_1,c_12
1365	bcs,a	%xcc,.+8
1366	add	c_3,t_2,c_3
1367	mulx	a_6,a_4,t_1	!sqr_add_c2(a,6,4,c2,c3,c1);
1368	addcc	c_12,t_1,c_12
1369	bcs,a	%xcc,.+8
1370	add	c_3,t_2,c_3
1371	addcc	c_12,t_1,c_12
1372	bcs,a	%xcc,.+8
1373	add	c_3,t_2,c_3
1374	mulx	a_5,a_5,t_1	!sqr_add_c(a,5,c2,c3,c1);
1375	addcc	c_12,t_1,t_1
1376	bcs,a	%xcc,.+8
1377	add	c_3,t_2,c_3
1378	srlx	t_1,32,c_12
1379	stuw	t_1,rp(10)	!r[10]=c2;
1380	or	c_12,c_3,c_12
1381
1382	mulx	a_4,a_7,t_1	!sqr_add_c2(a,7,4,c3,c1,c2);
1383	addcc	c_12,t_1,c_12
1384	clr	c_3
1385	bcs,a	%xcc,.+8
1386	add	c_3,t_2,c_3
1387	addcc	c_12,t_1,c_12
1388	bcs,a	%xcc,.+8
1389	add	c_3,t_2,c_3
1390	mulx	a_5,a_6,t_1	!sqr_add_c2(a,6,5,c3,c1,c2);
1391	addcc	c_12,t_1,c_12
1392	bcs,a	%xcc,.+8
1393	add	c_3,t_2,c_3
1394	addcc	c_12,t_1,t_1
1395	bcs,a	%xcc,.+8
1396	add	c_3,t_2,c_3
1397	srlx	t_1,32,c_12
1398	stuw	t_1,rp(11)	!r[11]=c3;
1399	or	c_12,c_3,c_12
1400
1401	mulx	a_7,a_5,t_1	!sqr_add_c2(a,7,5,c1,c2,c3);
1402	addcc	c_12,t_1,c_12
1403	clr	c_3
1404	bcs,a	%xcc,.+8
1405	add	c_3,t_2,c_3
1406	addcc	c_12,t_1,c_12
1407	bcs,a	%xcc,.+8
1408	add	c_3,t_2,c_3
1409	mulx	a_6,a_6,t_1	!sqr_add_c(a,6,c1,c2,c3);
1410	addcc	c_12,t_1,t_1
1411	bcs,a	%xcc,.+8
1412	add	c_3,t_2,c_3
1413	srlx	t_1,32,c_12
1414	stuw	t_1,rp(12)	!r[12]=c1;
1415	or	c_12,c_3,c_12
1416
1417	mulx	a_6,a_7,t_1	!sqr_add_c2(a,7,6,c2,c3,c1);
1418	addcc	c_12,t_1,c_12
1419	clr	c_3
1420	bcs,a	%xcc,.+8
1421	add	c_3,t_2,c_3
1422	addcc	c_12,t_1,t_1
1423	bcs,a	%xcc,.+8
1424	add	c_3,t_2,c_3
1425	srlx	t_1,32,c_12
1426	stuw	t_1,rp(13)	!r[13]=c2;
1427	or	c_12,c_3,c_12
1428
1429	mulx	a_7,a_7,t_1	!sqr_add_c(a,7,c3,c1,c2);
1430	addcc	c_12,t_1,t_1
1431	srlx	t_1,32,c_12
1432	stuw	t_1,rp(14)	!r[14]=c3;
1433	stuw	c_12,rp(15)	!r[15]=c1;
1434
1435	ret
1436	restore	%g0,%g0,%o0
1437
1438.type	bn_sqr_comba8,#function
1439.size	bn_sqr_comba8,(.-bn_sqr_comba8)
1440
1441.align	32
1442
1443.global bn_sqr_comba4
1444/*
1445 * void bn_sqr_comba4(r,a)
1446 * BN_ULONG *r,*a;
1447 */
1448bn_sqr_comba4:
1449	save	%sp,FRAME_SIZE,%sp
1450	mov	1,t_2
1451	lduw	ap(0),a_0
1452	sllx	t_2,32,t_2
1453	lduw	ap(1),a_1
1454	mulx	a_0,a_0,t_1	!sqr_add_c(a,0,c1,c2,c3);
1455	srlx	t_1,32,c_12
1456	stuw	t_1,rp(0)	!r[0]=c1;
1457
1458	lduw	ap(2),a_2
1459	mulx	a_0,a_1,t_1	!sqr_add_c2(a,1,0,c2,c3,c1);
1460	addcc	c_12,t_1,c_12
1461	clr	c_3
1462	bcs,a	%xcc,.+8
1463	add	c_3,t_2,c_3
1464	addcc	c_12,t_1,t_1
1465	bcs,a	%xcc,.+8
1466	add	c_3,t_2,c_3
1467	srlx	t_1,32,c_12
1468	stuw	t_1,rp(1)	!r[1]=c2;
1469	or	c_12,c_3,c_12
1470
1471	mulx	a_2,a_0,t_1	!sqr_add_c2(a,2,0,c3,c1,c2);
1472	addcc	c_12,t_1,c_12
1473	clr	c_3
1474	bcs,a	%xcc,.+8
1475	add	c_3,t_2,c_3
1476	addcc	c_12,t_1,c_12
1477	bcs,a	%xcc,.+8
1478	add	c_3,t_2,c_3
1479	lduw	ap(3),a_3
1480	mulx	a_1,a_1,t_1	!sqr_add_c(a,1,c3,c1,c2);
1481	addcc	c_12,t_1,t_1
1482	bcs,a	%xcc,.+8
1483	add	c_3,t_2,c_3
1484	srlx	t_1,32,c_12
1485	stuw	t_1,rp(2)	!r[2]=c3;
1486	or	c_12,c_3,c_12
1487
1488	mulx	a_0,a_3,t_1	!sqr_add_c2(a,3,0,c1,c2,c3);
1489	addcc	c_12,t_1,c_12
1490	clr	c_3
1491	bcs,a	%xcc,.+8
1492	add	c_3,t_2,c_3
1493	addcc	c_12,t_1,c_12
1494	bcs,a	%xcc,.+8
1495	add	c_3,t_2,c_3
1496	mulx	a_1,a_2,t_1	!sqr_add_c2(a,2,1,c1,c2,c3);
1497	addcc	c_12,t_1,c_12
1498	bcs,a	%xcc,.+8
1499	add	c_3,t_2,c_3
1500	addcc	c_12,t_1,t_1
1501	bcs,a	%xcc,.+8
1502	add	c_3,t_2,c_3
1503	srlx	t_1,32,c_12
1504	stuw	t_1,rp(3)	!r[3]=c1;
1505	or	c_12,c_3,c_12
1506
1507	mulx	a_3,a_1,t_1	!sqr_add_c2(a,3,1,c2,c3,c1);
1508	addcc	c_12,t_1,c_12
1509	clr	c_3
1510	bcs,a	%xcc,.+8
1511	add	c_3,t_2,c_3
1512	addcc	c_12,t_1,c_12
1513	bcs,a	%xcc,.+8
1514	add	c_3,t_2,c_3
1515	mulx	a_2,a_2,t_1	!sqr_add_c(a,2,c2,c3,c1);
1516	addcc	c_12,t_1,t_1
1517	bcs,a	%xcc,.+8
1518	add	c_3,t_2,c_3
1519	srlx	t_1,32,c_12
1520	stuw	t_1,rp(4)	!r[4]=c2;
1521	or	c_12,c_3,c_12
1522
1523	mulx	a_2,a_3,t_1	!sqr_add_c2(a,3,2,c3,c1,c2);
1524	addcc	c_12,t_1,c_12
1525	clr	c_3
1526	bcs,a	%xcc,.+8
1527	add	c_3,t_2,c_3
1528	addcc	c_12,t_1,t_1
1529	bcs,a	%xcc,.+8
1530	add	c_3,t_2,c_3
1531	srlx	t_1,32,c_12
1532	stuw	t_1,rp(5)	!r[5]=c3;
1533	or	c_12,c_3,c_12
1534
1535	mulx	a_3,a_3,t_1	!sqr_add_c(a,3,c1,c2,c3);
1536	addcc	c_12,t_1,t_1
1537	srlx	t_1,32,c_12
1538	stuw	t_1,rp(6)	!r[6]=c1;
1539	stuw	c_12,rp(7)	!r[7]=c2;
1540
1541	ret
1542	restore	%g0,%g0,%o0
1543
1544.type	bn_sqr_comba4,#function
1545.size	bn_sqr_comba4,(.-bn_sqr_comba4)
1546
1547.align	32
1548