mips3.s revision 277195
1.rdata
2.asciiz	"mips3.s, Version 1.1"
3.asciiz	"MIPS III/IV ISA artwork by Andy Polyakov <appro@fy.chalmers.se>"
4
5/*
6 * ====================================================================
7 * Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
8 * project.
9 *
10 * Rights for redistribution and usage in source and binary forms are
11 * granted according to the OpenSSL license. Warranty of any kind is
12 * disclaimed.
13 * ====================================================================
14 */
15
16/*
17 * This is my modest contributon to the OpenSSL project (see
18 * http://www.openssl.org/ for more information about it) and is
19 * a drop-in MIPS III/IV ISA replacement for crypto/bn/bn_asm.c
20 * module. For updates see http://fy.chalmers.se/~appro/hpe/.
21 *
22 * The module is designed to work with either of the "new" MIPS ABI(5),
23 * namely N32 or N64, offered by IRIX 6.x. It's not ment to work under
24 * IRIX 5.x not only because it doesn't support new ABIs but also
25 * because 5.x kernels put R4x00 CPU into 32-bit mode and all those
26 * 64-bit instructions (daddu, dmultu, etc.) found below gonna only
27 * cause illegal instruction exception:-(
28 *
29 * In addition the code depends on preprocessor flags set up by MIPSpro
30 * compiler driver (either as or cc) and therefore (probably?) can't be
31 * compiled by the GNU assembler. GNU C driver manages fine though...
32 * I mean as long as -mmips-as is specified or is the default option,
33 * because then it simply invokes /usr/bin/as which in turn takes
34 * perfect care of the preprocessor definitions. Another neat feature
35 * offered by the MIPSpro assembler is an optimization pass. This gave
36 * me the opportunity to have the code looking more regular as all those
37 * architecture dependent instruction rescheduling details were left to
38 * the assembler. Cool, huh?
39 *
40 * Performance improvement is astonishing! 'apps/openssl speed rsa dsa'
41 * goes way over 3 times faster!
42 *
43 *					<appro@fy.chalmers.se>
44 */
45#include <asm.h>
46#include <regdef.h>
47
48#if _MIPS_ISA>=4
49#define	MOVNZ(cond,dst,src)	\
50	movn	dst,src,cond
51#else
52#define	MOVNZ(cond,dst,src)	\
53	.set	noreorder;	\
54	bnezl	cond,.+8;	\
55	move	dst,src;	\
56	.set	reorder
57#endif
58
59.text
60
61.set	noat
62.set	reorder
63
64#define	MINUS4	v1
65
66.align	5
67LEAF(bn_mul_add_words)
68	.set	noreorder
69	bgtzl	a2,.L_bn_mul_add_words_proceed
70	ld	t0,0(a1)
71	jr	ra
72	move	v0,zero
73	.set	reorder
74
75.L_bn_mul_add_words_proceed:
76	li	MINUS4,-4
77	and	ta0,a2,MINUS4
78	move	v0,zero
79	beqz	ta0,.L_bn_mul_add_words_tail
80
81.L_bn_mul_add_words_loop:
82	dmultu	t0,a3
83	ld	t1,0(a0)
84	ld	t2,8(a1)
85	ld	t3,8(a0)
86	ld	ta0,16(a1)
87	ld	ta1,16(a0)
88	daddu	t1,v0
89	sltu	v0,t1,v0	/* All manuals say it "compares 32-bit
90				 * values", but it seems to work fine
91				 * even on 64-bit registers. */
92	mflo	AT
93	mfhi	t0
94	daddu	t1,AT
95	daddu	v0,t0
96	sltu	AT,t1,AT
97	sd	t1,0(a0)
98	daddu	v0,AT
99
100	dmultu	t2,a3
101	ld	ta2,24(a1)
102	ld	ta3,24(a0)
103	daddu	t3,v0
104	sltu	v0,t3,v0
105	mflo	AT
106	mfhi	t2
107	daddu	t3,AT
108	daddu	v0,t2
109	sltu	AT,t3,AT
110	sd	t3,8(a0)
111	daddu	v0,AT
112
113	dmultu	ta0,a3
114	subu	a2,4
115	PTR_ADD	a0,32
116	PTR_ADD	a1,32
117	daddu	ta1,v0
118	sltu	v0,ta1,v0
119	mflo	AT
120	mfhi	ta0
121	daddu	ta1,AT
122	daddu	v0,ta0
123	sltu	AT,ta1,AT
124	sd	ta1,-16(a0)
125	daddu	v0,AT
126
127
128	dmultu	ta2,a3
129	and	ta0,a2,MINUS4
130	daddu	ta3,v0
131	sltu	v0,ta3,v0
132	mflo	AT
133	mfhi	ta2
134	daddu	ta3,AT
135	daddu	v0,ta2
136	sltu	AT,ta3,AT
137	sd	ta3,-8(a0)
138	daddu	v0,AT
139	.set	noreorder
140	bgtzl	ta0,.L_bn_mul_add_words_loop
141	ld	t0,0(a1)
142
143	bnezl	a2,.L_bn_mul_add_words_tail
144	ld	t0,0(a1)
145	.set	reorder
146
147.L_bn_mul_add_words_return:
148	jr	ra
149
150.L_bn_mul_add_words_tail:
151	dmultu	t0,a3
152	ld	t1,0(a0)
153	subu	a2,1
154	daddu	t1,v0
155	sltu	v0,t1,v0
156	mflo	AT
157	mfhi	t0
158	daddu	t1,AT
159	daddu	v0,t0
160	sltu	AT,t1,AT
161	sd	t1,0(a0)
162	daddu	v0,AT
163	beqz	a2,.L_bn_mul_add_words_return
164
165	ld	t0,8(a1)
166	dmultu	t0,a3
167	ld	t1,8(a0)
168	subu	a2,1
169	daddu	t1,v0
170	sltu	v0,t1,v0
171	mflo	AT
172	mfhi	t0
173	daddu	t1,AT
174	daddu	v0,t0
175	sltu	AT,t1,AT
176	sd	t1,8(a0)
177	daddu	v0,AT
178	beqz	a2,.L_bn_mul_add_words_return
179
180	ld	t0,16(a1)
181	dmultu	t0,a3
182	ld	t1,16(a0)
183	daddu	t1,v0
184	sltu	v0,t1,v0
185	mflo	AT
186	mfhi	t0
187	daddu	t1,AT
188	daddu	v0,t0
189	sltu	AT,t1,AT
190	sd	t1,16(a0)
191	daddu	v0,AT
192	jr	ra
193END(bn_mul_add_words)
194
195.align	5
196LEAF(bn_mul_words)
197	.set	noreorder
198	bgtzl	a2,.L_bn_mul_words_proceed
199	ld	t0,0(a1)
200	jr	ra
201	move	v0,zero
202	.set	reorder
203
204.L_bn_mul_words_proceed:
205	li	MINUS4,-4
206	and	ta0,a2,MINUS4
207	move	v0,zero
208	beqz	ta0,.L_bn_mul_words_tail
209
210.L_bn_mul_words_loop:
211	dmultu	t0,a3
212	ld	t2,8(a1)
213	ld	ta0,16(a1)
214	ld	ta2,24(a1)
215	mflo	AT
216	mfhi	t0
217	daddu	v0,AT
218	sltu	t1,v0,AT
219	sd	v0,0(a0)
220	daddu	v0,t1,t0
221
222	dmultu	t2,a3
223	subu	a2,4
224	PTR_ADD	a0,32
225	PTR_ADD	a1,32
226	mflo	AT
227	mfhi	t2
228	daddu	v0,AT
229	sltu	t3,v0,AT
230	sd	v0,-24(a0)
231	daddu	v0,t3,t2
232
233	dmultu	ta0,a3
234	mflo	AT
235	mfhi	ta0
236	daddu	v0,AT
237	sltu	ta1,v0,AT
238	sd	v0,-16(a0)
239	daddu	v0,ta1,ta0
240
241
242	dmultu	ta2,a3
243	and	ta0,a2,MINUS4
244	mflo	AT
245	mfhi	ta2
246	daddu	v0,AT
247	sltu	ta3,v0,AT
248	sd	v0,-8(a0)
249	daddu	v0,ta3,ta2
250	.set	noreorder
251	bgtzl	ta0,.L_bn_mul_words_loop
252	ld	t0,0(a1)
253
254	bnezl	a2,.L_bn_mul_words_tail
255	ld	t0,0(a1)
256	.set	reorder
257
258.L_bn_mul_words_return:
259	jr	ra
260
261.L_bn_mul_words_tail:
262	dmultu	t0,a3
263	subu	a2,1
264	mflo	AT
265	mfhi	t0
266	daddu	v0,AT
267	sltu	t1,v0,AT
268	sd	v0,0(a0)
269	daddu	v0,t1,t0
270	beqz	a2,.L_bn_mul_words_return
271
272	ld	t0,8(a1)
273	dmultu	t0,a3
274	subu	a2,1
275	mflo	AT
276	mfhi	t0
277	daddu	v0,AT
278	sltu	t1,v0,AT
279	sd	v0,8(a0)
280	daddu	v0,t1,t0
281	beqz	a2,.L_bn_mul_words_return
282
283	ld	t0,16(a1)
284	dmultu	t0,a3
285	mflo	AT
286	mfhi	t0
287	daddu	v0,AT
288	sltu	t1,v0,AT
289	sd	v0,16(a0)
290	daddu	v0,t1,t0
291	jr	ra
292END(bn_mul_words)
293
294.align	5
295LEAF(bn_sqr_words)
296	.set	noreorder
297	bgtzl	a2,.L_bn_sqr_words_proceed
298	ld	t0,0(a1)
299	jr	ra
300	move	v0,zero
301	.set	reorder
302
303.L_bn_sqr_words_proceed:
304	li	MINUS4,-4
305	and	ta0,a2,MINUS4
306	move	v0,zero
307	beqz	ta0,.L_bn_sqr_words_tail
308
309.L_bn_sqr_words_loop:
310	dmultu	t0,t0
311	ld	t2,8(a1)
312	ld	ta0,16(a1)
313	ld	ta2,24(a1)
314	mflo	t1
315	mfhi	t0
316	sd	t1,0(a0)
317	sd	t0,8(a0)
318
319	dmultu	t2,t2
320	subu	a2,4
321	PTR_ADD	a0,64
322	PTR_ADD	a1,32
323	mflo	t3
324	mfhi	t2
325	sd	t3,-48(a0)
326	sd	t2,-40(a0)
327
328	dmultu	ta0,ta0
329	mflo	ta1
330	mfhi	ta0
331	sd	ta1,-32(a0)
332	sd	ta0,-24(a0)
333
334
335	dmultu	ta2,ta2
336	and	ta0,a2,MINUS4
337	mflo	ta3
338	mfhi	ta2
339	sd	ta3,-16(a0)
340	sd	ta2,-8(a0)
341
342	.set	noreorder
343	bgtzl	ta0,.L_bn_sqr_words_loop
344	ld	t0,0(a1)
345
346	bnezl	a2,.L_bn_sqr_words_tail
347	ld	t0,0(a1)
348	.set	reorder
349
350.L_bn_sqr_words_return:
351	move	v0,zero
352	jr	ra
353
354.L_bn_sqr_words_tail:
355	dmultu	t0,t0
356	subu	a2,1
357	mflo	t1
358	mfhi	t0
359	sd	t1,0(a0)
360	sd	t0,8(a0)
361	beqz	a2,.L_bn_sqr_words_return
362
363	ld	t0,8(a1)
364	dmultu	t0,t0
365	subu	a2,1
366	mflo	t1
367	mfhi	t0
368	sd	t1,16(a0)
369	sd	t0,24(a0)
370	beqz	a2,.L_bn_sqr_words_return
371
372	ld	t0,16(a1)
373	dmultu	t0,t0
374	mflo	t1
375	mfhi	t0
376	sd	t1,32(a0)
377	sd	t0,40(a0)
378	jr	ra
379END(bn_sqr_words)
380
381.align	5
382LEAF(bn_add_words)
383	.set	noreorder
384	bgtzl	a3,.L_bn_add_words_proceed
385	ld	t0,0(a1)
386	jr	ra
387	move	v0,zero
388	.set	reorder
389
390.L_bn_add_words_proceed:
391	li	MINUS4,-4
392	and	AT,a3,MINUS4
393	move	v0,zero
394	beqz	AT,.L_bn_add_words_tail
395
396.L_bn_add_words_loop:
397	ld	ta0,0(a2)
398	subu	a3,4
399	ld	t1,8(a1)
400	and	AT,a3,MINUS4
401	ld	t2,16(a1)
402	PTR_ADD	a2,32
403	ld	t3,24(a1)
404	PTR_ADD	a0,32
405	ld	ta1,-24(a2)
406	PTR_ADD	a1,32
407	ld	ta2,-16(a2)
408	ld	ta3,-8(a2)
409	daddu	ta0,t0
410	sltu	t8,ta0,t0
411	daddu	t0,ta0,v0
412	sltu	v0,t0,ta0
413	sd	t0,-32(a0)
414	daddu	v0,t8
415
416	daddu	ta1,t1
417	sltu	t9,ta1,t1
418	daddu	t1,ta1,v0
419	sltu	v0,t1,ta1
420	sd	t1,-24(a0)
421	daddu	v0,t9
422
423	daddu	ta2,t2
424	sltu	t8,ta2,t2
425	daddu	t2,ta2,v0
426	sltu	v0,t2,ta2
427	sd	t2,-16(a0)
428	daddu	v0,t8
429
430	daddu	ta3,t3
431	sltu	t9,ta3,t3
432	daddu	t3,ta3,v0
433	sltu	v0,t3,ta3
434	sd	t3,-8(a0)
435	daddu	v0,t9
436
437	.set	noreorder
438	bgtzl	AT,.L_bn_add_words_loop
439	ld	t0,0(a1)
440
441	bnezl	a3,.L_bn_add_words_tail
442	ld	t0,0(a1)
443	.set	reorder
444
445.L_bn_add_words_return:
446	jr	ra
447
448.L_bn_add_words_tail:
449	ld	ta0,0(a2)
450	daddu	ta0,t0
451	subu	a3,1
452	sltu	t8,ta0,t0
453	daddu	t0,ta0,v0
454	sltu	v0,t0,ta0
455	sd	t0,0(a0)
456	daddu	v0,t8
457	beqz	a3,.L_bn_add_words_return
458
459	ld	t1,8(a1)
460	ld	ta1,8(a2)
461	daddu	ta1,t1
462	subu	a3,1
463	sltu	t9,ta1,t1
464	daddu	t1,ta1,v0
465	sltu	v0,t1,ta1
466	sd	t1,8(a0)
467	daddu	v0,t9
468	beqz	a3,.L_bn_add_words_return
469
470	ld	t2,16(a1)
471	ld	ta2,16(a2)
472	daddu	ta2,t2
473	sltu	t8,ta2,t2
474	daddu	t2,ta2,v0
475	sltu	v0,t2,ta2
476	sd	t2,16(a0)
477	daddu	v0,t8
478	jr	ra
479END(bn_add_words)
480
481.align	5
482LEAF(bn_sub_words)
483	.set	noreorder
484	bgtzl	a3,.L_bn_sub_words_proceed
485	ld	t0,0(a1)
486	jr	ra
487	move	v0,zero
488	.set	reorder
489
490.L_bn_sub_words_proceed:
491	li	MINUS4,-4
492	and	AT,a3,MINUS4
493	move	v0,zero
494	beqz	AT,.L_bn_sub_words_tail
495
496.L_bn_sub_words_loop:
497	ld	ta0,0(a2)
498	subu	a3,4
499	ld	t1,8(a1)
500	and	AT,a3,MINUS4
501	ld	t2,16(a1)
502	PTR_ADD	a2,32
503	ld	t3,24(a1)
504	PTR_ADD	a0,32
505	ld	ta1,-24(a2)
506	PTR_ADD	a1,32
507	ld	ta2,-16(a2)
508	ld	ta3,-8(a2)
509	sltu	t8,t0,ta0
510	dsubu	t0,ta0
511	dsubu	ta0,t0,v0
512	sd	ta0,-32(a0)
513	MOVNZ	(t0,v0,t8)
514
515	sltu	t9,t1,ta1
516	dsubu	t1,ta1
517	dsubu	ta1,t1,v0
518	sd	ta1,-24(a0)
519	MOVNZ	(t1,v0,t9)
520
521
522	sltu	t8,t2,ta2
523	dsubu	t2,ta2
524	dsubu	ta2,t2,v0
525	sd	ta2,-16(a0)
526	MOVNZ	(t2,v0,t8)
527
528	sltu	t9,t3,ta3
529	dsubu	t3,ta3
530	dsubu	ta3,t3,v0
531	sd	ta3,-8(a0)
532	MOVNZ	(t3,v0,t9)
533
534	.set	noreorder
535	bgtzl	AT,.L_bn_sub_words_loop
536	ld	t0,0(a1)
537
538	bnezl	a3,.L_bn_sub_words_tail
539	ld	t0,0(a1)
540	.set	reorder
541
542.L_bn_sub_words_return:
543	jr	ra
544
545.L_bn_sub_words_tail:
546	ld	ta0,0(a2)
547	subu	a3,1
548	sltu	t8,t0,ta0
549	dsubu	t0,ta0
550	dsubu	ta0,t0,v0
551	MOVNZ	(t0,v0,t8)
552	sd	ta0,0(a0)
553	beqz	a3,.L_bn_sub_words_return
554
555	ld	t1,8(a1)
556	subu	a3,1
557	ld	ta1,8(a2)
558	sltu	t9,t1,ta1
559	dsubu	t1,ta1
560	dsubu	ta1,t1,v0
561	MOVNZ	(t1,v0,t9)
562	sd	ta1,8(a0)
563	beqz	a3,.L_bn_sub_words_return
564
565	ld	t2,16(a1)
566	ld	ta2,16(a2)
567	sltu	t8,t2,ta2
568	dsubu	t2,ta2
569	dsubu	ta2,t2,v0
570	MOVNZ	(t2,v0,t8)
571	sd	ta2,16(a0)
572	jr	ra
573END(bn_sub_words)
574
575#undef	MINUS4
576
577.align 5
578LEAF(bn_div_3_words)
579	.set	reorder
580	move	a3,a0		/* we know that bn_div_words doesn't
581				 * touch a3, ta2, ta3 and preserves a2
582				 * so that we can save two arguments
583				 * and return address in registers
584				 * instead of stack:-)
585				 */
586	ld	a0,(a3)
587	move	ta2,a1
588	ld	a1,-8(a3)
589	bne	a0,a2,.L_bn_div_3_words_proceed
590	li	v0,-1
591	jr	ra
592.L_bn_div_3_words_proceed:
593	move	ta3,ra
594	bal	bn_div_words
595	move	ra,ta3
596	dmultu	ta2,v0
597	ld	t2,-16(a3)
598	move	ta0,zero
599	mfhi	t1
600	mflo	t0
601	sltu	t8,t1,v1
602.L_bn_div_3_words_inner_loop:
603	bnez	t8,.L_bn_div_3_words_inner_loop_done
604	sgeu	AT,t2,t0
605	seq	t9,t1,v1
606	and	AT,t9
607	sltu	t3,t0,ta2
608	daddu	v1,a2
609	dsubu	t1,t3
610	dsubu	t0,ta2
611	sltu	t8,t1,v1
612	sltu	ta0,v1,a2
613	or	t8,ta0
614	.set	noreorder
615	beqzl	AT,.L_bn_div_3_words_inner_loop
616	dsubu	v0,1
617	.set	reorder
618.L_bn_div_3_words_inner_loop_done:
619	jr	ra
620END(bn_div_3_words)
621
622.align	5
623LEAF(bn_div_words)
624	.set	noreorder
625	bnezl	a2,.L_bn_div_words_proceed
626	move	v1,zero
627	jr	ra
628	li	v0,-1		/* I'd rather signal div-by-zero
629				 * which can be done with 'break 7' */
630
631.L_bn_div_words_proceed:
632	bltz	a2,.L_bn_div_words_body
633	move	t9,v1
634	dsll	a2,1
635	bgtz	a2,.-4
636	addu	t9,1
637
638	.set	reorder
639	negu	t1,t9
640	li	t2,-1
641	dsll	t2,t1
642	and	t2,a0
643	dsrl	AT,a1,t1
644	.set	noreorder
645	bnezl	t2,.+8
646	break	6		/* signal overflow */
647	.set	reorder
648	dsll	a0,t9
649	dsll	a1,t9
650	or	a0,AT
651
652#define	QT	ta0
653#define	HH	ta1
654#define	DH	v1
655.L_bn_div_words_body:
656	dsrl	DH,a2,32
657	sgeu	AT,a0,a2
658	.set	noreorder
659	bnezl	AT,.+8
660	dsubu	a0,a2
661	.set	reorder
662
663	li	QT,-1
664	dsrl	HH,a0,32
665	dsrl	QT,32	/* q=0xffffffff */
666	beq	DH,HH,.L_bn_div_words_skip_div1
667	ddivu	zero,a0,DH
668	mflo	QT
669.L_bn_div_words_skip_div1:
670	dmultu	a2,QT
671	dsll	t3,a0,32
672	dsrl	AT,a1,32
673	or	t3,AT
674	mflo	t0
675	mfhi	t1
676.L_bn_div_words_inner_loop1:
677	sltu	t2,t3,t0
678	seq	t8,HH,t1
679	sltu	AT,HH,t1
680	and	t2,t8
681	sltu	v0,t0,a2
682	or	AT,t2
683	.set	noreorder
684	beqz	AT,.L_bn_div_words_inner_loop1_done
685	dsubu	t1,v0
686	dsubu	t0,a2
687	b	.L_bn_div_words_inner_loop1
688	dsubu	QT,1
689	.set	reorder
690.L_bn_div_words_inner_loop1_done:
691
692	dsll	a1,32
693	dsubu	a0,t3,t0
694	dsll	v0,QT,32
695
696	li	QT,-1
697	dsrl	HH,a0,32
698	dsrl	QT,32	/* q=0xffffffff */
699	beq	DH,HH,.L_bn_div_words_skip_div2
700	ddivu	zero,a0,DH
701	mflo	QT
702.L_bn_div_words_skip_div2:
703#undef	DH
704	dmultu	a2,QT
705	dsll	t3,a0,32
706	dsrl	AT,a1,32
707	or	t3,AT
708	mflo	t0
709	mfhi	t1
710.L_bn_div_words_inner_loop2:
711	sltu	t2,t3,t0
712	seq	t8,HH,t1
713	sltu	AT,HH,t1
714	and	t2,t8
715	sltu	v1,t0,a2
716	or	AT,t2
717	.set	noreorder
718	beqz	AT,.L_bn_div_words_inner_loop2_done
719	dsubu	t1,v1
720	dsubu	t0,a2
721	b	.L_bn_div_words_inner_loop2
722	dsubu	QT,1
723	.set	reorder
724.L_bn_div_words_inner_loop2_done:
725#undef	HH
726
727	dsubu	a0,t3,t0
728	or	v0,QT
729	dsrl	v1,a0,t9	/* v1 contains remainder if anybody wants it */
730	dsrl	a2,t9		/* restore a2 */
731	jr	ra
732#undef	QT
733END(bn_div_words)
734
735#define	a_0	t0
736#define	a_1	t1
737#define	a_2	t2
738#define	a_3	t3
739#define	b_0	ta0
740#define	b_1	ta1
741#define	b_2	ta2
742#define	b_3	ta3
743
744#define	a_4	s0
745#define	a_5	s2
746#define	a_6	s4
747#define	a_7	a1	/* once we load a[7] we don't need a anymore */
748#define	b_4	s1
749#define	b_5	s3
750#define	b_6	s5
751#define	b_7	a2	/* once we load b[7] we don't need b anymore */
752
753#define	t_1	t8
754#define	t_2	t9
755
756#define	c_1	v0
757#define	c_2	v1
758#define	c_3	a3
759
760#define	FRAME_SIZE	48
761
762.align	5
763LEAF(bn_mul_comba8)
764	.set	noreorder
765	PTR_SUB	sp,FRAME_SIZE
766	.frame	sp,64,ra
767	.set	reorder
768	ld	a_0,0(a1)	/* If compiled with -mips3 option on
769				 * R5000 box assembler barks on this
770				 * line with "shouldn't have mult/div
771				 * as last instruction in bb (R10K
772				 * bug)" warning. If anybody out there
773				 * has a clue about how to circumvent
774				 * this do send me a note.
775				 *		<appro@fy.chalmers.se>
776				 */
777	ld	b_0,0(a2)
778	ld	a_1,8(a1)
779	ld	a_2,16(a1)
780	ld	a_3,24(a1)
781	ld	b_1,8(a2)
782	ld	b_2,16(a2)
783	ld	b_3,24(a2)
784	dmultu	a_0,b_0		/* mul_add_c(a[0],b[0],c1,c2,c3); */
785	sd	s0,0(sp)
786	sd	s1,8(sp)
787	sd	s2,16(sp)
788	sd	s3,24(sp)
789	sd	s4,32(sp)
790	sd	s5,40(sp)
791	mflo	c_1
792	mfhi	c_2
793
794	dmultu	a_0,b_1		/* mul_add_c(a[0],b[1],c2,c3,c1); */
795	ld	a_4,32(a1)
796	ld	a_5,40(a1)
797	ld	a_6,48(a1)
798	ld	a_7,56(a1)
799	ld	b_4,32(a2)
800	ld	b_5,40(a2)
801	mflo	t_1
802	mfhi	t_2
803	daddu	c_2,t_1
804	sltu	AT,c_2,t_1
805	daddu	c_3,t_2,AT
806	dmultu	a_1,b_0		/* mul_add_c(a[1],b[0],c2,c3,c1); */
807	ld	b_6,48(a2)
808	ld	b_7,56(a2)
809	sd	c_1,0(a0)	/* r[0]=c1; */
810	mflo	t_1
811	mfhi	t_2
812	daddu	c_2,t_1
813	sltu	AT,c_2,t_1
814	daddu	t_2,AT
815	daddu	c_3,t_2
816	sltu	c_1,c_3,t_2
817	sd	c_2,8(a0)	/* r[1]=c2; */
818
819	dmultu	a_2,b_0		/* mul_add_c(a[2],b[0],c3,c1,c2); */
820	mflo	t_1
821	mfhi	t_2
822	daddu	c_3,t_1
823	sltu	AT,c_3,t_1
824	daddu	t_2,AT
825	daddu	c_1,t_2
826	dmultu	a_1,b_1		/* mul_add_c(a[1],b[1],c3,c1,c2); */
827	mflo	t_1
828	mfhi	t_2
829	daddu	c_3,t_1
830	sltu	AT,c_3,t_1
831	daddu	t_2,AT
832	daddu	c_1,t_2
833	sltu	c_2,c_1,t_2
834	dmultu	a_0,b_2		/* mul_add_c(a[0],b[2],c3,c1,c2); */
835	mflo	t_1
836	mfhi	t_2
837	daddu	c_3,t_1
838	sltu	AT,c_3,t_1
839	daddu	t_2,AT
840	daddu	c_1,t_2
841	sltu	AT,c_1,t_2
842	daddu	c_2,AT
843	sd	c_3,16(a0)	/* r[2]=c3; */
844
845	dmultu	a_0,b_3		/* mul_add_c(a[0],b[3],c1,c2,c3); */
846	mflo	t_1
847	mfhi	t_2
848	daddu	c_1,t_1
849	sltu	AT,c_1,t_1
850	daddu	t_2,AT
851	daddu	c_2,t_2
852	sltu	c_3,c_2,t_2
853	dmultu	a_1,b_2		/* mul_add_c(a[1],b[2],c1,c2,c3); */
854	mflo	t_1
855	mfhi	t_2
856	daddu	c_1,t_1
857	sltu	AT,c_1,t_1
858	daddu	t_2,AT
859	daddu	c_2,t_2
860	sltu	AT,c_2,t_2
861	daddu	c_3,AT
862	dmultu	a_2,b_1		/* mul_add_c(a[2],b[1],c1,c2,c3); */
863	mflo	t_1
864	mfhi	t_2
865	daddu	c_1,t_1
866	sltu	AT,c_1,t_1
867	daddu	t_2,AT
868	daddu	c_2,t_2
869	sltu	AT,c_2,t_2
870	daddu	c_3,AT
871	dmultu	a_3,b_0		/* mul_add_c(a[3],b[0],c1,c2,c3); */
872	mflo	t_1
873	mfhi	t_2
874	daddu	c_1,t_1
875	sltu	AT,c_1,t_1
876	daddu	t_2,AT
877	daddu	c_2,t_2
878	sltu	AT,c_2,t_2
879	daddu	c_3,AT
880	sd	c_1,24(a0)	/* r[3]=c1; */
881
882	dmultu	a_4,b_0		/* mul_add_c(a[4],b[0],c2,c3,c1); */
883	mflo	t_1
884	mfhi	t_2
885	daddu	c_2,t_1
886	sltu	AT,c_2,t_1
887	daddu	t_2,AT
888	daddu	c_3,t_2
889	sltu	c_1,c_3,t_2
890	dmultu	a_3,b_1		/* mul_add_c(a[3],b[1],c2,c3,c1); */
891	mflo	t_1
892	mfhi	t_2
893	daddu	c_2,t_1
894	sltu	AT,c_2,t_1
895	daddu	t_2,AT
896	daddu	c_3,t_2
897	sltu	AT,c_3,t_2
898	daddu	c_1,AT
899	dmultu	a_2,b_2		/* mul_add_c(a[2],b[2],c2,c3,c1); */
900	mflo	t_1
901	mfhi	t_2
902	daddu	c_2,t_1
903	sltu	AT,c_2,t_1
904	daddu	t_2,AT
905	daddu	c_3,t_2
906	sltu	AT,c_3,t_2
907	daddu	c_1,AT
908	dmultu	a_1,b_3		/* mul_add_c(a[1],b[3],c2,c3,c1); */
909	mflo	t_1
910	mfhi	t_2
911	daddu	c_2,t_1
912	sltu	AT,c_2,t_1
913	daddu	t_2,AT
914	daddu	c_3,t_2
915	sltu	AT,c_3,t_2
916	daddu	c_1,AT
917	dmultu	a_0,b_4		/* mul_add_c(a[0],b[4],c2,c3,c1); */
918	mflo	t_1
919	mfhi	t_2
920	daddu	c_2,t_1
921	sltu	AT,c_2,t_1
922	daddu	t_2,AT
923	daddu	c_3,t_2
924	sltu	AT,c_3,t_2
925	daddu	c_1,AT
926	sd	c_2,32(a0)	/* r[4]=c2; */
927
928	dmultu	a_0,b_5		/* mul_add_c(a[0],b[5],c3,c1,c2); */
929	mflo	t_1
930	mfhi	t_2
931	daddu	c_3,t_1
932	sltu	AT,c_3,t_1
933	daddu	t_2,AT
934	daddu	c_1,t_2
935	sltu	c_2,c_1,t_2
936	dmultu	a_1,b_4		/* mul_add_c(a[1],b[4],c3,c1,c2); */
937	mflo	t_1
938	mfhi	t_2
939	daddu	c_3,t_1
940	sltu	AT,c_3,t_1
941	daddu	t_2,AT
942	daddu	c_1,t_2
943	sltu	AT,c_1,t_2
944	daddu	c_2,AT
945	dmultu	a_2,b_3		/* mul_add_c(a[2],b[3],c3,c1,c2); */
946	mflo	t_1
947	mfhi	t_2
948	daddu	c_3,t_1
949	sltu	AT,c_3,t_1
950	daddu	t_2,AT
951	daddu	c_1,t_2
952	sltu	AT,c_1,t_2
953	daddu	c_2,AT
954	dmultu	a_3,b_2		/* mul_add_c(a[3],b[2],c3,c1,c2); */
955	mflo	t_1
956	mfhi	t_2
957	daddu	c_3,t_1
958	sltu	AT,c_3,t_1
959	daddu	t_2,AT
960	daddu	c_1,t_2
961	sltu	AT,c_1,t_2
962	daddu	c_2,AT
963	dmultu	a_4,b_1		/* mul_add_c(a[4],b[1],c3,c1,c2); */
964	mflo	t_1
965	mfhi	t_2
966	daddu	c_3,t_1
967	sltu	AT,c_3,t_1
968	daddu	t_2,AT
969	daddu	c_1,t_2
970	sltu	AT,c_1,t_2
971	daddu	c_2,AT
972	dmultu	a_5,b_0		/* mul_add_c(a[5],b[0],c3,c1,c2); */
973	mflo	t_1
974	mfhi	t_2
975	daddu	c_3,t_1
976	sltu	AT,c_3,t_1
977	daddu	t_2,AT
978	daddu	c_1,t_2
979	sltu	AT,c_1,t_2
980	daddu	c_2,AT
981	sd	c_3,40(a0)	/* r[5]=c3; */
982
983	dmultu	a_6,b_0		/* mul_add_c(a[6],b[0],c1,c2,c3); */
984	mflo	t_1
985	mfhi	t_2
986	daddu	c_1,t_1
987	sltu	AT,c_1,t_1
988	daddu	t_2,AT
989	daddu	c_2,t_2
990	sltu	c_3,c_2,t_2
991	dmultu	a_5,b_1		/* mul_add_c(a[5],b[1],c1,c2,c3); */
992	mflo	t_1
993	mfhi	t_2
994	daddu	c_1,t_1
995	sltu	AT,c_1,t_1
996	daddu	t_2,AT
997	daddu	c_2,t_2
998	sltu	AT,c_2,t_2
999	daddu	c_3,AT
1000	dmultu	a_4,b_2		/* mul_add_c(a[4],b[2],c1,c2,c3); */
1001	mflo	t_1
1002	mfhi	t_2
1003	daddu	c_1,t_1
1004	sltu	AT,c_1,t_1
1005	daddu	t_2,AT
1006	daddu	c_2,t_2
1007	sltu	AT,c_2,t_2
1008	daddu	c_3,AT
1009	dmultu	a_3,b_3		/* mul_add_c(a[3],b[3],c1,c2,c3); */
1010	mflo	t_1
1011	mfhi	t_2
1012	daddu	c_1,t_1
1013	sltu	AT,c_1,t_1
1014	daddu	t_2,AT
1015	daddu	c_2,t_2
1016	sltu	AT,c_2,t_2
1017	daddu	c_3,AT
1018	dmultu	a_2,b_4		/* mul_add_c(a[2],b[4],c1,c2,c3); */
1019	mflo	t_1
1020	mfhi	t_2
1021	daddu	c_1,t_1
1022	sltu	AT,c_1,t_1
1023	daddu	t_2,AT
1024	daddu	c_2,t_2
1025	sltu	AT,c_2,t_2
1026	daddu	c_3,AT
1027	dmultu	a_1,b_5		/* mul_add_c(a[1],b[5],c1,c2,c3); */
1028	mflo	t_1
1029	mfhi	t_2
1030	daddu	c_1,t_1
1031	sltu	AT,c_1,t_1
1032	daddu	t_2,AT
1033	daddu	c_2,t_2
1034	sltu	AT,c_2,t_2
1035	daddu	c_3,AT
1036	dmultu	a_0,b_6		/* mul_add_c(a[0],b[6],c1,c2,c3); */
1037	mflo	t_1
1038	mfhi	t_2
1039	daddu	c_1,t_1
1040	sltu	AT,c_1,t_1
1041	daddu	t_2,AT
1042	daddu	c_2,t_2
1043	sltu	AT,c_2,t_2
1044	daddu	c_3,AT
1045	sd	c_1,48(a0)	/* r[6]=c1; */
1046
1047	dmultu	a_0,b_7		/* mul_add_c(a[0],b[7],c2,c3,c1); */
1048	mflo	t_1
1049	mfhi	t_2
1050	daddu	c_2,t_1
1051	sltu	AT,c_2,t_1
1052	daddu	t_2,AT
1053	daddu	c_3,t_2
1054	sltu	c_1,c_3,t_2
1055	dmultu	a_1,b_6		/* mul_add_c(a[1],b[6],c2,c3,c1); */
1056	mflo	t_1
1057	mfhi	t_2
1058	daddu	c_2,t_1
1059	sltu	AT,c_2,t_1
1060	daddu	t_2,AT
1061	daddu	c_3,t_2
1062	sltu	AT,c_3,t_2
1063	daddu	c_1,AT
1064	dmultu	a_2,b_5		/* mul_add_c(a[2],b[5],c2,c3,c1); */
1065	mflo	t_1
1066	mfhi	t_2
1067	daddu	c_2,t_1
1068	sltu	AT,c_2,t_1
1069	daddu	t_2,AT
1070	daddu	c_3,t_2
1071	sltu	AT,c_3,t_2
1072	daddu	c_1,AT
1073	dmultu	a_3,b_4		/* mul_add_c(a[3],b[4],c2,c3,c1); */
1074	mflo	t_1
1075	mfhi	t_2
1076	daddu	c_2,t_1
1077	sltu	AT,c_2,t_1
1078	daddu	t_2,AT
1079	daddu	c_3,t_2
1080	sltu	AT,c_3,t_2
1081	daddu	c_1,AT
1082	dmultu	a_4,b_3		/* mul_add_c(a[4],b[3],c2,c3,c1); */
1083	mflo	t_1
1084	mfhi	t_2
1085	daddu	c_2,t_1
1086	sltu	AT,c_2,t_1
1087	daddu	t_2,AT
1088	daddu	c_3,t_2
1089	sltu	AT,c_3,t_2
1090	daddu	c_1,AT
1091	dmultu	a_5,b_2		/* mul_add_c(a[5],b[2],c2,c3,c1); */
1092	mflo	t_1
1093	mfhi	t_2
1094	daddu	c_2,t_1
1095	sltu	AT,c_2,t_1
1096	daddu	t_2,AT
1097	daddu	c_3,t_2
1098	sltu	AT,c_3,t_2
1099	daddu	c_1,AT
1100	dmultu	a_6,b_1		/* mul_add_c(a[6],b[1],c2,c3,c1); */
1101	mflo	t_1
1102	mfhi	t_2
1103	daddu	c_2,t_1
1104	sltu	AT,c_2,t_1
1105	daddu	t_2,AT
1106	daddu	c_3,t_2
1107	sltu	AT,c_3,t_2
1108	daddu	c_1,AT
1109	dmultu	a_7,b_0		/* mul_add_c(a[7],b[0],c2,c3,c1); */
1110	mflo	t_1
1111	mfhi	t_2
1112	daddu	c_2,t_1
1113	sltu	AT,c_2,t_1
1114	daddu	t_2,AT
1115	daddu	c_3,t_2
1116	sltu	AT,c_3,t_2
1117	daddu	c_1,AT
1118	sd	c_2,56(a0)	/* r[7]=c2; */
1119
1120	dmultu	a_7,b_1		/* mul_add_c(a[7],b[1],c3,c1,c2); */
1121	mflo	t_1
1122	mfhi	t_2
1123	daddu	c_3,t_1
1124	sltu	AT,c_3,t_1
1125	daddu	t_2,AT
1126	daddu	c_1,t_2
1127	sltu	c_2,c_1,t_2
1128	dmultu	a_6,b_2		/* mul_add_c(a[6],b[2],c3,c1,c2); */
1129	mflo	t_1
1130	mfhi	t_2
1131	daddu	c_3,t_1
1132	sltu	AT,c_3,t_1
1133	daddu	t_2,AT
1134	daddu	c_1,t_2
1135	sltu	AT,c_1,t_2
1136	daddu	c_2,AT
1137	dmultu	a_5,b_3		/* mul_add_c(a[5],b[3],c3,c1,c2); */
1138	mflo	t_1
1139	mfhi	t_2
1140	daddu	c_3,t_1
1141	sltu	AT,c_3,t_1
1142	daddu	t_2,AT
1143	daddu	c_1,t_2
1144	sltu	AT,c_1,t_2
1145	daddu	c_2,AT
1146	dmultu	a_4,b_4		/* mul_add_c(a[4],b[4],c3,c1,c2); */
1147	mflo	t_1
1148	mfhi	t_2
1149	daddu	c_3,t_1
1150	sltu	AT,c_3,t_1
1151	daddu	t_2,AT
1152	daddu	c_1,t_2
1153	sltu	AT,c_1,t_2
1154	daddu	c_2,AT
1155	dmultu	a_3,b_5		/* mul_add_c(a[3],b[5],c3,c1,c2); */
1156	mflo	t_1
1157	mfhi	t_2
1158	daddu	c_3,t_1
1159	sltu	AT,c_3,t_1
1160	daddu	t_2,AT
1161	daddu	c_1,t_2
1162	sltu	AT,c_1,t_2
1163	daddu	c_2,AT
1164	dmultu	a_2,b_6		/* mul_add_c(a[2],b[6],c3,c1,c2); */
1165	mflo	t_1
1166	mfhi	t_2
1167	daddu	c_3,t_1
1168	sltu	AT,c_3,t_1
1169	daddu	t_2,AT
1170	daddu	c_1,t_2
1171	sltu	AT,c_1,t_2
1172	daddu	c_2,AT
1173	dmultu	a_1,b_7		/* mul_add_c(a[1],b[7],c3,c1,c2); */
1174	mflo	t_1
1175	mfhi	t_2
1176	daddu	c_3,t_1
1177	sltu	AT,c_3,t_1
1178	daddu	t_2,AT
1179	daddu	c_1,t_2
1180	sltu	AT,c_1,t_2
1181	daddu	c_2,AT
1182	sd	c_3,64(a0)	/* r[8]=c3; */
1183
1184	dmultu	a_2,b_7		/* mul_add_c(a[2],b[7],c1,c2,c3); */
1185	mflo	t_1
1186	mfhi	t_2
1187	daddu	c_1,t_1
1188	sltu	AT,c_1,t_1
1189	daddu	t_2,AT
1190	daddu	c_2,t_2
1191	sltu	c_3,c_2,t_2
1192	dmultu	a_3,b_6		/* mul_add_c(a[3],b[6],c1,c2,c3); */
1193	mflo	t_1
1194	mfhi	t_2
1195	daddu	c_1,t_1
1196	sltu	AT,c_1,t_1
1197	daddu	t_2,AT
1198	daddu	c_2,t_2
1199	sltu	AT,c_2,t_2
1200	daddu	c_3,AT
1201	dmultu	a_4,b_5		/* mul_add_c(a[4],b[5],c1,c2,c3); */
1202	mflo	t_1
1203	mfhi	t_2
1204	daddu	c_1,t_1
1205	sltu	AT,c_1,t_1
1206	daddu	t_2,AT
1207	daddu	c_2,t_2
1208	sltu	AT,c_2,t_2
1209	daddu	c_3,AT
1210	dmultu	a_5,b_4		/* mul_add_c(a[5],b[4],c1,c2,c3); */
1211	mflo	t_1
1212	mfhi	t_2
1213	daddu	c_1,t_1
1214	sltu	AT,c_1,t_1
1215	daddu	t_2,AT
1216	daddu	c_2,t_2
1217	sltu	AT,c_2,t_2
1218	daddu	c_3,AT
1219	dmultu	a_6,b_3		/* mul_add_c(a[6],b[3],c1,c2,c3); */
1220	mflo	t_1
1221	mfhi	t_2
1222	daddu	c_1,t_1
1223	sltu	AT,c_1,t_1
1224	daddu	t_2,AT
1225	daddu	c_2,t_2
1226	sltu	AT,c_2,t_2
1227	daddu	c_3,AT
1228	dmultu	a_7,b_2		/* mul_add_c(a[7],b[2],c1,c2,c3); */
1229	mflo	t_1
1230	mfhi	t_2
1231	daddu	c_1,t_1
1232	sltu	AT,c_1,t_1
1233	daddu	t_2,AT
1234	daddu	c_2,t_2
1235	sltu	AT,c_2,t_2
1236	daddu	c_3,AT
1237	sd	c_1,72(a0)	/* r[9]=c1; */
1238
1239	dmultu	a_7,b_3		/* mul_add_c(a[7],b[3],c2,c3,c1); */
1240	mflo	t_1
1241	mfhi	t_2
1242	daddu	c_2,t_1
1243	sltu	AT,c_2,t_1
1244	daddu	t_2,AT
1245	daddu	c_3,t_2
1246	sltu	c_1,c_3,t_2
1247	dmultu	a_6,b_4		/* mul_add_c(a[6],b[4],c2,c3,c1); */
1248	mflo	t_1
1249	mfhi	t_2
1250	daddu	c_2,t_1
1251	sltu	AT,c_2,t_1
1252	daddu	t_2,AT
1253	daddu	c_3,t_2
1254	sltu	AT,c_3,t_2
1255	daddu	c_1,AT
1256	dmultu	a_5,b_5		/* mul_add_c(a[5],b[5],c2,c3,c1); */
1257	mflo	t_1
1258	mfhi	t_2
1259	daddu	c_2,t_1
1260	sltu	AT,c_2,t_1
1261	daddu	t_2,AT
1262	daddu	c_3,t_2
1263	sltu	AT,c_3,t_2
1264	daddu	c_1,AT
1265	dmultu	a_4,b_6		/* mul_add_c(a[4],b[6],c2,c3,c1); */
1266	mflo	t_1
1267	mfhi	t_2
1268	daddu	c_2,t_1
1269	sltu	AT,c_2,t_1
1270	daddu	t_2,AT
1271	daddu	c_3,t_2
1272	sltu	AT,c_3,t_2
1273	daddu	c_1,AT
1274	dmultu	a_3,b_7		/* mul_add_c(a[3],b[7],c2,c3,c1); */
1275	mflo	t_1
1276	mfhi	t_2
1277	daddu	c_2,t_1
1278	sltu	AT,c_2,t_1
1279	daddu	t_2,AT
1280	daddu	c_3,t_2
1281	sltu	AT,c_3,t_2
1282	daddu	c_1,AT
1283	sd	c_2,80(a0)	/* r[10]=c2; */
1284
1285	dmultu	a_4,b_7		/* mul_add_c(a[4],b[7],c3,c1,c2); */
1286	mflo	t_1
1287	mfhi	t_2
1288	daddu	c_3,t_1
1289	sltu	AT,c_3,t_1
1290	daddu	t_2,AT
1291	daddu	c_1,t_2
1292	sltu	c_2,c_1,t_2
1293	dmultu	a_5,b_6		/* mul_add_c(a[5],b[6],c3,c1,c2); */
1294	mflo	t_1
1295	mfhi	t_2
1296	daddu	c_3,t_1
1297	sltu	AT,c_3,t_1
1298	daddu	t_2,AT
1299	daddu	c_1,t_2
1300	sltu	AT,c_1,t_2
1301	daddu	c_2,AT
1302	dmultu	a_6,b_5		/* mul_add_c(a[6],b[5],c3,c1,c2); */
1303	mflo	t_1
1304	mfhi	t_2
1305	daddu	c_3,t_1
1306	sltu	AT,c_3,t_1
1307	daddu	t_2,AT
1308	daddu	c_1,t_2
1309	sltu	AT,c_1,t_2
1310	daddu	c_2,AT
1311	dmultu	a_7,b_4		/* mul_add_c(a[7],b[4],c3,c1,c2); */
1312	mflo	t_1
1313	mfhi	t_2
1314	daddu	c_3,t_1
1315	sltu	AT,c_3,t_1
1316	daddu	t_2,AT
1317	daddu	c_1,t_2
1318	sltu	AT,c_1,t_2
1319	daddu	c_2,AT
1320	sd	c_3,88(a0)	/* r[11]=c3; */
1321
1322	dmultu	a_7,b_5		/* mul_add_c(a[7],b[5],c1,c2,c3); */
1323	mflo	t_1
1324	mfhi	t_2
1325	daddu	c_1,t_1
1326	sltu	AT,c_1,t_1
1327	daddu	t_2,AT
1328	daddu	c_2,t_2
1329	sltu	c_3,c_2,t_2
1330	dmultu	a_6,b_6		/* mul_add_c(a[6],b[6],c1,c2,c3); */
1331	mflo	t_1
1332	mfhi	t_2
1333	daddu	c_1,t_1
1334	sltu	AT,c_1,t_1
1335	daddu	t_2,AT
1336	daddu	c_2,t_2
1337	sltu	AT,c_2,t_2
1338	daddu	c_3,AT
1339	dmultu	a_5,b_7		/* mul_add_c(a[5],b[7],c1,c2,c3); */
1340	mflo	t_1
1341	mfhi	t_2
1342	daddu	c_1,t_1
1343	sltu	AT,c_1,t_1
1344	daddu	t_2,AT
1345	daddu	c_2,t_2
1346	sltu	AT,c_2,t_2
1347	daddu	c_3,AT
1348	sd	c_1,96(a0)	/* r[12]=c1; */
1349
1350	dmultu	a_6,b_7		/* mul_add_c(a[6],b[7],c2,c3,c1); */
1351	mflo	t_1
1352	mfhi	t_2
1353	daddu	c_2,t_1
1354	sltu	AT,c_2,t_1
1355	daddu	t_2,AT
1356	daddu	c_3,t_2
1357	sltu	c_1,c_3,t_2
1358	dmultu	a_7,b_6		/* mul_add_c(a[7],b[6],c2,c3,c1); */
1359	mflo	t_1
1360	mfhi	t_2
1361	daddu	c_2,t_1
1362	sltu	AT,c_2,t_1
1363	daddu	t_2,AT
1364	daddu	c_3,t_2
1365	sltu	AT,c_3,t_2
1366	daddu	c_1,AT
1367	sd	c_2,104(a0)	/* r[13]=c2; */
1368
1369	dmultu	a_7,b_7		/* mul_add_c(a[7],b[7],c3,c1,c2); */
1370	ld	s0,0(sp)
1371	ld	s1,8(sp)
1372	ld	s2,16(sp)
1373	ld	s3,24(sp)
1374	ld	s4,32(sp)
1375	ld	s5,40(sp)
1376	mflo	t_1
1377	mfhi	t_2
1378	daddu	c_3,t_1
1379	sltu	AT,c_3,t_1
1380	daddu	t_2,AT
1381	daddu	c_1,t_2
1382	sd	c_3,112(a0)	/* r[14]=c3; */
1383	sd	c_1,120(a0)	/* r[15]=c1; */
1384
1385	PTR_ADD	sp,FRAME_SIZE
1386
1387	jr	ra
1388END(bn_mul_comba8)
1389
1390.align	5
1391LEAF(bn_mul_comba4)
1392	.set	reorder
1393	ld	a_0,0(a1)
1394	ld	b_0,0(a2)
1395	ld	a_1,8(a1)
1396	ld	a_2,16(a1)
1397	dmultu	a_0,b_0		/* mul_add_c(a[0],b[0],c1,c2,c3); */
1398	ld	a_3,24(a1)
1399	ld	b_1,8(a2)
1400	ld	b_2,16(a2)
1401	ld	b_3,24(a2)
1402	mflo	c_1
1403	mfhi	c_2
1404	sd	c_1,0(a0)
1405
1406	dmultu	a_0,b_1		/* mul_add_c(a[0],b[1],c2,c3,c1); */
1407	mflo	t_1
1408	mfhi	t_2
1409	daddu	c_2,t_1
1410	sltu	AT,c_2,t_1
1411	daddu	c_3,t_2,AT
1412	dmultu	a_1,b_0		/* mul_add_c(a[1],b[0],c2,c3,c1); */
1413	mflo	t_1
1414	mfhi	t_2
1415	daddu	c_2,t_1
1416	sltu	AT,c_2,t_1
1417	daddu	t_2,AT
1418	daddu	c_3,t_2
1419	sltu	c_1,c_3,t_2
1420	sd	c_2,8(a0)
1421
1422	dmultu	a_2,b_0		/* mul_add_c(a[2],b[0],c3,c1,c2); */
1423	mflo	t_1
1424	mfhi	t_2
1425	daddu	c_3,t_1
1426	sltu	AT,c_3,t_1
1427	daddu	t_2,AT
1428	daddu	c_1,t_2
1429	dmultu	a_1,b_1		/* mul_add_c(a[1],b[1],c3,c1,c2); */
1430	mflo	t_1
1431	mfhi	t_2
1432	daddu	c_3,t_1
1433	sltu	AT,c_3,t_1
1434	daddu	t_2,AT
1435	daddu	c_1,t_2
1436	sltu	c_2,c_1,t_2
1437	dmultu	a_0,b_2		/* mul_add_c(a[0],b[2],c3,c1,c2); */
1438	mflo	t_1
1439	mfhi	t_2
1440	daddu	c_3,t_1
1441	sltu	AT,c_3,t_1
1442	daddu	t_2,AT
1443	daddu	c_1,t_2
1444	sltu	AT,c_1,t_2
1445	daddu	c_2,AT
1446	sd	c_3,16(a0)
1447
1448	dmultu	a_0,b_3		/* mul_add_c(a[0],b[3],c1,c2,c3); */
1449	mflo	t_1
1450	mfhi	t_2
1451	daddu	c_1,t_1
1452	sltu	AT,c_1,t_1
1453	daddu	t_2,AT
1454	daddu	c_2,t_2
1455	sltu	c_3,c_2,t_2
1456	dmultu	a_1,b_2		/* mul_add_c(a[1],b[2],c1,c2,c3); */
1457	mflo	t_1
1458	mfhi	t_2
1459	daddu	c_1,t_1
1460	sltu	AT,c_1,t_1
1461	daddu	t_2,AT
1462	daddu	c_2,t_2
1463	sltu	AT,c_2,t_2
1464	daddu	c_3,AT
1465	dmultu	a_2,b_1		/* mul_add_c(a[2],b[1],c1,c2,c3); */
1466	mflo	t_1
1467	mfhi	t_2
1468	daddu	c_1,t_1
1469	sltu	AT,c_1,t_1
1470	daddu	t_2,AT
1471	daddu	c_2,t_2
1472	sltu	AT,c_2,t_2
1473	daddu	c_3,AT
1474	dmultu	a_3,b_0		/* mul_add_c(a[3],b[0],c1,c2,c3); */
1475	mflo	t_1
1476	mfhi	t_2
1477	daddu	c_1,t_1
1478	sltu	AT,c_1,t_1
1479	daddu	t_2,AT
1480	daddu	c_2,t_2
1481	sltu	AT,c_2,t_2
1482	daddu	c_3,AT
1483	sd	c_1,24(a0)
1484
1485	dmultu	a_3,b_1		/* mul_add_c(a[3],b[1],c2,c3,c1); */
1486	mflo	t_1
1487	mfhi	t_2
1488	daddu	c_2,t_1
1489	sltu	AT,c_2,t_1
1490	daddu	t_2,AT
1491	daddu	c_3,t_2
1492	sltu	c_1,c_3,t_2
1493	dmultu	a_2,b_2		/* mul_add_c(a[2],b[2],c2,c3,c1); */
1494	mflo	t_1
1495	mfhi	t_2
1496	daddu	c_2,t_1
1497	sltu	AT,c_2,t_1
1498	daddu	t_2,AT
1499	daddu	c_3,t_2
1500	sltu	AT,c_3,t_2
1501	daddu	c_1,AT
1502	dmultu	a_1,b_3		/* mul_add_c(a[1],b[3],c2,c3,c1); */
1503	mflo	t_1
1504	mfhi	t_2
1505	daddu	c_2,t_1
1506	sltu	AT,c_2,t_1
1507	daddu	t_2,AT
1508	daddu	c_3,t_2
1509	sltu	AT,c_3,t_2
1510	daddu	c_1,AT
1511	sd	c_2,32(a0)
1512
1513	dmultu	a_2,b_3		/* mul_add_c(a[2],b[3],c3,c1,c2); */
1514	mflo	t_1
1515	mfhi	t_2
1516	daddu	c_3,t_1
1517	sltu	AT,c_3,t_1
1518	daddu	t_2,AT
1519	daddu	c_1,t_2
1520	sltu	c_2,c_1,t_2
1521	dmultu	a_3,b_2		/* mul_add_c(a[3],b[2],c3,c1,c2); */
1522	mflo	t_1
1523	mfhi	t_2
1524	daddu	c_3,t_1
1525	sltu	AT,c_3,t_1
1526	daddu	t_2,AT
1527	daddu	c_1,t_2
1528	sltu	AT,c_1,t_2
1529	daddu	c_2,AT
1530	sd	c_3,40(a0)
1531
1532	dmultu	a_3,b_3		/* mul_add_c(a[3],b[3],c1,c2,c3); */
1533	mflo	t_1
1534	mfhi	t_2
1535	daddu	c_1,t_1
1536	sltu	AT,c_1,t_1
1537	daddu	t_2,AT
1538	daddu	c_2,t_2
1539	sd	c_1,48(a0)
1540	sd	c_2,56(a0)
1541
1542	jr	ra
1543END(bn_mul_comba4)
1544
1545#undef	a_4
1546#undef	a_5
1547#undef	a_6
1548#undef	a_7
1549#define	a_4	b_0
1550#define	a_5	b_1
1551#define	a_6	b_2
1552#define	a_7	b_3
1553
1554.align	5
1555LEAF(bn_sqr_comba8)
1556	.set	reorder
1557	ld	a_0,0(a1)
1558	ld	a_1,8(a1)
1559	ld	a_2,16(a1)
1560	ld	a_3,24(a1)
1561
1562	dmultu	a_0,a_0		/* mul_add_c(a[0],b[0],c1,c2,c3); */
1563	ld	a_4,32(a1)
1564	ld	a_5,40(a1)
1565	ld	a_6,48(a1)
1566	ld	a_7,56(a1)
1567	mflo	c_1
1568	mfhi	c_2
1569	sd	c_1,0(a0)
1570
1571	dmultu	a_0,a_1		/* mul_add_c2(a[0],b[1],c2,c3,c1); */
1572	mflo	t_1
1573	mfhi	t_2
1574	slt	c_1,t_2,zero
1575	dsll	t_2,1
1576	slt	a2,t_1,zero
1577	daddu	t_2,a2
1578	dsll	t_1,1
1579	daddu	c_2,t_1
1580	sltu	AT,c_2,t_1
1581	daddu	c_3,t_2,AT
1582	sd	c_2,8(a0)
1583
1584	dmultu	a_2,a_0		/* mul_add_c2(a[2],b[0],c3,c1,c2); */
1585	mflo	t_1
1586	mfhi	t_2
1587	daddu	c_3,t_1
1588	sltu	AT,c_3,t_1
1589	daddu	c_3,t_1
1590	daddu	AT,t_2
1591	sltu	t_1,c_3,t_1
1592	daddu	c_1,AT
1593	daddu	t_2,t_1
1594	sltu	c_2,c_1,AT
1595	daddu	c_1,t_2
1596	sltu	t_2,c_1,t_2
1597	daddu	c_2,t_2
1598	dmultu	a_1,a_1		/* mul_add_c(a[1],b[1],c3,c1,c2); */
1599	mflo	t_1
1600	mfhi	t_2
1601	daddu	c_3,t_1
1602	sltu	AT,c_3,t_1
1603	daddu	t_2,AT
1604	daddu	c_1,t_2
1605	sltu	AT,c_1,t_2
1606	daddu	c_2,AT
1607	sd	c_3,16(a0)
1608
1609	dmultu	a_0,a_3		/* mul_add_c2(a[0],b[3],c1,c2,c3); */
1610	mflo	t_1
1611	mfhi	t_2
1612	daddu	c_1,t_1
1613	sltu	AT,c_1,t_1
1614	daddu	c_1,t_1
1615	daddu	AT,t_2
1616	sltu	t_1,c_1,t_1
1617	daddu	c_2,AT
1618	daddu	t_2,t_1
1619	sltu	c_3,c_2,AT
1620	daddu	c_2,t_2
1621	sltu	t_2,c_2,t_2
1622	daddu	c_3,t_2
1623	dmultu	a_1,a_2		/* mul_add_c2(a[1],b[2],c1,c2,c3); */
1624	mflo	t_1
1625	mfhi	t_2
1626	daddu	c_1,t_1
1627	sltu	AT,c_1,t_1
1628	daddu	c_1,t_1
1629	daddu	AT,t_2
1630	sltu	t_1,c_1,t_1
1631	daddu	c_2,AT
1632	daddu	t_2,t_1
1633	sltu	AT,c_2,AT
1634	daddu	c_2,t_2
1635	daddu	c_3,AT
1636	sltu	t_2,c_2,t_2
1637	daddu	c_3,t_2
1638	sd	c_1,24(a0)
1639
1640	dmultu	a_4,a_0		/* mul_add_c2(a[4],b[0],c2,c3,c1); */
1641	mflo	t_1
1642	mfhi	t_2
1643	daddu	c_2,t_1
1644	sltu	AT,c_2,t_1
1645	daddu	c_2,t_1
1646	daddu	AT,t_2
1647	sltu	t_1,c_2,t_1
1648	daddu	c_3,AT
1649	daddu	t_2,t_1
1650	sltu	c_1,c_3,AT
1651	daddu	c_3,t_2
1652	sltu	t_2,c_3,t_2
1653	daddu	c_1,t_2
1654	dmultu	a_3,a_1		/* mul_add_c2(a[3],b[1],c2,c3,c1); */
1655	mflo	t_1
1656	mfhi	t_2
1657	daddu	c_2,t_1
1658	sltu	AT,c_2,t_1
1659	daddu	c_2,t_1
1660	daddu	AT,t_2
1661	sltu	t_1,c_2,t_1
1662	daddu	c_3,AT
1663	daddu	t_2,t_1
1664	sltu	AT,c_3,AT
1665	daddu	c_3,t_2
1666	daddu	c_1,AT
1667	sltu	t_2,c_3,t_2
1668	daddu	c_1,t_2
1669	dmultu	a_2,a_2		/* mul_add_c(a[2],b[2],c2,c3,c1); */
1670	mflo	t_1
1671	mfhi	t_2
1672	daddu	c_2,t_1
1673	sltu	AT,c_2,t_1
1674	daddu	t_2,AT
1675	daddu	c_3,t_2
1676	sltu	AT,c_3,t_2
1677	daddu	c_1,AT
1678	sd	c_2,32(a0)
1679
1680	dmultu	a_0,a_5		/* mul_add_c2(a[0],b[5],c3,c1,c2); */
1681	mflo	t_1
1682	mfhi	t_2
1683	daddu	c_3,t_1
1684	sltu	AT,c_3,t_1
1685	daddu	c_3,t_1
1686	daddu	AT,t_2
1687	sltu	t_1,c_3,t_1
1688	daddu	c_1,AT
1689	daddu	t_2,t_1
1690	sltu	c_2,c_1,AT
1691	daddu	c_1,t_2
1692	sltu	t_2,c_1,t_2
1693	daddu	c_2,t_2
1694	dmultu	a_1,a_4		/* mul_add_c2(a[1],b[4],c3,c1,c2); */
1695	mflo	t_1
1696	mfhi	t_2
1697	daddu	c_3,t_1
1698	sltu	AT,c_3,t_1
1699	daddu	c_3,t_1
1700	daddu	AT,t_2
1701	sltu	t_1,c_3,t_1
1702	daddu	c_1,AT
1703	daddu	t_2,t_1
1704	sltu	AT,c_1,AT
1705	daddu	c_1,t_2
1706	daddu	c_2,AT
1707	sltu	t_2,c_1,t_2
1708	daddu	c_2,t_2
1709	dmultu	a_2,a_3		/* mul_add_c2(a[2],b[3],c3,c1,c2); */
1710	mflo	t_1
1711	mfhi	t_2
1712	daddu	c_3,t_1
1713	sltu	AT,c_3,t_1
1714	daddu	c_3,t_1
1715	daddu	AT,t_2
1716	sltu	t_1,c_3,t_1
1717	daddu	c_1,AT
1718	daddu	t_2,t_1
1719	sltu	AT,c_1,AT
1720	daddu	c_1,t_2
1721	daddu	c_2,AT
1722	sltu	t_2,c_1,t_2
1723	daddu	c_2,t_2
1724	sd	c_3,40(a0)
1725
1726	dmultu	a_6,a_0		/* mul_add_c2(a[6],b[0],c1,c2,c3); */
1727	mflo	t_1
1728	mfhi	t_2
1729	daddu	c_1,t_1
1730	sltu	AT,c_1,t_1
1731	daddu	c_1,t_1
1732	daddu	AT,t_2
1733	sltu	t_1,c_1,t_1
1734	daddu	c_2,AT
1735	daddu	t_2,t_1
1736	sltu	c_3,c_2,AT
1737	daddu	c_2,t_2
1738	sltu	t_2,c_2,t_2
1739	daddu	c_3,t_2
1740	dmultu	a_5,a_1		/* mul_add_c2(a[5],b[1],c1,c2,c3); */
1741	mflo	t_1
1742	mfhi	t_2
1743	daddu	c_1,t_1
1744	sltu	AT,c_1,t_1
1745	daddu	c_1,t_1
1746	daddu	AT,t_2
1747	sltu	t_1,c_1,t_1
1748	daddu	c_2,AT
1749	daddu	t_2,t_1
1750	sltu	AT,c_2,AT
1751	daddu	c_2,t_2
1752	daddu	c_3,AT
1753	sltu	t_2,c_2,t_2
1754	daddu	c_3,t_2
1755	dmultu	a_4,a_2		/* mul_add_c2(a[4],b[2],c1,c2,c3); */
1756	mflo	t_1
1757	mfhi	t_2
1758	daddu	c_1,t_1
1759	sltu	AT,c_1,t_1
1760	daddu	c_1,t_1
1761	daddu	AT,t_2
1762	sltu	t_1,c_1,t_1
1763	daddu	c_2,AT
1764	daddu	t_2,t_1
1765	sltu	AT,c_2,AT
1766	daddu	c_2,t_2
1767	daddu	c_3,AT
1768	sltu	t_2,c_2,t_2
1769	daddu	c_3,t_2
1770	dmultu	a_3,a_3		/* mul_add_c(a[3],b[3],c1,c2,c3); */
1771	mflo	t_1
1772	mfhi	t_2
1773	daddu	c_1,t_1
1774	sltu	AT,c_1,t_1
1775	daddu	t_2,AT
1776	daddu	c_2,t_2
1777	sltu	AT,c_2,t_2
1778	daddu	c_3,AT
1779	sd	c_1,48(a0)
1780
1781	dmultu	a_0,a_7		/* mul_add_c2(a[0],b[7],c2,c3,c1); */
1782	mflo	t_1
1783	mfhi	t_2
1784	daddu	c_2,t_1
1785	sltu	AT,c_2,t_1
1786	daddu	c_2,t_1
1787	daddu	AT,t_2
1788	sltu	t_1,c_2,t_1
1789	daddu	c_3,AT
1790	daddu	t_2,t_1
1791	sltu	c_1,c_3,AT
1792	daddu	c_3,t_2
1793	sltu	t_2,c_3,t_2
1794	daddu	c_1,t_2
1795	dmultu	a_1,a_6		/* mul_add_c2(a[1],b[6],c2,c3,c1); */
1796	mflo	t_1
1797	mfhi	t_2
1798	daddu	c_2,t_1
1799	sltu	AT,c_2,t_1
1800	daddu	c_2,t_1
1801	daddu	AT,t_2
1802	sltu	t_1,c_2,t_1
1803	daddu	c_3,AT
1804	daddu	t_2,t_1
1805	sltu	AT,c_3,AT
1806	daddu	c_3,t_2
1807	daddu	c_1,AT
1808	sltu	t_2,c_3,t_2
1809	daddu	c_1,t_2
1810	dmultu	a_2,a_5		/* mul_add_c2(a[2],b[5],c2,c3,c1); */
1811	mflo	t_1
1812	mfhi	t_2
1813	daddu	c_2,t_1
1814	sltu	AT,c_2,t_1
1815	daddu	c_2,t_1
1816	daddu	AT,t_2
1817	sltu	t_1,c_2,t_1
1818	daddu	c_3,AT
1819	daddu	t_2,t_1
1820	sltu	AT,c_3,AT
1821	daddu	c_3,t_2
1822	daddu	c_1,AT
1823	sltu	t_2,c_3,t_2
1824	daddu	c_1,t_2
1825	dmultu	a_3,a_4		/* mul_add_c2(a[3],b[4],c2,c3,c1); */
1826	mflo	t_1
1827	mfhi	t_2
1828	daddu	c_2,t_1
1829	sltu	AT,c_2,t_1
1830	daddu	c_2,t_1
1831	daddu	AT,t_2
1832	sltu	t_1,c_2,t_1
1833	daddu	c_3,AT
1834	daddu	t_2,t_1
1835	sltu	AT,c_3,AT
1836	daddu	c_3,t_2
1837	daddu	c_1,AT
1838	sltu	t_2,c_3,t_2
1839	daddu	c_1,t_2
1840	sd	c_2,56(a0)
1841
1842	dmultu	a_7,a_1		/* mul_add_c2(a[7],b[1],c3,c1,c2); */
1843	mflo	t_1
1844	mfhi	t_2
1845	daddu	c_3,t_1
1846	sltu	AT,c_3,t_1
1847	daddu	c_3,t_1
1848	daddu	AT,t_2
1849	sltu	t_1,c_3,t_1
1850	daddu	c_1,AT
1851	daddu	t_2,t_1
1852	sltu	c_2,c_1,AT
1853	daddu	c_1,t_2
1854	sltu	t_2,c_1,t_2
1855	daddu	c_2,t_2
1856	dmultu	a_6,a_2		/* mul_add_c2(a[6],b[2],c3,c1,c2); */
1857	mflo	t_1
1858	mfhi	t_2
1859	daddu	c_3,t_1
1860	sltu	AT,c_3,t_1
1861	daddu	c_3,t_1
1862	daddu	AT,t_2
1863	sltu	t_1,c_3,t_1
1864	daddu	c_1,AT
1865	daddu	t_2,t_1
1866	sltu	AT,c_1,AT
1867	daddu	c_1,t_2
1868	daddu	c_2,AT
1869	sltu	t_2,c_1,t_2
1870	daddu	c_2,t_2
1871	dmultu	a_5,a_3		/* mul_add_c2(a[5],b[3],c3,c1,c2); */
1872	mflo	t_1
1873	mfhi	t_2
1874	daddu	c_3,t_1
1875	sltu	AT,c_3,t_1
1876	daddu	c_3,t_1
1877	daddu	AT,t_2
1878	sltu	t_1,c_3,t_1
1879	daddu	c_1,AT
1880	daddu	t_2,t_1
1881	sltu	AT,c_1,AT
1882	daddu	c_1,t_2
1883	daddu	c_2,AT
1884	sltu	t_2,c_1,t_2
1885	daddu	c_2,t_2
1886	dmultu	a_4,a_4		/* mul_add_c(a[4],b[4],c3,c1,c2); */
1887	mflo	t_1
1888	mfhi	t_2
1889	daddu	c_3,t_1
1890	sltu	AT,c_3,t_1
1891	daddu	t_2,AT
1892	daddu	c_1,t_2
1893	sltu	AT,c_1,t_2
1894	daddu	c_2,AT
1895	sd	c_3,64(a0)
1896
1897	dmultu	a_2,a_7		/* mul_add_c2(a[2],b[7],c1,c2,c3); */
1898	mflo	t_1
1899	mfhi	t_2
1900	daddu	c_1,t_1
1901	sltu	AT,c_1,t_1
1902	daddu	c_1,t_1
1903	daddu	AT,t_2
1904	sltu	t_1,c_1,t_1
1905	daddu	c_2,AT
1906	daddu	t_2,t_1
1907	sltu	c_3,c_2,AT
1908	daddu	c_2,t_2
1909	sltu	t_2,c_2,t_2
1910	daddu	c_3,t_2
1911	dmultu	a_3,a_6		/* mul_add_c2(a[3],b[6],c1,c2,c3); */
1912	mflo	t_1
1913	mfhi	t_2
1914	daddu	c_1,t_1
1915	sltu	AT,c_1,t_1
1916	daddu	c_1,t_1
1917	daddu	AT,t_2
1918	sltu	t_1,c_1,t_1
1919	daddu	c_2,AT
1920	daddu	t_2,t_1
1921	sltu	AT,c_2,AT
1922	daddu	c_2,t_2
1923	daddu	c_3,AT
1924	sltu	t_2,c_2,t_2
1925	daddu	c_3,t_2
1926	dmultu	a_4,a_5		/* mul_add_c2(a[4],b[5],c1,c2,c3); */
1927	mflo	t_1
1928	mfhi	t_2
1929	daddu	c_1,t_1
1930	sltu	AT,c_1,t_1
1931	daddu	c_1,t_1
1932	daddu	AT,t_2
1933	sltu	t_1,c_1,t_1
1934	daddu	c_2,AT
1935	daddu	t_2,t_1
1936	sltu	AT,c_2,AT
1937	daddu	c_2,t_2
1938	daddu	c_3,AT
1939	sltu	t_2,c_2,t_2
1940	daddu	c_3,t_2
1941	sd	c_1,72(a0)
1942
1943	dmultu	a_7,a_3		/* mul_add_c2(a[7],b[3],c2,c3,c1); */
1944	mflo	t_1
1945	mfhi	t_2
1946	daddu	c_2,t_1
1947	sltu	AT,c_2,t_1
1948	daddu	c_2,t_1
1949	daddu	AT,t_2
1950	sltu	t_1,c_2,t_1
1951	daddu	c_3,AT
1952	daddu	t_2,t_1
1953	sltu	c_1,c_3,AT
1954	daddu	c_3,t_2
1955	sltu	t_2,c_3,t_2
1956	daddu	c_1,t_2
1957	dmultu	a_6,a_4		/* mul_add_c2(a[6],b[4],c2,c3,c1); */
1958	mflo	t_1
1959	mfhi	t_2
1960	daddu	c_2,t_1
1961	sltu	AT,c_2,t_1
1962	daddu	c_2,t_1
1963	daddu	AT,t_2
1964	sltu	t_1,c_2,t_1
1965	daddu	c_3,AT
1966	daddu	t_2,t_1
1967	sltu	AT,c_3,AT
1968	daddu	c_3,t_2
1969	daddu	c_1,AT
1970	sltu	t_2,c_3,t_2
1971	daddu	c_1,t_2
1972	dmultu	a_5,a_5		/* mul_add_c(a[5],b[5],c2,c3,c1); */
1973	mflo	t_1
1974	mfhi	t_2
1975	daddu	c_2,t_1
1976	sltu	AT,c_2,t_1
1977	daddu	t_2,AT
1978	daddu	c_3,t_2
1979	sltu	AT,c_3,t_2
1980	daddu	c_1,AT
1981	sd	c_2,80(a0)
1982
1983	dmultu	a_4,a_7		/* mul_add_c2(a[4],b[7],c3,c1,c2); */
1984	mflo	t_1
1985	mfhi	t_2
1986	daddu	c_3,t_1
1987	sltu	AT,c_3,t_1
1988	daddu	c_3,t_1
1989	daddu	AT,t_2
1990	sltu	t_1,c_3,t_1
1991	daddu	c_1,AT
1992	daddu	t_2,t_1
1993	sltu	c_2,c_1,AT
1994	daddu	c_1,t_2
1995	sltu	t_2,c_1,t_2
1996	daddu	c_2,t_2
1997	dmultu	a_5,a_6		/* mul_add_c2(a[5],b[6],c3,c1,c2); */
1998	mflo	t_1
1999	mfhi	t_2
2000	daddu	c_3,t_1
2001	sltu	AT,c_3,t_1
2002	daddu	c_3,t_1
2003	daddu	AT,t_2
2004	sltu	t_1,c_3,t_1
2005	daddu	c_1,AT
2006	daddu	t_2,t_1
2007	sltu	AT,c_1,AT
2008	daddu	c_1,t_2
2009	daddu	c_2,AT
2010	sltu	t_2,c_1,t_2
2011	daddu	c_2,t_2
2012	sd	c_3,88(a0)
2013
2014	dmultu	a_7,a_5		/* mul_add_c2(a[7],b[5],c1,c2,c3); */
2015	mflo	t_1
2016	mfhi	t_2
2017	daddu	c_1,t_1
2018	sltu	AT,c_1,t_1
2019	daddu	c_1,t_1
2020	daddu	AT,t_2
2021	sltu	t_1,c_1,t_1
2022	daddu	c_2,AT
2023	daddu	t_2,t_1
2024	sltu	c_3,c_2,AT
2025	daddu	c_2,t_2
2026	sltu	t_2,c_2,t_2
2027	daddu	c_3,t_2
2028	dmultu	a_6,a_6		/* mul_add_c(a[6],b[6],c1,c2,c3); */
2029	mflo	t_1
2030	mfhi	t_2
2031	daddu	c_1,t_1
2032	sltu	AT,c_1,t_1
2033	daddu	t_2,AT
2034	daddu	c_2,t_2
2035	sltu	AT,c_2,t_2
2036	daddu	c_3,AT
2037	sd	c_1,96(a0)
2038
2039	dmultu	a_6,a_7		/* mul_add_c2(a[6],b[7],c2,c3,c1); */
2040	mflo	t_1
2041	mfhi	t_2
2042	daddu	c_2,t_1
2043	sltu	AT,c_2,t_1
2044	daddu	c_2,t_1
2045	daddu	AT,t_2
2046	sltu	t_1,c_2,t_1
2047	daddu	c_3,AT
2048	daddu	t_2,t_1
2049	sltu	c_1,c_3,AT
2050	daddu	c_3,t_2
2051	sltu	t_2,c_3,t_2
2052	daddu	c_1,t_2
2053	sd	c_2,104(a0)
2054
2055	dmultu	a_7,a_7		/* mul_add_c(a[7],b[7],c3,c1,c2); */
2056	mflo	t_1
2057	mfhi	t_2
2058	daddu	c_3,t_1
2059	sltu	AT,c_3,t_1
2060	daddu	t_2,AT
2061	daddu	c_1,t_2
2062	sd	c_3,112(a0)
2063	sd	c_1,120(a0)
2064
2065	jr	ra
2066END(bn_sqr_comba8)
2067
2068.align	5
2069LEAF(bn_sqr_comba4)
2070	.set	reorder
2071	ld	a_0,0(a1)
2072	ld	a_1,8(a1)
2073	dmultu	a_0,a_0		/* mul_add_c(a[0],b[0],c1,c2,c3); */
2074	ld	a_2,16(a1)
2075	ld	a_3,24(a1)
2076	mflo	c_1
2077	mfhi	c_2
2078	sd	c_1,0(a0)
2079
2080	dmultu	a_0,a_1		/* mul_add_c2(a[0],b[1],c2,c3,c1); */
2081	mflo	t_1
2082	mfhi	t_2
2083	slt	c_1,t_2,zero
2084	dsll	t_2,1
2085	slt	a2,t_1,zero
2086	daddu	t_2,a2
2087	dsll	t_1,1
2088	daddu	c_2,t_1
2089	sltu	AT,c_2,t_1
2090	daddu	c_3,t_2,AT
2091	sd	c_2,8(a0)
2092
2093	dmultu	a_2,a_0		/* mul_add_c2(a[2],b[0],c3,c1,c2); */
2094	mflo	t_1
2095	mfhi	t_2
2096	daddu	c_3,t_1
2097	sltu	AT,c_3,t_1
2098	daddu	c_3,t_1
2099	daddu	AT,t_2
2100	sltu	t_1,c_3,t_1
2101	daddu	c_1,AT
2102	daddu	t_2,t_1
2103	sltu	c_2,c_1,AT
2104	daddu	c_1,t_2
2105	sltu	t_2,c_1,t_2
2106	daddu	c_2,t_2
2107	dmultu	a_1,a_1		/* mul_add_c(a[1],b[1],c3,c1,c2); */
2108	mflo	t_1
2109	mfhi	t_2
2110	daddu	c_3,t_1
2111	sltu	AT,c_3,t_1
2112	daddu	t_2,AT
2113	daddu	c_1,t_2
2114	sltu	AT,c_1,t_2
2115	daddu	c_2,AT
2116	sd	c_3,16(a0)
2117
2118	dmultu	a_0,a_3		/* mul_add_c2(a[0],b[3],c1,c2,c3); */
2119	mflo	t_1
2120	mfhi	t_2
2121	daddu	c_1,t_1
2122	sltu	AT,c_1,t_1
2123	daddu	c_1,t_1
2124	daddu	AT,t_2
2125	sltu	t_1,c_1,t_1
2126	daddu	c_2,AT
2127	daddu	t_2,t_1
2128	sltu	c_3,c_2,AT
2129	daddu	c_2,t_2
2130	sltu	t_2,c_2,t_2
2131	daddu	c_3,t_2
2132	dmultu	a_1,a_2		/* mul_add_c(a2[1],b[2],c1,c2,c3); */
2133	mflo	t_1
2134	mfhi	t_2
2135	daddu	c_1,t_1
2136	sltu	AT,c_1,t_1
2137	daddu	c_1,t_1
2138	daddu	AT,t_2
2139	sltu	t_1,c_1,t_1
2140	daddu	c_2,AT
2141	daddu	t_2,t_1
2142	sltu	AT,c_2,AT
2143	daddu	c_2,t_2
2144	daddu	c_3,AT
2145	sltu	t_2,c_2,t_2
2146	daddu	c_3,t_2
2147	sd	c_1,24(a0)
2148
2149	dmultu	a_3,a_1		/* mul_add_c2(a[3],b[1],c2,c3,c1); */
2150	mflo	t_1
2151	mfhi	t_2
2152	daddu	c_2,t_1
2153	sltu	AT,c_2,t_1
2154	daddu	c_2,t_1
2155	daddu	AT,t_2
2156	sltu	t_1,c_2,t_1
2157	daddu	c_3,AT
2158	daddu	t_2,t_1
2159	sltu	c_1,c_3,AT
2160	daddu	c_3,t_2
2161	sltu	t_2,c_3,t_2
2162	daddu	c_1,t_2
2163	dmultu	a_2,a_2		/* mul_add_c(a[2],b[2],c2,c3,c1); */
2164	mflo	t_1
2165	mfhi	t_2
2166	daddu	c_2,t_1
2167	sltu	AT,c_2,t_1
2168	daddu	t_2,AT
2169	daddu	c_3,t_2
2170	sltu	AT,c_3,t_2
2171	daddu	c_1,AT
2172	sd	c_2,32(a0)
2173
2174	dmultu	a_2,a_3		/* mul_add_c2(a[2],b[3],c3,c1,c2); */
2175	mflo	t_1
2176	mfhi	t_2
2177	daddu	c_3,t_1
2178	sltu	AT,c_3,t_1
2179	daddu	c_3,t_1
2180	daddu	AT,t_2
2181	sltu	t_1,c_3,t_1
2182	daddu	c_1,AT
2183	daddu	t_2,t_1
2184	sltu	c_2,c_1,AT
2185	daddu	c_1,t_2
2186	sltu	t_2,c_1,t_2
2187	daddu	c_2,t_2
2188	sd	c_3,40(a0)
2189
2190	dmultu	a_3,a_3		/* mul_add_c(a[3],b[3],c1,c2,c3); */
2191	mflo	t_1
2192	mfhi	t_2
2193	daddu	c_1,t_1
2194	sltu	AT,c_1,t_1
2195	daddu	t_2,AT
2196	daddu	c_2,t_2
2197	sd	c_1,48(a0)
2198	sd	c_2,56(a0)
2199
2200	jr	ra
2201END(bn_sqr_comba4)
2202