168651Skris;
268651Skris; PA-RISC 64-bit implementation of bn_asm code
368651Skris;
468651Skris; This code is approximately 2x faster than the C version
568651Skris; for RSA/DSA.
668651Skris;
768651Skris; See http://devresource.hp.com/  for more details on the PA-RISC
868651Skris; architecture.  Also see the book "PA-RISC 2.0 Architecture"
968651Skris; by Gerry Kane for information on the instruction set architecture.
1068651Skris;
1168651Skris; Code written by Chris Ruemmler (with some help from the HP C
1268651Skris; compiler).
1368651Skris;
1468651Skris; The code compiles with HP's assembler
1568651Skris;
1668651Skris
1768651Skris	.level	2.0W
1868651Skris	.space	$TEXT$
1968651Skris	.subspa	$CODE$,QUAD=0,ALIGN=8,ACCESS=0x2c,CODE_ONLY
2068651Skris
2168651Skris;
2268651Skris; Global Register definitions used for the routines.
2368651Skris;
2468651Skris; Some information about HP's runtime architecture for 64-bits.
2568651Skris;
2668651Skris; "Caller save" means the calling function must save the register
2768651Skris; if it wants the register to be preserved.
2868651Skris; "Callee save" means if a function uses the register, it must save
2968651Skris; the value before using it.
3068651Skris;
3168651Skris; For the floating point registers
3268651Skris;
3368651Skris;    "caller save" registers: fr4-fr11, fr22-fr31
3468651Skris;    "callee save" registers: fr12-fr21
3568651Skris;    "special" registers: fr0-fr3 (status and exception registers)
3668651Skris;
3768651Skris; For the integer registers
3868651Skris;     value zero             :  r0
3968651Skris;     "caller save" registers: r1,r19-r26
4068651Skris;     "callee save" registers: r3-r18
4168651Skris;     return register        :  r2  (rp)
4268651Skris;     return values          ; r28  (ret0,ret1)
4368651Skris;     Stack pointer          ; r30  (sp)
4468651Skris;     global data pointer    ; r27  (dp)
4568651Skris;     argument pointer       ; r29  (ap)
4668651Skris;     millicode return ptr   ; r31  (also a caller save register)
4768651Skris
4868651Skris
4968651Skris;
5068651Skris; Arguments to the routines
5168651Skris;
5268651Skrisr_ptr       .reg %r26
5368651Skrisa_ptr       .reg %r25
5468651Skrisb_ptr       .reg %r24
5568651Skrisnum         .reg %r24
5668651Skrisw           .reg %r23
5768651Skrisn           .reg %r23
5868651Skris
5968651Skris
6068651Skris;
6168651Skris; Globals used in some routines
6268651Skris;
6368651Skris
6468651Skristop_overflow .reg %r29
6568651Skrishigh_mask    .reg %r22    ; value 0xffffffff80000000L
6668651Skris
6768651Skris
6868651Skris;------------------------------------------------------------------------------
6968651Skris;
7068651Skris; bn_mul_add_words
7168651Skris;
7268651Skris;BN_ULONG bn_mul_add_words(BN_ULONG *r_ptr, BN_ULONG *a_ptr,
7368651Skris;								int num, BN_ULONG w)
7468651Skris;
7568651Skris; arg0 = r_ptr
7668651Skris; arg1 = a_ptr
7768651Skris; arg2 = num
7868651Skris; arg3 = w
7968651Skris;
8068651Skris; Local register definitions
8168651Skris;
8268651Skris
8368651Skrisfm1          .reg %fr22
8468651Skrisfm           .reg %fr23
8568651Skrisht_temp      .reg %fr24
8668651Skrisht_temp_1    .reg %fr25
8768651Skrislt_temp      .reg %fr26
8868651Skrislt_temp_1    .reg %fr27
8968651Skrisfm1_1        .reg %fr28
9068651Skrisfm_1         .reg %fr29
9168651Skris
9268651Skrisfw_h         .reg %fr7L
9368651Skrisfw_l         .reg %fr7R
9468651Skrisfw           .reg %fr7
9568651Skris
9668651Skrisfht_0        .reg %fr8L
9768651Skrisflt_0        .reg %fr8R
9868651Skrist_float_0    .reg %fr8
9968651Skris
10068651Skrisfht_1        .reg %fr9L
10168651Skrisflt_1        .reg %fr9R
10268651Skrist_float_1    .reg %fr9
10368651Skris
10468651Skristmp_0        .reg %r31
10568651Skristmp_1        .reg %r21
10668651Skrism_0          .reg %r20
10768651Skrism_1          .reg %r19
10868651Skrisht_0         .reg %r1
10968651Skrisht_1         .reg %r3
11068651Skrislt_0         .reg %r4
11168651Skrislt_1         .reg %r5
11268651Skrism1_0         .reg %r6
11368651Skrism1_1         .reg %r7
11468651Skrisrp_val       .reg %r8
11568651Skrisrp_val_1     .reg %r9
11668651Skris
11768651Skrisbn_mul_add_words
11868651Skris	.export	bn_mul_add_words,entry,NO_RELOCATION,LONG_RETURN
11968651Skris	.proc
12068651Skris	.callinfo frame=128
12168651Skris    .entry
12268651Skris	.align 64
12368651Skris
12468651Skris    STD     %r3,0(%sp)          ; save r3
12568651Skris    STD     %r4,8(%sp)          ; save r4
12668651Skris	NOP                         ; Needed to make the loop 16-byte aligned
12768651Skris	NOP                         ; Needed to make the loop 16-byte aligned
12868651Skris
12968651Skris    STD     %r5,16(%sp)         ; save r5
13068651Skris    STD     %r6,24(%sp)         ; save r6
13168651Skris    STD     %r7,32(%sp)         ; save r7
13268651Skris    STD     %r8,40(%sp)         ; save r8
13368651Skris
13468651Skris    STD     %r9,48(%sp)         ; save r9
13568651Skris    COPY    %r0,%ret0           ; return 0 by default
13668651Skris    DEPDI,Z 1,31,1,top_overflow ; top_overflow = 1 << 32
13768651Skris	STD     w,56(%sp)           ; store w on stack
13868651Skris
13968651Skris    CMPIB,>= 0,num,bn_mul_add_words_exit  ; if (num <= 0) then exit
14068651Skris	LDO     128(%sp),%sp       ; bump stack
14168651Skris
14268651Skris	;
14368651Skris	; The loop is unrolled twice, so if there is only 1 number
14468651Skris    ; then go straight to the cleanup code.
14568651Skris	;
14668651Skris	CMPIB,= 1,num,bn_mul_add_words_single_top
14768651Skris	FLDD    -72(%sp),fw     ; load up w into fp register fw (fw_h/fw_l)
14868651Skris
14968651Skris	;
15068651Skris	; This loop is unrolled 2 times (64-byte aligned as well)
15168651Skris	;
15268651Skris	; PA-RISC 2.0 chips have two fully pipelined multipliers, thus
15368651Skris    ; two 32-bit mutiplies can be issued per cycle.
15468651Skris    ;
15568651Skrisbn_mul_add_words_unroll2
15668651Skris
15768651Skris    FLDD    0(a_ptr),t_float_0       ; load up 64-bit value (fr8L) ht(L)/lt(R)
15868651Skris    FLDD    8(a_ptr),t_float_1       ; load up 64-bit value (fr8L) ht(L)/lt(R)
15968651Skris    LDD     0(r_ptr),rp_val          ; rp[0]
16068651Skris    LDD     8(r_ptr),rp_val_1        ; rp[1]
16168651Skris
16268651Skris    XMPYU   fht_0,fw_l,fm1           ; m1[0] = fht_0*fw_l
16368651Skris    XMPYU   fht_1,fw_l,fm1_1         ; m1[1] = fht_1*fw_l
16468651Skris    FSTD    fm1,-16(%sp)             ; -16(sp) = m1[0]
16568651Skris    FSTD    fm1_1,-48(%sp)           ; -48(sp) = m1[1]
16668651Skris
16768651Skris    XMPYU   flt_0,fw_h,fm            ; m[0] = flt_0*fw_h
16868651Skris    XMPYU   flt_1,fw_h,fm_1          ; m[1] = flt_1*fw_h
16968651Skris    FSTD    fm,-8(%sp)               ; -8(sp) = m[0]
17068651Skris    FSTD    fm_1,-40(%sp)            ; -40(sp) = m[1]
17168651Skris
17268651Skris    XMPYU   fht_0,fw_h,ht_temp       ; ht_temp   = fht_0*fw_h
17368651Skris    XMPYU   fht_1,fw_h,ht_temp_1     ; ht_temp_1 = fht_1*fw_h
17468651Skris    FSTD    ht_temp,-24(%sp)         ; -24(sp)   = ht_temp
17568651Skris    FSTD    ht_temp_1,-56(%sp)       ; -56(sp)   = ht_temp_1
17668651Skris
17768651Skris    XMPYU   flt_0,fw_l,lt_temp       ; lt_temp = lt*fw_l
17868651Skris    XMPYU   flt_1,fw_l,lt_temp_1     ; lt_temp = lt*fw_l
17968651Skris    FSTD    lt_temp,-32(%sp)         ; -32(sp) = lt_temp
18068651Skris    FSTD    lt_temp_1,-64(%sp)       ; -64(sp) = lt_temp_1
18168651Skris
18268651Skris    LDD     -8(%sp),m_0              ; m[0]
18368651Skris    LDD     -40(%sp),m_1             ; m[1]
18468651Skris    LDD     -16(%sp),m1_0            ; m1[0]
18568651Skris    LDD     -48(%sp),m1_1            ; m1[1]
18668651Skris
18768651Skris    LDD     -24(%sp),ht_0            ; ht[0]
18868651Skris    LDD     -56(%sp),ht_1            ; ht[1]
18968651Skris    ADD,L   m1_0,m_0,tmp_0           ; tmp_0 = m[0] + m1[0];
19068651Skris    ADD,L   m1_1,m_1,tmp_1           ; tmp_1 = m[1] + m1[1];
19168651Skris
19268651Skris    LDD     -32(%sp),lt_0
19368651Skris    LDD     -64(%sp),lt_1
19468651Skris    CMPCLR,*>>= tmp_0,m1_0, %r0      ; if (m[0] < m1[0])
19568651Skris    ADD,L   ht_0,top_overflow,ht_0   ; ht[0] += (1<<32)
19668651Skris
19768651Skris    CMPCLR,*>>= tmp_1,m1_1,%r0       ; if (m[1] < m1[1])
19868651Skris    ADD,L   ht_1,top_overflow,ht_1   ; ht[1] += (1<<32)
19968651Skris    EXTRD,U tmp_0,31,32,m_0          ; m[0]>>32
20068651Skris    DEPD,Z  tmp_0,31,32,m1_0         ; m1[0] = m[0]<<32
20168651Skris
20268651Skris    EXTRD,U tmp_1,31,32,m_1          ; m[1]>>32
20368651Skris    DEPD,Z  tmp_1,31,32,m1_1         ; m1[1] = m[1]<<32
20468651Skris    ADD,L   ht_0,m_0,ht_0            ; ht[0]+= (m[0]>>32)
20568651Skris    ADD,L   ht_1,m_1,ht_1            ; ht[1]+= (m[1]>>32)
20668651Skris
20768651Skris    ADD     lt_0,m1_0,lt_0           ; lt[0] = lt[0]+m1[0];
20868651Skris	ADD,DC  ht_0,%r0,ht_0            ; ht[0]++
20968651Skris    ADD     lt_1,m1_1,lt_1           ; lt[1] = lt[1]+m1[1];
21068651Skris    ADD,DC  ht_1,%r0,ht_1            ; ht[1]++
21168651Skris
21268651Skris    ADD    %ret0,lt_0,lt_0           ; lt[0] = lt[0] + c;
21368651Skris	ADD,DC  ht_0,%r0,ht_0            ; ht[0]++
21468651Skris    ADD     lt_0,rp_val,lt_0         ; lt[0] = lt[0]+rp[0]
21568651Skris    ADD,DC  ht_0,%r0,ht_0            ; ht[0]++
21668651Skris
21768651Skris	LDO    -2(num),num               ; num = num - 2;
21868651Skris    ADD     ht_0,lt_1,lt_1           ; lt[1] = lt[1] + ht_0 (c);
21968651Skris    ADD,DC  ht_1,%r0,ht_1            ; ht[1]++
22068651Skris    STD     lt_0,0(r_ptr)            ; rp[0] = lt[0]
22168651Skris
22268651Skris    ADD     lt_1,rp_val_1,lt_1       ; lt[1] = lt[1]+rp[1]
22368651Skris    ADD,DC  ht_1,%r0,%ret0           ; ht[1]++
22468651Skris    LDO     16(a_ptr),a_ptr          ; a_ptr += 2
22568651Skris
22668651Skris    STD     lt_1,8(r_ptr)            ; rp[1] = lt[1]
22768651Skris	CMPIB,<= 2,num,bn_mul_add_words_unroll2 ; go again if more to do
22868651Skris    LDO     16(r_ptr),r_ptr          ; r_ptr += 2
22968651Skris
23068651Skris    CMPIB,=,N 0,num,bn_mul_add_words_exit ; are we done, or cleanup last one
23168651Skris
23268651Skris	;
23368651Skris	; Top of loop aligned on 64-byte boundary
23468651Skris	;
23568651Skrisbn_mul_add_words_single_top
23668651Skris    FLDD    0(a_ptr),t_float_0        ; load up 64-bit value (fr8L) ht(L)/lt(R)
23768651Skris    LDD     0(r_ptr),rp_val           ; rp[0]
23868651Skris    LDO     8(a_ptr),a_ptr            ; a_ptr++
23968651Skris    XMPYU   fht_0,fw_l,fm1            ; m1 = ht*fw_l
24068651Skris    FSTD    fm1,-16(%sp)              ; -16(sp) = m1
24168651Skris    XMPYU   flt_0,fw_h,fm             ; m = lt*fw_h
24268651Skris    FSTD    fm,-8(%sp)                ; -8(sp) = m
24368651Skris    XMPYU   fht_0,fw_h,ht_temp        ; ht_temp = ht*fw_h
24468651Skris    FSTD    ht_temp,-24(%sp)          ; -24(sp) = ht
24568651Skris    XMPYU   flt_0,fw_l,lt_temp        ; lt_temp = lt*fw_l
24668651Skris    FSTD    lt_temp,-32(%sp)          ; -32(sp) = lt
24768651Skris
24868651Skris    LDD     -8(%sp),m_0
24968651Skris    LDD    -16(%sp),m1_0              ; m1 = temp1
25068651Skris    ADD,L   m_0,m1_0,tmp_0            ; tmp_0 = m + m1;
25168651Skris    LDD     -24(%sp),ht_0
25268651Skris    LDD     -32(%sp),lt_0
25368651Skris
25468651Skris    CMPCLR,*>>= tmp_0,m1_0,%r0        ; if (m < m1)
25568651Skris    ADD,L   ht_0,top_overflow,ht_0    ; ht += (1<<32)
25668651Skris
25768651Skris    EXTRD,U tmp_0,31,32,m_0           ; m>>32
25868651Skris    DEPD,Z  tmp_0,31,32,m1_0          ; m1 = m<<32
25968651Skris
26068651Skris    ADD,L   ht_0,m_0,ht_0             ; ht+= (m>>32)
26168651Skris    ADD     lt_0,m1_0,tmp_0           ; tmp_0 = lt+m1;
26268651Skris    ADD,DC  ht_0,%r0,ht_0             ; ht++
26368651Skris    ADD     %ret0,tmp_0,lt_0          ; lt = lt + c;
26468651Skris    ADD,DC  ht_0,%r0,ht_0             ; ht++
26568651Skris    ADD     lt_0,rp_val,lt_0          ; lt = lt+rp[0]
26668651Skris    ADD,DC  ht_0,%r0,%ret0            ; ht++
26768651Skris    STD     lt_0,0(r_ptr)             ; rp[0] = lt
26868651Skris
26968651Skrisbn_mul_add_words_exit
27068651Skris    .EXIT
27168651Skris    LDD     -80(%sp),%r9              ; restore r9
27268651Skris    LDD     -88(%sp),%r8              ; restore r8
27368651Skris    LDD     -96(%sp),%r7              ; restore r7
27468651Skris    LDD     -104(%sp),%r6             ; restore r6
27568651Skris    LDD     -112(%sp),%r5             ; restore r5
27668651Skris    LDD     -120(%sp),%r4             ; restore r4
27768651Skris    BVE     (%rp)
27868651Skris    LDD,MB  -128(%sp),%r3             ; restore r3
27968651Skris	.PROCEND	;in=23,24,25,26,29;out=28;
28068651Skris
28168651Skris;----------------------------------------------------------------------------
28268651Skris;
28368651Skris;BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
28468651Skris;
28568651Skris; arg0 = rp
28668651Skris; arg1 = ap
28768651Skris; arg2 = num
28868651Skris; arg3 = w
28968651Skris
29068651Skrisbn_mul_words
29168651Skris	.proc
29268651Skris	.callinfo frame=128
29368651Skris    .entry
29468651Skris	.EXPORT	bn_mul_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
29568651Skris	.align 64
29668651Skris
29768651Skris    STD     %r3,0(%sp)          ; save r3
29868651Skris    STD     %r4,8(%sp)          ; save r4
29968651Skris    STD     %r5,16(%sp)         ; save r5
30068651Skris    STD     %r6,24(%sp)         ; save r6
30168651Skris
30268651Skris    STD     %r7,32(%sp)         ; save r7
30368651Skris    COPY    %r0,%ret0           ; return 0 by default
30468651Skris    DEPDI,Z 1,31,1,top_overflow ; top_overflow = 1 << 32
30568651Skris	STD     w,56(%sp)           ; w on stack
30668651Skris
30768651Skris    CMPIB,>= 0,num,bn_mul_words_exit
30868651Skris	LDO     128(%sp),%sp       ; bump stack
30968651Skris
31068651Skris	;
31168651Skris	; See if only 1 word to do, thus just do cleanup
31268651Skris	;
31368651Skris	CMPIB,= 1,num,bn_mul_words_single_top
31468651Skris	FLDD    -72(%sp),fw     ; load up w into fp register fw (fw_h/fw_l)
31568651Skris
31668651Skris	;
31768651Skris	; This loop is unrolled 2 times (64-byte aligned as well)
31868651Skris	;
31968651Skris	; PA-RISC 2.0 chips have two fully pipelined multipliers, thus
32068651Skris    ; two 32-bit mutiplies can be issued per cycle.
32168651Skris    ;
32268651Skrisbn_mul_words_unroll2
32368651Skris
32468651Skris    FLDD    0(a_ptr),t_float_0        ; load up 64-bit value (fr8L) ht(L)/lt(R)
32568651Skris    FLDD    8(a_ptr),t_float_1        ; load up 64-bit value (fr8L) ht(L)/lt(R)
32668651Skris    XMPYU   fht_0,fw_l,fm1            ; m1[0] = fht_0*fw_l
32768651Skris    XMPYU   fht_1,fw_l,fm1_1          ; m1[1] = ht*fw_l
32868651Skris
32968651Skris    FSTD    fm1,-16(%sp)              ; -16(sp) = m1
33068651Skris    FSTD    fm1_1,-48(%sp)            ; -48(sp) = m1
33168651Skris    XMPYU   flt_0,fw_h,fm             ; m = lt*fw_h
33268651Skris    XMPYU   flt_1,fw_h,fm_1           ; m = lt*fw_h
33368651Skris
33468651Skris    FSTD    fm,-8(%sp)                ; -8(sp) = m
33568651Skris    FSTD    fm_1,-40(%sp)             ; -40(sp) = m
33668651Skris    XMPYU   fht_0,fw_h,ht_temp        ; ht_temp = fht_0*fw_h
33768651Skris    XMPYU   fht_1,fw_h,ht_temp_1      ; ht_temp = ht*fw_h
33868651Skris
33968651Skris    FSTD    ht_temp,-24(%sp)          ; -24(sp) = ht
34068651Skris    FSTD    ht_temp_1,-56(%sp)        ; -56(sp) = ht
34168651Skris    XMPYU   flt_0,fw_l,lt_temp        ; lt_temp = lt*fw_l
34268651Skris    XMPYU   flt_1,fw_l,lt_temp_1      ; lt_temp = lt*fw_l
34368651Skris
34468651Skris    FSTD    lt_temp,-32(%sp)          ; -32(sp) = lt
34568651Skris    FSTD    lt_temp_1,-64(%sp)        ; -64(sp) = lt
34668651Skris    LDD     -8(%sp),m_0
34768651Skris    LDD     -40(%sp),m_1
34868651Skris
34968651Skris    LDD    -16(%sp),m1_0
35068651Skris    LDD    -48(%sp),m1_1
35168651Skris    LDD     -24(%sp),ht_0
35268651Skris    LDD     -56(%sp),ht_1
35368651Skris
35468651Skris    ADD,L   m1_0,m_0,tmp_0            ; tmp_0 = m + m1;
35568651Skris    ADD,L   m1_1,m_1,tmp_1            ; tmp_1 = m + m1;
35668651Skris    LDD     -32(%sp),lt_0
35768651Skris    LDD     -64(%sp),lt_1
35868651Skris
35968651Skris    CMPCLR,*>>= tmp_0,m1_0, %r0       ; if (m < m1)
36068651Skris    ADD,L   ht_0,top_overflow,ht_0    ; ht += (1<<32)
36168651Skris    CMPCLR,*>>= tmp_1,m1_1,%r0        ; if (m < m1)
36268651Skris    ADD,L   ht_1,top_overflow,ht_1    ; ht += (1<<32)
36368651Skris
36468651Skris    EXTRD,U tmp_0,31,32,m_0           ; m>>32
36568651Skris    DEPD,Z  tmp_0,31,32,m1_0          ; m1 = m<<32
36668651Skris    EXTRD,U tmp_1,31,32,m_1           ; m>>32
36768651Skris    DEPD,Z  tmp_1,31,32,m1_1          ; m1 = m<<32
36868651Skris
36968651Skris    ADD,L   ht_0,m_0,ht_0             ; ht+= (m>>32)
37068651Skris    ADD,L   ht_1,m_1,ht_1             ; ht+= (m>>32)
37168651Skris    ADD     lt_0,m1_0,lt_0            ; lt = lt+m1;
37268651Skris	ADD,DC  ht_0,%r0,ht_0             ; ht++
37368651Skris
37468651Skris    ADD     lt_1,m1_1,lt_1            ; lt = lt+m1;
37568651Skris    ADD,DC  ht_1,%r0,ht_1             ; ht++
37668651Skris    ADD    %ret0,lt_0,lt_0            ; lt = lt + c (ret0);
37768651Skris	ADD,DC  ht_0,%r0,ht_0             ; ht++
37868651Skris
37968651Skris    ADD     ht_0,lt_1,lt_1            ; lt = lt + c (ht_0)
38068651Skris    ADD,DC  ht_1,%r0,ht_1             ; ht++
38168651Skris    STD     lt_0,0(r_ptr)             ; rp[0] = lt
38268651Skris    STD     lt_1,8(r_ptr)             ; rp[1] = lt
38368651Skris
38468651Skris	COPY    ht_1,%ret0                ; carry = ht
38568651Skris	LDO    -2(num),num                ; num = num - 2;
38668651Skris    LDO     16(a_ptr),a_ptr           ; ap += 2
38768651Skris	CMPIB,<= 2,num,bn_mul_words_unroll2
38868651Skris    LDO     16(r_ptr),r_ptr           ; rp++
38968651Skris
39068651Skris    CMPIB,=,N 0,num,bn_mul_words_exit ; are we done?
39168651Skris
39268651Skris	;
39368651Skris	; Top of loop aligned on 64-byte boundary
39468651Skris	;
39568651Skrisbn_mul_words_single_top
39668651Skris    FLDD    0(a_ptr),t_float_0        ; load up 64-bit value (fr8L) ht(L)/lt(R)
39768651Skris
39868651Skris    XMPYU   fht_0,fw_l,fm1            ; m1 = ht*fw_l
39968651Skris    FSTD    fm1,-16(%sp)              ; -16(sp) = m1
40068651Skris    XMPYU   flt_0,fw_h,fm             ; m = lt*fw_h
40168651Skris    FSTD    fm,-8(%sp)                ; -8(sp) = m
40268651Skris    XMPYU   fht_0,fw_h,ht_temp        ; ht_temp = ht*fw_h
40368651Skris    FSTD    ht_temp,-24(%sp)          ; -24(sp) = ht
40468651Skris    XMPYU   flt_0,fw_l,lt_temp        ; lt_temp = lt*fw_l
40568651Skris    FSTD    lt_temp,-32(%sp)          ; -32(sp) = lt
40668651Skris
40768651Skris    LDD     -8(%sp),m_0
40868651Skris    LDD    -16(%sp),m1_0
40968651Skris    ADD,L   m_0,m1_0,tmp_0            ; tmp_0 = m + m1;
41068651Skris    LDD     -24(%sp),ht_0
41168651Skris    LDD     -32(%sp),lt_0
41268651Skris
41368651Skris    CMPCLR,*>>= tmp_0,m1_0,%r0        ; if (m < m1)
41468651Skris    ADD,L   ht_0,top_overflow,ht_0    ; ht += (1<<32)
41568651Skris
41668651Skris    EXTRD,U tmp_0,31,32,m_0           ; m>>32
41768651Skris    DEPD,Z  tmp_0,31,32,m1_0          ; m1 = m<<32
41868651Skris
41968651Skris    ADD,L   ht_0,m_0,ht_0             ; ht+= (m>>32)
42068651Skris    ADD     lt_0,m1_0,lt_0            ; lt= lt+m1;
42168651Skris    ADD,DC  ht_0,%r0,ht_0             ; ht++
42268651Skris
42368651Skris    ADD     %ret0,lt_0,lt_0           ; lt = lt + c;
42468651Skris    ADD,DC  ht_0,%r0,ht_0             ; ht++
42568651Skris
42668651Skris    COPY    ht_0,%ret0                ; copy carry
42768651Skris    STD     lt_0,0(r_ptr)             ; rp[0] = lt
42868651Skris
42968651Skrisbn_mul_words_exit
43068651Skris    .EXIT
43168651Skris    LDD     -96(%sp),%r7              ; restore r7
43268651Skris    LDD     -104(%sp),%r6             ; restore r6
43368651Skris    LDD     -112(%sp),%r5             ; restore r5
43468651Skris    LDD     -120(%sp),%r4             ; restore r4
43568651Skris    BVE     (%rp)
43668651Skris    LDD,MB  -128(%sp),%r3             ; restore r3
43768651Skris	.PROCEND	;in=23,24,25,26,29;out=28;
43868651Skris
43968651Skris;----------------------------------------------------------------------------
44068651Skris;
44168651Skris;void bn_sqr_words(BN_ULONG *rp, BN_ULONG *ap, int num)
44268651Skris;
44368651Skris; arg0 = rp
44468651Skris; arg1 = ap
44568651Skris; arg2 = num
44668651Skris;
44768651Skris
44868651Skrisbn_sqr_words
44968651Skris	.proc
45068651Skris	.callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE
45168651Skris	.EXPORT	bn_sqr_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
45268651Skris    .entry
45368651Skris	.align 64
45468651Skris
45568651Skris    STD     %r3,0(%sp)          ; save r3
45668651Skris    STD     %r4,8(%sp)          ; save r4
45768651Skris	NOP
45868651Skris    STD     %r5,16(%sp)         ; save r5
45968651Skris
46068651Skris    CMPIB,>= 0,num,bn_sqr_words_exit
46168651Skris	LDO     128(%sp),%sp       ; bump stack
46268651Skris
46368651Skris	;
46468651Skris	; If only 1, the goto straight to cleanup
46568651Skris	;
46668651Skris	CMPIB,= 1,num,bn_sqr_words_single_top
46768651Skris    DEPDI,Z -1,32,33,high_mask   ; Create Mask 0xffffffff80000000L
46868651Skris
46968651Skris	;
47068651Skris	; This loop is unrolled 2 times (64-byte aligned as well)
47168651Skris	;
47268651Skris
47368651Skrisbn_sqr_words_unroll2
47468651Skris    FLDD    0(a_ptr),t_float_0        ; a[0]
47568651Skris    FLDD    8(a_ptr),t_float_1        ; a[1]
47668651Skris    XMPYU   fht_0,flt_0,fm            ; m[0]
47768651Skris    XMPYU   fht_1,flt_1,fm_1          ; m[1]
47868651Skris
47968651Skris    FSTD    fm,-24(%sp)               ; store m[0]
48068651Skris    FSTD    fm_1,-56(%sp)             ; store m[1]
48168651Skris    XMPYU   flt_0,flt_0,lt_temp       ; lt[0]
48268651Skris    XMPYU   flt_1,flt_1,lt_temp_1     ; lt[1]
48368651Skris
48468651Skris    FSTD    lt_temp,-16(%sp)          ; store lt[0]
48568651Skris    FSTD    lt_temp_1,-48(%sp)        ; store lt[1]
48668651Skris    XMPYU   fht_0,fht_0,ht_temp       ; ht[0]
48768651Skris    XMPYU   fht_1,fht_1,ht_temp_1     ; ht[1]
48868651Skris
48968651Skris    FSTD    ht_temp,-8(%sp)           ; store ht[0]
49068651Skris    FSTD    ht_temp_1,-40(%sp)        ; store ht[1]
49168651Skris    LDD     -24(%sp),m_0
49268651Skris    LDD     -56(%sp),m_1
49368651Skris
49468651Skris    AND     m_0,high_mask,tmp_0       ; m[0] & Mask
49568651Skris    AND     m_1,high_mask,tmp_1       ; m[1] & Mask
49668651Skris    DEPD,Z  m_0,30,31,m_0             ; m[0] << 32+1
49768651Skris    DEPD,Z  m_1,30,31,m_1             ; m[1] << 32+1
49868651Skris
49968651Skris    LDD     -16(%sp),lt_0
50068651Skris    LDD     -48(%sp),lt_1
50168651Skris    EXTRD,U tmp_0,32,33,tmp_0         ; tmp_0 = m[0]&Mask >> 32-1
50268651Skris    EXTRD,U tmp_1,32,33,tmp_1         ; tmp_1 = m[1]&Mask >> 32-1
50368651Skris
50468651Skris    LDD     -8(%sp),ht_0
50568651Skris    LDD     -40(%sp),ht_1
50668651Skris    ADD,L   ht_0,tmp_0,ht_0           ; ht[0] += tmp_0
50768651Skris    ADD,L   ht_1,tmp_1,ht_1           ; ht[1] += tmp_1
50868651Skris
50968651Skris    ADD     lt_0,m_0,lt_0             ; lt = lt+m
51068651Skris    ADD,DC  ht_0,%r0,ht_0             ; ht[0]++
51168651Skris    STD     lt_0,0(r_ptr)             ; rp[0] = lt[0]
51268651Skris    STD     ht_0,8(r_ptr)             ; rp[1] = ht[1]
51368651Skris
51468651Skris    ADD     lt_1,m_1,lt_1             ; lt = lt+m
51568651Skris    ADD,DC  ht_1,%r0,ht_1             ; ht[1]++
51668651Skris    STD     lt_1,16(r_ptr)            ; rp[2] = lt[1]
51768651Skris    STD     ht_1,24(r_ptr)            ; rp[3] = ht[1]
51868651Skris
51968651Skris	LDO    -2(num),num                ; num = num - 2;
52068651Skris    LDO     16(a_ptr),a_ptr           ; ap += 2
52168651Skris	CMPIB,<= 2,num,bn_sqr_words_unroll2
52268651Skris    LDO     32(r_ptr),r_ptr           ; rp += 4
52368651Skris
52468651Skris    CMPIB,=,N 0,num,bn_sqr_words_exit ; are we done?
52568651Skris
52668651Skris	;
52768651Skris	; Top of loop aligned on 64-byte boundary
52868651Skris	;
52968651Skrisbn_sqr_words_single_top
53068651Skris    FLDD    0(a_ptr),t_float_0        ; load up 64-bit value (fr8L) ht(L)/lt(R)
53168651Skris
53268651Skris    XMPYU   fht_0,flt_0,fm            ; m
53368651Skris    FSTD    fm,-24(%sp)               ; store m
53468651Skris
53568651Skris    XMPYU   flt_0,flt_0,lt_temp       ; lt
53668651Skris    FSTD    lt_temp,-16(%sp)          ; store lt
53768651Skris
53868651Skris    XMPYU   fht_0,fht_0,ht_temp       ; ht
53968651Skris    FSTD    ht_temp,-8(%sp)           ; store ht
54068651Skris
54168651Skris    LDD     -24(%sp),m_0              ; load m
54268651Skris    AND     m_0,high_mask,tmp_0       ; m & Mask
54368651Skris    DEPD,Z  m_0,30,31,m_0             ; m << 32+1
54468651Skris    LDD     -16(%sp),lt_0             ; lt
54568651Skris
54668651Skris    LDD     -8(%sp),ht_0              ; ht
54768651Skris    EXTRD,U tmp_0,32,33,tmp_0         ; tmp_0 = m&Mask >> 32-1
54868651Skris    ADD     m_0,lt_0,lt_0             ; lt = lt+m
54968651Skris    ADD,L   ht_0,tmp_0,ht_0           ; ht += tmp_0
55068651Skris    ADD,DC  ht_0,%r0,ht_0             ; ht++
55168651Skris
55268651Skris    STD     lt_0,0(r_ptr)             ; rp[0] = lt
55368651Skris    STD     ht_0,8(r_ptr)             ; rp[1] = ht
55468651Skris
55568651Skrisbn_sqr_words_exit
55668651Skris    .EXIT
55768651Skris    LDD     -112(%sp),%r5       ; restore r5
55868651Skris    LDD     -120(%sp),%r4       ; restore r4
55968651Skris    BVE     (%rp)
56068651Skris    LDD,MB  -128(%sp),%r3
56168651Skris	.PROCEND	;in=23,24,25,26,29;out=28;
56268651Skris
56368651Skris
56468651Skris;----------------------------------------------------------------------------
56568651Skris;
56668651Skris;BN_ULONG bn_add_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
56768651Skris;
56868651Skris; arg0 = rp
56968651Skris; arg1 = ap
57068651Skris; arg2 = bp
57168651Skris; arg3 = n
57268651Skris
57368651Skrist  .reg %r22
57468651Skrisb  .reg %r21
57568651Skrisl  .reg %r20
57668651Skris
57768651Skrisbn_add_words
57868651Skris	.proc
57968651Skris    .entry
58068651Skris	.callinfo
58168651Skris	.EXPORT	bn_add_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
58268651Skris	.align 64
58368651Skris
58468651Skris    CMPIB,>= 0,n,bn_add_words_exit
58568651Skris    COPY    %r0,%ret0           ; return 0 by default
58668651Skris
58768651Skris	;
58868651Skris	; If 2 or more numbers do the loop
58968651Skris	;
59068651Skris	CMPIB,= 1,n,bn_add_words_single_top
59168651Skris	NOP
59268651Skris
59368651Skris	;
59468651Skris	; This loop is unrolled 2 times (64-byte aligned as well)
59568651Skris	;
59668651Skrisbn_add_words_unroll2
59768651Skris	LDD     0(a_ptr),t
59868651Skris	LDD     0(b_ptr),b
59968651Skris	ADD     t,%ret0,t                    ; t = t+c;
60068651Skris	ADD,DC  %r0,%r0,%ret0                ; set c to carry
60168651Skris	ADD     t,b,l                        ; l = t + b[0]
60268651Skris	ADD,DC  %ret0,%r0,%ret0              ; c+= carry
60368651Skris	STD     l,0(r_ptr)
60468651Skris
60568651Skris	LDD     8(a_ptr),t
60668651Skris	LDD     8(b_ptr),b
60768651Skris	ADD     t,%ret0,t                     ; t = t+c;
60868651Skris	ADD,DC  %r0,%r0,%ret0                 ; set c to carry
60968651Skris	ADD     t,b,l                         ; l = t + b[0]
61068651Skris	ADD,DC  %ret0,%r0,%ret0               ; c+= carry
61168651Skris	STD     l,8(r_ptr)
61268651Skris
61368651Skris	LDO     -2(n),n
61468651Skris	LDO     16(a_ptr),a_ptr
61568651Skris	LDO     16(b_ptr),b_ptr
61668651Skris
61768651Skris	CMPIB,<= 2,n,bn_add_words_unroll2
61868651Skris	LDO     16(r_ptr),r_ptr
61968651Skris
62068651Skris    CMPIB,=,N 0,n,bn_add_words_exit ; are we done?
62168651Skris
62268651Skrisbn_add_words_single_top
62368651Skris	LDD     0(a_ptr),t
62468651Skris	LDD     0(b_ptr),b
62568651Skris
62668651Skris	ADD     t,%ret0,t                 ; t = t+c;
62768651Skris	ADD,DC  %r0,%r0,%ret0             ; set c to carry (could use CMPCLR??)
62868651Skris	ADD     t,b,l                     ; l = t + b[0]
62968651Skris	ADD,DC  %ret0,%r0,%ret0           ; c+= carry
63068651Skris	STD     l,0(r_ptr)
63168651Skris
63268651Skrisbn_add_words_exit
63368651Skris    .EXIT
63468651Skris    BVE     (%rp)
63568651Skris	NOP
63668651Skris	.PROCEND	;in=23,24,25,26,29;out=28;
63768651Skris
63868651Skris;----------------------------------------------------------------------------
63968651Skris;
64068651Skris;BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
64168651Skris;
64268651Skris; arg0 = rp
64368651Skris; arg1 = ap
64468651Skris; arg2 = bp
64568651Skris; arg3 = n
64668651Skris
64768651Skrist1       .reg %r22
64868651Skrist2       .reg %r21
64968651Skrissub_tmp1 .reg %r20
65068651Skrissub_tmp2 .reg %r19
65168651Skris
65268651Skris
65368651Skrisbn_sub_words
65468651Skris	.proc
65568651Skris	.callinfo
65668651Skris	.EXPORT	bn_sub_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
65768651Skris    .entry
65868651Skris	.align 64
65968651Skris
66068651Skris    CMPIB,>=  0,n,bn_sub_words_exit
66168651Skris    COPY    %r0,%ret0           ; return 0 by default
66268651Skris
66368651Skris	;
66468651Skris	; If 2 or more numbers do the loop
66568651Skris	;
66668651Skris	CMPIB,= 1,n,bn_sub_words_single_top
66768651Skris	NOP
66868651Skris
66968651Skris	;
67068651Skris	; This loop is unrolled 2 times (64-byte aligned as well)
67168651Skris	;
67268651Skrisbn_sub_words_unroll2
67368651Skris	LDD     0(a_ptr),t1
67468651Skris	LDD     0(b_ptr),t2
67568651Skris	SUB     t1,t2,sub_tmp1           ; t3 = t1-t2;
67668651Skris	SUB     sub_tmp1,%ret0,sub_tmp1  ; t3 = t3- c;
67768651Skris
67868651Skris	CMPCLR,*>> t1,t2,sub_tmp2        ; clear if t1 > t2
67968651Skris	LDO      1(%r0),sub_tmp2
68068651Skris
68168651Skris	CMPCLR,*= t1,t2,%r0
68268651Skris	COPY    sub_tmp2,%ret0
68368651Skris	STD     sub_tmp1,0(r_ptr)
68468651Skris
68568651Skris	LDD     8(a_ptr),t1
68668651Skris	LDD     8(b_ptr),t2
68768651Skris	SUB     t1,t2,sub_tmp1            ; t3 = t1-t2;
68868651Skris	SUB     sub_tmp1,%ret0,sub_tmp1   ; t3 = t3- c;
68968651Skris	CMPCLR,*>> t1,t2,sub_tmp2         ; clear if t1 > t2
69068651Skris	LDO      1(%r0),sub_tmp2
69168651Skris
69268651Skris	CMPCLR,*= t1,t2,%r0
69368651Skris	COPY    sub_tmp2,%ret0
69468651Skris	STD     sub_tmp1,8(r_ptr)
69568651Skris
69668651Skris	LDO     -2(n),n
69768651Skris	LDO     16(a_ptr),a_ptr
69868651Skris	LDO     16(b_ptr),b_ptr
69968651Skris
70068651Skris	CMPIB,<= 2,n,bn_sub_words_unroll2
70168651Skris	LDO     16(r_ptr),r_ptr
70268651Skris
70368651Skris    CMPIB,=,N 0,n,bn_sub_words_exit ; are we done?
70468651Skris
70568651Skrisbn_sub_words_single_top
70668651Skris	LDD     0(a_ptr),t1
70768651Skris	LDD     0(b_ptr),t2
70868651Skris	SUB     t1,t2,sub_tmp1            ; t3 = t1-t2;
70968651Skris	SUB     sub_tmp1,%ret0,sub_tmp1   ; t3 = t3- c;
71068651Skris	CMPCLR,*>> t1,t2,sub_tmp2         ; clear if t1 > t2
71168651Skris	LDO      1(%r0),sub_tmp2
71268651Skris
71368651Skris	CMPCLR,*= t1,t2,%r0
71468651Skris	COPY    sub_tmp2,%ret0
71568651Skris
71668651Skris	STD     sub_tmp1,0(r_ptr)
71768651Skris
71868651Skrisbn_sub_words_exit
71968651Skris    .EXIT
72068651Skris    BVE     (%rp)
72168651Skris	NOP
72268651Skris	.PROCEND	;in=23,24,25,26,29;out=28;
72368651Skris
72468651Skris;------------------------------------------------------------------------------
72568651Skris;
72668651Skris; unsigned long bn_div_words(unsigned long h, unsigned long l, unsigned long d)
72768651Skris;
72868651Skris; arg0 = h
72968651Skris; arg1 = l
73068651Skris; arg2 = d
73168651Skris;
73268651Skris; This is mainly just modified assembly from the compiler, thus the
73368651Skris; lack of variable names.
73468651Skris;
73568651Skris;------------------------------------------------------------------------------
73668651Skrisbn_div_words
73768651Skris	.proc
73868651Skris	.callinfo CALLER,FRAME=272,ENTRY_GR=%r10,SAVE_RP,ARGS_SAVED,ORDERING_AWARE
73968651Skris	.EXPORT	bn_div_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
74068651Skris	.IMPORT	BN_num_bits_word,CODE,NO_RELOCATION
74168651Skris	.IMPORT	__iob,DATA
74268651Skris	.IMPORT	fprintf,CODE,NO_RELOCATION
74368651Skris	.IMPORT	abort,CODE,NO_RELOCATION
74468651Skris	.IMPORT	$$div2U,MILLICODE
74568651Skris    .entry
74668651Skris    STD     %r2,-16(%r30)
74768651Skris    STD,MA  %r3,352(%r30)
74868651Skris    STD     %r4,-344(%r30)
74968651Skris    STD     %r5,-336(%r30)
75068651Skris    STD     %r6,-328(%r30)
75168651Skris    STD     %r7,-320(%r30)
75268651Skris    STD     %r8,-312(%r30)
75368651Skris    STD     %r9,-304(%r30)
75468651Skris    STD     %r10,-296(%r30)
75568651Skris
75668651Skris    STD     %r27,-288(%r30)             ; save gp
75768651Skris
75868651Skris    COPY    %r24,%r3           ; save d
75968651Skris    COPY    %r26,%r4           ; save h (high 64-bits)
76068651Skris    LDO      -1(%r0),%ret0     ; return -1 by default
76168651Skris
76268651Skris    CMPB,*=  %r0,%arg2,$D3     ; if (d == 0)
76368651Skris    COPY    %r25,%r5           ; save l (low 64-bits)
76468651Skris
76568651Skris    LDO     -48(%r30),%r29     ; create ap
76668651Skris    .CALL   ;in=26,29;out=28;
76768651Skris    B,L     BN_num_bits_word,%r2
76868651Skris    COPY    %r3,%r26
76968651Skris    LDD     -288(%r30),%r27    ; restore gp
77068651Skris    LDI     64,%r21
77168651Skris
77268651Skris    CMPB,=  %r21,%ret0,$00000012   ;if (i == 64) (forward)
77368651Skris    COPY    %ret0,%r24             ; i
77468651Skris    MTSARCM %r24
77568651Skris    DEPDI,Z -1,%sar,1,%r29
77668651Skris    CMPB,*<<,N %r29,%r4,bn_div_err_case ; if (h > 1<<i) (forward)
77768651Skris
77868651Skris$00000012
77968651Skris    SUBI    64,%r24,%r31                       ; i = 64 - i;
78068651Skris    CMPCLR,*<< %r4,%r3,%r0                     ; if (h >= d)
78168651Skris    SUB     %r4,%r3,%r4                        ; h -= d
78268651Skris    CMPB,=  %r31,%r0,$0000001A                 ; if (i)
78368651Skris    COPY    %r0,%r10                           ; ret = 0
78468651Skris    MTSARCM %r31                               ; i to shift
78568651Skris    DEPD,Z  %r3,%sar,64,%r3                    ; d <<= i;
78668651Skris    SUBI    64,%r31,%r19                       ; 64 - i; redundent
78768651Skris    MTSAR   %r19                               ; (64 -i) to shift
78868651Skris    SHRPD   %r4,%r5,%sar,%r4                   ; l>> (64-i)
78968651Skris    MTSARCM %r31                               ; i to shift
79068651Skris    DEPD,Z  %r5,%sar,64,%r5                    ; l <<= i;
79168651Skris
79268651Skris$0000001A
79368651Skris    DEPDI,Z -1,31,32,%r19
79468651Skris    EXTRD,U %r3,31,32,%r6                      ; dh=(d&0xfff)>>32
79568651Skris    EXTRD,U %r3,63,32,%r8                      ; dl = d&0xffffff
79668651Skris    LDO     2(%r0),%r9
79768651Skris    STD    %r3,-280(%r30)                      ; "d" to stack
79868651Skris
79968651Skris$0000001C
80068651Skris    DEPDI,Z -1,63,32,%r29                      ;
80168651Skris    EXTRD,U %r4,31,32,%r31                     ; h >> 32
80268651Skris    CMPB,*=,N  %r31,%r6,$D2     	       ; if ((h>>32) != dh)(forward) div
80368651Skris    COPY    %r4,%r26
80468651Skris    EXTRD,U %r4,31,32,%r25
80568651Skris    COPY    %r6,%r24
80668651Skris    .CALL   ;in=23,24,25,26;out=20,21,22,28,29; (MILLICALL)
80768651Skris    B,L     $$div2U,%r2
80868651Skris    EXTRD,U %r6,31,32,%r23
80968651Skris    DEPD    %r28,31,32,%r29
81068651Skris$D2
81168651Skris    STD     %r29,-272(%r30)                   ; q
81268651Skris    AND     %r5,%r19,%r24                   ; t & 0xffffffff00000000;
81368651Skris    EXTRD,U %r24,31,32,%r24                 ; ???
81468651Skris    FLDD    -272(%r30),%fr7                 ; q
81568651Skris    FLDD    -280(%r30),%fr8                 ; d
81668651Skris    XMPYU   %fr8L,%fr7L,%fr10
81768651Skris    FSTD    %fr10,-256(%r30)
81868651Skris    XMPYU   %fr8L,%fr7R,%fr22
81968651Skris    FSTD    %fr22,-264(%r30)
82068651Skris    XMPYU   %fr8R,%fr7L,%fr11
82168651Skris    XMPYU   %fr8R,%fr7R,%fr23
82268651Skris    FSTD    %fr11,-232(%r30)
82368651Skris    FSTD    %fr23,-240(%r30)
82468651Skris    LDD     -256(%r30),%r28
82568651Skris    DEPD,Z  %r28,31,32,%r2
82668651Skris    LDD     -264(%r30),%r20
82768651Skris    ADD,L   %r20,%r2,%r31
82868651Skris    LDD     -232(%r30),%r22
82968651Skris    DEPD,Z  %r22,31,32,%r22
83068651Skris    LDD     -240(%r30),%r21
83168651Skris    B       $00000024       ; enter loop
83268651Skris    ADD,L   %r21,%r22,%r23
83368651Skris
83468651Skris$0000002A
83568651Skris    LDO     -1(%r29),%r29
83668651Skris    SUB     %r23,%r8,%r23
83768651Skris$00000024
83868651Skris    SUB     %r4,%r31,%r25
83968651Skris    AND     %r25,%r19,%r26
84068651Skris    CMPB,*<>,N      %r0,%r26,$00000046  ; (forward)
84168651Skris    DEPD,Z  %r25,31,32,%r20
84268651Skris    OR      %r20,%r24,%r21
84368651Skris    CMPB,*<<,N  %r21,%r23,$0000002A ;(backward)
84468651Skris    SUB     %r31,%r6,%r31
84568651Skris;-------------Break path---------------------
84668651Skris
84768651Skris$00000046
84868651Skris    DEPD,Z  %r23,31,32,%r25              ;tl
84968651Skris    EXTRD,U %r23,31,32,%r26              ;t
85068651Skris    AND     %r25,%r19,%r24               ;tl = (tl<<32)&0xfffffff0000000L
85168651Skris    ADD,L   %r31,%r26,%r31               ;th += t;
85268651Skris    CMPCLR,*>>=     %r5,%r24,%r0         ;if (l<tl)
85368651Skris    LDO     1(%r31),%r31                 ; th++;
85468651Skris    CMPB,*<<=,N     %r31,%r4,$00000036   ;if (n < th) (forward)
85568651Skris    LDO     -1(%r29),%r29                ;q--;
85668651Skris    ADD,L   %r4,%r3,%r4                  ;h += d;
85768651Skris$00000036
85868651Skris    ADDIB,=,N       -1,%r9,$D1 ;if (--count == 0) break (forward)
85968651Skris    SUB     %r5,%r24,%r28                ; l -= tl;
86068651Skris    SUB     %r4,%r31,%r24                ; h -= th;
86168651Skris    SHRPD   %r24,%r28,32,%r4             ; h = ((h<<32)|(l>>32));
86268651Skris    DEPD,Z  %r29,31,32,%r10              ; ret = q<<32
86368651Skris    b      $0000001C
86468651Skris    DEPD,Z  %r28,31,32,%r5               ; l = l << 32
86568651Skris
86668651Skris$D1
86768651Skris    OR      %r10,%r29,%r28           ; ret |= q
86868651Skris$D3
86968651Skris    LDD     -368(%r30),%r2
87068651Skris$D0
87168651Skris    LDD     -296(%r30),%r10
87268651Skris    LDD     -304(%r30),%r9
87368651Skris    LDD     -312(%r30),%r8
87468651Skris    LDD     -320(%r30),%r7
87568651Skris    LDD     -328(%r30),%r6
87668651Skris    LDD     -336(%r30),%r5
87768651Skris    LDD     -344(%r30),%r4
87868651Skris    BVE     (%r2)
87968651Skris        .EXIT
88068651Skris    LDD,MB  -352(%r30),%r3
88168651Skris
88268651Skrisbn_div_err_case
88368651Skris    MFIA    %r6
88468651Skris    ADDIL   L'bn_div_words-bn_div_err_case,%r6,%r1
88568651Skris    LDO     R'bn_div_words-bn_div_err_case(%r1),%r6
88668651Skris    ADDIL   LT'__iob,%r27,%r1
88768651Skris    LDD     RT'__iob(%r1),%r26
88868651Skris    ADDIL   L'C$4-bn_div_words,%r6,%r1
88968651Skris    LDO     R'C$4-bn_div_words(%r1),%r25
89068651Skris    LDO     64(%r26),%r26
89168651Skris    .CALL           ;in=24,25,26,29;out=28;
89268651Skris    B,L     fprintf,%r2
89368651Skris    LDO     -48(%r30),%r29
89468651Skris    LDD     -288(%r30),%r27
89568651Skris    .CALL           ;in=29;
89668651Skris    B,L     abort,%r2
89768651Skris    LDO     -48(%r30),%r29
89868651Skris    LDD     -288(%r30),%r27
89968651Skris    B       $D0
90068651Skris    LDD     -368(%r30),%r2
90168651Skris	.PROCEND	;in=24,25,26,29;out=28;
90268651Skris
90368651Skris;----------------------------------------------------------------------------
90468651Skris;
90568651Skris; Registers to hold 64-bit values to manipulate.  The "L" part
90668651Skris; of the register corresponds to the upper 32-bits, while the "R"
90768651Skris; part corresponds to the lower 32-bits
90868651Skris;
90968651Skris; Note, that when using b6 and b7, the code must save these before
91068651Skris; using them because they are callee save registers
91168651Skris;
91268651Skris;
91368651Skris; Floating point registers to use to save values that
91468651Skris; are manipulated.  These don't collide with ftemp1-6 and
91568651Skris; are all caller save registers
91668651Skris;
91768651Skrisa0        .reg %fr22
91868651Skrisa0L       .reg %fr22L
91968651Skrisa0R       .reg %fr22R
92068651Skris
92168651Skrisa1        .reg %fr23
92268651Skrisa1L       .reg %fr23L
92368651Skrisa1R       .reg %fr23R
92468651Skris
92568651Skrisa2        .reg %fr24
92668651Skrisa2L       .reg %fr24L
92768651Skrisa2R       .reg %fr24R
92868651Skris
92968651Skrisa3        .reg %fr25
93068651Skrisa3L       .reg %fr25L
93168651Skrisa3R       .reg %fr25R
93268651Skris
93368651Skrisa4        .reg %fr26
93468651Skrisa4L       .reg %fr26L
93568651Skrisa4R       .reg %fr26R
93668651Skris
93768651Skrisa5        .reg %fr27
93868651Skrisa5L       .reg %fr27L
93968651Skrisa5R       .reg %fr27R
94068651Skris
94168651Skrisa6        .reg %fr28
94268651Skrisa6L       .reg %fr28L
94368651Skrisa6R       .reg %fr28R
94468651Skris
94568651Skrisa7        .reg %fr29
94668651Skrisa7L       .reg %fr29L
94768651Skrisa7R       .reg %fr29R
94868651Skris
94968651Skrisb0        .reg %fr30
95068651Skrisb0L       .reg %fr30L
95168651Skrisb0R       .reg %fr30R
95268651Skris
95368651Skrisb1        .reg %fr31
95468651Skrisb1L       .reg %fr31L
95568651Skrisb1R       .reg %fr31R
95668651Skris
95768651Skris;
95868651Skris; Temporary floating point variables, these are all caller save
95968651Skris; registers
96068651Skris;
96168651Skrisftemp1    .reg %fr4
96268651Skrisftemp2    .reg %fr5
96368651Skrisftemp3    .reg %fr6
96468651Skrisftemp4    .reg %fr7
96568651Skris
96668651Skris;
96768651Skris; The B set of registers when used.
96868651Skris;
96968651Skris
97068651Skrisb2        .reg %fr8
97168651Skrisb2L       .reg %fr8L
97268651Skrisb2R       .reg %fr8R
97368651Skris
97468651Skrisb3        .reg %fr9
97568651Skrisb3L       .reg %fr9L
97668651Skrisb3R       .reg %fr9R
97768651Skris
97868651Skrisb4        .reg %fr10
97968651Skrisb4L       .reg %fr10L
98068651Skrisb4R       .reg %fr10R
98168651Skris
98268651Skrisb5        .reg %fr11
98368651Skrisb5L       .reg %fr11L
98468651Skrisb5R       .reg %fr11R
98568651Skris
98668651Skrisb6        .reg %fr12
98768651Skrisb6L       .reg %fr12L
98868651Skrisb6R       .reg %fr12R
98968651Skris
99068651Skrisb7        .reg %fr13
99168651Skrisb7L       .reg %fr13L
99268651Skrisb7R       .reg %fr13R
99368651Skris
99468651Skrisc1           .reg %r21   ; only reg
99568651Skristemp1        .reg %r20   ; only reg
99668651Skristemp2        .reg %r19   ; only reg
99768651Skristemp3        .reg %r31   ; only reg
99868651Skris
99968651Skrism1           .reg %r28
100068651Skrisc2           .reg %r23
100168651Skrishigh_one     .reg %r1
100268651Skrisht           .reg %r6
100368651Skrislt           .reg %r5
100468651Skrism            .reg %r4
100568651Skrisc3           .reg %r3
100668651Skris
100768651SkrisSQR_ADD_C  .macro  A0L,A0R,C1,C2,C3
100868651Skris    XMPYU   A0L,A0R,ftemp1       ; m
100968651Skris    FSTD    ftemp1,-24(%sp)      ; store m
101068651Skris
101168651Skris    XMPYU   A0R,A0R,ftemp2       ; lt
101268651Skris    FSTD    ftemp2,-16(%sp)      ; store lt
101368651Skris
101468651Skris    XMPYU   A0L,A0L,ftemp3       ; ht
101568651Skris    FSTD    ftemp3,-8(%sp)       ; store ht
101668651Skris
101768651Skris    LDD     -24(%sp),m           ; load m
101868651Skris    AND     m,high_mask,temp2    ; m & Mask
101968651Skris    DEPD,Z  m,30,31,temp3        ; m << 32+1
102068651Skris    LDD     -16(%sp),lt          ; lt
102168651Skris
102268651Skris    LDD     -8(%sp),ht           ; ht
102368651Skris    EXTRD,U temp2,32,33,temp1    ; temp1 = m&Mask >> 32-1
102468651Skris    ADD     temp3,lt,lt          ; lt = lt+m
102568651Skris    ADD,L   ht,temp1,ht          ; ht += temp1
102668651Skris    ADD,DC  ht,%r0,ht            ; ht++
102768651Skris
102868651Skris    ADD     C1,lt,C1             ; c1=c1+lt
102968651Skris    ADD,DC  ht,%r0,ht            ; ht++
103068651Skris
103168651Skris    ADD     C2,ht,C2             ; c2=c2+ht
103268651Skris    ADD,DC  C3,%r0,C3            ; c3++
103368651Skris.endm
103468651Skris
103568651SkrisSQR_ADD_C2 .macro  A0L,A0R,A1L,A1R,C1,C2,C3
103668651Skris    XMPYU   A0L,A1R,ftemp1          ; m1 = bl*ht
103768651Skris    FSTD    ftemp1,-16(%sp)         ;
103868651Skris    XMPYU   A0R,A1L,ftemp2          ; m = bh*lt
103968651Skris    FSTD    ftemp2,-8(%sp)          ;
104068651Skris    XMPYU   A0R,A1R,ftemp3          ; lt = bl*lt
104168651Skris    FSTD    ftemp3,-32(%sp)
104268651Skris    XMPYU   A0L,A1L,ftemp4          ; ht = bh*ht
104368651Skris    FSTD    ftemp4,-24(%sp)         ;
104468651Skris
104568651Skris    LDD     -8(%sp),m               ; r21 = m
104668651Skris    LDD     -16(%sp),m1             ; r19 = m1
104768651Skris    ADD,L   m,m1,m                  ; m+m1
104868651Skris
104968651Skris    DEPD,Z  m,31,32,temp3           ; (m+m1<<32)
105068651Skris    LDD     -24(%sp),ht             ; r24 = ht
105168651Skris
105268651Skris    CMPCLR,*>>= m,m1,%r0            ; if (m < m1)
105368651Skris    ADD,L   ht,high_one,ht          ; ht+=high_one
105468651Skris
105568651Skris    EXTRD,U m,31,32,temp1           ; m >> 32
105668651Skris    LDD     -32(%sp),lt             ; lt
105768651Skris    ADD,L   ht,temp1,ht             ; ht+= m>>32
105868651Skris    ADD     lt,temp3,lt             ; lt = lt+m1
105968651Skris    ADD,DC  ht,%r0,ht               ; ht++
106068651Skris
106168651Skris    ADD     ht,ht,ht                ; ht=ht+ht;
106268651Skris    ADD,DC  C3,%r0,C3               ; add in carry (c3++)
106368651Skris
106468651Skris    ADD     lt,lt,lt                ; lt=lt+lt;
106568651Skris    ADD,DC  ht,%r0,ht               ; add in carry (ht++)
106668651Skris
106768651Skris    ADD     C1,lt,C1                ; c1=c1+lt
106868651Skris    ADD,DC,*NUV ht,%r0,ht           ; add in carry (ht++)
106968651Skris    LDO     1(C3),C3              ; bump c3 if overflow,nullify otherwise
107068651Skris
107168651Skris    ADD     C2,ht,C2                ; c2 = c2 + ht
107268651Skris    ADD,DC  C3,%r0,C3             ; add in carry (c3++)
107368651Skris.endm
107468651Skris
107568651Skris;
107668651Skris;void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a)
107768651Skris; arg0 = r_ptr
107868651Skris; arg1 = a_ptr
107968651Skris;
108068651Skris
108168651Skrisbn_sqr_comba8
108268651Skris	.PROC
108368651Skris	.CALLINFO FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE
108468651Skris	.EXPORT	bn_sqr_comba8,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
108568651Skris    .ENTRY
108668651Skris	.align 64
108768651Skris
108868651Skris    STD     %r3,0(%sp)          ; save r3
108968651Skris    STD     %r4,8(%sp)          ; save r4
109068651Skris    STD     %r5,16(%sp)         ; save r5
109168651Skris    STD     %r6,24(%sp)         ; save r6
109268651Skris
109368651Skris	;
109468651Skris	; Zero out carries
109568651Skris	;
109668651Skris	COPY     %r0,c1
109768651Skris	COPY     %r0,c2
109868651Skris	COPY     %r0,c3
109968651Skris
110068651Skris	LDO      128(%sp),%sp       ; bump stack
110168651Skris    DEPDI,Z -1,32,33,high_mask   ; Create Mask 0xffffffff80000000L
110268651Skris    DEPDI,Z  1,31,1,high_one     ; Create Value  1 << 32
110368651Skris
110468651Skris	;
110568651Skris	; Load up all of the values we are going to use
110668651Skris	;
110768651Skris    FLDD     0(a_ptr),a0
110868651Skris    FLDD     8(a_ptr),a1
110968651Skris    FLDD    16(a_ptr),a2
111068651Skris    FLDD    24(a_ptr),a3
111168651Skris    FLDD    32(a_ptr),a4
111268651Skris    FLDD    40(a_ptr),a5
111368651Skris    FLDD    48(a_ptr),a6
111468651Skris    FLDD    56(a_ptr),a7
111568651Skris
111668651Skris	SQR_ADD_C a0L,a0R,c1,c2,c3
111768651Skris	STD     c1,0(r_ptr)          ; r[0] = c1;
111868651Skris	COPY    %r0,c1
111968651Skris
112068651Skris	SQR_ADD_C2 a1L,a1R,a0L,a0R,c2,c3,c1
112168651Skris	STD     c2,8(r_ptr)          ; r[1] = c2;
112268651Skris	COPY    %r0,c2
112368651Skris
112468651Skris	SQR_ADD_C a1L,a1R,c3,c1,c2
112568651Skris	SQR_ADD_C2 a2L,a2R,a0L,a0R,c3,c1,c2
112668651Skris	STD     c3,16(r_ptr)            ; r[2] = c3;
112768651Skris	COPY    %r0,c3
112868651Skris
112968651Skris	SQR_ADD_C2 a3L,a3R,a0L,a0R,c1,c2,c3
113068651Skris	SQR_ADD_C2 a2L,a2R,a1L,a1R,c1,c2,c3
113168651Skris	STD     c1,24(r_ptr)           ; r[3] = c1;
113268651Skris	COPY    %r0,c1
113368651Skris
113468651Skris	SQR_ADD_C a2L,a2R,c2,c3,c1
113568651Skris	SQR_ADD_C2 a3L,a3R,a1L,a1R,c2,c3,c1
113668651Skris	SQR_ADD_C2 a4L,a4R,a0L,a0R,c2,c3,c1
113768651Skris	STD     c2,32(r_ptr)          ; r[4] = c2;
113868651Skris	COPY    %r0,c2
113968651Skris
114068651Skris	SQR_ADD_C2 a5L,a5R,a0L,a0R,c3,c1,c2
114168651Skris	SQR_ADD_C2 a4L,a4R,a1L,a1R,c3,c1,c2
114268651Skris	SQR_ADD_C2 a3L,a3R,a2L,a2R,c3,c1,c2
114368651Skris	STD     c3,40(r_ptr)          ; r[5] = c3;
114468651Skris	COPY    %r0,c3
114568651Skris
114668651Skris	SQR_ADD_C a3L,a3R,c1,c2,c3
114768651Skris	SQR_ADD_C2 a4L,a4R,a2L,a2R,c1,c2,c3
114868651Skris	SQR_ADD_C2 a5L,a5R,a1L,a1R,c1,c2,c3
114968651Skris	SQR_ADD_C2 a6L,a6R,a0L,a0R,c1,c2,c3
115068651Skris	STD     c1,48(r_ptr)          ; r[6] = c1;
115168651Skris	COPY    %r0,c1
115268651Skris
115368651Skris	SQR_ADD_C2 a7L,a7R,a0L,a0R,c2,c3,c1
115468651Skris	SQR_ADD_C2 a6L,a6R,a1L,a1R,c2,c3,c1
115568651Skris	SQR_ADD_C2 a5L,a5R,a2L,a2R,c2,c3,c1
115668651Skris	SQR_ADD_C2 a4L,a4R,a3L,a3R,c2,c3,c1
115768651Skris	STD     c2,56(r_ptr)          ; r[7] = c2;
115868651Skris	COPY    %r0,c2
115968651Skris
116068651Skris	SQR_ADD_C a4L,a4R,c3,c1,c2
116168651Skris	SQR_ADD_C2 a5L,a5R,a3L,a3R,c3,c1,c2
116268651Skris	SQR_ADD_C2 a6L,a6R,a2L,a2R,c3,c1,c2
116368651Skris	SQR_ADD_C2 a7L,a7R,a1L,a1R,c3,c1,c2
116468651Skris	STD     c3,64(r_ptr)          ; r[8] = c3;
116568651Skris	COPY    %r0,c3
116668651Skris
116768651Skris	SQR_ADD_C2 a7L,a7R,a2L,a2R,c1,c2,c3
116868651Skris	SQR_ADD_C2 a6L,a6R,a3L,a3R,c1,c2,c3
116968651Skris	SQR_ADD_C2 a5L,a5R,a4L,a4R,c1,c2,c3
117068651Skris	STD     c1,72(r_ptr)          ; r[9] = c1;
117168651Skris	COPY    %r0,c1
117268651Skris
117368651Skris	SQR_ADD_C a5L,a5R,c2,c3,c1
117468651Skris	SQR_ADD_C2 a6L,a6R,a4L,a4R,c2,c3,c1
117568651Skris	SQR_ADD_C2 a7L,a7R,a3L,a3R,c2,c3,c1
117668651Skris	STD     c2,80(r_ptr)          ; r[10] = c2;
117768651Skris	COPY    %r0,c2
117868651Skris
117968651Skris	SQR_ADD_C2 a7L,a7R,a4L,a4R,c3,c1,c2
118068651Skris	SQR_ADD_C2 a6L,a6R,a5L,a5R,c3,c1,c2
118168651Skris	STD     c3,88(r_ptr)          ; r[11] = c3;
118268651Skris	COPY    %r0,c3
118368651Skris
118468651Skris	SQR_ADD_C a6L,a6R,c1,c2,c3
118568651Skris	SQR_ADD_C2 a7L,a7R,a5L,a5R,c1,c2,c3
118668651Skris	STD     c1,96(r_ptr)          ; r[12] = c1;
118768651Skris	COPY    %r0,c1
118868651Skris
118968651Skris	SQR_ADD_C2 a7L,a7R,a6L,a6R,c2,c3,c1
119068651Skris	STD     c2,104(r_ptr)         ; r[13] = c2;
119168651Skris	COPY    %r0,c2
119268651Skris
119368651Skris	SQR_ADD_C a7L,a7R,c3,c1,c2
119468651Skris	STD     c3, 112(r_ptr)       ; r[14] = c3
119568651Skris	STD     c1, 120(r_ptr)       ; r[15] = c1
119668651Skris
119768651Skris    .EXIT
119868651Skris    LDD     -104(%sp),%r6        ; restore r6
119968651Skris    LDD     -112(%sp),%r5        ; restore r5
120068651Skris    LDD     -120(%sp),%r4        ; restore r4
120168651Skris    BVE     (%rp)
120268651Skris    LDD,MB  -128(%sp),%r3
120368651Skris
120468651Skris	.PROCEND
120568651Skris
120668651Skris;-----------------------------------------------------------------------------
120768651Skris;
120868651Skris;void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a)
120968651Skris; arg0 = r_ptr
121068651Skris; arg1 = a_ptr
121168651Skris;
121268651Skris
121368651Skrisbn_sqr_comba4
121468651Skris	.proc
121568651Skris	.callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE
121668651Skris	.EXPORT	bn_sqr_comba4,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
121768651Skris    .entry
121868651Skris	.align 64
121968651Skris    STD     %r3,0(%sp)          ; save r3
122068651Skris    STD     %r4,8(%sp)          ; save r4
122168651Skris    STD     %r5,16(%sp)         ; save r5
122268651Skris    STD     %r6,24(%sp)         ; save r6
122368651Skris
122468651Skris	;
122568651Skris	; Zero out carries
122668651Skris	;
122768651Skris	COPY     %r0,c1
122868651Skris	COPY     %r0,c2
122968651Skris	COPY     %r0,c3
123068651Skris
123168651Skris	LDO      128(%sp),%sp       ; bump stack
123268651Skris    DEPDI,Z -1,32,33,high_mask   ; Create Mask 0xffffffff80000000L
123368651Skris    DEPDI,Z  1,31,1,high_one     ; Create Value  1 << 32
123468651Skris
123568651Skris	;
123668651Skris	; Load up all of the values we are going to use
123768651Skris	;
123868651Skris    FLDD     0(a_ptr),a0
123968651Skris    FLDD     8(a_ptr),a1
124068651Skris    FLDD    16(a_ptr),a2
124168651Skris    FLDD    24(a_ptr),a3
124268651Skris    FLDD    32(a_ptr),a4
124368651Skris    FLDD    40(a_ptr),a5
124468651Skris    FLDD    48(a_ptr),a6
124568651Skris    FLDD    56(a_ptr),a7
124668651Skris
124768651Skris	SQR_ADD_C a0L,a0R,c1,c2,c3
124868651Skris
124968651Skris	STD     c1,0(r_ptr)          ; r[0] = c1;
125068651Skris	COPY    %r0,c1
125168651Skris
125268651Skris	SQR_ADD_C2 a1L,a1R,a0L,a0R,c2,c3,c1
125368651Skris
125468651Skris	STD     c2,8(r_ptr)          ; r[1] = c2;
125568651Skris	COPY    %r0,c2
125668651Skris
125768651Skris	SQR_ADD_C a1L,a1R,c3,c1,c2
125868651Skris	SQR_ADD_C2 a2L,a2R,a0L,a0R,c3,c1,c2
125968651Skris
126068651Skris	STD     c3,16(r_ptr)            ; r[2] = c3;
126168651Skris	COPY    %r0,c3
126268651Skris
126368651Skris	SQR_ADD_C2 a3L,a3R,a0L,a0R,c1,c2,c3
126468651Skris	SQR_ADD_C2 a2L,a2R,a1L,a1R,c1,c2,c3
126568651Skris
126668651Skris	STD     c1,24(r_ptr)           ; r[3] = c1;
126768651Skris	COPY    %r0,c1
126868651Skris
126968651Skris	SQR_ADD_C a2L,a2R,c2,c3,c1
127068651Skris	SQR_ADD_C2 a3L,a3R,a1L,a1R,c2,c3,c1
127168651Skris
127268651Skris	STD     c2,32(r_ptr)           ; r[4] = c2;
127368651Skris	COPY    %r0,c2
127468651Skris
127568651Skris	SQR_ADD_C2 a3L,a3R,a2L,a2R,c3,c1,c2
127668651Skris	STD     c3,40(r_ptr)           ; r[5] = c3;
127768651Skris	COPY    %r0,c3
127868651Skris
127968651Skris	SQR_ADD_C a3L,a3R,c1,c2,c3
128068651Skris	STD     c1,48(r_ptr)           ; r[6] = c1;
128168651Skris	STD     c2,56(r_ptr)           ; r[7] = c2;
128268651Skris
128368651Skris    .EXIT
128468651Skris    LDD     -104(%sp),%r6        ; restore r6
128568651Skris    LDD     -112(%sp),%r5        ; restore r5
128668651Skris    LDD     -120(%sp),%r4        ; restore r4
128768651Skris    BVE     (%rp)
128868651Skris    LDD,MB  -128(%sp),%r3
128968651Skris
129068651Skris	.PROCEND
129168651Skris
129268651Skris
129368651Skris;---------------------------------------------------------------------------
129468651Skris
129568651SkrisMUL_ADD_C  .macro  A0L,A0R,B0L,B0R,C1,C2,C3
129668651Skris    XMPYU   A0L,B0R,ftemp1        ; m1 = bl*ht
129768651Skris    FSTD    ftemp1,-16(%sp)       ;
129868651Skris    XMPYU   A0R,B0L,ftemp2        ; m = bh*lt
129968651Skris    FSTD    ftemp2,-8(%sp)        ;
130068651Skris    XMPYU   A0R,B0R,ftemp3        ; lt = bl*lt
130168651Skris    FSTD    ftemp3,-32(%sp)
130268651Skris    XMPYU   A0L,B0L,ftemp4        ; ht = bh*ht
130368651Skris    FSTD    ftemp4,-24(%sp)       ;
130468651Skris
130568651Skris    LDD     -8(%sp),m             ; r21 = m
130668651Skris    LDD     -16(%sp),m1           ; r19 = m1
130768651Skris    ADD,L   m,m1,m                ; m+m1
130868651Skris
130968651Skris    DEPD,Z  m,31,32,temp3         ; (m+m1<<32)
131068651Skris    LDD     -24(%sp),ht           ; r24 = ht
131168651Skris
131268651Skris    CMPCLR,*>>= m,m1,%r0          ; if (m < m1)
131368651Skris    ADD,L   ht,high_one,ht        ; ht+=high_one
131468651Skris
131568651Skris    EXTRD,U m,31,32,temp1         ; m >> 32
131668651Skris    LDD     -32(%sp),lt           ; lt
131768651Skris    ADD,L   ht,temp1,ht           ; ht+= m>>32
131868651Skris    ADD     lt,temp3,lt           ; lt = lt+m1
131968651Skris    ADD,DC  ht,%r0,ht             ; ht++
132068651Skris
132168651Skris    ADD     C1,lt,C1              ; c1=c1+lt
132268651Skris    ADD,DC  ht,%r0,ht             ; bump c3 if overflow,nullify otherwise
132368651Skris
132468651Skris    ADD     C2,ht,C2              ; c2 = c2 + ht
132568651Skris    ADD,DC  C3,%r0,C3             ; add in carry (c3++)
132668651Skris.endm
132768651Skris
132868651Skris
132968651Skris;
133068651Skris;void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
133168651Skris; arg0 = r_ptr
133268651Skris; arg1 = a_ptr
133368651Skris; arg2 = b_ptr
133468651Skris;
133568651Skris
133668651Skrisbn_mul_comba8
133768651Skris	.proc
133868651Skris	.callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE
133968651Skris	.EXPORT	bn_mul_comba8,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
134068651Skris    .entry
134168651Skris	.align 64
134268651Skris
134368651Skris    STD     %r3,0(%sp)          ; save r3
134468651Skris    STD     %r4,8(%sp)          ; save r4
134568651Skris    STD     %r5,16(%sp)         ; save r5
134668651Skris    STD     %r6,24(%sp)         ; save r6
134768651Skris    FSTD    %fr12,32(%sp)       ; save r6
134868651Skris    FSTD    %fr13,40(%sp)       ; save r7
134968651Skris
135068651Skris	;
135168651Skris	; Zero out carries
135268651Skris	;
135368651Skris	COPY     %r0,c1
135468651Skris	COPY     %r0,c2
135568651Skris	COPY     %r0,c3
135668651Skris
135768651Skris	LDO      128(%sp),%sp       ; bump stack
135868651Skris    DEPDI,Z  1,31,1,high_one     ; Create Value  1 << 32
135968651Skris
136068651Skris	;
136168651Skris	; Load up all of the values we are going to use
136268651Skris	;
136368651Skris    FLDD      0(a_ptr),a0
136468651Skris    FLDD      8(a_ptr),a1
136568651Skris    FLDD     16(a_ptr),a2
136668651Skris    FLDD     24(a_ptr),a3
136768651Skris    FLDD     32(a_ptr),a4
136868651Skris    FLDD     40(a_ptr),a5
136968651Skris    FLDD     48(a_ptr),a6
137068651Skris    FLDD     56(a_ptr),a7
137168651Skris
137268651Skris    FLDD      0(b_ptr),b0
137368651Skris    FLDD      8(b_ptr),b1
137468651Skris    FLDD     16(b_ptr),b2
137568651Skris    FLDD     24(b_ptr),b3
137668651Skris    FLDD     32(b_ptr),b4
137768651Skris    FLDD     40(b_ptr),b5
137868651Skris    FLDD     48(b_ptr),b6
137968651Skris    FLDD     56(b_ptr),b7
138068651Skris
138168651Skris	MUL_ADD_C a0L,a0R,b0L,b0R,c1,c2,c3
138268651Skris	STD       c1,0(r_ptr)
138368651Skris	COPY      %r0,c1
138468651Skris
138568651Skris	MUL_ADD_C a0L,a0R,b1L,b1R,c2,c3,c1
138668651Skris	MUL_ADD_C a1L,a1R,b0L,b0R,c2,c3,c1
138768651Skris	STD       c2,8(r_ptr)
138868651Skris	COPY      %r0,c2
138968651Skris
139068651Skris	MUL_ADD_C a2L,a2R,b0L,b0R,c3,c1,c2
139168651Skris	MUL_ADD_C a1L,a1R,b1L,b1R,c3,c1,c2
139268651Skris	MUL_ADD_C a0L,a0R,b2L,b2R,c3,c1,c2
139368651Skris	STD       c3,16(r_ptr)
139468651Skris	COPY      %r0,c3
139568651Skris
139668651Skris	MUL_ADD_C a0L,a0R,b3L,b3R,c1,c2,c3
139768651Skris	MUL_ADD_C a1L,a1R,b2L,b2R,c1,c2,c3
139868651Skris	MUL_ADD_C a2L,a2R,b1L,b1R,c1,c2,c3
139968651Skris	MUL_ADD_C a3L,a3R,b0L,b0R,c1,c2,c3
140068651Skris	STD       c1,24(r_ptr)
140168651Skris	COPY      %r0,c1
140268651Skris
140368651Skris	MUL_ADD_C a4L,a4R,b0L,b0R,c2,c3,c1
140468651Skris	MUL_ADD_C a3L,a3R,b1L,b1R,c2,c3,c1
140568651Skris	MUL_ADD_C a2L,a2R,b2L,b2R,c2,c3,c1
140668651Skris	MUL_ADD_C a1L,a1R,b3L,b3R,c2,c3,c1
140768651Skris	MUL_ADD_C a0L,a0R,b4L,b4R,c2,c3,c1
140868651Skris	STD       c2,32(r_ptr)
140968651Skris	COPY      %r0,c2
141068651Skris
141168651Skris	MUL_ADD_C a0L,a0R,b5L,b5R,c3,c1,c2
141268651Skris	MUL_ADD_C a1L,a1R,b4L,b4R,c3,c1,c2
141368651Skris	MUL_ADD_C a2L,a2R,b3L,b3R,c3,c1,c2
141468651Skris	MUL_ADD_C a3L,a3R,b2L,b2R,c3,c1,c2
141568651Skris	MUL_ADD_C a4L,a4R,b1L,b1R,c3,c1,c2
141668651Skris	MUL_ADD_C a5L,a5R,b0L,b0R,c3,c1,c2
141768651Skris	STD       c3,40(r_ptr)
141868651Skris	COPY      %r0,c3
141968651Skris
142068651Skris	MUL_ADD_C a6L,a6R,b0L,b0R,c1,c2,c3
142168651Skris	MUL_ADD_C a5L,a5R,b1L,b1R,c1,c2,c3
142268651Skris	MUL_ADD_C a4L,a4R,b2L,b2R,c1,c2,c3
142368651Skris	MUL_ADD_C a3L,a3R,b3L,b3R,c1,c2,c3
142468651Skris	MUL_ADD_C a2L,a2R,b4L,b4R,c1,c2,c3
142568651Skris	MUL_ADD_C a1L,a1R,b5L,b5R,c1,c2,c3
142668651Skris	MUL_ADD_C a0L,a0R,b6L,b6R,c1,c2,c3
142768651Skris	STD       c1,48(r_ptr)
142868651Skris	COPY      %r0,c1
142968651Skris
143068651Skris	MUL_ADD_C a0L,a0R,b7L,b7R,c2,c3,c1
143168651Skris	MUL_ADD_C a1L,a1R,b6L,b6R,c2,c3,c1
143268651Skris	MUL_ADD_C a2L,a2R,b5L,b5R,c2,c3,c1
143368651Skris	MUL_ADD_C a3L,a3R,b4L,b4R,c2,c3,c1
143468651Skris	MUL_ADD_C a4L,a4R,b3L,b3R,c2,c3,c1
143568651Skris	MUL_ADD_C a5L,a5R,b2L,b2R,c2,c3,c1
143668651Skris	MUL_ADD_C a6L,a6R,b1L,b1R,c2,c3,c1
143768651Skris	MUL_ADD_C a7L,a7R,b0L,b0R,c2,c3,c1
143868651Skris	STD       c2,56(r_ptr)
143968651Skris	COPY      %r0,c2
144068651Skris
144168651Skris	MUL_ADD_C a7L,a7R,b1L,b1R,c3,c1,c2
144268651Skris	MUL_ADD_C a6L,a6R,b2L,b2R,c3,c1,c2
144368651Skris	MUL_ADD_C a5L,a5R,b3L,b3R,c3,c1,c2
144468651Skris	MUL_ADD_C a4L,a4R,b4L,b4R,c3,c1,c2
144568651Skris	MUL_ADD_C a3L,a3R,b5L,b5R,c3,c1,c2
144668651Skris	MUL_ADD_C a2L,a2R,b6L,b6R,c3,c1,c2
144768651Skris	MUL_ADD_C a1L,a1R,b7L,b7R,c3,c1,c2
144868651Skris	STD       c3,64(r_ptr)
144968651Skris	COPY      %r0,c3
145068651Skris
145168651Skris	MUL_ADD_C a2L,a2R,b7L,b7R,c1,c2,c3
145268651Skris	MUL_ADD_C a3L,a3R,b6L,b6R,c1,c2,c3
145368651Skris	MUL_ADD_C a4L,a4R,b5L,b5R,c1,c2,c3
145468651Skris	MUL_ADD_C a5L,a5R,b4L,b4R,c1,c2,c3
145568651Skris	MUL_ADD_C a6L,a6R,b3L,b3R,c1,c2,c3
145668651Skris	MUL_ADD_C a7L,a7R,b2L,b2R,c1,c2,c3
145768651Skris	STD       c1,72(r_ptr)
145868651Skris	COPY      %r0,c1
145968651Skris
146068651Skris	MUL_ADD_C a7L,a7R,b3L,b3R,c2,c3,c1
146168651Skris	MUL_ADD_C a6L,a6R,b4L,b4R,c2,c3,c1
146268651Skris	MUL_ADD_C a5L,a5R,b5L,b5R,c2,c3,c1
146368651Skris	MUL_ADD_C a4L,a4R,b6L,b6R,c2,c3,c1
146468651Skris	MUL_ADD_C a3L,a3R,b7L,b7R,c2,c3,c1
146568651Skris	STD       c2,80(r_ptr)
146668651Skris	COPY      %r0,c2
146768651Skris
146868651Skris	MUL_ADD_C a4L,a4R,b7L,b7R,c3,c1,c2
146968651Skris	MUL_ADD_C a5L,a5R,b6L,b6R,c3,c1,c2
147068651Skris	MUL_ADD_C a6L,a6R,b5L,b5R,c3,c1,c2
147168651Skris	MUL_ADD_C a7L,a7R,b4L,b4R,c3,c1,c2
147268651Skris	STD       c3,88(r_ptr)
147368651Skris	COPY      %r0,c3
147468651Skris
147568651Skris	MUL_ADD_C a7L,a7R,b5L,b5R,c1,c2,c3
147668651Skris	MUL_ADD_C a6L,a6R,b6L,b6R,c1,c2,c3
147768651Skris	MUL_ADD_C a5L,a5R,b7L,b7R,c1,c2,c3
147868651Skris	STD       c1,96(r_ptr)
147968651Skris	COPY      %r0,c1
148068651Skris
148168651Skris	MUL_ADD_C a6L,a6R,b7L,b7R,c2,c3,c1
148268651Skris	MUL_ADD_C a7L,a7R,b6L,b6R,c2,c3,c1
148368651Skris	STD       c2,104(r_ptr)
148468651Skris	COPY      %r0,c2
148568651Skris
148668651Skris	MUL_ADD_C a7L,a7R,b7L,b7R,c3,c1,c2
148768651Skris	STD       c3,112(r_ptr)
148868651Skris	STD       c1,120(r_ptr)
148968651Skris
149068651Skris    .EXIT
149168651Skris    FLDD    -88(%sp),%fr13
149268651Skris    FLDD    -96(%sp),%fr12
149368651Skris    LDD     -104(%sp),%r6        ; restore r6
149468651Skris    LDD     -112(%sp),%r5        ; restore r5
149568651Skris    LDD     -120(%sp),%r4        ; restore r4
149668651Skris    BVE     (%rp)
149768651Skris    LDD,MB  -128(%sp),%r3
149868651Skris
149968651Skris	.PROCEND
150068651Skris
150168651Skris;-----------------------------------------------------------------------------
150268651Skris;
150368651Skris;void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
150468651Skris; arg0 = r_ptr
150568651Skris; arg1 = a_ptr
150668651Skris; arg2 = b_ptr
150768651Skris;
150868651Skris
150968651Skrisbn_mul_comba4
151068651Skris	.proc
151168651Skris	.callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE
151268651Skris	.EXPORT	bn_mul_comba4,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
151368651Skris    .entry
151468651Skris	.align 64
151568651Skris
151668651Skris    STD     %r3,0(%sp)          ; save r3
151768651Skris    STD     %r4,8(%sp)          ; save r4
151868651Skris    STD     %r5,16(%sp)         ; save r5
151968651Skris    STD     %r6,24(%sp)         ; save r6
152068651Skris    FSTD    %fr12,32(%sp)       ; save r6
152168651Skris    FSTD    %fr13,40(%sp)       ; save r7
152268651Skris
152368651Skris	;
152468651Skris	; Zero out carries
152568651Skris	;
152668651Skris	COPY     %r0,c1
152768651Skris	COPY     %r0,c2
152868651Skris	COPY     %r0,c3
152968651Skris
153068651Skris	LDO      128(%sp),%sp       ; bump stack
153168651Skris    DEPDI,Z  1,31,1,high_one     ; Create Value  1 << 32
153268651Skris
153368651Skris	;
153468651Skris	; Load up all of the values we are going to use
153568651Skris	;
153668651Skris    FLDD      0(a_ptr),a0
153768651Skris    FLDD      8(a_ptr),a1
153868651Skris    FLDD     16(a_ptr),a2
153968651Skris    FLDD     24(a_ptr),a3
154068651Skris
154168651Skris    FLDD      0(b_ptr),b0
154268651Skris    FLDD      8(b_ptr),b1
154368651Skris    FLDD     16(b_ptr),b2
154468651Skris    FLDD     24(b_ptr),b3
154568651Skris
154668651Skris	MUL_ADD_C a0L,a0R,b0L,b0R,c1,c2,c3
154768651Skris	STD       c1,0(r_ptr)
154868651Skris	COPY      %r0,c1
154968651Skris
155068651Skris	MUL_ADD_C a0L,a0R,b1L,b1R,c2,c3,c1
155168651Skris	MUL_ADD_C a1L,a1R,b0L,b0R,c2,c3,c1
155268651Skris	STD       c2,8(r_ptr)
155368651Skris	COPY      %r0,c2
155468651Skris
155568651Skris	MUL_ADD_C a2L,a2R,b0L,b0R,c3,c1,c2
155668651Skris	MUL_ADD_C a1L,a1R,b1L,b1R,c3,c1,c2
155768651Skris	MUL_ADD_C a0L,a0R,b2L,b2R,c3,c1,c2
155868651Skris	STD       c3,16(r_ptr)
155968651Skris	COPY      %r0,c3
156068651Skris
156168651Skris	MUL_ADD_C a0L,a0R,b3L,b3R,c1,c2,c3
156268651Skris	MUL_ADD_C a1L,a1R,b2L,b2R,c1,c2,c3
156368651Skris	MUL_ADD_C a2L,a2R,b1L,b1R,c1,c2,c3
156468651Skris	MUL_ADD_C a3L,a3R,b0L,b0R,c1,c2,c3
156568651Skris	STD       c1,24(r_ptr)
156668651Skris	COPY      %r0,c1
156768651Skris
156868651Skris	MUL_ADD_C a3L,a3R,b1L,b1R,c2,c3,c1
156968651Skris	MUL_ADD_C a2L,a2R,b2L,b2R,c2,c3,c1
157068651Skris	MUL_ADD_C a1L,a1R,b3L,b3R,c2,c3,c1
157168651Skris	STD       c2,32(r_ptr)
157268651Skris	COPY      %r0,c2
157368651Skris
157468651Skris	MUL_ADD_C a2L,a2R,b3L,b3R,c3,c1,c2
157568651Skris	MUL_ADD_C a3L,a3R,b2L,b2R,c3,c1,c2
157668651Skris	STD       c3,40(r_ptr)
157768651Skris	COPY      %r0,c3
157868651Skris
157968651Skris	MUL_ADD_C a3L,a3R,b3L,b3R,c1,c2,c3
158068651Skris	STD       c1,48(r_ptr)
158168651Skris	STD       c2,56(r_ptr)
158268651Skris
158368651Skris    .EXIT
158468651Skris    FLDD    -88(%sp),%fr13
158568651Skris    FLDD    -96(%sp),%fr12
158668651Skris    LDD     -104(%sp),%r6        ; restore r6
158768651Skris    LDD     -112(%sp),%r5        ; restore r5
158868651Skris    LDD     -120(%sp),%r4        ; restore r4
158968651Skris    BVE     (%rp)
159068651Skris    LDD,MB  -128(%sp),%r3
159168651Skris
159268651Skris	.PROCEND
159368651Skris
159468651Skris
159568651Skris	.SPACE	$TEXT$
159668651Skris	.SUBSPA	$CODE$
159768651Skris	.SPACE	$PRIVATE$,SORT=16
159868651Skris	.IMPORT	$global$,DATA
159968651Skris	.SPACE	$TEXT$
160068651Skris	.SUBSPA	$CODE$
160172613Skris	.SUBSPA	$LIT$,ACCESS=0x2c
160268651SkrisC$4
160368651Skris	.ALIGN	8
160468651Skris	.STRINGZ	"Division would overflow (%d)\n"
160568651Skris	.END
1606