168651Skris;
268651Skris; PA-RISC 2.0 implementation of bn_asm code, based on the
368651Skris; 64-bit version of the code.  This code is effectively the
468651Skris; same as the 64-bit version except the register model is
568651Skris; slightly different given all values must be 32-bit between
668651Skris; function calls.  Thus the 64-bit return values are returned
768651Skris; in %ret0 and %ret1 vs just %ret0 as is done in 64-bit
868651Skris;
968651Skris;
1068651Skris; This code is approximately 2x faster than the C version
1168651Skris; for RSA/DSA.
1268651Skris;
1368651Skris; See http://devresource.hp.com/  for more details on the PA-RISC
1468651Skris; architecture.  Also see the book "PA-RISC 2.0 Architecture"
1568651Skris; by Gerry Kane for information on the instruction set architecture.
1668651Skris;
1768651Skris; Code written by Chris Ruemmler (with some help from the HP C
1868651Skris; compiler).
1968651Skris;
2068651Skris; The code compiles with HP's assembler
2168651Skris;
2255714Skris
2368651Skris	.level	2.0N
2468651Skris	.space	$TEXT$
2568651Skris	.subspa	$CODE$,QUAD=0,ALIGN=8,ACCESS=0x2c,CODE_ONLY
2668651Skris
2768651Skris;
2868651Skris; Global Register definitions used for the routines.
2968651Skris;
3068651Skris; Some information about HP's runtime architecture for 32-bits.
3168651Skris;
3268651Skris; "Caller save" means the calling function must save the register
3368651Skris; if it wants the register to be preserved.
3468651Skris; "Callee save" means if a function uses the register, it must save
3568651Skris; the value before using it.
3668651Skris;
3768651Skris; For the floating point registers
3868651Skris;
3968651Skris;    "caller save" registers: fr4-fr11, fr22-fr31
4068651Skris;    "callee save" registers: fr12-fr21
4168651Skris;    "special" registers: fr0-fr3 (status and exception registers)
4268651Skris;
4368651Skris; For the integer registers
4468651Skris;     value zero             :  r0
4568651Skris;     "caller save" registers: r1,r19-r26
4668651Skris;     "callee save" registers: r3-r18
4768651Skris;     return register        :  r2  (rp)
4868651Skris;     return values          ; r28,r29  (ret0,ret1)
4968651Skris;     Stack pointer          ; r30  (sp)
5068651Skris;     millicode return ptr   ; r31  (also a caller save register)
5168651Skris
5268651Skris
5368651Skris;
5468651Skris; Arguments to the routines
5568651Skris;
5668651Skrisr_ptr       .reg %r26
5768651Skrisa_ptr       .reg %r25
5868651Skrisb_ptr       .reg %r24
5968651Skrisnum         .reg %r24
6068651Skrisn           .reg %r23
6168651Skris
6268651Skris;
6368651Skris; Note that the "w" argument for bn_mul_add_words and bn_mul_words
6468651Skris; is passed on the stack at a delta of -56 from the top of stack
6568651Skris; as the routine is entered.
6668651Skris;
6768651Skris
6868651Skris;
6968651Skris; Globals used in some routines
7068651Skris;
7168651Skris
7268651Skristop_overflow .reg %r23
7368651Skrishigh_mask    .reg %r22    ; value 0xffffffff80000000L
7468651Skris
7568651Skris
7668651Skris;------------------------------------------------------------------------------
7768651Skris;
7868651Skris; bn_mul_add_words
7968651Skris;
8068651Skris;BN_ULONG bn_mul_add_words(BN_ULONG *r_ptr, BN_ULONG *a_ptr,
8168651Skris;								int num, BN_ULONG w)
8268651Skris;
8368651Skris; arg0 = r_ptr
8468651Skris; arg1 = a_ptr
8568651Skris; arg3 = num
8668651Skris; -56(sp) =  w
8768651Skris;
8868651Skris; Local register definitions
8968651Skris;
9068651Skris
9168651Skrisfm1          .reg %fr22
9268651Skrisfm           .reg %fr23
9368651Skrisht_temp      .reg %fr24
9468651Skrisht_temp_1    .reg %fr25
9568651Skrislt_temp      .reg %fr26
9668651Skrislt_temp_1    .reg %fr27
9768651Skrisfm1_1        .reg %fr28
9868651Skrisfm_1         .reg %fr29
9968651Skris
10068651Skrisfw_h         .reg %fr7L
10168651Skrisfw_l         .reg %fr7R
10268651Skrisfw           .reg %fr7
10368651Skris
10468651Skrisfht_0        .reg %fr8L
10568651Skrisflt_0        .reg %fr8R
10668651Skrist_float_0    .reg %fr8
10768651Skris
10868651Skrisfht_1        .reg %fr9L
10968651Skrisflt_1        .reg %fr9R
11068651Skrist_float_1    .reg %fr9
11168651Skris
11268651Skristmp_0        .reg %r31
11368651Skristmp_1        .reg %r21
11468651Skrism_0          .reg %r20
11568651Skrism_1          .reg %r19
11668651Skrisht_0         .reg %r1
11768651Skrisht_1         .reg %r3
11868651Skrislt_0         .reg %r4
11968651Skrislt_1         .reg %r5
12068651Skrism1_0         .reg %r6
12168651Skrism1_1         .reg %r7
12268651Skrisrp_val       .reg %r8
12368651Skrisrp_val_1     .reg %r9
12468651Skris
12555714Skrisbn_mul_add_words
12668651Skris	.export	bn_mul_add_words,entry,NO_RELOCATION,LONG_RETURN
12768651Skris	.proc
12868651Skris	.callinfo frame=128
12968651Skris    .entry
13068651Skris	.align 64
13168651Skris
13268651Skris    STD     %r3,0(%sp)          ; save r3
13368651Skris    STD     %r4,8(%sp)          ; save r4
13468651Skris	NOP                         ; Needed to make the loop 16-byte aligned
13568651Skris	NOP                         ; needed to make the loop 16-byte aligned
13668651Skris
13768651Skris    STD     %r5,16(%sp)         ; save r5
13868651Skris	NOP
13968651Skris    STD     %r6,24(%sp)         ; save r6
14068651Skris    STD     %r7,32(%sp)         ; save r7
14168651Skris
14268651Skris    STD     %r8,40(%sp)         ; save r8
14368651Skris    STD     %r9,48(%sp)         ; save r9
14468651Skris    COPY    %r0,%ret1           ; return 0 by default
14568651Skris    DEPDI,Z 1,31,1,top_overflow ; top_overflow = 1 << 32
14668651Skris
14768651Skris    CMPIB,>= 0,num,bn_mul_add_words_exit  ; if (num <= 0) then exit
14868651Skris	LDO     128(%sp),%sp        ; bump stack
14968651Skris
15068651Skris	;
15168651Skris	; The loop is unrolled twice, so if there is only 1 number
15268651Skris    ; then go straight to the cleanup code.
15368651Skris	;
15468651Skris	CMPIB,= 1,num,bn_mul_add_words_single_top
15568651Skris	FLDD    -184(%sp),fw        ; (-56-128) load up w into fw (fw_h/fw_l)
15668651Skris
15768651Skris	;
15868651Skris	; This loop is unrolled 2 times (64-byte aligned as well)
15968651Skris	;
16068651Skris	; PA-RISC 2.0 chips have two fully pipelined multipliers, thus
16168651Skris    ; two 32-bit mutiplies can be issued per cycle.
16268651Skris    ;
16368651Skrisbn_mul_add_words_unroll2
16468651Skris
16568651Skris    FLDD    0(a_ptr),t_float_0       ; load up 64-bit value (fr8L) ht(L)/lt(R)
16668651Skris    FLDD    8(a_ptr),t_float_1       ; load up 64-bit value (fr8L) ht(L)/lt(R)
16768651Skris    LDD     0(r_ptr),rp_val          ; rp[0]
16868651Skris    LDD     8(r_ptr),rp_val_1        ; rp[1]
16968651Skris
17068651Skris    XMPYU   fht_0,fw_l,fm1           ; m1[0] = fht_0*fw_l
17168651Skris    XMPYU   fht_1,fw_l,fm1_1         ; m1[1] = fht_1*fw_l
17268651Skris    FSTD    fm1,-16(%sp)             ; -16(sp) = m1[0]
17368651Skris    FSTD    fm1_1,-48(%sp)           ; -48(sp) = m1[1]
17468651Skris
17568651Skris    XMPYU   flt_0,fw_h,fm            ; m[0] = flt_0*fw_h
17668651Skris    XMPYU   flt_1,fw_h,fm_1          ; m[1] = flt_1*fw_h
17768651Skris    FSTD    fm,-8(%sp)               ; -8(sp) = m[0]
17868651Skris    FSTD    fm_1,-40(%sp)            ; -40(sp) = m[1]
17968651Skris
18068651Skris    XMPYU   fht_0,fw_h,ht_temp       ; ht_temp   = fht_0*fw_h
18168651Skris    XMPYU   fht_1,fw_h,ht_temp_1     ; ht_temp_1 = fht_1*fw_h
18268651Skris    FSTD    ht_temp,-24(%sp)         ; -24(sp)   = ht_temp
18368651Skris    FSTD    ht_temp_1,-56(%sp)       ; -56(sp)   = ht_temp_1
18468651Skris
18568651Skris    XMPYU   flt_0,fw_l,lt_temp       ; lt_temp = lt*fw_l
18668651Skris    XMPYU   flt_1,fw_l,lt_temp_1     ; lt_temp = lt*fw_l
18768651Skris    FSTD    lt_temp,-32(%sp)         ; -32(sp) = lt_temp
18868651Skris    FSTD    lt_temp_1,-64(%sp)       ; -64(sp) = lt_temp_1
18968651Skris
19068651Skris    LDD     -8(%sp),m_0              ; m[0]
19168651Skris    LDD     -40(%sp),m_1             ; m[1]
19268651Skris    LDD     -16(%sp),m1_0            ; m1[0]
19368651Skris    LDD     -48(%sp),m1_1            ; m1[1]
19468651Skris
19568651Skris    LDD     -24(%sp),ht_0            ; ht[0]
19668651Skris    LDD     -56(%sp),ht_1            ; ht[1]
19768651Skris    ADD,L   m1_0,m_0,tmp_0           ; tmp_0 = m[0] + m1[0];
19868651Skris    ADD,L   m1_1,m_1,tmp_1           ; tmp_1 = m[1] + m1[1];
19968651Skris
20068651Skris    LDD     -32(%sp),lt_0
20168651Skris    LDD     -64(%sp),lt_1
20268651Skris    CMPCLR,*>>= tmp_0,m1_0, %r0      ; if (m[0] < m1[0])
20368651Skris    ADD,L   ht_0,top_overflow,ht_0   ; ht[0] += (1<<32)
20468651Skris
20568651Skris    CMPCLR,*>>= tmp_1,m1_1,%r0       ; if (m[1] < m1[1])
20668651Skris    ADD,L   ht_1,top_overflow,ht_1   ; ht[1] += (1<<32)
20768651Skris    EXTRD,U tmp_0,31,32,m_0          ; m[0]>>32
20868651Skris    DEPD,Z  tmp_0,31,32,m1_0         ; m1[0] = m[0]<<32
20968651Skris
21068651Skris    EXTRD,U tmp_1,31,32,m_1          ; m[1]>>32
21168651Skris    DEPD,Z  tmp_1,31,32,m1_1         ; m1[1] = m[1]<<32
21268651Skris    ADD,L   ht_0,m_0,ht_0            ; ht[0]+= (m[0]>>32)
21368651Skris    ADD,L   ht_1,m_1,ht_1            ; ht[1]+= (m[1]>>32)
21468651Skris
21568651Skris    ADD     lt_0,m1_0,lt_0           ; lt[0] = lt[0]+m1[0];
21668651Skris	ADD,DC  ht_0,%r0,ht_0            ; ht[0]++
21768651Skris    ADD     lt_1,m1_1,lt_1           ; lt[1] = lt[1]+m1[1];
21868651Skris    ADD,DC  ht_1,%r0,ht_1            ; ht[1]++
21968651Skris
22068651Skris    ADD    %ret1,lt_0,lt_0           ; lt[0] = lt[0] + c;
22168651Skris	ADD,DC  ht_0,%r0,ht_0            ; ht[0]++
22268651Skris    ADD     lt_0,rp_val,lt_0         ; lt[0] = lt[0]+rp[0]
22368651Skris    ADD,DC  ht_0,%r0,ht_0            ; ht[0]++
22468651Skris
22568651Skris	LDO    -2(num),num               ; num = num - 2;
22668651Skris    ADD     ht_0,lt_1,lt_1           ; lt[1] = lt[1] + ht_0 (c);
22768651Skris    ADD,DC  ht_1,%r0,ht_1            ; ht[1]++
22868651Skris    STD     lt_0,0(r_ptr)            ; rp[0] = lt[0]
22968651Skris
23068651Skris    ADD     lt_1,rp_val_1,lt_1       ; lt[1] = lt[1]+rp[1]
23168651Skris    ADD,DC  ht_1,%r0,%ret1           ; ht[1]++
23268651Skris    LDO     16(a_ptr),a_ptr          ; a_ptr += 2
23368651Skris
23468651Skris    STD     lt_1,8(r_ptr)            ; rp[1] = lt[1]
23568651Skris	CMPIB,<= 2,num,bn_mul_add_words_unroll2 ; go again if more to do
23668651Skris    LDO     16(r_ptr),r_ptr          ; r_ptr += 2
23768651Skris
23868651Skris    CMPIB,=,N 0,num,bn_mul_add_words_exit ; are we done, or cleanup last one
23968651Skris
24068651Skris	;
24168651Skris	; Top of loop aligned on 64-byte boundary
24268651Skris	;
24368651Skrisbn_mul_add_words_single_top
24468651Skris    FLDD    0(a_ptr),t_float_0        ; load up 64-bit value (fr8L) ht(L)/lt(R)
24568651Skris    LDD     0(r_ptr),rp_val           ; rp[0]
24668651Skris    LDO     8(a_ptr),a_ptr            ; a_ptr++
24768651Skris    XMPYU   fht_0,fw_l,fm1            ; m1 = ht*fw_l
24868651Skris    FSTD    fm1,-16(%sp)              ; -16(sp) = m1
24968651Skris    XMPYU   flt_0,fw_h,fm             ; m = lt*fw_h
25068651Skris    FSTD    fm,-8(%sp)                ; -8(sp) = m
25168651Skris    XMPYU   fht_0,fw_h,ht_temp        ; ht_temp = ht*fw_h
25268651Skris    FSTD    ht_temp,-24(%sp)          ; -24(sp) = ht
25368651Skris    XMPYU   flt_0,fw_l,lt_temp        ; lt_temp = lt*fw_l
25468651Skris    FSTD    lt_temp,-32(%sp)          ; -32(sp) = lt
25568651Skris
25668651Skris    LDD     -8(%sp),m_0
25768651Skris    LDD    -16(%sp),m1_0              ; m1 = temp1
25868651Skris    ADD,L   m_0,m1_0,tmp_0            ; tmp_0 = m + m1;
25968651Skris    LDD     -24(%sp),ht_0
26068651Skris    LDD     -32(%sp),lt_0
26168651Skris
26268651Skris    CMPCLR,*>>= tmp_0,m1_0,%r0        ; if (m < m1)
26368651Skris    ADD,L   ht_0,top_overflow,ht_0    ; ht += (1<<32)
26468651Skris
26568651Skris    EXTRD,U tmp_0,31,32,m_0           ; m>>32
26668651Skris    DEPD,Z  tmp_0,31,32,m1_0          ; m1 = m<<32
26768651Skris
26868651Skris    ADD,L   ht_0,m_0,ht_0             ; ht+= (m>>32)
26968651Skris    ADD     lt_0,m1_0,tmp_0           ; tmp_0 = lt+m1;
27068651Skris    ADD,DC  ht_0,%r0,ht_0             ; ht++
27168651Skris    ADD     %ret1,tmp_0,lt_0          ; lt = lt + c;
27268651Skris    ADD,DC  ht_0,%r0,ht_0             ; ht++
27368651Skris    ADD     lt_0,rp_val,lt_0          ; lt = lt+rp[0]
27468651Skris    ADD,DC  ht_0,%r0,%ret1            ; ht++
27568651Skris    STD     lt_0,0(r_ptr)             ; rp[0] = lt
27668651Skris
27768651Skrisbn_mul_add_words_exit
27868651Skris    .EXIT
27968651Skris
28068651Skris    EXTRD,U %ret1,31,32,%ret0         ; for 32-bit, return in ret0/ret1
28168651Skris    LDD     -80(%sp),%r9              ; restore r9
28268651Skris    LDD     -88(%sp),%r8              ; restore r8
28368651Skris    LDD     -96(%sp),%r7              ; restore r7
28468651Skris    LDD     -104(%sp),%r6             ; restore r6
28568651Skris    LDD     -112(%sp),%r5             ; restore r5
28668651Skris    LDD     -120(%sp),%r4             ; restore r4
28768651Skris    BVE     (%rp)
28868651Skris    LDD,MB  -128(%sp),%r3             ; restore r3
28968651Skris	.PROCEND	;in=23,24,25,26,29;out=28;
29068651Skris
29168651Skris;----------------------------------------------------------------------------
29268651Skris;
29368651Skris;BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
29468651Skris;
29568651Skris; arg0 = rp
29668651Skris; arg1 = ap
29768651Skris; arg3 = num
29868651Skris; w on stack at -56(sp)
29968651Skris
30055714Skrisbn_mul_words
30168651Skris	.proc
30268651Skris	.callinfo frame=128
30368651Skris    .entry
30468651Skris	.EXPORT	bn_mul_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
30568651Skris	.align 64
30668651Skris
30768651Skris    STD     %r3,0(%sp)          ; save r3
30868651Skris    STD     %r4,8(%sp)          ; save r4
30968651Skris	NOP
31068651Skris    STD     %r5,16(%sp)         ; save r5
31168651Skris
31268651Skris    STD     %r6,24(%sp)         ; save r6
31368651Skris    STD     %r7,32(%sp)         ; save r7
31468651Skris    COPY    %r0,%ret1           ; return 0 by default
31568651Skris    DEPDI,Z 1,31,1,top_overflow ; top_overflow = 1 << 32
31668651Skris
31768651Skris    CMPIB,>= 0,num,bn_mul_words_exit
31868651Skris	LDO     128(%sp),%sp    ; bump stack
31968651Skris
32068651Skris	;
32168651Skris	; See if only 1 word to do, thus just do cleanup
32268651Skris	;
32368651Skris	CMPIB,= 1,num,bn_mul_words_single_top
32468651Skris	FLDD    -184(%sp),fw        ; (-56-128) load up w into fw (fw_h/fw_l)
32568651Skris
32668651Skris	;
32768651Skris	; This loop is unrolled 2 times (64-byte aligned as well)
32868651Skris	;
32968651Skris	; PA-RISC 2.0 chips have two fully pipelined multipliers, thus
33068651Skris    ; two 32-bit mutiplies can be issued per cycle.
33168651Skris    ;
33268651Skrisbn_mul_words_unroll2
33368651Skris
33468651Skris    FLDD    0(a_ptr),t_float_0        ; load up 64-bit value (fr8L) ht(L)/lt(R)
33568651Skris    FLDD    8(a_ptr),t_float_1        ; load up 64-bit value (fr8L) ht(L)/lt(R)
33668651Skris    XMPYU   fht_0,fw_l,fm1            ; m1[0] = fht_0*fw_l
33768651Skris    XMPYU   fht_1,fw_l,fm1_1          ; m1[1] = ht*fw_l
33868651Skris
33968651Skris    FSTD    fm1,-16(%sp)              ; -16(sp) = m1
34068651Skris    FSTD    fm1_1,-48(%sp)            ; -48(sp) = m1
34168651Skris    XMPYU   flt_0,fw_h,fm             ; m = lt*fw_h
34268651Skris    XMPYU   flt_1,fw_h,fm_1           ; m = lt*fw_h
34368651Skris
34468651Skris    FSTD    fm,-8(%sp)                ; -8(sp) = m
34568651Skris    FSTD    fm_1,-40(%sp)             ; -40(sp) = m
34668651Skris    XMPYU   fht_0,fw_h,ht_temp        ; ht_temp = fht_0*fw_h
34768651Skris    XMPYU   fht_1,fw_h,ht_temp_1      ; ht_temp = ht*fw_h
34868651Skris
34968651Skris    FSTD    ht_temp,-24(%sp)          ; -24(sp) = ht
35068651Skris    FSTD    ht_temp_1,-56(%sp)        ; -56(sp) = ht
35168651Skris    XMPYU   flt_0,fw_l,lt_temp        ; lt_temp = lt*fw_l
35268651Skris    XMPYU   flt_1,fw_l,lt_temp_1      ; lt_temp = lt*fw_l
35368651Skris
35468651Skris    FSTD    lt_temp,-32(%sp)          ; -32(sp) = lt
35568651Skris    FSTD    lt_temp_1,-64(%sp)        ; -64(sp) = lt
35668651Skris    LDD     -8(%sp),m_0
35768651Skris    LDD     -40(%sp),m_1
35868651Skris
35968651Skris    LDD    -16(%sp),m1_0
36068651Skris    LDD    -48(%sp),m1_1
36168651Skris    LDD     -24(%sp),ht_0
36268651Skris    LDD     -56(%sp),ht_1
36368651Skris
36468651Skris    ADD,L   m1_0,m_0,tmp_0            ; tmp_0 = m + m1;
36568651Skris    ADD,L   m1_1,m_1,tmp_1            ; tmp_1 = m + m1;
36668651Skris    LDD     -32(%sp),lt_0
36768651Skris    LDD     -64(%sp),lt_1
36868651Skris
36968651Skris    CMPCLR,*>>= tmp_0,m1_0, %r0       ; if (m < m1)
37068651Skris    ADD,L   ht_0,top_overflow,ht_0    ; ht += (1<<32)
37168651Skris    CMPCLR,*>>= tmp_1,m1_1,%r0        ; if (m < m1)
37268651Skris    ADD,L   ht_1,top_overflow,ht_1    ; ht += (1<<32)
37368651Skris
37468651Skris    EXTRD,U tmp_0,31,32,m_0           ; m>>32
37568651Skris    DEPD,Z  tmp_0,31,32,m1_0          ; m1 = m<<32
37668651Skris    EXTRD,U tmp_1,31,32,m_1           ; m>>32
37768651Skris    DEPD,Z  tmp_1,31,32,m1_1          ; m1 = m<<32
37868651Skris
37968651Skris    ADD,L   ht_0,m_0,ht_0             ; ht+= (m>>32)
38068651Skris    ADD,L   ht_1,m_1,ht_1             ; ht+= (m>>32)
38168651Skris    ADD     lt_0,m1_0,lt_0            ; lt = lt+m1;
38268651Skris	ADD,DC  ht_0,%r0,ht_0             ; ht++
38368651Skris
38468651Skris    ADD     lt_1,m1_1,lt_1            ; lt = lt+m1;
38568651Skris    ADD,DC  ht_1,%r0,ht_1             ; ht++
38668651Skris    ADD    %ret1,lt_0,lt_0            ; lt = lt + c (ret1);
38768651Skris	ADD,DC  ht_0,%r0,ht_0             ; ht++
38868651Skris
38968651Skris    ADD     ht_0,lt_1,lt_1            ; lt = lt + c (ht_0)
39068651Skris    ADD,DC  ht_1,%r0,ht_1             ; ht++
39168651Skris    STD     lt_0,0(r_ptr)             ; rp[0] = lt
39268651Skris    STD     lt_1,8(r_ptr)             ; rp[1] = lt
39368651Skris
39468651Skris	COPY    ht_1,%ret1                ; carry = ht
39568651Skris	LDO    -2(num),num                ; num = num - 2;
39668651Skris    LDO     16(a_ptr),a_ptr           ; ap += 2
39768651Skris	CMPIB,<= 2,num,bn_mul_words_unroll2
39868651Skris    LDO     16(r_ptr),r_ptr           ; rp++
39968651Skris
40068651Skris    CMPIB,=,N 0,num,bn_mul_words_exit ; are we done?
40168651Skris
40268651Skris	;
40368651Skris	; Top of loop aligned on 64-byte boundary
40468651Skris	;
40568651Skrisbn_mul_words_single_top
40668651Skris    FLDD    0(a_ptr),t_float_0        ; load up 64-bit value (fr8L) ht(L)/lt(R)
40768651Skris
40868651Skris    XMPYU   fht_0,fw_l,fm1            ; m1 = ht*fw_l
40968651Skris    FSTD    fm1,-16(%sp)              ; -16(sp) = m1
41068651Skris    XMPYU   flt_0,fw_h,fm             ; m = lt*fw_h
41168651Skris    FSTD    fm,-8(%sp)                ; -8(sp) = m
41268651Skris    XMPYU   fht_0,fw_h,ht_temp        ; ht_temp = ht*fw_h
41368651Skris    FSTD    ht_temp,-24(%sp)          ; -24(sp) = ht
41468651Skris    XMPYU   flt_0,fw_l,lt_temp        ; lt_temp = lt*fw_l
41568651Skris    FSTD    lt_temp,-32(%sp)          ; -32(sp) = lt
41668651Skris
41768651Skris    LDD     -8(%sp),m_0
41868651Skris    LDD    -16(%sp),m1_0
41968651Skris    ADD,L   m_0,m1_0,tmp_0            ; tmp_0 = m + m1;
42068651Skris    LDD     -24(%sp),ht_0
42168651Skris    LDD     -32(%sp),lt_0
42268651Skris
42368651Skris    CMPCLR,*>>= tmp_0,m1_0,%r0        ; if (m < m1)
42468651Skris    ADD,L   ht_0,top_overflow,ht_0    ; ht += (1<<32)
42568651Skris
42668651Skris    EXTRD,U tmp_0,31,32,m_0           ; m>>32
42768651Skris    DEPD,Z  tmp_0,31,32,m1_0          ; m1 = m<<32
42868651Skris
42968651Skris    ADD,L   ht_0,m_0,ht_0             ; ht+= (m>>32)
43068651Skris    ADD     lt_0,m1_0,lt_0            ; lt= lt+m1;
43168651Skris    ADD,DC  ht_0,%r0,ht_0             ; ht++
43268651Skris
43368651Skris    ADD     %ret1,lt_0,lt_0           ; lt = lt + c;
43468651Skris    ADD,DC  ht_0,%r0,ht_0             ; ht++
43568651Skris
43668651Skris    COPY    ht_0,%ret1                ; copy carry
43768651Skris    STD     lt_0,0(r_ptr)             ; rp[0] = lt
43868651Skris
43968651Skrisbn_mul_words_exit
44068651Skris    .EXIT
44168651Skris    EXTRD,U %ret1,31,32,%ret0           ; for 32-bit, return in ret0/ret1
44268651Skris    LDD     -96(%sp),%r7              ; restore r7
44368651Skris    LDD     -104(%sp),%r6             ; restore r6
44468651Skris    LDD     -112(%sp),%r5             ; restore r5
44568651Skris    LDD     -120(%sp),%r4             ; restore r4
44668651Skris    BVE     (%rp)
44768651Skris    LDD,MB  -128(%sp),%r3             ; restore r3
44868651Skris	.PROCEND
44968651Skris
45068651Skris;----------------------------------------------------------------------------
45168651Skris;
45268651Skris;void bn_sqr_words(BN_ULONG *rp, BN_ULONG *ap, int num)
45368651Skris;
45468651Skris; arg0 = rp
45568651Skris; arg1 = ap
45668651Skris; arg2 = num
45768651Skris;
45868651Skris
45955714Skrisbn_sqr_words
46068651Skris	.proc
46168651Skris	.callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE
46268651Skris	.EXPORT	bn_sqr_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
46368651Skris    .entry
46468651Skris	.align 64
46568651Skris
46668651Skris    STD     %r3,0(%sp)          ; save r3
46768651Skris    STD     %r4,8(%sp)          ; save r4
46868651Skris	NOP
46968651Skris    STD     %r5,16(%sp)         ; save r5
47068651Skris
47168651Skris    CMPIB,>= 0,num,bn_sqr_words_exit
47268651Skris	LDO     128(%sp),%sp       ; bump stack
47368651Skris
47468651Skris	;
47568651Skris	; If only 1, the goto straight to cleanup
47668651Skris	;
47768651Skris	CMPIB,= 1,num,bn_sqr_words_single_top
47868651Skris    DEPDI,Z -1,32,33,high_mask   ; Create Mask 0xffffffff80000000L
47968651Skris
48068651Skris	;
48168651Skris	; This loop is unrolled 2 times (64-byte aligned as well)
48268651Skris	;
48368651Skris
48468651Skrisbn_sqr_words_unroll2
48568651Skris    FLDD    0(a_ptr),t_float_0        ; a[0]
48668651Skris    FLDD    8(a_ptr),t_float_1        ; a[1]
48768651Skris    XMPYU   fht_0,flt_0,fm            ; m[0]
48868651Skris    XMPYU   fht_1,flt_1,fm_1          ; m[1]
48968651Skris
49068651Skris    FSTD    fm,-24(%sp)               ; store m[0]
49168651Skris    FSTD    fm_1,-56(%sp)             ; store m[1]
49268651Skris    XMPYU   flt_0,flt_0,lt_temp       ; lt[0]
49368651Skris    XMPYU   flt_1,flt_1,lt_temp_1     ; lt[1]
49468651Skris
49568651Skris    FSTD    lt_temp,-16(%sp)          ; store lt[0]
49668651Skris    FSTD    lt_temp_1,-48(%sp)        ; store lt[1]
49768651Skris    XMPYU   fht_0,fht_0,ht_temp       ; ht[0]
49868651Skris    XMPYU   fht_1,fht_1,ht_temp_1     ; ht[1]
49968651Skris
50068651Skris    FSTD    ht_temp,-8(%sp)           ; store ht[0]
50168651Skris    FSTD    ht_temp_1,-40(%sp)        ; store ht[1]
50268651Skris    LDD     -24(%sp),m_0
50368651Skris    LDD     -56(%sp),m_1
50468651Skris
50568651Skris    AND     m_0,high_mask,tmp_0       ; m[0] & Mask
50668651Skris    AND     m_1,high_mask,tmp_1       ; m[1] & Mask
50768651Skris    DEPD,Z  m_0,30,31,m_0             ; m[0] << 32+1
50868651Skris    DEPD,Z  m_1,30,31,m_1             ; m[1] << 32+1
50968651Skris
51068651Skris    LDD     -16(%sp),lt_0
51168651Skris    LDD     -48(%sp),lt_1
51268651Skris    EXTRD,U tmp_0,32,33,tmp_0         ; tmp_0 = m[0]&Mask >> 32-1
51368651Skris    EXTRD,U tmp_1,32,33,tmp_1         ; tmp_1 = m[1]&Mask >> 32-1
51468651Skris
51568651Skris    LDD     -8(%sp),ht_0
51668651Skris    LDD     -40(%sp),ht_1
51768651Skris    ADD,L   ht_0,tmp_0,ht_0           ; ht[0] += tmp_0
51868651Skris    ADD,L   ht_1,tmp_1,ht_1           ; ht[1] += tmp_1
51968651Skris
52068651Skris    ADD     lt_0,m_0,lt_0             ; lt = lt+m
52168651Skris    ADD,DC  ht_0,%r0,ht_0             ; ht[0]++
52268651Skris    STD     lt_0,0(r_ptr)             ; rp[0] = lt[0]
52368651Skris    STD     ht_0,8(r_ptr)             ; rp[1] = ht[1]
52468651Skris
52568651Skris    ADD     lt_1,m_1,lt_1             ; lt = lt+m
52668651Skris    ADD,DC  ht_1,%r0,ht_1             ; ht[1]++
52768651Skris    STD     lt_1,16(r_ptr)            ; rp[2] = lt[1]
52868651Skris    STD     ht_1,24(r_ptr)            ; rp[3] = ht[1]
52968651Skris
53068651Skris	LDO    -2(num),num                ; num = num - 2;
53168651Skris    LDO     16(a_ptr),a_ptr           ; ap += 2
53268651Skris	CMPIB,<= 2,num,bn_sqr_words_unroll2
53368651Skris    LDO     32(r_ptr),r_ptr           ; rp += 4
53468651Skris
53568651Skris    CMPIB,=,N 0,num,bn_sqr_words_exit ; are we done?
53668651Skris
53768651Skris	;
53868651Skris	; Top of loop aligned on 64-byte boundary
53968651Skris	;
54068651Skrisbn_sqr_words_single_top
54168651Skris    FLDD    0(a_ptr),t_float_0        ; load up 64-bit value (fr8L) ht(L)/lt(R)
54268651Skris
54368651Skris    XMPYU   fht_0,flt_0,fm            ; m
54468651Skris    FSTD    fm,-24(%sp)               ; store m
54568651Skris
54668651Skris    XMPYU   flt_0,flt_0,lt_temp       ; lt
54768651Skris    FSTD    lt_temp,-16(%sp)          ; store lt
54868651Skris
54968651Skris    XMPYU   fht_0,fht_0,ht_temp       ; ht
55068651Skris    FSTD    ht_temp,-8(%sp)           ; store ht
55168651Skris
55268651Skris    LDD     -24(%sp),m_0              ; load m
55368651Skris    AND     m_0,high_mask,tmp_0       ; m & Mask
55468651Skris    DEPD,Z  m_0,30,31,m_0             ; m << 32+1
55568651Skris    LDD     -16(%sp),lt_0             ; lt
55668651Skris
55768651Skris    LDD     -8(%sp),ht_0              ; ht
55868651Skris    EXTRD,U tmp_0,32,33,tmp_0         ; tmp_0 = m&Mask >> 32-1
55968651Skris    ADD     m_0,lt_0,lt_0             ; lt = lt+m
56068651Skris    ADD,L   ht_0,tmp_0,ht_0           ; ht += tmp_0
56168651Skris    ADD,DC  ht_0,%r0,ht_0             ; ht++
56268651Skris
56368651Skris    STD     lt_0,0(r_ptr)             ; rp[0] = lt
56468651Skris    STD     ht_0,8(r_ptr)             ; rp[1] = ht
56568651Skris
56668651Skrisbn_sqr_words_exit
56768651Skris    .EXIT
56868651Skris    LDD     -112(%sp),%r5       ; restore r5
56968651Skris    LDD     -120(%sp),%r4       ; restore r4
57068651Skris    BVE     (%rp)
57168651Skris    LDD,MB  -128(%sp),%r3
57268651Skris	.PROCEND	;in=23,24,25,26,29;out=28;
57368651Skris
57468651Skris
57568651Skris;----------------------------------------------------------------------------
57668651Skris;
57768651Skris;BN_ULONG bn_add_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
57868651Skris;
57968651Skris; arg0 = rp
58068651Skris; arg1 = ap
58168651Skris; arg2 = bp
58268651Skris; arg3 = n
58368651Skris
58468651Skrist  .reg %r22
58568651Skrisb  .reg %r21
58668651Skrisl  .reg %r20
58768651Skris
58868651Skrisbn_add_words
58968651Skris	.proc
59068651Skris    .entry
59168651Skris	.callinfo
59268651Skris	.EXPORT	bn_add_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
59368651Skris	.align 64
59468651Skris
59568651Skris    CMPIB,>= 0,n,bn_add_words_exit
59668651Skris    COPY    %r0,%ret1           ; return 0 by default
59768651Skris
59868651Skris	;
59968651Skris	; If 2 or more numbers do the loop
60068651Skris	;
60168651Skris	CMPIB,= 1,n,bn_add_words_single_top
60268651Skris	NOP
60368651Skris
60468651Skris	;
60568651Skris	; This loop is unrolled 2 times (64-byte aligned as well)
60668651Skris	;
60768651Skrisbn_add_words_unroll2
60868651Skris	LDD     0(a_ptr),t
60968651Skris	LDD     0(b_ptr),b
61068651Skris	ADD     t,%ret1,t                    ; t = t+c;
61168651Skris	ADD,DC  %r0,%r0,%ret1                ; set c to carry
61268651Skris	ADD     t,b,l                        ; l = t + b[0]
61368651Skris	ADD,DC  %ret1,%r0,%ret1              ; c+= carry
61468651Skris	STD     l,0(r_ptr)
61568651Skris
61668651Skris	LDD     8(a_ptr),t
61768651Skris	LDD     8(b_ptr),b
61868651Skris	ADD     t,%ret1,t                     ; t = t+c;
61968651Skris	ADD,DC  %r0,%r0,%ret1                 ; set c to carry
62068651Skris	ADD     t,b,l                         ; l = t + b[0]
62168651Skris	ADD,DC  %ret1,%r0,%ret1               ; c+= carry
62268651Skris	STD     l,8(r_ptr)
62368651Skris
62468651Skris	LDO     -2(n),n
62568651Skris	LDO     16(a_ptr),a_ptr
62668651Skris	LDO     16(b_ptr),b_ptr
62768651Skris
62868651Skris	CMPIB,<= 2,n,bn_add_words_unroll2
62968651Skris	LDO     16(r_ptr),r_ptr
63068651Skris
63168651Skris    CMPIB,=,N 0,n,bn_add_words_exit ; are we done?
63268651Skris
63368651Skrisbn_add_words_single_top
63468651Skris	LDD     0(a_ptr),t
63568651Skris	LDD     0(b_ptr),b
63668651Skris
63768651Skris	ADD     t,%ret1,t                 ; t = t+c;
63868651Skris	ADD,DC  %r0,%r0,%ret1             ; set c to carry (could use CMPCLR??)
63968651Skris	ADD     t,b,l                     ; l = t + b[0]
64068651Skris	ADD,DC  %ret1,%r0,%ret1           ; c+= carry
64168651Skris	STD     l,0(r_ptr)
64268651Skris
64368651Skrisbn_add_words_exit
64468651Skris    .EXIT
64568651Skris    BVE     (%rp)
64668651Skris    EXTRD,U %ret1,31,32,%ret0           ; for 32-bit, return in ret0/ret1
64768651Skris	.PROCEND	;in=23,24,25,26,29;out=28;
64868651Skris
64968651Skris;----------------------------------------------------------------------------
65068651Skris;
65168651Skris;BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
65268651Skris;
65368651Skris; arg0 = rp
65468651Skris; arg1 = ap
65568651Skris; arg2 = bp
65668651Skris; arg3 = n
65768651Skris
65868651Skrist1       .reg %r22
65968651Skrist2       .reg %r21
66068651Skrissub_tmp1 .reg %r20
66168651Skrissub_tmp2 .reg %r19
66268651Skris
66368651Skris
66468651Skrisbn_sub_words
66568651Skris	.proc
66668651Skris	.callinfo
66768651Skris	.EXPORT	bn_sub_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
66868651Skris    .entry
66968651Skris	.align 64
67068651Skris
67168651Skris    CMPIB,>=  0,n,bn_sub_words_exit
67268651Skris    COPY    %r0,%ret1           ; return 0 by default
67368651Skris
67468651Skris	;
67568651Skris	; If 2 or more numbers do the loop
67668651Skris	;
67768651Skris	CMPIB,= 1,n,bn_sub_words_single_top
67868651Skris	NOP
67968651Skris
68068651Skris	;
68168651Skris	; This loop is unrolled 2 times (64-byte aligned as well)
68268651Skris	;
68368651Skrisbn_sub_words_unroll2
68468651Skris	LDD     0(a_ptr),t1
68568651Skris	LDD     0(b_ptr),t2
68668651Skris	SUB     t1,t2,sub_tmp1           ; t3 = t1-t2;
68768651Skris	SUB     sub_tmp1,%ret1,sub_tmp1  ; t3 = t3- c;
68868651Skris
68968651Skris	CMPCLR,*>> t1,t2,sub_tmp2        ; clear if t1 > t2
69068651Skris	LDO      1(%r0),sub_tmp2
69168651Skris
69268651Skris	CMPCLR,*= t1,t2,%r0
69368651Skris	COPY    sub_tmp2,%ret1
69468651Skris	STD     sub_tmp1,0(r_ptr)
69568651Skris
69668651Skris	LDD     8(a_ptr),t1
69768651Skris	LDD     8(b_ptr),t2
69868651Skris	SUB     t1,t2,sub_tmp1            ; t3 = t1-t2;
69968651Skris	SUB     sub_tmp1,%ret1,sub_tmp1   ; t3 = t3- c;
70068651Skris	CMPCLR,*>> t1,t2,sub_tmp2         ; clear if t1 > t2
70168651Skris	LDO      1(%r0),sub_tmp2
70268651Skris
70368651Skris	CMPCLR,*= t1,t2,%r0
70468651Skris	COPY    sub_tmp2,%ret1
70568651Skris	STD     sub_tmp1,8(r_ptr)
70668651Skris
70768651Skris	LDO     -2(n),n
70868651Skris	LDO     16(a_ptr),a_ptr
70968651Skris	LDO     16(b_ptr),b_ptr
71068651Skris
71168651Skris	CMPIB,<= 2,n,bn_sub_words_unroll2
71268651Skris	LDO     16(r_ptr),r_ptr
71368651Skris
71468651Skris    CMPIB,=,N 0,n,bn_sub_words_exit ; are we done?
71568651Skris
71668651Skrisbn_sub_words_single_top
71768651Skris	LDD     0(a_ptr),t1
71868651Skris	LDD     0(b_ptr),t2
71968651Skris	SUB     t1,t2,sub_tmp1            ; t3 = t1-t2;
72068651Skris	SUB     sub_tmp1,%ret1,sub_tmp1   ; t3 = t3- c;
72168651Skris	CMPCLR,*>> t1,t2,sub_tmp2         ; clear if t1 > t2
72268651Skris	LDO      1(%r0),sub_tmp2
72368651Skris
72468651Skris	CMPCLR,*= t1,t2,%r0
72568651Skris	COPY    sub_tmp2,%ret1
72668651Skris
72768651Skris	STD     sub_tmp1,0(r_ptr)
72868651Skris
72968651Skrisbn_sub_words_exit
73068651Skris    .EXIT
73168651Skris    BVE     (%rp)
73268651Skris    EXTRD,U %ret1,31,32,%ret0           ; for 32-bit, return in ret0/ret1
73368651Skris	.PROCEND	;in=23,24,25,26,29;out=28;
73468651Skris
73568651Skris;------------------------------------------------------------------------------
73668651Skris;
73768651Skris; unsigned long bn_div_words(unsigned long h, unsigned long l, unsigned long d)
73868651Skris;
73968651Skris; arg0 = h
74068651Skris; arg1 = l
74168651Skris; arg2 = d
74268651Skris;
74368651Skris; This is mainly just output from the HP C compiler.
74468651Skris;
74568651Skris;------------------------------------------------------------------------------
74668651Skrisbn_div_words
74755714Skris	.PROC
74868651Skris	.EXPORT	bn_div_words,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR,RTNVAL=GR,LONG_RETURN
74968651Skris	.IMPORT	BN_num_bits_word,CODE
750111147Snectar	;--- not PIC	.IMPORT	__iob,DATA
751111147Snectar	;--- not PIC	.IMPORT	fprintf,CODE
75268651Skris	.IMPORT	abort,CODE
75368651Skris	.IMPORT	$$div2U,MILLICODE
75468651Skris	.CALLINFO CALLER,FRAME=144,ENTRY_GR=%r9,SAVE_RP,ARGS_SAVED,ORDERING_AWARE
75568651Skris        .ENTRY
75668651Skris        STW     %r2,-20(%r30)   ;offset 0x8ec
75768651Skris        STW,MA  %r3,192(%r30)   ;offset 0x8f0
75868651Skris        STW     %r4,-188(%r30)  ;offset 0x8f4
75968651Skris        DEPD    %r5,31,32,%r6   ;offset 0x8f8
76068651Skris        STD     %r6,-184(%r30)  ;offset 0x8fc
76168651Skris        DEPD    %r7,31,32,%r8   ;offset 0x900
76268651Skris        STD     %r8,-176(%r30)  ;offset 0x904
76368651Skris        STW     %r9,-168(%r30)  ;offset 0x908
76468651Skris        LDD     -248(%r30),%r3  ;offset 0x90c
76568651Skris        COPY    %r26,%r4        ;offset 0x910
76668651Skris        COPY    %r24,%r5        ;offset 0x914
76768651Skris        DEPD    %r25,31,32,%r4  ;offset 0x918
76868651Skris        CMPB,*<>        %r3,%r0,$0006000C       ;offset 0x91c
76968651Skris        DEPD    %r23,31,32,%r5  ;offset 0x920
77068651Skris        MOVIB,TR        -1,%r29,$00060002       ;offset 0x924
77168651Skris        EXTRD,U %r29,31,32,%r28 ;offset 0x928
77268651Skris$0006002A
77368651Skris        LDO     -1(%r29),%r29   ;offset 0x92c
77468651Skris        SUB     %r23,%r7,%r23   ;offset 0x930
77568651Skris$00060024
77668651Skris        SUB     %r4,%r31,%r25   ;offset 0x934
77768651Skris        AND     %r25,%r19,%r26  ;offset 0x938
77868651Skris        CMPB,*<>,N      %r0,%r26,$00060046      ;offset 0x93c
77968651Skris        DEPD,Z  %r25,31,32,%r20 ;offset 0x940
78068651Skris        OR      %r20,%r24,%r21  ;offset 0x944
78168651Skris        CMPB,*<<,N      %r21,%r23,$0006002A     ;offset 0x948
78268651Skris        SUB     %r31,%r2,%r31   ;offset 0x94c
78368651Skris$00060046
78468651Skris$0006002E
78568651Skris        DEPD,Z  %r23,31,32,%r25 ;offset 0x950
78668651Skris        EXTRD,U %r23,31,32,%r26 ;offset 0x954
78768651Skris        AND     %r25,%r19,%r24  ;offset 0x958
78868651Skris        ADD,L   %r31,%r26,%r31  ;offset 0x95c
78968651Skris        CMPCLR,*>>=     %r5,%r24,%r0    ;offset 0x960
79068651Skris        LDO     1(%r31),%r31    ;offset 0x964
79168651Skris$00060032
79268651Skris        CMPB,*<<=,N     %r31,%r4,$00060036      ;offset 0x968
79368651Skris        LDO     -1(%r29),%r29   ;offset 0x96c
79468651Skris        ADD,L   %r4,%r3,%r4     ;offset 0x970
79568651Skris$00060036
79668651Skris        ADDIB,=,N       -1,%r8,$D0      ;offset 0x974
79768651Skris        SUB     %r5,%r24,%r28   ;offset 0x978
79868651Skris$0006003A
79968651Skris        SUB     %r4,%r31,%r24   ;offset 0x97c
80068651Skris        SHRPD   %r24,%r28,32,%r4        ;offset 0x980
80168651Skris        DEPD,Z  %r29,31,32,%r9  ;offset 0x984
80268651Skris        DEPD,Z  %r28,31,32,%r5  ;offset 0x988
80368651Skris$0006001C
80468651Skris        EXTRD,U %r4,31,32,%r31  ;offset 0x98c
80568651Skris        CMPB,*<>,N      %r31,%r2,$00060020      ;offset 0x990
80668651Skris        MOVB,TR %r6,%r29,$D1    ;offset 0x994
80768651Skris        STD     %r29,-152(%r30) ;offset 0x998
80868651Skris$0006000C
80968651Skris        EXTRD,U %r3,31,32,%r25  ;offset 0x99c
81068651Skris        COPY    %r3,%r26        ;offset 0x9a0
81168651Skris        EXTRD,U %r3,31,32,%r9   ;offset 0x9a4
81268651Skris        EXTRD,U %r4,31,32,%r8   ;offset 0x9a8
81368651Skris        .CALL   ARGW0=GR,ARGW1=GR,RTNVAL=GR     ;in=25,26;out=28;
81468651Skris        B,L     BN_num_bits_word,%r2    ;offset 0x9ac
81568651Skris        EXTRD,U %r5,31,32,%r7   ;offset 0x9b0
81668651Skris        LDI     64,%r20 ;offset 0x9b4
81768651Skris        DEPD    %r7,31,32,%r5   ;offset 0x9b8
81868651Skris        DEPD    %r8,31,32,%r4   ;offset 0x9bc
81968651Skris        DEPD    %r9,31,32,%r3   ;offset 0x9c0
82068651Skris        CMPB,=  %r28,%r20,$00060012     ;offset 0x9c4
82168651Skris        COPY    %r28,%r24       ;offset 0x9c8
82268651Skris        MTSARCM %r24    ;offset 0x9cc
82368651Skris        DEPDI,Z -1,%sar,1,%r19  ;offset 0x9d0
82468651Skris        CMPB,*>>,N      %r4,%r19,$D2    ;offset 0x9d4
82568651Skris$00060012
82668651Skris        SUBI    64,%r24,%r31    ;offset 0x9d8
82768651Skris        CMPCLR,*<<      %r4,%r3,%r0     ;offset 0x9dc
82868651Skris        SUB     %r4,%r3,%r4     ;offset 0x9e0
82968651Skris$00060016
83068651Skris        CMPB,=  %r31,%r0,$0006001A      ;offset 0x9e4
83168651Skris        COPY    %r0,%r9 ;offset 0x9e8
83268651Skris        MTSARCM %r31    ;offset 0x9ec
83368651Skris        DEPD,Z  %r3,%sar,64,%r3 ;offset 0x9f0
83468651Skris        SUBI    64,%r31,%r26    ;offset 0x9f4
83568651Skris        MTSAR   %r26    ;offset 0x9f8
83668651Skris        SHRPD   %r4,%r5,%sar,%r4        ;offset 0x9fc
83768651Skris        MTSARCM %r31    ;offset 0xa00
83868651Skris        DEPD,Z  %r5,%sar,64,%r5 ;offset 0xa04
83968651Skris$0006001A
84068651Skris        DEPDI,Z -1,31,32,%r19   ;offset 0xa08
84168651Skris        AND     %r3,%r19,%r29   ;offset 0xa0c
84268651Skris        EXTRD,U %r29,31,32,%r2  ;offset 0xa10
84368651Skris        DEPDI,Z -1,63,32,%r6    ;offset 0xa14
84468651Skris        MOVIB,TR        2,%r8,$0006001C ;offset 0xa18
84568651Skris        EXTRD,U %r3,63,32,%r7   ;offset 0xa1c
84668651Skris$D2
847111147Snectar        ;--- not PIC	ADDIL   LR'__iob-$global$,%r27,%r1      ;offset 0xa20
848111147Snectar        ;--- not PIC	LDIL    LR'C$7,%r21     ;offset 0xa24
849111147Snectar        ;--- not PIC	LDO     RR'__iob-$global$+32(%r1),%r26  ;offset 0xa28
850111147Snectar        ;--- not PIC	.CALL   ARGW0=GR,ARGW1=GR,ARGW2=GR,RTNVAL=GR    ;in=24,25,26;out=28;
851111147Snectar        ;--- not PIC	B,L     fprintf,%r2     ;offset 0xa2c
852111147Snectar        ;--- not PIC	LDO     RR'C$7(%r21),%r25       ;offset 0xa30
85368651Skris        .CALL           ;
85468651Skris        B,L     abort,%r2       ;offset 0xa34
85568651Skris        NOP             ;offset 0xa38
85668651Skris        B       $D3     ;offset 0xa3c
85768651Skris        LDW     -212(%r30),%r2  ;offset 0xa40
85868651Skris$00060020
85968651Skris        COPY    %r4,%r26        ;offset 0xa44
86068651Skris        EXTRD,U %r4,31,32,%r25  ;offset 0xa48
86168651Skris        COPY    %r2,%r24        ;offset 0xa4c
86268651Skris        .CALL   ;in=23,24,25,26;out=20,21,22,28,29; (MILLICALL)
86368651Skris        B,L     $$div2U,%r31    ;offset 0xa50
86468651Skris        EXTRD,U %r2,31,32,%r23  ;offset 0xa54
86568651Skris        DEPD    %r28,31,32,%r29 ;offset 0xa58
86668651Skris$00060022
86768651Skris        STD     %r29,-152(%r30) ;offset 0xa5c
86868651Skris$D1
86968651Skris        AND     %r5,%r19,%r24   ;offset 0xa60
87068651Skris        EXTRD,U %r24,31,32,%r24 ;offset 0xa64
87168651Skris        STW     %r2,-160(%r30)  ;offset 0xa68
87268651Skris        STW     %r7,-128(%r30)  ;offset 0xa6c
87368651Skris        FLDD    -152(%r30),%fr4 ;offset 0xa70
87468651Skris        FLDD    -152(%r30),%fr7 ;offset 0xa74
87568651Skris        FLDW    -160(%r30),%fr8L        ;offset 0xa78
87668651Skris        FLDW    -128(%r30),%fr5L        ;offset 0xa7c
87768651Skris        XMPYU   %fr8L,%fr7L,%fr10       ;offset 0xa80
87868651Skris        FSTD    %fr10,-136(%r30)        ;offset 0xa84
87968651Skris        XMPYU   %fr8L,%fr7R,%fr22       ;offset 0xa88
88068651Skris        FSTD    %fr22,-144(%r30)        ;offset 0xa8c
88168651Skris        XMPYU   %fr5L,%fr4L,%fr11       ;offset 0xa90
88268651Skris        XMPYU   %fr5L,%fr4R,%fr23       ;offset 0xa94
88368651Skris        FSTD    %fr11,-112(%r30)        ;offset 0xa98
88468651Skris        FSTD    %fr23,-120(%r30)        ;offset 0xa9c
88568651Skris        LDD     -136(%r30),%r28 ;offset 0xaa0
88668651Skris        DEPD,Z  %r28,31,32,%r31 ;offset 0xaa4
88768651Skris        LDD     -144(%r30),%r20 ;offset 0xaa8
88868651Skris        ADD,L   %r20,%r31,%r31  ;offset 0xaac
88968651Skris        LDD     -112(%r30),%r22 ;offset 0xab0
89068651Skris        DEPD,Z  %r22,31,32,%r22 ;offset 0xab4
89168651Skris        LDD     -120(%r30),%r21 ;offset 0xab8
89268651Skris        B       $00060024       ;offset 0xabc
89368651Skris        ADD,L   %r21,%r22,%r23  ;offset 0xac0
89468651Skris$D0
89568651Skris        OR      %r9,%r29,%r29   ;offset 0xac4
89668651Skris$00060040
89768651Skris        EXTRD,U %r29,31,32,%r28 ;offset 0xac8
89868651Skris$00060002
89968651Skris$L2
90068651Skris        LDW     -212(%r30),%r2  ;offset 0xacc
90168651Skris$D3
90268651Skris        LDW     -168(%r30),%r9  ;offset 0xad0
90368651Skris        LDD     -176(%r30),%r8  ;offset 0xad4
90468651Skris        EXTRD,U %r8,31,32,%r7   ;offset 0xad8
90568651Skris        LDD     -184(%r30),%r6  ;offset 0xadc
90668651Skris        EXTRD,U %r6,31,32,%r5   ;offset 0xae0
90768651Skris        LDW     -188(%r30),%r4  ;offset 0xae4
90868651Skris        BVE     (%r2)   ;offset 0xae8
90968651Skris        .EXIT
91068651Skris        LDW,MB  -192(%r30),%r3  ;offset 0xaec
91168651Skris	.PROCEND	;in=23,25;out=28,29;fpin=105,107;
91255714Skris
91355714Skris
91468651Skris
91568651Skris
91668651Skris;----------------------------------------------------------------------------
91768651Skris;
91868651Skris; Registers to hold 64-bit values to manipulate.  The "L" part
91968651Skris; of the register corresponds to the upper 32-bits, while the "R"
92068651Skris; part corresponds to the lower 32-bits
92168651Skris;
92268651Skris; Note, that when using b6 and b7, the code must save these before
92368651Skris; using them because they are callee save registers
92468651Skris;
92568651Skris;
92668651Skris; Floating point registers to use to save values that
92768651Skris; are manipulated.  These don't collide with ftemp1-6 and
92868651Skris; are all caller save registers
92968651Skris;
93068651Skrisa0        .reg %fr22
93168651Skrisa0L       .reg %fr22L
93268651Skrisa0R       .reg %fr22R
93368651Skris
93468651Skrisa1        .reg %fr23
93568651Skrisa1L       .reg %fr23L
93668651Skrisa1R       .reg %fr23R
93768651Skris
93868651Skrisa2        .reg %fr24
93968651Skrisa2L       .reg %fr24L
94068651Skrisa2R       .reg %fr24R
94168651Skris
94268651Skrisa3        .reg %fr25
94368651Skrisa3L       .reg %fr25L
94468651Skrisa3R       .reg %fr25R
94568651Skris
94668651Skrisa4        .reg %fr26
94768651Skrisa4L       .reg %fr26L
94868651Skrisa4R       .reg %fr26R
94968651Skris
95068651Skrisa5        .reg %fr27
95168651Skrisa5L       .reg %fr27L
95268651Skrisa5R       .reg %fr27R
95368651Skris
95468651Skrisa6        .reg %fr28
95568651Skrisa6L       .reg %fr28L
95668651Skrisa6R       .reg %fr28R
95768651Skris
95868651Skrisa7        .reg %fr29
95968651Skrisa7L       .reg %fr29L
96068651Skrisa7R       .reg %fr29R
96168651Skris
96268651Skrisb0        .reg %fr30
96368651Skrisb0L       .reg %fr30L
96468651Skrisb0R       .reg %fr30R
96568651Skris
96668651Skrisb1        .reg %fr31
96768651Skrisb1L       .reg %fr31L
96868651Skrisb1R       .reg %fr31R
96968651Skris
97068651Skris;
97168651Skris; Temporary floating point variables, these are all caller save
97268651Skris; registers
97368651Skris;
97468651Skrisftemp1    .reg %fr4
97568651Skrisftemp2    .reg %fr5
97668651Skrisftemp3    .reg %fr6
97768651Skrisftemp4    .reg %fr7
97868651Skris
97968651Skris;
98068651Skris; The B set of registers when used.
98168651Skris;
98268651Skris
98368651Skrisb2        .reg %fr8
98468651Skrisb2L       .reg %fr8L
98568651Skrisb2R       .reg %fr8R
98668651Skris
98768651Skrisb3        .reg %fr9
98868651Skrisb3L       .reg %fr9L
98968651Skrisb3R       .reg %fr9R
99068651Skris
99168651Skrisb4        .reg %fr10
99268651Skrisb4L       .reg %fr10L
99368651Skrisb4R       .reg %fr10R
99468651Skris
99568651Skrisb5        .reg %fr11
99668651Skrisb5L       .reg %fr11L
99768651Skrisb5R       .reg %fr11R
99868651Skris
99968651Skrisb6        .reg %fr12
100068651Skrisb6L       .reg %fr12L
100168651Skrisb6R       .reg %fr12R
100268651Skris
100368651Skrisb7        .reg %fr13
100468651Skrisb7L       .reg %fr13L
100568651Skrisb7R       .reg %fr13R
100668651Skris
100768651Skrisc1           .reg %r21   ; only reg
100868651Skristemp1        .reg %r20   ; only reg
100968651Skristemp2        .reg %r19   ; only reg
101068651Skristemp3        .reg %r31   ; only reg
101168651Skris
101268651Skrism1           .reg %r28
101368651Skrisc2           .reg %r23
101468651Skrishigh_one     .reg %r1
101568651Skrisht           .reg %r6
101668651Skrislt           .reg %r5
101768651Skrism            .reg %r4
101868651Skrisc3           .reg %r3
101968651Skris
102068651SkrisSQR_ADD_C  .macro  A0L,A0R,C1,C2,C3
102168651Skris    XMPYU   A0L,A0R,ftemp1       ; m
102268651Skris    FSTD    ftemp1,-24(%sp)      ; store m
102368651Skris
102468651Skris    XMPYU   A0R,A0R,ftemp2       ; lt
102568651Skris    FSTD    ftemp2,-16(%sp)      ; store lt
102668651Skris
102768651Skris    XMPYU   A0L,A0L,ftemp3       ; ht
102868651Skris    FSTD    ftemp3,-8(%sp)       ; store ht
102968651Skris
103068651Skris    LDD     -24(%sp),m           ; load m
103168651Skris    AND     m,high_mask,temp2    ; m & Mask
103268651Skris    DEPD,Z  m,30,31,temp3        ; m << 32+1
103368651Skris    LDD     -16(%sp),lt          ; lt
103468651Skris
103568651Skris    LDD     -8(%sp),ht           ; ht
103668651Skris    EXTRD,U temp2,32,33,temp1    ; temp1 = m&Mask >> 32-1
103768651Skris    ADD     temp3,lt,lt          ; lt = lt+m
103868651Skris    ADD,L   ht,temp1,ht          ; ht += temp1
103968651Skris    ADD,DC  ht,%r0,ht            ; ht++
104068651Skris
104168651Skris    ADD     C1,lt,C1             ; c1=c1+lt
104268651Skris    ADD,DC  ht,%r0,ht            ; ht++
104368651Skris
104468651Skris    ADD     C2,ht,C2             ; c2=c2+ht
104568651Skris    ADD,DC  C3,%r0,C3            ; c3++
104668651Skris.endm
104768651Skris
104868651SkrisSQR_ADD_C2 .macro  A0L,A0R,A1L,A1R,C1,C2,C3
104968651Skris    XMPYU   A0L,A1R,ftemp1          ; m1 = bl*ht
105068651Skris    FSTD    ftemp1,-16(%sp)         ;
105168651Skris    XMPYU   A0R,A1L,ftemp2          ; m = bh*lt
105268651Skris    FSTD    ftemp2,-8(%sp)          ;
105368651Skris    XMPYU   A0R,A1R,ftemp3          ; lt = bl*lt
105468651Skris    FSTD    ftemp3,-32(%sp)
105568651Skris    XMPYU   A0L,A1L,ftemp4          ; ht = bh*ht
105668651Skris    FSTD    ftemp4,-24(%sp)         ;
105768651Skris
105868651Skris    LDD     -8(%sp),m               ; r21 = m
105968651Skris    LDD     -16(%sp),m1             ; r19 = m1
106068651Skris    ADD,L   m,m1,m                  ; m+m1
106168651Skris
106268651Skris    DEPD,Z  m,31,32,temp3           ; (m+m1<<32)
106368651Skris    LDD     -24(%sp),ht             ; r24 = ht
106468651Skris
106568651Skris    CMPCLR,*>>= m,m1,%r0            ; if (m < m1)
106668651Skris    ADD,L   ht,high_one,ht          ; ht+=high_one
106768651Skris
106868651Skris    EXTRD,U m,31,32,temp1           ; m >> 32
106968651Skris    LDD     -32(%sp),lt             ; lt
107068651Skris    ADD,L   ht,temp1,ht             ; ht+= m>>32
107168651Skris    ADD     lt,temp3,lt             ; lt = lt+m1
107268651Skris    ADD,DC  ht,%r0,ht               ; ht++
107368651Skris
107468651Skris    ADD     ht,ht,ht                ; ht=ht+ht;
107568651Skris    ADD,DC  C3,%r0,C3               ; add in carry (c3++)
107668651Skris
107768651Skris    ADD     lt,lt,lt                ; lt=lt+lt;
107868651Skris    ADD,DC  ht,%r0,ht               ; add in carry (ht++)
107968651Skris
108068651Skris    ADD     C1,lt,C1                ; c1=c1+lt
108168651Skris    ADD,DC,*NUV ht,%r0,ht           ; add in carry (ht++)
108268651Skris    LDO     1(C3),C3              ; bump c3 if overflow,nullify otherwise
108368651Skris
108468651Skris    ADD     C2,ht,C2                ; c2 = c2 + ht
108568651Skris    ADD,DC  C3,%r0,C3             ; add in carry (c3++)
108668651Skris.endm
108768651Skris
108868651Skris;
108968651Skris;void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a)
109068651Skris; arg0 = r_ptr
109168651Skris; arg1 = a_ptr
109268651Skris;
109368651Skris
109468651Skrisbn_sqr_comba8
109555714Skris	.PROC
109668651Skris	.CALLINFO FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE
109768651Skris	.EXPORT	bn_sqr_comba8,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
109868651Skris    .ENTRY
109968651Skris	.align 64
110068651Skris
110168651Skris    STD     %r3,0(%sp)          ; save r3
110268651Skris    STD     %r4,8(%sp)          ; save r4
110368651Skris    STD     %r5,16(%sp)         ; save r5
110468651Skris    STD     %r6,24(%sp)         ; save r6
110568651Skris
110668651Skris	;
110768651Skris	; Zero out carries
110868651Skris	;
110968651Skris	COPY     %r0,c1
111068651Skris	COPY     %r0,c2
111168651Skris	COPY     %r0,c3
111268651Skris
111368651Skris	LDO      128(%sp),%sp       ; bump stack
111468651Skris    DEPDI,Z -1,32,33,high_mask   ; Create Mask 0xffffffff80000000L
111568651Skris    DEPDI,Z  1,31,1,high_one     ; Create Value  1 << 32
111668651Skris
111768651Skris	;
111868651Skris	; Load up all of the values we are going to use
111968651Skris	;
112068651Skris    FLDD     0(a_ptr),a0
112168651Skris    FLDD     8(a_ptr),a1
112268651Skris    FLDD    16(a_ptr),a2
112368651Skris    FLDD    24(a_ptr),a3
112468651Skris    FLDD    32(a_ptr),a4
112568651Skris    FLDD    40(a_ptr),a5
112668651Skris    FLDD    48(a_ptr),a6
112768651Skris    FLDD    56(a_ptr),a7
112868651Skris
112968651Skris	SQR_ADD_C a0L,a0R,c1,c2,c3
113068651Skris	STD     c1,0(r_ptr)          ; r[0] = c1;
113168651Skris	COPY    %r0,c1
113268651Skris
113368651Skris	SQR_ADD_C2 a1L,a1R,a0L,a0R,c2,c3,c1
113468651Skris	STD     c2,8(r_ptr)          ; r[1] = c2;
113568651Skris	COPY    %r0,c2
113668651Skris
113768651Skris	SQR_ADD_C a1L,a1R,c3,c1,c2
113868651Skris	SQR_ADD_C2 a2L,a2R,a0L,a0R,c3,c1,c2
113968651Skris	STD     c3,16(r_ptr)            ; r[2] = c3;
114068651Skris	COPY    %r0,c3
114168651Skris
114268651Skris	SQR_ADD_C2 a3L,a3R,a0L,a0R,c1,c2,c3
114368651Skris	SQR_ADD_C2 a2L,a2R,a1L,a1R,c1,c2,c3
114468651Skris	STD     c1,24(r_ptr)           ; r[3] = c1;
114568651Skris	COPY    %r0,c1
114668651Skris
114768651Skris	SQR_ADD_C a2L,a2R,c2,c3,c1
114868651Skris	SQR_ADD_C2 a3L,a3R,a1L,a1R,c2,c3,c1
114968651Skris	SQR_ADD_C2 a4L,a4R,a0L,a0R,c2,c3,c1
115068651Skris	STD     c2,32(r_ptr)          ; r[4] = c2;
115168651Skris	COPY    %r0,c2
115268651Skris
115368651Skris	SQR_ADD_C2 a5L,a5R,a0L,a0R,c3,c1,c2
115468651Skris	SQR_ADD_C2 a4L,a4R,a1L,a1R,c3,c1,c2
115568651Skris	SQR_ADD_C2 a3L,a3R,a2L,a2R,c3,c1,c2
115668651Skris	STD     c3,40(r_ptr)          ; r[5] = c3;
115768651Skris	COPY    %r0,c3
115868651Skris
115968651Skris	SQR_ADD_C a3L,a3R,c1,c2,c3
116068651Skris	SQR_ADD_C2 a4L,a4R,a2L,a2R,c1,c2,c3
116168651Skris	SQR_ADD_C2 a5L,a5R,a1L,a1R,c1,c2,c3
116268651Skris	SQR_ADD_C2 a6L,a6R,a0L,a0R,c1,c2,c3
116368651Skris	STD     c1,48(r_ptr)          ; r[6] = c1;
116468651Skris	COPY    %r0,c1
116568651Skris
116668651Skris	SQR_ADD_C2 a7L,a7R,a0L,a0R,c2,c3,c1
116768651Skris	SQR_ADD_C2 a6L,a6R,a1L,a1R,c2,c3,c1
116868651Skris	SQR_ADD_C2 a5L,a5R,a2L,a2R,c2,c3,c1
116968651Skris	SQR_ADD_C2 a4L,a4R,a3L,a3R,c2,c3,c1
117068651Skris	STD     c2,56(r_ptr)          ; r[7] = c2;
117168651Skris	COPY    %r0,c2
117268651Skris
117368651Skris	SQR_ADD_C a4L,a4R,c3,c1,c2
117468651Skris	SQR_ADD_C2 a5L,a5R,a3L,a3R,c3,c1,c2
117568651Skris	SQR_ADD_C2 a6L,a6R,a2L,a2R,c3,c1,c2
117668651Skris	SQR_ADD_C2 a7L,a7R,a1L,a1R,c3,c1,c2
117768651Skris	STD     c3,64(r_ptr)          ; r[8] = c3;
117868651Skris	COPY    %r0,c3
117968651Skris
118068651Skris	SQR_ADD_C2 a7L,a7R,a2L,a2R,c1,c2,c3
118168651Skris	SQR_ADD_C2 a6L,a6R,a3L,a3R,c1,c2,c3
118268651Skris	SQR_ADD_C2 a5L,a5R,a4L,a4R,c1,c2,c3
118368651Skris	STD     c1,72(r_ptr)          ; r[9] = c1;
118468651Skris	COPY    %r0,c1
118568651Skris
118668651Skris	SQR_ADD_C a5L,a5R,c2,c3,c1
118768651Skris	SQR_ADD_C2 a6L,a6R,a4L,a4R,c2,c3,c1
118868651Skris	SQR_ADD_C2 a7L,a7R,a3L,a3R,c2,c3,c1
118968651Skris	STD     c2,80(r_ptr)          ; r[10] = c2;
119068651Skris	COPY    %r0,c2
119168651Skris
119268651Skris	SQR_ADD_C2 a7L,a7R,a4L,a4R,c3,c1,c2
119368651Skris	SQR_ADD_C2 a6L,a6R,a5L,a5R,c3,c1,c2
119468651Skris	STD     c3,88(r_ptr)          ; r[11] = c3;
119568651Skris	COPY    %r0,c3
119668651Skris
119768651Skris	SQR_ADD_C a6L,a6R,c1,c2,c3
119868651Skris	SQR_ADD_C2 a7L,a7R,a5L,a5R,c1,c2,c3
119968651Skris	STD     c1,96(r_ptr)          ; r[12] = c1;
120068651Skris	COPY    %r0,c1
120168651Skris
120268651Skris	SQR_ADD_C2 a7L,a7R,a6L,a6R,c2,c3,c1
120368651Skris	STD     c2,104(r_ptr)         ; r[13] = c2;
120468651Skris	COPY    %r0,c2
120568651Skris
120668651Skris	SQR_ADD_C a7L,a7R,c3,c1,c2
120768651Skris	STD     c3, 112(r_ptr)       ; r[14] = c3
120868651Skris	STD     c1, 120(r_ptr)       ; r[15] = c1
120968651Skris
121068651Skris    .EXIT
121168651Skris    LDD     -104(%sp),%r6        ; restore r6
121268651Skris    LDD     -112(%sp),%r5        ; restore r5
121368651Skris    LDD     -120(%sp),%r4        ; restore r4
121468651Skris    BVE     (%rp)
121568651Skris    LDD,MB  -128(%sp),%r3
121668651Skris
121768651Skris	.PROCEND
121868651Skris
121968651Skris;-----------------------------------------------------------------------------
122068651Skris;
122168651Skris;void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a)
122268651Skris; arg0 = r_ptr
122368651Skris; arg1 = a_ptr
122468651Skris;
122568651Skris
122668651Skrisbn_sqr_comba4
122768651Skris	.proc
122868651Skris	.callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE
122968651Skris	.EXPORT	bn_sqr_comba4,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
123068651Skris    .entry
123168651Skris	.align 64
123268651Skris    STD     %r3,0(%sp)          ; save r3
123368651Skris    STD     %r4,8(%sp)          ; save r4
123468651Skris    STD     %r5,16(%sp)         ; save r5
123568651Skris    STD     %r6,24(%sp)         ; save r6
123668651Skris
123768651Skris	;
123868651Skris	; Zero out carries
123968651Skris	;
124068651Skris	COPY     %r0,c1
124168651Skris	COPY     %r0,c2
124268651Skris	COPY     %r0,c3
124368651Skris
124468651Skris	LDO      128(%sp),%sp       ; bump stack
124568651Skris    DEPDI,Z -1,32,33,high_mask   ; Create Mask 0xffffffff80000000L
124668651Skris    DEPDI,Z  1,31,1,high_one     ; Create Value  1 << 32
124768651Skris
124868651Skris	;
124968651Skris	; Load up all of the values we are going to use
125068651Skris	;
125168651Skris    FLDD     0(a_ptr),a0
125268651Skris    FLDD     8(a_ptr),a1
125368651Skris    FLDD    16(a_ptr),a2
125468651Skris    FLDD    24(a_ptr),a3
125568651Skris    FLDD    32(a_ptr),a4
125668651Skris    FLDD    40(a_ptr),a5
125768651Skris    FLDD    48(a_ptr),a6
125868651Skris    FLDD    56(a_ptr),a7
125968651Skris
126068651Skris	SQR_ADD_C a0L,a0R,c1,c2,c3
126168651Skris
126268651Skris	STD     c1,0(r_ptr)          ; r[0] = c1;
126368651Skris	COPY    %r0,c1
126468651Skris
126568651Skris	SQR_ADD_C2 a1L,a1R,a0L,a0R,c2,c3,c1
126668651Skris
126768651Skris	STD     c2,8(r_ptr)          ; r[1] = c2;
126868651Skris	COPY    %r0,c2
126968651Skris
127068651Skris	SQR_ADD_C a1L,a1R,c3,c1,c2
127168651Skris	SQR_ADD_C2 a2L,a2R,a0L,a0R,c3,c1,c2
127268651Skris
127368651Skris	STD     c3,16(r_ptr)            ; r[2] = c3;
127468651Skris	COPY    %r0,c3
127568651Skris
127668651Skris	SQR_ADD_C2 a3L,a3R,a0L,a0R,c1,c2,c3
127768651Skris	SQR_ADD_C2 a2L,a2R,a1L,a1R,c1,c2,c3
127868651Skris
127968651Skris	STD     c1,24(r_ptr)           ; r[3] = c1;
128068651Skris	COPY    %r0,c1
128168651Skris
128268651Skris	SQR_ADD_C a2L,a2R,c2,c3,c1
128368651Skris	SQR_ADD_C2 a3L,a3R,a1L,a1R,c2,c3,c1
128468651Skris
128568651Skris	STD     c2,32(r_ptr)           ; r[4] = c2;
128668651Skris	COPY    %r0,c2
128768651Skris
128868651Skris	SQR_ADD_C2 a3L,a3R,a2L,a2R,c3,c1,c2
128968651Skris	STD     c3,40(r_ptr)           ; r[5] = c3;
129068651Skris	COPY    %r0,c3
129168651Skris
129268651Skris	SQR_ADD_C a3L,a3R,c1,c2,c3
129368651Skris	STD     c1,48(r_ptr)           ; r[6] = c1;
129468651Skris	STD     c2,56(r_ptr)           ; r[7] = c2;
129568651Skris
129668651Skris    .EXIT
129768651Skris    LDD     -104(%sp),%r6        ; restore r6
129868651Skris    LDD     -112(%sp),%r5        ; restore r5
129968651Skris    LDD     -120(%sp),%r4        ; restore r4
130068651Skris    BVE     (%rp)
130168651Skris    LDD,MB  -128(%sp),%r3
130268651Skris
130368651Skris	.PROCEND
130468651Skris
130568651Skris
130668651Skris;---------------------------------------------------------------------------
130768651Skris
130868651SkrisMUL_ADD_C  .macro  A0L,A0R,B0L,B0R,C1,C2,C3
130968651Skris    XMPYU   A0L,B0R,ftemp1        ; m1 = bl*ht
131068651Skris    FSTD    ftemp1,-16(%sp)       ;
131168651Skris    XMPYU   A0R,B0L,ftemp2        ; m = bh*lt
131268651Skris    FSTD    ftemp2,-8(%sp)        ;
131368651Skris    XMPYU   A0R,B0R,ftemp3        ; lt = bl*lt
131468651Skris    FSTD    ftemp3,-32(%sp)
131568651Skris    XMPYU   A0L,B0L,ftemp4        ; ht = bh*ht
131668651Skris    FSTD    ftemp4,-24(%sp)       ;
131768651Skris
131868651Skris    LDD     -8(%sp),m             ; r21 = m
131968651Skris    LDD     -16(%sp),m1           ; r19 = m1
132068651Skris    ADD,L   m,m1,m                ; m+m1
132168651Skris
132268651Skris    DEPD,Z  m,31,32,temp3         ; (m+m1<<32)
132368651Skris    LDD     -24(%sp),ht           ; r24 = ht
132468651Skris
132568651Skris    CMPCLR,*>>= m,m1,%r0          ; if (m < m1)
132668651Skris    ADD,L   ht,high_one,ht        ; ht+=high_one
132768651Skris
132868651Skris    EXTRD,U m,31,32,temp1         ; m >> 32
132968651Skris    LDD     -32(%sp),lt           ; lt
133068651Skris    ADD,L   ht,temp1,ht           ; ht+= m>>32
133168651Skris    ADD     lt,temp3,lt           ; lt = lt+m1
133268651Skris    ADD,DC  ht,%r0,ht             ; ht++
133368651Skris
133468651Skris    ADD     C1,lt,C1              ; c1=c1+lt
133568651Skris    ADD,DC  ht,%r0,ht             ; bump c3 if overflow,nullify otherwise
133668651Skris
133768651Skris    ADD     C2,ht,C2              ; c2 = c2 + ht
133868651Skris    ADD,DC  C3,%r0,C3             ; add in carry (c3++)
133968651Skris.endm
134068651Skris
134168651Skris
134268651Skris;
134368651Skris;void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
134468651Skris; arg0 = r_ptr
134568651Skris; arg1 = a_ptr
134668651Skris; arg2 = b_ptr
134768651Skris;
134868651Skris
134968651Skrisbn_mul_comba8
135068651Skris	.proc
135168651Skris	.callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE
135268651Skris	.EXPORT	bn_mul_comba8,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
135368651Skris    .entry
135468651Skris	.align 64
135568651Skris
135668651Skris    STD     %r3,0(%sp)          ; save r3
135768651Skris    STD     %r4,8(%sp)          ; save r4
135868651Skris    STD     %r5,16(%sp)         ; save r5
135968651Skris    STD     %r6,24(%sp)         ; save r6
136068651Skris    FSTD    %fr12,32(%sp)       ; save r6
136168651Skris    FSTD    %fr13,40(%sp)       ; save r7
136268651Skris
136368651Skris	;
136468651Skris	; Zero out carries
136568651Skris	;
136668651Skris	COPY     %r0,c1
136768651Skris	COPY     %r0,c2
136868651Skris	COPY     %r0,c3
136968651Skris
137068651Skris	LDO      128(%sp),%sp       ; bump stack
137168651Skris    DEPDI,Z  1,31,1,high_one     ; Create Value  1 << 32
137268651Skris
137368651Skris	;
137468651Skris	; Load up all of the values we are going to use
137568651Skris	;
137668651Skris    FLDD      0(a_ptr),a0
137768651Skris    FLDD      8(a_ptr),a1
137868651Skris    FLDD     16(a_ptr),a2
137968651Skris    FLDD     24(a_ptr),a3
138068651Skris    FLDD     32(a_ptr),a4
138168651Skris    FLDD     40(a_ptr),a5
138268651Skris    FLDD     48(a_ptr),a6
138368651Skris    FLDD     56(a_ptr),a7
138468651Skris
138568651Skris    FLDD      0(b_ptr),b0
138668651Skris    FLDD      8(b_ptr),b1
138768651Skris    FLDD     16(b_ptr),b2
138868651Skris    FLDD     24(b_ptr),b3
138968651Skris    FLDD     32(b_ptr),b4
139068651Skris    FLDD     40(b_ptr),b5
139168651Skris    FLDD     48(b_ptr),b6
139268651Skris    FLDD     56(b_ptr),b7
139368651Skris
139468651Skris	MUL_ADD_C a0L,a0R,b0L,b0R,c1,c2,c3
139568651Skris	STD       c1,0(r_ptr)
139668651Skris	COPY      %r0,c1
139768651Skris
139868651Skris	MUL_ADD_C a0L,a0R,b1L,b1R,c2,c3,c1
139968651Skris	MUL_ADD_C a1L,a1R,b0L,b0R,c2,c3,c1
140068651Skris	STD       c2,8(r_ptr)
140168651Skris	COPY      %r0,c2
140268651Skris
140368651Skris	MUL_ADD_C a2L,a2R,b0L,b0R,c3,c1,c2
140468651Skris	MUL_ADD_C a1L,a1R,b1L,b1R,c3,c1,c2
140568651Skris	MUL_ADD_C a0L,a0R,b2L,b2R,c3,c1,c2
140668651Skris	STD       c3,16(r_ptr)
140768651Skris	COPY      %r0,c3
140868651Skris
140968651Skris	MUL_ADD_C a0L,a0R,b3L,b3R,c1,c2,c3
141068651Skris	MUL_ADD_C a1L,a1R,b2L,b2R,c1,c2,c3
141168651Skris	MUL_ADD_C a2L,a2R,b1L,b1R,c1,c2,c3
141268651Skris	MUL_ADD_C a3L,a3R,b0L,b0R,c1,c2,c3
141368651Skris	STD       c1,24(r_ptr)
141468651Skris	COPY      %r0,c1
141568651Skris
141668651Skris	MUL_ADD_C a4L,a4R,b0L,b0R,c2,c3,c1
141768651Skris	MUL_ADD_C a3L,a3R,b1L,b1R,c2,c3,c1
141868651Skris	MUL_ADD_C a2L,a2R,b2L,b2R,c2,c3,c1
141968651Skris	MUL_ADD_C a1L,a1R,b3L,b3R,c2,c3,c1
142068651Skris	MUL_ADD_C a0L,a0R,b4L,b4R,c2,c3,c1
142168651Skris	STD       c2,32(r_ptr)
142268651Skris	COPY      %r0,c2
142368651Skris
142468651Skris	MUL_ADD_C a0L,a0R,b5L,b5R,c3,c1,c2
142568651Skris	MUL_ADD_C a1L,a1R,b4L,b4R,c3,c1,c2
142668651Skris	MUL_ADD_C a2L,a2R,b3L,b3R,c3,c1,c2
142768651Skris	MUL_ADD_C a3L,a3R,b2L,b2R,c3,c1,c2
142868651Skris	MUL_ADD_C a4L,a4R,b1L,b1R,c3,c1,c2
142968651Skris	MUL_ADD_C a5L,a5R,b0L,b0R,c3,c1,c2
143068651Skris	STD       c3,40(r_ptr)
143168651Skris	COPY      %r0,c3
143268651Skris
143368651Skris	MUL_ADD_C a6L,a6R,b0L,b0R,c1,c2,c3
143468651Skris	MUL_ADD_C a5L,a5R,b1L,b1R,c1,c2,c3
143568651Skris	MUL_ADD_C a4L,a4R,b2L,b2R,c1,c2,c3
143668651Skris	MUL_ADD_C a3L,a3R,b3L,b3R,c1,c2,c3
143768651Skris	MUL_ADD_C a2L,a2R,b4L,b4R,c1,c2,c3
143868651Skris	MUL_ADD_C a1L,a1R,b5L,b5R,c1,c2,c3
143968651Skris	MUL_ADD_C a0L,a0R,b6L,b6R,c1,c2,c3
144068651Skris	STD       c1,48(r_ptr)
144168651Skris	COPY      %r0,c1
144268651Skris
144368651Skris	MUL_ADD_C a0L,a0R,b7L,b7R,c2,c3,c1
144468651Skris	MUL_ADD_C a1L,a1R,b6L,b6R,c2,c3,c1
144568651Skris	MUL_ADD_C a2L,a2R,b5L,b5R,c2,c3,c1
144668651Skris	MUL_ADD_C a3L,a3R,b4L,b4R,c2,c3,c1
144768651Skris	MUL_ADD_C a4L,a4R,b3L,b3R,c2,c3,c1
144868651Skris	MUL_ADD_C a5L,a5R,b2L,b2R,c2,c3,c1
144968651Skris	MUL_ADD_C a6L,a6R,b1L,b1R,c2,c3,c1
145068651Skris	MUL_ADD_C a7L,a7R,b0L,b0R,c2,c3,c1
145168651Skris	STD       c2,56(r_ptr)
145268651Skris	COPY      %r0,c2
145368651Skris
145468651Skris	MUL_ADD_C a7L,a7R,b1L,b1R,c3,c1,c2
145568651Skris	MUL_ADD_C a6L,a6R,b2L,b2R,c3,c1,c2
145668651Skris	MUL_ADD_C a5L,a5R,b3L,b3R,c3,c1,c2
145768651Skris	MUL_ADD_C a4L,a4R,b4L,b4R,c3,c1,c2
145868651Skris	MUL_ADD_C a3L,a3R,b5L,b5R,c3,c1,c2
145968651Skris	MUL_ADD_C a2L,a2R,b6L,b6R,c3,c1,c2
146068651Skris	MUL_ADD_C a1L,a1R,b7L,b7R,c3,c1,c2
146168651Skris	STD       c3,64(r_ptr)
146268651Skris	COPY      %r0,c3
146368651Skris
146468651Skris	MUL_ADD_C a2L,a2R,b7L,b7R,c1,c2,c3
146568651Skris	MUL_ADD_C a3L,a3R,b6L,b6R,c1,c2,c3
146668651Skris	MUL_ADD_C a4L,a4R,b5L,b5R,c1,c2,c3
146768651Skris	MUL_ADD_C a5L,a5R,b4L,b4R,c1,c2,c3
146868651Skris	MUL_ADD_C a6L,a6R,b3L,b3R,c1,c2,c3
146968651Skris	MUL_ADD_C a7L,a7R,b2L,b2R,c1,c2,c3
147068651Skris	STD       c1,72(r_ptr)
147168651Skris	COPY      %r0,c1
147268651Skris
147368651Skris	MUL_ADD_C a7L,a7R,b3L,b3R,c2,c3,c1
147468651Skris	MUL_ADD_C a6L,a6R,b4L,b4R,c2,c3,c1
147568651Skris	MUL_ADD_C a5L,a5R,b5L,b5R,c2,c3,c1
147668651Skris	MUL_ADD_C a4L,a4R,b6L,b6R,c2,c3,c1
147768651Skris	MUL_ADD_C a3L,a3R,b7L,b7R,c2,c3,c1
147868651Skris	STD       c2,80(r_ptr)
147968651Skris	COPY      %r0,c2
148068651Skris
148168651Skris	MUL_ADD_C a4L,a4R,b7L,b7R,c3,c1,c2
148268651Skris	MUL_ADD_C a5L,a5R,b6L,b6R,c3,c1,c2
148368651Skris	MUL_ADD_C a6L,a6R,b5L,b5R,c3,c1,c2
148468651Skris	MUL_ADD_C a7L,a7R,b4L,b4R,c3,c1,c2
148568651Skris	STD       c3,88(r_ptr)
148668651Skris	COPY      %r0,c3
148768651Skris
148868651Skris	MUL_ADD_C a7L,a7R,b5L,b5R,c1,c2,c3
148968651Skris	MUL_ADD_C a6L,a6R,b6L,b6R,c1,c2,c3
149068651Skris	MUL_ADD_C a5L,a5R,b7L,b7R,c1,c2,c3
149168651Skris	STD       c1,96(r_ptr)
149268651Skris	COPY      %r0,c1
149368651Skris
149468651Skris	MUL_ADD_C a6L,a6R,b7L,b7R,c2,c3,c1
149568651Skris	MUL_ADD_C a7L,a7R,b6L,b6R,c2,c3,c1
149668651Skris	STD       c2,104(r_ptr)
149768651Skris	COPY      %r0,c2
149868651Skris
149968651Skris	MUL_ADD_C a7L,a7R,b7L,b7R,c3,c1,c2
150068651Skris	STD       c3,112(r_ptr)
150168651Skris	STD       c1,120(r_ptr)
150268651Skris
150368651Skris    .EXIT
150468651Skris    FLDD    -88(%sp),%fr13
150568651Skris    FLDD    -96(%sp),%fr12
150668651Skris    LDD     -104(%sp),%r6        ; restore r6
150768651Skris    LDD     -112(%sp),%r5        ; restore r5
150868651Skris    LDD     -120(%sp),%r4        ; restore r4
150968651Skris    BVE     (%rp)
151068651Skris    LDD,MB  -128(%sp),%r3
151168651Skris
151268651Skris	.PROCEND
151368651Skris
151468651Skris;-----------------------------------------------------------------------------
151568651Skris;
151668651Skris;void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
151768651Skris; arg0 = r_ptr
151868651Skris; arg1 = a_ptr
151968651Skris; arg2 = b_ptr
152068651Skris;
152168651Skris
152268651Skrisbn_mul_comba4
152368651Skris	.proc
152468651Skris	.callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE
152568651Skris	.EXPORT	bn_mul_comba4,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
152668651Skris    .entry
152768651Skris	.align 64
152868651Skris
152968651Skris    STD     %r3,0(%sp)          ; save r3
153068651Skris    STD     %r4,8(%sp)          ; save r4
153168651Skris    STD     %r5,16(%sp)         ; save r5
153268651Skris    STD     %r6,24(%sp)         ; save r6
153368651Skris    FSTD    %fr12,32(%sp)       ; save r6
153468651Skris    FSTD    %fr13,40(%sp)       ; save r7
153568651Skris
153668651Skris	;
153768651Skris	; Zero out carries
153868651Skris	;
153968651Skris	COPY     %r0,c1
154068651Skris	COPY     %r0,c2
154168651Skris	COPY     %r0,c3
154268651Skris
154368651Skris	LDO      128(%sp),%sp       ; bump stack
154468651Skris    DEPDI,Z  1,31,1,high_one     ; Create Value  1 << 32
154568651Skris
154668651Skris	;
154768651Skris	; Load up all of the values we are going to use
154868651Skris	;
154968651Skris    FLDD      0(a_ptr),a0
155068651Skris    FLDD      8(a_ptr),a1
155168651Skris    FLDD     16(a_ptr),a2
155268651Skris    FLDD     24(a_ptr),a3
155368651Skris
155468651Skris    FLDD      0(b_ptr),b0
155568651Skris    FLDD      8(b_ptr),b1
155668651Skris    FLDD     16(b_ptr),b2
155768651Skris    FLDD     24(b_ptr),b3
155868651Skris
155968651Skris	MUL_ADD_C a0L,a0R,b0L,b0R,c1,c2,c3
156068651Skris	STD       c1,0(r_ptr)
156168651Skris	COPY      %r0,c1
156268651Skris
156368651Skris	MUL_ADD_C a0L,a0R,b1L,b1R,c2,c3,c1
156468651Skris	MUL_ADD_C a1L,a1R,b0L,b0R,c2,c3,c1
156568651Skris	STD       c2,8(r_ptr)
156668651Skris	COPY      %r0,c2
156768651Skris
156868651Skris	MUL_ADD_C a2L,a2R,b0L,b0R,c3,c1,c2
156968651Skris	MUL_ADD_C a1L,a1R,b1L,b1R,c3,c1,c2
157068651Skris	MUL_ADD_C a0L,a0R,b2L,b2R,c3,c1,c2
157168651Skris	STD       c3,16(r_ptr)
157268651Skris	COPY      %r0,c3
157368651Skris
157468651Skris	MUL_ADD_C a0L,a0R,b3L,b3R,c1,c2,c3
157568651Skris	MUL_ADD_C a1L,a1R,b2L,b2R,c1,c2,c3
157668651Skris	MUL_ADD_C a2L,a2R,b1L,b1R,c1,c2,c3
157768651Skris	MUL_ADD_C a3L,a3R,b0L,b0R,c1,c2,c3
157868651Skris	STD       c1,24(r_ptr)
157968651Skris	COPY      %r0,c1
158068651Skris
158168651Skris	MUL_ADD_C a3L,a3R,b1L,b1R,c2,c3,c1
158268651Skris	MUL_ADD_C a2L,a2R,b2L,b2R,c2,c3,c1
158368651Skris	MUL_ADD_C a1L,a1R,b3L,b3R,c2,c3,c1
158468651Skris	STD       c2,32(r_ptr)
158568651Skris	COPY      %r0,c2
158668651Skris
158768651Skris	MUL_ADD_C a2L,a2R,b3L,b3R,c3,c1,c2
158868651Skris	MUL_ADD_C a3L,a3R,b2L,b2R,c3,c1,c2
158968651Skris	STD       c3,40(r_ptr)
159068651Skris	COPY      %r0,c3
159168651Skris
159268651Skris	MUL_ADD_C a3L,a3R,b3L,b3R,c1,c2,c3
159368651Skris	STD       c1,48(r_ptr)
159468651Skris	STD       c2,56(r_ptr)
159568651Skris
159668651Skris    .EXIT
159768651Skris    FLDD    -88(%sp),%fr13
159868651Skris    FLDD    -96(%sp),%fr12
159968651Skris    LDD     -104(%sp),%r6        ; restore r6
160068651Skris    LDD     -112(%sp),%r5        ; restore r5
160168651Skris    LDD     -120(%sp),%r4        ; restore r4
160268651Skris    BVE     (%rp)
160368651Skris    LDD,MB  -128(%sp),%r3
160468651Skris
160568651Skris	.PROCEND
160668651Skris
160768651Skris
1608111147Snectar;--- not PIC	.SPACE	$TEXT$
1609111147Snectar;--- not PIC	.SUBSPA	$CODE$
1610111147Snectar;--- not PIC	.SPACE	$PRIVATE$,SORT=16
1611111147Snectar;--- not PIC	.IMPORT	$global$,DATA
1612111147Snectar;--- not PIC	.SPACE	$TEXT$
1613111147Snectar;--- not PIC	.SUBSPA	$CODE$
1614111147Snectar;--- not PIC	.SUBSPA	$LIT$,ACCESS=0x2c
1615111147Snectar;--- not PIC	C$7
1616111147Snectar;--- not PIC	.ALIGN	8
1617111147Snectar;--- not PIC	.STRINGZ	"Division would overflow (%d)\n"
161868651Skris	.END
1619