#	$NetBSD: bn_asm_vax.S,v 1.2 2018/03/08 03:14:31 christos Exp $
#
# w.j.m. 15-jan-1999
#
# it's magic ...
#
# ULONG bn_mul_add_words(ULONG r[],ULONG a[],int n,ULONG w) {
#	ULONG c = 0;
#	int i;
#	for(i = 0; i < n; i++) <c,r[i]> := r[i] + c + a[i] * w ;
#	return c;
# }

	.globl	bn_mul_add_words
	.type   bn_mul_add_words@function

bn_mul_add_words:
	.word	0x40

	movl	4(%ap),%r2		# *r
	movl	8(%ap),%r3		# *a
	movl	12(%ap),%r4		# n
	movl	16(%ap),%r5		# w
	clrl	%r6			# return value ("carry")

0:	emul	%r5,(%r3),(%r2),%r0	# w * a[0] + r[0] -> r0

	# fixup for "negative" r[]
	tstl	(%r2)
	bgeq	1f
	incl	%r1			# add 1 to highword

1:	# add saved carry to result
	addl2	%r6,%r0
	adwc	$0,%r1

	# combined fixup for "negative" w, a[]
	tstl	%r5		# if w is negative...
	bgeq	1f
	addl2	(%r3),%r1	# ...add a[0] again to highword
1:	tstl	(%r3)		# if a[0] is negative...
	bgeq	1f
	addl2	%r5,%r1		# ...add w again to highword
1:
	movl	%r0,(%r2)+	# save low word in dest & advance *r
	addl2	$4,%r3		# advance *a
	movl	%r1,%r6		# high word in r6 for return value

	sobgtr	%r4,0b		# loop?

	movl	%r6,%r0
	ret
	.size  bn_mul_add_words, .-bn_mul_add_words

#	.title	vax_bn_mul_words  unsigned multiply & add, 32*32+32=>64
#;
#; w.j.m. 15-jan-1999
#;
#; it's magic ...
#;
#; ULONG bn_mul_words(ULONG r[],ULONG a[],int n,ULONG w) {
#;	ULONG c = 0;
#;	int i;
#;	for(i = 0; i < num; i++) <c,r[i]> := a[i] * w + c ;
#;	return(c);
#; }
#
	.globl	bn_mul_words
	.type   bn_mul_words@function
bn_mul_words:
	.word	0x40

	movl	4(%ap),%r2		# *r
	movl	8(%ap),%r3		# *a
	movl	12(%ap),%r4		# n
	movl	16(%ap),%r5		# w
	clrl	%r6			# carry

0:	emul	%r5,(%r3),%r6,%r0	# w * a[0] + carry -> r0

	# fixup for "negative" carry
	tstl	%r6
	bgeq	1f
	incl	%r1

1:	# combined fixup for "negative" w, a[]
	tstl	%r5
	bgeq	1f
	addl2	(%r3),%r1
1:	tstl	(%r3)
	bgeq	1f
	addl2	%r5,%r1

1:	movl	%r0,(%r2)+
	addl2	$4,%r3
	movl	%r1,%r6

	sobgtr	%r4,0b

	movl	%r6,%r0
	ret
	.size  bn_mul_words, .-bn_mul_words


#	.title	vax_bn_sqr_words  unsigned square, 32*32=>64
#;
#; w.j.m. 15-jan-1999
#;
#; it's magic ...
#;
#; void bn_sqr_words(ULONG r[],ULONG a[],int n) {
#;	int i;
#;	for(i = 0; i < n; i++) <r[2*i+1],r[2*i]> := a[i] * a[i] ;
#; }
#
	.globl	bn_sqr_words
	.type   bn_sqr_words@function
bn_sqr_words:
	.word	0

	movl	4(%ap),%r2		# r
	movl	8(%ap),%r3		# a
	movl	12(%ap),%r4		# n

0:	movl	(%r3)+,%r5		# r5 = a[] & advance

	emul	%r5,%r5,$0,%r0		# a[0] * a[0] + 0 -> r0

	# fixup for "negative" a[]
	tstl	%r5
	bgeq	1f
	addl2	%r5,%r1
	addl2	%r5,%r1

1:	movq	%r0,(%r2)+		# store 64-bit result

	sobgtr	%r4,0b			# loop

	ret
	.size  bn_sqr_words, .-bn_sqr_words


#	.title	vax_bn_div_words  unsigned divide
#;
#; Richard Levitte 20-Nov-2000
#;
#; ULONG bn_div_words(ULONG h, ULONG l, ULONG d)
#; {
#;	return ((ULONG)((((ULLONG)h)<<32)|l) / (ULLONG)d);
#; }
#;
#; Using EDIV would be very easy, if it didn't do signed calculations.
#; Any time any of the input numbers are signed, there are problems,
#; usually with integer overflow, at which point it returns useless
#; data (the quotient gets the value of l, and the remainder becomes 0).
#;
#; If it was just for the dividend, it would be very easy, just divide
#; it by 2 (unsigned), do the division, multiply the resulting quotient
#; and remainder by 2, add the bit that was dropped when dividing by 2
#; to the remainder, and do some adjustment so the remainder doesn't
#; end up larger than the divisor.  For some cases when the divisor is
#; negative (from EDIV's point of view, i.e. when the highest bit is set),
#; dividing the dividend by 2 isn't enough, and since some operations
#; might generate integer overflows even when the dividend is divided by
#; 4 (when the high part of the shifted down dividend ends up being exactly
#; half of the divisor, the result is the quotient 0x80000000, which is
#; negative...) it needs to be divided by 8.  Furthermore, the divisor needs
#; to be divided by 2 (unsigned) as well, to avoid more problems with the sign.
#; In this case, a little extra fiddling with the remainder is required.
#;
#; So, the simplest way to handle this is always to divide the dividend
#; by 8, and to divide the divisor by 2 if it's highest bit is set.
#; After EDIV has been used, the quotient gets multiplied by 8 if the
#; original divisor was positive, otherwise 4.  The remainder, oddly
#; enough, is *always* multiplied by 8.
#; NOTE: in the case mentioned above, where the high part of the shifted
#; down dividend ends up being exactly half the shifted down divisor, we
#; end up with a 33 bit quotient.  That's no problem however, it usually
#; means we have ended up with a too large remainder as well, and the
#; problem is fixed by the last part of the algorithm (next paragraph).
#;
#; The routine ends with comparing the resulting remainder with the
#; original divisor and if the remainder is larger, subtract the
#; original divisor from it, and increase the quotient by 1.  This is
#; done until the remainder is smaller than the divisor.
#;
#; The complete algorithm looks like this:
#;
#; d'    = d
#; l'    = l & 7
#; [h,l] = [h,l] >> 3
#; [q,r] = floor([h,l] / d)	# This is the EDIV operation
#; if (q < 0) q = -q		# I doubt this is necessary any more
#;
#; r'    = r >> 29
#; if (d' >= 0)
#;   q'  = q >> 29
#;   q   = q << 3
#; else
#;   q'  = q >> 30
#;   q   = q << 2
#; r     = (r << 3) + l'
#;
#; if (d' < 0)
#;   {
#;     [r',r] = [r',r] - q
#;     while ([r',r] < 0)
#;       {
#;         [r',r] = [r',r] + d
#;         [q',q] = [q',q] - 1
#;       }
#;   }
#;
#; while ([r',r] >= d')
#;   {
#;     [r',r] = [r',r] - d'
#;     [q',q] = [q',q] + 1
#;   }
#;
#; return q
#
#;r2 = l, q
#;r3 = h, r
#;r4 = d
#;r5 = l'
#;r6 = r'
#;r7 = d'
#;r8 = q'
#
	.globl	bn_div_words
	.type   bn_div_words@function
bn_div_words:
	.word	0x1c0

	movl	4(%ap),%r3		# h
	movl	8(%ap),%r2		# l
	movl	12(%ap),%r4		# d

	bicl3	$-8,%r2,%r5		# l' = l & 7
	bicl3	$7,%r2,%r2

	bicl3	$-8,%r3,%r6
	bicl3	$7,%r3,%r3

	addl2	%r6,%r2

	rotl	$-3,%r2,%r2		# l = l >> 3
	rotl	$-3,%r3,%r3		# h = h >> 3

	movl	%r4,%r7			# d' = d

	clrl	%r6			# r' = 0
	clrl	%r8			# q' = 0

	tstl	%r4
	beql	0f			# Uh-oh, the divisor is 0...
	bgtr	1f
	rotl	$-1,%r4,%r4	# If d is negative, shift it right.
	bicl2	$0x80000000,%r4	# Since d is then a large number, the
				# lowest bit is insignificant
				# (contradict that, and I'll fix the problem!)
1:
	ediv	%r4,%r2,%r2,%r3		# Do the actual division

	tstl	%r2
	bgeq	1f
	mnegl	%r2,%r2		# if q < 0, negate it
1:
	tstl	%r7
	blss	1f
	rotl	$3,%r2,%r2	#   q = q << 3
	bicl3	$-8,%r2,%r8	#   q' gets the high bits from q
	bicl3	$7,%r2,%r2
	brb	2f

1:				# else
	rotl	$2,%r2,%r2	#   q = q << 2
	bicl3	$-4,%r2,%r8	#   q' gets the high bits from q
	bicl3	$3,%r2,%r2
2:
	rotl	$3,%r3,%r3	# r = r << 3
	bicl3	$-8,%r3,%r6	# r' gets the high bits from r
	bicl3	$7,%r3,%r3
	addl2	%r5,%r3		# r = r + l'

	tstl	%r7
	bgeq	5f
	bitl	$1,%r7
	beql	5f		# if d' < 0 && d' & 1
	subl2	%r2,%r3		#   [r',r] = [r',r] - [q',q]
	sbwc	%r8,%r6
3:
	bgeq	5f		#   while r < 0
	decl	%r2		#     [q',q] = [q',q] - 1
	sbwc	$0,%r8
	addl2	%r7,%r3		#     [r',r] = [r',r] + d'
	adwc	$0,%r6
	brb	3b

# The return points are placed in the middle to keep a short distance from
# all the branch points
1:
#	movl	%r3,%r1
	movl	%r2,%r0
	ret
0:
	movl	$-1,%r0
	ret
5:
	tstl	%r6
	bneq	6f
	cmpl	%r3,%r7
	blssu	1b		# while [r',r] >= d'
6:
	subl2	%r7,%r3		#   [r',r] = [r',r] - d'
	sbwc	$0,%r6
	incl	%r2		#   [q',q] = [q',q] + 1
	adwc	$0,%r8
	brb	5b
	.size  bn_div_words, .-bn_div_words


#	.title	vax_bn_add_words  unsigned add of two arrays
#;
#; Richard Levitte 20-Nov-2000
#;
#; ULONG bn_add_words(ULONG r[], ULONG a[], ULONG b[], int n) {
#;	ULONG c = 0;
#;	int i;
#;	for (i = 0; i < n; i++) <c,r[i]> = a[i] + b[i] + c;
#;	return(c);
#; }
#

	.globl	bn_add_words
	.type   bn_add_words@function
bn_add_words:
	.word	0

	movl	4(%ap),%r2	# r
	movl	8(%ap),%r3	# a
	movl	12(%ap),%r4	# b
	movl	16(%ap),%r5	# n
	clrl	%r0

	tstl	%r5
	bleq	1f

0:	movl	(%r3)+,%r1	# carry untouched
	adwc	(%r4)+,%r1	# carry used and touched
	movl	%r1,(%r2)+	# carry untouched
	sobgtr	%r5,0b		# carry untouched

	adwc	$0,%r0
1:	ret
	.size  bn_add_words, .-bn_add_words

#;
#; Richard Levitte 20-Nov-2000
#;
#; ULONG bn_sub_words(ULONG r[], ULONG a[], ULONG b[], int n) {
#;	ULONG c = 0;
#;	int i;
#;	for (i = 0; i < n; i++) <c,r[i]> = a[i] - b[i] - c;
#;	return(c);
#; }
#
	.globl	bn_sub_words
	.type   bn_sub_words@function
bn_sub_words:
	.word	0x40

	movl	4(%ap),%r2	# r
	movl	8(%ap),%r3	# a
	movl	12(%ap),%r4	# b
	movl	16(%ap),%r5	# n
	clrl	%r0

	tstl	%r5
	bleq	1f

0:	movl	(%r3)+,%r6	# carry untouched
	sbwc	(%r4)+,%r6	# carry used and touched
	movl	%r6,(%r2)+	# carry untouched
	sobgtr	%r5,0b		# carry untouched

1:	adwc	$0,%r0
	ret
	.size  bn_sub_words, .-bn_sub_words

#
#	Ragge 20-Sep-2003
#
#	Multiply a vector of 4/8 longword by another.
#	Uses two loops and 16/64 emuls.
#
	.globl	bn_mul_comba4
	.type   bn_mul_comba4@function
bn_mul_comba4:
	.word	0x3c0
	movl	$4,%r9		# 4*4
	brb	6f

	.globl	bn_mul_comba8
	.type   bn_mul_comba8@function
bn_mul_comba8:
	.word	0x3c0
	movl	$8,%r9		# 8*8

6:	movl	8(%ap),%r3	# a[]
	movl	12(%ap),%r7	# b[]
	brb	5f

	.globl	bn_sqr_comba4
	.type   bn_sqr_comba4@function
bn_sqr_comba4:
	.word	0x3c0
	movl	$4,%r9		# 4*4
	brb 0f

	.globl	bn_sqr_comba8
	.type   bn_sqr_comba8@function
bn_sqr_comba8:
	.word	0x3c0
	movl	$8,%r9		# 8*8

0:
	movl	8(%ap),%r3	# a[]
	movl	%r3,%r7		# a[]

5:	movl	4(%ap),%r5	# r[]
	movl	%r9,%r8

	clrq	(%r5)		# clear destinatino, for add.
	clrq	8(%r5)
	clrq	16(%r5)		# these only needed for comba8
	clrq	24(%r5)

2:	clrl	%r4		# carry
	movl	%r9,%r6		# inner loop count
	movl	(%r7)+,%r2	# value to multiply with

1:	emul	%r2,(%r3),%r4,%r0
	tstl	%r4
	bgeq	3f
	incl	%r1
3:	tstl	%r2
	bgeq	3f
	addl2	(%r3),%r1
3:	tstl	(%r3)
	bgeq	3f
	addl2	%r2,%r1

3:	addl2	%r0,(%r5)+	# add to destination
	adwc	$0,%r1		# remember carry
	movl	%r1,%r4		# add carry in next emul
	addl2	$4,%r3
	sobgtr	%r6,1b

	movl	%r4,(%r5)	# save highest add result

	ashl	$2,%r9,%r4
	subl2	%r4,%r3
	subl2	$4,%r4
	subl2	%r4,%r5

	sobgtr	%r8,2b

	ret
	.size  bn_mul_comba4, .-bn_mul_comba4