bn_asm_vax.S revision 1.2
1#	$NetBSD: bn_asm_vax.S,v 1.2 2018/03/08 03:14:31 christos Exp $
2#
3# w.j.m. 15-jan-1999
4#
5# it's magic ...
6#
7# ULONG bn_mul_add_words(ULONG r[],ULONG a[],int n,ULONG w) {
8#	ULONG c = 0;
9#	int i;
10#	for(i = 0; i < n; i++) <c,r[i]> := r[i] + c + a[i] * w ;
11#	return c;
12# }
13
14	.globl	bn_mul_add_words
15	.type   bn_mul_add_words@function
16
17bn_mul_add_words:
18	.word	0x40
19
20	movl	4(%ap),%r2		# *r
21	movl	8(%ap),%r3		# *a
22	movl	12(%ap),%r4		# n
23	movl	16(%ap),%r5		# w
24	clrl	%r6			# return value ("carry")
25
260:	emul	%r5,(%r3),(%r2),%r0	# w * a[0] + r[0] -> r0
27
28	# fixup for "negative" r[]
29	tstl	(%r2)
30	bgeq	1f
31	incl	%r1			# add 1 to highword
32
331:	# add saved carry to result
34	addl2	%r6,%r0
35	adwc	$0,%r1
36
37	# combined fixup for "negative" w, a[]
38	tstl	%r5		# if w is negative...
39	bgeq	1f
40	addl2	(%r3),%r1	# ...add a[0] again to highword
411:	tstl	(%r3)		# if a[0] is negative...
42	bgeq	1f
43	addl2	%r5,%r1		# ...add w again to highword
441:
45	movl	%r0,(%r2)+	# save low word in dest & advance *r
46	addl2	$4,%r3		# advance *a
47	movl	%r1,%r6		# high word in r6 for return value
48
49	sobgtr	%r4,0b		# loop?
50
51	movl	%r6,%r0
52	ret
53	.size  bn_mul_add_words, .-bn_mul_add_words
54
55#	.title	vax_bn_mul_words  unsigned multiply & add, 32*32+32=>64
56#;
57#; w.j.m. 15-jan-1999
58#;
59#; it's magic ...
60#;
61#; ULONG bn_mul_words(ULONG r[],ULONG a[],int n,ULONG w) {
62#;	ULONG c = 0;
63#;	int i;
64#;	for(i = 0; i < num; i++) <c,r[i]> := a[i] * w + c ;
65#;	return(c);
66#; }
67#
68	.globl	bn_mul_words
69	.type   bn_mul_words@function
70bn_mul_words:
71	.word	0x40
72
73	movl	4(%ap),%r2		# *r
74	movl	8(%ap),%r3		# *a
75	movl	12(%ap),%r4		# n
76	movl	16(%ap),%r5		# w
77	clrl	%r6			# carry
78
790:	emul	%r5,(%r3),%r6,%r0	# w * a[0] + carry -> r0
80
81	# fixup for "negative" carry
82	tstl	%r6
83	bgeq	1f
84	incl	%r1
85
861:	# combined fixup for "negative" w, a[]
87	tstl	%r5
88	bgeq	1f
89	addl2	(%r3),%r1
901:	tstl	(%r3)
91	bgeq	1f
92	addl2	%r5,%r1
93
941:	movl	%r0,(%r2)+
95	addl2	$4,%r3
96	movl	%r1,%r6
97
98	sobgtr	%r4,0b
99
100	movl	%r6,%r0
101	ret
102	.size  bn_mul_words, .-bn_mul_words
103
104
105
106#	.title	vax_bn_sqr_words  unsigned square, 32*32=>64
107#;
108#; w.j.m. 15-jan-1999
109#;
110#; it's magic ...
111#;
112#; void bn_sqr_words(ULONG r[],ULONG a[],int n) {
113#;	int i;
114#;	for(i = 0; i < n; i++) <r[2*i+1],r[2*i]> := a[i] * a[i] ;
115#; }
116#
117	.globl	bn_sqr_words
118	.type   bn_sqr_words@function
119bn_sqr_words:
120	.word	0
121
122	movl	4(%ap),%r2		# r
123	movl	8(%ap),%r3		# a
124	movl	12(%ap),%r4		# n
125
1260:	movl	(%r3)+,%r5		# r5 = a[] & advance
127
128	emul	%r5,%r5,$0,%r0		# a[0] * a[0] + 0 -> r0
129
130	# fixup for "negative" a[]
131	tstl	%r5
132	bgeq	1f
133	addl2	%r5,%r1
134	addl2	%r5,%r1
135
1361:	movq	%r0,(%r2)+		# store 64-bit result
137
138	sobgtr	%r4,0b			# loop
139
140	ret
141	.size  bn_sqr_words, .-bn_sqr_words
142
143
144#	.title	vax_bn_div_words  unsigned divide
145#;
146#; Richard Levitte 20-Nov-2000
147#;
148#; ULONG bn_div_words(ULONG h, ULONG l, ULONG d)
149#; {
150#;	return ((ULONG)((((ULLONG)h)<<32)|l) / (ULLONG)d);
151#; }
152#;
153#; Using EDIV would be very easy, if it didn't do signed calculations.
154#; Any time any of the input numbers are signed, there are problems,
155#; usually with integer overflow, at which point it returns useless
156#; data (the quotient gets the value of l, and the remainder becomes 0).
157#;
158#; If it was just for the dividend, it would be very easy, just divide
159#; it by 2 (unsigned), do the division, multiply the resulting quotient
160#; and remainder by 2, add the bit that was dropped when dividing by 2
161#; to the remainder, and do some adjustment so the remainder doesn't
162#; end up larger than the divisor.  For some cases when the divisor is
163#; negative (from EDIV's point of view, i.e. when the highest bit is set),
164#; dividing the dividend by 2 isn't enough, and since some operations
165#; might generate integer overflows even when the dividend is divided by
166#; 4 (when the high part of the shifted down dividend ends up being exactly
167#; half of the divisor, the result is the quotient 0x80000000, which is
168#; negative...) it needs to be divided by 8.  Furthermore, the divisor needs
169#; to be divided by 2 (unsigned) as well, to avoid more problems with the sign.
170#; In this case, a little extra fiddling with the remainder is required.
171#;
172#; So, the simplest way to handle this is always to divide the dividend
173#; by 8, and to divide the divisor by 2 if it's highest bit is set.
174#; After EDIV has been used, the quotient gets multiplied by 8 if the
175#; original divisor was positive, otherwise 4.  The remainder, oddly
176#; enough, is *always* multiplied by 8.
177#; NOTE: in the case mentioned above, where the high part of the shifted
178#; down dividend ends up being exactly half the shifted down divisor, we
179#; end up with a 33 bit quotient.  That's no problem however, it usually
180#; means we have ended up with a too large remainder as well, and the
181#; problem is fixed by the last part of the algorithm (next paragraph).
182#;
183#; The routine ends with comparing the resulting remainder with the
184#; original divisor and if the remainder is larger, subtract the
185#; original divisor from it, and increase the quotient by 1.  This is
186#; done until the remainder is smaller than the divisor.
187#;
188#; The complete algorithm looks like this:
189#;
190#; d'    = d
191#; l'    = l & 7
192#; [h,l] = [h,l] >> 3
193#; [q,r] = floor([h,l] / d)	# This is the EDIV operation
194#; if (q < 0) q = -q		# I doubt this is necessary any more
195#;
196#; r'    = r >> 29
197#; if (d' >= 0)
198#;   q'  = q >> 29
199#;   q   = q << 3
200#; else
201#;   q'  = q >> 30
202#;   q   = q << 2
203#; r     = (r << 3) + l'
204#;
205#; if (d' < 0)
206#;   {
207#;     [r',r] = [r',r] - q
208#;     while ([r',r] < 0)
209#;       {
210#;         [r',r] = [r',r] + d
211#;         [q',q] = [q',q] - 1
212#;       }
213#;   }
214#;
215#; while ([r',r] >= d')
216#;   {
217#;     [r',r] = [r',r] - d'
218#;     [q',q] = [q',q] + 1
219#;   }
220#;
221#; return q
222#
223#;r2 = l, q
224#;r3 = h, r
225#;r4 = d
226#;r5 = l'
227#;r6 = r'
228#;r7 = d'
229#;r8 = q'
230#
231	.globl	bn_div_words
232	.type   bn_div_words@function
233bn_div_words:
234	.word	0x1c0
235
236	movl	4(%ap),%r3		# h
237	movl	8(%ap),%r2		# l
238	movl	12(%ap),%r4		# d
239
240	bicl3	$-8,%r2,%r5		# l' = l & 7
241	bicl3	$7,%r2,%r2
242
243	bicl3	$-8,%r3,%r6
244	bicl3	$7,%r3,%r3
245
246	addl2	%r6,%r2
247
248	rotl	$-3,%r2,%r2		# l = l >> 3
249	rotl	$-3,%r3,%r3		# h = h >> 3
250
251	movl	%r4,%r7			# d' = d
252
253	clrl	%r6			# r' = 0
254	clrl	%r8			# q' = 0
255
256	tstl	%r4
257	beql	0f			# Uh-oh, the divisor is 0...
258	bgtr	1f
259	rotl	$-1,%r4,%r4	# If d is negative, shift it right.
260	bicl2	$0x80000000,%r4	# Since d is then a large number, the
261				# lowest bit is insignificant
262				# (contradict that, and I'll fix the problem!)
2631:
264	ediv	%r4,%r2,%r2,%r3		# Do the actual division
265
266	tstl	%r2
267	bgeq	1f
268	mnegl	%r2,%r2		# if q < 0, negate it
2691:
270	tstl	%r7
271	blss	1f
272	rotl	$3,%r2,%r2	#   q = q << 3
273	bicl3	$-8,%r2,%r8	#   q' gets the high bits from q
274	bicl3	$7,%r2,%r2
275	brb	2f
276
2771:				# else
278	rotl	$2,%r2,%r2	#   q = q << 2
279	bicl3	$-4,%r2,%r8	#   q' gets the high bits from q
280	bicl3	$3,%r2,%r2
2812:
282	rotl	$3,%r3,%r3	# r = r << 3
283	bicl3	$-8,%r3,%r6	# r' gets the high bits from r
284	bicl3	$7,%r3,%r3
285	addl2	%r5,%r3		# r = r + l'
286
287	tstl	%r7
288	bgeq	5f
289	bitl	$1,%r7
290	beql	5f		# if d' < 0 && d' & 1
291	subl2	%r2,%r3		#   [r',r] = [r',r] - [q',q]
292	sbwc	%r8,%r6
2933:
294	bgeq	5f		#   while r < 0
295	decl	%r2		#     [q',q] = [q',q] - 1
296	sbwc	$0,%r8
297	addl2	%r7,%r3		#     [r',r] = [r',r] + d'
298	adwc	$0,%r6
299	brb	3b
300
301# The return points are placed in the middle to keep a short distance from
302# all the branch points
3031:
304#	movl	%r3,%r1
305	movl	%r2,%r0
306	ret
3070:
308	movl	$-1,%r0
309	ret
3105:
311	tstl	%r6
312	bneq	6f
313	cmpl	%r3,%r7
314	blssu	1b		# while [r',r] >= d'
3156:
316	subl2	%r7,%r3		#   [r',r] = [r',r] - d'
317	sbwc	$0,%r6
318	incl	%r2		#   [q',q] = [q',q] + 1
319	adwc	$0,%r8
320	brb	5b
321	.size  bn_div_words, .-bn_div_words
322
323
324
325#	.title	vax_bn_add_words  unsigned add of two arrays
326#;
327#; Richard Levitte 20-Nov-2000
328#;
329#; ULONG bn_add_words(ULONG r[], ULONG a[], ULONG b[], int n) {
330#;	ULONG c = 0;
331#;	int i;
332#;	for (i = 0; i < n; i++) <c,r[i]> = a[i] + b[i] + c;
333#;	return(c);
334#; }
335#
336
337	.globl	bn_add_words
338	.type   bn_add_words@function
339bn_add_words:
340	.word	0
341
342	movl	4(%ap),%r2	# r
343	movl	8(%ap),%r3	# a
344	movl	12(%ap),%r4	# b
345	movl	16(%ap),%r5	# n
346	clrl	%r0
347
348	tstl	%r5
349	bleq	1f
350
3510:	movl	(%r3)+,%r1	# carry untouched
352	adwc	(%r4)+,%r1	# carry used and touched
353	movl	%r1,(%r2)+	# carry untouched
354	sobgtr	%r5,0b		# carry untouched
355
356	adwc	$0,%r0
3571:	ret
358	.size  bn_add_words, .-bn_add_words
359
360#;
361#; Richard Levitte 20-Nov-2000
362#;
363#; ULONG bn_sub_words(ULONG r[], ULONG a[], ULONG b[], int n) {
364#;	ULONG c = 0;
365#;	int i;
366#;	for (i = 0; i < n; i++) <c,r[i]> = a[i] - b[i] - c;
367#;	return(c);
368#; }
369#
370	.globl	bn_sub_words
371	.type   bn_sub_words@function
372bn_sub_words:
373	.word	0x40
374
375	movl	4(%ap),%r2	# r
376	movl	8(%ap),%r3	# a
377	movl	12(%ap),%r4	# b
378	movl	16(%ap),%r5	# n
379	clrl	%r0
380
381	tstl	%r5
382	bleq	1f
383
3840:	movl	(%r3)+,%r6	# carry untouched
385	sbwc	(%r4)+,%r6	# carry used and touched
386	movl	%r6,(%r2)+	# carry untouched
387	sobgtr	%r5,0b		# carry untouched
388
3891:	adwc	$0,%r0
390	ret
391	.size  bn_sub_words, .-bn_sub_words
392
393#
394#	Ragge 20-Sep-2003
395#
396#	Multiply a vector of 4/8 longword by another.
397#	Uses two loops and 16/64 emuls.
398#
399	.globl	bn_mul_comba4
400	.type   bn_mul_comba4@function
401bn_mul_comba4:
402	.word	0x3c0
403	movl	$4,%r9		# 4*4
404	brb	6f
405
406	.globl	bn_mul_comba8
407	.type   bn_mul_comba8@function
408bn_mul_comba8:
409	.word	0x3c0
410	movl	$8,%r9		# 8*8
411
4126:	movl	8(%ap),%r3	# a[]
413	movl	12(%ap),%r7	# b[]
414	brb	5f
415
416	.globl	bn_sqr_comba4
417	.type   bn_sqr_comba4@function
418bn_sqr_comba4:
419	.word	0x3c0
420	movl	$4,%r9		# 4*4
421	brb 0f
422
423	.globl	bn_sqr_comba8
424	.type   bn_sqr_comba8@function
425bn_sqr_comba8:
426	.word	0x3c0
427	movl	$8,%r9		# 8*8
428
4290:
430	movl	8(%ap),%r3	# a[]
431	movl	%r3,%r7		# a[]
432
4335:	movl	4(%ap),%r5	# r[]
434	movl	%r9,%r8
435
436	clrq	(%r5)		# clear destinatino, for add.
437	clrq	8(%r5)
438	clrq	16(%r5)		# these only needed for comba8
439	clrq	24(%r5)
440
4412:	clrl	%r4		# carry
442	movl	%r9,%r6		# inner loop count
443	movl	(%r7)+,%r2	# value to multiply with
444
4451:	emul	%r2,(%r3),%r4,%r0
446	tstl	%r4
447	bgeq	3f
448	incl	%r1
4493:	tstl	%r2
450	bgeq	3f
451	addl2	(%r3),%r1
4523:	tstl	(%r3)
453	bgeq	3f
454	addl2	%r2,%r1
455
4563:	addl2	%r0,(%r5)+	# add to destination
457	adwc	$0,%r1		# remember carry
458	movl	%r1,%r4		# add carry in next emul
459	addl2	$4,%r3
460	sobgtr	%r6,1b
461
462	movl	%r4,(%r5)	# save highest add result
463
464	ashl	$2,%r9,%r4
465	subl2	%r4,%r3
466	subl2	$4,%r4
467	subl2	%r4,%r5
468
469	sobgtr	%r8,2b
470
471	ret
472	.size  bn_mul_comba4, .-bn_mul_comba4
473