1/*
2 * x86_64 BIGNUM accelerator version 0.1, December 2002.
3 *
4 * Implemented by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5 * project.
6 *
7 * Rights for redistribution and usage in source and binary forms are
8 * granted according to the OpenSSL license. Warranty of any kind is
9 * disclaimed.
10 *
11 * Q. Version 0.1? It doesn't sound like Andy, he used to assign real
12 *    versions, like 1.0...
13 * A. Well, that's because this code is basically a quick-n-dirty
14 *    proof-of-concept hack. As you can see it's implemented with
15 *    inline assembler, which means that you're bound to GCC and that
16 *    there must be a room for fine-tuning.
17 *
18 * Q. Why inline assembler?
19 * A. x86_64 features own ABI I'm not familiar with. Which is why
20 *    I decided to let the compiler take care of subroutine
21 *    prologue/epilogue as well as register allocation.
22 *
23 * Q. How much faster does it get?
24 * A. Unfortunately people sitting on x86_64 hardware are prohibited
25 *    to disclose the performance numbers, so they (SuSE labs to be
26 *    specific) wouldn't tell me. However! Very similar coding technique
27 *    (reaching out for 128-bit result from 64x64-bit multiplication)
28 *    results in >3 times performance improvement on MIPS and I see no
29 *    reason why gain on x86_64 would be so much different:-)
30 */
31
32#define BN_ULONG unsigned long
33
34/*
35 * "m"(a), "+m"(r)	is the way to favor DirectPath �-code;
36 * "g"(0)		let the compiler to decide where does it
37 *			want to keep the value of zero;
38 */
39#define mul_add(r,a,word,carry) do {	\
40	register BN_ULONG high,low;	\
41	asm ("mulq %3"			\
42		: "=a"(low),"=d"(high)	\
43		: "a"(word),"m"(a)	\
44		: "cc");		\
45	asm ("addq %2,%0; adcq %3,%1"	\
46		: "+r"(carry),"+d"(high)\
47		: "a"(low),"g"(0)	\
48		: "cc");		\
49	asm ("addq %2,%0; adcq %3,%1"	\
50		: "+m"(r),"+d"(high)	\
51		: "r"(carry),"g"(0)	\
52		: "cc");		\
53	carry=high;			\
54	} while (0)
55
56#define mul(r,a,word,carry) do {	\
57	register BN_ULONG high,low;	\
58	asm ("mulq %3"			\
59		: "=a"(low),"=d"(high)	\
60		: "a"(word),"g"(a)	\
61		: "cc");		\
62	asm ("addq %2,%0; adcq %3,%1"	\
63		: "+r"(carry),"+d"(high)\
64		: "a"(low),"g"(0)	\
65		: "cc");		\
66	(r)=carry, carry=high;		\
67	} while (0)
68
69#define sqr(r0,r1,a)			\
70	asm ("mulq %2"			\
71		: "=a"(r0),"=d"(r1)	\
72		: "a"(a)		\
73		: "cc");
74
75BN_ULONG bn_mul_add_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
76	{
77	BN_ULONG c1=0;
78
79	if (num <= 0) return(c1);
80
81	while (num&~3)
82		{
83		mul_add(rp[0],ap[0],w,c1);
84		mul_add(rp[1],ap[1],w,c1);
85		mul_add(rp[2],ap[2],w,c1);
86		mul_add(rp[3],ap[3],w,c1);
87		ap+=4; rp+=4; num-=4;
88		}
89	if (num)
90		{
91		mul_add(rp[0],ap[0],w,c1); if (--num==0) return c1;
92		mul_add(rp[1],ap[1],w,c1); if (--num==0) return c1;
93		mul_add(rp[2],ap[2],w,c1); return c1;
94		}
95
96	return(c1);
97	}
98
99BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
100	{
101	BN_ULONG c1=0;
102
103	if (num <= 0) return(c1);
104
105	while (num&~3)
106		{
107		mul(rp[0],ap[0],w,c1);
108		mul(rp[1],ap[1],w,c1);
109		mul(rp[2],ap[2],w,c1);
110		mul(rp[3],ap[3],w,c1);
111		ap+=4; rp+=4; num-=4;
112		}
113	if (num)
114		{
115		mul(rp[0],ap[0],w,c1); if (--num == 0) return c1;
116		mul(rp[1],ap[1],w,c1); if (--num == 0) return c1;
117		mul(rp[2],ap[2],w,c1);
118		}
119	return(c1);
120	}
121
122void bn_sqr_words(BN_ULONG *r, BN_ULONG *a, int n)
123        {
124	if (n <= 0) return;
125
126	while (n&~3)
127		{
128		sqr(r[0],r[1],a[0]);
129		sqr(r[2],r[3],a[1]);
130		sqr(r[4],r[5],a[2]);
131		sqr(r[6],r[7],a[3]);
132		a+=4; r+=8; n-=4;
133		}
134	if (n)
135		{
136		sqr(r[0],r[1],a[0]); if (--n == 0) return;
137		sqr(r[2],r[3],a[1]); if (--n == 0) return;
138		sqr(r[4],r[5],a[2]);
139		}
140	}
141
142BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
143{	BN_ULONG ret,waste;
144
145	asm ("divq	%4"
146		: "=a"(ret),"=d"(waste)
147		: "a"(l),"d"(h),"g"(d)
148		: "cc");
149
150	return ret;
151}
152
153BN_ULONG bn_add_words (BN_ULONG *rp, BN_ULONG *ap, BN_ULONG *bp,int n)
154{ BN_ULONG ret,i;
155
156	if (n <= 0) return 0;
157
158	asm (
159	"	subq	%2,%2		\n"
160	".align 16			\n"
161	"1:	movq	(%4,%2,8),%0	\n"
162	"	adcq	(%5,%2,8),%0	\n"
163	"	movq	%0,(%3,%2,8)	\n"
164	"	leaq	1(%2),%2	\n"
165	"	loop	1b		\n"
166	"	sbbq	%0,%0		\n"
167		: "+a"(ret),"+c"(n),"+r"(i)
168		: "r"(rp),"r"(ap),"r"(bp)
169		: "cc"
170	);
171
172  return ret&1;
173}
174
175#ifndef SIMICS
176BN_ULONG bn_sub_words (BN_ULONG *rp, BN_ULONG *ap, BN_ULONG *bp,int n)
177{ BN_ULONG ret,i;
178
179	if (n <= 0) return 0;
180
181	asm (
182	"	subq	%2,%2		\n"
183	".align 16			\n"
184	"1:	movq	(%4,%2,8),%0	\n"
185	"	sbbq	(%5,%2,8),%0	\n"
186	"	movq	%0,(%3,%2,8)	\n"
187	"	leaq	1(%2),%2	\n"
188	"	loop	1b		\n"
189	"	sbbq	%0,%0		\n"
190		: "+a"(ret),"+c"(n),"+r"(i)
191		: "r"(rp),"r"(ap),"r"(bp)
192		: "cc"
193	);
194
195  return ret&1;
196}
197#else
198/* Simics 1.4<7 has buggy sbbq:-( */
199#define BN_MASK2 0xffffffffffffffffL
200BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
201        {
202	BN_ULONG t1,t2;
203	int c=0;
204
205	if (n <= 0) return((BN_ULONG)0);
206
207	for (;;)
208		{
209		t1=a[0]; t2=b[0];
210		r[0]=(t1-t2-c)&BN_MASK2;
211		if (t1 != t2) c=(t1 < t2);
212		if (--n <= 0) break;
213
214		t1=a[1]; t2=b[1];
215		r[1]=(t1-t2-c)&BN_MASK2;
216		if (t1 != t2) c=(t1 < t2);
217		if (--n <= 0) break;
218
219		t1=a[2]; t2=b[2];
220		r[2]=(t1-t2-c)&BN_MASK2;
221		if (t1 != t2) c=(t1 < t2);
222		if (--n <= 0) break;
223
224		t1=a[3]; t2=b[3];
225		r[3]=(t1-t2-c)&BN_MASK2;
226		if (t1 != t2) c=(t1 < t2);
227		if (--n <= 0) break;
228
229		a+=4;
230		b+=4;
231		r+=4;
232		}
233	return(c);
234	}
235#endif
236
237/* mul_add_c(a,b,c0,c1,c2)  -- c+=a*b for three word number c=(c2,c1,c0) */
238/* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */
239/* sqr_add_c(a,i,c0,c1,c2)  -- c+=a[i]^2 for three word number c=(c2,c1,c0) */
240/* sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number c=(c2,c1,c0) */
241
242#if 0
243/* original macros are kept for reference purposes */
244#define mul_add_c(a,b,c0,c1,c2) {	\
245	BN_ULONG ta=(a),tb=(b);		\
246	t1 = ta * tb;			\
247	t2 = BN_UMULT_HIGH(ta,tb);	\
248	c0 += t1; t2 += (c0<t1)?1:0;	\
249	c1 += t2; c2 += (c1<t2)?1:0;	\
250	}
251
252#define mul_add_c2(a,b,c0,c1,c2) {	\
253	BN_ULONG ta=(a),tb=(b),t0;	\
254	t1 = BN_UMULT_HIGH(ta,tb);	\
255	t0 = ta * tb;			\
256	t2 = t1+t1; c2 += (t2<t1)?1:0;	\
257	t1 = t0+t0; t2 += (t1<t0)?1:0;	\
258	c0 += t1; t2 += (c0<t1)?1:0;	\
259	c1 += t2; c2 += (c1<t2)?1:0;	\
260	}
261#else
262#define mul_add_c(a,b,c0,c1,c2)	do {	\
263	asm ("mulq %3"			\
264		: "=a"(t1),"=d"(t2)	\
265		: "a"(a),"m"(b)		\
266		: "cc");		\
267	asm ("addq %2,%0; adcq %3,%1"	\
268		: "+r"(c0),"+d"(t2)	\
269		: "a"(t1),"g"(0)	\
270		: "cc");		\
271	asm ("addq %2,%0; adcq %3,%1"	\
272		: "+r"(c1),"+r"(c2)	\
273		: "d"(t2),"g"(0)	\
274		: "cc");		\
275	} while (0)
276
277#define sqr_add_c(a,i,c0,c1,c2)	do {	\
278	asm ("mulq %2"			\
279		: "=a"(t1),"=d"(t2)	\
280		: "a"(a[i])		\
281		: "cc");		\
282	asm ("addq %2,%0; adcq %3,%1"	\
283		: "+r"(c0),"+d"(t2)	\
284		: "a"(t1),"g"(0)	\
285		: "cc");		\
286	asm ("addq %2,%0; adcq %3,%1"	\
287		: "+r"(c1),"+r"(c2)	\
288		: "d"(t2),"g"(0)	\
289		: "cc");		\
290	} while (0)
291
292#define mul_add_c2(a,b,c0,c1,c2) do {	\
293	asm ("mulq %3"			\
294		: "=a"(t1),"=d"(t2)	\
295		: "a"(a),"m"(b)		\
296		: "cc");		\
297	asm ("addq %0,%0; adcq %2,%1"	\
298		: "+d"(t2),"+r"(c2)	\
299		: "g"(0)		\
300		: "cc");		\
301	asm ("addq %0,%0; adcq %2,%1"	\
302		: "+a"(t1),"+d"(t2)	\
303		: "g"(0)		\
304		: "cc");		\
305	asm ("addq %2,%0; adcq %3,%1"	\
306		: "+r"(c0),"+d"(t2)	\
307		: "a"(t1),"g"(0)	\
308		: "cc");		\
309	asm ("addq %2,%0; adcq %3,%1"	\
310		: "+r"(c1),"+r"(c2)	\
311		: "d"(t2),"g"(0)	\
312		: "cc");		\
313	} while (0)
314#endif
315
316#define sqr_add_c2(a,i,j,c0,c1,c2)	\
317	mul_add_c2((a)[i],(a)[j],c0,c1,c2)
318
319void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
320	{
321	BN_ULONG bl,bh;
322	BN_ULONG t1,t2;
323	BN_ULONG c1,c2,c3;
324
325	c1=0;
326	c2=0;
327	c3=0;
328	mul_add_c(a[0],b[0],c1,c2,c3);
329	r[0]=c1;
330	c1=0;
331	mul_add_c(a[0],b[1],c2,c3,c1);
332	mul_add_c(a[1],b[0],c2,c3,c1);
333	r[1]=c2;
334	c2=0;
335	mul_add_c(a[2],b[0],c3,c1,c2);
336	mul_add_c(a[1],b[1],c3,c1,c2);
337	mul_add_c(a[0],b[2],c3,c1,c2);
338	r[2]=c3;
339	c3=0;
340	mul_add_c(a[0],b[3],c1,c2,c3);
341	mul_add_c(a[1],b[2],c1,c2,c3);
342	mul_add_c(a[2],b[1],c1,c2,c3);
343	mul_add_c(a[3],b[0],c1,c2,c3);
344	r[3]=c1;
345	c1=0;
346	mul_add_c(a[4],b[0],c2,c3,c1);
347	mul_add_c(a[3],b[1],c2,c3,c1);
348	mul_add_c(a[2],b[2],c2,c3,c1);
349	mul_add_c(a[1],b[3],c2,c3,c1);
350	mul_add_c(a[0],b[4],c2,c3,c1);
351	r[4]=c2;
352	c2=0;
353	mul_add_c(a[0],b[5],c3,c1,c2);
354	mul_add_c(a[1],b[4],c3,c1,c2);
355	mul_add_c(a[2],b[3],c3,c1,c2);
356	mul_add_c(a[3],b[2],c3,c1,c2);
357	mul_add_c(a[4],b[1],c3,c1,c2);
358	mul_add_c(a[5],b[0],c3,c1,c2);
359	r[5]=c3;
360	c3=0;
361	mul_add_c(a[6],b[0],c1,c2,c3);
362	mul_add_c(a[5],b[1],c1,c2,c3);
363	mul_add_c(a[4],b[2],c1,c2,c3);
364	mul_add_c(a[3],b[3],c1,c2,c3);
365	mul_add_c(a[2],b[4],c1,c2,c3);
366	mul_add_c(a[1],b[5],c1,c2,c3);
367	mul_add_c(a[0],b[6],c1,c2,c3);
368	r[6]=c1;
369	c1=0;
370	mul_add_c(a[0],b[7],c2,c3,c1);
371	mul_add_c(a[1],b[6],c2,c3,c1);
372	mul_add_c(a[2],b[5],c2,c3,c1);
373	mul_add_c(a[3],b[4],c2,c3,c1);
374	mul_add_c(a[4],b[3],c2,c3,c1);
375	mul_add_c(a[5],b[2],c2,c3,c1);
376	mul_add_c(a[6],b[1],c2,c3,c1);
377	mul_add_c(a[7],b[0],c2,c3,c1);
378	r[7]=c2;
379	c2=0;
380	mul_add_c(a[7],b[1],c3,c1,c2);
381	mul_add_c(a[6],b[2],c3,c1,c2);
382	mul_add_c(a[5],b[3],c3,c1,c2);
383	mul_add_c(a[4],b[4],c3,c1,c2);
384	mul_add_c(a[3],b[5],c3,c1,c2);
385	mul_add_c(a[2],b[6],c3,c1,c2);
386	mul_add_c(a[1],b[7],c3,c1,c2);
387	r[8]=c3;
388	c3=0;
389	mul_add_c(a[2],b[7],c1,c2,c3);
390	mul_add_c(a[3],b[6],c1,c2,c3);
391	mul_add_c(a[4],b[5],c1,c2,c3);
392	mul_add_c(a[5],b[4],c1,c2,c3);
393	mul_add_c(a[6],b[3],c1,c2,c3);
394	mul_add_c(a[7],b[2],c1,c2,c3);
395	r[9]=c1;
396	c1=0;
397	mul_add_c(a[7],b[3],c2,c3,c1);
398	mul_add_c(a[6],b[4],c2,c3,c1);
399	mul_add_c(a[5],b[5],c2,c3,c1);
400	mul_add_c(a[4],b[6],c2,c3,c1);
401	mul_add_c(a[3],b[7],c2,c3,c1);
402	r[10]=c2;
403	c2=0;
404	mul_add_c(a[4],b[7],c3,c1,c2);
405	mul_add_c(a[5],b[6],c3,c1,c2);
406	mul_add_c(a[6],b[5],c3,c1,c2);
407	mul_add_c(a[7],b[4],c3,c1,c2);
408	r[11]=c3;
409	c3=0;
410	mul_add_c(a[7],b[5],c1,c2,c3);
411	mul_add_c(a[6],b[6],c1,c2,c3);
412	mul_add_c(a[5],b[7],c1,c2,c3);
413	r[12]=c1;
414	c1=0;
415	mul_add_c(a[6],b[7],c2,c3,c1);
416	mul_add_c(a[7],b[6],c2,c3,c1);
417	r[13]=c2;
418	c2=0;
419	mul_add_c(a[7],b[7],c3,c1,c2);
420	r[14]=c3;
421	r[15]=c1;
422	}
423
424void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
425	{
426	BN_ULONG bl,bh;
427	BN_ULONG t1,t2;
428	BN_ULONG c1,c2,c3;
429
430	c1=0;
431	c2=0;
432	c3=0;
433	mul_add_c(a[0],b[0],c1,c2,c3);
434	r[0]=c1;
435	c1=0;
436	mul_add_c(a[0],b[1],c2,c3,c1);
437	mul_add_c(a[1],b[0],c2,c3,c1);
438	r[1]=c2;
439	c2=0;
440	mul_add_c(a[2],b[0],c3,c1,c2);
441	mul_add_c(a[1],b[1],c3,c1,c2);
442	mul_add_c(a[0],b[2],c3,c1,c2);
443	r[2]=c3;
444	c3=0;
445	mul_add_c(a[0],b[3],c1,c2,c3);
446	mul_add_c(a[1],b[2],c1,c2,c3);
447	mul_add_c(a[2],b[1],c1,c2,c3);
448	mul_add_c(a[3],b[0],c1,c2,c3);
449	r[3]=c1;
450	c1=0;
451	mul_add_c(a[3],b[1],c2,c3,c1);
452	mul_add_c(a[2],b[2],c2,c3,c1);
453	mul_add_c(a[1],b[3],c2,c3,c1);
454	r[4]=c2;
455	c2=0;
456	mul_add_c(a[2],b[3],c3,c1,c2);
457	mul_add_c(a[3],b[2],c3,c1,c2);
458	r[5]=c3;
459	c3=0;
460	mul_add_c(a[3],b[3],c1,c2,c3);
461	r[6]=c1;
462	r[7]=c2;
463	}
464
465void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a)
466	{
467	BN_ULONG bl,bh;
468	BN_ULONG t1,t2;
469	BN_ULONG c1,c2,c3;
470
471	c1=0;
472	c2=0;
473	c3=0;
474	sqr_add_c(a,0,c1,c2,c3);
475	r[0]=c1;
476	c1=0;
477	sqr_add_c2(a,1,0,c2,c3,c1);
478	r[1]=c2;
479	c2=0;
480	sqr_add_c(a,1,c3,c1,c2);
481	sqr_add_c2(a,2,0,c3,c1,c2);
482	r[2]=c3;
483	c3=0;
484	sqr_add_c2(a,3,0,c1,c2,c3);
485	sqr_add_c2(a,2,1,c1,c2,c3);
486	r[3]=c1;
487	c1=0;
488	sqr_add_c(a,2,c2,c3,c1);
489	sqr_add_c2(a,3,1,c2,c3,c1);
490	sqr_add_c2(a,4,0,c2,c3,c1);
491	r[4]=c2;
492	c2=0;
493	sqr_add_c2(a,5,0,c3,c1,c2);
494	sqr_add_c2(a,4,1,c3,c1,c2);
495	sqr_add_c2(a,3,2,c3,c1,c2);
496	r[5]=c3;
497	c3=0;
498	sqr_add_c(a,3,c1,c2,c3);
499	sqr_add_c2(a,4,2,c1,c2,c3);
500	sqr_add_c2(a,5,1,c1,c2,c3);
501	sqr_add_c2(a,6,0,c1,c2,c3);
502	r[6]=c1;
503	c1=0;
504	sqr_add_c2(a,7,0,c2,c3,c1);
505	sqr_add_c2(a,6,1,c2,c3,c1);
506	sqr_add_c2(a,5,2,c2,c3,c1);
507	sqr_add_c2(a,4,3,c2,c3,c1);
508	r[7]=c2;
509	c2=0;
510	sqr_add_c(a,4,c3,c1,c2);
511	sqr_add_c2(a,5,3,c3,c1,c2);
512	sqr_add_c2(a,6,2,c3,c1,c2);
513	sqr_add_c2(a,7,1,c3,c1,c2);
514	r[8]=c3;
515	c3=0;
516	sqr_add_c2(a,7,2,c1,c2,c3);
517	sqr_add_c2(a,6,3,c1,c2,c3);
518	sqr_add_c2(a,5,4,c1,c2,c3);
519	r[9]=c1;
520	c1=0;
521	sqr_add_c(a,5,c2,c3,c1);
522	sqr_add_c2(a,6,4,c2,c3,c1);
523	sqr_add_c2(a,7,3,c2,c3,c1);
524	r[10]=c2;
525	c2=0;
526	sqr_add_c2(a,7,4,c3,c1,c2);
527	sqr_add_c2(a,6,5,c3,c1,c2);
528	r[11]=c3;
529	c3=0;
530	sqr_add_c(a,6,c1,c2,c3);
531	sqr_add_c2(a,7,5,c1,c2,c3);
532	r[12]=c1;
533	c1=0;
534	sqr_add_c2(a,7,6,c2,c3,c1);
535	r[13]=c2;
536	c2=0;
537	sqr_add_c(a,7,c3,c1,c2);
538	r[14]=c3;
539	r[15]=c1;
540	}
541
542void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a)
543	{
544	BN_ULONG bl,bh;
545	BN_ULONG t1,t2;
546	BN_ULONG c1,c2,c3;
547
548	c1=0;
549	c2=0;
550	c3=0;
551	sqr_add_c(a,0,c1,c2,c3);
552	r[0]=c1;
553	c1=0;
554	sqr_add_c2(a,1,0,c2,c3,c1);
555	r[1]=c2;
556	c2=0;
557	sqr_add_c(a,1,c3,c1,c2);
558	sqr_add_c2(a,2,0,c3,c1,c2);
559	r[2]=c3;
560	c3=0;
561	sqr_add_c2(a,3,0,c1,c2,c3);
562	sqr_add_c2(a,2,1,c1,c2,c3);
563	r[3]=c1;
564	c1=0;
565	sqr_add_c(a,2,c2,c3,c1);
566	sqr_add_c2(a,3,1,c2,c3,c1);
567	r[4]=c2;
568	c2=0;
569	sqr_add_c2(a,3,2,c3,c1,c2);
570	r[5]=c3;
571	c3=0;
572	sqr_add_c(a,3,c1,c2,c3);
573	r[6]=c1;
574	r[7]=c2;
575	}
576