bn_asm.c revision 109998
155714Skris/* crypto/bn/bn_asm.c */
255714Skris/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
355714Skris * All rights reserved.
455714Skris *
555714Skris * This package is an SSL implementation written
655714Skris * by Eric Young (eay@cryptsoft.com).
755714Skris * The implementation was written so as to conform with Netscapes SSL.
855714Skris *
955714Skris * This library is free for commercial and non-commercial use as long as
1055714Skris * the following conditions are aheared to.  The following conditions
1155714Skris * apply to all code found in this distribution, be it the RC4, RSA,
1255714Skris * lhash, DES, etc., code; not just the SSL code.  The SSL documentation
1355714Skris * included with this distribution is covered by the same copyright terms
1455714Skris * except that the holder is Tim Hudson (tjh@cryptsoft.com).
1555714Skris *
1655714Skris * Copyright remains Eric Young's, and as such any Copyright notices in
1755714Skris * the code are not to be removed.
1855714Skris * If this package is used in a product, Eric Young should be given attribution
1955714Skris * as the author of the parts of the library used.
2055714Skris * This can be in the form of a textual message at program startup or
2155714Skris * in documentation (online or textual) provided with the package.
2255714Skris *
2355714Skris * Redistribution and use in source and binary forms, with or without
2455714Skris * modification, are permitted provided that the following conditions
2555714Skris * are met:
2655714Skris * 1. Redistributions of source code must retain the copyright
2755714Skris *    notice, this list of conditions and the following disclaimer.
2855714Skris * 2. Redistributions in binary form must reproduce the above copyright
2955714Skris *    notice, this list of conditions and the following disclaimer in the
3055714Skris *    documentation and/or other materials provided with the distribution.
3155714Skris * 3. All advertising materials mentioning features or use of this software
3255714Skris *    must display the following acknowledgement:
3355714Skris *    "This product includes cryptographic software written by
3455714Skris *     Eric Young (eay@cryptsoft.com)"
3555714Skris *    The word 'cryptographic' can be left out if the rouines from the library
3655714Skris *    being used are not cryptographic related :-).
3755714Skris * 4. If you include any Windows specific code (or a derivative thereof) from
3855714Skris *    the apps directory (application code) you must include an acknowledgement:
3955714Skris *    "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
4055714Skris *
4155714Skris * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
4255714Skris * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
4355714Skris * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
4455714Skris * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
4555714Skris * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
4655714Skris * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
4755714Skris * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
4855714Skris * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
4955714Skris * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
5055714Skris * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
5155714Skris * SUCH DAMAGE.
5255714Skris *
5355714Skris * The licence and distribution terms for any publically available version or
5455714Skris * derivative of this code cannot be changed.  i.e. this code cannot simply be
5555714Skris * copied and put under another distribution licence
5655714Skris * [including the GNU Public Licence.]
5755714Skris */
5855714Skris
5959191Skris#ifndef BN_DEBUG
6059191Skris# undef NDEBUG /* avoid conflicting definitions */
6159191Skris# define NDEBUG
6259191Skris#endif
6359191Skris
6455714Skris#include <stdio.h>
6559191Skris#include <assert.h>
6655714Skris#include "cryptlib.h"
6755714Skris#include "bn_lcl.h"
6855714Skris
6959191Skris#if defined(BN_LLONG) || defined(BN_UMULT_HIGH)
7055714Skris
71109998SmarkmBN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
7255714Skris	{
7355714Skris	BN_ULONG c1=0;
7455714Skris
7559191Skris	assert(num >= 0);
7655714Skris	if (num <= 0) return(c1);
7755714Skris
7859191Skris	while (num&~3)
7955714Skris		{
8055714Skris		mul_add(rp[0],ap[0],w,c1);
8155714Skris		mul_add(rp[1],ap[1],w,c1);
8255714Skris		mul_add(rp[2],ap[2],w,c1);
8355714Skris		mul_add(rp[3],ap[3],w,c1);
8459191Skris		ap+=4; rp+=4; num-=4;
8555714Skris		}
8659191Skris	if (num)
8759191Skris		{
8859191Skris		mul_add(rp[0],ap[0],w,c1); if (--num==0) return c1;
8959191Skris		mul_add(rp[1],ap[1],w,c1); if (--num==0) return c1;
9059191Skris		mul_add(rp[2],ap[2],w,c1); return c1;
9159191Skris		}
9255714Skris
9355714Skris	return(c1);
9455714Skris	}
9555714Skris
96109998SmarkmBN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
9755714Skris	{
9855714Skris	BN_ULONG c1=0;
9955714Skris
10059191Skris	assert(num >= 0);
10155714Skris	if (num <= 0) return(c1);
10255714Skris
10359191Skris	while (num&~3)
10455714Skris		{
10555714Skris		mul(rp[0],ap[0],w,c1);
10655714Skris		mul(rp[1],ap[1],w,c1);
10755714Skris		mul(rp[2],ap[2],w,c1);
10855714Skris		mul(rp[3],ap[3],w,c1);
10959191Skris		ap+=4; rp+=4; num-=4;
11055714Skris		}
11159191Skris	if (num)
11259191Skris		{
11359191Skris		mul(rp[0],ap[0],w,c1); if (--num == 0) return c1;
11459191Skris		mul(rp[1],ap[1],w,c1); if (--num == 0) return c1;
11559191Skris		mul(rp[2],ap[2],w,c1);
11659191Skris		}
11755714Skris	return(c1);
11855714Skris	}
11955714Skris
120109998Smarkmvoid bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n)
12155714Skris        {
12259191Skris	assert(n >= 0);
12355714Skris	if (n <= 0) return;
12459191Skris	while (n&~3)
12555714Skris		{
12659191Skris		sqr(r[0],r[1],a[0]);
12759191Skris		sqr(r[2],r[3],a[1]);
12859191Skris		sqr(r[4],r[5],a[2]);
12959191Skris		sqr(r[6],r[7],a[3]);
13059191Skris		a+=4; r+=8; n-=4;
13155714Skris		}
13259191Skris	if (n)
13359191Skris		{
13459191Skris		sqr(r[0],r[1],a[0]); if (--n == 0) return;
13559191Skris		sqr(r[2],r[3],a[1]); if (--n == 0) return;
13659191Skris		sqr(r[4],r[5],a[2]);
13759191Skris		}
13855714Skris	}
13955714Skris
14059191Skris#else /* !(defined(BN_LLONG) || defined(BN_UMULT_HIGH)) */
14155714Skris
142109998SmarkmBN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
14355714Skris	{
14455714Skris	BN_ULONG c=0;
14555714Skris	BN_ULONG bl,bh;
14655714Skris
14759191Skris	assert(num >= 0);
14855714Skris	if (num <= 0) return((BN_ULONG)0);
14955714Skris
15055714Skris	bl=LBITS(w);
15155714Skris	bh=HBITS(w);
15255714Skris
15355714Skris	for (;;)
15455714Skris		{
15555714Skris		mul_add(rp[0],ap[0],bl,bh,c);
15655714Skris		if (--num == 0) break;
15755714Skris		mul_add(rp[1],ap[1],bl,bh,c);
15855714Skris		if (--num == 0) break;
15955714Skris		mul_add(rp[2],ap[2],bl,bh,c);
16055714Skris		if (--num == 0) break;
16155714Skris		mul_add(rp[3],ap[3],bl,bh,c);
16255714Skris		if (--num == 0) break;
16355714Skris		ap+=4;
16455714Skris		rp+=4;
16555714Skris		}
16655714Skris	return(c);
16755714Skris	}
16855714Skris
169109998SmarkmBN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
17055714Skris	{
17155714Skris	BN_ULONG carry=0;
17255714Skris	BN_ULONG bl,bh;
17355714Skris
17459191Skris	assert(num >= 0);
17555714Skris	if (num <= 0) return((BN_ULONG)0);
17655714Skris
17755714Skris	bl=LBITS(w);
17855714Skris	bh=HBITS(w);
17955714Skris
18055714Skris	for (;;)
18155714Skris		{
18255714Skris		mul(rp[0],ap[0],bl,bh,carry);
18355714Skris		if (--num == 0) break;
18455714Skris		mul(rp[1],ap[1],bl,bh,carry);
18555714Skris		if (--num == 0) break;
18655714Skris		mul(rp[2],ap[2],bl,bh,carry);
18755714Skris		if (--num == 0) break;
18855714Skris		mul(rp[3],ap[3],bl,bh,carry);
18955714Skris		if (--num == 0) break;
19055714Skris		ap+=4;
19155714Skris		rp+=4;
19255714Skris		}
19355714Skris	return(carry);
19455714Skris	}
19555714Skris
196109998Smarkmvoid bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n)
19755714Skris        {
19859191Skris	assert(n >= 0);
19955714Skris	if (n <= 0) return;
20055714Skris	for (;;)
20155714Skris		{
20255714Skris		sqr64(r[0],r[1],a[0]);
20355714Skris		if (--n == 0) break;
20455714Skris
20555714Skris		sqr64(r[2],r[3],a[1]);
20655714Skris		if (--n == 0) break;
20755714Skris
20855714Skris		sqr64(r[4],r[5],a[2]);
20955714Skris		if (--n == 0) break;
21055714Skris
21155714Skris		sqr64(r[6],r[7],a[3]);
21255714Skris		if (--n == 0) break;
21355714Skris
21455714Skris		a+=4;
21555714Skris		r+=8;
21655714Skris		}
21755714Skris	}
21855714Skris
21959191Skris#endif /* !(defined(BN_LLONG) || defined(BN_UMULT_HIGH)) */
22055714Skris
22155714Skris#if defined(BN_LLONG) && defined(BN_DIV2W)
22255714Skris
22355714SkrisBN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
22455714Skris	{
22555714Skris	return((BN_ULONG)(((((BN_ULLONG)h)<<BN_BITS2)|l)/(BN_ULLONG)d));
22655714Skris	}
22755714Skris
22855714Skris#else
22955714Skris
23068651Skris/* Divide h,l by d and return the result. */
23155714Skris/* I need to test this some more :-( */
23255714SkrisBN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
23355714Skris	{
23455714Skris	BN_ULONG dh,dl,q,ret=0,th,tl,t;
23555714Skris	int i,count=2;
23655714Skris
23755714Skris	if (d == 0) return(BN_MASK2);
23855714Skris
23955714Skris	i=BN_num_bits_word(d);
24068651Skris	assert((i == BN_BITS2) || (h > (BN_ULONG)1<<i));
24168651Skris
24255714Skris	i=BN_BITS2-i;
24355714Skris	if (h >= d) h-=d;
24455714Skris
24555714Skris	if (i)
24655714Skris		{
24755714Skris		d<<=i;
24855714Skris		h=(h<<i)|(l>>(BN_BITS2-i));
24955714Skris		l<<=i;
25055714Skris		}
25155714Skris	dh=(d&BN_MASK2h)>>BN_BITS4;
25255714Skris	dl=(d&BN_MASK2l);
25355714Skris	for (;;)
25455714Skris		{
25555714Skris		if ((h>>BN_BITS4) == dh)
25655714Skris			q=BN_MASK2l;
25755714Skris		else
25855714Skris			q=h/dh;
25955714Skris
26055714Skris		th=q*dh;
26155714Skris		tl=dl*q;
26255714Skris		for (;;)
26355714Skris			{
26455714Skris			t=h-th;
26555714Skris			if ((t&BN_MASK2h) ||
26655714Skris				((tl) <= (
26755714Skris					(t<<BN_BITS4)|
26855714Skris					((l&BN_MASK2h)>>BN_BITS4))))
26955714Skris				break;
27055714Skris			q--;
27155714Skris			th-=dh;
27255714Skris			tl-=dl;
27355714Skris			}
27455714Skris		t=(tl>>BN_BITS4);
27555714Skris		tl=(tl<<BN_BITS4)&BN_MASK2h;
27655714Skris		th+=t;
27755714Skris
27855714Skris		if (l < tl) th++;
27955714Skris		l-=tl;
28055714Skris		if (h < th)
28155714Skris			{
28255714Skris			h+=d;
28355714Skris			q--;
28455714Skris			}
28555714Skris		h-=th;
28655714Skris
28755714Skris		if (--count == 0) break;
28855714Skris
28955714Skris		ret=q<<BN_BITS4;
29055714Skris		h=((h<<BN_BITS4)|(l>>BN_BITS4))&BN_MASK2;
29155714Skris		l=(l&BN_MASK2l)<<BN_BITS4;
29255714Skris		}
29355714Skris	ret|=q;
29455714Skris	return(ret);
29555714Skris	}
29659191Skris#endif /* !defined(BN_LLONG) && defined(BN_DIV2W) */
29755714Skris
29855714Skris#ifdef BN_LLONG
299109998SmarkmBN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, int n)
30055714Skris        {
30155714Skris	BN_ULLONG ll=0;
30255714Skris
30359191Skris	assert(n >= 0);
30455714Skris	if (n <= 0) return((BN_ULONG)0);
30555714Skris
30655714Skris	for (;;)
30755714Skris		{
30855714Skris		ll+=(BN_ULLONG)a[0]+b[0];
30955714Skris		r[0]=(BN_ULONG)ll&BN_MASK2;
31055714Skris		ll>>=BN_BITS2;
31155714Skris		if (--n <= 0) break;
31255714Skris
31355714Skris		ll+=(BN_ULLONG)a[1]+b[1];
31455714Skris		r[1]=(BN_ULONG)ll&BN_MASK2;
31555714Skris		ll>>=BN_BITS2;
31655714Skris		if (--n <= 0) break;
31755714Skris
31855714Skris		ll+=(BN_ULLONG)a[2]+b[2];
31955714Skris		r[2]=(BN_ULONG)ll&BN_MASK2;
32055714Skris		ll>>=BN_BITS2;
32155714Skris		if (--n <= 0) break;
32255714Skris
32355714Skris		ll+=(BN_ULLONG)a[3]+b[3];
32455714Skris		r[3]=(BN_ULONG)ll&BN_MASK2;
32555714Skris		ll>>=BN_BITS2;
32655714Skris		if (--n <= 0) break;
32755714Skris
32855714Skris		a+=4;
32955714Skris		b+=4;
33055714Skris		r+=4;
33155714Skris		}
33255714Skris	return((BN_ULONG)ll);
33355714Skris	}
33459191Skris#else /* !BN_LLONG */
335109998SmarkmBN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, int n)
33655714Skris        {
33755714Skris	BN_ULONG c,l,t;
33855714Skris
33959191Skris	assert(n >= 0);
34055714Skris	if (n <= 0) return((BN_ULONG)0);
34155714Skris
34255714Skris	c=0;
34355714Skris	for (;;)
34455714Skris		{
34555714Skris		t=a[0];
34655714Skris		t=(t+c)&BN_MASK2;
34755714Skris		c=(t < c);
34855714Skris		l=(t+b[0])&BN_MASK2;
34955714Skris		c+=(l < t);
35055714Skris		r[0]=l;
35155714Skris		if (--n <= 0) break;
35255714Skris
35355714Skris		t=a[1];
35455714Skris		t=(t+c)&BN_MASK2;
35555714Skris		c=(t < c);
35655714Skris		l=(t+b[1])&BN_MASK2;
35755714Skris		c+=(l < t);
35855714Skris		r[1]=l;
35955714Skris		if (--n <= 0) break;
36055714Skris
36155714Skris		t=a[2];
36255714Skris		t=(t+c)&BN_MASK2;
36355714Skris		c=(t < c);
36455714Skris		l=(t+b[2])&BN_MASK2;
36555714Skris		c+=(l < t);
36655714Skris		r[2]=l;
36755714Skris		if (--n <= 0) break;
36855714Skris
36955714Skris		t=a[3];
37055714Skris		t=(t+c)&BN_MASK2;
37155714Skris		c=(t < c);
37255714Skris		l=(t+b[3])&BN_MASK2;
37355714Skris		c+=(l < t);
37455714Skris		r[3]=l;
37555714Skris		if (--n <= 0) break;
37655714Skris
37755714Skris		a+=4;
37855714Skris		b+=4;
37955714Skris		r+=4;
38055714Skris		}
38155714Skris	return((BN_ULONG)c);
38255714Skris	}
38359191Skris#endif /* !BN_LLONG */
38455714Skris
385109998SmarkmBN_ULONG bn_sub_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, int n)
38655714Skris        {
38755714Skris	BN_ULONG t1,t2;
38855714Skris	int c=0;
38955714Skris
39059191Skris	assert(n >= 0);
39155714Skris	if (n <= 0) return((BN_ULONG)0);
39255714Skris
39355714Skris	for (;;)
39455714Skris		{
39555714Skris		t1=a[0]; t2=b[0];
39655714Skris		r[0]=(t1-t2-c)&BN_MASK2;
39755714Skris		if (t1 != t2) c=(t1 < t2);
39855714Skris		if (--n <= 0) break;
39955714Skris
40055714Skris		t1=a[1]; t2=b[1];
40155714Skris		r[1]=(t1-t2-c)&BN_MASK2;
40255714Skris		if (t1 != t2) c=(t1 < t2);
40355714Skris		if (--n <= 0) break;
40455714Skris
40555714Skris		t1=a[2]; t2=b[2];
40655714Skris		r[2]=(t1-t2-c)&BN_MASK2;
40755714Skris		if (t1 != t2) c=(t1 < t2);
40855714Skris		if (--n <= 0) break;
40955714Skris
41055714Skris		t1=a[3]; t2=b[3];
41155714Skris		r[3]=(t1-t2-c)&BN_MASK2;
41255714Skris		if (t1 != t2) c=(t1 < t2);
41355714Skris		if (--n <= 0) break;
41455714Skris
41555714Skris		a+=4;
41655714Skris		b+=4;
41755714Skris		r+=4;
41855714Skris		}
41955714Skris	return(c);
42055714Skris	}
42155714Skris
42255714Skris#ifdef BN_MUL_COMBA
42355714Skris
42455714Skris#undef bn_mul_comba8
42555714Skris#undef bn_mul_comba4
42655714Skris#undef bn_sqr_comba8
42755714Skris#undef bn_sqr_comba4
42855714Skris
42959191Skris/* mul_add_c(a,b,c0,c1,c2)  -- c+=a*b for three word number c=(c2,c1,c0) */
43059191Skris/* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */
43159191Skris/* sqr_add_c(a,i,c0,c1,c2)  -- c+=a[i]^2 for three word number c=(c2,c1,c0) */
43259191Skris/* sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number c=(c2,c1,c0) */
43359191Skris
43455714Skris#ifdef BN_LLONG
43555714Skris#define mul_add_c(a,b,c0,c1,c2) \
43655714Skris	t=(BN_ULLONG)a*b; \
43755714Skris	t1=(BN_ULONG)Lw(t); \
43855714Skris	t2=(BN_ULONG)Hw(t); \
43955714Skris	c0=(c0+t1)&BN_MASK2; if ((c0) < t1) t2++; \
44055714Skris	c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++;
44155714Skris
44255714Skris#define mul_add_c2(a,b,c0,c1,c2) \
44355714Skris	t=(BN_ULLONG)a*b; \
44455714Skris	tt=(t+t)&BN_MASK; \
44555714Skris	if (tt < t) c2++; \
44655714Skris	t1=(BN_ULONG)Lw(tt); \
44755714Skris	t2=(BN_ULONG)Hw(tt); \
44855714Skris	c0=(c0+t1)&BN_MASK2;  \
44955714Skris	if ((c0 < t1) && (((++t2)&BN_MASK2) == 0)) c2++; \
45055714Skris	c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++;
45155714Skris
45255714Skris#define sqr_add_c(a,i,c0,c1,c2) \
45355714Skris	t=(BN_ULLONG)a[i]*a[i]; \
45455714Skris	t1=(BN_ULONG)Lw(t); \
45555714Skris	t2=(BN_ULONG)Hw(t); \
45655714Skris	c0=(c0+t1)&BN_MASK2; if ((c0) < t1) t2++; \
45755714Skris	c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++;
45855714Skris
45955714Skris#define sqr_add_c2(a,i,j,c0,c1,c2) \
46055714Skris	mul_add_c2((a)[i],(a)[j],c0,c1,c2)
46159191Skris
46259191Skris#elif defined(BN_UMULT_HIGH)
46359191Skris
46459191Skris#define mul_add_c(a,b,c0,c1,c2)	{	\
46559191Skris	BN_ULONG ta=(a),tb=(b);		\
46659191Skris	t1 = ta * tb;			\
46759191Skris	t2 = BN_UMULT_HIGH(ta,tb);	\
46859191Skris	c0 += t1; t2 += (c0<t1)?1:0;	\
46959191Skris	c1 += t2; c2 += (c1<t2)?1:0;	\
47059191Skris	}
47159191Skris
47259191Skris#define mul_add_c2(a,b,c0,c1,c2) {	\
47359191Skris	BN_ULONG ta=(a),tb=(b),t0;	\
47459191Skris	t1 = BN_UMULT_HIGH(ta,tb);	\
47559191Skris	t0 = ta * tb;			\
47659191Skris	t2 = t1+t1; c2 += (t2<t1)?1:0;	\
47759191Skris	t1 = t0+t0; t2 += (t1<t0)?1:0;	\
47859191Skris	c0 += t1; t2 += (c0<t1)?1:0;	\
47959191Skris	c1 += t2; c2 += (c1<t2)?1:0;	\
48059191Skris	}
48159191Skris
48259191Skris#define sqr_add_c(a,i,c0,c1,c2)	{	\
48359191Skris	BN_ULONG ta=(a)[i];		\
48459191Skris	t1 = ta * ta;			\
48559191Skris	t2 = BN_UMULT_HIGH(ta,ta);	\
48659191Skris	c0 += t1; t2 += (c0<t1)?1:0;	\
48759191Skris	c1 += t2; c2 += (c1<t2)?1:0;	\
48859191Skris	}
48959191Skris
49059191Skris#define sqr_add_c2(a,i,j,c0,c1,c2)	\
49159191Skris	mul_add_c2((a)[i],(a)[j],c0,c1,c2)
49259191Skris
49359191Skris#else /* !BN_LLONG */
49455714Skris#define mul_add_c(a,b,c0,c1,c2) \
49555714Skris	t1=LBITS(a); t2=HBITS(a); \
49655714Skris	bl=LBITS(b); bh=HBITS(b); \
49755714Skris	mul64(t1,t2,bl,bh); \
49855714Skris	c0=(c0+t1)&BN_MASK2; if ((c0) < t1) t2++; \
49955714Skris	c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++;
50055714Skris
50155714Skris#define mul_add_c2(a,b,c0,c1,c2) \
50255714Skris	t1=LBITS(a); t2=HBITS(a); \
50355714Skris	bl=LBITS(b); bh=HBITS(b); \
50455714Skris	mul64(t1,t2,bl,bh); \
50555714Skris	if (t2 & BN_TBIT) c2++; \
50655714Skris	t2=(t2+t2)&BN_MASK2; \
50755714Skris	if (t1 & BN_TBIT) t2++; \
50855714Skris	t1=(t1+t1)&BN_MASK2; \
50955714Skris	c0=(c0+t1)&BN_MASK2;  \
51055714Skris	if ((c0 < t1) && (((++t2)&BN_MASK2) == 0)) c2++; \
51155714Skris	c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++;
51255714Skris
51355714Skris#define sqr_add_c(a,i,c0,c1,c2) \
51455714Skris	sqr64(t1,t2,(a)[i]); \
51555714Skris	c0=(c0+t1)&BN_MASK2; if ((c0) < t1) t2++; \
51655714Skris	c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++;
51755714Skris
51855714Skris#define sqr_add_c2(a,i,j,c0,c1,c2) \
51955714Skris	mul_add_c2((a)[i],(a)[j],c0,c1,c2)
52059191Skris#endif /* !BN_LLONG */
52155714Skris
52255714Skrisvoid bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
52355714Skris	{
52455714Skris#ifdef BN_LLONG
52555714Skris	BN_ULLONG t;
52655714Skris#else
52755714Skris	BN_ULONG bl,bh;
52855714Skris#endif
52955714Skris	BN_ULONG t1,t2;
53055714Skris	BN_ULONG c1,c2,c3;
53155714Skris
53255714Skris	c1=0;
53355714Skris	c2=0;
53455714Skris	c3=0;
53555714Skris	mul_add_c(a[0],b[0],c1,c2,c3);
53655714Skris	r[0]=c1;
53755714Skris	c1=0;
53855714Skris	mul_add_c(a[0],b[1],c2,c3,c1);
53955714Skris	mul_add_c(a[1],b[0],c2,c3,c1);
54055714Skris	r[1]=c2;
54155714Skris	c2=0;
54255714Skris	mul_add_c(a[2],b[0],c3,c1,c2);
54355714Skris	mul_add_c(a[1],b[1],c3,c1,c2);
54455714Skris	mul_add_c(a[0],b[2],c3,c1,c2);
54555714Skris	r[2]=c3;
54655714Skris	c3=0;
54755714Skris	mul_add_c(a[0],b[3],c1,c2,c3);
54855714Skris	mul_add_c(a[1],b[2],c1,c2,c3);
54955714Skris	mul_add_c(a[2],b[1],c1,c2,c3);
55055714Skris	mul_add_c(a[3],b[0],c1,c2,c3);
55155714Skris	r[3]=c1;
55255714Skris	c1=0;
55355714Skris	mul_add_c(a[4],b[0],c2,c3,c1);
55455714Skris	mul_add_c(a[3],b[1],c2,c3,c1);
55555714Skris	mul_add_c(a[2],b[2],c2,c3,c1);
55655714Skris	mul_add_c(a[1],b[3],c2,c3,c1);
55755714Skris	mul_add_c(a[0],b[4],c2,c3,c1);
55855714Skris	r[4]=c2;
55955714Skris	c2=0;
56055714Skris	mul_add_c(a[0],b[5],c3,c1,c2);
56155714Skris	mul_add_c(a[1],b[4],c3,c1,c2);
56255714Skris	mul_add_c(a[2],b[3],c3,c1,c2);
56355714Skris	mul_add_c(a[3],b[2],c3,c1,c2);
56455714Skris	mul_add_c(a[4],b[1],c3,c1,c2);
56555714Skris	mul_add_c(a[5],b[0],c3,c1,c2);
56655714Skris	r[5]=c3;
56755714Skris	c3=0;
56855714Skris	mul_add_c(a[6],b[0],c1,c2,c3);
56955714Skris	mul_add_c(a[5],b[1],c1,c2,c3);
57055714Skris	mul_add_c(a[4],b[2],c1,c2,c3);
57155714Skris	mul_add_c(a[3],b[3],c1,c2,c3);
57255714Skris	mul_add_c(a[2],b[4],c1,c2,c3);
57355714Skris	mul_add_c(a[1],b[5],c1,c2,c3);
57455714Skris	mul_add_c(a[0],b[6],c1,c2,c3);
57555714Skris	r[6]=c1;
57655714Skris	c1=0;
57755714Skris	mul_add_c(a[0],b[7],c2,c3,c1);
57855714Skris	mul_add_c(a[1],b[6],c2,c3,c1);
57955714Skris	mul_add_c(a[2],b[5],c2,c3,c1);
58055714Skris	mul_add_c(a[3],b[4],c2,c3,c1);
58155714Skris	mul_add_c(a[4],b[3],c2,c3,c1);
58255714Skris	mul_add_c(a[5],b[2],c2,c3,c1);
58355714Skris	mul_add_c(a[6],b[1],c2,c3,c1);
58455714Skris	mul_add_c(a[7],b[0],c2,c3,c1);
58555714Skris	r[7]=c2;
58655714Skris	c2=0;
58755714Skris	mul_add_c(a[7],b[1],c3,c1,c2);
58855714Skris	mul_add_c(a[6],b[2],c3,c1,c2);
58955714Skris	mul_add_c(a[5],b[3],c3,c1,c2);
59055714Skris	mul_add_c(a[4],b[4],c3,c1,c2);
59155714Skris	mul_add_c(a[3],b[5],c3,c1,c2);
59255714Skris	mul_add_c(a[2],b[6],c3,c1,c2);
59355714Skris	mul_add_c(a[1],b[7],c3,c1,c2);
59455714Skris	r[8]=c3;
59555714Skris	c3=0;
59655714Skris	mul_add_c(a[2],b[7],c1,c2,c3);
59755714Skris	mul_add_c(a[3],b[6],c1,c2,c3);
59855714Skris	mul_add_c(a[4],b[5],c1,c2,c3);
59955714Skris	mul_add_c(a[5],b[4],c1,c2,c3);
60055714Skris	mul_add_c(a[6],b[3],c1,c2,c3);
60155714Skris	mul_add_c(a[7],b[2],c1,c2,c3);
60255714Skris	r[9]=c1;
60355714Skris	c1=0;
60455714Skris	mul_add_c(a[7],b[3],c2,c3,c1);
60555714Skris	mul_add_c(a[6],b[4],c2,c3,c1);
60655714Skris	mul_add_c(a[5],b[5],c2,c3,c1);
60755714Skris	mul_add_c(a[4],b[6],c2,c3,c1);
60855714Skris	mul_add_c(a[3],b[7],c2,c3,c1);
60955714Skris	r[10]=c2;
61055714Skris	c2=0;
61155714Skris	mul_add_c(a[4],b[7],c3,c1,c2);
61255714Skris	mul_add_c(a[5],b[6],c3,c1,c2);
61355714Skris	mul_add_c(a[6],b[5],c3,c1,c2);
61455714Skris	mul_add_c(a[7],b[4],c3,c1,c2);
61555714Skris	r[11]=c3;
61655714Skris	c3=0;
61755714Skris	mul_add_c(a[7],b[5],c1,c2,c3);
61855714Skris	mul_add_c(a[6],b[6],c1,c2,c3);
61955714Skris	mul_add_c(a[5],b[7],c1,c2,c3);
62055714Skris	r[12]=c1;
62155714Skris	c1=0;
62255714Skris	mul_add_c(a[6],b[7],c2,c3,c1);
62355714Skris	mul_add_c(a[7],b[6],c2,c3,c1);
62455714Skris	r[13]=c2;
62555714Skris	c2=0;
62655714Skris	mul_add_c(a[7],b[7],c3,c1,c2);
62755714Skris	r[14]=c3;
62855714Skris	r[15]=c1;
62955714Skris	}
63055714Skris
63155714Skrisvoid bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
63255714Skris	{
63355714Skris#ifdef BN_LLONG
63455714Skris	BN_ULLONG t;
63555714Skris#else
63655714Skris	BN_ULONG bl,bh;
63755714Skris#endif
63855714Skris	BN_ULONG t1,t2;
63955714Skris	BN_ULONG c1,c2,c3;
64055714Skris
64155714Skris	c1=0;
64255714Skris	c2=0;
64355714Skris	c3=0;
64455714Skris	mul_add_c(a[0],b[0],c1,c2,c3);
64555714Skris	r[0]=c1;
64655714Skris	c1=0;
64755714Skris	mul_add_c(a[0],b[1],c2,c3,c1);
64855714Skris	mul_add_c(a[1],b[0],c2,c3,c1);
64955714Skris	r[1]=c2;
65055714Skris	c2=0;
65155714Skris	mul_add_c(a[2],b[0],c3,c1,c2);
65255714Skris	mul_add_c(a[1],b[1],c3,c1,c2);
65355714Skris	mul_add_c(a[0],b[2],c3,c1,c2);
65455714Skris	r[2]=c3;
65555714Skris	c3=0;
65655714Skris	mul_add_c(a[0],b[3],c1,c2,c3);
65755714Skris	mul_add_c(a[1],b[2],c1,c2,c3);
65855714Skris	mul_add_c(a[2],b[1],c1,c2,c3);
65955714Skris	mul_add_c(a[3],b[0],c1,c2,c3);
66055714Skris	r[3]=c1;
66155714Skris	c1=0;
66255714Skris	mul_add_c(a[3],b[1],c2,c3,c1);
66355714Skris	mul_add_c(a[2],b[2],c2,c3,c1);
66455714Skris	mul_add_c(a[1],b[3],c2,c3,c1);
66555714Skris	r[4]=c2;
66655714Skris	c2=0;
66755714Skris	mul_add_c(a[2],b[3],c3,c1,c2);
66855714Skris	mul_add_c(a[3],b[2],c3,c1,c2);
66955714Skris	r[5]=c3;
67055714Skris	c3=0;
67155714Skris	mul_add_c(a[3],b[3],c1,c2,c3);
67255714Skris	r[6]=c1;
67355714Skris	r[7]=c2;
67455714Skris	}
67555714Skris
676109998Smarkmvoid bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a)
67755714Skris	{
67855714Skris#ifdef BN_LLONG
67955714Skris	BN_ULLONG t,tt;
68055714Skris#else
68155714Skris	BN_ULONG bl,bh;
68255714Skris#endif
68355714Skris	BN_ULONG t1,t2;
68455714Skris	BN_ULONG c1,c2,c3;
68555714Skris
68655714Skris	c1=0;
68755714Skris	c2=0;
68855714Skris	c3=0;
68955714Skris	sqr_add_c(a,0,c1,c2,c3);
69055714Skris	r[0]=c1;
69155714Skris	c1=0;
69255714Skris	sqr_add_c2(a,1,0,c2,c3,c1);
69355714Skris	r[1]=c2;
69455714Skris	c2=0;
69555714Skris	sqr_add_c(a,1,c3,c1,c2);
69655714Skris	sqr_add_c2(a,2,0,c3,c1,c2);
69755714Skris	r[2]=c3;
69855714Skris	c3=0;
69955714Skris	sqr_add_c2(a,3,0,c1,c2,c3);
70055714Skris	sqr_add_c2(a,2,1,c1,c2,c3);
70155714Skris	r[3]=c1;
70255714Skris	c1=0;
70355714Skris	sqr_add_c(a,2,c2,c3,c1);
70455714Skris	sqr_add_c2(a,3,1,c2,c3,c1);
70555714Skris	sqr_add_c2(a,4,0,c2,c3,c1);
70655714Skris	r[4]=c2;
70755714Skris	c2=0;
70855714Skris	sqr_add_c2(a,5,0,c3,c1,c2);
70955714Skris	sqr_add_c2(a,4,1,c3,c1,c2);
71055714Skris	sqr_add_c2(a,3,2,c3,c1,c2);
71155714Skris	r[5]=c3;
71255714Skris	c3=0;
71355714Skris	sqr_add_c(a,3,c1,c2,c3);
71455714Skris	sqr_add_c2(a,4,2,c1,c2,c3);
71555714Skris	sqr_add_c2(a,5,1,c1,c2,c3);
71655714Skris	sqr_add_c2(a,6,0,c1,c2,c3);
71755714Skris	r[6]=c1;
71855714Skris	c1=0;
71955714Skris	sqr_add_c2(a,7,0,c2,c3,c1);
72055714Skris	sqr_add_c2(a,6,1,c2,c3,c1);
72155714Skris	sqr_add_c2(a,5,2,c2,c3,c1);
72255714Skris	sqr_add_c2(a,4,3,c2,c3,c1);
72355714Skris	r[7]=c2;
72455714Skris	c2=0;
72555714Skris	sqr_add_c(a,4,c3,c1,c2);
72655714Skris	sqr_add_c2(a,5,3,c3,c1,c2);
72755714Skris	sqr_add_c2(a,6,2,c3,c1,c2);
72855714Skris	sqr_add_c2(a,7,1,c3,c1,c2);
72955714Skris	r[8]=c3;
73055714Skris	c3=0;
73155714Skris	sqr_add_c2(a,7,2,c1,c2,c3);
73255714Skris	sqr_add_c2(a,6,3,c1,c2,c3);
73355714Skris	sqr_add_c2(a,5,4,c1,c2,c3);
73455714Skris	r[9]=c1;
73555714Skris	c1=0;
73655714Skris	sqr_add_c(a,5,c2,c3,c1);
73755714Skris	sqr_add_c2(a,6,4,c2,c3,c1);
73855714Skris	sqr_add_c2(a,7,3,c2,c3,c1);
73955714Skris	r[10]=c2;
74055714Skris	c2=0;
74155714Skris	sqr_add_c2(a,7,4,c3,c1,c2);
74255714Skris	sqr_add_c2(a,6,5,c3,c1,c2);
74355714Skris	r[11]=c3;
74455714Skris	c3=0;
74555714Skris	sqr_add_c(a,6,c1,c2,c3);
74655714Skris	sqr_add_c2(a,7,5,c1,c2,c3);
74755714Skris	r[12]=c1;
74855714Skris	c1=0;
74955714Skris	sqr_add_c2(a,7,6,c2,c3,c1);
75055714Skris	r[13]=c2;
75155714Skris	c2=0;
75255714Skris	sqr_add_c(a,7,c3,c1,c2);
75355714Skris	r[14]=c3;
75455714Skris	r[15]=c1;
75555714Skris	}
75655714Skris
757109998Smarkmvoid bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a)
75855714Skris	{
75955714Skris#ifdef BN_LLONG
76055714Skris	BN_ULLONG t,tt;
76155714Skris#else
76255714Skris	BN_ULONG bl,bh;
76355714Skris#endif
76455714Skris	BN_ULONG t1,t2;
76555714Skris	BN_ULONG c1,c2,c3;
76655714Skris
76755714Skris	c1=0;
76855714Skris	c2=0;
76955714Skris	c3=0;
77055714Skris	sqr_add_c(a,0,c1,c2,c3);
77155714Skris	r[0]=c1;
77255714Skris	c1=0;
77355714Skris	sqr_add_c2(a,1,0,c2,c3,c1);
77455714Skris	r[1]=c2;
77555714Skris	c2=0;
77655714Skris	sqr_add_c(a,1,c3,c1,c2);
77755714Skris	sqr_add_c2(a,2,0,c3,c1,c2);
77855714Skris	r[2]=c3;
77955714Skris	c3=0;
78055714Skris	sqr_add_c2(a,3,0,c1,c2,c3);
78155714Skris	sqr_add_c2(a,2,1,c1,c2,c3);
78255714Skris	r[3]=c1;
78355714Skris	c1=0;
78455714Skris	sqr_add_c(a,2,c2,c3,c1);
78555714Skris	sqr_add_c2(a,3,1,c2,c3,c1);
78655714Skris	r[4]=c2;
78755714Skris	c2=0;
78855714Skris	sqr_add_c2(a,3,2,c3,c1,c2);
78955714Skris	r[5]=c3;
79055714Skris	c3=0;
79155714Skris	sqr_add_c(a,3,c1,c2,c3);
79255714Skris	r[6]=c1;
79355714Skris	r[7]=c2;
79455714Skris	}
79559191Skris#else /* !BN_MUL_COMBA */
79655714Skris
79755714Skris/* hmm... is it faster just to do a multiply? */
79855714Skris#undef bn_sqr_comba4
79955714Skrisvoid bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a)
80055714Skris	{
80155714Skris	BN_ULONG t[8];
80255714Skris	bn_sqr_normal(r,a,4,t);
80355714Skris	}
80455714Skris
80555714Skris#undef bn_sqr_comba8
80655714Skrisvoid bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a)
80755714Skris	{
80855714Skris	BN_ULONG t[16];
80955714Skris	bn_sqr_normal(r,a,8,t);
81055714Skris	}
81155714Skris
81255714Skrisvoid bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
81355714Skris	{
81455714Skris	r[4]=bn_mul_words(    &(r[0]),a,4,b[0]);
81555714Skris	r[5]=bn_mul_add_words(&(r[1]),a,4,b[1]);
81655714Skris	r[6]=bn_mul_add_words(&(r[2]),a,4,b[2]);
81755714Skris	r[7]=bn_mul_add_words(&(r[3]),a,4,b[3]);
81855714Skris	}
81955714Skris
82055714Skrisvoid bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
82155714Skris	{
82255714Skris	r[ 8]=bn_mul_words(    &(r[0]),a,8,b[0]);
82355714Skris	r[ 9]=bn_mul_add_words(&(r[1]),a,8,b[1]);
82455714Skris	r[10]=bn_mul_add_words(&(r[2]),a,8,b[2]);
82555714Skris	r[11]=bn_mul_add_words(&(r[3]),a,8,b[3]);
82655714Skris	r[12]=bn_mul_add_words(&(r[4]),a,8,b[4]);
82755714Skris	r[13]=bn_mul_add_words(&(r[5]),a,8,b[5]);
82855714Skris	r[14]=bn_mul_add_words(&(r[6]),a,8,b[6]);
82955714Skris	r[15]=bn_mul_add_words(&(r[7]),a,8,b[7]);
83055714Skris	}
83155714Skris
83259191Skris#endif /* !BN_MUL_COMBA */
833