155714Skris/* crypto/bn/bn_asm.c */
255714Skris/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
355714Skris * All rights reserved.
455714Skris *
555714Skris * This package is an SSL implementation written
655714Skris * by Eric Young (eay@cryptsoft.com).
755714Skris * The implementation was written so as to conform with Netscapes SSL.
855714Skris *
955714Skris * This library is free for commercial and non-commercial use as long as
1055714Skris * the following conditions are aheared to.  The following conditions
1155714Skris * apply to all code found in this distribution, be it the RC4, RSA,
1255714Skris * lhash, DES, etc., code; not just the SSL code.  The SSL documentation
1355714Skris * included with this distribution is covered by the same copyright terms
1455714Skris * except that the holder is Tim Hudson (tjh@cryptsoft.com).
1555714Skris *
1655714Skris * Copyright remains Eric Young's, and as such any Copyright notices in
1755714Skris * the code are not to be removed.
1855714Skris * If this package is used in a product, Eric Young should be given attribution
1955714Skris * as the author of the parts of the library used.
2055714Skris * This can be in the form of a textual message at program startup or
2155714Skris * in documentation (online or textual) provided with the package.
2255714Skris *
2355714Skris * Redistribution and use in source and binary forms, with or without
2455714Skris * modification, are permitted provided that the following conditions
2555714Skris * are met:
2655714Skris * 1. Redistributions of source code must retain the copyright
2755714Skris *    notice, this list of conditions and the following disclaimer.
2855714Skris * 2. Redistributions in binary form must reproduce the above copyright
2955714Skris *    notice, this list of conditions and the following disclaimer in the
3055714Skris *    documentation and/or other materials provided with the distribution.
3155714Skris * 3. All advertising materials mentioning features or use of this software
3255714Skris *    must display the following acknowledgement:
3355714Skris *    "This product includes cryptographic software written by
3455714Skris *     Eric Young (eay@cryptsoft.com)"
3555714Skris *    The word 'cryptographic' can be left out if the rouines from the library
3655714Skris *    being used are not cryptographic related :-).
3755714Skris * 4. If you include any Windows specific code (or a derivative thereof) from
3855714Skris *    the apps directory (application code) you must include an acknowledgement:
3955714Skris *    "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
4055714Skris *
4155714Skris * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
4255714Skris * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
4355714Skris * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
4455714Skris * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
4555714Skris * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
4655714Skris * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
4755714Skris * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
4855714Skris * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
4955714Skris * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
5055714Skris * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
5155714Skris * SUCH DAMAGE.
5255714Skris *
5355714Skris * The licence and distribution terms for any publically available version or
5455714Skris * derivative of this code cannot be changed.  i.e. this code cannot simply be
5555714Skris * copied and put under another distribution licence
5655714Skris * [including the GNU Public Licence.]
5755714Skris */
5855714Skris
5959191Skris#ifndef BN_DEBUG
6059191Skris# undef NDEBUG /* avoid conflicting definitions */
6159191Skris# define NDEBUG
6259191Skris#endif
6359191Skris
6455714Skris#include <stdio.h>
6559191Skris#include <assert.h>
6655714Skris#include "cryptlib.h"
6755714Skris#include "bn_lcl.h"
6855714Skris
6959191Skris#if defined(BN_LLONG) || defined(BN_UMULT_HIGH)
7055714Skris
71109998SmarkmBN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
7255714Skris	{
7355714Skris	BN_ULONG c1=0;
7455714Skris
7559191Skris	assert(num >= 0);
7655714Skris	if (num <= 0) return(c1);
7755714Skris
78238405Sjkim#ifndef OPENSSL_SMALL_FOOTPRINT
7959191Skris	while (num&~3)
8055714Skris		{
8155714Skris		mul_add(rp[0],ap[0],w,c1);
8255714Skris		mul_add(rp[1],ap[1],w,c1);
8355714Skris		mul_add(rp[2],ap[2],w,c1);
8455714Skris		mul_add(rp[3],ap[3],w,c1);
8559191Skris		ap+=4; rp+=4; num-=4;
8655714Skris		}
87238405Sjkim#endif
88238405Sjkim	while (num)
8959191Skris		{
90238405Sjkim		mul_add(rp[0],ap[0],w,c1);
91238405Sjkim		ap++; rp++; num--;
9259191Skris		}
9355714Skris
9455714Skris	return(c1);
9555714Skris	}
9655714Skris
97109998SmarkmBN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
9855714Skris	{
9955714Skris	BN_ULONG c1=0;
10055714Skris
10159191Skris	assert(num >= 0);
10255714Skris	if (num <= 0) return(c1);
10355714Skris
104238405Sjkim#ifndef OPENSSL_SMALL_FOOTPRINT
10559191Skris	while (num&~3)
10655714Skris		{
10755714Skris		mul(rp[0],ap[0],w,c1);
10855714Skris		mul(rp[1],ap[1],w,c1);
10955714Skris		mul(rp[2],ap[2],w,c1);
11055714Skris		mul(rp[3],ap[3],w,c1);
11159191Skris		ap+=4; rp+=4; num-=4;
11255714Skris		}
113238405Sjkim#endif
114238405Sjkim	while (num)
11559191Skris		{
116238405Sjkim		mul(rp[0],ap[0],w,c1);
117238405Sjkim		ap++; rp++; num--;
11859191Skris		}
11955714Skris	return(c1);
12055714Skris	}
12155714Skris
122109998Smarkmvoid bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n)
12355714Skris        {
12459191Skris	assert(n >= 0);
12555714Skris	if (n <= 0) return;
126238405Sjkim
127238405Sjkim#ifndef OPENSSL_SMALL_FOOTPRINT
12859191Skris	while (n&~3)
12955714Skris		{
13059191Skris		sqr(r[0],r[1],a[0]);
13159191Skris		sqr(r[2],r[3],a[1]);
13259191Skris		sqr(r[4],r[5],a[2]);
13359191Skris		sqr(r[6],r[7],a[3]);
13459191Skris		a+=4; r+=8; n-=4;
13555714Skris		}
136238405Sjkim#endif
137238405Sjkim	while (n)
13859191Skris		{
139238405Sjkim		sqr(r[0],r[1],a[0]);
140238405Sjkim		a++; r+=2; n--;
14159191Skris		}
14255714Skris	}
14355714Skris
14459191Skris#else /* !(defined(BN_LLONG) || defined(BN_UMULT_HIGH)) */
14555714Skris
146109998SmarkmBN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
14755714Skris	{
14855714Skris	BN_ULONG c=0;
14955714Skris	BN_ULONG bl,bh;
15055714Skris
15159191Skris	assert(num >= 0);
15255714Skris	if (num <= 0) return((BN_ULONG)0);
15355714Skris
15455714Skris	bl=LBITS(w);
15555714Skris	bh=HBITS(w);
15655714Skris
157238405Sjkim#ifndef OPENSSL_SMALL_FOOTPRINT
158238405Sjkim	while (num&~3)
15955714Skris		{
16055714Skris		mul_add(rp[0],ap[0],bl,bh,c);
16155714Skris		mul_add(rp[1],ap[1],bl,bh,c);
16255714Skris		mul_add(rp[2],ap[2],bl,bh,c);
16355714Skris		mul_add(rp[3],ap[3],bl,bh,c);
164238405Sjkim		ap+=4; rp+=4; num-=4;
16555714Skris		}
166238405Sjkim#endif
167238405Sjkim	while (num)
168238405Sjkim		{
169238405Sjkim		mul_add(rp[0],ap[0],bl,bh,c);
170238405Sjkim		ap++; rp++; num--;
171238405Sjkim		}
17255714Skris	return(c);
17355714Skris	}
17455714Skris
175109998SmarkmBN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
17655714Skris	{
17755714Skris	BN_ULONG carry=0;
17855714Skris	BN_ULONG bl,bh;
17955714Skris
18059191Skris	assert(num >= 0);
18155714Skris	if (num <= 0) return((BN_ULONG)0);
18255714Skris
18355714Skris	bl=LBITS(w);
18455714Skris	bh=HBITS(w);
18555714Skris
186238405Sjkim#ifndef OPENSSL_SMALL_FOOTPRINT
187238405Sjkim	while (num&~3)
18855714Skris		{
18955714Skris		mul(rp[0],ap[0],bl,bh,carry);
19055714Skris		mul(rp[1],ap[1],bl,bh,carry);
19155714Skris		mul(rp[2],ap[2],bl,bh,carry);
19255714Skris		mul(rp[3],ap[3],bl,bh,carry);
193238405Sjkim		ap+=4; rp+=4; num-=4;
19455714Skris		}
195238405Sjkim#endif
196238405Sjkim	while (num)
197238405Sjkim		{
198238405Sjkim		mul(rp[0],ap[0],bl,bh,carry);
199238405Sjkim		ap++; rp++; num--;
200238405Sjkim		}
20155714Skris	return(carry);
20255714Skris	}
20355714Skris
204109998Smarkmvoid bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n)
20555714Skris        {
20659191Skris	assert(n >= 0);
20755714Skris	if (n <= 0) return;
208238405Sjkim
209238405Sjkim#ifndef OPENSSL_SMALL_FOOTPRINT
210238405Sjkim	while (n&~3)
21155714Skris		{
21255714Skris		sqr64(r[0],r[1],a[0]);
21355714Skris		sqr64(r[2],r[3],a[1]);
21455714Skris		sqr64(r[4],r[5],a[2]);
21555714Skris		sqr64(r[6],r[7],a[3]);
216238405Sjkim		a+=4; r+=8; n-=4;
21755714Skris		}
218238405Sjkim#endif
219238405Sjkim	while (n)
220238405Sjkim		{
221238405Sjkim		sqr64(r[0],r[1],a[0]);
222238405Sjkim		a++; r+=2; n--;
223238405Sjkim		}
22455714Skris	}
22555714Skris
22659191Skris#endif /* !(defined(BN_LLONG) || defined(BN_UMULT_HIGH)) */
22755714Skris
22855714Skris#if defined(BN_LLONG) && defined(BN_DIV2W)
22955714Skris
23055714SkrisBN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
23155714Skris	{
23255714Skris	return((BN_ULONG)(((((BN_ULLONG)h)<<BN_BITS2)|l)/(BN_ULLONG)d));
23355714Skris	}
23455714Skris
23555714Skris#else
23655714Skris
23768651Skris/* Divide h,l by d and return the result. */
23855714Skris/* I need to test this some more :-( */
23955714SkrisBN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
24055714Skris	{
24155714Skris	BN_ULONG dh,dl,q,ret=0,th,tl,t;
24255714Skris	int i,count=2;
24355714Skris
24455714Skris	if (d == 0) return(BN_MASK2);
24555714Skris
24655714Skris	i=BN_num_bits_word(d);
247160814Ssimon	assert((i == BN_BITS2) || (h <= (BN_ULONG)1<<i));
24868651Skris
24955714Skris	i=BN_BITS2-i;
25055714Skris	if (h >= d) h-=d;
25155714Skris
25255714Skris	if (i)
25355714Skris		{
25455714Skris		d<<=i;
25555714Skris		h=(h<<i)|(l>>(BN_BITS2-i));
25655714Skris		l<<=i;
25755714Skris		}
25855714Skris	dh=(d&BN_MASK2h)>>BN_BITS4;
25955714Skris	dl=(d&BN_MASK2l);
26055714Skris	for (;;)
26155714Skris		{
26255714Skris		if ((h>>BN_BITS4) == dh)
26355714Skris			q=BN_MASK2l;
26455714Skris		else
26555714Skris			q=h/dh;
26655714Skris
26755714Skris		th=q*dh;
26855714Skris		tl=dl*q;
26955714Skris		for (;;)
27055714Skris			{
27155714Skris			t=h-th;
27255714Skris			if ((t&BN_MASK2h) ||
27355714Skris				((tl) <= (
27455714Skris					(t<<BN_BITS4)|
27555714Skris					((l&BN_MASK2h)>>BN_BITS4))))
27655714Skris				break;
27755714Skris			q--;
27855714Skris			th-=dh;
27955714Skris			tl-=dl;
28055714Skris			}
28155714Skris		t=(tl>>BN_BITS4);
28255714Skris		tl=(tl<<BN_BITS4)&BN_MASK2h;
28355714Skris		th+=t;
28455714Skris
28555714Skris		if (l < tl) th++;
28655714Skris		l-=tl;
28755714Skris		if (h < th)
28855714Skris			{
28955714Skris			h+=d;
29055714Skris			q--;
29155714Skris			}
29255714Skris		h-=th;
29355714Skris
29455714Skris		if (--count == 0) break;
29555714Skris
29655714Skris		ret=q<<BN_BITS4;
29755714Skris		h=((h<<BN_BITS4)|(l>>BN_BITS4))&BN_MASK2;
29855714Skris		l=(l&BN_MASK2l)<<BN_BITS4;
29955714Skris		}
30055714Skris	ret|=q;
30155714Skris	return(ret);
30255714Skris	}
30359191Skris#endif /* !defined(BN_LLONG) && defined(BN_DIV2W) */
30455714Skris
30555714Skris#ifdef BN_LLONG
306109998SmarkmBN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, int n)
30755714Skris        {
30855714Skris	BN_ULLONG ll=0;
30955714Skris
31059191Skris	assert(n >= 0);
31155714Skris	if (n <= 0) return((BN_ULONG)0);
31255714Skris
313238405Sjkim#ifndef OPENSSL_SMALL_FOOTPRINT
314238405Sjkim	while (n&~3)
31555714Skris		{
31655714Skris		ll+=(BN_ULLONG)a[0]+b[0];
31755714Skris		r[0]=(BN_ULONG)ll&BN_MASK2;
31855714Skris		ll>>=BN_BITS2;
31955714Skris		ll+=(BN_ULLONG)a[1]+b[1];
32055714Skris		r[1]=(BN_ULONG)ll&BN_MASK2;
32155714Skris		ll>>=BN_BITS2;
32255714Skris		ll+=(BN_ULLONG)a[2]+b[2];
32355714Skris		r[2]=(BN_ULONG)ll&BN_MASK2;
32455714Skris		ll>>=BN_BITS2;
32555714Skris		ll+=(BN_ULLONG)a[3]+b[3];
32655714Skris		r[3]=(BN_ULONG)ll&BN_MASK2;
32755714Skris		ll>>=BN_BITS2;
328238405Sjkim		a+=4; b+=4; r+=4; n-=4;
32955714Skris		}
330238405Sjkim#endif
331238405Sjkim	while (n)
332238405Sjkim		{
333238405Sjkim		ll+=(BN_ULLONG)a[0]+b[0];
334238405Sjkim		r[0]=(BN_ULONG)ll&BN_MASK2;
335238405Sjkim		ll>>=BN_BITS2;
336238405Sjkim		a++; b++; r++; n--;
337238405Sjkim		}
33855714Skris	return((BN_ULONG)ll);
33955714Skris	}
34059191Skris#else /* !BN_LLONG */
341109998SmarkmBN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, int n)
34255714Skris        {
34355714Skris	BN_ULONG c,l,t;
34455714Skris
34559191Skris	assert(n >= 0);
34655714Skris	if (n <= 0) return((BN_ULONG)0);
34755714Skris
34855714Skris	c=0;
349238405Sjkim#ifndef OPENSSL_SMALL_FOOTPRINT
350238405Sjkim	while (n&~3)
35155714Skris		{
35255714Skris		t=a[0];
35355714Skris		t=(t+c)&BN_MASK2;
35455714Skris		c=(t < c);
35555714Skris		l=(t+b[0])&BN_MASK2;
35655714Skris		c+=(l < t);
35755714Skris		r[0]=l;
35855714Skris		t=a[1];
35955714Skris		t=(t+c)&BN_MASK2;
36055714Skris		c=(t < c);
36155714Skris		l=(t+b[1])&BN_MASK2;
36255714Skris		c+=(l < t);
36355714Skris		r[1]=l;
36455714Skris		t=a[2];
36555714Skris		t=(t+c)&BN_MASK2;
36655714Skris		c=(t < c);
36755714Skris		l=(t+b[2])&BN_MASK2;
36855714Skris		c+=(l < t);
36955714Skris		r[2]=l;
37055714Skris		t=a[3];
37155714Skris		t=(t+c)&BN_MASK2;
37255714Skris		c=(t < c);
37355714Skris		l=(t+b[3])&BN_MASK2;
37455714Skris		c+=(l < t);
37555714Skris		r[3]=l;
376238405Sjkim		a+=4; b+=4; r+=4; n-=4;
37755714Skris		}
378238405Sjkim#endif
379238405Sjkim	while(n)
380238405Sjkim		{
381238405Sjkim		t=a[0];
382238405Sjkim		t=(t+c)&BN_MASK2;
383238405Sjkim		c=(t < c);
384238405Sjkim		l=(t+b[0])&BN_MASK2;
385238405Sjkim		c+=(l < t);
386238405Sjkim		r[0]=l;
387238405Sjkim		a++; b++; r++; n--;
388238405Sjkim		}
38955714Skris	return((BN_ULONG)c);
39055714Skris	}
39159191Skris#endif /* !BN_LLONG */
39255714Skris
393109998SmarkmBN_ULONG bn_sub_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, int n)
39455714Skris        {
39555714Skris	BN_ULONG t1,t2;
39655714Skris	int c=0;
39755714Skris
39859191Skris	assert(n >= 0);
39955714Skris	if (n <= 0) return((BN_ULONG)0);
40055714Skris
401238405Sjkim#ifndef OPENSSL_SMALL_FOOTPRINT
402238405Sjkim	while (n&~3)
40355714Skris		{
40455714Skris		t1=a[0]; t2=b[0];
40555714Skris		r[0]=(t1-t2-c)&BN_MASK2;
40655714Skris		if (t1 != t2) c=(t1 < t2);
40755714Skris		t1=a[1]; t2=b[1];
40855714Skris		r[1]=(t1-t2-c)&BN_MASK2;
40955714Skris		if (t1 != t2) c=(t1 < t2);
41055714Skris		t1=a[2]; t2=b[2];
41155714Skris		r[2]=(t1-t2-c)&BN_MASK2;
41255714Skris		if (t1 != t2) c=(t1 < t2);
41355714Skris		t1=a[3]; t2=b[3];
41455714Skris		r[3]=(t1-t2-c)&BN_MASK2;
41555714Skris		if (t1 != t2) c=(t1 < t2);
416238405Sjkim		a+=4; b+=4; r+=4; n-=4;
41755714Skris		}
418238405Sjkim#endif
419238405Sjkim	while (n)
420238405Sjkim		{
421238405Sjkim		t1=a[0]; t2=b[0];
422238405Sjkim		r[0]=(t1-t2-c)&BN_MASK2;
423238405Sjkim		if (t1 != t2) c=(t1 < t2);
424238405Sjkim		a++; b++; r++; n--;
425238405Sjkim		}
42655714Skris	return(c);
42755714Skris	}
42855714Skris
429238405Sjkim#if defined(BN_MUL_COMBA) && !defined(OPENSSL_SMALL_FOOTPRINT)
43055714Skris
43155714Skris#undef bn_mul_comba8
43255714Skris#undef bn_mul_comba4
43355714Skris#undef bn_sqr_comba8
43455714Skris#undef bn_sqr_comba4
43555714Skris
43659191Skris/* mul_add_c(a,b,c0,c1,c2)  -- c+=a*b for three word number c=(c2,c1,c0) */
43759191Skris/* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */
43859191Skris/* sqr_add_c(a,i,c0,c1,c2)  -- c+=a[i]^2 for three word number c=(c2,c1,c0) */
43959191Skris/* sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number c=(c2,c1,c0) */
44059191Skris
441277195Sdelphij/*
442277195Sdelphij * Keep in mind that carrying into high part of multiplication result
443277195Sdelphij * can not overflow, because it cannot be all-ones.
444277195Sdelphij */
44555714Skris#ifdef BN_LLONG
44655714Skris#define mul_add_c(a,b,c0,c1,c2) \
44755714Skris	t=(BN_ULLONG)a*b; \
44855714Skris	t1=(BN_ULONG)Lw(t); \
44955714Skris	t2=(BN_ULONG)Hw(t); \
45055714Skris	c0=(c0+t1)&BN_MASK2; if ((c0) < t1) t2++; \
45155714Skris	c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++;
45255714Skris
45355714Skris#define mul_add_c2(a,b,c0,c1,c2) \
45455714Skris	t=(BN_ULLONG)a*b; \
45555714Skris	tt=(t+t)&BN_MASK; \
45655714Skris	if (tt < t) c2++; \
45755714Skris	t1=(BN_ULONG)Lw(tt); \
45855714Skris	t2=(BN_ULONG)Hw(tt); \
45955714Skris	c0=(c0+t1)&BN_MASK2;  \
46055714Skris	if ((c0 < t1) && (((++t2)&BN_MASK2) == 0)) c2++; \
46155714Skris	c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++;
46255714Skris
46355714Skris#define sqr_add_c(a,i,c0,c1,c2) \
46455714Skris	t=(BN_ULLONG)a[i]*a[i]; \
46555714Skris	t1=(BN_ULONG)Lw(t); \
46655714Skris	t2=(BN_ULONG)Hw(t); \
46755714Skris	c0=(c0+t1)&BN_MASK2; if ((c0) < t1) t2++; \
46855714Skris	c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++;
46955714Skris
47055714Skris#define sqr_add_c2(a,i,j,c0,c1,c2) \
47155714Skris	mul_add_c2((a)[i],(a)[j],c0,c1,c2)
47259191Skris
473160814Ssimon#elif defined(BN_UMULT_LOHI)
474160814Ssimon
475160814Ssimon#define mul_add_c(a,b,c0,c1,c2)	{	\
476160814Ssimon	BN_ULONG ta=(a),tb=(b);		\
477160814Ssimon	BN_UMULT_LOHI(t1,t2,ta,tb);	\
478160814Ssimon	c0 += t1; t2 += (c0<t1)?1:0;	\
479160814Ssimon	c1 += t2; c2 += (c1<t2)?1:0;	\
480160814Ssimon	}
481160814Ssimon
482160814Ssimon#define mul_add_c2(a,b,c0,c1,c2) {	\
483160814Ssimon	BN_ULONG ta=(a),tb=(b),t0;	\
484160814Ssimon	BN_UMULT_LOHI(t0,t1,ta,tb);	\
485277195Sdelphij	c0 += t0; t2 = t1+((c0<t0)?1:0);\
486160814Ssimon	c1 += t2; c2 += (c1<t2)?1:0;	\
487277195Sdelphij	c0 += t0; t1 += (c0<t0)?1:0;	\
488277195Sdelphij	c1 += t1; c2 += (c1<t1)?1:0;	\
489160814Ssimon	}
490160814Ssimon
491160814Ssimon#define sqr_add_c(a,i,c0,c1,c2)	{	\
492160814Ssimon	BN_ULONG ta=(a)[i];		\
493160814Ssimon	BN_UMULT_LOHI(t1,t2,ta,ta);	\
494160814Ssimon	c0 += t1; t2 += (c0<t1)?1:0;	\
495160814Ssimon	c1 += t2; c2 += (c1<t2)?1:0;	\
496160814Ssimon	}
497160814Ssimon
498160814Ssimon#define sqr_add_c2(a,i,j,c0,c1,c2)	\
499160814Ssimon	mul_add_c2((a)[i],(a)[j],c0,c1,c2)
500160814Ssimon
50159191Skris#elif defined(BN_UMULT_HIGH)
50259191Skris
50359191Skris#define mul_add_c(a,b,c0,c1,c2)	{	\
50459191Skris	BN_ULONG ta=(a),tb=(b);		\
50559191Skris	t1 = ta * tb;			\
50659191Skris	t2 = BN_UMULT_HIGH(ta,tb);	\
50759191Skris	c0 += t1; t2 += (c0<t1)?1:0;	\
50859191Skris	c1 += t2; c2 += (c1<t2)?1:0;	\
50959191Skris	}
51059191Skris
51159191Skris#define mul_add_c2(a,b,c0,c1,c2) {	\
51259191Skris	BN_ULONG ta=(a),tb=(b),t0;	\
51359191Skris	t1 = BN_UMULT_HIGH(ta,tb);	\
51459191Skris	t0 = ta * tb;			\
515277195Sdelphij	c0 += t0; t2 = t1+((c0<t0)?1:0);\
51659191Skris	c1 += t2; c2 += (c1<t2)?1:0;	\
517277195Sdelphij	c0 += t0; t1 += (c0<t0)?1:0;	\
518277195Sdelphij	c1 += t1; c2 += (c1<t1)?1:0;	\
51959191Skris	}
52059191Skris
52159191Skris#define sqr_add_c(a,i,c0,c1,c2)	{	\
52259191Skris	BN_ULONG ta=(a)[i];		\
52359191Skris	t1 = ta * ta;			\
52459191Skris	t2 = BN_UMULT_HIGH(ta,ta);	\
52559191Skris	c0 += t1; t2 += (c0<t1)?1:0;	\
52659191Skris	c1 += t2; c2 += (c1<t2)?1:0;	\
52759191Skris	}
52859191Skris
52959191Skris#define sqr_add_c2(a,i,j,c0,c1,c2)	\
53059191Skris	mul_add_c2((a)[i],(a)[j],c0,c1,c2)
53159191Skris
53259191Skris#else /* !BN_LLONG */
53355714Skris#define mul_add_c(a,b,c0,c1,c2) \
53455714Skris	t1=LBITS(a); t2=HBITS(a); \
53555714Skris	bl=LBITS(b); bh=HBITS(b); \
53655714Skris	mul64(t1,t2,bl,bh); \
53755714Skris	c0=(c0+t1)&BN_MASK2; if ((c0) < t1) t2++; \
53855714Skris	c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++;
53955714Skris
54055714Skris#define mul_add_c2(a,b,c0,c1,c2) \
54155714Skris	t1=LBITS(a); t2=HBITS(a); \
54255714Skris	bl=LBITS(b); bh=HBITS(b); \
54355714Skris	mul64(t1,t2,bl,bh); \
54455714Skris	if (t2 & BN_TBIT) c2++; \
54555714Skris	t2=(t2+t2)&BN_MASK2; \
54655714Skris	if (t1 & BN_TBIT) t2++; \
54755714Skris	t1=(t1+t1)&BN_MASK2; \
54855714Skris	c0=(c0+t1)&BN_MASK2;  \
54955714Skris	if ((c0 < t1) && (((++t2)&BN_MASK2) == 0)) c2++; \
55055714Skris	c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++;
55155714Skris
55255714Skris#define sqr_add_c(a,i,c0,c1,c2) \
55355714Skris	sqr64(t1,t2,(a)[i]); \
55455714Skris	c0=(c0+t1)&BN_MASK2; if ((c0) < t1) t2++; \
55555714Skris	c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++;
55655714Skris
55755714Skris#define sqr_add_c2(a,i,j,c0,c1,c2) \
55855714Skris	mul_add_c2((a)[i],(a)[j],c0,c1,c2)
55959191Skris#endif /* !BN_LLONG */
56055714Skris
56155714Skrisvoid bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
56255714Skris	{
56355714Skris#ifdef BN_LLONG
56455714Skris	BN_ULLONG t;
56555714Skris#else
56655714Skris	BN_ULONG bl,bh;
56755714Skris#endif
56855714Skris	BN_ULONG t1,t2;
56955714Skris	BN_ULONG c1,c2,c3;
57055714Skris
57155714Skris	c1=0;
57255714Skris	c2=0;
57355714Skris	c3=0;
57455714Skris	mul_add_c(a[0],b[0],c1,c2,c3);
57555714Skris	r[0]=c1;
57655714Skris	c1=0;
57755714Skris	mul_add_c(a[0],b[1],c2,c3,c1);
57855714Skris	mul_add_c(a[1],b[0],c2,c3,c1);
57955714Skris	r[1]=c2;
58055714Skris	c2=0;
58155714Skris	mul_add_c(a[2],b[0],c3,c1,c2);
58255714Skris	mul_add_c(a[1],b[1],c3,c1,c2);
58355714Skris	mul_add_c(a[0],b[2],c3,c1,c2);
58455714Skris	r[2]=c3;
58555714Skris	c3=0;
58655714Skris	mul_add_c(a[0],b[3],c1,c2,c3);
58755714Skris	mul_add_c(a[1],b[2],c1,c2,c3);
58855714Skris	mul_add_c(a[2],b[1],c1,c2,c3);
58955714Skris	mul_add_c(a[3],b[0],c1,c2,c3);
59055714Skris	r[3]=c1;
59155714Skris	c1=0;
59255714Skris	mul_add_c(a[4],b[0],c2,c3,c1);
59355714Skris	mul_add_c(a[3],b[1],c2,c3,c1);
59455714Skris	mul_add_c(a[2],b[2],c2,c3,c1);
59555714Skris	mul_add_c(a[1],b[3],c2,c3,c1);
59655714Skris	mul_add_c(a[0],b[4],c2,c3,c1);
59755714Skris	r[4]=c2;
59855714Skris	c2=0;
59955714Skris	mul_add_c(a[0],b[5],c3,c1,c2);
60055714Skris	mul_add_c(a[1],b[4],c3,c1,c2);
60155714Skris	mul_add_c(a[2],b[3],c3,c1,c2);
60255714Skris	mul_add_c(a[3],b[2],c3,c1,c2);
60355714Skris	mul_add_c(a[4],b[1],c3,c1,c2);
60455714Skris	mul_add_c(a[5],b[0],c3,c1,c2);
60555714Skris	r[5]=c3;
60655714Skris	c3=0;
60755714Skris	mul_add_c(a[6],b[0],c1,c2,c3);
60855714Skris	mul_add_c(a[5],b[1],c1,c2,c3);
60955714Skris	mul_add_c(a[4],b[2],c1,c2,c3);
61055714Skris	mul_add_c(a[3],b[3],c1,c2,c3);
61155714Skris	mul_add_c(a[2],b[4],c1,c2,c3);
61255714Skris	mul_add_c(a[1],b[5],c1,c2,c3);
61355714Skris	mul_add_c(a[0],b[6],c1,c2,c3);
61455714Skris	r[6]=c1;
61555714Skris	c1=0;
61655714Skris	mul_add_c(a[0],b[7],c2,c3,c1);
61755714Skris	mul_add_c(a[1],b[6],c2,c3,c1);
61855714Skris	mul_add_c(a[2],b[5],c2,c3,c1);
61955714Skris	mul_add_c(a[3],b[4],c2,c3,c1);
62055714Skris	mul_add_c(a[4],b[3],c2,c3,c1);
62155714Skris	mul_add_c(a[5],b[2],c2,c3,c1);
62255714Skris	mul_add_c(a[6],b[1],c2,c3,c1);
62355714Skris	mul_add_c(a[7],b[0],c2,c3,c1);
62455714Skris	r[7]=c2;
62555714Skris	c2=0;
62655714Skris	mul_add_c(a[7],b[1],c3,c1,c2);
62755714Skris	mul_add_c(a[6],b[2],c3,c1,c2);
62855714Skris	mul_add_c(a[5],b[3],c3,c1,c2);
62955714Skris	mul_add_c(a[4],b[4],c3,c1,c2);
63055714Skris	mul_add_c(a[3],b[5],c3,c1,c2);
63155714Skris	mul_add_c(a[2],b[6],c3,c1,c2);
63255714Skris	mul_add_c(a[1],b[7],c3,c1,c2);
63355714Skris	r[8]=c3;
63455714Skris	c3=0;
63555714Skris	mul_add_c(a[2],b[7],c1,c2,c3);
63655714Skris	mul_add_c(a[3],b[6],c1,c2,c3);
63755714Skris	mul_add_c(a[4],b[5],c1,c2,c3);
63855714Skris	mul_add_c(a[5],b[4],c1,c2,c3);
63955714Skris	mul_add_c(a[6],b[3],c1,c2,c3);
64055714Skris	mul_add_c(a[7],b[2],c1,c2,c3);
64155714Skris	r[9]=c1;
64255714Skris	c1=0;
64355714Skris	mul_add_c(a[7],b[3],c2,c3,c1);
64455714Skris	mul_add_c(a[6],b[4],c2,c3,c1);
64555714Skris	mul_add_c(a[5],b[5],c2,c3,c1);
64655714Skris	mul_add_c(a[4],b[6],c2,c3,c1);
64755714Skris	mul_add_c(a[3],b[7],c2,c3,c1);
64855714Skris	r[10]=c2;
64955714Skris	c2=0;
65055714Skris	mul_add_c(a[4],b[7],c3,c1,c2);
65155714Skris	mul_add_c(a[5],b[6],c3,c1,c2);
65255714Skris	mul_add_c(a[6],b[5],c3,c1,c2);
65355714Skris	mul_add_c(a[7],b[4],c3,c1,c2);
65455714Skris	r[11]=c3;
65555714Skris	c3=0;
65655714Skris	mul_add_c(a[7],b[5],c1,c2,c3);
65755714Skris	mul_add_c(a[6],b[6],c1,c2,c3);
65855714Skris	mul_add_c(a[5],b[7],c1,c2,c3);
65955714Skris	r[12]=c1;
66055714Skris	c1=0;
66155714Skris	mul_add_c(a[6],b[7],c2,c3,c1);
66255714Skris	mul_add_c(a[7],b[6],c2,c3,c1);
66355714Skris	r[13]=c2;
66455714Skris	c2=0;
66555714Skris	mul_add_c(a[7],b[7],c3,c1,c2);
66655714Skris	r[14]=c3;
66755714Skris	r[15]=c1;
66855714Skris	}
66955714Skris
67055714Skrisvoid bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
67155714Skris	{
67255714Skris#ifdef BN_LLONG
67355714Skris	BN_ULLONG t;
67455714Skris#else
67555714Skris	BN_ULONG bl,bh;
67655714Skris#endif
67755714Skris	BN_ULONG t1,t2;
67855714Skris	BN_ULONG c1,c2,c3;
67955714Skris
68055714Skris	c1=0;
68155714Skris	c2=0;
68255714Skris	c3=0;
68355714Skris	mul_add_c(a[0],b[0],c1,c2,c3);
68455714Skris	r[0]=c1;
68555714Skris	c1=0;
68655714Skris	mul_add_c(a[0],b[1],c2,c3,c1);
68755714Skris	mul_add_c(a[1],b[0],c2,c3,c1);
68855714Skris	r[1]=c2;
68955714Skris	c2=0;
69055714Skris	mul_add_c(a[2],b[0],c3,c1,c2);
69155714Skris	mul_add_c(a[1],b[1],c3,c1,c2);
69255714Skris	mul_add_c(a[0],b[2],c3,c1,c2);
69355714Skris	r[2]=c3;
69455714Skris	c3=0;
69555714Skris	mul_add_c(a[0],b[3],c1,c2,c3);
69655714Skris	mul_add_c(a[1],b[2],c1,c2,c3);
69755714Skris	mul_add_c(a[2],b[1],c1,c2,c3);
69855714Skris	mul_add_c(a[3],b[0],c1,c2,c3);
69955714Skris	r[3]=c1;
70055714Skris	c1=0;
70155714Skris	mul_add_c(a[3],b[1],c2,c3,c1);
70255714Skris	mul_add_c(a[2],b[2],c2,c3,c1);
70355714Skris	mul_add_c(a[1],b[3],c2,c3,c1);
70455714Skris	r[4]=c2;
70555714Skris	c2=0;
70655714Skris	mul_add_c(a[2],b[3],c3,c1,c2);
70755714Skris	mul_add_c(a[3],b[2],c3,c1,c2);
70855714Skris	r[5]=c3;
70955714Skris	c3=0;
71055714Skris	mul_add_c(a[3],b[3],c1,c2,c3);
71155714Skris	r[6]=c1;
71255714Skris	r[7]=c2;
71355714Skris	}
71455714Skris
715109998Smarkmvoid bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a)
71655714Skris	{
71755714Skris#ifdef BN_LLONG
71855714Skris	BN_ULLONG t,tt;
71955714Skris#else
72055714Skris	BN_ULONG bl,bh;
72155714Skris#endif
72255714Skris	BN_ULONG t1,t2;
72355714Skris	BN_ULONG c1,c2,c3;
72455714Skris
72555714Skris	c1=0;
72655714Skris	c2=0;
72755714Skris	c3=0;
72855714Skris	sqr_add_c(a,0,c1,c2,c3);
72955714Skris	r[0]=c1;
73055714Skris	c1=0;
73155714Skris	sqr_add_c2(a,1,0,c2,c3,c1);
73255714Skris	r[1]=c2;
73355714Skris	c2=0;
73455714Skris	sqr_add_c(a,1,c3,c1,c2);
73555714Skris	sqr_add_c2(a,2,0,c3,c1,c2);
73655714Skris	r[2]=c3;
73755714Skris	c3=0;
73855714Skris	sqr_add_c2(a,3,0,c1,c2,c3);
73955714Skris	sqr_add_c2(a,2,1,c1,c2,c3);
74055714Skris	r[3]=c1;
74155714Skris	c1=0;
74255714Skris	sqr_add_c(a,2,c2,c3,c1);
74355714Skris	sqr_add_c2(a,3,1,c2,c3,c1);
74455714Skris	sqr_add_c2(a,4,0,c2,c3,c1);
74555714Skris	r[4]=c2;
74655714Skris	c2=0;
74755714Skris	sqr_add_c2(a,5,0,c3,c1,c2);
74855714Skris	sqr_add_c2(a,4,1,c3,c1,c2);
74955714Skris	sqr_add_c2(a,3,2,c3,c1,c2);
75055714Skris	r[5]=c3;
75155714Skris	c3=0;
75255714Skris	sqr_add_c(a,3,c1,c2,c3);
75355714Skris	sqr_add_c2(a,4,2,c1,c2,c3);
75455714Skris	sqr_add_c2(a,5,1,c1,c2,c3);
75555714Skris	sqr_add_c2(a,6,0,c1,c2,c3);
75655714Skris	r[6]=c1;
75755714Skris	c1=0;
75855714Skris	sqr_add_c2(a,7,0,c2,c3,c1);
75955714Skris	sqr_add_c2(a,6,1,c2,c3,c1);
76055714Skris	sqr_add_c2(a,5,2,c2,c3,c1);
76155714Skris	sqr_add_c2(a,4,3,c2,c3,c1);
76255714Skris	r[7]=c2;
76355714Skris	c2=0;
76455714Skris	sqr_add_c(a,4,c3,c1,c2);
76555714Skris	sqr_add_c2(a,5,3,c3,c1,c2);
76655714Skris	sqr_add_c2(a,6,2,c3,c1,c2);
76755714Skris	sqr_add_c2(a,7,1,c3,c1,c2);
76855714Skris	r[8]=c3;
76955714Skris	c3=0;
77055714Skris	sqr_add_c2(a,7,2,c1,c2,c3);
77155714Skris	sqr_add_c2(a,6,3,c1,c2,c3);
77255714Skris	sqr_add_c2(a,5,4,c1,c2,c3);
77355714Skris	r[9]=c1;
77455714Skris	c1=0;
77555714Skris	sqr_add_c(a,5,c2,c3,c1);
77655714Skris	sqr_add_c2(a,6,4,c2,c3,c1);
77755714Skris	sqr_add_c2(a,7,3,c2,c3,c1);
77855714Skris	r[10]=c2;
77955714Skris	c2=0;
78055714Skris	sqr_add_c2(a,7,4,c3,c1,c2);
78155714Skris	sqr_add_c2(a,6,5,c3,c1,c2);
78255714Skris	r[11]=c3;
78355714Skris	c3=0;
78455714Skris	sqr_add_c(a,6,c1,c2,c3);
78555714Skris	sqr_add_c2(a,7,5,c1,c2,c3);
78655714Skris	r[12]=c1;
78755714Skris	c1=0;
78855714Skris	sqr_add_c2(a,7,6,c2,c3,c1);
78955714Skris	r[13]=c2;
79055714Skris	c2=0;
79155714Skris	sqr_add_c(a,7,c3,c1,c2);
79255714Skris	r[14]=c3;
79355714Skris	r[15]=c1;
79455714Skris	}
79555714Skris
796109998Smarkmvoid bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a)
79755714Skris	{
79855714Skris#ifdef BN_LLONG
79955714Skris	BN_ULLONG t,tt;
80055714Skris#else
80155714Skris	BN_ULONG bl,bh;
80255714Skris#endif
80355714Skris	BN_ULONG t1,t2;
80455714Skris	BN_ULONG c1,c2,c3;
80555714Skris
80655714Skris	c1=0;
80755714Skris	c2=0;
80855714Skris	c3=0;
80955714Skris	sqr_add_c(a,0,c1,c2,c3);
81055714Skris	r[0]=c1;
81155714Skris	c1=0;
81255714Skris	sqr_add_c2(a,1,0,c2,c3,c1);
81355714Skris	r[1]=c2;
81455714Skris	c2=0;
81555714Skris	sqr_add_c(a,1,c3,c1,c2);
81655714Skris	sqr_add_c2(a,2,0,c3,c1,c2);
81755714Skris	r[2]=c3;
81855714Skris	c3=0;
81955714Skris	sqr_add_c2(a,3,0,c1,c2,c3);
82055714Skris	sqr_add_c2(a,2,1,c1,c2,c3);
82155714Skris	r[3]=c1;
82255714Skris	c1=0;
82355714Skris	sqr_add_c(a,2,c2,c3,c1);
82455714Skris	sqr_add_c2(a,3,1,c2,c3,c1);
82555714Skris	r[4]=c2;
82655714Skris	c2=0;
82755714Skris	sqr_add_c2(a,3,2,c3,c1,c2);
82855714Skris	r[5]=c3;
82955714Skris	c3=0;
83055714Skris	sqr_add_c(a,3,c1,c2,c3);
83155714Skris	r[6]=c1;
83255714Skris	r[7]=c2;
83355714Skris	}
834238405Sjkim
835238405Sjkim#ifdef OPENSSL_NO_ASM
836238405Sjkim#ifdef OPENSSL_BN_ASM_MONT
837238405Sjkim#include <alloca.h>
838238405Sjkim/*
839238405Sjkim * This is essentially reference implementation, which may or may not
840238405Sjkim * result in performance improvement. E.g. on IA-32 this routine was
841238405Sjkim * observed to give 40% faster rsa1024 private key operations and 10%
842238405Sjkim * faster rsa4096 ones, while on AMD64 it improves rsa1024 sign only
843238405Sjkim * by 10% and *worsens* rsa4096 sign by 15%. Once again, it's a
844238405Sjkim * reference implementation, one to be used as starting point for
845238405Sjkim * platform-specific assembler. Mentioned numbers apply to compiler
846238405Sjkim * generated code compiled with and without -DOPENSSL_BN_ASM_MONT and
847238405Sjkim * can vary not only from platform to platform, but even for compiler
848238405Sjkim * versions. Assembler vs. assembler improvement coefficients can
849238405Sjkim * [and are known to] differ and are to be documented elsewhere.
850238405Sjkim */
851238405Sjkimint bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,const BN_ULONG *n0p, int num)
852238405Sjkim	{
853238405Sjkim	BN_ULONG c0,c1,ml,*tp,n0;
854238405Sjkim#ifdef mul64
855238405Sjkim	BN_ULONG mh;
856238405Sjkim#endif
857238405Sjkim	volatile BN_ULONG *vp;
858238405Sjkim	int i=0,j;
859238405Sjkim
860238405Sjkim#if 0	/* template for platform-specific implementation */
861238405Sjkim	if (ap==bp)	return bn_sqr_mont(rp,ap,np,n0p,num);
862238405Sjkim#endif
863238405Sjkim	vp = tp = alloca((num+2)*sizeof(BN_ULONG));
864238405Sjkim
865238405Sjkim	n0 = *n0p;
866238405Sjkim
867238405Sjkim	c0 = 0;
868238405Sjkim	ml = bp[0];
869238405Sjkim#ifdef mul64
870238405Sjkim	mh = HBITS(ml);
871238405Sjkim	ml = LBITS(ml);
872238405Sjkim	for (j=0;j<num;++j)
873238405Sjkim		mul(tp[j],ap[j],ml,mh,c0);
874238405Sjkim#else
875238405Sjkim	for (j=0;j<num;++j)
876238405Sjkim		mul(tp[j],ap[j],ml,c0);
877238405Sjkim#endif
878238405Sjkim
879238405Sjkim	tp[num]   = c0;
880238405Sjkim	tp[num+1] = 0;
881238405Sjkim	goto enter;
882238405Sjkim
883238405Sjkim	for(i=0;i<num;i++)
884238405Sjkim		{
885238405Sjkim		c0 = 0;
886238405Sjkim		ml = bp[i];
887238405Sjkim#ifdef mul64
888238405Sjkim		mh = HBITS(ml);
889238405Sjkim		ml = LBITS(ml);
890238405Sjkim		for (j=0;j<num;++j)
891238405Sjkim			mul_add(tp[j],ap[j],ml,mh,c0);
892238405Sjkim#else
893238405Sjkim		for (j=0;j<num;++j)
894238405Sjkim			mul_add(tp[j],ap[j],ml,c0);
895238405Sjkim#endif
896238405Sjkim		c1 = (tp[num] + c0)&BN_MASK2;
897238405Sjkim		tp[num]   = c1;
898238405Sjkim		tp[num+1] = (c1<c0?1:0);
899238405Sjkim	enter:
900238405Sjkim		c1  = tp[0];
901238405Sjkim		ml = (c1*n0)&BN_MASK2;
902238405Sjkim		c0 = 0;
903238405Sjkim#ifdef mul64
904238405Sjkim		mh = HBITS(ml);
905238405Sjkim		ml = LBITS(ml);
906238405Sjkim		mul_add(c1,np[0],ml,mh,c0);
907238405Sjkim#else
908238405Sjkim		mul_add(c1,ml,np[0],c0);
909238405Sjkim#endif
910238405Sjkim		for(j=1;j<num;j++)
911238405Sjkim			{
912238405Sjkim			c1 = tp[j];
913238405Sjkim#ifdef mul64
914238405Sjkim			mul_add(c1,np[j],ml,mh,c0);
915238405Sjkim#else
916238405Sjkim			mul_add(c1,ml,np[j],c0);
917238405Sjkim#endif
918238405Sjkim			tp[j-1] = c1&BN_MASK2;
919238405Sjkim			}
920238405Sjkim		c1        = (tp[num] + c0)&BN_MASK2;
921238405Sjkim		tp[num-1] = c1;
922238405Sjkim		tp[num]   = tp[num+1] + (c1<c0?1:0);
923238405Sjkim		}
924238405Sjkim
925238405Sjkim	if (tp[num]!=0 || tp[num-1]>=np[num-1])
926238405Sjkim		{
927238405Sjkim		c0 = bn_sub_words(rp,tp,np,num);
928238405Sjkim		if (tp[num]!=0 || c0==0)
929238405Sjkim			{
930238405Sjkim			for(i=0;i<num+2;i++)	vp[i] = 0;
931238405Sjkim			return 1;
932238405Sjkim			}
933238405Sjkim		}
934238405Sjkim	for(i=0;i<num;i++)	rp[i] = tp[i],	vp[i] = 0;
935238405Sjkim	vp[num]   = 0;
936238405Sjkim	vp[num+1] = 0;
937238405Sjkim	return 1;
938238405Sjkim	}
939238405Sjkim#else
940238405Sjkim/*
941238405Sjkim * Return value of 0 indicates that multiplication/convolution was not
942238405Sjkim * performed to signal the caller to fall down to alternative/original
943238405Sjkim * code-path.
944238405Sjkim */
945238405Sjkimint bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,const BN_ULONG *n0, int num)
946238405Sjkim{	return 0;	}
947238405Sjkim#endif /* OPENSSL_BN_ASM_MONT */
948238405Sjkim#endif
949238405Sjkim
95059191Skris#else /* !BN_MUL_COMBA */
95155714Skris
95255714Skris/* hmm... is it faster just to do a multiply? */
95355714Skris#undef bn_sqr_comba4
954238405Sjkimvoid bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a)
95555714Skris	{
95655714Skris	BN_ULONG t[8];
95755714Skris	bn_sqr_normal(r,a,4,t);
95855714Skris	}
95955714Skris
96055714Skris#undef bn_sqr_comba8
961238405Sjkimvoid bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a)
96255714Skris	{
96355714Skris	BN_ULONG t[16];
96455714Skris	bn_sqr_normal(r,a,8,t);
96555714Skris	}
96655714Skris
96755714Skrisvoid bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
96855714Skris	{
96955714Skris	r[4]=bn_mul_words(    &(r[0]),a,4,b[0]);
97055714Skris	r[5]=bn_mul_add_words(&(r[1]),a,4,b[1]);
97155714Skris	r[6]=bn_mul_add_words(&(r[2]),a,4,b[2]);
97255714Skris	r[7]=bn_mul_add_words(&(r[3]),a,4,b[3]);
97355714Skris	}
97455714Skris
97555714Skrisvoid bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
97655714Skris	{
97755714Skris	r[ 8]=bn_mul_words(    &(r[0]),a,8,b[0]);
97855714Skris	r[ 9]=bn_mul_add_words(&(r[1]),a,8,b[1]);
97955714Skris	r[10]=bn_mul_add_words(&(r[2]),a,8,b[2]);
98055714Skris	r[11]=bn_mul_add_words(&(r[3]),a,8,b[3]);
98155714Skris	r[12]=bn_mul_add_words(&(r[4]),a,8,b[4]);
98255714Skris	r[13]=bn_mul_add_words(&(r[5]),a,8,b[5]);
98355714Skris	r[14]=bn_mul_add_words(&(r[6]),a,8,b[6]);
98455714Skris	r[15]=bn_mul_add_words(&(r[7]),a,8,b[7]);
98555714Skris	}
98655714Skris
987238405Sjkim#ifdef OPENSSL_NO_ASM
988238405Sjkim#ifdef OPENSSL_BN_ASM_MONT
989238405Sjkim#include <alloca.h>
990238405Sjkimint bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,const BN_ULONG *n0p, int num)
991238405Sjkim	{
992238405Sjkim	BN_ULONG c0,c1,*tp,n0=*n0p;
993238405Sjkim	volatile BN_ULONG *vp;
994238405Sjkim	int i=0,j;
995238405Sjkim
996238405Sjkim	vp = tp = alloca((num+2)*sizeof(BN_ULONG));
997238405Sjkim
998238405Sjkim	for(i=0;i<=num;i++)	tp[i]=0;
999238405Sjkim
1000238405Sjkim	for(i=0;i<num;i++)
1001238405Sjkim		{
1002238405Sjkim		c0         = bn_mul_add_words(tp,ap,num,bp[i]);
1003238405Sjkim		c1         = (tp[num] + c0)&BN_MASK2;
1004238405Sjkim		tp[num]    = c1;
1005238405Sjkim		tp[num+1]  = (c1<c0?1:0);
1006238405Sjkim
1007238405Sjkim		c0         = bn_mul_add_words(tp,np,num,tp[0]*n0);
1008238405Sjkim		c1         = (tp[num] + c0)&BN_MASK2;
1009238405Sjkim		tp[num]    = c1;
1010238405Sjkim		tp[num+1] += (c1<c0?1:0);
1011238405Sjkim		for(j=0;j<=num;j++)	tp[j]=tp[j+1];
1012238405Sjkim		}
1013238405Sjkim
1014238405Sjkim	if (tp[num]!=0 || tp[num-1]>=np[num-1])
1015238405Sjkim		{
1016238405Sjkim		c0 = bn_sub_words(rp,tp,np,num);
1017238405Sjkim		if (tp[num]!=0 || c0==0)
1018238405Sjkim			{
1019238405Sjkim			for(i=0;i<num+2;i++)	vp[i] = 0;
1020238405Sjkim			return 1;
1021238405Sjkim			}
1022238405Sjkim		}
1023238405Sjkim	for(i=0;i<num;i++)	rp[i] = tp[i],	vp[i] = 0;
1024238405Sjkim	vp[num]   = 0;
1025238405Sjkim	vp[num+1] = 0;
1026238405Sjkim	return 1;
1027238405Sjkim	}
1028238405Sjkim#else
1029238405Sjkimint bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,const BN_ULONG *n0, int num)
1030238405Sjkim{	return 0;	}
1031238405Sjkim#endif /* OPENSSL_BN_ASM_MONT */
1032238405Sjkim#endif
1033238405Sjkim
103459191Skris#endif /* !BN_MUL_COMBA */
1035