155714Skris/* crypto/bn/bn_asm.c */
255714Skris/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
355714Skris * All rights reserved.
455714Skris *
555714Skris * This package is an SSL implementation written
655714Skris * by Eric Young (eay@cryptsoft.com).
755714Skris * The implementation was written so as to conform with Netscapes SSL.
8296341Sdelphij *
955714Skris * This library is free for commercial and non-commercial use as long as
1055714Skris * the following conditions are aheared to.  The following conditions
1155714Skris * apply to all code found in this distribution, be it the RC4, RSA,
1255714Skris * lhash, DES, etc., code; not just the SSL code.  The SSL documentation
1355714Skris * included with this distribution is covered by the same copyright terms
1455714Skris * except that the holder is Tim Hudson (tjh@cryptsoft.com).
15296341Sdelphij *
1655714Skris * Copyright remains Eric Young's, and as such any Copyright notices in
1755714Skris * the code are not to be removed.
1855714Skris * If this package is used in a product, Eric Young should be given attribution
1955714Skris * as the author of the parts of the library used.
2055714Skris * This can be in the form of a textual message at program startup or
2155714Skris * in documentation (online or textual) provided with the package.
22296341Sdelphij *
2355714Skris * Redistribution and use in source and binary forms, with or without
2455714Skris * modification, are permitted provided that the following conditions
2555714Skris * are met:
2655714Skris * 1. Redistributions of source code must retain the copyright
2755714Skris *    notice, this list of conditions and the following disclaimer.
2855714Skris * 2. Redistributions in binary form must reproduce the above copyright
2955714Skris *    notice, this list of conditions and the following disclaimer in the
3055714Skris *    documentation and/or other materials provided with the distribution.
3155714Skris * 3. All advertising materials mentioning features or use of this software
3255714Skris *    must display the following acknowledgement:
3355714Skris *    "This product includes cryptographic software written by
3455714Skris *     Eric Young (eay@cryptsoft.com)"
3555714Skris *    The word 'cryptographic' can be left out if the rouines from the library
3655714Skris *    being used are not cryptographic related :-).
37296341Sdelphij * 4. If you include any Windows specific code (or a derivative thereof) from
3855714Skris *    the apps directory (application code) you must include an acknowledgement:
3955714Skris *    "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
40296341Sdelphij *
4155714Skris * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
4255714Skris * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
4355714Skris * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
4455714Skris * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
4555714Skris * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
4655714Skris * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
4755714Skris * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
4855714Skris * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
4955714Skris * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
5055714Skris * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
5155714Skris * SUCH DAMAGE.
52296341Sdelphij *
5355714Skris * The licence and distribution terms for any publically available version or
5455714Skris * derivative of this code cannot be changed.  i.e. this code cannot simply be
5555714Skris * copied and put under another distribution licence
5655714Skris * [including the GNU Public Licence.]
5755714Skris */
5855714Skris
5959191Skris#ifndef BN_DEBUG
60296341Sdelphij# undef NDEBUG                  /* avoid conflicting definitions */
6159191Skris# define NDEBUG
6259191Skris#endif
6359191Skris
6455714Skris#include <stdio.h>
6559191Skris#include <assert.h>
6655714Skris#include "cryptlib.h"
6755714Skris#include "bn_lcl.h"
6855714Skris
6959191Skris#if defined(BN_LLONG) || defined(BN_UMULT_HIGH)
7055714Skris
71296341SdelphijBN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num,
72296341Sdelphij                          BN_ULONG w)
73296341Sdelphij{
74296341Sdelphij    BN_ULONG c1 = 0;
7555714Skris
76296341Sdelphij    assert(num >= 0);
77296341Sdelphij    if (num <= 0)
78296341Sdelphij        return (c1);
7955714Skris
80296341Sdelphij# ifndef OPENSSL_SMALL_FOOTPRINT
81296341Sdelphij    while (num & ~3) {
82296341Sdelphij        mul_add(rp[0], ap[0], w, c1);
83296341Sdelphij        mul_add(rp[1], ap[1], w, c1);
84296341Sdelphij        mul_add(rp[2], ap[2], w, c1);
85296341Sdelphij        mul_add(rp[3], ap[3], w, c1);
86296341Sdelphij        ap += 4;
87296341Sdelphij        rp += 4;
88296341Sdelphij        num -= 4;
89296341Sdelphij    }
90296341Sdelphij# endif
91296341Sdelphij    while (num) {
92296341Sdelphij        mul_add(rp[0], ap[0], w, c1);
93296341Sdelphij        ap++;
94296341Sdelphij        rp++;
95296341Sdelphij        num--;
96296341Sdelphij    }
9755714Skris
98296341Sdelphij    return (c1);
99296341Sdelphij}
100296341Sdelphij
101109998SmarkmBN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
102296341Sdelphij{
103296341Sdelphij    BN_ULONG c1 = 0;
10455714Skris
105296341Sdelphij    assert(num >= 0);
106296341Sdelphij    if (num <= 0)
107296341Sdelphij        return (c1);
10855714Skris
109296341Sdelphij# ifndef OPENSSL_SMALL_FOOTPRINT
110296341Sdelphij    while (num & ~3) {
111296341Sdelphij        mul(rp[0], ap[0], w, c1);
112296341Sdelphij        mul(rp[1], ap[1], w, c1);
113296341Sdelphij        mul(rp[2], ap[2], w, c1);
114296341Sdelphij        mul(rp[3], ap[3], w, c1);
115296341Sdelphij        ap += 4;
116296341Sdelphij        rp += 4;
117296341Sdelphij        num -= 4;
118296341Sdelphij    }
119296341Sdelphij# endif
120296341Sdelphij    while (num) {
121296341Sdelphij        mul(rp[0], ap[0], w, c1);
122296341Sdelphij        ap++;
123296341Sdelphij        rp++;
124296341Sdelphij        num--;
125296341Sdelphij    }
126296341Sdelphij    return (c1);
127296341Sdelphij}
12855714Skris
129109998Smarkmvoid bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n)
130296341Sdelphij{
131296341Sdelphij    assert(n >= 0);
132296341Sdelphij    if (n <= 0)
133296341Sdelphij        return;
134238405Sjkim
135296341Sdelphij# ifndef OPENSSL_SMALL_FOOTPRINT
136296341Sdelphij    while (n & ~3) {
137296341Sdelphij        sqr(r[0], r[1], a[0]);
138296341Sdelphij        sqr(r[2], r[3], a[1]);
139296341Sdelphij        sqr(r[4], r[5], a[2]);
140296341Sdelphij        sqr(r[6], r[7], a[3]);
141296341Sdelphij        a += 4;
142296341Sdelphij        r += 8;
143296341Sdelphij        n -= 4;
144296341Sdelphij    }
145296341Sdelphij# endif
146296341Sdelphij    while (n) {
147296341Sdelphij        sqr(r[0], r[1], a[0]);
148296341Sdelphij        a++;
149296341Sdelphij        r += 2;
150296341Sdelphij        n--;
151296341Sdelphij    }
152296341Sdelphij}
15355714Skris
154296341Sdelphij#else                           /* !(defined(BN_LLONG) ||
155296341Sdelphij                                 * defined(BN_UMULT_HIGH)) */
15655714Skris
157296341SdelphijBN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num,
158296341Sdelphij                          BN_ULONG w)
159296341Sdelphij{
160296341Sdelphij    BN_ULONG c = 0;
161296341Sdelphij    BN_ULONG bl, bh;
16255714Skris
163296341Sdelphij    assert(num >= 0);
164296341Sdelphij    if (num <= 0)
165296341Sdelphij        return ((BN_ULONG)0);
16655714Skris
167296341Sdelphij    bl = LBITS(w);
168296341Sdelphij    bh = HBITS(w);
16955714Skris
170296341Sdelphij# ifndef OPENSSL_SMALL_FOOTPRINT
171296341Sdelphij    while (num & ~3) {
172296341Sdelphij        mul_add(rp[0], ap[0], bl, bh, c);
173296341Sdelphij        mul_add(rp[1], ap[1], bl, bh, c);
174296341Sdelphij        mul_add(rp[2], ap[2], bl, bh, c);
175296341Sdelphij        mul_add(rp[3], ap[3], bl, bh, c);
176296341Sdelphij        ap += 4;
177296341Sdelphij        rp += 4;
178296341Sdelphij        num -= 4;
179296341Sdelphij    }
180296341Sdelphij# endif
181296341Sdelphij    while (num) {
182296341Sdelphij        mul_add(rp[0], ap[0], bl, bh, c);
183296341Sdelphij        ap++;
184296341Sdelphij        rp++;
185296341Sdelphij        num--;
186296341Sdelphij    }
187296341Sdelphij    return (c);
188296341Sdelphij}
18955714Skris
190109998SmarkmBN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
191296341Sdelphij{
192296341Sdelphij    BN_ULONG carry = 0;
193296341Sdelphij    BN_ULONG bl, bh;
19455714Skris
195296341Sdelphij    assert(num >= 0);
196296341Sdelphij    if (num <= 0)
197296341Sdelphij        return ((BN_ULONG)0);
19855714Skris
199296341Sdelphij    bl = LBITS(w);
200296341Sdelphij    bh = HBITS(w);
20155714Skris
202296341Sdelphij# ifndef OPENSSL_SMALL_FOOTPRINT
203296341Sdelphij    while (num & ~3) {
204296341Sdelphij        mul(rp[0], ap[0], bl, bh, carry);
205296341Sdelphij        mul(rp[1], ap[1], bl, bh, carry);
206296341Sdelphij        mul(rp[2], ap[2], bl, bh, carry);
207296341Sdelphij        mul(rp[3], ap[3], bl, bh, carry);
208296341Sdelphij        ap += 4;
209296341Sdelphij        rp += 4;
210296341Sdelphij        num -= 4;
211296341Sdelphij    }
212296341Sdelphij# endif
213296341Sdelphij    while (num) {
214296341Sdelphij        mul(rp[0], ap[0], bl, bh, carry);
215296341Sdelphij        ap++;
216296341Sdelphij        rp++;
217296341Sdelphij        num--;
218296341Sdelphij    }
219296341Sdelphij    return (carry);
220296341Sdelphij}
22155714Skris
222109998Smarkmvoid bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n)
223296341Sdelphij{
224296341Sdelphij    assert(n >= 0);
225296341Sdelphij    if (n <= 0)
226296341Sdelphij        return;
227238405Sjkim
228296341Sdelphij# ifndef OPENSSL_SMALL_FOOTPRINT
229296341Sdelphij    while (n & ~3) {
230296341Sdelphij        sqr64(r[0], r[1], a[0]);
231296341Sdelphij        sqr64(r[2], r[3], a[1]);
232296341Sdelphij        sqr64(r[4], r[5], a[2]);
233296341Sdelphij        sqr64(r[6], r[7], a[3]);
234296341Sdelphij        a += 4;
235296341Sdelphij        r += 8;
236296341Sdelphij        n -= 4;
237296341Sdelphij    }
238296341Sdelphij# endif
239296341Sdelphij    while (n) {
240296341Sdelphij        sqr64(r[0], r[1], a[0]);
241296341Sdelphij        a++;
242296341Sdelphij        r += 2;
243296341Sdelphij        n--;
244296341Sdelphij    }
245296341Sdelphij}
24655714Skris
247296341Sdelphij#endif                          /* !(defined(BN_LLONG) ||
248296341Sdelphij                                 * defined(BN_UMULT_HIGH)) */
24955714Skris
25055714Skris#if defined(BN_LLONG) && defined(BN_DIV2W)
25155714Skris
25255714SkrisBN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
253296341Sdelphij{
254296341Sdelphij    return ((BN_ULONG)(((((BN_ULLONG) h) << BN_BITS2) | l) / (BN_ULLONG) d));
255296341Sdelphij}
25655714Skris
25755714Skris#else
25855714Skris
25968651Skris/* Divide h,l by d and return the result. */
26055714Skris/* I need to test this some more :-( */
26155714SkrisBN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
262296341Sdelphij{
263296341Sdelphij    BN_ULONG dh, dl, q, ret = 0, th, tl, t;
264296341Sdelphij    int i, count = 2;
26555714Skris
266296341Sdelphij    if (d == 0)
267296341Sdelphij        return (BN_MASK2);
26855714Skris
269296341Sdelphij    i = BN_num_bits_word(d);
270296341Sdelphij    assert((i == BN_BITS2) || (h <= (BN_ULONG)1 << i));
27168651Skris
272296341Sdelphij    i = BN_BITS2 - i;
273296341Sdelphij    if (h >= d)
274296341Sdelphij        h -= d;
27555714Skris
276296341Sdelphij    if (i) {
277296341Sdelphij        d <<= i;
278296341Sdelphij        h = (h << i) | (l >> (BN_BITS2 - i));
279296341Sdelphij        l <<= i;
280296341Sdelphij    }
281296341Sdelphij    dh = (d & BN_MASK2h) >> BN_BITS4;
282296341Sdelphij    dl = (d & BN_MASK2l);
283296341Sdelphij    for (;;) {
284296341Sdelphij        if ((h >> BN_BITS4) == dh)
285296341Sdelphij            q = BN_MASK2l;
286296341Sdelphij        else
287296341Sdelphij            q = h / dh;
28855714Skris
289296341Sdelphij        th = q * dh;
290296341Sdelphij        tl = dl * q;
291296341Sdelphij        for (;;) {
292296341Sdelphij            t = h - th;
293296341Sdelphij            if ((t & BN_MASK2h) ||
294296341Sdelphij                ((tl) <= ((t << BN_BITS4) | ((l & BN_MASK2h) >> BN_BITS4))))
295296341Sdelphij                break;
296296341Sdelphij            q--;
297296341Sdelphij            th -= dh;
298296341Sdelphij            tl -= dl;
299296341Sdelphij        }
300296341Sdelphij        t = (tl >> BN_BITS4);
301296341Sdelphij        tl = (tl << BN_BITS4) & BN_MASK2h;
302296341Sdelphij        th += t;
30355714Skris
304296341Sdelphij        if (l < tl)
305296341Sdelphij            th++;
306296341Sdelphij        l -= tl;
307296341Sdelphij        if (h < th) {
308296341Sdelphij            h += d;
309296341Sdelphij            q--;
310296341Sdelphij        }
311296341Sdelphij        h -= th;
31255714Skris
313296341Sdelphij        if (--count == 0)
314296341Sdelphij            break;
31555714Skris
316296341Sdelphij        ret = q << BN_BITS4;
317296341Sdelphij        h = ((h << BN_BITS4) | (l >> BN_BITS4)) & BN_MASK2;
318296341Sdelphij        l = (l & BN_MASK2l) << BN_BITS4;
319296341Sdelphij    }
320296341Sdelphij    ret |= q;
321296341Sdelphij    return (ret);
322296341Sdelphij}
323296341Sdelphij#endif                          /* !defined(BN_LLONG) && defined(BN_DIV2W) */
32455714Skris
32555714Skris#ifdef BN_LLONG
326296341SdelphijBN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
327296341Sdelphij                      int n)
328296341Sdelphij{
329296341Sdelphij    BN_ULLONG ll = 0;
33055714Skris
331296341Sdelphij    assert(n >= 0);
332296341Sdelphij    if (n <= 0)
333296341Sdelphij        return ((BN_ULONG)0);
33455714Skris
335296341Sdelphij# ifndef OPENSSL_SMALL_FOOTPRINT
336296341Sdelphij    while (n & ~3) {
337296341Sdelphij        ll += (BN_ULLONG) a[0] + b[0];
338296341Sdelphij        r[0] = (BN_ULONG)ll & BN_MASK2;
339296341Sdelphij        ll >>= BN_BITS2;
340296341Sdelphij        ll += (BN_ULLONG) a[1] + b[1];
341296341Sdelphij        r[1] = (BN_ULONG)ll & BN_MASK2;
342296341Sdelphij        ll >>= BN_BITS2;
343296341Sdelphij        ll += (BN_ULLONG) a[2] + b[2];
344296341Sdelphij        r[2] = (BN_ULONG)ll & BN_MASK2;
345296341Sdelphij        ll >>= BN_BITS2;
346296341Sdelphij        ll += (BN_ULLONG) a[3] + b[3];
347296341Sdelphij        r[3] = (BN_ULONG)ll & BN_MASK2;
348296341Sdelphij        ll >>= BN_BITS2;
349296341Sdelphij        a += 4;
350296341Sdelphij        b += 4;
351296341Sdelphij        r += 4;
352296341Sdelphij        n -= 4;
353296341Sdelphij    }
354296341Sdelphij# endif
355296341Sdelphij    while (n) {
356296341Sdelphij        ll += (BN_ULLONG) a[0] + b[0];
357296341Sdelphij        r[0] = (BN_ULONG)ll & BN_MASK2;
358296341Sdelphij        ll >>= BN_BITS2;
359296341Sdelphij        a++;
360296341Sdelphij        b++;
361296341Sdelphij        r++;
362296341Sdelphij        n--;
363296341Sdelphij    }
364296341Sdelphij    return ((BN_ULONG)ll);
365296341Sdelphij}
366296341Sdelphij#else                           /* !BN_LLONG */
367296341SdelphijBN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
368296341Sdelphij                      int n)
369296341Sdelphij{
370296341Sdelphij    BN_ULONG c, l, t;
37155714Skris
372296341Sdelphij    assert(n >= 0);
373296341Sdelphij    if (n <= 0)
374296341Sdelphij        return ((BN_ULONG)0);
37555714Skris
376296341Sdelphij    c = 0;
377296341Sdelphij# ifndef OPENSSL_SMALL_FOOTPRINT
378296341Sdelphij    while (n & ~3) {
379296341Sdelphij        t = a[0];
380296341Sdelphij        t = (t + c) & BN_MASK2;
381296341Sdelphij        c = (t < c);
382296341Sdelphij        l = (t + b[0]) & BN_MASK2;
383296341Sdelphij        c += (l < t);
384296341Sdelphij        r[0] = l;
385296341Sdelphij        t = a[1];
386296341Sdelphij        t = (t + c) & BN_MASK2;
387296341Sdelphij        c = (t < c);
388296341Sdelphij        l = (t + b[1]) & BN_MASK2;
389296341Sdelphij        c += (l < t);
390296341Sdelphij        r[1] = l;
391296341Sdelphij        t = a[2];
392296341Sdelphij        t = (t + c) & BN_MASK2;
393296341Sdelphij        c = (t < c);
394296341Sdelphij        l = (t + b[2]) & BN_MASK2;
395296341Sdelphij        c += (l < t);
396296341Sdelphij        r[2] = l;
397296341Sdelphij        t = a[3];
398296341Sdelphij        t = (t + c) & BN_MASK2;
399296341Sdelphij        c = (t < c);
400296341Sdelphij        l = (t + b[3]) & BN_MASK2;
401296341Sdelphij        c += (l < t);
402296341Sdelphij        r[3] = l;
403296341Sdelphij        a += 4;
404296341Sdelphij        b += 4;
405296341Sdelphij        r += 4;
406296341Sdelphij        n -= 4;
407296341Sdelphij    }
408296341Sdelphij# endif
409296341Sdelphij    while (n) {
410296341Sdelphij        t = a[0];
411296341Sdelphij        t = (t + c) & BN_MASK2;
412296341Sdelphij        c = (t < c);
413296341Sdelphij        l = (t + b[0]) & BN_MASK2;
414296341Sdelphij        c += (l < t);
415296341Sdelphij        r[0] = l;
416296341Sdelphij        a++;
417296341Sdelphij        b++;
418296341Sdelphij        r++;
419296341Sdelphij        n--;
420296341Sdelphij    }
421296341Sdelphij    return ((BN_ULONG)c);
422296341Sdelphij}
423296341Sdelphij#endif                          /* !BN_LLONG */
42455714Skris
425296341SdelphijBN_ULONG bn_sub_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
426296341Sdelphij                      int n)
427296341Sdelphij{
428296341Sdelphij    BN_ULONG t1, t2;
429296341Sdelphij    int c = 0;
43055714Skris
431296341Sdelphij    assert(n >= 0);
432296341Sdelphij    if (n <= 0)
433296341Sdelphij        return ((BN_ULONG)0);
43455714Skris
435238405Sjkim#ifndef OPENSSL_SMALL_FOOTPRINT
436296341Sdelphij    while (n & ~3) {
437296341Sdelphij        t1 = a[0];
438296341Sdelphij        t2 = b[0];
439296341Sdelphij        r[0] = (t1 - t2 - c) & BN_MASK2;
440296341Sdelphij        if (t1 != t2)
441296341Sdelphij            c = (t1 < t2);
442296341Sdelphij        t1 = a[1];
443296341Sdelphij        t2 = b[1];
444296341Sdelphij        r[1] = (t1 - t2 - c) & BN_MASK2;
445296341Sdelphij        if (t1 != t2)
446296341Sdelphij            c = (t1 < t2);
447296341Sdelphij        t1 = a[2];
448296341Sdelphij        t2 = b[2];
449296341Sdelphij        r[2] = (t1 - t2 - c) & BN_MASK2;
450296341Sdelphij        if (t1 != t2)
451296341Sdelphij            c = (t1 < t2);
452296341Sdelphij        t1 = a[3];
453296341Sdelphij        t2 = b[3];
454296341Sdelphij        r[3] = (t1 - t2 - c) & BN_MASK2;
455296341Sdelphij        if (t1 != t2)
456296341Sdelphij            c = (t1 < t2);
457296341Sdelphij        a += 4;
458296341Sdelphij        b += 4;
459296341Sdelphij        r += 4;
460296341Sdelphij        n -= 4;
461296341Sdelphij    }
462238405Sjkim#endif
463296341Sdelphij    while (n) {
464296341Sdelphij        t1 = a[0];
465296341Sdelphij        t2 = b[0];
466296341Sdelphij        r[0] = (t1 - t2 - c) & BN_MASK2;
467296341Sdelphij        if (t1 != t2)
468296341Sdelphij            c = (t1 < t2);
469296341Sdelphij        a++;
470296341Sdelphij        b++;
471296341Sdelphij        r++;
472296341Sdelphij        n--;
473296341Sdelphij    }
474296341Sdelphij    return (c);
475296341Sdelphij}
47655714Skris
477238405Sjkim#if defined(BN_MUL_COMBA) && !defined(OPENSSL_SMALL_FOOTPRINT)
47855714Skris
479296341Sdelphij# undef bn_mul_comba8
480296341Sdelphij# undef bn_mul_comba4
481296341Sdelphij# undef bn_sqr_comba8
482296341Sdelphij# undef bn_sqr_comba4
48355714Skris
48459191Skris/* mul_add_c(a,b,c0,c1,c2)  -- c+=a*b for three word number c=(c2,c1,c0) */
48559191Skris/* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */
48659191Skris/* sqr_add_c(a,i,c0,c1,c2)  -- c+=a[i]^2 for three word number c=(c2,c1,c0) */
487296341Sdelphij/*
488296341Sdelphij * sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number
489296341Sdelphij * c=(c2,c1,c0)
490296341Sdelphij */
49159191Skris
492277195Sdelphij/*
493277195Sdelphij * Keep in mind that carrying into high part of multiplication result
494277195Sdelphij * can not overflow, because it cannot be all-ones.
495277195Sdelphij */
496296341Sdelphij# ifdef BN_LLONG
497296341Sdelphij#  define mul_add_c(a,b,c0,c1,c2) \
498296341Sdelphij        t=(BN_ULLONG)a*b; \
499296341Sdelphij        t1=(BN_ULONG)Lw(t); \
500296341Sdelphij        t2=(BN_ULONG)Hw(t); \
501296341Sdelphij        c0=(c0+t1)&BN_MASK2; if ((c0) < t1) t2++; \
502296341Sdelphij        c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++;
50355714Skris
504296341Sdelphij#  define mul_add_c2(a,b,c0,c1,c2) \
505296341Sdelphij        t=(BN_ULLONG)a*b; \
506296341Sdelphij        tt=(t+t)&BN_MASK; \
507296341Sdelphij        if (tt < t) c2++; \
508296341Sdelphij        t1=(BN_ULONG)Lw(tt); \
509296341Sdelphij        t2=(BN_ULONG)Hw(tt); \
510296341Sdelphij        c0=(c0+t1)&BN_MASK2;  \
511296341Sdelphij        if ((c0 < t1) && (((++t2)&BN_MASK2) == 0)) c2++; \
512296341Sdelphij        c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++;
51355714Skris
514296341Sdelphij#  define sqr_add_c(a,i,c0,c1,c2) \
515296341Sdelphij        t=(BN_ULLONG)a[i]*a[i]; \
516296341Sdelphij        t1=(BN_ULONG)Lw(t); \
517296341Sdelphij        t2=(BN_ULONG)Hw(t); \
518296341Sdelphij        c0=(c0+t1)&BN_MASK2; if ((c0) < t1) t2++; \
519296341Sdelphij        c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++;
52055714Skris
521296341Sdelphij#  define sqr_add_c2(a,i,j,c0,c1,c2) \
522296341Sdelphij        mul_add_c2((a)[i],(a)[j],c0,c1,c2)
52359191Skris
524296341Sdelphij# elif defined(BN_UMULT_LOHI)
525160814Ssimon
526296341Sdelphij#  define mul_add_c(a,b,c0,c1,c2) {       \
527296341Sdelphij        BN_ULONG ta=(a),tb=(b);         \
528296341Sdelphij        BN_UMULT_LOHI(t1,t2,ta,tb);     \
529296341Sdelphij        c0 += t1; t2 += (c0<t1)?1:0;    \
530296341Sdelphij        c1 += t2; c2 += (c1<t2)?1:0;    \
531296341Sdelphij        }
532160814Ssimon
533296341Sdelphij#  define mul_add_c2(a,b,c0,c1,c2) {      \
534296341Sdelphij        BN_ULONG ta=(a),tb=(b),t0;      \
535296341Sdelphij        BN_UMULT_LOHI(t0,t1,ta,tb);     \
536296341Sdelphij        c0 += t0; t2 = t1+((c0<t0)?1:0);\
537296341Sdelphij        c1 += t2; c2 += (c1<t2)?1:0;    \
538296341Sdelphij        c0 += t0; t1 += (c0<t0)?1:0;    \
539296341Sdelphij        c1 += t1; c2 += (c1<t1)?1:0;    \
540296341Sdelphij        }
541160814Ssimon
542296341Sdelphij#  define sqr_add_c(a,i,c0,c1,c2) {       \
543296341Sdelphij        BN_ULONG ta=(a)[i];             \
544296341Sdelphij        BN_UMULT_LOHI(t1,t2,ta,ta);     \
545296341Sdelphij        c0 += t1; t2 += (c0<t1)?1:0;    \
546296341Sdelphij        c1 += t2; c2 += (c1<t2)?1:0;    \
547296341Sdelphij        }
548160814Ssimon
549296341Sdelphij#  define sqr_add_c2(a,i,j,c0,c1,c2)    \
550296341Sdelphij        mul_add_c2((a)[i],(a)[j],c0,c1,c2)
551160814Ssimon
552296341Sdelphij# elif defined(BN_UMULT_HIGH)
55359191Skris
554296341Sdelphij#  define mul_add_c(a,b,c0,c1,c2) {       \
555296341Sdelphij        BN_ULONG ta=(a),tb=(b);         \
556296341Sdelphij        t1 = ta * tb;                   \
557296341Sdelphij        t2 = BN_UMULT_HIGH(ta,tb);      \
558296341Sdelphij        c0 += t1; t2 += (c0<t1)?1:0;    \
559296341Sdelphij        c1 += t2; c2 += (c1<t2)?1:0;    \
560296341Sdelphij        }
56159191Skris
562296341Sdelphij#  define mul_add_c2(a,b,c0,c1,c2) {      \
563296341Sdelphij        BN_ULONG ta=(a),tb=(b),t0;      \
564296341Sdelphij        t1 = BN_UMULT_HIGH(ta,tb);      \
565296341Sdelphij        t0 = ta * tb;                   \
566296341Sdelphij        c0 += t0; t2 = t1+((c0<t0)?1:0);\
567296341Sdelphij        c1 += t2; c2 += (c1<t2)?1:0;    \
568296341Sdelphij        c0 += t0; t1 += (c0<t0)?1:0;    \
569296341Sdelphij        c1 += t1; c2 += (c1<t1)?1:0;    \
570296341Sdelphij        }
57159191Skris
572296341Sdelphij#  define sqr_add_c(a,i,c0,c1,c2) {       \
573296341Sdelphij        BN_ULONG ta=(a)[i];             \
574296341Sdelphij        t1 = ta * ta;                   \
575296341Sdelphij        t2 = BN_UMULT_HIGH(ta,ta);      \
576296341Sdelphij        c0 += t1; t2 += (c0<t1)?1:0;    \
577296341Sdelphij        c1 += t2; c2 += (c1<t2)?1:0;    \
578296341Sdelphij        }
57959191Skris
580296341Sdelphij#  define sqr_add_c2(a,i,j,c0,c1,c2)      \
581296341Sdelphij        mul_add_c2((a)[i],(a)[j],c0,c1,c2)
58259191Skris
583296341Sdelphij# else                          /* !BN_LLONG */
584296341Sdelphij#  define mul_add_c(a,b,c0,c1,c2) \
585296341Sdelphij        t1=LBITS(a); t2=HBITS(a); \
586296341Sdelphij        bl=LBITS(b); bh=HBITS(b); \
587296341Sdelphij        mul64(t1,t2,bl,bh); \
588296341Sdelphij        c0=(c0+t1)&BN_MASK2; if ((c0) < t1) t2++; \
589296341Sdelphij        c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++;
59055714Skris
591296341Sdelphij#  define mul_add_c2(a,b,c0,c1,c2) \
592296341Sdelphij        t1=LBITS(a); t2=HBITS(a); \
593296341Sdelphij        bl=LBITS(b); bh=HBITS(b); \
594296341Sdelphij        mul64(t1,t2,bl,bh); \
595296341Sdelphij        if (t2 & BN_TBIT) c2++; \
596296341Sdelphij        t2=(t2+t2)&BN_MASK2; \
597296341Sdelphij        if (t1 & BN_TBIT) t2++; \
598296341Sdelphij        t1=(t1+t1)&BN_MASK2; \
599296341Sdelphij        c0=(c0+t1)&BN_MASK2;  \
600296341Sdelphij        if ((c0 < t1) && (((++t2)&BN_MASK2) == 0)) c2++; \
601296341Sdelphij        c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++;
60255714Skris
603296341Sdelphij#  define sqr_add_c(a,i,c0,c1,c2) \
604296341Sdelphij        sqr64(t1,t2,(a)[i]); \
605296341Sdelphij        c0=(c0+t1)&BN_MASK2; if ((c0) < t1) t2++; \
606296341Sdelphij        c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++;
60755714Skris
608296341Sdelphij#  define sqr_add_c2(a,i,j,c0,c1,c2) \
609296341Sdelphij        mul_add_c2((a)[i],(a)[j],c0,c1,c2)
610296341Sdelphij# endif                         /* !BN_LLONG */
61155714Skris
61255714Skrisvoid bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
613296341Sdelphij{
614296341Sdelphij# ifdef BN_LLONG
615296341Sdelphij    BN_ULLONG t;
616296341Sdelphij# else
617296341Sdelphij    BN_ULONG bl, bh;
618296341Sdelphij# endif
619296341Sdelphij    BN_ULONG t1, t2;
620296341Sdelphij    BN_ULONG c1, c2, c3;
62155714Skris
622296341Sdelphij    c1 = 0;
623296341Sdelphij    c2 = 0;
624296341Sdelphij    c3 = 0;
625296341Sdelphij    mul_add_c(a[0], b[0], c1, c2, c3);
626296341Sdelphij    r[0] = c1;
627296341Sdelphij    c1 = 0;
628296341Sdelphij    mul_add_c(a[0], b[1], c2, c3, c1);
629296341Sdelphij    mul_add_c(a[1], b[0], c2, c3, c1);
630296341Sdelphij    r[1] = c2;
631296341Sdelphij    c2 = 0;
632296341Sdelphij    mul_add_c(a[2], b[0], c3, c1, c2);
633296341Sdelphij    mul_add_c(a[1], b[1], c3, c1, c2);
634296341Sdelphij    mul_add_c(a[0], b[2], c3, c1, c2);
635296341Sdelphij    r[2] = c3;
636296341Sdelphij    c3 = 0;
637296341Sdelphij    mul_add_c(a[0], b[3], c1, c2, c3);
638296341Sdelphij    mul_add_c(a[1], b[2], c1, c2, c3);
639296341Sdelphij    mul_add_c(a[2], b[1], c1, c2, c3);
640296341Sdelphij    mul_add_c(a[3], b[0], c1, c2, c3);
641296341Sdelphij    r[3] = c1;
642296341Sdelphij    c1 = 0;
643296341Sdelphij    mul_add_c(a[4], b[0], c2, c3, c1);
644296341Sdelphij    mul_add_c(a[3], b[1], c2, c3, c1);
645296341Sdelphij    mul_add_c(a[2], b[2], c2, c3, c1);
646296341Sdelphij    mul_add_c(a[1], b[3], c2, c3, c1);
647296341Sdelphij    mul_add_c(a[0], b[4], c2, c3, c1);
648296341Sdelphij    r[4] = c2;
649296341Sdelphij    c2 = 0;
650296341Sdelphij    mul_add_c(a[0], b[5], c3, c1, c2);
651296341Sdelphij    mul_add_c(a[1], b[4], c3, c1, c2);
652296341Sdelphij    mul_add_c(a[2], b[3], c3, c1, c2);
653296341Sdelphij    mul_add_c(a[3], b[2], c3, c1, c2);
654296341Sdelphij    mul_add_c(a[4], b[1], c3, c1, c2);
655296341Sdelphij    mul_add_c(a[5], b[0], c3, c1, c2);
656296341Sdelphij    r[5] = c3;
657296341Sdelphij    c3 = 0;
658296341Sdelphij    mul_add_c(a[6], b[0], c1, c2, c3);
659296341Sdelphij    mul_add_c(a[5], b[1], c1, c2, c3);
660296341Sdelphij    mul_add_c(a[4], b[2], c1, c2, c3);
661296341Sdelphij    mul_add_c(a[3], b[3], c1, c2, c3);
662296341Sdelphij    mul_add_c(a[2], b[4], c1, c2, c3);
663296341Sdelphij    mul_add_c(a[1], b[5], c1, c2, c3);
664296341Sdelphij    mul_add_c(a[0], b[6], c1, c2, c3);
665296341Sdelphij    r[6] = c1;
666296341Sdelphij    c1 = 0;
667296341Sdelphij    mul_add_c(a[0], b[7], c2, c3, c1);
668296341Sdelphij    mul_add_c(a[1], b[6], c2, c3, c1);
669296341Sdelphij    mul_add_c(a[2], b[5], c2, c3, c1);
670296341Sdelphij    mul_add_c(a[3], b[4], c2, c3, c1);
671296341Sdelphij    mul_add_c(a[4], b[3], c2, c3, c1);
672296341Sdelphij    mul_add_c(a[5], b[2], c2, c3, c1);
673296341Sdelphij    mul_add_c(a[6], b[1], c2, c3, c1);
674296341Sdelphij    mul_add_c(a[7], b[0], c2, c3, c1);
675296341Sdelphij    r[7] = c2;
676296341Sdelphij    c2 = 0;
677296341Sdelphij    mul_add_c(a[7], b[1], c3, c1, c2);
678296341Sdelphij    mul_add_c(a[6], b[2], c3, c1, c2);
679296341Sdelphij    mul_add_c(a[5], b[3], c3, c1, c2);
680296341Sdelphij    mul_add_c(a[4], b[4], c3, c1, c2);
681296341Sdelphij    mul_add_c(a[3], b[5], c3, c1, c2);
682296341Sdelphij    mul_add_c(a[2], b[6], c3, c1, c2);
683296341Sdelphij    mul_add_c(a[1], b[7], c3, c1, c2);
684296341Sdelphij    r[8] = c3;
685296341Sdelphij    c3 = 0;
686296341Sdelphij    mul_add_c(a[2], b[7], c1, c2, c3);
687296341Sdelphij    mul_add_c(a[3], b[6], c1, c2, c3);
688296341Sdelphij    mul_add_c(a[4], b[5], c1, c2, c3);
689296341Sdelphij    mul_add_c(a[5], b[4], c1, c2, c3);
690296341Sdelphij    mul_add_c(a[6], b[3], c1, c2, c3);
691296341Sdelphij    mul_add_c(a[7], b[2], c1, c2, c3);
692296341Sdelphij    r[9] = c1;
693296341Sdelphij    c1 = 0;
694296341Sdelphij    mul_add_c(a[7], b[3], c2, c3, c1);
695296341Sdelphij    mul_add_c(a[6], b[4], c2, c3, c1);
696296341Sdelphij    mul_add_c(a[5], b[5], c2, c3, c1);
697296341Sdelphij    mul_add_c(a[4], b[6], c2, c3, c1);
698296341Sdelphij    mul_add_c(a[3], b[7], c2, c3, c1);
699296341Sdelphij    r[10] = c2;
700296341Sdelphij    c2 = 0;
701296341Sdelphij    mul_add_c(a[4], b[7], c3, c1, c2);
702296341Sdelphij    mul_add_c(a[5], b[6], c3, c1, c2);
703296341Sdelphij    mul_add_c(a[6], b[5], c3, c1, c2);
704296341Sdelphij    mul_add_c(a[7], b[4], c3, c1, c2);
705296341Sdelphij    r[11] = c3;
706296341Sdelphij    c3 = 0;
707296341Sdelphij    mul_add_c(a[7], b[5], c1, c2, c3);
708296341Sdelphij    mul_add_c(a[6], b[6], c1, c2, c3);
709296341Sdelphij    mul_add_c(a[5], b[7], c1, c2, c3);
710296341Sdelphij    r[12] = c1;
711296341Sdelphij    c1 = 0;
712296341Sdelphij    mul_add_c(a[6], b[7], c2, c3, c1);
713296341Sdelphij    mul_add_c(a[7], b[6], c2, c3, c1);
714296341Sdelphij    r[13] = c2;
715296341Sdelphij    c2 = 0;
716296341Sdelphij    mul_add_c(a[7], b[7], c3, c1, c2);
717296341Sdelphij    r[14] = c3;
718296341Sdelphij    r[15] = c1;
719296341Sdelphij}
72055714Skris
72155714Skrisvoid bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
722296341Sdelphij{
723296341Sdelphij# ifdef BN_LLONG
724296341Sdelphij    BN_ULLONG t;
725296341Sdelphij# else
726296341Sdelphij    BN_ULONG bl, bh;
727296341Sdelphij# endif
728296341Sdelphij    BN_ULONG t1, t2;
729296341Sdelphij    BN_ULONG c1, c2, c3;
73055714Skris
731296341Sdelphij    c1 = 0;
732296341Sdelphij    c2 = 0;
733296341Sdelphij    c3 = 0;
734296341Sdelphij    mul_add_c(a[0], b[0], c1, c2, c3);
735296341Sdelphij    r[0] = c1;
736296341Sdelphij    c1 = 0;
737296341Sdelphij    mul_add_c(a[0], b[1], c2, c3, c1);
738296341Sdelphij    mul_add_c(a[1], b[0], c2, c3, c1);
739296341Sdelphij    r[1] = c2;
740296341Sdelphij    c2 = 0;
741296341Sdelphij    mul_add_c(a[2], b[0], c3, c1, c2);
742296341Sdelphij    mul_add_c(a[1], b[1], c3, c1, c2);
743296341Sdelphij    mul_add_c(a[0], b[2], c3, c1, c2);
744296341Sdelphij    r[2] = c3;
745296341Sdelphij    c3 = 0;
746296341Sdelphij    mul_add_c(a[0], b[3], c1, c2, c3);
747296341Sdelphij    mul_add_c(a[1], b[2], c1, c2, c3);
748296341Sdelphij    mul_add_c(a[2], b[1], c1, c2, c3);
749296341Sdelphij    mul_add_c(a[3], b[0], c1, c2, c3);
750296341Sdelphij    r[3] = c1;
751296341Sdelphij    c1 = 0;
752296341Sdelphij    mul_add_c(a[3], b[1], c2, c3, c1);
753296341Sdelphij    mul_add_c(a[2], b[2], c2, c3, c1);
754296341Sdelphij    mul_add_c(a[1], b[3], c2, c3, c1);
755296341Sdelphij    r[4] = c2;
756296341Sdelphij    c2 = 0;
757296341Sdelphij    mul_add_c(a[2], b[3], c3, c1, c2);
758296341Sdelphij    mul_add_c(a[3], b[2], c3, c1, c2);
759296341Sdelphij    r[5] = c3;
760296341Sdelphij    c3 = 0;
761296341Sdelphij    mul_add_c(a[3], b[3], c1, c2, c3);
762296341Sdelphij    r[6] = c1;
763296341Sdelphij    r[7] = c2;
764296341Sdelphij}
76555714Skris
766109998Smarkmvoid bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a)
767296341Sdelphij{
768296341Sdelphij# ifdef BN_LLONG
769296341Sdelphij    BN_ULLONG t, tt;
770296341Sdelphij# else
771296341Sdelphij    BN_ULONG bl, bh;
772296341Sdelphij# endif
773296341Sdelphij    BN_ULONG t1, t2;
774296341Sdelphij    BN_ULONG c1, c2, c3;
77555714Skris
776296341Sdelphij    c1 = 0;
777296341Sdelphij    c2 = 0;
778296341Sdelphij    c3 = 0;
779296341Sdelphij    sqr_add_c(a, 0, c1, c2, c3);
780296341Sdelphij    r[0] = c1;
781296341Sdelphij    c1 = 0;
782296341Sdelphij    sqr_add_c2(a, 1, 0, c2, c3, c1);
783296341Sdelphij    r[1] = c2;
784296341Sdelphij    c2 = 0;
785296341Sdelphij    sqr_add_c(a, 1, c3, c1, c2);
786296341Sdelphij    sqr_add_c2(a, 2, 0, c3, c1, c2);
787296341Sdelphij    r[2] = c3;
788296341Sdelphij    c3 = 0;
789296341Sdelphij    sqr_add_c2(a, 3, 0, c1, c2, c3);
790296341Sdelphij    sqr_add_c2(a, 2, 1, c1, c2, c3);
791296341Sdelphij    r[3] = c1;
792296341Sdelphij    c1 = 0;
793296341Sdelphij    sqr_add_c(a, 2, c2, c3, c1);
794296341Sdelphij    sqr_add_c2(a, 3, 1, c2, c3, c1);
795296341Sdelphij    sqr_add_c2(a, 4, 0, c2, c3, c1);
796296341Sdelphij    r[4] = c2;
797296341Sdelphij    c2 = 0;
798296341Sdelphij    sqr_add_c2(a, 5, 0, c3, c1, c2);
799296341Sdelphij    sqr_add_c2(a, 4, 1, c3, c1, c2);
800296341Sdelphij    sqr_add_c2(a, 3, 2, c3, c1, c2);
801296341Sdelphij    r[5] = c3;
802296341Sdelphij    c3 = 0;
803296341Sdelphij    sqr_add_c(a, 3, c1, c2, c3);
804296341Sdelphij    sqr_add_c2(a, 4, 2, c1, c2, c3);
805296341Sdelphij    sqr_add_c2(a, 5, 1, c1, c2, c3);
806296341Sdelphij    sqr_add_c2(a, 6, 0, c1, c2, c3);
807296341Sdelphij    r[6] = c1;
808296341Sdelphij    c1 = 0;
809296341Sdelphij    sqr_add_c2(a, 7, 0, c2, c3, c1);
810296341Sdelphij    sqr_add_c2(a, 6, 1, c2, c3, c1);
811296341Sdelphij    sqr_add_c2(a, 5, 2, c2, c3, c1);
812296341Sdelphij    sqr_add_c2(a, 4, 3, c2, c3, c1);
813296341Sdelphij    r[7] = c2;
814296341Sdelphij    c2 = 0;
815296341Sdelphij    sqr_add_c(a, 4, c3, c1, c2);
816296341Sdelphij    sqr_add_c2(a, 5, 3, c3, c1, c2);
817296341Sdelphij    sqr_add_c2(a, 6, 2, c3, c1, c2);
818296341Sdelphij    sqr_add_c2(a, 7, 1, c3, c1, c2);
819296341Sdelphij    r[8] = c3;
820296341Sdelphij    c3 = 0;
821296341Sdelphij    sqr_add_c2(a, 7, 2, c1, c2, c3);
822296341Sdelphij    sqr_add_c2(a, 6, 3, c1, c2, c3);
823296341Sdelphij    sqr_add_c2(a, 5, 4, c1, c2, c3);
824296341Sdelphij    r[9] = c1;
825296341Sdelphij    c1 = 0;
826296341Sdelphij    sqr_add_c(a, 5, c2, c3, c1);
827296341Sdelphij    sqr_add_c2(a, 6, 4, c2, c3, c1);
828296341Sdelphij    sqr_add_c2(a, 7, 3, c2, c3, c1);
829296341Sdelphij    r[10] = c2;
830296341Sdelphij    c2 = 0;
831296341Sdelphij    sqr_add_c2(a, 7, 4, c3, c1, c2);
832296341Sdelphij    sqr_add_c2(a, 6, 5, c3, c1, c2);
833296341Sdelphij    r[11] = c3;
834296341Sdelphij    c3 = 0;
835296341Sdelphij    sqr_add_c(a, 6, c1, c2, c3);
836296341Sdelphij    sqr_add_c2(a, 7, 5, c1, c2, c3);
837296341Sdelphij    r[12] = c1;
838296341Sdelphij    c1 = 0;
839296341Sdelphij    sqr_add_c2(a, 7, 6, c2, c3, c1);
840296341Sdelphij    r[13] = c2;
841296341Sdelphij    c2 = 0;
842296341Sdelphij    sqr_add_c(a, 7, c3, c1, c2);
843296341Sdelphij    r[14] = c3;
844296341Sdelphij    r[15] = c1;
845296341Sdelphij}
84655714Skris
847109998Smarkmvoid bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a)
848296341Sdelphij{
849296341Sdelphij# ifdef BN_LLONG
850296341Sdelphij    BN_ULLONG t, tt;
851296341Sdelphij# else
852296341Sdelphij    BN_ULONG bl, bh;
853296341Sdelphij# endif
854296341Sdelphij    BN_ULONG t1, t2;
855296341Sdelphij    BN_ULONG c1, c2, c3;
85655714Skris
857296341Sdelphij    c1 = 0;
858296341Sdelphij    c2 = 0;
859296341Sdelphij    c3 = 0;
860296341Sdelphij    sqr_add_c(a, 0, c1, c2, c3);
861296341Sdelphij    r[0] = c1;
862296341Sdelphij    c1 = 0;
863296341Sdelphij    sqr_add_c2(a, 1, 0, c2, c3, c1);
864296341Sdelphij    r[1] = c2;
865296341Sdelphij    c2 = 0;
866296341Sdelphij    sqr_add_c(a, 1, c3, c1, c2);
867296341Sdelphij    sqr_add_c2(a, 2, 0, c3, c1, c2);
868296341Sdelphij    r[2] = c3;
869296341Sdelphij    c3 = 0;
870296341Sdelphij    sqr_add_c2(a, 3, 0, c1, c2, c3);
871296341Sdelphij    sqr_add_c2(a, 2, 1, c1, c2, c3);
872296341Sdelphij    r[3] = c1;
873296341Sdelphij    c1 = 0;
874296341Sdelphij    sqr_add_c(a, 2, c2, c3, c1);
875296341Sdelphij    sqr_add_c2(a, 3, 1, c2, c3, c1);
876296341Sdelphij    r[4] = c2;
877296341Sdelphij    c2 = 0;
878296341Sdelphij    sqr_add_c2(a, 3, 2, c3, c1, c2);
879296341Sdelphij    r[5] = c3;
880296341Sdelphij    c3 = 0;
881296341Sdelphij    sqr_add_c(a, 3, c1, c2, c3);
882296341Sdelphij    r[6] = c1;
883296341Sdelphij    r[7] = c2;
884296341Sdelphij}
885238405Sjkim
886296341Sdelphij# ifdef OPENSSL_NO_ASM
887296341Sdelphij#  ifdef OPENSSL_BN_ASM_MONT
888296341Sdelphij#   include <alloca.h>
889238405Sjkim/*
890238405Sjkim * This is essentially reference implementation, which may or may not
891238405Sjkim * result in performance improvement. E.g. on IA-32 this routine was
892238405Sjkim * observed to give 40% faster rsa1024 private key operations and 10%
893238405Sjkim * faster rsa4096 ones, while on AMD64 it improves rsa1024 sign only
894238405Sjkim * by 10% and *worsens* rsa4096 sign by 15%. Once again, it's a
895238405Sjkim * reference implementation, one to be used as starting point for
896238405Sjkim * platform-specific assembler. Mentioned numbers apply to compiler
897238405Sjkim * generated code compiled with and without -DOPENSSL_BN_ASM_MONT and
898238405Sjkim * can vary not only from platform to platform, but even for compiler
899238405Sjkim * versions. Assembler vs. assembler improvement coefficients can
900238405Sjkim * [and are known to] differ and are to be documented elsewhere.
901238405Sjkim */
902296341Sdelphijint bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
903296341Sdelphij                const BN_ULONG *np, const BN_ULONG *n0p, int num)
904296341Sdelphij{
905296341Sdelphij    BN_ULONG c0, c1, ml, *tp, n0;
906296341Sdelphij#   ifdef mul64
907296341Sdelphij    BN_ULONG mh;
908296341Sdelphij#   endif
909296341Sdelphij    volatile BN_ULONG *vp;
910296341Sdelphij    int i = 0, j;
911238405Sjkim
912296341Sdelphij#   if 0                        /* template for platform-specific
913296341Sdelphij                                 * implementation */
914296341Sdelphij    if (ap == bp)
915296341Sdelphij        return bn_sqr_mont(rp, ap, np, n0p, num);
916296341Sdelphij#   endif
917296341Sdelphij    vp = tp = alloca((num + 2) * sizeof(BN_ULONG));
918238405Sjkim
919296341Sdelphij    n0 = *n0p;
920238405Sjkim
921296341Sdelphij    c0 = 0;
922296341Sdelphij    ml = bp[0];
923296341Sdelphij#   ifdef mul64
924296341Sdelphij    mh = HBITS(ml);
925296341Sdelphij    ml = LBITS(ml);
926296341Sdelphij    for (j = 0; j < num; ++j)
927296341Sdelphij        mul(tp[j], ap[j], ml, mh, c0);
928296341Sdelphij#   else
929296341Sdelphij    for (j = 0; j < num; ++j)
930296341Sdelphij        mul(tp[j], ap[j], ml, c0);
931296341Sdelphij#   endif
932238405Sjkim
933296341Sdelphij    tp[num] = c0;
934296341Sdelphij    tp[num + 1] = 0;
935296341Sdelphij    goto enter;
936238405Sjkim
937296341Sdelphij    for (i = 0; i < num; i++) {
938296341Sdelphij        c0 = 0;
939296341Sdelphij        ml = bp[i];
940296341Sdelphij#   ifdef mul64
941296341Sdelphij        mh = HBITS(ml);
942296341Sdelphij        ml = LBITS(ml);
943296341Sdelphij        for (j = 0; j < num; ++j)
944296341Sdelphij            mul_add(tp[j], ap[j], ml, mh, c0);
945296341Sdelphij#   else
946296341Sdelphij        for (j = 0; j < num; ++j)
947296341Sdelphij            mul_add(tp[j], ap[j], ml, c0);
948296341Sdelphij#   endif
949296341Sdelphij        c1 = (tp[num] + c0) & BN_MASK2;
950296341Sdelphij        tp[num] = c1;
951296341Sdelphij        tp[num + 1] = (c1 < c0 ? 1 : 0);
952296341Sdelphij enter:
953296341Sdelphij        c1 = tp[0];
954296341Sdelphij        ml = (c1 * n0) & BN_MASK2;
955296341Sdelphij        c0 = 0;
956296341Sdelphij#   ifdef mul64
957296341Sdelphij        mh = HBITS(ml);
958296341Sdelphij        ml = LBITS(ml);
959296341Sdelphij        mul_add(c1, np[0], ml, mh, c0);
960296341Sdelphij#   else
961296341Sdelphij        mul_add(c1, ml, np[0], c0);
962296341Sdelphij#   endif
963296341Sdelphij        for (j = 1; j < num; j++) {
964296341Sdelphij            c1 = tp[j];
965296341Sdelphij#   ifdef mul64
966296341Sdelphij            mul_add(c1, np[j], ml, mh, c0);
967296341Sdelphij#   else
968296341Sdelphij            mul_add(c1, ml, np[j], c0);
969296341Sdelphij#   endif
970296341Sdelphij            tp[j - 1] = c1 & BN_MASK2;
971296341Sdelphij        }
972296341Sdelphij        c1 = (tp[num] + c0) & BN_MASK2;
973296341Sdelphij        tp[num - 1] = c1;
974296341Sdelphij        tp[num] = tp[num + 1] + (c1 < c0 ? 1 : 0);
975296341Sdelphij    }
976238405Sjkim
977296341Sdelphij    if (tp[num] != 0 || tp[num - 1] >= np[num - 1]) {
978296341Sdelphij        c0 = bn_sub_words(rp, tp, np, num);
979296341Sdelphij        if (tp[num] != 0 || c0 == 0) {
980296341Sdelphij            for (i = 0; i < num + 2; i++)
981296341Sdelphij                vp[i] = 0;
982296341Sdelphij            return 1;
983296341Sdelphij        }
984296341Sdelphij    }
985296341Sdelphij    for (i = 0; i < num; i++)
986296341Sdelphij        rp[i] = tp[i], vp[i] = 0;
987296341Sdelphij    vp[num] = 0;
988296341Sdelphij    vp[num + 1] = 0;
989296341Sdelphij    return 1;
990296341Sdelphij}
991296341Sdelphij#  else
992238405Sjkim/*
993238405Sjkim * Return value of 0 indicates that multiplication/convolution was not
994238405Sjkim * performed to signal the caller to fall down to alternative/original
995238405Sjkim * code-path.
996238405Sjkim */
997296341Sdelphijint bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
998296341Sdelphij                const BN_ULONG *np, const BN_ULONG *n0, int num)
999296341Sdelphij{
1000296341Sdelphij    return 0;
1001296341Sdelphij}
1002296341Sdelphij#  endif                        /* OPENSSL_BN_ASM_MONT */
1003296341Sdelphij# endif
1004238405Sjkim
1005296341Sdelphij#else                           /* !BN_MUL_COMBA */
100655714Skris
100755714Skris/* hmm... is it faster just to do a multiply? */
1008296341Sdelphij# undef bn_sqr_comba4
1009238405Sjkimvoid bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a)
1010296341Sdelphij{
1011296341Sdelphij    BN_ULONG t[8];
1012296341Sdelphij    bn_sqr_normal(r, a, 4, t);
1013296341Sdelphij}
101455714Skris
1015296341Sdelphij# undef bn_sqr_comba8
1016238405Sjkimvoid bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a)
1017296341Sdelphij{
1018296341Sdelphij    BN_ULONG t[16];
1019296341Sdelphij    bn_sqr_normal(r, a, 8, t);
1020296341Sdelphij}
102155714Skris
102255714Skrisvoid bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
1023296341Sdelphij{
1024296341Sdelphij    r[4] = bn_mul_words(&(r[0]), a, 4, b[0]);
1025296341Sdelphij    r[5] = bn_mul_add_words(&(r[1]), a, 4, b[1]);
1026296341Sdelphij    r[6] = bn_mul_add_words(&(r[2]), a, 4, b[2]);
1027296341Sdelphij    r[7] = bn_mul_add_words(&(r[3]), a, 4, b[3]);
1028296341Sdelphij}
102955714Skris
103055714Skrisvoid bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
1031296341Sdelphij{
1032296341Sdelphij    r[8] = bn_mul_words(&(r[0]), a, 8, b[0]);
1033296341Sdelphij    r[9] = bn_mul_add_words(&(r[1]), a, 8, b[1]);
1034296341Sdelphij    r[10] = bn_mul_add_words(&(r[2]), a, 8, b[2]);
1035296341Sdelphij    r[11] = bn_mul_add_words(&(r[3]), a, 8, b[3]);
1036296341Sdelphij    r[12] = bn_mul_add_words(&(r[4]), a, 8, b[4]);
1037296341Sdelphij    r[13] = bn_mul_add_words(&(r[5]), a, 8, b[5]);
1038296341Sdelphij    r[14] = bn_mul_add_words(&(r[6]), a, 8, b[6]);
1039296341Sdelphij    r[15] = bn_mul_add_words(&(r[7]), a, 8, b[7]);
1040296341Sdelphij}
104155714Skris
1042296341Sdelphij# ifdef OPENSSL_NO_ASM
1043296341Sdelphij#  ifdef OPENSSL_BN_ASM_MONT
1044296341Sdelphij#   include <alloca.h>
1045296341Sdelphijint bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
1046296341Sdelphij                const BN_ULONG *np, const BN_ULONG *n0p, int num)
1047296341Sdelphij{
1048296341Sdelphij    BN_ULONG c0, c1, *tp, n0 = *n0p;
1049296341Sdelphij    volatile BN_ULONG *vp;
1050296341Sdelphij    int i = 0, j;
1051238405Sjkim
1052296341Sdelphij    vp = tp = alloca((num + 2) * sizeof(BN_ULONG));
1053238405Sjkim
1054296341Sdelphij    for (i = 0; i <= num; i++)
1055296341Sdelphij        tp[i] = 0;
1056238405Sjkim
1057296341Sdelphij    for (i = 0; i < num; i++) {
1058296341Sdelphij        c0 = bn_mul_add_words(tp, ap, num, bp[i]);
1059296341Sdelphij        c1 = (tp[num] + c0) & BN_MASK2;
1060296341Sdelphij        tp[num] = c1;
1061296341Sdelphij        tp[num + 1] = (c1 < c0 ? 1 : 0);
1062238405Sjkim
1063296341Sdelphij        c0 = bn_mul_add_words(tp, np, num, tp[0] * n0);
1064296341Sdelphij        c1 = (tp[num] + c0) & BN_MASK2;
1065296341Sdelphij        tp[num] = c1;
1066296341Sdelphij        tp[num + 1] += (c1 < c0 ? 1 : 0);
1067296341Sdelphij        for (j = 0; j <= num; j++)
1068296341Sdelphij            tp[j] = tp[j + 1];
1069296341Sdelphij    }
1070238405Sjkim
1071296341Sdelphij    if (tp[num] != 0 || tp[num - 1] >= np[num - 1]) {
1072296341Sdelphij        c0 = bn_sub_words(rp, tp, np, num);
1073296341Sdelphij        if (tp[num] != 0 || c0 == 0) {
1074296341Sdelphij            for (i = 0; i < num + 2; i++)
1075296341Sdelphij                vp[i] = 0;
1076296341Sdelphij            return 1;
1077296341Sdelphij        }
1078296341Sdelphij    }
1079296341Sdelphij    for (i = 0; i < num; i++)
1080296341Sdelphij        rp[i] = tp[i], vp[i] = 0;
1081296341Sdelphij    vp[num] = 0;
1082296341Sdelphij    vp[num + 1] = 0;
1083296341Sdelphij    return 1;
1084296341Sdelphij}
1085296341Sdelphij#  else
1086296341Sdelphijint bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
1087296341Sdelphij                const BN_ULONG *np, const BN_ULONG *n0, int num)
1088296341Sdelphij{
1089296341Sdelphij    return 0;
1090296341Sdelphij}
1091296341Sdelphij#  endif                        /* OPENSSL_BN_ASM_MONT */
1092296341Sdelphij# endif
1093238405Sjkim
1094296341Sdelphij#endif                          /* !BN_MUL_COMBA */
1095