155714Skris/* crypto/bn/bn_asm.c */
255714Skris/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
355714Skris * All rights reserved.
455714Skris *
555714Skris * This package is an SSL implementation written
655714Skris * by Eric Young (eay@cryptsoft.com).
755714Skris * The implementation was written so as to conform with Netscapes SSL.
8280297Sjkim *
955714Skris * This library is free for commercial and non-commercial use as long as
1055714Skris * the following conditions are aheared to.  The following conditions
1155714Skris * apply to all code found in this distribution, be it the RC4, RSA,
1255714Skris * lhash, DES, etc., code; not just the SSL code.  The SSL documentation
1355714Skris * included with this distribution is covered by the same copyright terms
1455714Skris * except that the holder is Tim Hudson (tjh@cryptsoft.com).
15280297Sjkim *
1655714Skris * Copyright remains Eric Young's, and as such any Copyright notices in
1755714Skris * the code are not to be removed.
1855714Skris * If this package is used in a product, Eric Young should be given attribution
1955714Skris * as the author of the parts of the library used.
2055714Skris * This can be in the form of a textual message at program startup or
2155714Skris * in documentation (online or textual) provided with the package.
22280297Sjkim *
2355714Skris * Redistribution and use in source and binary forms, with or without
2455714Skris * modification, are permitted provided that the following conditions
2555714Skris * are met:
2655714Skris * 1. Redistributions of source code must retain the copyright
2755714Skris *    notice, this list of conditions and the following disclaimer.
2855714Skris * 2. Redistributions in binary form must reproduce the above copyright
2955714Skris *    notice, this list of conditions and the following disclaimer in the
3055714Skris *    documentation and/or other materials provided with the distribution.
3155714Skris * 3. All advertising materials mentioning features or use of this software
3255714Skris *    must display the following acknowledgement:
3355714Skris *    "This product includes cryptographic software written by
3455714Skris *     Eric Young (eay@cryptsoft.com)"
3555714Skris *    The word 'cryptographic' can be left out if the rouines from the library
3655714Skris *    being used are not cryptographic related :-).
37280297Sjkim * 4. If you include any Windows specific code (or a derivative thereof) from
3855714Skris *    the apps directory (application code) you must include an acknowledgement:
3955714Skris *    "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
40280297Sjkim *
4155714Skris * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
4255714Skris * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
4355714Skris * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
4455714Skris * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
4555714Skris * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
4655714Skris * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
4755714Skris * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
4855714Skris * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
4955714Skris * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
5055714Skris * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
5155714Skris * SUCH DAMAGE.
52280297Sjkim *
5355714Skris * The licence and distribution terms for any publically available version or
5455714Skris * derivative of this code cannot be changed.  i.e. this code cannot simply be
5555714Skris * copied and put under another distribution licence
5655714Skris * [including the GNU Public Licence.]
5755714Skris */
5855714Skris
5959191Skris#ifndef BN_DEBUG
60280297Sjkim# undef NDEBUG                  /* avoid conflicting definitions */
6159191Skris# define NDEBUG
6259191Skris#endif
6359191Skris
6455714Skris#include <stdio.h>
6559191Skris#include <assert.h>
6655714Skris#include "cryptlib.h"
6755714Skris#include "bn_lcl.h"
6855714Skris
6959191Skris#if defined(BN_LLONG) || defined(BN_UMULT_HIGH)
7055714Skris
71280297SjkimBN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num,
72280297Sjkim                          BN_ULONG w)
73280297Sjkim{
74280297Sjkim    BN_ULONG c1 = 0;
7555714Skris
76280297Sjkim    assert(num >= 0);
77280297Sjkim    if (num <= 0)
78280297Sjkim        return (c1);
7955714Skris
80280297Sjkim# ifndef OPENSSL_SMALL_FOOTPRINT
81280297Sjkim    while (num & ~3) {
82280297Sjkim        mul_add(rp[0], ap[0], w, c1);
83280297Sjkim        mul_add(rp[1], ap[1], w, c1);
84280297Sjkim        mul_add(rp[2], ap[2], w, c1);
85280297Sjkim        mul_add(rp[3], ap[3], w, c1);
86280297Sjkim        ap += 4;
87280297Sjkim        rp += 4;
88280297Sjkim        num -= 4;
89280297Sjkim    }
90280297Sjkim# endif
91280297Sjkim    while (num) {
92280297Sjkim        mul_add(rp[0], ap[0], w, c1);
93280297Sjkim        ap++;
94280297Sjkim        rp++;
95280297Sjkim        num--;
96280297Sjkim    }
9755714Skris
98280297Sjkim    return (c1);
99280297Sjkim}
100280297Sjkim
101109998SmarkmBN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
102280297Sjkim{
103280297Sjkim    BN_ULONG c1 = 0;
10455714Skris
105280297Sjkim    assert(num >= 0);
106280297Sjkim    if (num <= 0)
107280297Sjkim        return (c1);
10855714Skris
109280297Sjkim# ifndef OPENSSL_SMALL_FOOTPRINT
110280297Sjkim    while (num & ~3) {
111280297Sjkim        mul(rp[0], ap[0], w, c1);
112280297Sjkim        mul(rp[1], ap[1], w, c1);
113280297Sjkim        mul(rp[2], ap[2], w, c1);
114280297Sjkim        mul(rp[3], ap[3], w, c1);
115280297Sjkim        ap += 4;
116280297Sjkim        rp += 4;
117280297Sjkim        num -= 4;
118280297Sjkim    }
119280297Sjkim# endif
120280297Sjkim    while (num) {
121280297Sjkim        mul(rp[0], ap[0], w, c1);
122280297Sjkim        ap++;
123280297Sjkim        rp++;
124280297Sjkim        num--;
125280297Sjkim    }
126280297Sjkim    return (c1);
127280297Sjkim}
12855714Skris
129109998Smarkmvoid bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n)
130280297Sjkim{
131280297Sjkim    assert(n >= 0);
132280297Sjkim    if (n <= 0)
133280297Sjkim        return;
134238405Sjkim
135280297Sjkim# ifndef OPENSSL_SMALL_FOOTPRINT
136280297Sjkim    while (n & ~3) {
137280297Sjkim        sqr(r[0], r[1], a[0]);
138280297Sjkim        sqr(r[2], r[3], a[1]);
139280297Sjkim        sqr(r[4], r[5], a[2]);
140280297Sjkim        sqr(r[6], r[7], a[3]);
141280297Sjkim        a += 4;
142280297Sjkim        r += 8;
143280297Sjkim        n -= 4;
144280297Sjkim    }
145280297Sjkim# endif
146280297Sjkim    while (n) {
147280297Sjkim        sqr(r[0], r[1], a[0]);
148280297Sjkim        a++;
149280297Sjkim        r += 2;
150280297Sjkim        n--;
151280297Sjkim    }
152280297Sjkim}
15355714Skris
154280297Sjkim#else                           /* !(defined(BN_LLONG) ||
155280297Sjkim                                 * defined(BN_UMULT_HIGH)) */
15655714Skris
157280297SjkimBN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num,
158280297Sjkim                          BN_ULONG w)
159280297Sjkim{
160280297Sjkim    BN_ULONG c = 0;
161280297Sjkim    BN_ULONG bl, bh;
16255714Skris
163280297Sjkim    assert(num >= 0);
164280297Sjkim    if (num <= 0)
165280297Sjkim        return ((BN_ULONG)0);
16655714Skris
167280297Sjkim    bl = LBITS(w);
168280297Sjkim    bh = HBITS(w);
16955714Skris
170280297Sjkim# ifndef OPENSSL_SMALL_FOOTPRINT
171280297Sjkim    while (num & ~3) {
172280297Sjkim        mul_add(rp[0], ap[0], bl, bh, c);
173280297Sjkim        mul_add(rp[1], ap[1], bl, bh, c);
174280297Sjkim        mul_add(rp[2], ap[2], bl, bh, c);
175280297Sjkim        mul_add(rp[3], ap[3], bl, bh, c);
176280297Sjkim        ap += 4;
177280297Sjkim        rp += 4;
178280297Sjkim        num -= 4;
179280297Sjkim    }
180280297Sjkim# endif
181280297Sjkim    while (num) {
182280297Sjkim        mul_add(rp[0], ap[0], bl, bh, c);
183280297Sjkim        ap++;
184280297Sjkim        rp++;
185280297Sjkim        num--;
186280297Sjkim    }
187280297Sjkim    return (c);
188280297Sjkim}
18955714Skris
190109998SmarkmBN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
191280297Sjkim{
192280297Sjkim    BN_ULONG carry = 0;
193280297Sjkim    BN_ULONG bl, bh;
19455714Skris
195280297Sjkim    assert(num >= 0);
196280297Sjkim    if (num <= 0)
197280297Sjkim        return ((BN_ULONG)0);
19855714Skris
199280297Sjkim    bl = LBITS(w);
200280297Sjkim    bh = HBITS(w);
20155714Skris
202280297Sjkim# ifndef OPENSSL_SMALL_FOOTPRINT
203280297Sjkim    while (num & ~3) {
204280297Sjkim        mul(rp[0], ap[0], bl, bh, carry);
205280297Sjkim        mul(rp[1], ap[1], bl, bh, carry);
206280297Sjkim        mul(rp[2], ap[2], bl, bh, carry);
207280297Sjkim        mul(rp[3], ap[3], bl, bh, carry);
208280297Sjkim        ap += 4;
209280297Sjkim        rp += 4;
210280297Sjkim        num -= 4;
211280297Sjkim    }
212280297Sjkim# endif
213280297Sjkim    while (num) {
214280297Sjkim        mul(rp[0], ap[0], bl, bh, carry);
215280297Sjkim        ap++;
216280297Sjkim        rp++;
217280297Sjkim        num--;
218280297Sjkim    }
219280297Sjkim    return (carry);
220280297Sjkim}
22155714Skris
222109998Smarkmvoid bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n)
223280297Sjkim{
224280297Sjkim    assert(n >= 0);
225280297Sjkim    if (n <= 0)
226280297Sjkim        return;
227238405Sjkim
228280297Sjkim# ifndef OPENSSL_SMALL_FOOTPRINT
229280297Sjkim    while (n & ~3) {
230280297Sjkim        sqr64(r[0], r[1], a[0]);
231280297Sjkim        sqr64(r[2], r[3], a[1]);
232280297Sjkim        sqr64(r[4], r[5], a[2]);
233280297Sjkim        sqr64(r[6], r[7], a[3]);
234280297Sjkim        a += 4;
235280297Sjkim        r += 8;
236280297Sjkim        n -= 4;
237280297Sjkim    }
238280297Sjkim# endif
239280297Sjkim    while (n) {
240280297Sjkim        sqr64(r[0], r[1], a[0]);
241280297Sjkim        a++;
242280297Sjkim        r += 2;
243280297Sjkim        n--;
244280297Sjkim    }
245280297Sjkim}
24655714Skris
247280297Sjkim#endif                          /* !(defined(BN_LLONG) ||
248280297Sjkim                                 * defined(BN_UMULT_HIGH)) */
24955714Skris
25055714Skris#if defined(BN_LLONG) && defined(BN_DIV2W)
25155714Skris
25255714SkrisBN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
253280297Sjkim{
254280297Sjkim    return ((BN_ULONG)(((((BN_ULLONG) h) << BN_BITS2) | l) / (BN_ULLONG) d));
255280297Sjkim}
25655714Skris
25755714Skris#else
25855714Skris
25968651Skris/* Divide h,l by d and return the result. */
26055714Skris/* I need to test this some more :-( */
26155714SkrisBN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
262280297Sjkim{
263280297Sjkim    BN_ULONG dh, dl, q, ret = 0, th, tl, t;
264280297Sjkim    int i, count = 2;
26555714Skris
266280297Sjkim    if (d == 0)
267280297Sjkim        return (BN_MASK2);
26855714Skris
269280297Sjkim    i = BN_num_bits_word(d);
270280297Sjkim    assert((i == BN_BITS2) || (h <= (BN_ULONG)1 << i));
27168651Skris
272280297Sjkim    i = BN_BITS2 - i;
273280297Sjkim    if (h >= d)
274280297Sjkim        h -= d;
27555714Skris
276280297Sjkim    if (i) {
277280297Sjkim        d <<= i;
278280297Sjkim        h = (h << i) | (l >> (BN_BITS2 - i));
279280297Sjkim        l <<= i;
280280297Sjkim    }
281280297Sjkim    dh = (d & BN_MASK2h) >> BN_BITS4;
282280297Sjkim    dl = (d & BN_MASK2l);
283280297Sjkim    for (;;) {
284280297Sjkim        if ((h >> BN_BITS4) == dh)
285280297Sjkim            q = BN_MASK2l;
286280297Sjkim        else
287280297Sjkim            q = h / dh;
28855714Skris
289280297Sjkim        th = q * dh;
290280297Sjkim        tl = dl * q;
291280297Sjkim        for (;;) {
292280297Sjkim            t = h - th;
293280297Sjkim            if ((t & BN_MASK2h) ||
294280297Sjkim                ((tl) <= ((t << BN_BITS4) | ((l & BN_MASK2h) >> BN_BITS4))))
295280297Sjkim                break;
296280297Sjkim            q--;
297280297Sjkim            th -= dh;
298280297Sjkim            tl -= dl;
299280297Sjkim        }
300280297Sjkim        t = (tl >> BN_BITS4);
301280297Sjkim        tl = (tl << BN_BITS4) & BN_MASK2h;
302280297Sjkim        th += t;
30355714Skris
304280297Sjkim        if (l < tl)
305280297Sjkim            th++;
306280297Sjkim        l -= tl;
307280297Sjkim        if (h < th) {
308280297Sjkim            h += d;
309280297Sjkim            q--;
310280297Sjkim        }
311280297Sjkim        h -= th;
31255714Skris
313280297Sjkim        if (--count == 0)
314280297Sjkim            break;
31555714Skris
316280297Sjkim        ret = q << BN_BITS4;
317280297Sjkim        h = ((h << BN_BITS4) | (l >> BN_BITS4)) & BN_MASK2;
318280297Sjkim        l = (l & BN_MASK2l) << BN_BITS4;
319280297Sjkim    }
320280297Sjkim    ret |= q;
321280297Sjkim    return (ret);
322280297Sjkim}
323280297Sjkim#endif                          /* !defined(BN_LLONG) && defined(BN_DIV2W) */
32455714Skris
32555714Skris#ifdef BN_LLONG
326280297SjkimBN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
327280297Sjkim                      int n)
328280297Sjkim{
329280297Sjkim    BN_ULLONG ll = 0;
33055714Skris
331280297Sjkim    assert(n >= 0);
332280297Sjkim    if (n <= 0)
333280297Sjkim        return ((BN_ULONG)0);
33455714Skris
335280297Sjkim# ifndef OPENSSL_SMALL_FOOTPRINT
336280297Sjkim    while (n & ~3) {
337280297Sjkim        ll += (BN_ULLONG) a[0] + b[0];
338280297Sjkim        r[0] = (BN_ULONG)ll & BN_MASK2;
339280297Sjkim        ll >>= BN_BITS2;
340280297Sjkim        ll += (BN_ULLONG) a[1] + b[1];
341280297Sjkim        r[1] = (BN_ULONG)ll & BN_MASK2;
342280297Sjkim        ll >>= BN_BITS2;
343280297Sjkim        ll += (BN_ULLONG) a[2] + b[2];
344280297Sjkim        r[2] = (BN_ULONG)ll & BN_MASK2;
345280297Sjkim        ll >>= BN_BITS2;
346280297Sjkim        ll += (BN_ULLONG) a[3] + b[3];
347280297Sjkim        r[3] = (BN_ULONG)ll & BN_MASK2;
348280297Sjkim        ll >>= BN_BITS2;
349280297Sjkim        a += 4;
350280297Sjkim        b += 4;
351280297Sjkim        r += 4;
352280297Sjkim        n -= 4;
353280297Sjkim    }
354280297Sjkim# endif
355280297Sjkim    while (n) {
356280297Sjkim        ll += (BN_ULLONG) a[0] + b[0];
357280297Sjkim        r[0] = (BN_ULONG)ll & BN_MASK2;
358280297Sjkim        ll >>= BN_BITS2;
359280297Sjkim        a++;
360280297Sjkim        b++;
361280297Sjkim        r++;
362280297Sjkim        n--;
363280297Sjkim    }
364280297Sjkim    return ((BN_ULONG)ll);
365280297Sjkim}
366280297Sjkim#else                           /* !BN_LLONG */
367280297SjkimBN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
368280297Sjkim                      int n)
369280297Sjkim{
370280297Sjkim    BN_ULONG c, l, t;
37155714Skris
372280297Sjkim    assert(n >= 0);
373280297Sjkim    if (n <= 0)
374280297Sjkim        return ((BN_ULONG)0);
37555714Skris
376280297Sjkim    c = 0;
377280297Sjkim# ifndef OPENSSL_SMALL_FOOTPRINT
378280297Sjkim    while (n & ~3) {
379280297Sjkim        t = a[0];
380280297Sjkim        t = (t + c) & BN_MASK2;
381280297Sjkim        c = (t < c);
382280297Sjkim        l = (t + b[0]) & BN_MASK2;
383280297Sjkim        c += (l < t);
384280297Sjkim        r[0] = l;
385280297Sjkim        t = a[1];
386280297Sjkim        t = (t + c) & BN_MASK2;
387280297Sjkim        c = (t < c);
388280297Sjkim        l = (t + b[1]) & BN_MASK2;
389280297Sjkim        c += (l < t);
390280297Sjkim        r[1] = l;
391280297Sjkim        t = a[2];
392280297Sjkim        t = (t + c) & BN_MASK2;
393280297Sjkim        c = (t < c);
394280297Sjkim        l = (t + b[2]) & BN_MASK2;
395280297Sjkim        c += (l < t);
396280297Sjkim        r[2] = l;
397280297Sjkim        t = a[3];
398280297Sjkim        t = (t + c) & BN_MASK2;
399280297Sjkim        c = (t < c);
400280297Sjkim        l = (t + b[3]) & BN_MASK2;
401280297Sjkim        c += (l < t);
402280297Sjkim        r[3] = l;
403280297Sjkim        a += 4;
404280297Sjkim        b += 4;
405280297Sjkim        r += 4;
406280297Sjkim        n -= 4;
407280297Sjkim    }
408280297Sjkim# endif
409280297Sjkim    while (n) {
410280297Sjkim        t = a[0];
411280297Sjkim        t = (t + c) & BN_MASK2;
412280297Sjkim        c = (t < c);
413280297Sjkim        l = (t + b[0]) & BN_MASK2;
414280297Sjkim        c += (l < t);
415280297Sjkim        r[0] = l;
416280297Sjkim        a++;
417280297Sjkim        b++;
418280297Sjkim        r++;
419280297Sjkim        n--;
420280297Sjkim    }
421280297Sjkim    return ((BN_ULONG)c);
422280297Sjkim}
423280297Sjkim#endif                          /* !BN_LLONG */
42455714Skris
425280297SjkimBN_ULONG bn_sub_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
426280297Sjkim                      int n)
427280297Sjkim{
428280297Sjkim    BN_ULONG t1, t2;
429280297Sjkim    int c = 0;
43055714Skris
431280297Sjkim    assert(n >= 0);
432280297Sjkim    if (n <= 0)
433280297Sjkim        return ((BN_ULONG)0);
43455714Skris
435238405Sjkim#ifndef OPENSSL_SMALL_FOOTPRINT
436280297Sjkim    while (n & ~3) {
437280297Sjkim        t1 = a[0];
438280297Sjkim        t2 = b[0];
439280297Sjkim        r[0] = (t1 - t2 - c) & BN_MASK2;
440280297Sjkim        if (t1 != t2)
441280297Sjkim            c = (t1 < t2);
442280297Sjkim        t1 = a[1];
443280297Sjkim        t2 = b[1];
444280297Sjkim        r[1] = (t1 - t2 - c) & BN_MASK2;
445280297Sjkim        if (t1 != t2)
446280297Sjkim            c = (t1 < t2);
447280297Sjkim        t1 = a[2];
448280297Sjkim        t2 = b[2];
449280297Sjkim        r[2] = (t1 - t2 - c) & BN_MASK2;
450280297Sjkim        if (t1 != t2)
451280297Sjkim            c = (t1 < t2);
452280297Sjkim        t1 = a[3];
453280297Sjkim        t2 = b[3];
454280297Sjkim        r[3] = (t1 - t2 - c) & BN_MASK2;
455280297Sjkim        if (t1 != t2)
456280297Sjkim            c = (t1 < t2);
457280297Sjkim        a += 4;
458280297Sjkim        b += 4;
459280297Sjkim        r += 4;
460280297Sjkim        n -= 4;
461280297Sjkim    }
462238405Sjkim#endif
463280297Sjkim    while (n) {
464280297Sjkim        t1 = a[0];
465280297Sjkim        t2 = b[0];
466280297Sjkim        r[0] = (t1 - t2 - c) & BN_MASK2;
467280297Sjkim        if (t1 != t2)
468280297Sjkim            c = (t1 < t2);
469280297Sjkim        a++;
470280297Sjkim        b++;
471280297Sjkim        r++;
472280297Sjkim        n--;
473280297Sjkim    }
474280297Sjkim    return (c);
475280297Sjkim}
47655714Skris
477238405Sjkim#if defined(BN_MUL_COMBA) && !defined(OPENSSL_SMALL_FOOTPRINT)
47855714Skris
479280297Sjkim# undef bn_mul_comba8
480280297Sjkim# undef bn_mul_comba4
481280297Sjkim# undef bn_sqr_comba8
482280297Sjkim# undef bn_sqr_comba4
48355714Skris
48459191Skris/* mul_add_c(a,b,c0,c1,c2)  -- c+=a*b for three word number c=(c2,c1,c0) */
48559191Skris/* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */
48659191Skris/* sqr_add_c(a,i,c0,c1,c2)  -- c+=a[i]^2 for three word number c=(c2,c1,c0) */
487280297Sjkim/*
488280297Sjkim * sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number
489280297Sjkim * c=(c2,c1,c0)
490280297Sjkim */
49159191Skris
492290207Sjkim# ifdef BN_LLONG
493276861Sjkim/*
494290207Sjkim * Keep in mind that additions to multiplication result can not
495290207Sjkim * overflow, because its high half cannot be all-ones.
496276861Sjkim */
497290207Sjkim#  define mul_add_c(a,b,c0,c1,c2)       do {    \
498290207Sjkim        BN_ULONG hi;                            \
499290207Sjkim        BN_ULLONG t = (BN_ULLONG)(a)*(b);       \
500290207Sjkim        t += c0;                /* no carry */  \
501290207Sjkim        c0 = (BN_ULONG)Lw(t);                   \
502290207Sjkim        hi = (BN_ULONG)Hw(t);                   \
503290207Sjkim        c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
504290207Sjkim        } while(0)
50555714Skris
506290207Sjkim#  define mul_add_c2(a,b,c0,c1,c2)      do {    \
507290207Sjkim        BN_ULONG hi;                            \
508290207Sjkim        BN_ULLONG t = (BN_ULLONG)(a)*(b);       \
509290207Sjkim        BN_ULLONG tt = t+c0;    /* no carry */  \
510290207Sjkim        c0 = (BN_ULONG)Lw(tt);                  \
511290207Sjkim        hi = (BN_ULONG)Hw(tt);                  \
512290207Sjkim        c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
513290207Sjkim        t += c0;                /* no carry */  \
514290207Sjkim        c0 = (BN_ULONG)Lw(t);                   \
515290207Sjkim        hi = (BN_ULONG)Hw(t);                   \
516290207Sjkim        c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
517290207Sjkim        } while(0)
51855714Skris
519290207Sjkim#  define sqr_add_c(a,i,c0,c1,c2)       do {    \
520290207Sjkim        BN_ULONG hi;                            \
521290207Sjkim        BN_ULLONG t = (BN_ULLONG)a[i]*a[i];     \
522290207Sjkim        t += c0;                /* no carry */  \
523290207Sjkim        c0 = (BN_ULONG)Lw(t);                   \
524290207Sjkim        hi = (BN_ULONG)Hw(t);                   \
525290207Sjkim        c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
526290207Sjkim        } while(0)
52755714Skris
528280297Sjkim#  define sqr_add_c2(a,i,j,c0,c1,c2) \
529280297Sjkim        mul_add_c2((a)[i],(a)[j],c0,c1,c2)
53059191Skris
531280297Sjkim# elif defined(BN_UMULT_LOHI)
532290207Sjkim/*
533290207Sjkim * Keep in mind that additions to hi can not overflow, because
534290207Sjkim * the high word of a multiplication result cannot be all-ones.
535290207Sjkim */
536290207Sjkim#  define mul_add_c(a,b,c0,c1,c2)       do {    \
537290207Sjkim        BN_ULONG ta = (a), tb = (b);            \
538290207Sjkim        BN_ULONG lo, hi;                        \
539290207Sjkim        BN_UMULT_LOHI(lo,hi,ta,tb);             \
540290207Sjkim        c0 += lo; hi += (c0<lo)?1:0;            \
541290207Sjkim        c1 += hi; c2 += (c1<hi)?1:0;            \
542290207Sjkim        } while(0)
543160814Ssimon
544290207Sjkim#  define mul_add_c2(a,b,c0,c1,c2)      do {    \
545290207Sjkim        BN_ULONG ta = (a), tb = (b);            \
546290207Sjkim        BN_ULONG lo, hi, tt;                    \
547290207Sjkim        BN_UMULT_LOHI(lo,hi,ta,tb);             \
548290207Sjkim        c0 += lo; tt = hi+((c0<lo)?1:0);        \
549290207Sjkim        c1 += tt; c2 += (c1<tt)?1:0;            \
550290207Sjkim        c0 += lo; hi += (c0<lo)?1:0;            \
551290207Sjkim        c1 += hi; c2 += (c1<hi)?1:0;            \
552290207Sjkim        } while(0)
553160814Ssimon
554290207Sjkim#  define sqr_add_c(a,i,c0,c1,c2)       do {    \
555290207Sjkim        BN_ULONG ta = (a)[i];                   \
556290207Sjkim        BN_ULONG lo, hi;                        \
557290207Sjkim        BN_UMULT_LOHI(lo,hi,ta,ta);             \
558290207Sjkim        c0 += lo; hi += (c0<lo)?1:0;            \
559290207Sjkim        c1 += hi; c2 += (c1<hi)?1:0;            \
560290207Sjkim        } while(0)
561160814Ssimon
562280297Sjkim#  define sqr_add_c2(a,i,j,c0,c1,c2)    \
563280297Sjkim        mul_add_c2((a)[i],(a)[j],c0,c1,c2)
564160814Ssimon
565280297Sjkim# elif defined(BN_UMULT_HIGH)
566290207Sjkim/*
567290207Sjkim * Keep in mind that additions to hi can not overflow, because
568290207Sjkim * the high word of a multiplication result cannot be all-ones.
569290207Sjkim */
570290207Sjkim#  define mul_add_c(a,b,c0,c1,c2)       do {    \
571290207Sjkim        BN_ULONG ta = (a), tb = (b);            \
572290207Sjkim        BN_ULONG lo = ta * tb;                  \
573290207Sjkim        BN_ULONG hi = BN_UMULT_HIGH(ta,tb);     \
574290207Sjkim        c0 += lo; hi += (c0<lo)?1:0;            \
575290207Sjkim        c1 += hi; c2 += (c1<hi)?1:0;            \
576290207Sjkim        } while(0)
57759191Skris
578290207Sjkim#  define mul_add_c2(a,b,c0,c1,c2)      do {    \
579290207Sjkim        BN_ULONG ta = (a), tb = (b), tt;        \
580290207Sjkim        BN_ULONG lo = ta * tb;                  \
581290207Sjkim        BN_ULONG hi = BN_UMULT_HIGH(ta,tb);     \
582290207Sjkim        c0 += lo; tt = hi + ((c0<lo)?1:0);      \
583290207Sjkim        c1 += tt; c2 += (c1<tt)?1:0;            \
584290207Sjkim        c0 += lo; hi += (c0<lo)?1:0;            \
585290207Sjkim        c1 += hi; c2 += (c1<hi)?1:0;            \
586290207Sjkim        } while(0)
58759191Skris
588290207Sjkim#  define sqr_add_c(a,i,c0,c1,c2)       do {    \
589290207Sjkim        BN_ULONG ta = (a)[i];                   \
590290207Sjkim        BN_ULONG lo = ta * ta;                  \
591290207Sjkim        BN_ULONG hi = BN_UMULT_HIGH(ta,ta);     \
592290207Sjkim        c0 += lo; hi += (c0<lo)?1:0;            \
593290207Sjkim        c1 += hi; c2 += (c1<hi)?1:0;            \
594290207Sjkim        } while(0)
59559191Skris
596280297Sjkim#  define sqr_add_c2(a,i,j,c0,c1,c2)      \
597280297Sjkim        mul_add_c2((a)[i],(a)[j],c0,c1,c2)
59859191Skris
599280297Sjkim# else                          /* !BN_LLONG */
600290207Sjkim/*
601290207Sjkim * Keep in mind that additions to hi can not overflow, because
602290207Sjkim * the high word of a multiplication result cannot be all-ones.
603290207Sjkim */
604290207Sjkim#  define mul_add_c(a,b,c0,c1,c2)       do {    \
605290207Sjkim        BN_ULONG lo = LBITS(a), hi = HBITS(a);  \
606290207Sjkim        BN_ULONG bl = LBITS(b), bh = HBITS(b);  \
607290207Sjkim        mul64(lo,hi,bl,bh);                     \
608290207Sjkim        c0 = (c0+lo)&BN_MASK2; if (c0<lo) hi++; \
609290207Sjkim        c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
610290207Sjkim        } while(0)
61155714Skris
612290207Sjkim#  define mul_add_c2(a,b,c0,c1,c2)      do {    \
613290207Sjkim        BN_ULONG tt;                            \
614290207Sjkim        BN_ULONG lo = LBITS(a), hi = HBITS(a);  \
615290207Sjkim        BN_ULONG bl = LBITS(b), bh = HBITS(b);  \
616290207Sjkim        mul64(lo,hi,bl,bh);                     \
617290207Sjkim        tt = hi;                                \
618290207Sjkim        c0 = (c0+lo)&BN_MASK2; if (c0<lo) tt++; \
619290207Sjkim        c1 = (c1+tt)&BN_MASK2; if (c1<tt) c2++; \
620290207Sjkim        c0 = (c0+lo)&BN_MASK2; if (c0<lo) hi++; \
621290207Sjkim        c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
622290207Sjkim        } while(0)
62355714Skris
624290207Sjkim#  define sqr_add_c(a,i,c0,c1,c2)       do {    \
625290207Sjkim        BN_ULONG lo, hi;                        \
626290207Sjkim        sqr64(lo,hi,(a)[i]);                    \
627290207Sjkim        c0 = (c0+lo)&BN_MASK2; if (c0<lo) hi++; \
628290207Sjkim        c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
629290207Sjkim        } while(0)
63055714Skris
631280297Sjkim#  define sqr_add_c2(a,i,j,c0,c1,c2) \
632280297Sjkim        mul_add_c2((a)[i],(a)[j],c0,c1,c2)
633280297Sjkim# endif                         /* !BN_LLONG */
63455714Skris
63555714Skrisvoid bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
636280297Sjkim{
637280297Sjkim    BN_ULONG c1, c2, c3;
63855714Skris
639280297Sjkim    c1 = 0;
640280297Sjkim    c2 = 0;
641280297Sjkim    c3 = 0;
642280297Sjkim    mul_add_c(a[0], b[0], c1, c2, c3);
643280297Sjkim    r[0] = c1;
644280297Sjkim    c1 = 0;
645280297Sjkim    mul_add_c(a[0], b[1], c2, c3, c1);
646280297Sjkim    mul_add_c(a[1], b[0], c2, c3, c1);
647280297Sjkim    r[1] = c2;
648280297Sjkim    c2 = 0;
649280297Sjkim    mul_add_c(a[2], b[0], c3, c1, c2);
650280297Sjkim    mul_add_c(a[1], b[1], c3, c1, c2);
651280297Sjkim    mul_add_c(a[0], b[2], c3, c1, c2);
652280297Sjkim    r[2] = c3;
653280297Sjkim    c3 = 0;
654280297Sjkim    mul_add_c(a[0], b[3], c1, c2, c3);
655280297Sjkim    mul_add_c(a[1], b[2], c1, c2, c3);
656280297Sjkim    mul_add_c(a[2], b[1], c1, c2, c3);
657280297Sjkim    mul_add_c(a[3], b[0], c1, c2, c3);
658280297Sjkim    r[3] = c1;
659280297Sjkim    c1 = 0;
660280297Sjkim    mul_add_c(a[4], b[0], c2, c3, c1);
661280297Sjkim    mul_add_c(a[3], b[1], c2, c3, c1);
662280297Sjkim    mul_add_c(a[2], b[2], c2, c3, c1);
663280297Sjkim    mul_add_c(a[1], b[3], c2, c3, c1);
664280297Sjkim    mul_add_c(a[0], b[4], c2, c3, c1);
665280297Sjkim    r[4] = c2;
666280297Sjkim    c2 = 0;
667280297Sjkim    mul_add_c(a[0], b[5], c3, c1, c2);
668280297Sjkim    mul_add_c(a[1], b[4], c3, c1, c2);
669280297Sjkim    mul_add_c(a[2], b[3], c3, c1, c2);
670280297Sjkim    mul_add_c(a[3], b[2], c3, c1, c2);
671280297Sjkim    mul_add_c(a[4], b[1], c3, c1, c2);
672280297Sjkim    mul_add_c(a[5], b[0], c3, c1, c2);
673280297Sjkim    r[5] = c3;
674280297Sjkim    c3 = 0;
675280297Sjkim    mul_add_c(a[6], b[0], c1, c2, c3);
676280297Sjkim    mul_add_c(a[5], b[1], c1, c2, c3);
677280297Sjkim    mul_add_c(a[4], b[2], c1, c2, c3);
678280297Sjkim    mul_add_c(a[3], b[3], c1, c2, c3);
679280297Sjkim    mul_add_c(a[2], b[4], c1, c2, c3);
680280297Sjkim    mul_add_c(a[1], b[5], c1, c2, c3);
681280297Sjkim    mul_add_c(a[0], b[6], c1, c2, c3);
682280297Sjkim    r[6] = c1;
683280297Sjkim    c1 = 0;
684280297Sjkim    mul_add_c(a[0], b[7], c2, c3, c1);
685280297Sjkim    mul_add_c(a[1], b[6], c2, c3, c1);
686280297Sjkim    mul_add_c(a[2], b[5], c2, c3, c1);
687280297Sjkim    mul_add_c(a[3], b[4], c2, c3, c1);
688280297Sjkim    mul_add_c(a[4], b[3], c2, c3, c1);
689280297Sjkim    mul_add_c(a[5], b[2], c2, c3, c1);
690280297Sjkim    mul_add_c(a[6], b[1], c2, c3, c1);
691280297Sjkim    mul_add_c(a[7], b[0], c2, c3, c1);
692280297Sjkim    r[7] = c2;
693280297Sjkim    c2 = 0;
694280297Sjkim    mul_add_c(a[7], b[1], c3, c1, c2);
695280297Sjkim    mul_add_c(a[6], b[2], c3, c1, c2);
696280297Sjkim    mul_add_c(a[5], b[3], c3, c1, c2);
697280297Sjkim    mul_add_c(a[4], b[4], c3, c1, c2);
698280297Sjkim    mul_add_c(a[3], b[5], c3, c1, c2);
699280297Sjkim    mul_add_c(a[2], b[6], c3, c1, c2);
700280297Sjkim    mul_add_c(a[1], b[7], c3, c1, c2);
701280297Sjkim    r[8] = c3;
702280297Sjkim    c3 = 0;
703280297Sjkim    mul_add_c(a[2], b[7], c1, c2, c3);
704280297Sjkim    mul_add_c(a[3], b[6], c1, c2, c3);
705280297Sjkim    mul_add_c(a[4], b[5], c1, c2, c3);
706280297Sjkim    mul_add_c(a[5], b[4], c1, c2, c3);
707280297Sjkim    mul_add_c(a[6], b[3], c1, c2, c3);
708280297Sjkim    mul_add_c(a[7], b[2], c1, c2, c3);
709280297Sjkim    r[9] = c1;
710280297Sjkim    c1 = 0;
711280297Sjkim    mul_add_c(a[7], b[3], c2, c3, c1);
712280297Sjkim    mul_add_c(a[6], b[4], c2, c3, c1);
713280297Sjkim    mul_add_c(a[5], b[5], c2, c3, c1);
714280297Sjkim    mul_add_c(a[4], b[6], c2, c3, c1);
715280297Sjkim    mul_add_c(a[3], b[7], c2, c3, c1);
716280297Sjkim    r[10] = c2;
717280297Sjkim    c2 = 0;
718280297Sjkim    mul_add_c(a[4], b[7], c3, c1, c2);
719280297Sjkim    mul_add_c(a[5], b[6], c3, c1, c2);
720280297Sjkim    mul_add_c(a[6], b[5], c3, c1, c2);
721280297Sjkim    mul_add_c(a[7], b[4], c3, c1, c2);
722280297Sjkim    r[11] = c3;
723280297Sjkim    c3 = 0;
724280297Sjkim    mul_add_c(a[7], b[5], c1, c2, c3);
725280297Sjkim    mul_add_c(a[6], b[6], c1, c2, c3);
726280297Sjkim    mul_add_c(a[5], b[7], c1, c2, c3);
727280297Sjkim    r[12] = c1;
728280297Sjkim    c1 = 0;
729280297Sjkim    mul_add_c(a[6], b[7], c2, c3, c1);
730280297Sjkim    mul_add_c(a[7], b[6], c2, c3, c1);
731280297Sjkim    r[13] = c2;
732280297Sjkim    c2 = 0;
733280297Sjkim    mul_add_c(a[7], b[7], c3, c1, c2);
734280297Sjkim    r[14] = c3;
735280297Sjkim    r[15] = c1;
736280297Sjkim}
73755714Skris
73855714Skrisvoid bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
739280297Sjkim{
740280297Sjkim    BN_ULONG c1, c2, c3;
74155714Skris
742280297Sjkim    c1 = 0;
743280297Sjkim    c2 = 0;
744280297Sjkim    c3 = 0;
745280297Sjkim    mul_add_c(a[0], b[0], c1, c2, c3);
746280297Sjkim    r[0] = c1;
747280297Sjkim    c1 = 0;
748280297Sjkim    mul_add_c(a[0], b[1], c2, c3, c1);
749280297Sjkim    mul_add_c(a[1], b[0], c2, c3, c1);
750280297Sjkim    r[1] = c2;
751280297Sjkim    c2 = 0;
752280297Sjkim    mul_add_c(a[2], b[0], c3, c1, c2);
753280297Sjkim    mul_add_c(a[1], b[1], c3, c1, c2);
754280297Sjkim    mul_add_c(a[0], b[2], c3, c1, c2);
755280297Sjkim    r[2] = c3;
756280297Sjkim    c3 = 0;
757280297Sjkim    mul_add_c(a[0], b[3], c1, c2, c3);
758280297Sjkim    mul_add_c(a[1], b[2], c1, c2, c3);
759280297Sjkim    mul_add_c(a[2], b[1], c1, c2, c3);
760280297Sjkim    mul_add_c(a[3], b[0], c1, c2, c3);
761280297Sjkim    r[3] = c1;
762280297Sjkim    c1 = 0;
763280297Sjkim    mul_add_c(a[3], b[1], c2, c3, c1);
764280297Sjkim    mul_add_c(a[2], b[2], c2, c3, c1);
765280297Sjkim    mul_add_c(a[1], b[3], c2, c3, c1);
766280297Sjkim    r[4] = c2;
767280297Sjkim    c2 = 0;
768280297Sjkim    mul_add_c(a[2], b[3], c3, c1, c2);
769280297Sjkim    mul_add_c(a[3], b[2], c3, c1, c2);
770280297Sjkim    r[5] = c3;
771280297Sjkim    c3 = 0;
772280297Sjkim    mul_add_c(a[3], b[3], c1, c2, c3);
773280297Sjkim    r[6] = c1;
774280297Sjkim    r[7] = c2;
775280297Sjkim}
77655714Skris
777109998Smarkmvoid bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a)
778280297Sjkim{
779280297Sjkim    BN_ULONG c1, c2, c3;
78055714Skris
781280297Sjkim    c1 = 0;
782280297Sjkim    c2 = 0;
783280297Sjkim    c3 = 0;
784280297Sjkim    sqr_add_c(a, 0, c1, c2, c3);
785280297Sjkim    r[0] = c1;
786280297Sjkim    c1 = 0;
787280297Sjkim    sqr_add_c2(a, 1, 0, c2, c3, c1);
788280297Sjkim    r[1] = c2;
789280297Sjkim    c2 = 0;
790280297Sjkim    sqr_add_c(a, 1, c3, c1, c2);
791280297Sjkim    sqr_add_c2(a, 2, 0, c3, c1, c2);
792280297Sjkim    r[2] = c3;
793280297Sjkim    c3 = 0;
794280297Sjkim    sqr_add_c2(a, 3, 0, c1, c2, c3);
795280297Sjkim    sqr_add_c2(a, 2, 1, c1, c2, c3);
796280297Sjkim    r[3] = c1;
797280297Sjkim    c1 = 0;
798280297Sjkim    sqr_add_c(a, 2, c2, c3, c1);
799280297Sjkim    sqr_add_c2(a, 3, 1, c2, c3, c1);
800280297Sjkim    sqr_add_c2(a, 4, 0, c2, c3, c1);
801280297Sjkim    r[4] = c2;
802280297Sjkim    c2 = 0;
803280297Sjkim    sqr_add_c2(a, 5, 0, c3, c1, c2);
804280297Sjkim    sqr_add_c2(a, 4, 1, c3, c1, c2);
805280297Sjkim    sqr_add_c2(a, 3, 2, c3, c1, c2);
806280297Sjkim    r[5] = c3;
807280297Sjkim    c3 = 0;
808280297Sjkim    sqr_add_c(a, 3, c1, c2, c3);
809280297Sjkim    sqr_add_c2(a, 4, 2, c1, c2, c3);
810280297Sjkim    sqr_add_c2(a, 5, 1, c1, c2, c3);
811280297Sjkim    sqr_add_c2(a, 6, 0, c1, c2, c3);
812280297Sjkim    r[6] = c1;
813280297Sjkim    c1 = 0;
814280297Sjkim    sqr_add_c2(a, 7, 0, c2, c3, c1);
815280297Sjkim    sqr_add_c2(a, 6, 1, c2, c3, c1);
816280297Sjkim    sqr_add_c2(a, 5, 2, c2, c3, c1);
817280297Sjkim    sqr_add_c2(a, 4, 3, c2, c3, c1);
818280297Sjkim    r[7] = c2;
819280297Sjkim    c2 = 0;
820280297Sjkim    sqr_add_c(a, 4, c3, c1, c2);
821280297Sjkim    sqr_add_c2(a, 5, 3, c3, c1, c2);
822280297Sjkim    sqr_add_c2(a, 6, 2, c3, c1, c2);
823280297Sjkim    sqr_add_c2(a, 7, 1, c3, c1, c2);
824280297Sjkim    r[8] = c3;
825280297Sjkim    c3 = 0;
826280297Sjkim    sqr_add_c2(a, 7, 2, c1, c2, c3);
827280297Sjkim    sqr_add_c2(a, 6, 3, c1, c2, c3);
828280297Sjkim    sqr_add_c2(a, 5, 4, c1, c2, c3);
829280297Sjkim    r[9] = c1;
830280297Sjkim    c1 = 0;
831280297Sjkim    sqr_add_c(a, 5, c2, c3, c1);
832280297Sjkim    sqr_add_c2(a, 6, 4, c2, c3, c1);
833280297Sjkim    sqr_add_c2(a, 7, 3, c2, c3, c1);
834280297Sjkim    r[10] = c2;
835280297Sjkim    c2 = 0;
836280297Sjkim    sqr_add_c2(a, 7, 4, c3, c1, c2);
837280297Sjkim    sqr_add_c2(a, 6, 5, c3, c1, c2);
838280297Sjkim    r[11] = c3;
839280297Sjkim    c3 = 0;
840280297Sjkim    sqr_add_c(a, 6, c1, c2, c3);
841280297Sjkim    sqr_add_c2(a, 7, 5, c1, c2, c3);
842280297Sjkim    r[12] = c1;
843280297Sjkim    c1 = 0;
844280297Sjkim    sqr_add_c2(a, 7, 6, c2, c3, c1);
845280297Sjkim    r[13] = c2;
846280297Sjkim    c2 = 0;
847280297Sjkim    sqr_add_c(a, 7, c3, c1, c2);
848280297Sjkim    r[14] = c3;
849280297Sjkim    r[15] = c1;
850280297Sjkim}
85155714Skris
852109998Smarkmvoid bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a)
853280297Sjkim{
854280297Sjkim    BN_ULONG c1, c2, c3;
85555714Skris
856280297Sjkim    c1 = 0;
857280297Sjkim    c2 = 0;
858280297Sjkim    c3 = 0;
859280297Sjkim    sqr_add_c(a, 0, c1, c2, c3);
860280297Sjkim    r[0] = c1;
861280297Sjkim    c1 = 0;
862280297Sjkim    sqr_add_c2(a, 1, 0, c2, c3, c1);
863280297Sjkim    r[1] = c2;
864280297Sjkim    c2 = 0;
865280297Sjkim    sqr_add_c(a, 1, c3, c1, c2);
866280297Sjkim    sqr_add_c2(a, 2, 0, c3, c1, c2);
867280297Sjkim    r[2] = c3;
868280297Sjkim    c3 = 0;
869280297Sjkim    sqr_add_c2(a, 3, 0, c1, c2, c3);
870280297Sjkim    sqr_add_c2(a, 2, 1, c1, c2, c3);
871280297Sjkim    r[3] = c1;
872280297Sjkim    c1 = 0;
873280297Sjkim    sqr_add_c(a, 2, c2, c3, c1);
874280297Sjkim    sqr_add_c2(a, 3, 1, c2, c3, c1);
875280297Sjkim    r[4] = c2;
876280297Sjkim    c2 = 0;
877280297Sjkim    sqr_add_c2(a, 3, 2, c3, c1, c2);
878280297Sjkim    r[5] = c3;
879280297Sjkim    c3 = 0;
880280297Sjkim    sqr_add_c(a, 3, c1, c2, c3);
881280297Sjkim    r[6] = c1;
882280297Sjkim    r[7] = c2;
883280297Sjkim}
884238405Sjkim
885280297Sjkim# ifdef OPENSSL_NO_ASM
886280297Sjkim#  ifdef OPENSSL_BN_ASM_MONT
887280297Sjkim#   include <alloca.h>
888238405Sjkim/*
889238405Sjkim * This is essentially reference implementation, which may or may not
890238405Sjkim * result in performance improvement. E.g. on IA-32 this routine was
891238405Sjkim * observed to give 40% faster rsa1024 private key operations and 10%
892238405Sjkim * faster rsa4096 ones, while on AMD64 it improves rsa1024 sign only
893238405Sjkim * by 10% and *worsens* rsa4096 sign by 15%. Once again, it's a
894238405Sjkim * reference implementation, one to be used as starting point for
895238405Sjkim * platform-specific assembler. Mentioned numbers apply to compiler
896238405Sjkim * generated code compiled with and without -DOPENSSL_BN_ASM_MONT and
897238405Sjkim * can vary not only from platform to platform, but even for compiler
898238405Sjkim * versions. Assembler vs. assembler improvement coefficients can
899238405Sjkim * [and are known to] differ and are to be documented elsewhere.
900238405Sjkim */
901280297Sjkimint bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
902280297Sjkim                const BN_ULONG *np, const BN_ULONG *n0p, int num)
903280297Sjkim{
904280297Sjkim    BN_ULONG c0, c1, ml, *tp, n0;
905280297Sjkim#   ifdef mul64
906280297Sjkim    BN_ULONG mh;
907280297Sjkim#   endif
908280297Sjkim    volatile BN_ULONG *vp;
909280297Sjkim    int i = 0, j;
910238405Sjkim
911280297Sjkim#   if 0                        /* template for platform-specific
912280297Sjkim                                 * implementation */
913280297Sjkim    if (ap == bp)
914280297Sjkim        return bn_sqr_mont(rp, ap, np, n0p, num);
915280297Sjkim#   endif
916280297Sjkim    vp = tp = alloca((num + 2) * sizeof(BN_ULONG));
917238405Sjkim
918280297Sjkim    n0 = *n0p;
919238405Sjkim
920280297Sjkim    c0 = 0;
921280297Sjkim    ml = bp[0];
922280297Sjkim#   ifdef mul64
923280297Sjkim    mh = HBITS(ml);
924280297Sjkim    ml = LBITS(ml);
925280297Sjkim    for (j = 0; j < num; ++j)
926280297Sjkim        mul(tp[j], ap[j], ml, mh, c0);
927280297Sjkim#   else
928280297Sjkim    for (j = 0; j < num; ++j)
929280297Sjkim        mul(tp[j], ap[j], ml, c0);
930280297Sjkim#   endif
931238405Sjkim
932280297Sjkim    tp[num] = c0;
933280297Sjkim    tp[num + 1] = 0;
934280297Sjkim    goto enter;
935238405Sjkim
936280297Sjkim    for (i = 0; i < num; i++) {
937280297Sjkim        c0 = 0;
938280297Sjkim        ml = bp[i];
939280297Sjkim#   ifdef mul64
940280297Sjkim        mh = HBITS(ml);
941280297Sjkim        ml = LBITS(ml);
942280297Sjkim        for (j = 0; j < num; ++j)
943280297Sjkim            mul_add(tp[j], ap[j], ml, mh, c0);
944280297Sjkim#   else
945280297Sjkim        for (j = 0; j < num; ++j)
946280297Sjkim            mul_add(tp[j], ap[j], ml, c0);
947280297Sjkim#   endif
948280297Sjkim        c1 = (tp[num] + c0) & BN_MASK2;
949280297Sjkim        tp[num] = c1;
950280297Sjkim        tp[num + 1] = (c1 < c0 ? 1 : 0);
951280297Sjkim enter:
952280297Sjkim        c1 = tp[0];
953280297Sjkim        ml = (c1 * n0) & BN_MASK2;
954280297Sjkim        c0 = 0;
955280297Sjkim#   ifdef mul64
956280297Sjkim        mh = HBITS(ml);
957280297Sjkim        ml = LBITS(ml);
958280297Sjkim        mul_add(c1, np[0], ml, mh, c0);
959280297Sjkim#   else
960280297Sjkim        mul_add(c1, ml, np[0], c0);
961280297Sjkim#   endif
962280297Sjkim        for (j = 1; j < num; j++) {
963280297Sjkim            c1 = tp[j];
964280297Sjkim#   ifdef mul64
965280297Sjkim            mul_add(c1, np[j], ml, mh, c0);
966280297Sjkim#   else
967280297Sjkim            mul_add(c1, ml, np[j], c0);
968280297Sjkim#   endif
969280297Sjkim            tp[j - 1] = c1 & BN_MASK2;
970280297Sjkim        }
971280297Sjkim        c1 = (tp[num] + c0) & BN_MASK2;
972280297Sjkim        tp[num - 1] = c1;
973280297Sjkim        tp[num] = tp[num + 1] + (c1 < c0 ? 1 : 0);
974280297Sjkim    }
975238405Sjkim
976280297Sjkim    if (tp[num] != 0 || tp[num - 1] >= np[num - 1]) {
977280297Sjkim        c0 = bn_sub_words(rp, tp, np, num);
978280297Sjkim        if (tp[num] != 0 || c0 == 0) {
979280297Sjkim            for (i = 0; i < num + 2; i++)
980280297Sjkim                vp[i] = 0;
981280297Sjkim            return 1;
982280297Sjkim        }
983280297Sjkim    }
984280297Sjkim    for (i = 0; i < num; i++)
985280297Sjkim        rp[i] = tp[i], vp[i] = 0;
986280297Sjkim    vp[num] = 0;
987280297Sjkim    vp[num + 1] = 0;
988280297Sjkim    return 1;
989280297Sjkim}
990280297Sjkim#  else
991238405Sjkim/*
992238405Sjkim * Return value of 0 indicates that multiplication/convolution was not
993238405Sjkim * performed to signal the caller to fall down to alternative/original
994238405Sjkim * code-path.
995238405Sjkim */
996280297Sjkimint bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
997280297Sjkim                const BN_ULONG *np, const BN_ULONG *n0, int num)
998280297Sjkim{
999280297Sjkim    return 0;
1000280297Sjkim}
1001280297Sjkim#  endif                        /* OPENSSL_BN_ASM_MONT */
1002280297Sjkim# endif
1003238405Sjkim
1004280297Sjkim#else                           /* !BN_MUL_COMBA */
100555714Skris
100655714Skris/* hmm... is it faster just to do a multiply? */
1007280297Sjkim# undef bn_sqr_comba4
1008238405Sjkimvoid bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a)
1009280297Sjkim{
1010280297Sjkim    BN_ULONG t[8];
1011280297Sjkim    bn_sqr_normal(r, a, 4, t);
1012280297Sjkim}
101355714Skris
1014280297Sjkim# undef bn_sqr_comba8
1015238405Sjkimvoid bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a)
1016280297Sjkim{
1017280297Sjkim    BN_ULONG t[16];
1018280297Sjkim    bn_sqr_normal(r, a, 8, t);
1019280297Sjkim}
102055714Skris
102155714Skrisvoid bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
1022280297Sjkim{
1023280297Sjkim    r[4] = bn_mul_words(&(r[0]), a, 4, b[0]);
1024280297Sjkim    r[5] = bn_mul_add_words(&(r[1]), a, 4, b[1]);
1025280297Sjkim    r[6] = bn_mul_add_words(&(r[2]), a, 4, b[2]);
1026280297Sjkim    r[7] = bn_mul_add_words(&(r[3]), a, 4, b[3]);
1027280297Sjkim}
102855714Skris
102955714Skrisvoid bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
1030280297Sjkim{
1031280297Sjkim    r[8] = bn_mul_words(&(r[0]), a, 8, b[0]);
1032280297Sjkim    r[9] = bn_mul_add_words(&(r[1]), a, 8, b[1]);
1033280297Sjkim    r[10] = bn_mul_add_words(&(r[2]), a, 8, b[2]);
1034280297Sjkim    r[11] = bn_mul_add_words(&(r[3]), a, 8, b[3]);
1035280297Sjkim    r[12] = bn_mul_add_words(&(r[4]), a, 8, b[4]);
1036280297Sjkim    r[13] = bn_mul_add_words(&(r[5]), a, 8, b[5]);
1037280297Sjkim    r[14] = bn_mul_add_words(&(r[6]), a, 8, b[6]);
1038280297Sjkim    r[15] = bn_mul_add_words(&(r[7]), a, 8, b[7]);
1039280297Sjkim}
104055714Skris
1041280297Sjkim# ifdef OPENSSL_NO_ASM
1042280297Sjkim#  ifdef OPENSSL_BN_ASM_MONT
1043280297Sjkim#   include <alloca.h>
1044280297Sjkimint bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
1045280297Sjkim                const BN_ULONG *np, const BN_ULONG *n0p, int num)
1046280297Sjkim{
1047280297Sjkim    BN_ULONG c0, c1, *tp, n0 = *n0p;
1048280297Sjkim    volatile BN_ULONG *vp;
1049280297Sjkim    int i = 0, j;
1050238405Sjkim
1051280297Sjkim    vp = tp = alloca((num + 2) * sizeof(BN_ULONG));
1052238405Sjkim
1053280297Sjkim    for (i = 0; i <= num; i++)
1054280297Sjkim        tp[i] = 0;
1055238405Sjkim
1056280297Sjkim    for (i = 0; i < num; i++) {
1057280297Sjkim        c0 = bn_mul_add_words(tp, ap, num, bp[i]);
1058280297Sjkim        c1 = (tp[num] + c0) & BN_MASK2;
1059280297Sjkim        tp[num] = c1;
1060280297Sjkim        tp[num + 1] = (c1 < c0 ? 1 : 0);
1061238405Sjkim
1062280297Sjkim        c0 = bn_mul_add_words(tp, np, num, tp[0] * n0);
1063280297Sjkim        c1 = (tp[num] + c0) & BN_MASK2;
1064280297Sjkim        tp[num] = c1;
1065280297Sjkim        tp[num + 1] += (c1 < c0 ? 1 : 0);
1066280297Sjkim        for (j = 0; j <= num; j++)
1067280297Sjkim            tp[j] = tp[j + 1];
1068280297Sjkim    }
1069238405Sjkim
1070280297Sjkim    if (tp[num] != 0 || tp[num - 1] >= np[num - 1]) {
1071280297Sjkim        c0 = bn_sub_words(rp, tp, np, num);
1072280297Sjkim        if (tp[num] != 0 || c0 == 0) {
1073280297Sjkim            for (i = 0; i < num + 2; i++)
1074280297Sjkim                vp[i] = 0;
1075280297Sjkim            return 1;
1076280297Sjkim        }
1077280297Sjkim    }
1078280297Sjkim    for (i = 0; i < num; i++)
1079280297Sjkim        rp[i] = tp[i], vp[i] = 0;
1080280297Sjkim    vp[num] = 0;
1081280297Sjkim    vp[num + 1] = 0;
1082280297Sjkim    return 1;
1083280297Sjkim}
1084280297Sjkim#  else
1085280297Sjkimint bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
1086280297Sjkim                const BN_ULONG *np, const BN_ULONG *n0, int num)
1087280297Sjkim{
1088280297Sjkim    return 0;
1089280297Sjkim}
1090280297Sjkim#  endif                        /* OPENSSL_BN_ASM_MONT */
1091280297Sjkim# endif
1092238405Sjkim
1093280297Sjkim#endif                          /* !BN_MUL_COMBA */
1094