bn_asm.c revision 296465
1/* crypto/bn/bn_asm.c */
2/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
3 * All rights reserved.
4 *
5 * This package is an SSL implementation written
6 * by Eric Young (eay@cryptsoft.com).
7 * The implementation was written so as to conform with Netscapes SSL.
8 *
9 * This library is free for commercial and non-commercial use as long as
10 * the following conditions are aheared to.  The following conditions
11 * apply to all code found in this distribution, be it the RC4, RSA,
12 * lhash, DES, etc., code; not just the SSL code.  The SSL documentation
13 * included with this distribution is covered by the same copyright terms
14 * except that the holder is Tim Hudson (tjh@cryptsoft.com).
15 *
16 * Copyright remains Eric Young's, and as such any Copyright notices in
17 * the code are not to be removed.
18 * If this package is used in a product, Eric Young should be given attribution
19 * as the author of the parts of the library used.
20 * This can be in the form of a textual message at program startup or
21 * in documentation (online or textual) provided with the package.
22 *
23 * Redistribution and use in source and binary forms, with or without
24 * modification, are permitted provided that the following conditions
25 * are met:
26 * 1. Redistributions of source code must retain the copyright
27 *    notice, this list of conditions and the following disclaimer.
28 * 2. Redistributions in binary form must reproduce the above copyright
29 *    notice, this list of conditions and the following disclaimer in the
30 *    documentation and/or other materials provided with the distribution.
31 * 3. All advertising materials mentioning features or use of this software
32 *    must display the following acknowledgement:
33 *    "This product includes cryptographic software written by
34 *     Eric Young (eay@cryptsoft.com)"
35 *    The word 'cryptographic' can be left out if the rouines from the library
36 *    being used are not cryptographic related :-).
37 * 4. If you include any Windows specific code (or a derivative thereof) from
38 *    the apps directory (application code) you must include an acknowledgement:
39 *    "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
40 *
41 * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
42 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
43 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
44 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
45 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
46 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
47 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
48 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
49 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
50 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
51 * SUCH DAMAGE.
52 *
53 * The licence and distribution terms for any publically available version or
54 * derivative of this code cannot be changed.  i.e. this code cannot simply be
55 * copied and put under another distribution licence
56 * [including the GNU Public Licence.]
57 */
58
59#ifndef BN_DEBUG
60# undef NDEBUG                  /* avoid conflicting definitions */
61# define NDEBUG
62#endif
63
64#include <stdio.h>
65#include <assert.h>
66#include "cryptlib.h"
67#include "bn_lcl.h"
68
69#if defined(BN_LLONG) || defined(BN_UMULT_HIGH)
70
71BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num,
72                          BN_ULONG w)
73{
74    BN_ULONG c1 = 0;
75
76    assert(num >= 0);
77    if (num <= 0)
78        return (c1);
79
80    while (num & ~3) {
81        mul_add(rp[0], ap[0], w, c1);
82        mul_add(rp[1], ap[1], w, c1);
83        mul_add(rp[2], ap[2], w, c1);
84        mul_add(rp[3], ap[3], w, c1);
85        ap += 4;
86        rp += 4;
87        num -= 4;
88    }
89    if (num) {
90        mul_add(rp[0], ap[0], w, c1);
91        if (--num == 0)
92            return c1;
93        mul_add(rp[1], ap[1], w, c1);
94        if (--num == 0)
95            return c1;
96        mul_add(rp[2], ap[2], w, c1);
97        return c1;
98    }
99
100    return (c1);
101}
102
103BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
104{
105    BN_ULONG c1 = 0;
106
107    assert(num >= 0);
108    if (num <= 0)
109        return (c1);
110
111    while (num & ~3) {
112        mul(rp[0], ap[0], w, c1);
113        mul(rp[1], ap[1], w, c1);
114        mul(rp[2], ap[2], w, c1);
115        mul(rp[3], ap[3], w, c1);
116        ap += 4;
117        rp += 4;
118        num -= 4;
119    }
120    if (num) {
121        mul(rp[0], ap[0], w, c1);
122        if (--num == 0)
123            return c1;
124        mul(rp[1], ap[1], w, c1);
125        if (--num == 0)
126            return c1;
127        mul(rp[2], ap[2], w, c1);
128    }
129    return (c1);
130}
131
132void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n)
133{
134    assert(n >= 0);
135    if (n <= 0)
136        return;
137    while (n & ~3) {
138        sqr(r[0], r[1], a[0]);
139        sqr(r[2], r[3], a[1]);
140        sqr(r[4], r[5], a[2]);
141        sqr(r[6], r[7], a[3]);
142        a += 4;
143        r += 8;
144        n -= 4;
145    }
146    if (n) {
147        sqr(r[0], r[1], a[0]);
148        if (--n == 0)
149            return;
150        sqr(r[2], r[3], a[1]);
151        if (--n == 0)
152            return;
153        sqr(r[4], r[5], a[2]);
154    }
155}
156
157#else                           /* !(defined(BN_LLONG) ||
158                                 * defined(BN_UMULT_HIGH)) */
159
160BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num,
161                          BN_ULONG w)
162{
163    BN_ULONG c = 0;
164    BN_ULONG bl, bh;
165
166    assert(num >= 0);
167    if (num <= 0)
168        return ((BN_ULONG)0);
169
170    bl = LBITS(w);
171    bh = HBITS(w);
172
173    for (;;) {
174        mul_add(rp[0], ap[0], bl, bh, c);
175        if (--num == 0)
176            break;
177        mul_add(rp[1], ap[1], bl, bh, c);
178        if (--num == 0)
179            break;
180        mul_add(rp[2], ap[2], bl, bh, c);
181        if (--num == 0)
182            break;
183        mul_add(rp[3], ap[3], bl, bh, c);
184        if (--num == 0)
185            break;
186        ap += 4;
187        rp += 4;
188    }
189    return (c);
190}
191
192BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
193{
194    BN_ULONG carry = 0;
195    BN_ULONG bl, bh;
196
197    assert(num >= 0);
198    if (num <= 0)
199        return ((BN_ULONG)0);
200
201    bl = LBITS(w);
202    bh = HBITS(w);
203
204    for (;;) {
205        mul(rp[0], ap[0], bl, bh, carry);
206        if (--num == 0)
207            break;
208        mul(rp[1], ap[1], bl, bh, carry);
209        if (--num == 0)
210            break;
211        mul(rp[2], ap[2], bl, bh, carry);
212        if (--num == 0)
213            break;
214        mul(rp[3], ap[3], bl, bh, carry);
215        if (--num == 0)
216            break;
217        ap += 4;
218        rp += 4;
219    }
220    return (carry);
221}
222
223void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n)
224{
225    assert(n >= 0);
226    if (n <= 0)
227        return;
228    for (;;) {
229        sqr64(r[0], r[1], a[0]);
230        if (--n == 0)
231            break;
232
233        sqr64(r[2], r[3], a[1]);
234        if (--n == 0)
235            break;
236
237        sqr64(r[4], r[5], a[2]);
238        if (--n == 0)
239            break;
240
241        sqr64(r[6], r[7], a[3]);
242        if (--n == 0)
243            break;
244
245        a += 4;
246        r += 8;
247    }
248}
249
250#endif                          /* !(defined(BN_LLONG) ||
251                                 * defined(BN_UMULT_HIGH)) */
252
253#if defined(BN_LLONG) && defined(BN_DIV2W)
254
255BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
256{
257    return ((BN_ULONG)(((((BN_ULLONG) h) << BN_BITS2) | l) / (BN_ULLONG) d));
258}
259
260#else
261
262/* Divide h,l by d and return the result. */
263/* I need to test this some more :-( */
264BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
265{
266    BN_ULONG dh, dl, q, ret = 0, th, tl, t;
267    int i, count = 2;
268
269    if (d == 0)
270        return (BN_MASK2);
271
272    i = BN_num_bits_word(d);
273    assert((i == BN_BITS2) || (h <= (BN_ULONG)1 << i));
274
275    i = BN_BITS2 - i;
276    if (h >= d)
277        h -= d;
278
279    if (i) {
280        d <<= i;
281        h = (h << i) | (l >> (BN_BITS2 - i));
282        l <<= i;
283    }
284    dh = (d & BN_MASK2h) >> BN_BITS4;
285    dl = (d & BN_MASK2l);
286    for (;;) {
287        if ((h >> BN_BITS4) == dh)
288            q = BN_MASK2l;
289        else
290            q = h / dh;
291
292        th = q * dh;
293        tl = dl * q;
294        for (;;) {
295            t = h - th;
296            if ((t & BN_MASK2h) ||
297                ((tl) <= ((t << BN_BITS4) | ((l & BN_MASK2h) >> BN_BITS4))))
298                break;
299            q--;
300            th -= dh;
301            tl -= dl;
302        }
303        t = (tl >> BN_BITS4);
304        tl = (tl << BN_BITS4) & BN_MASK2h;
305        th += t;
306
307        if (l < tl)
308            th++;
309        l -= tl;
310        if (h < th) {
311            h += d;
312            q--;
313        }
314        h -= th;
315
316        if (--count == 0)
317            break;
318
319        ret = q << BN_BITS4;
320        h = ((h << BN_BITS4) | (l >> BN_BITS4)) & BN_MASK2;
321        l = (l & BN_MASK2l) << BN_BITS4;
322    }
323    ret |= q;
324    return (ret);
325}
326#endif                          /* !defined(BN_LLONG) && defined(BN_DIV2W) */
327
328#ifdef BN_LLONG
329BN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
330                      int n)
331{
332    BN_ULLONG ll = 0;
333
334    assert(n >= 0);
335    if (n <= 0)
336        return ((BN_ULONG)0);
337
338    for (;;) {
339        ll += (BN_ULLONG) a[0] + b[0];
340        r[0] = (BN_ULONG)ll & BN_MASK2;
341        ll >>= BN_BITS2;
342        if (--n <= 0)
343            break;
344
345        ll += (BN_ULLONG) a[1] + b[1];
346        r[1] = (BN_ULONG)ll & BN_MASK2;
347        ll >>= BN_BITS2;
348        if (--n <= 0)
349            break;
350
351        ll += (BN_ULLONG) a[2] + b[2];
352        r[2] = (BN_ULONG)ll & BN_MASK2;
353        ll >>= BN_BITS2;
354        if (--n <= 0)
355            break;
356
357        ll += (BN_ULLONG) a[3] + b[3];
358        r[3] = (BN_ULONG)ll & BN_MASK2;
359        ll >>= BN_BITS2;
360        if (--n <= 0)
361            break;
362
363        a += 4;
364        b += 4;
365        r += 4;
366    }
367    return ((BN_ULONG)ll);
368}
369#else                           /* !BN_LLONG */
370BN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
371                      int n)
372{
373    BN_ULONG c, l, t;
374
375    assert(n >= 0);
376    if (n <= 0)
377        return ((BN_ULONG)0);
378
379    c = 0;
380    for (;;) {
381        t = a[0];
382        t = (t + c) & BN_MASK2;
383        c = (t < c);
384        l = (t + b[0]) & BN_MASK2;
385        c += (l < t);
386        r[0] = l;
387        if (--n <= 0)
388            break;
389
390        t = a[1];
391        t = (t + c) & BN_MASK2;
392        c = (t < c);
393        l = (t + b[1]) & BN_MASK2;
394        c += (l < t);
395        r[1] = l;
396        if (--n <= 0)
397            break;
398
399        t = a[2];
400        t = (t + c) & BN_MASK2;
401        c = (t < c);
402        l = (t + b[2]) & BN_MASK2;
403        c += (l < t);
404        r[2] = l;
405        if (--n <= 0)
406            break;
407
408        t = a[3];
409        t = (t + c) & BN_MASK2;
410        c = (t < c);
411        l = (t + b[3]) & BN_MASK2;
412        c += (l < t);
413        r[3] = l;
414        if (--n <= 0)
415            break;
416
417        a += 4;
418        b += 4;
419        r += 4;
420    }
421    return ((BN_ULONG)c);
422}
423#endif                          /* !BN_LLONG */
424
425BN_ULONG bn_sub_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
426                      int n)
427{
428    BN_ULONG t1, t2;
429    int c = 0;
430
431    assert(n >= 0);
432    if (n <= 0)
433        return ((BN_ULONG)0);
434
435    for (;;) {
436        t1 = a[0];
437        t2 = b[0];
438        r[0] = (t1 - t2 - c) & BN_MASK2;
439        if (t1 != t2)
440            c = (t1 < t2);
441        if (--n <= 0)
442            break;
443
444        t1 = a[1];
445        t2 = b[1];
446        r[1] = (t1 - t2 - c) & BN_MASK2;
447        if (t1 != t2)
448            c = (t1 < t2);
449        if (--n <= 0)
450            break;
451
452        t1 = a[2];
453        t2 = b[2];
454        r[2] = (t1 - t2 - c) & BN_MASK2;
455        if (t1 != t2)
456            c = (t1 < t2);
457        if (--n <= 0)
458            break;
459
460        t1 = a[3];
461        t2 = b[3];
462        r[3] = (t1 - t2 - c) & BN_MASK2;
463        if (t1 != t2)
464            c = (t1 < t2);
465        if (--n <= 0)
466            break;
467
468        a += 4;
469        b += 4;
470        r += 4;
471    }
472    return (c);
473}
474
475#ifdef BN_MUL_COMBA
476
477# undef bn_mul_comba8
478# undef bn_mul_comba4
479# undef bn_sqr_comba8
480# undef bn_sqr_comba4
481
482/* mul_add_c(a,b,c0,c1,c2)  -- c+=a*b for three word number c=(c2,c1,c0) */
483/* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */
484/* sqr_add_c(a,i,c0,c1,c2)  -- c+=a[i]^2 for three word number c=(c2,c1,c0) */
485/*
486 * sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number
487 * c=(c2,c1,c0)
488 */
489
490/*
491 * Keep in mind that carrying into high part of multiplication result
492 * can not overflow, because it cannot be all-ones.
493 */
494# ifdef BN_LLONG
495#  define mul_add_c(a,b,c0,c1,c2) \
496        t=(BN_ULLONG)a*b; \
497        t1=(BN_ULONG)Lw(t); \
498        t2=(BN_ULONG)Hw(t); \
499        c0=(c0+t1)&BN_MASK2; if ((c0) < t1) t2++; \
500        c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++;
501
502#  define mul_add_c2(a,b,c0,c1,c2) \
503        t=(BN_ULLONG)a*b; \
504        tt=(t+t)&BN_MASK; \
505        if (tt < t) c2++; \
506        t1=(BN_ULONG)Lw(tt); \
507        t2=(BN_ULONG)Hw(tt); \
508        c0=(c0+t1)&BN_MASK2;  \
509        if ((c0 < t1) && (((++t2)&BN_MASK2) == 0)) c2++; \
510        c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++;
511
512#  define sqr_add_c(a,i,c0,c1,c2) \
513        t=(BN_ULLONG)a[i]*a[i]; \
514        t1=(BN_ULONG)Lw(t); \
515        t2=(BN_ULONG)Hw(t); \
516        c0=(c0+t1)&BN_MASK2; if ((c0) < t1) t2++; \
517        c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++;
518
519#  define sqr_add_c2(a,i,j,c0,c1,c2) \
520        mul_add_c2((a)[i],(a)[j],c0,c1,c2)
521
522# elif defined(BN_UMULT_LOHI)
523
524#  define mul_add_c(a,b,c0,c1,c2) {       \
525        BN_ULONG ta=(a),tb=(b);         \
526        BN_UMULT_LOHI(t1,t2,ta,tb);     \
527        c0 += t1; t2 += (c0<t1)?1:0;    \
528        c1 += t2; c2 += (c1<t2)?1:0;    \
529        }
530
531#  define mul_add_c2(a,b,c0,c1,c2) {      \
532        BN_ULONG ta=(a),tb=(b),t0;      \
533        BN_UMULT_LOHI(t0,t1,ta,tb);     \
534        c0 += t0; t2 = t1+((c0<t0)?1:0);\
535        c1 += t2; c2 += (c1<t2)?1:0;    \
536        c0 += t0; t1 += (c0<t0)?1:0;    \
537        c1 += t1; c2 += (c1<t1)?1:0;    \
538        }
539
540#  define sqr_add_c(a,i,c0,c1,c2) {       \
541        BN_ULONG ta=(a)[i];             \
542        BN_UMULT_LOHI(t1,t2,ta,ta);     \
543        c0 += t1; t2 += (c0<t1)?1:0;    \
544        c1 += t2; c2 += (c1<t2)?1:0;    \
545        }
546
547#  define sqr_add_c2(a,i,j,c0,c1,c2)    \
548        mul_add_c2((a)[i],(a)[j],c0,c1,c2)
549
550# elif defined(BN_UMULT_HIGH)
551
552#  define mul_add_c(a,b,c0,c1,c2) {       \
553        BN_ULONG ta=(a),tb=(b);         \
554        t1 = ta * tb;                   \
555        t2 = BN_UMULT_HIGH(ta,tb);      \
556        c0 += t1; t2 += (c0<t1)?1:0;    \
557        c1 += t2; c2 += (c1<t2)?1:0;    \
558        }
559
560#  define mul_add_c2(a,b,c0,c1,c2) {      \
561        BN_ULONG ta=(a),tb=(b),t0;      \
562        t1 = BN_UMULT_HIGH(ta,tb);      \
563        t0 = ta * tb;                   \
564        c0 += t0; t2 = t1+((c0<t0)?1:0);\
565        c1 += t2; c2 += (c1<t2)?1:0;    \
566        c0 += t0; t1 += (c0<t0)?1:0;    \
567        c1 += t1; c2 += (c1<t1)?1:0;    \
568        }
569
570#  define sqr_add_c(a,i,c0,c1,c2) {       \
571        BN_ULONG ta=(a)[i];             \
572        t1 = ta * ta;                   \
573        t2 = BN_UMULT_HIGH(ta,ta);      \
574        c0 += t1; t2 += (c0<t1)?1:0;    \
575        c1 += t2; c2 += (c1<t2)?1:0;    \
576        }
577
578#  define sqr_add_c2(a,i,j,c0,c1,c2)      \
579        mul_add_c2((a)[i],(a)[j],c0,c1,c2)
580
581# else                          /* !BN_LLONG */
582#  define mul_add_c(a,b,c0,c1,c2) \
583        t1=LBITS(a); t2=HBITS(a); \
584        bl=LBITS(b); bh=HBITS(b); \
585        mul64(t1,t2,bl,bh); \
586        c0=(c0+t1)&BN_MASK2; if ((c0) < t1) t2++; \
587        c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++;
588
589#  define mul_add_c2(a,b,c0,c1,c2) \
590        t1=LBITS(a); t2=HBITS(a); \
591        bl=LBITS(b); bh=HBITS(b); \
592        mul64(t1,t2,bl,bh); \
593        if (t2 & BN_TBIT) c2++; \
594        t2=(t2+t2)&BN_MASK2; \
595        if (t1 & BN_TBIT) t2++; \
596        t1=(t1+t1)&BN_MASK2; \
597        c0=(c0+t1)&BN_MASK2;  \
598        if ((c0 < t1) && (((++t2)&BN_MASK2) == 0)) c2++; \
599        c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++;
600
601#  define sqr_add_c(a,i,c0,c1,c2) \
602        sqr64(t1,t2,(a)[i]); \
603        c0=(c0+t1)&BN_MASK2; if ((c0) < t1) t2++; \
604        c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++;
605
606#  define sqr_add_c2(a,i,j,c0,c1,c2) \
607        mul_add_c2((a)[i],(a)[j],c0,c1,c2)
608# endif                         /* !BN_LLONG */
609
610void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
611{
612# ifdef BN_LLONG
613    BN_ULLONG t;
614# else
615    BN_ULONG bl, bh;
616# endif
617    BN_ULONG t1, t2;
618    BN_ULONG c1, c2, c3;
619
620    c1 = 0;
621    c2 = 0;
622    c3 = 0;
623    mul_add_c(a[0], b[0], c1, c2, c3);
624    r[0] = c1;
625    c1 = 0;
626    mul_add_c(a[0], b[1], c2, c3, c1);
627    mul_add_c(a[1], b[0], c2, c3, c1);
628    r[1] = c2;
629    c2 = 0;
630    mul_add_c(a[2], b[0], c3, c1, c2);
631    mul_add_c(a[1], b[1], c3, c1, c2);
632    mul_add_c(a[0], b[2], c3, c1, c2);
633    r[2] = c3;
634    c3 = 0;
635    mul_add_c(a[0], b[3], c1, c2, c3);
636    mul_add_c(a[1], b[2], c1, c2, c3);
637    mul_add_c(a[2], b[1], c1, c2, c3);
638    mul_add_c(a[3], b[0], c1, c2, c3);
639    r[3] = c1;
640    c1 = 0;
641    mul_add_c(a[4], b[0], c2, c3, c1);
642    mul_add_c(a[3], b[1], c2, c3, c1);
643    mul_add_c(a[2], b[2], c2, c3, c1);
644    mul_add_c(a[1], b[3], c2, c3, c1);
645    mul_add_c(a[0], b[4], c2, c3, c1);
646    r[4] = c2;
647    c2 = 0;
648    mul_add_c(a[0], b[5], c3, c1, c2);
649    mul_add_c(a[1], b[4], c3, c1, c2);
650    mul_add_c(a[2], b[3], c3, c1, c2);
651    mul_add_c(a[3], b[2], c3, c1, c2);
652    mul_add_c(a[4], b[1], c3, c1, c2);
653    mul_add_c(a[5], b[0], c3, c1, c2);
654    r[5] = c3;
655    c3 = 0;
656    mul_add_c(a[6], b[0], c1, c2, c3);
657    mul_add_c(a[5], b[1], c1, c2, c3);
658    mul_add_c(a[4], b[2], c1, c2, c3);
659    mul_add_c(a[3], b[3], c1, c2, c3);
660    mul_add_c(a[2], b[4], c1, c2, c3);
661    mul_add_c(a[1], b[5], c1, c2, c3);
662    mul_add_c(a[0], b[6], c1, c2, c3);
663    r[6] = c1;
664    c1 = 0;
665    mul_add_c(a[0], b[7], c2, c3, c1);
666    mul_add_c(a[1], b[6], c2, c3, c1);
667    mul_add_c(a[2], b[5], c2, c3, c1);
668    mul_add_c(a[3], b[4], c2, c3, c1);
669    mul_add_c(a[4], b[3], c2, c3, c1);
670    mul_add_c(a[5], b[2], c2, c3, c1);
671    mul_add_c(a[6], b[1], c2, c3, c1);
672    mul_add_c(a[7], b[0], c2, c3, c1);
673    r[7] = c2;
674    c2 = 0;
675    mul_add_c(a[7], b[1], c3, c1, c2);
676    mul_add_c(a[6], b[2], c3, c1, c2);
677    mul_add_c(a[5], b[3], c3, c1, c2);
678    mul_add_c(a[4], b[4], c3, c1, c2);
679    mul_add_c(a[3], b[5], c3, c1, c2);
680    mul_add_c(a[2], b[6], c3, c1, c2);
681    mul_add_c(a[1], b[7], c3, c1, c2);
682    r[8] = c3;
683    c3 = 0;
684    mul_add_c(a[2], b[7], c1, c2, c3);
685    mul_add_c(a[3], b[6], c1, c2, c3);
686    mul_add_c(a[4], b[5], c1, c2, c3);
687    mul_add_c(a[5], b[4], c1, c2, c3);
688    mul_add_c(a[6], b[3], c1, c2, c3);
689    mul_add_c(a[7], b[2], c1, c2, c3);
690    r[9] = c1;
691    c1 = 0;
692    mul_add_c(a[7], b[3], c2, c3, c1);
693    mul_add_c(a[6], b[4], c2, c3, c1);
694    mul_add_c(a[5], b[5], c2, c3, c1);
695    mul_add_c(a[4], b[6], c2, c3, c1);
696    mul_add_c(a[3], b[7], c2, c3, c1);
697    r[10] = c2;
698    c2 = 0;
699    mul_add_c(a[4], b[7], c3, c1, c2);
700    mul_add_c(a[5], b[6], c3, c1, c2);
701    mul_add_c(a[6], b[5], c3, c1, c2);
702    mul_add_c(a[7], b[4], c3, c1, c2);
703    r[11] = c3;
704    c3 = 0;
705    mul_add_c(a[7], b[5], c1, c2, c3);
706    mul_add_c(a[6], b[6], c1, c2, c3);
707    mul_add_c(a[5], b[7], c1, c2, c3);
708    r[12] = c1;
709    c1 = 0;
710    mul_add_c(a[6], b[7], c2, c3, c1);
711    mul_add_c(a[7], b[6], c2, c3, c1);
712    r[13] = c2;
713    c2 = 0;
714    mul_add_c(a[7], b[7], c3, c1, c2);
715    r[14] = c3;
716    r[15] = c1;
717}
718
719void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
720{
721# ifdef BN_LLONG
722    BN_ULLONG t;
723# else
724    BN_ULONG bl, bh;
725# endif
726    BN_ULONG t1, t2;
727    BN_ULONG c1, c2, c3;
728
729    c1 = 0;
730    c2 = 0;
731    c3 = 0;
732    mul_add_c(a[0], b[0], c1, c2, c3);
733    r[0] = c1;
734    c1 = 0;
735    mul_add_c(a[0], b[1], c2, c3, c1);
736    mul_add_c(a[1], b[0], c2, c3, c1);
737    r[1] = c2;
738    c2 = 0;
739    mul_add_c(a[2], b[0], c3, c1, c2);
740    mul_add_c(a[1], b[1], c3, c1, c2);
741    mul_add_c(a[0], b[2], c3, c1, c2);
742    r[2] = c3;
743    c3 = 0;
744    mul_add_c(a[0], b[3], c1, c2, c3);
745    mul_add_c(a[1], b[2], c1, c2, c3);
746    mul_add_c(a[2], b[1], c1, c2, c3);
747    mul_add_c(a[3], b[0], c1, c2, c3);
748    r[3] = c1;
749    c1 = 0;
750    mul_add_c(a[3], b[1], c2, c3, c1);
751    mul_add_c(a[2], b[2], c2, c3, c1);
752    mul_add_c(a[1], b[3], c2, c3, c1);
753    r[4] = c2;
754    c2 = 0;
755    mul_add_c(a[2], b[3], c3, c1, c2);
756    mul_add_c(a[3], b[2], c3, c1, c2);
757    r[5] = c3;
758    c3 = 0;
759    mul_add_c(a[3], b[3], c1, c2, c3);
760    r[6] = c1;
761    r[7] = c2;
762}
763
764void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a)
765{
766# ifdef BN_LLONG
767    BN_ULLONG t, tt;
768# else
769    BN_ULONG bl, bh;
770# endif
771    BN_ULONG t1, t2;
772    BN_ULONG c1, c2, c3;
773
774    c1 = 0;
775    c2 = 0;
776    c3 = 0;
777    sqr_add_c(a, 0, c1, c2, c3);
778    r[0] = c1;
779    c1 = 0;
780    sqr_add_c2(a, 1, 0, c2, c3, c1);
781    r[1] = c2;
782    c2 = 0;
783    sqr_add_c(a, 1, c3, c1, c2);
784    sqr_add_c2(a, 2, 0, c3, c1, c2);
785    r[2] = c3;
786    c3 = 0;
787    sqr_add_c2(a, 3, 0, c1, c2, c3);
788    sqr_add_c2(a, 2, 1, c1, c2, c3);
789    r[3] = c1;
790    c1 = 0;
791    sqr_add_c(a, 2, c2, c3, c1);
792    sqr_add_c2(a, 3, 1, c2, c3, c1);
793    sqr_add_c2(a, 4, 0, c2, c3, c1);
794    r[4] = c2;
795    c2 = 0;
796    sqr_add_c2(a, 5, 0, c3, c1, c2);
797    sqr_add_c2(a, 4, 1, c3, c1, c2);
798    sqr_add_c2(a, 3, 2, c3, c1, c2);
799    r[5] = c3;
800    c3 = 0;
801    sqr_add_c(a, 3, c1, c2, c3);
802    sqr_add_c2(a, 4, 2, c1, c2, c3);
803    sqr_add_c2(a, 5, 1, c1, c2, c3);
804    sqr_add_c2(a, 6, 0, c1, c2, c3);
805    r[6] = c1;
806    c1 = 0;
807    sqr_add_c2(a, 7, 0, c2, c3, c1);
808    sqr_add_c2(a, 6, 1, c2, c3, c1);
809    sqr_add_c2(a, 5, 2, c2, c3, c1);
810    sqr_add_c2(a, 4, 3, c2, c3, c1);
811    r[7] = c2;
812    c2 = 0;
813    sqr_add_c(a, 4, c3, c1, c2);
814    sqr_add_c2(a, 5, 3, c3, c1, c2);
815    sqr_add_c2(a, 6, 2, c3, c1, c2);
816    sqr_add_c2(a, 7, 1, c3, c1, c2);
817    r[8] = c3;
818    c3 = 0;
819    sqr_add_c2(a, 7, 2, c1, c2, c3);
820    sqr_add_c2(a, 6, 3, c1, c2, c3);
821    sqr_add_c2(a, 5, 4, c1, c2, c3);
822    r[9] = c1;
823    c1 = 0;
824    sqr_add_c(a, 5, c2, c3, c1);
825    sqr_add_c2(a, 6, 4, c2, c3, c1);
826    sqr_add_c2(a, 7, 3, c2, c3, c1);
827    r[10] = c2;
828    c2 = 0;
829    sqr_add_c2(a, 7, 4, c3, c1, c2);
830    sqr_add_c2(a, 6, 5, c3, c1, c2);
831    r[11] = c3;
832    c3 = 0;
833    sqr_add_c(a, 6, c1, c2, c3);
834    sqr_add_c2(a, 7, 5, c1, c2, c3);
835    r[12] = c1;
836    c1 = 0;
837    sqr_add_c2(a, 7, 6, c2, c3, c1);
838    r[13] = c2;
839    c2 = 0;
840    sqr_add_c(a, 7, c3, c1, c2);
841    r[14] = c3;
842    r[15] = c1;
843}
844
845void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a)
846{
847# ifdef BN_LLONG
848    BN_ULLONG t, tt;
849# else
850    BN_ULONG bl, bh;
851# endif
852    BN_ULONG t1, t2;
853    BN_ULONG c1, c2, c3;
854
855    c1 = 0;
856    c2 = 0;
857    c3 = 0;
858    sqr_add_c(a, 0, c1, c2, c3);
859    r[0] = c1;
860    c1 = 0;
861    sqr_add_c2(a, 1, 0, c2, c3, c1);
862    r[1] = c2;
863    c2 = 0;
864    sqr_add_c(a, 1, c3, c1, c2);
865    sqr_add_c2(a, 2, 0, c3, c1, c2);
866    r[2] = c3;
867    c3 = 0;
868    sqr_add_c2(a, 3, 0, c1, c2, c3);
869    sqr_add_c2(a, 2, 1, c1, c2, c3);
870    r[3] = c1;
871    c1 = 0;
872    sqr_add_c(a, 2, c2, c3, c1);
873    sqr_add_c2(a, 3, 1, c2, c3, c1);
874    r[4] = c2;
875    c2 = 0;
876    sqr_add_c2(a, 3, 2, c3, c1, c2);
877    r[5] = c3;
878    c3 = 0;
879    sqr_add_c(a, 3, c1, c2, c3);
880    r[6] = c1;
881    r[7] = c2;
882}
883#else                           /* !BN_MUL_COMBA */
884
885/* hmm... is it faster just to do a multiply? */
886# undef bn_sqr_comba4
887void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a)
888{
889    BN_ULONG t[8];
890    bn_sqr_normal(r, a, 4, t);
891}
892
893# undef bn_sqr_comba8
894void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a)
895{
896    BN_ULONG t[16];
897    bn_sqr_normal(r, a, 8, t);
898}
899
900void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
901{
902    r[4] = bn_mul_words(&(r[0]), a, 4, b[0]);
903    r[5] = bn_mul_add_words(&(r[1]), a, 4, b[1]);
904    r[6] = bn_mul_add_words(&(r[2]), a, 4, b[2]);
905    r[7] = bn_mul_add_words(&(r[3]), a, 4, b[3]);
906}
907
908void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
909{
910    r[8] = bn_mul_words(&(r[0]), a, 8, b[0]);
911    r[9] = bn_mul_add_words(&(r[1]), a, 8, b[1]);
912    r[10] = bn_mul_add_words(&(r[2]), a, 8, b[2]);
913    r[11] = bn_mul_add_words(&(r[3]), a, 8, b[3]);
914    r[12] = bn_mul_add_words(&(r[4]), a, 8, b[4]);
915    r[13] = bn_mul_add_words(&(r[5]), a, 8, b[5]);
916    r[14] = bn_mul_add_words(&(r[6]), a, 8, b[6]);
917    r[15] = bn_mul_add_words(&(r[7]), a, 8, b[7]);
918}
919
920#endif                          /* !BN_MUL_COMBA */
921