x86_64-gcc.c revision 296465
1#include "../bn_lcl.h"
2#ifdef __SUNPRO_C
3# include "../bn_asm.c"         /* kind of dirty hack for Sun Studio */
4#else
5/*-
6 * x86_64 BIGNUM accelerator version 0.1, December 2002.
7 *
8 * Implemented by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
9 * project.
10 *
11 * Rights for redistribution and usage in source and binary forms are
12 * granted according to the OpenSSL license. Warranty of any kind is
13 * disclaimed.
14 *
15 * Q. Version 0.1? It doesn't sound like Andy, he used to assign real
16 *    versions, like 1.0...
17 * A. Well, that's because this code is basically a quick-n-dirty
18 *    proof-of-concept hack. As you can see it's implemented with
19 *    inline assembler, which means that you're bound to GCC and that
20 *    there might be enough room for further improvement.
21 *
22 * Q. Why inline assembler?
23 * A. x86_64 features own ABI which I'm not familiar with. This is
24 *    why I decided to let the compiler take care of subroutine
25 *    prologue/epilogue as well as register allocation. For reference.
26 *    Win64 implements different ABI for AMD64, different from Linux.
27 *
28 * Q. How much faster does it get?
29 * A. 'apps/openssl speed rsa dsa' output with no-asm:
30 *
31 *                        sign    verify    sign/s verify/s
32 *      rsa  512 bits   0.0006s   0.0001s   1683.8  18456.2
33 *      rsa 1024 bits   0.0028s   0.0002s    356.0   6407.0
34 *      rsa 2048 bits   0.0172s   0.0005s     58.0   1957.8
35 *      rsa 4096 bits   0.1155s   0.0018s      8.7    555.6
36 *                        sign    verify    sign/s verify/s
37 *      dsa  512 bits   0.0005s   0.0006s   2100.8   1768.3
38 *      dsa 1024 bits   0.0014s   0.0018s    692.3    559.2
39 *      dsa 2048 bits   0.0049s   0.0061s    204.7    165.0
40 *
41 *    'apps/openssl speed rsa dsa' output with this module:
42 *
43 *                        sign    verify    sign/s verify/s
44 *      rsa  512 bits   0.0004s   0.0000s   2767.1  33297.9
45 *      rsa 1024 bits   0.0012s   0.0001s    867.4  14674.7
46 *      rsa 2048 bits   0.0061s   0.0002s    164.0   5270.0
47 *      rsa 4096 bits   0.0384s   0.0006s     26.1   1650.8
48 *                        sign    verify    sign/s verify/s
49 *      dsa  512 bits   0.0002s   0.0003s   4442.2   3786.3
50 *      dsa 1024 bits   0.0005s   0.0007s   1835.1   1497.4
51 *      dsa 2048 bits   0.0016s   0.0020s    620.4    504.6
52 *
53 *    For the reference. IA-32 assembler implementation performs
54 *    very much like 64-bit code compiled with no-asm on the same
55 *    machine.
56 */
57
58# define BN_ULONG unsigned long
59
60# undef mul
61# undef mul_add
62# undef sqr
63
64/*-
65 * "m"(a), "+m"(r)      is the way to favor DirectPath �-code;
66 * "g"(0)               let the compiler to decide where does it
67 *                      want to keep the value of zero;
68 */
69# define mul_add(r,a,word,carry) do {   \
70        register BN_ULONG high,low;     \
71        asm ("mulq %3"                  \
72                : "=a"(low),"=d"(high)  \
73                : "a"(word),"m"(a)      \
74                : "cc");                \
75        asm ("addq %2,%0; adcq %3,%1"   \
76                : "+r"(carry),"+d"(high)\
77                : "a"(low),"g"(0)       \
78                : "cc");                \
79        asm ("addq %2,%0; adcq %3,%1"   \
80                : "+m"(r),"+d"(high)    \
81                : "r"(carry),"g"(0)     \
82                : "cc");                \
83        carry=high;                     \
84        } while (0)
85
86# define mul(r,a,word,carry) do {       \
87        register BN_ULONG high,low;     \
88        asm ("mulq %3"                  \
89                : "=a"(low),"=d"(high)  \
90                : "a"(word),"g"(a)      \
91                : "cc");                \
92        asm ("addq %2,%0; adcq %3,%1"   \
93                : "+r"(carry),"+d"(high)\
94                : "a"(low),"g"(0)       \
95                : "cc");                \
96        (r)=carry, carry=high;          \
97        } while (0)
98
99# define sqr(r0,r1,a)                    \
100        asm ("mulq %2"                  \
101                : "=a"(r0),"=d"(r1)     \
102                : "a"(a)                \
103                : "cc");
104
105BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num,
106                          BN_ULONG w)
107{
108    BN_ULONG c1 = 0;
109
110    if (num <= 0)
111        return (c1);
112
113    while (num & ~3) {
114        mul_add(rp[0], ap[0], w, c1);
115        mul_add(rp[1], ap[1], w, c1);
116        mul_add(rp[2], ap[2], w, c1);
117        mul_add(rp[3], ap[3], w, c1);
118        ap += 4;
119        rp += 4;
120        num -= 4;
121    }
122    if (num) {
123        mul_add(rp[0], ap[0], w, c1);
124        if (--num == 0)
125            return c1;
126        mul_add(rp[1], ap[1], w, c1);
127        if (--num == 0)
128            return c1;
129        mul_add(rp[2], ap[2], w, c1);
130        return c1;
131    }
132
133    return (c1);
134}
135
136BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
137{
138    BN_ULONG c1 = 0;
139
140    if (num <= 0)
141        return (c1);
142
143    while (num & ~3) {
144        mul(rp[0], ap[0], w, c1);
145        mul(rp[1], ap[1], w, c1);
146        mul(rp[2], ap[2], w, c1);
147        mul(rp[3], ap[3], w, c1);
148        ap += 4;
149        rp += 4;
150        num -= 4;
151    }
152    if (num) {
153        mul(rp[0], ap[0], w, c1);
154        if (--num == 0)
155            return c1;
156        mul(rp[1], ap[1], w, c1);
157        if (--num == 0)
158            return c1;
159        mul(rp[2], ap[2], w, c1);
160    }
161    return (c1);
162}
163
164void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n)
165{
166    if (n <= 0)
167        return;
168
169    while (n & ~3) {
170        sqr(r[0], r[1], a[0]);
171        sqr(r[2], r[3], a[1]);
172        sqr(r[4], r[5], a[2]);
173        sqr(r[6], r[7], a[3]);
174        a += 4;
175        r += 8;
176        n -= 4;
177    }
178    if (n) {
179        sqr(r[0], r[1], a[0]);
180        if (--n == 0)
181            return;
182        sqr(r[2], r[3], a[1]);
183        if (--n == 0)
184            return;
185        sqr(r[4], r[5], a[2]);
186    }
187}
188
189BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
190{
191    BN_ULONG ret, waste;
192
193 asm("divq      %4":"=a"(ret), "=d"(waste)
194 :     "a"(l), "d"(h), "g"(d)
195 :     "cc");
196
197    return ret;
198}
199
200BN_ULONG bn_add_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
201                      int n)
202{
203    BN_ULONG ret = 0, i = 0;
204
205    if (n <= 0)
206        return 0;
207
208    asm volatile ("       subq    %2,%2           \n"
209                  ".align 16                      \n"
210                  "1:     movq    (%4,%2,8),%0    \n"
211                  "       adcq    (%5,%2,8),%0    \n"
212                  "       movq    %0,(%3,%2,8)    \n"
213                  "       leaq    1(%2),%2        \n"
214                  "       loop    1b              \n"
215                  "       sbbq    %0,%0           \n":"=&a" (ret), "+c"(n),
216                  "=&r"(i)
217                  :"r"(rp), "r"(ap), "r"(bp)
218                  :"cc", "memory");
219
220    return ret & 1;
221}
222
223# ifndef SIMICS
224BN_ULONG bn_sub_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
225                      int n)
226{
227    BN_ULONG ret = 0, i = 0;
228
229    if (n <= 0)
230        return 0;
231
232    asm volatile ("       subq    %2,%2           \n"
233                  ".align 16                      \n"
234                  "1:     movq    (%4,%2,8),%0    \n"
235                  "       sbbq    (%5,%2,8),%0    \n"
236                  "       movq    %0,(%3,%2,8)    \n"
237                  "       leaq    1(%2),%2        \n"
238                  "       loop    1b              \n"
239                  "       sbbq    %0,%0           \n":"=&a" (ret), "+c"(n),
240                  "=&r"(i)
241                  :"r"(rp), "r"(ap), "r"(bp)
242                  :"cc", "memory");
243
244    return ret & 1;
245}
246# else
247/* Simics 1.4<7 has buggy sbbq:-( */
248#  define BN_MASK2 0xffffffffffffffffL
249BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
250{
251    BN_ULONG t1, t2;
252    int c = 0;
253
254    if (n <= 0)
255        return ((BN_ULONG)0);
256
257    for (;;) {
258        t1 = a[0];
259        t2 = b[0];
260        r[0] = (t1 - t2 - c) & BN_MASK2;
261        if (t1 != t2)
262            c = (t1 < t2);
263        if (--n <= 0)
264            break;
265
266        t1 = a[1];
267        t2 = b[1];
268        r[1] = (t1 - t2 - c) & BN_MASK2;
269        if (t1 != t2)
270            c = (t1 < t2);
271        if (--n <= 0)
272            break;
273
274        t1 = a[2];
275        t2 = b[2];
276        r[2] = (t1 - t2 - c) & BN_MASK2;
277        if (t1 != t2)
278            c = (t1 < t2);
279        if (--n <= 0)
280            break;
281
282        t1 = a[3];
283        t2 = b[3];
284        r[3] = (t1 - t2 - c) & BN_MASK2;
285        if (t1 != t2)
286            c = (t1 < t2);
287        if (--n <= 0)
288            break;
289
290        a += 4;
291        b += 4;
292        r += 4;
293    }
294    return (c);
295}
296# endif
297
298/* mul_add_c(a,b,c0,c1,c2)  -- c+=a*b for three word number c=(c2,c1,c0) */
299/* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */
300/* sqr_add_c(a,i,c0,c1,c2)  -- c+=a[i]^2 for three word number c=(c2,c1,c0) */
301/*
302 * sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number
303 * c=(c2,c1,c0)
304 */
305
306/*
307 * Keep in mind that carrying into high part of multiplication result
308 * can not overflow, because it cannot be all-ones.
309 */
310# if 0
311/* original macros are kept for reference purposes */
312#  define mul_add_c(a,b,c0,c1,c2) {       \
313        BN_ULONG ta=(a),tb=(b);         \
314        t1 = ta * tb;                   \
315        t2 = BN_UMULT_HIGH(ta,tb);      \
316        c0 += t1; t2 += (c0<t1)?1:0;    \
317        c1 += t2; c2 += (c1<t2)?1:0;    \
318        }
319
320#  define mul_add_c2(a,b,c0,c1,c2) {      \
321        BN_ULONG ta=(a),tb=(b),t0;      \
322        t1 = BN_UMULT_HIGH(ta,tb);      \
323        t0 = ta * tb;                   \
324        c0 += t0; t2 = t1+((c0<t0)?1:0);\
325        c1 += t2; c2 += (c1<t2)?1:0;    \
326        c0 += t0; t1 += (c0<t0)?1:0;    \
327        c1 += t1; c2 += (c1<t1)?1:0;    \
328        }
329# else
330#  define mul_add_c(a,b,c0,c1,c2) do {    \
331        asm ("mulq %3"                  \
332                : "=a"(t1),"=d"(t2)     \
333                : "a"(a),"m"(b)         \
334                : "cc");                \
335        asm ("addq %2,%0; adcq %3,%1"   \
336                : "+r"(c0),"+d"(t2)     \
337                : "a"(t1),"g"(0)        \
338                : "cc");                \
339        asm ("addq %2,%0; adcq %3,%1"   \
340                : "+r"(c1),"+r"(c2)     \
341                : "d"(t2),"g"(0)        \
342                : "cc");                \
343        } while (0)
344
345#  define sqr_add_c(a,i,c0,c1,c2) do {    \
346        asm ("mulq %2"                  \
347                : "=a"(t1),"=d"(t2)     \
348                : "a"(a[i])             \
349                : "cc");                \
350        asm ("addq %2,%0; adcq %3,%1"   \
351                : "+r"(c0),"+d"(t2)     \
352                : "a"(t1),"g"(0)        \
353                : "cc");                \
354        asm ("addq %2,%0; adcq %3,%1"   \
355                : "+r"(c1),"+r"(c2)     \
356                : "d"(t2),"g"(0)        \
357                : "cc");                \
358        } while (0)
359
360#  define mul_add_c2(a,b,c0,c1,c2) do {   \
361        asm ("mulq %3"                  \
362                : "=a"(t1),"=d"(t2)     \
363                : "a"(a),"m"(b)         \
364                : "cc");                \
365        asm ("addq %3,%0; adcq %4,%1; adcq %5,%2"       \
366                : "+r"(c0),"+r"(c1),"+r"(c2)            \
367                : "r"(t1),"r"(t2),"g"(0)                \
368                : "cc");                                \
369        asm ("addq %3,%0; adcq %4,%1; adcq %5,%2"       \
370                : "+r"(c0),"+r"(c1),"+r"(c2)            \
371                : "r"(t1),"r"(t2),"g"(0)                \
372                : "cc");                                \
373        } while (0)
374# endif
375
376# define sqr_add_c2(a,i,j,c0,c1,c2)      \
377        mul_add_c2((a)[i],(a)[j],c0,c1,c2)
378
379void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
380{
381    BN_ULONG t1, t2;
382    BN_ULONG c1, c2, c3;
383
384    c1 = 0;
385    c2 = 0;
386    c3 = 0;
387    mul_add_c(a[0], b[0], c1, c2, c3);
388    r[0] = c1;
389    c1 = 0;
390    mul_add_c(a[0], b[1], c2, c3, c1);
391    mul_add_c(a[1], b[0], c2, c3, c1);
392    r[1] = c2;
393    c2 = 0;
394    mul_add_c(a[2], b[0], c3, c1, c2);
395    mul_add_c(a[1], b[1], c3, c1, c2);
396    mul_add_c(a[0], b[2], c3, c1, c2);
397    r[2] = c3;
398    c3 = 0;
399    mul_add_c(a[0], b[3], c1, c2, c3);
400    mul_add_c(a[1], b[2], c1, c2, c3);
401    mul_add_c(a[2], b[1], c1, c2, c3);
402    mul_add_c(a[3], b[0], c1, c2, c3);
403    r[3] = c1;
404    c1 = 0;
405    mul_add_c(a[4], b[0], c2, c3, c1);
406    mul_add_c(a[3], b[1], c2, c3, c1);
407    mul_add_c(a[2], b[2], c2, c3, c1);
408    mul_add_c(a[1], b[3], c2, c3, c1);
409    mul_add_c(a[0], b[4], c2, c3, c1);
410    r[4] = c2;
411    c2 = 0;
412    mul_add_c(a[0], b[5], c3, c1, c2);
413    mul_add_c(a[1], b[4], c3, c1, c2);
414    mul_add_c(a[2], b[3], c3, c1, c2);
415    mul_add_c(a[3], b[2], c3, c1, c2);
416    mul_add_c(a[4], b[1], c3, c1, c2);
417    mul_add_c(a[5], b[0], c3, c1, c2);
418    r[5] = c3;
419    c3 = 0;
420    mul_add_c(a[6], b[0], c1, c2, c3);
421    mul_add_c(a[5], b[1], c1, c2, c3);
422    mul_add_c(a[4], b[2], c1, c2, c3);
423    mul_add_c(a[3], b[3], c1, c2, c3);
424    mul_add_c(a[2], b[4], c1, c2, c3);
425    mul_add_c(a[1], b[5], c1, c2, c3);
426    mul_add_c(a[0], b[6], c1, c2, c3);
427    r[6] = c1;
428    c1 = 0;
429    mul_add_c(a[0], b[7], c2, c3, c1);
430    mul_add_c(a[1], b[6], c2, c3, c1);
431    mul_add_c(a[2], b[5], c2, c3, c1);
432    mul_add_c(a[3], b[4], c2, c3, c1);
433    mul_add_c(a[4], b[3], c2, c3, c1);
434    mul_add_c(a[5], b[2], c2, c3, c1);
435    mul_add_c(a[6], b[1], c2, c3, c1);
436    mul_add_c(a[7], b[0], c2, c3, c1);
437    r[7] = c2;
438    c2 = 0;
439    mul_add_c(a[7], b[1], c3, c1, c2);
440    mul_add_c(a[6], b[2], c3, c1, c2);
441    mul_add_c(a[5], b[3], c3, c1, c2);
442    mul_add_c(a[4], b[4], c3, c1, c2);
443    mul_add_c(a[3], b[5], c3, c1, c2);
444    mul_add_c(a[2], b[6], c3, c1, c2);
445    mul_add_c(a[1], b[7], c3, c1, c2);
446    r[8] = c3;
447    c3 = 0;
448    mul_add_c(a[2], b[7], c1, c2, c3);
449    mul_add_c(a[3], b[6], c1, c2, c3);
450    mul_add_c(a[4], b[5], c1, c2, c3);
451    mul_add_c(a[5], b[4], c1, c2, c3);
452    mul_add_c(a[6], b[3], c1, c2, c3);
453    mul_add_c(a[7], b[2], c1, c2, c3);
454    r[9] = c1;
455    c1 = 0;
456    mul_add_c(a[7], b[3], c2, c3, c1);
457    mul_add_c(a[6], b[4], c2, c3, c1);
458    mul_add_c(a[5], b[5], c2, c3, c1);
459    mul_add_c(a[4], b[6], c2, c3, c1);
460    mul_add_c(a[3], b[7], c2, c3, c1);
461    r[10] = c2;
462    c2 = 0;
463    mul_add_c(a[4], b[7], c3, c1, c2);
464    mul_add_c(a[5], b[6], c3, c1, c2);
465    mul_add_c(a[6], b[5], c3, c1, c2);
466    mul_add_c(a[7], b[4], c3, c1, c2);
467    r[11] = c3;
468    c3 = 0;
469    mul_add_c(a[7], b[5], c1, c2, c3);
470    mul_add_c(a[6], b[6], c1, c2, c3);
471    mul_add_c(a[5], b[7], c1, c2, c3);
472    r[12] = c1;
473    c1 = 0;
474    mul_add_c(a[6], b[7], c2, c3, c1);
475    mul_add_c(a[7], b[6], c2, c3, c1);
476    r[13] = c2;
477    c2 = 0;
478    mul_add_c(a[7], b[7], c3, c1, c2);
479    r[14] = c3;
480    r[15] = c1;
481}
482
483void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
484{
485    BN_ULONG t1, t2;
486    BN_ULONG c1, c2, c3;
487
488    c1 = 0;
489    c2 = 0;
490    c3 = 0;
491    mul_add_c(a[0], b[0], c1, c2, c3);
492    r[0] = c1;
493    c1 = 0;
494    mul_add_c(a[0], b[1], c2, c3, c1);
495    mul_add_c(a[1], b[0], c2, c3, c1);
496    r[1] = c2;
497    c2 = 0;
498    mul_add_c(a[2], b[0], c3, c1, c2);
499    mul_add_c(a[1], b[1], c3, c1, c2);
500    mul_add_c(a[0], b[2], c3, c1, c2);
501    r[2] = c3;
502    c3 = 0;
503    mul_add_c(a[0], b[3], c1, c2, c3);
504    mul_add_c(a[1], b[2], c1, c2, c3);
505    mul_add_c(a[2], b[1], c1, c2, c3);
506    mul_add_c(a[3], b[0], c1, c2, c3);
507    r[3] = c1;
508    c1 = 0;
509    mul_add_c(a[3], b[1], c2, c3, c1);
510    mul_add_c(a[2], b[2], c2, c3, c1);
511    mul_add_c(a[1], b[3], c2, c3, c1);
512    r[4] = c2;
513    c2 = 0;
514    mul_add_c(a[2], b[3], c3, c1, c2);
515    mul_add_c(a[3], b[2], c3, c1, c2);
516    r[5] = c3;
517    c3 = 0;
518    mul_add_c(a[3], b[3], c1, c2, c3);
519    r[6] = c1;
520    r[7] = c2;
521}
522
523void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a)
524{
525    BN_ULONG t1, t2;
526    BN_ULONG c1, c2, c3;
527
528    c1 = 0;
529    c2 = 0;
530    c3 = 0;
531    sqr_add_c(a, 0, c1, c2, c3);
532    r[0] = c1;
533    c1 = 0;
534    sqr_add_c2(a, 1, 0, c2, c3, c1);
535    r[1] = c2;
536    c2 = 0;
537    sqr_add_c(a, 1, c3, c1, c2);
538    sqr_add_c2(a, 2, 0, c3, c1, c2);
539    r[2] = c3;
540    c3 = 0;
541    sqr_add_c2(a, 3, 0, c1, c2, c3);
542    sqr_add_c2(a, 2, 1, c1, c2, c3);
543    r[3] = c1;
544    c1 = 0;
545    sqr_add_c(a, 2, c2, c3, c1);
546    sqr_add_c2(a, 3, 1, c2, c3, c1);
547    sqr_add_c2(a, 4, 0, c2, c3, c1);
548    r[4] = c2;
549    c2 = 0;
550    sqr_add_c2(a, 5, 0, c3, c1, c2);
551    sqr_add_c2(a, 4, 1, c3, c1, c2);
552    sqr_add_c2(a, 3, 2, c3, c1, c2);
553    r[5] = c3;
554    c3 = 0;
555    sqr_add_c(a, 3, c1, c2, c3);
556    sqr_add_c2(a, 4, 2, c1, c2, c3);
557    sqr_add_c2(a, 5, 1, c1, c2, c3);
558    sqr_add_c2(a, 6, 0, c1, c2, c3);
559    r[6] = c1;
560    c1 = 0;
561    sqr_add_c2(a, 7, 0, c2, c3, c1);
562    sqr_add_c2(a, 6, 1, c2, c3, c1);
563    sqr_add_c2(a, 5, 2, c2, c3, c1);
564    sqr_add_c2(a, 4, 3, c2, c3, c1);
565    r[7] = c2;
566    c2 = 0;
567    sqr_add_c(a, 4, c3, c1, c2);
568    sqr_add_c2(a, 5, 3, c3, c1, c2);
569    sqr_add_c2(a, 6, 2, c3, c1, c2);
570    sqr_add_c2(a, 7, 1, c3, c1, c2);
571    r[8] = c3;
572    c3 = 0;
573    sqr_add_c2(a, 7, 2, c1, c2, c3);
574    sqr_add_c2(a, 6, 3, c1, c2, c3);
575    sqr_add_c2(a, 5, 4, c1, c2, c3);
576    r[9] = c1;
577    c1 = 0;
578    sqr_add_c(a, 5, c2, c3, c1);
579    sqr_add_c2(a, 6, 4, c2, c3, c1);
580    sqr_add_c2(a, 7, 3, c2, c3, c1);
581    r[10] = c2;
582    c2 = 0;
583    sqr_add_c2(a, 7, 4, c3, c1, c2);
584    sqr_add_c2(a, 6, 5, c3, c1, c2);
585    r[11] = c3;
586    c3 = 0;
587    sqr_add_c(a, 6, c1, c2, c3);
588    sqr_add_c2(a, 7, 5, c1, c2, c3);
589    r[12] = c1;
590    c1 = 0;
591    sqr_add_c2(a, 7, 6, c2, c3, c1);
592    r[13] = c2;
593    c2 = 0;
594    sqr_add_c(a, 7, c3, c1, c2);
595    r[14] = c3;
596    r[15] = c1;
597}
598
599void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a)
600{
601    BN_ULONG t1, t2;
602    BN_ULONG c1, c2, c3;
603
604    c1 = 0;
605    c2 = 0;
606    c3 = 0;
607    sqr_add_c(a, 0, c1, c2, c3);
608    r[0] = c1;
609    c1 = 0;
610    sqr_add_c2(a, 1, 0, c2, c3, c1);
611    r[1] = c2;
612    c2 = 0;
613    sqr_add_c(a, 1, c3, c1, c2);
614    sqr_add_c2(a, 2, 0, c3, c1, c2);
615    r[2] = c3;
616    c3 = 0;
617    sqr_add_c2(a, 3, 0, c1, c2, c3);
618    sqr_add_c2(a, 2, 1, c1, c2, c3);
619    r[3] = c1;
620    c1 = 0;
621    sqr_add_c(a, 2, c2, c3, c1);
622    sqr_add_c2(a, 3, 1, c2, c3, c1);
623    r[4] = c2;
624    c2 = 0;
625    sqr_add_c2(a, 3, 2, c3, c1, c2);
626    r[5] = c3;
627    c3 = 0;
628    sqr_add_c(a, 3, c1, c2, c3);
629    r[6] = c1;
630    r[7] = c2;
631}
632#endif
633