1/*
2 * Copyright 2002-2018 The OpenSSL Project Authors. All Rights Reserved.
3 *
4 * Licensed under the OpenSSL license (the "License").  You may not use
5 * this file except in compliance with the License.  You can obtain a copy
6 * in the file LICENSE in the source distribution or at
7 * https://www.openssl.org/source/license.html
8 */
9
10#include "../bn_local.h"
11#if !(defined(__GNUC__) && __GNUC__>=2)
12# include "../bn_asm.c"         /* kind of dirty hack for Sun Studio */
13#else
14/*-
15 * x86_64 BIGNUM accelerator version 0.1, December 2002.
16 *
17 * Implemented by Andy Polyakov <appro@openssl.org> for the OpenSSL
18 * project.
19 *
20 * Rights for redistribution and usage in source and binary forms are
21 * granted according to the OpenSSL license. Warranty of any kind is
22 * disclaimed.
23 *
24 * Q. Version 0.1? It doesn't sound like Andy, he used to assign real
25 *    versions, like 1.0...
26 * A. Well, that's because this code is basically a quick-n-dirty
27 *    proof-of-concept hack. As you can see it's implemented with
28 *    inline assembler, which means that you're bound to GCC and that
29 *    there might be enough room for further improvement.
30 *
31 * Q. Why inline assembler?
32 * A. x86_64 features own ABI which I'm not familiar with. This is
33 *    why I decided to let the compiler take care of subroutine
34 *    prologue/epilogue as well as register allocation. For reference.
35 *    Win64 implements different ABI for AMD64, different from Linux.
36 *
37 * Q. How much faster does it get?
38 * A. 'apps/openssl speed rsa dsa' output with no-asm:
39 *
40 *                        sign    verify    sign/s verify/s
41 *      rsa  512 bits   0.0006s   0.0001s   1683.8  18456.2
42 *      rsa 1024 bits   0.0028s   0.0002s    356.0   6407.0
43 *      rsa 2048 bits   0.0172s   0.0005s     58.0   1957.8
44 *      rsa 4096 bits   0.1155s   0.0018s      8.7    555.6
45 *                        sign    verify    sign/s verify/s
46 *      dsa  512 bits   0.0005s   0.0006s   2100.8   1768.3
47 *      dsa 1024 bits   0.0014s   0.0018s    692.3    559.2
48 *      dsa 2048 bits   0.0049s   0.0061s    204.7    165.0
49 *
50 *    'apps/openssl speed rsa dsa' output with this module:
51 *
52 *                        sign    verify    sign/s verify/s
53 *      rsa  512 bits   0.0004s   0.0000s   2767.1  33297.9
54 *      rsa 1024 bits   0.0012s   0.0001s    867.4  14674.7
55 *      rsa 2048 bits   0.0061s   0.0002s    164.0   5270.0
56 *      rsa 4096 bits   0.0384s   0.0006s     26.1   1650.8
57 *                        sign    verify    sign/s verify/s
58 *      dsa  512 bits   0.0002s   0.0003s   4442.2   3786.3
59 *      dsa 1024 bits   0.0005s   0.0007s   1835.1   1497.4
60 *      dsa 2048 bits   0.0016s   0.0020s    620.4    504.6
61 *
62 *    For the reference. IA-32 assembler implementation performs
63 *    very much like 64-bit code compiled with no-asm on the same
64 *    machine.
65 */
66
67# undef mul
68# undef mul_add
69
70/*-
71 * "m"(a), "+m"(r)      is the way to favor DirectPath ��-code;
72 * "g"(0)               let the compiler to decide where does it
73 *                      want to keep the value of zero;
74 */
75# define mul_add(r,a,word,carry) do {   \
76        register BN_ULONG high,low;     \
77        asm ("mulq %3"                  \
78                : "=a"(low),"=d"(high)  \
79                : "a"(word),"m"(a)      \
80                : "cc");                \
81        asm ("addq %2,%0; adcq %3,%1"   \
82                : "+r"(carry),"+d"(high)\
83                : "a"(low),"g"(0)       \
84                : "cc");                \
85        asm ("addq %2,%0; adcq %3,%1"   \
86                : "+m"(r),"+d"(high)    \
87                : "r"(carry),"g"(0)     \
88                : "cc");                \
89        carry=high;                     \
90        } while (0)
91
92# define mul(r,a,word,carry) do {       \
93        register BN_ULONG high,low;     \
94        asm ("mulq %3"                  \
95                : "=a"(low),"=d"(high)  \
96                : "a"(word),"g"(a)      \
97                : "cc");                \
98        asm ("addq %2,%0; adcq %3,%1"   \
99                : "+r"(carry),"+d"(high)\
100                : "a"(low),"g"(0)       \
101                : "cc");                \
102        (r)=carry, carry=high;          \
103        } while (0)
104# undef sqr
105# define sqr(r0,r1,a)                   \
106        asm ("mulq %2"                  \
107                : "=a"(r0),"=d"(r1)     \
108                : "a"(a)                \
109                : "cc");
110
111BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num,
112                          BN_ULONG w)
113{
114    BN_ULONG c1 = 0;
115
116    if (num <= 0)
117        return c1;
118
119    while (num & ~3) {
120        mul_add(rp[0], ap[0], w, c1);
121        mul_add(rp[1], ap[1], w, c1);
122        mul_add(rp[2], ap[2], w, c1);
123        mul_add(rp[3], ap[3], w, c1);
124        ap += 4;
125        rp += 4;
126        num -= 4;
127    }
128    if (num) {
129        mul_add(rp[0], ap[0], w, c1);
130        if (--num == 0)
131            return c1;
132        mul_add(rp[1], ap[1], w, c1);
133        if (--num == 0)
134            return c1;
135        mul_add(rp[2], ap[2], w, c1);
136        return c1;
137    }
138
139    return c1;
140}
141
142BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
143{
144    BN_ULONG c1 = 0;
145
146    if (num <= 0)
147        return c1;
148
149    while (num & ~3) {
150        mul(rp[0], ap[0], w, c1);
151        mul(rp[1], ap[1], w, c1);
152        mul(rp[2], ap[2], w, c1);
153        mul(rp[3], ap[3], w, c1);
154        ap += 4;
155        rp += 4;
156        num -= 4;
157    }
158    if (num) {
159        mul(rp[0], ap[0], w, c1);
160        if (--num == 0)
161            return c1;
162        mul(rp[1], ap[1], w, c1);
163        if (--num == 0)
164            return c1;
165        mul(rp[2], ap[2], w, c1);
166    }
167    return c1;
168}
169
170void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n)
171{
172    if (n <= 0)
173        return;
174
175    while (n & ~3) {
176        sqr(r[0], r[1], a[0]);
177        sqr(r[2], r[3], a[1]);
178        sqr(r[4], r[5], a[2]);
179        sqr(r[6], r[7], a[3]);
180        a += 4;
181        r += 8;
182        n -= 4;
183    }
184    if (n) {
185        sqr(r[0], r[1], a[0]);
186        if (--n == 0)
187            return;
188        sqr(r[2], r[3], a[1]);
189        if (--n == 0)
190            return;
191        sqr(r[4], r[5], a[2]);
192    }
193}
194
195BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
196{
197    BN_ULONG ret, waste;
198
199 asm("divq      %4":"=a"(ret), "=d"(waste)
200 :     "a"(l), "d"(h), "r"(d)
201 :     "cc");
202
203    return ret;
204}
205
206BN_ULONG bn_add_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
207                      int n)
208{
209    BN_ULONG ret;
210    size_t i = 0;
211
212    if (n <= 0)
213        return 0;
214
215    asm volatile ("       subq    %0,%0           \n" /* clear carry */
216                  "       jmp     1f              \n"
217                  ".p2align 4                     \n"
218                  "1:     movq    (%4,%2,8),%0    \n"
219                  "       adcq    (%5,%2,8),%0    \n"
220                  "       movq    %0,(%3,%2,8)    \n"
221                  "       lea     1(%2),%2        \n"
222                  "       dec     %1              \n"
223                  "       jnz     1b              \n"
224                  "       sbbq    %0,%0           \n"
225                  :"=&r" (ret), "+c"(n), "+r"(i)
226                  :"r"(rp), "r"(ap), "r"(bp)
227                  :"cc", "memory");
228
229    return ret & 1;
230}
231
232# ifndef SIMICS
233BN_ULONG bn_sub_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
234                      int n)
235{
236    BN_ULONG ret;
237    size_t i = 0;
238
239    if (n <= 0)
240        return 0;
241
242    asm volatile ("       subq    %0,%0           \n" /* clear borrow */
243                  "       jmp     1f              \n"
244                  ".p2align 4                     \n"
245                  "1:     movq    (%4,%2,8),%0    \n"
246                  "       sbbq    (%5,%2,8),%0    \n"
247                  "       movq    %0,(%3,%2,8)    \n"
248                  "       lea     1(%2),%2        \n"
249                  "       dec     %1              \n"
250                  "       jnz     1b              \n"
251                  "       sbbq    %0,%0           \n"
252                  :"=&r" (ret), "+c"(n), "+r"(i)
253                  :"r"(rp), "r"(ap), "r"(bp)
254                  :"cc", "memory");
255
256    return ret & 1;
257}
258# else
259/* Simics 1.4<7 has buggy sbbq:-( */
260#  define BN_MASK2 0xffffffffffffffffL
261BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
262{
263    BN_ULONG t1, t2;
264    int c = 0;
265
266    if (n <= 0)
267        return (BN_ULONG)0;
268
269    for (;;) {
270        t1 = a[0];
271        t2 = b[0];
272        r[0] = (t1 - t2 - c) & BN_MASK2;
273        if (t1 != t2)
274            c = (t1 < t2);
275        if (--n <= 0)
276            break;
277
278        t1 = a[1];
279        t2 = b[1];
280        r[1] = (t1 - t2 - c) & BN_MASK2;
281        if (t1 != t2)
282            c = (t1 < t2);
283        if (--n <= 0)
284            break;
285
286        t1 = a[2];
287        t2 = b[2];
288        r[2] = (t1 - t2 - c) & BN_MASK2;
289        if (t1 != t2)
290            c = (t1 < t2);
291        if (--n <= 0)
292            break;
293
294        t1 = a[3];
295        t2 = b[3];
296        r[3] = (t1 - t2 - c) & BN_MASK2;
297        if (t1 != t2)
298            c = (t1 < t2);
299        if (--n <= 0)
300            break;
301
302        a += 4;
303        b += 4;
304        r += 4;
305    }
306    return c;
307}
308# endif
309
310/* mul_add_c(a,b,c0,c1,c2)  -- c+=a*b for three word number c=(c2,c1,c0) */
311/* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */
312/* sqr_add_c(a,i,c0,c1,c2)  -- c+=a[i]^2 for three word number c=(c2,c1,c0) */
313/*
314 * sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number
315 * c=(c2,c1,c0)
316 */
317
318/*
319 * Keep in mind that carrying into high part of multiplication result
320 * can not overflow, because it cannot be all-ones.
321 */
322# if 0
323/* original macros are kept for reference purposes */
324#  define mul_add_c(a,b,c0,c1,c2)       do {    \
325        BN_ULONG ta = (a), tb = (b);            \
326        BN_ULONG lo, hi;                        \
327        BN_UMULT_LOHI(lo,hi,ta,tb);             \
328        c0 += lo; hi += (c0<lo)?1:0;            \
329        c1 += hi; c2 += (c1<hi)?1:0;            \
330        } while(0)
331
332#  define mul_add_c2(a,b,c0,c1,c2)      do {    \
333        BN_ULONG ta = (a), tb = (b);            \
334        BN_ULONG lo, hi, tt;                    \
335        BN_UMULT_LOHI(lo,hi,ta,tb);             \
336        c0 += lo; tt = hi+((c0<lo)?1:0);        \
337        c1 += tt; c2 += (c1<tt)?1:0;            \
338        c0 += lo; hi += (c0<lo)?1:0;            \
339        c1 += hi; c2 += (c1<hi)?1:0;            \
340        } while(0)
341
342#  define sqr_add_c(a,i,c0,c1,c2)       do {    \
343        BN_ULONG ta = (a)[i];                   \
344        BN_ULONG lo, hi;                        \
345        BN_UMULT_LOHI(lo,hi,ta,ta);             \
346        c0 += lo; hi += (c0<lo)?1:0;            \
347        c1 += hi; c2 += (c1<hi)?1:0;            \
348        } while(0)
349# else
350#  define mul_add_c(a,b,c0,c1,c2) do {  \
351        BN_ULONG t1,t2;                 \
352        asm ("mulq %3"                  \
353                : "=a"(t1),"=d"(t2)     \
354                : "a"(a),"m"(b)         \
355                : "cc");                \
356        asm ("addq %3,%0; adcq %4,%1; adcq %5,%2"       \
357                : "+r"(c0),"+r"(c1),"+r"(c2)            \
358                : "r"(t1),"r"(t2),"g"(0)                \
359                : "cc");                                \
360        } while (0)
361
362#  define sqr_add_c(a,i,c0,c1,c2) do {  \
363        BN_ULONG t1,t2;                 \
364        asm ("mulq %2"                  \
365                : "=a"(t1),"=d"(t2)     \
366                : "a"(a[i])             \
367                : "cc");                \
368        asm ("addq %3,%0; adcq %4,%1; adcq %5,%2"       \
369                : "+r"(c0),"+r"(c1),"+r"(c2)            \
370                : "r"(t1),"r"(t2),"g"(0)                \
371                : "cc");                                \
372        } while (0)
373
374#  define mul_add_c2(a,b,c0,c1,c2) do { \
375        BN_ULONG t1,t2;                 \
376        asm ("mulq %3"                  \
377                : "=a"(t1),"=d"(t2)     \
378                : "a"(a),"m"(b)         \
379                : "cc");                \
380        asm ("addq %3,%0; adcq %4,%1; adcq %5,%2"       \
381                : "+r"(c0),"+r"(c1),"+r"(c2)            \
382                : "r"(t1),"r"(t2),"g"(0)                \
383                : "cc");                                \
384        asm ("addq %3,%0; adcq %4,%1; adcq %5,%2"       \
385                : "+r"(c0),"+r"(c1),"+r"(c2)            \
386                : "r"(t1),"r"(t2),"g"(0)                \
387                : "cc");                                \
388        } while (0)
389# endif
390
391# define sqr_add_c2(a,i,j,c0,c1,c2)      \
392        mul_add_c2((a)[i],(a)[j],c0,c1,c2)
393
394void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
395{
396    BN_ULONG c1, c2, c3;
397
398    c1 = 0;
399    c2 = 0;
400    c3 = 0;
401    mul_add_c(a[0], b[0], c1, c2, c3);
402    r[0] = c1;
403    c1 = 0;
404    mul_add_c(a[0], b[1], c2, c3, c1);
405    mul_add_c(a[1], b[0], c2, c3, c1);
406    r[1] = c2;
407    c2 = 0;
408    mul_add_c(a[2], b[0], c3, c1, c2);
409    mul_add_c(a[1], b[1], c3, c1, c2);
410    mul_add_c(a[0], b[2], c3, c1, c2);
411    r[2] = c3;
412    c3 = 0;
413    mul_add_c(a[0], b[3], c1, c2, c3);
414    mul_add_c(a[1], b[2], c1, c2, c3);
415    mul_add_c(a[2], b[1], c1, c2, c3);
416    mul_add_c(a[3], b[0], c1, c2, c3);
417    r[3] = c1;
418    c1 = 0;
419    mul_add_c(a[4], b[0], c2, c3, c1);
420    mul_add_c(a[3], b[1], c2, c3, c1);
421    mul_add_c(a[2], b[2], c2, c3, c1);
422    mul_add_c(a[1], b[3], c2, c3, c1);
423    mul_add_c(a[0], b[4], c2, c3, c1);
424    r[4] = c2;
425    c2 = 0;
426    mul_add_c(a[0], b[5], c3, c1, c2);
427    mul_add_c(a[1], b[4], c3, c1, c2);
428    mul_add_c(a[2], b[3], c3, c1, c2);
429    mul_add_c(a[3], b[2], c3, c1, c2);
430    mul_add_c(a[4], b[1], c3, c1, c2);
431    mul_add_c(a[5], b[0], c3, c1, c2);
432    r[5] = c3;
433    c3 = 0;
434    mul_add_c(a[6], b[0], c1, c2, c3);
435    mul_add_c(a[5], b[1], c1, c2, c3);
436    mul_add_c(a[4], b[2], c1, c2, c3);
437    mul_add_c(a[3], b[3], c1, c2, c3);
438    mul_add_c(a[2], b[4], c1, c2, c3);
439    mul_add_c(a[1], b[5], c1, c2, c3);
440    mul_add_c(a[0], b[6], c1, c2, c3);
441    r[6] = c1;
442    c1 = 0;
443    mul_add_c(a[0], b[7], c2, c3, c1);
444    mul_add_c(a[1], b[6], c2, c3, c1);
445    mul_add_c(a[2], b[5], c2, c3, c1);
446    mul_add_c(a[3], b[4], c2, c3, c1);
447    mul_add_c(a[4], b[3], c2, c3, c1);
448    mul_add_c(a[5], b[2], c2, c3, c1);
449    mul_add_c(a[6], b[1], c2, c3, c1);
450    mul_add_c(a[7], b[0], c2, c3, c1);
451    r[7] = c2;
452    c2 = 0;
453    mul_add_c(a[7], b[1], c3, c1, c2);
454    mul_add_c(a[6], b[2], c3, c1, c2);
455    mul_add_c(a[5], b[3], c3, c1, c2);
456    mul_add_c(a[4], b[4], c3, c1, c2);
457    mul_add_c(a[3], b[5], c3, c1, c2);
458    mul_add_c(a[2], b[6], c3, c1, c2);
459    mul_add_c(a[1], b[7], c3, c1, c2);
460    r[8] = c3;
461    c3 = 0;
462    mul_add_c(a[2], b[7], c1, c2, c3);
463    mul_add_c(a[3], b[6], c1, c2, c3);
464    mul_add_c(a[4], b[5], c1, c2, c3);
465    mul_add_c(a[5], b[4], c1, c2, c3);
466    mul_add_c(a[6], b[3], c1, c2, c3);
467    mul_add_c(a[7], b[2], c1, c2, c3);
468    r[9] = c1;
469    c1 = 0;
470    mul_add_c(a[7], b[3], c2, c3, c1);
471    mul_add_c(a[6], b[4], c2, c3, c1);
472    mul_add_c(a[5], b[5], c2, c3, c1);
473    mul_add_c(a[4], b[6], c2, c3, c1);
474    mul_add_c(a[3], b[7], c2, c3, c1);
475    r[10] = c2;
476    c2 = 0;
477    mul_add_c(a[4], b[7], c3, c1, c2);
478    mul_add_c(a[5], b[6], c3, c1, c2);
479    mul_add_c(a[6], b[5], c3, c1, c2);
480    mul_add_c(a[7], b[4], c3, c1, c2);
481    r[11] = c3;
482    c3 = 0;
483    mul_add_c(a[7], b[5], c1, c2, c3);
484    mul_add_c(a[6], b[6], c1, c2, c3);
485    mul_add_c(a[5], b[7], c1, c2, c3);
486    r[12] = c1;
487    c1 = 0;
488    mul_add_c(a[6], b[7], c2, c3, c1);
489    mul_add_c(a[7], b[6], c2, c3, c1);
490    r[13] = c2;
491    c2 = 0;
492    mul_add_c(a[7], b[7], c3, c1, c2);
493    r[14] = c3;
494    r[15] = c1;
495}
496
497void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
498{
499    BN_ULONG c1, c2, c3;
500
501    c1 = 0;
502    c2 = 0;
503    c3 = 0;
504    mul_add_c(a[0], b[0], c1, c2, c3);
505    r[0] = c1;
506    c1 = 0;
507    mul_add_c(a[0], b[1], c2, c3, c1);
508    mul_add_c(a[1], b[0], c2, c3, c1);
509    r[1] = c2;
510    c2 = 0;
511    mul_add_c(a[2], b[0], c3, c1, c2);
512    mul_add_c(a[1], b[1], c3, c1, c2);
513    mul_add_c(a[0], b[2], c3, c1, c2);
514    r[2] = c3;
515    c3 = 0;
516    mul_add_c(a[0], b[3], c1, c2, c3);
517    mul_add_c(a[1], b[2], c1, c2, c3);
518    mul_add_c(a[2], b[1], c1, c2, c3);
519    mul_add_c(a[3], b[0], c1, c2, c3);
520    r[3] = c1;
521    c1 = 0;
522    mul_add_c(a[3], b[1], c2, c3, c1);
523    mul_add_c(a[2], b[2], c2, c3, c1);
524    mul_add_c(a[1], b[3], c2, c3, c1);
525    r[4] = c2;
526    c2 = 0;
527    mul_add_c(a[2], b[3], c3, c1, c2);
528    mul_add_c(a[3], b[2], c3, c1, c2);
529    r[5] = c3;
530    c3 = 0;
531    mul_add_c(a[3], b[3], c1, c2, c3);
532    r[6] = c1;
533    r[7] = c2;
534}
535
536void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a)
537{
538    BN_ULONG c1, c2, c3;
539
540    c1 = 0;
541    c2 = 0;
542    c3 = 0;
543    sqr_add_c(a, 0, c1, c2, c3);
544    r[0] = c1;
545    c1 = 0;
546    sqr_add_c2(a, 1, 0, c2, c3, c1);
547    r[1] = c2;
548    c2 = 0;
549    sqr_add_c(a, 1, c3, c1, c2);
550    sqr_add_c2(a, 2, 0, c3, c1, c2);
551    r[2] = c3;
552    c3 = 0;
553    sqr_add_c2(a, 3, 0, c1, c2, c3);
554    sqr_add_c2(a, 2, 1, c1, c2, c3);
555    r[3] = c1;
556    c1 = 0;
557    sqr_add_c(a, 2, c2, c3, c1);
558    sqr_add_c2(a, 3, 1, c2, c3, c1);
559    sqr_add_c2(a, 4, 0, c2, c3, c1);
560    r[4] = c2;
561    c2 = 0;
562    sqr_add_c2(a, 5, 0, c3, c1, c2);
563    sqr_add_c2(a, 4, 1, c3, c1, c2);
564    sqr_add_c2(a, 3, 2, c3, c1, c2);
565    r[5] = c3;
566    c3 = 0;
567    sqr_add_c(a, 3, c1, c2, c3);
568    sqr_add_c2(a, 4, 2, c1, c2, c3);
569    sqr_add_c2(a, 5, 1, c1, c2, c3);
570    sqr_add_c2(a, 6, 0, c1, c2, c3);
571    r[6] = c1;
572    c1 = 0;
573    sqr_add_c2(a, 7, 0, c2, c3, c1);
574    sqr_add_c2(a, 6, 1, c2, c3, c1);
575    sqr_add_c2(a, 5, 2, c2, c3, c1);
576    sqr_add_c2(a, 4, 3, c2, c3, c1);
577    r[7] = c2;
578    c2 = 0;
579    sqr_add_c(a, 4, c3, c1, c2);
580    sqr_add_c2(a, 5, 3, c3, c1, c2);
581    sqr_add_c2(a, 6, 2, c3, c1, c2);
582    sqr_add_c2(a, 7, 1, c3, c1, c2);
583    r[8] = c3;
584    c3 = 0;
585    sqr_add_c2(a, 7, 2, c1, c2, c3);
586    sqr_add_c2(a, 6, 3, c1, c2, c3);
587    sqr_add_c2(a, 5, 4, c1, c2, c3);
588    r[9] = c1;
589    c1 = 0;
590    sqr_add_c(a, 5, c2, c3, c1);
591    sqr_add_c2(a, 6, 4, c2, c3, c1);
592    sqr_add_c2(a, 7, 3, c2, c3, c1);
593    r[10] = c2;
594    c2 = 0;
595    sqr_add_c2(a, 7, 4, c3, c1, c2);
596    sqr_add_c2(a, 6, 5, c3, c1, c2);
597    r[11] = c3;
598    c3 = 0;
599    sqr_add_c(a, 6, c1, c2, c3);
600    sqr_add_c2(a, 7, 5, c1, c2, c3);
601    r[12] = c1;
602    c1 = 0;
603    sqr_add_c2(a, 7, 6, c2, c3, c1);
604    r[13] = c2;
605    c2 = 0;
606    sqr_add_c(a, 7, c3, c1, c2);
607    r[14] = c3;
608    r[15] = c1;
609}
610
611void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a)
612{
613    BN_ULONG c1, c2, c3;
614
615    c1 = 0;
616    c2 = 0;
617    c3 = 0;
618    sqr_add_c(a, 0, c1, c2, c3);
619    r[0] = c1;
620    c1 = 0;
621    sqr_add_c2(a, 1, 0, c2, c3, c1);
622    r[1] = c2;
623    c2 = 0;
624    sqr_add_c(a, 1, c3, c1, c2);
625    sqr_add_c2(a, 2, 0, c3, c1, c2);
626    r[2] = c3;
627    c3 = 0;
628    sqr_add_c2(a, 3, 0, c1, c2, c3);
629    sqr_add_c2(a, 2, 1, c1, c2, c3);
630    r[3] = c1;
631    c1 = 0;
632    sqr_add_c(a, 2, c2, c3, c1);
633    sqr_add_c2(a, 3, 1, c2, c3, c1);
634    r[4] = c2;
635    c2 = 0;
636    sqr_add_c2(a, 3, 2, c3, c1, c2);
637    r[5] = c3;
638    c3 = 0;
639    sqr_add_c(a, 3, c1, c2, c3);
640    r[6] = c1;
641    r[7] = c2;
642}
643#endif
644