1205128Ssimon#include "../bn_lcl.h"
2162911Ssimon#ifdef __SUNPRO_C
3296465Sdelphij# include "../bn_asm.c"         /* kind of dirty hack for Sun Studio */
4162911Ssimon#else
5296465Sdelphij/*-
6109998Smarkm * x86_64 BIGNUM accelerator version 0.1, December 2002.
7109998Smarkm *
8109998Smarkm * Implemented by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
9109998Smarkm * project.
10109998Smarkm *
11109998Smarkm * Rights for redistribution and usage in source and binary forms are
12109998Smarkm * granted according to the OpenSSL license. Warranty of any kind is
13109998Smarkm * disclaimed.
14109998Smarkm *
15109998Smarkm * Q. Version 0.1? It doesn't sound like Andy, he used to assign real
16109998Smarkm *    versions, like 1.0...
17109998Smarkm * A. Well, that's because this code is basically a quick-n-dirty
18109998Smarkm *    proof-of-concept hack. As you can see it's implemented with
19109998Smarkm *    inline assembler, which means that you're bound to GCC and that
20160814Ssimon *    there might be enough room for further improvement.
21109998Smarkm *
22109998Smarkm * Q. Why inline assembler?
23160814Ssimon * A. x86_64 features own ABI which I'm not familiar with. This is
24160814Ssimon *    why I decided to let the compiler take care of subroutine
25160814Ssimon *    prologue/epilogue as well as register allocation. For reference.
26160814Ssimon *    Win64 implements different ABI for AMD64, different from Linux.
27109998Smarkm *
28109998Smarkm * Q. How much faster does it get?
29160814Ssimon * A. 'apps/openssl speed rsa dsa' output with no-asm:
30160814Ssimon *
31296465Sdelphij *                        sign    verify    sign/s verify/s
32296465Sdelphij *      rsa  512 bits   0.0006s   0.0001s   1683.8  18456.2
33296465Sdelphij *      rsa 1024 bits   0.0028s   0.0002s    356.0   6407.0
34296465Sdelphij *      rsa 2048 bits   0.0172s   0.0005s     58.0   1957.8
35296465Sdelphij *      rsa 4096 bits   0.1155s   0.0018s      8.7    555.6
36296465Sdelphij *                        sign    verify    sign/s verify/s
37296465Sdelphij *      dsa  512 bits   0.0005s   0.0006s   2100.8   1768.3
38296465Sdelphij *      dsa 1024 bits   0.0014s   0.0018s    692.3    559.2
39296465Sdelphij *      dsa 2048 bits   0.0049s   0.0061s    204.7    165.0
40160814Ssimon *
41160814Ssimon *    'apps/openssl speed rsa dsa' output with this module:
42160814Ssimon *
43296465Sdelphij *                        sign    verify    sign/s verify/s
44296465Sdelphij *      rsa  512 bits   0.0004s   0.0000s   2767.1  33297.9
45296465Sdelphij *      rsa 1024 bits   0.0012s   0.0001s    867.4  14674.7
46296465Sdelphij *      rsa 2048 bits   0.0061s   0.0002s    164.0   5270.0
47296465Sdelphij *      rsa 4096 bits   0.0384s   0.0006s     26.1   1650.8
48296465Sdelphij *                        sign    verify    sign/s verify/s
49296465Sdelphij *      dsa  512 bits   0.0002s   0.0003s   4442.2   3786.3
50296465Sdelphij *      dsa 1024 bits   0.0005s   0.0007s   1835.1   1497.4
51296465Sdelphij *      dsa 2048 bits   0.0016s   0.0020s    620.4    504.6
52160814Ssimon *
53160814Ssimon *    For the reference. IA-32 assembler implementation performs
54160814Ssimon *    very much like 64-bit code compiled with no-asm on the same
55160814Ssimon *    machine.
56109998Smarkm */
57109998Smarkm
58296465Sdelphij# define BN_ULONG unsigned long
59109998Smarkm
60296465Sdelphij# undef mul
61296465Sdelphij# undef mul_add
62296465Sdelphij# undef sqr
63205128Ssimon
64296465Sdelphij/*-
65296465Sdelphij * "m"(a), "+m"(r)      is the way to favor DirectPath �-code;
66296465Sdelphij * "g"(0)               let the compiler to decide where does it
67296465Sdelphij *                      want to keep the value of zero;
68109998Smarkm */
69296465Sdelphij# define mul_add(r,a,word,carry) do {   \
70296465Sdelphij        register BN_ULONG high,low;     \
71296465Sdelphij        asm ("mulq %3"                  \
72296465Sdelphij                : "=a"(low),"=d"(high)  \
73296465Sdelphij                : "a"(word),"m"(a)      \
74296465Sdelphij                : "cc");                \
75296465Sdelphij        asm ("addq %2,%0; adcq %3,%1"   \
76296465Sdelphij                : "+r"(carry),"+d"(high)\
77296465Sdelphij                : "a"(low),"g"(0)       \
78296465Sdelphij                : "cc");                \
79296465Sdelphij        asm ("addq %2,%0; adcq %3,%1"   \
80296465Sdelphij                : "+m"(r),"+d"(high)    \
81296465Sdelphij                : "r"(carry),"g"(0)     \
82296465Sdelphij                : "cc");                \
83296465Sdelphij        carry=high;                     \
84296465Sdelphij        } while (0)
85109998Smarkm
86296465Sdelphij# define mul(r,a,word,carry) do {       \
87296465Sdelphij        register BN_ULONG high,low;     \
88296465Sdelphij        asm ("mulq %3"                  \
89296465Sdelphij                : "=a"(low),"=d"(high)  \
90296465Sdelphij                : "a"(word),"g"(a)      \
91296465Sdelphij                : "cc");                \
92296465Sdelphij        asm ("addq %2,%0; adcq %3,%1"   \
93296465Sdelphij                : "+r"(carry),"+d"(high)\
94296465Sdelphij                : "a"(low),"g"(0)       \
95296465Sdelphij                : "cc");                \
96296465Sdelphij        (r)=carry, carry=high;          \
97296465Sdelphij        } while (0)
98109998Smarkm
99296465Sdelphij# define sqr(r0,r1,a)                    \
100296465Sdelphij        asm ("mulq %2"                  \
101296465Sdelphij                : "=a"(r0),"=d"(r1)     \
102296465Sdelphij                : "a"(a)                \
103296465Sdelphij                : "cc");
104109998Smarkm
105296465SdelphijBN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num,
106296465Sdelphij                          BN_ULONG w)
107296465Sdelphij{
108296465Sdelphij    BN_ULONG c1 = 0;
109109998Smarkm
110296465Sdelphij    if (num <= 0)
111296465Sdelphij        return (c1);
112109998Smarkm
113296465Sdelphij    while (num & ~3) {
114296465Sdelphij        mul_add(rp[0], ap[0], w, c1);
115296465Sdelphij        mul_add(rp[1], ap[1], w, c1);
116296465Sdelphij        mul_add(rp[2], ap[2], w, c1);
117296465Sdelphij        mul_add(rp[3], ap[3], w, c1);
118296465Sdelphij        ap += 4;
119296465Sdelphij        rp += 4;
120296465Sdelphij        num -= 4;
121296465Sdelphij    }
122296465Sdelphij    if (num) {
123296465Sdelphij        mul_add(rp[0], ap[0], w, c1);
124296465Sdelphij        if (--num == 0)
125296465Sdelphij            return c1;
126296465Sdelphij        mul_add(rp[1], ap[1], w, c1);
127296465Sdelphij        if (--num == 0)
128296465Sdelphij            return c1;
129296465Sdelphij        mul_add(rp[2], ap[2], w, c1);
130296465Sdelphij        return c1;
131296465Sdelphij    }
132109998Smarkm
133296465Sdelphij    return (c1);
134296465Sdelphij}
135296465Sdelphij
136205128SsimonBN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
137296465Sdelphij{
138296465Sdelphij    BN_ULONG c1 = 0;
139109998Smarkm
140296465Sdelphij    if (num <= 0)
141296465Sdelphij        return (c1);
142109998Smarkm
143296465Sdelphij    while (num & ~3) {
144296465Sdelphij        mul(rp[0], ap[0], w, c1);
145296465Sdelphij        mul(rp[1], ap[1], w, c1);
146296465Sdelphij        mul(rp[2], ap[2], w, c1);
147296465Sdelphij        mul(rp[3], ap[3], w, c1);
148296465Sdelphij        ap += 4;
149296465Sdelphij        rp += 4;
150296465Sdelphij        num -= 4;
151296465Sdelphij    }
152296465Sdelphij    if (num) {
153296465Sdelphij        mul(rp[0], ap[0], w, c1);
154296465Sdelphij        if (--num == 0)
155296465Sdelphij            return c1;
156296465Sdelphij        mul(rp[1], ap[1], w, c1);
157296465Sdelphij        if (--num == 0)
158296465Sdelphij            return c1;
159296465Sdelphij        mul(rp[2], ap[2], w, c1);
160296465Sdelphij    }
161296465Sdelphij    return (c1);
162296465Sdelphij}
163109998Smarkm
164205128Ssimonvoid bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n)
165296465Sdelphij{
166296465Sdelphij    if (n <= 0)
167296465Sdelphij        return;
168109998Smarkm
169296465Sdelphij    while (n & ~3) {
170296465Sdelphij        sqr(r[0], r[1], a[0]);
171296465Sdelphij        sqr(r[2], r[3], a[1]);
172296465Sdelphij        sqr(r[4], r[5], a[2]);
173296465Sdelphij        sqr(r[6], r[7], a[3]);
174296465Sdelphij        a += 4;
175296465Sdelphij        r += 8;
176296465Sdelphij        n -= 4;
177296465Sdelphij    }
178296465Sdelphij    if (n) {
179296465Sdelphij        sqr(r[0], r[1], a[0]);
180296465Sdelphij        if (--n == 0)
181296465Sdelphij            return;
182296465Sdelphij        sqr(r[2], r[3], a[1]);
183296465Sdelphij        if (--n == 0)
184296465Sdelphij            return;
185296465Sdelphij        sqr(r[4], r[5], a[2]);
186296465Sdelphij    }
187296465Sdelphij}
188109998Smarkm
189109998SmarkmBN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
190296465Sdelphij{
191296465Sdelphij    BN_ULONG ret, waste;
192109998Smarkm
193296465Sdelphij asm("divq      %4":"=a"(ret), "=d"(waste)
194296465Sdelphij :     "a"(l), "d"(h), "g"(d)
195296465Sdelphij :     "cc");
196109998Smarkm
197296465Sdelphij    return ret;
198109998Smarkm}
199109998Smarkm
200296465SdelphijBN_ULONG bn_add_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
201296465Sdelphij                      int n)
202296465Sdelphij{
203296465Sdelphij    BN_ULONG ret = 0, i = 0;
204109998Smarkm
205296465Sdelphij    if (n <= 0)
206296465Sdelphij        return 0;
207109998Smarkm
208296465Sdelphij    asm volatile ("       subq    %2,%2           \n"
209296465Sdelphij                  ".align 16                      \n"
210296465Sdelphij                  "1:     movq    (%4,%2,8),%0    \n"
211296465Sdelphij                  "       adcq    (%5,%2,8),%0    \n"
212296465Sdelphij                  "       movq    %0,(%3,%2,8)    \n"
213296465Sdelphij                  "       leaq    1(%2),%2        \n"
214296465Sdelphij                  "       loop    1b              \n"
215296465Sdelphij                  "       sbbq    %0,%0           \n":"=&a" (ret), "+c"(n),
216296465Sdelphij                  "=&r"(i)
217296465Sdelphij                  :"r"(rp), "r"(ap), "r"(bp)
218296465Sdelphij                  :"cc", "memory");
219109998Smarkm
220296465Sdelphij    return ret & 1;
221109998Smarkm}
222109998Smarkm
223296465Sdelphij# ifndef SIMICS
224296465SdelphijBN_ULONG bn_sub_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
225296465Sdelphij                      int n)
226296465Sdelphij{
227296465Sdelphij    BN_ULONG ret = 0, i = 0;
228109998Smarkm
229296465Sdelphij    if (n <= 0)
230296465Sdelphij        return 0;
231109998Smarkm
232296465Sdelphij    asm volatile ("       subq    %2,%2           \n"
233296465Sdelphij                  ".align 16                      \n"
234296465Sdelphij                  "1:     movq    (%4,%2,8),%0    \n"
235296465Sdelphij                  "       sbbq    (%5,%2,8),%0    \n"
236296465Sdelphij                  "       movq    %0,(%3,%2,8)    \n"
237296465Sdelphij                  "       leaq    1(%2),%2        \n"
238296465Sdelphij                  "       loop    1b              \n"
239296465Sdelphij                  "       sbbq    %0,%0           \n":"=&a" (ret), "+c"(n),
240296465Sdelphij                  "=&r"(i)
241296465Sdelphij                  :"r"(rp), "r"(ap), "r"(bp)
242296465Sdelphij                  :"cc", "memory");
243109998Smarkm
244296465Sdelphij    return ret & 1;
245109998Smarkm}
246296465Sdelphij# else
247109998Smarkm/* Simics 1.4<7 has buggy sbbq:-( */
248296465Sdelphij#  define BN_MASK2 0xffffffffffffffffL
249109998SmarkmBN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
250296465Sdelphij{
251296465Sdelphij    BN_ULONG t1, t2;
252296465Sdelphij    int c = 0;
253109998Smarkm
254296465Sdelphij    if (n <= 0)
255296465Sdelphij        return ((BN_ULONG)0);
256109998Smarkm
257296465Sdelphij    for (;;) {
258296465Sdelphij        t1 = a[0];
259296465Sdelphij        t2 = b[0];
260296465Sdelphij        r[0] = (t1 - t2 - c) & BN_MASK2;
261296465Sdelphij        if (t1 != t2)
262296465Sdelphij            c = (t1 < t2);
263296465Sdelphij        if (--n <= 0)
264296465Sdelphij            break;
265109998Smarkm
266296465Sdelphij        t1 = a[1];
267296465Sdelphij        t2 = b[1];
268296465Sdelphij        r[1] = (t1 - t2 - c) & BN_MASK2;
269296465Sdelphij        if (t1 != t2)
270296465Sdelphij            c = (t1 < t2);
271296465Sdelphij        if (--n <= 0)
272296465Sdelphij            break;
273109998Smarkm
274296465Sdelphij        t1 = a[2];
275296465Sdelphij        t2 = b[2];
276296465Sdelphij        r[2] = (t1 - t2 - c) & BN_MASK2;
277296465Sdelphij        if (t1 != t2)
278296465Sdelphij            c = (t1 < t2);
279296465Sdelphij        if (--n <= 0)
280296465Sdelphij            break;
281109998Smarkm
282296465Sdelphij        t1 = a[3];
283296465Sdelphij        t2 = b[3];
284296465Sdelphij        r[3] = (t1 - t2 - c) & BN_MASK2;
285296465Sdelphij        if (t1 != t2)
286296465Sdelphij            c = (t1 < t2);
287296465Sdelphij        if (--n <= 0)
288296465Sdelphij            break;
289109998Smarkm
290296465Sdelphij        a += 4;
291296465Sdelphij        b += 4;
292296465Sdelphij        r += 4;
293296465Sdelphij    }
294296465Sdelphij    return (c);
295296465Sdelphij}
296296465Sdelphij# endif
297109998Smarkm
298109998Smarkm/* mul_add_c(a,b,c0,c1,c2)  -- c+=a*b for three word number c=(c2,c1,c0) */
299109998Smarkm/* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */
300109998Smarkm/* sqr_add_c(a,i,c0,c1,c2)  -- c+=a[i]^2 for three word number c=(c2,c1,c0) */
301296465Sdelphij/*
302296465Sdelphij * sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number
303296465Sdelphij * c=(c2,c1,c0)
304296465Sdelphij */
305109998Smarkm
306277195Sdelphij/*
307277195Sdelphij * Keep in mind that carrying into high part of multiplication result
308277195Sdelphij * can not overflow, because it cannot be all-ones.
309277195Sdelphij */
310296465Sdelphij# if 0
311109998Smarkm/* original macros are kept for reference purposes */
312296465Sdelphij#  define mul_add_c(a,b,c0,c1,c2) {       \
313296465Sdelphij        BN_ULONG ta=(a),tb=(b);         \
314296465Sdelphij        t1 = ta * tb;                   \
315296465Sdelphij        t2 = BN_UMULT_HIGH(ta,tb);      \
316296465Sdelphij        c0 += t1; t2 += (c0<t1)?1:0;    \
317296465Sdelphij        c1 += t2; c2 += (c1<t2)?1:0;    \
318296465Sdelphij        }
319109998Smarkm
320296465Sdelphij#  define mul_add_c2(a,b,c0,c1,c2) {      \
321296465Sdelphij        BN_ULONG ta=(a),tb=(b),t0;      \
322296465Sdelphij        t1 = BN_UMULT_HIGH(ta,tb);      \
323296465Sdelphij        t0 = ta * tb;                   \
324296465Sdelphij        c0 += t0; t2 = t1+((c0<t0)?1:0);\
325296465Sdelphij        c1 += t2; c2 += (c1<t2)?1:0;    \
326296465Sdelphij        c0 += t0; t1 += (c0<t0)?1:0;    \
327296465Sdelphij        c1 += t1; c2 += (c1<t1)?1:0;    \
328296465Sdelphij        }
329296465Sdelphij# else
330296465Sdelphij#  define mul_add_c(a,b,c0,c1,c2) do {    \
331296465Sdelphij        asm ("mulq %3"                  \
332296465Sdelphij                : "=a"(t1),"=d"(t2)     \
333296465Sdelphij                : "a"(a),"m"(b)         \
334296465Sdelphij                : "cc");                \
335296465Sdelphij        asm ("addq %2,%0; adcq %3,%1"   \
336296465Sdelphij                : "+r"(c0),"+d"(t2)     \
337296465Sdelphij                : "a"(t1),"g"(0)        \
338296465Sdelphij                : "cc");                \
339296465Sdelphij        asm ("addq %2,%0; adcq %3,%1"   \
340296465Sdelphij                : "+r"(c1),"+r"(c2)     \
341296465Sdelphij                : "d"(t2),"g"(0)        \
342296465Sdelphij                : "cc");                \
343296465Sdelphij        } while (0)
344109998Smarkm
345296465Sdelphij#  define sqr_add_c(a,i,c0,c1,c2) do {    \
346296465Sdelphij        asm ("mulq %2"                  \
347296465Sdelphij                : "=a"(t1),"=d"(t2)     \
348296465Sdelphij                : "a"(a[i])             \
349296465Sdelphij                : "cc");                \
350296465Sdelphij        asm ("addq %2,%0; adcq %3,%1"   \
351296465Sdelphij                : "+r"(c0),"+d"(t2)     \
352296465Sdelphij                : "a"(t1),"g"(0)        \
353296465Sdelphij                : "cc");                \
354296465Sdelphij        asm ("addq %2,%0; adcq %3,%1"   \
355296465Sdelphij                : "+r"(c1),"+r"(c2)     \
356296465Sdelphij                : "d"(t2),"g"(0)        \
357296465Sdelphij                : "cc");                \
358296465Sdelphij        } while (0)
359109998Smarkm
360296465Sdelphij#  define mul_add_c2(a,b,c0,c1,c2) do {   \
361296465Sdelphij        asm ("mulq %3"                  \
362296465Sdelphij                : "=a"(t1),"=d"(t2)     \
363296465Sdelphij                : "a"(a),"m"(b)         \
364296465Sdelphij                : "cc");                \
365296465Sdelphij        asm ("addq %3,%0; adcq %4,%1; adcq %5,%2"       \
366296465Sdelphij                : "+r"(c0),"+r"(c1),"+r"(c2)            \
367296465Sdelphij                : "r"(t1),"r"(t2),"g"(0)                \
368296465Sdelphij                : "cc");                                \
369296465Sdelphij        asm ("addq %3,%0; adcq %4,%1; adcq %5,%2"       \
370296465Sdelphij                : "+r"(c0),"+r"(c1),"+r"(c2)            \
371296465Sdelphij                : "r"(t1),"r"(t2),"g"(0)                \
372296465Sdelphij                : "cc");                                \
373296465Sdelphij        } while (0)
374296465Sdelphij# endif
375109998Smarkm
376296465Sdelphij# define sqr_add_c2(a,i,j,c0,c1,c2)      \
377296465Sdelphij        mul_add_c2((a)[i],(a)[j],c0,c1,c2)
378109998Smarkm
379109998Smarkmvoid bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
380296465Sdelphij{
381296465Sdelphij    BN_ULONG t1, t2;
382296465Sdelphij    BN_ULONG c1, c2, c3;
383109998Smarkm
384296465Sdelphij    c1 = 0;
385296465Sdelphij    c2 = 0;
386296465Sdelphij    c3 = 0;
387296465Sdelphij    mul_add_c(a[0], b[0], c1, c2, c3);
388296465Sdelphij    r[0] = c1;
389296465Sdelphij    c1 = 0;
390296465Sdelphij    mul_add_c(a[0], b[1], c2, c3, c1);
391296465Sdelphij    mul_add_c(a[1], b[0], c2, c3, c1);
392296465Sdelphij    r[1] = c2;
393296465Sdelphij    c2 = 0;
394296465Sdelphij    mul_add_c(a[2], b[0], c3, c1, c2);
395296465Sdelphij    mul_add_c(a[1], b[1], c3, c1, c2);
396296465Sdelphij    mul_add_c(a[0], b[2], c3, c1, c2);
397296465Sdelphij    r[2] = c3;
398296465Sdelphij    c3 = 0;
399296465Sdelphij    mul_add_c(a[0], b[3], c1, c2, c3);
400296465Sdelphij    mul_add_c(a[1], b[2], c1, c2, c3);
401296465Sdelphij    mul_add_c(a[2], b[1], c1, c2, c3);
402296465Sdelphij    mul_add_c(a[3], b[0], c1, c2, c3);
403296465Sdelphij    r[3] = c1;
404296465Sdelphij    c1 = 0;
405296465Sdelphij    mul_add_c(a[4], b[0], c2, c3, c1);
406296465Sdelphij    mul_add_c(a[3], b[1], c2, c3, c1);
407296465Sdelphij    mul_add_c(a[2], b[2], c2, c3, c1);
408296465Sdelphij    mul_add_c(a[1], b[3], c2, c3, c1);
409296465Sdelphij    mul_add_c(a[0], b[4], c2, c3, c1);
410296465Sdelphij    r[4] = c2;
411296465Sdelphij    c2 = 0;
412296465Sdelphij    mul_add_c(a[0], b[5], c3, c1, c2);
413296465Sdelphij    mul_add_c(a[1], b[4], c3, c1, c2);
414296465Sdelphij    mul_add_c(a[2], b[3], c3, c1, c2);
415296465Sdelphij    mul_add_c(a[3], b[2], c3, c1, c2);
416296465Sdelphij    mul_add_c(a[4], b[1], c3, c1, c2);
417296465Sdelphij    mul_add_c(a[5], b[0], c3, c1, c2);
418296465Sdelphij    r[5] = c3;
419296465Sdelphij    c3 = 0;
420296465Sdelphij    mul_add_c(a[6], b[0], c1, c2, c3);
421296465Sdelphij    mul_add_c(a[5], b[1], c1, c2, c3);
422296465Sdelphij    mul_add_c(a[4], b[2], c1, c2, c3);
423296465Sdelphij    mul_add_c(a[3], b[3], c1, c2, c3);
424296465Sdelphij    mul_add_c(a[2], b[4], c1, c2, c3);
425296465Sdelphij    mul_add_c(a[1], b[5], c1, c2, c3);
426296465Sdelphij    mul_add_c(a[0], b[6], c1, c2, c3);
427296465Sdelphij    r[6] = c1;
428296465Sdelphij    c1 = 0;
429296465Sdelphij    mul_add_c(a[0], b[7], c2, c3, c1);
430296465Sdelphij    mul_add_c(a[1], b[6], c2, c3, c1);
431296465Sdelphij    mul_add_c(a[2], b[5], c2, c3, c1);
432296465Sdelphij    mul_add_c(a[3], b[4], c2, c3, c1);
433296465Sdelphij    mul_add_c(a[4], b[3], c2, c3, c1);
434296465Sdelphij    mul_add_c(a[5], b[2], c2, c3, c1);
435296465Sdelphij    mul_add_c(a[6], b[1], c2, c3, c1);
436296465Sdelphij    mul_add_c(a[7], b[0], c2, c3, c1);
437296465Sdelphij    r[7] = c2;
438296465Sdelphij    c2 = 0;
439296465Sdelphij    mul_add_c(a[7], b[1], c3, c1, c2);
440296465Sdelphij    mul_add_c(a[6], b[2], c3, c1, c2);
441296465Sdelphij    mul_add_c(a[5], b[3], c3, c1, c2);
442296465Sdelphij    mul_add_c(a[4], b[4], c3, c1, c2);
443296465Sdelphij    mul_add_c(a[3], b[5], c3, c1, c2);
444296465Sdelphij    mul_add_c(a[2], b[6], c3, c1, c2);
445296465Sdelphij    mul_add_c(a[1], b[7], c3, c1, c2);
446296465Sdelphij    r[8] = c3;
447296465Sdelphij    c3 = 0;
448296465Sdelphij    mul_add_c(a[2], b[7], c1, c2, c3);
449296465Sdelphij    mul_add_c(a[3], b[6], c1, c2, c3);
450296465Sdelphij    mul_add_c(a[4], b[5], c1, c2, c3);
451296465Sdelphij    mul_add_c(a[5], b[4], c1, c2, c3);
452296465Sdelphij    mul_add_c(a[6], b[3], c1, c2, c3);
453296465Sdelphij    mul_add_c(a[7], b[2], c1, c2, c3);
454296465Sdelphij    r[9] = c1;
455296465Sdelphij    c1 = 0;
456296465Sdelphij    mul_add_c(a[7], b[3], c2, c3, c1);
457296465Sdelphij    mul_add_c(a[6], b[4], c2, c3, c1);
458296465Sdelphij    mul_add_c(a[5], b[5], c2, c3, c1);
459296465Sdelphij    mul_add_c(a[4], b[6], c2, c3, c1);
460296465Sdelphij    mul_add_c(a[3], b[7], c2, c3, c1);
461296465Sdelphij    r[10] = c2;
462296465Sdelphij    c2 = 0;
463296465Sdelphij    mul_add_c(a[4], b[7], c3, c1, c2);
464296465Sdelphij    mul_add_c(a[5], b[6], c3, c1, c2);
465296465Sdelphij    mul_add_c(a[6], b[5], c3, c1, c2);
466296465Sdelphij    mul_add_c(a[7], b[4], c3, c1, c2);
467296465Sdelphij    r[11] = c3;
468296465Sdelphij    c3 = 0;
469296465Sdelphij    mul_add_c(a[7], b[5], c1, c2, c3);
470296465Sdelphij    mul_add_c(a[6], b[6], c1, c2, c3);
471296465Sdelphij    mul_add_c(a[5], b[7], c1, c2, c3);
472296465Sdelphij    r[12] = c1;
473296465Sdelphij    c1 = 0;
474296465Sdelphij    mul_add_c(a[6], b[7], c2, c3, c1);
475296465Sdelphij    mul_add_c(a[7], b[6], c2, c3, c1);
476296465Sdelphij    r[13] = c2;
477296465Sdelphij    c2 = 0;
478296465Sdelphij    mul_add_c(a[7], b[7], c3, c1, c2);
479296465Sdelphij    r[14] = c3;
480296465Sdelphij    r[15] = c1;
481296465Sdelphij}
482109998Smarkm
483109998Smarkmvoid bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
484296465Sdelphij{
485296465Sdelphij    BN_ULONG t1, t2;
486296465Sdelphij    BN_ULONG c1, c2, c3;
487109998Smarkm
488296465Sdelphij    c1 = 0;
489296465Sdelphij    c2 = 0;
490296465Sdelphij    c3 = 0;
491296465Sdelphij    mul_add_c(a[0], b[0], c1, c2, c3);
492296465Sdelphij    r[0] = c1;
493296465Sdelphij    c1 = 0;
494296465Sdelphij    mul_add_c(a[0], b[1], c2, c3, c1);
495296465Sdelphij    mul_add_c(a[1], b[0], c2, c3, c1);
496296465Sdelphij    r[1] = c2;
497296465Sdelphij    c2 = 0;
498296465Sdelphij    mul_add_c(a[2], b[0], c3, c1, c2);
499296465Sdelphij    mul_add_c(a[1], b[1], c3, c1, c2);
500296465Sdelphij    mul_add_c(a[0], b[2], c3, c1, c2);
501296465Sdelphij    r[2] = c3;
502296465Sdelphij    c3 = 0;
503296465Sdelphij    mul_add_c(a[0], b[3], c1, c2, c3);
504296465Sdelphij    mul_add_c(a[1], b[2], c1, c2, c3);
505296465Sdelphij    mul_add_c(a[2], b[1], c1, c2, c3);
506296465Sdelphij    mul_add_c(a[3], b[0], c1, c2, c3);
507296465Sdelphij    r[3] = c1;
508296465Sdelphij    c1 = 0;
509296465Sdelphij    mul_add_c(a[3], b[1], c2, c3, c1);
510296465Sdelphij    mul_add_c(a[2], b[2], c2, c3, c1);
511296465Sdelphij    mul_add_c(a[1], b[3], c2, c3, c1);
512296465Sdelphij    r[4] = c2;
513296465Sdelphij    c2 = 0;
514296465Sdelphij    mul_add_c(a[2], b[3], c3, c1, c2);
515296465Sdelphij    mul_add_c(a[3], b[2], c3, c1, c2);
516296465Sdelphij    r[5] = c3;
517296465Sdelphij    c3 = 0;
518296465Sdelphij    mul_add_c(a[3], b[3], c1, c2, c3);
519296465Sdelphij    r[6] = c1;
520296465Sdelphij    r[7] = c2;
521296465Sdelphij}
522109998Smarkm
523205128Ssimonvoid bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a)
524296465Sdelphij{
525296465Sdelphij    BN_ULONG t1, t2;
526296465Sdelphij    BN_ULONG c1, c2, c3;
527109998Smarkm
528296465Sdelphij    c1 = 0;
529296465Sdelphij    c2 = 0;
530296465Sdelphij    c3 = 0;
531296465Sdelphij    sqr_add_c(a, 0, c1, c2, c3);
532296465Sdelphij    r[0] = c1;
533296465Sdelphij    c1 = 0;
534296465Sdelphij    sqr_add_c2(a, 1, 0, c2, c3, c1);
535296465Sdelphij    r[1] = c2;
536296465Sdelphij    c2 = 0;
537296465Sdelphij    sqr_add_c(a, 1, c3, c1, c2);
538296465Sdelphij    sqr_add_c2(a, 2, 0, c3, c1, c2);
539296465Sdelphij    r[2] = c3;
540296465Sdelphij    c3 = 0;
541296465Sdelphij    sqr_add_c2(a, 3, 0, c1, c2, c3);
542296465Sdelphij    sqr_add_c2(a, 2, 1, c1, c2, c3);
543296465Sdelphij    r[3] = c1;
544296465Sdelphij    c1 = 0;
545296465Sdelphij    sqr_add_c(a, 2, c2, c3, c1);
546296465Sdelphij    sqr_add_c2(a, 3, 1, c2, c3, c1);
547296465Sdelphij    sqr_add_c2(a, 4, 0, c2, c3, c1);
548296465Sdelphij    r[4] = c2;
549296465Sdelphij    c2 = 0;
550296465Sdelphij    sqr_add_c2(a, 5, 0, c3, c1, c2);
551296465Sdelphij    sqr_add_c2(a, 4, 1, c3, c1, c2);
552296465Sdelphij    sqr_add_c2(a, 3, 2, c3, c1, c2);
553296465Sdelphij    r[5] = c3;
554296465Sdelphij    c3 = 0;
555296465Sdelphij    sqr_add_c(a, 3, c1, c2, c3);
556296465Sdelphij    sqr_add_c2(a, 4, 2, c1, c2, c3);
557296465Sdelphij    sqr_add_c2(a, 5, 1, c1, c2, c3);
558296465Sdelphij    sqr_add_c2(a, 6, 0, c1, c2, c3);
559296465Sdelphij    r[6] = c1;
560296465Sdelphij    c1 = 0;
561296465Sdelphij    sqr_add_c2(a, 7, 0, c2, c3, c1);
562296465Sdelphij    sqr_add_c2(a, 6, 1, c2, c3, c1);
563296465Sdelphij    sqr_add_c2(a, 5, 2, c2, c3, c1);
564296465Sdelphij    sqr_add_c2(a, 4, 3, c2, c3, c1);
565296465Sdelphij    r[7] = c2;
566296465Sdelphij    c2 = 0;
567296465Sdelphij    sqr_add_c(a, 4, c3, c1, c2);
568296465Sdelphij    sqr_add_c2(a, 5, 3, c3, c1, c2);
569296465Sdelphij    sqr_add_c2(a, 6, 2, c3, c1, c2);
570296465Sdelphij    sqr_add_c2(a, 7, 1, c3, c1, c2);
571296465Sdelphij    r[8] = c3;
572296465Sdelphij    c3 = 0;
573296465Sdelphij    sqr_add_c2(a, 7, 2, c1, c2, c3);
574296465Sdelphij    sqr_add_c2(a, 6, 3, c1, c2, c3);
575296465Sdelphij    sqr_add_c2(a, 5, 4, c1, c2, c3);
576296465Sdelphij    r[9] = c1;
577296465Sdelphij    c1 = 0;
578296465Sdelphij    sqr_add_c(a, 5, c2, c3, c1);
579296465Sdelphij    sqr_add_c2(a, 6, 4, c2, c3, c1);
580296465Sdelphij    sqr_add_c2(a, 7, 3, c2, c3, c1);
581296465Sdelphij    r[10] = c2;
582296465Sdelphij    c2 = 0;
583296465Sdelphij    sqr_add_c2(a, 7, 4, c3, c1, c2);
584296465Sdelphij    sqr_add_c2(a, 6, 5, c3, c1, c2);
585296465Sdelphij    r[11] = c3;
586296465Sdelphij    c3 = 0;
587296465Sdelphij    sqr_add_c(a, 6, c1, c2, c3);
588296465Sdelphij    sqr_add_c2(a, 7, 5, c1, c2, c3);
589296465Sdelphij    r[12] = c1;
590296465Sdelphij    c1 = 0;
591296465Sdelphij    sqr_add_c2(a, 7, 6, c2, c3, c1);
592296465Sdelphij    r[13] = c2;
593296465Sdelphij    c2 = 0;
594296465Sdelphij    sqr_add_c(a, 7, c3, c1, c2);
595296465Sdelphij    r[14] = c3;
596296465Sdelphij    r[15] = c1;
597296465Sdelphij}
598109998Smarkm
599205128Ssimonvoid bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a)
600296465Sdelphij{
601296465Sdelphij    BN_ULONG t1, t2;
602296465Sdelphij    BN_ULONG c1, c2, c3;
603109998Smarkm
604296465Sdelphij    c1 = 0;
605296465Sdelphij    c2 = 0;
606296465Sdelphij    c3 = 0;
607296465Sdelphij    sqr_add_c(a, 0, c1, c2, c3);
608296465Sdelphij    r[0] = c1;
609296465Sdelphij    c1 = 0;
610296465Sdelphij    sqr_add_c2(a, 1, 0, c2, c3, c1);
611296465Sdelphij    r[1] = c2;
612296465Sdelphij    c2 = 0;
613296465Sdelphij    sqr_add_c(a, 1, c3, c1, c2);
614296465Sdelphij    sqr_add_c2(a, 2, 0, c3, c1, c2);
615296465Sdelphij    r[2] = c3;
616296465Sdelphij    c3 = 0;
617296465Sdelphij    sqr_add_c2(a, 3, 0, c1, c2, c3);
618296465Sdelphij    sqr_add_c2(a, 2, 1, c1, c2, c3);
619296465Sdelphij    r[3] = c1;
620296465Sdelphij    c1 = 0;
621296465Sdelphij    sqr_add_c(a, 2, c2, c3, c1);
622296465Sdelphij    sqr_add_c2(a, 3, 1, c2, c3, c1);
623296465Sdelphij    r[4] = c2;
624296465Sdelphij    c2 = 0;
625296465Sdelphij    sqr_add_c2(a, 3, 2, c3, c1, c2);
626296465Sdelphij    r[5] = c3;
627296465Sdelphij    c3 = 0;
628296465Sdelphij    sqr_add_c(a, 3, c1, c2, c3);
629296465Sdelphij    r[6] = c1;
630296465Sdelphij    r[7] = c2;
631296465Sdelphij}
632162911Ssimon#endif
633