ecp_nistp521.c revision 325337
1/* crypto/ec/ecp_nistp521.c */
2/*
3 * Written by Adam Langley (Google) for the OpenSSL project
4 */
5/* Copyright 2011 Google Inc.
6 *
7 * Licensed under the Apache License, Version 2.0 (the "License");
8 *
9 * you may not use this file except in compliance with the License.
10 * You may obtain a copy of the License at
11 *
12 *     http://www.apache.org/licenses/LICENSE-2.0
13 *
14 *  Unless required by applicable law or agreed to in writing, software
15 *  distributed under the License is distributed on an "AS IS" BASIS,
16 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 *  See the License for the specific language governing permissions and
18 *  limitations under the License.
19 */
20
21/*
22 * A 64-bit implementation of the NIST P-521 elliptic curve point multiplication
23 *
24 * OpenSSL integration was taken from Emilia Kasper's work in ecp_nistp224.c.
25 * Otherwise based on Emilia's P224 work, which was inspired by my curve25519
26 * work which got its smarts from Daniel J. Bernstein's work on the same.
27 */
28
29#include <openssl/opensslconf.h>
30#ifndef OPENSSL_NO_EC_NISTP_64_GCC_128
31
32# ifndef OPENSSL_SYS_VMS
33#  include <stdint.h>
34# else
35#  include <inttypes.h>
36# endif
37
38# include <string.h>
39# include <openssl/err.h>
40# include "ec_lcl.h"
41
42# if defined(__GNUC__) && (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))
43  /* even with gcc, the typedef won't work for 32-bit platforms */
44typedef __uint128_t uint128_t;  /* nonstandard; implemented by gcc on 64-bit
45                                 * platforms */
46# else
47#  error "Need GCC 3.1 or later to define type uint128_t"
48# endif
49
50typedef uint8_t u8;
51typedef uint64_t u64;
52typedef int64_t s64;
53
54/*
55 * The underlying field. P521 operates over GF(2^521-1). We can serialise an
56 * element of this field into 66 bytes where the most significant byte
57 * contains only a single bit. We call this an felem_bytearray.
58 */
59
60typedef u8 felem_bytearray[66];
61
62/*
63 * These are the parameters of P521, taken from FIPS 186-3, section D.1.2.5.
64 * These values are big-endian.
65 */
66static const felem_bytearray nistp521_curve_params[5] = {
67    {0x01, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* p */
68     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
69     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
70     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
71     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
72     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
73     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
74     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
75     0xff, 0xff},
76    {0x01, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* a = -3 */
77     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
78     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
79     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
80     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
81     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
82     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
83     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
84     0xff, 0xfc},
85    {0x00, 0x51, 0x95, 0x3e, 0xb9, 0x61, 0x8e, 0x1c, /* b */
86     0x9a, 0x1f, 0x92, 0x9a, 0x21, 0xa0, 0xb6, 0x85,
87     0x40, 0xee, 0xa2, 0xda, 0x72, 0x5b, 0x99, 0xb3,
88     0x15, 0xf3, 0xb8, 0xb4, 0x89, 0x91, 0x8e, 0xf1,
89     0x09, 0xe1, 0x56, 0x19, 0x39, 0x51, 0xec, 0x7e,
90     0x93, 0x7b, 0x16, 0x52, 0xc0, 0xbd, 0x3b, 0xb1,
91     0xbf, 0x07, 0x35, 0x73, 0xdf, 0x88, 0x3d, 0x2c,
92     0x34, 0xf1, 0xef, 0x45, 0x1f, 0xd4, 0x6b, 0x50,
93     0x3f, 0x00},
94    {0x00, 0xc6, 0x85, 0x8e, 0x06, 0xb7, 0x04, 0x04, /* x */
95     0xe9, 0xcd, 0x9e, 0x3e, 0xcb, 0x66, 0x23, 0x95,
96     0xb4, 0x42, 0x9c, 0x64, 0x81, 0x39, 0x05, 0x3f,
97     0xb5, 0x21, 0xf8, 0x28, 0xaf, 0x60, 0x6b, 0x4d,
98     0x3d, 0xba, 0xa1, 0x4b, 0x5e, 0x77, 0xef, 0xe7,
99     0x59, 0x28, 0xfe, 0x1d, 0xc1, 0x27, 0xa2, 0xff,
100     0xa8, 0xde, 0x33, 0x48, 0xb3, 0xc1, 0x85, 0x6a,
101     0x42, 0x9b, 0xf9, 0x7e, 0x7e, 0x31, 0xc2, 0xe5,
102     0xbd, 0x66},
103    {0x01, 0x18, 0x39, 0x29, 0x6a, 0x78, 0x9a, 0x3b, /* y */
104     0xc0, 0x04, 0x5c, 0x8a, 0x5f, 0xb4, 0x2c, 0x7d,
105     0x1b, 0xd9, 0x98, 0xf5, 0x44, 0x49, 0x57, 0x9b,
106     0x44, 0x68, 0x17, 0xaf, 0xbd, 0x17, 0x27, 0x3e,
107     0x66, 0x2c, 0x97, 0xee, 0x72, 0x99, 0x5e, 0xf4,
108     0x26, 0x40, 0xc5, 0x50, 0xb9, 0x01, 0x3f, 0xad,
109     0x07, 0x61, 0x35, 0x3c, 0x70, 0x86, 0xa2, 0x72,
110     0xc2, 0x40, 0x88, 0xbe, 0x94, 0x76, 0x9f, 0xd1,
111     0x66, 0x50}
112};
113
114/*-
115 * The representation of field elements.
116 * ------------------------------------
117 *
118 * We represent field elements with nine values. These values are either 64 or
119 * 128 bits and the field element represented is:
120 *   v[0]*2^0 + v[1]*2^58 + v[2]*2^116 + ... + v[8]*2^464  (mod p)
121 * Each of the nine values is called a 'limb'. Since the limbs are spaced only
122 * 58 bits apart, but are greater than 58 bits in length, the most significant
123 * bits of each limb overlap with the least significant bits of the next.
124 *
125 * A field element with 64-bit limbs is an 'felem'. One with 128-bit limbs is a
126 * 'largefelem' */
127
128# define NLIMBS 9
129
130typedef uint64_t limb;
131typedef limb felem[NLIMBS];
132typedef uint128_t largefelem[NLIMBS];
133
134static const limb bottom57bits = 0x1ffffffffffffff;
135static const limb bottom58bits = 0x3ffffffffffffff;
136
137/*
138 * bin66_to_felem takes a little-endian byte array and converts it into felem
139 * form. This assumes that the CPU is little-endian.
140 */
141static void bin66_to_felem(felem out, const u8 in[66])
142{
143    out[0] = (*((limb *) & in[0])) & bottom58bits;
144    out[1] = (*((limb *) & in[7]) >> 2) & bottom58bits;
145    out[2] = (*((limb *) & in[14]) >> 4) & bottom58bits;
146    out[3] = (*((limb *) & in[21]) >> 6) & bottom58bits;
147    out[4] = (*((limb *) & in[29])) & bottom58bits;
148    out[5] = (*((limb *) & in[36]) >> 2) & bottom58bits;
149    out[6] = (*((limb *) & in[43]) >> 4) & bottom58bits;
150    out[7] = (*((limb *) & in[50]) >> 6) & bottom58bits;
151    out[8] = (*((limb *) & in[58])) & bottom57bits;
152}
153
154/*
155 * felem_to_bin66 takes an felem and serialises into a little endian, 66 byte
156 * array. This assumes that the CPU is little-endian.
157 */
158static void felem_to_bin66(u8 out[66], const felem in)
159{
160    memset(out, 0, 66);
161    (*((limb *) & out[0])) = in[0];
162    (*((limb *) & out[7])) |= in[1] << 2;
163    (*((limb *) & out[14])) |= in[2] << 4;
164    (*((limb *) & out[21])) |= in[3] << 6;
165    (*((limb *) & out[29])) = in[4];
166    (*((limb *) & out[36])) |= in[5] << 2;
167    (*((limb *) & out[43])) |= in[6] << 4;
168    (*((limb *) & out[50])) |= in[7] << 6;
169    (*((limb *) & out[58])) = in[8];
170}
171
172/* To preserve endianness when using BN_bn2bin and BN_bin2bn */
173static void flip_endian(u8 *out, const u8 *in, unsigned len)
174{
175    unsigned i;
176    for (i = 0; i < len; ++i)
177        out[i] = in[len - 1 - i];
178}
179
180/* BN_to_felem converts an OpenSSL BIGNUM into an felem */
181static int BN_to_felem(felem out, const BIGNUM *bn)
182{
183    felem_bytearray b_in;
184    felem_bytearray b_out;
185    unsigned num_bytes;
186
187    /* BN_bn2bin eats leading zeroes */
188    memset(b_out, 0, sizeof b_out);
189    num_bytes = BN_num_bytes(bn);
190    if (num_bytes > sizeof b_out) {
191        ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
192        return 0;
193    }
194    if (BN_is_negative(bn)) {
195        ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
196        return 0;
197    }
198    num_bytes = BN_bn2bin(bn, b_in);
199    flip_endian(b_out, b_in, num_bytes);
200    bin66_to_felem(out, b_out);
201    return 1;
202}
203
204/* felem_to_BN converts an felem into an OpenSSL BIGNUM */
205static BIGNUM *felem_to_BN(BIGNUM *out, const felem in)
206{
207    felem_bytearray b_in, b_out;
208    felem_to_bin66(b_in, in);
209    flip_endian(b_out, b_in, sizeof b_out);
210    return BN_bin2bn(b_out, sizeof b_out, out);
211}
212
213/*-
214 * Field operations
215 * ----------------
216 */
217
218static void felem_one(felem out)
219{
220    out[0] = 1;
221    out[1] = 0;
222    out[2] = 0;
223    out[3] = 0;
224    out[4] = 0;
225    out[5] = 0;
226    out[6] = 0;
227    out[7] = 0;
228    out[8] = 0;
229}
230
231static void felem_assign(felem out, const felem in)
232{
233    out[0] = in[0];
234    out[1] = in[1];
235    out[2] = in[2];
236    out[3] = in[3];
237    out[4] = in[4];
238    out[5] = in[5];
239    out[6] = in[6];
240    out[7] = in[7];
241    out[8] = in[8];
242}
243
244/* felem_sum64 sets out = out + in. */
245static void felem_sum64(felem out, const felem in)
246{
247    out[0] += in[0];
248    out[1] += in[1];
249    out[2] += in[2];
250    out[3] += in[3];
251    out[4] += in[4];
252    out[5] += in[5];
253    out[6] += in[6];
254    out[7] += in[7];
255    out[8] += in[8];
256}
257
258/* felem_scalar sets out = in * scalar */
259static void felem_scalar(felem out, const felem in, limb scalar)
260{
261    out[0] = in[0] * scalar;
262    out[1] = in[1] * scalar;
263    out[2] = in[2] * scalar;
264    out[3] = in[3] * scalar;
265    out[4] = in[4] * scalar;
266    out[5] = in[5] * scalar;
267    out[6] = in[6] * scalar;
268    out[7] = in[7] * scalar;
269    out[8] = in[8] * scalar;
270}
271
272/* felem_scalar64 sets out = out * scalar */
273static void felem_scalar64(felem out, limb scalar)
274{
275    out[0] *= scalar;
276    out[1] *= scalar;
277    out[2] *= scalar;
278    out[3] *= scalar;
279    out[4] *= scalar;
280    out[5] *= scalar;
281    out[6] *= scalar;
282    out[7] *= scalar;
283    out[8] *= scalar;
284}
285
286/* felem_scalar128 sets out = out * scalar */
287static void felem_scalar128(largefelem out, limb scalar)
288{
289    out[0] *= scalar;
290    out[1] *= scalar;
291    out[2] *= scalar;
292    out[3] *= scalar;
293    out[4] *= scalar;
294    out[5] *= scalar;
295    out[6] *= scalar;
296    out[7] *= scalar;
297    out[8] *= scalar;
298}
299
300/*-
301 * felem_neg sets |out| to |-in|
302 * On entry:
303 *   in[i] < 2^59 + 2^14
304 * On exit:
305 *   out[i] < 2^62
306 */
307static void felem_neg(felem out, const felem in)
308{
309    /* In order to prevent underflow, we subtract from 0 mod p. */
310    static const limb two62m3 = (((limb) 1) << 62) - (((limb) 1) << 5);
311    static const limb two62m2 = (((limb) 1) << 62) - (((limb) 1) << 4);
312
313    out[0] = two62m3 - in[0];
314    out[1] = two62m2 - in[1];
315    out[2] = two62m2 - in[2];
316    out[3] = two62m2 - in[3];
317    out[4] = two62m2 - in[4];
318    out[5] = two62m2 - in[5];
319    out[6] = two62m2 - in[6];
320    out[7] = two62m2 - in[7];
321    out[8] = two62m2 - in[8];
322}
323
324/*-
325 * felem_diff64 subtracts |in| from |out|
326 * On entry:
327 *   in[i] < 2^59 + 2^14
328 * On exit:
329 *   out[i] < out[i] + 2^62
330 */
331static void felem_diff64(felem out, const felem in)
332{
333    /*
334     * In order to prevent underflow, we add 0 mod p before subtracting.
335     */
336    static const limb two62m3 = (((limb) 1) << 62) - (((limb) 1) << 5);
337    static const limb two62m2 = (((limb) 1) << 62) - (((limb) 1) << 4);
338
339    out[0] += two62m3 - in[0];
340    out[1] += two62m2 - in[1];
341    out[2] += two62m2 - in[2];
342    out[3] += two62m2 - in[3];
343    out[4] += two62m2 - in[4];
344    out[5] += two62m2 - in[5];
345    out[6] += two62m2 - in[6];
346    out[7] += two62m2 - in[7];
347    out[8] += two62m2 - in[8];
348}
349
350/*-
351 * felem_diff_128_64 subtracts |in| from |out|
352 * On entry:
353 *   in[i] < 2^62 + 2^17
354 * On exit:
355 *   out[i] < out[i] + 2^63
356 */
357static void felem_diff_128_64(largefelem out, const felem in)
358{
359    /*
360     * In order to prevent underflow, we add 0 mod p before subtracting.
361     */
362    static const limb two63m6 = (((limb) 1) << 62) - (((limb) 1) << 5);
363    static const limb two63m5 = (((limb) 1) << 62) - (((limb) 1) << 4);
364
365    out[0] += two63m6 - in[0];
366    out[1] += two63m5 - in[1];
367    out[2] += two63m5 - in[2];
368    out[3] += two63m5 - in[3];
369    out[4] += two63m5 - in[4];
370    out[5] += two63m5 - in[5];
371    out[6] += two63m5 - in[6];
372    out[7] += two63m5 - in[7];
373    out[8] += two63m5 - in[8];
374}
375
376/*-
377 * felem_diff_128_64 subtracts |in| from |out|
378 * On entry:
379 *   in[i] < 2^126
380 * On exit:
381 *   out[i] < out[i] + 2^127 - 2^69
382 */
383static void felem_diff128(largefelem out, const largefelem in)
384{
385    /*
386     * In order to prevent underflow, we add 0 mod p before subtracting.
387     */
388    static const uint128_t two127m70 =
389        (((uint128_t) 1) << 127) - (((uint128_t) 1) << 70);
390    static const uint128_t two127m69 =
391        (((uint128_t) 1) << 127) - (((uint128_t) 1) << 69);
392
393    out[0] += (two127m70 - in[0]);
394    out[1] += (two127m69 - in[1]);
395    out[2] += (two127m69 - in[2]);
396    out[3] += (two127m69 - in[3]);
397    out[4] += (two127m69 - in[4]);
398    out[5] += (two127m69 - in[5]);
399    out[6] += (two127m69 - in[6]);
400    out[7] += (two127m69 - in[7]);
401    out[8] += (two127m69 - in[8]);
402}
403
404/*-
405 * felem_square sets |out| = |in|^2
406 * On entry:
407 *   in[i] < 2^62
408 * On exit:
409 *   out[i] < 17 * max(in[i]) * max(in[i])
410 */
411static void felem_square(largefelem out, const felem in)
412{
413    felem inx2, inx4;
414    felem_scalar(inx2, in, 2);
415    felem_scalar(inx4, in, 4);
416
417    /*-
418     * We have many cases were we want to do
419     *   in[x] * in[y] +
420     *   in[y] * in[x]
421     * This is obviously just
422     *   2 * in[x] * in[y]
423     * However, rather than do the doubling on the 128 bit result, we
424     * double one of the inputs to the multiplication by reading from
425     * |inx2|
426     */
427
428    out[0] = ((uint128_t) in[0]) * in[0];
429    out[1] = ((uint128_t) in[0]) * inx2[1];
430    out[2] = ((uint128_t) in[0]) * inx2[2] + ((uint128_t) in[1]) * in[1];
431    out[3] = ((uint128_t) in[0]) * inx2[3] + ((uint128_t) in[1]) * inx2[2];
432    out[4] = ((uint128_t) in[0]) * inx2[4] +
433        ((uint128_t) in[1]) * inx2[3] + ((uint128_t) in[2]) * in[2];
434    out[5] = ((uint128_t) in[0]) * inx2[5] +
435        ((uint128_t) in[1]) * inx2[4] + ((uint128_t) in[2]) * inx2[3];
436    out[6] = ((uint128_t) in[0]) * inx2[6] +
437        ((uint128_t) in[1]) * inx2[5] +
438        ((uint128_t) in[2]) * inx2[4] + ((uint128_t) in[3]) * in[3];
439    out[7] = ((uint128_t) in[0]) * inx2[7] +
440        ((uint128_t) in[1]) * inx2[6] +
441        ((uint128_t) in[2]) * inx2[5] + ((uint128_t) in[3]) * inx2[4];
442    out[8] = ((uint128_t) in[0]) * inx2[8] +
443        ((uint128_t) in[1]) * inx2[7] +
444        ((uint128_t) in[2]) * inx2[6] +
445        ((uint128_t) in[3]) * inx2[5] + ((uint128_t) in[4]) * in[4];
446
447    /*
448     * The remaining limbs fall above 2^521, with the first falling at 2^522.
449     * They correspond to locations one bit up from the limbs produced above
450     * so we would have to multiply by two to align them. Again, rather than
451     * operate on the 128-bit result, we double one of the inputs to the
452     * multiplication. If we want to double for both this reason, and the
453     * reason above, then we end up multiplying by four.
454     */
455
456    /* 9 */
457    out[0] += ((uint128_t) in[1]) * inx4[8] +
458        ((uint128_t) in[2]) * inx4[7] +
459        ((uint128_t) in[3]) * inx4[6] + ((uint128_t) in[4]) * inx4[5];
460
461    /* 10 */
462    out[1] += ((uint128_t) in[2]) * inx4[8] +
463        ((uint128_t) in[3]) * inx4[7] +
464        ((uint128_t) in[4]) * inx4[6] + ((uint128_t) in[5]) * inx2[5];
465
466    /* 11 */
467    out[2] += ((uint128_t) in[3]) * inx4[8] +
468        ((uint128_t) in[4]) * inx4[7] + ((uint128_t) in[5]) * inx4[6];
469
470    /* 12 */
471    out[3] += ((uint128_t) in[4]) * inx4[8] +
472        ((uint128_t) in[5]) * inx4[7] + ((uint128_t) in[6]) * inx2[6];
473
474    /* 13 */
475    out[4] += ((uint128_t) in[5]) * inx4[8] + ((uint128_t) in[6]) * inx4[7];
476
477    /* 14 */
478    out[5] += ((uint128_t) in[6]) * inx4[8] + ((uint128_t) in[7]) * inx2[7];
479
480    /* 15 */
481    out[6] += ((uint128_t) in[7]) * inx4[8];
482
483    /* 16 */
484    out[7] += ((uint128_t) in[8]) * inx2[8];
485}
486
487/*-
488 * felem_mul sets |out| = |in1| * |in2|
489 * On entry:
490 *   in1[i] < 2^64
491 *   in2[i] < 2^63
492 * On exit:
493 *   out[i] < 17 * max(in1[i]) * max(in2[i])
494 */
495static void felem_mul(largefelem out, const felem in1, const felem in2)
496{
497    felem in2x2;
498    felem_scalar(in2x2, in2, 2);
499
500    out[0] = ((uint128_t) in1[0]) * in2[0];
501
502    out[1] = ((uint128_t) in1[0]) * in2[1] + ((uint128_t) in1[1]) * in2[0];
503
504    out[2] = ((uint128_t) in1[0]) * in2[2] +
505        ((uint128_t) in1[1]) * in2[1] + ((uint128_t) in1[2]) * in2[0];
506
507    out[3] = ((uint128_t) in1[0]) * in2[3] +
508        ((uint128_t) in1[1]) * in2[2] +
509        ((uint128_t) in1[2]) * in2[1] + ((uint128_t) in1[3]) * in2[0];
510
511    out[4] = ((uint128_t) in1[0]) * in2[4] +
512        ((uint128_t) in1[1]) * in2[3] +
513        ((uint128_t) in1[2]) * in2[2] +
514        ((uint128_t) in1[3]) * in2[1] + ((uint128_t) in1[4]) * in2[0];
515
516    out[5] = ((uint128_t) in1[0]) * in2[5] +
517        ((uint128_t) in1[1]) * in2[4] +
518        ((uint128_t) in1[2]) * in2[3] +
519        ((uint128_t) in1[3]) * in2[2] +
520        ((uint128_t) in1[4]) * in2[1] + ((uint128_t) in1[5]) * in2[0];
521
522    out[6] = ((uint128_t) in1[0]) * in2[6] +
523        ((uint128_t) in1[1]) * in2[5] +
524        ((uint128_t) in1[2]) * in2[4] +
525        ((uint128_t) in1[3]) * in2[3] +
526        ((uint128_t) in1[4]) * in2[2] +
527        ((uint128_t) in1[5]) * in2[1] + ((uint128_t) in1[6]) * in2[0];
528
529    out[7] = ((uint128_t) in1[0]) * in2[7] +
530        ((uint128_t) in1[1]) * in2[6] +
531        ((uint128_t) in1[2]) * in2[5] +
532        ((uint128_t) in1[3]) * in2[4] +
533        ((uint128_t) in1[4]) * in2[3] +
534        ((uint128_t) in1[5]) * in2[2] +
535        ((uint128_t) in1[6]) * in2[1] + ((uint128_t) in1[7]) * in2[0];
536
537    out[8] = ((uint128_t) in1[0]) * in2[8] +
538        ((uint128_t) in1[1]) * in2[7] +
539        ((uint128_t) in1[2]) * in2[6] +
540        ((uint128_t) in1[3]) * in2[5] +
541        ((uint128_t) in1[4]) * in2[4] +
542        ((uint128_t) in1[5]) * in2[3] +
543        ((uint128_t) in1[6]) * in2[2] +
544        ((uint128_t) in1[7]) * in2[1] + ((uint128_t) in1[8]) * in2[0];
545
546    /* See comment in felem_square about the use of in2x2 here */
547
548    out[0] += ((uint128_t) in1[1]) * in2x2[8] +
549        ((uint128_t) in1[2]) * in2x2[7] +
550        ((uint128_t) in1[3]) * in2x2[6] +
551        ((uint128_t) in1[4]) * in2x2[5] +
552        ((uint128_t) in1[5]) * in2x2[4] +
553        ((uint128_t) in1[6]) * in2x2[3] +
554        ((uint128_t) in1[7]) * in2x2[2] + ((uint128_t) in1[8]) * in2x2[1];
555
556    out[1] += ((uint128_t) in1[2]) * in2x2[8] +
557        ((uint128_t) in1[3]) * in2x2[7] +
558        ((uint128_t) in1[4]) * in2x2[6] +
559        ((uint128_t) in1[5]) * in2x2[5] +
560        ((uint128_t) in1[6]) * in2x2[4] +
561        ((uint128_t) in1[7]) * in2x2[3] + ((uint128_t) in1[8]) * in2x2[2];
562
563    out[2] += ((uint128_t) in1[3]) * in2x2[8] +
564        ((uint128_t) in1[4]) * in2x2[7] +
565        ((uint128_t) in1[5]) * in2x2[6] +
566        ((uint128_t) in1[6]) * in2x2[5] +
567        ((uint128_t) in1[7]) * in2x2[4] + ((uint128_t) in1[8]) * in2x2[3];
568
569    out[3] += ((uint128_t) in1[4]) * in2x2[8] +
570        ((uint128_t) in1[5]) * in2x2[7] +
571        ((uint128_t) in1[6]) * in2x2[6] +
572        ((uint128_t) in1[7]) * in2x2[5] + ((uint128_t) in1[8]) * in2x2[4];
573
574    out[4] += ((uint128_t) in1[5]) * in2x2[8] +
575        ((uint128_t) in1[6]) * in2x2[7] +
576        ((uint128_t) in1[7]) * in2x2[6] + ((uint128_t) in1[8]) * in2x2[5];
577
578    out[5] += ((uint128_t) in1[6]) * in2x2[8] +
579        ((uint128_t) in1[7]) * in2x2[7] + ((uint128_t) in1[8]) * in2x2[6];
580
581    out[6] += ((uint128_t) in1[7]) * in2x2[8] +
582        ((uint128_t) in1[8]) * in2x2[7];
583
584    out[7] += ((uint128_t) in1[8]) * in2x2[8];
585}
586
587static const limb bottom52bits = 0xfffffffffffff;
588
589/*-
590 * felem_reduce converts a largefelem to an felem.
591 * On entry:
592 *   in[i] < 2^128
593 * On exit:
594 *   out[i] < 2^59 + 2^14
595 */
596static void felem_reduce(felem out, const largefelem in)
597{
598    u64 overflow1, overflow2;
599
600    out[0] = ((limb) in[0]) & bottom58bits;
601    out[1] = ((limb) in[1]) & bottom58bits;
602    out[2] = ((limb) in[2]) & bottom58bits;
603    out[3] = ((limb) in[3]) & bottom58bits;
604    out[4] = ((limb) in[4]) & bottom58bits;
605    out[5] = ((limb) in[5]) & bottom58bits;
606    out[6] = ((limb) in[6]) & bottom58bits;
607    out[7] = ((limb) in[7]) & bottom58bits;
608    out[8] = ((limb) in[8]) & bottom58bits;
609
610    /* out[i] < 2^58 */
611
612    out[1] += ((limb) in[0]) >> 58;
613    out[1] += (((limb) (in[0] >> 64)) & bottom52bits) << 6;
614    /*-
615     * out[1] < 2^58 + 2^6 + 2^58
616     *        = 2^59 + 2^6
617     */
618    out[2] += ((limb) (in[0] >> 64)) >> 52;
619
620    out[2] += ((limb) in[1]) >> 58;
621    out[2] += (((limb) (in[1] >> 64)) & bottom52bits) << 6;
622    out[3] += ((limb) (in[1] >> 64)) >> 52;
623
624    out[3] += ((limb) in[2]) >> 58;
625    out[3] += (((limb) (in[2] >> 64)) & bottom52bits) << 6;
626    out[4] += ((limb) (in[2] >> 64)) >> 52;
627
628    out[4] += ((limb) in[3]) >> 58;
629    out[4] += (((limb) (in[3] >> 64)) & bottom52bits) << 6;
630    out[5] += ((limb) (in[3] >> 64)) >> 52;
631
632    out[5] += ((limb) in[4]) >> 58;
633    out[5] += (((limb) (in[4] >> 64)) & bottom52bits) << 6;
634    out[6] += ((limb) (in[4] >> 64)) >> 52;
635
636    out[6] += ((limb) in[5]) >> 58;
637    out[6] += (((limb) (in[5] >> 64)) & bottom52bits) << 6;
638    out[7] += ((limb) (in[5] >> 64)) >> 52;
639
640    out[7] += ((limb) in[6]) >> 58;
641    out[7] += (((limb) (in[6] >> 64)) & bottom52bits) << 6;
642    out[8] += ((limb) (in[6] >> 64)) >> 52;
643
644    out[8] += ((limb) in[7]) >> 58;
645    out[8] += (((limb) (in[7] >> 64)) & bottom52bits) << 6;
646    /*-
647     * out[x > 1] < 2^58 + 2^6 + 2^58 + 2^12
648     *            < 2^59 + 2^13
649     */
650    overflow1 = ((limb) (in[7] >> 64)) >> 52;
651
652    overflow1 += ((limb) in[8]) >> 58;
653    overflow1 += (((limb) (in[8] >> 64)) & bottom52bits) << 6;
654    overflow2 = ((limb) (in[8] >> 64)) >> 52;
655
656    overflow1 <<= 1;            /* overflow1 < 2^13 + 2^7 + 2^59 */
657    overflow2 <<= 1;            /* overflow2 < 2^13 */
658
659    out[0] += overflow1;        /* out[0] < 2^60 */
660    out[1] += overflow2;        /* out[1] < 2^59 + 2^6 + 2^13 */
661
662    out[1] += out[0] >> 58;
663    out[0] &= bottom58bits;
664    /*-
665     * out[0] < 2^58
666     * out[1] < 2^59 + 2^6 + 2^13 + 2^2
667     *        < 2^59 + 2^14
668     */
669}
670
671static void felem_square_reduce(felem out, const felem in)
672{
673    largefelem tmp;
674    felem_square(tmp, in);
675    felem_reduce(out, tmp);
676}
677
678static void felem_mul_reduce(felem out, const felem in1, const felem in2)
679{
680    largefelem tmp;
681    felem_mul(tmp, in1, in2);
682    felem_reduce(out, tmp);
683}
684
685/*-
686 * felem_inv calculates |out| = |in|^{-1}
687 *
688 * Based on Fermat's Little Theorem:
689 *   a^p = a (mod p)
690 *   a^{p-1} = 1 (mod p)
691 *   a^{p-2} = a^{-1} (mod p)
692 */
693static void felem_inv(felem out, const felem in)
694{
695    felem ftmp, ftmp2, ftmp3, ftmp4;
696    largefelem tmp;
697    unsigned i;
698
699    felem_square(tmp, in);
700    felem_reduce(ftmp, tmp);    /* 2^1 */
701    felem_mul(tmp, in, ftmp);
702    felem_reduce(ftmp, tmp);    /* 2^2 - 2^0 */
703    felem_assign(ftmp2, ftmp);
704    felem_square(tmp, ftmp);
705    felem_reduce(ftmp, tmp);    /* 2^3 - 2^1 */
706    felem_mul(tmp, in, ftmp);
707    felem_reduce(ftmp, tmp);    /* 2^3 - 2^0 */
708    felem_square(tmp, ftmp);
709    felem_reduce(ftmp, tmp);    /* 2^4 - 2^1 */
710
711    felem_square(tmp, ftmp2);
712    felem_reduce(ftmp3, tmp);   /* 2^3 - 2^1 */
713    felem_square(tmp, ftmp3);
714    felem_reduce(ftmp3, tmp);   /* 2^4 - 2^2 */
715    felem_mul(tmp, ftmp3, ftmp2);
716    felem_reduce(ftmp3, tmp);   /* 2^4 - 2^0 */
717
718    felem_assign(ftmp2, ftmp3);
719    felem_square(tmp, ftmp3);
720    felem_reduce(ftmp3, tmp);   /* 2^5 - 2^1 */
721    felem_square(tmp, ftmp3);
722    felem_reduce(ftmp3, tmp);   /* 2^6 - 2^2 */
723    felem_square(tmp, ftmp3);
724    felem_reduce(ftmp3, tmp);   /* 2^7 - 2^3 */
725    felem_square(tmp, ftmp3);
726    felem_reduce(ftmp3, tmp);   /* 2^8 - 2^4 */
727    felem_assign(ftmp4, ftmp3);
728    felem_mul(tmp, ftmp3, ftmp);
729    felem_reduce(ftmp4, tmp);   /* 2^8 - 2^1 */
730    felem_square(tmp, ftmp4);
731    felem_reduce(ftmp4, tmp);   /* 2^9 - 2^2 */
732    felem_mul(tmp, ftmp3, ftmp2);
733    felem_reduce(ftmp3, tmp);   /* 2^8 - 2^0 */
734    felem_assign(ftmp2, ftmp3);
735
736    for (i = 0; i < 8; i++) {
737        felem_square(tmp, ftmp3);
738        felem_reduce(ftmp3, tmp); /* 2^16 - 2^8 */
739    }
740    felem_mul(tmp, ftmp3, ftmp2);
741    felem_reduce(ftmp3, tmp);   /* 2^16 - 2^0 */
742    felem_assign(ftmp2, ftmp3);
743
744    for (i = 0; i < 16; i++) {
745        felem_square(tmp, ftmp3);
746        felem_reduce(ftmp3, tmp); /* 2^32 - 2^16 */
747    }
748    felem_mul(tmp, ftmp3, ftmp2);
749    felem_reduce(ftmp3, tmp);   /* 2^32 - 2^0 */
750    felem_assign(ftmp2, ftmp3);
751
752    for (i = 0; i < 32; i++) {
753        felem_square(tmp, ftmp3);
754        felem_reduce(ftmp3, tmp); /* 2^64 - 2^32 */
755    }
756    felem_mul(tmp, ftmp3, ftmp2);
757    felem_reduce(ftmp3, tmp);   /* 2^64 - 2^0 */
758    felem_assign(ftmp2, ftmp3);
759
760    for (i = 0; i < 64; i++) {
761        felem_square(tmp, ftmp3);
762        felem_reduce(ftmp3, tmp); /* 2^128 - 2^64 */
763    }
764    felem_mul(tmp, ftmp3, ftmp2);
765    felem_reduce(ftmp3, tmp);   /* 2^128 - 2^0 */
766    felem_assign(ftmp2, ftmp3);
767
768    for (i = 0; i < 128; i++) {
769        felem_square(tmp, ftmp3);
770        felem_reduce(ftmp3, tmp); /* 2^256 - 2^128 */
771    }
772    felem_mul(tmp, ftmp3, ftmp2);
773    felem_reduce(ftmp3, tmp);   /* 2^256 - 2^0 */
774    felem_assign(ftmp2, ftmp3);
775
776    for (i = 0; i < 256; i++) {
777        felem_square(tmp, ftmp3);
778        felem_reduce(ftmp3, tmp); /* 2^512 - 2^256 */
779    }
780    felem_mul(tmp, ftmp3, ftmp2);
781    felem_reduce(ftmp3, tmp);   /* 2^512 - 2^0 */
782
783    for (i = 0; i < 9; i++) {
784        felem_square(tmp, ftmp3);
785        felem_reduce(ftmp3, tmp); /* 2^521 - 2^9 */
786    }
787    felem_mul(tmp, ftmp3, ftmp4);
788    felem_reduce(ftmp3, tmp);   /* 2^512 - 2^2 */
789    felem_mul(tmp, ftmp3, in);
790    felem_reduce(out, tmp);     /* 2^512 - 3 */
791}
792
793/* This is 2^521-1, expressed as an felem */
794static const felem kPrime = {
795    0x03ffffffffffffff, 0x03ffffffffffffff, 0x03ffffffffffffff,
796    0x03ffffffffffffff, 0x03ffffffffffffff, 0x03ffffffffffffff,
797    0x03ffffffffffffff, 0x03ffffffffffffff, 0x01ffffffffffffff
798};
799
800/*-
801 * felem_is_zero returns a limb with all bits set if |in| == 0 (mod p) and 0
802 * otherwise.
803 * On entry:
804 *   in[i] < 2^59 + 2^14
805 */
806static limb felem_is_zero(const felem in)
807{
808    felem ftmp;
809    limb is_zero, is_p;
810    felem_assign(ftmp, in);
811
812    ftmp[0] += ftmp[8] >> 57;
813    ftmp[8] &= bottom57bits;
814    /* ftmp[8] < 2^57 */
815    ftmp[1] += ftmp[0] >> 58;
816    ftmp[0] &= bottom58bits;
817    ftmp[2] += ftmp[1] >> 58;
818    ftmp[1] &= bottom58bits;
819    ftmp[3] += ftmp[2] >> 58;
820    ftmp[2] &= bottom58bits;
821    ftmp[4] += ftmp[3] >> 58;
822    ftmp[3] &= bottom58bits;
823    ftmp[5] += ftmp[4] >> 58;
824    ftmp[4] &= bottom58bits;
825    ftmp[6] += ftmp[5] >> 58;
826    ftmp[5] &= bottom58bits;
827    ftmp[7] += ftmp[6] >> 58;
828    ftmp[6] &= bottom58bits;
829    ftmp[8] += ftmp[7] >> 58;
830    ftmp[7] &= bottom58bits;
831    /* ftmp[8] < 2^57 + 4 */
832
833    /*
834     * The ninth limb of 2*(2^521-1) is 0x03ffffffffffffff, which is greater
835     * than our bound for ftmp[8]. Therefore we only have to check if the
836     * zero is zero or 2^521-1.
837     */
838
839    is_zero = 0;
840    is_zero |= ftmp[0];
841    is_zero |= ftmp[1];
842    is_zero |= ftmp[2];
843    is_zero |= ftmp[3];
844    is_zero |= ftmp[4];
845    is_zero |= ftmp[5];
846    is_zero |= ftmp[6];
847    is_zero |= ftmp[7];
848    is_zero |= ftmp[8];
849
850    is_zero--;
851    /*
852     * We know that ftmp[i] < 2^63, therefore the only way that the top bit
853     * can be set is if is_zero was 0 before the decrement.
854     */
855    is_zero = ((s64) is_zero) >> 63;
856
857    is_p = ftmp[0] ^ kPrime[0];
858    is_p |= ftmp[1] ^ kPrime[1];
859    is_p |= ftmp[2] ^ kPrime[2];
860    is_p |= ftmp[3] ^ kPrime[3];
861    is_p |= ftmp[4] ^ kPrime[4];
862    is_p |= ftmp[5] ^ kPrime[5];
863    is_p |= ftmp[6] ^ kPrime[6];
864    is_p |= ftmp[7] ^ kPrime[7];
865    is_p |= ftmp[8] ^ kPrime[8];
866
867    is_p--;
868    is_p = ((s64) is_p) >> 63;
869
870    is_zero |= is_p;
871    return is_zero;
872}
873
874static int felem_is_zero_int(const void *in)
875{
876    return (int)(felem_is_zero(in) & ((limb) 1));
877}
878
879/*-
880 * felem_contract converts |in| to its unique, minimal representation.
881 * On entry:
882 *   in[i] < 2^59 + 2^14
883 */
884static void felem_contract(felem out, const felem in)
885{
886    limb is_p, is_greater, sign;
887    static const limb two58 = ((limb) 1) << 58;
888
889    felem_assign(out, in);
890
891    out[0] += out[8] >> 57;
892    out[8] &= bottom57bits;
893    /* out[8] < 2^57 */
894    out[1] += out[0] >> 58;
895    out[0] &= bottom58bits;
896    out[2] += out[1] >> 58;
897    out[1] &= bottom58bits;
898    out[3] += out[2] >> 58;
899    out[2] &= bottom58bits;
900    out[4] += out[3] >> 58;
901    out[3] &= bottom58bits;
902    out[5] += out[4] >> 58;
903    out[4] &= bottom58bits;
904    out[6] += out[5] >> 58;
905    out[5] &= bottom58bits;
906    out[7] += out[6] >> 58;
907    out[6] &= bottom58bits;
908    out[8] += out[7] >> 58;
909    out[7] &= bottom58bits;
910    /* out[8] < 2^57 + 4 */
911
912    /*
913     * If the value is greater than 2^521-1 then we have to subtract 2^521-1
914     * out. See the comments in felem_is_zero regarding why we don't test for
915     * other multiples of the prime.
916     */
917
918    /*
919     * First, if |out| is equal to 2^521-1, we subtract it out to get zero.
920     */
921
922    is_p = out[0] ^ kPrime[0];
923    is_p |= out[1] ^ kPrime[1];
924    is_p |= out[2] ^ kPrime[2];
925    is_p |= out[3] ^ kPrime[3];
926    is_p |= out[4] ^ kPrime[4];
927    is_p |= out[5] ^ kPrime[5];
928    is_p |= out[6] ^ kPrime[6];
929    is_p |= out[7] ^ kPrime[7];
930    is_p |= out[8] ^ kPrime[8];
931
932    is_p--;
933    is_p &= is_p << 32;
934    is_p &= is_p << 16;
935    is_p &= is_p << 8;
936    is_p &= is_p << 4;
937    is_p &= is_p << 2;
938    is_p &= is_p << 1;
939    is_p = ((s64) is_p) >> 63;
940    is_p = ~is_p;
941
942    /* is_p is 0 iff |out| == 2^521-1 and all ones otherwise */
943
944    out[0] &= is_p;
945    out[1] &= is_p;
946    out[2] &= is_p;
947    out[3] &= is_p;
948    out[4] &= is_p;
949    out[5] &= is_p;
950    out[6] &= is_p;
951    out[7] &= is_p;
952    out[8] &= is_p;
953
954    /*
955     * In order to test that |out| >= 2^521-1 we need only test if out[8] >>
956     * 57 is greater than zero as (2^521-1) + x >= 2^522
957     */
958    is_greater = out[8] >> 57;
959    is_greater |= is_greater << 32;
960    is_greater |= is_greater << 16;
961    is_greater |= is_greater << 8;
962    is_greater |= is_greater << 4;
963    is_greater |= is_greater << 2;
964    is_greater |= is_greater << 1;
965    is_greater = ((s64) is_greater) >> 63;
966
967    out[0] -= kPrime[0] & is_greater;
968    out[1] -= kPrime[1] & is_greater;
969    out[2] -= kPrime[2] & is_greater;
970    out[3] -= kPrime[3] & is_greater;
971    out[4] -= kPrime[4] & is_greater;
972    out[5] -= kPrime[5] & is_greater;
973    out[6] -= kPrime[6] & is_greater;
974    out[7] -= kPrime[7] & is_greater;
975    out[8] -= kPrime[8] & is_greater;
976
977    /* Eliminate negative coefficients */
978    sign = -(out[0] >> 63);
979    out[0] += (two58 & sign);
980    out[1] -= (1 & sign);
981    sign = -(out[1] >> 63);
982    out[1] += (two58 & sign);
983    out[2] -= (1 & sign);
984    sign = -(out[2] >> 63);
985    out[2] += (two58 & sign);
986    out[3] -= (1 & sign);
987    sign = -(out[3] >> 63);
988    out[3] += (two58 & sign);
989    out[4] -= (1 & sign);
990    sign = -(out[4] >> 63);
991    out[4] += (two58 & sign);
992    out[5] -= (1 & sign);
993    sign = -(out[0] >> 63);
994    out[5] += (two58 & sign);
995    out[6] -= (1 & sign);
996    sign = -(out[6] >> 63);
997    out[6] += (two58 & sign);
998    out[7] -= (1 & sign);
999    sign = -(out[7] >> 63);
1000    out[7] += (two58 & sign);
1001    out[8] -= (1 & sign);
1002    sign = -(out[5] >> 63);
1003    out[5] += (two58 & sign);
1004    out[6] -= (1 & sign);
1005    sign = -(out[6] >> 63);
1006    out[6] += (two58 & sign);
1007    out[7] -= (1 & sign);
1008    sign = -(out[7] >> 63);
1009    out[7] += (two58 & sign);
1010    out[8] -= (1 & sign);
1011}
1012
1013/*-
1014 * Group operations
1015 * ----------------
1016 *
1017 * Building on top of the field operations we have the operations on the
1018 * elliptic curve group itself. Points on the curve are represented in Jacobian
1019 * coordinates */
1020
1021/*-
1022 * point_double calcuates 2*(x_in, y_in, z_in)
1023 *
1024 * The method is taken from:
1025 *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b
1026 *
1027 * Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed.
1028 * while x_out == y_in is not (maybe this works, but it's not tested). */
1029static void
1030point_double(felem x_out, felem y_out, felem z_out,
1031             const felem x_in, const felem y_in, const felem z_in)
1032{
1033    largefelem tmp, tmp2;
1034    felem delta, gamma, beta, alpha, ftmp, ftmp2;
1035
1036    felem_assign(ftmp, x_in);
1037    felem_assign(ftmp2, x_in);
1038
1039    /* delta = z^2 */
1040    felem_square(tmp, z_in);
1041    felem_reduce(delta, tmp);   /* delta[i] < 2^59 + 2^14 */
1042
1043    /* gamma = y^2 */
1044    felem_square(tmp, y_in);
1045    felem_reduce(gamma, tmp);   /* gamma[i] < 2^59 + 2^14 */
1046
1047    /* beta = x*gamma */
1048    felem_mul(tmp, x_in, gamma);
1049    felem_reduce(beta, tmp);    /* beta[i] < 2^59 + 2^14 */
1050
1051    /* alpha = 3*(x-delta)*(x+delta) */
1052    felem_diff64(ftmp, delta);
1053    /* ftmp[i] < 2^61 */
1054    felem_sum64(ftmp2, delta);
1055    /* ftmp2[i] < 2^60 + 2^15 */
1056    felem_scalar64(ftmp2, 3);
1057    /* ftmp2[i] < 3*2^60 + 3*2^15 */
1058    felem_mul(tmp, ftmp, ftmp2);
1059    /*-
1060     * tmp[i] < 17(3*2^121 + 3*2^76)
1061     *        = 61*2^121 + 61*2^76
1062     *        < 64*2^121 + 64*2^76
1063     *        = 2^127 + 2^82
1064     *        < 2^128
1065     */
1066    felem_reduce(alpha, tmp);
1067
1068    /* x' = alpha^2 - 8*beta */
1069    felem_square(tmp, alpha);
1070    /*
1071     * tmp[i] < 17*2^120 < 2^125
1072     */
1073    felem_assign(ftmp, beta);
1074    felem_scalar64(ftmp, 8);
1075    /* ftmp[i] < 2^62 + 2^17 */
1076    felem_diff_128_64(tmp, ftmp);
1077    /* tmp[i] < 2^125 + 2^63 + 2^62 + 2^17 */
1078    felem_reduce(x_out, tmp);
1079
1080    /* z' = (y + z)^2 - gamma - delta */
1081    felem_sum64(delta, gamma);
1082    /* delta[i] < 2^60 + 2^15 */
1083    felem_assign(ftmp, y_in);
1084    felem_sum64(ftmp, z_in);
1085    /* ftmp[i] < 2^60 + 2^15 */
1086    felem_square(tmp, ftmp);
1087    /*
1088     * tmp[i] < 17(2^122) < 2^127
1089     */
1090    felem_diff_128_64(tmp, delta);
1091    /* tmp[i] < 2^127 + 2^63 */
1092    felem_reduce(z_out, tmp);
1093
1094    /* y' = alpha*(4*beta - x') - 8*gamma^2 */
1095    felem_scalar64(beta, 4);
1096    /* beta[i] < 2^61 + 2^16 */
1097    felem_diff64(beta, x_out);
1098    /* beta[i] < 2^61 + 2^60 + 2^16 */
1099    felem_mul(tmp, alpha, beta);
1100    /*-
1101     * tmp[i] < 17*((2^59 + 2^14)(2^61 + 2^60 + 2^16))
1102     *        = 17*(2^120 + 2^75 + 2^119 + 2^74 + 2^75 + 2^30)
1103     *        = 17*(2^120 + 2^119 + 2^76 + 2^74 + 2^30)
1104     *        < 2^128
1105     */
1106    felem_square(tmp2, gamma);
1107    /*-
1108     * tmp2[i] < 17*(2^59 + 2^14)^2
1109     *         = 17*(2^118 + 2^74 + 2^28)
1110     */
1111    felem_scalar128(tmp2, 8);
1112    /*-
1113     * tmp2[i] < 8*17*(2^118 + 2^74 + 2^28)
1114     *         = 2^125 + 2^121 + 2^81 + 2^77 + 2^35 + 2^31
1115     *         < 2^126
1116     */
1117    felem_diff128(tmp, tmp2);
1118    /*-
1119     * tmp[i] < 2^127 - 2^69 + 17(2^120 + 2^119 + 2^76 + 2^74 + 2^30)
1120     *        = 2^127 + 2^124 + 2^122 + 2^120 + 2^118 + 2^80 + 2^78 + 2^76 +
1121     *          2^74 + 2^69 + 2^34 + 2^30
1122     *        < 2^128
1123     */
1124    felem_reduce(y_out, tmp);
1125}
1126
1127/* copy_conditional copies in to out iff mask is all ones. */
1128static void copy_conditional(felem out, const felem in, limb mask)
1129{
1130    unsigned i;
1131    for (i = 0; i < NLIMBS; ++i) {
1132        const limb tmp = mask & (in[i] ^ out[i]);
1133        out[i] ^= tmp;
1134    }
1135}
1136
1137/*-
1138 * point_add calcuates (x1, y1, z1) + (x2, y2, z2)
1139 *
1140 * The method is taken from
1141 *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl,
1142 * adapted for mixed addition (z2 = 1, or z2 = 0 for the point at infinity).
1143 *
1144 * This function includes a branch for checking whether the two input points
1145 * are equal (while not equal to the point at infinity). This case never
1146 * happens during single point multiplication, so there is no timing leak for
1147 * ECDH or ECDSA signing. */
1148static void point_add(felem x3, felem y3, felem z3,
1149                      const felem x1, const felem y1, const felem z1,
1150                      const int mixed, const felem x2, const felem y2,
1151                      const felem z2)
1152{
1153    felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, ftmp6, x_out, y_out, z_out;
1154    largefelem tmp, tmp2;
1155    limb x_equal, y_equal, z1_is_zero, z2_is_zero;
1156
1157    z1_is_zero = felem_is_zero(z1);
1158    z2_is_zero = felem_is_zero(z2);
1159
1160    /* ftmp = z1z1 = z1**2 */
1161    felem_square(tmp, z1);
1162    felem_reduce(ftmp, tmp);
1163
1164    if (!mixed) {
1165        /* ftmp2 = z2z2 = z2**2 */
1166        felem_square(tmp, z2);
1167        felem_reduce(ftmp2, tmp);
1168
1169        /* u1 = ftmp3 = x1*z2z2 */
1170        felem_mul(tmp, x1, ftmp2);
1171        felem_reduce(ftmp3, tmp);
1172
1173        /* ftmp5 = z1 + z2 */
1174        felem_assign(ftmp5, z1);
1175        felem_sum64(ftmp5, z2);
1176        /* ftmp5[i] < 2^61 */
1177
1178        /* ftmp5 = (z1 + z2)**2 - z1z1 - z2z2 = 2*z1z2 */
1179        felem_square(tmp, ftmp5);
1180        /* tmp[i] < 17*2^122 */
1181        felem_diff_128_64(tmp, ftmp);
1182        /* tmp[i] < 17*2^122 + 2^63 */
1183        felem_diff_128_64(tmp, ftmp2);
1184        /* tmp[i] < 17*2^122 + 2^64 */
1185        felem_reduce(ftmp5, tmp);
1186
1187        /* ftmp2 = z2 * z2z2 */
1188        felem_mul(tmp, ftmp2, z2);
1189        felem_reduce(ftmp2, tmp);
1190
1191        /* s1 = ftmp6 = y1 * z2**3 */
1192        felem_mul(tmp, y1, ftmp2);
1193        felem_reduce(ftmp6, tmp);
1194    } else {
1195        /*
1196         * We'll assume z2 = 1 (special case z2 = 0 is handled later)
1197         */
1198
1199        /* u1 = ftmp3 = x1*z2z2 */
1200        felem_assign(ftmp3, x1);
1201
1202        /* ftmp5 = 2*z1z2 */
1203        felem_scalar(ftmp5, z1, 2);
1204
1205        /* s1 = ftmp6 = y1 * z2**3 */
1206        felem_assign(ftmp6, y1);
1207    }
1208
1209    /* u2 = x2*z1z1 */
1210    felem_mul(tmp, x2, ftmp);
1211    /* tmp[i] < 17*2^120 */
1212
1213    /* h = ftmp4 = u2 - u1 */
1214    felem_diff_128_64(tmp, ftmp3);
1215    /* tmp[i] < 17*2^120 + 2^63 */
1216    felem_reduce(ftmp4, tmp);
1217
1218    x_equal = felem_is_zero(ftmp4);
1219
1220    /* z_out = ftmp5 * h */
1221    felem_mul(tmp, ftmp5, ftmp4);
1222    felem_reduce(z_out, tmp);
1223
1224    /* ftmp = z1 * z1z1 */
1225    felem_mul(tmp, ftmp, z1);
1226    felem_reduce(ftmp, tmp);
1227
1228    /* s2 = tmp = y2 * z1**3 */
1229    felem_mul(tmp, y2, ftmp);
1230    /* tmp[i] < 17*2^120 */
1231
1232    /* r = ftmp5 = (s2 - s1)*2 */
1233    felem_diff_128_64(tmp, ftmp6);
1234    /* tmp[i] < 17*2^120 + 2^63 */
1235    felem_reduce(ftmp5, tmp);
1236    y_equal = felem_is_zero(ftmp5);
1237    felem_scalar64(ftmp5, 2);
1238    /* ftmp5[i] < 2^61 */
1239
1240    if (x_equal && y_equal && !z1_is_zero && !z2_is_zero) {
1241        point_double(x3, y3, z3, x1, y1, z1);
1242        return;
1243    }
1244
1245    /* I = ftmp = (2h)**2 */
1246    felem_assign(ftmp, ftmp4);
1247    felem_scalar64(ftmp, 2);
1248    /* ftmp[i] < 2^61 */
1249    felem_square(tmp, ftmp);
1250    /* tmp[i] < 17*2^122 */
1251    felem_reduce(ftmp, tmp);
1252
1253    /* J = ftmp2 = h * I */
1254    felem_mul(tmp, ftmp4, ftmp);
1255    felem_reduce(ftmp2, tmp);
1256
1257    /* V = ftmp4 = U1 * I */
1258    felem_mul(tmp, ftmp3, ftmp);
1259    felem_reduce(ftmp4, tmp);
1260
1261    /* x_out = r**2 - J - 2V */
1262    felem_square(tmp, ftmp5);
1263    /* tmp[i] < 17*2^122 */
1264    felem_diff_128_64(tmp, ftmp2);
1265    /* tmp[i] < 17*2^122 + 2^63 */
1266    felem_assign(ftmp3, ftmp4);
1267    felem_scalar64(ftmp4, 2);
1268    /* ftmp4[i] < 2^61 */
1269    felem_diff_128_64(tmp, ftmp4);
1270    /* tmp[i] < 17*2^122 + 2^64 */
1271    felem_reduce(x_out, tmp);
1272
1273    /* y_out = r(V-x_out) - 2 * s1 * J */
1274    felem_diff64(ftmp3, x_out);
1275    /*
1276     * ftmp3[i] < 2^60 + 2^60 = 2^61
1277     */
1278    felem_mul(tmp, ftmp5, ftmp3);
1279    /* tmp[i] < 17*2^122 */
1280    felem_mul(tmp2, ftmp6, ftmp2);
1281    /* tmp2[i] < 17*2^120 */
1282    felem_scalar128(tmp2, 2);
1283    /* tmp2[i] < 17*2^121 */
1284    felem_diff128(tmp, tmp2);
1285        /*-
1286         * tmp[i] < 2^127 - 2^69 + 17*2^122
1287         *        = 2^126 - 2^122 - 2^6 - 2^2 - 1
1288         *        < 2^127
1289         */
1290    felem_reduce(y_out, tmp);
1291
1292    copy_conditional(x_out, x2, z1_is_zero);
1293    copy_conditional(x_out, x1, z2_is_zero);
1294    copy_conditional(y_out, y2, z1_is_zero);
1295    copy_conditional(y_out, y1, z2_is_zero);
1296    copy_conditional(z_out, z2, z1_is_zero);
1297    copy_conditional(z_out, z1, z2_is_zero);
1298    felem_assign(x3, x_out);
1299    felem_assign(y3, y_out);
1300    felem_assign(z3, z_out);
1301}
1302
1303/*-
1304 * Base point pre computation
1305 * --------------------------
1306 *
1307 * Two different sorts of precomputed tables are used in the following code.
1308 * Each contain various points on the curve, where each point is three field
1309 * elements (x, y, z).
1310 *
1311 * For the base point table, z is usually 1 (0 for the point at infinity).
1312 * This table has 16 elements:
1313 * index | bits    | point
1314 * ------+---------+------------------------------
1315 *     0 | 0 0 0 0 | 0G
1316 *     1 | 0 0 0 1 | 1G
1317 *     2 | 0 0 1 0 | 2^130G
1318 *     3 | 0 0 1 1 | (2^130 + 1)G
1319 *     4 | 0 1 0 0 | 2^260G
1320 *     5 | 0 1 0 1 | (2^260 + 1)G
1321 *     6 | 0 1 1 0 | (2^260 + 2^130)G
1322 *     7 | 0 1 1 1 | (2^260 + 2^130 + 1)G
1323 *     8 | 1 0 0 0 | 2^390G
1324 *     9 | 1 0 0 1 | (2^390 + 1)G
1325 *    10 | 1 0 1 0 | (2^390 + 2^130)G
1326 *    11 | 1 0 1 1 | (2^390 + 2^130 + 1)G
1327 *    12 | 1 1 0 0 | (2^390 + 2^260)G
1328 *    13 | 1 1 0 1 | (2^390 + 2^260 + 1)G
1329 *    14 | 1 1 1 0 | (2^390 + 2^260 + 2^130)G
1330 *    15 | 1 1 1 1 | (2^390 + 2^260 + 2^130 + 1)G
1331 *
1332 * The reason for this is so that we can clock bits into four different
1333 * locations when doing simple scalar multiplies against the base point.
1334 *
1335 * Tables for other points have table[i] = iG for i in 0 .. 16. */
1336
1337/* gmul is the table of precomputed base points */
1338static const felem gmul[16][3] = { {{0, 0, 0, 0, 0, 0, 0, 0, 0},
1339                                    {0, 0, 0, 0, 0, 0, 0, 0, 0},
1340                                    {0, 0, 0, 0, 0, 0, 0, 0, 0}},
1341{{0x017e7e31c2e5bd66, 0x022cf0615a90a6fe, 0x00127a2ffa8de334,
1342  0x01dfbf9d64a3f877, 0x006b4d3dbaa14b5e, 0x014fed487e0a2bd8,
1343  0x015b4429c6481390, 0x03a73678fb2d988e, 0x00c6858e06b70404},
1344 {0x00be94769fd16650, 0x031c21a89cb09022, 0x039013fad0761353,
1345  0x02657bd099031542, 0x03273e662c97ee72, 0x01e6d11a05ebef45,
1346  0x03d1bd998f544495, 0x03001172297ed0b1, 0x011839296a789a3b},
1347 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1348{{0x0373faacbc875bae, 0x00f325023721c671, 0x00f666fd3dbde5ad,
1349  0x01a6932363f88ea7, 0x01fc6d9e13f9c47b, 0x03bcbffc2bbf734e,
1350  0x013ee3c3647f3a92, 0x029409fefe75d07d, 0x00ef9199963d85e5},
1351 {0x011173743ad5b178, 0x02499c7c21bf7d46, 0x035beaeabb8b1a58,
1352  0x00f989c4752ea0a3, 0x0101e1de48a9c1a3, 0x01a20076be28ba6c,
1353  0x02f8052e5eb2de95, 0x01bfe8f82dea117c, 0x0160074d3c36ddb7},
1354 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1355{{0x012f3fc373393b3b, 0x03d3d6172f1419fa, 0x02adc943c0b86873,
1356  0x00d475584177952b, 0x012a4d1673750ee2, 0x00512517a0f13b0c,
1357  0x02b184671a7b1734, 0x0315b84236f1a50a, 0x00a4afc472edbdb9},
1358 {0x00152a7077f385c4, 0x03044007d8d1c2ee, 0x0065829d61d52b52,
1359  0x00494ff6b6631d0d, 0x00a11d94d5f06bcf, 0x02d2f89474d9282e,
1360  0x0241c5727c06eeb9, 0x0386928710fbdb9d, 0x01f883f727b0dfbe},
1361 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1362{{0x019b0c3c9185544d, 0x006243a37c9d97db, 0x02ee3cbe030a2ad2,
1363  0x00cfdd946bb51e0d, 0x0271c00932606b91, 0x03f817d1ec68c561,
1364  0x03f37009806a369c, 0x03c1f30baf184fd5, 0x01091022d6d2f065},
1365 {0x0292c583514c45ed, 0x0316fca51f9a286c, 0x00300af507c1489a,
1366  0x0295f69008298cf1, 0x02c0ed8274943d7b, 0x016509b9b47a431e,
1367  0x02bc9de9634868ce, 0x005b34929bffcb09, 0x000c1a0121681524},
1368 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1369{{0x0286abc0292fb9f2, 0x02665eee9805b3f7, 0x01ed7455f17f26d6,
1370  0x0346355b83175d13, 0x006284944cd0a097, 0x0191895bcdec5e51,
1371  0x02e288370afda7d9, 0x03b22312bfefa67a, 0x01d104d3fc0613fe},
1372 {0x0092421a12f7e47f, 0x0077a83fa373c501, 0x03bd25c5f696bd0d,
1373  0x035c41e4d5459761, 0x01ca0d1742b24f53, 0x00aaab27863a509c,
1374  0x018b6de47df73917, 0x025c0b771705cd01, 0x01fd51d566d760a7},
1375 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1376{{0x01dd92ff6b0d1dbd, 0x039c5e2e8f8afa69, 0x0261ed13242c3b27,
1377  0x0382c6e67026e6a0, 0x01d60b10be2089f9, 0x03c15f3dce86723f,
1378  0x03c764a32d2a062d, 0x017307eac0fad056, 0x018207c0b96c5256},
1379 {0x0196a16d60e13154, 0x03e6ce74c0267030, 0x00ddbf2b4e52a5aa,
1380  0x012738241bbf31c8, 0x00ebe8dc04685a28, 0x024c2ad6d380d4a2,
1381  0x035ee062a6e62d0e, 0x0029ed74af7d3a0f, 0x00eef32aec142ebd},
1382 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1383{{0x00c31ec398993b39, 0x03a9f45bcda68253, 0x00ac733c24c70890,
1384  0x00872b111401ff01, 0x01d178c23195eafb, 0x03bca2c816b87f74,
1385  0x0261a9af46fbad7a, 0x0324b2a8dd3d28f9, 0x00918121d8f24e23},
1386 {0x032bc8c1ca983cd7, 0x00d869dfb08fc8c6, 0x01693cb61fce1516,
1387  0x012a5ea68f4e88a8, 0x010869cab88d7ae3, 0x009081ad277ceee1,
1388  0x033a77166d064cdc, 0x03955235a1fb3a95, 0x01251a4a9b25b65e},
1389 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1390{{0x00148a3a1b27f40b, 0x0123186df1b31fdc, 0x00026e7beaad34ce,
1391  0x01db446ac1d3dbba, 0x0299c1a33437eaec, 0x024540610183cbb7,
1392  0x0173bb0e9ce92e46, 0x02b937e43921214b, 0x01ab0436a9bf01b5},
1393 {0x0383381640d46948, 0x008dacbf0e7f330f, 0x03602122bcc3f318,
1394  0x01ee596b200620d6, 0x03bd0585fda430b3, 0x014aed77fd123a83,
1395  0x005ace749e52f742, 0x0390fe041da2b842, 0x0189a8ceb3299242},
1396 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1397{{0x012a19d6b3282473, 0x00c0915918b423ce, 0x023a954eb94405ae,
1398  0x00529f692be26158, 0x0289fa1b6fa4b2aa, 0x0198ae4ceea346ef,
1399  0x0047d8cdfbdedd49, 0x00cc8c8953f0f6b8, 0x001424abbff49203},
1400 {0x0256732a1115a03a, 0x0351bc38665c6733, 0x03f7b950fb4a6447,
1401  0x000afffa94c22155, 0x025763d0a4dab540, 0x000511e92d4fc283,
1402  0x030a7e9eda0ee96c, 0x004c3cd93a28bf0a, 0x017edb3a8719217f},
1403 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1404{{0x011de5675a88e673, 0x031d7d0f5e567fbe, 0x0016b2062c970ae5,
1405  0x03f4a2be49d90aa7, 0x03cef0bd13822866, 0x03f0923dcf774a6c,
1406  0x0284bebc4f322f72, 0x016ab2645302bb2c, 0x01793f95dace0e2a},
1407 {0x010646e13527a28f, 0x01ca1babd59dc5e7, 0x01afedfd9a5595df,
1408  0x01f15785212ea6b1, 0x0324e5d64f6ae3f4, 0x02d680f526d00645,
1409  0x0127920fadf627a7, 0x03b383f75df4f684, 0x0089e0057e783b0a},
1410 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1411{{0x00f334b9eb3c26c6, 0x0298fdaa98568dce, 0x01c2d24843a82292,
1412  0x020bcb24fa1b0711, 0x02cbdb3d2b1875e6, 0x0014907598f89422,
1413  0x03abe3aa43b26664, 0x02cbf47f720bc168, 0x0133b5e73014b79b},
1414 {0x034aab5dab05779d, 0x00cdc5d71fee9abb, 0x0399f16bd4bd9d30,
1415  0x03582fa592d82647, 0x02be1cdfb775b0e9, 0x0034f7cea32e94cb,
1416  0x0335a7f08f56f286, 0x03b707e9565d1c8b, 0x0015c946ea5b614f},
1417 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1418{{0x024676f6cff72255, 0x00d14625cac96378, 0x00532b6008bc3767,
1419  0x01fc16721b985322, 0x023355ea1b091668, 0x029de7afdc0317c3,
1420  0x02fc8a7ca2da037c, 0x02de1217d74a6f30, 0x013f7173175b73bf},
1421 {0x0344913f441490b5, 0x0200f9e272b61eca, 0x0258a246b1dd55d2,
1422  0x03753db9ea496f36, 0x025e02937a09c5ef, 0x030cbd3d14012692,
1423  0x01793a67e70dc72a, 0x03ec1d37048a662e, 0x006550f700c32a8d},
1424 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1425{{0x00d3f48a347eba27, 0x008e636649b61bd8, 0x00d3b93716778fb3,
1426  0x004d1915757bd209, 0x019d5311a3da44e0, 0x016d1afcbbe6aade,
1427  0x0241bf5f73265616, 0x0384672e5d50d39b, 0x005009fee522b684},
1428 {0x029b4fab064435fe, 0x018868ee095bbb07, 0x01ea3d6936cc92b8,
1429  0x000608b00f78a2f3, 0x02db911073d1c20f, 0x018205938470100a,
1430  0x01f1e4964cbe6ff2, 0x021a19a29eed4663, 0x01414485f42afa81},
1431 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1432{{0x01612b3a17f63e34, 0x03813992885428e6, 0x022b3c215b5a9608,
1433  0x029b4057e19f2fcb, 0x0384059a587af7e6, 0x02d6400ace6fe610,
1434  0x029354d896e8e331, 0x00c047ee6dfba65e, 0x0037720542e9d49d},
1435 {0x02ce9eed7c5e9278, 0x0374ed703e79643b, 0x01316c54c4072006,
1436  0x005aaa09054b2ee8, 0x002824000c840d57, 0x03d4eba24771ed86,
1437  0x0189c50aabc3bdae, 0x0338c01541e15510, 0x00466d56e38eed42},
1438 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1439{{0x007efd8330ad8bd6, 0x02465ed48047710b, 0x0034c6606b215e0c,
1440  0x016ae30c53cbf839, 0x01fa17bd37161216, 0x018ead4e61ce8ab9,
1441  0x005482ed5f5dee46, 0x037543755bba1d7f, 0x005e5ac7e70a9d0f},
1442 {0x0117e1bb2fdcb2a2, 0x03deea36249f40c4, 0x028d09b4a6246cb7,
1443  0x03524b8855bcf756, 0x023d7d109d5ceb58, 0x0178e43e3223ef9c,
1444  0x0154536a0c6e966a, 0x037964d1286ee9fe, 0x0199bcd90e125055},
1445 {1, 0, 0, 0, 0, 0, 0, 0, 0}}
1446};
1447
1448/*
1449 * select_point selects the |idx|th point from a precomputation table and
1450 * copies it to out.
1451 */
1452 /* pre_comp below is of the size provided in |size| */
1453static void select_point(const limb idx, unsigned int size,
1454                         const felem pre_comp[][3], felem out[3])
1455{
1456    unsigned i, j;
1457    limb *outlimbs = &out[0][0];
1458    memset(outlimbs, 0, 3 * sizeof(felem));
1459
1460    for (i = 0; i < size; i++) {
1461        const limb *inlimbs = &pre_comp[i][0][0];
1462        limb mask = i ^ idx;
1463        mask |= mask >> 4;
1464        mask |= mask >> 2;
1465        mask |= mask >> 1;
1466        mask &= 1;
1467        mask--;
1468        for (j = 0; j < NLIMBS * 3; j++)
1469            outlimbs[j] |= inlimbs[j] & mask;
1470    }
1471}
1472
1473/* get_bit returns the |i|th bit in |in| */
1474static char get_bit(const felem_bytearray in, int i)
1475{
1476    if (i < 0)
1477        return 0;
1478    return (in[i >> 3] >> (i & 7)) & 1;
1479}
1480
1481/*
1482 * Interleaved point multiplication using precomputed point multiples: The
1483 * small point multiples 0*P, 1*P, ..., 16*P are in pre_comp[], the scalars
1484 * in scalars[]. If g_scalar is non-NULL, we also add this multiple of the
1485 * generator, using certain (large) precomputed multiples in g_pre_comp.
1486 * Output point (X, Y, Z) is stored in x_out, y_out, z_out
1487 */
1488static void batch_mul(felem x_out, felem y_out, felem z_out,
1489                      const felem_bytearray scalars[],
1490                      const unsigned num_points, const u8 *g_scalar,
1491                      const int mixed, const felem pre_comp[][17][3],
1492                      const felem g_pre_comp[16][3])
1493{
1494    int i, skip;
1495    unsigned num, gen_mul = (g_scalar != NULL);
1496    felem nq[3], tmp[4];
1497    limb bits;
1498    u8 sign, digit;
1499
1500    /* set nq to the point at infinity */
1501    memset(nq, 0, 3 * sizeof(felem));
1502
1503    /*
1504     * Loop over all scalars msb-to-lsb, interleaving additions of multiples
1505     * of the generator (last quarter of rounds) and additions of other
1506     * points multiples (every 5th round).
1507     */
1508    skip = 1;                   /* save two point operations in the first
1509                                 * round */
1510    for (i = (num_points ? 520 : 130); i >= 0; --i) {
1511        /* double */
1512        if (!skip)
1513            point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]);
1514
1515        /* add multiples of the generator */
1516        if (gen_mul && (i <= 130)) {
1517            bits = get_bit(g_scalar, i + 390) << 3;
1518            if (i < 130) {
1519                bits |= get_bit(g_scalar, i + 260) << 2;
1520                bits |= get_bit(g_scalar, i + 130) << 1;
1521                bits |= get_bit(g_scalar, i);
1522            }
1523            /* select the point to add, in constant time */
1524            select_point(bits, 16, g_pre_comp, tmp);
1525            if (!skip) {
1526                /* The 1 argument below is for "mixed" */
1527                point_add(nq[0], nq[1], nq[2],
1528                          nq[0], nq[1], nq[2], 1, tmp[0], tmp[1], tmp[2]);
1529            } else {
1530                memcpy(nq, tmp, 3 * sizeof(felem));
1531                skip = 0;
1532            }
1533        }
1534
1535        /* do other additions every 5 doublings */
1536        if (num_points && (i % 5 == 0)) {
1537            /* loop over all scalars */
1538            for (num = 0; num < num_points; ++num) {
1539                bits = get_bit(scalars[num], i + 4) << 5;
1540                bits |= get_bit(scalars[num], i + 3) << 4;
1541                bits |= get_bit(scalars[num], i + 2) << 3;
1542                bits |= get_bit(scalars[num], i + 1) << 2;
1543                bits |= get_bit(scalars[num], i) << 1;
1544                bits |= get_bit(scalars[num], i - 1);
1545                ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits);
1546
1547                /*
1548                 * select the point to add or subtract, in constant time
1549                 */
1550                select_point(digit, 17, pre_comp[num], tmp);
1551                felem_neg(tmp[3], tmp[1]); /* (X, -Y, Z) is the negative
1552                                            * point */
1553                copy_conditional(tmp[1], tmp[3], (-(limb) sign));
1554
1555                if (!skip) {
1556                    point_add(nq[0], nq[1], nq[2],
1557                              nq[0], nq[1], nq[2],
1558                              mixed, tmp[0], tmp[1], tmp[2]);
1559                } else {
1560                    memcpy(nq, tmp, 3 * sizeof(felem));
1561                    skip = 0;
1562                }
1563            }
1564        }
1565    }
1566    felem_assign(x_out, nq[0]);
1567    felem_assign(y_out, nq[1]);
1568    felem_assign(z_out, nq[2]);
1569}
1570
1571/* Precomputation for the group generator. */
1572typedef struct {
1573    felem g_pre_comp[16][3];
1574    int references;
1575} NISTP521_PRE_COMP;
1576
1577const EC_METHOD *EC_GFp_nistp521_method(void)
1578{
1579    static const EC_METHOD ret = {
1580        EC_FLAGS_DEFAULT_OCT,
1581        NID_X9_62_prime_field,
1582        ec_GFp_nistp521_group_init,
1583        ec_GFp_simple_group_finish,
1584        ec_GFp_simple_group_clear_finish,
1585        ec_GFp_nist_group_copy,
1586        ec_GFp_nistp521_group_set_curve,
1587        ec_GFp_simple_group_get_curve,
1588        ec_GFp_simple_group_get_degree,
1589        ec_GFp_simple_group_check_discriminant,
1590        ec_GFp_simple_point_init,
1591        ec_GFp_simple_point_finish,
1592        ec_GFp_simple_point_clear_finish,
1593        ec_GFp_simple_point_copy,
1594        ec_GFp_simple_point_set_to_infinity,
1595        ec_GFp_simple_set_Jprojective_coordinates_GFp,
1596        ec_GFp_simple_get_Jprojective_coordinates_GFp,
1597        ec_GFp_simple_point_set_affine_coordinates,
1598        ec_GFp_nistp521_point_get_affine_coordinates,
1599        0 /* point_set_compressed_coordinates */ ,
1600        0 /* point2oct */ ,
1601        0 /* oct2point */ ,
1602        ec_GFp_simple_add,
1603        ec_GFp_simple_dbl,
1604        ec_GFp_simple_invert,
1605        ec_GFp_simple_is_at_infinity,
1606        ec_GFp_simple_is_on_curve,
1607        ec_GFp_simple_cmp,
1608        ec_GFp_simple_make_affine,
1609        ec_GFp_simple_points_make_affine,
1610        ec_GFp_nistp521_points_mul,
1611        ec_GFp_nistp521_precompute_mult,
1612        ec_GFp_nistp521_have_precompute_mult,
1613        ec_GFp_nist_field_mul,
1614        ec_GFp_nist_field_sqr,
1615        0 /* field_div */ ,
1616        0 /* field_encode */ ,
1617        0 /* field_decode */ ,
1618        0                       /* field_set_to_one */
1619    };
1620
1621    return &ret;
1622}
1623
1624/******************************************************************************/
1625/*
1626 * FUNCTIONS TO MANAGE PRECOMPUTATION
1627 */
1628
1629static NISTP521_PRE_COMP *nistp521_pre_comp_new()
1630{
1631    NISTP521_PRE_COMP *ret = NULL;
1632    ret = (NISTP521_PRE_COMP *) OPENSSL_malloc(sizeof(NISTP521_PRE_COMP));
1633    if (!ret) {
1634        ECerr(EC_F_NISTP521_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE);
1635        return ret;
1636    }
1637    memset(ret->g_pre_comp, 0, sizeof(ret->g_pre_comp));
1638    ret->references = 1;
1639    return ret;
1640}
1641
1642static void *nistp521_pre_comp_dup(void *src_)
1643{
1644    NISTP521_PRE_COMP *src = src_;
1645
1646    /* no need to actually copy, these objects never change! */
1647    CRYPTO_add(&src->references, 1, CRYPTO_LOCK_EC_PRE_COMP);
1648
1649    return src_;
1650}
1651
1652static void nistp521_pre_comp_free(void *pre_)
1653{
1654    int i;
1655    NISTP521_PRE_COMP *pre = pre_;
1656
1657    if (!pre)
1658        return;
1659
1660    i = CRYPTO_add(&pre->references, -1, CRYPTO_LOCK_EC_PRE_COMP);
1661    if (i > 0)
1662        return;
1663
1664    OPENSSL_free(pre);
1665}
1666
1667static void nistp521_pre_comp_clear_free(void *pre_)
1668{
1669    int i;
1670    NISTP521_PRE_COMP *pre = pre_;
1671
1672    if (!pre)
1673        return;
1674
1675    i = CRYPTO_add(&pre->references, -1, CRYPTO_LOCK_EC_PRE_COMP);
1676    if (i > 0)
1677        return;
1678
1679    OPENSSL_cleanse(pre, sizeof(*pre));
1680    OPENSSL_free(pre);
1681}
1682
1683/******************************************************************************/
1684/*
1685 * OPENSSL EC_METHOD FUNCTIONS
1686 */
1687
1688int ec_GFp_nistp521_group_init(EC_GROUP *group)
1689{
1690    int ret;
1691    ret = ec_GFp_simple_group_init(group);
1692    group->a_is_minus3 = 1;
1693    return ret;
1694}
1695
1696int ec_GFp_nistp521_group_set_curve(EC_GROUP *group, const BIGNUM *p,
1697                                    const BIGNUM *a, const BIGNUM *b,
1698                                    BN_CTX *ctx)
1699{
1700    int ret = 0;
1701    BN_CTX *new_ctx = NULL;
1702    BIGNUM *curve_p, *curve_a, *curve_b;
1703
1704    if (ctx == NULL)
1705        if ((ctx = new_ctx = BN_CTX_new()) == NULL)
1706            return 0;
1707    BN_CTX_start(ctx);
1708    if (((curve_p = BN_CTX_get(ctx)) == NULL) ||
1709        ((curve_a = BN_CTX_get(ctx)) == NULL) ||
1710        ((curve_b = BN_CTX_get(ctx)) == NULL))
1711        goto err;
1712    BN_bin2bn(nistp521_curve_params[0], sizeof(felem_bytearray), curve_p);
1713    BN_bin2bn(nistp521_curve_params[1], sizeof(felem_bytearray), curve_a);
1714    BN_bin2bn(nistp521_curve_params[2], sizeof(felem_bytearray), curve_b);
1715    if ((BN_cmp(curve_p, p)) || (BN_cmp(curve_a, a)) || (BN_cmp(curve_b, b))) {
1716        ECerr(EC_F_EC_GFP_NISTP521_GROUP_SET_CURVE,
1717              EC_R_WRONG_CURVE_PARAMETERS);
1718        goto err;
1719    }
1720    group->field_mod_func = BN_nist_mod_521;
1721    ret = ec_GFp_simple_group_set_curve(group, p, a, b, ctx);
1722 err:
1723    BN_CTX_end(ctx);
1724    if (new_ctx != NULL)
1725        BN_CTX_free(new_ctx);
1726    return ret;
1727}
1728
1729/*
1730 * Takes the Jacobian coordinates (X, Y, Z) of a point and returns (X', Y') =
1731 * (X/Z^2, Y/Z^3)
1732 */
1733int ec_GFp_nistp521_point_get_affine_coordinates(const EC_GROUP *group,
1734                                                 const EC_POINT *point,
1735                                                 BIGNUM *x, BIGNUM *y,
1736                                                 BN_CTX *ctx)
1737{
1738    felem z1, z2, x_in, y_in, x_out, y_out;
1739    largefelem tmp;
1740
1741    if (EC_POINT_is_at_infinity(group, point)) {
1742        ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES,
1743              EC_R_POINT_AT_INFINITY);
1744        return 0;
1745    }
1746    if ((!BN_to_felem(x_in, &point->X)) || (!BN_to_felem(y_in, &point->Y)) ||
1747        (!BN_to_felem(z1, &point->Z)))
1748        return 0;
1749    felem_inv(z2, z1);
1750    felem_square(tmp, z2);
1751    felem_reduce(z1, tmp);
1752    felem_mul(tmp, x_in, z1);
1753    felem_reduce(x_in, tmp);
1754    felem_contract(x_out, x_in);
1755    if (x != NULL) {
1756        if (!felem_to_BN(x, x_out)) {
1757            ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES,
1758                  ERR_R_BN_LIB);
1759            return 0;
1760        }
1761    }
1762    felem_mul(tmp, z1, z2);
1763    felem_reduce(z1, tmp);
1764    felem_mul(tmp, y_in, z1);
1765    felem_reduce(y_in, tmp);
1766    felem_contract(y_out, y_in);
1767    if (y != NULL) {
1768        if (!felem_to_BN(y, y_out)) {
1769            ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES,
1770                  ERR_R_BN_LIB);
1771            return 0;
1772        }
1773    }
1774    return 1;
1775}
1776
1777/* points below is of size |num|, and tmp_felems is of size |num+1/ */
1778static void make_points_affine(size_t num, felem points[][3],
1779                               felem tmp_felems[])
1780{
1781    /*
1782     * Runs in constant time, unless an input is the point at infinity (which
1783     * normally shouldn't happen).
1784     */
1785    ec_GFp_nistp_points_make_affine_internal(num,
1786                                             points,
1787                                             sizeof(felem),
1788                                             tmp_felems,
1789                                             (void (*)(void *))felem_one,
1790                                             felem_is_zero_int,
1791                                             (void (*)(void *, const void *))
1792                                             felem_assign,
1793                                             (void (*)(void *, const void *))
1794                                             felem_square_reduce, (void (*)
1795                                                                   (void *,
1796                                                                    const void
1797                                                                    *,
1798                                                                    const void
1799                                                                    *))
1800                                             felem_mul_reduce,
1801                                             (void (*)(void *, const void *))
1802                                             felem_inv,
1803                                             (void (*)(void *, const void *))
1804                                             felem_contract);
1805}
1806
1807/*
1808 * Computes scalar*generator + \sum scalars[i]*points[i], ignoring NULL
1809 * values Result is stored in r (r can equal one of the inputs).
1810 */
1811int ec_GFp_nistp521_points_mul(const EC_GROUP *group, EC_POINT *r,
1812                               const BIGNUM *scalar, size_t num,
1813                               const EC_POINT *points[],
1814                               const BIGNUM *scalars[], BN_CTX *ctx)
1815{
1816    int ret = 0;
1817    int j;
1818    int mixed = 0;
1819    BN_CTX *new_ctx = NULL;
1820    BIGNUM *x, *y, *z, *tmp_scalar;
1821    felem_bytearray g_secret;
1822    felem_bytearray *secrets = NULL;
1823    felem(*pre_comp)[17][3] = NULL;
1824    felem *tmp_felems = NULL;
1825    felem_bytearray tmp;
1826    unsigned i, num_bytes;
1827    int have_pre_comp = 0;
1828    size_t num_points = num;
1829    felem x_in, y_in, z_in, x_out, y_out, z_out;
1830    NISTP521_PRE_COMP *pre = NULL;
1831    felem(*g_pre_comp)[3] = NULL;
1832    EC_POINT *generator = NULL;
1833    const EC_POINT *p = NULL;
1834    const BIGNUM *p_scalar = NULL;
1835
1836    if (ctx == NULL)
1837        if ((ctx = new_ctx = BN_CTX_new()) == NULL)
1838            return 0;
1839    BN_CTX_start(ctx);
1840    if (((x = BN_CTX_get(ctx)) == NULL) ||
1841        ((y = BN_CTX_get(ctx)) == NULL) ||
1842        ((z = BN_CTX_get(ctx)) == NULL) ||
1843        ((tmp_scalar = BN_CTX_get(ctx)) == NULL))
1844        goto err;
1845
1846    if (scalar != NULL) {
1847        pre = EC_EX_DATA_get_data(group->extra_data,
1848                                  nistp521_pre_comp_dup,
1849                                  nistp521_pre_comp_free,
1850                                  nistp521_pre_comp_clear_free);
1851        if (pre)
1852            /* we have precomputation, try to use it */
1853            g_pre_comp = &pre->g_pre_comp[0];
1854        else
1855            /* try to use the standard precomputation */
1856            g_pre_comp = (felem(*)[3]) gmul;
1857        generator = EC_POINT_new(group);
1858        if (generator == NULL)
1859            goto err;
1860        /* get the generator from precomputation */
1861        if (!felem_to_BN(x, g_pre_comp[1][0]) ||
1862            !felem_to_BN(y, g_pre_comp[1][1]) ||
1863            !felem_to_BN(z, g_pre_comp[1][2])) {
1864            ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
1865            goto err;
1866        }
1867        if (!EC_POINT_set_Jprojective_coordinates_GFp(group,
1868                                                      generator, x, y, z,
1869                                                      ctx))
1870            goto err;
1871        if (0 == EC_POINT_cmp(group, generator, group->generator, ctx))
1872            /* precomputation matches generator */
1873            have_pre_comp = 1;
1874        else
1875            /*
1876             * we don't have valid precomputation: treat the generator as a
1877             * random point
1878             */
1879            num_points++;
1880    }
1881
1882    if (num_points > 0) {
1883        if (num_points >= 2) {
1884            /*
1885             * unless we precompute multiples for just one point, converting
1886             * those into affine form is time well spent
1887             */
1888            mixed = 1;
1889        }
1890        secrets = OPENSSL_malloc(num_points * sizeof(felem_bytearray));
1891        pre_comp = OPENSSL_malloc(num_points * 17 * 3 * sizeof(felem));
1892        if (mixed)
1893            tmp_felems =
1894                OPENSSL_malloc((num_points * 17 + 1) * sizeof(felem));
1895        if ((secrets == NULL) || (pre_comp == NULL)
1896            || (mixed && (tmp_felems == NULL))) {
1897            ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_MALLOC_FAILURE);
1898            goto err;
1899        }
1900
1901        /*
1902         * we treat NULL scalars as 0, and NULL points as points at infinity,
1903         * i.e., they contribute nothing to the linear combination
1904         */
1905        memset(secrets, 0, num_points * sizeof(felem_bytearray));
1906        memset(pre_comp, 0, num_points * 17 * 3 * sizeof(felem));
1907        for (i = 0; i < num_points; ++i) {
1908            if (i == num)
1909                /*
1910                 * we didn't have a valid precomputation, so we pick the
1911                 * generator
1912                 */
1913            {
1914                p = EC_GROUP_get0_generator(group);
1915                p_scalar = scalar;
1916            } else
1917                /* the i^th point */
1918            {
1919                p = points[i];
1920                p_scalar = scalars[i];
1921            }
1922            if ((p_scalar != NULL) && (p != NULL)) {
1923                /* reduce scalar to 0 <= scalar < 2^521 */
1924                if ((BN_num_bits(p_scalar) > 521)
1925                    || (BN_is_negative(p_scalar))) {
1926                    /*
1927                     * this is an unusual input, and we don't guarantee
1928                     * constant-timeness
1929                     */
1930                    if (!BN_nnmod(tmp_scalar, p_scalar, &group->order, ctx)) {
1931                        ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
1932                        goto err;
1933                    }
1934                    num_bytes = BN_bn2bin(tmp_scalar, tmp);
1935                } else
1936                    num_bytes = BN_bn2bin(p_scalar, tmp);
1937                flip_endian(secrets[i], tmp, num_bytes);
1938                /* precompute multiples */
1939                if ((!BN_to_felem(x_out, &p->X)) ||
1940                    (!BN_to_felem(y_out, &p->Y)) ||
1941                    (!BN_to_felem(z_out, &p->Z)))
1942                    goto err;
1943                memcpy(pre_comp[i][1][0], x_out, sizeof(felem));
1944                memcpy(pre_comp[i][1][1], y_out, sizeof(felem));
1945                memcpy(pre_comp[i][1][2], z_out, sizeof(felem));
1946                for (j = 2; j <= 16; ++j) {
1947                    if (j & 1) {
1948                        point_add(pre_comp[i][j][0], pre_comp[i][j][1],
1949                                  pre_comp[i][j][2], pre_comp[i][1][0],
1950                                  pre_comp[i][1][1], pre_comp[i][1][2], 0,
1951                                  pre_comp[i][j - 1][0],
1952                                  pre_comp[i][j - 1][1],
1953                                  pre_comp[i][j - 1][2]);
1954                    } else {
1955                        point_double(pre_comp[i][j][0], pre_comp[i][j][1],
1956                                     pre_comp[i][j][2], pre_comp[i][j / 2][0],
1957                                     pre_comp[i][j / 2][1],
1958                                     pre_comp[i][j / 2][2]);
1959                    }
1960                }
1961            }
1962        }
1963        if (mixed)
1964            make_points_affine(num_points * 17, pre_comp[0], tmp_felems);
1965    }
1966
1967    /* the scalar for the generator */
1968    if ((scalar != NULL) && (have_pre_comp)) {
1969        memset(g_secret, 0, sizeof(g_secret));
1970        /* reduce scalar to 0 <= scalar < 2^521 */
1971        if ((BN_num_bits(scalar) > 521) || (BN_is_negative(scalar))) {
1972            /*
1973             * this is an unusual input, and we don't guarantee
1974             * constant-timeness
1975             */
1976            if (!BN_nnmod(tmp_scalar, scalar, &group->order, ctx)) {
1977                ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
1978                goto err;
1979            }
1980            num_bytes = BN_bn2bin(tmp_scalar, tmp);
1981        } else
1982            num_bytes = BN_bn2bin(scalar, tmp);
1983        flip_endian(g_secret, tmp, num_bytes);
1984        /* do the multiplication with generator precomputation */
1985        batch_mul(x_out, y_out, z_out,
1986                  (const felem_bytearray(*))secrets, num_points,
1987                  g_secret,
1988                  mixed, (const felem(*)[17][3])pre_comp,
1989                  (const felem(*)[3])g_pre_comp);
1990    } else
1991        /* do the multiplication without generator precomputation */
1992        batch_mul(x_out, y_out, z_out,
1993                  (const felem_bytearray(*))secrets, num_points,
1994                  NULL, mixed, (const felem(*)[17][3])pre_comp, NULL);
1995    /* reduce the output to its unique minimal representation */
1996    felem_contract(x_in, x_out);
1997    felem_contract(y_in, y_out);
1998    felem_contract(z_in, z_out);
1999    if ((!felem_to_BN(x, x_in)) || (!felem_to_BN(y, y_in)) ||
2000        (!felem_to_BN(z, z_in))) {
2001        ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
2002        goto err;
2003    }
2004    ret = EC_POINT_set_Jprojective_coordinates_GFp(group, r, x, y, z, ctx);
2005
2006 err:
2007    BN_CTX_end(ctx);
2008    if (generator != NULL)
2009        EC_POINT_free(generator);
2010    if (new_ctx != NULL)
2011        BN_CTX_free(new_ctx);
2012    if (secrets != NULL)
2013        OPENSSL_free(secrets);
2014    if (pre_comp != NULL)
2015        OPENSSL_free(pre_comp);
2016    if (tmp_felems != NULL)
2017        OPENSSL_free(tmp_felems);
2018    return ret;
2019}
2020
2021int ec_GFp_nistp521_precompute_mult(EC_GROUP *group, BN_CTX *ctx)
2022{
2023    int ret = 0;
2024    NISTP521_PRE_COMP *pre = NULL;
2025    int i, j;
2026    BN_CTX *new_ctx = NULL;
2027    BIGNUM *x, *y;
2028    EC_POINT *generator = NULL;
2029    felem tmp_felems[16];
2030
2031    /* throw away old precomputation */
2032    EC_EX_DATA_free_data(&group->extra_data, nistp521_pre_comp_dup,
2033                         nistp521_pre_comp_free,
2034                         nistp521_pre_comp_clear_free);
2035    if (ctx == NULL)
2036        if ((ctx = new_ctx = BN_CTX_new()) == NULL)
2037            return 0;
2038    BN_CTX_start(ctx);
2039    if (((x = BN_CTX_get(ctx)) == NULL) || ((y = BN_CTX_get(ctx)) == NULL))
2040        goto err;
2041    /* get the generator */
2042    if (group->generator == NULL)
2043        goto err;
2044    generator = EC_POINT_new(group);
2045    if (generator == NULL)
2046        goto err;
2047    BN_bin2bn(nistp521_curve_params[3], sizeof(felem_bytearray), x);
2048    BN_bin2bn(nistp521_curve_params[4], sizeof(felem_bytearray), y);
2049    if (!EC_POINT_set_affine_coordinates_GFp(group, generator, x, y, ctx))
2050        goto err;
2051    if ((pre = nistp521_pre_comp_new()) == NULL)
2052        goto err;
2053    /*
2054     * if the generator is the standard one, use built-in precomputation
2055     */
2056    if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) {
2057        memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp));
2058        goto done;
2059    }
2060    if ((!BN_to_felem(pre->g_pre_comp[1][0], &group->generator->X)) ||
2061        (!BN_to_felem(pre->g_pre_comp[1][1], &group->generator->Y)) ||
2062        (!BN_to_felem(pre->g_pre_comp[1][2], &group->generator->Z)))
2063        goto err;
2064    /* compute 2^130*G, 2^260*G, 2^390*G */
2065    for (i = 1; i <= 4; i <<= 1) {
2066        point_double(pre->g_pre_comp[2 * i][0], pre->g_pre_comp[2 * i][1],
2067                     pre->g_pre_comp[2 * i][2], pre->g_pre_comp[i][0],
2068                     pre->g_pre_comp[i][1], pre->g_pre_comp[i][2]);
2069        for (j = 0; j < 129; ++j) {
2070            point_double(pre->g_pre_comp[2 * i][0],
2071                         pre->g_pre_comp[2 * i][1],
2072                         pre->g_pre_comp[2 * i][2],
2073                         pre->g_pre_comp[2 * i][0],
2074                         pre->g_pre_comp[2 * i][1],
2075                         pre->g_pre_comp[2 * i][2]);
2076        }
2077    }
2078    /* g_pre_comp[0] is the point at infinity */
2079    memset(pre->g_pre_comp[0], 0, sizeof(pre->g_pre_comp[0]));
2080    /* the remaining multiples */
2081    /* 2^130*G + 2^260*G */
2082    point_add(pre->g_pre_comp[6][0], pre->g_pre_comp[6][1],
2083              pre->g_pre_comp[6][2], pre->g_pre_comp[4][0],
2084              pre->g_pre_comp[4][1], pre->g_pre_comp[4][2],
2085              0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
2086              pre->g_pre_comp[2][2]);
2087    /* 2^130*G + 2^390*G */
2088    point_add(pre->g_pre_comp[10][0], pre->g_pre_comp[10][1],
2089              pre->g_pre_comp[10][2], pre->g_pre_comp[8][0],
2090              pre->g_pre_comp[8][1], pre->g_pre_comp[8][2],
2091              0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
2092              pre->g_pre_comp[2][2]);
2093    /* 2^260*G + 2^390*G */
2094    point_add(pre->g_pre_comp[12][0], pre->g_pre_comp[12][1],
2095              pre->g_pre_comp[12][2], pre->g_pre_comp[8][0],
2096              pre->g_pre_comp[8][1], pre->g_pre_comp[8][2],
2097              0, pre->g_pre_comp[4][0], pre->g_pre_comp[4][1],
2098              pre->g_pre_comp[4][2]);
2099    /* 2^130*G + 2^260*G + 2^390*G */
2100    point_add(pre->g_pre_comp[14][0], pre->g_pre_comp[14][1],
2101              pre->g_pre_comp[14][2], pre->g_pre_comp[12][0],
2102              pre->g_pre_comp[12][1], pre->g_pre_comp[12][2],
2103              0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
2104              pre->g_pre_comp[2][2]);
2105    for (i = 1; i < 8; ++i) {
2106        /* odd multiples: add G */
2107        point_add(pre->g_pre_comp[2 * i + 1][0],
2108                  pre->g_pre_comp[2 * i + 1][1],
2109                  pre->g_pre_comp[2 * i + 1][2], pre->g_pre_comp[2 * i][0],
2110                  pre->g_pre_comp[2 * i][1], pre->g_pre_comp[2 * i][2], 0,
2111                  pre->g_pre_comp[1][0], pre->g_pre_comp[1][1],
2112                  pre->g_pre_comp[1][2]);
2113    }
2114    make_points_affine(15, &(pre->g_pre_comp[1]), tmp_felems);
2115
2116 done:
2117    if (!EC_EX_DATA_set_data(&group->extra_data, pre, nistp521_pre_comp_dup,
2118                             nistp521_pre_comp_free,
2119                             nistp521_pre_comp_clear_free))
2120        goto err;
2121    ret = 1;
2122    pre = NULL;
2123 err:
2124    BN_CTX_end(ctx);
2125    if (generator != NULL)
2126        EC_POINT_free(generator);
2127    if (new_ctx != NULL)
2128        BN_CTX_free(new_ctx);
2129    if (pre)
2130        nistp521_pre_comp_free(pre);
2131    return ret;
2132}
2133
2134int ec_GFp_nistp521_have_precompute_mult(const EC_GROUP *group)
2135{
2136    if (EC_EX_DATA_get_data(group->extra_data, nistp521_pre_comp_dup,
2137                            nistp521_pre_comp_free,
2138                            nistp521_pre_comp_clear_free)
2139        != NULL)
2140        return 1;
2141    else
2142        return 0;
2143}
2144
2145#else
2146static void *dummy = &dummy;
2147#endif
2148