ecp_nistp521.c revision 348343
1/* crypto/ec/ecp_nistp521.c */
2/*
3 * Written by Adam Langley (Google) for the OpenSSL project
4 */
5/* Copyright 2011 Google Inc.
6 *
7 * Licensed under the Apache License, Version 2.0 (the "License");
8 *
9 * you may not use this file except in compliance with the License.
10 * You may obtain a copy of the License at
11 *
12 *     http://www.apache.org/licenses/LICENSE-2.0
13 *
14 *  Unless required by applicable law or agreed to in writing, software
15 *  distributed under the License is distributed on an "AS IS" BASIS,
16 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 *  See the License for the specific language governing permissions and
18 *  limitations under the License.
19 */
20
21/*
22 * A 64-bit implementation of the NIST P-521 elliptic curve point multiplication
23 *
24 * OpenSSL integration was taken from Emilia Kasper's work in ecp_nistp224.c.
25 * Otherwise based on Emilia's P224 work, which was inspired by my curve25519
26 * work which got its smarts from Daniel J. Bernstein's work on the same.
27 */
28
29#include <openssl/opensslconf.h>
30#ifndef OPENSSL_NO_EC_NISTP_64_GCC_128
31
32# ifndef OPENSSL_SYS_VMS
33#  include <stdint.h>
34# else
35#  include <inttypes.h>
36# endif
37
38# include <string.h>
39# include <openssl/err.h>
40# include "ec_lcl.h"
41
42# if defined(__GNUC__) && (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))
43  /* even with gcc, the typedef won't work for 32-bit platforms */
44typedef __uint128_t uint128_t;  /* nonstandard; implemented by gcc on 64-bit
45                                 * platforms */
46# else
47#  error "Need GCC 3.1 or later to define type uint128_t"
48# endif
49
50typedef uint8_t u8;
51typedef uint64_t u64;
52
53/*
54 * The underlying field. P521 operates over GF(2^521-1). We can serialise an
55 * element of this field into 66 bytes where the most significant byte
56 * contains only a single bit. We call this an felem_bytearray.
57 */
58
59typedef u8 felem_bytearray[66];
60
61/*
62 * These are the parameters of P521, taken from FIPS 186-3, section D.1.2.5.
63 * These values are big-endian.
64 */
65static const felem_bytearray nistp521_curve_params[5] = {
66    {0x01, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* p */
67     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
68     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
69     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
70     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
71     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
72     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
73     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
74     0xff, 0xff},
75    {0x01, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* a = -3 */
76     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
77     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
78     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
79     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
80     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
81     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
82     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
83     0xff, 0xfc},
84    {0x00, 0x51, 0x95, 0x3e, 0xb9, 0x61, 0x8e, 0x1c, /* b */
85     0x9a, 0x1f, 0x92, 0x9a, 0x21, 0xa0, 0xb6, 0x85,
86     0x40, 0xee, 0xa2, 0xda, 0x72, 0x5b, 0x99, 0xb3,
87     0x15, 0xf3, 0xb8, 0xb4, 0x89, 0x91, 0x8e, 0xf1,
88     0x09, 0xe1, 0x56, 0x19, 0x39, 0x51, 0xec, 0x7e,
89     0x93, 0x7b, 0x16, 0x52, 0xc0, 0xbd, 0x3b, 0xb1,
90     0xbf, 0x07, 0x35, 0x73, 0xdf, 0x88, 0x3d, 0x2c,
91     0x34, 0xf1, 0xef, 0x45, 0x1f, 0xd4, 0x6b, 0x50,
92     0x3f, 0x00},
93    {0x00, 0xc6, 0x85, 0x8e, 0x06, 0xb7, 0x04, 0x04, /* x */
94     0xe9, 0xcd, 0x9e, 0x3e, 0xcb, 0x66, 0x23, 0x95,
95     0xb4, 0x42, 0x9c, 0x64, 0x81, 0x39, 0x05, 0x3f,
96     0xb5, 0x21, 0xf8, 0x28, 0xaf, 0x60, 0x6b, 0x4d,
97     0x3d, 0xba, 0xa1, 0x4b, 0x5e, 0x77, 0xef, 0xe7,
98     0x59, 0x28, 0xfe, 0x1d, 0xc1, 0x27, 0xa2, 0xff,
99     0xa8, 0xde, 0x33, 0x48, 0xb3, 0xc1, 0x85, 0x6a,
100     0x42, 0x9b, 0xf9, 0x7e, 0x7e, 0x31, 0xc2, 0xe5,
101     0xbd, 0x66},
102    {0x01, 0x18, 0x39, 0x29, 0x6a, 0x78, 0x9a, 0x3b, /* y */
103     0xc0, 0x04, 0x5c, 0x8a, 0x5f, 0xb4, 0x2c, 0x7d,
104     0x1b, 0xd9, 0x98, 0xf5, 0x44, 0x49, 0x57, 0x9b,
105     0x44, 0x68, 0x17, 0xaf, 0xbd, 0x17, 0x27, 0x3e,
106     0x66, 0x2c, 0x97, 0xee, 0x72, 0x99, 0x5e, 0xf4,
107     0x26, 0x40, 0xc5, 0x50, 0xb9, 0x01, 0x3f, 0xad,
108     0x07, 0x61, 0x35, 0x3c, 0x70, 0x86, 0xa2, 0x72,
109     0xc2, 0x40, 0x88, 0xbe, 0x94, 0x76, 0x9f, 0xd1,
110     0x66, 0x50}
111};
112
113/*-
114 * The representation of field elements.
115 * ------------------------------------
116 *
117 * We represent field elements with nine values. These values are either 64 or
118 * 128 bits and the field element represented is:
119 *   v[0]*2^0 + v[1]*2^58 + v[2]*2^116 + ... + v[8]*2^464  (mod p)
120 * Each of the nine values is called a 'limb'. Since the limbs are spaced only
121 * 58 bits apart, but are greater than 58 bits in length, the most significant
122 * bits of each limb overlap with the least significant bits of the next.
123 *
124 * A field element with 64-bit limbs is an 'felem'. One with 128-bit limbs is a
125 * 'largefelem' */
126
127# define NLIMBS 9
128
129typedef uint64_t limb;
130typedef limb felem[NLIMBS];
131typedef uint128_t largefelem[NLIMBS];
132
133static const limb bottom57bits = 0x1ffffffffffffff;
134static const limb bottom58bits = 0x3ffffffffffffff;
135
136/*
137 * bin66_to_felem takes a little-endian byte array and converts it into felem
138 * form. This assumes that the CPU is little-endian.
139 */
140static void bin66_to_felem(felem out, const u8 in[66])
141{
142    out[0] = (*((limb *) & in[0])) & bottom58bits;
143    out[1] = (*((limb *) & in[7]) >> 2) & bottom58bits;
144    out[2] = (*((limb *) & in[14]) >> 4) & bottom58bits;
145    out[3] = (*((limb *) & in[21]) >> 6) & bottom58bits;
146    out[4] = (*((limb *) & in[29])) & bottom58bits;
147    out[5] = (*((limb *) & in[36]) >> 2) & bottom58bits;
148    out[6] = (*((limb *) & in[43]) >> 4) & bottom58bits;
149    out[7] = (*((limb *) & in[50]) >> 6) & bottom58bits;
150    out[8] = (*((limb *) & in[58])) & bottom57bits;
151}
152
153/*
154 * felem_to_bin66 takes an felem and serialises into a little endian, 66 byte
155 * array. This assumes that the CPU is little-endian.
156 */
157static void felem_to_bin66(u8 out[66], const felem in)
158{
159    memset(out, 0, 66);
160    (*((limb *) & out[0])) = in[0];
161    (*((limb *) & out[7])) |= in[1] << 2;
162    (*((limb *) & out[14])) |= in[2] << 4;
163    (*((limb *) & out[21])) |= in[3] << 6;
164    (*((limb *) & out[29])) = in[4];
165    (*((limb *) & out[36])) |= in[5] << 2;
166    (*((limb *) & out[43])) |= in[6] << 4;
167    (*((limb *) & out[50])) |= in[7] << 6;
168    (*((limb *) & out[58])) = in[8];
169}
170
171/* To preserve endianness when using BN_bn2bin and BN_bin2bn */
172static void flip_endian(u8 *out, const u8 *in, unsigned len)
173{
174    unsigned i;
175    for (i = 0; i < len; ++i)
176        out[i] = in[len - 1 - i];
177}
178
179/* BN_to_felem converts an OpenSSL BIGNUM into an felem */
180static int BN_to_felem(felem out, const BIGNUM *bn)
181{
182    felem_bytearray b_in;
183    felem_bytearray b_out;
184    unsigned num_bytes;
185
186    /* BN_bn2bin eats leading zeroes */
187    memset(b_out, 0, sizeof(b_out));
188    num_bytes = BN_num_bytes(bn);
189    if (num_bytes > sizeof(b_out)) {
190        ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
191        return 0;
192    }
193    if (BN_is_negative(bn)) {
194        ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
195        return 0;
196    }
197    num_bytes = BN_bn2bin(bn, b_in);
198    flip_endian(b_out, b_in, num_bytes);
199    bin66_to_felem(out, b_out);
200    return 1;
201}
202
203/* felem_to_BN converts an felem into an OpenSSL BIGNUM */
204static BIGNUM *felem_to_BN(BIGNUM *out, const felem in)
205{
206    felem_bytearray b_in, b_out;
207    felem_to_bin66(b_in, in);
208    flip_endian(b_out, b_in, sizeof(b_out));
209    return BN_bin2bn(b_out, sizeof(b_out), out);
210}
211
212/*-
213 * Field operations
214 * ----------------
215 */
216
217static void felem_one(felem out)
218{
219    out[0] = 1;
220    out[1] = 0;
221    out[2] = 0;
222    out[3] = 0;
223    out[4] = 0;
224    out[5] = 0;
225    out[6] = 0;
226    out[7] = 0;
227    out[8] = 0;
228}
229
230static void felem_assign(felem out, const felem in)
231{
232    out[0] = in[0];
233    out[1] = in[1];
234    out[2] = in[2];
235    out[3] = in[3];
236    out[4] = in[4];
237    out[5] = in[5];
238    out[6] = in[6];
239    out[7] = in[7];
240    out[8] = in[8];
241}
242
243/* felem_sum64 sets out = out + in. */
244static void felem_sum64(felem out, const felem in)
245{
246    out[0] += in[0];
247    out[1] += in[1];
248    out[2] += in[2];
249    out[3] += in[3];
250    out[4] += in[4];
251    out[5] += in[5];
252    out[6] += in[6];
253    out[7] += in[7];
254    out[8] += in[8];
255}
256
257/* felem_scalar sets out = in * scalar */
258static void felem_scalar(felem out, const felem in, limb scalar)
259{
260    out[0] = in[0] * scalar;
261    out[1] = in[1] * scalar;
262    out[2] = in[2] * scalar;
263    out[3] = in[3] * scalar;
264    out[4] = in[4] * scalar;
265    out[5] = in[5] * scalar;
266    out[6] = in[6] * scalar;
267    out[7] = in[7] * scalar;
268    out[8] = in[8] * scalar;
269}
270
271/* felem_scalar64 sets out = out * scalar */
272static void felem_scalar64(felem out, limb scalar)
273{
274    out[0] *= scalar;
275    out[1] *= scalar;
276    out[2] *= scalar;
277    out[3] *= scalar;
278    out[4] *= scalar;
279    out[5] *= scalar;
280    out[6] *= scalar;
281    out[7] *= scalar;
282    out[8] *= scalar;
283}
284
285/* felem_scalar128 sets out = out * scalar */
286static void felem_scalar128(largefelem out, limb scalar)
287{
288    out[0] *= scalar;
289    out[1] *= scalar;
290    out[2] *= scalar;
291    out[3] *= scalar;
292    out[4] *= scalar;
293    out[5] *= scalar;
294    out[6] *= scalar;
295    out[7] *= scalar;
296    out[8] *= scalar;
297}
298
299/*-
300 * felem_neg sets |out| to |-in|
301 * On entry:
302 *   in[i] < 2^59 + 2^14
303 * On exit:
304 *   out[i] < 2^62
305 */
306static void felem_neg(felem out, const felem in)
307{
308    /* In order to prevent underflow, we subtract from 0 mod p. */
309    static const limb two62m3 = (((limb) 1) << 62) - (((limb) 1) << 5);
310    static const limb two62m2 = (((limb) 1) << 62) - (((limb) 1) << 4);
311
312    out[0] = two62m3 - in[0];
313    out[1] = two62m2 - in[1];
314    out[2] = two62m2 - in[2];
315    out[3] = two62m2 - in[3];
316    out[4] = two62m2 - in[4];
317    out[5] = two62m2 - in[5];
318    out[6] = two62m2 - in[6];
319    out[7] = two62m2 - in[7];
320    out[8] = two62m2 - in[8];
321}
322
323/*-
324 * felem_diff64 subtracts |in| from |out|
325 * On entry:
326 *   in[i] < 2^59 + 2^14
327 * On exit:
328 *   out[i] < out[i] + 2^62
329 */
330static void felem_diff64(felem out, const felem in)
331{
332    /*
333     * In order to prevent underflow, we add 0 mod p before subtracting.
334     */
335    static const limb two62m3 = (((limb) 1) << 62) - (((limb) 1) << 5);
336    static const limb two62m2 = (((limb) 1) << 62) - (((limb) 1) << 4);
337
338    out[0] += two62m3 - in[0];
339    out[1] += two62m2 - in[1];
340    out[2] += two62m2 - in[2];
341    out[3] += two62m2 - in[3];
342    out[4] += two62m2 - in[4];
343    out[5] += two62m2 - in[5];
344    out[6] += two62m2 - in[6];
345    out[7] += two62m2 - in[7];
346    out[8] += two62m2 - in[8];
347}
348
349/*-
350 * felem_diff_128_64 subtracts |in| from |out|
351 * On entry:
352 *   in[i] < 2^62 + 2^17
353 * On exit:
354 *   out[i] < out[i] + 2^63
355 */
356static void felem_diff_128_64(largefelem out, const felem in)
357{
358    /*
359     * In order to prevent underflow, we add 64p mod p (which is equivalent
360     * to 0 mod p) before subtracting. p is 2^521 - 1, i.e. in binary a 521
361     * digit number with all bits set to 1. See "The representation of field
362     * elements" comment above for a description of how limbs are used to
363     * represent a number. 64p is represented with 8 limbs containing a number
364     * with 58 bits set and one limb with a number with 57 bits set.
365     */
366    static const limb two63m6 = (((limb) 1) << 63) - (((limb) 1) << 6);
367    static const limb two63m5 = (((limb) 1) << 63) - (((limb) 1) << 5);
368
369    out[0] += two63m6 - in[0];
370    out[1] += two63m5 - in[1];
371    out[2] += two63m5 - in[2];
372    out[3] += two63m5 - in[3];
373    out[4] += two63m5 - in[4];
374    out[5] += two63m5 - in[5];
375    out[6] += two63m5 - in[6];
376    out[7] += two63m5 - in[7];
377    out[8] += two63m5 - in[8];
378}
379
380/*-
381 * felem_diff_128_64 subtracts |in| from |out|
382 * On entry:
383 *   in[i] < 2^126
384 * On exit:
385 *   out[i] < out[i] + 2^127 - 2^69
386 */
387static void felem_diff128(largefelem out, const largefelem in)
388{
389    /*
390     * In order to prevent underflow, we add 0 mod p before subtracting.
391     */
392    static const uint128_t two127m70 =
393        (((uint128_t) 1) << 127) - (((uint128_t) 1) << 70);
394    static const uint128_t two127m69 =
395        (((uint128_t) 1) << 127) - (((uint128_t) 1) << 69);
396
397    out[0] += (two127m70 - in[0]);
398    out[1] += (two127m69 - in[1]);
399    out[2] += (two127m69 - in[2]);
400    out[3] += (two127m69 - in[3]);
401    out[4] += (two127m69 - in[4]);
402    out[5] += (two127m69 - in[5]);
403    out[6] += (two127m69 - in[6]);
404    out[7] += (two127m69 - in[7]);
405    out[8] += (two127m69 - in[8]);
406}
407
408/*-
409 * felem_square sets |out| = |in|^2
410 * On entry:
411 *   in[i] < 2^62
412 * On exit:
413 *   out[i] < 17 * max(in[i]) * max(in[i])
414 */
415static void felem_square(largefelem out, const felem in)
416{
417    felem inx2, inx4;
418    felem_scalar(inx2, in, 2);
419    felem_scalar(inx4, in, 4);
420
421    /*-
422     * We have many cases were we want to do
423     *   in[x] * in[y] +
424     *   in[y] * in[x]
425     * This is obviously just
426     *   2 * in[x] * in[y]
427     * However, rather than do the doubling on the 128 bit result, we
428     * double one of the inputs to the multiplication by reading from
429     * |inx2|
430     */
431
432    out[0] = ((uint128_t) in[0]) * in[0];
433    out[1] = ((uint128_t) in[0]) * inx2[1];
434    out[2] = ((uint128_t) in[0]) * inx2[2] + ((uint128_t) in[1]) * in[1];
435    out[3] = ((uint128_t) in[0]) * inx2[3] + ((uint128_t) in[1]) * inx2[2];
436    out[4] = ((uint128_t) in[0]) * inx2[4] +
437        ((uint128_t) in[1]) * inx2[3] + ((uint128_t) in[2]) * in[2];
438    out[5] = ((uint128_t) in[0]) * inx2[5] +
439        ((uint128_t) in[1]) * inx2[4] + ((uint128_t) in[2]) * inx2[3];
440    out[6] = ((uint128_t) in[0]) * inx2[6] +
441        ((uint128_t) in[1]) * inx2[5] +
442        ((uint128_t) in[2]) * inx2[4] + ((uint128_t) in[3]) * in[3];
443    out[7] = ((uint128_t) in[0]) * inx2[7] +
444        ((uint128_t) in[1]) * inx2[6] +
445        ((uint128_t) in[2]) * inx2[5] + ((uint128_t) in[3]) * inx2[4];
446    out[8] = ((uint128_t) in[0]) * inx2[8] +
447        ((uint128_t) in[1]) * inx2[7] +
448        ((uint128_t) in[2]) * inx2[6] +
449        ((uint128_t) in[3]) * inx2[5] + ((uint128_t) in[4]) * in[4];
450
451    /*
452     * The remaining limbs fall above 2^521, with the first falling at 2^522.
453     * They correspond to locations one bit up from the limbs produced above
454     * so we would have to multiply by two to align them. Again, rather than
455     * operate on the 128-bit result, we double one of the inputs to the
456     * multiplication. If we want to double for both this reason, and the
457     * reason above, then we end up multiplying by four.
458     */
459
460    /* 9 */
461    out[0] += ((uint128_t) in[1]) * inx4[8] +
462        ((uint128_t) in[2]) * inx4[7] +
463        ((uint128_t) in[3]) * inx4[6] + ((uint128_t) in[4]) * inx4[5];
464
465    /* 10 */
466    out[1] += ((uint128_t) in[2]) * inx4[8] +
467        ((uint128_t) in[3]) * inx4[7] +
468        ((uint128_t) in[4]) * inx4[6] + ((uint128_t) in[5]) * inx2[5];
469
470    /* 11 */
471    out[2] += ((uint128_t) in[3]) * inx4[8] +
472        ((uint128_t) in[4]) * inx4[7] + ((uint128_t) in[5]) * inx4[6];
473
474    /* 12 */
475    out[3] += ((uint128_t) in[4]) * inx4[8] +
476        ((uint128_t) in[5]) * inx4[7] + ((uint128_t) in[6]) * inx2[6];
477
478    /* 13 */
479    out[4] += ((uint128_t) in[5]) * inx4[8] + ((uint128_t) in[6]) * inx4[7];
480
481    /* 14 */
482    out[5] += ((uint128_t) in[6]) * inx4[8] + ((uint128_t) in[7]) * inx2[7];
483
484    /* 15 */
485    out[6] += ((uint128_t) in[7]) * inx4[8];
486
487    /* 16 */
488    out[7] += ((uint128_t) in[8]) * inx2[8];
489}
490
491/*-
492 * felem_mul sets |out| = |in1| * |in2|
493 * On entry:
494 *   in1[i] < 2^64
495 *   in2[i] < 2^63
496 * On exit:
497 *   out[i] < 17 * max(in1[i]) * max(in2[i])
498 */
499static void felem_mul(largefelem out, const felem in1, const felem in2)
500{
501    felem in2x2;
502    felem_scalar(in2x2, in2, 2);
503
504    out[0] = ((uint128_t) in1[0]) * in2[0];
505
506    out[1] = ((uint128_t) in1[0]) * in2[1] + ((uint128_t) in1[1]) * in2[0];
507
508    out[2] = ((uint128_t) in1[0]) * in2[2] +
509        ((uint128_t) in1[1]) * in2[1] + ((uint128_t) in1[2]) * in2[0];
510
511    out[3] = ((uint128_t) in1[0]) * in2[3] +
512        ((uint128_t) in1[1]) * in2[2] +
513        ((uint128_t) in1[2]) * in2[1] + ((uint128_t) in1[3]) * in2[0];
514
515    out[4] = ((uint128_t) in1[0]) * in2[4] +
516        ((uint128_t) in1[1]) * in2[3] +
517        ((uint128_t) in1[2]) * in2[2] +
518        ((uint128_t) in1[3]) * in2[1] + ((uint128_t) in1[4]) * in2[0];
519
520    out[5] = ((uint128_t) in1[0]) * in2[5] +
521        ((uint128_t) in1[1]) * in2[4] +
522        ((uint128_t) in1[2]) * in2[3] +
523        ((uint128_t) in1[3]) * in2[2] +
524        ((uint128_t) in1[4]) * in2[1] + ((uint128_t) in1[5]) * in2[0];
525
526    out[6] = ((uint128_t) in1[0]) * in2[6] +
527        ((uint128_t) in1[1]) * in2[5] +
528        ((uint128_t) in1[2]) * in2[4] +
529        ((uint128_t) in1[3]) * in2[3] +
530        ((uint128_t) in1[4]) * in2[2] +
531        ((uint128_t) in1[5]) * in2[1] + ((uint128_t) in1[6]) * in2[0];
532
533    out[7] = ((uint128_t) in1[0]) * in2[7] +
534        ((uint128_t) in1[1]) * in2[6] +
535        ((uint128_t) in1[2]) * in2[5] +
536        ((uint128_t) in1[3]) * in2[4] +
537        ((uint128_t) in1[4]) * in2[3] +
538        ((uint128_t) in1[5]) * in2[2] +
539        ((uint128_t) in1[6]) * in2[1] + ((uint128_t) in1[7]) * in2[0];
540
541    out[8] = ((uint128_t) in1[0]) * in2[8] +
542        ((uint128_t) in1[1]) * in2[7] +
543        ((uint128_t) in1[2]) * in2[6] +
544        ((uint128_t) in1[3]) * in2[5] +
545        ((uint128_t) in1[4]) * in2[4] +
546        ((uint128_t) in1[5]) * in2[3] +
547        ((uint128_t) in1[6]) * in2[2] +
548        ((uint128_t) in1[7]) * in2[1] + ((uint128_t) in1[8]) * in2[0];
549
550    /* See comment in felem_square about the use of in2x2 here */
551
552    out[0] += ((uint128_t) in1[1]) * in2x2[8] +
553        ((uint128_t) in1[2]) * in2x2[7] +
554        ((uint128_t) in1[3]) * in2x2[6] +
555        ((uint128_t) in1[4]) * in2x2[5] +
556        ((uint128_t) in1[5]) * in2x2[4] +
557        ((uint128_t) in1[6]) * in2x2[3] +
558        ((uint128_t) in1[7]) * in2x2[2] + ((uint128_t) in1[8]) * in2x2[1];
559
560    out[1] += ((uint128_t) in1[2]) * in2x2[8] +
561        ((uint128_t) in1[3]) * in2x2[7] +
562        ((uint128_t) in1[4]) * in2x2[6] +
563        ((uint128_t) in1[5]) * in2x2[5] +
564        ((uint128_t) in1[6]) * in2x2[4] +
565        ((uint128_t) in1[7]) * in2x2[3] + ((uint128_t) in1[8]) * in2x2[2];
566
567    out[2] += ((uint128_t) in1[3]) * in2x2[8] +
568        ((uint128_t) in1[4]) * in2x2[7] +
569        ((uint128_t) in1[5]) * in2x2[6] +
570        ((uint128_t) in1[6]) * in2x2[5] +
571        ((uint128_t) in1[7]) * in2x2[4] + ((uint128_t) in1[8]) * in2x2[3];
572
573    out[3] += ((uint128_t) in1[4]) * in2x2[8] +
574        ((uint128_t) in1[5]) * in2x2[7] +
575        ((uint128_t) in1[6]) * in2x2[6] +
576        ((uint128_t) in1[7]) * in2x2[5] + ((uint128_t) in1[8]) * in2x2[4];
577
578    out[4] += ((uint128_t) in1[5]) * in2x2[8] +
579        ((uint128_t) in1[6]) * in2x2[7] +
580        ((uint128_t) in1[7]) * in2x2[6] + ((uint128_t) in1[8]) * in2x2[5];
581
582    out[5] += ((uint128_t) in1[6]) * in2x2[8] +
583        ((uint128_t) in1[7]) * in2x2[7] + ((uint128_t) in1[8]) * in2x2[6];
584
585    out[6] += ((uint128_t) in1[7]) * in2x2[8] +
586        ((uint128_t) in1[8]) * in2x2[7];
587
588    out[7] += ((uint128_t) in1[8]) * in2x2[8];
589}
590
591static const limb bottom52bits = 0xfffffffffffff;
592
593/*-
594 * felem_reduce converts a largefelem to an felem.
595 * On entry:
596 *   in[i] < 2^128
597 * On exit:
598 *   out[i] < 2^59 + 2^14
599 */
600static void felem_reduce(felem out, const largefelem in)
601{
602    u64 overflow1, overflow2;
603
604    out[0] = ((limb) in[0]) & bottom58bits;
605    out[1] = ((limb) in[1]) & bottom58bits;
606    out[2] = ((limb) in[2]) & bottom58bits;
607    out[3] = ((limb) in[3]) & bottom58bits;
608    out[4] = ((limb) in[4]) & bottom58bits;
609    out[5] = ((limb) in[5]) & bottom58bits;
610    out[6] = ((limb) in[6]) & bottom58bits;
611    out[7] = ((limb) in[7]) & bottom58bits;
612    out[8] = ((limb) in[8]) & bottom58bits;
613
614    /* out[i] < 2^58 */
615
616    out[1] += ((limb) in[0]) >> 58;
617    out[1] += (((limb) (in[0] >> 64)) & bottom52bits) << 6;
618    /*-
619     * out[1] < 2^58 + 2^6 + 2^58
620     *        = 2^59 + 2^6
621     */
622    out[2] += ((limb) (in[0] >> 64)) >> 52;
623
624    out[2] += ((limb) in[1]) >> 58;
625    out[2] += (((limb) (in[1] >> 64)) & bottom52bits) << 6;
626    out[3] += ((limb) (in[1] >> 64)) >> 52;
627
628    out[3] += ((limb) in[2]) >> 58;
629    out[3] += (((limb) (in[2] >> 64)) & bottom52bits) << 6;
630    out[4] += ((limb) (in[2] >> 64)) >> 52;
631
632    out[4] += ((limb) in[3]) >> 58;
633    out[4] += (((limb) (in[3] >> 64)) & bottom52bits) << 6;
634    out[5] += ((limb) (in[3] >> 64)) >> 52;
635
636    out[5] += ((limb) in[4]) >> 58;
637    out[5] += (((limb) (in[4] >> 64)) & bottom52bits) << 6;
638    out[6] += ((limb) (in[4] >> 64)) >> 52;
639
640    out[6] += ((limb) in[5]) >> 58;
641    out[6] += (((limb) (in[5] >> 64)) & bottom52bits) << 6;
642    out[7] += ((limb) (in[5] >> 64)) >> 52;
643
644    out[7] += ((limb) in[6]) >> 58;
645    out[7] += (((limb) (in[6] >> 64)) & bottom52bits) << 6;
646    out[8] += ((limb) (in[6] >> 64)) >> 52;
647
648    out[8] += ((limb) in[7]) >> 58;
649    out[8] += (((limb) (in[7] >> 64)) & bottom52bits) << 6;
650    /*-
651     * out[x > 1] < 2^58 + 2^6 + 2^58 + 2^12
652     *            < 2^59 + 2^13
653     */
654    overflow1 = ((limb) (in[7] >> 64)) >> 52;
655
656    overflow1 += ((limb) in[8]) >> 58;
657    overflow1 += (((limb) (in[8] >> 64)) & bottom52bits) << 6;
658    overflow2 = ((limb) (in[8] >> 64)) >> 52;
659
660    overflow1 <<= 1;            /* overflow1 < 2^13 + 2^7 + 2^59 */
661    overflow2 <<= 1;            /* overflow2 < 2^13 */
662
663    out[0] += overflow1;        /* out[0] < 2^60 */
664    out[1] += overflow2;        /* out[1] < 2^59 + 2^6 + 2^13 */
665
666    out[1] += out[0] >> 58;
667    out[0] &= bottom58bits;
668    /*-
669     * out[0] < 2^58
670     * out[1] < 2^59 + 2^6 + 2^13 + 2^2
671     *        < 2^59 + 2^14
672     */
673}
674
675static void felem_square_reduce(felem out, const felem in)
676{
677    largefelem tmp;
678    felem_square(tmp, in);
679    felem_reduce(out, tmp);
680}
681
682static void felem_mul_reduce(felem out, const felem in1, const felem in2)
683{
684    largefelem tmp;
685    felem_mul(tmp, in1, in2);
686    felem_reduce(out, tmp);
687}
688
689/*-
690 * felem_inv calculates |out| = |in|^{-1}
691 *
692 * Based on Fermat's Little Theorem:
693 *   a^p = a (mod p)
694 *   a^{p-1} = 1 (mod p)
695 *   a^{p-2} = a^{-1} (mod p)
696 */
697static void felem_inv(felem out, const felem in)
698{
699    felem ftmp, ftmp2, ftmp3, ftmp4;
700    largefelem tmp;
701    unsigned i;
702
703    felem_square(tmp, in);
704    felem_reduce(ftmp, tmp);    /* 2^1 */
705    felem_mul(tmp, in, ftmp);
706    felem_reduce(ftmp, tmp);    /* 2^2 - 2^0 */
707    felem_assign(ftmp2, ftmp);
708    felem_square(tmp, ftmp);
709    felem_reduce(ftmp, tmp);    /* 2^3 - 2^1 */
710    felem_mul(tmp, in, ftmp);
711    felem_reduce(ftmp, tmp);    /* 2^3 - 2^0 */
712    felem_square(tmp, ftmp);
713    felem_reduce(ftmp, tmp);    /* 2^4 - 2^1 */
714
715    felem_square(tmp, ftmp2);
716    felem_reduce(ftmp3, tmp);   /* 2^3 - 2^1 */
717    felem_square(tmp, ftmp3);
718    felem_reduce(ftmp3, tmp);   /* 2^4 - 2^2 */
719    felem_mul(tmp, ftmp3, ftmp2);
720    felem_reduce(ftmp3, tmp);   /* 2^4 - 2^0 */
721
722    felem_assign(ftmp2, ftmp3);
723    felem_square(tmp, ftmp3);
724    felem_reduce(ftmp3, tmp);   /* 2^5 - 2^1 */
725    felem_square(tmp, ftmp3);
726    felem_reduce(ftmp3, tmp);   /* 2^6 - 2^2 */
727    felem_square(tmp, ftmp3);
728    felem_reduce(ftmp3, tmp);   /* 2^7 - 2^3 */
729    felem_square(tmp, ftmp3);
730    felem_reduce(ftmp3, tmp);   /* 2^8 - 2^4 */
731    felem_assign(ftmp4, ftmp3);
732    felem_mul(tmp, ftmp3, ftmp);
733    felem_reduce(ftmp4, tmp);   /* 2^8 - 2^1 */
734    felem_square(tmp, ftmp4);
735    felem_reduce(ftmp4, tmp);   /* 2^9 - 2^2 */
736    felem_mul(tmp, ftmp3, ftmp2);
737    felem_reduce(ftmp3, tmp);   /* 2^8 - 2^0 */
738    felem_assign(ftmp2, ftmp3);
739
740    for (i = 0; i < 8; i++) {
741        felem_square(tmp, ftmp3);
742        felem_reduce(ftmp3, tmp); /* 2^16 - 2^8 */
743    }
744    felem_mul(tmp, ftmp3, ftmp2);
745    felem_reduce(ftmp3, tmp);   /* 2^16 - 2^0 */
746    felem_assign(ftmp2, ftmp3);
747
748    for (i = 0; i < 16; i++) {
749        felem_square(tmp, ftmp3);
750        felem_reduce(ftmp3, tmp); /* 2^32 - 2^16 */
751    }
752    felem_mul(tmp, ftmp3, ftmp2);
753    felem_reduce(ftmp3, tmp);   /* 2^32 - 2^0 */
754    felem_assign(ftmp2, ftmp3);
755
756    for (i = 0; i < 32; i++) {
757        felem_square(tmp, ftmp3);
758        felem_reduce(ftmp3, tmp); /* 2^64 - 2^32 */
759    }
760    felem_mul(tmp, ftmp3, ftmp2);
761    felem_reduce(ftmp3, tmp);   /* 2^64 - 2^0 */
762    felem_assign(ftmp2, ftmp3);
763
764    for (i = 0; i < 64; i++) {
765        felem_square(tmp, ftmp3);
766        felem_reduce(ftmp3, tmp); /* 2^128 - 2^64 */
767    }
768    felem_mul(tmp, ftmp3, ftmp2);
769    felem_reduce(ftmp3, tmp);   /* 2^128 - 2^0 */
770    felem_assign(ftmp2, ftmp3);
771
772    for (i = 0; i < 128; i++) {
773        felem_square(tmp, ftmp3);
774        felem_reduce(ftmp3, tmp); /* 2^256 - 2^128 */
775    }
776    felem_mul(tmp, ftmp3, ftmp2);
777    felem_reduce(ftmp3, tmp);   /* 2^256 - 2^0 */
778    felem_assign(ftmp2, ftmp3);
779
780    for (i = 0; i < 256; i++) {
781        felem_square(tmp, ftmp3);
782        felem_reduce(ftmp3, tmp); /* 2^512 - 2^256 */
783    }
784    felem_mul(tmp, ftmp3, ftmp2);
785    felem_reduce(ftmp3, tmp);   /* 2^512 - 2^0 */
786
787    for (i = 0; i < 9; i++) {
788        felem_square(tmp, ftmp3);
789        felem_reduce(ftmp3, tmp); /* 2^521 - 2^9 */
790    }
791    felem_mul(tmp, ftmp3, ftmp4);
792    felem_reduce(ftmp3, tmp);   /* 2^512 - 2^2 */
793    felem_mul(tmp, ftmp3, in);
794    felem_reduce(out, tmp);     /* 2^512 - 3 */
795}
796
797/* This is 2^521-1, expressed as an felem */
798static const felem kPrime = {
799    0x03ffffffffffffff, 0x03ffffffffffffff, 0x03ffffffffffffff,
800    0x03ffffffffffffff, 0x03ffffffffffffff, 0x03ffffffffffffff,
801    0x03ffffffffffffff, 0x03ffffffffffffff, 0x01ffffffffffffff
802};
803
804/*-
805 * felem_is_zero returns a limb with all bits set if |in| == 0 (mod p) and 0
806 * otherwise.
807 * On entry:
808 *   in[i] < 2^59 + 2^14
809 */
810static limb felem_is_zero(const felem in)
811{
812    felem ftmp;
813    limb is_zero, is_p;
814    felem_assign(ftmp, in);
815
816    ftmp[0] += ftmp[8] >> 57;
817    ftmp[8] &= bottom57bits;
818    /* ftmp[8] < 2^57 */
819    ftmp[1] += ftmp[0] >> 58;
820    ftmp[0] &= bottom58bits;
821    ftmp[2] += ftmp[1] >> 58;
822    ftmp[1] &= bottom58bits;
823    ftmp[3] += ftmp[2] >> 58;
824    ftmp[2] &= bottom58bits;
825    ftmp[4] += ftmp[3] >> 58;
826    ftmp[3] &= bottom58bits;
827    ftmp[5] += ftmp[4] >> 58;
828    ftmp[4] &= bottom58bits;
829    ftmp[6] += ftmp[5] >> 58;
830    ftmp[5] &= bottom58bits;
831    ftmp[7] += ftmp[6] >> 58;
832    ftmp[6] &= bottom58bits;
833    ftmp[8] += ftmp[7] >> 58;
834    ftmp[7] &= bottom58bits;
835    /* ftmp[8] < 2^57 + 4 */
836
837    /*
838     * The ninth limb of 2*(2^521-1) is 0x03ffffffffffffff, which is greater
839     * than our bound for ftmp[8]. Therefore we only have to check if the
840     * zero is zero or 2^521-1.
841     */
842
843    is_zero = 0;
844    is_zero |= ftmp[0];
845    is_zero |= ftmp[1];
846    is_zero |= ftmp[2];
847    is_zero |= ftmp[3];
848    is_zero |= ftmp[4];
849    is_zero |= ftmp[5];
850    is_zero |= ftmp[6];
851    is_zero |= ftmp[7];
852    is_zero |= ftmp[8];
853
854    is_zero--;
855    /*
856     * We know that ftmp[i] < 2^63, therefore the only way that the top bit
857     * can be set is if is_zero was 0 before the decrement.
858     */
859    is_zero = 0 - (is_zero >> 63);
860
861    is_p = ftmp[0] ^ kPrime[0];
862    is_p |= ftmp[1] ^ kPrime[1];
863    is_p |= ftmp[2] ^ kPrime[2];
864    is_p |= ftmp[3] ^ kPrime[3];
865    is_p |= ftmp[4] ^ kPrime[4];
866    is_p |= ftmp[5] ^ kPrime[5];
867    is_p |= ftmp[6] ^ kPrime[6];
868    is_p |= ftmp[7] ^ kPrime[7];
869    is_p |= ftmp[8] ^ kPrime[8];
870
871    is_p--;
872    is_p = 0 - (is_p >> 63);
873
874    is_zero |= is_p;
875    return is_zero;
876}
877
878static int felem_is_zero_int(const void *in)
879{
880    return (int)(felem_is_zero(in) & ((limb) 1));
881}
882
883/*-
884 * felem_contract converts |in| to its unique, minimal representation.
885 * On entry:
886 *   in[i] < 2^59 + 2^14
887 */
888static void felem_contract(felem out, const felem in)
889{
890    limb is_p, is_greater, sign;
891    static const limb two58 = ((limb) 1) << 58;
892
893    felem_assign(out, in);
894
895    out[0] += out[8] >> 57;
896    out[8] &= bottom57bits;
897    /* out[8] < 2^57 */
898    out[1] += out[0] >> 58;
899    out[0] &= bottom58bits;
900    out[2] += out[1] >> 58;
901    out[1] &= bottom58bits;
902    out[3] += out[2] >> 58;
903    out[2] &= bottom58bits;
904    out[4] += out[3] >> 58;
905    out[3] &= bottom58bits;
906    out[5] += out[4] >> 58;
907    out[4] &= bottom58bits;
908    out[6] += out[5] >> 58;
909    out[5] &= bottom58bits;
910    out[7] += out[6] >> 58;
911    out[6] &= bottom58bits;
912    out[8] += out[7] >> 58;
913    out[7] &= bottom58bits;
914    /* out[8] < 2^57 + 4 */
915
916    /*
917     * If the value is greater than 2^521-1 then we have to subtract 2^521-1
918     * out. See the comments in felem_is_zero regarding why we don't test for
919     * other multiples of the prime.
920     */
921
922    /*
923     * First, if |out| is equal to 2^521-1, we subtract it out to get zero.
924     */
925
926    is_p = out[0] ^ kPrime[0];
927    is_p |= out[1] ^ kPrime[1];
928    is_p |= out[2] ^ kPrime[2];
929    is_p |= out[3] ^ kPrime[3];
930    is_p |= out[4] ^ kPrime[4];
931    is_p |= out[5] ^ kPrime[5];
932    is_p |= out[6] ^ kPrime[6];
933    is_p |= out[7] ^ kPrime[7];
934    is_p |= out[8] ^ kPrime[8];
935
936    is_p--;
937    is_p &= is_p << 32;
938    is_p &= is_p << 16;
939    is_p &= is_p << 8;
940    is_p &= is_p << 4;
941    is_p &= is_p << 2;
942    is_p &= is_p << 1;
943    is_p = 0 - (is_p >> 63);
944    is_p = ~is_p;
945
946    /* is_p is 0 iff |out| == 2^521-1 and all ones otherwise */
947
948    out[0] &= is_p;
949    out[1] &= is_p;
950    out[2] &= is_p;
951    out[3] &= is_p;
952    out[4] &= is_p;
953    out[5] &= is_p;
954    out[6] &= is_p;
955    out[7] &= is_p;
956    out[8] &= is_p;
957
958    /*
959     * In order to test that |out| >= 2^521-1 we need only test if out[8] >>
960     * 57 is greater than zero as (2^521-1) + x >= 2^522
961     */
962    is_greater = out[8] >> 57;
963    is_greater |= is_greater << 32;
964    is_greater |= is_greater << 16;
965    is_greater |= is_greater << 8;
966    is_greater |= is_greater << 4;
967    is_greater |= is_greater << 2;
968    is_greater |= is_greater << 1;
969    is_greater = 0 - (is_greater >> 63);
970
971    out[0] -= kPrime[0] & is_greater;
972    out[1] -= kPrime[1] & is_greater;
973    out[2] -= kPrime[2] & is_greater;
974    out[3] -= kPrime[3] & is_greater;
975    out[4] -= kPrime[4] & is_greater;
976    out[5] -= kPrime[5] & is_greater;
977    out[6] -= kPrime[6] & is_greater;
978    out[7] -= kPrime[7] & is_greater;
979    out[8] -= kPrime[8] & is_greater;
980
981    /* Eliminate negative coefficients */
982    sign = -(out[0] >> 63);
983    out[0] += (two58 & sign);
984    out[1] -= (1 & sign);
985    sign = -(out[1] >> 63);
986    out[1] += (two58 & sign);
987    out[2] -= (1 & sign);
988    sign = -(out[2] >> 63);
989    out[2] += (two58 & sign);
990    out[3] -= (1 & sign);
991    sign = -(out[3] >> 63);
992    out[3] += (two58 & sign);
993    out[4] -= (1 & sign);
994    sign = -(out[4] >> 63);
995    out[4] += (two58 & sign);
996    out[5] -= (1 & sign);
997    sign = -(out[0] >> 63);
998    out[5] += (two58 & sign);
999    out[6] -= (1 & sign);
1000    sign = -(out[6] >> 63);
1001    out[6] += (two58 & sign);
1002    out[7] -= (1 & sign);
1003    sign = -(out[7] >> 63);
1004    out[7] += (two58 & sign);
1005    out[8] -= (1 & sign);
1006    sign = -(out[5] >> 63);
1007    out[5] += (two58 & sign);
1008    out[6] -= (1 & sign);
1009    sign = -(out[6] >> 63);
1010    out[6] += (two58 & sign);
1011    out[7] -= (1 & sign);
1012    sign = -(out[7] >> 63);
1013    out[7] += (two58 & sign);
1014    out[8] -= (1 & sign);
1015}
1016
1017/*-
1018 * Group operations
1019 * ----------------
1020 *
1021 * Building on top of the field operations we have the operations on the
1022 * elliptic curve group itself. Points on the curve are represented in Jacobian
1023 * coordinates */
1024
1025/*-
1026 * point_double calcuates 2*(x_in, y_in, z_in)
1027 *
1028 * The method is taken from:
1029 *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b
1030 *
1031 * Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed.
1032 * while x_out == y_in is not (maybe this works, but it's not tested). */
1033static void
1034point_double(felem x_out, felem y_out, felem z_out,
1035             const felem x_in, const felem y_in, const felem z_in)
1036{
1037    largefelem tmp, tmp2;
1038    felem delta, gamma, beta, alpha, ftmp, ftmp2;
1039
1040    felem_assign(ftmp, x_in);
1041    felem_assign(ftmp2, x_in);
1042
1043    /* delta = z^2 */
1044    felem_square(tmp, z_in);
1045    felem_reduce(delta, tmp);   /* delta[i] < 2^59 + 2^14 */
1046
1047    /* gamma = y^2 */
1048    felem_square(tmp, y_in);
1049    felem_reduce(gamma, tmp);   /* gamma[i] < 2^59 + 2^14 */
1050
1051    /* beta = x*gamma */
1052    felem_mul(tmp, x_in, gamma);
1053    felem_reduce(beta, tmp);    /* beta[i] < 2^59 + 2^14 */
1054
1055    /* alpha = 3*(x-delta)*(x+delta) */
1056    felem_diff64(ftmp, delta);
1057    /* ftmp[i] < 2^61 */
1058    felem_sum64(ftmp2, delta);
1059    /* ftmp2[i] < 2^60 + 2^15 */
1060    felem_scalar64(ftmp2, 3);
1061    /* ftmp2[i] < 3*2^60 + 3*2^15 */
1062    felem_mul(tmp, ftmp, ftmp2);
1063    /*-
1064     * tmp[i] < 17(3*2^121 + 3*2^76)
1065     *        = 61*2^121 + 61*2^76
1066     *        < 64*2^121 + 64*2^76
1067     *        = 2^127 + 2^82
1068     *        < 2^128
1069     */
1070    felem_reduce(alpha, tmp);
1071
1072    /* x' = alpha^2 - 8*beta */
1073    felem_square(tmp, alpha);
1074    /*
1075     * tmp[i] < 17*2^120 < 2^125
1076     */
1077    felem_assign(ftmp, beta);
1078    felem_scalar64(ftmp, 8);
1079    /* ftmp[i] < 2^62 + 2^17 */
1080    felem_diff_128_64(tmp, ftmp);
1081    /* tmp[i] < 2^125 + 2^63 + 2^62 + 2^17 */
1082    felem_reduce(x_out, tmp);
1083
1084    /* z' = (y + z)^2 - gamma - delta */
1085    felem_sum64(delta, gamma);
1086    /* delta[i] < 2^60 + 2^15 */
1087    felem_assign(ftmp, y_in);
1088    felem_sum64(ftmp, z_in);
1089    /* ftmp[i] < 2^60 + 2^15 */
1090    felem_square(tmp, ftmp);
1091    /*
1092     * tmp[i] < 17(2^122) < 2^127
1093     */
1094    felem_diff_128_64(tmp, delta);
1095    /* tmp[i] < 2^127 + 2^63 */
1096    felem_reduce(z_out, tmp);
1097
1098    /* y' = alpha*(4*beta - x') - 8*gamma^2 */
1099    felem_scalar64(beta, 4);
1100    /* beta[i] < 2^61 + 2^16 */
1101    felem_diff64(beta, x_out);
1102    /* beta[i] < 2^61 + 2^60 + 2^16 */
1103    felem_mul(tmp, alpha, beta);
1104    /*-
1105     * tmp[i] < 17*((2^59 + 2^14)(2^61 + 2^60 + 2^16))
1106     *        = 17*(2^120 + 2^75 + 2^119 + 2^74 + 2^75 + 2^30)
1107     *        = 17*(2^120 + 2^119 + 2^76 + 2^74 + 2^30)
1108     *        < 2^128
1109     */
1110    felem_square(tmp2, gamma);
1111    /*-
1112     * tmp2[i] < 17*(2^59 + 2^14)^2
1113     *         = 17*(2^118 + 2^74 + 2^28)
1114     */
1115    felem_scalar128(tmp2, 8);
1116    /*-
1117     * tmp2[i] < 8*17*(2^118 + 2^74 + 2^28)
1118     *         = 2^125 + 2^121 + 2^81 + 2^77 + 2^35 + 2^31
1119     *         < 2^126
1120     */
1121    felem_diff128(tmp, tmp2);
1122    /*-
1123     * tmp[i] < 2^127 - 2^69 + 17(2^120 + 2^119 + 2^76 + 2^74 + 2^30)
1124     *        = 2^127 + 2^124 + 2^122 + 2^120 + 2^118 + 2^80 + 2^78 + 2^76 +
1125     *          2^74 + 2^69 + 2^34 + 2^30
1126     *        < 2^128
1127     */
1128    felem_reduce(y_out, tmp);
1129}
1130
1131/* copy_conditional copies in to out iff mask is all ones. */
1132static void copy_conditional(felem out, const felem in, limb mask)
1133{
1134    unsigned i;
1135    for (i = 0; i < NLIMBS; ++i) {
1136        const limb tmp = mask & (in[i] ^ out[i]);
1137        out[i] ^= tmp;
1138    }
1139}
1140
1141/*-
1142 * point_add calcuates (x1, y1, z1) + (x2, y2, z2)
1143 *
1144 * The method is taken from
1145 *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl,
1146 * adapted for mixed addition (z2 = 1, or z2 = 0 for the point at infinity).
1147 *
1148 * This function includes a branch for checking whether the two input points
1149 * are equal (while not equal to the point at infinity). This case never
1150 * happens during single point multiplication, so there is no timing leak for
1151 * ECDH or ECDSA signing. */
1152static void point_add(felem x3, felem y3, felem z3,
1153                      const felem x1, const felem y1, const felem z1,
1154                      const int mixed, const felem x2, const felem y2,
1155                      const felem z2)
1156{
1157    felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, ftmp6, x_out, y_out, z_out;
1158    largefelem tmp, tmp2;
1159    limb x_equal, y_equal, z1_is_zero, z2_is_zero;
1160
1161    z1_is_zero = felem_is_zero(z1);
1162    z2_is_zero = felem_is_zero(z2);
1163
1164    /* ftmp = z1z1 = z1**2 */
1165    felem_square(tmp, z1);
1166    felem_reduce(ftmp, tmp);
1167
1168    if (!mixed) {
1169        /* ftmp2 = z2z2 = z2**2 */
1170        felem_square(tmp, z2);
1171        felem_reduce(ftmp2, tmp);
1172
1173        /* u1 = ftmp3 = x1*z2z2 */
1174        felem_mul(tmp, x1, ftmp2);
1175        felem_reduce(ftmp3, tmp);
1176
1177        /* ftmp5 = z1 + z2 */
1178        felem_assign(ftmp5, z1);
1179        felem_sum64(ftmp5, z2);
1180        /* ftmp5[i] < 2^61 */
1181
1182        /* ftmp5 = (z1 + z2)**2 - z1z1 - z2z2 = 2*z1z2 */
1183        felem_square(tmp, ftmp5);
1184        /* tmp[i] < 17*2^122 */
1185        felem_diff_128_64(tmp, ftmp);
1186        /* tmp[i] < 17*2^122 + 2^63 */
1187        felem_diff_128_64(tmp, ftmp2);
1188        /* tmp[i] < 17*2^122 + 2^64 */
1189        felem_reduce(ftmp5, tmp);
1190
1191        /* ftmp2 = z2 * z2z2 */
1192        felem_mul(tmp, ftmp2, z2);
1193        felem_reduce(ftmp2, tmp);
1194
1195        /* s1 = ftmp6 = y1 * z2**3 */
1196        felem_mul(tmp, y1, ftmp2);
1197        felem_reduce(ftmp6, tmp);
1198    } else {
1199        /*
1200         * We'll assume z2 = 1 (special case z2 = 0 is handled later)
1201         */
1202
1203        /* u1 = ftmp3 = x1*z2z2 */
1204        felem_assign(ftmp3, x1);
1205
1206        /* ftmp5 = 2*z1z2 */
1207        felem_scalar(ftmp5, z1, 2);
1208
1209        /* s1 = ftmp6 = y1 * z2**3 */
1210        felem_assign(ftmp6, y1);
1211    }
1212
1213    /* u2 = x2*z1z1 */
1214    felem_mul(tmp, x2, ftmp);
1215    /* tmp[i] < 17*2^120 */
1216
1217    /* h = ftmp4 = u2 - u1 */
1218    felem_diff_128_64(tmp, ftmp3);
1219    /* tmp[i] < 17*2^120 + 2^63 */
1220    felem_reduce(ftmp4, tmp);
1221
1222    x_equal = felem_is_zero(ftmp4);
1223
1224    /* z_out = ftmp5 * h */
1225    felem_mul(tmp, ftmp5, ftmp4);
1226    felem_reduce(z_out, tmp);
1227
1228    /* ftmp = z1 * z1z1 */
1229    felem_mul(tmp, ftmp, z1);
1230    felem_reduce(ftmp, tmp);
1231
1232    /* s2 = tmp = y2 * z1**3 */
1233    felem_mul(tmp, y2, ftmp);
1234    /* tmp[i] < 17*2^120 */
1235
1236    /* r = ftmp5 = (s2 - s1)*2 */
1237    felem_diff_128_64(tmp, ftmp6);
1238    /* tmp[i] < 17*2^120 + 2^63 */
1239    felem_reduce(ftmp5, tmp);
1240    y_equal = felem_is_zero(ftmp5);
1241    felem_scalar64(ftmp5, 2);
1242    /* ftmp5[i] < 2^61 */
1243
1244    if (x_equal && y_equal && !z1_is_zero && !z2_is_zero) {
1245        point_double(x3, y3, z3, x1, y1, z1);
1246        return;
1247    }
1248
1249    /* I = ftmp = (2h)**2 */
1250    felem_assign(ftmp, ftmp4);
1251    felem_scalar64(ftmp, 2);
1252    /* ftmp[i] < 2^61 */
1253    felem_square(tmp, ftmp);
1254    /* tmp[i] < 17*2^122 */
1255    felem_reduce(ftmp, tmp);
1256
1257    /* J = ftmp2 = h * I */
1258    felem_mul(tmp, ftmp4, ftmp);
1259    felem_reduce(ftmp2, tmp);
1260
1261    /* V = ftmp4 = U1 * I */
1262    felem_mul(tmp, ftmp3, ftmp);
1263    felem_reduce(ftmp4, tmp);
1264
1265    /* x_out = r**2 - J - 2V */
1266    felem_square(tmp, ftmp5);
1267    /* tmp[i] < 17*2^122 */
1268    felem_diff_128_64(tmp, ftmp2);
1269    /* tmp[i] < 17*2^122 + 2^63 */
1270    felem_assign(ftmp3, ftmp4);
1271    felem_scalar64(ftmp4, 2);
1272    /* ftmp4[i] < 2^61 */
1273    felem_diff_128_64(tmp, ftmp4);
1274    /* tmp[i] < 17*2^122 + 2^64 */
1275    felem_reduce(x_out, tmp);
1276
1277    /* y_out = r(V-x_out) - 2 * s1 * J */
1278    felem_diff64(ftmp3, x_out);
1279    /*
1280     * ftmp3[i] < 2^60 + 2^60 = 2^61
1281     */
1282    felem_mul(tmp, ftmp5, ftmp3);
1283    /* tmp[i] < 17*2^122 */
1284    felem_mul(tmp2, ftmp6, ftmp2);
1285    /* tmp2[i] < 17*2^120 */
1286    felem_scalar128(tmp2, 2);
1287    /* tmp2[i] < 17*2^121 */
1288    felem_diff128(tmp, tmp2);
1289        /*-
1290         * tmp[i] < 2^127 - 2^69 + 17*2^122
1291         *        = 2^126 - 2^122 - 2^6 - 2^2 - 1
1292         *        < 2^127
1293         */
1294    felem_reduce(y_out, tmp);
1295
1296    copy_conditional(x_out, x2, z1_is_zero);
1297    copy_conditional(x_out, x1, z2_is_zero);
1298    copy_conditional(y_out, y2, z1_is_zero);
1299    copy_conditional(y_out, y1, z2_is_zero);
1300    copy_conditional(z_out, z2, z1_is_zero);
1301    copy_conditional(z_out, z1, z2_is_zero);
1302    felem_assign(x3, x_out);
1303    felem_assign(y3, y_out);
1304    felem_assign(z3, z_out);
1305}
1306
1307/*-
1308 * Base point pre computation
1309 * --------------------------
1310 *
1311 * Two different sorts of precomputed tables are used in the following code.
1312 * Each contain various points on the curve, where each point is three field
1313 * elements (x, y, z).
1314 *
1315 * For the base point table, z is usually 1 (0 for the point at infinity).
1316 * This table has 16 elements:
1317 * index | bits    | point
1318 * ------+---------+------------------------------
1319 *     0 | 0 0 0 0 | 0G
1320 *     1 | 0 0 0 1 | 1G
1321 *     2 | 0 0 1 0 | 2^130G
1322 *     3 | 0 0 1 1 | (2^130 + 1)G
1323 *     4 | 0 1 0 0 | 2^260G
1324 *     5 | 0 1 0 1 | (2^260 + 1)G
1325 *     6 | 0 1 1 0 | (2^260 + 2^130)G
1326 *     7 | 0 1 1 1 | (2^260 + 2^130 + 1)G
1327 *     8 | 1 0 0 0 | 2^390G
1328 *     9 | 1 0 0 1 | (2^390 + 1)G
1329 *    10 | 1 0 1 0 | (2^390 + 2^130)G
1330 *    11 | 1 0 1 1 | (2^390 + 2^130 + 1)G
1331 *    12 | 1 1 0 0 | (2^390 + 2^260)G
1332 *    13 | 1 1 0 1 | (2^390 + 2^260 + 1)G
1333 *    14 | 1 1 1 0 | (2^390 + 2^260 + 2^130)G
1334 *    15 | 1 1 1 1 | (2^390 + 2^260 + 2^130 + 1)G
1335 *
1336 * The reason for this is so that we can clock bits into four different
1337 * locations when doing simple scalar multiplies against the base point.
1338 *
1339 * Tables for other points have table[i] = iG for i in 0 .. 16. */
1340
1341/* gmul is the table of precomputed base points */
1342static const felem gmul[16][3] = { {{0, 0, 0, 0, 0, 0, 0, 0, 0},
1343                                    {0, 0, 0, 0, 0, 0, 0, 0, 0},
1344                                    {0, 0, 0, 0, 0, 0, 0, 0, 0}},
1345{{0x017e7e31c2e5bd66, 0x022cf0615a90a6fe, 0x00127a2ffa8de334,
1346  0x01dfbf9d64a3f877, 0x006b4d3dbaa14b5e, 0x014fed487e0a2bd8,
1347  0x015b4429c6481390, 0x03a73678fb2d988e, 0x00c6858e06b70404},
1348 {0x00be94769fd16650, 0x031c21a89cb09022, 0x039013fad0761353,
1349  0x02657bd099031542, 0x03273e662c97ee72, 0x01e6d11a05ebef45,
1350  0x03d1bd998f544495, 0x03001172297ed0b1, 0x011839296a789a3b},
1351 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1352{{0x0373faacbc875bae, 0x00f325023721c671, 0x00f666fd3dbde5ad,
1353  0x01a6932363f88ea7, 0x01fc6d9e13f9c47b, 0x03bcbffc2bbf734e,
1354  0x013ee3c3647f3a92, 0x029409fefe75d07d, 0x00ef9199963d85e5},
1355 {0x011173743ad5b178, 0x02499c7c21bf7d46, 0x035beaeabb8b1a58,
1356  0x00f989c4752ea0a3, 0x0101e1de48a9c1a3, 0x01a20076be28ba6c,
1357  0x02f8052e5eb2de95, 0x01bfe8f82dea117c, 0x0160074d3c36ddb7},
1358 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1359{{0x012f3fc373393b3b, 0x03d3d6172f1419fa, 0x02adc943c0b86873,
1360  0x00d475584177952b, 0x012a4d1673750ee2, 0x00512517a0f13b0c,
1361  0x02b184671a7b1734, 0x0315b84236f1a50a, 0x00a4afc472edbdb9},
1362 {0x00152a7077f385c4, 0x03044007d8d1c2ee, 0x0065829d61d52b52,
1363  0x00494ff6b6631d0d, 0x00a11d94d5f06bcf, 0x02d2f89474d9282e,
1364  0x0241c5727c06eeb9, 0x0386928710fbdb9d, 0x01f883f727b0dfbe},
1365 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1366{{0x019b0c3c9185544d, 0x006243a37c9d97db, 0x02ee3cbe030a2ad2,
1367  0x00cfdd946bb51e0d, 0x0271c00932606b91, 0x03f817d1ec68c561,
1368  0x03f37009806a369c, 0x03c1f30baf184fd5, 0x01091022d6d2f065},
1369 {0x0292c583514c45ed, 0x0316fca51f9a286c, 0x00300af507c1489a,
1370  0x0295f69008298cf1, 0x02c0ed8274943d7b, 0x016509b9b47a431e,
1371  0x02bc9de9634868ce, 0x005b34929bffcb09, 0x000c1a0121681524},
1372 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1373{{0x0286abc0292fb9f2, 0x02665eee9805b3f7, 0x01ed7455f17f26d6,
1374  0x0346355b83175d13, 0x006284944cd0a097, 0x0191895bcdec5e51,
1375  0x02e288370afda7d9, 0x03b22312bfefa67a, 0x01d104d3fc0613fe},
1376 {0x0092421a12f7e47f, 0x0077a83fa373c501, 0x03bd25c5f696bd0d,
1377  0x035c41e4d5459761, 0x01ca0d1742b24f53, 0x00aaab27863a509c,
1378  0x018b6de47df73917, 0x025c0b771705cd01, 0x01fd51d566d760a7},
1379 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1380{{0x01dd92ff6b0d1dbd, 0x039c5e2e8f8afa69, 0x0261ed13242c3b27,
1381  0x0382c6e67026e6a0, 0x01d60b10be2089f9, 0x03c15f3dce86723f,
1382  0x03c764a32d2a062d, 0x017307eac0fad056, 0x018207c0b96c5256},
1383 {0x0196a16d60e13154, 0x03e6ce74c0267030, 0x00ddbf2b4e52a5aa,
1384  0x012738241bbf31c8, 0x00ebe8dc04685a28, 0x024c2ad6d380d4a2,
1385  0x035ee062a6e62d0e, 0x0029ed74af7d3a0f, 0x00eef32aec142ebd},
1386 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1387{{0x00c31ec398993b39, 0x03a9f45bcda68253, 0x00ac733c24c70890,
1388  0x00872b111401ff01, 0x01d178c23195eafb, 0x03bca2c816b87f74,
1389  0x0261a9af46fbad7a, 0x0324b2a8dd3d28f9, 0x00918121d8f24e23},
1390 {0x032bc8c1ca983cd7, 0x00d869dfb08fc8c6, 0x01693cb61fce1516,
1391  0x012a5ea68f4e88a8, 0x010869cab88d7ae3, 0x009081ad277ceee1,
1392  0x033a77166d064cdc, 0x03955235a1fb3a95, 0x01251a4a9b25b65e},
1393 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1394{{0x00148a3a1b27f40b, 0x0123186df1b31fdc, 0x00026e7beaad34ce,
1395  0x01db446ac1d3dbba, 0x0299c1a33437eaec, 0x024540610183cbb7,
1396  0x0173bb0e9ce92e46, 0x02b937e43921214b, 0x01ab0436a9bf01b5},
1397 {0x0383381640d46948, 0x008dacbf0e7f330f, 0x03602122bcc3f318,
1398  0x01ee596b200620d6, 0x03bd0585fda430b3, 0x014aed77fd123a83,
1399  0x005ace749e52f742, 0x0390fe041da2b842, 0x0189a8ceb3299242},
1400 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1401{{0x012a19d6b3282473, 0x00c0915918b423ce, 0x023a954eb94405ae,
1402  0x00529f692be26158, 0x0289fa1b6fa4b2aa, 0x0198ae4ceea346ef,
1403  0x0047d8cdfbdedd49, 0x00cc8c8953f0f6b8, 0x001424abbff49203},
1404 {0x0256732a1115a03a, 0x0351bc38665c6733, 0x03f7b950fb4a6447,
1405  0x000afffa94c22155, 0x025763d0a4dab540, 0x000511e92d4fc283,
1406  0x030a7e9eda0ee96c, 0x004c3cd93a28bf0a, 0x017edb3a8719217f},
1407 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1408{{0x011de5675a88e673, 0x031d7d0f5e567fbe, 0x0016b2062c970ae5,
1409  0x03f4a2be49d90aa7, 0x03cef0bd13822866, 0x03f0923dcf774a6c,
1410  0x0284bebc4f322f72, 0x016ab2645302bb2c, 0x01793f95dace0e2a},
1411 {0x010646e13527a28f, 0x01ca1babd59dc5e7, 0x01afedfd9a5595df,
1412  0x01f15785212ea6b1, 0x0324e5d64f6ae3f4, 0x02d680f526d00645,
1413  0x0127920fadf627a7, 0x03b383f75df4f684, 0x0089e0057e783b0a},
1414 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1415{{0x00f334b9eb3c26c6, 0x0298fdaa98568dce, 0x01c2d24843a82292,
1416  0x020bcb24fa1b0711, 0x02cbdb3d2b1875e6, 0x0014907598f89422,
1417  0x03abe3aa43b26664, 0x02cbf47f720bc168, 0x0133b5e73014b79b},
1418 {0x034aab5dab05779d, 0x00cdc5d71fee9abb, 0x0399f16bd4bd9d30,
1419  0x03582fa592d82647, 0x02be1cdfb775b0e9, 0x0034f7cea32e94cb,
1420  0x0335a7f08f56f286, 0x03b707e9565d1c8b, 0x0015c946ea5b614f},
1421 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1422{{0x024676f6cff72255, 0x00d14625cac96378, 0x00532b6008bc3767,
1423  0x01fc16721b985322, 0x023355ea1b091668, 0x029de7afdc0317c3,
1424  0x02fc8a7ca2da037c, 0x02de1217d74a6f30, 0x013f7173175b73bf},
1425 {0x0344913f441490b5, 0x0200f9e272b61eca, 0x0258a246b1dd55d2,
1426  0x03753db9ea496f36, 0x025e02937a09c5ef, 0x030cbd3d14012692,
1427  0x01793a67e70dc72a, 0x03ec1d37048a662e, 0x006550f700c32a8d},
1428 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1429{{0x00d3f48a347eba27, 0x008e636649b61bd8, 0x00d3b93716778fb3,
1430  0x004d1915757bd209, 0x019d5311a3da44e0, 0x016d1afcbbe6aade,
1431  0x0241bf5f73265616, 0x0384672e5d50d39b, 0x005009fee522b684},
1432 {0x029b4fab064435fe, 0x018868ee095bbb07, 0x01ea3d6936cc92b8,
1433  0x000608b00f78a2f3, 0x02db911073d1c20f, 0x018205938470100a,
1434  0x01f1e4964cbe6ff2, 0x021a19a29eed4663, 0x01414485f42afa81},
1435 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1436{{0x01612b3a17f63e34, 0x03813992885428e6, 0x022b3c215b5a9608,
1437  0x029b4057e19f2fcb, 0x0384059a587af7e6, 0x02d6400ace6fe610,
1438  0x029354d896e8e331, 0x00c047ee6dfba65e, 0x0037720542e9d49d},
1439 {0x02ce9eed7c5e9278, 0x0374ed703e79643b, 0x01316c54c4072006,
1440  0x005aaa09054b2ee8, 0x002824000c840d57, 0x03d4eba24771ed86,
1441  0x0189c50aabc3bdae, 0x0338c01541e15510, 0x00466d56e38eed42},
1442 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1443{{0x007efd8330ad8bd6, 0x02465ed48047710b, 0x0034c6606b215e0c,
1444  0x016ae30c53cbf839, 0x01fa17bd37161216, 0x018ead4e61ce8ab9,
1445  0x005482ed5f5dee46, 0x037543755bba1d7f, 0x005e5ac7e70a9d0f},
1446 {0x0117e1bb2fdcb2a2, 0x03deea36249f40c4, 0x028d09b4a6246cb7,
1447  0x03524b8855bcf756, 0x023d7d109d5ceb58, 0x0178e43e3223ef9c,
1448  0x0154536a0c6e966a, 0x037964d1286ee9fe, 0x0199bcd90e125055},
1449 {1, 0, 0, 0, 0, 0, 0, 0, 0}}
1450};
1451
1452/*
1453 * select_point selects the |idx|th point from a precomputation table and
1454 * copies it to out.
1455 */
1456 /* pre_comp below is of the size provided in |size| */
1457static void select_point(const limb idx, unsigned int size,
1458                         const felem pre_comp[][3], felem out[3])
1459{
1460    unsigned i, j;
1461    limb *outlimbs = &out[0][0];
1462    memset(outlimbs, 0, 3 * sizeof(felem));
1463
1464    for (i = 0; i < size; i++) {
1465        const limb *inlimbs = &pre_comp[i][0][0];
1466        limb mask = i ^ idx;
1467        mask |= mask >> 4;
1468        mask |= mask >> 2;
1469        mask |= mask >> 1;
1470        mask &= 1;
1471        mask--;
1472        for (j = 0; j < NLIMBS * 3; j++)
1473            outlimbs[j] |= inlimbs[j] & mask;
1474    }
1475}
1476
1477/* get_bit returns the |i|th bit in |in| */
1478static char get_bit(const felem_bytearray in, int i)
1479{
1480    if (i < 0)
1481        return 0;
1482    return (in[i >> 3] >> (i & 7)) & 1;
1483}
1484
1485/*
1486 * Interleaved point multiplication using precomputed point multiples: The
1487 * small point multiples 0*P, 1*P, ..., 16*P are in pre_comp[], the scalars
1488 * in scalars[]. If g_scalar is non-NULL, we also add this multiple of the
1489 * generator, using certain (large) precomputed multiples in g_pre_comp.
1490 * Output point (X, Y, Z) is stored in x_out, y_out, z_out
1491 */
1492static void batch_mul(felem x_out, felem y_out, felem z_out,
1493                      const felem_bytearray scalars[],
1494                      const unsigned num_points, const u8 *g_scalar,
1495                      const int mixed, const felem pre_comp[][17][3],
1496                      const felem g_pre_comp[16][3])
1497{
1498    int i, skip;
1499    unsigned num, gen_mul = (g_scalar != NULL);
1500    felem nq[3], tmp[4];
1501    limb bits;
1502    u8 sign, digit;
1503
1504    /* set nq to the point at infinity */
1505    memset(nq, 0, 3 * sizeof(felem));
1506
1507    /*
1508     * Loop over all scalars msb-to-lsb, interleaving additions of multiples
1509     * of the generator (last quarter of rounds) and additions of other
1510     * points multiples (every 5th round).
1511     */
1512    skip = 1;                   /* save two point operations in the first
1513                                 * round */
1514    for (i = (num_points ? 520 : 130); i >= 0; --i) {
1515        /* double */
1516        if (!skip)
1517            point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]);
1518
1519        /* add multiples of the generator */
1520        if (gen_mul && (i <= 130)) {
1521            bits = get_bit(g_scalar, i + 390) << 3;
1522            if (i < 130) {
1523                bits |= get_bit(g_scalar, i + 260) << 2;
1524                bits |= get_bit(g_scalar, i + 130) << 1;
1525                bits |= get_bit(g_scalar, i);
1526            }
1527            /* select the point to add, in constant time */
1528            select_point(bits, 16, g_pre_comp, tmp);
1529            if (!skip) {
1530                /* The 1 argument below is for "mixed" */
1531                point_add(nq[0], nq[1], nq[2],
1532                          nq[0], nq[1], nq[2], 1, tmp[0], tmp[1], tmp[2]);
1533            } else {
1534                memcpy(nq, tmp, 3 * sizeof(felem));
1535                skip = 0;
1536            }
1537        }
1538
1539        /* do other additions every 5 doublings */
1540        if (num_points && (i % 5 == 0)) {
1541            /* loop over all scalars */
1542            for (num = 0; num < num_points; ++num) {
1543                bits = get_bit(scalars[num], i + 4) << 5;
1544                bits |= get_bit(scalars[num], i + 3) << 4;
1545                bits |= get_bit(scalars[num], i + 2) << 3;
1546                bits |= get_bit(scalars[num], i + 1) << 2;
1547                bits |= get_bit(scalars[num], i) << 1;
1548                bits |= get_bit(scalars[num], i - 1);
1549                ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits);
1550
1551                /*
1552                 * select the point to add or subtract, in constant time
1553                 */
1554                select_point(digit, 17, pre_comp[num], tmp);
1555                felem_neg(tmp[3], tmp[1]); /* (X, -Y, Z) is the negative
1556                                            * point */
1557                copy_conditional(tmp[1], tmp[3], (-(limb) sign));
1558
1559                if (!skip) {
1560                    point_add(nq[0], nq[1], nq[2],
1561                              nq[0], nq[1], nq[2],
1562                              mixed, tmp[0], tmp[1], tmp[2]);
1563                } else {
1564                    memcpy(nq, tmp, 3 * sizeof(felem));
1565                    skip = 0;
1566                }
1567            }
1568        }
1569    }
1570    felem_assign(x_out, nq[0]);
1571    felem_assign(y_out, nq[1]);
1572    felem_assign(z_out, nq[2]);
1573}
1574
1575/* Precomputation for the group generator. */
1576typedef struct {
1577    felem g_pre_comp[16][3];
1578    int references;
1579} NISTP521_PRE_COMP;
1580
1581const EC_METHOD *EC_GFp_nistp521_method(void)
1582{
1583    static const EC_METHOD ret = {
1584        EC_FLAGS_DEFAULT_OCT,
1585        NID_X9_62_prime_field,
1586        ec_GFp_nistp521_group_init,
1587        ec_GFp_simple_group_finish,
1588        ec_GFp_simple_group_clear_finish,
1589        ec_GFp_nist_group_copy,
1590        ec_GFp_nistp521_group_set_curve,
1591        ec_GFp_simple_group_get_curve,
1592        ec_GFp_simple_group_get_degree,
1593        ec_GFp_simple_group_check_discriminant,
1594        ec_GFp_simple_point_init,
1595        ec_GFp_simple_point_finish,
1596        ec_GFp_simple_point_clear_finish,
1597        ec_GFp_simple_point_copy,
1598        ec_GFp_simple_point_set_to_infinity,
1599        ec_GFp_simple_set_Jprojective_coordinates_GFp,
1600        ec_GFp_simple_get_Jprojective_coordinates_GFp,
1601        ec_GFp_simple_point_set_affine_coordinates,
1602        ec_GFp_nistp521_point_get_affine_coordinates,
1603        0 /* point_set_compressed_coordinates */ ,
1604        0 /* point2oct */ ,
1605        0 /* oct2point */ ,
1606        ec_GFp_simple_add,
1607        ec_GFp_simple_dbl,
1608        ec_GFp_simple_invert,
1609        ec_GFp_simple_is_at_infinity,
1610        ec_GFp_simple_is_on_curve,
1611        ec_GFp_simple_cmp,
1612        ec_GFp_simple_make_affine,
1613        ec_GFp_simple_points_make_affine,
1614        ec_GFp_nistp521_points_mul,
1615        ec_GFp_nistp521_precompute_mult,
1616        ec_GFp_nistp521_have_precompute_mult,
1617        ec_GFp_nist_field_mul,
1618        ec_GFp_nist_field_sqr,
1619        0 /* field_div */ ,
1620        0 /* field_encode */ ,
1621        0 /* field_decode */ ,
1622        0                       /* field_set_to_one */
1623    };
1624
1625    return &ret;
1626}
1627
1628/******************************************************************************/
1629/*
1630 * FUNCTIONS TO MANAGE PRECOMPUTATION
1631 */
1632
1633static NISTP521_PRE_COMP *nistp521_pre_comp_new()
1634{
1635    NISTP521_PRE_COMP *ret = NULL;
1636    ret = (NISTP521_PRE_COMP *) OPENSSL_malloc(sizeof(NISTP521_PRE_COMP));
1637    if (!ret) {
1638        ECerr(EC_F_NISTP521_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE);
1639        return ret;
1640    }
1641    memset(ret->g_pre_comp, 0, sizeof(ret->g_pre_comp));
1642    ret->references = 1;
1643    return ret;
1644}
1645
1646static void *nistp521_pre_comp_dup(void *src_)
1647{
1648    NISTP521_PRE_COMP *src = src_;
1649
1650    /* no need to actually copy, these objects never change! */
1651    CRYPTO_add(&src->references, 1, CRYPTO_LOCK_EC_PRE_COMP);
1652
1653    return src_;
1654}
1655
1656static void nistp521_pre_comp_free(void *pre_)
1657{
1658    int i;
1659    NISTP521_PRE_COMP *pre = pre_;
1660
1661    if (!pre)
1662        return;
1663
1664    i = CRYPTO_add(&pre->references, -1, CRYPTO_LOCK_EC_PRE_COMP);
1665    if (i > 0)
1666        return;
1667
1668    OPENSSL_free(pre);
1669}
1670
1671static void nistp521_pre_comp_clear_free(void *pre_)
1672{
1673    int i;
1674    NISTP521_PRE_COMP *pre = pre_;
1675
1676    if (!pre)
1677        return;
1678
1679    i = CRYPTO_add(&pre->references, -1, CRYPTO_LOCK_EC_PRE_COMP);
1680    if (i > 0)
1681        return;
1682
1683    OPENSSL_cleanse(pre, sizeof(*pre));
1684    OPENSSL_free(pre);
1685}
1686
1687/******************************************************************************/
1688/*
1689 * OPENSSL EC_METHOD FUNCTIONS
1690 */
1691
1692int ec_GFp_nistp521_group_init(EC_GROUP *group)
1693{
1694    int ret;
1695    ret = ec_GFp_simple_group_init(group);
1696    group->a_is_minus3 = 1;
1697    return ret;
1698}
1699
1700int ec_GFp_nistp521_group_set_curve(EC_GROUP *group, const BIGNUM *p,
1701                                    const BIGNUM *a, const BIGNUM *b,
1702                                    BN_CTX *ctx)
1703{
1704    int ret = 0;
1705    BN_CTX *new_ctx = NULL;
1706    BIGNUM *curve_p, *curve_a, *curve_b;
1707
1708    if (ctx == NULL)
1709        if ((ctx = new_ctx = BN_CTX_new()) == NULL)
1710            return 0;
1711    BN_CTX_start(ctx);
1712    if (((curve_p = BN_CTX_get(ctx)) == NULL) ||
1713        ((curve_a = BN_CTX_get(ctx)) == NULL) ||
1714        ((curve_b = BN_CTX_get(ctx)) == NULL))
1715        goto err;
1716    BN_bin2bn(nistp521_curve_params[0], sizeof(felem_bytearray), curve_p);
1717    BN_bin2bn(nistp521_curve_params[1], sizeof(felem_bytearray), curve_a);
1718    BN_bin2bn(nistp521_curve_params[2], sizeof(felem_bytearray), curve_b);
1719    if ((BN_cmp(curve_p, p)) || (BN_cmp(curve_a, a)) || (BN_cmp(curve_b, b))) {
1720        ECerr(EC_F_EC_GFP_NISTP521_GROUP_SET_CURVE,
1721              EC_R_WRONG_CURVE_PARAMETERS);
1722        goto err;
1723    }
1724    group->field_mod_func = BN_nist_mod_521;
1725    ret = ec_GFp_simple_group_set_curve(group, p, a, b, ctx);
1726 err:
1727    BN_CTX_end(ctx);
1728    if (new_ctx != NULL)
1729        BN_CTX_free(new_ctx);
1730    return ret;
1731}
1732
1733/*
1734 * Takes the Jacobian coordinates (X, Y, Z) of a point and returns (X', Y') =
1735 * (X/Z^2, Y/Z^3)
1736 */
1737int ec_GFp_nistp521_point_get_affine_coordinates(const EC_GROUP *group,
1738                                                 const EC_POINT *point,
1739                                                 BIGNUM *x, BIGNUM *y,
1740                                                 BN_CTX *ctx)
1741{
1742    felem z1, z2, x_in, y_in, x_out, y_out;
1743    largefelem tmp;
1744
1745    if (EC_POINT_is_at_infinity(group, point)) {
1746        ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES,
1747              EC_R_POINT_AT_INFINITY);
1748        return 0;
1749    }
1750    if ((!BN_to_felem(x_in, &point->X)) || (!BN_to_felem(y_in, &point->Y)) ||
1751        (!BN_to_felem(z1, &point->Z)))
1752        return 0;
1753    felem_inv(z2, z1);
1754    felem_square(tmp, z2);
1755    felem_reduce(z1, tmp);
1756    felem_mul(tmp, x_in, z1);
1757    felem_reduce(x_in, tmp);
1758    felem_contract(x_out, x_in);
1759    if (x != NULL) {
1760        if (!felem_to_BN(x, x_out)) {
1761            ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES,
1762                  ERR_R_BN_LIB);
1763            return 0;
1764        }
1765    }
1766    felem_mul(tmp, z1, z2);
1767    felem_reduce(z1, tmp);
1768    felem_mul(tmp, y_in, z1);
1769    felem_reduce(y_in, tmp);
1770    felem_contract(y_out, y_in);
1771    if (y != NULL) {
1772        if (!felem_to_BN(y, y_out)) {
1773            ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES,
1774                  ERR_R_BN_LIB);
1775            return 0;
1776        }
1777    }
1778    return 1;
1779}
1780
1781/* points below is of size |num|, and tmp_felems is of size |num+1/ */
1782static void make_points_affine(size_t num, felem points[][3],
1783                               felem tmp_felems[])
1784{
1785    /*
1786     * Runs in constant time, unless an input is the point at infinity (which
1787     * normally shouldn't happen).
1788     */
1789    ec_GFp_nistp_points_make_affine_internal(num,
1790                                             points,
1791                                             sizeof(felem),
1792                                             tmp_felems,
1793                                             (void (*)(void *))felem_one,
1794                                             felem_is_zero_int,
1795                                             (void (*)(void *, const void *))
1796                                             felem_assign,
1797                                             (void (*)(void *, const void *))
1798                                             felem_square_reduce, (void (*)
1799                                                                   (void *,
1800                                                                    const void
1801                                                                    *,
1802                                                                    const void
1803                                                                    *))
1804                                             felem_mul_reduce,
1805                                             (void (*)(void *, const void *))
1806                                             felem_inv,
1807                                             (void (*)(void *, const void *))
1808                                             felem_contract);
1809}
1810
1811/*
1812 * Computes scalar*generator + \sum scalars[i]*points[i], ignoring NULL
1813 * values Result is stored in r (r can equal one of the inputs).
1814 */
1815int ec_GFp_nistp521_points_mul(const EC_GROUP *group, EC_POINT *r,
1816                               const BIGNUM *scalar, size_t num,
1817                               const EC_POINT *points[],
1818                               const BIGNUM *scalars[], BN_CTX *ctx)
1819{
1820    int ret = 0;
1821    int j;
1822    int mixed = 0;
1823    BN_CTX *new_ctx = NULL;
1824    BIGNUM *x, *y, *z, *tmp_scalar;
1825    felem_bytearray g_secret;
1826    felem_bytearray *secrets = NULL;
1827    felem(*pre_comp)[17][3] = NULL;
1828    felem *tmp_felems = NULL;
1829    felem_bytearray tmp;
1830    unsigned i, num_bytes;
1831    int have_pre_comp = 0;
1832    size_t num_points = num;
1833    felem x_in, y_in, z_in, x_out, y_out, z_out;
1834    NISTP521_PRE_COMP *pre = NULL;
1835    felem(*g_pre_comp)[3] = NULL;
1836    EC_POINT *generator = NULL;
1837    const EC_POINT *p = NULL;
1838    const BIGNUM *p_scalar = NULL;
1839
1840    if (ctx == NULL)
1841        if ((ctx = new_ctx = BN_CTX_new()) == NULL)
1842            return 0;
1843    BN_CTX_start(ctx);
1844    if (((x = BN_CTX_get(ctx)) == NULL) ||
1845        ((y = BN_CTX_get(ctx)) == NULL) ||
1846        ((z = BN_CTX_get(ctx)) == NULL) ||
1847        ((tmp_scalar = BN_CTX_get(ctx)) == NULL))
1848        goto err;
1849
1850    if (scalar != NULL) {
1851        pre = EC_EX_DATA_get_data(group->extra_data,
1852                                  nistp521_pre_comp_dup,
1853                                  nistp521_pre_comp_free,
1854                                  nistp521_pre_comp_clear_free);
1855        if (pre)
1856            /* we have precomputation, try to use it */
1857            g_pre_comp = &pre->g_pre_comp[0];
1858        else
1859            /* try to use the standard precomputation */
1860            g_pre_comp = (felem(*)[3]) gmul;
1861        generator = EC_POINT_new(group);
1862        if (generator == NULL)
1863            goto err;
1864        /* get the generator from precomputation */
1865        if (!felem_to_BN(x, g_pre_comp[1][0]) ||
1866            !felem_to_BN(y, g_pre_comp[1][1]) ||
1867            !felem_to_BN(z, g_pre_comp[1][2])) {
1868            ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
1869            goto err;
1870        }
1871        if (!EC_POINT_set_Jprojective_coordinates_GFp(group,
1872                                                      generator, x, y, z,
1873                                                      ctx))
1874            goto err;
1875        if (0 == EC_POINT_cmp(group, generator, group->generator, ctx))
1876            /* precomputation matches generator */
1877            have_pre_comp = 1;
1878        else
1879            /*
1880             * we don't have valid precomputation: treat the generator as a
1881             * random point
1882             */
1883            num_points++;
1884    }
1885
1886    if (num_points > 0) {
1887        if (num_points >= 2) {
1888            /*
1889             * unless we precompute multiples for just one point, converting
1890             * those into affine form is time well spent
1891             */
1892            mixed = 1;
1893        }
1894        secrets = OPENSSL_malloc(num_points * sizeof(felem_bytearray));
1895        pre_comp = OPENSSL_malloc(num_points * 17 * 3 * sizeof(felem));
1896        if (mixed)
1897            tmp_felems =
1898                OPENSSL_malloc((num_points * 17 + 1) * sizeof(felem));
1899        if ((secrets == NULL) || (pre_comp == NULL)
1900            || (mixed && (tmp_felems == NULL))) {
1901            ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_MALLOC_FAILURE);
1902            goto err;
1903        }
1904
1905        /*
1906         * we treat NULL scalars as 0, and NULL points as points at infinity,
1907         * i.e., they contribute nothing to the linear combination
1908         */
1909        memset(secrets, 0, num_points * sizeof(felem_bytearray));
1910        memset(pre_comp, 0, num_points * 17 * 3 * sizeof(felem));
1911        for (i = 0; i < num_points; ++i) {
1912            if (i == num)
1913                /*
1914                 * we didn't have a valid precomputation, so we pick the
1915                 * generator
1916                 */
1917            {
1918                p = EC_GROUP_get0_generator(group);
1919                p_scalar = scalar;
1920            } else
1921                /* the i^th point */
1922            {
1923                p = points[i];
1924                p_scalar = scalars[i];
1925            }
1926            if ((p_scalar != NULL) && (p != NULL)) {
1927                /* reduce scalar to 0 <= scalar < 2^521 */
1928                if ((BN_num_bits(p_scalar) > 521)
1929                    || (BN_is_negative(p_scalar))) {
1930                    /*
1931                     * this is an unusual input, and we don't guarantee
1932                     * constant-timeness
1933                     */
1934                    if (!BN_nnmod(tmp_scalar, p_scalar, &group->order, ctx)) {
1935                        ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
1936                        goto err;
1937                    }
1938                    num_bytes = BN_bn2bin(tmp_scalar, tmp);
1939                } else
1940                    num_bytes = BN_bn2bin(p_scalar, tmp);
1941                flip_endian(secrets[i], tmp, num_bytes);
1942                /* precompute multiples */
1943                if ((!BN_to_felem(x_out, &p->X)) ||
1944                    (!BN_to_felem(y_out, &p->Y)) ||
1945                    (!BN_to_felem(z_out, &p->Z)))
1946                    goto err;
1947                memcpy(pre_comp[i][1][0], x_out, sizeof(felem));
1948                memcpy(pre_comp[i][1][1], y_out, sizeof(felem));
1949                memcpy(pre_comp[i][1][2], z_out, sizeof(felem));
1950                for (j = 2; j <= 16; ++j) {
1951                    if (j & 1) {
1952                        point_add(pre_comp[i][j][0], pre_comp[i][j][1],
1953                                  pre_comp[i][j][2], pre_comp[i][1][0],
1954                                  pre_comp[i][1][1], pre_comp[i][1][2], 0,
1955                                  pre_comp[i][j - 1][0],
1956                                  pre_comp[i][j - 1][1],
1957                                  pre_comp[i][j - 1][2]);
1958                    } else {
1959                        point_double(pre_comp[i][j][0], pre_comp[i][j][1],
1960                                     pre_comp[i][j][2], pre_comp[i][j / 2][0],
1961                                     pre_comp[i][j / 2][1],
1962                                     pre_comp[i][j / 2][2]);
1963                    }
1964                }
1965            }
1966        }
1967        if (mixed)
1968            make_points_affine(num_points * 17, pre_comp[0], tmp_felems);
1969    }
1970
1971    /* the scalar for the generator */
1972    if ((scalar != NULL) && (have_pre_comp)) {
1973        memset(g_secret, 0, sizeof(g_secret));
1974        /* reduce scalar to 0 <= scalar < 2^521 */
1975        if ((BN_num_bits(scalar) > 521) || (BN_is_negative(scalar))) {
1976            /*
1977             * this is an unusual input, and we don't guarantee
1978             * constant-timeness
1979             */
1980            if (!BN_nnmod(tmp_scalar, scalar, &group->order, ctx)) {
1981                ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
1982                goto err;
1983            }
1984            num_bytes = BN_bn2bin(tmp_scalar, tmp);
1985        } else
1986            num_bytes = BN_bn2bin(scalar, tmp);
1987        flip_endian(g_secret, tmp, num_bytes);
1988        /* do the multiplication with generator precomputation */
1989        batch_mul(x_out, y_out, z_out,
1990                  (const felem_bytearray(*))secrets, num_points,
1991                  g_secret,
1992                  mixed, (const felem(*)[17][3])pre_comp,
1993                  (const felem(*)[3])g_pre_comp);
1994    } else
1995        /* do the multiplication without generator precomputation */
1996        batch_mul(x_out, y_out, z_out,
1997                  (const felem_bytearray(*))secrets, num_points,
1998                  NULL, mixed, (const felem(*)[17][3])pre_comp, NULL);
1999    /* reduce the output to its unique minimal representation */
2000    felem_contract(x_in, x_out);
2001    felem_contract(y_in, y_out);
2002    felem_contract(z_in, z_out);
2003    if ((!felem_to_BN(x, x_in)) || (!felem_to_BN(y, y_in)) ||
2004        (!felem_to_BN(z, z_in))) {
2005        ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
2006        goto err;
2007    }
2008    ret = EC_POINT_set_Jprojective_coordinates_GFp(group, r, x, y, z, ctx);
2009
2010 err:
2011    BN_CTX_end(ctx);
2012    if (generator != NULL)
2013        EC_POINT_free(generator);
2014    if (new_ctx != NULL)
2015        BN_CTX_free(new_ctx);
2016    if (secrets != NULL)
2017        OPENSSL_free(secrets);
2018    if (pre_comp != NULL)
2019        OPENSSL_free(pre_comp);
2020    if (tmp_felems != NULL)
2021        OPENSSL_free(tmp_felems);
2022    return ret;
2023}
2024
2025int ec_GFp_nistp521_precompute_mult(EC_GROUP *group, BN_CTX *ctx)
2026{
2027    int ret = 0;
2028    NISTP521_PRE_COMP *pre = NULL;
2029    int i, j;
2030    BN_CTX *new_ctx = NULL;
2031    BIGNUM *x, *y;
2032    EC_POINT *generator = NULL;
2033    felem tmp_felems[16];
2034
2035    /* throw away old precomputation */
2036    EC_EX_DATA_free_data(&group->extra_data, nistp521_pre_comp_dup,
2037                         nistp521_pre_comp_free,
2038                         nistp521_pre_comp_clear_free);
2039    if (ctx == NULL)
2040        if ((ctx = new_ctx = BN_CTX_new()) == NULL)
2041            return 0;
2042    BN_CTX_start(ctx);
2043    if (((x = BN_CTX_get(ctx)) == NULL) || ((y = BN_CTX_get(ctx)) == NULL))
2044        goto err;
2045    /* get the generator */
2046    if (group->generator == NULL)
2047        goto err;
2048    generator = EC_POINT_new(group);
2049    if (generator == NULL)
2050        goto err;
2051    BN_bin2bn(nistp521_curve_params[3], sizeof(felem_bytearray), x);
2052    BN_bin2bn(nistp521_curve_params[4], sizeof(felem_bytearray), y);
2053    if (!EC_POINT_set_affine_coordinates_GFp(group, generator, x, y, ctx))
2054        goto err;
2055    if ((pre = nistp521_pre_comp_new()) == NULL)
2056        goto err;
2057    /*
2058     * if the generator is the standard one, use built-in precomputation
2059     */
2060    if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) {
2061        memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp));
2062        goto done;
2063    }
2064    if ((!BN_to_felem(pre->g_pre_comp[1][0], &group->generator->X)) ||
2065        (!BN_to_felem(pre->g_pre_comp[1][1], &group->generator->Y)) ||
2066        (!BN_to_felem(pre->g_pre_comp[1][2], &group->generator->Z)))
2067        goto err;
2068    /* compute 2^130*G, 2^260*G, 2^390*G */
2069    for (i = 1; i <= 4; i <<= 1) {
2070        point_double(pre->g_pre_comp[2 * i][0], pre->g_pre_comp[2 * i][1],
2071                     pre->g_pre_comp[2 * i][2], pre->g_pre_comp[i][0],
2072                     pre->g_pre_comp[i][1], pre->g_pre_comp[i][2]);
2073        for (j = 0; j < 129; ++j) {
2074            point_double(pre->g_pre_comp[2 * i][0],
2075                         pre->g_pre_comp[2 * i][1],
2076                         pre->g_pre_comp[2 * i][2],
2077                         pre->g_pre_comp[2 * i][0],
2078                         pre->g_pre_comp[2 * i][1],
2079                         pre->g_pre_comp[2 * i][2]);
2080        }
2081    }
2082    /* g_pre_comp[0] is the point at infinity */
2083    memset(pre->g_pre_comp[0], 0, sizeof(pre->g_pre_comp[0]));
2084    /* the remaining multiples */
2085    /* 2^130*G + 2^260*G */
2086    point_add(pre->g_pre_comp[6][0], pre->g_pre_comp[6][1],
2087              pre->g_pre_comp[6][2], pre->g_pre_comp[4][0],
2088              pre->g_pre_comp[4][1], pre->g_pre_comp[4][2],
2089              0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
2090              pre->g_pre_comp[2][2]);
2091    /* 2^130*G + 2^390*G */
2092    point_add(pre->g_pre_comp[10][0], pre->g_pre_comp[10][1],
2093              pre->g_pre_comp[10][2], pre->g_pre_comp[8][0],
2094              pre->g_pre_comp[8][1], pre->g_pre_comp[8][2],
2095              0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
2096              pre->g_pre_comp[2][2]);
2097    /* 2^260*G + 2^390*G */
2098    point_add(pre->g_pre_comp[12][0], pre->g_pre_comp[12][1],
2099              pre->g_pre_comp[12][2], pre->g_pre_comp[8][0],
2100              pre->g_pre_comp[8][1], pre->g_pre_comp[8][2],
2101              0, pre->g_pre_comp[4][0], pre->g_pre_comp[4][1],
2102              pre->g_pre_comp[4][2]);
2103    /* 2^130*G + 2^260*G + 2^390*G */
2104    point_add(pre->g_pre_comp[14][0], pre->g_pre_comp[14][1],
2105              pre->g_pre_comp[14][2], pre->g_pre_comp[12][0],
2106              pre->g_pre_comp[12][1], pre->g_pre_comp[12][2],
2107              0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
2108              pre->g_pre_comp[2][2]);
2109    for (i = 1; i < 8; ++i) {
2110        /* odd multiples: add G */
2111        point_add(pre->g_pre_comp[2 * i + 1][0],
2112                  pre->g_pre_comp[2 * i + 1][1],
2113                  pre->g_pre_comp[2 * i + 1][2], pre->g_pre_comp[2 * i][0],
2114                  pre->g_pre_comp[2 * i][1], pre->g_pre_comp[2 * i][2], 0,
2115                  pre->g_pre_comp[1][0], pre->g_pre_comp[1][1],
2116                  pre->g_pre_comp[1][2]);
2117    }
2118    make_points_affine(15, &(pre->g_pre_comp[1]), tmp_felems);
2119
2120 done:
2121    if (!EC_EX_DATA_set_data(&group->extra_data, pre, nistp521_pre_comp_dup,
2122                             nistp521_pre_comp_free,
2123                             nistp521_pre_comp_clear_free))
2124        goto err;
2125    ret = 1;
2126    pre = NULL;
2127 err:
2128    BN_CTX_end(ctx);
2129    if (generator != NULL)
2130        EC_POINT_free(generator);
2131    if (new_ctx != NULL)
2132        BN_CTX_free(new_ctx);
2133    if (pre)
2134        nistp521_pre_comp_free(pre);
2135    return ret;
2136}
2137
2138int ec_GFp_nistp521_have_precompute_mult(const EC_GROUP *group)
2139{
2140    if (EC_EX_DATA_get_data(group->extra_data, nistp521_pre_comp_dup,
2141                            nistp521_pre_comp_free,
2142                            nistp521_pre_comp_clear_free)
2143        != NULL)
2144        return 1;
2145    else
2146        return 0;
2147}
2148
2149#else
2150static void *dummy = &dummy;
2151#endif
2152