ecp_nistp521.c revision 331638
1/* crypto/ec/ecp_nistp521.c */
2/*
3 * Written by Adam Langley (Google) for the OpenSSL project
4 */
5/* Copyright 2011 Google Inc.
6 *
7 * Licensed under the Apache License, Version 2.0 (the "License");
8 *
9 * you may not use this file except in compliance with the License.
10 * You may obtain a copy of the License at
11 *
12 *     http://www.apache.org/licenses/LICENSE-2.0
13 *
14 *  Unless required by applicable law or agreed to in writing, software
15 *  distributed under the License is distributed on an "AS IS" BASIS,
16 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 *  See the License for the specific language governing permissions and
18 *  limitations under the License.
19 */
20
21/*
22 * A 64-bit implementation of the NIST P-521 elliptic curve point multiplication
23 *
24 * OpenSSL integration was taken from Emilia Kasper's work in ecp_nistp224.c.
25 * Otherwise based on Emilia's P224 work, which was inspired by my curve25519
26 * work which got its smarts from Daniel J. Bernstein's work on the same.
27 */
28
29#include <openssl/opensslconf.h>
30#ifndef OPENSSL_NO_EC_NISTP_64_GCC_128
31
32# ifndef OPENSSL_SYS_VMS
33#  include <stdint.h>
34# else
35#  include <inttypes.h>
36# endif
37
38# include <string.h>
39# include <openssl/err.h>
40# include "ec_lcl.h"
41
42# if defined(__GNUC__) && (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))
43  /* even with gcc, the typedef won't work for 32-bit platforms */
44typedef __uint128_t uint128_t;  /* nonstandard; implemented by gcc on 64-bit
45                                 * platforms */
46# else
47#  error "Need GCC 3.1 or later to define type uint128_t"
48# endif
49
50typedef uint8_t u8;
51typedef uint64_t u64;
52
53/*
54 * The underlying field. P521 operates over GF(2^521-1). We can serialise an
55 * element of this field into 66 bytes where the most significant byte
56 * contains only a single bit. We call this an felem_bytearray.
57 */
58
59typedef u8 felem_bytearray[66];
60
61/*
62 * These are the parameters of P521, taken from FIPS 186-3, section D.1.2.5.
63 * These values are big-endian.
64 */
65static const felem_bytearray nistp521_curve_params[5] = {
66    {0x01, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* p */
67     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
68     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
69     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
70     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
71     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
72     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
73     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
74     0xff, 0xff},
75    {0x01, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* a = -3 */
76     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
77     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
78     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
79     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
80     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
81     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
82     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
83     0xff, 0xfc},
84    {0x00, 0x51, 0x95, 0x3e, 0xb9, 0x61, 0x8e, 0x1c, /* b */
85     0x9a, 0x1f, 0x92, 0x9a, 0x21, 0xa0, 0xb6, 0x85,
86     0x40, 0xee, 0xa2, 0xda, 0x72, 0x5b, 0x99, 0xb3,
87     0x15, 0xf3, 0xb8, 0xb4, 0x89, 0x91, 0x8e, 0xf1,
88     0x09, 0xe1, 0x56, 0x19, 0x39, 0x51, 0xec, 0x7e,
89     0x93, 0x7b, 0x16, 0x52, 0xc0, 0xbd, 0x3b, 0xb1,
90     0xbf, 0x07, 0x35, 0x73, 0xdf, 0x88, 0x3d, 0x2c,
91     0x34, 0xf1, 0xef, 0x45, 0x1f, 0xd4, 0x6b, 0x50,
92     0x3f, 0x00},
93    {0x00, 0xc6, 0x85, 0x8e, 0x06, 0xb7, 0x04, 0x04, /* x */
94     0xe9, 0xcd, 0x9e, 0x3e, 0xcb, 0x66, 0x23, 0x95,
95     0xb4, 0x42, 0x9c, 0x64, 0x81, 0x39, 0x05, 0x3f,
96     0xb5, 0x21, 0xf8, 0x28, 0xaf, 0x60, 0x6b, 0x4d,
97     0x3d, 0xba, 0xa1, 0x4b, 0x5e, 0x77, 0xef, 0xe7,
98     0x59, 0x28, 0xfe, 0x1d, 0xc1, 0x27, 0xa2, 0xff,
99     0xa8, 0xde, 0x33, 0x48, 0xb3, 0xc1, 0x85, 0x6a,
100     0x42, 0x9b, 0xf9, 0x7e, 0x7e, 0x31, 0xc2, 0xe5,
101     0xbd, 0x66},
102    {0x01, 0x18, 0x39, 0x29, 0x6a, 0x78, 0x9a, 0x3b, /* y */
103     0xc0, 0x04, 0x5c, 0x8a, 0x5f, 0xb4, 0x2c, 0x7d,
104     0x1b, 0xd9, 0x98, 0xf5, 0x44, 0x49, 0x57, 0x9b,
105     0x44, 0x68, 0x17, 0xaf, 0xbd, 0x17, 0x27, 0x3e,
106     0x66, 0x2c, 0x97, 0xee, 0x72, 0x99, 0x5e, 0xf4,
107     0x26, 0x40, 0xc5, 0x50, 0xb9, 0x01, 0x3f, 0xad,
108     0x07, 0x61, 0x35, 0x3c, 0x70, 0x86, 0xa2, 0x72,
109     0xc2, 0x40, 0x88, 0xbe, 0x94, 0x76, 0x9f, 0xd1,
110     0x66, 0x50}
111};
112
113/*-
114 * The representation of field elements.
115 * ------------------------------------
116 *
117 * We represent field elements with nine values. These values are either 64 or
118 * 128 bits and the field element represented is:
119 *   v[0]*2^0 + v[1]*2^58 + v[2]*2^116 + ... + v[8]*2^464  (mod p)
120 * Each of the nine values is called a 'limb'. Since the limbs are spaced only
121 * 58 bits apart, but are greater than 58 bits in length, the most significant
122 * bits of each limb overlap with the least significant bits of the next.
123 *
124 * A field element with 64-bit limbs is an 'felem'. One with 128-bit limbs is a
125 * 'largefelem' */
126
127# define NLIMBS 9
128
129typedef uint64_t limb;
130typedef limb felem[NLIMBS];
131typedef uint128_t largefelem[NLIMBS];
132
133static const limb bottom57bits = 0x1ffffffffffffff;
134static const limb bottom58bits = 0x3ffffffffffffff;
135
136/*
137 * bin66_to_felem takes a little-endian byte array and converts it into felem
138 * form. This assumes that the CPU is little-endian.
139 */
140static void bin66_to_felem(felem out, const u8 in[66])
141{
142    out[0] = (*((limb *) & in[0])) & bottom58bits;
143    out[1] = (*((limb *) & in[7]) >> 2) & bottom58bits;
144    out[2] = (*((limb *) & in[14]) >> 4) & bottom58bits;
145    out[3] = (*((limb *) & in[21]) >> 6) & bottom58bits;
146    out[4] = (*((limb *) & in[29])) & bottom58bits;
147    out[5] = (*((limb *) & in[36]) >> 2) & bottom58bits;
148    out[6] = (*((limb *) & in[43]) >> 4) & bottom58bits;
149    out[7] = (*((limb *) & in[50]) >> 6) & bottom58bits;
150    out[8] = (*((limb *) & in[58])) & bottom57bits;
151}
152
153/*
154 * felem_to_bin66 takes an felem and serialises into a little endian, 66 byte
155 * array. This assumes that the CPU is little-endian.
156 */
157static void felem_to_bin66(u8 out[66], const felem in)
158{
159    memset(out, 0, 66);
160    (*((limb *) & out[0])) = in[0];
161    (*((limb *) & out[7])) |= in[1] << 2;
162    (*((limb *) & out[14])) |= in[2] << 4;
163    (*((limb *) & out[21])) |= in[3] << 6;
164    (*((limb *) & out[29])) = in[4];
165    (*((limb *) & out[36])) |= in[5] << 2;
166    (*((limb *) & out[43])) |= in[6] << 4;
167    (*((limb *) & out[50])) |= in[7] << 6;
168    (*((limb *) & out[58])) = in[8];
169}
170
171/* To preserve endianness when using BN_bn2bin and BN_bin2bn */
172static void flip_endian(u8 *out, const u8 *in, unsigned len)
173{
174    unsigned i;
175    for (i = 0; i < len; ++i)
176        out[i] = in[len - 1 - i];
177}
178
179/* BN_to_felem converts an OpenSSL BIGNUM into an felem */
180static int BN_to_felem(felem out, const BIGNUM *bn)
181{
182    felem_bytearray b_in;
183    felem_bytearray b_out;
184    unsigned num_bytes;
185
186    /* BN_bn2bin eats leading zeroes */
187    memset(b_out, 0, sizeof(b_out));
188    num_bytes = BN_num_bytes(bn);
189    if (num_bytes > sizeof(b_out)) {
190        ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
191        return 0;
192    }
193    if (BN_is_negative(bn)) {
194        ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
195        return 0;
196    }
197    num_bytes = BN_bn2bin(bn, b_in);
198    flip_endian(b_out, b_in, num_bytes);
199    bin66_to_felem(out, b_out);
200    return 1;
201}
202
203/* felem_to_BN converts an felem into an OpenSSL BIGNUM */
204static BIGNUM *felem_to_BN(BIGNUM *out, const felem in)
205{
206    felem_bytearray b_in, b_out;
207    felem_to_bin66(b_in, in);
208    flip_endian(b_out, b_in, sizeof(b_out));
209    return BN_bin2bn(b_out, sizeof(b_out), out);
210}
211
212/*-
213 * Field operations
214 * ----------------
215 */
216
217static void felem_one(felem out)
218{
219    out[0] = 1;
220    out[1] = 0;
221    out[2] = 0;
222    out[3] = 0;
223    out[4] = 0;
224    out[5] = 0;
225    out[6] = 0;
226    out[7] = 0;
227    out[8] = 0;
228}
229
230static void felem_assign(felem out, const felem in)
231{
232    out[0] = in[0];
233    out[1] = in[1];
234    out[2] = in[2];
235    out[3] = in[3];
236    out[4] = in[4];
237    out[5] = in[5];
238    out[6] = in[6];
239    out[7] = in[7];
240    out[8] = in[8];
241}
242
243/* felem_sum64 sets out = out + in. */
244static void felem_sum64(felem out, const felem in)
245{
246    out[0] += in[0];
247    out[1] += in[1];
248    out[2] += in[2];
249    out[3] += in[3];
250    out[4] += in[4];
251    out[5] += in[5];
252    out[6] += in[6];
253    out[7] += in[7];
254    out[8] += in[8];
255}
256
257/* felem_scalar sets out = in * scalar */
258static void felem_scalar(felem out, const felem in, limb scalar)
259{
260    out[0] = in[0] * scalar;
261    out[1] = in[1] * scalar;
262    out[2] = in[2] * scalar;
263    out[3] = in[3] * scalar;
264    out[4] = in[4] * scalar;
265    out[5] = in[5] * scalar;
266    out[6] = in[6] * scalar;
267    out[7] = in[7] * scalar;
268    out[8] = in[8] * scalar;
269}
270
271/* felem_scalar64 sets out = out * scalar */
272static void felem_scalar64(felem out, limb scalar)
273{
274    out[0] *= scalar;
275    out[1] *= scalar;
276    out[2] *= scalar;
277    out[3] *= scalar;
278    out[4] *= scalar;
279    out[5] *= scalar;
280    out[6] *= scalar;
281    out[7] *= scalar;
282    out[8] *= scalar;
283}
284
285/* felem_scalar128 sets out = out * scalar */
286static void felem_scalar128(largefelem out, limb scalar)
287{
288    out[0] *= scalar;
289    out[1] *= scalar;
290    out[2] *= scalar;
291    out[3] *= scalar;
292    out[4] *= scalar;
293    out[5] *= scalar;
294    out[6] *= scalar;
295    out[7] *= scalar;
296    out[8] *= scalar;
297}
298
299/*-
300 * felem_neg sets |out| to |-in|
301 * On entry:
302 *   in[i] < 2^59 + 2^14
303 * On exit:
304 *   out[i] < 2^62
305 */
306static void felem_neg(felem out, const felem in)
307{
308    /* In order to prevent underflow, we subtract from 0 mod p. */
309    static const limb two62m3 = (((limb) 1) << 62) - (((limb) 1) << 5);
310    static const limb two62m2 = (((limb) 1) << 62) - (((limb) 1) << 4);
311
312    out[0] = two62m3 - in[0];
313    out[1] = two62m2 - in[1];
314    out[2] = two62m2 - in[2];
315    out[3] = two62m2 - in[3];
316    out[4] = two62m2 - in[4];
317    out[5] = two62m2 - in[5];
318    out[6] = two62m2 - in[6];
319    out[7] = two62m2 - in[7];
320    out[8] = two62m2 - in[8];
321}
322
323/*-
324 * felem_diff64 subtracts |in| from |out|
325 * On entry:
326 *   in[i] < 2^59 + 2^14
327 * On exit:
328 *   out[i] < out[i] + 2^62
329 */
330static void felem_diff64(felem out, const felem in)
331{
332    /*
333     * In order to prevent underflow, we add 0 mod p before subtracting.
334     */
335    static const limb two62m3 = (((limb) 1) << 62) - (((limb) 1) << 5);
336    static const limb two62m2 = (((limb) 1) << 62) - (((limb) 1) << 4);
337
338    out[0] += two62m3 - in[0];
339    out[1] += two62m2 - in[1];
340    out[2] += two62m2 - in[2];
341    out[3] += two62m2 - in[3];
342    out[4] += two62m2 - in[4];
343    out[5] += two62m2 - in[5];
344    out[6] += two62m2 - in[6];
345    out[7] += two62m2 - in[7];
346    out[8] += two62m2 - in[8];
347}
348
349/*-
350 * felem_diff_128_64 subtracts |in| from |out|
351 * On entry:
352 *   in[i] < 2^62 + 2^17
353 * On exit:
354 *   out[i] < out[i] + 2^63
355 */
356static void felem_diff_128_64(largefelem out, const felem in)
357{
358    /*
359     * In order to prevent underflow, we add 0 mod p before subtracting.
360     */
361    static const limb two63m6 = (((limb) 1) << 62) - (((limb) 1) << 5);
362    static const limb two63m5 = (((limb) 1) << 62) - (((limb) 1) << 4);
363
364    out[0] += two63m6 - in[0];
365    out[1] += two63m5 - in[1];
366    out[2] += two63m5 - in[2];
367    out[3] += two63m5 - in[3];
368    out[4] += two63m5 - in[4];
369    out[5] += two63m5 - in[5];
370    out[6] += two63m5 - in[6];
371    out[7] += two63m5 - in[7];
372    out[8] += two63m5 - in[8];
373}
374
375/*-
376 * felem_diff_128_64 subtracts |in| from |out|
377 * On entry:
378 *   in[i] < 2^126
379 * On exit:
380 *   out[i] < out[i] + 2^127 - 2^69
381 */
382static void felem_diff128(largefelem out, const largefelem in)
383{
384    /*
385     * In order to prevent underflow, we add 0 mod p before subtracting.
386     */
387    static const uint128_t two127m70 =
388        (((uint128_t) 1) << 127) - (((uint128_t) 1) << 70);
389    static const uint128_t two127m69 =
390        (((uint128_t) 1) << 127) - (((uint128_t) 1) << 69);
391
392    out[0] += (two127m70 - in[0]);
393    out[1] += (two127m69 - in[1]);
394    out[2] += (two127m69 - in[2]);
395    out[3] += (two127m69 - in[3]);
396    out[4] += (two127m69 - in[4]);
397    out[5] += (two127m69 - in[5]);
398    out[6] += (two127m69 - in[6]);
399    out[7] += (two127m69 - in[7]);
400    out[8] += (two127m69 - in[8]);
401}
402
403/*-
404 * felem_square sets |out| = |in|^2
405 * On entry:
406 *   in[i] < 2^62
407 * On exit:
408 *   out[i] < 17 * max(in[i]) * max(in[i])
409 */
410static void felem_square(largefelem out, const felem in)
411{
412    felem inx2, inx4;
413    felem_scalar(inx2, in, 2);
414    felem_scalar(inx4, in, 4);
415
416    /*-
417     * We have many cases were we want to do
418     *   in[x] * in[y] +
419     *   in[y] * in[x]
420     * This is obviously just
421     *   2 * in[x] * in[y]
422     * However, rather than do the doubling on the 128 bit result, we
423     * double one of the inputs to the multiplication by reading from
424     * |inx2|
425     */
426
427    out[0] = ((uint128_t) in[0]) * in[0];
428    out[1] = ((uint128_t) in[0]) * inx2[1];
429    out[2] = ((uint128_t) in[0]) * inx2[2] + ((uint128_t) in[1]) * in[1];
430    out[3] = ((uint128_t) in[0]) * inx2[3] + ((uint128_t) in[1]) * inx2[2];
431    out[4] = ((uint128_t) in[0]) * inx2[4] +
432        ((uint128_t) in[1]) * inx2[3] + ((uint128_t) in[2]) * in[2];
433    out[5] = ((uint128_t) in[0]) * inx2[5] +
434        ((uint128_t) in[1]) * inx2[4] + ((uint128_t) in[2]) * inx2[3];
435    out[6] = ((uint128_t) in[0]) * inx2[6] +
436        ((uint128_t) in[1]) * inx2[5] +
437        ((uint128_t) in[2]) * inx2[4] + ((uint128_t) in[3]) * in[3];
438    out[7] = ((uint128_t) in[0]) * inx2[7] +
439        ((uint128_t) in[1]) * inx2[6] +
440        ((uint128_t) in[2]) * inx2[5] + ((uint128_t) in[3]) * inx2[4];
441    out[8] = ((uint128_t) in[0]) * inx2[8] +
442        ((uint128_t) in[1]) * inx2[7] +
443        ((uint128_t) in[2]) * inx2[6] +
444        ((uint128_t) in[3]) * inx2[5] + ((uint128_t) in[4]) * in[4];
445
446    /*
447     * The remaining limbs fall above 2^521, with the first falling at 2^522.
448     * They correspond to locations one bit up from the limbs produced above
449     * so we would have to multiply by two to align them. Again, rather than
450     * operate on the 128-bit result, we double one of the inputs to the
451     * multiplication. If we want to double for both this reason, and the
452     * reason above, then we end up multiplying by four.
453     */
454
455    /* 9 */
456    out[0] += ((uint128_t) in[1]) * inx4[8] +
457        ((uint128_t) in[2]) * inx4[7] +
458        ((uint128_t) in[3]) * inx4[6] + ((uint128_t) in[4]) * inx4[5];
459
460    /* 10 */
461    out[1] += ((uint128_t) in[2]) * inx4[8] +
462        ((uint128_t) in[3]) * inx4[7] +
463        ((uint128_t) in[4]) * inx4[6] + ((uint128_t) in[5]) * inx2[5];
464
465    /* 11 */
466    out[2] += ((uint128_t) in[3]) * inx4[8] +
467        ((uint128_t) in[4]) * inx4[7] + ((uint128_t) in[5]) * inx4[6];
468
469    /* 12 */
470    out[3] += ((uint128_t) in[4]) * inx4[8] +
471        ((uint128_t) in[5]) * inx4[7] + ((uint128_t) in[6]) * inx2[6];
472
473    /* 13 */
474    out[4] += ((uint128_t) in[5]) * inx4[8] + ((uint128_t) in[6]) * inx4[7];
475
476    /* 14 */
477    out[5] += ((uint128_t) in[6]) * inx4[8] + ((uint128_t) in[7]) * inx2[7];
478
479    /* 15 */
480    out[6] += ((uint128_t) in[7]) * inx4[8];
481
482    /* 16 */
483    out[7] += ((uint128_t) in[8]) * inx2[8];
484}
485
486/*-
487 * felem_mul sets |out| = |in1| * |in2|
488 * On entry:
489 *   in1[i] < 2^64
490 *   in2[i] < 2^63
491 * On exit:
492 *   out[i] < 17 * max(in1[i]) * max(in2[i])
493 */
494static void felem_mul(largefelem out, const felem in1, const felem in2)
495{
496    felem in2x2;
497    felem_scalar(in2x2, in2, 2);
498
499    out[0] = ((uint128_t) in1[0]) * in2[0];
500
501    out[1] = ((uint128_t) in1[0]) * in2[1] + ((uint128_t) in1[1]) * in2[0];
502
503    out[2] = ((uint128_t) in1[0]) * in2[2] +
504        ((uint128_t) in1[1]) * in2[1] + ((uint128_t) in1[2]) * in2[0];
505
506    out[3] = ((uint128_t) in1[0]) * in2[3] +
507        ((uint128_t) in1[1]) * in2[2] +
508        ((uint128_t) in1[2]) * in2[1] + ((uint128_t) in1[3]) * in2[0];
509
510    out[4] = ((uint128_t) in1[0]) * in2[4] +
511        ((uint128_t) in1[1]) * in2[3] +
512        ((uint128_t) in1[2]) * in2[2] +
513        ((uint128_t) in1[3]) * in2[1] + ((uint128_t) in1[4]) * in2[0];
514
515    out[5] = ((uint128_t) in1[0]) * in2[5] +
516        ((uint128_t) in1[1]) * in2[4] +
517        ((uint128_t) in1[2]) * in2[3] +
518        ((uint128_t) in1[3]) * in2[2] +
519        ((uint128_t) in1[4]) * in2[1] + ((uint128_t) in1[5]) * in2[0];
520
521    out[6] = ((uint128_t) in1[0]) * in2[6] +
522        ((uint128_t) in1[1]) * in2[5] +
523        ((uint128_t) in1[2]) * in2[4] +
524        ((uint128_t) in1[3]) * in2[3] +
525        ((uint128_t) in1[4]) * in2[2] +
526        ((uint128_t) in1[5]) * in2[1] + ((uint128_t) in1[6]) * in2[0];
527
528    out[7] = ((uint128_t) in1[0]) * in2[7] +
529        ((uint128_t) in1[1]) * in2[6] +
530        ((uint128_t) in1[2]) * in2[5] +
531        ((uint128_t) in1[3]) * in2[4] +
532        ((uint128_t) in1[4]) * in2[3] +
533        ((uint128_t) in1[5]) * in2[2] +
534        ((uint128_t) in1[6]) * in2[1] + ((uint128_t) in1[7]) * in2[0];
535
536    out[8] = ((uint128_t) in1[0]) * in2[8] +
537        ((uint128_t) in1[1]) * in2[7] +
538        ((uint128_t) in1[2]) * in2[6] +
539        ((uint128_t) in1[3]) * in2[5] +
540        ((uint128_t) in1[4]) * in2[4] +
541        ((uint128_t) in1[5]) * in2[3] +
542        ((uint128_t) in1[6]) * in2[2] +
543        ((uint128_t) in1[7]) * in2[1] + ((uint128_t) in1[8]) * in2[0];
544
545    /* See comment in felem_square about the use of in2x2 here */
546
547    out[0] += ((uint128_t) in1[1]) * in2x2[8] +
548        ((uint128_t) in1[2]) * in2x2[7] +
549        ((uint128_t) in1[3]) * in2x2[6] +
550        ((uint128_t) in1[4]) * in2x2[5] +
551        ((uint128_t) in1[5]) * in2x2[4] +
552        ((uint128_t) in1[6]) * in2x2[3] +
553        ((uint128_t) in1[7]) * in2x2[2] + ((uint128_t) in1[8]) * in2x2[1];
554
555    out[1] += ((uint128_t) in1[2]) * in2x2[8] +
556        ((uint128_t) in1[3]) * in2x2[7] +
557        ((uint128_t) in1[4]) * in2x2[6] +
558        ((uint128_t) in1[5]) * in2x2[5] +
559        ((uint128_t) in1[6]) * in2x2[4] +
560        ((uint128_t) in1[7]) * in2x2[3] + ((uint128_t) in1[8]) * in2x2[2];
561
562    out[2] += ((uint128_t) in1[3]) * in2x2[8] +
563        ((uint128_t) in1[4]) * in2x2[7] +
564        ((uint128_t) in1[5]) * in2x2[6] +
565        ((uint128_t) in1[6]) * in2x2[5] +
566        ((uint128_t) in1[7]) * in2x2[4] + ((uint128_t) in1[8]) * in2x2[3];
567
568    out[3] += ((uint128_t) in1[4]) * in2x2[8] +
569        ((uint128_t) in1[5]) * in2x2[7] +
570        ((uint128_t) in1[6]) * in2x2[6] +
571        ((uint128_t) in1[7]) * in2x2[5] + ((uint128_t) in1[8]) * in2x2[4];
572
573    out[4] += ((uint128_t) in1[5]) * in2x2[8] +
574        ((uint128_t) in1[6]) * in2x2[7] +
575        ((uint128_t) in1[7]) * in2x2[6] + ((uint128_t) in1[8]) * in2x2[5];
576
577    out[5] += ((uint128_t) in1[6]) * in2x2[8] +
578        ((uint128_t) in1[7]) * in2x2[7] + ((uint128_t) in1[8]) * in2x2[6];
579
580    out[6] += ((uint128_t) in1[7]) * in2x2[8] +
581        ((uint128_t) in1[8]) * in2x2[7];
582
583    out[7] += ((uint128_t) in1[8]) * in2x2[8];
584}
585
586static const limb bottom52bits = 0xfffffffffffff;
587
588/*-
589 * felem_reduce converts a largefelem to an felem.
590 * On entry:
591 *   in[i] < 2^128
592 * On exit:
593 *   out[i] < 2^59 + 2^14
594 */
595static void felem_reduce(felem out, const largefelem in)
596{
597    u64 overflow1, overflow2;
598
599    out[0] = ((limb) in[0]) & bottom58bits;
600    out[1] = ((limb) in[1]) & bottom58bits;
601    out[2] = ((limb) in[2]) & bottom58bits;
602    out[3] = ((limb) in[3]) & bottom58bits;
603    out[4] = ((limb) in[4]) & bottom58bits;
604    out[5] = ((limb) in[5]) & bottom58bits;
605    out[6] = ((limb) in[6]) & bottom58bits;
606    out[7] = ((limb) in[7]) & bottom58bits;
607    out[8] = ((limb) in[8]) & bottom58bits;
608
609    /* out[i] < 2^58 */
610
611    out[1] += ((limb) in[0]) >> 58;
612    out[1] += (((limb) (in[0] >> 64)) & bottom52bits) << 6;
613    /*-
614     * out[1] < 2^58 + 2^6 + 2^58
615     *        = 2^59 + 2^6
616     */
617    out[2] += ((limb) (in[0] >> 64)) >> 52;
618
619    out[2] += ((limb) in[1]) >> 58;
620    out[2] += (((limb) (in[1] >> 64)) & bottom52bits) << 6;
621    out[3] += ((limb) (in[1] >> 64)) >> 52;
622
623    out[3] += ((limb) in[2]) >> 58;
624    out[3] += (((limb) (in[2] >> 64)) & bottom52bits) << 6;
625    out[4] += ((limb) (in[2] >> 64)) >> 52;
626
627    out[4] += ((limb) in[3]) >> 58;
628    out[4] += (((limb) (in[3] >> 64)) & bottom52bits) << 6;
629    out[5] += ((limb) (in[3] >> 64)) >> 52;
630
631    out[5] += ((limb) in[4]) >> 58;
632    out[5] += (((limb) (in[4] >> 64)) & bottom52bits) << 6;
633    out[6] += ((limb) (in[4] >> 64)) >> 52;
634
635    out[6] += ((limb) in[5]) >> 58;
636    out[6] += (((limb) (in[5] >> 64)) & bottom52bits) << 6;
637    out[7] += ((limb) (in[5] >> 64)) >> 52;
638
639    out[7] += ((limb) in[6]) >> 58;
640    out[7] += (((limb) (in[6] >> 64)) & bottom52bits) << 6;
641    out[8] += ((limb) (in[6] >> 64)) >> 52;
642
643    out[8] += ((limb) in[7]) >> 58;
644    out[8] += (((limb) (in[7] >> 64)) & bottom52bits) << 6;
645    /*-
646     * out[x > 1] < 2^58 + 2^6 + 2^58 + 2^12
647     *            < 2^59 + 2^13
648     */
649    overflow1 = ((limb) (in[7] >> 64)) >> 52;
650
651    overflow1 += ((limb) in[8]) >> 58;
652    overflow1 += (((limb) (in[8] >> 64)) & bottom52bits) << 6;
653    overflow2 = ((limb) (in[8] >> 64)) >> 52;
654
655    overflow1 <<= 1;            /* overflow1 < 2^13 + 2^7 + 2^59 */
656    overflow2 <<= 1;            /* overflow2 < 2^13 */
657
658    out[0] += overflow1;        /* out[0] < 2^60 */
659    out[1] += overflow2;        /* out[1] < 2^59 + 2^6 + 2^13 */
660
661    out[1] += out[0] >> 58;
662    out[0] &= bottom58bits;
663    /*-
664     * out[0] < 2^58
665     * out[1] < 2^59 + 2^6 + 2^13 + 2^2
666     *        < 2^59 + 2^14
667     */
668}
669
670static void felem_square_reduce(felem out, const felem in)
671{
672    largefelem tmp;
673    felem_square(tmp, in);
674    felem_reduce(out, tmp);
675}
676
677static void felem_mul_reduce(felem out, const felem in1, const felem in2)
678{
679    largefelem tmp;
680    felem_mul(tmp, in1, in2);
681    felem_reduce(out, tmp);
682}
683
684/*-
685 * felem_inv calculates |out| = |in|^{-1}
686 *
687 * Based on Fermat's Little Theorem:
688 *   a^p = a (mod p)
689 *   a^{p-1} = 1 (mod p)
690 *   a^{p-2} = a^{-1} (mod p)
691 */
692static void felem_inv(felem out, const felem in)
693{
694    felem ftmp, ftmp2, ftmp3, ftmp4;
695    largefelem tmp;
696    unsigned i;
697
698    felem_square(tmp, in);
699    felem_reduce(ftmp, tmp);    /* 2^1 */
700    felem_mul(tmp, in, ftmp);
701    felem_reduce(ftmp, tmp);    /* 2^2 - 2^0 */
702    felem_assign(ftmp2, ftmp);
703    felem_square(tmp, ftmp);
704    felem_reduce(ftmp, tmp);    /* 2^3 - 2^1 */
705    felem_mul(tmp, in, ftmp);
706    felem_reduce(ftmp, tmp);    /* 2^3 - 2^0 */
707    felem_square(tmp, ftmp);
708    felem_reduce(ftmp, tmp);    /* 2^4 - 2^1 */
709
710    felem_square(tmp, ftmp2);
711    felem_reduce(ftmp3, tmp);   /* 2^3 - 2^1 */
712    felem_square(tmp, ftmp3);
713    felem_reduce(ftmp3, tmp);   /* 2^4 - 2^2 */
714    felem_mul(tmp, ftmp3, ftmp2);
715    felem_reduce(ftmp3, tmp);   /* 2^4 - 2^0 */
716
717    felem_assign(ftmp2, ftmp3);
718    felem_square(tmp, ftmp3);
719    felem_reduce(ftmp3, tmp);   /* 2^5 - 2^1 */
720    felem_square(tmp, ftmp3);
721    felem_reduce(ftmp3, tmp);   /* 2^6 - 2^2 */
722    felem_square(tmp, ftmp3);
723    felem_reduce(ftmp3, tmp);   /* 2^7 - 2^3 */
724    felem_square(tmp, ftmp3);
725    felem_reduce(ftmp3, tmp);   /* 2^8 - 2^4 */
726    felem_assign(ftmp4, ftmp3);
727    felem_mul(tmp, ftmp3, ftmp);
728    felem_reduce(ftmp4, tmp);   /* 2^8 - 2^1 */
729    felem_square(tmp, ftmp4);
730    felem_reduce(ftmp4, tmp);   /* 2^9 - 2^2 */
731    felem_mul(tmp, ftmp3, ftmp2);
732    felem_reduce(ftmp3, tmp);   /* 2^8 - 2^0 */
733    felem_assign(ftmp2, ftmp3);
734
735    for (i = 0; i < 8; i++) {
736        felem_square(tmp, ftmp3);
737        felem_reduce(ftmp3, tmp); /* 2^16 - 2^8 */
738    }
739    felem_mul(tmp, ftmp3, ftmp2);
740    felem_reduce(ftmp3, tmp);   /* 2^16 - 2^0 */
741    felem_assign(ftmp2, ftmp3);
742
743    for (i = 0; i < 16; i++) {
744        felem_square(tmp, ftmp3);
745        felem_reduce(ftmp3, tmp); /* 2^32 - 2^16 */
746    }
747    felem_mul(tmp, ftmp3, ftmp2);
748    felem_reduce(ftmp3, tmp);   /* 2^32 - 2^0 */
749    felem_assign(ftmp2, ftmp3);
750
751    for (i = 0; i < 32; i++) {
752        felem_square(tmp, ftmp3);
753        felem_reduce(ftmp3, tmp); /* 2^64 - 2^32 */
754    }
755    felem_mul(tmp, ftmp3, ftmp2);
756    felem_reduce(ftmp3, tmp);   /* 2^64 - 2^0 */
757    felem_assign(ftmp2, ftmp3);
758
759    for (i = 0; i < 64; i++) {
760        felem_square(tmp, ftmp3);
761        felem_reduce(ftmp3, tmp); /* 2^128 - 2^64 */
762    }
763    felem_mul(tmp, ftmp3, ftmp2);
764    felem_reduce(ftmp3, tmp);   /* 2^128 - 2^0 */
765    felem_assign(ftmp2, ftmp3);
766
767    for (i = 0; i < 128; i++) {
768        felem_square(tmp, ftmp3);
769        felem_reduce(ftmp3, tmp); /* 2^256 - 2^128 */
770    }
771    felem_mul(tmp, ftmp3, ftmp2);
772    felem_reduce(ftmp3, tmp);   /* 2^256 - 2^0 */
773    felem_assign(ftmp2, ftmp3);
774
775    for (i = 0; i < 256; i++) {
776        felem_square(tmp, ftmp3);
777        felem_reduce(ftmp3, tmp); /* 2^512 - 2^256 */
778    }
779    felem_mul(tmp, ftmp3, ftmp2);
780    felem_reduce(ftmp3, tmp);   /* 2^512 - 2^0 */
781
782    for (i = 0; i < 9; i++) {
783        felem_square(tmp, ftmp3);
784        felem_reduce(ftmp3, tmp); /* 2^521 - 2^9 */
785    }
786    felem_mul(tmp, ftmp3, ftmp4);
787    felem_reduce(ftmp3, tmp);   /* 2^512 - 2^2 */
788    felem_mul(tmp, ftmp3, in);
789    felem_reduce(out, tmp);     /* 2^512 - 3 */
790}
791
792/* This is 2^521-1, expressed as an felem */
793static const felem kPrime = {
794    0x03ffffffffffffff, 0x03ffffffffffffff, 0x03ffffffffffffff,
795    0x03ffffffffffffff, 0x03ffffffffffffff, 0x03ffffffffffffff,
796    0x03ffffffffffffff, 0x03ffffffffffffff, 0x01ffffffffffffff
797};
798
799/*-
800 * felem_is_zero returns a limb with all bits set if |in| == 0 (mod p) and 0
801 * otherwise.
802 * On entry:
803 *   in[i] < 2^59 + 2^14
804 */
805static limb felem_is_zero(const felem in)
806{
807    felem ftmp;
808    limb is_zero, is_p;
809    felem_assign(ftmp, in);
810
811    ftmp[0] += ftmp[8] >> 57;
812    ftmp[8] &= bottom57bits;
813    /* ftmp[8] < 2^57 */
814    ftmp[1] += ftmp[0] >> 58;
815    ftmp[0] &= bottom58bits;
816    ftmp[2] += ftmp[1] >> 58;
817    ftmp[1] &= bottom58bits;
818    ftmp[3] += ftmp[2] >> 58;
819    ftmp[2] &= bottom58bits;
820    ftmp[4] += ftmp[3] >> 58;
821    ftmp[3] &= bottom58bits;
822    ftmp[5] += ftmp[4] >> 58;
823    ftmp[4] &= bottom58bits;
824    ftmp[6] += ftmp[5] >> 58;
825    ftmp[5] &= bottom58bits;
826    ftmp[7] += ftmp[6] >> 58;
827    ftmp[6] &= bottom58bits;
828    ftmp[8] += ftmp[7] >> 58;
829    ftmp[7] &= bottom58bits;
830    /* ftmp[8] < 2^57 + 4 */
831
832    /*
833     * The ninth limb of 2*(2^521-1) is 0x03ffffffffffffff, which is greater
834     * than our bound for ftmp[8]. Therefore we only have to check if the
835     * zero is zero or 2^521-1.
836     */
837
838    is_zero = 0;
839    is_zero |= ftmp[0];
840    is_zero |= ftmp[1];
841    is_zero |= ftmp[2];
842    is_zero |= ftmp[3];
843    is_zero |= ftmp[4];
844    is_zero |= ftmp[5];
845    is_zero |= ftmp[6];
846    is_zero |= ftmp[7];
847    is_zero |= ftmp[8];
848
849    is_zero--;
850    /*
851     * We know that ftmp[i] < 2^63, therefore the only way that the top bit
852     * can be set is if is_zero was 0 before the decrement.
853     */
854    is_zero = 0 - (is_zero >> 63);
855
856    is_p = ftmp[0] ^ kPrime[0];
857    is_p |= ftmp[1] ^ kPrime[1];
858    is_p |= ftmp[2] ^ kPrime[2];
859    is_p |= ftmp[3] ^ kPrime[3];
860    is_p |= ftmp[4] ^ kPrime[4];
861    is_p |= ftmp[5] ^ kPrime[5];
862    is_p |= ftmp[6] ^ kPrime[6];
863    is_p |= ftmp[7] ^ kPrime[7];
864    is_p |= ftmp[8] ^ kPrime[8];
865
866    is_p--;
867    is_p = 0 - (is_p >> 63);
868
869    is_zero |= is_p;
870    return is_zero;
871}
872
873static int felem_is_zero_int(const void *in)
874{
875    return (int)(felem_is_zero(in) & ((limb) 1));
876}
877
878/*-
879 * felem_contract converts |in| to its unique, minimal representation.
880 * On entry:
881 *   in[i] < 2^59 + 2^14
882 */
883static void felem_contract(felem out, const felem in)
884{
885    limb is_p, is_greater, sign;
886    static const limb two58 = ((limb) 1) << 58;
887
888    felem_assign(out, in);
889
890    out[0] += out[8] >> 57;
891    out[8] &= bottom57bits;
892    /* out[8] < 2^57 */
893    out[1] += out[0] >> 58;
894    out[0] &= bottom58bits;
895    out[2] += out[1] >> 58;
896    out[1] &= bottom58bits;
897    out[3] += out[2] >> 58;
898    out[2] &= bottom58bits;
899    out[4] += out[3] >> 58;
900    out[3] &= bottom58bits;
901    out[5] += out[4] >> 58;
902    out[4] &= bottom58bits;
903    out[6] += out[5] >> 58;
904    out[5] &= bottom58bits;
905    out[7] += out[6] >> 58;
906    out[6] &= bottom58bits;
907    out[8] += out[7] >> 58;
908    out[7] &= bottom58bits;
909    /* out[8] < 2^57 + 4 */
910
911    /*
912     * If the value is greater than 2^521-1 then we have to subtract 2^521-1
913     * out. See the comments in felem_is_zero regarding why we don't test for
914     * other multiples of the prime.
915     */
916
917    /*
918     * First, if |out| is equal to 2^521-1, we subtract it out to get zero.
919     */
920
921    is_p = out[0] ^ kPrime[0];
922    is_p |= out[1] ^ kPrime[1];
923    is_p |= out[2] ^ kPrime[2];
924    is_p |= out[3] ^ kPrime[3];
925    is_p |= out[4] ^ kPrime[4];
926    is_p |= out[5] ^ kPrime[5];
927    is_p |= out[6] ^ kPrime[6];
928    is_p |= out[7] ^ kPrime[7];
929    is_p |= out[8] ^ kPrime[8];
930
931    is_p--;
932    is_p &= is_p << 32;
933    is_p &= is_p << 16;
934    is_p &= is_p << 8;
935    is_p &= is_p << 4;
936    is_p &= is_p << 2;
937    is_p &= is_p << 1;
938    is_p = 0 - (is_p >> 63);
939    is_p = ~is_p;
940
941    /* is_p is 0 iff |out| == 2^521-1 and all ones otherwise */
942
943    out[0] &= is_p;
944    out[1] &= is_p;
945    out[2] &= is_p;
946    out[3] &= is_p;
947    out[4] &= is_p;
948    out[5] &= is_p;
949    out[6] &= is_p;
950    out[7] &= is_p;
951    out[8] &= is_p;
952
953    /*
954     * In order to test that |out| >= 2^521-1 we need only test if out[8] >>
955     * 57 is greater than zero as (2^521-1) + x >= 2^522
956     */
957    is_greater = out[8] >> 57;
958    is_greater |= is_greater << 32;
959    is_greater |= is_greater << 16;
960    is_greater |= is_greater << 8;
961    is_greater |= is_greater << 4;
962    is_greater |= is_greater << 2;
963    is_greater |= is_greater << 1;
964    is_greater = 0 - (is_greater >> 63);
965
966    out[0] -= kPrime[0] & is_greater;
967    out[1] -= kPrime[1] & is_greater;
968    out[2] -= kPrime[2] & is_greater;
969    out[3] -= kPrime[3] & is_greater;
970    out[4] -= kPrime[4] & is_greater;
971    out[5] -= kPrime[5] & is_greater;
972    out[6] -= kPrime[6] & is_greater;
973    out[7] -= kPrime[7] & is_greater;
974    out[8] -= kPrime[8] & is_greater;
975
976    /* Eliminate negative coefficients */
977    sign = -(out[0] >> 63);
978    out[0] += (two58 & sign);
979    out[1] -= (1 & sign);
980    sign = -(out[1] >> 63);
981    out[1] += (two58 & sign);
982    out[2] -= (1 & sign);
983    sign = -(out[2] >> 63);
984    out[2] += (two58 & sign);
985    out[3] -= (1 & sign);
986    sign = -(out[3] >> 63);
987    out[3] += (two58 & sign);
988    out[4] -= (1 & sign);
989    sign = -(out[4] >> 63);
990    out[4] += (two58 & sign);
991    out[5] -= (1 & sign);
992    sign = -(out[0] >> 63);
993    out[5] += (two58 & sign);
994    out[6] -= (1 & sign);
995    sign = -(out[6] >> 63);
996    out[6] += (two58 & sign);
997    out[7] -= (1 & sign);
998    sign = -(out[7] >> 63);
999    out[7] += (two58 & sign);
1000    out[8] -= (1 & sign);
1001    sign = -(out[5] >> 63);
1002    out[5] += (two58 & sign);
1003    out[6] -= (1 & sign);
1004    sign = -(out[6] >> 63);
1005    out[6] += (two58 & sign);
1006    out[7] -= (1 & sign);
1007    sign = -(out[7] >> 63);
1008    out[7] += (two58 & sign);
1009    out[8] -= (1 & sign);
1010}
1011
1012/*-
1013 * Group operations
1014 * ----------------
1015 *
1016 * Building on top of the field operations we have the operations on the
1017 * elliptic curve group itself. Points on the curve are represented in Jacobian
1018 * coordinates */
1019
1020/*-
1021 * point_double calcuates 2*(x_in, y_in, z_in)
1022 *
1023 * The method is taken from:
1024 *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b
1025 *
1026 * Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed.
1027 * while x_out == y_in is not (maybe this works, but it's not tested). */
1028static void
1029point_double(felem x_out, felem y_out, felem z_out,
1030             const felem x_in, const felem y_in, const felem z_in)
1031{
1032    largefelem tmp, tmp2;
1033    felem delta, gamma, beta, alpha, ftmp, ftmp2;
1034
1035    felem_assign(ftmp, x_in);
1036    felem_assign(ftmp2, x_in);
1037
1038    /* delta = z^2 */
1039    felem_square(tmp, z_in);
1040    felem_reduce(delta, tmp);   /* delta[i] < 2^59 + 2^14 */
1041
1042    /* gamma = y^2 */
1043    felem_square(tmp, y_in);
1044    felem_reduce(gamma, tmp);   /* gamma[i] < 2^59 + 2^14 */
1045
1046    /* beta = x*gamma */
1047    felem_mul(tmp, x_in, gamma);
1048    felem_reduce(beta, tmp);    /* beta[i] < 2^59 + 2^14 */
1049
1050    /* alpha = 3*(x-delta)*(x+delta) */
1051    felem_diff64(ftmp, delta);
1052    /* ftmp[i] < 2^61 */
1053    felem_sum64(ftmp2, delta);
1054    /* ftmp2[i] < 2^60 + 2^15 */
1055    felem_scalar64(ftmp2, 3);
1056    /* ftmp2[i] < 3*2^60 + 3*2^15 */
1057    felem_mul(tmp, ftmp, ftmp2);
1058    /*-
1059     * tmp[i] < 17(3*2^121 + 3*2^76)
1060     *        = 61*2^121 + 61*2^76
1061     *        < 64*2^121 + 64*2^76
1062     *        = 2^127 + 2^82
1063     *        < 2^128
1064     */
1065    felem_reduce(alpha, tmp);
1066
1067    /* x' = alpha^2 - 8*beta */
1068    felem_square(tmp, alpha);
1069    /*
1070     * tmp[i] < 17*2^120 < 2^125
1071     */
1072    felem_assign(ftmp, beta);
1073    felem_scalar64(ftmp, 8);
1074    /* ftmp[i] < 2^62 + 2^17 */
1075    felem_diff_128_64(tmp, ftmp);
1076    /* tmp[i] < 2^125 + 2^63 + 2^62 + 2^17 */
1077    felem_reduce(x_out, tmp);
1078
1079    /* z' = (y + z)^2 - gamma - delta */
1080    felem_sum64(delta, gamma);
1081    /* delta[i] < 2^60 + 2^15 */
1082    felem_assign(ftmp, y_in);
1083    felem_sum64(ftmp, z_in);
1084    /* ftmp[i] < 2^60 + 2^15 */
1085    felem_square(tmp, ftmp);
1086    /*
1087     * tmp[i] < 17(2^122) < 2^127
1088     */
1089    felem_diff_128_64(tmp, delta);
1090    /* tmp[i] < 2^127 + 2^63 */
1091    felem_reduce(z_out, tmp);
1092
1093    /* y' = alpha*(4*beta - x') - 8*gamma^2 */
1094    felem_scalar64(beta, 4);
1095    /* beta[i] < 2^61 + 2^16 */
1096    felem_diff64(beta, x_out);
1097    /* beta[i] < 2^61 + 2^60 + 2^16 */
1098    felem_mul(tmp, alpha, beta);
1099    /*-
1100     * tmp[i] < 17*((2^59 + 2^14)(2^61 + 2^60 + 2^16))
1101     *        = 17*(2^120 + 2^75 + 2^119 + 2^74 + 2^75 + 2^30)
1102     *        = 17*(2^120 + 2^119 + 2^76 + 2^74 + 2^30)
1103     *        < 2^128
1104     */
1105    felem_square(tmp2, gamma);
1106    /*-
1107     * tmp2[i] < 17*(2^59 + 2^14)^2
1108     *         = 17*(2^118 + 2^74 + 2^28)
1109     */
1110    felem_scalar128(tmp2, 8);
1111    /*-
1112     * tmp2[i] < 8*17*(2^118 + 2^74 + 2^28)
1113     *         = 2^125 + 2^121 + 2^81 + 2^77 + 2^35 + 2^31
1114     *         < 2^126
1115     */
1116    felem_diff128(tmp, tmp2);
1117    /*-
1118     * tmp[i] < 2^127 - 2^69 + 17(2^120 + 2^119 + 2^76 + 2^74 + 2^30)
1119     *        = 2^127 + 2^124 + 2^122 + 2^120 + 2^118 + 2^80 + 2^78 + 2^76 +
1120     *          2^74 + 2^69 + 2^34 + 2^30
1121     *        < 2^128
1122     */
1123    felem_reduce(y_out, tmp);
1124}
1125
1126/* copy_conditional copies in to out iff mask is all ones. */
1127static void copy_conditional(felem out, const felem in, limb mask)
1128{
1129    unsigned i;
1130    for (i = 0; i < NLIMBS; ++i) {
1131        const limb tmp = mask & (in[i] ^ out[i]);
1132        out[i] ^= tmp;
1133    }
1134}
1135
1136/*-
1137 * point_add calcuates (x1, y1, z1) + (x2, y2, z2)
1138 *
1139 * The method is taken from
1140 *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl,
1141 * adapted for mixed addition (z2 = 1, or z2 = 0 for the point at infinity).
1142 *
1143 * This function includes a branch for checking whether the two input points
1144 * are equal (while not equal to the point at infinity). This case never
1145 * happens during single point multiplication, so there is no timing leak for
1146 * ECDH or ECDSA signing. */
1147static void point_add(felem x3, felem y3, felem z3,
1148                      const felem x1, const felem y1, const felem z1,
1149                      const int mixed, const felem x2, const felem y2,
1150                      const felem z2)
1151{
1152    felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, ftmp6, x_out, y_out, z_out;
1153    largefelem tmp, tmp2;
1154    limb x_equal, y_equal, z1_is_zero, z2_is_zero;
1155
1156    z1_is_zero = felem_is_zero(z1);
1157    z2_is_zero = felem_is_zero(z2);
1158
1159    /* ftmp = z1z1 = z1**2 */
1160    felem_square(tmp, z1);
1161    felem_reduce(ftmp, tmp);
1162
1163    if (!mixed) {
1164        /* ftmp2 = z2z2 = z2**2 */
1165        felem_square(tmp, z2);
1166        felem_reduce(ftmp2, tmp);
1167
1168        /* u1 = ftmp3 = x1*z2z2 */
1169        felem_mul(tmp, x1, ftmp2);
1170        felem_reduce(ftmp3, tmp);
1171
1172        /* ftmp5 = z1 + z2 */
1173        felem_assign(ftmp5, z1);
1174        felem_sum64(ftmp5, z2);
1175        /* ftmp5[i] < 2^61 */
1176
1177        /* ftmp5 = (z1 + z2)**2 - z1z1 - z2z2 = 2*z1z2 */
1178        felem_square(tmp, ftmp5);
1179        /* tmp[i] < 17*2^122 */
1180        felem_diff_128_64(tmp, ftmp);
1181        /* tmp[i] < 17*2^122 + 2^63 */
1182        felem_diff_128_64(tmp, ftmp2);
1183        /* tmp[i] < 17*2^122 + 2^64 */
1184        felem_reduce(ftmp5, tmp);
1185
1186        /* ftmp2 = z2 * z2z2 */
1187        felem_mul(tmp, ftmp2, z2);
1188        felem_reduce(ftmp2, tmp);
1189
1190        /* s1 = ftmp6 = y1 * z2**3 */
1191        felem_mul(tmp, y1, ftmp2);
1192        felem_reduce(ftmp6, tmp);
1193    } else {
1194        /*
1195         * We'll assume z2 = 1 (special case z2 = 0 is handled later)
1196         */
1197
1198        /* u1 = ftmp3 = x1*z2z2 */
1199        felem_assign(ftmp3, x1);
1200
1201        /* ftmp5 = 2*z1z2 */
1202        felem_scalar(ftmp5, z1, 2);
1203
1204        /* s1 = ftmp6 = y1 * z2**3 */
1205        felem_assign(ftmp6, y1);
1206    }
1207
1208    /* u2 = x2*z1z1 */
1209    felem_mul(tmp, x2, ftmp);
1210    /* tmp[i] < 17*2^120 */
1211
1212    /* h = ftmp4 = u2 - u1 */
1213    felem_diff_128_64(tmp, ftmp3);
1214    /* tmp[i] < 17*2^120 + 2^63 */
1215    felem_reduce(ftmp4, tmp);
1216
1217    x_equal = felem_is_zero(ftmp4);
1218
1219    /* z_out = ftmp5 * h */
1220    felem_mul(tmp, ftmp5, ftmp4);
1221    felem_reduce(z_out, tmp);
1222
1223    /* ftmp = z1 * z1z1 */
1224    felem_mul(tmp, ftmp, z1);
1225    felem_reduce(ftmp, tmp);
1226
1227    /* s2 = tmp = y2 * z1**3 */
1228    felem_mul(tmp, y2, ftmp);
1229    /* tmp[i] < 17*2^120 */
1230
1231    /* r = ftmp5 = (s2 - s1)*2 */
1232    felem_diff_128_64(tmp, ftmp6);
1233    /* tmp[i] < 17*2^120 + 2^63 */
1234    felem_reduce(ftmp5, tmp);
1235    y_equal = felem_is_zero(ftmp5);
1236    felem_scalar64(ftmp5, 2);
1237    /* ftmp5[i] < 2^61 */
1238
1239    if (x_equal && y_equal && !z1_is_zero && !z2_is_zero) {
1240        point_double(x3, y3, z3, x1, y1, z1);
1241        return;
1242    }
1243
1244    /* I = ftmp = (2h)**2 */
1245    felem_assign(ftmp, ftmp4);
1246    felem_scalar64(ftmp, 2);
1247    /* ftmp[i] < 2^61 */
1248    felem_square(tmp, ftmp);
1249    /* tmp[i] < 17*2^122 */
1250    felem_reduce(ftmp, tmp);
1251
1252    /* J = ftmp2 = h * I */
1253    felem_mul(tmp, ftmp4, ftmp);
1254    felem_reduce(ftmp2, tmp);
1255
1256    /* V = ftmp4 = U1 * I */
1257    felem_mul(tmp, ftmp3, ftmp);
1258    felem_reduce(ftmp4, tmp);
1259
1260    /* x_out = r**2 - J - 2V */
1261    felem_square(tmp, ftmp5);
1262    /* tmp[i] < 17*2^122 */
1263    felem_diff_128_64(tmp, ftmp2);
1264    /* tmp[i] < 17*2^122 + 2^63 */
1265    felem_assign(ftmp3, ftmp4);
1266    felem_scalar64(ftmp4, 2);
1267    /* ftmp4[i] < 2^61 */
1268    felem_diff_128_64(tmp, ftmp4);
1269    /* tmp[i] < 17*2^122 + 2^64 */
1270    felem_reduce(x_out, tmp);
1271
1272    /* y_out = r(V-x_out) - 2 * s1 * J */
1273    felem_diff64(ftmp3, x_out);
1274    /*
1275     * ftmp3[i] < 2^60 + 2^60 = 2^61
1276     */
1277    felem_mul(tmp, ftmp5, ftmp3);
1278    /* tmp[i] < 17*2^122 */
1279    felem_mul(tmp2, ftmp6, ftmp2);
1280    /* tmp2[i] < 17*2^120 */
1281    felem_scalar128(tmp2, 2);
1282    /* tmp2[i] < 17*2^121 */
1283    felem_diff128(tmp, tmp2);
1284        /*-
1285         * tmp[i] < 2^127 - 2^69 + 17*2^122
1286         *        = 2^126 - 2^122 - 2^6 - 2^2 - 1
1287         *        < 2^127
1288         */
1289    felem_reduce(y_out, tmp);
1290
1291    copy_conditional(x_out, x2, z1_is_zero);
1292    copy_conditional(x_out, x1, z2_is_zero);
1293    copy_conditional(y_out, y2, z1_is_zero);
1294    copy_conditional(y_out, y1, z2_is_zero);
1295    copy_conditional(z_out, z2, z1_is_zero);
1296    copy_conditional(z_out, z1, z2_is_zero);
1297    felem_assign(x3, x_out);
1298    felem_assign(y3, y_out);
1299    felem_assign(z3, z_out);
1300}
1301
1302/*-
1303 * Base point pre computation
1304 * --------------------------
1305 *
1306 * Two different sorts of precomputed tables are used in the following code.
1307 * Each contain various points on the curve, where each point is three field
1308 * elements (x, y, z).
1309 *
1310 * For the base point table, z is usually 1 (0 for the point at infinity).
1311 * This table has 16 elements:
1312 * index | bits    | point
1313 * ------+---------+------------------------------
1314 *     0 | 0 0 0 0 | 0G
1315 *     1 | 0 0 0 1 | 1G
1316 *     2 | 0 0 1 0 | 2^130G
1317 *     3 | 0 0 1 1 | (2^130 + 1)G
1318 *     4 | 0 1 0 0 | 2^260G
1319 *     5 | 0 1 0 1 | (2^260 + 1)G
1320 *     6 | 0 1 1 0 | (2^260 + 2^130)G
1321 *     7 | 0 1 1 1 | (2^260 + 2^130 + 1)G
1322 *     8 | 1 0 0 0 | 2^390G
1323 *     9 | 1 0 0 1 | (2^390 + 1)G
1324 *    10 | 1 0 1 0 | (2^390 + 2^130)G
1325 *    11 | 1 0 1 1 | (2^390 + 2^130 + 1)G
1326 *    12 | 1 1 0 0 | (2^390 + 2^260)G
1327 *    13 | 1 1 0 1 | (2^390 + 2^260 + 1)G
1328 *    14 | 1 1 1 0 | (2^390 + 2^260 + 2^130)G
1329 *    15 | 1 1 1 1 | (2^390 + 2^260 + 2^130 + 1)G
1330 *
1331 * The reason for this is so that we can clock bits into four different
1332 * locations when doing simple scalar multiplies against the base point.
1333 *
1334 * Tables for other points have table[i] = iG for i in 0 .. 16. */
1335
1336/* gmul is the table of precomputed base points */
1337static const felem gmul[16][3] = { {{0, 0, 0, 0, 0, 0, 0, 0, 0},
1338                                    {0, 0, 0, 0, 0, 0, 0, 0, 0},
1339                                    {0, 0, 0, 0, 0, 0, 0, 0, 0}},
1340{{0x017e7e31c2e5bd66, 0x022cf0615a90a6fe, 0x00127a2ffa8de334,
1341  0x01dfbf9d64a3f877, 0x006b4d3dbaa14b5e, 0x014fed487e0a2bd8,
1342  0x015b4429c6481390, 0x03a73678fb2d988e, 0x00c6858e06b70404},
1343 {0x00be94769fd16650, 0x031c21a89cb09022, 0x039013fad0761353,
1344  0x02657bd099031542, 0x03273e662c97ee72, 0x01e6d11a05ebef45,
1345  0x03d1bd998f544495, 0x03001172297ed0b1, 0x011839296a789a3b},
1346 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1347{{0x0373faacbc875bae, 0x00f325023721c671, 0x00f666fd3dbde5ad,
1348  0x01a6932363f88ea7, 0x01fc6d9e13f9c47b, 0x03bcbffc2bbf734e,
1349  0x013ee3c3647f3a92, 0x029409fefe75d07d, 0x00ef9199963d85e5},
1350 {0x011173743ad5b178, 0x02499c7c21bf7d46, 0x035beaeabb8b1a58,
1351  0x00f989c4752ea0a3, 0x0101e1de48a9c1a3, 0x01a20076be28ba6c,
1352  0x02f8052e5eb2de95, 0x01bfe8f82dea117c, 0x0160074d3c36ddb7},
1353 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1354{{0x012f3fc373393b3b, 0x03d3d6172f1419fa, 0x02adc943c0b86873,
1355  0x00d475584177952b, 0x012a4d1673750ee2, 0x00512517a0f13b0c,
1356  0x02b184671a7b1734, 0x0315b84236f1a50a, 0x00a4afc472edbdb9},
1357 {0x00152a7077f385c4, 0x03044007d8d1c2ee, 0x0065829d61d52b52,
1358  0x00494ff6b6631d0d, 0x00a11d94d5f06bcf, 0x02d2f89474d9282e,
1359  0x0241c5727c06eeb9, 0x0386928710fbdb9d, 0x01f883f727b0dfbe},
1360 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1361{{0x019b0c3c9185544d, 0x006243a37c9d97db, 0x02ee3cbe030a2ad2,
1362  0x00cfdd946bb51e0d, 0x0271c00932606b91, 0x03f817d1ec68c561,
1363  0x03f37009806a369c, 0x03c1f30baf184fd5, 0x01091022d6d2f065},
1364 {0x0292c583514c45ed, 0x0316fca51f9a286c, 0x00300af507c1489a,
1365  0x0295f69008298cf1, 0x02c0ed8274943d7b, 0x016509b9b47a431e,
1366  0x02bc9de9634868ce, 0x005b34929bffcb09, 0x000c1a0121681524},
1367 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1368{{0x0286abc0292fb9f2, 0x02665eee9805b3f7, 0x01ed7455f17f26d6,
1369  0x0346355b83175d13, 0x006284944cd0a097, 0x0191895bcdec5e51,
1370  0x02e288370afda7d9, 0x03b22312bfefa67a, 0x01d104d3fc0613fe},
1371 {0x0092421a12f7e47f, 0x0077a83fa373c501, 0x03bd25c5f696bd0d,
1372  0x035c41e4d5459761, 0x01ca0d1742b24f53, 0x00aaab27863a509c,
1373  0x018b6de47df73917, 0x025c0b771705cd01, 0x01fd51d566d760a7},
1374 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1375{{0x01dd92ff6b0d1dbd, 0x039c5e2e8f8afa69, 0x0261ed13242c3b27,
1376  0x0382c6e67026e6a0, 0x01d60b10be2089f9, 0x03c15f3dce86723f,
1377  0x03c764a32d2a062d, 0x017307eac0fad056, 0x018207c0b96c5256},
1378 {0x0196a16d60e13154, 0x03e6ce74c0267030, 0x00ddbf2b4e52a5aa,
1379  0x012738241bbf31c8, 0x00ebe8dc04685a28, 0x024c2ad6d380d4a2,
1380  0x035ee062a6e62d0e, 0x0029ed74af7d3a0f, 0x00eef32aec142ebd},
1381 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1382{{0x00c31ec398993b39, 0x03a9f45bcda68253, 0x00ac733c24c70890,
1383  0x00872b111401ff01, 0x01d178c23195eafb, 0x03bca2c816b87f74,
1384  0x0261a9af46fbad7a, 0x0324b2a8dd3d28f9, 0x00918121d8f24e23},
1385 {0x032bc8c1ca983cd7, 0x00d869dfb08fc8c6, 0x01693cb61fce1516,
1386  0x012a5ea68f4e88a8, 0x010869cab88d7ae3, 0x009081ad277ceee1,
1387  0x033a77166d064cdc, 0x03955235a1fb3a95, 0x01251a4a9b25b65e},
1388 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1389{{0x00148a3a1b27f40b, 0x0123186df1b31fdc, 0x00026e7beaad34ce,
1390  0x01db446ac1d3dbba, 0x0299c1a33437eaec, 0x024540610183cbb7,
1391  0x0173bb0e9ce92e46, 0x02b937e43921214b, 0x01ab0436a9bf01b5},
1392 {0x0383381640d46948, 0x008dacbf0e7f330f, 0x03602122bcc3f318,
1393  0x01ee596b200620d6, 0x03bd0585fda430b3, 0x014aed77fd123a83,
1394  0x005ace749e52f742, 0x0390fe041da2b842, 0x0189a8ceb3299242},
1395 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1396{{0x012a19d6b3282473, 0x00c0915918b423ce, 0x023a954eb94405ae,
1397  0x00529f692be26158, 0x0289fa1b6fa4b2aa, 0x0198ae4ceea346ef,
1398  0x0047d8cdfbdedd49, 0x00cc8c8953f0f6b8, 0x001424abbff49203},
1399 {0x0256732a1115a03a, 0x0351bc38665c6733, 0x03f7b950fb4a6447,
1400  0x000afffa94c22155, 0x025763d0a4dab540, 0x000511e92d4fc283,
1401  0x030a7e9eda0ee96c, 0x004c3cd93a28bf0a, 0x017edb3a8719217f},
1402 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1403{{0x011de5675a88e673, 0x031d7d0f5e567fbe, 0x0016b2062c970ae5,
1404  0x03f4a2be49d90aa7, 0x03cef0bd13822866, 0x03f0923dcf774a6c,
1405  0x0284bebc4f322f72, 0x016ab2645302bb2c, 0x01793f95dace0e2a},
1406 {0x010646e13527a28f, 0x01ca1babd59dc5e7, 0x01afedfd9a5595df,
1407  0x01f15785212ea6b1, 0x0324e5d64f6ae3f4, 0x02d680f526d00645,
1408  0x0127920fadf627a7, 0x03b383f75df4f684, 0x0089e0057e783b0a},
1409 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1410{{0x00f334b9eb3c26c6, 0x0298fdaa98568dce, 0x01c2d24843a82292,
1411  0x020bcb24fa1b0711, 0x02cbdb3d2b1875e6, 0x0014907598f89422,
1412  0x03abe3aa43b26664, 0x02cbf47f720bc168, 0x0133b5e73014b79b},
1413 {0x034aab5dab05779d, 0x00cdc5d71fee9abb, 0x0399f16bd4bd9d30,
1414  0x03582fa592d82647, 0x02be1cdfb775b0e9, 0x0034f7cea32e94cb,
1415  0x0335a7f08f56f286, 0x03b707e9565d1c8b, 0x0015c946ea5b614f},
1416 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1417{{0x024676f6cff72255, 0x00d14625cac96378, 0x00532b6008bc3767,
1418  0x01fc16721b985322, 0x023355ea1b091668, 0x029de7afdc0317c3,
1419  0x02fc8a7ca2da037c, 0x02de1217d74a6f30, 0x013f7173175b73bf},
1420 {0x0344913f441490b5, 0x0200f9e272b61eca, 0x0258a246b1dd55d2,
1421  0x03753db9ea496f36, 0x025e02937a09c5ef, 0x030cbd3d14012692,
1422  0x01793a67e70dc72a, 0x03ec1d37048a662e, 0x006550f700c32a8d},
1423 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1424{{0x00d3f48a347eba27, 0x008e636649b61bd8, 0x00d3b93716778fb3,
1425  0x004d1915757bd209, 0x019d5311a3da44e0, 0x016d1afcbbe6aade,
1426  0x0241bf5f73265616, 0x0384672e5d50d39b, 0x005009fee522b684},
1427 {0x029b4fab064435fe, 0x018868ee095bbb07, 0x01ea3d6936cc92b8,
1428  0x000608b00f78a2f3, 0x02db911073d1c20f, 0x018205938470100a,
1429  0x01f1e4964cbe6ff2, 0x021a19a29eed4663, 0x01414485f42afa81},
1430 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1431{{0x01612b3a17f63e34, 0x03813992885428e6, 0x022b3c215b5a9608,
1432  0x029b4057e19f2fcb, 0x0384059a587af7e6, 0x02d6400ace6fe610,
1433  0x029354d896e8e331, 0x00c047ee6dfba65e, 0x0037720542e9d49d},
1434 {0x02ce9eed7c5e9278, 0x0374ed703e79643b, 0x01316c54c4072006,
1435  0x005aaa09054b2ee8, 0x002824000c840d57, 0x03d4eba24771ed86,
1436  0x0189c50aabc3bdae, 0x0338c01541e15510, 0x00466d56e38eed42},
1437 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1438{{0x007efd8330ad8bd6, 0x02465ed48047710b, 0x0034c6606b215e0c,
1439  0x016ae30c53cbf839, 0x01fa17bd37161216, 0x018ead4e61ce8ab9,
1440  0x005482ed5f5dee46, 0x037543755bba1d7f, 0x005e5ac7e70a9d0f},
1441 {0x0117e1bb2fdcb2a2, 0x03deea36249f40c4, 0x028d09b4a6246cb7,
1442  0x03524b8855bcf756, 0x023d7d109d5ceb58, 0x0178e43e3223ef9c,
1443  0x0154536a0c6e966a, 0x037964d1286ee9fe, 0x0199bcd90e125055},
1444 {1, 0, 0, 0, 0, 0, 0, 0, 0}}
1445};
1446
1447/*
1448 * select_point selects the |idx|th point from a precomputation table and
1449 * copies it to out.
1450 */
1451 /* pre_comp below is of the size provided in |size| */
1452static void select_point(const limb idx, unsigned int size,
1453                         const felem pre_comp[][3], felem out[3])
1454{
1455    unsigned i, j;
1456    limb *outlimbs = &out[0][0];
1457    memset(outlimbs, 0, 3 * sizeof(felem));
1458
1459    for (i = 0; i < size; i++) {
1460        const limb *inlimbs = &pre_comp[i][0][0];
1461        limb mask = i ^ idx;
1462        mask |= mask >> 4;
1463        mask |= mask >> 2;
1464        mask |= mask >> 1;
1465        mask &= 1;
1466        mask--;
1467        for (j = 0; j < NLIMBS * 3; j++)
1468            outlimbs[j] |= inlimbs[j] & mask;
1469    }
1470}
1471
1472/* get_bit returns the |i|th bit in |in| */
1473static char get_bit(const felem_bytearray in, int i)
1474{
1475    if (i < 0)
1476        return 0;
1477    return (in[i >> 3] >> (i & 7)) & 1;
1478}
1479
1480/*
1481 * Interleaved point multiplication using precomputed point multiples: The
1482 * small point multiples 0*P, 1*P, ..., 16*P are in pre_comp[], the scalars
1483 * in scalars[]. If g_scalar is non-NULL, we also add this multiple of the
1484 * generator, using certain (large) precomputed multiples in g_pre_comp.
1485 * Output point (X, Y, Z) is stored in x_out, y_out, z_out
1486 */
1487static void batch_mul(felem x_out, felem y_out, felem z_out,
1488                      const felem_bytearray scalars[],
1489                      const unsigned num_points, const u8 *g_scalar,
1490                      const int mixed, const felem pre_comp[][17][3],
1491                      const felem g_pre_comp[16][3])
1492{
1493    int i, skip;
1494    unsigned num, gen_mul = (g_scalar != NULL);
1495    felem nq[3], tmp[4];
1496    limb bits;
1497    u8 sign, digit;
1498
1499    /* set nq to the point at infinity */
1500    memset(nq, 0, 3 * sizeof(felem));
1501
1502    /*
1503     * Loop over all scalars msb-to-lsb, interleaving additions of multiples
1504     * of the generator (last quarter of rounds) and additions of other
1505     * points multiples (every 5th round).
1506     */
1507    skip = 1;                   /* save two point operations in the first
1508                                 * round */
1509    for (i = (num_points ? 520 : 130); i >= 0; --i) {
1510        /* double */
1511        if (!skip)
1512            point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]);
1513
1514        /* add multiples of the generator */
1515        if (gen_mul && (i <= 130)) {
1516            bits = get_bit(g_scalar, i + 390) << 3;
1517            if (i < 130) {
1518                bits |= get_bit(g_scalar, i + 260) << 2;
1519                bits |= get_bit(g_scalar, i + 130) << 1;
1520                bits |= get_bit(g_scalar, i);
1521            }
1522            /* select the point to add, in constant time */
1523            select_point(bits, 16, g_pre_comp, tmp);
1524            if (!skip) {
1525                /* The 1 argument below is for "mixed" */
1526                point_add(nq[0], nq[1], nq[2],
1527                          nq[0], nq[1], nq[2], 1, tmp[0], tmp[1], tmp[2]);
1528            } else {
1529                memcpy(nq, tmp, 3 * sizeof(felem));
1530                skip = 0;
1531            }
1532        }
1533
1534        /* do other additions every 5 doublings */
1535        if (num_points && (i % 5 == 0)) {
1536            /* loop over all scalars */
1537            for (num = 0; num < num_points; ++num) {
1538                bits = get_bit(scalars[num], i + 4) << 5;
1539                bits |= get_bit(scalars[num], i + 3) << 4;
1540                bits |= get_bit(scalars[num], i + 2) << 3;
1541                bits |= get_bit(scalars[num], i + 1) << 2;
1542                bits |= get_bit(scalars[num], i) << 1;
1543                bits |= get_bit(scalars[num], i - 1);
1544                ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits);
1545
1546                /*
1547                 * select the point to add or subtract, in constant time
1548                 */
1549                select_point(digit, 17, pre_comp[num], tmp);
1550                felem_neg(tmp[3], tmp[1]); /* (X, -Y, Z) is the negative
1551                                            * point */
1552                copy_conditional(tmp[1], tmp[3], (-(limb) sign));
1553
1554                if (!skip) {
1555                    point_add(nq[0], nq[1], nq[2],
1556                              nq[0], nq[1], nq[2],
1557                              mixed, tmp[0], tmp[1], tmp[2]);
1558                } else {
1559                    memcpy(nq, tmp, 3 * sizeof(felem));
1560                    skip = 0;
1561                }
1562            }
1563        }
1564    }
1565    felem_assign(x_out, nq[0]);
1566    felem_assign(y_out, nq[1]);
1567    felem_assign(z_out, nq[2]);
1568}
1569
1570/* Precomputation for the group generator. */
1571typedef struct {
1572    felem g_pre_comp[16][3];
1573    int references;
1574} NISTP521_PRE_COMP;
1575
1576const EC_METHOD *EC_GFp_nistp521_method(void)
1577{
1578    static const EC_METHOD ret = {
1579        EC_FLAGS_DEFAULT_OCT,
1580        NID_X9_62_prime_field,
1581        ec_GFp_nistp521_group_init,
1582        ec_GFp_simple_group_finish,
1583        ec_GFp_simple_group_clear_finish,
1584        ec_GFp_nist_group_copy,
1585        ec_GFp_nistp521_group_set_curve,
1586        ec_GFp_simple_group_get_curve,
1587        ec_GFp_simple_group_get_degree,
1588        ec_GFp_simple_group_check_discriminant,
1589        ec_GFp_simple_point_init,
1590        ec_GFp_simple_point_finish,
1591        ec_GFp_simple_point_clear_finish,
1592        ec_GFp_simple_point_copy,
1593        ec_GFp_simple_point_set_to_infinity,
1594        ec_GFp_simple_set_Jprojective_coordinates_GFp,
1595        ec_GFp_simple_get_Jprojective_coordinates_GFp,
1596        ec_GFp_simple_point_set_affine_coordinates,
1597        ec_GFp_nistp521_point_get_affine_coordinates,
1598        0 /* point_set_compressed_coordinates */ ,
1599        0 /* point2oct */ ,
1600        0 /* oct2point */ ,
1601        ec_GFp_simple_add,
1602        ec_GFp_simple_dbl,
1603        ec_GFp_simple_invert,
1604        ec_GFp_simple_is_at_infinity,
1605        ec_GFp_simple_is_on_curve,
1606        ec_GFp_simple_cmp,
1607        ec_GFp_simple_make_affine,
1608        ec_GFp_simple_points_make_affine,
1609        ec_GFp_nistp521_points_mul,
1610        ec_GFp_nistp521_precompute_mult,
1611        ec_GFp_nistp521_have_precompute_mult,
1612        ec_GFp_nist_field_mul,
1613        ec_GFp_nist_field_sqr,
1614        0 /* field_div */ ,
1615        0 /* field_encode */ ,
1616        0 /* field_decode */ ,
1617        0                       /* field_set_to_one */
1618    };
1619
1620    return &ret;
1621}
1622
1623/******************************************************************************/
1624/*
1625 * FUNCTIONS TO MANAGE PRECOMPUTATION
1626 */
1627
1628static NISTP521_PRE_COMP *nistp521_pre_comp_new()
1629{
1630    NISTP521_PRE_COMP *ret = NULL;
1631    ret = (NISTP521_PRE_COMP *) OPENSSL_malloc(sizeof(NISTP521_PRE_COMP));
1632    if (!ret) {
1633        ECerr(EC_F_NISTP521_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE);
1634        return ret;
1635    }
1636    memset(ret->g_pre_comp, 0, sizeof(ret->g_pre_comp));
1637    ret->references = 1;
1638    return ret;
1639}
1640
1641static void *nistp521_pre_comp_dup(void *src_)
1642{
1643    NISTP521_PRE_COMP *src = src_;
1644
1645    /* no need to actually copy, these objects never change! */
1646    CRYPTO_add(&src->references, 1, CRYPTO_LOCK_EC_PRE_COMP);
1647
1648    return src_;
1649}
1650
1651static void nistp521_pre_comp_free(void *pre_)
1652{
1653    int i;
1654    NISTP521_PRE_COMP *pre = pre_;
1655
1656    if (!pre)
1657        return;
1658
1659    i = CRYPTO_add(&pre->references, -1, CRYPTO_LOCK_EC_PRE_COMP);
1660    if (i > 0)
1661        return;
1662
1663    OPENSSL_free(pre);
1664}
1665
1666static void nistp521_pre_comp_clear_free(void *pre_)
1667{
1668    int i;
1669    NISTP521_PRE_COMP *pre = pre_;
1670
1671    if (!pre)
1672        return;
1673
1674    i = CRYPTO_add(&pre->references, -1, CRYPTO_LOCK_EC_PRE_COMP);
1675    if (i > 0)
1676        return;
1677
1678    OPENSSL_cleanse(pre, sizeof(*pre));
1679    OPENSSL_free(pre);
1680}
1681
1682/******************************************************************************/
1683/*
1684 * OPENSSL EC_METHOD FUNCTIONS
1685 */
1686
1687int ec_GFp_nistp521_group_init(EC_GROUP *group)
1688{
1689    int ret;
1690    ret = ec_GFp_simple_group_init(group);
1691    group->a_is_minus3 = 1;
1692    return ret;
1693}
1694
1695int ec_GFp_nistp521_group_set_curve(EC_GROUP *group, const BIGNUM *p,
1696                                    const BIGNUM *a, const BIGNUM *b,
1697                                    BN_CTX *ctx)
1698{
1699    int ret = 0;
1700    BN_CTX *new_ctx = NULL;
1701    BIGNUM *curve_p, *curve_a, *curve_b;
1702
1703    if (ctx == NULL)
1704        if ((ctx = new_ctx = BN_CTX_new()) == NULL)
1705            return 0;
1706    BN_CTX_start(ctx);
1707    if (((curve_p = BN_CTX_get(ctx)) == NULL) ||
1708        ((curve_a = BN_CTX_get(ctx)) == NULL) ||
1709        ((curve_b = BN_CTX_get(ctx)) == NULL))
1710        goto err;
1711    BN_bin2bn(nistp521_curve_params[0], sizeof(felem_bytearray), curve_p);
1712    BN_bin2bn(nistp521_curve_params[1], sizeof(felem_bytearray), curve_a);
1713    BN_bin2bn(nistp521_curve_params[2], sizeof(felem_bytearray), curve_b);
1714    if ((BN_cmp(curve_p, p)) || (BN_cmp(curve_a, a)) || (BN_cmp(curve_b, b))) {
1715        ECerr(EC_F_EC_GFP_NISTP521_GROUP_SET_CURVE,
1716              EC_R_WRONG_CURVE_PARAMETERS);
1717        goto err;
1718    }
1719    group->field_mod_func = BN_nist_mod_521;
1720    ret = ec_GFp_simple_group_set_curve(group, p, a, b, ctx);
1721 err:
1722    BN_CTX_end(ctx);
1723    if (new_ctx != NULL)
1724        BN_CTX_free(new_ctx);
1725    return ret;
1726}
1727
1728/*
1729 * Takes the Jacobian coordinates (X, Y, Z) of a point and returns (X', Y') =
1730 * (X/Z^2, Y/Z^3)
1731 */
1732int ec_GFp_nistp521_point_get_affine_coordinates(const EC_GROUP *group,
1733                                                 const EC_POINT *point,
1734                                                 BIGNUM *x, BIGNUM *y,
1735                                                 BN_CTX *ctx)
1736{
1737    felem z1, z2, x_in, y_in, x_out, y_out;
1738    largefelem tmp;
1739
1740    if (EC_POINT_is_at_infinity(group, point)) {
1741        ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES,
1742              EC_R_POINT_AT_INFINITY);
1743        return 0;
1744    }
1745    if ((!BN_to_felem(x_in, &point->X)) || (!BN_to_felem(y_in, &point->Y)) ||
1746        (!BN_to_felem(z1, &point->Z)))
1747        return 0;
1748    felem_inv(z2, z1);
1749    felem_square(tmp, z2);
1750    felem_reduce(z1, tmp);
1751    felem_mul(tmp, x_in, z1);
1752    felem_reduce(x_in, tmp);
1753    felem_contract(x_out, x_in);
1754    if (x != NULL) {
1755        if (!felem_to_BN(x, x_out)) {
1756            ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES,
1757                  ERR_R_BN_LIB);
1758            return 0;
1759        }
1760    }
1761    felem_mul(tmp, z1, z2);
1762    felem_reduce(z1, tmp);
1763    felem_mul(tmp, y_in, z1);
1764    felem_reduce(y_in, tmp);
1765    felem_contract(y_out, y_in);
1766    if (y != NULL) {
1767        if (!felem_to_BN(y, y_out)) {
1768            ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES,
1769                  ERR_R_BN_LIB);
1770            return 0;
1771        }
1772    }
1773    return 1;
1774}
1775
1776/* points below is of size |num|, and tmp_felems is of size |num+1/ */
1777static void make_points_affine(size_t num, felem points[][3],
1778                               felem tmp_felems[])
1779{
1780    /*
1781     * Runs in constant time, unless an input is the point at infinity (which
1782     * normally shouldn't happen).
1783     */
1784    ec_GFp_nistp_points_make_affine_internal(num,
1785                                             points,
1786                                             sizeof(felem),
1787                                             tmp_felems,
1788                                             (void (*)(void *))felem_one,
1789                                             felem_is_zero_int,
1790                                             (void (*)(void *, const void *))
1791                                             felem_assign,
1792                                             (void (*)(void *, const void *))
1793                                             felem_square_reduce, (void (*)
1794                                                                   (void *,
1795                                                                    const void
1796                                                                    *,
1797                                                                    const void
1798                                                                    *))
1799                                             felem_mul_reduce,
1800                                             (void (*)(void *, const void *))
1801                                             felem_inv,
1802                                             (void (*)(void *, const void *))
1803                                             felem_contract);
1804}
1805
1806/*
1807 * Computes scalar*generator + \sum scalars[i]*points[i], ignoring NULL
1808 * values Result is stored in r (r can equal one of the inputs).
1809 */
1810int ec_GFp_nistp521_points_mul(const EC_GROUP *group, EC_POINT *r,
1811                               const BIGNUM *scalar, size_t num,
1812                               const EC_POINT *points[],
1813                               const BIGNUM *scalars[], BN_CTX *ctx)
1814{
1815    int ret = 0;
1816    int j;
1817    int mixed = 0;
1818    BN_CTX *new_ctx = NULL;
1819    BIGNUM *x, *y, *z, *tmp_scalar;
1820    felem_bytearray g_secret;
1821    felem_bytearray *secrets = NULL;
1822    felem(*pre_comp)[17][3] = NULL;
1823    felem *tmp_felems = NULL;
1824    felem_bytearray tmp;
1825    unsigned i, num_bytes;
1826    int have_pre_comp = 0;
1827    size_t num_points = num;
1828    felem x_in, y_in, z_in, x_out, y_out, z_out;
1829    NISTP521_PRE_COMP *pre = NULL;
1830    felem(*g_pre_comp)[3] = NULL;
1831    EC_POINT *generator = NULL;
1832    const EC_POINT *p = NULL;
1833    const BIGNUM *p_scalar = NULL;
1834
1835    if (ctx == NULL)
1836        if ((ctx = new_ctx = BN_CTX_new()) == NULL)
1837            return 0;
1838    BN_CTX_start(ctx);
1839    if (((x = BN_CTX_get(ctx)) == NULL) ||
1840        ((y = BN_CTX_get(ctx)) == NULL) ||
1841        ((z = BN_CTX_get(ctx)) == NULL) ||
1842        ((tmp_scalar = BN_CTX_get(ctx)) == NULL))
1843        goto err;
1844
1845    if (scalar != NULL) {
1846        pre = EC_EX_DATA_get_data(group->extra_data,
1847                                  nistp521_pre_comp_dup,
1848                                  nistp521_pre_comp_free,
1849                                  nistp521_pre_comp_clear_free);
1850        if (pre)
1851            /* we have precomputation, try to use it */
1852            g_pre_comp = &pre->g_pre_comp[0];
1853        else
1854            /* try to use the standard precomputation */
1855            g_pre_comp = (felem(*)[3]) gmul;
1856        generator = EC_POINT_new(group);
1857        if (generator == NULL)
1858            goto err;
1859        /* get the generator from precomputation */
1860        if (!felem_to_BN(x, g_pre_comp[1][0]) ||
1861            !felem_to_BN(y, g_pre_comp[1][1]) ||
1862            !felem_to_BN(z, g_pre_comp[1][2])) {
1863            ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
1864            goto err;
1865        }
1866        if (!EC_POINT_set_Jprojective_coordinates_GFp(group,
1867                                                      generator, x, y, z,
1868                                                      ctx))
1869            goto err;
1870        if (0 == EC_POINT_cmp(group, generator, group->generator, ctx))
1871            /* precomputation matches generator */
1872            have_pre_comp = 1;
1873        else
1874            /*
1875             * we don't have valid precomputation: treat the generator as a
1876             * random point
1877             */
1878            num_points++;
1879    }
1880
1881    if (num_points > 0) {
1882        if (num_points >= 2) {
1883            /*
1884             * unless we precompute multiples for just one point, converting
1885             * those into affine form is time well spent
1886             */
1887            mixed = 1;
1888        }
1889        secrets = OPENSSL_malloc(num_points * sizeof(felem_bytearray));
1890        pre_comp = OPENSSL_malloc(num_points * 17 * 3 * sizeof(felem));
1891        if (mixed)
1892            tmp_felems =
1893                OPENSSL_malloc((num_points * 17 + 1) * sizeof(felem));
1894        if ((secrets == NULL) || (pre_comp == NULL)
1895            || (mixed && (tmp_felems == NULL))) {
1896            ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_MALLOC_FAILURE);
1897            goto err;
1898        }
1899
1900        /*
1901         * we treat NULL scalars as 0, and NULL points as points at infinity,
1902         * i.e., they contribute nothing to the linear combination
1903         */
1904        memset(secrets, 0, num_points * sizeof(felem_bytearray));
1905        memset(pre_comp, 0, num_points * 17 * 3 * sizeof(felem));
1906        for (i = 0; i < num_points; ++i) {
1907            if (i == num)
1908                /*
1909                 * we didn't have a valid precomputation, so we pick the
1910                 * generator
1911                 */
1912            {
1913                p = EC_GROUP_get0_generator(group);
1914                p_scalar = scalar;
1915            } else
1916                /* the i^th point */
1917            {
1918                p = points[i];
1919                p_scalar = scalars[i];
1920            }
1921            if ((p_scalar != NULL) && (p != NULL)) {
1922                /* reduce scalar to 0 <= scalar < 2^521 */
1923                if ((BN_num_bits(p_scalar) > 521)
1924                    || (BN_is_negative(p_scalar))) {
1925                    /*
1926                     * this is an unusual input, and we don't guarantee
1927                     * constant-timeness
1928                     */
1929                    if (!BN_nnmod(tmp_scalar, p_scalar, &group->order, ctx)) {
1930                        ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
1931                        goto err;
1932                    }
1933                    num_bytes = BN_bn2bin(tmp_scalar, tmp);
1934                } else
1935                    num_bytes = BN_bn2bin(p_scalar, tmp);
1936                flip_endian(secrets[i], tmp, num_bytes);
1937                /* precompute multiples */
1938                if ((!BN_to_felem(x_out, &p->X)) ||
1939                    (!BN_to_felem(y_out, &p->Y)) ||
1940                    (!BN_to_felem(z_out, &p->Z)))
1941                    goto err;
1942                memcpy(pre_comp[i][1][0], x_out, sizeof(felem));
1943                memcpy(pre_comp[i][1][1], y_out, sizeof(felem));
1944                memcpy(pre_comp[i][1][2], z_out, sizeof(felem));
1945                for (j = 2; j <= 16; ++j) {
1946                    if (j & 1) {
1947                        point_add(pre_comp[i][j][0], pre_comp[i][j][1],
1948                                  pre_comp[i][j][2], pre_comp[i][1][0],
1949                                  pre_comp[i][1][1], pre_comp[i][1][2], 0,
1950                                  pre_comp[i][j - 1][0],
1951                                  pre_comp[i][j - 1][1],
1952                                  pre_comp[i][j - 1][2]);
1953                    } else {
1954                        point_double(pre_comp[i][j][0], pre_comp[i][j][1],
1955                                     pre_comp[i][j][2], pre_comp[i][j / 2][0],
1956                                     pre_comp[i][j / 2][1],
1957                                     pre_comp[i][j / 2][2]);
1958                    }
1959                }
1960            }
1961        }
1962        if (mixed)
1963            make_points_affine(num_points * 17, pre_comp[0], tmp_felems);
1964    }
1965
1966    /* the scalar for the generator */
1967    if ((scalar != NULL) && (have_pre_comp)) {
1968        memset(g_secret, 0, sizeof(g_secret));
1969        /* reduce scalar to 0 <= scalar < 2^521 */
1970        if ((BN_num_bits(scalar) > 521) || (BN_is_negative(scalar))) {
1971            /*
1972             * this is an unusual input, and we don't guarantee
1973             * constant-timeness
1974             */
1975            if (!BN_nnmod(tmp_scalar, scalar, &group->order, ctx)) {
1976                ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
1977                goto err;
1978            }
1979            num_bytes = BN_bn2bin(tmp_scalar, tmp);
1980        } else
1981            num_bytes = BN_bn2bin(scalar, tmp);
1982        flip_endian(g_secret, tmp, num_bytes);
1983        /* do the multiplication with generator precomputation */
1984        batch_mul(x_out, y_out, z_out,
1985                  (const felem_bytearray(*))secrets, num_points,
1986                  g_secret,
1987                  mixed, (const felem(*)[17][3])pre_comp,
1988                  (const felem(*)[3])g_pre_comp);
1989    } else
1990        /* do the multiplication without generator precomputation */
1991        batch_mul(x_out, y_out, z_out,
1992                  (const felem_bytearray(*))secrets, num_points,
1993                  NULL, mixed, (const felem(*)[17][3])pre_comp, NULL);
1994    /* reduce the output to its unique minimal representation */
1995    felem_contract(x_in, x_out);
1996    felem_contract(y_in, y_out);
1997    felem_contract(z_in, z_out);
1998    if ((!felem_to_BN(x, x_in)) || (!felem_to_BN(y, y_in)) ||
1999        (!felem_to_BN(z, z_in))) {
2000        ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
2001        goto err;
2002    }
2003    ret = EC_POINT_set_Jprojective_coordinates_GFp(group, r, x, y, z, ctx);
2004
2005 err:
2006    BN_CTX_end(ctx);
2007    if (generator != NULL)
2008        EC_POINT_free(generator);
2009    if (new_ctx != NULL)
2010        BN_CTX_free(new_ctx);
2011    if (secrets != NULL)
2012        OPENSSL_free(secrets);
2013    if (pre_comp != NULL)
2014        OPENSSL_free(pre_comp);
2015    if (tmp_felems != NULL)
2016        OPENSSL_free(tmp_felems);
2017    return ret;
2018}
2019
2020int ec_GFp_nistp521_precompute_mult(EC_GROUP *group, BN_CTX *ctx)
2021{
2022    int ret = 0;
2023    NISTP521_PRE_COMP *pre = NULL;
2024    int i, j;
2025    BN_CTX *new_ctx = NULL;
2026    BIGNUM *x, *y;
2027    EC_POINT *generator = NULL;
2028    felem tmp_felems[16];
2029
2030    /* throw away old precomputation */
2031    EC_EX_DATA_free_data(&group->extra_data, nistp521_pre_comp_dup,
2032                         nistp521_pre_comp_free,
2033                         nistp521_pre_comp_clear_free);
2034    if (ctx == NULL)
2035        if ((ctx = new_ctx = BN_CTX_new()) == NULL)
2036            return 0;
2037    BN_CTX_start(ctx);
2038    if (((x = BN_CTX_get(ctx)) == NULL) || ((y = BN_CTX_get(ctx)) == NULL))
2039        goto err;
2040    /* get the generator */
2041    if (group->generator == NULL)
2042        goto err;
2043    generator = EC_POINT_new(group);
2044    if (generator == NULL)
2045        goto err;
2046    BN_bin2bn(nistp521_curve_params[3], sizeof(felem_bytearray), x);
2047    BN_bin2bn(nistp521_curve_params[4], sizeof(felem_bytearray), y);
2048    if (!EC_POINT_set_affine_coordinates_GFp(group, generator, x, y, ctx))
2049        goto err;
2050    if ((pre = nistp521_pre_comp_new()) == NULL)
2051        goto err;
2052    /*
2053     * if the generator is the standard one, use built-in precomputation
2054     */
2055    if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) {
2056        memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp));
2057        goto done;
2058    }
2059    if ((!BN_to_felem(pre->g_pre_comp[1][0], &group->generator->X)) ||
2060        (!BN_to_felem(pre->g_pre_comp[1][1], &group->generator->Y)) ||
2061        (!BN_to_felem(pre->g_pre_comp[1][2], &group->generator->Z)))
2062        goto err;
2063    /* compute 2^130*G, 2^260*G, 2^390*G */
2064    for (i = 1; i <= 4; i <<= 1) {
2065        point_double(pre->g_pre_comp[2 * i][0], pre->g_pre_comp[2 * i][1],
2066                     pre->g_pre_comp[2 * i][2], pre->g_pre_comp[i][0],
2067                     pre->g_pre_comp[i][1], pre->g_pre_comp[i][2]);
2068        for (j = 0; j < 129; ++j) {
2069            point_double(pre->g_pre_comp[2 * i][0],
2070                         pre->g_pre_comp[2 * i][1],
2071                         pre->g_pre_comp[2 * i][2],
2072                         pre->g_pre_comp[2 * i][0],
2073                         pre->g_pre_comp[2 * i][1],
2074                         pre->g_pre_comp[2 * i][2]);
2075        }
2076    }
2077    /* g_pre_comp[0] is the point at infinity */
2078    memset(pre->g_pre_comp[0], 0, sizeof(pre->g_pre_comp[0]));
2079    /* the remaining multiples */
2080    /* 2^130*G + 2^260*G */
2081    point_add(pre->g_pre_comp[6][0], pre->g_pre_comp[6][1],
2082              pre->g_pre_comp[6][2], pre->g_pre_comp[4][0],
2083              pre->g_pre_comp[4][1], pre->g_pre_comp[4][2],
2084              0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
2085              pre->g_pre_comp[2][2]);
2086    /* 2^130*G + 2^390*G */
2087    point_add(pre->g_pre_comp[10][0], pre->g_pre_comp[10][1],
2088              pre->g_pre_comp[10][2], pre->g_pre_comp[8][0],
2089              pre->g_pre_comp[8][1], pre->g_pre_comp[8][2],
2090              0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
2091              pre->g_pre_comp[2][2]);
2092    /* 2^260*G + 2^390*G */
2093    point_add(pre->g_pre_comp[12][0], pre->g_pre_comp[12][1],
2094              pre->g_pre_comp[12][2], pre->g_pre_comp[8][0],
2095              pre->g_pre_comp[8][1], pre->g_pre_comp[8][2],
2096              0, pre->g_pre_comp[4][0], pre->g_pre_comp[4][1],
2097              pre->g_pre_comp[4][2]);
2098    /* 2^130*G + 2^260*G + 2^390*G */
2099    point_add(pre->g_pre_comp[14][0], pre->g_pre_comp[14][1],
2100              pre->g_pre_comp[14][2], pre->g_pre_comp[12][0],
2101              pre->g_pre_comp[12][1], pre->g_pre_comp[12][2],
2102              0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
2103              pre->g_pre_comp[2][2]);
2104    for (i = 1; i < 8; ++i) {
2105        /* odd multiples: add G */
2106        point_add(pre->g_pre_comp[2 * i + 1][0],
2107                  pre->g_pre_comp[2 * i + 1][1],
2108                  pre->g_pre_comp[2 * i + 1][2], pre->g_pre_comp[2 * i][0],
2109                  pre->g_pre_comp[2 * i][1], pre->g_pre_comp[2 * i][2], 0,
2110                  pre->g_pre_comp[1][0], pre->g_pre_comp[1][1],
2111                  pre->g_pre_comp[1][2]);
2112    }
2113    make_points_affine(15, &(pre->g_pre_comp[1]), tmp_felems);
2114
2115 done:
2116    if (!EC_EX_DATA_set_data(&group->extra_data, pre, nistp521_pre_comp_dup,
2117                             nistp521_pre_comp_free,
2118                             nistp521_pre_comp_clear_free))
2119        goto err;
2120    ret = 1;
2121    pre = NULL;
2122 err:
2123    BN_CTX_end(ctx);
2124    if (generator != NULL)
2125        EC_POINT_free(generator);
2126    if (new_ctx != NULL)
2127        BN_CTX_free(new_ctx);
2128    if (pre)
2129        nistp521_pre_comp_free(pre);
2130    return ret;
2131}
2132
2133int ec_GFp_nistp521_have_precompute_mult(const EC_GROUP *group)
2134{
2135    if (EC_EX_DATA_get_data(group->extra_data, nistp521_pre_comp_dup,
2136                            nistp521_pre_comp_free,
2137                            nistp521_pre_comp_clear_free)
2138        != NULL)
2139        return 1;
2140    else
2141        return 0;
2142}
2143
2144#else
2145static void *dummy = &dummy;
2146#endif
2147