1/*
2 * Copyright 2011-2020 The OpenSSL Project Authors. All Rights Reserved.
3 *
4 * Licensed under the OpenSSL license (the "License").  You may not use
5 * this file except in compliance with the License.  You can obtain a copy
6 * in the file LICENSE in the source distribution or at
7 * https://www.openssl.org/source/license.html
8 */
9
10/* Copyright 2011 Google Inc.
11 *
12 * Licensed under the Apache License, Version 2.0 (the "License");
13 *
14 * you may not use this file except in compliance with the License.
15 * You may obtain a copy of the License at
16 *
17 *     http://www.apache.org/licenses/LICENSE-2.0
18 *
19 *  Unless required by applicable law or agreed to in writing, software
20 *  distributed under the License is distributed on an "AS IS" BASIS,
21 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
22 *  See the License for the specific language governing permissions and
23 *  limitations under the License.
24 */
25
26/*
27 * A 64-bit implementation of the NIST P-521 elliptic curve point multiplication
28 *
29 * OpenSSL integration was taken from Emilia Kasper's work in ecp_nistp224.c.
30 * Otherwise based on Emilia's P224 work, which was inspired by my curve25519
31 * work which got its smarts from Daniel J. Bernstein's work on the same.
32 */
33
34#include <openssl/e_os2.h>
35#ifdef OPENSSL_NO_EC_NISTP_64_GCC_128
36NON_EMPTY_TRANSLATION_UNIT
37#else
38
39# include <string.h>
40# include <openssl/err.h>
41# include "ec_local.h"
42
43# if defined(__SIZEOF_INT128__) && __SIZEOF_INT128__==16
44  /* even with gcc, the typedef won't work for 32-bit platforms */
45typedef __uint128_t uint128_t;  /* nonstandard; implemented by gcc on 64-bit
46                                 * platforms */
47# else
48#  error "Your compiler doesn't appear to support 128-bit integer types"
49# endif
50
51typedef uint8_t u8;
52typedef uint64_t u64;
53
54/*
55 * The underlying field. P521 operates over GF(2^521-1). We can serialise an
56 * element of this field into 66 bytes where the most significant byte
57 * contains only a single bit. We call this an felem_bytearray.
58 */
59
60typedef u8 felem_bytearray[66];
61
62/*
63 * These are the parameters of P521, taken from FIPS 186-3, section D.1.2.5.
64 * These values are big-endian.
65 */
66static const felem_bytearray nistp521_curve_params[5] = {
67    {0x01, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* p */
68     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
69     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
70     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
71     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
72     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
73     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
74     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
75     0xff, 0xff},
76    {0x01, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* a = -3 */
77     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
78     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
79     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
80     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
81     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
82     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
83     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
84     0xff, 0xfc},
85    {0x00, 0x51, 0x95, 0x3e, 0xb9, 0x61, 0x8e, 0x1c, /* b */
86     0x9a, 0x1f, 0x92, 0x9a, 0x21, 0xa0, 0xb6, 0x85,
87     0x40, 0xee, 0xa2, 0xda, 0x72, 0x5b, 0x99, 0xb3,
88     0x15, 0xf3, 0xb8, 0xb4, 0x89, 0x91, 0x8e, 0xf1,
89     0x09, 0xe1, 0x56, 0x19, 0x39, 0x51, 0xec, 0x7e,
90     0x93, 0x7b, 0x16, 0x52, 0xc0, 0xbd, 0x3b, 0xb1,
91     0xbf, 0x07, 0x35, 0x73, 0xdf, 0x88, 0x3d, 0x2c,
92     0x34, 0xf1, 0xef, 0x45, 0x1f, 0xd4, 0x6b, 0x50,
93     0x3f, 0x00},
94    {0x00, 0xc6, 0x85, 0x8e, 0x06, 0xb7, 0x04, 0x04, /* x */
95     0xe9, 0xcd, 0x9e, 0x3e, 0xcb, 0x66, 0x23, 0x95,
96     0xb4, 0x42, 0x9c, 0x64, 0x81, 0x39, 0x05, 0x3f,
97     0xb5, 0x21, 0xf8, 0x28, 0xaf, 0x60, 0x6b, 0x4d,
98     0x3d, 0xba, 0xa1, 0x4b, 0x5e, 0x77, 0xef, 0xe7,
99     0x59, 0x28, 0xfe, 0x1d, 0xc1, 0x27, 0xa2, 0xff,
100     0xa8, 0xde, 0x33, 0x48, 0xb3, 0xc1, 0x85, 0x6a,
101     0x42, 0x9b, 0xf9, 0x7e, 0x7e, 0x31, 0xc2, 0xe5,
102     0xbd, 0x66},
103    {0x01, 0x18, 0x39, 0x29, 0x6a, 0x78, 0x9a, 0x3b, /* y */
104     0xc0, 0x04, 0x5c, 0x8a, 0x5f, 0xb4, 0x2c, 0x7d,
105     0x1b, 0xd9, 0x98, 0xf5, 0x44, 0x49, 0x57, 0x9b,
106     0x44, 0x68, 0x17, 0xaf, 0xbd, 0x17, 0x27, 0x3e,
107     0x66, 0x2c, 0x97, 0xee, 0x72, 0x99, 0x5e, 0xf4,
108     0x26, 0x40, 0xc5, 0x50, 0xb9, 0x01, 0x3f, 0xad,
109     0x07, 0x61, 0x35, 0x3c, 0x70, 0x86, 0xa2, 0x72,
110     0xc2, 0x40, 0x88, 0xbe, 0x94, 0x76, 0x9f, 0xd1,
111     0x66, 0x50}
112};
113
114/*-
115 * The representation of field elements.
116 * ------------------------------------
117 *
118 * We represent field elements with nine values. These values are either 64 or
119 * 128 bits and the field element represented is:
120 *   v[0]*2^0 + v[1]*2^58 + v[2]*2^116 + ... + v[8]*2^464  (mod p)
121 * Each of the nine values is called a 'limb'. Since the limbs are spaced only
122 * 58 bits apart, but are greater than 58 bits in length, the most significant
123 * bits of each limb overlap with the least significant bits of the next.
124 *
125 * A field element with 64-bit limbs is an 'felem'. One with 128-bit limbs is a
126 * 'largefelem' */
127
128# define NLIMBS 9
129
130typedef uint64_t limb;
131typedef limb limb_aX __attribute((__aligned__(1)));
132typedef limb felem[NLIMBS];
133typedef uint128_t largefelem[NLIMBS];
134
135static const limb bottom57bits = 0x1ffffffffffffff;
136static const limb bottom58bits = 0x3ffffffffffffff;
137
138/*
139 * bin66_to_felem takes a little-endian byte array and converts it into felem
140 * form. This assumes that the CPU is little-endian.
141 */
142static void bin66_to_felem(felem out, const u8 in[66])
143{
144    out[0] = (*((limb *) & in[0])) & bottom58bits;
145    out[1] = (*((limb_aX *) & in[7]) >> 2) & bottom58bits;
146    out[2] = (*((limb_aX *) & in[14]) >> 4) & bottom58bits;
147    out[3] = (*((limb_aX *) & in[21]) >> 6) & bottom58bits;
148    out[4] = (*((limb_aX *) & in[29])) & bottom58bits;
149    out[5] = (*((limb_aX *) & in[36]) >> 2) & bottom58bits;
150    out[6] = (*((limb_aX *) & in[43]) >> 4) & bottom58bits;
151    out[7] = (*((limb_aX *) & in[50]) >> 6) & bottom58bits;
152    out[8] = (*((limb_aX *) & in[58])) & bottom57bits;
153}
154
155/*
156 * felem_to_bin66 takes an felem and serialises into a little endian, 66 byte
157 * array. This assumes that the CPU is little-endian.
158 */
159static void felem_to_bin66(u8 out[66], const felem in)
160{
161    memset(out, 0, 66);
162    (*((limb *) & out[0])) = in[0];
163    (*((limb_aX *) & out[7])) |= in[1] << 2;
164    (*((limb_aX *) & out[14])) |= in[2] << 4;
165    (*((limb_aX *) & out[21])) |= in[3] << 6;
166    (*((limb_aX *) & out[29])) = in[4];
167    (*((limb_aX *) & out[36])) |= in[5] << 2;
168    (*((limb_aX *) & out[43])) |= in[6] << 4;
169    (*((limb_aX *) & out[50])) |= in[7] << 6;
170    (*((limb_aX *) & out[58])) = in[8];
171}
172
173/* BN_to_felem converts an OpenSSL BIGNUM into an felem */
174static int BN_to_felem(felem out, const BIGNUM *bn)
175{
176    felem_bytearray b_out;
177    int num_bytes;
178
179    if (BN_is_negative(bn)) {
180        ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
181        return 0;
182    }
183    num_bytes = BN_bn2lebinpad(bn, b_out, sizeof(b_out));
184    if (num_bytes < 0) {
185        ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
186        return 0;
187    }
188    bin66_to_felem(out, b_out);
189    return 1;
190}
191
192/* felem_to_BN converts an felem into an OpenSSL BIGNUM */
193static BIGNUM *felem_to_BN(BIGNUM *out, const felem in)
194{
195    felem_bytearray b_out;
196    felem_to_bin66(b_out, in);
197    return BN_lebin2bn(b_out, sizeof(b_out), out);
198}
199
200/*-
201 * Field operations
202 * ----------------
203 */
204
205static void felem_one(felem out)
206{
207    out[0] = 1;
208    out[1] = 0;
209    out[2] = 0;
210    out[3] = 0;
211    out[4] = 0;
212    out[5] = 0;
213    out[6] = 0;
214    out[7] = 0;
215    out[8] = 0;
216}
217
218static void felem_assign(felem out, const felem in)
219{
220    out[0] = in[0];
221    out[1] = in[1];
222    out[2] = in[2];
223    out[3] = in[3];
224    out[4] = in[4];
225    out[5] = in[5];
226    out[6] = in[6];
227    out[7] = in[7];
228    out[8] = in[8];
229}
230
231/* felem_sum64 sets out = out + in. */
232static void felem_sum64(felem out, const felem in)
233{
234    out[0] += in[0];
235    out[1] += in[1];
236    out[2] += in[2];
237    out[3] += in[3];
238    out[4] += in[4];
239    out[5] += in[5];
240    out[6] += in[6];
241    out[7] += in[7];
242    out[8] += in[8];
243}
244
245/* felem_scalar sets out = in * scalar */
246static void felem_scalar(felem out, const felem in, limb scalar)
247{
248    out[0] = in[0] * scalar;
249    out[1] = in[1] * scalar;
250    out[2] = in[2] * scalar;
251    out[3] = in[3] * scalar;
252    out[4] = in[4] * scalar;
253    out[5] = in[5] * scalar;
254    out[6] = in[6] * scalar;
255    out[7] = in[7] * scalar;
256    out[8] = in[8] * scalar;
257}
258
259/* felem_scalar64 sets out = out * scalar */
260static void felem_scalar64(felem out, limb scalar)
261{
262    out[0] *= scalar;
263    out[1] *= scalar;
264    out[2] *= scalar;
265    out[3] *= scalar;
266    out[4] *= scalar;
267    out[5] *= scalar;
268    out[6] *= scalar;
269    out[7] *= scalar;
270    out[8] *= scalar;
271}
272
273/* felem_scalar128 sets out = out * scalar */
274static void felem_scalar128(largefelem out, limb scalar)
275{
276    out[0] *= scalar;
277    out[1] *= scalar;
278    out[2] *= scalar;
279    out[3] *= scalar;
280    out[4] *= scalar;
281    out[5] *= scalar;
282    out[6] *= scalar;
283    out[7] *= scalar;
284    out[8] *= scalar;
285}
286
287/*-
288 * felem_neg sets |out| to |-in|
289 * On entry:
290 *   in[i] < 2^59 + 2^14
291 * On exit:
292 *   out[i] < 2^62
293 */
294static void felem_neg(felem out, const felem in)
295{
296    /* In order to prevent underflow, we subtract from 0 mod p. */
297    static const limb two62m3 = (((limb) 1) << 62) - (((limb) 1) << 5);
298    static const limb two62m2 = (((limb) 1) << 62) - (((limb) 1) << 4);
299
300    out[0] = two62m3 - in[0];
301    out[1] = two62m2 - in[1];
302    out[2] = two62m2 - in[2];
303    out[3] = two62m2 - in[3];
304    out[4] = two62m2 - in[4];
305    out[5] = two62m2 - in[5];
306    out[6] = two62m2 - in[6];
307    out[7] = two62m2 - in[7];
308    out[8] = two62m2 - in[8];
309}
310
311/*-
312 * felem_diff64 subtracts |in| from |out|
313 * On entry:
314 *   in[i] < 2^59 + 2^14
315 * On exit:
316 *   out[i] < out[i] + 2^62
317 */
318static void felem_diff64(felem out, const felem in)
319{
320    /*
321     * In order to prevent underflow, we add 0 mod p before subtracting.
322     */
323    static const limb two62m3 = (((limb) 1) << 62) - (((limb) 1) << 5);
324    static const limb two62m2 = (((limb) 1) << 62) - (((limb) 1) << 4);
325
326    out[0] += two62m3 - in[0];
327    out[1] += two62m2 - in[1];
328    out[2] += two62m2 - in[2];
329    out[3] += two62m2 - in[3];
330    out[4] += two62m2 - in[4];
331    out[5] += two62m2 - in[5];
332    out[6] += two62m2 - in[6];
333    out[7] += two62m2 - in[7];
334    out[8] += two62m2 - in[8];
335}
336
337/*-
338 * felem_diff_128_64 subtracts |in| from |out|
339 * On entry:
340 *   in[i] < 2^62 + 2^17
341 * On exit:
342 *   out[i] < out[i] + 2^63
343 */
344static void felem_diff_128_64(largefelem out, const felem in)
345{
346    /*
347     * In order to prevent underflow, we add 64p mod p (which is equivalent
348     * to 0 mod p) before subtracting. p is 2^521 - 1, i.e. in binary a 521
349     * digit number with all bits set to 1. See "The representation of field
350     * elements" comment above for a description of how limbs are used to
351     * represent a number. 64p is represented with 8 limbs containing a number
352     * with 58 bits set and one limb with a number with 57 bits set.
353     */
354    static const limb two63m6 = (((limb) 1) << 63) - (((limb) 1) << 6);
355    static const limb two63m5 = (((limb) 1) << 63) - (((limb) 1) << 5);
356
357    out[0] += two63m6 - in[0];
358    out[1] += two63m5 - in[1];
359    out[2] += two63m5 - in[2];
360    out[3] += two63m5 - in[3];
361    out[4] += two63m5 - in[4];
362    out[5] += two63m5 - in[5];
363    out[6] += two63m5 - in[6];
364    out[7] += two63m5 - in[7];
365    out[8] += two63m5 - in[8];
366}
367
368/*-
369 * felem_diff_128_64 subtracts |in| from |out|
370 * On entry:
371 *   in[i] < 2^126
372 * On exit:
373 *   out[i] < out[i] + 2^127 - 2^69
374 */
375static void felem_diff128(largefelem out, const largefelem in)
376{
377    /*
378     * In order to prevent underflow, we add 0 mod p before subtracting.
379     */
380    static const uint128_t two127m70 =
381        (((uint128_t) 1) << 127) - (((uint128_t) 1) << 70);
382    static const uint128_t two127m69 =
383        (((uint128_t) 1) << 127) - (((uint128_t) 1) << 69);
384
385    out[0] += (two127m70 - in[0]);
386    out[1] += (two127m69 - in[1]);
387    out[2] += (two127m69 - in[2]);
388    out[3] += (two127m69 - in[3]);
389    out[4] += (two127m69 - in[4]);
390    out[5] += (two127m69 - in[5]);
391    out[6] += (two127m69 - in[6]);
392    out[7] += (two127m69 - in[7]);
393    out[8] += (two127m69 - in[8]);
394}
395
396/*-
397 * felem_square sets |out| = |in|^2
398 * On entry:
399 *   in[i] < 2^62
400 * On exit:
401 *   out[i] < 17 * max(in[i]) * max(in[i])
402 */
403static void felem_square(largefelem out, const felem in)
404{
405    felem inx2, inx4;
406    felem_scalar(inx2, in, 2);
407    felem_scalar(inx4, in, 4);
408
409    /*-
410     * We have many cases were we want to do
411     *   in[x] * in[y] +
412     *   in[y] * in[x]
413     * This is obviously just
414     *   2 * in[x] * in[y]
415     * However, rather than do the doubling on the 128 bit result, we
416     * double one of the inputs to the multiplication by reading from
417     * |inx2|
418     */
419
420    out[0] = ((uint128_t) in[0]) * in[0];
421    out[1] = ((uint128_t) in[0]) * inx2[1];
422    out[2] = ((uint128_t) in[0]) * inx2[2] + ((uint128_t) in[1]) * in[1];
423    out[3] = ((uint128_t) in[0]) * inx2[3] + ((uint128_t) in[1]) * inx2[2];
424    out[4] = ((uint128_t) in[0]) * inx2[4] +
425             ((uint128_t) in[1]) * inx2[3] + ((uint128_t) in[2]) * in[2];
426    out[5] = ((uint128_t) in[0]) * inx2[5] +
427             ((uint128_t) in[1]) * inx2[4] + ((uint128_t) in[2]) * inx2[3];
428    out[6] = ((uint128_t) in[0]) * inx2[6] +
429             ((uint128_t) in[1]) * inx2[5] +
430             ((uint128_t) in[2]) * inx2[4] + ((uint128_t) in[3]) * in[3];
431    out[7] = ((uint128_t) in[0]) * inx2[7] +
432             ((uint128_t) in[1]) * inx2[6] +
433             ((uint128_t) in[2]) * inx2[5] + ((uint128_t) in[3]) * inx2[4];
434    out[8] = ((uint128_t) in[0]) * inx2[8] +
435             ((uint128_t) in[1]) * inx2[7] +
436             ((uint128_t) in[2]) * inx2[6] +
437             ((uint128_t) in[3]) * inx2[5] + ((uint128_t) in[4]) * in[4];
438
439    /*
440     * The remaining limbs fall above 2^521, with the first falling at 2^522.
441     * They correspond to locations one bit up from the limbs produced above
442     * so we would have to multiply by two to align them. Again, rather than
443     * operate on the 128-bit result, we double one of the inputs to the
444     * multiplication. If we want to double for both this reason, and the
445     * reason above, then we end up multiplying by four.
446     */
447
448    /* 9 */
449    out[0] += ((uint128_t) in[1]) * inx4[8] +
450              ((uint128_t) in[2]) * inx4[7] +
451              ((uint128_t) in[3]) * inx4[6] + ((uint128_t) in[4]) * inx4[5];
452
453    /* 10 */
454    out[1] += ((uint128_t) in[2]) * inx4[8] +
455              ((uint128_t) in[3]) * inx4[7] +
456              ((uint128_t) in[4]) * inx4[6] + ((uint128_t) in[5]) * inx2[5];
457
458    /* 11 */
459    out[2] += ((uint128_t) in[3]) * inx4[8] +
460              ((uint128_t) in[4]) * inx4[7] + ((uint128_t) in[5]) * inx4[6];
461
462    /* 12 */
463    out[3] += ((uint128_t) in[4]) * inx4[8] +
464              ((uint128_t) in[5]) * inx4[7] + ((uint128_t) in[6]) * inx2[6];
465
466    /* 13 */
467    out[4] += ((uint128_t) in[5]) * inx4[8] + ((uint128_t) in[6]) * inx4[7];
468
469    /* 14 */
470    out[5] += ((uint128_t) in[6]) * inx4[8] + ((uint128_t) in[7]) * inx2[7];
471
472    /* 15 */
473    out[6] += ((uint128_t) in[7]) * inx4[8];
474
475    /* 16 */
476    out[7] += ((uint128_t) in[8]) * inx2[8];
477}
478
479/*-
480 * felem_mul sets |out| = |in1| * |in2|
481 * On entry:
482 *   in1[i] < 2^64
483 *   in2[i] < 2^63
484 * On exit:
485 *   out[i] < 17 * max(in1[i]) * max(in2[i])
486 */
487static void felem_mul(largefelem out, const felem in1, const felem in2)
488{
489    felem in2x2;
490    felem_scalar(in2x2, in2, 2);
491
492    out[0] = ((uint128_t) in1[0]) * in2[0];
493
494    out[1] = ((uint128_t) in1[0]) * in2[1] +
495             ((uint128_t) in1[1]) * in2[0];
496
497    out[2] = ((uint128_t) in1[0]) * in2[2] +
498             ((uint128_t) in1[1]) * in2[1] +
499             ((uint128_t) in1[2]) * in2[0];
500
501    out[3] = ((uint128_t) in1[0]) * in2[3] +
502             ((uint128_t) in1[1]) * in2[2] +
503             ((uint128_t) in1[2]) * in2[1] +
504             ((uint128_t) in1[3]) * in2[0];
505
506    out[4] = ((uint128_t) in1[0]) * in2[4] +
507             ((uint128_t) in1[1]) * in2[3] +
508             ((uint128_t) in1[2]) * in2[2] +
509             ((uint128_t) in1[3]) * in2[1] +
510             ((uint128_t) in1[4]) * in2[0];
511
512    out[5] = ((uint128_t) in1[0]) * in2[5] +
513             ((uint128_t) in1[1]) * in2[4] +
514             ((uint128_t) in1[2]) * in2[3] +
515             ((uint128_t) in1[3]) * in2[2] +
516             ((uint128_t) in1[4]) * in2[1] +
517             ((uint128_t) in1[5]) * in2[0];
518
519    out[6] = ((uint128_t) in1[0]) * in2[6] +
520             ((uint128_t) in1[1]) * in2[5] +
521             ((uint128_t) in1[2]) * in2[4] +
522             ((uint128_t) in1[3]) * in2[3] +
523             ((uint128_t) in1[4]) * in2[2] +
524             ((uint128_t) in1[5]) * in2[1] +
525             ((uint128_t) in1[6]) * in2[0];
526
527    out[7] = ((uint128_t) in1[0]) * in2[7] +
528             ((uint128_t) in1[1]) * in2[6] +
529             ((uint128_t) in1[2]) * in2[5] +
530             ((uint128_t) in1[3]) * in2[4] +
531             ((uint128_t) in1[4]) * in2[3] +
532             ((uint128_t) in1[5]) * in2[2] +
533             ((uint128_t) in1[6]) * in2[1] +
534             ((uint128_t) in1[7]) * in2[0];
535
536    out[8] = ((uint128_t) in1[0]) * in2[8] +
537             ((uint128_t) in1[1]) * in2[7] +
538             ((uint128_t) in1[2]) * in2[6] +
539             ((uint128_t) in1[3]) * in2[5] +
540             ((uint128_t) in1[4]) * in2[4] +
541             ((uint128_t) in1[5]) * in2[3] +
542             ((uint128_t) in1[6]) * in2[2] +
543             ((uint128_t) in1[7]) * in2[1] +
544             ((uint128_t) in1[8]) * in2[0];
545
546    /* See comment in felem_square about the use of in2x2 here */
547
548    out[0] += ((uint128_t) in1[1]) * in2x2[8] +
549              ((uint128_t) in1[2]) * in2x2[7] +
550              ((uint128_t) in1[3]) * in2x2[6] +
551              ((uint128_t) in1[4]) * in2x2[5] +
552              ((uint128_t) in1[5]) * in2x2[4] +
553              ((uint128_t) in1[6]) * in2x2[3] +
554              ((uint128_t) in1[7]) * in2x2[2] +
555              ((uint128_t) in1[8]) * in2x2[1];
556
557    out[1] += ((uint128_t) in1[2]) * in2x2[8] +
558              ((uint128_t) in1[3]) * in2x2[7] +
559              ((uint128_t) in1[4]) * in2x2[6] +
560              ((uint128_t) in1[5]) * in2x2[5] +
561              ((uint128_t) in1[6]) * in2x2[4] +
562              ((uint128_t) in1[7]) * in2x2[3] +
563              ((uint128_t) in1[8]) * in2x2[2];
564
565    out[2] += ((uint128_t) in1[3]) * in2x2[8] +
566              ((uint128_t) in1[4]) * in2x2[7] +
567              ((uint128_t) in1[5]) * in2x2[6] +
568              ((uint128_t) in1[6]) * in2x2[5] +
569              ((uint128_t) in1[7]) * in2x2[4] +
570              ((uint128_t) in1[8]) * in2x2[3];
571
572    out[3] += ((uint128_t) in1[4]) * in2x2[8] +
573              ((uint128_t) in1[5]) * in2x2[7] +
574              ((uint128_t) in1[6]) * in2x2[6] +
575              ((uint128_t) in1[7]) * in2x2[5] +
576              ((uint128_t) in1[8]) * in2x2[4];
577
578    out[4] += ((uint128_t) in1[5]) * in2x2[8] +
579              ((uint128_t) in1[6]) * in2x2[7] +
580              ((uint128_t) in1[7]) * in2x2[6] +
581              ((uint128_t) in1[8]) * in2x2[5];
582
583    out[5] += ((uint128_t) in1[6]) * in2x2[8] +
584              ((uint128_t) in1[7]) * in2x2[7] +
585              ((uint128_t) in1[8]) * in2x2[6];
586
587    out[6] += ((uint128_t) in1[7]) * in2x2[8] +
588              ((uint128_t) in1[8]) * in2x2[7];
589
590    out[7] += ((uint128_t) in1[8]) * in2x2[8];
591}
592
593static const limb bottom52bits = 0xfffffffffffff;
594
595/*-
596 * felem_reduce converts a largefelem to an felem.
597 * On entry:
598 *   in[i] < 2^128
599 * On exit:
600 *   out[i] < 2^59 + 2^14
601 */
602static void felem_reduce(felem out, const largefelem in)
603{
604    u64 overflow1, overflow2;
605
606    out[0] = ((limb) in[0]) & bottom58bits;
607    out[1] = ((limb) in[1]) & bottom58bits;
608    out[2] = ((limb) in[2]) & bottom58bits;
609    out[3] = ((limb) in[3]) & bottom58bits;
610    out[4] = ((limb) in[4]) & bottom58bits;
611    out[5] = ((limb) in[5]) & bottom58bits;
612    out[6] = ((limb) in[6]) & bottom58bits;
613    out[7] = ((limb) in[7]) & bottom58bits;
614    out[8] = ((limb) in[8]) & bottom58bits;
615
616    /* out[i] < 2^58 */
617
618    out[1] += ((limb) in[0]) >> 58;
619    out[1] += (((limb) (in[0] >> 64)) & bottom52bits) << 6;
620    /*-
621     * out[1] < 2^58 + 2^6 + 2^58
622     *        = 2^59 + 2^6
623     */
624    out[2] += ((limb) (in[0] >> 64)) >> 52;
625
626    out[2] += ((limb) in[1]) >> 58;
627    out[2] += (((limb) (in[1] >> 64)) & bottom52bits) << 6;
628    out[3] += ((limb) (in[1] >> 64)) >> 52;
629
630    out[3] += ((limb) in[2]) >> 58;
631    out[3] += (((limb) (in[2] >> 64)) & bottom52bits) << 6;
632    out[4] += ((limb) (in[2] >> 64)) >> 52;
633
634    out[4] += ((limb) in[3]) >> 58;
635    out[4] += (((limb) (in[3] >> 64)) & bottom52bits) << 6;
636    out[5] += ((limb) (in[3] >> 64)) >> 52;
637
638    out[5] += ((limb) in[4]) >> 58;
639    out[5] += (((limb) (in[4] >> 64)) & bottom52bits) << 6;
640    out[6] += ((limb) (in[4] >> 64)) >> 52;
641
642    out[6] += ((limb) in[5]) >> 58;
643    out[6] += (((limb) (in[5] >> 64)) & bottom52bits) << 6;
644    out[7] += ((limb) (in[5] >> 64)) >> 52;
645
646    out[7] += ((limb) in[6]) >> 58;
647    out[7] += (((limb) (in[6] >> 64)) & bottom52bits) << 6;
648    out[8] += ((limb) (in[6] >> 64)) >> 52;
649
650    out[8] += ((limb) in[7]) >> 58;
651    out[8] += (((limb) (in[7] >> 64)) & bottom52bits) << 6;
652    /*-
653     * out[x > 1] < 2^58 + 2^6 + 2^58 + 2^12
654     *            < 2^59 + 2^13
655     */
656    overflow1 = ((limb) (in[7] >> 64)) >> 52;
657
658    overflow1 += ((limb) in[8]) >> 58;
659    overflow1 += (((limb) (in[8] >> 64)) & bottom52bits) << 6;
660    overflow2 = ((limb) (in[8] >> 64)) >> 52;
661
662    overflow1 <<= 1;            /* overflow1 < 2^13 + 2^7 + 2^59 */
663    overflow2 <<= 1;            /* overflow2 < 2^13 */
664
665    out[0] += overflow1;        /* out[0] < 2^60 */
666    out[1] += overflow2;        /* out[1] < 2^59 + 2^6 + 2^13 */
667
668    out[1] += out[0] >> 58;
669    out[0] &= bottom58bits;
670    /*-
671     * out[0] < 2^58
672     * out[1] < 2^59 + 2^6 + 2^13 + 2^2
673     *        < 2^59 + 2^14
674     */
675}
676
677static void felem_square_reduce(felem out, const felem in)
678{
679    largefelem tmp;
680    felem_square(tmp, in);
681    felem_reduce(out, tmp);
682}
683
684static void felem_mul_reduce(felem out, const felem in1, const felem in2)
685{
686    largefelem tmp;
687    felem_mul(tmp, in1, in2);
688    felem_reduce(out, tmp);
689}
690
691/*-
692 * felem_inv calculates |out| = |in|^{-1}
693 *
694 * Based on Fermat's Little Theorem:
695 *   a^p = a (mod p)
696 *   a^{p-1} = 1 (mod p)
697 *   a^{p-2} = a^{-1} (mod p)
698 */
699static void felem_inv(felem out, const felem in)
700{
701    felem ftmp, ftmp2, ftmp3, ftmp4;
702    largefelem tmp;
703    unsigned i;
704
705    felem_square(tmp, in);
706    felem_reduce(ftmp, tmp);    /* 2^1 */
707    felem_mul(tmp, in, ftmp);
708    felem_reduce(ftmp, tmp);    /* 2^2 - 2^0 */
709    felem_assign(ftmp2, ftmp);
710    felem_square(tmp, ftmp);
711    felem_reduce(ftmp, tmp);    /* 2^3 - 2^1 */
712    felem_mul(tmp, in, ftmp);
713    felem_reduce(ftmp, tmp);    /* 2^3 - 2^0 */
714    felem_square(tmp, ftmp);
715    felem_reduce(ftmp, tmp);    /* 2^4 - 2^1 */
716
717    felem_square(tmp, ftmp2);
718    felem_reduce(ftmp3, tmp);   /* 2^3 - 2^1 */
719    felem_square(tmp, ftmp3);
720    felem_reduce(ftmp3, tmp);   /* 2^4 - 2^2 */
721    felem_mul(tmp, ftmp3, ftmp2);
722    felem_reduce(ftmp3, tmp);   /* 2^4 - 2^0 */
723
724    felem_assign(ftmp2, ftmp3);
725    felem_square(tmp, ftmp3);
726    felem_reduce(ftmp3, tmp);   /* 2^5 - 2^1 */
727    felem_square(tmp, ftmp3);
728    felem_reduce(ftmp3, tmp);   /* 2^6 - 2^2 */
729    felem_square(tmp, ftmp3);
730    felem_reduce(ftmp3, tmp);   /* 2^7 - 2^3 */
731    felem_square(tmp, ftmp3);
732    felem_reduce(ftmp3, tmp);   /* 2^8 - 2^4 */
733    felem_assign(ftmp4, ftmp3);
734    felem_mul(tmp, ftmp3, ftmp);
735    felem_reduce(ftmp4, tmp);   /* 2^8 - 2^1 */
736    felem_square(tmp, ftmp4);
737    felem_reduce(ftmp4, tmp);   /* 2^9 - 2^2 */
738    felem_mul(tmp, ftmp3, ftmp2);
739    felem_reduce(ftmp3, tmp);   /* 2^8 - 2^0 */
740    felem_assign(ftmp2, ftmp3);
741
742    for (i = 0; i < 8; i++) {
743        felem_square(tmp, ftmp3);
744        felem_reduce(ftmp3, tmp); /* 2^16 - 2^8 */
745    }
746    felem_mul(tmp, ftmp3, ftmp2);
747    felem_reduce(ftmp3, tmp);   /* 2^16 - 2^0 */
748    felem_assign(ftmp2, ftmp3);
749
750    for (i = 0; i < 16; i++) {
751        felem_square(tmp, ftmp3);
752        felem_reduce(ftmp3, tmp); /* 2^32 - 2^16 */
753    }
754    felem_mul(tmp, ftmp3, ftmp2);
755    felem_reduce(ftmp3, tmp);   /* 2^32 - 2^0 */
756    felem_assign(ftmp2, ftmp3);
757
758    for (i = 0; i < 32; i++) {
759        felem_square(tmp, ftmp3);
760        felem_reduce(ftmp3, tmp); /* 2^64 - 2^32 */
761    }
762    felem_mul(tmp, ftmp3, ftmp2);
763    felem_reduce(ftmp3, tmp);   /* 2^64 - 2^0 */
764    felem_assign(ftmp2, ftmp3);
765
766    for (i = 0; i < 64; i++) {
767        felem_square(tmp, ftmp3);
768        felem_reduce(ftmp3, tmp); /* 2^128 - 2^64 */
769    }
770    felem_mul(tmp, ftmp3, ftmp2);
771    felem_reduce(ftmp3, tmp);   /* 2^128 - 2^0 */
772    felem_assign(ftmp2, ftmp3);
773
774    for (i = 0; i < 128; i++) {
775        felem_square(tmp, ftmp3);
776        felem_reduce(ftmp3, tmp); /* 2^256 - 2^128 */
777    }
778    felem_mul(tmp, ftmp3, ftmp2);
779    felem_reduce(ftmp3, tmp);   /* 2^256 - 2^0 */
780    felem_assign(ftmp2, ftmp3);
781
782    for (i = 0; i < 256; i++) {
783        felem_square(tmp, ftmp3);
784        felem_reduce(ftmp3, tmp); /* 2^512 - 2^256 */
785    }
786    felem_mul(tmp, ftmp3, ftmp2);
787    felem_reduce(ftmp3, tmp);   /* 2^512 - 2^0 */
788
789    for (i = 0; i < 9; i++) {
790        felem_square(tmp, ftmp3);
791        felem_reduce(ftmp3, tmp); /* 2^521 - 2^9 */
792    }
793    felem_mul(tmp, ftmp3, ftmp4);
794    felem_reduce(ftmp3, tmp);   /* 2^512 - 2^2 */
795    felem_mul(tmp, ftmp3, in);
796    felem_reduce(out, tmp);     /* 2^512 - 3 */
797}
798
799/* This is 2^521-1, expressed as an felem */
800static const felem kPrime = {
801    0x03ffffffffffffff, 0x03ffffffffffffff, 0x03ffffffffffffff,
802    0x03ffffffffffffff, 0x03ffffffffffffff, 0x03ffffffffffffff,
803    0x03ffffffffffffff, 0x03ffffffffffffff, 0x01ffffffffffffff
804};
805
806/*-
807 * felem_is_zero returns a limb with all bits set if |in| == 0 (mod p) and 0
808 * otherwise.
809 * On entry:
810 *   in[i] < 2^59 + 2^14
811 */
812static limb felem_is_zero(const felem in)
813{
814    felem ftmp;
815    limb is_zero, is_p;
816    felem_assign(ftmp, in);
817
818    ftmp[0] += ftmp[8] >> 57;
819    ftmp[8] &= bottom57bits;
820    /* ftmp[8] < 2^57 */
821    ftmp[1] += ftmp[0] >> 58;
822    ftmp[0] &= bottom58bits;
823    ftmp[2] += ftmp[1] >> 58;
824    ftmp[1] &= bottom58bits;
825    ftmp[3] += ftmp[2] >> 58;
826    ftmp[2] &= bottom58bits;
827    ftmp[4] += ftmp[3] >> 58;
828    ftmp[3] &= bottom58bits;
829    ftmp[5] += ftmp[4] >> 58;
830    ftmp[4] &= bottom58bits;
831    ftmp[6] += ftmp[5] >> 58;
832    ftmp[5] &= bottom58bits;
833    ftmp[7] += ftmp[6] >> 58;
834    ftmp[6] &= bottom58bits;
835    ftmp[8] += ftmp[7] >> 58;
836    ftmp[7] &= bottom58bits;
837    /* ftmp[8] < 2^57 + 4 */
838
839    /*
840     * The ninth limb of 2*(2^521-1) is 0x03ffffffffffffff, which is greater
841     * than our bound for ftmp[8]. Therefore we only have to check if the
842     * zero is zero or 2^521-1.
843     */
844
845    is_zero = 0;
846    is_zero |= ftmp[0];
847    is_zero |= ftmp[1];
848    is_zero |= ftmp[2];
849    is_zero |= ftmp[3];
850    is_zero |= ftmp[4];
851    is_zero |= ftmp[5];
852    is_zero |= ftmp[6];
853    is_zero |= ftmp[7];
854    is_zero |= ftmp[8];
855
856    is_zero--;
857    /*
858     * We know that ftmp[i] < 2^63, therefore the only way that the top bit
859     * can be set is if is_zero was 0 before the decrement.
860     */
861    is_zero = 0 - (is_zero >> 63);
862
863    is_p = ftmp[0] ^ kPrime[0];
864    is_p |= ftmp[1] ^ kPrime[1];
865    is_p |= ftmp[2] ^ kPrime[2];
866    is_p |= ftmp[3] ^ kPrime[3];
867    is_p |= ftmp[4] ^ kPrime[4];
868    is_p |= ftmp[5] ^ kPrime[5];
869    is_p |= ftmp[6] ^ kPrime[6];
870    is_p |= ftmp[7] ^ kPrime[7];
871    is_p |= ftmp[8] ^ kPrime[8];
872
873    is_p--;
874    is_p = 0 - (is_p >> 63);
875
876    is_zero |= is_p;
877    return is_zero;
878}
879
880static int felem_is_zero_int(const void *in)
881{
882    return (int)(felem_is_zero(in) & ((limb) 1));
883}
884
885/*-
886 * felem_contract converts |in| to its unique, minimal representation.
887 * On entry:
888 *   in[i] < 2^59 + 2^14
889 */
890static void felem_contract(felem out, const felem in)
891{
892    limb is_p, is_greater, sign;
893    static const limb two58 = ((limb) 1) << 58;
894
895    felem_assign(out, in);
896
897    out[0] += out[8] >> 57;
898    out[8] &= bottom57bits;
899    /* out[8] < 2^57 */
900    out[1] += out[0] >> 58;
901    out[0] &= bottom58bits;
902    out[2] += out[1] >> 58;
903    out[1] &= bottom58bits;
904    out[3] += out[2] >> 58;
905    out[2] &= bottom58bits;
906    out[4] += out[3] >> 58;
907    out[3] &= bottom58bits;
908    out[5] += out[4] >> 58;
909    out[4] &= bottom58bits;
910    out[6] += out[5] >> 58;
911    out[5] &= bottom58bits;
912    out[7] += out[6] >> 58;
913    out[6] &= bottom58bits;
914    out[8] += out[7] >> 58;
915    out[7] &= bottom58bits;
916    /* out[8] < 2^57 + 4 */
917
918    /*
919     * If the value is greater than 2^521-1 then we have to subtract 2^521-1
920     * out. See the comments in felem_is_zero regarding why we don't test for
921     * other multiples of the prime.
922     */
923
924    /*
925     * First, if |out| is equal to 2^521-1, we subtract it out to get zero.
926     */
927
928    is_p = out[0] ^ kPrime[0];
929    is_p |= out[1] ^ kPrime[1];
930    is_p |= out[2] ^ kPrime[2];
931    is_p |= out[3] ^ kPrime[3];
932    is_p |= out[4] ^ kPrime[4];
933    is_p |= out[5] ^ kPrime[5];
934    is_p |= out[6] ^ kPrime[6];
935    is_p |= out[7] ^ kPrime[7];
936    is_p |= out[8] ^ kPrime[8];
937
938    is_p--;
939    is_p &= is_p << 32;
940    is_p &= is_p << 16;
941    is_p &= is_p << 8;
942    is_p &= is_p << 4;
943    is_p &= is_p << 2;
944    is_p &= is_p << 1;
945    is_p = 0 - (is_p >> 63);
946    is_p = ~is_p;
947
948    /* is_p is 0 iff |out| == 2^521-1 and all ones otherwise */
949
950    out[0] &= is_p;
951    out[1] &= is_p;
952    out[2] &= is_p;
953    out[3] &= is_p;
954    out[4] &= is_p;
955    out[5] &= is_p;
956    out[6] &= is_p;
957    out[7] &= is_p;
958    out[8] &= is_p;
959
960    /*
961     * In order to test that |out| >= 2^521-1 we need only test if out[8] >>
962     * 57 is greater than zero as (2^521-1) + x >= 2^522
963     */
964    is_greater = out[8] >> 57;
965    is_greater |= is_greater << 32;
966    is_greater |= is_greater << 16;
967    is_greater |= is_greater << 8;
968    is_greater |= is_greater << 4;
969    is_greater |= is_greater << 2;
970    is_greater |= is_greater << 1;
971    is_greater = 0 - (is_greater >> 63);
972
973    out[0] -= kPrime[0] & is_greater;
974    out[1] -= kPrime[1] & is_greater;
975    out[2] -= kPrime[2] & is_greater;
976    out[3] -= kPrime[3] & is_greater;
977    out[4] -= kPrime[4] & is_greater;
978    out[5] -= kPrime[5] & is_greater;
979    out[6] -= kPrime[6] & is_greater;
980    out[7] -= kPrime[7] & is_greater;
981    out[8] -= kPrime[8] & is_greater;
982
983    /* Eliminate negative coefficients */
984    sign = -(out[0] >> 63);
985    out[0] += (two58 & sign);
986    out[1] -= (1 & sign);
987    sign = -(out[1] >> 63);
988    out[1] += (two58 & sign);
989    out[2] -= (1 & sign);
990    sign = -(out[2] >> 63);
991    out[2] += (two58 & sign);
992    out[3] -= (1 & sign);
993    sign = -(out[3] >> 63);
994    out[3] += (two58 & sign);
995    out[4] -= (1 & sign);
996    sign = -(out[4] >> 63);
997    out[4] += (two58 & sign);
998    out[5] -= (1 & sign);
999    sign = -(out[0] >> 63);
1000    out[5] += (two58 & sign);
1001    out[6] -= (1 & sign);
1002    sign = -(out[6] >> 63);
1003    out[6] += (two58 & sign);
1004    out[7] -= (1 & sign);
1005    sign = -(out[7] >> 63);
1006    out[7] += (two58 & sign);
1007    out[8] -= (1 & sign);
1008    sign = -(out[5] >> 63);
1009    out[5] += (two58 & sign);
1010    out[6] -= (1 & sign);
1011    sign = -(out[6] >> 63);
1012    out[6] += (two58 & sign);
1013    out[7] -= (1 & sign);
1014    sign = -(out[7] >> 63);
1015    out[7] += (two58 & sign);
1016    out[8] -= (1 & sign);
1017}
1018
1019/*-
1020 * Group operations
1021 * ----------------
1022 *
1023 * Building on top of the field operations we have the operations on the
1024 * elliptic curve group itself. Points on the curve are represented in Jacobian
1025 * coordinates */
1026
1027/*-
1028 * point_double calculates 2*(x_in, y_in, z_in)
1029 *
1030 * The method is taken from:
1031 *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b
1032 *
1033 * Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed.
1034 * while x_out == y_in is not (maybe this works, but it's not tested). */
1035static void
1036point_double(felem x_out, felem y_out, felem z_out,
1037             const felem x_in, const felem y_in, const felem z_in)
1038{
1039    largefelem tmp, tmp2;
1040    felem delta, gamma, beta, alpha, ftmp, ftmp2;
1041
1042    felem_assign(ftmp, x_in);
1043    felem_assign(ftmp2, x_in);
1044
1045    /* delta = z^2 */
1046    felem_square(tmp, z_in);
1047    felem_reduce(delta, tmp);   /* delta[i] < 2^59 + 2^14 */
1048
1049    /* gamma = y^2 */
1050    felem_square(tmp, y_in);
1051    felem_reduce(gamma, tmp);   /* gamma[i] < 2^59 + 2^14 */
1052
1053    /* beta = x*gamma */
1054    felem_mul(tmp, x_in, gamma);
1055    felem_reduce(beta, tmp);    /* beta[i] < 2^59 + 2^14 */
1056
1057    /* alpha = 3*(x-delta)*(x+delta) */
1058    felem_diff64(ftmp, delta);
1059    /* ftmp[i] < 2^61 */
1060    felem_sum64(ftmp2, delta);
1061    /* ftmp2[i] < 2^60 + 2^15 */
1062    felem_scalar64(ftmp2, 3);
1063    /* ftmp2[i] < 3*2^60 + 3*2^15 */
1064    felem_mul(tmp, ftmp, ftmp2);
1065    /*-
1066     * tmp[i] < 17(3*2^121 + 3*2^76)
1067     *        = 61*2^121 + 61*2^76
1068     *        < 64*2^121 + 64*2^76
1069     *        = 2^127 + 2^82
1070     *        < 2^128
1071     */
1072    felem_reduce(alpha, tmp);
1073
1074    /* x' = alpha^2 - 8*beta */
1075    felem_square(tmp, alpha);
1076    /*
1077     * tmp[i] < 17*2^120 < 2^125
1078     */
1079    felem_assign(ftmp, beta);
1080    felem_scalar64(ftmp, 8);
1081    /* ftmp[i] < 2^62 + 2^17 */
1082    felem_diff_128_64(tmp, ftmp);
1083    /* tmp[i] < 2^125 + 2^63 + 2^62 + 2^17 */
1084    felem_reduce(x_out, tmp);
1085
1086    /* z' = (y + z)^2 - gamma - delta */
1087    felem_sum64(delta, gamma);
1088    /* delta[i] < 2^60 + 2^15 */
1089    felem_assign(ftmp, y_in);
1090    felem_sum64(ftmp, z_in);
1091    /* ftmp[i] < 2^60 + 2^15 */
1092    felem_square(tmp, ftmp);
1093    /*
1094     * tmp[i] < 17(2^122) < 2^127
1095     */
1096    felem_diff_128_64(tmp, delta);
1097    /* tmp[i] < 2^127 + 2^63 */
1098    felem_reduce(z_out, tmp);
1099
1100    /* y' = alpha*(4*beta - x') - 8*gamma^2 */
1101    felem_scalar64(beta, 4);
1102    /* beta[i] < 2^61 + 2^16 */
1103    felem_diff64(beta, x_out);
1104    /* beta[i] < 2^61 + 2^60 + 2^16 */
1105    felem_mul(tmp, alpha, beta);
1106    /*-
1107     * tmp[i] < 17*((2^59 + 2^14)(2^61 + 2^60 + 2^16))
1108     *        = 17*(2^120 + 2^75 + 2^119 + 2^74 + 2^75 + 2^30)
1109     *        = 17*(2^120 + 2^119 + 2^76 + 2^74 + 2^30)
1110     *        < 2^128
1111     */
1112    felem_square(tmp2, gamma);
1113    /*-
1114     * tmp2[i] < 17*(2^59 + 2^14)^2
1115     *         = 17*(2^118 + 2^74 + 2^28)
1116     */
1117    felem_scalar128(tmp2, 8);
1118    /*-
1119     * tmp2[i] < 8*17*(2^118 + 2^74 + 2^28)
1120     *         = 2^125 + 2^121 + 2^81 + 2^77 + 2^35 + 2^31
1121     *         < 2^126
1122     */
1123    felem_diff128(tmp, tmp2);
1124    /*-
1125     * tmp[i] < 2^127 - 2^69 + 17(2^120 + 2^119 + 2^76 + 2^74 + 2^30)
1126     *        = 2^127 + 2^124 + 2^122 + 2^120 + 2^118 + 2^80 + 2^78 + 2^76 +
1127     *          2^74 + 2^69 + 2^34 + 2^30
1128     *        < 2^128
1129     */
1130    felem_reduce(y_out, tmp);
1131}
1132
1133/* copy_conditional copies in to out iff mask is all ones. */
1134static void copy_conditional(felem out, const felem in, limb mask)
1135{
1136    unsigned i;
1137    for (i = 0; i < NLIMBS; ++i) {
1138        const limb tmp = mask & (in[i] ^ out[i]);
1139        out[i] ^= tmp;
1140    }
1141}
1142
1143/*-
1144 * point_add calculates (x1, y1, z1) + (x2, y2, z2)
1145 *
1146 * The method is taken from
1147 *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl,
1148 * adapted for mixed addition (z2 = 1, or z2 = 0 for the point at infinity).
1149 *
1150 * This function includes a branch for checking whether the two input points
1151 * are equal (while not equal to the point at infinity). See comment below
1152 * on constant-time.
1153 */
1154static void point_add(felem x3, felem y3, felem z3,
1155                      const felem x1, const felem y1, const felem z1,
1156                      const int mixed, const felem x2, const felem y2,
1157                      const felem z2)
1158{
1159    felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, ftmp6, x_out, y_out, z_out;
1160    largefelem tmp, tmp2;
1161    limb x_equal, y_equal, z1_is_zero, z2_is_zero;
1162    limb points_equal;
1163
1164    z1_is_zero = felem_is_zero(z1);
1165    z2_is_zero = felem_is_zero(z2);
1166
1167    /* ftmp = z1z1 = z1**2 */
1168    felem_square(tmp, z1);
1169    felem_reduce(ftmp, tmp);
1170
1171    if (!mixed) {
1172        /* ftmp2 = z2z2 = z2**2 */
1173        felem_square(tmp, z2);
1174        felem_reduce(ftmp2, tmp);
1175
1176        /* u1 = ftmp3 = x1*z2z2 */
1177        felem_mul(tmp, x1, ftmp2);
1178        felem_reduce(ftmp3, tmp);
1179
1180        /* ftmp5 = z1 + z2 */
1181        felem_assign(ftmp5, z1);
1182        felem_sum64(ftmp5, z2);
1183        /* ftmp5[i] < 2^61 */
1184
1185        /* ftmp5 = (z1 + z2)**2 - z1z1 - z2z2 = 2*z1z2 */
1186        felem_square(tmp, ftmp5);
1187        /* tmp[i] < 17*2^122 */
1188        felem_diff_128_64(tmp, ftmp);
1189        /* tmp[i] < 17*2^122 + 2^63 */
1190        felem_diff_128_64(tmp, ftmp2);
1191        /* tmp[i] < 17*2^122 + 2^64 */
1192        felem_reduce(ftmp5, tmp);
1193
1194        /* ftmp2 = z2 * z2z2 */
1195        felem_mul(tmp, ftmp2, z2);
1196        felem_reduce(ftmp2, tmp);
1197
1198        /* s1 = ftmp6 = y1 * z2**3 */
1199        felem_mul(tmp, y1, ftmp2);
1200        felem_reduce(ftmp6, tmp);
1201    } else {
1202        /*
1203         * We'll assume z2 = 1 (special case z2 = 0 is handled later)
1204         */
1205
1206        /* u1 = ftmp3 = x1*z2z2 */
1207        felem_assign(ftmp3, x1);
1208
1209        /* ftmp5 = 2*z1z2 */
1210        felem_scalar(ftmp5, z1, 2);
1211
1212        /* s1 = ftmp6 = y1 * z2**3 */
1213        felem_assign(ftmp6, y1);
1214    }
1215
1216    /* u2 = x2*z1z1 */
1217    felem_mul(tmp, x2, ftmp);
1218    /* tmp[i] < 17*2^120 */
1219
1220    /* h = ftmp4 = u2 - u1 */
1221    felem_diff_128_64(tmp, ftmp3);
1222    /* tmp[i] < 17*2^120 + 2^63 */
1223    felem_reduce(ftmp4, tmp);
1224
1225    x_equal = felem_is_zero(ftmp4);
1226
1227    /* z_out = ftmp5 * h */
1228    felem_mul(tmp, ftmp5, ftmp4);
1229    felem_reduce(z_out, tmp);
1230
1231    /* ftmp = z1 * z1z1 */
1232    felem_mul(tmp, ftmp, z1);
1233    felem_reduce(ftmp, tmp);
1234
1235    /* s2 = tmp = y2 * z1**3 */
1236    felem_mul(tmp, y2, ftmp);
1237    /* tmp[i] < 17*2^120 */
1238
1239    /* r = ftmp5 = (s2 - s1)*2 */
1240    felem_diff_128_64(tmp, ftmp6);
1241    /* tmp[i] < 17*2^120 + 2^63 */
1242    felem_reduce(ftmp5, tmp);
1243    y_equal = felem_is_zero(ftmp5);
1244    felem_scalar64(ftmp5, 2);
1245    /* ftmp5[i] < 2^61 */
1246
1247    /*
1248     * The formulae are incorrect if the points are equal, in affine coordinates
1249     * (X_1, Y_1) == (X_2, Y_2), so we check for this and do doubling if this
1250     * happens.
1251     *
1252     * We use bitwise operations to avoid potential side-channels introduced by
1253     * the short-circuiting behaviour of boolean operators.
1254     *
1255     * The special case of either point being the point at infinity (z1 and/or
1256     * z2 are zero), is handled separately later on in this function, so we
1257     * avoid jumping to point_double here in those special cases.
1258     *
1259     * Notice the comment below on the implications of this branching for timing
1260     * leaks and why it is considered practically irrelevant.
1261     */
1262    points_equal = (x_equal & y_equal & (~z1_is_zero) & (~z2_is_zero));
1263
1264    if (points_equal) {
1265        /*
1266         * This is obviously not constant-time but it will almost-never happen
1267         * for ECDH / ECDSA. The case where it can happen is during scalar-mult
1268         * where the intermediate value gets very close to the group order.
1269         * Since |ec_GFp_nistp_recode_scalar_bits| produces signed digits for
1270         * the scalar, it's possible for the intermediate value to be a small
1271         * negative multiple of the base point, and for the final signed digit
1272         * to be the same value. We believe that this only occurs for the scalar
1273         * 1fffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
1274         * ffffffa51868783bf2f966b7fcc0148f709a5d03bb5c9b8899c47aebb6fb
1275         * 71e913863f7, in that case the penultimate intermediate is -9G and
1276         * the final digit is also -9G. Since this only happens for a single
1277         * scalar, the timing leak is irrelevant. (Any attacker who wanted to
1278         * check whether a secret scalar was that exact value, can already do
1279         * so.)
1280         */
1281        point_double(x3, y3, z3, x1, y1, z1);
1282        return;
1283    }
1284
1285    /* I = ftmp = (2h)**2 */
1286    felem_assign(ftmp, ftmp4);
1287    felem_scalar64(ftmp, 2);
1288    /* ftmp[i] < 2^61 */
1289    felem_square(tmp, ftmp);
1290    /* tmp[i] < 17*2^122 */
1291    felem_reduce(ftmp, tmp);
1292
1293    /* J = ftmp2 = h * I */
1294    felem_mul(tmp, ftmp4, ftmp);
1295    felem_reduce(ftmp2, tmp);
1296
1297    /* V = ftmp4 = U1 * I */
1298    felem_mul(tmp, ftmp3, ftmp);
1299    felem_reduce(ftmp4, tmp);
1300
1301    /* x_out = r**2 - J - 2V */
1302    felem_square(tmp, ftmp5);
1303    /* tmp[i] < 17*2^122 */
1304    felem_diff_128_64(tmp, ftmp2);
1305    /* tmp[i] < 17*2^122 + 2^63 */
1306    felem_assign(ftmp3, ftmp4);
1307    felem_scalar64(ftmp4, 2);
1308    /* ftmp4[i] < 2^61 */
1309    felem_diff_128_64(tmp, ftmp4);
1310    /* tmp[i] < 17*2^122 + 2^64 */
1311    felem_reduce(x_out, tmp);
1312
1313    /* y_out = r(V-x_out) - 2 * s1 * J */
1314    felem_diff64(ftmp3, x_out);
1315    /*
1316     * ftmp3[i] < 2^60 + 2^60 = 2^61
1317     */
1318    felem_mul(tmp, ftmp5, ftmp3);
1319    /* tmp[i] < 17*2^122 */
1320    felem_mul(tmp2, ftmp6, ftmp2);
1321    /* tmp2[i] < 17*2^120 */
1322    felem_scalar128(tmp2, 2);
1323    /* tmp2[i] < 17*2^121 */
1324    felem_diff128(tmp, tmp2);
1325        /*-
1326         * tmp[i] < 2^127 - 2^69 + 17*2^122
1327         *        = 2^126 - 2^122 - 2^6 - 2^2 - 1
1328         *        < 2^127
1329         */
1330    felem_reduce(y_out, tmp);
1331
1332    copy_conditional(x_out, x2, z1_is_zero);
1333    copy_conditional(x_out, x1, z2_is_zero);
1334    copy_conditional(y_out, y2, z1_is_zero);
1335    copy_conditional(y_out, y1, z2_is_zero);
1336    copy_conditional(z_out, z2, z1_is_zero);
1337    copy_conditional(z_out, z1, z2_is_zero);
1338    felem_assign(x3, x_out);
1339    felem_assign(y3, y_out);
1340    felem_assign(z3, z_out);
1341}
1342
1343/*-
1344 * Base point pre computation
1345 * --------------------------
1346 *
1347 * Two different sorts of precomputed tables are used in the following code.
1348 * Each contain various points on the curve, where each point is three field
1349 * elements (x, y, z).
1350 *
1351 * For the base point table, z is usually 1 (0 for the point at infinity).
1352 * This table has 16 elements:
1353 * index | bits    | point
1354 * ------+---------+------------------------------
1355 *     0 | 0 0 0 0 | 0G
1356 *     1 | 0 0 0 1 | 1G
1357 *     2 | 0 0 1 0 | 2^130G
1358 *     3 | 0 0 1 1 | (2^130 + 1)G
1359 *     4 | 0 1 0 0 | 2^260G
1360 *     5 | 0 1 0 1 | (2^260 + 1)G
1361 *     6 | 0 1 1 0 | (2^260 + 2^130)G
1362 *     7 | 0 1 1 1 | (2^260 + 2^130 + 1)G
1363 *     8 | 1 0 0 0 | 2^390G
1364 *     9 | 1 0 0 1 | (2^390 + 1)G
1365 *    10 | 1 0 1 0 | (2^390 + 2^130)G
1366 *    11 | 1 0 1 1 | (2^390 + 2^130 + 1)G
1367 *    12 | 1 1 0 0 | (2^390 + 2^260)G
1368 *    13 | 1 1 0 1 | (2^390 + 2^260 + 1)G
1369 *    14 | 1 1 1 0 | (2^390 + 2^260 + 2^130)G
1370 *    15 | 1 1 1 1 | (2^390 + 2^260 + 2^130 + 1)G
1371 *
1372 * The reason for this is so that we can clock bits into four different
1373 * locations when doing simple scalar multiplies against the base point.
1374 *
1375 * Tables for other points have table[i] = iG for i in 0 .. 16. */
1376
1377/* gmul is the table of precomputed base points */
1378static const felem gmul[16][3] = {
1379{{0, 0, 0, 0, 0, 0, 0, 0, 0},
1380 {0, 0, 0, 0, 0, 0, 0, 0, 0},
1381 {0, 0, 0, 0, 0, 0, 0, 0, 0}},
1382{{0x017e7e31c2e5bd66, 0x022cf0615a90a6fe, 0x00127a2ffa8de334,
1383  0x01dfbf9d64a3f877, 0x006b4d3dbaa14b5e, 0x014fed487e0a2bd8,
1384  0x015b4429c6481390, 0x03a73678fb2d988e, 0x00c6858e06b70404},
1385 {0x00be94769fd16650, 0x031c21a89cb09022, 0x039013fad0761353,
1386  0x02657bd099031542, 0x03273e662c97ee72, 0x01e6d11a05ebef45,
1387  0x03d1bd998f544495, 0x03001172297ed0b1, 0x011839296a789a3b},
1388 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1389{{0x0373faacbc875bae, 0x00f325023721c671, 0x00f666fd3dbde5ad,
1390  0x01a6932363f88ea7, 0x01fc6d9e13f9c47b, 0x03bcbffc2bbf734e,
1391  0x013ee3c3647f3a92, 0x029409fefe75d07d, 0x00ef9199963d85e5},
1392 {0x011173743ad5b178, 0x02499c7c21bf7d46, 0x035beaeabb8b1a58,
1393  0x00f989c4752ea0a3, 0x0101e1de48a9c1a3, 0x01a20076be28ba6c,
1394  0x02f8052e5eb2de95, 0x01bfe8f82dea117c, 0x0160074d3c36ddb7},
1395 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1396{{0x012f3fc373393b3b, 0x03d3d6172f1419fa, 0x02adc943c0b86873,
1397  0x00d475584177952b, 0x012a4d1673750ee2, 0x00512517a0f13b0c,
1398  0x02b184671a7b1734, 0x0315b84236f1a50a, 0x00a4afc472edbdb9},
1399 {0x00152a7077f385c4, 0x03044007d8d1c2ee, 0x0065829d61d52b52,
1400  0x00494ff6b6631d0d, 0x00a11d94d5f06bcf, 0x02d2f89474d9282e,
1401  0x0241c5727c06eeb9, 0x0386928710fbdb9d, 0x01f883f727b0dfbe},
1402 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1403{{0x019b0c3c9185544d, 0x006243a37c9d97db, 0x02ee3cbe030a2ad2,
1404  0x00cfdd946bb51e0d, 0x0271c00932606b91, 0x03f817d1ec68c561,
1405  0x03f37009806a369c, 0x03c1f30baf184fd5, 0x01091022d6d2f065},
1406 {0x0292c583514c45ed, 0x0316fca51f9a286c, 0x00300af507c1489a,
1407  0x0295f69008298cf1, 0x02c0ed8274943d7b, 0x016509b9b47a431e,
1408  0x02bc9de9634868ce, 0x005b34929bffcb09, 0x000c1a0121681524},
1409 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1410{{0x0286abc0292fb9f2, 0x02665eee9805b3f7, 0x01ed7455f17f26d6,
1411  0x0346355b83175d13, 0x006284944cd0a097, 0x0191895bcdec5e51,
1412  0x02e288370afda7d9, 0x03b22312bfefa67a, 0x01d104d3fc0613fe},
1413 {0x0092421a12f7e47f, 0x0077a83fa373c501, 0x03bd25c5f696bd0d,
1414  0x035c41e4d5459761, 0x01ca0d1742b24f53, 0x00aaab27863a509c,
1415  0x018b6de47df73917, 0x025c0b771705cd01, 0x01fd51d566d760a7},
1416 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1417{{0x01dd92ff6b0d1dbd, 0x039c5e2e8f8afa69, 0x0261ed13242c3b27,
1418  0x0382c6e67026e6a0, 0x01d60b10be2089f9, 0x03c15f3dce86723f,
1419  0x03c764a32d2a062d, 0x017307eac0fad056, 0x018207c0b96c5256},
1420 {0x0196a16d60e13154, 0x03e6ce74c0267030, 0x00ddbf2b4e52a5aa,
1421  0x012738241bbf31c8, 0x00ebe8dc04685a28, 0x024c2ad6d380d4a2,
1422  0x035ee062a6e62d0e, 0x0029ed74af7d3a0f, 0x00eef32aec142ebd},
1423 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1424{{0x00c31ec398993b39, 0x03a9f45bcda68253, 0x00ac733c24c70890,
1425  0x00872b111401ff01, 0x01d178c23195eafb, 0x03bca2c816b87f74,
1426  0x0261a9af46fbad7a, 0x0324b2a8dd3d28f9, 0x00918121d8f24e23},
1427 {0x032bc8c1ca983cd7, 0x00d869dfb08fc8c6, 0x01693cb61fce1516,
1428  0x012a5ea68f4e88a8, 0x010869cab88d7ae3, 0x009081ad277ceee1,
1429  0x033a77166d064cdc, 0x03955235a1fb3a95, 0x01251a4a9b25b65e},
1430 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1431{{0x00148a3a1b27f40b, 0x0123186df1b31fdc, 0x00026e7beaad34ce,
1432  0x01db446ac1d3dbba, 0x0299c1a33437eaec, 0x024540610183cbb7,
1433  0x0173bb0e9ce92e46, 0x02b937e43921214b, 0x01ab0436a9bf01b5},
1434 {0x0383381640d46948, 0x008dacbf0e7f330f, 0x03602122bcc3f318,
1435  0x01ee596b200620d6, 0x03bd0585fda430b3, 0x014aed77fd123a83,
1436  0x005ace749e52f742, 0x0390fe041da2b842, 0x0189a8ceb3299242},
1437 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1438{{0x012a19d6b3282473, 0x00c0915918b423ce, 0x023a954eb94405ae,
1439  0x00529f692be26158, 0x0289fa1b6fa4b2aa, 0x0198ae4ceea346ef,
1440  0x0047d8cdfbdedd49, 0x00cc8c8953f0f6b8, 0x001424abbff49203},
1441 {0x0256732a1115a03a, 0x0351bc38665c6733, 0x03f7b950fb4a6447,
1442  0x000afffa94c22155, 0x025763d0a4dab540, 0x000511e92d4fc283,
1443  0x030a7e9eda0ee96c, 0x004c3cd93a28bf0a, 0x017edb3a8719217f},
1444 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1445{{0x011de5675a88e673, 0x031d7d0f5e567fbe, 0x0016b2062c970ae5,
1446  0x03f4a2be49d90aa7, 0x03cef0bd13822866, 0x03f0923dcf774a6c,
1447  0x0284bebc4f322f72, 0x016ab2645302bb2c, 0x01793f95dace0e2a},
1448 {0x010646e13527a28f, 0x01ca1babd59dc5e7, 0x01afedfd9a5595df,
1449  0x01f15785212ea6b1, 0x0324e5d64f6ae3f4, 0x02d680f526d00645,
1450  0x0127920fadf627a7, 0x03b383f75df4f684, 0x0089e0057e783b0a},
1451 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1452{{0x00f334b9eb3c26c6, 0x0298fdaa98568dce, 0x01c2d24843a82292,
1453  0x020bcb24fa1b0711, 0x02cbdb3d2b1875e6, 0x0014907598f89422,
1454  0x03abe3aa43b26664, 0x02cbf47f720bc168, 0x0133b5e73014b79b},
1455 {0x034aab5dab05779d, 0x00cdc5d71fee9abb, 0x0399f16bd4bd9d30,
1456  0x03582fa592d82647, 0x02be1cdfb775b0e9, 0x0034f7cea32e94cb,
1457  0x0335a7f08f56f286, 0x03b707e9565d1c8b, 0x0015c946ea5b614f},
1458 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1459{{0x024676f6cff72255, 0x00d14625cac96378, 0x00532b6008bc3767,
1460  0x01fc16721b985322, 0x023355ea1b091668, 0x029de7afdc0317c3,
1461  0x02fc8a7ca2da037c, 0x02de1217d74a6f30, 0x013f7173175b73bf},
1462 {0x0344913f441490b5, 0x0200f9e272b61eca, 0x0258a246b1dd55d2,
1463  0x03753db9ea496f36, 0x025e02937a09c5ef, 0x030cbd3d14012692,
1464  0x01793a67e70dc72a, 0x03ec1d37048a662e, 0x006550f700c32a8d},
1465 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1466{{0x00d3f48a347eba27, 0x008e636649b61bd8, 0x00d3b93716778fb3,
1467  0x004d1915757bd209, 0x019d5311a3da44e0, 0x016d1afcbbe6aade,
1468  0x0241bf5f73265616, 0x0384672e5d50d39b, 0x005009fee522b684},
1469 {0x029b4fab064435fe, 0x018868ee095bbb07, 0x01ea3d6936cc92b8,
1470  0x000608b00f78a2f3, 0x02db911073d1c20f, 0x018205938470100a,
1471  0x01f1e4964cbe6ff2, 0x021a19a29eed4663, 0x01414485f42afa81},
1472 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1473{{0x01612b3a17f63e34, 0x03813992885428e6, 0x022b3c215b5a9608,
1474  0x029b4057e19f2fcb, 0x0384059a587af7e6, 0x02d6400ace6fe610,
1475  0x029354d896e8e331, 0x00c047ee6dfba65e, 0x0037720542e9d49d},
1476 {0x02ce9eed7c5e9278, 0x0374ed703e79643b, 0x01316c54c4072006,
1477  0x005aaa09054b2ee8, 0x002824000c840d57, 0x03d4eba24771ed86,
1478  0x0189c50aabc3bdae, 0x0338c01541e15510, 0x00466d56e38eed42},
1479 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1480{{0x007efd8330ad8bd6, 0x02465ed48047710b, 0x0034c6606b215e0c,
1481  0x016ae30c53cbf839, 0x01fa17bd37161216, 0x018ead4e61ce8ab9,
1482  0x005482ed5f5dee46, 0x037543755bba1d7f, 0x005e5ac7e70a9d0f},
1483 {0x0117e1bb2fdcb2a2, 0x03deea36249f40c4, 0x028d09b4a6246cb7,
1484  0x03524b8855bcf756, 0x023d7d109d5ceb58, 0x0178e43e3223ef9c,
1485  0x0154536a0c6e966a, 0x037964d1286ee9fe, 0x0199bcd90e125055},
1486 {1, 0, 0, 0, 0, 0, 0, 0, 0}}
1487};
1488
1489/*
1490 * select_point selects the |idx|th point from a precomputation table and
1491 * copies it to out.
1492 */
1493 /* pre_comp below is of the size provided in |size| */
1494static void select_point(const limb idx, unsigned int size,
1495                         const felem pre_comp[][3], felem out[3])
1496{
1497    unsigned i, j;
1498    limb *outlimbs = &out[0][0];
1499
1500    memset(out, 0, sizeof(*out) * 3);
1501
1502    for (i = 0; i < size; i++) {
1503        const limb *inlimbs = &pre_comp[i][0][0];
1504        limb mask = i ^ idx;
1505        mask |= mask >> 4;
1506        mask |= mask >> 2;
1507        mask |= mask >> 1;
1508        mask &= 1;
1509        mask--;
1510        for (j = 0; j < NLIMBS * 3; j++)
1511            outlimbs[j] |= inlimbs[j] & mask;
1512    }
1513}
1514
1515/* get_bit returns the |i|th bit in |in| */
1516static char get_bit(const felem_bytearray in, int i)
1517{
1518    if (i < 0)
1519        return 0;
1520    return (in[i >> 3] >> (i & 7)) & 1;
1521}
1522
1523/*
1524 * Interleaved point multiplication using precomputed point multiples: The
1525 * small point multiples 0*P, 1*P, ..., 16*P are in pre_comp[], the scalars
1526 * in scalars[]. If g_scalar is non-NULL, we also add this multiple of the
1527 * generator, using certain (large) precomputed multiples in g_pre_comp.
1528 * Output point (X, Y, Z) is stored in x_out, y_out, z_out
1529 */
1530static void batch_mul(felem x_out, felem y_out, felem z_out,
1531                      const felem_bytearray scalars[],
1532                      const unsigned num_points, const u8 *g_scalar,
1533                      const int mixed, const felem pre_comp[][17][3],
1534                      const felem g_pre_comp[16][3])
1535{
1536    int i, skip;
1537    unsigned num, gen_mul = (g_scalar != NULL);
1538    felem nq[3], tmp[4];
1539    limb bits;
1540    u8 sign, digit;
1541
1542    /* set nq to the point at infinity */
1543    memset(nq, 0, sizeof(nq));
1544
1545    /*
1546     * Loop over all scalars msb-to-lsb, interleaving additions of multiples
1547     * of the generator (last quarter of rounds) and additions of other
1548     * points multiples (every 5th round).
1549     */
1550    skip = 1;                   /* save two point operations in the first
1551                                 * round */
1552    for (i = (num_points ? 520 : 130); i >= 0; --i) {
1553        /* double */
1554        if (!skip)
1555            point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]);
1556
1557        /* add multiples of the generator */
1558        if (gen_mul && (i <= 130)) {
1559            bits = get_bit(g_scalar, i + 390) << 3;
1560            if (i < 130) {
1561                bits |= get_bit(g_scalar, i + 260) << 2;
1562                bits |= get_bit(g_scalar, i + 130) << 1;
1563                bits |= get_bit(g_scalar, i);
1564            }
1565            /* select the point to add, in constant time */
1566            select_point(bits, 16, g_pre_comp, tmp);
1567            if (!skip) {
1568                /* The 1 argument below is for "mixed" */
1569                point_add(nq[0], nq[1], nq[2],
1570                          nq[0], nq[1], nq[2], 1, tmp[0], tmp[1], tmp[2]);
1571            } else {
1572                memcpy(nq, tmp, 3 * sizeof(felem));
1573                skip = 0;
1574            }
1575        }
1576
1577        /* do other additions every 5 doublings */
1578        if (num_points && (i % 5 == 0)) {
1579            /* loop over all scalars */
1580            for (num = 0; num < num_points; ++num) {
1581                bits = get_bit(scalars[num], i + 4) << 5;
1582                bits |= get_bit(scalars[num], i + 3) << 4;
1583                bits |= get_bit(scalars[num], i + 2) << 3;
1584                bits |= get_bit(scalars[num], i + 1) << 2;
1585                bits |= get_bit(scalars[num], i) << 1;
1586                bits |= get_bit(scalars[num], i - 1);
1587                ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits);
1588
1589                /*
1590                 * select the point to add or subtract, in constant time
1591                 */
1592                select_point(digit, 17, pre_comp[num], tmp);
1593                felem_neg(tmp[3], tmp[1]); /* (X, -Y, Z) is the negative
1594                                            * point */
1595                copy_conditional(tmp[1], tmp[3], (-(limb) sign));
1596
1597                if (!skip) {
1598                    point_add(nq[0], nq[1], nq[2],
1599                              nq[0], nq[1], nq[2],
1600                              mixed, tmp[0], tmp[1], tmp[2]);
1601                } else {
1602                    memcpy(nq, tmp, 3 * sizeof(felem));
1603                    skip = 0;
1604                }
1605            }
1606        }
1607    }
1608    felem_assign(x_out, nq[0]);
1609    felem_assign(y_out, nq[1]);
1610    felem_assign(z_out, nq[2]);
1611}
1612
1613/* Precomputation for the group generator. */
1614struct nistp521_pre_comp_st {
1615    felem g_pre_comp[16][3];
1616    CRYPTO_REF_COUNT references;
1617    CRYPTO_RWLOCK *lock;
1618};
1619
1620const EC_METHOD *EC_GFp_nistp521_method(void)
1621{
1622    static const EC_METHOD ret = {
1623        EC_FLAGS_DEFAULT_OCT,
1624        NID_X9_62_prime_field,
1625        ec_GFp_nistp521_group_init,
1626        ec_GFp_simple_group_finish,
1627        ec_GFp_simple_group_clear_finish,
1628        ec_GFp_nist_group_copy,
1629        ec_GFp_nistp521_group_set_curve,
1630        ec_GFp_simple_group_get_curve,
1631        ec_GFp_simple_group_get_degree,
1632        ec_group_simple_order_bits,
1633        ec_GFp_simple_group_check_discriminant,
1634        ec_GFp_simple_point_init,
1635        ec_GFp_simple_point_finish,
1636        ec_GFp_simple_point_clear_finish,
1637        ec_GFp_simple_point_copy,
1638        ec_GFp_simple_point_set_to_infinity,
1639        ec_GFp_simple_set_Jprojective_coordinates_GFp,
1640        ec_GFp_simple_get_Jprojective_coordinates_GFp,
1641        ec_GFp_simple_point_set_affine_coordinates,
1642        ec_GFp_nistp521_point_get_affine_coordinates,
1643        0 /* point_set_compressed_coordinates */ ,
1644        0 /* point2oct */ ,
1645        0 /* oct2point */ ,
1646        ec_GFp_simple_add,
1647        ec_GFp_simple_dbl,
1648        ec_GFp_simple_invert,
1649        ec_GFp_simple_is_at_infinity,
1650        ec_GFp_simple_is_on_curve,
1651        ec_GFp_simple_cmp,
1652        ec_GFp_simple_make_affine,
1653        ec_GFp_simple_points_make_affine,
1654        ec_GFp_nistp521_points_mul,
1655        ec_GFp_nistp521_precompute_mult,
1656        ec_GFp_nistp521_have_precompute_mult,
1657        ec_GFp_nist_field_mul,
1658        ec_GFp_nist_field_sqr,
1659        0 /* field_div */ ,
1660        ec_GFp_simple_field_inv,
1661        0 /* field_encode */ ,
1662        0 /* field_decode */ ,
1663        0,                      /* field_set_to_one */
1664        ec_key_simple_priv2oct,
1665        ec_key_simple_oct2priv,
1666        0, /* set private */
1667        ec_key_simple_generate_key,
1668        ec_key_simple_check_key,
1669        ec_key_simple_generate_public_key,
1670        0, /* keycopy */
1671        0, /* keyfinish */
1672        ecdh_simple_compute_key,
1673        0, /* field_inverse_mod_ord */
1674        0, /* blind_coordinates */
1675        0, /* ladder_pre */
1676        0, /* ladder_step */
1677        0  /* ladder_post */
1678    };
1679
1680    return &ret;
1681}
1682
1683/******************************************************************************/
1684/*
1685 * FUNCTIONS TO MANAGE PRECOMPUTATION
1686 */
1687
1688static NISTP521_PRE_COMP *nistp521_pre_comp_new(void)
1689{
1690    NISTP521_PRE_COMP *ret = OPENSSL_zalloc(sizeof(*ret));
1691
1692    if (ret == NULL) {
1693        ECerr(EC_F_NISTP521_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE);
1694        return ret;
1695    }
1696
1697    ret->references = 1;
1698
1699    ret->lock = CRYPTO_THREAD_lock_new();
1700    if (ret->lock == NULL) {
1701        ECerr(EC_F_NISTP521_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE);
1702        OPENSSL_free(ret);
1703        return NULL;
1704    }
1705    return ret;
1706}
1707
1708NISTP521_PRE_COMP *EC_nistp521_pre_comp_dup(NISTP521_PRE_COMP *p)
1709{
1710    int i;
1711    if (p != NULL)
1712        CRYPTO_UP_REF(&p->references, &i, p->lock);
1713    return p;
1714}
1715
1716void EC_nistp521_pre_comp_free(NISTP521_PRE_COMP *p)
1717{
1718    int i;
1719
1720    if (p == NULL)
1721        return;
1722
1723    CRYPTO_DOWN_REF(&p->references, &i, p->lock);
1724    REF_PRINT_COUNT("EC_nistp521", x);
1725    if (i > 0)
1726        return;
1727    REF_ASSERT_ISNT(i < 0);
1728
1729    CRYPTO_THREAD_lock_free(p->lock);
1730    OPENSSL_free(p);
1731}
1732
1733/******************************************************************************/
1734/*
1735 * OPENSSL EC_METHOD FUNCTIONS
1736 */
1737
1738int ec_GFp_nistp521_group_init(EC_GROUP *group)
1739{
1740    int ret;
1741    ret = ec_GFp_simple_group_init(group);
1742    group->a_is_minus3 = 1;
1743    return ret;
1744}
1745
1746int ec_GFp_nistp521_group_set_curve(EC_GROUP *group, const BIGNUM *p,
1747                                    const BIGNUM *a, const BIGNUM *b,
1748                                    BN_CTX *ctx)
1749{
1750    int ret = 0;
1751    BN_CTX *new_ctx = NULL;
1752    BIGNUM *curve_p, *curve_a, *curve_b;
1753
1754    if (ctx == NULL)
1755        if ((ctx = new_ctx = BN_CTX_new()) == NULL)
1756            return 0;
1757    BN_CTX_start(ctx);
1758    curve_p = BN_CTX_get(ctx);
1759    curve_a = BN_CTX_get(ctx);
1760    curve_b = BN_CTX_get(ctx);
1761    if (curve_b == NULL)
1762        goto err;
1763    BN_bin2bn(nistp521_curve_params[0], sizeof(felem_bytearray), curve_p);
1764    BN_bin2bn(nistp521_curve_params[1], sizeof(felem_bytearray), curve_a);
1765    BN_bin2bn(nistp521_curve_params[2], sizeof(felem_bytearray), curve_b);
1766    if ((BN_cmp(curve_p, p)) || (BN_cmp(curve_a, a)) || (BN_cmp(curve_b, b))) {
1767        ECerr(EC_F_EC_GFP_NISTP521_GROUP_SET_CURVE,
1768              EC_R_WRONG_CURVE_PARAMETERS);
1769        goto err;
1770    }
1771    group->field_mod_func = BN_nist_mod_521;
1772    ret = ec_GFp_simple_group_set_curve(group, p, a, b, ctx);
1773 err:
1774    BN_CTX_end(ctx);
1775    BN_CTX_free(new_ctx);
1776    return ret;
1777}
1778
1779/*
1780 * Takes the Jacobian coordinates (X, Y, Z) of a point and returns (X', Y') =
1781 * (X/Z^2, Y/Z^3)
1782 */
1783int ec_GFp_nistp521_point_get_affine_coordinates(const EC_GROUP *group,
1784                                                 const EC_POINT *point,
1785                                                 BIGNUM *x, BIGNUM *y,
1786                                                 BN_CTX *ctx)
1787{
1788    felem z1, z2, x_in, y_in, x_out, y_out;
1789    largefelem tmp;
1790
1791    if (EC_POINT_is_at_infinity(group, point)) {
1792        ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES,
1793              EC_R_POINT_AT_INFINITY);
1794        return 0;
1795    }
1796    if ((!BN_to_felem(x_in, point->X)) || (!BN_to_felem(y_in, point->Y)) ||
1797        (!BN_to_felem(z1, point->Z)))
1798        return 0;
1799    felem_inv(z2, z1);
1800    felem_square(tmp, z2);
1801    felem_reduce(z1, tmp);
1802    felem_mul(tmp, x_in, z1);
1803    felem_reduce(x_in, tmp);
1804    felem_contract(x_out, x_in);
1805    if (x != NULL) {
1806        if (!felem_to_BN(x, x_out)) {
1807            ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES,
1808                  ERR_R_BN_LIB);
1809            return 0;
1810        }
1811    }
1812    felem_mul(tmp, z1, z2);
1813    felem_reduce(z1, tmp);
1814    felem_mul(tmp, y_in, z1);
1815    felem_reduce(y_in, tmp);
1816    felem_contract(y_out, y_in);
1817    if (y != NULL) {
1818        if (!felem_to_BN(y, y_out)) {
1819            ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES,
1820                  ERR_R_BN_LIB);
1821            return 0;
1822        }
1823    }
1824    return 1;
1825}
1826
1827/* points below is of size |num|, and tmp_felems is of size |num+1/ */
1828static void make_points_affine(size_t num, felem points[][3],
1829                               felem tmp_felems[])
1830{
1831    /*
1832     * Runs in constant time, unless an input is the point at infinity (which
1833     * normally shouldn't happen).
1834     */
1835    ec_GFp_nistp_points_make_affine_internal(num,
1836                                             points,
1837                                             sizeof(felem),
1838                                             tmp_felems,
1839                                             (void (*)(void *))felem_one,
1840                                             felem_is_zero_int,
1841                                             (void (*)(void *, const void *))
1842                                             felem_assign,
1843                                             (void (*)(void *, const void *))
1844                                             felem_square_reduce, (void (*)
1845                                                                   (void *,
1846                                                                    const void
1847                                                                    *,
1848                                                                    const void
1849                                                                    *))
1850                                             felem_mul_reduce,
1851                                             (void (*)(void *, const void *))
1852                                             felem_inv,
1853                                             (void (*)(void *, const void *))
1854                                             felem_contract);
1855}
1856
1857/*
1858 * Computes scalar*generator + \sum scalars[i]*points[i], ignoring NULL
1859 * values Result is stored in r (r can equal one of the inputs).
1860 */
1861int ec_GFp_nistp521_points_mul(const EC_GROUP *group, EC_POINT *r,
1862                               const BIGNUM *scalar, size_t num,
1863                               const EC_POINT *points[],
1864                               const BIGNUM *scalars[], BN_CTX *ctx)
1865{
1866    int ret = 0;
1867    int j;
1868    int mixed = 0;
1869    BIGNUM *x, *y, *z, *tmp_scalar;
1870    felem_bytearray g_secret;
1871    felem_bytearray *secrets = NULL;
1872    felem (*pre_comp)[17][3] = NULL;
1873    felem *tmp_felems = NULL;
1874    unsigned i;
1875    int num_bytes;
1876    int have_pre_comp = 0;
1877    size_t num_points = num;
1878    felem x_in, y_in, z_in, x_out, y_out, z_out;
1879    NISTP521_PRE_COMP *pre = NULL;
1880    felem(*g_pre_comp)[3] = NULL;
1881    EC_POINT *generator = NULL;
1882    const EC_POINT *p = NULL;
1883    const BIGNUM *p_scalar = NULL;
1884
1885    BN_CTX_start(ctx);
1886    x = BN_CTX_get(ctx);
1887    y = BN_CTX_get(ctx);
1888    z = BN_CTX_get(ctx);
1889    tmp_scalar = BN_CTX_get(ctx);
1890    if (tmp_scalar == NULL)
1891        goto err;
1892
1893    if (scalar != NULL) {
1894        pre = group->pre_comp.nistp521;
1895        if (pre)
1896            /* we have precomputation, try to use it */
1897            g_pre_comp = &pre->g_pre_comp[0];
1898        else
1899            /* try to use the standard precomputation */
1900            g_pre_comp = (felem(*)[3]) gmul;
1901        generator = EC_POINT_new(group);
1902        if (generator == NULL)
1903            goto err;
1904        /* get the generator from precomputation */
1905        if (!felem_to_BN(x, g_pre_comp[1][0]) ||
1906            !felem_to_BN(y, g_pre_comp[1][1]) ||
1907            !felem_to_BN(z, g_pre_comp[1][2])) {
1908            ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
1909            goto err;
1910        }
1911        if (!EC_POINT_set_Jprojective_coordinates_GFp(group,
1912                                                      generator, x, y, z,
1913                                                      ctx))
1914            goto err;
1915        if (0 == EC_POINT_cmp(group, generator, group->generator, ctx))
1916            /* precomputation matches generator */
1917            have_pre_comp = 1;
1918        else
1919            /*
1920             * we don't have valid precomputation: treat the generator as a
1921             * random point
1922             */
1923            num_points++;
1924    }
1925
1926    if (num_points > 0) {
1927        if (num_points >= 2) {
1928            /*
1929             * unless we precompute multiples for just one point, converting
1930             * those into affine form is time well spent
1931             */
1932            mixed = 1;
1933        }
1934        secrets = OPENSSL_zalloc(sizeof(*secrets) * num_points);
1935        pre_comp = OPENSSL_zalloc(sizeof(*pre_comp) * num_points);
1936        if (mixed)
1937            tmp_felems =
1938                OPENSSL_malloc(sizeof(*tmp_felems) * (num_points * 17 + 1));
1939        if ((secrets == NULL) || (pre_comp == NULL)
1940            || (mixed && (tmp_felems == NULL))) {
1941            ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_MALLOC_FAILURE);
1942            goto err;
1943        }
1944
1945        /*
1946         * we treat NULL scalars as 0, and NULL points as points at infinity,
1947         * i.e., they contribute nothing to the linear combination
1948         */
1949        for (i = 0; i < num_points; ++i) {
1950            if (i == num) {
1951                /*
1952                 * we didn't have a valid precomputation, so we pick the
1953                 * generator
1954                 */
1955                p = EC_GROUP_get0_generator(group);
1956                p_scalar = scalar;
1957            } else {
1958                /* the i^th point */
1959                p = points[i];
1960                p_scalar = scalars[i];
1961            }
1962            if ((p_scalar != NULL) && (p != NULL)) {
1963                /* reduce scalar to 0 <= scalar < 2^521 */
1964                if ((BN_num_bits(p_scalar) > 521)
1965                    || (BN_is_negative(p_scalar))) {
1966                    /*
1967                     * this is an unusual input, and we don't guarantee
1968                     * constant-timeness
1969                     */
1970                    if (!BN_nnmod(tmp_scalar, p_scalar, group->order, ctx)) {
1971                        ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
1972                        goto err;
1973                    }
1974                    num_bytes = BN_bn2lebinpad(tmp_scalar,
1975                                               secrets[i], sizeof(secrets[i]));
1976                } else {
1977                    num_bytes = BN_bn2lebinpad(p_scalar,
1978                                               secrets[i], sizeof(secrets[i]));
1979                }
1980                if (num_bytes < 0) {
1981                    ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
1982                    goto err;
1983                }
1984                /* precompute multiples */
1985                if ((!BN_to_felem(x_out, p->X)) ||
1986                    (!BN_to_felem(y_out, p->Y)) ||
1987                    (!BN_to_felem(z_out, p->Z)))
1988                    goto err;
1989                memcpy(pre_comp[i][1][0], x_out, sizeof(felem));
1990                memcpy(pre_comp[i][1][1], y_out, sizeof(felem));
1991                memcpy(pre_comp[i][1][2], z_out, sizeof(felem));
1992                for (j = 2; j <= 16; ++j) {
1993                    if (j & 1) {
1994                        point_add(pre_comp[i][j][0], pre_comp[i][j][1],
1995                                  pre_comp[i][j][2], pre_comp[i][1][0],
1996                                  pre_comp[i][1][1], pre_comp[i][1][2], 0,
1997                                  pre_comp[i][j - 1][0],
1998                                  pre_comp[i][j - 1][1],
1999                                  pre_comp[i][j - 1][2]);
2000                    } else {
2001                        point_double(pre_comp[i][j][0], pre_comp[i][j][1],
2002                                     pre_comp[i][j][2], pre_comp[i][j / 2][0],
2003                                     pre_comp[i][j / 2][1],
2004                                     pre_comp[i][j / 2][2]);
2005                    }
2006                }
2007            }
2008        }
2009        if (mixed)
2010            make_points_affine(num_points * 17, pre_comp[0], tmp_felems);
2011    }
2012
2013    /* the scalar for the generator */
2014    if ((scalar != NULL) && (have_pre_comp)) {
2015        memset(g_secret, 0, sizeof(g_secret));
2016        /* reduce scalar to 0 <= scalar < 2^521 */
2017        if ((BN_num_bits(scalar) > 521) || (BN_is_negative(scalar))) {
2018            /*
2019             * this is an unusual input, and we don't guarantee
2020             * constant-timeness
2021             */
2022            if (!BN_nnmod(tmp_scalar, scalar, group->order, ctx)) {
2023                ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
2024                goto err;
2025            }
2026            num_bytes = BN_bn2lebinpad(tmp_scalar, g_secret, sizeof(g_secret));
2027        } else {
2028            num_bytes = BN_bn2lebinpad(scalar, g_secret, sizeof(g_secret));
2029        }
2030        /* do the multiplication with generator precomputation */
2031        batch_mul(x_out, y_out, z_out,
2032                  (const felem_bytearray(*))secrets, num_points,
2033                  g_secret,
2034                  mixed, (const felem(*)[17][3])pre_comp,
2035                  (const felem(*)[3])g_pre_comp);
2036    } else {
2037        /* do the multiplication without generator precomputation */
2038        batch_mul(x_out, y_out, z_out,
2039                  (const felem_bytearray(*))secrets, num_points,
2040                  NULL, mixed, (const felem(*)[17][3])pre_comp, NULL);
2041    }
2042    /* reduce the output to its unique minimal representation */
2043    felem_contract(x_in, x_out);
2044    felem_contract(y_in, y_out);
2045    felem_contract(z_in, z_out);
2046    if ((!felem_to_BN(x, x_in)) || (!felem_to_BN(y, y_in)) ||
2047        (!felem_to_BN(z, z_in))) {
2048        ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
2049        goto err;
2050    }
2051    ret = EC_POINT_set_Jprojective_coordinates_GFp(group, r, x, y, z, ctx);
2052
2053 err:
2054    BN_CTX_end(ctx);
2055    EC_POINT_free(generator);
2056    OPENSSL_free(secrets);
2057    OPENSSL_free(pre_comp);
2058    OPENSSL_free(tmp_felems);
2059    return ret;
2060}
2061
2062int ec_GFp_nistp521_precompute_mult(EC_GROUP *group, BN_CTX *ctx)
2063{
2064    int ret = 0;
2065    NISTP521_PRE_COMP *pre = NULL;
2066    int i, j;
2067    BN_CTX *new_ctx = NULL;
2068    BIGNUM *x, *y;
2069    EC_POINT *generator = NULL;
2070    felem tmp_felems[16];
2071
2072    /* throw away old precomputation */
2073    EC_pre_comp_free(group);
2074    if (ctx == NULL)
2075        if ((ctx = new_ctx = BN_CTX_new()) == NULL)
2076            return 0;
2077    BN_CTX_start(ctx);
2078    x = BN_CTX_get(ctx);
2079    y = BN_CTX_get(ctx);
2080    if (y == NULL)
2081        goto err;
2082    /* get the generator */
2083    if (group->generator == NULL)
2084        goto err;
2085    generator = EC_POINT_new(group);
2086    if (generator == NULL)
2087        goto err;
2088    BN_bin2bn(nistp521_curve_params[3], sizeof(felem_bytearray), x);
2089    BN_bin2bn(nistp521_curve_params[4], sizeof(felem_bytearray), y);
2090    if (!EC_POINT_set_affine_coordinates(group, generator, x, y, ctx))
2091        goto err;
2092    if ((pre = nistp521_pre_comp_new()) == NULL)
2093        goto err;
2094    /*
2095     * if the generator is the standard one, use built-in precomputation
2096     */
2097    if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) {
2098        memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp));
2099        goto done;
2100    }
2101    if ((!BN_to_felem(pre->g_pre_comp[1][0], group->generator->X)) ||
2102        (!BN_to_felem(pre->g_pre_comp[1][1], group->generator->Y)) ||
2103        (!BN_to_felem(pre->g_pre_comp[1][2], group->generator->Z)))
2104        goto err;
2105    /* compute 2^130*G, 2^260*G, 2^390*G */
2106    for (i = 1; i <= 4; i <<= 1) {
2107        point_double(pre->g_pre_comp[2 * i][0], pre->g_pre_comp[2 * i][1],
2108                     pre->g_pre_comp[2 * i][2], pre->g_pre_comp[i][0],
2109                     pre->g_pre_comp[i][1], pre->g_pre_comp[i][2]);
2110        for (j = 0; j < 129; ++j) {
2111            point_double(pre->g_pre_comp[2 * i][0],
2112                         pre->g_pre_comp[2 * i][1],
2113                         pre->g_pre_comp[2 * i][2],
2114                         pre->g_pre_comp[2 * i][0],
2115                         pre->g_pre_comp[2 * i][1],
2116                         pre->g_pre_comp[2 * i][2]);
2117        }
2118    }
2119    /* g_pre_comp[0] is the point at infinity */
2120    memset(pre->g_pre_comp[0], 0, sizeof(pre->g_pre_comp[0]));
2121    /* the remaining multiples */
2122    /* 2^130*G + 2^260*G */
2123    point_add(pre->g_pre_comp[6][0], pre->g_pre_comp[6][1],
2124              pre->g_pre_comp[6][2], pre->g_pre_comp[4][0],
2125              pre->g_pre_comp[4][1], pre->g_pre_comp[4][2],
2126              0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
2127              pre->g_pre_comp[2][2]);
2128    /* 2^130*G + 2^390*G */
2129    point_add(pre->g_pre_comp[10][0], pre->g_pre_comp[10][1],
2130              pre->g_pre_comp[10][2], pre->g_pre_comp[8][0],
2131              pre->g_pre_comp[8][1], pre->g_pre_comp[8][2],
2132              0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
2133              pre->g_pre_comp[2][2]);
2134    /* 2^260*G + 2^390*G */
2135    point_add(pre->g_pre_comp[12][0], pre->g_pre_comp[12][1],
2136              pre->g_pre_comp[12][2], pre->g_pre_comp[8][0],
2137              pre->g_pre_comp[8][1], pre->g_pre_comp[8][2],
2138              0, pre->g_pre_comp[4][0], pre->g_pre_comp[4][1],
2139              pre->g_pre_comp[4][2]);
2140    /* 2^130*G + 2^260*G + 2^390*G */
2141    point_add(pre->g_pre_comp[14][0], pre->g_pre_comp[14][1],
2142              pre->g_pre_comp[14][2], pre->g_pre_comp[12][0],
2143              pre->g_pre_comp[12][1], pre->g_pre_comp[12][2],
2144              0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
2145              pre->g_pre_comp[2][2]);
2146    for (i = 1; i < 8; ++i) {
2147        /* odd multiples: add G */
2148        point_add(pre->g_pre_comp[2 * i + 1][0],
2149                  pre->g_pre_comp[2 * i + 1][1],
2150                  pre->g_pre_comp[2 * i + 1][2], pre->g_pre_comp[2 * i][0],
2151                  pre->g_pre_comp[2 * i][1], pre->g_pre_comp[2 * i][2], 0,
2152                  pre->g_pre_comp[1][0], pre->g_pre_comp[1][1],
2153                  pre->g_pre_comp[1][2]);
2154    }
2155    make_points_affine(15, &(pre->g_pre_comp[1]), tmp_felems);
2156
2157 done:
2158    SETPRECOMP(group, nistp521, pre);
2159    ret = 1;
2160    pre = NULL;
2161 err:
2162    BN_CTX_end(ctx);
2163    EC_POINT_free(generator);
2164    BN_CTX_free(new_ctx);
2165    EC_nistp521_pre_comp_free(pre);
2166    return ret;
2167}
2168
2169int ec_GFp_nistp521_have_precompute_mult(const EC_GROUP *group)
2170{
2171    return HAVEPRECOMP(group, nistp521);
2172}
2173
2174#endif
2175