ecp_nistp521.c revision 352193
1/* crypto/ec/ecp_nistp521.c */
2/*
3 * Written by Adam Langley (Google) for the OpenSSL project
4 */
5/* Copyright 2011 Google Inc.
6 *
7 * Licensed under the Apache License, Version 2.0 (the "License");
8 *
9 * you may not use this file except in compliance with the License.
10 * You may obtain a copy of the License at
11 *
12 *     http://www.apache.org/licenses/LICENSE-2.0
13 *
14 *  Unless required by applicable law or agreed to in writing, software
15 *  distributed under the License is distributed on an "AS IS" BASIS,
16 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 *  See the License for the specific language governing permissions and
18 *  limitations under the License.
19 */
20
21/*
22 * A 64-bit implementation of the NIST P-521 elliptic curve point multiplication
23 *
24 * OpenSSL integration was taken from Emilia Kasper's work in ecp_nistp224.c.
25 * Otherwise based on Emilia's P224 work, which was inspired by my curve25519
26 * work which got its smarts from Daniel J. Bernstein's work on the same.
27 */
28
29#include <openssl/opensslconf.h>
30#ifndef OPENSSL_NO_EC_NISTP_64_GCC_128
31
32# ifndef OPENSSL_SYS_VMS
33#  include <stdint.h>
34# else
35#  include <inttypes.h>
36# endif
37
38# include <string.h>
39# include <openssl/err.h>
40# include "ec_lcl.h"
41# include "bn_int.h" /* bn_bn2lebinpad, bn_lebin2bn */
42
43# if defined(__GNUC__) && (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))
44  /* even with gcc, the typedef won't work for 32-bit platforms */
45typedef __uint128_t uint128_t;  /* nonstandard; implemented by gcc on 64-bit
46                                 * platforms */
47# else
48#  error "Need GCC 3.1 or later to define type uint128_t"
49# endif
50
51typedef uint8_t u8;
52typedef uint64_t u64;
53
54/*
55 * The underlying field. P521 operates over GF(2^521-1). We can serialise an
56 * element of this field into 66 bytes where the most significant byte
57 * contains only a single bit. We call this an felem_bytearray.
58 */
59
60typedef u8 felem_bytearray[66];
61
62/*
63 * These are the parameters of P521, taken from FIPS 186-3, section D.1.2.5.
64 * These values are big-endian.
65 */
66static const felem_bytearray nistp521_curve_params[5] = {
67    {0x01, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* p */
68     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
69     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
70     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
71     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
72     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
73     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
74     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
75     0xff, 0xff},
76    {0x01, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* a = -3 */
77     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
78     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
79     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
80     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
81     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
82     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
83     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
84     0xff, 0xfc},
85    {0x00, 0x51, 0x95, 0x3e, 0xb9, 0x61, 0x8e, 0x1c, /* b */
86     0x9a, 0x1f, 0x92, 0x9a, 0x21, 0xa0, 0xb6, 0x85,
87     0x40, 0xee, 0xa2, 0xda, 0x72, 0x5b, 0x99, 0xb3,
88     0x15, 0xf3, 0xb8, 0xb4, 0x89, 0x91, 0x8e, 0xf1,
89     0x09, 0xe1, 0x56, 0x19, 0x39, 0x51, 0xec, 0x7e,
90     0x93, 0x7b, 0x16, 0x52, 0xc0, 0xbd, 0x3b, 0xb1,
91     0xbf, 0x07, 0x35, 0x73, 0xdf, 0x88, 0x3d, 0x2c,
92     0x34, 0xf1, 0xef, 0x45, 0x1f, 0xd4, 0x6b, 0x50,
93     0x3f, 0x00},
94    {0x00, 0xc6, 0x85, 0x8e, 0x06, 0xb7, 0x04, 0x04, /* x */
95     0xe9, 0xcd, 0x9e, 0x3e, 0xcb, 0x66, 0x23, 0x95,
96     0xb4, 0x42, 0x9c, 0x64, 0x81, 0x39, 0x05, 0x3f,
97     0xb5, 0x21, 0xf8, 0x28, 0xaf, 0x60, 0x6b, 0x4d,
98     0x3d, 0xba, 0xa1, 0x4b, 0x5e, 0x77, 0xef, 0xe7,
99     0x59, 0x28, 0xfe, 0x1d, 0xc1, 0x27, 0xa2, 0xff,
100     0xa8, 0xde, 0x33, 0x48, 0xb3, 0xc1, 0x85, 0x6a,
101     0x42, 0x9b, 0xf9, 0x7e, 0x7e, 0x31, 0xc2, 0xe5,
102     0xbd, 0x66},
103    {0x01, 0x18, 0x39, 0x29, 0x6a, 0x78, 0x9a, 0x3b, /* y */
104     0xc0, 0x04, 0x5c, 0x8a, 0x5f, 0xb4, 0x2c, 0x7d,
105     0x1b, 0xd9, 0x98, 0xf5, 0x44, 0x49, 0x57, 0x9b,
106     0x44, 0x68, 0x17, 0xaf, 0xbd, 0x17, 0x27, 0x3e,
107     0x66, 0x2c, 0x97, 0xee, 0x72, 0x99, 0x5e, 0xf4,
108     0x26, 0x40, 0xc5, 0x50, 0xb9, 0x01, 0x3f, 0xad,
109     0x07, 0x61, 0x35, 0x3c, 0x70, 0x86, 0xa2, 0x72,
110     0xc2, 0x40, 0x88, 0xbe, 0x94, 0x76, 0x9f, 0xd1,
111     0x66, 0x50}
112};
113
114/*-
115 * The representation of field elements.
116 * ------------------------------------
117 *
118 * We represent field elements with nine values. These values are either 64 or
119 * 128 bits and the field element represented is:
120 *   v[0]*2^0 + v[1]*2^58 + v[2]*2^116 + ... + v[8]*2^464  (mod p)
121 * Each of the nine values is called a 'limb'. Since the limbs are spaced only
122 * 58 bits apart, but are greater than 58 bits in length, the most significant
123 * bits of each limb overlap with the least significant bits of the next.
124 *
125 * A field element with 64-bit limbs is an 'felem'. One with 128-bit limbs is a
126 * 'largefelem' */
127
128# define NLIMBS 9
129
130typedef uint64_t limb;
131typedef limb felem[NLIMBS];
132typedef uint128_t largefelem[NLIMBS];
133
134static const limb bottom57bits = 0x1ffffffffffffff;
135static const limb bottom58bits = 0x3ffffffffffffff;
136
137/*
138 * bin66_to_felem takes a little-endian byte array and converts it into felem
139 * form. This assumes that the CPU is little-endian.
140 */
141static void bin66_to_felem(felem out, const u8 in[66])
142{
143    out[0] = (*((limb *) & in[0])) & bottom58bits;
144    out[1] = (*((limb *) & in[7]) >> 2) & bottom58bits;
145    out[2] = (*((limb *) & in[14]) >> 4) & bottom58bits;
146    out[3] = (*((limb *) & in[21]) >> 6) & bottom58bits;
147    out[4] = (*((limb *) & in[29])) & bottom58bits;
148    out[5] = (*((limb *) & in[36]) >> 2) & bottom58bits;
149    out[6] = (*((limb *) & in[43]) >> 4) & bottom58bits;
150    out[7] = (*((limb *) & in[50]) >> 6) & bottom58bits;
151    out[8] = (*((limb *) & in[58])) & bottom57bits;
152}
153
154/*
155 * felem_to_bin66 takes an felem and serialises into a little endian, 66 byte
156 * array. This assumes that the CPU is little-endian.
157 */
158static void felem_to_bin66(u8 out[66], const felem in)
159{
160    memset(out, 0, 66);
161    (*((limb *) & out[0])) = in[0];
162    (*((limb *) & out[7])) |= in[1] << 2;
163    (*((limb *) & out[14])) |= in[2] << 4;
164    (*((limb *) & out[21])) |= in[3] << 6;
165    (*((limb *) & out[29])) = in[4];
166    (*((limb *) & out[36])) |= in[5] << 2;
167    (*((limb *) & out[43])) |= in[6] << 4;
168    (*((limb *) & out[50])) |= in[7] << 6;
169    (*((limb *) & out[58])) = in[8];
170}
171
172/* BN_to_felem converts an OpenSSL BIGNUM into an felem */
173static int BN_to_felem(felem out, const BIGNUM *bn)
174{
175    felem_bytearray b_out;
176    int num_bytes;
177
178    if (BN_is_negative(bn)) {
179        ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
180        return 0;
181    }
182    num_bytes = bn_bn2lebinpad(bn, b_out, sizeof(b_out));
183    if (num_bytes < 0) {
184        ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
185        return 0;
186    }
187    bin66_to_felem(out, b_out);
188    return 1;
189}
190
191/* felem_to_BN converts an felem into an OpenSSL BIGNUM */
192static BIGNUM *felem_to_BN(BIGNUM *out, const felem in)
193{
194    felem_bytearray b_out;
195    felem_to_bin66(b_out, in);
196    return bn_lebin2bn(b_out, sizeof(b_out), out);
197}
198
199/*-
200 * Field operations
201 * ----------------
202 */
203
204static void felem_one(felem out)
205{
206    out[0] = 1;
207    out[1] = 0;
208    out[2] = 0;
209    out[3] = 0;
210    out[4] = 0;
211    out[5] = 0;
212    out[6] = 0;
213    out[7] = 0;
214    out[8] = 0;
215}
216
217static void felem_assign(felem out, const felem in)
218{
219    out[0] = in[0];
220    out[1] = in[1];
221    out[2] = in[2];
222    out[3] = in[3];
223    out[4] = in[4];
224    out[5] = in[5];
225    out[6] = in[6];
226    out[7] = in[7];
227    out[8] = in[8];
228}
229
230/* felem_sum64 sets out = out + in. */
231static void felem_sum64(felem out, const felem in)
232{
233    out[0] += in[0];
234    out[1] += in[1];
235    out[2] += in[2];
236    out[3] += in[3];
237    out[4] += in[4];
238    out[5] += in[5];
239    out[6] += in[6];
240    out[7] += in[7];
241    out[8] += in[8];
242}
243
244/* felem_scalar sets out = in * scalar */
245static void felem_scalar(felem out, const felem in, limb scalar)
246{
247    out[0] = in[0] * scalar;
248    out[1] = in[1] * scalar;
249    out[2] = in[2] * scalar;
250    out[3] = in[3] * scalar;
251    out[4] = in[4] * scalar;
252    out[5] = in[5] * scalar;
253    out[6] = in[6] * scalar;
254    out[7] = in[7] * scalar;
255    out[8] = in[8] * scalar;
256}
257
258/* felem_scalar64 sets out = out * scalar */
259static void felem_scalar64(felem out, limb scalar)
260{
261    out[0] *= scalar;
262    out[1] *= scalar;
263    out[2] *= scalar;
264    out[3] *= scalar;
265    out[4] *= scalar;
266    out[5] *= scalar;
267    out[6] *= scalar;
268    out[7] *= scalar;
269    out[8] *= scalar;
270}
271
272/* felem_scalar128 sets out = out * scalar */
273static void felem_scalar128(largefelem out, limb scalar)
274{
275    out[0] *= scalar;
276    out[1] *= scalar;
277    out[2] *= scalar;
278    out[3] *= scalar;
279    out[4] *= scalar;
280    out[5] *= scalar;
281    out[6] *= scalar;
282    out[7] *= scalar;
283    out[8] *= scalar;
284}
285
286/*-
287 * felem_neg sets |out| to |-in|
288 * On entry:
289 *   in[i] < 2^59 + 2^14
290 * On exit:
291 *   out[i] < 2^62
292 */
293static void felem_neg(felem out, const felem in)
294{
295    /* In order to prevent underflow, we subtract from 0 mod p. */
296    static const limb two62m3 = (((limb) 1) << 62) - (((limb) 1) << 5);
297    static const limb two62m2 = (((limb) 1) << 62) - (((limb) 1) << 4);
298
299    out[0] = two62m3 - in[0];
300    out[1] = two62m2 - in[1];
301    out[2] = two62m2 - in[2];
302    out[3] = two62m2 - in[3];
303    out[4] = two62m2 - in[4];
304    out[5] = two62m2 - in[5];
305    out[6] = two62m2 - in[6];
306    out[7] = two62m2 - in[7];
307    out[8] = two62m2 - in[8];
308}
309
310/*-
311 * felem_diff64 subtracts |in| from |out|
312 * On entry:
313 *   in[i] < 2^59 + 2^14
314 * On exit:
315 *   out[i] < out[i] + 2^62
316 */
317static void felem_diff64(felem out, const felem in)
318{
319    /*
320     * In order to prevent underflow, we add 0 mod p before subtracting.
321     */
322    static const limb two62m3 = (((limb) 1) << 62) - (((limb) 1) << 5);
323    static const limb two62m2 = (((limb) 1) << 62) - (((limb) 1) << 4);
324
325    out[0] += two62m3 - in[0];
326    out[1] += two62m2 - in[1];
327    out[2] += two62m2 - in[2];
328    out[3] += two62m2 - in[3];
329    out[4] += two62m2 - in[4];
330    out[5] += two62m2 - in[5];
331    out[6] += two62m2 - in[6];
332    out[7] += two62m2 - in[7];
333    out[8] += two62m2 - in[8];
334}
335
336/*-
337 * felem_diff_128_64 subtracts |in| from |out|
338 * On entry:
339 *   in[i] < 2^62 + 2^17
340 * On exit:
341 *   out[i] < out[i] + 2^63
342 */
343static void felem_diff_128_64(largefelem out, const felem in)
344{
345    /*
346     * In order to prevent underflow, we add 64p mod p (which is equivalent
347     * to 0 mod p) before subtracting. p is 2^521 - 1, i.e. in binary a 521
348     * digit number with all bits set to 1. See "The representation of field
349     * elements" comment above for a description of how limbs are used to
350     * represent a number. 64p is represented with 8 limbs containing a number
351     * with 58 bits set and one limb with a number with 57 bits set.
352     */
353    static const limb two63m6 = (((limb) 1) << 63) - (((limb) 1) << 6);
354    static const limb two63m5 = (((limb) 1) << 63) - (((limb) 1) << 5);
355
356    out[0] += two63m6 - in[0];
357    out[1] += two63m5 - in[1];
358    out[2] += two63m5 - in[2];
359    out[3] += two63m5 - in[3];
360    out[4] += two63m5 - in[4];
361    out[5] += two63m5 - in[5];
362    out[6] += two63m5 - in[6];
363    out[7] += two63m5 - in[7];
364    out[8] += two63m5 - in[8];
365}
366
367/*-
368 * felem_diff_128_64 subtracts |in| from |out|
369 * On entry:
370 *   in[i] < 2^126
371 * On exit:
372 *   out[i] < out[i] + 2^127 - 2^69
373 */
374static void felem_diff128(largefelem out, const largefelem in)
375{
376    /*
377     * In order to prevent underflow, we add 0 mod p before subtracting.
378     */
379    static const uint128_t two127m70 =
380        (((uint128_t) 1) << 127) - (((uint128_t) 1) << 70);
381    static const uint128_t two127m69 =
382        (((uint128_t) 1) << 127) - (((uint128_t) 1) << 69);
383
384    out[0] += (two127m70 - in[0]);
385    out[1] += (two127m69 - in[1]);
386    out[2] += (two127m69 - in[2]);
387    out[3] += (two127m69 - in[3]);
388    out[4] += (two127m69 - in[4]);
389    out[5] += (two127m69 - in[5]);
390    out[6] += (two127m69 - in[6]);
391    out[7] += (two127m69 - in[7]);
392    out[8] += (two127m69 - in[8]);
393}
394
395/*-
396 * felem_square sets |out| = |in|^2
397 * On entry:
398 *   in[i] < 2^62
399 * On exit:
400 *   out[i] < 17 * max(in[i]) * max(in[i])
401 */
402static void felem_square(largefelem out, const felem in)
403{
404    felem inx2, inx4;
405    felem_scalar(inx2, in, 2);
406    felem_scalar(inx4, in, 4);
407
408    /*-
409     * We have many cases were we want to do
410     *   in[x] * in[y] +
411     *   in[y] * in[x]
412     * This is obviously just
413     *   2 * in[x] * in[y]
414     * However, rather than do the doubling on the 128 bit result, we
415     * double one of the inputs to the multiplication by reading from
416     * |inx2|
417     */
418
419    out[0] = ((uint128_t) in[0]) * in[0];
420    out[1] = ((uint128_t) in[0]) * inx2[1];
421    out[2] = ((uint128_t) in[0]) * inx2[2] + ((uint128_t) in[1]) * in[1];
422    out[3] = ((uint128_t) in[0]) * inx2[3] + ((uint128_t) in[1]) * inx2[2];
423    out[4] = ((uint128_t) in[0]) * inx2[4] +
424        ((uint128_t) in[1]) * inx2[3] + ((uint128_t) in[2]) * in[2];
425    out[5] = ((uint128_t) in[0]) * inx2[5] +
426        ((uint128_t) in[1]) * inx2[4] + ((uint128_t) in[2]) * inx2[3];
427    out[6] = ((uint128_t) in[0]) * inx2[6] +
428        ((uint128_t) in[1]) * inx2[5] +
429        ((uint128_t) in[2]) * inx2[4] + ((uint128_t) in[3]) * in[3];
430    out[7] = ((uint128_t) in[0]) * inx2[7] +
431        ((uint128_t) in[1]) * inx2[6] +
432        ((uint128_t) in[2]) * inx2[5] + ((uint128_t) in[3]) * inx2[4];
433    out[8] = ((uint128_t) in[0]) * inx2[8] +
434        ((uint128_t) in[1]) * inx2[7] +
435        ((uint128_t) in[2]) * inx2[6] +
436        ((uint128_t) in[3]) * inx2[5] + ((uint128_t) in[4]) * in[4];
437
438    /*
439     * The remaining limbs fall above 2^521, with the first falling at 2^522.
440     * They correspond to locations one bit up from the limbs produced above
441     * so we would have to multiply by two to align them. Again, rather than
442     * operate on the 128-bit result, we double one of the inputs to the
443     * multiplication. If we want to double for both this reason, and the
444     * reason above, then we end up multiplying by four.
445     */
446
447    /* 9 */
448    out[0] += ((uint128_t) in[1]) * inx4[8] +
449        ((uint128_t) in[2]) * inx4[7] +
450        ((uint128_t) in[3]) * inx4[6] + ((uint128_t) in[4]) * inx4[5];
451
452    /* 10 */
453    out[1] += ((uint128_t) in[2]) * inx4[8] +
454        ((uint128_t) in[3]) * inx4[7] +
455        ((uint128_t) in[4]) * inx4[6] + ((uint128_t) in[5]) * inx2[5];
456
457    /* 11 */
458    out[2] += ((uint128_t) in[3]) * inx4[8] +
459        ((uint128_t) in[4]) * inx4[7] + ((uint128_t) in[5]) * inx4[6];
460
461    /* 12 */
462    out[3] += ((uint128_t) in[4]) * inx4[8] +
463        ((uint128_t) in[5]) * inx4[7] + ((uint128_t) in[6]) * inx2[6];
464
465    /* 13 */
466    out[4] += ((uint128_t) in[5]) * inx4[8] + ((uint128_t) in[6]) * inx4[7];
467
468    /* 14 */
469    out[5] += ((uint128_t) in[6]) * inx4[8] + ((uint128_t) in[7]) * inx2[7];
470
471    /* 15 */
472    out[6] += ((uint128_t) in[7]) * inx4[8];
473
474    /* 16 */
475    out[7] += ((uint128_t) in[8]) * inx2[8];
476}
477
478/*-
479 * felem_mul sets |out| = |in1| * |in2|
480 * On entry:
481 *   in1[i] < 2^64
482 *   in2[i] < 2^63
483 * On exit:
484 *   out[i] < 17 * max(in1[i]) * max(in2[i])
485 */
486static void felem_mul(largefelem out, const felem in1, const felem in2)
487{
488    felem in2x2;
489    felem_scalar(in2x2, in2, 2);
490
491    out[0] = ((uint128_t) in1[0]) * in2[0];
492
493    out[1] = ((uint128_t) in1[0]) * in2[1] + ((uint128_t) in1[1]) * in2[0];
494
495    out[2] = ((uint128_t) in1[0]) * in2[2] +
496        ((uint128_t) in1[1]) * in2[1] + ((uint128_t) in1[2]) * in2[0];
497
498    out[3] = ((uint128_t) in1[0]) * in2[3] +
499        ((uint128_t) in1[1]) * in2[2] +
500        ((uint128_t) in1[2]) * in2[1] + ((uint128_t) in1[3]) * in2[0];
501
502    out[4] = ((uint128_t) in1[0]) * in2[4] +
503        ((uint128_t) in1[1]) * in2[3] +
504        ((uint128_t) in1[2]) * in2[2] +
505        ((uint128_t) in1[3]) * in2[1] + ((uint128_t) in1[4]) * in2[0];
506
507    out[5] = ((uint128_t) in1[0]) * in2[5] +
508        ((uint128_t) in1[1]) * in2[4] +
509        ((uint128_t) in1[2]) * in2[3] +
510        ((uint128_t) in1[3]) * in2[2] +
511        ((uint128_t) in1[4]) * in2[1] + ((uint128_t) in1[5]) * in2[0];
512
513    out[6] = ((uint128_t) in1[0]) * in2[6] +
514        ((uint128_t) in1[1]) * in2[5] +
515        ((uint128_t) in1[2]) * in2[4] +
516        ((uint128_t) in1[3]) * in2[3] +
517        ((uint128_t) in1[4]) * in2[2] +
518        ((uint128_t) in1[5]) * in2[1] + ((uint128_t) in1[6]) * in2[0];
519
520    out[7] = ((uint128_t) in1[0]) * in2[7] +
521        ((uint128_t) in1[1]) * in2[6] +
522        ((uint128_t) in1[2]) * in2[5] +
523        ((uint128_t) in1[3]) * in2[4] +
524        ((uint128_t) in1[4]) * in2[3] +
525        ((uint128_t) in1[5]) * in2[2] +
526        ((uint128_t) in1[6]) * in2[1] + ((uint128_t) in1[7]) * in2[0];
527
528    out[8] = ((uint128_t) in1[0]) * in2[8] +
529        ((uint128_t) in1[1]) * in2[7] +
530        ((uint128_t) in1[2]) * in2[6] +
531        ((uint128_t) in1[3]) * in2[5] +
532        ((uint128_t) in1[4]) * in2[4] +
533        ((uint128_t) in1[5]) * in2[3] +
534        ((uint128_t) in1[6]) * in2[2] +
535        ((uint128_t) in1[7]) * in2[1] + ((uint128_t) in1[8]) * in2[0];
536
537    /* See comment in felem_square about the use of in2x2 here */
538
539    out[0] += ((uint128_t) in1[1]) * in2x2[8] +
540        ((uint128_t) in1[2]) * in2x2[7] +
541        ((uint128_t) in1[3]) * in2x2[6] +
542        ((uint128_t) in1[4]) * in2x2[5] +
543        ((uint128_t) in1[5]) * in2x2[4] +
544        ((uint128_t) in1[6]) * in2x2[3] +
545        ((uint128_t) in1[7]) * in2x2[2] + ((uint128_t) in1[8]) * in2x2[1];
546
547    out[1] += ((uint128_t) in1[2]) * in2x2[8] +
548        ((uint128_t) in1[3]) * in2x2[7] +
549        ((uint128_t) in1[4]) * in2x2[6] +
550        ((uint128_t) in1[5]) * in2x2[5] +
551        ((uint128_t) in1[6]) * in2x2[4] +
552        ((uint128_t) in1[7]) * in2x2[3] + ((uint128_t) in1[8]) * in2x2[2];
553
554    out[2] += ((uint128_t) in1[3]) * in2x2[8] +
555        ((uint128_t) in1[4]) * in2x2[7] +
556        ((uint128_t) in1[5]) * in2x2[6] +
557        ((uint128_t) in1[6]) * in2x2[5] +
558        ((uint128_t) in1[7]) * in2x2[4] + ((uint128_t) in1[8]) * in2x2[3];
559
560    out[3] += ((uint128_t) in1[4]) * in2x2[8] +
561        ((uint128_t) in1[5]) * in2x2[7] +
562        ((uint128_t) in1[6]) * in2x2[6] +
563        ((uint128_t) in1[7]) * in2x2[5] + ((uint128_t) in1[8]) * in2x2[4];
564
565    out[4] += ((uint128_t) in1[5]) * in2x2[8] +
566        ((uint128_t) in1[6]) * in2x2[7] +
567        ((uint128_t) in1[7]) * in2x2[6] + ((uint128_t) in1[8]) * in2x2[5];
568
569    out[5] += ((uint128_t) in1[6]) * in2x2[8] +
570        ((uint128_t) in1[7]) * in2x2[7] + ((uint128_t) in1[8]) * in2x2[6];
571
572    out[6] += ((uint128_t) in1[7]) * in2x2[8] +
573        ((uint128_t) in1[8]) * in2x2[7];
574
575    out[7] += ((uint128_t) in1[8]) * in2x2[8];
576}
577
578static const limb bottom52bits = 0xfffffffffffff;
579
580/*-
581 * felem_reduce converts a largefelem to an felem.
582 * On entry:
583 *   in[i] < 2^128
584 * On exit:
585 *   out[i] < 2^59 + 2^14
586 */
587static void felem_reduce(felem out, const largefelem in)
588{
589    u64 overflow1, overflow2;
590
591    out[0] = ((limb) in[0]) & bottom58bits;
592    out[1] = ((limb) in[1]) & bottom58bits;
593    out[2] = ((limb) in[2]) & bottom58bits;
594    out[3] = ((limb) in[3]) & bottom58bits;
595    out[4] = ((limb) in[4]) & bottom58bits;
596    out[5] = ((limb) in[5]) & bottom58bits;
597    out[6] = ((limb) in[6]) & bottom58bits;
598    out[7] = ((limb) in[7]) & bottom58bits;
599    out[8] = ((limb) in[8]) & bottom58bits;
600
601    /* out[i] < 2^58 */
602
603    out[1] += ((limb) in[0]) >> 58;
604    out[1] += (((limb) (in[0] >> 64)) & bottom52bits) << 6;
605    /*-
606     * out[1] < 2^58 + 2^6 + 2^58
607     *        = 2^59 + 2^6
608     */
609    out[2] += ((limb) (in[0] >> 64)) >> 52;
610
611    out[2] += ((limb) in[1]) >> 58;
612    out[2] += (((limb) (in[1] >> 64)) & bottom52bits) << 6;
613    out[3] += ((limb) (in[1] >> 64)) >> 52;
614
615    out[3] += ((limb) in[2]) >> 58;
616    out[3] += (((limb) (in[2] >> 64)) & bottom52bits) << 6;
617    out[4] += ((limb) (in[2] >> 64)) >> 52;
618
619    out[4] += ((limb) in[3]) >> 58;
620    out[4] += (((limb) (in[3] >> 64)) & bottom52bits) << 6;
621    out[5] += ((limb) (in[3] >> 64)) >> 52;
622
623    out[5] += ((limb) in[4]) >> 58;
624    out[5] += (((limb) (in[4] >> 64)) & bottom52bits) << 6;
625    out[6] += ((limb) (in[4] >> 64)) >> 52;
626
627    out[6] += ((limb) in[5]) >> 58;
628    out[6] += (((limb) (in[5] >> 64)) & bottom52bits) << 6;
629    out[7] += ((limb) (in[5] >> 64)) >> 52;
630
631    out[7] += ((limb) in[6]) >> 58;
632    out[7] += (((limb) (in[6] >> 64)) & bottom52bits) << 6;
633    out[8] += ((limb) (in[6] >> 64)) >> 52;
634
635    out[8] += ((limb) in[7]) >> 58;
636    out[8] += (((limb) (in[7] >> 64)) & bottom52bits) << 6;
637    /*-
638     * out[x > 1] < 2^58 + 2^6 + 2^58 + 2^12
639     *            < 2^59 + 2^13
640     */
641    overflow1 = ((limb) (in[7] >> 64)) >> 52;
642
643    overflow1 += ((limb) in[8]) >> 58;
644    overflow1 += (((limb) (in[8] >> 64)) & bottom52bits) << 6;
645    overflow2 = ((limb) (in[8] >> 64)) >> 52;
646
647    overflow1 <<= 1;            /* overflow1 < 2^13 + 2^7 + 2^59 */
648    overflow2 <<= 1;            /* overflow2 < 2^13 */
649
650    out[0] += overflow1;        /* out[0] < 2^60 */
651    out[1] += overflow2;        /* out[1] < 2^59 + 2^6 + 2^13 */
652
653    out[1] += out[0] >> 58;
654    out[0] &= bottom58bits;
655    /*-
656     * out[0] < 2^58
657     * out[1] < 2^59 + 2^6 + 2^13 + 2^2
658     *        < 2^59 + 2^14
659     */
660}
661
662static void felem_square_reduce(felem out, const felem in)
663{
664    largefelem tmp;
665    felem_square(tmp, in);
666    felem_reduce(out, tmp);
667}
668
669static void felem_mul_reduce(felem out, const felem in1, const felem in2)
670{
671    largefelem tmp;
672    felem_mul(tmp, in1, in2);
673    felem_reduce(out, tmp);
674}
675
676/*-
677 * felem_inv calculates |out| = |in|^{-1}
678 *
679 * Based on Fermat's Little Theorem:
680 *   a^p = a (mod p)
681 *   a^{p-1} = 1 (mod p)
682 *   a^{p-2} = a^{-1} (mod p)
683 */
684static void felem_inv(felem out, const felem in)
685{
686    felem ftmp, ftmp2, ftmp3, ftmp4;
687    largefelem tmp;
688    unsigned i;
689
690    felem_square(tmp, in);
691    felem_reduce(ftmp, tmp);    /* 2^1 */
692    felem_mul(tmp, in, ftmp);
693    felem_reduce(ftmp, tmp);    /* 2^2 - 2^0 */
694    felem_assign(ftmp2, ftmp);
695    felem_square(tmp, ftmp);
696    felem_reduce(ftmp, tmp);    /* 2^3 - 2^1 */
697    felem_mul(tmp, in, ftmp);
698    felem_reduce(ftmp, tmp);    /* 2^3 - 2^0 */
699    felem_square(tmp, ftmp);
700    felem_reduce(ftmp, tmp);    /* 2^4 - 2^1 */
701
702    felem_square(tmp, ftmp2);
703    felem_reduce(ftmp3, tmp);   /* 2^3 - 2^1 */
704    felem_square(tmp, ftmp3);
705    felem_reduce(ftmp3, tmp);   /* 2^4 - 2^2 */
706    felem_mul(tmp, ftmp3, ftmp2);
707    felem_reduce(ftmp3, tmp);   /* 2^4 - 2^0 */
708
709    felem_assign(ftmp2, ftmp3);
710    felem_square(tmp, ftmp3);
711    felem_reduce(ftmp3, tmp);   /* 2^5 - 2^1 */
712    felem_square(tmp, ftmp3);
713    felem_reduce(ftmp3, tmp);   /* 2^6 - 2^2 */
714    felem_square(tmp, ftmp3);
715    felem_reduce(ftmp3, tmp);   /* 2^7 - 2^3 */
716    felem_square(tmp, ftmp3);
717    felem_reduce(ftmp3, tmp);   /* 2^8 - 2^4 */
718    felem_assign(ftmp4, ftmp3);
719    felem_mul(tmp, ftmp3, ftmp);
720    felem_reduce(ftmp4, tmp);   /* 2^8 - 2^1 */
721    felem_square(tmp, ftmp4);
722    felem_reduce(ftmp4, tmp);   /* 2^9 - 2^2 */
723    felem_mul(tmp, ftmp3, ftmp2);
724    felem_reduce(ftmp3, tmp);   /* 2^8 - 2^0 */
725    felem_assign(ftmp2, ftmp3);
726
727    for (i = 0; i < 8; i++) {
728        felem_square(tmp, ftmp3);
729        felem_reduce(ftmp3, tmp); /* 2^16 - 2^8 */
730    }
731    felem_mul(tmp, ftmp3, ftmp2);
732    felem_reduce(ftmp3, tmp);   /* 2^16 - 2^0 */
733    felem_assign(ftmp2, ftmp3);
734
735    for (i = 0; i < 16; i++) {
736        felem_square(tmp, ftmp3);
737        felem_reduce(ftmp3, tmp); /* 2^32 - 2^16 */
738    }
739    felem_mul(tmp, ftmp3, ftmp2);
740    felem_reduce(ftmp3, tmp);   /* 2^32 - 2^0 */
741    felem_assign(ftmp2, ftmp3);
742
743    for (i = 0; i < 32; i++) {
744        felem_square(tmp, ftmp3);
745        felem_reduce(ftmp3, tmp); /* 2^64 - 2^32 */
746    }
747    felem_mul(tmp, ftmp3, ftmp2);
748    felem_reduce(ftmp3, tmp);   /* 2^64 - 2^0 */
749    felem_assign(ftmp2, ftmp3);
750
751    for (i = 0; i < 64; i++) {
752        felem_square(tmp, ftmp3);
753        felem_reduce(ftmp3, tmp); /* 2^128 - 2^64 */
754    }
755    felem_mul(tmp, ftmp3, ftmp2);
756    felem_reduce(ftmp3, tmp);   /* 2^128 - 2^0 */
757    felem_assign(ftmp2, ftmp3);
758
759    for (i = 0; i < 128; i++) {
760        felem_square(tmp, ftmp3);
761        felem_reduce(ftmp3, tmp); /* 2^256 - 2^128 */
762    }
763    felem_mul(tmp, ftmp3, ftmp2);
764    felem_reduce(ftmp3, tmp);   /* 2^256 - 2^0 */
765    felem_assign(ftmp2, ftmp3);
766
767    for (i = 0; i < 256; i++) {
768        felem_square(tmp, ftmp3);
769        felem_reduce(ftmp3, tmp); /* 2^512 - 2^256 */
770    }
771    felem_mul(tmp, ftmp3, ftmp2);
772    felem_reduce(ftmp3, tmp);   /* 2^512 - 2^0 */
773
774    for (i = 0; i < 9; i++) {
775        felem_square(tmp, ftmp3);
776        felem_reduce(ftmp3, tmp); /* 2^521 - 2^9 */
777    }
778    felem_mul(tmp, ftmp3, ftmp4);
779    felem_reduce(ftmp3, tmp);   /* 2^512 - 2^2 */
780    felem_mul(tmp, ftmp3, in);
781    felem_reduce(out, tmp);     /* 2^512 - 3 */
782}
783
784/* This is 2^521-1, expressed as an felem */
785static const felem kPrime = {
786    0x03ffffffffffffff, 0x03ffffffffffffff, 0x03ffffffffffffff,
787    0x03ffffffffffffff, 0x03ffffffffffffff, 0x03ffffffffffffff,
788    0x03ffffffffffffff, 0x03ffffffffffffff, 0x01ffffffffffffff
789};
790
791/*-
792 * felem_is_zero returns a limb with all bits set if |in| == 0 (mod p) and 0
793 * otherwise.
794 * On entry:
795 *   in[i] < 2^59 + 2^14
796 */
797static limb felem_is_zero(const felem in)
798{
799    felem ftmp;
800    limb is_zero, is_p;
801    felem_assign(ftmp, in);
802
803    ftmp[0] += ftmp[8] >> 57;
804    ftmp[8] &= bottom57bits;
805    /* ftmp[8] < 2^57 */
806    ftmp[1] += ftmp[0] >> 58;
807    ftmp[0] &= bottom58bits;
808    ftmp[2] += ftmp[1] >> 58;
809    ftmp[1] &= bottom58bits;
810    ftmp[3] += ftmp[2] >> 58;
811    ftmp[2] &= bottom58bits;
812    ftmp[4] += ftmp[3] >> 58;
813    ftmp[3] &= bottom58bits;
814    ftmp[5] += ftmp[4] >> 58;
815    ftmp[4] &= bottom58bits;
816    ftmp[6] += ftmp[5] >> 58;
817    ftmp[5] &= bottom58bits;
818    ftmp[7] += ftmp[6] >> 58;
819    ftmp[6] &= bottom58bits;
820    ftmp[8] += ftmp[7] >> 58;
821    ftmp[7] &= bottom58bits;
822    /* ftmp[8] < 2^57 + 4 */
823
824    /*
825     * The ninth limb of 2*(2^521-1) is 0x03ffffffffffffff, which is greater
826     * than our bound for ftmp[8]. Therefore we only have to check if the
827     * zero is zero or 2^521-1.
828     */
829
830    is_zero = 0;
831    is_zero |= ftmp[0];
832    is_zero |= ftmp[1];
833    is_zero |= ftmp[2];
834    is_zero |= ftmp[3];
835    is_zero |= ftmp[4];
836    is_zero |= ftmp[5];
837    is_zero |= ftmp[6];
838    is_zero |= ftmp[7];
839    is_zero |= ftmp[8];
840
841    is_zero--;
842    /*
843     * We know that ftmp[i] < 2^63, therefore the only way that the top bit
844     * can be set is if is_zero was 0 before the decrement.
845     */
846    is_zero = 0 - (is_zero >> 63);
847
848    is_p = ftmp[0] ^ kPrime[0];
849    is_p |= ftmp[1] ^ kPrime[1];
850    is_p |= ftmp[2] ^ kPrime[2];
851    is_p |= ftmp[3] ^ kPrime[3];
852    is_p |= ftmp[4] ^ kPrime[4];
853    is_p |= ftmp[5] ^ kPrime[5];
854    is_p |= ftmp[6] ^ kPrime[6];
855    is_p |= ftmp[7] ^ kPrime[7];
856    is_p |= ftmp[8] ^ kPrime[8];
857
858    is_p--;
859    is_p = 0 - (is_p >> 63);
860
861    is_zero |= is_p;
862    return is_zero;
863}
864
865static int felem_is_zero_int(const void *in)
866{
867    return (int)(felem_is_zero(in) & ((limb) 1));
868}
869
870/*-
871 * felem_contract converts |in| to its unique, minimal representation.
872 * On entry:
873 *   in[i] < 2^59 + 2^14
874 */
875static void felem_contract(felem out, const felem in)
876{
877    limb is_p, is_greater, sign;
878    static const limb two58 = ((limb) 1) << 58;
879
880    felem_assign(out, in);
881
882    out[0] += out[8] >> 57;
883    out[8] &= bottom57bits;
884    /* out[8] < 2^57 */
885    out[1] += out[0] >> 58;
886    out[0] &= bottom58bits;
887    out[2] += out[1] >> 58;
888    out[1] &= bottom58bits;
889    out[3] += out[2] >> 58;
890    out[2] &= bottom58bits;
891    out[4] += out[3] >> 58;
892    out[3] &= bottom58bits;
893    out[5] += out[4] >> 58;
894    out[4] &= bottom58bits;
895    out[6] += out[5] >> 58;
896    out[5] &= bottom58bits;
897    out[7] += out[6] >> 58;
898    out[6] &= bottom58bits;
899    out[8] += out[7] >> 58;
900    out[7] &= bottom58bits;
901    /* out[8] < 2^57 + 4 */
902
903    /*
904     * If the value is greater than 2^521-1 then we have to subtract 2^521-1
905     * out. See the comments in felem_is_zero regarding why we don't test for
906     * other multiples of the prime.
907     */
908
909    /*
910     * First, if |out| is equal to 2^521-1, we subtract it out to get zero.
911     */
912
913    is_p = out[0] ^ kPrime[0];
914    is_p |= out[1] ^ kPrime[1];
915    is_p |= out[2] ^ kPrime[2];
916    is_p |= out[3] ^ kPrime[3];
917    is_p |= out[4] ^ kPrime[4];
918    is_p |= out[5] ^ kPrime[5];
919    is_p |= out[6] ^ kPrime[6];
920    is_p |= out[7] ^ kPrime[7];
921    is_p |= out[8] ^ kPrime[8];
922
923    is_p--;
924    is_p &= is_p << 32;
925    is_p &= is_p << 16;
926    is_p &= is_p << 8;
927    is_p &= is_p << 4;
928    is_p &= is_p << 2;
929    is_p &= is_p << 1;
930    is_p = 0 - (is_p >> 63);
931    is_p = ~is_p;
932
933    /* is_p is 0 iff |out| == 2^521-1 and all ones otherwise */
934
935    out[0] &= is_p;
936    out[1] &= is_p;
937    out[2] &= is_p;
938    out[3] &= is_p;
939    out[4] &= is_p;
940    out[5] &= is_p;
941    out[6] &= is_p;
942    out[7] &= is_p;
943    out[8] &= is_p;
944
945    /*
946     * In order to test that |out| >= 2^521-1 we need only test if out[8] >>
947     * 57 is greater than zero as (2^521-1) + x >= 2^522
948     */
949    is_greater = out[8] >> 57;
950    is_greater |= is_greater << 32;
951    is_greater |= is_greater << 16;
952    is_greater |= is_greater << 8;
953    is_greater |= is_greater << 4;
954    is_greater |= is_greater << 2;
955    is_greater |= is_greater << 1;
956    is_greater = 0 - (is_greater >> 63);
957
958    out[0] -= kPrime[0] & is_greater;
959    out[1] -= kPrime[1] & is_greater;
960    out[2] -= kPrime[2] & is_greater;
961    out[3] -= kPrime[3] & is_greater;
962    out[4] -= kPrime[4] & is_greater;
963    out[5] -= kPrime[5] & is_greater;
964    out[6] -= kPrime[6] & is_greater;
965    out[7] -= kPrime[7] & is_greater;
966    out[8] -= kPrime[8] & is_greater;
967
968    /* Eliminate negative coefficients */
969    sign = -(out[0] >> 63);
970    out[0] += (two58 & sign);
971    out[1] -= (1 & sign);
972    sign = -(out[1] >> 63);
973    out[1] += (two58 & sign);
974    out[2] -= (1 & sign);
975    sign = -(out[2] >> 63);
976    out[2] += (two58 & sign);
977    out[3] -= (1 & sign);
978    sign = -(out[3] >> 63);
979    out[3] += (two58 & sign);
980    out[4] -= (1 & sign);
981    sign = -(out[4] >> 63);
982    out[4] += (two58 & sign);
983    out[5] -= (1 & sign);
984    sign = -(out[0] >> 63);
985    out[5] += (two58 & sign);
986    out[6] -= (1 & sign);
987    sign = -(out[6] >> 63);
988    out[6] += (two58 & sign);
989    out[7] -= (1 & sign);
990    sign = -(out[7] >> 63);
991    out[7] += (two58 & sign);
992    out[8] -= (1 & sign);
993    sign = -(out[5] >> 63);
994    out[5] += (two58 & sign);
995    out[6] -= (1 & sign);
996    sign = -(out[6] >> 63);
997    out[6] += (two58 & sign);
998    out[7] -= (1 & sign);
999    sign = -(out[7] >> 63);
1000    out[7] += (two58 & sign);
1001    out[8] -= (1 & sign);
1002}
1003
1004/*-
1005 * Group operations
1006 * ----------------
1007 *
1008 * Building on top of the field operations we have the operations on the
1009 * elliptic curve group itself. Points on the curve are represented in Jacobian
1010 * coordinates */
1011
1012/*-
1013 * point_double calcuates 2*(x_in, y_in, z_in)
1014 *
1015 * The method is taken from:
1016 *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b
1017 *
1018 * Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed.
1019 * while x_out == y_in is not (maybe this works, but it's not tested). */
1020static void
1021point_double(felem x_out, felem y_out, felem z_out,
1022             const felem x_in, const felem y_in, const felem z_in)
1023{
1024    largefelem tmp, tmp2;
1025    felem delta, gamma, beta, alpha, ftmp, ftmp2;
1026
1027    felem_assign(ftmp, x_in);
1028    felem_assign(ftmp2, x_in);
1029
1030    /* delta = z^2 */
1031    felem_square(tmp, z_in);
1032    felem_reduce(delta, tmp);   /* delta[i] < 2^59 + 2^14 */
1033
1034    /* gamma = y^2 */
1035    felem_square(tmp, y_in);
1036    felem_reduce(gamma, tmp);   /* gamma[i] < 2^59 + 2^14 */
1037
1038    /* beta = x*gamma */
1039    felem_mul(tmp, x_in, gamma);
1040    felem_reduce(beta, tmp);    /* beta[i] < 2^59 + 2^14 */
1041
1042    /* alpha = 3*(x-delta)*(x+delta) */
1043    felem_diff64(ftmp, delta);
1044    /* ftmp[i] < 2^61 */
1045    felem_sum64(ftmp2, delta);
1046    /* ftmp2[i] < 2^60 + 2^15 */
1047    felem_scalar64(ftmp2, 3);
1048    /* ftmp2[i] < 3*2^60 + 3*2^15 */
1049    felem_mul(tmp, ftmp, ftmp2);
1050    /*-
1051     * tmp[i] < 17(3*2^121 + 3*2^76)
1052     *        = 61*2^121 + 61*2^76
1053     *        < 64*2^121 + 64*2^76
1054     *        = 2^127 + 2^82
1055     *        < 2^128
1056     */
1057    felem_reduce(alpha, tmp);
1058
1059    /* x' = alpha^2 - 8*beta */
1060    felem_square(tmp, alpha);
1061    /*
1062     * tmp[i] < 17*2^120 < 2^125
1063     */
1064    felem_assign(ftmp, beta);
1065    felem_scalar64(ftmp, 8);
1066    /* ftmp[i] < 2^62 + 2^17 */
1067    felem_diff_128_64(tmp, ftmp);
1068    /* tmp[i] < 2^125 + 2^63 + 2^62 + 2^17 */
1069    felem_reduce(x_out, tmp);
1070
1071    /* z' = (y + z)^2 - gamma - delta */
1072    felem_sum64(delta, gamma);
1073    /* delta[i] < 2^60 + 2^15 */
1074    felem_assign(ftmp, y_in);
1075    felem_sum64(ftmp, z_in);
1076    /* ftmp[i] < 2^60 + 2^15 */
1077    felem_square(tmp, ftmp);
1078    /*
1079     * tmp[i] < 17(2^122) < 2^127
1080     */
1081    felem_diff_128_64(tmp, delta);
1082    /* tmp[i] < 2^127 + 2^63 */
1083    felem_reduce(z_out, tmp);
1084
1085    /* y' = alpha*(4*beta - x') - 8*gamma^2 */
1086    felem_scalar64(beta, 4);
1087    /* beta[i] < 2^61 + 2^16 */
1088    felem_diff64(beta, x_out);
1089    /* beta[i] < 2^61 + 2^60 + 2^16 */
1090    felem_mul(tmp, alpha, beta);
1091    /*-
1092     * tmp[i] < 17*((2^59 + 2^14)(2^61 + 2^60 + 2^16))
1093     *        = 17*(2^120 + 2^75 + 2^119 + 2^74 + 2^75 + 2^30)
1094     *        = 17*(2^120 + 2^119 + 2^76 + 2^74 + 2^30)
1095     *        < 2^128
1096     */
1097    felem_square(tmp2, gamma);
1098    /*-
1099     * tmp2[i] < 17*(2^59 + 2^14)^2
1100     *         = 17*(2^118 + 2^74 + 2^28)
1101     */
1102    felem_scalar128(tmp2, 8);
1103    /*-
1104     * tmp2[i] < 8*17*(2^118 + 2^74 + 2^28)
1105     *         = 2^125 + 2^121 + 2^81 + 2^77 + 2^35 + 2^31
1106     *         < 2^126
1107     */
1108    felem_diff128(tmp, tmp2);
1109    /*-
1110     * tmp[i] < 2^127 - 2^69 + 17(2^120 + 2^119 + 2^76 + 2^74 + 2^30)
1111     *        = 2^127 + 2^124 + 2^122 + 2^120 + 2^118 + 2^80 + 2^78 + 2^76 +
1112     *          2^74 + 2^69 + 2^34 + 2^30
1113     *        < 2^128
1114     */
1115    felem_reduce(y_out, tmp);
1116}
1117
1118/* copy_conditional copies in to out iff mask is all ones. */
1119static void copy_conditional(felem out, const felem in, limb mask)
1120{
1121    unsigned i;
1122    for (i = 0; i < NLIMBS; ++i) {
1123        const limb tmp = mask & (in[i] ^ out[i]);
1124        out[i] ^= tmp;
1125    }
1126}
1127
1128/*-
1129 * point_add calcuates (x1, y1, z1) + (x2, y2, z2)
1130 *
1131 * The method is taken from
1132 *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl,
1133 * adapted for mixed addition (z2 = 1, or z2 = 0 for the point at infinity).
1134 *
1135 * This function includes a branch for checking whether the two input points
1136 * are equal (while not equal to the point at infinity). This case never
1137 * happens during single point multiplication, so there is no timing leak for
1138 * ECDH or ECDSA signing. */
1139static void point_add(felem x3, felem y3, felem z3,
1140                      const felem x1, const felem y1, const felem z1,
1141                      const int mixed, const felem x2, const felem y2,
1142                      const felem z2)
1143{
1144    felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, ftmp6, x_out, y_out, z_out;
1145    largefelem tmp, tmp2;
1146    limb x_equal, y_equal, z1_is_zero, z2_is_zero;
1147
1148    z1_is_zero = felem_is_zero(z1);
1149    z2_is_zero = felem_is_zero(z2);
1150
1151    /* ftmp = z1z1 = z1**2 */
1152    felem_square(tmp, z1);
1153    felem_reduce(ftmp, tmp);
1154
1155    if (!mixed) {
1156        /* ftmp2 = z2z2 = z2**2 */
1157        felem_square(tmp, z2);
1158        felem_reduce(ftmp2, tmp);
1159
1160        /* u1 = ftmp3 = x1*z2z2 */
1161        felem_mul(tmp, x1, ftmp2);
1162        felem_reduce(ftmp3, tmp);
1163
1164        /* ftmp5 = z1 + z2 */
1165        felem_assign(ftmp5, z1);
1166        felem_sum64(ftmp5, z2);
1167        /* ftmp5[i] < 2^61 */
1168
1169        /* ftmp5 = (z1 + z2)**2 - z1z1 - z2z2 = 2*z1z2 */
1170        felem_square(tmp, ftmp5);
1171        /* tmp[i] < 17*2^122 */
1172        felem_diff_128_64(tmp, ftmp);
1173        /* tmp[i] < 17*2^122 + 2^63 */
1174        felem_diff_128_64(tmp, ftmp2);
1175        /* tmp[i] < 17*2^122 + 2^64 */
1176        felem_reduce(ftmp5, tmp);
1177
1178        /* ftmp2 = z2 * z2z2 */
1179        felem_mul(tmp, ftmp2, z2);
1180        felem_reduce(ftmp2, tmp);
1181
1182        /* s1 = ftmp6 = y1 * z2**3 */
1183        felem_mul(tmp, y1, ftmp2);
1184        felem_reduce(ftmp6, tmp);
1185    } else {
1186        /*
1187         * We'll assume z2 = 1 (special case z2 = 0 is handled later)
1188         */
1189
1190        /* u1 = ftmp3 = x1*z2z2 */
1191        felem_assign(ftmp3, x1);
1192
1193        /* ftmp5 = 2*z1z2 */
1194        felem_scalar(ftmp5, z1, 2);
1195
1196        /* s1 = ftmp6 = y1 * z2**3 */
1197        felem_assign(ftmp6, y1);
1198    }
1199
1200    /* u2 = x2*z1z1 */
1201    felem_mul(tmp, x2, ftmp);
1202    /* tmp[i] < 17*2^120 */
1203
1204    /* h = ftmp4 = u2 - u1 */
1205    felem_diff_128_64(tmp, ftmp3);
1206    /* tmp[i] < 17*2^120 + 2^63 */
1207    felem_reduce(ftmp4, tmp);
1208
1209    x_equal = felem_is_zero(ftmp4);
1210
1211    /* z_out = ftmp5 * h */
1212    felem_mul(tmp, ftmp5, ftmp4);
1213    felem_reduce(z_out, tmp);
1214
1215    /* ftmp = z1 * z1z1 */
1216    felem_mul(tmp, ftmp, z1);
1217    felem_reduce(ftmp, tmp);
1218
1219    /* s2 = tmp = y2 * z1**3 */
1220    felem_mul(tmp, y2, ftmp);
1221    /* tmp[i] < 17*2^120 */
1222
1223    /* r = ftmp5 = (s2 - s1)*2 */
1224    felem_diff_128_64(tmp, ftmp6);
1225    /* tmp[i] < 17*2^120 + 2^63 */
1226    felem_reduce(ftmp5, tmp);
1227    y_equal = felem_is_zero(ftmp5);
1228    felem_scalar64(ftmp5, 2);
1229    /* ftmp5[i] < 2^61 */
1230
1231    if (x_equal && y_equal && !z1_is_zero && !z2_is_zero) {
1232        point_double(x3, y3, z3, x1, y1, z1);
1233        return;
1234    }
1235
1236    /* I = ftmp = (2h)**2 */
1237    felem_assign(ftmp, ftmp4);
1238    felem_scalar64(ftmp, 2);
1239    /* ftmp[i] < 2^61 */
1240    felem_square(tmp, ftmp);
1241    /* tmp[i] < 17*2^122 */
1242    felem_reduce(ftmp, tmp);
1243
1244    /* J = ftmp2 = h * I */
1245    felem_mul(tmp, ftmp4, ftmp);
1246    felem_reduce(ftmp2, tmp);
1247
1248    /* V = ftmp4 = U1 * I */
1249    felem_mul(tmp, ftmp3, ftmp);
1250    felem_reduce(ftmp4, tmp);
1251
1252    /* x_out = r**2 - J - 2V */
1253    felem_square(tmp, ftmp5);
1254    /* tmp[i] < 17*2^122 */
1255    felem_diff_128_64(tmp, ftmp2);
1256    /* tmp[i] < 17*2^122 + 2^63 */
1257    felem_assign(ftmp3, ftmp4);
1258    felem_scalar64(ftmp4, 2);
1259    /* ftmp4[i] < 2^61 */
1260    felem_diff_128_64(tmp, ftmp4);
1261    /* tmp[i] < 17*2^122 + 2^64 */
1262    felem_reduce(x_out, tmp);
1263
1264    /* y_out = r(V-x_out) - 2 * s1 * J */
1265    felem_diff64(ftmp3, x_out);
1266    /*
1267     * ftmp3[i] < 2^60 + 2^60 = 2^61
1268     */
1269    felem_mul(tmp, ftmp5, ftmp3);
1270    /* tmp[i] < 17*2^122 */
1271    felem_mul(tmp2, ftmp6, ftmp2);
1272    /* tmp2[i] < 17*2^120 */
1273    felem_scalar128(tmp2, 2);
1274    /* tmp2[i] < 17*2^121 */
1275    felem_diff128(tmp, tmp2);
1276        /*-
1277         * tmp[i] < 2^127 - 2^69 + 17*2^122
1278         *        = 2^126 - 2^122 - 2^6 - 2^2 - 1
1279         *        < 2^127
1280         */
1281    felem_reduce(y_out, tmp);
1282
1283    copy_conditional(x_out, x2, z1_is_zero);
1284    copy_conditional(x_out, x1, z2_is_zero);
1285    copy_conditional(y_out, y2, z1_is_zero);
1286    copy_conditional(y_out, y1, z2_is_zero);
1287    copy_conditional(z_out, z2, z1_is_zero);
1288    copy_conditional(z_out, z1, z2_is_zero);
1289    felem_assign(x3, x_out);
1290    felem_assign(y3, y_out);
1291    felem_assign(z3, z_out);
1292}
1293
1294/*-
1295 * Base point pre computation
1296 * --------------------------
1297 *
1298 * Two different sorts of precomputed tables are used in the following code.
1299 * Each contain various points on the curve, where each point is three field
1300 * elements (x, y, z).
1301 *
1302 * For the base point table, z is usually 1 (0 for the point at infinity).
1303 * This table has 16 elements:
1304 * index | bits    | point
1305 * ------+---------+------------------------------
1306 *     0 | 0 0 0 0 | 0G
1307 *     1 | 0 0 0 1 | 1G
1308 *     2 | 0 0 1 0 | 2^130G
1309 *     3 | 0 0 1 1 | (2^130 + 1)G
1310 *     4 | 0 1 0 0 | 2^260G
1311 *     5 | 0 1 0 1 | (2^260 + 1)G
1312 *     6 | 0 1 1 0 | (2^260 + 2^130)G
1313 *     7 | 0 1 1 1 | (2^260 + 2^130 + 1)G
1314 *     8 | 1 0 0 0 | 2^390G
1315 *     9 | 1 0 0 1 | (2^390 + 1)G
1316 *    10 | 1 0 1 0 | (2^390 + 2^130)G
1317 *    11 | 1 0 1 1 | (2^390 + 2^130 + 1)G
1318 *    12 | 1 1 0 0 | (2^390 + 2^260)G
1319 *    13 | 1 1 0 1 | (2^390 + 2^260 + 1)G
1320 *    14 | 1 1 1 0 | (2^390 + 2^260 + 2^130)G
1321 *    15 | 1 1 1 1 | (2^390 + 2^260 + 2^130 + 1)G
1322 *
1323 * The reason for this is so that we can clock bits into four different
1324 * locations when doing simple scalar multiplies against the base point.
1325 *
1326 * Tables for other points have table[i] = iG for i in 0 .. 16. */
1327
1328/* gmul is the table of precomputed base points */
1329static const felem gmul[16][3] = { {{0, 0, 0, 0, 0, 0, 0, 0, 0},
1330                                    {0, 0, 0, 0, 0, 0, 0, 0, 0},
1331                                    {0, 0, 0, 0, 0, 0, 0, 0, 0}},
1332{{0x017e7e31c2e5bd66, 0x022cf0615a90a6fe, 0x00127a2ffa8de334,
1333  0x01dfbf9d64a3f877, 0x006b4d3dbaa14b5e, 0x014fed487e0a2bd8,
1334  0x015b4429c6481390, 0x03a73678fb2d988e, 0x00c6858e06b70404},
1335 {0x00be94769fd16650, 0x031c21a89cb09022, 0x039013fad0761353,
1336  0x02657bd099031542, 0x03273e662c97ee72, 0x01e6d11a05ebef45,
1337  0x03d1bd998f544495, 0x03001172297ed0b1, 0x011839296a789a3b},
1338 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1339{{0x0373faacbc875bae, 0x00f325023721c671, 0x00f666fd3dbde5ad,
1340  0x01a6932363f88ea7, 0x01fc6d9e13f9c47b, 0x03bcbffc2bbf734e,
1341  0x013ee3c3647f3a92, 0x029409fefe75d07d, 0x00ef9199963d85e5},
1342 {0x011173743ad5b178, 0x02499c7c21bf7d46, 0x035beaeabb8b1a58,
1343  0x00f989c4752ea0a3, 0x0101e1de48a9c1a3, 0x01a20076be28ba6c,
1344  0x02f8052e5eb2de95, 0x01bfe8f82dea117c, 0x0160074d3c36ddb7},
1345 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1346{{0x012f3fc373393b3b, 0x03d3d6172f1419fa, 0x02adc943c0b86873,
1347  0x00d475584177952b, 0x012a4d1673750ee2, 0x00512517a0f13b0c,
1348  0x02b184671a7b1734, 0x0315b84236f1a50a, 0x00a4afc472edbdb9},
1349 {0x00152a7077f385c4, 0x03044007d8d1c2ee, 0x0065829d61d52b52,
1350  0x00494ff6b6631d0d, 0x00a11d94d5f06bcf, 0x02d2f89474d9282e,
1351  0x0241c5727c06eeb9, 0x0386928710fbdb9d, 0x01f883f727b0dfbe},
1352 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1353{{0x019b0c3c9185544d, 0x006243a37c9d97db, 0x02ee3cbe030a2ad2,
1354  0x00cfdd946bb51e0d, 0x0271c00932606b91, 0x03f817d1ec68c561,
1355  0x03f37009806a369c, 0x03c1f30baf184fd5, 0x01091022d6d2f065},
1356 {0x0292c583514c45ed, 0x0316fca51f9a286c, 0x00300af507c1489a,
1357  0x0295f69008298cf1, 0x02c0ed8274943d7b, 0x016509b9b47a431e,
1358  0x02bc9de9634868ce, 0x005b34929bffcb09, 0x000c1a0121681524},
1359 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1360{{0x0286abc0292fb9f2, 0x02665eee9805b3f7, 0x01ed7455f17f26d6,
1361  0x0346355b83175d13, 0x006284944cd0a097, 0x0191895bcdec5e51,
1362  0x02e288370afda7d9, 0x03b22312bfefa67a, 0x01d104d3fc0613fe},
1363 {0x0092421a12f7e47f, 0x0077a83fa373c501, 0x03bd25c5f696bd0d,
1364  0x035c41e4d5459761, 0x01ca0d1742b24f53, 0x00aaab27863a509c,
1365  0x018b6de47df73917, 0x025c0b771705cd01, 0x01fd51d566d760a7},
1366 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1367{{0x01dd92ff6b0d1dbd, 0x039c5e2e8f8afa69, 0x0261ed13242c3b27,
1368  0x0382c6e67026e6a0, 0x01d60b10be2089f9, 0x03c15f3dce86723f,
1369  0x03c764a32d2a062d, 0x017307eac0fad056, 0x018207c0b96c5256},
1370 {0x0196a16d60e13154, 0x03e6ce74c0267030, 0x00ddbf2b4e52a5aa,
1371  0x012738241bbf31c8, 0x00ebe8dc04685a28, 0x024c2ad6d380d4a2,
1372  0x035ee062a6e62d0e, 0x0029ed74af7d3a0f, 0x00eef32aec142ebd},
1373 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1374{{0x00c31ec398993b39, 0x03a9f45bcda68253, 0x00ac733c24c70890,
1375  0x00872b111401ff01, 0x01d178c23195eafb, 0x03bca2c816b87f74,
1376  0x0261a9af46fbad7a, 0x0324b2a8dd3d28f9, 0x00918121d8f24e23},
1377 {0x032bc8c1ca983cd7, 0x00d869dfb08fc8c6, 0x01693cb61fce1516,
1378  0x012a5ea68f4e88a8, 0x010869cab88d7ae3, 0x009081ad277ceee1,
1379  0x033a77166d064cdc, 0x03955235a1fb3a95, 0x01251a4a9b25b65e},
1380 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1381{{0x00148a3a1b27f40b, 0x0123186df1b31fdc, 0x00026e7beaad34ce,
1382  0x01db446ac1d3dbba, 0x0299c1a33437eaec, 0x024540610183cbb7,
1383  0x0173bb0e9ce92e46, 0x02b937e43921214b, 0x01ab0436a9bf01b5},
1384 {0x0383381640d46948, 0x008dacbf0e7f330f, 0x03602122bcc3f318,
1385  0x01ee596b200620d6, 0x03bd0585fda430b3, 0x014aed77fd123a83,
1386  0x005ace749e52f742, 0x0390fe041da2b842, 0x0189a8ceb3299242},
1387 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1388{{0x012a19d6b3282473, 0x00c0915918b423ce, 0x023a954eb94405ae,
1389  0x00529f692be26158, 0x0289fa1b6fa4b2aa, 0x0198ae4ceea346ef,
1390  0x0047d8cdfbdedd49, 0x00cc8c8953f0f6b8, 0x001424abbff49203},
1391 {0x0256732a1115a03a, 0x0351bc38665c6733, 0x03f7b950fb4a6447,
1392  0x000afffa94c22155, 0x025763d0a4dab540, 0x000511e92d4fc283,
1393  0x030a7e9eda0ee96c, 0x004c3cd93a28bf0a, 0x017edb3a8719217f},
1394 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1395{{0x011de5675a88e673, 0x031d7d0f5e567fbe, 0x0016b2062c970ae5,
1396  0x03f4a2be49d90aa7, 0x03cef0bd13822866, 0x03f0923dcf774a6c,
1397  0x0284bebc4f322f72, 0x016ab2645302bb2c, 0x01793f95dace0e2a},
1398 {0x010646e13527a28f, 0x01ca1babd59dc5e7, 0x01afedfd9a5595df,
1399  0x01f15785212ea6b1, 0x0324e5d64f6ae3f4, 0x02d680f526d00645,
1400  0x0127920fadf627a7, 0x03b383f75df4f684, 0x0089e0057e783b0a},
1401 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1402{{0x00f334b9eb3c26c6, 0x0298fdaa98568dce, 0x01c2d24843a82292,
1403  0x020bcb24fa1b0711, 0x02cbdb3d2b1875e6, 0x0014907598f89422,
1404  0x03abe3aa43b26664, 0x02cbf47f720bc168, 0x0133b5e73014b79b},
1405 {0x034aab5dab05779d, 0x00cdc5d71fee9abb, 0x0399f16bd4bd9d30,
1406  0x03582fa592d82647, 0x02be1cdfb775b0e9, 0x0034f7cea32e94cb,
1407  0x0335a7f08f56f286, 0x03b707e9565d1c8b, 0x0015c946ea5b614f},
1408 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1409{{0x024676f6cff72255, 0x00d14625cac96378, 0x00532b6008bc3767,
1410  0x01fc16721b985322, 0x023355ea1b091668, 0x029de7afdc0317c3,
1411  0x02fc8a7ca2da037c, 0x02de1217d74a6f30, 0x013f7173175b73bf},
1412 {0x0344913f441490b5, 0x0200f9e272b61eca, 0x0258a246b1dd55d2,
1413  0x03753db9ea496f36, 0x025e02937a09c5ef, 0x030cbd3d14012692,
1414  0x01793a67e70dc72a, 0x03ec1d37048a662e, 0x006550f700c32a8d},
1415 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1416{{0x00d3f48a347eba27, 0x008e636649b61bd8, 0x00d3b93716778fb3,
1417  0x004d1915757bd209, 0x019d5311a3da44e0, 0x016d1afcbbe6aade,
1418  0x0241bf5f73265616, 0x0384672e5d50d39b, 0x005009fee522b684},
1419 {0x029b4fab064435fe, 0x018868ee095bbb07, 0x01ea3d6936cc92b8,
1420  0x000608b00f78a2f3, 0x02db911073d1c20f, 0x018205938470100a,
1421  0x01f1e4964cbe6ff2, 0x021a19a29eed4663, 0x01414485f42afa81},
1422 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1423{{0x01612b3a17f63e34, 0x03813992885428e6, 0x022b3c215b5a9608,
1424  0x029b4057e19f2fcb, 0x0384059a587af7e6, 0x02d6400ace6fe610,
1425  0x029354d896e8e331, 0x00c047ee6dfba65e, 0x0037720542e9d49d},
1426 {0x02ce9eed7c5e9278, 0x0374ed703e79643b, 0x01316c54c4072006,
1427  0x005aaa09054b2ee8, 0x002824000c840d57, 0x03d4eba24771ed86,
1428  0x0189c50aabc3bdae, 0x0338c01541e15510, 0x00466d56e38eed42},
1429 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1430{{0x007efd8330ad8bd6, 0x02465ed48047710b, 0x0034c6606b215e0c,
1431  0x016ae30c53cbf839, 0x01fa17bd37161216, 0x018ead4e61ce8ab9,
1432  0x005482ed5f5dee46, 0x037543755bba1d7f, 0x005e5ac7e70a9d0f},
1433 {0x0117e1bb2fdcb2a2, 0x03deea36249f40c4, 0x028d09b4a6246cb7,
1434  0x03524b8855bcf756, 0x023d7d109d5ceb58, 0x0178e43e3223ef9c,
1435  0x0154536a0c6e966a, 0x037964d1286ee9fe, 0x0199bcd90e125055},
1436 {1, 0, 0, 0, 0, 0, 0, 0, 0}}
1437};
1438
1439/*
1440 * select_point selects the |idx|th point from a precomputation table and
1441 * copies it to out.
1442 */
1443 /* pre_comp below is of the size provided in |size| */
1444static void select_point(const limb idx, unsigned int size,
1445                         const felem pre_comp[][3], felem out[3])
1446{
1447    unsigned i, j;
1448    limb *outlimbs = &out[0][0];
1449    memset(outlimbs, 0, 3 * sizeof(felem));
1450
1451    for (i = 0; i < size; i++) {
1452        const limb *inlimbs = &pre_comp[i][0][0];
1453        limb mask = i ^ idx;
1454        mask |= mask >> 4;
1455        mask |= mask >> 2;
1456        mask |= mask >> 1;
1457        mask &= 1;
1458        mask--;
1459        for (j = 0; j < NLIMBS * 3; j++)
1460            outlimbs[j] |= inlimbs[j] & mask;
1461    }
1462}
1463
1464/* get_bit returns the |i|th bit in |in| */
1465static char get_bit(const felem_bytearray in, int i)
1466{
1467    if (i < 0)
1468        return 0;
1469    return (in[i >> 3] >> (i & 7)) & 1;
1470}
1471
1472/*
1473 * Interleaved point multiplication using precomputed point multiples: The
1474 * small point multiples 0*P, 1*P, ..., 16*P are in pre_comp[], the scalars
1475 * in scalars[]. If g_scalar is non-NULL, we also add this multiple of the
1476 * generator, using certain (large) precomputed multiples in g_pre_comp.
1477 * Output point (X, Y, Z) is stored in x_out, y_out, z_out
1478 */
1479static void batch_mul(felem x_out, felem y_out, felem z_out,
1480                      const felem_bytearray scalars[],
1481                      const unsigned num_points, const u8 *g_scalar,
1482                      const int mixed, const felem pre_comp[][17][3],
1483                      const felem g_pre_comp[16][3])
1484{
1485    int i, skip;
1486    unsigned num, gen_mul = (g_scalar != NULL);
1487    felem nq[3], tmp[4];
1488    limb bits;
1489    u8 sign, digit;
1490
1491    /* set nq to the point at infinity */
1492    memset(nq, 0, 3 * sizeof(felem));
1493
1494    /*
1495     * Loop over all scalars msb-to-lsb, interleaving additions of multiples
1496     * of the generator (last quarter of rounds) and additions of other
1497     * points multiples (every 5th round).
1498     */
1499    skip = 1;                   /* save two point operations in the first
1500                                 * round */
1501    for (i = (num_points ? 520 : 130); i >= 0; --i) {
1502        /* double */
1503        if (!skip)
1504            point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]);
1505
1506        /* add multiples of the generator */
1507        if (gen_mul && (i <= 130)) {
1508            bits = get_bit(g_scalar, i + 390) << 3;
1509            if (i < 130) {
1510                bits |= get_bit(g_scalar, i + 260) << 2;
1511                bits |= get_bit(g_scalar, i + 130) << 1;
1512                bits |= get_bit(g_scalar, i);
1513            }
1514            /* select the point to add, in constant time */
1515            select_point(bits, 16, g_pre_comp, tmp);
1516            if (!skip) {
1517                /* The 1 argument below is for "mixed" */
1518                point_add(nq[0], nq[1], nq[2],
1519                          nq[0], nq[1], nq[2], 1, tmp[0], tmp[1], tmp[2]);
1520            } else {
1521                memcpy(nq, tmp, 3 * sizeof(felem));
1522                skip = 0;
1523            }
1524        }
1525
1526        /* do other additions every 5 doublings */
1527        if (num_points && (i % 5 == 0)) {
1528            /* loop over all scalars */
1529            for (num = 0; num < num_points; ++num) {
1530                bits = get_bit(scalars[num], i + 4) << 5;
1531                bits |= get_bit(scalars[num], i + 3) << 4;
1532                bits |= get_bit(scalars[num], i + 2) << 3;
1533                bits |= get_bit(scalars[num], i + 1) << 2;
1534                bits |= get_bit(scalars[num], i) << 1;
1535                bits |= get_bit(scalars[num], i - 1);
1536                ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits);
1537
1538                /*
1539                 * select the point to add or subtract, in constant time
1540                 */
1541                select_point(digit, 17, pre_comp[num], tmp);
1542                felem_neg(tmp[3], tmp[1]); /* (X, -Y, Z) is the negative
1543                                            * point */
1544                copy_conditional(tmp[1], tmp[3], (-(limb) sign));
1545
1546                if (!skip) {
1547                    point_add(nq[0], nq[1], nq[2],
1548                              nq[0], nq[1], nq[2],
1549                              mixed, tmp[0], tmp[1], tmp[2]);
1550                } else {
1551                    memcpy(nq, tmp, 3 * sizeof(felem));
1552                    skip = 0;
1553                }
1554            }
1555        }
1556    }
1557    felem_assign(x_out, nq[0]);
1558    felem_assign(y_out, nq[1]);
1559    felem_assign(z_out, nq[2]);
1560}
1561
1562/* Precomputation for the group generator. */
1563typedef struct {
1564    felem g_pre_comp[16][3];
1565    int references;
1566} NISTP521_PRE_COMP;
1567
1568const EC_METHOD *EC_GFp_nistp521_method(void)
1569{
1570    static const EC_METHOD ret = {
1571        EC_FLAGS_DEFAULT_OCT,
1572        NID_X9_62_prime_field,
1573        ec_GFp_nistp521_group_init,
1574        ec_GFp_simple_group_finish,
1575        ec_GFp_simple_group_clear_finish,
1576        ec_GFp_nist_group_copy,
1577        ec_GFp_nistp521_group_set_curve,
1578        ec_GFp_simple_group_get_curve,
1579        ec_GFp_simple_group_get_degree,
1580        ec_GFp_simple_group_check_discriminant,
1581        ec_GFp_simple_point_init,
1582        ec_GFp_simple_point_finish,
1583        ec_GFp_simple_point_clear_finish,
1584        ec_GFp_simple_point_copy,
1585        ec_GFp_simple_point_set_to_infinity,
1586        ec_GFp_simple_set_Jprojective_coordinates_GFp,
1587        ec_GFp_simple_get_Jprojective_coordinates_GFp,
1588        ec_GFp_simple_point_set_affine_coordinates,
1589        ec_GFp_nistp521_point_get_affine_coordinates,
1590        0 /* point_set_compressed_coordinates */ ,
1591        0 /* point2oct */ ,
1592        0 /* oct2point */ ,
1593        ec_GFp_simple_add,
1594        ec_GFp_simple_dbl,
1595        ec_GFp_simple_invert,
1596        ec_GFp_simple_is_at_infinity,
1597        ec_GFp_simple_is_on_curve,
1598        ec_GFp_simple_cmp,
1599        ec_GFp_simple_make_affine,
1600        ec_GFp_simple_points_make_affine,
1601        ec_GFp_nistp521_points_mul,
1602        ec_GFp_nistp521_precompute_mult,
1603        ec_GFp_nistp521_have_precompute_mult,
1604        ec_GFp_nist_field_mul,
1605        ec_GFp_nist_field_sqr,
1606        0 /* field_div */ ,
1607        0 /* field_encode */ ,
1608        0 /* field_decode */ ,
1609        0                       /* field_set_to_one */
1610    };
1611
1612    return &ret;
1613}
1614
1615/******************************************************************************/
1616/*
1617 * FUNCTIONS TO MANAGE PRECOMPUTATION
1618 */
1619
1620static NISTP521_PRE_COMP *nistp521_pre_comp_new()
1621{
1622    NISTP521_PRE_COMP *ret = NULL;
1623    ret = (NISTP521_PRE_COMP *) OPENSSL_malloc(sizeof(NISTP521_PRE_COMP));
1624    if (!ret) {
1625        ECerr(EC_F_NISTP521_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE);
1626        return ret;
1627    }
1628    memset(ret->g_pre_comp, 0, sizeof(ret->g_pre_comp));
1629    ret->references = 1;
1630    return ret;
1631}
1632
1633static void *nistp521_pre_comp_dup(void *src_)
1634{
1635    NISTP521_PRE_COMP *src = src_;
1636
1637    /* no need to actually copy, these objects never change! */
1638    CRYPTO_add(&src->references, 1, CRYPTO_LOCK_EC_PRE_COMP);
1639
1640    return src_;
1641}
1642
1643static void nistp521_pre_comp_free(void *pre_)
1644{
1645    int i;
1646    NISTP521_PRE_COMP *pre = pre_;
1647
1648    if (!pre)
1649        return;
1650
1651    i = CRYPTO_add(&pre->references, -1, CRYPTO_LOCK_EC_PRE_COMP);
1652    if (i > 0)
1653        return;
1654
1655    OPENSSL_free(pre);
1656}
1657
1658static void nistp521_pre_comp_clear_free(void *pre_)
1659{
1660    int i;
1661    NISTP521_PRE_COMP *pre = pre_;
1662
1663    if (!pre)
1664        return;
1665
1666    i = CRYPTO_add(&pre->references, -1, CRYPTO_LOCK_EC_PRE_COMP);
1667    if (i > 0)
1668        return;
1669
1670    OPENSSL_cleanse(pre, sizeof(*pre));
1671    OPENSSL_free(pre);
1672}
1673
1674/******************************************************************************/
1675/*
1676 * OPENSSL EC_METHOD FUNCTIONS
1677 */
1678
1679int ec_GFp_nistp521_group_init(EC_GROUP *group)
1680{
1681    int ret;
1682    ret = ec_GFp_simple_group_init(group);
1683    group->a_is_minus3 = 1;
1684    return ret;
1685}
1686
1687int ec_GFp_nistp521_group_set_curve(EC_GROUP *group, const BIGNUM *p,
1688                                    const BIGNUM *a, const BIGNUM *b,
1689                                    BN_CTX *ctx)
1690{
1691    int ret = 0;
1692    BN_CTX *new_ctx = NULL;
1693    BIGNUM *curve_p, *curve_a, *curve_b;
1694
1695    if (ctx == NULL)
1696        if ((ctx = new_ctx = BN_CTX_new()) == NULL)
1697            return 0;
1698    BN_CTX_start(ctx);
1699    if (((curve_p = BN_CTX_get(ctx)) == NULL) ||
1700        ((curve_a = BN_CTX_get(ctx)) == NULL) ||
1701        ((curve_b = BN_CTX_get(ctx)) == NULL))
1702        goto err;
1703    BN_bin2bn(nistp521_curve_params[0], sizeof(felem_bytearray), curve_p);
1704    BN_bin2bn(nistp521_curve_params[1], sizeof(felem_bytearray), curve_a);
1705    BN_bin2bn(nistp521_curve_params[2], sizeof(felem_bytearray), curve_b);
1706    if ((BN_cmp(curve_p, p)) || (BN_cmp(curve_a, a)) || (BN_cmp(curve_b, b))) {
1707        ECerr(EC_F_EC_GFP_NISTP521_GROUP_SET_CURVE,
1708              EC_R_WRONG_CURVE_PARAMETERS);
1709        goto err;
1710    }
1711    group->field_mod_func = BN_nist_mod_521;
1712    ret = ec_GFp_simple_group_set_curve(group, p, a, b, ctx);
1713 err:
1714    BN_CTX_end(ctx);
1715    if (new_ctx != NULL)
1716        BN_CTX_free(new_ctx);
1717    return ret;
1718}
1719
1720/*
1721 * Takes the Jacobian coordinates (X, Y, Z) of a point and returns (X', Y') =
1722 * (X/Z^2, Y/Z^3)
1723 */
1724int ec_GFp_nistp521_point_get_affine_coordinates(const EC_GROUP *group,
1725                                                 const EC_POINT *point,
1726                                                 BIGNUM *x, BIGNUM *y,
1727                                                 BN_CTX *ctx)
1728{
1729    felem z1, z2, x_in, y_in, x_out, y_out;
1730    largefelem tmp;
1731
1732    if (EC_POINT_is_at_infinity(group, point)) {
1733        ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES,
1734              EC_R_POINT_AT_INFINITY);
1735        return 0;
1736    }
1737    if ((!BN_to_felem(x_in, &point->X)) || (!BN_to_felem(y_in, &point->Y)) ||
1738        (!BN_to_felem(z1, &point->Z)))
1739        return 0;
1740    felem_inv(z2, z1);
1741    felem_square(tmp, z2);
1742    felem_reduce(z1, tmp);
1743    felem_mul(tmp, x_in, z1);
1744    felem_reduce(x_in, tmp);
1745    felem_contract(x_out, x_in);
1746    if (x != NULL) {
1747        if (!felem_to_BN(x, x_out)) {
1748            ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES,
1749                  ERR_R_BN_LIB);
1750            return 0;
1751        }
1752    }
1753    felem_mul(tmp, z1, z2);
1754    felem_reduce(z1, tmp);
1755    felem_mul(tmp, y_in, z1);
1756    felem_reduce(y_in, tmp);
1757    felem_contract(y_out, y_in);
1758    if (y != NULL) {
1759        if (!felem_to_BN(y, y_out)) {
1760            ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES,
1761                  ERR_R_BN_LIB);
1762            return 0;
1763        }
1764    }
1765    return 1;
1766}
1767
1768/* points below is of size |num|, and tmp_felems is of size |num+1/ */
1769static void make_points_affine(size_t num, felem points[][3],
1770                               felem tmp_felems[])
1771{
1772    /*
1773     * Runs in constant time, unless an input is the point at infinity (which
1774     * normally shouldn't happen).
1775     */
1776    ec_GFp_nistp_points_make_affine_internal(num,
1777                                             points,
1778                                             sizeof(felem),
1779                                             tmp_felems,
1780                                             (void (*)(void *))felem_one,
1781                                             felem_is_zero_int,
1782                                             (void (*)(void *, const void *))
1783                                             felem_assign,
1784                                             (void (*)(void *, const void *))
1785                                             felem_square_reduce, (void (*)
1786                                                                   (void *,
1787                                                                    const void
1788                                                                    *,
1789                                                                    const void
1790                                                                    *))
1791                                             felem_mul_reduce,
1792                                             (void (*)(void *, const void *))
1793                                             felem_inv,
1794                                             (void (*)(void *, const void *))
1795                                             felem_contract);
1796}
1797
1798/*
1799 * Computes scalar*generator + \sum scalars[i]*points[i], ignoring NULL
1800 * values Result is stored in r (r can equal one of the inputs).
1801 */
1802int ec_GFp_nistp521_points_mul(const EC_GROUP *group, EC_POINT *r,
1803                               const BIGNUM *scalar, size_t num,
1804                               const EC_POINT *points[],
1805                               const BIGNUM *scalars[], BN_CTX *ctx)
1806{
1807    int ret = 0;
1808    int j;
1809    int mixed = 0;
1810    BN_CTX *new_ctx = NULL;
1811    BIGNUM *x, *y, *z, *tmp_scalar;
1812    felem_bytearray g_secret;
1813    felem_bytearray *secrets = NULL;
1814    felem(*pre_comp)[17][3] = NULL;
1815    felem *tmp_felems = NULL;
1816    unsigned i;
1817    int num_bytes;
1818    int have_pre_comp = 0;
1819    size_t num_points = num;
1820    felem x_in, y_in, z_in, x_out, y_out, z_out;
1821    NISTP521_PRE_COMP *pre = NULL;
1822    felem(*g_pre_comp)[3] = NULL;
1823    EC_POINT *generator = NULL;
1824    const EC_POINT *p = NULL;
1825    const BIGNUM *p_scalar = NULL;
1826
1827    if (ctx == NULL)
1828        if ((ctx = new_ctx = BN_CTX_new()) == NULL)
1829            return 0;
1830    BN_CTX_start(ctx);
1831    if (((x = BN_CTX_get(ctx)) == NULL) ||
1832        ((y = BN_CTX_get(ctx)) == NULL) ||
1833        ((z = BN_CTX_get(ctx)) == NULL) ||
1834        ((tmp_scalar = BN_CTX_get(ctx)) == NULL))
1835        goto err;
1836
1837    if (scalar != NULL) {
1838        pre = EC_EX_DATA_get_data(group->extra_data,
1839                                  nistp521_pre_comp_dup,
1840                                  nistp521_pre_comp_free,
1841                                  nistp521_pre_comp_clear_free);
1842        if (pre)
1843            /* we have precomputation, try to use it */
1844            g_pre_comp = &pre->g_pre_comp[0];
1845        else
1846            /* try to use the standard precomputation */
1847            g_pre_comp = (felem(*)[3]) gmul;
1848        generator = EC_POINT_new(group);
1849        if (generator == NULL)
1850            goto err;
1851        /* get the generator from precomputation */
1852        if (!felem_to_BN(x, g_pre_comp[1][0]) ||
1853            !felem_to_BN(y, g_pre_comp[1][1]) ||
1854            !felem_to_BN(z, g_pre_comp[1][2])) {
1855            ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
1856            goto err;
1857        }
1858        if (!EC_POINT_set_Jprojective_coordinates_GFp(group,
1859                                                      generator, x, y, z,
1860                                                      ctx))
1861            goto err;
1862        if (0 == EC_POINT_cmp(group, generator, group->generator, ctx))
1863            /* precomputation matches generator */
1864            have_pre_comp = 1;
1865        else
1866            /*
1867             * we don't have valid precomputation: treat the generator as a
1868             * random point
1869             */
1870            num_points++;
1871    }
1872
1873    if (num_points > 0) {
1874        if (num_points >= 2) {
1875            /*
1876             * unless we precompute multiples for just one point, converting
1877             * those into affine form is time well spent
1878             */
1879            mixed = 1;
1880        }
1881        secrets = OPENSSL_malloc(num_points * sizeof(felem_bytearray));
1882        pre_comp = OPENSSL_malloc(num_points * 17 * 3 * sizeof(felem));
1883        if (mixed)
1884            tmp_felems =
1885                OPENSSL_malloc((num_points * 17 + 1) * sizeof(felem));
1886        if ((secrets == NULL) || (pre_comp == NULL)
1887            || (mixed && (tmp_felems == NULL))) {
1888            ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_MALLOC_FAILURE);
1889            goto err;
1890        }
1891
1892        /*
1893         * we treat NULL scalars as 0, and NULL points as points at infinity,
1894         * i.e., they contribute nothing to the linear combination
1895         */
1896        memset(secrets, 0, num_points * sizeof(felem_bytearray));
1897        memset(pre_comp, 0, num_points * 17 * 3 * sizeof(felem));
1898        for (i = 0; i < num_points; ++i) {
1899            if (i == num) {
1900                /*
1901                 * we didn't have a valid precomputation, so we pick the
1902                 * generator
1903                 */
1904                p = EC_GROUP_get0_generator(group);
1905                p_scalar = scalar;
1906            } else {
1907                /* the i^th point */
1908                p = points[i];
1909                p_scalar = scalars[i];
1910            }
1911            if ((p_scalar != NULL) && (p != NULL)) {
1912                /* reduce scalar to 0 <= scalar < 2^521 */
1913                if ((BN_num_bits(p_scalar) > 521)
1914                    || (BN_is_negative(p_scalar))) {
1915                    /*
1916                     * this is an unusual input, and we don't guarantee
1917                     * constant-timeness
1918                     */
1919                    if (!BN_nnmod(tmp_scalar, p_scalar, &group->order, ctx)) {
1920                        ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
1921                        goto err;
1922                    }
1923                    num_bytes = bn_bn2lebinpad(tmp_scalar,
1924                                               secrets[i], sizeof(secrets[i]));
1925                } else {
1926                    num_bytes = bn_bn2lebinpad(p_scalar,
1927                                               secrets[i], sizeof(secrets[i]));
1928                }
1929                if (num_bytes < 0) {
1930                    ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
1931                    goto err;
1932                }
1933                /* precompute multiples */
1934                if ((!BN_to_felem(x_out, &p->X)) ||
1935                    (!BN_to_felem(y_out, &p->Y)) ||
1936                    (!BN_to_felem(z_out, &p->Z)))
1937                    goto err;
1938                memcpy(pre_comp[i][1][0], x_out, sizeof(felem));
1939                memcpy(pre_comp[i][1][1], y_out, sizeof(felem));
1940                memcpy(pre_comp[i][1][2], z_out, sizeof(felem));
1941                for (j = 2; j <= 16; ++j) {
1942                    if (j & 1) {
1943                        point_add(pre_comp[i][j][0], pre_comp[i][j][1],
1944                                  pre_comp[i][j][2], pre_comp[i][1][0],
1945                                  pre_comp[i][1][1], pre_comp[i][1][2], 0,
1946                                  pre_comp[i][j - 1][0],
1947                                  pre_comp[i][j - 1][1],
1948                                  pre_comp[i][j - 1][2]);
1949                    } else {
1950                        point_double(pre_comp[i][j][0], pre_comp[i][j][1],
1951                                     pre_comp[i][j][2], pre_comp[i][j / 2][0],
1952                                     pre_comp[i][j / 2][1],
1953                                     pre_comp[i][j / 2][2]);
1954                    }
1955                }
1956            }
1957        }
1958        if (mixed)
1959            make_points_affine(num_points * 17, pre_comp[0], tmp_felems);
1960    }
1961
1962    /* the scalar for the generator */
1963    if ((scalar != NULL) && (have_pre_comp)) {
1964        memset(g_secret, 0, sizeof(g_secret));
1965        /* reduce scalar to 0 <= scalar < 2^521 */
1966        if ((BN_num_bits(scalar) > 521) || (BN_is_negative(scalar))) {
1967            /*
1968             * this is an unusual input, and we don't guarantee
1969             * constant-timeness
1970             */
1971            if (!BN_nnmod(tmp_scalar, scalar, &group->order, ctx)) {
1972                ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
1973                goto err;
1974            }
1975            num_bytes = bn_bn2lebinpad(tmp_scalar, g_secret, sizeof(g_secret));
1976        } else {
1977            num_bytes = bn_bn2lebinpad(scalar, g_secret, sizeof(g_secret));
1978        }
1979        /* do the multiplication with generator precomputation */
1980        batch_mul(x_out, y_out, z_out,
1981                  (const felem_bytearray(*))secrets, num_points,
1982                  g_secret,
1983                  mixed, (const felem(*)[17][3])pre_comp,
1984                  (const felem(*)[3])g_pre_comp);
1985    } else {
1986        /* do the multiplication without generator precomputation */
1987        batch_mul(x_out, y_out, z_out,
1988                  (const felem_bytearray(*))secrets, num_points,
1989                  NULL, mixed, (const felem(*)[17][3])pre_comp, NULL);
1990    }
1991    /* reduce the output to its unique minimal representation */
1992    felem_contract(x_in, x_out);
1993    felem_contract(y_in, y_out);
1994    felem_contract(z_in, z_out);
1995    if ((!felem_to_BN(x, x_in)) || (!felem_to_BN(y, y_in)) ||
1996        (!felem_to_BN(z, z_in))) {
1997        ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
1998        goto err;
1999    }
2000    ret = EC_POINT_set_Jprojective_coordinates_GFp(group, r, x, y, z, ctx);
2001
2002 err:
2003    BN_CTX_end(ctx);
2004    if (generator != NULL)
2005        EC_POINT_free(generator);
2006    if (new_ctx != NULL)
2007        BN_CTX_free(new_ctx);
2008    if (secrets != NULL)
2009        OPENSSL_free(secrets);
2010    if (pre_comp != NULL)
2011        OPENSSL_free(pre_comp);
2012    if (tmp_felems != NULL)
2013        OPENSSL_free(tmp_felems);
2014    return ret;
2015}
2016
2017int ec_GFp_nistp521_precompute_mult(EC_GROUP *group, BN_CTX *ctx)
2018{
2019    int ret = 0;
2020    NISTP521_PRE_COMP *pre = NULL;
2021    int i, j;
2022    BN_CTX *new_ctx = NULL;
2023    BIGNUM *x, *y;
2024    EC_POINT *generator = NULL;
2025    felem tmp_felems[16];
2026
2027    /* throw away old precomputation */
2028    EC_EX_DATA_free_data(&group->extra_data, nistp521_pre_comp_dup,
2029                         nistp521_pre_comp_free,
2030                         nistp521_pre_comp_clear_free);
2031    if (ctx == NULL)
2032        if ((ctx = new_ctx = BN_CTX_new()) == NULL)
2033            return 0;
2034    BN_CTX_start(ctx);
2035    if (((x = BN_CTX_get(ctx)) == NULL) || ((y = BN_CTX_get(ctx)) == NULL))
2036        goto err;
2037    /* get the generator */
2038    if (group->generator == NULL)
2039        goto err;
2040    generator = EC_POINT_new(group);
2041    if (generator == NULL)
2042        goto err;
2043    BN_bin2bn(nistp521_curve_params[3], sizeof(felem_bytearray), x);
2044    BN_bin2bn(nistp521_curve_params[4], sizeof(felem_bytearray), y);
2045    if (!EC_POINT_set_affine_coordinates_GFp(group, generator, x, y, ctx))
2046        goto err;
2047    if ((pre = nistp521_pre_comp_new()) == NULL)
2048        goto err;
2049    /*
2050     * if the generator is the standard one, use built-in precomputation
2051     */
2052    if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) {
2053        memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp));
2054        goto done;
2055    }
2056    if ((!BN_to_felem(pre->g_pre_comp[1][0], &group->generator->X)) ||
2057        (!BN_to_felem(pre->g_pre_comp[1][1], &group->generator->Y)) ||
2058        (!BN_to_felem(pre->g_pre_comp[1][2], &group->generator->Z)))
2059        goto err;
2060    /* compute 2^130*G, 2^260*G, 2^390*G */
2061    for (i = 1; i <= 4; i <<= 1) {
2062        point_double(pre->g_pre_comp[2 * i][0], pre->g_pre_comp[2 * i][1],
2063                     pre->g_pre_comp[2 * i][2], pre->g_pre_comp[i][0],
2064                     pre->g_pre_comp[i][1], pre->g_pre_comp[i][2]);
2065        for (j = 0; j < 129; ++j) {
2066            point_double(pre->g_pre_comp[2 * i][0],
2067                         pre->g_pre_comp[2 * i][1],
2068                         pre->g_pre_comp[2 * i][2],
2069                         pre->g_pre_comp[2 * i][0],
2070                         pre->g_pre_comp[2 * i][1],
2071                         pre->g_pre_comp[2 * i][2]);
2072        }
2073    }
2074    /* g_pre_comp[0] is the point at infinity */
2075    memset(pre->g_pre_comp[0], 0, sizeof(pre->g_pre_comp[0]));
2076    /* the remaining multiples */
2077    /* 2^130*G + 2^260*G */
2078    point_add(pre->g_pre_comp[6][0], pre->g_pre_comp[6][1],
2079              pre->g_pre_comp[6][2], pre->g_pre_comp[4][0],
2080              pre->g_pre_comp[4][1], pre->g_pre_comp[4][2],
2081              0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
2082              pre->g_pre_comp[2][2]);
2083    /* 2^130*G + 2^390*G */
2084    point_add(pre->g_pre_comp[10][0], pre->g_pre_comp[10][1],
2085              pre->g_pre_comp[10][2], pre->g_pre_comp[8][0],
2086              pre->g_pre_comp[8][1], pre->g_pre_comp[8][2],
2087              0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
2088              pre->g_pre_comp[2][2]);
2089    /* 2^260*G + 2^390*G */
2090    point_add(pre->g_pre_comp[12][0], pre->g_pre_comp[12][1],
2091              pre->g_pre_comp[12][2], pre->g_pre_comp[8][0],
2092              pre->g_pre_comp[8][1], pre->g_pre_comp[8][2],
2093              0, pre->g_pre_comp[4][0], pre->g_pre_comp[4][1],
2094              pre->g_pre_comp[4][2]);
2095    /* 2^130*G + 2^260*G + 2^390*G */
2096    point_add(pre->g_pre_comp[14][0], pre->g_pre_comp[14][1],
2097              pre->g_pre_comp[14][2], pre->g_pre_comp[12][0],
2098              pre->g_pre_comp[12][1], pre->g_pre_comp[12][2],
2099              0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
2100              pre->g_pre_comp[2][2]);
2101    for (i = 1; i < 8; ++i) {
2102        /* odd multiples: add G */
2103        point_add(pre->g_pre_comp[2 * i + 1][0],
2104                  pre->g_pre_comp[2 * i + 1][1],
2105                  pre->g_pre_comp[2 * i + 1][2], pre->g_pre_comp[2 * i][0],
2106                  pre->g_pre_comp[2 * i][1], pre->g_pre_comp[2 * i][2], 0,
2107                  pre->g_pre_comp[1][0], pre->g_pre_comp[1][1],
2108                  pre->g_pre_comp[1][2]);
2109    }
2110    make_points_affine(15, &(pre->g_pre_comp[1]), tmp_felems);
2111
2112 done:
2113    if (!EC_EX_DATA_set_data(&group->extra_data, pre, nistp521_pre_comp_dup,
2114                             nistp521_pre_comp_free,
2115                             nistp521_pre_comp_clear_free))
2116        goto err;
2117    ret = 1;
2118    pre = NULL;
2119 err:
2120    BN_CTX_end(ctx);
2121    if (generator != NULL)
2122        EC_POINT_free(generator);
2123    if (new_ctx != NULL)
2124        BN_CTX_free(new_ctx);
2125    if (pre)
2126        nistp521_pre_comp_free(pre);
2127    return ret;
2128}
2129
2130int ec_GFp_nistp521_have_precompute_mult(const EC_GROUP *group)
2131{
2132    if (EC_EX_DATA_get_data(group->extra_data, nistp521_pre_comp_dup,
2133                            nistp521_pre_comp_free,
2134                            nistp521_pre_comp_clear_free)
2135        != NULL)
2136        return 1;
2137    else
2138        return 0;
2139}
2140
2141#else
2142static void *dummy = &dummy;
2143#endif
2144