1238384Sjkim/* crypto/ec/ecp_nistp521.c */
2238384Sjkim/*
3238384Sjkim * Written by Adam Langley (Google) for the OpenSSL project
4238384Sjkim */
5238384Sjkim/* Copyright 2011 Google Inc.
6238384Sjkim *
7238384Sjkim * Licensed under the Apache License, Version 2.0 (the "License");
8238384Sjkim *
9238384Sjkim * you may not use this file except in compliance with the License.
10238384Sjkim * You may obtain a copy of the License at
11238384Sjkim *
12238384Sjkim *     http://www.apache.org/licenses/LICENSE-2.0
13238384Sjkim *
14238384Sjkim *  Unless required by applicable law or agreed to in writing, software
15238384Sjkim *  distributed under the License is distributed on an "AS IS" BASIS,
16238384Sjkim *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17238384Sjkim *  See the License for the specific language governing permissions and
18238384Sjkim *  limitations under the License.
19238384Sjkim */
20238384Sjkim
21238384Sjkim/*
22238384Sjkim * A 64-bit implementation of the NIST P-521 elliptic curve point multiplication
23238384Sjkim *
24238384Sjkim * OpenSSL integration was taken from Emilia Kasper's work in ecp_nistp224.c.
25238384Sjkim * Otherwise based on Emilia's P224 work, which was inspired by my curve25519
26238384Sjkim * work which got its smarts from Daniel J. Bernstein's work on the same.
27238384Sjkim */
28238384Sjkim
29238384Sjkim#include <openssl/opensslconf.h>
30238384Sjkim#ifndef OPENSSL_NO_EC_NISTP_64_GCC_128
31238384Sjkim
32280304Sjkim# ifndef OPENSSL_SYS_VMS
33280304Sjkim#  include <stdint.h>
34280304Sjkim# else
35280304Sjkim#  include <inttypes.h>
36280304Sjkim# endif
37238384Sjkim
38280304Sjkim# include <string.h>
39280304Sjkim# include <openssl/err.h>
40280304Sjkim# include "ec_lcl.h"
41238384Sjkim
42280304Sjkim# if defined(__GNUC__) && (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))
43238384Sjkim  /* even with gcc, the typedef won't work for 32-bit platforms */
44280304Sjkimtypedef __uint128_t uint128_t;  /* nonstandard; implemented by gcc on 64-bit
45280304Sjkim                                 * platforms */
46280304Sjkim# else
47280304Sjkim#  error "Need GCC 3.1 or later to define type uint128_t"
48280304Sjkim# endif
49238384Sjkim
50238384Sjkimtypedef uint8_t u8;
51238384Sjkimtypedef uint64_t u64;
52238384Sjkimtypedef int64_t s64;
53238384Sjkim
54280304Sjkim/*
55280304Sjkim * The underlying field. P521 operates over GF(2^521-1). We can serialise an
56280304Sjkim * element of this field into 66 bytes where the most significant byte
57280304Sjkim * contains only a single bit. We call this an felem_bytearray.
58280304Sjkim */
59238384Sjkim
60238384Sjkimtypedef u8 felem_bytearray[66];
61238384Sjkim
62280304Sjkim/*
63280304Sjkim * These are the parameters of P521, taken from FIPS 186-3, section D.1.2.5.
64280304Sjkim * These values are big-endian.
65280304Sjkim */
66280304Sjkimstatic const felem_bytearray nistp521_curve_params[5] = {
67280304Sjkim    {0x01, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* p */
68280304Sjkim     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
69280304Sjkim     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
70280304Sjkim     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
71280304Sjkim     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
72280304Sjkim     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
73280304Sjkim     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
74280304Sjkim     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
75280304Sjkim     0xff, 0xff},
76280304Sjkim    {0x01, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* a = -3 */
77280304Sjkim     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
78280304Sjkim     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
79280304Sjkim     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
80280304Sjkim     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
81280304Sjkim     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
82280304Sjkim     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
83280304Sjkim     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
84280304Sjkim     0xff, 0xfc},
85280304Sjkim    {0x00, 0x51, 0x95, 0x3e, 0xb9, 0x61, 0x8e, 0x1c, /* b */
86280304Sjkim     0x9a, 0x1f, 0x92, 0x9a, 0x21, 0xa0, 0xb6, 0x85,
87280304Sjkim     0x40, 0xee, 0xa2, 0xda, 0x72, 0x5b, 0x99, 0xb3,
88280304Sjkim     0x15, 0xf3, 0xb8, 0xb4, 0x89, 0x91, 0x8e, 0xf1,
89280304Sjkim     0x09, 0xe1, 0x56, 0x19, 0x39, 0x51, 0xec, 0x7e,
90280304Sjkim     0x93, 0x7b, 0x16, 0x52, 0xc0, 0xbd, 0x3b, 0xb1,
91280304Sjkim     0xbf, 0x07, 0x35, 0x73, 0xdf, 0x88, 0x3d, 0x2c,
92280304Sjkim     0x34, 0xf1, 0xef, 0x45, 0x1f, 0xd4, 0x6b, 0x50,
93280304Sjkim     0x3f, 0x00},
94280304Sjkim    {0x00, 0xc6, 0x85, 0x8e, 0x06, 0xb7, 0x04, 0x04, /* x */
95280304Sjkim     0xe9, 0xcd, 0x9e, 0x3e, 0xcb, 0x66, 0x23, 0x95,
96280304Sjkim     0xb4, 0x42, 0x9c, 0x64, 0x81, 0x39, 0x05, 0x3f,
97280304Sjkim     0xb5, 0x21, 0xf8, 0x28, 0xaf, 0x60, 0x6b, 0x4d,
98280304Sjkim     0x3d, 0xba, 0xa1, 0x4b, 0x5e, 0x77, 0xef, 0xe7,
99280304Sjkim     0x59, 0x28, 0xfe, 0x1d, 0xc1, 0x27, 0xa2, 0xff,
100280304Sjkim     0xa8, 0xde, 0x33, 0x48, 0xb3, 0xc1, 0x85, 0x6a,
101280304Sjkim     0x42, 0x9b, 0xf9, 0x7e, 0x7e, 0x31, 0xc2, 0xe5,
102280304Sjkim     0xbd, 0x66},
103280304Sjkim    {0x01, 0x18, 0x39, 0x29, 0x6a, 0x78, 0x9a, 0x3b, /* y */
104280304Sjkim     0xc0, 0x04, 0x5c, 0x8a, 0x5f, 0xb4, 0x2c, 0x7d,
105280304Sjkim     0x1b, 0xd9, 0x98, 0xf5, 0x44, 0x49, 0x57, 0x9b,
106280304Sjkim     0x44, 0x68, 0x17, 0xaf, 0xbd, 0x17, 0x27, 0x3e,
107280304Sjkim     0x66, 0x2c, 0x97, 0xee, 0x72, 0x99, 0x5e, 0xf4,
108280304Sjkim     0x26, 0x40, 0xc5, 0x50, 0xb9, 0x01, 0x3f, 0xad,
109280304Sjkim     0x07, 0x61, 0x35, 0x3c, 0x70, 0x86, 0xa2, 0x72,
110280304Sjkim     0xc2, 0x40, 0x88, 0xbe, 0x94, 0x76, 0x9f, 0xd1,
111280304Sjkim     0x66, 0x50}
112280304Sjkim};
113238384Sjkim
114280304Sjkim/*-
115280304Sjkim * The representation of field elements.
116238384Sjkim * ------------------------------------
117238384Sjkim *
118238384Sjkim * We represent field elements with nine values. These values are either 64 or
119238384Sjkim * 128 bits and the field element represented is:
120238384Sjkim *   v[0]*2^0 + v[1]*2^58 + v[2]*2^116 + ... + v[8]*2^464  (mod p)
121238384Sjkim * Each of the nine values is called a 'limb'. Since the limbs are spaced only
122238384Sjkim * 58 bits apart, but are greater than 58 bits in length, the most significant
123238384Sjkim * bits of each limb overlap with the least significant bits of the next.
124238384Sjkim *
125238384Sjkim * A field element with 64-bit limbs is an 'felem'. One with 128-bit limbs is a
126238384Sjkim * 'largefelem' */
127238384Sjkim
128280304Sjkim# define NLIMBS 9
129238384Sjkim
130238384Sjkimtypedef uint64_t limb;
131238384Sjkimtypedef limb felem[NLIMBS];
132238384Sjkimtypedef uint128_t largefelem[NLIMBS];
133238384Sjkim
134238384Sjkimstatic const limb bottom57bits = 0x1ffffffffffffff;
135238384Sjkimstatic const limb bottom58bits = 0x3ffffffffffffff;
136238384Sjkim
137280304Sjkim/*
138280304Sjkim * bin66_to_felem takes a little-endian byte array and converts it into felem
139280304Sjkim * form. This assumes that the CPU is little-endian.
140280304Sjkim */
141238384Sjkimstatic void bin66_to_felem(felem out, const u8 in[66])
142280304Sjkim{
143280304Sjkim    out[0] = (*((limb *) & in[0])) & bottom58bits;
144280304Sjkim    out[1] = (*((limb *) & in[7]) >> 2) & bottom58bits;
145280304Sjkim    out[2] = (*((limb *) & in[14]) >> 4) & bottom58bits;
146280304Sjkim    out[3] = (*((limb *) & in[21]) >> 6) & bottom58bits;
147280304Sjkim    out[4] = (*((limb *) & in[29])) & bottom58bits;
148280304Sjkim    out[5] = (*((limb *) & in[36]) >> 2) & bottom58bits;
149280304Sjkim    out[6] = (*((limb *) & in[43]) >> 4) & bottom58bits;
150280304Sjkim    out[7] = (*((limb *) & in[50]) >> 6) & bottom58bits;
151280304Sjkim    out[8] = (*((limb *) & in[58])) & bottom57bits;
152280304Sjkim}
153238384Sjkim
154280304Sjkim/*
155280304Sjkim * felem_to_bin66 takes an felem and serialises into a little endian, 66 byte
156280304Sjkim * array. This assumes that the CPU is little-endian.
157280304Sjkim */
158238384Sjkimstatic void felem_to_bin66(u8 out[66], const felem in)
159280304Sjkim{
160280304Sjkim    memset(out, 0, 66);
161280304Sjkim    (*((limb *) & out[0])) = in[0];
162280304Sjkim    (*((limb *) & out[7])) |= in[1] << 2;
163280304Sjkim    (*((limb *) & out[14])) |= in[2] << 4;
164280304Sjkim    (*((limb *) & out[21])) |= in[3] << 6;
165280304Sjkim    (*((limb *) & out[29])) = in[4];
166280304Sjkim    (*((limb *) & out[36])) |= in[5] << 2;
167280304Sjkim    (*((limb *) & out[43])) |= in[6] << 4;
168280304Sjkim    (*((limb *) & out[50])) |= in[7] << 6;
169280304Sjkim    (*((limb *) & out[58])) = in[8];
170280304Sjkim}
171238384Sjkim
172238384Sjkim/* To preserve endianness when using BN_bn2bin and BN_bin2bn */
173238384Sjkimstatic void flip_endian(u8 *out, const u8 *in, unsigned len)
174280304Sjkim{
175280304Sjkim    unsigned i;
176280304Sjkim    for (i = 0; i < len; ++i)
177280304Sjkim        out[i] = in[len - 1 - i];
178280304Sjkim}
179238384Sjkim
180238384Sjkim/* BN_to_felem converts an OpenSSL BIGNUM into an felem */
181238384Sjkimstatic int BN_to_felem(felem out, const BIGNUM *bn)
182280304Sjkim{
183280304Sjkim    felem_bytearray b_in;
184280304Sjkim    felem_bytearray b_out;
185280304Sjkim    unsigned num_bytes;
186238384Sjkim
187280304Sjkim    /* BN_bn2bin eats leading zeroes */
188280304Sjkim    memset(b_out, 0, sizeof b_out);
189280304Sjkim    num_bytes = BN_num_bytes(bn);
190280304Sjkim    if (num_bytes > sizeof b_out) {
191280304Sjkim        ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
192280304Sjkim        return 0;
193280304Sjkim    }
194280304Sjkim    if (BN_is_negative(bn)) {
195280304Sjkim        ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
196280304Sjkim        return 0;
197280304Sjkim    }
198280304Sjkim    num_bytes = BN_bn2bin(bn, b_in);
199280304Sjkim    flip_endian(b_out, b_in, num_bytes);
200280304Sjkim    bin66_to_felem(out, b_out);
201280304Sjkim    return 1;
202280304Sjkim}
203238384Sjkim
204238384Sjkim/* felem_to_BN converts an felem into an OpenSSL BIGNUM */
205238384Sjkimstatic BIGNUM *felem_to_BN(BIGNUM *out, const felem in)
206280304Sjkim{
207280304Sjkim    felem_bytearray b_in, b_out;
208280304Sjkim    felem_to_bin66(b_in, in);
209280304Sjkim    flip_endian(b_out, b_in, sizeof b_out);
210280304Sjkim    return BN_bin2bn(b_out, sizeof b_out, out);
211280304Sjkim}
212238384Sjkim
213280304Sjkim/*-
214280304Sjkim * Field operations
215280304Sjkim * ----------------
216280304Sjkim */
217238384Sjkim
218238384Sjkimstatic void felem_one(felem out)
219280304Sjkim{
220280304Sjkim    out[0] = 1;
221280304Sjkim    out[1] = 0;
222280304Sjkim    out[2] = 0;
223280304Sjkim    out[3] = 0;
224280304Sjkim    out[4] = 0;
225280304Sjkim    out[5] = 0;
226280304Sjkim    out[6] = 0;
227280304Sjkim    out[7] = 0;
228280304Sjkim    out[8] = 0;
229280304Sjkim}
230238384Sjkim
231238384Sjkimstatic void felem_assign(felem out, const felem in)
232280304Sjkim{
233280304Sjkim    out[0] = in[0];
234280304Sjkim    out[1] = in[1];
235280304Sjkim    out[2] = in[2];
236280304Sjkim    out[3] = in[3];
237280304Sjkim    out[4] = in[4];
238280304Sjkim    out[5] = in[5];
239280304Sjkim    out[6] = in[6];
240280304Sjkim    out[7] = in[7];
241280304Sjkim    out[8] = in[8];
242280304Sjkim}
243238384Sjkim
244238384Sjkim/* felem_sum64 sets out = out + in. */
245238384Sjkimstatic void felem_sum64(felem out, const felem in)
246280304Sjkim{
247280304Sjkim    out[0] += in[0];
248280304Sjkim    out[1] += in[1];
249280304Sjkim    out[2] += in[2];
250280304Sjkim    out[3] += in[3];
251280304Sjkim    out[4] += in[4];
252280304Sjkim    out[5] += in[5];
253280304Sjkim    out[6] += in[6];
254280304Sjkim    out[7] += in[7];
255280304Sjkim    out[8] += in[8];
256280304Sjkim}
257238384Sjkim
258238384Sjkim/* felem_scalar sets out = in * scalar */
259238384Sjkimstatic void felem_scalar(felem out, const felem in, limb scalar)
260280304Sjkim{
261280304Sjkim    out[0] = in[0] * scalar;
262280304Sjkim    out[1] = in[1] * scalar;
263280304Sjkim    out[2] = in[2] * scalar;
264280304Sjkim    out[3] = in[3] * scalar;
265280304Sjkim    out[4] = in[4] * scalar;
266280304Sjkim    out[5] = in[5] * scalar;
267280304Sjkim    out[6] = in[6] * scalar;
268280304Sjkim    out[7] = in[7] * scalar;
269280304Sjkim    out[8] = in[8] * scalar;
270280304Sjkim}
271238384Sjkim
272238384Sjkim/* felem_scalar64 sets out = out * scalar */
273238384Sjkimstatic void felem_scalar64(felem out, limb scalar)
274280304Sjkim{
275280304Sjkim    out[0] *= scalar;
276280304Sjkim    out[1] *= scalar;
277280304Sjkim    out[2] *= scalar;
278280304Sjkim    out[3] *= scalar;
279280304Sjkim    out[4] *= scalar;
280280304Sjkim    out[5] *= scalar;
281280304Sjkim    out[6] *= scalar;
282280304Sjkim    out[7] *= scalar;
283280304Sjkim    out[8] *= scalar;
284280304Sjkim}
285238384Sjkim
286238384Sjkim/* felem_scalar128 sets out = out * scalar */
287238384Sjkimstatic void felem_scalar128(largefelem out, limb scalar)
288280304Sjkim{
289280304Sjkim    out[0] *= scalar;
290280304Sjkim    out[1] *= scalar;
291280304Sjkim    out[2] *= scalar;
292280304Sjkim    out[3] *= scalar;
293280304Sjkim    out[4] *= scalar;
294280304Sjkim    out[5] *= scalar;
295280304Sjkim    out[6] *= scalar;
296280304Sjkim    out[7] *= scalar;
297280304Sjkim    out[8] *= scalar;
298280304Sjkim}
299238384Sjkim
300280304Sjkim/*-
301280304Sjkim * felem_neg sets |out| to |-in|
302238384Sjkim * On entry:
303238384Sjkim *   in[i] < 2^59 + 2^14
304238384Sjkim * On exit:
305238384Sjkim *   out[i] < 2^62
306238384Sjkim */
307238384Sjkimstatic void felem_neg(felem out, const felem in)
308280304Sjkim{
309280304Sjkim    /* In order to prevent underflow, we subtract from 0 mod p. */
310280304Sjkim    static const limb two62m3 = (((limb) 1) << 62) - (((limb) 1) << 5);
311280304Sjkim    static const limb two62m2 = (((limb) 1) << 62) - (((limb) 1) << 4);
312238384Sjkim
313280304Sjkim    out[0] = two62m3 - in[0];
314280304Sjkim    out[1] = two62m2 - in[1];
315280304Sjkim    out[2] = two62m2 - in[2];
316280304Sjkim    out[3] = two62m2 - in[3];
317280304Sjkim    out[4] = two62m2 - in[4];
318280304Sjkim    out[5] = two62m2 - in[5];
319280304Sjkim    out[6] = two62m2 - in[6];
320280304Sjkim    out[7] = two62m2 - in[7];
321280304Sjkim    out[8] = two62m2 - in[8];
322280304Sjkim}
323238384Sjkim
324280304Sjkim/*-
325280304Sjkim * felem_diff64 subtracts |in| from |out|
326238384Sjkim * On entry:
327238384Sjkim *   in[i] < 2^59 + 2^14
328238384Sjkim * On exit:
329238384Sjkim *   out[i] < out[i] + 2^62
330238384Sjkim */
331238384Sjkimstatic void felem_diff64(felem out, const felem in)
332280304Sjkim{
333280304Sjkim    /*
334280304Sjkim     * In order to prevent underflow, we add 0 mod p before subtracting.
335280304Sjkim     */
336280304Sjkim    static const limb two62m3 = (((limb) 1) << 62) - (((limb) 1) << 5);
337280304Sjkim    static const limb two62m2 = (((limb) 1) << 62) - (((limb) 1) << 4);
338238384Sjkim
339280304Sjkim    out[0] += two62m3 - in[0];
340280304Sjkim    out[1] += two62m2 - in[1];
341280304Sjkim    out[2] += two62m2 - in[2];
342280304Sjkim    out[3] += two62m2 - in[3];
343280304Sjkim    out[4] += two62m2 - in[4];
344280304Sjkim    out[5] += two62m2 - in[5];
345280304Sjkim    out[6] += two62m2 - in[6];
346280304Sjkim    out[7] += two62m2 - in[7];
347280304Sjkim    out[8] += two62m2 - in[8];
348280304Sjkim}
349238384Sjkim
350280304Sjkim/*-
351280304Sjkim * felem_diff_128_64 subtracts |in| from |out|
352238384Sjkim * On entry:
353238384Sjkim *   in[i] < 2^62 + 2^17
354238384Sjkim * On exit:
355238384Sjkim *   out[i] < out[i] + 2^63
356238384Sjkim */
357238384Sjkimstatic void felem_diff_128_64(largefelem out, const felem in)
358280304Sjkim{
359280304Sjkim    /*
360280304Sjkim     * In order to prevent underflow, we add 0 mod p before subtracting.
361280304Sjkim     */
362280304Sjkim    static const limb two63m6 = (((limb) 1) << 62) - (((limb) 1) << 5);
363280304Sjkim    static const limb two63m5 = (((limb) 1) << 62) - (((limb) 1) << 4);
364238384Sjkim
365280304Sjkim    out[0] += two63m6 - in[0];
366280304Sjkim    out[1] += two63m5 - in[1];
367280304Sjkim    out[2] += two63m5 - in[2];
368280304Sjkim    out[3] += two63m5 - in[3];
369280304Sjkim    out[4] += two63m5 - in[4];
370280304Sjkim    out[5] += two63m5 - in[5];
371280304Sjkim    out[6] += two63m5 - in[6];
372280304Sjkim    out[7] += two63m5 - in[7];
373280304Sjkim    out[8] += two63m5 - in[8];
374280304Sjkim}
375238384Sjkim
376280304Sjkim/*-
377280304Sjkim * felem_diff_128_64 subtracts |in| from |out|
378238384Sjkim * On entry:
379238384Sjkim *   in[i] < 2^126
380238384Sjkim * On exit:
381238384Sjkim *   out[i] < out[i] + 2^127 - 2^69
382238384Sjkim */
383238384Sjkimstatic void felem_diff128(largefelem out, const largefelem in)
384280304Sjkim{
385280304Sjkim    /*
386280304Sjkim     * In order to prevent underflow, we add 0 mod p before subtracting.
387280304Sjkim     */
388280304Sjkim    static const uint128_t two127m70 =
389280304Sjkim        (((uint128_t) 1) << 127) - (((uint128_t) 1) << 70);
390280304Sjkim    static const uint128_t two127m69 =
391280304Sjkim        (((uint128_t) 1) << 127) - (((uint128_t) 1) << 69);
392238384Sjkim
393280304Sjkim    out[0] += (two127m70 - in[0]);
394280304Sjkim    out[1] += (two127m69 - in[1]);
395280304Sjkim    out[2] += (two127m69 - in[2]);
396280304Sjkim    out[3] += (two127m69 - in[3]);
397280304Sjkim    out[4] += (two127m69 - in[4]);
398280304Sjkim    out[5] += (two127m69 - in[5]);
399280304Sjkim    out[6] += (two127m69 - in[6]);
400280304Sjkim    out[7] += (two127m69 - in[7]);
401280304Sjkim    out[8] += (two127m69 - in[8]);
402280304Sjkim}
403238384Sjkim
404280304Sjkim/*-
405280304Sjkim * felem_square sets |out| = |in|^2
406238384Sjkim * On entry:
407238384Sjkim *   in[i] < 2^62
408238384Sjkim * On exit:
409238384Sjkim *   out[i] < 17 * max(in[i]) * max(in[i])
410238384Sjkim */
411238384Sjkimstatic void felem_square(largefelem out, const felem in)
412280304Sjkim{
413280304Sjkim    felem inx2, inx4;
414280304Sjkim    felem_scalar(inx2, in, 2);
415280304Sjkim    felem_scalar(inx4, in, 4);
416238384Sjkim
417280304Sjkim    /*-
418280304Sjkim     * We have many cases were we want to do
419280304Sjkim     *   in[x] * in[y] +
420280304Sjkim     *   in[y] * in[x]
421280304Sjkim     * This is obviously just
422280304Sjkim     *   2 * in[x] * in[y]
423280304Sjkim     * However, rather than do the doubling on the 128 bit result, we
424280304Sjkim     * double one of the inputs to the multiplication by reading from
425280304Sjkim     * |inx2|
426280304Sjkim     */
427238384Sjkim
428280304Sjkim    out[0] = ((uint128_t) in[0]) * in[0];
429280304Sjkim    out[1] = ((uint128_t) in[0]) * inx2[1];
430280304Sjkim    out[2] = ((uint128_t) in[0]) * inx2[2] + ((uint128_t) in[1]) * in[1];
431280304Sjkim    out[3] = ((uint128_t) in[0]) * inx2[3] + ((uint128_t) in[1]) * inx2[2];
432280304Sjkim    out[4] = ((uint128_t) in[0]) * inx2[4] +
433280304Sjkim        ((uint128_t) in[1]) * inx2[3] + ((uint128_t) in[2]) * in[2];
434280304Sjkim    out[5] = ((uint128_t) in[0]) * inx2[5] +
435280304Sjkim        ((uint128_t) in[1]) * inx2[4] + ((uint128_t) in[2]) * inx2[3];
436280304Sjkim    out[6] = ((uint128_t) in[0]) * inx2[6] +
437280304Sjkim        ((uint128_t) in[1]) * inx2[5] +
438280304Sjkim        ((uint128_t) in[2]) * inx2[4] + ((uint128_t) in[3]) * in[3];
439280304Sjkim    out[7] = ((uint128_t) in[0]) * inx2[7] +
440280304Sjkim        ((uint128_t) in[1]) * inx2[6] +
441280304Sjkim        ((uint128_t) in[2]) * inx2[5] + ((uint128_t) in[3]) * inx2[4];
442280304Sjkim    out[8] = ((uint128_t) in[0]) * inx2[8] +
443280304Sjkim        ((uint128_t) in[1]) * inx2[7] +
444280304Sjkim        ((uint128_t) in[2]) * inx2[6] +
445280304Sjkim        ((uint128_t) in[3]) * inx2[5] + ((uint128_t) in[4]) * in[4];
446238384Sjkim
447280304Sjkim    /*
448280304Sjkim     * The remaining limbs fall above 2^521, with the first falling at 2^522.
449280304Sjkim     * They correspond to locations one bit up from the limbs produced above
450280304Sjkim     * so we would have to multiply by two to align them. Again, rather than
451280304Sjkim     * operate on the 128-bit result, we double one of the inputs to the
452280304Sjkim     * multiplication. If we want to double for both this reason, and the
453280304Sjkim     * reason above, then we end up multiplying by four.
454280304Sjkim     */
455238384Sjkim
456280304Sjkim    /* 9 */
457280304Sjkim    out[0] += ((uint128_t) in[1]) * inx4[8] +
458280304Sjkim        ((uint128_t) in[2]) * inx4[7] +
459280304Sjkim        ((uint128_t) in[3]) * inx4[6] + ((uint128_t) in[4]) * inx4[5];
460238384Sjkim
461280304Sjkim    /* 10 */
462280304Sjkim    out[1] += ((uint128_t) in[2]) * inx4[8] +
463280304Sjkim        ((uint128_t) in[3]) * inx4[7] +
464280304Sjkim        ((uint128_t) in[4]) * inx4[6] + ((uint128_t) in[5]) * inx2[5];
465238384Sjkim
466280304Sjkim    /* 11 */
467280304Sjkim    out[2] += ((uint128_t) in[3]) * inx4[8] +
468280304Sjkim        ((uint128_t) in[4]) * inx4[7] + ((uint128_t) in[5]) * inx4[6];
469238384Sjkim
470280304Sjkim    /* 12 */
471280304Sjkim    out[3] += ((uint128_t) in[4]) * inx4[8] +
472280304Sjkim        ((uint128_t) in[5]) * inx4[7] + ((uint128_t) in[6]) * inx2[6];
473238384Sjkim
474280304Sjkim    /* 13 */
475280304Sjkim    out[4] += ((uint128_t) in[5]) * inx4[8] + ((uint128_t) in[6]) * inx4[7];
476238384Sjkim
477280304Sjkim    /* 14 */
478280304Sjkim    out[5] += ((uint128_t) in[6]) * inx4[8] + ((uint128_t) in[7]) * inx2[7];
479238384Sjkim
480280304Sjkim    /* 15 */
481280304Sjkim    out[6] += ((uint128_t) in[7]) * inx4[8];
482238384Sjkim
483280304Sjkim    /* 16 */
484280304Sjkim    out[7] += ((uint128_t) in[8]) * inx2[8];
485280304Sjkim}
486238384Sjkim
487280304Sjkim/*-
488280304Sjkim * felem_mul sets |out| = |in1| * |in2|
489238384Sjkim * On entry:
490238384Sjkim *   in1[i] < 2^64
491238384Sjkim *   in2[i] < 2^63
492238384Sjkim * On exit:
493238384Sjkim *   out[i] < 17 * max(in1[i]) * max(in2[i])
494238384Sjkim */
495238384Sjkimstatic void felem_mul(largefelem out, const felem in1, const felem in2)
496280304Sjkim{
497280304Sjkim    felem in2x2;
498280304Sjkim    felem_scalar(in2x2, in2, 2);
499238384Sjkim
500280304Sjkim    out[0] = ((uint128_t) in1[0]) * in2[0];
501238384Sjkim
502280304Sjkim    out[1] = ((uint128_t) in1[0]) * in2[1] + ((uint128_t) in1[1]) * in2[0];
503238384Sjkim
504280304Sjkim    out[2] = ((uint128_t) in1[0]) * in2[2] +
505280304Sjkim        ((uint128_t) in1[1]) * in2[1] + ((uint128_t) in1[2]) * in2[0];
506238384Sjkim
507280304Sjkim    out[3] = ((uint128_t) in1[0]) * in2[3] +
508280304Sjkim        ((uint128_t) in1[1]) * in2[2] +
509280304Sjkim        ((uint128_t) in1[2]) * in2[1] + ((uint128_t) in1[3]) * in2[0];
510238384Sjkim
511280304Sjkim    out[4] = ((uint128_t) in1[0]) * in2[4] +
512280304Sjkim        ((uint128_t) in1[1]) * in2[3] +
513280304Sjkim        ((uint128_t) in1[2]) * in2[2] +
514280304Sjkim        ((uint128_t) in1[3]) * in2[1] + ((uint128_t) in1[4]) * in2[0];
515238384Sjkim
516280304Sjkim    out[5] = ((uint128_t) in1[0]) * in2[5] +
517280304Sjkim        ((uint128_t) in1[1]) * in2[4] +
518280304Sjkim        ((uint128_t) in1[2]) * in2[3] +
519280304Sjkim        ((uint128_t) in1[3]) * in2[2] +
520280304Sjkim        ((uint128_t) in1[4]) * in2[1] + ((uint128_t) in1[5]) * in2[0];
521238384Sjkim
522280304Sjkim    out[6] = ((uint128_t) in1[0]) * in2[6] +
523280304Sjkim        ((uint128_t) in1[1]) * in2[5] +
524280304Sjkim        ((uint128_t) in1[2]) * in2[4] +
525280304Sjkim        ((uint128_t) in1[3]) * in2[3] +
526280304Sjkim        ((uint128_t) in1[4]) * in2[2] +
527280304Sjkim        ((uint128_t) in1[5]) * in2[1] + ((uint128_t) in1[6]) * in2[0];
528238384Sjkim
529280304Sjkim    out[7] = ((uint128_t) in1[0]) * in2[7] +
530280304Sjkim        ((uint128_t) in1[1]) * in2[6] +
531280304Sjkim        ((uint128_t) in1[2]) * in2[5] +
532280304Sjkim        ((uint128_t) in1[3]) * in2[4] +
533280304Sjkim        ((uint128_t) in1[4]) * in2[3] +
534280304Sjkim        ((uint128_t) in1[5]) * in2[2] +
535280304Sjkim        ((uint128_t) in1[6]) * in2[1] + ((uint128_t) in1[7]) * in2[0];
536238384Sjkim
537280304Sjkim    out[8] = ((uint128_t) in1[0]) * in2[8] +
538280304Sjkim        ((uint128_t) in1[1]) * in2[7] +
539280304Sjkim        ((uint128_t) in1[2]) * in2[6] +
540280304Sjkim        ((uint128_t) in1[3]) * in2[5] +
541280304Sjkim        ((uint128_t) in1[4]) * in2[4] +
542280304Sjkim        ((uint128_t) in1[5]) * in2[3] +
543280304Sjkim        ((uint128_t) in1[6]) * in2[2] +
544280304Sjkim        ((uint128_t) in1[7]) * in2[1] + ((uint128_t) in1[8]) * in2[0];
545238384Sjkim
546280304Sjkim    /* See comment in felem_square about the use of in2x2 here */
547238384Sjkim
548280304Sjkim    out[0] += ((uint128_t) in1[1]) * in2x2[8] +
549280304Sjkim        ((uint128_t) in1[2]) * in2x2[7] +
550280304Sjkim        ((uint128_t) in1[3]) * in2x2[6] +
551280304Sjkim        ((uint128_t) in1[4]) * in2x2[5] +
552280304Sjkim        ((uint128_t) in1[5]) * in2x2[4] +
553280304Sjkim        ((uint128_t) in1[6]) * in2x2[3] +
554280304Sjkim        ((uint128_t) in1[7]) * in2x2[2] + ((uint128_t) in1[8]) * in2x2[1];
555238384Sjkim
556280304Sjkim    out[1] += ((uint128_t) in1[2]) * in2x2[8] +
557280304Sjkim        ((uint128_t) in1[3]) * in2x2[7] +
558280304Sjkim        ((uint128_t) in1[4]) * in2x2[6] +
559280304Sjkim        ((uint128_t) in1[5]) * in2x2[5] +
560280304Sjkim        ((uint128_t) in1[6]) * in2x2[4] +
561280304Sjkim        ((uint128_t) in1[7]) * in2x2[3] + ((uint128_t) in1[8]) * in2x2[2];
562238384Sjkim
563280304Sjkim    out[2] += ((uint128_t) in1[3]) * in2x2[8] +
564280304Sjkim        ((uint128_t) in1[4]) * in2x2[7] +
565280304Sjkim        ((uint128_t) in1[5]) * in2x2[6] +
566280304Sjkim        ((uint128_t) in1[6]) * in2x2[5] +
567280304Sjkim        ((uint128_t) in1[7]) * in2x2[4] + ((uint128_t) in1[8]) * in2x2[3];
568238384Sjkim
569280304Sjkim    out[3] += ((uint128_t) in1[4]) * in2x2[8] +
570280304Sjkim        ((uint128_t) in1[5]) * in2x2[7] +
571280304Sjkim        ((uint128_t) in1[6]) * in2x2[6] +
572280304Sjkim        ((uint128_t) in1[7]) * in2x2[5] + ((uint128_t) in1[8]) * in2x2[4];
573238384Sjkim
574280304Sjkim    out[4] += ((uint128_t) in1[5]) * in2x2[8] +
575280304Sjkim        ((uint128_t) in1[6]) * in2x2[7] +
576280304Sjkim        ((uint128_t) in1[7]) * in2x2[6] + ((uint128_t) in1[8]) * in2x2[5];
577238384Sjkim
578280304Sjkim    out[5] += ((uint128_t) in1[6]) * in2x2[8] +
579280304Sjkim        ((uint128_t) in1[7]) * in2x2[7] + ((uint128_t) in1[8]) * in2x2[6];
580238384Sjkim
581280304Sjkim    out[6] += ((uint128_t) in1[7]) * in2x2[8] +
582280304Sjkim        ((uint128_t) in1[8]) * in2x2[7];
583238384Sjkim
584280304Sjkim    out[7] += ((uint128_t) in1[8]) * in2x2[8];
585280304Sjkim}
586238384Sjkim
587238384Sjkimstatic const limb bottom52bits = 0xfffffffffffff;
588238384Sjkim
589280304Sjkim/*-
590280304Sjkim * felem_reduce converts a largefelem to an felem.
591238384Sjkim * On entry:
592238384Sjkim *   in[i] < 2^128
593238384Sjkim * On exit:
594238384Sjkim *   out[i] < 2^59 + 2^14
595238384Sjkim */
596238384Sjkimstatic void felem_reduce(felem out, const largefelem in)
597280304Sjkim{
598280304Sjkim    u64 overflow1, overflow2;
599238384Sjkim
600280304Sjkim    out[0] = ((limb) in[0]) & bottom58bits;
601280304Sjkim    out[1] = ((limb) in[1]) & bottom58bits;
602280304Sjkim    out[2] = ((limb) in[2]) & bottom58bits;
603280304Sjkim    out[3] = ((limb) in[3]) & bottom58bits;
604280304Sjkim    out[4] = ((limb) in[4]) & bottom58bits;
605280304Sjkim    out[5] = ((limb) in[5]) & bottom58bits;
606280304Sjkim    out[6] = ((limb) in[6]) & bottom58bits;
607280304Sjkim    out[7] = ((limb) in[7]) & bottom58bits;
608280304Sjkim    out[8] = ((limb) in[8]) & bottom58bits;
609238384Sjkim
610280304Sjkim    /* out[i] < 2^58 */
611238384Sjkim
612280304Sjkim    out[1] += ((limb) in[0]) >> 58;
613280304Sjkim    out[1] += (((limb) (in[0] >> 64)) & bottom52bits) << 6;
614280304Sjkim    /*-
615280304Sjkim     * out[1] < 2^58 + 2^6 + 2^58
616280304Sjkim     *        = 2^59 + 2^6
617280304Sjkim     */
618280304Sjkim    out[2] += ((limb) (in[0] >> 64)) >> 52;
619238384Sjkim
620280304Sjkim    out[2] += ((limb) in[1]) >> 58;
621280304Sjkim    out[2] += (((limb) (in[1] >> 64)) & bottom52bits) << 6;
622280304Sjkim    out[3] += ((limb) (in[1] >> 64)) >> 52;
623238384Sjkim
624280304Sjkim    out[3] += ((limb) in[2]) >> 58;
625280304Sjkim    out[3] += (((limb) (in[2] >> 64)) & bottom52bits) << 6;
626280304Sjkim    out[4] += ((limb) (in[2] >> 64)) >> 52;
627238384Sjkim
628280304Sjkim    out[4] += ((limb) in[3]) >> 58;
629280304Sjkim    out[4] += (((limb) (in[3] >> 64)) & bottom52bits) << 6;
630280304Sjkim    out[5] += ((limb) (in[3] >> 64)) >> 52;
631238384Sjkim
632280304Sjkim    out[5] += ((limb) in[4]) >> 58;
633280304Sjkim    out[5] += (((limb) (in[4] >> 64)) & bottom52bits) << 6;
634280304Sjkim    out[6] += ((limb) (in[4] >> 64)) >> 52;
635238384Sjkim
636280304Sjkim    out[6] += ((limb) in[5]) >> 58;
637280304Sjkim    out[6] += (((limb) (in[5] >> 64)) & bottom52bits) << 6;
638280304Sjkim    out[7] += ((limb) (in[5] >> 64)) >> 52;
639238384Sjkim
640280304Sjkim    out[7] += ((limb) in[6]) >> 58;
641280304Sjkim    out[7] += (((limb) (in[6] >> 64)) & bottom52bits) << 6;
642280304Sjkim    out[8] += ((limb) (in[6] >> 64)) >> 52;
643238384Sjkim
644280304Sjkim    out[8] += ((limb) in[7]) >> 58;
645280304Sjkim    out[8] += (((limb) (in[7] >> 64)) & bottom52bits) << 6;
646280304Sjkim    /*-
647280304Sjkim     * out[x > 1] < 2^58 + 2^6 + 2^58 + 2^12
648280304Sjkim     *            < 2^59 + 2^13
649280304Sjkim     */
650280304Sjkim    overflow1 = ((limb) (in[7] >> 64)) >> 52;
651238384Sjkim
652280304Sjkim    overflow1 += ((limb) in[8]) >> 58;
653280304Sjkim    overflow1 += (((limb) (in[8] >> 64)) & bottom52bits) << 6;
654280304Sjkim    overflow2 = ((limb) (in[8] >> 64)) >> 52;
655238384Sjkim
656280304Sjkim    overflow1 <<= 1;            /* overflow1 < 2^13 + 2^7 + 2^59 */
657280304Sjkim    overflow2 <<= 1;            /* overflow2 < 2^13 */
658238384Sjkim
659280304Sjkim    out[0] += overflow1;        /* out[0] < 2^60 */
660280304Sjkim    out[1] += overflow2;        /* out[1] < 2^59 + 2^6 + 2^13 */
661238384Sjkim
662280304Sjkim    out[1] += out[0] >> 58;
663280304Sjkim    out[0] &= bottom58bits;
664280304Sjkim    /*-
665280304Sjkim     * out[0] < 2^58
666280304Sjkim     * out[1] < 2^59 + 2^6 + 2^13 + 2^2
667280304Sjkim     *        < 2^59 + 2^14
668280304Sjkim     */
669280304Sjkim}
670238384Sjkim
671238384Sjkimstatic void felem_square_reduce(felem out, const felem in)
672280304Sjkim{
673280304Sjkim    largefelem tmp;
674280304Sjkim    felem_square(tmp, in);
675280304Sjkim    felem_reduce(out, tmp);
676280304Sjkim}
677238384Sjkim
678238384Sjkimstatic void felem_mul_reduce(felem out, const felem in1, const felem in2)
679280304Sjkim{
680280304Sjkim    largefelem tmp;
681280304Sjkim    felem_mul(tmp, in1, in2);
682280304Sjkim    felem_reduce(out, tmp);
683280304Sjkim}
684238384Sjkim
685280304Sjkim/*-
686280304Sjkim * felem_inv calculates |out| = |in|^{-1}
687238384Sjkim *
688238384Sjkim * Based on Fermat's Little Theorem:
689238384Sjkim *   a^p = a (mod p)
690238384Sjkim *   a^{p-1} = 1 (mod p)
691238384Sjkim *   a^{p-2} = a^{-1} (mod p)
692238384Sjkim */
693238384Sjkimstatic void felem_inv(felem out, const felem in)
694280304Sjkim{
695280304Sjkim    felem ftmp, ftmp2, ftmp3, ftmp4;
696280304Sjkim    largefelem tmp;
697280304Sjkim    unsigned i;
698238384Sjkim
699280304Sjkim    felem_square(tmp, in);
700280304Sjkim    felem_reduce(ftmp, tmp);    /* 2^1 */
701280304Sjkim    felem_mul(tmp, in, ftmp);
702280304Sjkim    felem_reduce(ftmp, tmp);    /* 2^2 - 2^0 */
703280304Sjkim    felem_assign(ftmp2, ftmp);
704280304Sjkim    felem_square(tmp, ftmp);
705280304Sjkim    felem_reduce(ftmp, tmp);    /* 2^3 - 2^1 */
706280304Sjkim    felem_mul(tmp, in, ftmp);
707280304Sjkim    felem_reduce(ftmp, tmp);    /* 2^3 - 2^0 */
708280304Sjkim    felem_square(tmp, ftmp);
709280304Sjkim    felem_reduce(ftmp, tmp);    /* 2^4 - 2^1 */
710238384Sjkim
711280304Sjkim    felem_square(tmp, ftmp2);
712280304Sjkim    felem_reduce(ftmp3, tmp);   /* 2^3 - 2^1 */
713280304Sjkim    felem_square(tmp, ftmp3);
714280304Sjkim    felem_reduce(ftmp3, tmp);   /* 2^4 - 2^2 */
715280304Sjkim    felem_mul(tmp, ftmp3, ftmp2);
716280304Sjkim    felem_reduce(ftmp3, tmp);   /* 2^4 - 2^0 */
717238384Sjkim
718280304Sjkim    felem_assign(ftmp2, ftmp3);
719280304Sjkim    felem_square(tmp, ftmp3);
720280304Sjkim    felem_reduce(ftmp3, tmp);   /* 2^5 - 2^1 */
721280304Sjkim    felem_square(tmp, ftmp3);
722280304Sjkim    felem_reduce(ftmp3, tmp);   /* 2^6 - 2^2 */
723280304Sjkim    felem_square(tmp, ftmp3);
724280304Sjkim    felem_reduce(ftmp3, tmp);   /* 2^7 - 2^3 */
725280304Sjkim    felem_square(tmp, ftmp3);
726280304Sjkim    felem_reduce(ftmp3, tmp);   /* 2^8 - 2^4 */
727280304Sjkim    felem_assign(ftmp4, ftmp3);
728280304Sjkim    felem_mul(tmp, ftmp3, ftmp);
729280304Sjkim    felem_reduce(ftmp4, tmp);   /* 2^8 - 2^1 */
730280304Sjkim    felem_square(tmp, ftmp4);
731280304Sjkim    felem_reduce(ftmp4, tmp);   /* 2^9 - 2^2 */
732280304Sjkim    felem_mul(tmp, ftmp3, ftmp2);
733280304Sjkim    felem_reduce(ftmp3, tmp);   /* 2^8 - 2^0 */
734280304Sjkim    felem_assign(ftmp2, ftmp3);
735238384Sjkim
736280304Sjkim    for (i = 0; i < 8; i++) {
737280304Sjkim        felem_square(tmp, ftmp3);
738280304Sjkim        felem_reduce(ftmp3, tmp); /* 2^16 - 2^8 */
739280304Sjkim    }
740280304Sjkim    felem_mul(tmp, ftmp3, ftmp2);
741280304Sjkim    felem_reduce(ftmp3, tmp);   /* 2^16 - 2^0 */
742280304Sjkim    felem_assign(ftmp2, ftmp3);
743238384Sjkim
744280304Sjkim    for (i = 0; i < 16; i++) {
745280304Sjkim        felem_square(tmp, ftmp3);
746280304Sjkim        felem_reduce(ftmp3, tmp); /* 2^32 - 2^16 */
747280304Sjkim    }
748280304Sjkim    felem_mul(tmp, ftmp3, ftmp2);
749280304Sjkim    felem_reduce(ftmp3, tmp);   /* 2^32 - 2^0 */
750280304Sjkim    felem_assign(ftmp2, ftmp3);
751238384Sjkim
752280304Sjkim    for (i = 0; i < 32; i++) {
753280304Sjkim        felem_square(tmp, ftmp3);
754280304Sjkim        felem_reduce(ftmp3, tmp); /* 2^64 - 2^32 */
755280304Sjkim    }
756280304Sjkim    felem_mul(tmp, ftmp3, ftmp2);
757280304Sjkim    felem_reduce(ftmp3, tmp);   /* 2^64 - 2^0 */
758280304Sjkim    felem_assign(ftmp2, ftmp3);
759238384Sjkim
760280304Sjkim    for (i = 0; i < 64; i++) {
761280304Sjkim        felem_square(tmp, ftmp3);
762280304Sjkim        felem_reduce(ftmp3, tmp); /* 2^128 - 2^64 */
763280304Sjkim    }
764280304Sjkim    felem_mul(tmp, ftmp3, ftmp2);
765280304Sjkim    felem_reduce(ftmp3, tmp);   /* 2^128 - 2^0 */
766280304Sjkim    felem_assign(ftmp2, ftmp3);
767238384Sjkim
768280304Sjkim    for (i = 0; i < 128; i++) {
769280304Sjkim        felem_square(tmp, ftmp3);
770280304Sjkim        felem_reduce(ftmp3, tmp); /* 2^256 - 2^128 */
771280304Sjkim    }
772280304Sjkim    felem_mul(tmp, ftmp3, ftmp2);
773280304Sjkim    felem_reduce(ftmp3, tmp);   /* 2^256 - 2^0 */
774280304Sjkim    felem_assign(ftmp2, ftmp3);
775238384Sjkim
776280304Sjkim    for (i = 0; i < 256; i++) {
777280304Sjkim        felem_square(tmp, ftmp3);
778280304Sjkim        felem_reduce(ftmp3, tmp); /* 2^512 - 2^256 */
779280304Sjkim    }
780280304Sjkim    felem_mul(tmp, ftmp3, ftmp2);
781280304Sjkim    felem_reduce(ftmp3, tmp);   /* 2^512 - 2^0 */
782238384Sjkim
783280304Sjkim    for (i = 0; i < 9; i++) {
784280304Sjkim        felem_square(tmp, ftmp3);
785280304Sjkim        felem_reduce(ftmp3, tmp); /* 2^521 - 2^9 */
786280304Sjkim    }
787280304Sjkim    felem_mul(tmp, ftmp3, ftmp4);
788280304Sjkim    felem_reduce(ftmp3, tmp);   /* 2^512 - 2^2 */
789280304Sjkim    felem_mul(tmp, ftmp3, in);
790280304Sjkim    felem_reduce(out, tmp);     /* 2^512 - 3 */
791238384Sjkim}
792238384Sjkim
793238384Sjkim/* This is 2^521-1, expressed as an felem */
794280304Sjkimstatic const felem kPrime = {
795280304Sjkim    0x03ffffffffffffff, 0x03ffffffffffffff, 0x03ffffffffffffff,
796280304Sjkim    0x03ffffffffffffff, 0x03ffffffffffffff, 0x03ffffffffffffff,
797280304Sjkim    0x03ffffffffffffff, 0x03ffffffffffffff, 0x01ffffffffffffff
798280304Sjkim};
799238384Sjkim
800280304Sjkim/*-
801280304Sjkim * felem_is_zero returns a limb with all bits set if |in| == 0 (mod p) and 0
802238384Sjkim * otherwise.
803238384Sjkim * On entry:
804238384Sjkim *   in[i] < 2^59 + 2^14
805238384Sjkim */
806238384Sjkimstatic limb felem_is_zero(const felem in)
807280304Sjkim{
808280304Sjkim    felem ftmp;
809280304Sjkim    limb is_zero, is_p;
810280304Sjkim    felem_assign(ftmp, in);
811238384Sjkim
812280304Sjkim    ftmp[0] += ftmp[8] >> 57;
813280304Sjkim    ftmp[8] &= bottom57bits;
814280304Sjkim    /* ftmp[8] < 2^57 */
815280304Sjkim    ftmp[1] += ftmp[0] >> 58;
816280304Sjkim    ftmp[0] &= bottom58bits;
817280304Sjkim    ftmp[2] += ftmp[1] >> 58;
818280304Sjkim    ftmp[1] &= bottom58bits;
819280304Sjkim    ftmp[3] += ftmp[2] >> 58;
820280304Sjkim    ftmp[2] &= bottom58bits;
821280304Sjkim    ftmp[4] += ftmp[3] >> 58;
822280304Sjkim    ftmp[3] &= bottom58bits;
823280304Sjkim    ftmp[5] += ftmp[4] >> 58;
824280304Sjkim    ftmp[4] &= bottom58bits;
825280304Sjkim    ftmp[6] += ftmp[5] >> 58;
826280304Sjkim    ftmp[5] &= bottom58bits;
827280304Sjkim    ftmp[7] += ftmp[6] >> 58;
828280304Sjkim    ftmp[6] &= bottom58bits;
829280304Sjkim    ftmp[8] += ftmp[7] >> 58;
830280304Sjkim    ftmp[7] &= bottom58bits;
831280304Sjkim    /* ftmp[8] < 2^57 + 4 */
832238384Sjkim
833280304Sjkim    /*
834280304Sjkim     * The ninth limb of 2*(2^521-1) is 0x03ffffffffffffff, which is greater
835280304Sjkim     * than our bound for ftmp[8]. Therefore we only have to check if the
836280304Sjkim     * zero is zero or 2^521-1.
837280304Sjkim     */
838238384Sjkim
839280304Sjkim    is_zero = 0;
840280304Sjkim    is_zero |= ftmp[0];
841280304Sjkim    is_zero |= ftmp[1];
842280304Sjkim    is_zero |= ftmp[2];
843280304Sjkim    is_zero |= ftmp[3];
844280304Sjkim    is_zero |= ftmp[4];
845280304Sjkim    is_zero |= ftmp[5];
846280304Sjkim    is_zero |= ftmp[6];
847280304Sjkim    is_zero |= ftmp[7];
848280304Sjkim    is_zero |= ftmp[8];
849238384Sjkim
850280304Sjkim    is_zero--;
851280304Sjkim    /*
852280304Sjkim     * We know that ftmp[i] < 2^63, therefore the only way that the top bit
853280304Sjkim     * can be set is if is_zero was 0 before the decrement.
854280304Sjkim     */
855280304Sjkim    is_zero = ((s64) is_zero) >> 63;
856238384Sjkim
857280304Sjkim    is_p = ftmp[0] ^ kPrime[0];
858280304Sjkim    is_p |= ftmp[1] ^ kPrime[1];
859280304Sjkim    is_p |= ftmp[2] ^ kPrime[2];
860280304Sjkim    is_p |= ftmp[3] ^ kPrime[3];
861280304Sjkim    is_p |= ftmp[4] ^ kPrime[4];
862280304Sjkim    is_p |= ftmp[5] ^ kPrime[5];
863280304Sjkim    is_p |= ftmp[6] ^ kPrime[6];
864280304Sjkim    is_p |= ftmp[7] ^ kPrime[7];
865280304Sjkim    is_p |= ftmp[8] ^ kPrime[8];
866238384Sjkim
867280304Sjkim    is_p--;
868280304Sjkim    is_p = ((s64) is_p) >> 63;
869238384Sjkim
870280304Sjkim    is_zero |= is_p;
871280304Sjkim    return is_zero;
872280304Sjkim}
873238384Sjkim
874238384Sjkimstatic int felem_is_zero_int(const felem in)
875280304Sjkim{
876280304Sjkim    return (int)(felem_is_zero(in) & ((limb) 1));
877280304Sjkim}
878238384Sjkim
879280304Sjkim/*-
880280304Sjkim * felem_contract converts |in| to its unique, minimal representation.
881238384Sjkim * On entry:
882238384Sjkim *   in[i] < 2^59 + 2^14
883238384Sjkim */
884238384Sjkimstatic void felem_contract(felem out, const felem in)
885280304Sjkim{
886280304Sjkim    limb is_p, is_greater, sign;
887280304Sjkim    static const limb two58 = ((limb) 1) << 58;
888238384Sjkim
889280304Sjkim    felem_assign(out, in);
890238384Sjkim
891280304Sjkim    out[0] += out[8] >> 57;
892280304Sjkim    out[8] &= bottom57bits;
893280304Sjkim    /* out[8] < 2^57 */
894280304Sjkim    out[1] += out[0] >> 58;
895280304Sjkim    out[0] &= bottom58bits;
896280304Sjkim    out[2] += out[1] >> 58;
897280304Sjkim    out[1] &= bottom58bits;
898280304Sjkim    out[3] += out[2] >> 58;
899280304Sjkim    out[2] &= bottom58bits;
900280304Sjkim    out[4] += out[3] >> 58;
901280304Sjkim    out[3] &= bottom58bits;
902280304Sjkim    out[5] += out[4] >> 58;
903280304Sjkim    out[4] &= bottom58bits;
904280304Sjkim    out[6] += out[5] >> 58;
905280304Sjkim    out[5] &= bottom58bits;
906280304Sjkim    out[7] += out[6] >> 58;
907280304Sjkim    out[6] &= bottom58bits;
908280304Sjkim    out[8] += out[7] >> 58;
909280304Sjkim    out[7] &= bottom58bits;
910280304Sjkim    /* out[8] < 2^57 + 4 */
911238384Sjkim
912280304Sjkim    /*
913280304Sjkim     * If the value is greater than 2^521-1 then we have to subtract 2^521-1
914280304Sjkim     * out. See the comments in felem_is_zero regarding why we don't test for
915280304Sjkim     * other multiples of the prime.
916280304Sjkim     */
917238384Sjkim
918280304Sjkim    /*
919280304Sjkim     * First, if |out| is equal to 2^521-1, we subtract it out to get zero.
920280304Sjkim     */
921238384Sjkim
922280304Sjkim    is_p = out[0] ^ kPrime[0];
923280304Sjkim    is_p |= out[1] ^ kPrime[1];
924280304Sjkim    is_p |= out[2] ^ kPrime[2];
925280304Sjkim    is_p |= out[3] ^ kPrime[3];
926280304Sjkim    is_p |= out[4] ^ kPrime[4];
927280304Sjkim    is_p |= out[5] ^ kPrime[5];
928280304Sjkim    is_p |= out[6] ^ kPrime[6];
929280304Sjkim    is_p |= out[7] ^ kPrime[7];
930280304Sjkim    is_p |= out[8] ^ kPrime[8];
931238384Sjkim
932280304Sjkim    is_p--;
933280304Sjkim    is_p &= is_p << 32;
934280304Sjkim    is_p &= is_p << 16;
935280304Sjkim    is_p &= is_p << 8;
936280304Sjkim    is_p &= is_p << 4;
937280304Sjkim    is_p &= is_p << 2;
938280304Sjkim    is_p &= is_p << 1;
939280304Sjkim    is_p = ((s64) is_p) >> 63;
940280304Sjkim    is_p = ~is_p;
941238384Sjkim
942280304Sjkim    /* is_p is 0 iff |out| == 2^521-1 and all ones otherwise */
943238384Sjkim
944280304Sjkim    out[0] &= is_p;
945280304Sjkim    out[1] &= is_p;
946280304Sjkim    out[2] &= is_p;
947280304Sjkim    out[3] &= is_p;
948280304Sjkim    out[4] &= is_p;
949280304Sjkim    out[5] &= is_p;
950280304Sjkim    out[6] &= is_p;
951280304Sjkim    out[7] &= is_p;
952280304Sjkim    out[8] &= is_p;
953238384Sjkim
954280304Sjkim    /*
955280304Sjkim     * In order to test that |out| >= 2^521-1 we need only test if out[8] >>
956280304Sjkim     * 57 is greater than zero as (2^521-1) + x >= 2^522
957280304Sjkim     */
958280304Sjkim    is_greater = out[8] >> 57;
959280304Sjkim    is_greater |= is_greater << 32;
960280304Sjkim    is_greater |= is_greater << 16;
961280304Sjkim    is_greater |= is_greater << 8;
962280304Sjkim    is_greater |= is_greater << 4;
963280304Sjkim    is_greater |= is_greater << 2;
964280304Sjkim    is_greater |= is_greater << 1;
965280304Sjkim    is_greater = ((s64) is_greater) >> 63;
966238384Sjkim
967280304Sjkim    out[0] -= kPrime[0] & is_greater;
968280304Sjkim    out[1] -= kPrime[1] & is_greater;
969280304Sjkim    out[2] -= kPrime[2] & is_greater;
970280304Sjkim    out[3] -= kPrime[3] & is_greater;
971280304Sjkim    out[4] -= kPrime[4] & is_greater;
972280304Sjkim    out[5] -= kPrime[5] & is_greater;
973280304Sjkim    out[6] -= kPrime[6] & is_greater;
974280304Sjkim    out[7] -= kPrime[7] & is_greater;
975280304Sjkim    out[8] -= kPrime[8] & is_greater;
976238384Sjkim
977280304Sjkim    /* Eliminate negative coefficients */
978280304Sjkim    sign = -(out[0] >> 63);
979280304Sjkim    out[0] += (two58 & sign);
980280304Sjkim    out[1] -= (1 & sign);
981280304Sjkim    sign = -(out[1] >> 63);
982280304Sjkim    out[1] += (two58 & sign);
983280304Sjkim    out[2] -= (1 & sign);
984280304Sjkim    sign = -(out[2] >> 63);
985280304Sjkim    out[2] += (two58 & sign);
986280304Sjkim    out[3] -= (1 & sign);
987280304Sjkim    sign = -(out[3] >> 63);
988280304Sjkim    out[3] += (two58 & sign);
989280304Sjkim    out[4] -= (1 & sign);
990280304Sjkim    sign = -(out[4] >> 63);
991280304Sjkim    out[4] += (two58 & sign);
992280304Sjkim    out[5] -= (1 & sign);
993280304Sjkim    sign = -(out[0] >> 63);
994280304Sjkim    out[5] += (two58 & sign);
995280304Sjkim    out[6] -= (1 & sign);
996280304Sjkim    sign = -(out[6] >> 63);
997280304Sjkim    out[6] += (two58 & sign);
998280304Sjkim    out[7] -= (1 & sign);
999280304Sjkim    sign = -(out[7] >> 63);
1000280304Sjkim    out[7] += (two58 & sign);
1001280304Sjkim    out[8] -= (1 & sign);
1002280304Sjkim    sign = -(out[5] >> 63);
1003280304Sjkim    out[5] += (two58 & sign);
1004280304Sjkim    out[6] -= (1 & sign);
1005280304Sjkim    sign = -(out[6] >> 63);
1006280304Sjkim    out[6] += (two58 & sign);
1007280304Sjkim    out[7] -= (1 & sign);
1008280304Sjkim    sign = -(out[7] >> 63);
1009280304Sjkim    out[7] += (two58 & sign);
1010280304Sjkim    out[8] -= (1 & sign);
1011280304Sjkim}
1012238384Sjkim
1013280304Sjkim/*-
1014280304Sjkim * Group operations
1015238384Sjkim * ----------------
1016238384Sjkim *
1017238384Sjkim * Building on top of the field operations we have the operations on the
1018238384Sjkim * elliptic curve group itself. Points on the curve are represented in Jacobian
1019238384Sjkim * coordinates */
1020238384Sjkim
1021280304Sjkim/*-
1022280304Sjkim * point_double calcuates 2*(x_in, y_in, z_in)
1023238384Sjkim *
1024238384Sjkim * The method is taken from:
1025238384Sjkim *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b
1026238384Sjkim *
1027238384Sjkim * Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed.
1028238384Sjkim * while x_out == y_in is not (maybe this works, but it's not tested). */
1029238384Sjkimstatic void
1030238384Sjkimpoint_double(felem x_out, felem y_out, felem z_out,
1031280304Sjkim             const felem x_in, const felem y_in, const felem z_in)
1032280304Sjkim{
1033280304Sjkim    largefelem tmp, tmp2;
1034280304Sjkim    felem delta, gamma, beta, alpha, ftmp, ftmp2;
1035238384Sjkim
1036280304Sjkim    felem_assign(ftmp, x_in);
1037280304Sjkim    felem_assign(ftmp2, x_in);
1038238384Sjkim
1039280304Sjkim    /* delta = z^2 */
1040280304Sjkim    felem_square(tmp, z_in);
1041280304Sjkim    felem_reduce(delta, tmp);   /* delta[i] < 2^59 + 2^14 */
1042238384Sjkim
1043280304Sjkim    /* gamma = y^2 */
1044280304Sjkim    felem_square(tmp, y_in);
1045280304Sjkim    felem_reduce(gamma, tmp);   /* gamma[i] < 2^59 + 2^14 */
1046238384Sjkim
1047280304Sjkim    /* beta = x*gamma */
1048280304Sjkim    felem_mul(tmp, x_in, gamma);
1049280304Sjkim    felem_reduce(beta, tmp);    /* beta[i] < 2^59 + 2^14 */
1050238384Sjkim
1051280304Sjkim    /* alpha = 3*(x-delta)*(x+delta) */
1052280304Sjkim    felem_diff64(ftmp, delta);
1053280304Sjkim    /* ftmp[i] < 2^61 */
1054280304Sjkim    felem_sum64(ftmp2, delta);
1055280304Sjkim    /* ftmp2[i] < 2^60 + 2^15 */
1056280304Sjkim    felem_scalar64(ftmp2, 3);
1057280304Sjkim    /* ftmp2[i] < 3*2^60 + 3*2^15 */
1058280304Sjkim    felem_mul(tmp, ftmp, ftmp2);
1059280304Sjkim    /*-
1060280304Sjkim     * tmp[i] < 17(3*2^121 + 3*2^76)
1061280304Sjkim     *        = 61*2^121 + 61*2^76
1062280304Sjkim     *        < 64*2^121 + 64*2^76
1063280304Sjkim     *        = 2^127 + 2^82
1064280304Sjkim     *        < 2^128
1065280304Sjkim     */
1066280304Sjkim    felem_reduce(alpha, tmp);
1067238384Sjkim
1068280304Sjkim    /* x' = alpha^2 - 8*beta */
1069280304Sjkim    felem_square(tmp, alpha);
1070280304Sjkim    /*
1071280304Sjkim     * tmp[i] < 17*2^120 < 2^125
1072280304Sjkim     */
1073280304Sjkim    felem_assign(ftmp, beta);
1074280304Sjkim    felem_scalar64(ftmp, 8);
1075280304Sjkim    /* ftmp[i] < 2^62 + 2^17 */
1076280304Sjkim    felem_diff_128_64(tmp, ftmp);
1077280304Sjkim    /* tmp[i] < 2^125 + 2^63 + 2^62 + 2^17 */
1078280304Sjkim    felem_reduce(x_out, tmp);
1079238384Sjkim
1080280304Sjkim    /* z' = (y + z)^2 - gamma - delta */
1081280304Sjkim    felem_sum64(delta, gamma);
1082280304Sjkim    /* delta[i] < 2^60 + 2^15 */
1083280304Sjkim    felem_assign(ftmp, y_in);
1084280304Sjkim    felem_sum64(ftmp, z_in);
1085280304Sjkim    /* ftmp[i] < 2^60 + 2^15 */
1086280304Sjkim    felem_square(tmp, ftmp);
1087280304Sjkim    /*
1088280304Sjkim     * tmp[i] < 17(2^122) < 2^127
1089280304Sjkim     */
1090280304Sjkim    felem_diff_128_64(tmp, delta);
1091280304Sjkim    /* tmp[i] < 2^127 + 2^63 */
1092280304Sjkim    felem_reduce(z_out, tmp);
1093238384Sjkim
1094280304Sjkim    /* y' = alpha*(4*beta - x') - 8*gamma^2 */
1095280304Sjkim    felem_scalar64(beta, 4);
1096280304Sjkim    /* beta[i] < 2^61 + 2^16 */
1097280304Sjkim    felem_diff64(beta, x_out);
1098280304Sjkim    /* beta[i] < 2^61 + 2^60 + 2^16 */
1099280304Sjkim    felem_mul(tmp, alpha, beta);
1100280304Sjkim    /*-
1101280304Sjkim     * tmp[i] < 17*((2^59 + 2^14)(2^61 + 2^60 + 2^16))
1102280304Sjkim     *        = 17*(2^120 + 2^75 + 2^119 + 2^74 + 2^75 + 2^30)
1103280304Sjkim     *        = 17*(2^120 + 2^119 + 2^76 + 2^74 + 2^30)
1104280304Sjkim     *        < 2^128
1105280304Sjkim     */
1106280304Sjkim    felem_square(tmp2, gamma);
1107280304Sjkim    /*-
1108280304Sjkim     * tmp2[i] < 17*(2^59 + 2^14)^2
1109280304Sjkim     *         = 17*(2^118 + 2^74 + 2^28)
1110280304Sjkim     */
1111280304Sjkim    felem_scalar128(tmp2, 8);
1112280304Sjkim    /*-
1113280304Sjkim     * tmp2[i] < 8*17*(2^118 + 2^74 + 2^28)
1114280304Sjkim     *         = 2^125 + 2^121 + 2^81 + 2^77 + 2^35 + 2^31
1115280304Sjkim     *         < 2^126
1116280304Sjkim     */
1117280304Sjkim    felem_diff128(tmp, tmp2);
1118280304Sjkim    /*-
1119280304Sjkim     * tmp[i] < 2^127 - 2^69 + 17(2^120 + 2^119 + 2^76 + 2^74 + 2^30)
1120280304Sjkim     *        = 2^127 + 2^124 + 2^122 + 2^120 + 2^118 + 2^80 + 2^78 + 2^76 +
1121280304Sjkim     *          2^74 + 2^69 + 2^34 + 2^30
1122280304Sjkim     *        < 2^128
1123280304Sjkim     */
1124280304Sjkim    felem_reduce(y_out, tmp);
1125280304Sjkim}
1126238384Sjkim
1127238384Sjkim/* copy_conditional copies in to out iff mask is all ones. */
1128280304Sjkimstatic void copy_conditional(felem out, const felem in, limb mask)
1129280304Sjkim{
1130280304Sjkim    unsigned i;
1131280304Sjkim    for (i = 0; i < NLIMBS; ++i) {
1132280304Sjkim        const limb tmp = mask & (in[i] ^ out[i]);
1133280304Sjkim        out[i] ^= tmp;
1134280304Sjkim    }
1135280304Sjkim}
1136238384Sjkim
1137280304Sjkim/*-
1138280304Sjkim * point_add calcuates (x1, y1, z1) + (x2, y2, z2)
1139238384Sjkim *
1140238384Sjkim * The method is taken from
1141238384Sjkim *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl,
1142238384Sjkim * adapted for mixed addition (z2 = 1, or z2 = 0 for the point at infinity).
1143238384Sjkim *
1144238384Sjkim * This function includes a branch for checking whether the two input points
1145238384Sjkim * are equal (while not equal to the point at infinity). This case never
1146238384Sjkim * happens during single point multiplication, so there is no timing leak for
1147238384Sjkim * ECDH or ECDSA signing. */
1148238384Sjkimstatic void point_add(felem x3, felem y3, felem z3,
1149280304Sjkim                      const felem x1, const felem y1, const felem z1,
1150280304Sjkim                      const int mixed, const felem x2, const felem y2,
1151280304Sjkim                      const felem z2)
1152280304Sjkim{
1153280304Sjkim    felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, ftmp6, x_out, y_out, z_out;
1154280304Sjkim    largefelem tmp, tmp2;
1155280304Sjkim    limb x_equal, y_equal, z1_is_zero, z2_is_zero;
1156238384Sjkim
1157280304Sjkim    z1_is_zero = felem_is_zero(z1);
1158280304Sjkim    z2_is_zero = felem_is_zero(z2);
1159238384Sjkim
1160280304Sjkim    /* ftmp = z1z1 = z1**2 */
1161280304Sjkim    felem_square(tmp, z1);
1162280304Sjkim    felem_reduce(ftmp, tmp);
1163238384Sjkim
1164280304Sjkim    if (!mixed) {
1165280304Sjkim        /* ftmp2 = z2z2 = z2**2 */
1166280304Sjkim        felem_square(tmp, z2);
1167280304Sjkim        felem_reduce(ftmp2, tmp);
1168238384Sjkim
1169280304Sjkim        /* u1 = ftmp3 = x1*z2z2 */
1170280304Sjkim        felem_mul(tmp, x1, ftmp2);
1171280304Sjkim        felem_reduce(ftmp3, tmp);
1172238384Sjkim
1173280304Sjkim        /* ftmp5 = z1 + z2 */
1174280304Sjkim        felem_assign(ftmp5, z1);
1175280304Sjkim        felem_sum64(ftmp5, z2);
1176280304Sjkim        /* ftmp5[i] < 2^61 */
1177238384Sjkim
1178280304Sjkim        /* ftmp5 = (z1 + z2)**2 - z1z1 - z2z2 = 2*z1z2 */
1179280304Sjkim        felem_square(tmp, ftmp5);
1180280304Sjkim        /* tmp[i] < 17*2^122 */
1181280304Sjkim        felem_diff_128_64(tmp, ftmp);
1182280304Sjkim        /* tmp[i] < 17*2^122 + 2^63 */
1183280304Sjkim        felem_diff_128_64(tmp, ftmp2);
1184280304Sjkim        /* tmp[i] < 17*2^122 + 2^64 */
1185280304Sjkim        felem_reduce(ftmp5, tmp);
1186238384Sjkim
1187280304Sjkim        /* ftmp2 = z2 * z2z2 */
1188280304Sjkim        felem_mul(tmp, ftmp2, z2);
1189280304Sjkim        felem_reduce(ftmp2, tmp);
1190238384Sjkim
1191280304Sjkim        /* s1 = ftmp6 = y1 * z2**3 */
1192280304Sjkim        felem_mul(tmp, y1, ftmp2);
1193280304Sjkim        felem_reduce(ftmp6, tmp);
1194280304Sjkim    } else {
1195280304Sjkim        /*
1196280304Sjkim         * We'll assume z2 = 1 (special case z2 = 0 is handled later)
1197280304Sjkim         */
1198238384Sjkim
1199280304Sjkim        /* u1 = ftmp3 = x1*z2z2 */
1200280304Sjkim        felem_assign(ftmp3, x1);
1201238384Sjkim
1202280304Sjkim        /* ftmp5 = 2*z1z2 */
1203280304Sjkim        felem_scalar(ftmp5, z1, 2);
1204238384Sjkim
1205280304Sjkim        /* s1 = ftmp6 = y1 * z2**3 */
1206280304Sjkim        felem_assign(ftmp6, y1);
1207280304Sjkim    }
1208238384Sjkim
1209280304Sjkim    /* u2 = x2*z1z1 */
1210280304Sjkim    felem_mul(tmp, x2, ftmp);
1211280304Sjkim    /* tmp[i] < 17*2^120 */
1212238384Sjkim
1213280304Sjkim    /* h = ftmp4 = u2 - u1 */
1214280304Sjkim    felem_diff_128_64(tmp, ftmp3);
1215280304Sjkim    /* tmp[i] < 17*2^120 + 2^63 */
1216280304Sjkim    felem_reduce(ftmp4, tmp);
1217238384Sjkim
1218280304Sjkim    x_equal = felem_is_zero(ftmp4);
1219238384Sjkim
1220280304Sjkim    /* z_out = ftmp5 * h */
1221280304Sjkim    felem_mul(tmp, ftmp5, ftmp4);
1222280304Sjkim    felem_reduce(z_out, tmp);
1223238384Sjkim
1224280304Sjkim    /* ftmp = z1 * z1z1 */
1225280304Sjkim    felem_mul(tmp, ftmp, z1);
1226280304Sjkim    felem_reduce(ftmp, tmp);
1227238384Sjkim
1228280304Sjkim    /* s2 = tmp = y2 * z1**3 */
1229280304Sjkim    felem_mul(tmp, y2, ftmp);
1230280304Sjkim    /* tmp[i] < 17*2^120 */
1231238384Sjkim
1232280304Sjkim    /* r = ftmp5 = (s2 - s1)*2 */
1233280304Sjkim    felem_diff_128_64(tmp, ftmp6);
1234280304Sjkim    /* tmp[i] < 17*2^120 + 2^63 */
1235280304Sjkim    felem_reduce(ftmp5, tmp);
1236280304Sjkim    y_equal = felem_is_zero(ftmp5);
1237280304Sjkim    felem_scalar64(ftmp5, 2);
1238280304Sjkim    /* ftmp5[i] < 2^61 */
1239238384Sjkim
1240280304Sjkim    if (x_equal && y_equal && !z1_is_zero && !z2_is_zero) {
1241280304Sjkim        point_double(x3, y3, z3, x1, y1, z1);
1242280304Sjkim        return;
1243280304Sjkim    }
1244238384Sjkim
1245280304Sjkim    /* I = ftmp = (2h)**2 */
1246280304Sjkim    felem_assign(ftmp, ftmp4);
1247280304Sjkim    felem_scalar64(ftmp, 2);
1248280304Sjkim    /* ftmp[i] < 2^61 */
1249280304Sjkim    felem_square(tmp, ftmp);
1250280304Sjkim    /* tmp[i] < 17*2^122 */
1251280304Sjkim    felem_reduce(ftmp, tmp);
1252238384Sjkim
1253280304Sjkim    /* J = ftmp2 = h * I */
1254280304Sjkim    felem_mul(tmp, ftmp4, ftmp);
1255280304Sjkim    felem_reduce(ftmp2, tmp);
1256238384Sjkim
1257280304Sjkim    /* V = ftmp4 = U1 * I */
1258280304Sjkim    felem_mul(tmp, ftmp3, ftmp);
1259280304Sjkim    felem_reduce(ftmp4, tmp);
1260238384Sjkim
1261280304Sjkim    /* x_out = r**2 - J - 2V */
1262280304Sjkim    felem_square(tmp, ftmp5);
1263280304Sjkim    /* tmp[i] < 17*2^122 */
1264280304Sjkim    felem_diff_128_64(tmp, ftmp2);
1265280304Sjkim    /* tmp[i] < 17*2^122 + 2^63 */
1266280304Sjkim    felem_assign(ftmp3, ftmp4);
1267280304Sjkim    felem_scalar64(ftmp4, 2);
1268280304Sjkim    /* ftmp4[i] < 2^61 */
1269280304Sjkim    felem_diff_128_64(tmp, ftmp4);
1270280304Sjkim    /* tmp[i] < 17*2^122 + 2^64 */
1271280304Sjkim    felem_reduce(x_out, tmp);
1272238384Sjkim
1273280304Sjkim    /* y_out = r(V-x_out) - 2 * s1 * J */
1274280304Sjkim    felem_diff64(ftmp3, x_out);
1275280304Sjkim    /*
1276280304Sjkim     * ftmp3[i] < 2^60 + 2^60 = 2^61
1277280304Sjkim     */
1278280304Sjkim    felem_mul(tmp, ftmp5, ftmp3);
1279280304Sjkim    /* tmp[i] < 17*2^122 */
1280280304Sjkim    felem_mul(tmp2, ftmp6, ftmp2);
1281280304Sjkim    /* tmp2[i] < 17*2^120 */
1282280304Sjkim    felem_scalar128(tmp2, 2);
1283280304Sjkim    /* tmp2[i] < 17*2^121 */
1284280304Sjkim    felem_diff128(tmp, tmp2);
1285280304Sjkim    /*-
1286280304Sjkim     * tmp[i] < 2^127 - 2^69 + 17*2^122
1287280304Sjkim     *        = 2^126 - 2^122 - 2^6 - 2^2 - 1
1288280304Sjkim     *        < 2^127
1289280304Sjkim     */
1290280304Sjkim    felem_reduce(y_out, tmp);
1291238384Sjkim
1292280304Sjkim    copy_conditional(x_out, x2, z1_is_zero);
1293280304Sjkim    copy_conditional(x_out, x1, z2_is_zero);
1294280304Sjkim    copy_conditional(y_out, y2, z1_is_zero);
1295280304Sjkim    copy_conditional(y_out, y1, z2_is_zero);
1296280304Sjkim    copy_conditional(z_out, z2, z1_is_zero);
1297280304Sjkim    copy_conditional(z_out, z1, z2_is_zero);
1298280304Sjkim    felem_assign(x3, x_out);
1299280304Sjkim    felem_assign(y3, y_out);
1300280304Sjkim    felem_assign(z3, z_out);
1301280304Sjkim}
1302238384Sjkim
1303280304Sjkim/*-
1304280304Sjkim * Base point pre computation
1305238384Sjkim * --------------------------
1306238384Sjkim *
1307238384Sjkim * Two different sorts of precomputed tables are used in the following code.
1308238384Sjkim * Each contain various points on the curve, where each point is three field
1309238384Sjkim * elements (x, y, z).
1310238384Sjkim *
1311238384Sjkim * For the base point table, z is usually 1 (0 for the point at infinity).
1312238384Sjkim * This table has 16 elements:
1313238384Sjkim * index | bits    | point
1314238384Sjkim * ------+---------+------------------------------
1315238384Sjkim *     0 | 0 0 0 0 | 0G
1316238384Sjkim *     1 | 0 0 0 1 | 1G
1317238384Sjkim *     2 | 0 0 1 0 | 2^130G
1318238384Sjkim *     3 | 0 0 1 1 | (2^130 + 1)G
1319238384Sjkim *     4 | 0 1 0 0 | 2^260G
1320238384Sjkim *     5 | 0 1 0 1 | (2^260 + 1)G
1321238384Sjkim *     6 | 0 1 1 0 | (2^260 + 2^130)G
1322238384Sjkim *     7 | 0 1 1 1 | (2^260 + 2^130 + 1)G
1323238384Sjkim *     8 | 1 0 0 0 | 2^390G
1324238384Sjkim *     9 | 1 0 0 1 | (2^390 + 1)G
1325238384Sjkim *    10 | 1 0 1 0 | (2^390 + 2^130)G
1326238384Sjkim *    11 | 1 0 1 1 | (2^390 + 2^130 + 1)G
1327238384Sjkim *    12 | 1 1 0 0 | (2^390 + 2^260)G
1328238384Sjkim *    13 | 1 1 0 1 | (2^390 + 2^260 + 1)G
1329238384Sjkim *    14 | 1 1 1 0 | (2^390 + 2^260 + 2^130)G
1330238384Sjkim *    15 | 1 1 1 1 | (2^390 + 2^260 + 2^130 + 1)G
1331238384Sjkim *
1332238384Sjkim * The reason for this is so that we can clock bits into four different
1333238384Sjkim * locations when doing simple scalar multiplies against the base point.
1334238384Sjkim *
1335238384Sjkim * Tables for other points have table[i] = iG for i in 0 .. 16. */
1336238384Sjkim
1337238384Sjkim/* gmul is the table of precomputed base points */
1338280304Sjkimstatic const felem gmul[16][3] = { {{0, 0, 0, 0, 0, 0, 0, 0, 0},
1339280304Sjkim                                    {0, 0, 0, 0, 0, 0, 0, 0, 0},
1340280304Sjkim                                    {0, 0, 0, 0, 0, 0, 0, 0, 0}},
1341280304Sjkim{{0x017e7e31c2e5bd66, 0x022cf0615a90a6fe, 0x00127a2ffa8de334,
1342280304Sjkim  0x01dfbf9d64a3f877, 0x006b4d3dbaa14b5e, 0x014fed487e0a2bd8,
1343280304Sjkim  0x015b4429c6481390, 0x03a73678fb2d988e, 0x00c6858e06b70404},
1344280304Sjkim {0x00be94769fd16650, 0x031c21a89cb09022, 0x039013fad0761353,
1345280304Sjkim  0x02657bd099031542, 0x03273e662c97ee72, 0x01e6d11a05ebef45,
1346280304Sjkim  0x03d1bd998f544495, 0x03001172297ed0b1, 0x011839296a789a3b},
1347280304Sjkim {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1348280304Sjkim{{0x0373faacbc875bae, 0x00f325023721c671, 0x00f666fd3dbde5ad,
1349280304Sjkim  0x01a6932363f88ea7, 0x01fc6d9e13f9c47b, 0x03bcbffc2bbf734e,
1350280304Sjkim  0x013ee3c3647f3a92, 0x029409fefe75d07d, 0x00ef9199963d85e5},
1351280304Sjkim {0x011173743ad5b178, 0x02499c7c21bf7d46, 0x035beaeabb8b1a58,
1352280304Sjkim  0x00f989c4752ea0a3, 0x0101e1de48a9c1a3, 0x01a20076be28ba6c,
1353280304Sjkim  0x02f8052e5eb2de95, 0x01bfe8f82dea117c, 0x0160074d3c36ddb7},
1354280304Sjkim {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1355280304Sjkim{{0x012f3fc373393b3b, 0x03d3d6172f1419fa, 0x02adc943c0b86873,
1356280304Sjkim  0x00d475584177952b, 0x012a4d1673750ee2, 0x00512517a0f13b0c,
1357280304Sjkim  0x02b184671a7b1734, 0x0315b84236f1a50a, 0x00a4afc472edbdb9},
1358280304Sjkim {0x00152a7077f385c4, 0x03044007d8d1c2ee, 0x0065829d61d52b52,
1359280304Sjkim  0x00494ff6b6631d0d, 0x00a11d94d5f06bcf, 0x02d2f89474d9282e,
1360280304Sjkim  0x0241c5727c06eeb9, 0x0386928710fbdb9d, 0x01f883f727b0dfbe},
1361280304Sjkim {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1362280304Sjkim{{0x019b0c3c9185544d, 0x006243a37c9d97db, 0x02ee3cbe030a2ad2,
1363280304Sjkim  0x00cfdd946bb51e0d, 0x0271c00932606b91, 0x03f817d1ec68c561,
1364280304Sjkim  0x03f37009806a369c, 0x03c1f30baf184fd5, 0x01091022d6d2f065},
1365280304Sjkim {0x0292c583514c45ed, 0x0316fca51f9a286c, 0x00300af507c1489a,
1366280304Sjkim  0x0295f69008298cf1, 0x02c0ed8274943d7b, 0x016509b9b47a431e,
1367280304Sjkim  0x02bc9de9634868ce, 0x005b34929bffcb09, 0x000c1a0121681524},
1368280304Sjkim {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1369280304Sjkim{{0x0286abc0292fb9f2, 0x02665eee9805b3f7, 0x01ed7455f17f26d6,
1370280304Sjkim  0x0346355b83175d13, 0x006284944cd0a097, 0x0191895bcdec5e51,
1371280304Sjkim  0x02e288370afda7d9, 0x03b22312bfefa67a, 0x01d104d3fc0613fe},
1372280304Sjkim {0x0092421a12f7e47f, 0x0077a83fa373c501, 0x03bd25c5f696bd0d,
1373280304Sjkim  0x035c41e4d5459761, 0x01ca0d1742b24f53, 0x00aaab27863a509c,
1374280304Sjkim  0x018b6de47df73917, 0x025c0b771705cd01, 0x01fd51d566d760a7},
1375280304Sjkim {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1376280304Sjkim{{0x01dd92ff6b0d1dbd, 0x039c5e2e8f8afa69, 0x0261ed13242c3b27,
1377280304Sjkim  0x0382c6e67026e6a0, 0x01d60b10be2089f9, 0x03c15f3dce86723f,
1378280304Sjkim  0x03c764a32d2a062d, 0x017307eac0fad056, 0x018207c0b96c5256},
1379280304Sjkim {0x0196a16d60e13154, 0x03e6ce74c0267030, 0x00ddbf2b4e52a5aa,
1380280304Sjkim  0x012738241bbf31c8, 0x00ebe8dc04685a28, 0x024c2ad6d380d4a2,
1381280304Sjkim  0x035ee062a6e62d0e, 0x0029ed74af7d3a0f, 0x00eef32aec142ebd},
1382280304Sjkim {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1383280304Sjkim{{0x00c31ec398993b39, 0x03a9f45bcda68253, 0x00ac733c24c70890,
1384280304Sjkim  0x00872b111401ff01, 0x01d178c23195eafb, 0x03bca2c816b87f74,
1385280304Sjkim  0x0261a9af46fbad7a, 0x0324b2a8dd3d28f9, 0x00918121d8f24e23},
1386280304Sjkim {0x032bc8c1ca983cd7, 0x00d869dfb08fc8c6, 0x01693cb61fce1516,
1387280304Sjkim  0x012a5ea68f4e88a8, 0x010869cab88d7ae3, 0x009081ad277ceee1,
1388280304Sjkim  0x033a77166d064cdc, 0x03955235a1fb3a95, 0x01251a4a9b25b65e},
1389280304Sjkim {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1390280304Sjkim{{0x00148a3a1b27f40b, 0x0123186df1b31fdc, 0x00026e7beaad34ce,
1391280304Sjkim  0x01db446ac1d3dbba, 0x0299c1a33437eaec, 0x024540610183cbb7,
1392280304Sjkim  0x0173bb0e9ce92e46, 0x02b937e43921214b, 0x01ab0436a9bf01b5},
1393280304Sjkim {0x0383381640d46948, 0x008dacbf0e7f330f, 0x03602122bcc3f318,
1394280304Sjkim  0x01ee596b200620d6, 0x03bd0585fda430b3, 0x014aed77fd123a83,
1395280304Sjkim  0x005ace749e52f742, 0x0390fe041da2b842, 0x0189a8ceb3299242},
1396280304Sjkim {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1397280304Sjkim{{0x012a19d6b3282473, 0x00c0915918b423ce, 0x023a954eb94405ae,
1398280304Sjkim  0x00529f692be26158, 0x0289fa1b6fa4b2aa, 0x0198ae4ceea346ef,
1399280304Sjkim  0x0047d8cdfbdedd49, 0x00cc8c8953f0f6b8, 0x001424abbff49203},
1400280304Sjkim {0x0256732a1115a03a, 0x0351bc38665c6733, 0x03f7b950fb4a6447,
1401280304Sjkim  0x000afffa94c22155, 0x025763d0a4dab540, 0x000511e92d4fc283,
1402280304Sjkim  0x030a7e9eda0ee96c, 0x004c3cd93a28bf0a, 0x017edb3a8719217f},
1403280304Sjkim {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1404280304Sjkim{{0x011de5675a88e673, 0x031d7d0f5e567fbe, 0x0016b2062c970ae5,
1405280304Sjkim  0x03f4a2be49d90aa7, 0x03cef0bd13822866, 0x03f0923dcf774a6c,
1406280304Sjkim  0x0284bebc4f322f72, 0x016ab2645302bb2c, 0x01793f95dace0e2a},
1407280304Sjkim {0x010646e13527a28f, 0x01ca1babd59dc5e7, 0x01afedfd9a5595df,
1408280304Sjkim  0x01f15785212ea6b1, 0x0324e5d64f6ae3f4, 0x02d680f526d00645,
1409280304Sjkim  0x0127920fadf627a7, 0x03b383f75df4f684, 0x0089e0057e783b0a},
1410280304Sjkim {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1411280304Sjkim{{0x00f334b9eb3c26c6, 0x0298fdaa98568dce, 0x01c2d24843a82292,
1412280304Sjkim  0x020bcb24fa1b0711, 0x02cbdb3d2b1875e6, 0x0014907598f89422,
1413280304Sjkim  0x03abe3aa43b26664, 0x02cbf47f720bc168, 0x0133b5e73014b79b},
1414280304Sjkim {0x034aab5dab05779d, 0x00cdc5d71fee9abb, 0x0399f16bd4bd9d30,
1415280304Sjkim  0x03582fa592d82647, 0x02be1cdfb775b0e9, 0x0034f7cea32e94cb,
1416280304Sjkim  0x0335a7f08f56f286, 0x03b707e9565d1c8b, 0x0015c946ea5b614f},
1417280304Sjkim {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1418280304Sjkim{{0x024676f6cff72255, 0x00d14625cac96378, 0x00532b6008bc3767,
1419280304Sjkim  0x01fc16721b985322, 0x023355ea1b091668, 0x029de7afdc0317c3,
1420280304Sjkim  0x02fc8a7ca2da037c, 0x02de1217d74a6f30, 0x013f7173175b73bf},
1421280304Sjkim {0x0344913f441490b5, 0x0200f9e272b61eca, 0x0258a246b1dd55d2,
1422280304Sjkim  0x03753db9ea496f36, 0x025e02937a09c5ef, 0x030cbd3d14012692,
1423280304Sjkim  0x01793a67e70dc72a, 0x03ec1d37048a662e, 0x006550f700c32a8d},
1424280304Sjkim {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1425280304Sjkim{{0x00d3f48a347eba27, 0x008e636649b61bd8, 0x00d3b93716778fb3,
1426280304Sjkim  0x004d1915757bd209, 0x019d5311a3da44e0, 0x016d1afcbbe6aade,
1427280304Sjkim  0x0241bf5f73265616, 0x0384672e5d50d39b, 0x005009fee522b684},
1428280304Sjkim {0x029b4fab064435fe, 0x018868ee095bbb07, 0x01ea3d6936cc92b8,
1429280304Sjkim  0x000608b00f78a2f3, 0x02db911073d1c20f, 0x018205938470100a,
1430280304Sjkim  0x01f1e4964cbe6ff2, 0x021a19a29eed4663, 0x01414485f42afa81},
1431280304Sjkim {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1432280304Sjkim{{0x01612b3a17f63e34, 0x03813992885428e6, 0x022b3c215b5a9608,
1433280304Sjkim  0x029b4057e19f2fcb, 0x0384059a587af7e6, 0x02d6400ace6fe610,
1434280304Sjkim  0x029354d896e8e331, 0x00c047ee6dfba65e, 0x0037720542e9d49d},
1435280304Sjkim {0x02ce9eed7c5e9278, 0x0374ed703e79643b, 0x01316c54c4072006,
1436280304Sjkim  0x005aaa09054b2ee8, 0x002824000c840d57, 0x03d4eba24771ed86,
1437280304Sjkim  0x0189c50aabc3bdae, 0x0338c01541e15510, 0x00466d56e38eed42},
1438280304Sjkim {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1439280304Sjkim{{0x007efd8330ad8bd6, 0x02465ed48047710b, 0x0034c6606b215e0c,
1440280304Sjkim  0x016ae30c53cbf839, 0x01fa17bd37161216, 0x018ead4e61ce8ab9,
1441280304Sjkim  0x005482ed5f5dee46, 0x037543755bba1d7f, 0x005e5ac7e70a9d0f},
1442280304Sjkim {0x0117e1bb2fdcb2a2, 0x03deea36249f40c4, 0x028d09b4a6246cb7,
1443280304Sjkim  0x03524b8855bcf756, 0x023d7d109d5ceb58, 0x0178e43e3223ef9c,
1444280304Sjkim  0x0154536a0c6e966a, 0x037964d1286ee9fe, 0x0199bcd90e125055},
1445280304Sjkim {1, 0, 0, 0, 0, 0, 0, 0, 0}}
1446280304Sjkim};
1447238384Sjkim
1448280304Sjkim/*
1449280304Sjkim * select_point selects the |idx|th point from a precomputation table and
1450280304Sjkim * copies it to out.
1451280304Sjkim */
1452280304Sjkim /* pre_comp below is of the size provided in |size| */
1453280304Sjkimstatic void select_point(const limb idx, unsigned int size,
1454280304Sjkim                         const felem pre_comp[][3], felem out[3])
1455280304Sjkim{
1456280304Sjkim    unsigned i, j;
1457280304Sjkim    limb *outlimbs = &out[0][0];
1458280304Sjkim    memset(outlimbs, 0, 3 * sizeof(felem));
1459238384Sjkim
1460280304Sjkim    for (i = 0; i < size; i++) {
1461280304Sjkim        const limb *inlimbs = &pre_comp[i][0][0];
1462280304Sjkim        limb mask = i ^ idx;
1463280304Sjkim        mask |= mask >> 4;
1464280304Sjkim        mask |= mask >> 2;
1465280304Sjkim        mask |= mask >> 1;
1466280304Sjkim        mask &= 1;
1467280304Sjkim        mask--;
1468280304Sjkim        for (j = 0; j < NLIMBS * 3; j++)
1469280304Sjkim            outlimbs[j] |= inlimbs[j] & mask;
1470280304Sjkim    }
1471280304Sjkim}
1472238384Sjkim
1473238384Sjkim/* get_bit returns the |i|th bit in |in| */
1474238384Sjkimstatic char get_bit(const felem_bytearray in, int i)
1475280304Sjkim{
1476280304Sjkim    if (i < 0)
1477280304Sjkim        return 0;
1478280304Sjkim    return (in[i >> 3] >> (i & 7)) & 1;
1479280304Sjkim}
1480238384Sjkim
1481280304Sjkim/*
1482280304Sjkim * Interleaved point multiplication using precomputed point multiples: The
1483280304Sjkim * small point multiples 0*P, 1*P, ..., 16*P are in pre_comp[], the scalars
1484280304Sjkim * in scalars[]. If g_scalar is non-NULL, we also add this multiple of the
1485280304Sjkim * generator, using certain (large) precomputed multiples in g_pre_comp.
1486280304Sjkim * Output point (X, Y, Z) is stored in x_out, y_out, z_out
1487280304Sjkim */
1488238384Sjkimstatic void batch_mul(felem x_out, felem y_out, felem z_out,
1489280304Sjkim                      const felem_bytearray scalars[],
1490280304Sjkim                      const unsigned num_points, const u8 *g_scalar,
1491280304Sjkim                      const int mixed, const felem pre_comp[][17][3],
1492280304Sjkim                      const felem g_pre_comp[16][3])
1493280304Sjkim{
1494280304Sjkim    int i, skip;
1495280304Sjkim    unsigned num, gen_mul = (g_scalar != NULL);
1496280304Sjkim    felem nq[3], tmp[4];
1497280304Sjkim    limb bits;
1498280304Sjkim    u8 sign, digit;
1499238384Sjkim
1500280304Sjkim    /* set nq to the point at infinity */
1501280304Sjkim    memset(nq, 0, 3 * sizeof(felem));
1502238384Sjkim
1503280304Sjkim    /*
1504280304Sjkim     * Loop over all scalars msb-to-lsb, interleaving additions of multiples
1505280304Sjkim     * of the generator (last quarter of rounds) and additions of other
1506280304Sjkim     * points multiples (every 5th round).
1507280304Sjkim     */
1508280304Sjkim    skip = 1;                   /* save two point operations in the first
1509280304Sjkim                                 * round */
1510280304Sjkim    for (i = (num_points ? 520 : 130); i >= 0; --i) {
1511280304Sjkim        /* double */
1512280304Sjkim        if (!skip)
1513280304Sjkim            point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]);
1514238384Sjkim
1515280304Sjkim        /* add multiples of the generator */
1516280304Sjkim        if (gen_mul && (i <= 130)) {
1517280304Sjkim            bits = get_bit(g_scalar, i + 390) << 3;
1518280304Sjkim            if (i < 130) {
1519280304Sjkim                bits |= get_bit(g_scalar, i + 260) << 2;
1520280304Sjkim                bits |= get_bit(g_scalar, i + 130) << 1;
1521280304Sjkim                bits |= get_bit(g_scalar, i);
1522280304Sjkim            }
1523280304Sjkim            /* select the point to add, in constant time */
1524280304Sjkim            select_point(bits, 16, g_pre_comp, tmp);
1525280304Sjkim            if (!skip) {
1526280304Sjkim                /* The 1 argument below is for "mixed" */
1527280304Sjkim                point_add(nq[0], nq[1], nq[2],
1528280304Sjkim                          nq[0], nq[1], nq[2], 1, tmp[0], tmp[1], tmp[2]);
1529280304Sjkim            } else {
1530280304Sjkim                memcpy(nq, tmp, 3 * sizeof(felem));
1531280304Sjkim                skip = 0;
1532280304Sjkim            }
1533280304Sjkim        }
1534238384Sjkim
1535280304Sjkim        /* do other additions every 5 doublings */
1536280304Sjkim        if (num_points && (i % 5 == 0)) {
1537280304Sjkim            /* loop over all scalars */
1538280304Sjkim            for (num = 0; num < num_points; ++num) {
1539280304Sjkim                bits = get_bit(scalars[num], i + 4) << 5;
1540280304Sjkim                bits |= get_bit(scalars[num], i + 3) << 4;
1541280304Sjkim                bits |= get_bit(scalars[num], i + 2) << 3;
1542280304Sjkim                bits |= get_bit(scalars[num], i + 1) << 2;
1543280304Sjkim                bits |= get_bit(scalars[num], i) << 1;
1544280304Sjkim                bits |= get_bit(scalars[num], i - 1);
1545280304Sjkim                ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits);
1546238384Sjkim
1547280304Sjkim                /*
1548280304Sjkim                 * select the point to add or subtract, in constant time
1549280304Sjkim                 */
1550280304Sjkim                select_point(digit, 17, pre_comp[num], tmp);
1551280304Sjkim                felem_neg(tmp[3], tmp[1]); /* (X, -Y, Z) is the negative
1552280304Sjkim                                            * point */
1553280304Sjkim                copy_conditional(tmp[1], tmp[3], (-(limb) sign));
1554238384Sjkim
1555280304Sjkim                if (!skip) {
1556280304Sjkim                    point_add(nq[0], nq[1], nq[2],
1557280304Sjkim                              nq[0], nq[1], nq[2],
1558280304Sjkim                              mixed, tmp[0], tmp[1], tmp[2]);
1559280304Sjkim                } else {
1560280304Sjkim                    memcpy(nq, tmp, 3 * sizeof(felem));
1561280304Sjkim                    skip = 0;
1562280304Sjkim                }
1563280304Sjkim            }
1564280304Sjkim        }
1565280304Sjkim    }
1566280304Sjkim    felem_assign(x_out, nq[0]);
1567280304Sjkim    felem_assign(y_out, nq[1]);
1568280304Sjkim    felem_assign(z_out, nq[2]);
1569280304Sjkim}
1570238384Sjkim
1571238384Sjkim/* Precomputation for the group generator. */
1572238384Sjkimtypedef struct {
1573280304Sjkim    felem g_pre_comp[16][3];
1574280304Sjkim    int references;
1575238384Sjkim} NISTP521_PRE_COMP;
1576238384Sjkim
1577238384Sjkimconst EC_METHOD *EC_GFp_nistp521_method(void)
1578280304Sjkim{
1579280304Sjkim    static const EC_METHOD ret = {
1580280304Sjkim        EC_FLAGS_DEFAULT_OCT,
1581280304Sjkim        NID_X9_62_prime_field,
1582280304Sjkim        ec_GFp_nistp521_group_init,
1583280304Sjkim        ec_GFp_simple_group_finish,
1584280304Sjkim        ec_GFp_simple_group_clear_finish,
1585280304Sjkim        ec_GFp_nist_group_copy,
1586280304Sjkim        ec_GFp_nistp521_group_set_curve,
1587280304Sjkim        ec_GFp_simple_group_get_curve,
1588280304Sjkim        ec_GFp_simple_group_get_degree,
1589280304Sjkim        ec_GFp_simple_group_check_discriminant,
1590280304Sjkim        ec_GFp_simple_point_init,
1591280304Sjkim        ec_GFp_simple_point_finish,
1592280304Sjkim        ec_GFp_simple_point_clear_finish,
1593280304Sjkim        ec_GFp_simple_point_copy,
1594280304Sjkim        ec_GFp_simple_point_set_to_infinity,
1595280304Sjkim        ec_GFp_simple_set_Jprojective_coordinates_GFp,
1596280304Sjkim        ec_GFp_simple_get_Jprojective_coordinates_GFp,
1597280304Sjkim        ec_GFp_simple_point_set_affine_coordinates,
1598280304Sjkim        ec_GFp_nistp521_point_get_affine_coordinates,
1599280304Sjkim        0 /* point_set_compressed_coordinates */ ,
1600280304Sjkim        0 /* point2oct */ ,
1601280304Sjkim        0 /* oct2point */ ,
1602280304Sjkim        ec_GFp_simple_add,
1603280304Sjkim        ec_GFp_simple_dbl,
1604280304Sjkim        ec_GFp_simple_invert,
1605280304Sjkim        ec_GFp_simple_is_at_infinity,
1606280304Sjkim        ec_GFp_simple_is_on_curve,
1607280304Sjkim        ec_GFp_simple_cmp,
1608280304Sjkim        ec_GFp_simple_make_affine,
1609280304Sjkim        ec_GFp_simple_points_make_affine,
1610280304Sjkim        ec_GFp_nistp521_points_mul,
1611280304Sjkim        ec_GFp_nistp521_precompute_mult,
1612280304Sjkim        ec_GFp_nistp521_have_precompute_mult,
1613280304Sjkim        ec_GFp_nist_field_mul,
1614280304Sjkim        ec_GFp_nist_field_sqr,
1615280304Sjkim        0 /* field_div */ ,
1616280304Sjkim        0 /* field_encode */ ,
1617280304Sjkim        0 /* field_decode */ ,
1618280304Sjkim        0                       /* field_set_to_one */
1619280304Sjkim    };
1620238384Sjkim
1621280304Sjkim    return &ret;
1622280304Sjkim}
1623238384Sjkim
1624238384Sjkim/******************************************************************************/
1625280304Sjkim/*
1626280304Sjkim * FUNCTIONS TO MANAGE PRECOMPUTATION
1627238384Sjkim */
1628238384Sjkim
1629238384Sjkimstatic NISTP521_PRE_COMP *nistp521_pre_comp_new()
1630280304Sjkim{
1631280304Sjkim    NISTP521_PRE_COMP *ret = NULL;
1632280304Sjkim    ret = (NISTP521_PRE_COMP *) OPENSSL_malloc(sizeof(NISTP521_PRE_COMP));
1633280304Sjkim    if (!ret) {
1634280304Sjkim        ECerr(EC_F_NISTP521_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE);
1635280304Sjkim        return ret;
1636280304Sjkim    }
1637280304Sjkim    memset(ret->g_pre_comp, 0, sizeof(ret->g_pre_comp));
1638280304Sjkim    ret->references = 1;
1639280304Sjkim    return ret;
1640280304Sjkim}
1641238384Sjkim
1642238384Sjkimstatic void *nistp521_pre_comp_dup(void *src_)
1643280304Sjkim{
1644280304Sjkim    NISTP521_PRE_COMP *src = src_;
1645238384Sjkim
1646280304Sjkim    /* no need to actually copy, these objects never change! */
1647280304Sjkim    CRYPTO_add(&src->references, 1, CRYPTO_LOCK_EC_PRE_COMP);
1648238384Sjkim
1649280304Sjkim    return src_;
1650280304Sjkim}
1651238384Sjkim
1652238384Sjkimstatic void nistp521_pre_comp_free(void *pre_)
1653280304Sjkim{
1654280304Sjkim    int i;
1655280304Sjkim    NISTP521_PRE_COMP *pre = pre_;
1656238384Sjkim
1657280304Sjkim    if (!pre)
1658280304Sjkim        return;
1659238384Sjkim
1660280304Sjkim    i = CRYPTO_add(&pre->references, -1, CRYPTO_LOCK_EC_PRE_COMP);
1661280304Sjkim    if (i > 0)
1662280304Sjkim        return;
1663238384Sjkim
1664280304Sjkim    OPENSSL_free(pre);
1665280304Sjkim}
1666238384Sjkim
1667238384Sjkimstatic void nistp521_pre_comp_clear_free(void *pre_)
1668280304Sjkim{
1669280304Sjkim    int i;
1670280304Sjkim    NISTP521_PRE_COMP *pre = pre_;
1671238384Sjkim
1672280304Sjkim    if (!pre)
1673280304Sjkim        return;
1674238384Sjkim
1675280304Sjkim    i = CRYPTO_add(&pre->references, -1, CRYPTO_LOCK_EC_PRE_COMP);
1676280304Sjkim    if (i > 0)
1677280304Sjkim        return;
1678238384Sjkim
1679280304Sjkim    OPENSSL_cleanse(pre, sizeof(*pre));
1680280304Sjkim    OPENSSL_free(pre);
1681280304Sjkim}
1682238384Sjkim
1683238384Sjkim/******************************************************************************/
1684280304Sjkim/*
1685280304Sjkim * OPENSSL EC_METHOD FUNCTIONS
1686238384Sjkim */
1687238384Sjkim
1688238384Sjkimint ec_GFp_nistp521_group_init(EC_GROUP *group)
1689280304Sjkim{
1690280304Sjkim    int ret;
1691280304Sjkim    ret = ec_GFp_simple_group_init(group);
1692280304Sjkim    group->a_is_minus3 = 1;
1693280304Sjkim    return ret;
1694280304Sjkim}
1695238384Sjkim
1696238384Sjkimint ec_GFp_nistp521_group_set_curve(EC_GROUP *group, const BIGNUM *p,
1697280304Sjkim                                    const BIGNUM *a, const BIGNUM *b,
1698280304Sjkim                                    BN_CTX *ctx)
1699280304Sjkim{
1700280304Sjkim    int ret = 0;
1701280304Sjkim    BN_CTX *new_ctx = NULL;
1702280304Sjkim    BIGNUM *curve_p, *curve_a, *curve_b;
1703238384Sjkim
1704280304Sjkim    if (ctx == NULL)
1705280304Sjkim        if ((ctx = new_ctx = BN_CTX_new()) == NULL)
1706280304Sjkim            return 0;
1707280304Sjkim    BN_CTX_start(ctx);
1708280304Sjkim    if (((curve_p = BN_CTX_get(ctx)) == NULL) ||
1709280304Sjkim        ((curve_a = BN_CTX_get(ctx)) == NULL) ||
1710280304Sjkim        ((curve_b = BN_CTX_get(ctx)) == NULL))
1711280304Sjkim        goto err;
1712280304Sjkim    BN_bin2bn(nistp521_curve_params[0], sizeof(felem_bytearray), curve_p);
1713280304Sjkim    BN_bin2bn(nistp521_curve_params[1], sizeof(felem_bytearray), curve_a);
1714280304Sjkim    BN_bin2bn(nistp521_curve_params[2], sizeof(felem_bytearray), curve_b);
1715280304Sjkim    if ((BN_cmp(curve_p, p)) || (BN_cmp(curve_a, a)) || (BN_cmp(curve_b, b))) {
1716280304Sjkim        ECerr(EC_F_EC_GFP_NISTP521_GROUP_SET_CURVE,
1717280304Sjkim              EC_R_WRONG_CURVE_PARAMETERS);
1718280304Sjkim        goto err;
1719280304Sjkim    }
1720280304Sjkim    group->field_mod_func = BN_nist_mod_521;
1721280304Sjkim    ret = ec_GFp_simple_group_set_curve(group, p, a, b, ctx);
1722280304Sjkim err:
1723280304Sjkim    BN_CTX_end(ctx);
1724280304Sjkim    if (new_ctx != NULL)
1725280304Sjkim        BN_CTX_free(new_ctx);
1726280304Sjkim    return ret;
1727280304Sjkim}
1728238384Sjkim
1729280304Sjkim/*
1730280304Sjkim * Takes the Jacobian coordinates (X, Y, Z) of a point and returns (X', Y') =
1731280304Sjkim * (X/Z^2, Y/Z^3)
1732280304Sjkim */
1733238384Sjkimint ec_GFp_nistp521_point_get_affine_coordinates(const EC_GROUP *group,
1734280304Sjkim                                                 const EC_POINT *point,
1735280304Sjkim                                                 BIGNUM *x, BIGNUM *y,
1736280304Sjkim                                                 BN_CTX *ctx)
1737280304Sjkim{
1738280304Sjkim    felem z1, z2, x_in, y_in, x_out, y_out;
1739280304Sjkim    largefelem tmp;
1740238384Sjkim
1741280304Sjkim    if (EC_POINT_is_at_infinity(group, point)) {
1742280304Sjkim        ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES,
1743280304Sjkim              EC_R_POINT_AT_INFINITY);
1744280304Sjkim        return 0;
1745280304Sjkim    }
1746280304Sjkim    if ((!BN_to_felem(x_in, &point->X)) || (!BN_to_felem(y_in, &point->Y)) ||
1747280304Sjkim        (!BN_to_felem(z1, &point->Z)))
1748280304Sjkim        return 0;
1749280304Sjkim    felem_inv(z2, z1);
1750280304Sjkim    felem_square(tmp, z2);
1751280304Sjkim    felem_reduce(z1, tmp);
1752280304Sjkim    felem_mul(tmp, x_in, z1);
1753280304Sjkim    felem_reduce(x_in, tmp);
1754280304Sjkim    felem_contract(x_out, x_in);
1755280304Sjkim    if (x != NULL) {
1756280304Sjkim        if (!felem_to_BN(x, x_out)) {
1757280304Sjkim            ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES,
1758280304Sjkim                  ERR_R_BN_LIB);
1759280304Sjkim            return 0;
1760280304Sjkim        }
1761280304Sjkim    }
1762280304Sjkim    felem_mul(tmp, z1, z2);
1763280304Sjkim    felem_reduce(z1, tmp);
1764280304Sjkim    felem_mul(tmp, y_in, z1);
1765280304Sjkim    felem_reduce(y_in, tmp);
1766280304Sjkim    felem_contract(y_out, y_in);
1767280304Sjkim    if (y != NULL) {
1768280304Sjkim        if (!felem_to_BN(y, y_out)) {
1769280304Sjkim            ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES,
1770280304Sjkim                  ERR_R_BN_LIB);
1771280304Sjkim            return 0;
1772280304Sjkim        }
1773280304Sjkim    }
1774280304Sjkim    return 1;
1775280304Sjkim}
1776238384Sjkim
1777280304Sjkim/* points below is of size |num|, and tmp_felems is of size |num+1/ */
1778280304Sjkimstatic void make_points_affine(size_t num, felem points[][3],
1779280304Sjkim                               felem tmp_felems[])
1780280304Sjkim{
1781280304Sjkim    /*
1782280304Sjkim     * Runs in constant time, unless an input is the point at infinity (which
1783280304Sjkim     * normally shouldn't happen).
1784280304Sjkim     */
1785280304Sjkim    ec_GFp_nistp_points_make_affine_internal(num,
1786280304Sjkim                                             points,
1787280304Sjkim                                             sizeof(felem),
1788280304Sjkim                                             tmp_felems,
1789280304Sjkim                                             (void (*)(void *))felem_one,
1790280304Sjkim                                             (int (*)(const void *))
1791280304Sjkim                                             felem_is_zero_int,
1792280304Sjkim                                             (void (*)(void *, const void *))
1793280304Sjkim                                             felem_assign,
1794280304Sjkim                                             (void (*)(void *, const void *))
1795280304Sjkim                                             felem_square_reduce, (void (*)
1796280304Sjkim                                                                   (void *,
1797280304Sjkim                                                                    const void
1798280304Sjkim                                                                    *,
1799280304Sjkim                                                                    const void
1800280304Sjkim                                                                    *))
1801280304Sjkim                                             felem_mul_reduce,
1802280304Sjkim                                             (void (*)(void *, const void *))
1803280304Sjkim                                             felem_inv,
1804280304Sjkim                                             (void (*)(void *, const void *))
1805280304Sjkim                                             felem_contract);
1806280304Sjkim}
1807238384Sjkim
1808280304Sjkim/*
1809280304Sjkim * Computes scalar*generator + \sum scalars[i]*points[i], ignoring NULL
1810280304Sjkim * values Result is stored in r (r can equal one of the inputs).
1811280304Sjkim */
1812238384Sjkimint ec_GFp_nistp521_points_mul(const EC_GROUP *group, EC_POINT *r,
1813280304Sjkim                               const BIGNUM *scalar, size_t num,
1814280304Sjkim                               const EC_POINT *points[],
1815280304Sjkim                               const BIGNUM *scalars[], BN_CTX *ctx)
1816280304Sjkim{
1817280304Sjkim    int ret = 0;
1818280304Sjkim    int j;
1819280304Sjkim    int mixed = 0;
1820280304Sjkim    BN_CTX *new_ctx = NULL;
1821280304Sjkim    BIGNUM *x, *y, *z, *tmp_scalar;
1822280304Sjkim    felem_bytearray g_secret;
1823280304Sjkim    felem_bytearray *secrets = NULL;
1824280304Sjkim    felem(*pre_comp)[17][3] = NULL;
1825280304Sjkim    felem *tmp_felems = NULL;
1826280304Sjkim    felem_bytearray tmp;
1827280304Sjkim    unsigned i, num_bytes;
1828280304Sjkim    int have_pre_comp = 0;
1829280304Sjkim    size_t num_points = num;
1830280304Sjkim    felem x_in, y_in, z_in, x_out, y_out, z_out;
1831280304Sjkim    NISTP521_PRE_COMP *pre = NULL;
1832280304Sjkim    felem(*g_pre_comp)[3] = NULL;
1833280304Sjkim    EC_POINT *generator = NULL;
1834280304Sjkim    const EC_POINT *p = NULL;
1835280304Sjkim    const BIGNUM *p_scalar = NULL;
1836238384Sjkim
1837280304Sjkim    if (ctx == NULL)
1838280304Sjkim        if ((ctx = new_ctx = BN_CTX_new()) == NULL)
1839280304Sjkim            return 0;
1840280304Sjkim    BN_CTX_start(ctx);
1841280304Sjkim    if (((x = BN_CTX_get(ctx)) == NULL) ||
1842280304Sjkim        ((y = BN_CTX_get(ctx)) == NULL) ||
1843280304Sjkim        ((z = BN_CTX_get(ctx)) == NULL) ||
1844280304Sjkim        ((tmp_scalar = BN_CTX_get(ctx)) == NULL))
1845280304Sjkim        goto err;
1846238384Sjkim
1847280304Sjkim    if (scalar != NULL) {
1848280304Sjkim        pre = EC_EX_DATA_get_data(group->extra_data,
1849280304Sjkim                                  nistp521_pre_comp_dup,
1850280304Sjkim                                  nistp521_pre_comp_free,
1851280304Sjkim                                  nistp521_pre_comp_clear_free);
1852280304Sjkim        if (pre)
1853280304Sjkim            /* we have precomputation, try to use it */
1854280304Sjkim            g_pre_comp = &pre->g_pre_comp[0];
1855280304Sjkim        else
1856280304Sjkim            /* try to use the standard precomputation */
1857280304Sjkim            g_pre_comp = (felem(*)[3]) gmul;
1858280304Sjkim        generator = EC_POINT_new(group);
1859280304Sjkim        if (generator == NULL)
1860280304Sjkim            goto err;
1861280304Sjkim        /* get the generator from precomputation */
1862280304Sjkim        if (!felem_to_BN(x, g_pre_comp[1][0]) ||
1863280304Sjkim            !felem_to_BN(y, g_pre_comp[1][1]) ||
1864280304Sjkim            !felem_to_BN(z, g_pre_comp[1][2])) {
1865280304Sjkim            ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
1866280304Sjkim            goto err;
1867280304Sjkim        }
1868280304Sjkim        if (!EC_POINT_set_Jprojective_coordinates_GFp(group,
1869280304Sjkim                                                      generator, x, y, z,
1870280304Sjkim                                                      ctx))
1871280304Sjkim            goto err;
1872280304Sjkim        if (0 == EC_POINT_cmp(group, generator, group->generator, ctx))
1873280304Sjkim            /* precomputation matches generator */
1874280304Sjkim            have_pre_comp = 1;
1875280304Sjkim        else
1876280304Sjkim            /*
1877280304Sjkim             * we don't have valid precomputation: treat the generator as a
1878280304Sjkim             * random point
1879280304Sjkim             */
1880280304Sjkim            num_points++;
1881280304Sjkim    }
1882238384Sjkim
1883280304Sjkim    if (num_points > 0) {
1884280304Sjkim        if (num_points >= 2) {
1885280304Sjkim            /*
1886280304Sjkim             * unless we precompute multiples for just one point, converting
1887280304Sjkim             * those into affine form is time well spent
1888280304Sjkim             */
1889280304Sjkim            mixed = 1;
1890280304Sjkim        }
1891280304Sjkim        secrets = OPENSSL_malloc(num_points * sizeof(felem_bytearray));
1892280304Sjkim        pre_comp = OPENSSL_malloc(num_points * 17 * 3 * sizeof(felem));
1893280304Sjkim        if (mixed)
1894280304Sjkim            tmp_felems =
1895280304Sjkim                OPENSSL_malloc((num_points * 17 + 1) * sizeof(felem));
1896280304Sjkim        if ((secrets == NULL) || (pre_comp == NULL)
1897280304Sjkim            || (mixed && (tmp_felems == NULL))) {
1898280304Sjkim            ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_MALLOC_FAILURE);
1899280304Sjkim            goto err;
1900280304Sjkim        }
1901238384Sjkim
1902280304Sjkim        /*
1903280304Sjkim         * we treat NULL scalars as 0, and NULL points as points at infinity,
1904280304Sjkim         * i.e., they contribute nothing to the linear combination
1905280304Sjkim         */
1906280304Sjkim        memset(secrets, 0, num_points * sizeof(felem_bytearray));
1907280304Sjkim        memset(pre_comp, 0, num_points * 17 * 3 * sizeof(felem));
1908280304Sjkim        for (i = 0; i < num_points; ++i) {
1909280304Sjkim            if (i == num)
1910280304Sjkim                /*
1911280304Sjkim                 * we didn't have a valid precomputation, so we pick the
1912280304Sjkim                 * generator
1913280304Sjkim                 */
1914280304Sjkim            {
1915280304Sjkim                p = EC_GROUP_get0_generator(group);
1916280304Sjkim                p_scalar = scalar;
1917280304Sjkim            } else
1918280304Sjkim                /* the i^th point */
1919280304Sjkim            {
1920280304Sjkim                p = points[i];
1921280304Sjkim                p_scalar = scalars[i];
1922280304Sjkim            }
1923280304Sjkim            if ((p_scalar != NULL) && (p != NULL)) {
1924280304Sjkim                /* reduce scalar to 0 <= scalar < 2^521 */
1925280304Sjkim                if ((BN_num_bits(p_scalar) > 521)
1926280304Sjkim                    || (BN_is_negative(p_scalar))) {
1927280304Sjkim                    /*
1928280304Sjkim                     * this is an unusual input, and we don't guarantee
1929280304Sjkim                     * constant-timeness
1930280304Sjkim                     */
1931280304Sjkim                    if (!BN_nnmod(tmp_scalar, p_scalar, &group->order, ctx)) {
1932280304Sjkim                        ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
1933280304Sjkim                        goto err;
1934280304Sjkim                    }
1935280304Sjkim                    num_bytes = BN_bn2bin(tmp_scalar, tmp);
1936280304Sjkim                } else
1937280304Sjkim                    num_bytes = BN_bn2bin(p_scalar, tmp);
1938280304Sjkim                flip_endian(secrets[i], tmp, num_bytes);
1939280304Sjkim                /* precompute multiples */
1940280304Sjkim                if ((!BN_to_felem(x_out, &p->X)) ||
1941280304Sjkim                    (!BN_to_felem(y_out, &p->Y)) ||
1942280304Sjkim                    (!BN_to_felem(z_out, &p->Z)))
1943280304Sjkim                    goto err;
1944280304Sjkim                memcpy(pre_comp[i][1][0], x_out, sizeof(felem));
1945280304Sjkim                memcpy(pre_comp[i][1][1], y_out, sizeof(felem));
1946280304Sjkim                memcpy(pre_comp[i][1][2], z_out, sizeof(felem));
1947280304Sjkim                for (j = 2; j <= 16; ++j) {
1948280304Sjkim                    if (j & 1) {
1949280304Sjkim                        point_add(pre_comp[i][j][0], pre_comp[i][j][1],
1950280304Sjkim                                  pre_comp[i][j][2], pre_comp[i][1][0],
1951280304Sjkim                                  pre_comp[i][1][1], pre_comp[i][1][2], 0,
1952280304Sjkim                                  pre_comp[i][j - 1][0],
1953280304Sjkim                                  pre_comp[i][j - 1][1],
1954280304Sjkim                                  pre_comp[i][j - 1][2]);
1955280304Sjkim                    } else {
1956280304Sjkim                        point_double(pre_comp[i][j][0], pre_comp[i][j][1],
1957280304Sjkim                                     pre_comp[i][j][2], pre_comp[i][j / 2][0],
1958280304Sjkim                                     pre_comp[i][j / 2][1],
1959280304Sjkim                                     pre_comp[i][j / 2][2]);
1960280304Sjkim                    }
1961280304Sjkim                }
1962280304Sjkim            }
1963280304Sjkim        }
1964280304Sjkim        if (mixed)
1965280304Sjkim            make_points_affine(num_points * 17, pre_comp[0], tmp_felems);
1966280304Sjkim    }
1967238384Sjkim
1968280304Sjkim    /* the scalar for the generator */
1969280304Sjkim    if ((scalar != NULL) && (have_pre_comp)) {
1970280304Sjkim        memset(g_secret, 0, sizeof(g_secret));
1971280304Sjkim        /* reduce scalar to 0 <= scalar < 2^521 */
1972280304Sjkim        if ((BN_num_bits(scalar) > 521) || (BN_is_negative(scalar))) {
1973280304Sjkim            /*
1974280304Sjkim             * this is an unusual input, and we don't guarantee
1975280304Sjkim             * constant-timeness
1976280304Sjkim             */
1977280304Sjkim            if (!BN_nnmod(tmp_scalar, scalar, &group->order, ctx)) {
1978280304Sjkim                ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
1979280304Sjkim                goto err;
1980280304Sjkim            }
1981280304Sjkim            num_bytes = BN_bn2bin(tmp_scalar, tmp);
1982280304Sjkim        } else
1983280304Sjkim            num_bytes = BN_bn2bin(scalar, tmp);
1984280304Sjkim        flip_endian(g_secret, tmp, num_bytes);
1985280304Sjkim        /* do the multiplication with generator precomputation */
1986280304Sjkim        batch_mul(x_out, y_out, z_out,
1987280304Sjkim                  (const felem_bytearray(*))secrets, num_points,
1988280304Sjkim                  g_secret,
1989280304Sjkim                  mixed, (const felem(*)[17][3])pre_comp,
1990280304Sjkim                  (const felem(*)[3])g_pre_comp);
1991280304Sjkim    } else
1992280304Sjkim        /* do the multiplication without generator precomputation */
1993280304Sjkim        batch_mul(x_out, y_out, z_out,
1994280304Sjkim                  (const felem_bytearray(*))secrets, num_points,
1995280304Sjkim                  NULL, mixed, (const felem(*)[17][3])pre_comp, NULL);
1996280304Sjkim    /* reduce the output to its unique minimal representation */
1997280304Sjkim    felem_contract(x_in, x_out);
1998280304Sjkim    felem_contract(y_in, y_out);
1999280304Sjkim    felem_contract(z_in, z_out);
2000280304Sjkim    if ((!felem_to_BN(x, x_in)) || (!felem_to_BN(y, y_in)) ||
2001280304Sjkim        (!felem_to_BN(z, z_in))) {
2002280304Sjkim        ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
2003280304Sjkim        goto err;
2004280304Sjkim    }
2005280304Sjkim    ret = EC_POINT_set_Jprojective_coordinates_GFp(group, r, x, y, z, ctx);
2006238384Sjkim
2007280304Sjkim err:
2008280304Sjkim    BN_CTX_end(ctx);
2009280304Sjkim    if (generator != NULL)
2010280304Sjkim        EC_POINT_free(generator);
2011280304Sjkim    if (new_ctx != NULL)
2012280304Sjkim        BN_CTX_free(new_ctx);
2013280304Sjkim    if (secrets != NULL)
2014280304Sjkim        OPENSSL_free(secrets);
2015280304Sjkim    if (pre_comp != NULL)
2016280304Sjkim        OPENSSL_free(pre_comp);
2017280304Sjkim    if (tmp_felems != NULL)
2018280304Sjkim        OPENSSL_free(tmp_felems);
2019280304Sjkim    return ret;
2020280304Sjkim}
2021238384Sjkim
2022238384Sjkimint ec_GFp_nistp521_precompute_mult(EC_GROUP *group, BN_CTX *ctx)
2023280304Sjkim{
2024280304Sjkim    int ret = 0;
2025280304Sjkim    NISTP521_PRE_COMP *pre = NULL;
2026280304Sjkim    int i, j;
2027280304Sjkim    BN_CTX *new_ctx = NULL;
2028280304Sjkim    BIGNUM *x, *y;
2029280304Sjkim    EC_POINT *generator = NULL;
2030280304Sjkim    felem tmp_felems[16];
2031238384Sjkim
2032280304Sjkim    /* throw away old precomputation */
2033280304Sjkim    EC_EX_DATA_free_data(&group->extra_data, nistp521_pre_comp_dup,
2034280304Sjkim                         nistp521_pre_comp_free,
2035280304Sjkim                         nistp521_pre_comp_clear_free);
2036280304Sjkim    if (ctx == NULL)
2037280304Sjkim        if ((ctx = new_ctx = BN_CTX_new()) == NULL)
2038280304Sjkim            return 0;
2039280304Sjkim    BN_CTX_start(ctx);
2040280304Sjkim    if (((x = BN_CTX_get(ctx)) == NULL) || ((y = BN_CTX_get(ctx)) == NULL))
2041280304Sjkim        goto err;
2042280304Sjkim    /* get the generator */
2043280304Sjkim    if (group->generator == NULL)
2044280304Sjkim        goto err;
2045280304Sjkim    generator = EC_POINT_new(group);
2046280304Sjkim    if (generator == NULL)
2047280304Sjkim        goto err;
2048280304Sjkim    BN_bin2bn(nistp521_curve_params[3], sizeof(felem_bytearray), x);
2049280304Sjkim    BN_bin2bn(nistp521_curve_params[4], sizeof(felem_bytearray), y);
2050280304Sjkim    if (!EC_POINT_set_affine_coordinates_GFp(group, generator, x, y, ctx))
2051280304Sjkim        goto err;
2052280304Sjkim    if ((pre = nistp521_pre_comp_new()) == NULL)
2053280304Sjkim        goto err;
2054280304Sjkim    /*
2055280304Sjkim     * if the generator is the standard one, use built-in precomputation
2056280304Sjkim     */
2057280304Sjkim    if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) {
2058280304Sjkim        memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp));
2059280304Sjkim        ret = 1;
2060280304Sjkim        goto err;
2061280304Sjkim    }
2062280304Sjkim    if ((!BN_to_felem(pre->g_pre_comp[1][0], &group->generator->X)) ||
2063280304Sjkim        (!BN_to_felem(pre->g_pre_comp[1][1], &group->generator->Y)) ||
2064280304Sjkim        (!BN_to_felem(pre->g_pre_comp[1][2], &group->generator->Z)))
2065280304Sjkim        goto err;
2066280304Sjkim    /* compute 2^130*G, 2^260*G, 2^390*G */
2067280304Sjkim    for (i = 1; i <= 4; i <<= 1) {
2068280304Sjkim        point_double(pre->g_pre_comp[2 * i][0], pre->g_pre_comp[2 * i][1],
2069280304Sjkim                     pre->g_pre_comp[2 * i][2], pre->g_pre_comp[i][0],
2070280304Sjkim                     pre->g_pre_comp[i][1], pre->g_pre_comp[i][2]);
2071280304Sjkim        for (j = 0; j < 129; ++j) {
2072280304Sjkim            point_double(pre->g_pre_comp[2 * i][0],
2073280304Sjkim                         pre->g_pre_comp[2 * i][1],
2074280304Sjkim                         pre->g_pre_comp[2 * i][2],
2075280304Sjkim                         pre->g_pre_comp[2 * i][0],
2076280304Sjkim                         pre->g_pre_comp[2 * i][1],
2077280304Sjkim                         pre->g_pre_comp[2 * i][2]);
2078280304Sjkim        }
2079280304Sjkim    }
2080280304Sjkim    /* g_pre_comp[0] is the point at infinity */
2081280304Sjkim    memset(pre->g_pre_comp[0], 0, sizeof(pre->g_pre_comp[0]));
2082280304Sjkim    /* the remaining multiples */
2083280304Sjkim    /* 2^130*G + 2^260*G */
2084280304Sjkim    point_add(pre->g_pre_comp[6][0], pre->g_pre_comp[6][1],
2085280304Sjkim              pre->g_pre_comp[6][2], pre->g_pre_comp[4][0],
2086280304Sjkim              pre->g_pre_comp[4][1], pre->g_pre_comp[4][2],
2087280304Sjkim              0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
2088280304Sjkim              pre->g_pre_comp[2][2]);
2089280304Sjkim    /* 2^130*G + 2^390*G */
2090280304Sjkim    point_add(pre->g_pre_comp[10][0], pre->g_pre_comp[10][1],
2091280304Sjkim              pre->g_pre_comp[10][2], pre->g_pre_comp[8][0],
2092280304Sjkim              pre->g_pre_comp[8][1], pre->g_pre_comp[8][2],
2093280304Sjkim              0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
2094280304Sjkim              pre->g_pre_comp[2][2]);
2095280304Sjkim    /* 2^260*G + 2^390*G */
2096280304Sjkim    point_add(pre->g_pre_comp[12][0], pre->g_pre_comp[12][1],
2097280304Sjkim              pre->g_pre_comp[12][2], pre->g_pre_comp[8][0],
2098280304Sjkim              pre->g_pre_comp[8][1], pre->g_pre_comp[8][2],
2099280304Sjkim              0, pre->g_pre_comp[4][0], pre->g_pre_comp[4][1],
2100280304Sjkim              pre->g_pre_comp[4][2]);
2101280304Sjkim    /* 2^130*G + 2^260*G + 2^390*G */
2102280304Sjkim    point_add(pre->g_pre_comp[14][0], pre->g_pre_comp[14][1],
2103280304Sjkim              pre->g_pre_comp[14][2], pre->g_pre_comp[12][0],
2104280304Sjkim              pre->g_pre_comp[12][1], pre->g_pre_comp[12][2],
2105280304Sjkim              0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
2106280304Sjkim              pre->g_pre_comp[2][2]);
2107280304Sjkim    for (i = 1; i < 8; ++i) {
2108280304Sjkim        /* odd multiples: add G */
2109280304Sjkim        point_add(pre->g_pre_comp[2 * i + 1][0],
2110280304Sjkim                  pre->g_pre_comp[2 * i + 1][1],
2111280304Sjkim                  pre->g_pre_comp[2 * i + 1][2], pre->g_pre_comp[2 * i][0],
2112280304Sjkim                  pre->g_pre_comp[2 * i][1], pre->g_pre_comp[2 * i][2], 0,
2113280304Sjkim                  pre->g_pre_comp[1][0], pre->g_pre_comp[1][1],
2114280304Sjkim                  pre->g_pre_comp[1][2]);
2115280304Sjkim    }
2116280304Sjkim    make_points_affine(15, &(pre->g_pre_comp[1]), tmp_felems);
2117238384Sjkim
2118280304Sjkim    if (!EC_EX_DATA_set_data(&group->extra_data, pre, nistp521_pre_comp_dup,
2119280304Sjkim                             nistp521_pre_comp_free,
2120280304Sjkim                             nistp521_pre_comp_clear_free))
2121280304Sjkim        goto err;
2122280304Sjkim    ret = 1;
2123280304Sjkim    pre = NULL;
2124238384Sjkim err:
2125280304Sjkim    BN_CTX_end(ctx);
2126280304Sjkim    if (generator != NULL)
2127280304Sjkim        EC_POINT_free(generator);
2128280304Sjkim    if (new_ctx != NULL)
2129280304Sjkim        BN_CTX_free(new_ctx);
2130280304Sjkim    if (pre)
2131280304Sjkim        nistp521_pre_comp_free(pre);
2132280304Sjkim    return ret;
2133280304Sjkim}
2134238384Sjkim
2135238384Sjkimint ec_GFp_nistp521_have_precompute_mult(const EC_GROUP *group)
2136280304Sjkim{
2137280304Sjkim    if (EC_EX_DATA_get_data(group->extra_data, nistp521_pre_comp_dup,
2138280304Sjkim                            nistp521_pre_comp_free,
2139280304Sjkim                            nistp521_pre_comp_clear_free)
2140280304Sjkim        != NULL)
2141280304Sjkim        return 1;
2142280304Sjkim    else
2143280304Sjkim        return 0;
2144280304Sjkim}
2145238384Sjkim
2146238384Sjkim#else
2147280304Sjkimstatic void *dummy = &dummy;
2148238384Sjkim#endif
2149