1238384Sjkim/* crypto/ec/ecp_nistp521.c */
2238384Sjkim/*
3238384Sjkim * Written by Adam Langley (Google) for the OpenSSL project
4238384Sjkim */
5238384Sjkim/* Copyright 2011 Google Inc.
6238384Sjkim *
7238384Sjkim * Licensed under the Apache License, Version 2.0 (the "License");
8238384Sjkim *
9238384Sjkim * you may not use this file except in compliance with the License.
10238384Sjkim * You may obtain a copy of the License at
11238384Sjkim *
12238384Sjkim *     http://www.apache.org/licenses/LICENSE-2.0
13238384Sjkim *
14238384Sjkim *  Unless required by applicable law or agreed to in writing, software
15238384Sjkim *  distributed under the License is distributed on an "AS IS" BASIS,
16238384Sjkim *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17238384Sjkim *  See the License for the specific language governing permissions and
18238384Sjkim *  limitations under the License.
19238384Sjkim */
20238384Sjkim
21238384Sjkim/*
22238384Sjkim * A 64-bit implementation of the NIST P-521 elliptic curve point multiplication
23238384Sjkim *
24238384Sjkim * OpenSSL integration was taken from Emilia Kasper's work in ecp_nistp224.c.
25238384Sjkim * Otherwise based on Emilia's P224 work, which was inspired by my curve25519
26238384Sjkim * work which got its smarts from Daniel J. Bernstein's work on the same.
27238384Sjkim */
28238384Sjkim
29238384Sjkim#include <openssl/opensslconf.h>
30238384Sjkim#ifndef OPENSSL_NO_EC_NISTP_64_GCC_128
31238384Sjkim
32280297Sjkim# ifndef OPENSSL_SYS_VMS
33280297Sjkim#  include <stdint.h>
34280297Sjkim# else
35280297Sjkim#  include <inttypes.h>
36280297Sjkim# endif
37238384Sjkim
38280297Sjkim# include <string.h>
39280297Sjkim# include <openssl/err.h>
40280297Sjkim# include "ec_lcl.h"
41352193Sjkim# include "bn_int.h" /* bn_bn2lebinpad, bn_lebin2bn */
42238384Sjkim
43280297Sjkim# if defined(__GNUC__) && (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))
44238384Sjkim  /* even with gcc, the typedef won't work for 32-bit platforms */
45280297Sjkimtypedef __uint128_t uint128_t;  /* nonstandard; implemented by gcc on 64-bit
46280297Sjkim                                 * platforms */
47280297Sjkim# else
48280297Sjkim#  error "Need GCC 3.1 or later to define type uint128_t"
49280297Sjkim# endif
50238384Sjkim
51238384Sjkimtypedef uint8_t u8;
52238384Sjkimtypedef uint64_t u64;
53238384Sjkim
54280297Sjkim/*
55280297Sjkim * The underlying field. P521 operates over GF(2^521-1). We can serialise an
56280297Sjkim * element of this field into 66 bytes where the most significant byte
57280297Sjkim * contains only a single bit. We call this an felem_bytearray.
58280297Sjkim */
59238384Sjkim
60238384Sjkimtypedef u8 felem_bytearray[66];
61238384Sjkim
62280297Sjkim/*
63280297Sjkim * These are the parameters of P521, taken from FIPS 186-3, section D.1.2.5.
64280297Sjkim * These values are big-endian.
65280297Sjkim */
66280297Sjkimstatic const felem_bytearray nistp521_curve_params[5] = {
67280297Sjkim    {0x01, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* p */
68280297Sjkim     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
69280297Sjkim     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
70280297Sjkim     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
71280297Sjkim     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
72280297Sjkim     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
73280297Sjkim     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
74280297Sjkim     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
75280297Sjkim     0xff, 0xff},
76280297Sjkim    {0x01, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* a = -3 */
77280297Sjkim     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
78280297Sjkim     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
79280297Sjkim     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
80280297Sjkim     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
81280297Sjkim     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
82280297Sjkim     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
83280297Sjkim     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
84280297Sjkim     0xff, 0xfc},
85280297Sjkim    {0x00, 0x51, 0x95, 0x3e, 0xb9, 0x61, 0x8e, 0x1c, /* b */
86280297Sjkim     0x9a, 0x1f, 0x92, 0x9a, 0x21, 0xa0, 0xb6, 0x85,
87280297Sjkim     0x40, 0xee, 0xa2, 0xda, 0x72, 0x5b, 0x99, 0xb3,
88280297Sjkim     0x15, 0xf3, 0xb8, 0xb4, 0x89, 0x91, 0x8e, 0xf1,
89280297Sjkim     0x09, 0xe1, 0x56, 0x19, 0x39, 0x51, 0xec, 0x7e,
90280297Sjkim     0x93, 0x7b, 0x16, 0x52, 0xc0, 0xbd, 0x3b, 0xb1,
91280297Sjkim     0xbf, 0x07, 0x35, 0x73, 0xdf, 0x88, 0x3d, 0x2c,
92280297Sjkim     0x34, 0xf1, 0xef, 0x45, 0x1f, 0xd4, 0x6b, 0x50,
93280297Sjkim     0x3f, 0x00},
94280297Sjkim    {0x00, 0xc6, 0x85, 0x8e, 0x06, 0xb7, 0x04, 0x04, /* x */
95280297Sjkim     0xe9, 0xcd, 0x9e, 0x3e, 0xcb, 0x66, 0x23, 0x95,
96280297Sjkim     0xb4, 0x42, 0x9c, 0x64, 0x81, 0x39, 0x05, 0x3f,
97280297Sjkim     0xb5, 0x21, 0xf8, 0x28, 0xaf, 0x60, 0x6b, 0x4d,
98280297Sjkim     0x3d, 0xba, 0xa1, 0x4b, 0x5e, 0x77, 0xef, 0xe7,
99280297Sjkim     0x59, 0x28, 0xfe, 0x1d, 0xc1, 0x27, 0xa2, 0xff,
100280297Sjkim     0xa8, 0xde, 0x33, 0x48, 0xb3, 0xc1, 0x85, 0x6a,
101280297Sjkim     0x42, 0x9b, 0xf9, 0x7e, 0x7e, 0x31, 0xc2, 0xe5,
102280297Sjkim     0xbd, 0x66},
103280297Sjkim    {0x01, 0x18, 0x39, 0x29, 0x6a, 0x78, 0x9a, 0x3b, /* y */
104280297Sjkim     0xc0, 0x04, 0x5c, 0x8a, 0x5f, 0xb4, 0x2c, 0x7d,
105280297Sjkim     0x1b, 0xd9, 0x98, 0xf5, 0x44, 0x49, 0x57, 0x9b,
106280297Sjkim     0x44, 0x68, 0x17, 0xaf, 0xbd, 0x17, 0x27, 0x3e,
107280297Sjkim     0x66, 0x2c, 0x97, 0xee, 0x72, 0x99, 0x5e, 0xf4,
108280297Sjkim     0x26, 0x40, 0xc5, 0x50, 0xb9, 0x01, 0x3f, 0xad,
109280297Sjkim     0x07, 0x61, 0x35, 0x3c, 0x70, 0x86, 0xa2, 0x72,
110280297Sjkim     0xc2, 0x40, 0x88, 0xbe, 0x94, 0x76, 0x9f, 0xd1,
111280297Sjkim     0x66, 0x50}
112280297Sjkim};
113238384Sjkim
114280297Sjkim/*-
115280297Sjkim * The representation of field elements.
116238384Sjkim * ------------------------------------
117238384Sjkim *
118238384Sjkim * We represent field elements with nine values. These values are either 64 or
119238384Sjkim * 128 bits and the field element represented is:
120238384Sjkim *   v[0]*2^0 + v[1]*2^58 + v[2]*2^116 + ... + v[8]*2^464  (mod p)
121238384Sjkim * Each of the nine values is called a 'limb'. Since the limbs are spaced only
122238384Sjkim * 58 bits apart, but are greater than 58 bits in length, the most significant
123238384Sjkim * bits of each limb overlap with the least significant bits of the next.
124238384Sjkim *
125238384Sjkim * A field element with 64-bit limbs is an 'felem'. One with 128-bit limbs is a
126238384Sjkim * 'largefelem' */
127238384Sjkim
128280297Sjkim# define NLIMBS 9
129238384Sjkim
130238384Sjkimtypedef uint64_t limb;
131238384Sjkimtypedef limb felem[NLIMBS];
132238384Sjkimtypedef uint128_t largefelem[NLIMBS];
133238384Sjkim
134238384Sjkimstatic const limb bottom57bits = 0x1ffffffffffffff;
135238384Sjkimstatic const limb bottom58bits = 0x3ffffffffffffff;
136238384Sjkim
137280297Sjkim/*
138280297Sjkim * bin66_to_felem takes a little-endian byte array and converts it into felem
139280297Sjkim * form. This assumes that the CPU is little-endian.
140280297Sjkim */
141238384Sjkimstatic void bin66_to_felem(felem out, const u8 in[66])
142280297Sjkim{
143280297Sjkim    out[0] = (*((limb *) & in[0])) & bottom58bits;
144280297Sjkim    out[1] = (*((limb *) & in[7]) >> 2) & bottom58bits;
145280297Sjkim    out[2] = (*((limb *) & in[14]) >> 4) & bottom58bits;
146280297Sjkim    out[3] = (*((limb *) & in[21]) >> 6) & bottom58bits;
147280297Sjkim    out[4] = (*((limb *) & in[29])) & bottom58bits;
148280297Sjkim    out[5] = (*((limb *) & in[36]) >> 2) & bottom58bits;
149280297Sjkim    out[6] = (*((limb *) & in[43]) >> 4) & bottom58bits;
150280297Sjkim    out[7] = (*((limb *) & in[50]) >> 6) & bottom58bits;
151280297Sjkim    out[8] = (*((limb *) & in[58])) & bottom57bits;
152280297Sjkim}
153238384Sjkim
154280297Sjkim/*
155280297Sjkim * felem_to_bin66 takes an felem and serialises into a little endian, 66 byte
156280297Sjkim * array. This assumes that the CPU is little-endian.
157280297Sjkim */
158238384Sjkimstatic void felem_to_bin66(u8 out[66], const felem in)
159280297Sjkim{
160280297Sjkim    memset(out, 0, 66);
161280297Sjkim    (*((limb *) & out[0])) = in[0];
162280297Sjkim    (*((limb *) & out[7])) |= in[1] << 2;
163280297Sjkim    (*((limb *) & out[14])) |= in[2] << 4;
164280297Sjkim    (*((limb *) & out[21])) |= in[3] << 6;
165280297Sjkim    (*((limb *) & out[29])) = in[4];
166280297Sjkim    (*((limb *) & out[36])) |= in[5] << 2;
167280297Sjkim    (*((limb *) & out[43])) |= in[6] << 4;
168280297Sjkim    (*((limb *) & out[50])) |= in[7] << 6;
169280297Sjkim    (*((limb *) & out[58])) = in[8];
170280297Sjkim}
171238384Sjkim
172238384Sjkim/* BN_to_felem converts an OpenSSL BIGNUM into an felem */
173238384Sjkimstatic int BN_to_felem(felem out, const BIGNUM *bn)
174280297Sjkim{
175280297Sjkim    felem_bytearray b_out;
176352193Sjkim    int num_bytes;
177238384Sjkim
178352193Sjkim    if (BN_is_negative(bn)) {
179280297Sjkim        ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
180280297Sjkim        return 0;
181280297Sjkim    }
182352193Sjkim    num_bytes = bn_bn2lebinpad(bn, b_out, sizeof(b_out));
183352193Sjkim    if (num_bytes < 0) {
184280297Sjkim        ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
185280297Sjkim        return 0;
186280297Sjkim    }
187280297Sjkim    bin66_to_felem(out, b_out);
188280297Sjkim    return 1;
189280297Sjkim}
190238384Sjkim
191238384Sjkim/* felem_to_BN converts an felem into an OpenSSL BIGNUM */
192238384Sjkimstatic BIGNUM *felem_to_BN(BIGNUM *out, const felem in)
193280297Sjkim{
194352193Sjkim    felem_bytearray b_out;
195352193Sjkim    felem_to_bin66(b_out, in);
196352193Sjkim    return bn_lebin2bn(b_out, sizeof(b_out), out);
197280297Sjkim}
198238384Sjkim
199280297Sjkim/*-
200280297Sjkim * Field operations
201280297Sjkim * ----------------
202280297Sjkim */
203238384Sjkim
204238384Sjkimstatic void felem_one(felem out)
205280297Sjkim{
206280297Sjkim    out[0] = 1;
207280297Sjkim    out[1] = 0;
208280297Sjkim    out[2] = 0;
209280297Sjkim    out[3] = 0;
210280297Sjkim    out[4] = 0;
211280297Sjkim    out[5] = 0;
212280297Sjkim    out[6] = 0;
213280297Sjkim    out[7] = 0;
214280297Sjkim    out[8] = 0;
215280297Sjkim}
216238384Sjkim
217238384Sjkimstatic void felem_assign(felem out, const felem in)
218280297Sjkim{
219280297Sjkim    out[0] = in[0];
220280297Sjkim    out[1] = in[1];
221280297Sjkim    out[2] = in[2];
222280297Sjkim    out[3] = in[3];
223280297Sjkim    out[4] = in[4];
224280297Sjkim    out[5] = in[5];
225280297Sjkim    out[6] = in[6];
226280297Sjkim    out[7] = in[7];
227280297Sjkim    out[8] = in[8];
228280297Sjkim}
229238384Sjkim
230238384Sjkim/* felem_sum64 sets out = out + in. */
231238384Sjkimstatic void felem_sum64(felem out, const felem in)
232280297Sjkim{
233280297Sjkim    out[0] += in[0];
234280297Sjkim    out[1] += in[1];
235280297Sjkim    out[2] += in[2];
236280297Sjkim    out[3] += in[3];
237280297Sjkim    out[4] += in[4];
238280297Sjkim    out[5] += in[5];
239280297Sjkim    out[6] += in[6];
240280297Sjkim    out[7] += in[7];
241280297Sjkim    out[8] += in[8];
242280297Sjkim}
243238384Sjkim
244238384Sjkim/* felem_scalar sets out = in * scalar */
245238384Sjkimstatic void felem_scalar(felem out, const felem in, limb scalar)
246280297Sjkim{
247280297Sjkim    out[0] = in[0] * scalar;
248280297Sjkim    out[1] = in[1] * scalar;
249280297Sjkim    out[2] = in[2] * scalar;
250280297Sjkim    out[3] = in[3] * scalar;
251280297Sjkim    out[4] = in[4] * scalar;
252280297Sjkim    out[5] = in[5] * scalar;
253280297Sjkim    out[6] = in[6] * scalar;
254280297Sjkim    out[7] = in[7] * scalar;
255280297Sjkim    out[8] = in[8] * scalar;
256280297Sjkim}
257238384Sjkim
258238384Sjkim/* felem_scalar64 sets out = out * scalar */
259238384Sjkimstatic void felem_scalar64(felem out, limb scalar)
260280297Sjkim{
261280297Sjkim    out[0] *= scalar;
262280297Sjkim    out[1] *= scalar;
263280297Sjkim    out[2] *= scalar;
264280297Sjkim    out[3] *= scalar;
265280297Sjkim    out[4] *= scalar;
266280297Sjkim    out[5] *= scalar;
267280297Sjkim    out[6] *= scalar;
268280297Sjkim    out[7] *= scalar;
269280297Sjkim    out[8] *= scalar;
270280297Sjkim}
271238384Sjkim
272238384Sjkim/* felem_scalar128 sets out = out * scalar */
273238384Sjkimstatic void felem_scalar128(largefelem out, limb scalar)
274280297Sjkim{
275280297Sjkim    out[0] *= scalar;
276280297Sjkim    out[1] *= scalar;
277280297Sjkim    out[2] *= scalar;
278280297Sjkim    out[3] *= scalar;
279280297Sjkim    out[4] *= scalar;
280280297Sjkim    out[5] *= scalar;
281280297Sjkim    out[6] *= scalar;
282280297Sjkim    out[7] *= scalar;
283280297Sjkim    out[8] *= scalar;
284280297Sjkim}
285238384Sjkim
286280297Sjkim/*-
287280297Sjkim * felem_neg sets |out| to |-in|
288238384Sjkim * On entry:
289238384Sjkim *   in[i] < 2^59 + 2^14
290238384Sjkim * On exit:
291238384Sjkim *   out[i] < 2^62
292238384Sjkim */
293238384Sjkimstatic void felem_neg(felem out, const felem in)
294280297Sjkim{
295280297Sjkim    /* In order to prevent underflow, we subtract from 0 mod p. */
296280297Sjkim    static const limb two62m3 = (((limb) 1) << 62) - (((limb) 1) << 5);
297280297Sjkim    static const limb two62m2 = (((limb) 1) << 62) - (((limb) 1) << 4);
298238384Sjkim
299280297Sjkim    out[0] = two62m3 - in[0];
300280297Sjkim    out[1] = two62m2 - in[1];
301280297Sjkim    out[2] = two62m2 - in[2];
302280297Sjkim    out[3] = two62m2 - in[3];
303280297Sjkim    out[4] = two62m2 - in[4];
304280297Sjkim    out[5] = two62m2 - in[5];
305280297Sjkim    out[6] = two62m2 - in[6];
306280297Sjkim    out[7] = two62m2 - in[7];
307280297Sjkim    out[8] = two62m2 - in[8];
308280297Sjkim}
309238384Sjkim
310280297Sjkim/*-
311280297Sjkim * felem_diff64 subtracts |in| from |out|
312238384Sjkim * On entry:
313238384Sjkim *   in[i] < 2^59 + 2^14
314238384Sjkim * On exit:
315238384Sjkim *   out[i] < out[i] + 2^62
316238384Sjkim */
317238384Sjkimstatic void felem_diff64(felem out, const felem in)
318280297Sjkim{
319280297Sjkim    /*
320280297Sjkim     * In order to prevent underflow, we add 0 mod p before subtracting.
321280297Sjkim     */
322280297Sjkim    static const limb two62m3 = (((limb) 1) << 62) - (((limb) 1) << 5);
323280297Sjkim    static const limb two62m2 = (((limb) 1) << 62) - (((limb) 1) << 4);
324238384Sjkim
325280297Sjkim    out[0] += two62m3 - in[0];
326280297Sjkim    out[1] += two62m2 - in[1];
327280297Sjkim    out[2] += two62m2 - in[2];
328280297Sjkim    out[3] += two62m2 - in[3];
329280297Sjkim    out[4] += two62m2 - in[4];
330280297Sjkim    out[5] += two62m2 - in[5];
331280297Sjkim    out[6] += two62m2 - in[6];
332280297Sjkim    out[7] += two62m2 - in[7];
333280297Sjkim    out[8] += two62m2 - in[8];
334280297Sjkim}
335238384Sjkim
336280297Sjkim/*-
337280297Sjkim * felem_diff_128_64 subtracts |in| from |out|
338238384Sjkim * On entry:
339238384Sjkim *   in[i] < 2^62 + 2^17
340238384Sjkim * On exit:
341238384Sjkim *   out[i] < out[i] + 2^63
342238384Sjkim */
343238384Sjkimstatic void felem_diff_128_64(largefelem out, const felem in)
344280297Sjkim{
345280297Sjkim    /*
346348343Sjkim     * In order to prevent underflow, we add 64p mod p (which is equivalent
347348343Sjkim     * to 0 mod p) before subtracting. p is 2^521 - 1, i.e. in binary a 521
348348343Sjkim     * digit number with all bits set to 1. See "The representation of field
349348343Sjkim     * elements" comment above for a description of how limbs are used to
350348343Sjkim     * represent a number. 64p is represented with 8 limbs containing a number
351348343Sjkim     * with 58 bits set and one limb with a number with 57 bits set.
352280297Sjkim     */
353348343Sjkim    static const limb two63m6 = (((limb) 1) << 63) - (((limb) 1) << 6);
354348343Sjkim    static const limb two63m5 = (((limb) 1) << 63) - (((limb) 1) << 5);
355238384Sjkim
356280297Sjkim    out[0] += two63m6 - in[0];
357280297Sjkim    out[1] += two63m5 - in[1];
358280297Sjkim    out[2] += two63m5 - in[2];
359280297Sjkim    out[3] += two63m5 - in[3];
360280297Sjkim    out[4] += two63m5 - in[4];
361280297Sjkim    out[5] += two63m5 - in[5];
362280297Sjkim    out[6] += two63m5 - in[6];
363280297Sjkim    out[7] += two63m5 - in[7];
364280297Sjkim    out[8] += two63m5 - in[8];
365280297Sjkim}
366238384Sjkim
367280297Sjkim/*-
368280297Sjkim * felem_diff_128_64 subtracts |in| from |out|
369238384Sjkim * On entry:
370238384Sjkim *   in[i] < 2^126
371238384Sjkim * On exit:
372238384Sjkim *   out[i] < out[i] + 2^127 - 2^69
373238384Sjkim */
374238384Sjkimstatic void felem_diff128(largefelem out, const largefelem in)
375280297Sjkim{
376280297Sjkim    /*
377280297Sjkim     * In order to prevent underflow, we add 0 mod p before subtracting.
378280297Sjkim     */
379280297Sjkim    static const uint128_t two127m70 =
380280297Sjkim        (((uint128_t) 1) << 127) - (((uint128_t) 1) << 70);
381280297Sjkim    static const uint128_t two127m69 =
382280297Sjkim        (((uint128_t) 1) << 127) - (((uint128_t) 1) << 69);
383238384Sjkim
384280297Sjkim    out[0] += (two127m70 - in[0]);
385280297Sjkim    out[1] += (two127m69 - in[1]);
386280297Sjkim    out[2] += (two127m69 - in[2]);
387280297Sjkim    out[3] += (two127m69 - in[3]);
388280297Sjkim    out[4] += (two127m69 - in[4]);
389280297Sjkim    out[5] += (two127m69 - in[5]);
390280297Sjkim    out[6] += (two127m69 - in[6]);
391280297Sjkim    out[7] += (two127m69 - in[7]);
392280297Sjkim    out[8] += (two127m69 - in[8]);
393280297Sjkim}
394238384Sjkim
395280297Sjkim/*-
396280297Sjkim * felem_square sets |out| = |in|^2
397238384Sjkim * On entry:
398238384Sjkim *   in[i] < 2^62
399238384Sjkim * On exit:
400238384Sjkim *   out[i] < 17 * max(in[i]) * max(in[i])
401238384Sjkim */
402238384Sjkimstatic void felem_square(largefelem out, const felem in)
403280297Sjkim{
404280297Sjkim    felem inx2, inx4;
405280297Sjkim    felem_scalar(inx2, in, 2);
406280297Sjkim    felem_scalar(inx4, in, 4);
407238384Sjkim
408280297Sjkim    /*-
409280297Sjkim     * We have many cases were we want to do
410280297Sjkim     *   in[x] * in[y] +
411280297Sjkim     *   in[y] * in[x]
412280297Sjkim     * This is obviously just
413280297Sjkim     *   2 * in[x] * in[y]
414280297Sjkim     * However, rather than do the doubling on the 128 bit result, we
415280297Sjkim     * double one of the inputs to the multiplication by reading from
416280297Sjkim     * |inx2|
417280297Sjkim     */
418238384Sjkim
419280297Sjkim    out[0] = ((uint128_t) in[0]) * in[0];
420280297Sjkim    out[1] = ((uint128_t) in[0]) * inx2[1];
421280297Sjkim    out[2] = ((uint128_t) in[0]) * inx2[2] + ((uint128_t) in[1]) * in[1];
422280297Sjkim    out[3] = ((uint128_t) in[0]) * inx2[3] + ((uint128_t) in[1]) * inx2[2];
423280297Sjkim    out[4] = ((uint128_t) in[0]) * inx2[4] +
424280297Sjkim        ((uint128_t) in[1]) * inx2[3] + ((uint128_t) in[2]) * in[2];
425280297Sjkim    out[5] = ((uint128_t) in[0]) * inx2[5] +
426280297Sjkim        ((uint128_t) in[1]) * inx2[4] + ((uint128_t) in[2]) * inx2[3];
427280297Sjkim    out[6] = ((uint128_t) in[0]) * inx2[6] +
428280297Sjkim        ((uint128_t) in[1]) * inx2[5] +
429280297Sjkim        ((uint128_t) in[2]) * inx2[4] + ((uint128_t) in[3]) * in[3];
430280297Sjkim    out[7] = ((uint128_t) in[0]) * inx2[7] +
431280297Sjkim        ((uint128_t) in[1]) * inx2[6] +
432280297Sjkim        ((uint128_t) in[2]) * inx2[5] + ((uint128_t) in[3]) * inx2[4];
433280297Sjkim    out[8] = ((uint128_t) in[0]) * inx2[8] +
434280297Sjkim        ((uint128_t) in[1]) * inx2[7] +
435280297Sjkim        ((uint128_t) in[2]) * inx2[6] +
436280297Sjkim        ((uint128_t) in[3]) * inx2[5] + ((uint128_t) in[4]) * in[4];
437238384Sjkim
438280297Sjkim    /*
439280297Sjkim     * The remaining limbs fall above 2^521, with the first falling at 2^522.
440280297Sjkim     * They correspond to locations one bit up from the limbs produced above
441280297Sjkim     * so we would have to multiply by two to align them. Again, rather than
442280297Sjkim     * operate on the 128-bit result, we double one of the inputs to the
443280297Sjkim     * multiplication. If we want to double for both this reason, and the
444280297Sjkim     * reason above, then we end up multiplying by four.
445280297Sjkim     */
446238384Sjkim
447280297Sjkim    /* 9 */
448280297Sjkim    out[0] += ((uint128_t) in[1]) * inx4[8] +
449280297Sjkim        ((uint128_t) in[2]) * inx4[7] +
450280297Sjkim        ((uint128_t) in[3]) * inx4[6] + ((uint128_t) in[4]) * inx4[5];
451238384Sjkim
452280297Sjkim    /* 10 */
453280297Sjkim    out[1] += ((uint128_t) in[2]) * inx4[8] +
454280297Sjkim        ((uint128_t) in[3]) * inx4[7] +
455280297Sjkim        ((uint128_t) in[4]) * inx4[6] + ((uint128_t) in[5]) * inx2[5];
456238384Sjkim
457280297Sjkim    /* 11 */
458280297Sjkim    out[2] += ((uint128_t) in[3]) * inx4[8] +
459280297Sjkim        ((uint128_t) in[4]) * inx4[7] + ((uint128_t) in[5]) * inx4[6];
460238384Sjkim
461280297Sjkim    /* 12 */
462280297Sjkim    out[3] += ((uint128_t) in[4]) * inx4[8] +
463280297Sjkim        ((uint128_t) in[5]) * inx4[7] + ((uint128_t) in[6]) * inx2[6];
464238384Sjkim
465280297Sjkim    /* 13 */
466280297Sjkim    out[4] += ((uint128_t) in[5]) * inx4[8] + ((uint128_t) in[6]) * inx4[7];
467238384Sjkim
468280297Sjkim    /* 14 */
469280297Sjkim    out[5] += ((uint128_t) in[6]) * inx4[8] + ((uint128_t) in[7]) * inx2[7];
470238384Sjkim
471280297Sjkim    /* 15 */
472280297Sjkim    out[6] += ((uint128_t) in[7]) * inx4[8];
473238384Sjkim
474280297Sjkim    /* 16 */
475280297Sjkim    out[7] += ((uint128_t) in[8]) * inx2[8];
476280297Sjkim}
477238384Sjkim
478280297Sjkim/*-
479280297Sjkim * felem_mul sets |out| = |in1| * |in2|
480238384Sjkim * On entry:
481238384Sjkim *   in1[i] < 2^64
482238384Sjkim *   in2[i] < 2^63
483238384Sjkim * On exit:
484238384Sjkim *   out[i] < 17 * max(in1[i]) * max(in2[i])
485238384Sjkim */
486238384Sjkimstatic void felem_mul(largefelem out, const felem in1, const felem in2)
487280297Sjkim{
488280297Sjkim    felem in2x2;
489280297Sjkim    felem_scalar(in2x2, in2, 2);
490238384Sjkim
491280297Sjkim    out[0] = ((uint128_t) in1[0]) * in2[0];
492238384Sjkim
493280297Sjkim    out[1] = ((uint128_t) in1[0]) * in2[1] + ((uint128_t) in1[1]) * in2[0];
494238384Sjkim
495280297Sjkim    out[2] = ((uint128_t) in1[0]) * in2[2] +
496280297Sjkim        ((uint128_t) in1[1]) * in2[1] + ((uint128_t) in1[2]) * in2[0];
497238384Sjkim
498280297Sjkim    out[3] = ((uint128_t) in1[0]) * in2[3] +
499280297Sjkim        ((uint128_t) in1[1]) * in2[2] +
500280297Sjkim        ((uint128_t) in1[2]) * in2[1] + ((uint128_t) in1[3]) * in2[0];
501238384Sjkim
502280297Sjkim    out[4] = ((uint128_t) in1[0]) * in2[4] +
503280297Sjkim        ((uint128_t) in1[1]) * in2[3] +
504280297Sjkim        ((uint128_t) in1[2]) * in2[2] +
505280297Sjkim        ((uint128_t) in1[3]) * in2[1] + ((uint128_t) in1[4]) * in2[0];
506238384Sjkim
507280297Sjkim    out[5] = ((uint128_t) in1[0]) * in2[5] +
508280297Sjkim        ((uint128_t) in1[1]) * in2[4] +
509280297Sjkim        ((uint128_t) in1[2]) * in2[3] +
510280297Sjkim        ((uint128_t) in1[3]) * in2[2] +
511280297Sjkim        ((uint128_t) in1[4]) * in2[1] + ((uint128_t) in1[5]) * in2[0];
512238384Sjkim
513280297Sjkim    out[6] = ((uint128_t) in1[0]) * in2[6] +
514280297Sjkim        ((uint128_t) in1[1]) * in2[5] +
515280297Sjkim        ((uint128_t) in1[2]) * in2[4] +
516280297Sjkim        ((uint128_t) in1[3]) * in2[3] +
517280297Sjkim        ((uint128_t) in1[4]) * in2[2] +
518280297Sjkim        ((uint128_t) in1[5]) * in2[1] + ((uint128_t) in1[6]) * in2[0];
519238384Sjkim
520280297Sjkim    out[7] = ((uint128_t) in1[0]) * in2[7] +
521280297Sjkim        ((uint128_t) in1[1]) * in2[6] +
522280297Sjkim        ((uint128_t) in1[2]) * in2[5] +
523280297Sjkim        ((uint128_t) in1[3]) * in2[4] +
524280297Sjkim        ((uint128_t) in1[4]) * in2[3] +
525280297Sjkim        ((uint128_t) in1[5]) * in2[2] +
526280297Sjkim        ((uint128_t) in1[6]) * in2[1] + ((uint128_t) in1[7]) * in2[0];
527238384Sjkim
528280297Sjkim    out[8] = ((uint128_t) in1[0]) * in2[8] +
529280297Sjkim        ((uint128_t) in1[1]) * in2[7] +
530280297Sjkim        ((uint128_t) in1[2]) * in2[6] +
531280297Sjkim        ((uint128_t) in1[3]) * in2[5] +
532280297Sjkim        ((uint128_t) in1[4]) * in2[4] +
533280297Sjkim        ((uint128_t) in1[5]) * in2[3] +
534280297Sjkim        ((uint128_t) in1[6]) * in2[2] +
535280297Sjkim        ((uint128_t) in1[7]) * in2[1] + ((uint128_t) in1[8]) * in2[0];
536238384Sjkim
537280297Sjkim    /* See comment in felem_square about the use of in2x2 here */
538238384Sjkim
539280297Sjkim    out[0] += ((uint128_t) in1[1]) * in2x2[8] +
540280297Sjkim        ((uint128_t) in1[2]) * in2x2[7] +
541280297Sjkim        ((uint128_t) in1[3]) * in2x2[6] +
542280297Sjkim        ((uint128_t) in1[4]) * in2x2[5] +
543280297Sjkim        ((uint128_t) in1[5]) * in2x2[4] +
544280297Sjkim        ((uint128_t) in1[6]) * in2x2[3] +
545280297Sjkim        ((uint128_t) in1[7]) * in2x2[2] + ((uint128_t) in1[8]) * in2x2[1];
546238384Sjkim
547280297Sjkim    out[1] += ((uint128_t) in1[2]) * in2x2[8] +
548280297Sjkim        ((uint128_t) in1[3]) * in2x2[7] +
549280297Sjkim        ((uint128_t) in1[4]) * in2x2[6] +
550280297Sjkim        ((uint128_t) in1[5]) * in2x2[5] +
551280297Sjkim        ((uint128_t) in1[6]) * in2x2[4] +
552280297Sjkim        ((uint128_t) in1[7]) * in2x2[3] + ((uint128_t) in1[8]) * in2x2[2];
553238384Sjkim
554280297Sjkim    out[2] += ((uint128_t) in1[3]) * in2x2[8] +
555280297Sjkim        ((uint128_t) in1[4]) * in2x2[7] +
556280297Sjkim        ((uint128_t) in1[5]) * in2x2[6] +
557280297Sjkim        ((uint128_t) in1[6]) * in2x2[5] +
558280297Sjkim        ((uint128_t) in1[7]) * in2x2[4] + ((uint128_t) in1[8]) * in2x2[3];
559238384Sjkim
560280297Sjkim    out[3] += ((uint128_t) in1[4]) * in2x2[8] +
561280297Sjkim        ((uint128_t) in1[5]) * in2x2[7] +
562280297Sjkim        ((uint128_t) in1[6]) * in2x2[6] +
563280297Sjkim        ((uint128_t) in1[7]) * in2x2[5] + ((uint128_t) in1[8]) * in2x2[4];
564238384Sjkim
565280297Sjkim    out[4] += ((uint128_t) in1[5]) * in2x2[8] +
566280297Sjkim        ((uint128_t) in1[6]) * in2x2[7] +
567280297Sjkim        ((uint128_t) in1[7]) * in2x2[6] + ((uint128_t) in1[8]) * in2x2[5];
568238384Sjkim
569280297Sjkim    out[5] += ((uint128_t) in1[6]) * in2x2[8] +
570280297Sjkim        ((uint128_t) in1[7]) * in2x2[7] + ((uint128_t) in1[8]) * in2x2[6];
571238384Sjkim
572280297Sjkim    out[6] += ((uint128_t) in1[7]) * in2x2[8] +
573280297Sjkim        ((uint128_t) in1[8]) * in2x2[7];
574238384Sjkim
575280297Sjkim    out[7] += ((uint128_t) in1[8]) * in2x2[8];
576280297Sjkim}
577238384Sjkim
578238384Sjkimstatic const limb bottom52bits = 0xfffffffffffff;
579238384Sjkim
580280297Sjkim/*-
581280297Sjkim * felem_reduce converts a largefelem to an felem.
582238384Sjkim * On entry:
583238384Sjkim *   in[i] < 2^128
584238384Sjkim * On exit:
585238384Sjkim *   out[i] < 2^59 + 2^14
586238384Sjkim */
587238384Sjkimstatic void felem_reduce(felem out, const largefelem in)
588280297Sjkim{
589280297Sjkim    u64 overflow1, overflow2;
590238384Sjkim
591280297Sjkim    out[0] = ((limb) in[0]) & bottom58bits;
592280297Sjkim    out[1] = ((limb) in[1]) & bottom58bits;
593280297Sjkim    out[2] = ((limb) in[2]) & bottom58bits;
594280297Sjkim    out[3] = ((limb) in[3]) & bottom58bits;
595280297Sjkim    out[4] = ((limb) in[4]) & bottom58bits;
596280297Sjkim    out[5] = ((limb) in[5]) & bottom58bits;
597280297Sjkim    out[6] = ((limb) in[6]) & bottom58bits;
598280297Sjkim    out[7] = ((limb) in[7]) & bottom58bits;
599280297Sjkim    out[8] = ((limb) in[8]) & bottom58bits;
600238384Sjkim
601280297Sjkim    /* out[i] < 2^58 */
602238384Sjkim
603280297Sjkim    out[1] += ((limb) in[0]) >> 58;
604280297Sjkim    out[1] += (((limb) (in[0] >> 64)) & bottom52bits) << 6;
605280297Sjkim    /*-
606280297Sjkim     * out[1] < 2^58 + 2^6 + 2^58
607280297Sjkim     *        = 2^59 + 2^6
608280297Sjkim     */
609280297Sjkim    out[2] += ((limb) (in[0] >> 64)) >> 52;
610238384Sjkim
611280297Sjkim    out[2] += ((limb) in[1]) >> 58;
612280297Sjkim    out[2] += (((limb) (in[1] >> 64)) & bottom52bits) << 6;
613280297Sjkim    out[3] += ((limb) (in[1] >> 64)) >> 52;
614238384Sjkim
615280297Sjkim    out[3] += ((limb) in[2]) >> 58;
616280297Sjkim    out[3] += (((limb) (in[2] >> 64)) & bottom52bits) << 6;
617280297Sjkim    out[4] += ((limb) (in[2] >> 64)) >> 52;
618238384Sjkim
619280297Sjkim    out[4] += ((limb) in[3]) >> 58;
620280297Sjkim    out[4] += (((limb) (in[3] >> 64)) & bottom52bits) << 6;
621280297Sjkim    out[5] += ((limb) (in[3] >> 64)) >> 52;
622238384Sjkim
623280297Sjkim    out[5] += ((limb) in[4]) >> 58;
624280297Sjkim    out[5] += (((limb) (in[4] >> 64)) & bottom52bits) << 6;
625280297Sjkim    out[6] += ((limb) (in[4] >> 64)) >> 52;
626238384Sjkim
627280297Sjkim    out[6] += ((limb) in[5]) >> 58;
628280297Sjkim    out[6] += (((limb) (in[5] >> 64)) & bottom52bits) << 6;
629280297Sjkim    out[7] += ((limb) (in[5] >> 64)) >> 52;
630238384Sjkim
631280297Sjkim    out[7] += ((limb) in[6]) >> 58;
632280297Sjkim    out[7] += (((limb) (in[6] >> 64)) & bottom52bits) << 6;
633280297Sjkim    out[8] += ((limb) (in[6] >> 64)) >> 52;
634238384Sjkim
635280297Sjkim    out[8] += ((limb) in[7]) >> 58;
636280297Sjkim    out[8] += (((limb) (in[7] >> 64)) & bottom52bits) << 6;
637280297Sjkim    /*-
638280297Sjkim     * out[x > 1] < 2^58 + 2^6 + 2^58 + 2^12
639280297Sjkim     *            < 2^59 + 2^13
640280297Sjkim     */
641280297Sjkim    overflow1 = ((limb) (in[7] >> 64)) >> 52;
642238384Sjkim
643280297Sjkim    overflow1 += ((limb) in[8]) >> 58;
644280297Sjkim    overflow1 += (((limb) (in[8] >> 64)) & bottom52bits) << 6;
645280297Sjkim    overflow2 = ((limb) (in[8] >> 64)) >> 52;
646238384Sjkim
647280297Sjkim    overflow1 <<= 1;            /* overflow1 < 2^13 + 2^7 + 2^59 */
648280297Sjkim    overflow2 <<= 1;            /* overflow2 < 2^13 */
649238384Sjkim
650280297Sjkim    out[0] += overflow1;        /* out[0] < 2^60 */
651280297Sjkim    out[1] += overflow2;        /* out[1] < 2^59 + 2^6 + 2^13 */
652238384Sjkim
653280297Sjkim    out[1] += out[0] >> 58;
654280297Sjkim    out[0] &= bottom58bits;
655280297Sjkim    /*-
656280297Sjkim     * out[0] < 2^58
657280297Sjkim     * out[1] < 2^59 + 2^6 + 2^13 + 2^2
658280297Sjkim     *        < 2^59 + 2^14
659280297Sjkim     */
660280297Sjkim}
661238384Sjkim
662238384Sjkimstatic void felem_square_reduce(felem out, const felem in)
663280297Sjkim{
664280297Sjkim    largefelem tmp;
665280297Sjkim    felem_square(tmp, in);
666280297Sjkim    felem_reduce(out, tmp);
667280297Sjkim}
668238384Sjkim
669238384Sjkimstatic void felem_mul_reduce(felem out, const felem in1, const felem in2)
670280297Sjkim{
671280297Sjkim    largefelem tmp;
672280297Sjkim    felem_mul(tmp, in1, in2);
673280297Sjkim    felem_reduce(out, tmp);
674280297Sjkim}
675238384Sjkim
676280297Sjkim/*-
677280297Sjkim * felem_inv calculates |out| = |in|^{-1}
678238384Sjkim *
679238384Sjkim * Based on Fermat's Little Theorem:
680238384Sjkim *   a^p = a (mod p)
681238384Sjkim *   a^{p-1} = 1 (mod p)
682238384Sjkim *   a^{p-2} = a^{-1} (mod p)
683238384Sjkim */
684238384Sjkimstatic void felem_inv(felem out, const felem in)
685280297Sjkim{
686280297Sjkim    felem ftmp, ftmp2, ftmp3, ftmp4;
687280297Sjkim    largefelem tmp;
688280297Sjkim    unsigned i;
689238384Sjkim
690280297Sjkim    felem_square(tmp, in);
691280297Sjkim    felem_reduce(ftmp, tmp);    /* 2^1 */
692280297Sjkim    felem_mul(tmp, in, ftmp);
693280297Sjkim    felem_reduce(ftmp, tmp);    /* 2^2 - 2^0 */
694280297Sjkim    felem_assign(ftmp2, ftmp);
695280297Sjkim    felem_square(tmp, ftmp);
696280297Sjkim    felem_reduce(ftmp, tmp);    /* 2^3 - 2^1 */
697280297Sjkim    felem_mul(tmp, in, ftmp);
698280297Sjkim    felem_reduce(ftmp, tmp);    /* 2^3 - 2^0 */
699280297Sjkim    felem_square(tmp, ftmp);
700280297Sjkim    felem_reduce(ftmp, tmp);    /* 2^4 - 2^1 */
701238384Sjkim
702280297Sjkim    felem_square(tmp, ftmp2);
703280297Sjkim    felem_reduce(ftmp3, tmp);   /* 2^3 - 2^1 */
704280297Sjkim    felem_square(tmp, ftmp3);
705280297Sjkim    felem_reduce(ftmp3, tmp);   /* 2^4 - 2^2 */
706280297Sjkim    felem_mul(tmp, ftmp3, ftmp2);
707280297Sjkim    felem_reduce(ftmp3, tmp);   /* 2^4 - 2^0 */
708238384Sjkim
709280297Sjkim    felem_assign(ftmp2, ftmp3);
710280297Sjkim    felem_square(tmp, ftmp3);
711280297Sjkim    felem_reduce(ftmp3, tmp);   /* 2^5 - 2^1 */
712280297Sjkim    felem_square(tmp, ftmp3);
713280297Sjkim    felem_reduce(ftmp3, tmp);   /* 2^6 - 2^2 */
714280297Sjkim    felem_square(tmp, ftmp3);
715280297Sjkim    felem_reduce(ftmp3, tmp);   /* 2^7 - 2^3 */
716280297Sjkim    felem_square(tmp, ftmp3);
717280297Sjkim    felem_reduce(ftmp3, tmp);   /* 2^8 - 2^4 */
718280297Sjkim    felem_assign(ftmp4, ftmp3);
719280297Sjkim    felem_mul(tmp, ftmp3, ftmp);
720280297Sjkim    felem_reduce(ftmp4, tmp);   /* 2^8 - 2^1 */
721280297Sjkim    felem_square(tmp, ftmp4);
722280297Sjkim    felem_reduce(ftmp4, tmp);   /* 2^9 - 2^2 */
723280297Sjkim    felem_mul(tmp, ftmp3, ftmp2);
724280297Sjkim    felem_reduce(ftmp3, tmp);   /* 2^8 - 2^0 */
725280297Sjkim    felem_assign(ftmp2, ftmp3);
726238384Sjkim
727280297Sjkim    for (i = 0; i < 8; i++) {
728280297Sjkim        felem_square(tmp, ftmp3);
729280297Sjkim        felem_reduce(ftmp3, tmp); /* 2^16 - 2^8 */
730280297Sjkim    }
731280297Sjkim    felem_mul(tmp, ftmp3, ftmp2);
732280297Sjkim    felem_reduce(ftmp3, tmp);   /* 2^16 - 2^0 */
733280297Sjkim    felem_assign(ftmp2, ftmp3);
734238384Sjkim
735280297Sjkim    for (i = 0; i < 16; i++) {
736280297Sjkim        felem_square(tmp, ftmp3);
737280297Sjkim        felem_reduce(ftmp3, tmp); /* 2^32 - 2^16 */
738280297Sjkim    }
739280297Sjkim    felem_mul(tmp, ftmp3, ftmp2);
740280297Sjkim    felem_reduce(ftmp3, tmp);   /* 2^32 - 2^0 */
741280297Sjkim    felem_assign(ftmp2, ftmp3);
742238384Sjkim
743280297Sjkim    for (i = 0; i < 32; i++) {
744280297Sjkim        felem_square(tmp, ftmp3);
745280297Sjkim        felem_reduce(ftmp3, tmp); /* 2^64 - 2^32 */
746280297Sjkim    }
747280297Sjkim    felem_mul(tmp, ftmp3, ftmp2);
748280297Sjkim    felem_reduce(ftmp3, tmp);   /* 2^64 - 2^0 */
749280297Sjkim    felem_assign(ftmp2, ftmp3);
750238384Sjkim
751280297Sjkim    for (i = 0; i < 64; i++) {
752280297Sjkim        felem_square(tmp, ftmp3);
753280297Sjkim        felem_reduce(ftmp3, tmp); /* 2^128 - 2^64 */
754280297Sjkim    }
755280297Sjkim    felem_mul(tmp, ftmp3, ftmp2);
756280297Sjkim    felem_reduce(ftmp3, tmp);   /* 2^128 - 2^0 */
757280297Sjkim    felem_assign(ftmp2, ftmp3);
758238384Sjkim
759280297Sjkim    for (i = 0; i < 128; i++) {
760280297Sjkim        felem_square(tmp, ftmp3);
761280297Sjkim        felem_reduce(ftmp3, tmp); /* 2^256 - 2^128 */
762280297Sjkim    }
763280297Sjkim    felem_mul(tmp, ftmp3, ftmp2);
764280297Sjkim    felem_reduce(ftmp3, tmp);   /* 2^256 - 2^0 */
765280297Sjkim    felem_assign(ftmp2, ftmp3);
766238384Sjkim
767280297Sjkim    for (i = 0; i < 256; i++) {
768280297Sjkim        felem_square(tmp, ftmp3);
769280297Sjkim        felem_reduce(ftmp3, tmp); /* 2^512 - 2^256 */
770280297Sjkim    }
771280297Sjkim    felem_mul(tmp, ftmp3, ftmp2);
772280297Sjkim    felem_reduce(ftmp3, tmp);   /* 2^512 - 2^0 */
773238384Sjkim
774280297Sjkim    for (i = 0; i < 9; i++) {
775280297Sjkim        felem_square(tmp, ftmp3);
776280297Sjkim        felem_reduce(ftmp3, tmp); /* 2^521 - 2^9 */
777280297Sjkim    }
778280297Sjkim    felem_mul(tmp, ftmp3, ftmp4);
779280297Sjkim    felem_reduce(ftmp3, tmp);   /* 2^512 - 2^2 */
780280297Sjkim    felem_mul(tmp, ftmp3, in);
781280297Sjkim    felem_reduce(out, tmp);     /* 2^512 - 3 */
782238384Sjkim}
783238384Sjkim
784238384Sjkim/* This is 2^521-1, expressed as an felem */
785280297Sjkimstatic const felem kPrime = {
786280297Sjkim    0x03ffffffffffffff, 0x03ffffffffffffff, 0x03ffffffffffffff,
787280297Sjkim    0x03ffffffffffffff, 0x03ffffffffffffff, 0x03ffffffffffffff,
788280297Sjkim    0x03ffffffffffffff, 0x03ffffffffffffff, 0x01ffffffffffffff
789280297Sjkim};
790238384Sjkim
791280297Sjkim/*-
792280297Sjkim * felem_is_zero returns a limb with all bits set if |in| == 0 (mod p) and 0
793238384Sjkim * otherwise.
794238384Sjkim * On entry:
795238384Sjkim *   in[i] < 2^59 + 2^14
796238384Sjkim */
797238384Sjkimstatic limb felem_is_zero(const felem in)
798280297Sjkim{
799280297Sjkim    felem ftmp;
800280297Sjkim    limb is_zero, is_p;
801280297Sjkim    felem_assign(ftmp, in);
802238384Sjkim
803280297Sjkim    ftmp[0] += ftmp[8] >> 57;
804280297Sjkim    ftmp[8] &= bottom57bits;
805280297Sjkim    /* ftmp[8] < 2^57 */
806280297Sjkim    ftmp[1] += ftmp[0] >> 58;
807280297Sjkim    ftmp[0] &= bottom58bits;
808280297Sjkim    ftmp[2] += ftmp[1] >> 58;
809280297Sjkim    ftmp[1] &= bottom58bits;
810280297Sjkim    ftmp[3] += ftmp[2] >> 58;
811280297Sjkim    ftmp[2] &= bottom58bits;
812280297Sjkim    ftmp[4] += ftmp[3] >> 58;
813280297Sjkim    ftmp[3] &= bottom58bits;
814280297Sjkim    ftmp[5] += ftmp[4] >> 58;
815280297Sjkim    ftmp[4] &= bottom58bits;
816280297Sjkim    ftmp[6] += ftmp[5] >> 58;
817280297Sjkim    ftmp[5] &= bottom58bits;
818280297Sjkim    ftmp[7] += ftmp[6] >> 58;
819280297Sjkim    ftmp[6] &= bottom58bits;
820280297Sjkim    ftmp[8] += ftmp[7] >> 58;
821280297Sjkim    ftmp[7] &= bottom58bits;
822280297Sjkim    /* ftmp[8] < 2^57 + 4 */
823238384Sjkim
824280297Sjkim    /*
825280297Sjkim     * The ninth limb of 2*(2^521-1) is 0x03ffffffffffffff, which is greater
826280297Sjkim     * than our bound for ftmp[8]. Therefore we only have to check if the
827280297Sjkim     * zero is zero or 2^521-1.
828280297Sjkim     */
829238384Sjkim
830280297Sjkim    is_zero = 0;
831280297Sjkim    is_zero |= ftmp[0];
832280297Sjkim    is_zero |= ftmp[1];
833280297Sjkim    is_zero |= ftmp[2];
834280297Sjkim    is_zero |= ftmp[3];
835280297Sjkim    is_zero |= ftmp[4];
836280297Sjkim    is_zero |= ftmp[5];
837280297Sjkim    is_zero |= ftmp[6];
838280297Sjkim    is_zero |= ftmp[7];
839280297Sjkim    is_zero |= ftmp[8];
840238384Sjkim
841280297Sjkim    is_zero--;
842280297Sjkim    /*
843280297Sjkim     * We know that ftmp[i] < 2^63, therefore the only way that the top bit
844280297Sjkim     * can be set is if is_zero was 0 before the decrement.
845280297Sjkim     */
846331638Sjkim    is_zero = 0 - (is_zero >> 63);
847238384Sjkim
848280297Sjkim    is_p = ftmp[0] ^ kPrime[0];
849280297Sjkim    is_p |= ftmp[1] ^ kPrime[1];
850280297Sjkim    is_p |= ftmp[2] ^ kPrime[2];
851280297Sjkim    is_p |= ftmp[3] ^ kPrime[3];
852280297Sjkim    is_p |= ftmp[4] ^ kPrime[4];
853280297Sjkim    is_p |= ftmp[5] ^ kPrime[5];
854280297Sjkim    is_p |= ftmp[6] ^ kPrime[6];
855280297Sjkim    is_p |= ftmp[7] ^ kPrime[7];
856280297Sjkim    is_p |= ftmp[8] ^ kPrime[8];
857238384Sjkim
858280297Sjkim    is_p--;
859331638Sjkim    is_p = 0 - (is_p >> 63);
860238384Sjkim
861280297Sjkim    is_zero |= is_p;
862280297Sjkim    return is_zero;
863280297Sjkim}
864238384Sjkim
865325337Sjkimstatic int felem_is_zero_int(const void *in)
866280297Sjkim{
867280297Sjkim    return (int)(felem_is_zero(in) & ((limb) 1));
868280297Sjkim}
869238384Sjkim
870280297Sjkim/*-
871280297Sjkim * felem_contract converts |in| to its unique, minimal representation.
872238384Sjkim * On entry:
873238384Sjkim *   in[i] < 2^59 + 2^14
874238384Sjkim */
875238384Sjkimstatic void felem_contract(felem out, const felem in)
876280297Sjkim{
877280297Sjkim    limb is_p, is_greater, sign;
878280297Sjkim    static const limb two58 = ((limb) 1) << 58;
879238384Sjkim
880280297Sjkim    felem_assign(out, in);
881238384Sjkim
882280297Sjkim    out[0] += out[8] >> 57;
883280297Sjkim    out[8] &= bottom57bits;
884280297Sjkim    /* out[8] < 2^57 */
885280297Sjkim    out[1] += out[0] >> 58;
886280297Sjkim    out[0] &= bottom58bits;
887280297Sjkim    out[2] += out[1] >> 58;
888280297Sjkim    out[1] &= bottom58bits;
889280297Sjkim    out[3] += out[2] >> 58;
890280297Sjkim    out[2] &= bottom58bits;
891280297Sjkim    out[4] += out[3] >> 58;
892280297Sjkim    out[3] &= bottom58bits;
893280297Sjkim    out[5] += out[4] >> 58;
894280297Sjkim    out[4] &= bottom58bits;
895280297Sjkim    out[6] += out[5] >> 58;
896280297Sjkim    out[5] &= bottom58bits;
897280297Sjkim    out[7] += out[6] >> 58;
898280297Sjkim    out[6] &= bottom58bits;
899280297Sjkim    out[8] += out[7] >> 58;
900280297Sjkim    out[7] &= bottom58bits;
901280297Sjkim    /* out[8] < 2^57 + 4 */
902238384Sjkim
903280297Sjkim    /*
904280297Sjkim     * If the value is greater than 2^521-1 then we have to subtract 2^521-1
905280297Sjkim     * out. See the comments in felem_is_zero regarding why we don't test for
906280297Sjkim     * other multiples of the prime.
907280297Sjkim     */
908238384Sjkim
909280297Sjkim    /*
910280297Sjkim     * First, if |out| is equal to 2^521-1, we subtract it out to get zero.
911280297Sjkim     */
912238384Sjkim
913280297Sjkim    is_p = out[0] ^ kPrime[0];
914280297Sjkim    is_p |= out[1] ^ kPrime[1];
915280297Sjkim    is_p |= out[2] ^ kPrime[2];
916280297Sjkim    is_p |= out[3] ^ kPrime[3];
917280297Sjkim    is_p |= out[4] ^ kPrime[4];
918280297Sjkim    is_p |= out[5] ^ kPrime[5];
919280297Sjkim    is_p |= out[6] ^ kPrime[6];
920280297Sjkim    is_p |= out[7] ^ kPrime[7];
921280297Sjkim    is_p |= out[8] ^ kPrime[8];
922238384Sjkim
923280297Sjkim    is_p--;
924280297Sjkim    is_p &= is_p << 32;
925280297Sjkim    is_p &= is_p << 16;
926280297Sjkim    is_p &= is_p << 8;
927280297Sjkim    is_p &= is_p << 4;
928280297Sjkim    is_p &= is_p << 2;
929280297Sjkim    is_p &= is_p << 1;
930331638Sjkim    is_p = 0 - (is_p >> 63);
931280297Sjkim    is_p = ~is_p;
932238384Sjkim
933280297Sjkim    /* is_p is 0 iff |out| == 2^521-1 and all ones otherwise */
934238384Sjkim
935280297Sjkim    out[0] &= is_p;
936280297Sjkim    out[1] &= is_p;
937280297Sjkim    out[2] &= is_p;
938280297Sjkim    out[3] &= is_p;
939280297Sjkim    out[4] &= is_p;
940280297Sjkim    out[5] &= is_p;
941280297Sjkim    out[6] &= is_p;
942280297Sjkim    out[7] &= is_p;
943280297Sjkim    out[8] &= is_p;
944238384Sjkim
945280297Sjkim    /*
946280297Sjkim     * In order to test that |out| >= 2^521-1 we need only test if out[8] >>
947280297Sjkim     * 57 is greater than zero as (2^521-1) + x >= 2^522
948280297Sjkim     */
949280297Sjkim    is_greater = out[8] >> 57;
950280297Sjkim    is_greater |= is_greater << 32;
951280297Sjkim    is_greater |= is_greater << 16;
952280297Sjkim    is_greater |= is_greater << 8;
953280297Sjkim    is_greater |= is_greater << 4;
954280297Sjkim    is_greater |= is_greater << 2;
955280297Sjkim    is_greater |= is_greater << 1;
956331638Sjkim    is_greater = 0 - (is_greater >> 63);
957238384Sjkim
958280297Sjkim    out[0] -= kPrime[0] & is_greater;
959280297Sjkim    out[1] -= kPrime[1] & is_greater;
960280297Sjkim    out[2] -= kPrime[2] & is_greater;
961280297Sjkim    out[3] -= kPrime[3] & is_greater;
962280297Sjkim    out[4] -= kPrime[4] & is_greater;
963280297Sjkim    out[5] -= kPrime[5] & is_greater;
964280297Sjkim    out[6] -= kPrime[6] & is_greater;
965280297Sjkim    out[7] -= kPrime[7] & is_greater;
966280297Sjkim    out[8] -= kPrime[8] & is_greater;
967238384Sjkim
968280297Sjkim    /* Eliminate negative coefficients */
969280297Sjkim    sign = -(out[0] >> 63);
970280297Sjkim    out[0] += (two58 & sign);
971280297Sjkim    out[1] -= (1 & sign);
972280297Sjkim    sign = -(out[1] >> 63);
973280297Sjkim    out[1] += (two58 & sign);
974280297Sjkim    out[2] -= (1 & sign);
975280297Sjkim    sign = -(out[2] >> 63);
976280297Sjkim    out[2] += (two58 & sign);
977280297Sjkim    out[3] -= (1 & sign);
978280297Sjkim    sign = -(out[3] >> 63);
979280297Sjkim    out[3] += (two58 & sign);
980280297Sjkim    out[4] -= (1 & sign);
981280297Sjkim    sign = -(out[4] >> 63);
982280297Sjkim    out[4] += (two58 & sign);
983280297Sjkim    out[5] -= (1 & sign);
984280297Sjkim    sign = -(out[0] >> 63);
985280297Sjkim    out[5] += (two58 & sign);
986280297Sjkim    out[6] -= (1 & sign);
987280297Sjkim    sign = -(out[6] >> 63);
988280297Sjkim    out[6] += (two58 & sign);
989280297Sjkim    out[7] -= (1 & sign);
990280297Sjkim    sign = -(out[7] >> 63);
991280297Sjkim    out[7] += (two58 & sign);
992280297Sjkim    out[8] -= (1 & sign);
993280297Sjkim    sign = -(out[5] >> 63);
994280297Sjkim    out[5] += (two58 & sign);
995280297Sjkim    out[6] -= (1 & sign);
996280297Sjkim    sign = -(out[6] >> 63);
997280297Sjkim    out[6] += (two58 & sign);
998280297Sjkim    out[7] -= (1 & sign);
999280297Sjkim    sign = -(out[7] >> 63);
1000280297Sjkim    out[7] += (two58 & sign);
1001280297Sjkim    out[8] -= (1 & sign);
1002280297Sjkim}
1003238384Sjkim
1004280297Sjkim/*-
1005280297Sjkim * Group operations
1006238384Sjkim * ----------------
1007238384Sjkim *
1008238384Sjkim * Building on top of the field operations we have the operations on the
1009238384Sjkim * elliptic curve group itself. Points on the curve are represented in Jacobian
1010238384Sjkim * coordinates */
1011238384Sjkim
1012280297Sjkim/*-
1013280297Sjkim * point_double calcuates 2*(x_in, y_in, z_in)
1014238384Sjkim *
1015238384Sjkim * The method is taken from:
1016238384Sjkim *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b
1017238384Sjkim *
1018238384Sjkim * Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed.
1019238384Sjkim * while x_out == y_in is not (maybe this works, but it's not tested). */
1020238384Sjkimstatic void
1021238384Sjkimpoint_double(felem x_out, felem y_out, felem z_out,
1022280297Sjkim             const felem x_in, const felem y_in, const felem z_in)
1023280297Sjkim{
1024280297Sjkim    largefelem tmp, tmp2;
1025280297Sjkim    felem delta, gamma, beta, alpha, ftmp, ftmp2;
1026238384Sjkim
1027280297Sjkim    felem_assign(ftmp, x_in);
1028280297Sjkim    felem_assign(ftmp2, x_in);
1029238384Sjkim
1030280297Sjkim    /* delta = z^2 */
1031280297Sjkim    felem_square(tmp, z_in);
1032280297Sjkim    felem_reduce(delta, tmp);   /* delta[i] < 2^59 + 2^14 */
1033238384Sjkim
1034280297Sjkim    /* gamma = y^2 */
1035280297Sjkim    felem_square(tmp, y_in);
1036280297Sjkim    felem_reduce(gamma, tmp);   /* gamma[i] < 2^59 + 2^14 */
1037238384Sjkim
1038280297Sjkim    /* beta = x*gamma */
1039280297Sjkim    felem_mul(tmp, x_in, gamma);
1040280297Sjkim    felem_reduce(beta, tmp);    /* beta[i] < 2^59 + 2^14 */
1041238384Sjkim
1042280297Sjkim    /* alpha = 3*(x-delta)*(x+delta) */
1043280297Sjkim    felem_diff64(ftmp, delta);
1044280297Sjkim    /* ftmp[i] < 2^61 */
1045280297Sjkim    felem_sum64(ftmp2, delta);
1046280297Sjkim    /* ftmp2[i] < 2^60 + 2^15 */
1047280297Sjkim    felem_scalar64(ftmp2, 3);
1048280297Sjkim    /* ftmp2[i] < 3*2^60 + 3*2^15 */
1049280297Sjkim    felem_mul(tmp, ftmp, ftmp2);
1050280297Sjkim    /*-
1051280297Sjkim     * tmp[i] < 17(3*2^121 + 3*2^76)
1052280297Sjkim     *        = 61*2^121 + 61*2^76
1053280297Sjkim     *        < 64*2^121 + 64*2^76
1054280297Sjkim     *        = 2^127 + 2^82
1055280297Sjkim     *        < 2^128
1056280297Sjkim     */
1057280297Sjkim    felem_reduce(alpha, tmp);
1058238384Sjkim
1059280297Sjkim    /* x' = alpha^2 - 8*beta */
1060280297Sjkim    felem_square(tmp, alpha);
1061280297Sjkim    /*
1062280297Sjkim     * tmp[i] < 17*2^120 < 2^125
1063280297Sjkim     */
1064280297Sjkim    felem_assign(ftmp, beta);
1065280297Sjkim    felem_scalar64(ftmp, 8);
1066280297Sjkim    /* ftmp[i] < 2^62 + 2^17 */
1067280297Sjkim    felem_diff_128_64(tmp, ftmp);
1068280297Sjkim    /* tmp[i] < 2^125 + 2^63 + 2^62 + 2^17 */
1069280297Sjkim    felem_reduce(x_out, tmp);
1070238384Sjkim
1071280297Sjkim    /* z' = (y + z)^2 - gamma - delta */
1072280297Sjkim    felem_sum64(delta, gamma);
1073280297Sjkim    /* delta[i] < 2^60 + 2^15 */
1074280297Sjkim    felem_assign(ftmp, y_in);
1075280297Sjkim    felem_sum64(ftmp, z_in);
1076280297Sjkim    /* ftmp[i] < 2^60 + 2^15 */
1077280297Sjkim    felem_square(tmp, ftmp);
1078280297Sjkim    /*
1079280297Sjkim     * tmp[i] < 17(2^122) < 2^127
1080280297Sjkim     */
1081280297Sjkim    felem_diff_128_64(tmp, delta);
1082280297Sjkim    /* tmp[i] < 2^127 + 2^63 */
1083280297Sjkim    felem_reduce(z_out, tmp);
1084238384Sjkim
1085280297Sjkim    /* y' = alpha*(4*beta - x') - 8*gamma^2 */
1086280297Sjkim    felem_scalar64(beta, 4);
1087280297Sjkim    /* beta[i] < 2^61 + 2^16 */
1088280297Sjkim    felem_diff64(beta, x_out);
1089280297Sjkim    /* beta[i] < 2^61 + 2^60 + 2^16 */
1090280297Sjkim    felem_mul(tmp, alpha, beta);
1091280297Sjkim    /*-
1092280297Sjkim     * tmp[i] < 17*((2^59 + 2^14)(2^61 + 2^60 + 2^16))
1093280297Sjkim     *        = 17*(2^120 + 2^75 + 2^119 + 2^74 + 2^75 + 2^30)
1094280297Sjkim     *        = 17*(2^120 + 2^119 + 2^76 + 2^74 + 2^30)
1095280297Sjkim     *        < 2^128
1096280297Sjkim     */
1097280297Sjkim    felem_square(tmp2, gamma);
1098280297Sjkim    /*-
1099280297Sjkim     * tmp2[i] < 17*(2^59 + 2^14)^2
1100280297Sjkim     *         = 17*(2^118 + 2^74 + 2^28)
1101280297Sjkim     */
1102280297Sjkim    felem_scalar128(tmp2, 8);
1103280297Sjkim    /*-
1104280297Sjkim     * tmp2[i] < 8*17*(2^118 + 2^74 + 2^28)
1105280297Sjkim     *         = 2^125 + 2^121 + 2^81 + 2^77 + 2^35 + 2^31
1106280297Sjkim     *         < 2^126
1107280297Sjkim     */
1108280297Sjkim    felem_diff128(tmp, tmp2);
1109280297Sjkim    /*-
1110280297Sjkim     * tmp[i] < 2^127 - 2^69 + 17(2^120 + 2^119 + 2^76 + 2^74 + 2^30)
1111280297Sjkim     *        = 2^127 + 2^124 + 2^122 + 2^120 + 2^118 + 2^80 + 2^78 + 2^76 +
1112280297Sjkim     *          2^74 + 2^69 + 2^34 + 2^30
1113280297Sjkim     *        < 2^128
1114280297Sjkim     */
1115280297Sjkim    felem_reduce(y_out, tmp);
1116280297Sjkim}
1117238384Sjkim
1118238384Sjkim/* copy_conditional copies in to out iff mask is all ones. */
1119280297Sjkimstatic void copy_conditional(felem out, const felem in, limb mask)
1120280297Sjkim{
1121280297Sjkim    unsigned i;
1122280297Sjkim    for (i = 0; i < NLIMBS; ++i) {
1123280297Sjkim        const limb tmp = mask & (in[i] ^ out[i]);
1124280297Sjkim        out[i] ^= tmp;
1125280297Sjkim    }
1126280297Sjkim}
1127238384Sjkim
1128280297Sjkim/*-
1129280297Sjkim * point_add calcuates (x1, y1, z1) + (x2, y2, z2)
1130238384Sjkim *
1131238384Sjkim * The method is taken from
1132238384Sjkim *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl,
1133238384Sjkim * adapted for mixed addition (z2 = 1, or z2 = 0 for the point at infinity).
1134238384Sjkim *
1135238384Sjkim * This function includes a branch for checking whether the two input points
1136238384Sjkim * are equal (while not equal to the point at infinity). This case never
1137238384Sjkim * happens during single point multiplication, so there is no timing leak for
1138238384Sjkim * ECDH or ECDSA signing. */
1139238384Sjkimstatic void point_add(felem x3, felem y3, felem z3,
1140280297Sjkim                      const felem x1, const felem y1, const felem z1,
1141280297Sjkim                      const int mixed, const felem x2, const felem y2,
1142280297Sjkim                      const felem z2)
1143280297Sjkim{
1144280297Sjkim    felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, ftmp6, x_out, y_out, z_out;
1145280297Sjkim    largefelem tmp, tmp2;
1146280297Sjkim    limb x_equal, y_equal, z1_is_zero, z2_is_zero;
1147238384Sjkim
1148280297Sjkim    z1_is_zero = felem_is_zero(z1);
1149280297Sjkim    z2_is_zero = felem_is_zero(z2);
1150238384Sjkim
1151280297Sjkim    /* ftmp = z1z1 = z1**2 */
1152280297Sjkim    felem_square(tmp, z1);
1153280297Sjkim    felem_reduce(ftmp, tmp);
1154238384Sjkim
1155280297Sjkim    if (!mixed) {
1156280297Sjkim        /* ftmp2 = z2z2 = z2**2 */
1157280297Sjkim        felem_square(tmp, z2);
1158280297Sjkim        felem_reduce(ftmp2, tmp);
1159238384Sjkim
1160280297Sjkim        /* u1 = ftmp3 = x1*z2z2 */
1161280297Sjkim        felem_mul(tmp, x1, ftmp2);
1162280297Sjkim        felem_reduce(ftmp3, tmp);
1163238384Sjkim
1164280297Sjkim        /* ftmp5 = z1 + z2 */
1165280297Sjkim        felem_assign(ftmp5, z1);
1166280297Sjkim        felem_sum64(ftmp5, z2);
1167280297Sjkim        /* ftmp5[i] < 2^61 */
1168238384Sjkim
1169280297Sjkim        /* ftmp5 = (z1 + z2)**2 - z1z1 - z2z2 = 2*z1z2 */
1170280297Sjkim        felem_square(tmp, ftmp5);
1171280297Sjkim        /* tmp[i] < 17*2^122 */
1172280297Sjkim        felem_diff_128_64(tmp, ftmp);
1173280297Sjkim        /* tmp[i] < 17*2^122 + 2^63 */
1174280297Sjkim        felem_diff_128_64(tmp, ftmp2);
1175280297Sjkim        /* tmp[i] < 17*2^122 + 2^64 */
1176280297Sjkim        felem_reduce(ftmp5, tmp);
1177238384Sjkim
1178280297Sjkim        /* ftmp2 = z2 * z2z2 */
1179280297Sjkim        felem_mul(tmp, ftmp2, z2);
1180280297Sjkim        felem_reduce(ftmp2, tmp);
1181238384Sjkim
1182280297Sjkim        /* s1 = ftmp6 = y1 * z2**3 */
1183280297Sjkim        felem_mul(tmp, y1, ftmp2);
1184280297Sjkim        felem_reduce(ftmp6, tmp);
1185280297Sjkim    } else {
1186280297Sjkim        /*
1187280297Sjkim         * We'll assume z2 = 1 (special case z2 = 0 is handled later)
1188280297Sjkim         */
1189238384Sjkim
1190280297Sjkim        /* u1 = ftmp3 = x1*z2z2 */
1191280297Sjkim        felem_assign(ftmp3, x1);
1192238384Sjkim
1193280297Sjkim        /* ftmp5 = 2*z1z2 */
1194280297Sjkim        felem_scalar(ftmp5, z1, 2);
1195238384Sjkim
1196280297Sjkim        /* s1 = ftmp6 = y1 * z2**3 */
1197280297Sjkim        felem_assign(ftmp6, y1);
1198280297Sjkim    }
1199238384Sjkim
1200280297Sjkim    /* u2 = x2*z1z1 */
1201280297Sjkim    felem_mul(tmp, x2, ftmp);
1202280297Sjkim    /* tmp[i] < 17*2^120 */
1203238384Sjkim
1204280297Sjkim    /* h = ftmp4 = u2 - u1 */
1205280297Sjkim    felem_diff_128_64(tmp, ftmp3);
1206280297Sjkim    /* tmp[i] < 17*2^120 + 2^63 */
1207280297Sjkim    felem_reduce(ftmp4, tmp);
1208238384Sjkim
1209280297Sjkim    x_equal = felem_is_zero(ftmp4);
1210238384Sjkim
1211280297Sjkim    /* z_out = ftmp5 * h */
1212280297Sjkim    felem_mul(tmp, ftmp5, ftmp4);
1213280297Sjkim    felem_reduce(z_out, tmp);
1214238384Sjkim
1215280297Sjkim    /* ftmp = z1 * z1z1 */
1216280297Sjkim    felem_mul(tmp, ftmp, z1);
1217280297Sjkim    felem_reduce(ftmp, tmp);
1218238384Sjkim
1219280297Sjkim    /* s2 = tmp = y2 * z1**3 */
1220280297Sjkim    felem_mul(tmp, y2, ftmp);
1221280297Sjkim    /* tmp[i] < 17*2^120 */
1222238384Sjkim
1223280297Sjkim    /* r = ftmp5 = (s2 - s1)*2 */
1224280297Sjkim    felem_diff_128_64(tmp, ftmp6);
1225280297Sjkim    /* tmp[i] < 17*2^120 + 2^63 */
1226280297Sjkim    felem_reduce(ftmp5, tmp);
1227280297Sjkim    y_equal = felem_is_zero(ftmp5);
1228280297Sjkim    felem_scalar64(ftmp5, 2);
1229280297Sjkim    /* ftmp5[i] < 2^61 */
1230238384Sjkim
1231280297Sjkim    if (x_equal && y_equal && !z1_is_zero && !z2_is_zero) {
1232280297Sjkim        point_double(x3, y3, z3, x1, y1, z1);
1233280297Sjkim        return;
1234280297Sjkim    }
1235238384Sjkim
1236280297Sjkim    /* I = ftmp = (2h)**2 */
1237280297Sjkim    felem_assign(ftmp, ftmp4);
1238280297Sjkim    felem_scalar64(ftmp, 2);
1239280297Sjkim    /* ftmp[i] < 2^61 */
1240280297Sjkim    felem_square(tmp, ftmp);
1241280297Sjkim    /* tmp[i] < 17*2^122 */
1242280297Sjkim    felem_reduce(ftmp, tmp);
1243238384Sjkim
1244280297Sjkim    /* J = ftmp2 = h * I */
1245280297Sjkim    felem_mul(tmp, ftmp4, ftmp);
1246280297Sjkim    felem_reduce(ftmp2, tmp);
1247238384Sjkim
1248280297Sjkim    /* V = ftmp4 = U1 * I */
1249280297Sjkim    felem_mul(tmp, ftmp3, ftmp);
1250280297Sjkim    felem_reduce(ftmp4, tmp);
1251238384Sjkim
1252280297Sjkim    /* x_out = r**2 - J - 2V */
1253280297Sjkim    felem_square(tmp, ftmp5);
1254280297Sjkim    /* tmp[i] < 17*2^122 */
1255280297Sjkim    felem_diff_128_64(tmp, ftmp2);
1256280297Sjkim    /* tmp[i] < 17*2^122 + 2^63 */
1257280297Sjkim    felem_assign(ftmp3, ftmp4);
1258280297Sjkim    felem_scalar64(ftmp4, 2);
1259280297Sjkim    /* ftmp4[i] < 2^61 */
1260280297Sjkim    felem_diff_128_64(tmp, ftmp4);
1261280297Sjkim    /* tmp[i] < 17*2^122 + 2^64 */
1262280297Sjkim    felem_reduce(x_out, tmp);
1263238384Sjkim
1264280297Sjkim    /* y_out = r(V-x_out) - 2 * s1 * J */
1265280297Sjkim    felem_diff64(ftmp3, x_out);
1266280297Sjkim    /*
1267280297Sjkim     * ftmp3[i] < 2^60 + 2^60 = 2^61
1268280297Sjkim     */
1269280297Sjkim    felem_mul(tmp, ftmp5, ftmp3);
1270280297Sjkim    /* tmp[i] < 17*2^122 */
1271280297Sjkim    felem_mul(tmp2, ftmp6, ftmp2);
1272280297Sjkim    /* tmp2[i] < 17*2^120 */
1273280297Sjkim    felem_scalar128(tmp2, 2);
1274280297Sjkim    /* tmp2[i] < 17*2^121 */
1275280297Sjkim    felem_diff128(tmp, tmp2);
1276290207Sjkim        /*-
1277290207Sjkim         * tmp[i] < 2^127 - 2^69 + 17*2^122
1278290207Sjkim         *        = 2^126 - 2^122 - 2^6 - 2^2 - 1
1279290207Sjkim         *        < 2^127
1280290207Sjkim         */
1281280297Sjkim    felem_reduce(y_out, tmp);
1282238384Sjkim
1283280297Sjkim    copy_conditional(x_out, x2, z1_is_zero);
1284280297Sjkim    copy_conditional(x_out, x1, z2_is_zero);
1285280297Sjkim    copy_conditional(y_out, y2, z1_is_zero);
1286280297Sjkim    copy_conditional(y_out, y1, z2_is_zero);
1287280297Sjkim    copy_conditional(z_out, z2, z1_is_zero);
1288280297Sjkim    copy_conditional(z_out, z1, z2_is_zero);
1289280297Sjkim    felem_assign(x3, x_out);
1290280297Sjkim    felem_assign(y3, y_out);
1291280297Sjkim    felem_assign(z3, z_out);
1292280297Sjkim}
1293238384Sjkim
1294280297Sjkim/*-
1295280297Sjkim * Base point pre computation
1296238384Sjkim * --------------------------
1297238384Sjkim *
1298238384Sjkim * Two different sorts of precomputed tables are used in the following code.
1299238384Sjkim * Each contain various points on the curve, where each point is three field
1300238384Sjkim * elements (x, y, z).
1301238384Sjkim *
1302238384Sjkim * For the base point table, z is usually 1 (0 for the point at infinity).
1303238384Sjkim * This table has 16 elements:
1304238384Sjkim * index | bits    | point
1305238384Sjkim * ------+---------+------------------------------
1306238384Sjkim *     0 | 0 0 0 0 | 0G
1307238384Sjkim *     1 | 0 0 0 1 | 1G
1308238384Sjkim *     2 | 0 0 1 0 | 2^130G
1309238384Sjkim *     3 | 0 0 1 1 | (2^130 + 1)G
1310238384Sjkim *     4 | 0 1 0 0 | 2^260G
1311238384Sjkim *     5 | 0 1 0 1 | (2^260 + 1)G
1312238384Sjkim *     6 | 0 1 1 0 | (2^260 + 2^130)G
1313238384Sjkim *     7 | 0 1 1 1 | (2^260 + 2^130 + 1)G
1314238384Sjkim *     8 | 1 0 0 0 | 2^390G
1315238384Sjkim *     9 | 1 0 0 1 | (2^390 + 1)G
1316238384Sjkim *    10 | 1 0 1 0 | (2^390 + 2^130)G
1317238384Sjkim *    11 | 1 0 1 1 | (2^390 + 2^130 + 1)G
1318238384Sjkim *    12 | 1 1 0 0 | (2^390 + 2^260)G
1319238384Sjkim *    13 | 1 1 0 1 | (2^390 + 2^260 + 1)G
1320238384Sjkim *    14 | 1 1 1 0 | (2^390 + 2^260 + 2^130)G
1321238384Sjkim *    15 | 1 1 1 1 | (2^390 + 2^260 + 2^130 + 1)G
1322238384Sjkim *
1323238384Sjkim * The reason for this is so that we can clock bits into four different
1324238384Sjkim * locations when doing simple scalar multiplies against the base point.
1325238384Sjkim *
1326238384Sjkim * Tables for other points have table[i] = iG for i in 0 .. 16. */
1327238384Sjkim
1328238384Sjkim/* gmul is the table of precomputed base points */
1329280297Sjkimstatic const felem gmul[16][3] = { {{0, 0, 0, 0, 0, 0, 0, 0, 0},
1330280297Sjkim                                    {0, 0, 0, 0, 0, 0, 0, 0, 0},
1331280297Sjkim                                    {0, 0, 0, 0, 0, 0, 0, 0, 0}},
1332280297Sjkim{{0x017e7e31c2e5bd66, 0x022cf0615a90a6fe, 0x00127a2ffa8de334,
1333280297Sjkim  0x01dfbf9d64a3f877, 0x006b4d3dbaa14b5e, 0x014fed487e0a2bd8,
1334280297Sjkim  0x015b4429c6481390, 0x03a73678fb2d988e, 0x00c6858e06b70404},
1335280297Sjkim {0x00be94769fd16650, 0x031c21a89cb09022, 0x039013fad0761353,
1336280297Sjkim  0x02657bd099031542, 0x03273e662c97ee72, 0x01e6d11a05ebef45,
1337280297Sjkim  0x03d1bd998f544495, 0x03001172297ed0b1, 0x011839296a789a3b},
1338280297Sjkim {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1339280297Sjkim{{0x0373faacbc875bae, 0x00f325023721c671, 0x00f666fd3dbde5ad,
1340280297Sjkim  0x01a6932363f88ea7, 0x01fc6d9e13f9c47b, 0x03bcbffc2bbf734e,
1341280297Sjkim  0x013ee3c3647f3a92, 0x029409fefe75d07d, 0x00ef9199963d85e5},
1342280297Sjkim {0x011173743ad5b178, 0x02499c7c21bf7d46, 0x035beaeabb8b1a58,
1343280297Sjkim  0x00f989c4752ea0a3, 0x0101e1de48a9c1a3, 0x01a20076be28ba6c,
1344280297Sjkim  0x02f8052e5eb2de95, 0x01bfe8f82dea117c, 0x0160074d3c36ddb7},
1345280297Sjkim {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1346280297Sjkim{{0x012f3fc373393b3b, 0x03d3d6172f1419fa, 0x02adc943c0b86873,
1347280297Sjkim  0x00d475584177952b, 0x012a4d1673750ee2, 0x00512517a0f13b0c,
1348280297Sjkim  0x02b184671a7b1734, 0x0315b84236f1a50a, 0x00a4afc472edbdb9},
1349280297Sjkim {0x00152a7077f385c4, 0x03044007d8d1c2ee, 0x0065829d61d52b52,
1350280297Sjkim  0x00494ff6b6631d0d, 0x00a11d94d5f06bcf, 0x02d2f89474d9282e,
1351280297Sjkim  0x0241c5727c06eeb9, 0x0386928710fbdb9d, 0x01f883f727b0dfbe},
1352280297Sjkim {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1353280297Sjkim{{0x019b0c3c9185544d, 0x006243a37c9d97db, 0x02ee3cbe030a2ad2,
1354280297Sjkim  0x00cfdd946bb51e0d, 0x0271c00932606b91, 0x03f817d1ec68c561,
1355280297Sjkim  0x03f37009806a369c, 0x03c1f30baf184fd5, 0x01091022d6d2f065},
1356280297Sjkim {0x0292c583514c45ed, 0x0316fca51f9a286c, 0x00300af507c1489a,
1357280297Sjkim  0x0295f69008298cf1, 0x02c0ed8274943d7b, 0x016509b9b47a431e,
1358280297Sjkim  0x02bc9de9634868ce, 0x005b34929bffcb09, 0x000c1a0121681524},
1359280297Sjkim {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1360280297Sjkim{{0x0286abc0292fb9f2, 0x02665eee9805b3f7, 0x01ed7455f17f26d6,
1361280297Sjkim  0x0346355b83175d13, 0x006284944cd0a097, 0x0191895bcdec5e51,
1362280297Sjkim  0x02e288370afda7d9, 0x03b22312bfefa67a, 0x01d104d3fc0613fe},
1363280297Sjkim {0x0092421a12f7e47f, 0x0077a83fa373c501, 0x03bd25c5f696bd0d,
1364280297Sjkim  0x035c41e4d5459761, 0x01ca0d1742b24f53, 0x00aaab27863a509c,
1365280297Sjkim  0x018b6de47df73917, 0x025c0b771705cd01, 0x01fd51d566d760a7},
1366280297Sjkim {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1367280297Sjkim{{0x01dd92ff6b0d1dbd, 0x039c5e2e8f8afa69, 0x0261ed13242c3b27,
1368280297Sjkim  0x0382c6e67026e6a0, 0x01d60b10be2089f9, 0x03c15f3dce86723f,
1369280297Sjkim  0x03c764a32d2a062d, 0x017307eac0fad056, 0x018207c0b96c5256},
1370280297Sjkim {0x0196a16d60e13154, 0x03e6ce74c0267030, 0x00ddbf2b4e52a5aa,
1371280297Sjkim  0x012738241bbf31c8, 0x00ebe8dc04685a28, 0x024c2ad6d380d4a2,
1372280297Sjkim  0x035ee062a6e62d0e, 0x0029ed74af7d3a0f, 0x00eef32aec142ebd},
1373280297Sjkim {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1374280297Sjkim{{0x00c31ec398993b39, 0x03a9f45bcda68253, 0x00ac733c24c70890,
1375280297Sjkim  0x00872b111401ff01, 0x01d178c23195eafb, 0x03bca2c816b87f74,
1376280297Sjkim  0x0261a9af46fbad7a, 0x0324b2a8dd3d28f9, 0x00918121d8f24e23},
1377280297Sjkim {0x032bc8c1ca983cd7, 0x00d869dfb08fc8c6, 0x01693cb61fce1516,
1378280297Sjkim  0x012a5ea68f4e88a8, 0x010869cab88d7ae3, 0x009081ad277ceee1,
1379280297Sjkim  0x033a77166d064cdc, 0x03955235a1fb3a95, 0x01251a4a9b25b65e},
1380280297Sjkim {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1381280297Sjkim{{0x00148a3a1b27f40b, 0x0123186df1b31fdc, 0x00026e7beaad34ce,
1382280297Sjkim  0x01db446ac1d3dbba, 0x0299c1a33437eaec, 0x024540610183cbb7,
1383280297Sjkim  0x0173bb0e9ce92e46, 0x02b937e43921214b, 0x01ab0436a9bf01b5},
1384280297Sjkim {0x0383381640d46948, 0x008dacbf0e7f330f, 0x03602122bcc3f318,
1385280297Sjkim  0x01ee596b200620d6, 0x03bd0585fda430b3, 0x014aed77fd123a83,
1386280297Sjkim  0x005ace749e52f742, 0x0390fe041da2b842, 0x0189a8ceb3299242},
1387280297Sjkim {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1388280297Sjkim{{0x012a19d6b3282473, 0x00c0915918b423ce, 0x023a954eb94405ae,
1389280297Sjkim  0x00529f692be26158, 0x0289fa1b6fa4b2aa, 0x0198ae4ceea346ef,
1390280297Sjkim  0x0047d8cdfbdedd49, 0x00cc8c8953f0f6b8, 0x001424abbff49203},
1391280297Sjkim {0x0256732a1115a03a, 0x0351bc38665c6733, 0x03f7b950fb4a6447,
1392280297Sjkim  0x000afffa94c22155, 0x025763d0a4dab540, 0x000511e92d4fc283,
1393280297Sjkim  0x030a7e9eda0ee96c, 0x004c3cd93a28bf0a, 0x017edb3a8719217f},
1394280297Sjkim {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1395280297Sjkim{{0x011de5675a88e673, 0x031d7d0f5e567fbe, 0x0016b2062c970ae5,
1396280297Sjkim  0x03f4a2be49d90aa7, 0x03cef0bd13822866, 0x03f0923dcf774a6c,
1397280297Sjkim  0x0284bebc4f322f72, 0x016ab2645302bb2c, 0x01793f95dace0e2a},
1398280297Sjkim {0x010646e13527a28f, 0x01ca1babd59dc5e7, 0x01afedfd9a5595df,
1399280297Sjkim  0x01f15785212ea6b1, 0x0324e5d64f6ae3f4, 0x02d680f526d00645,
1400280297Sjkim  0x0127920fadf627a7, 0x03b383f75df4f684, 0x0089e0057e783b0a},
1401280297Sjkim {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1402280297Sjkim{{0x00f334b9eb3c26c6, 0x0298fdaa98568dce, 0x01c2d24843a82292,
1403280297Sjkim  0x020bcb24fa1b0711, 0x02cbdb3d2b1875e6, 0x0014907598f89422,
1404280297Sjkim  0x03abe3aa43b26664, 0x02cbf47f720bc168, 0x0133b5e73014b79b},
1405280297Sjkim {0x034aab5dab05779d, 0x00cdc5d71fee9abb, 0x0399f16bd4bd9d30,
1406280297Sjkim  0x03582fa592d82647, 0x02be1cdfb775b0e9, 0x0034f7cea32e94cb,
1407280297Sjkim  0x0335a7f08f56f286, 0x03b707e9565d1c8b, 0x0015c946ea5b614f},
1408280297Sjkim {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1409280297Sjkim{{0x024676f6cff72255, 0x00d14625cac96378, 0x00532b6008bc3767,
1410280297Sjkim  0x01fc16721b985322, 0x023355ea1b091668, 0x029de7afdc0317c3,
1411280297Sjkim  0x02fc8a7ca2da037c, 0x02de1217d74a6f30, 0x013f7173175b73bf},
1412280297Sjkim {0x0344913f441490b5, 0x0200f9e272b61eca, 0x0258a246b1dd55d2,
1413280297Sjkim  0x03753db9ea496f36, 0x025e02937a09c5ef, 0x030cbd3d14012692,
1414280297Sjkim  0x01793a67e70dc72a, 0x03ec1d37048a662e, 0x006550f700c32a8d},
1415280297Sjkim {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1416280297Sjkim{{0x00d3f48a347eba27, 0x008e636649b61bd8, 0x00d3b93716778fb3,
1417280297Sjkim  0x004d1915757bd209, 0x019d5311a3da44e0, 0x016d1afcbbe6aade,
1418280297Sjkim  0x0241bf5f73265616, 0x0384672e5d50d39b, 0x005009fee522b684},
1419280297Sjkim {0x029b4fab064435fe, 0x018868ee095bbb07, 0x01ea3d6936cc92b8,
1420280297Sjkim  0x000608b00f78a2f3, 0x02db911073d1c20f, 0x018205938470100a,
1421280297Sjkim  0x01f1e4964cbe6ff2, 0x021a19a29eed4663, 0x01414485f42afa81},
1422280297Sjkim {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1423280297Sjkim{{0x01612b3a17f63e34, 0x03813992885428e6, 0x022b3c215b5a9608,
1424280297Sjkim  0x029b4057e19f2fcb, 0x0384059a587af7e6, 0x02d6400ace6fe610,
1425280297Sjkim  0x029354d896e8e331, 0x00c047ee6dfba65e, 0x0037720542e9d49d},
1426280297Sjkim {0x02ce9eed7c5e9278, 0x0374ed703e79643b, 0x01316c54c4072006,
1427280297Sjkim  0x005aaa09054b2ee8, 0x002824000c840d57, 0x03d4eba24771ed86,
1428280297Sjkim  0x0189c50aabc3bdae, 0x0338c01541e15510, 0x00466d56e38eed42},
1429280297Sjkim {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1430280297Sjkim{{0x007efd8330ad8bd6, 0x02465ed48047710b, 0x0034c6606b215e0c,
1431280297Sjkim  0x016ae30c53cbf839, 0x01fa17bd37161216, 0x018ead4e61ce8ab9,
1432280297Sjkim  0x005482ed5f5dee46, 0x037543755bba1d7f, 0x005e5ac7e70a9d0f},
1433280297Sjkim {0x0117e1bb2fdcb2a2, 0x03deea36249f40c4, 0x028d09b4a6246cb7,
1434280297Sjkim  0x03524b8855bcf756, 0x023d7d109d5ceb58, 0x0178e43e3223ef9c,
1435280297Sjkim  0x0154536a0c6e966a, 0x037964d1286ee9fe, 0x0199bcd90e125055},
1436280297Sjkim {1, 0, 0, 0, 0, 0, 0, 0, 0}}
1437280297Sjkim};
1438238384Sjkim
1439280297Sjkim/*
1440280297Sjkim * select_point selects the |idx|th point from a precomputation table and
1441280297Sjkim * copies it to out.
1442280297Sjkim */
1443280297Sjkim /* pre_comp below is of the size provided in |size| */
1444280297Sjkimstatic void select_point(const limb idx, unsigned int size,
1445280297Sjkim                         const felem pre_comp[][3], felem out[3])
1446280297Sjkim{
1447280297Sjkim    unsigned i, j;
1448280297Sjkim    limb *outlimbs = &out[0][0];
1449280297Sjkim    memset(outlimbs, 0, 3 * sizeof(felem));
1450238384Sjkim
1451280297Sjkim    for (i = 0; i < size; i++) {
1452280297Sjkim        const limb *inlimbs = &pre_comp[i][0][0];
1453280297Sjkim        limb mask = i ^ idx;
1454280297Sjkim        mask |= mask >> 4;
1455280297Sjkim        mask |= mask >> 2;
1456280297Sjkim        mask |= mask >> 1;
1457280297Sjkim        mask &= 1;
1458280297Sjkim        mask--;
1459280297Sjkim        for (j = 0; j < NLIMBS * 3; j++)
1460280297Sjkim            outlimbs[j] |= inlimbs[j] & mask;
1461280297Sjkim    }
1462280297Sjkim}
1463238384Sjkim
1464238384Sjkim/* get_bit returns the |i|th bit in |in| */
1465238384Sjkimstatic char get_bit(const felem_bytearray in, int i)
1466280297Sjkim{
1467280297Sjkim    if (i < 0)
1468280297Sjkim        return 0;
1469280297Sjkim    return (in[i >> 3] >> (i & 7)) & 1;
1470280297Sjkim}
1471238384Sjkim
1472280297Sjkim/*
1473280297Sjkim * Interleaved point multiplication using precomputed point multiples: The
1474280297Sjkim * small point multiples 0*P, 1*P, ..., 16*P are in pre_comp[], the scalars
1475280297Sjkim * in scalars[]. If g_scalar is non-NULL, we also add this multiple of the
1476280297Sjkim * generator, using certain (large) precomputed multiples in g_pre_comp.
1477280297Sjkim * Output point (X, Y, Z) is stored in x_out, y_out, z_out
1478280297Sjkim */
1479238384Sjkimstatic void batch_mul(felem x_out, felem y_out, felem z_out,
1480280297Sjkim                      const felem_bytearray scalars[],
1481280297Sjkim                      const unsigned num_points, const u8 *g_scalar,
1482280297Sjkim                      const int mixed, const felem pre_comp[][17][3],
1483280297Sjkim                      const felem g_pre_comp[16][3])
1484280297Sjkim{
1485280297Sjkim    int i, skip;
1486280297Sjkim    unsigned num, gen_mul = (g_scalar != NULL);
1487280297Sjkim    felem nq[3], tmp[4];
1488280297Sjkim    limb bits;
1489280297Sjkim    u8 sign, digit;
1490238384Sjkim
1491280297Sjkim    /* set nq to the point at infinity */
1492280297Sjkim    memset(nq, 0, 3 * sizeof(felem));
1493238384Sjkim
1494280297Sjkim    /*
1495280297Sjkim     * Loop over all scalars msb-to-lsb, interleaving additions of multiples
1496280297Sjkim     * of the generator (last quarter of rounds) and additions of other
1497280297Sjkim     * points multiples (every 5th round).
1498280297Sjkim     */
1499280297Sjkim    skip = 1;                   /* save two point operations in the first
1500280297Sjkim                                 * round */
1501280297Sjkim    for (i = (num_points ? 520 : 130); i >= 0; --i) {
1502280297Sjkim        /* double */
1503280297Sjkim        if (!skip)
1504280297Sjkim            point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]);
1505238384Sjkim
1506280297Sjkim        /* add multiples of the generator */
1507280297Sjkim        if (gen_mul && (i <= 130)) {
1508280297Sjkim            bits = get_bit(g_scalar, i + 390) << 3;
1509280297Sjkim            if (i < 130) {
1510280297Sjkim                bits |= get_bit(g_scalar, i + 260) << 2;
1511280297Sjkim                bits |= get_bit(g_scalar, i + 130) << 1;
1512280297Sjkim                bits |= get_bit(g_scalar, i);
1513280297Sjkim            }
1514280297Sjkim            /* select the point to add, in constant time */
1515280297Sjkim            select_point(bits, 16, g_pre_comp, tmp);
1516280297Sjkim            if (!skip) {
1517280297Sjkim                /* The 1 argument below is for "mixed" */
1518280297Sjkim                point_add(nq[0], nq[1], nq[2],
1519280297Sjkim                          nq[0], nq[1], nq[2], 1, tmp[0], tmp[1], tmp[2]);
1520280297Sjkim            } else {
1521280297Sjkim                memcpy(nq, tmp, 3 * sizeof(felem));
1522280297Sjkim                skip = 0;
1523280297Sjkim            }
1524280297Sjkim        }
1525238384Sjkim
1526280297Sjkim        /* do other additions every 5 doublings */
1527280297Sjkim        if (num_points && (i % 5 == 0)) {
1528280297Sjkim            /* loop over all scalars */
1529280297Sjkim            for (num = 0; num < num_points; ++num) {
1530280297Sjkim                bits = get_bit(scalars[num], i + 4) << 5;
1531280297Sjkim                bits |= get_bit(scalars[num], i + 3) << 4;
1532280297Sjkim                bits |= get_bit(scalars[num], i + 2) << 3;
1533280297Sjkim                bits |= get_bit(scalars[num], i + 1) << 2;
1534280297Sjkim                bits |= get_bit(scalars[num], i) << 1;
1535280297Sjkim                bits |= get_bit(scalars[num], i - 1);
1536280297Sjkim                ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits);
1537238384Sjkim
1538280297Sjkim                /*
1539280297Sjkim                 * select the point to add or subtract, in constant time
1540280297Sjkim                 */
1541280297Sjkim                select_point(digit, 17, pre_comp[num], tmp);
1542280297Sjkim                felem_neg(tmp[3], tmp[1]); /* (X, -Y, Z) is the negative
1543280297Sjkim                                            * point */
1544280297Sjkim                copy_conditional(tmp[1], tmp[3], (-(limb) sign));
1545238384Sjkim
1546280297Sjkim                if (!skip) {
1547280297Sjkim                    point_add(nq[0], nq[1], nq[2],
1548280297Sjkim                              nq[0], nq[1], nq[2],
1549280297Sjkim                              mixed, tmp[0], tmp[1], tmp[2]);
1550280297Sjkim                } else {
1551280297Sjkim                    memcpy(nq, tmp, 3 * sizeof(felem));
1552280297Sjkim                    skip = 0;
1553280297Sjkim                }
1554280297Sjkim            }
1555280297Sjkim        }
1556280297Sjkim    }
1557280297Sjkim    felem_assign(x_out, nq[0]);
1558280297Sjkim    felem_assign(y_out, nq[1]);
1559280297Sjkim    felem_assign(z_out, nq[2]);
1560280297Sjkim}
1561238384Sjkim
1562238384Sjkim/* Precomputation for the group generator. */
1563238384Sjkimtypedef struct {
1564280297Sjkim    felem g_pre_comp[16][3];
1565280297Sjkim    int references;
1566238384Sjkim} NISTP521_PRE_COMP;
1567238384Sjkim
1568238384Sjkimconst EC_METHOD *EC_GFp_nistp521_method(void)
1569280297Sjkim{
1570280297Sjkim    static const EC_METHOD ret = {
1571280297Sjkim        EC_FLAGS_DEFAULT_OCT,
1572280297Sjkim        NID_X9_62_prime_field,
1573280297Sjkim        ec_GFp_nistp521_group_init,
1574280297Sjkim        ec_GFp_simple_group_finish,
1575280297Sjkim        ec_GFp_simple_group_clear_finish,
1576280297Sjkim        ec_GFp_nist_group_copy,
1577280297Sjkim        ec_GFp_nistp521_group_set_curve,
1578280297Sjkim        ec_GFp_simple_group_get_curve,
1579280297Sjkim        ec_GFp_simple_group_get_degree,
1580280297Sjkim        ec_GFp_simple_group_check_discriminant,
1581280297Sjkim        ec_GFp_simple_point_init,
1582280297Sjkim        ec_GFp_simple_point_finish,
1583280297Sjkim        ec_GFp_simple_point_clear_finish,
1584280297Sjkim        ec_GFp_simple_point_copy,
1585280297Sjkim        ec_GFp_simple_point_set_to_infinity,
1586280297Sjkim        ec_GFp_simple_set_Jprojective_coordinates_GFp,
1587280297Sjkim        ec_GFp_simple_get_Jprojective_coordinates_GFp,
1588280297Sjkim        ec_GFp_simple_point_set_affine_coordinates,
1589280297Sjkim        ec_GFp_nistp521_point_get_affine_coordinates,
1590280297Sjkim        0 /* point_set_compressed_coordinates */ ,
1591280297Sjkim        0 /* point2oct */ ,
1592280297Sjkim        0 /* oct2point */ ,
1593280297Sjkim        ec_GFp_simple_add,
1594280297Sjkim        ec_GFp_simple_dbl,
1595280297Sjkim        ec_GFp_simple_invert,
1596280297Sjkim        ec_GFp_simple_is_at_infinity,
1597280297Sjkim        ec_GFp_simple_is_on_curve,
1598280297Sjkim        ec_GFp_simple_cmp,
1599280297Sjkim        ec_GFp_simple_make_affine,
1600280297Sjkim        ec_GFp_simple_points_make_affine,
1601280297Sjkim        ec_GFp_nistp521_points_mul,
1602280297Sjkim        ec_GFp_nistp521_precompute_mult,
1603280297Sjkim        ec_GFp_nistp521_have_precompute_mult,
1604280297Sjkim        ec_GFp_nist_field_mul,
1605280297Sjkim        ec_GFp_nist_field_sqr,
1606280297Sjkim        0 /* field_div */ ,
1607280297Sjkim        0 /* field_encode */ ,
1608280297Sjkim        0 /* field_decode */ ,
1609280297Sjkim        0                       /* field_set_to_one */
1610280297Sjkim    };
1611238384Sjkim
1612280297Sjkim    return &ret;
1613280297Sjkim}
1614238384Sjkim
1615238384Sjkim/******************************************************************************/
1616280297Sjkim/*
1617280297Sjkim * FUNCTIONS TO MANAGE PRECOMPUTATION
1618238384Sjkim */
1619238384Sjkim
1620238384Sjkimstatic NISTP521_PRE_COMP *nistp521_pre_comp_new()
1621280297Sjkim{
1622280297Sjkim    NISTP521_PRE_COMP *ret = NULL;
1623280297Sjkim    ret = (NISTP521_PRE_COMP *) OPENSSL_malloc(sizeof(NISTP521_PRE_COMP));
1624280297Sjkim    if (!ret) {
1625280297Sjkim        ECerr(EC_F_NISTP521_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE);
1626280297Sjkim        return ret;
1627280297Sjkim    }
1628280297Sjkim    memset(ret->g_pre_comp, 0, sizeof(ret->g_pre_comp));
1629280297Sjkim    ret->references = 1;
1630280297Sjkim    return ret;
1631280297Sjkim}
1632238384Sjkim
1633238384Sjkimstatic void *nistp521_pre_comp_dup(void *src_)
1634280297Sjkim{
1635280297Sjkim    NISTP521_PRE_COMP *src = src_;
1636238384Sjkim
1637280297Sjkim    /* no need to actually copy, these objects never change! */
1638280297Sjkim    CRYPTO_add(&src->references, 1, CRYPTO_LOCK_EC_PRE_COMP);
1639238384Sjkim
1640280297Sjkim    return src_;
1641280297Sjkim}
1642238384Sjkim
1643238384Sjkimstatic void nistp521_pre_comp_free(void *pre_)
1644280297Sjkim{
1645280297Sjkim    int i;
1646280297Sjkim    NISTP521_PRE_COMP *pre = pre_;
1647238384Sjkim
1648280297Sjkim    if (!pre)
1649280297Sjkim        return;
1650238384Sjkim
1651280297Sjkim    i = CRYPTO_add(&pre->references, -1, CRYPTO_LOCK_EC_PRE_COMP);
1652280297Sjkim    if (i > 0)
1653280297Sjkim        return;
1654238384Sjkim
1655280297Sjkim    OPENSSL_free(pre);
1656280297Sjkim}
1657238384Sjkim
1658238384Sjkimstatic void nistp521_pre_comp_clear_free(void *pre_)
1659280297Sjkim{
1660280297Sjkim    int i;
1661280297Sjkim    NISTP521_PRE_COMP *pre = pre_;
1662238384Sjkim
1663280297Sjkim    if (!pre)
1664280297Sjkim        return;
1665238384Sjkim
1666280297Sjkim    i = CRYPTO_add(&pre->references, -1, CRYPTO_LOCK_EC_PRE_COMP);
1667280297Sjkim    if (i > 0)
1668280297Sjkim        return;
1669238384Sjkim
1670280297Sjkim    OPENSSL_cleanse(pre, sizeof(*pre));
1671280297Sjkim    OPENSSL_free(pre);
1672280297Sjkim}
1673238384Sjkim
1674238384Sjkim/******************************************************************************/
1675280297Sjkim/*
1676280297Sjkim * OPENSSL EC_METHOD FUNCTIONS
1677238384Sjkim */
1678238384Sjkim
1679238384Sjkimint ec_GFp_nistp521_group_init(EC_GROUP *group)
1680280297Sjkim{
1681280297Sjkim    int ret;
1682280297Sjkim    ret = ec_GFp_simple_group_init(group);
1683280297Sjkim    group->a_is_minus3 = 1;
1684280297Sjkim    return ret;
1685280297Sjkim}
1686238384Sjkim
1687238384Sjkimint ec_GFp_nistp521_group_set_curve(EC_GROUP *group, const BIGNUM *p,
1688280297Sjkim                                    const BIGNUM *a, const BIGNUM *b,
1689280297Sjkim                                    BN_CTX *ctx)
1690280297Sjkim{
1691280297Sjkim    int ret = 0;
1692280297Sjkim    BN_CTX *new_ctx = NULL;
1693280297Sjkim    BIGNUM *curve_p, *curve_a, *curve_b;
1694238384Sjkim
1695280297Sjkim    if (ctx == NULL)
1696280297Sjkim        if ((ctx = new_ctx = BN_CTX_new()) == NULL)
1697280297Sjkim            return 0;
1698280297Sjkim    BN_CTX_start(ctx);
1699280297Sjkim    if (((curve_p = BN_CTX_get(ctx)) == NULL) ||
1700280297Sjkim        ((curve_a = BN_CTX_get(ctx)) == NULL) ||
1701280297Sjkim        ((curve_b = BN_CTX_get(ctx)) == NULL))
1702280297Sjkim        goto err;
1703280297Sjkim    BN_bin2bn(nistp521_curve_params[0], sizeof(felem_bytearray), curve_p);
1704280297Sjkim    BN_bin2bn(nistp521_curve_params[1], sizeof(felem_bytearray), curve_a);
1705280297Sjkim    BN_bin2bn(nistp521_curve_params[2], sizeof(felem_bytearray), curve_b);
1706280297Sjkim    if ((BN_cmp(curve_p, p)) || (BN_cmp(curve_a, a)) || (BN_cmp(curve_b, b))) {
1707280297Sjkim        ECerr(EC_F_EC_GFP_NISTP521_GROUP_SET_CURVE,
1708280297Sjkim              EC_R_WRONG_CURVE_PARAMETERS);
1709280297Sjkim        goto err;
1710280297Sjkim    }
1711280297Sjkim    group->field_mod_func = BN_nist_mod_521;
1712280297Sjkim    ret = ec_GFp_simple_group_set_curve(group, p, a, b, ctx);
1713280297Sjkim err:
1714280297Sjkim    BN_CTX_end(ctx);
1715280297Sjkim    if (new_ctx != NULL)
1716280297Sjkim        BN_CTX_free(new_ctx);
1717280297Sjkim    return ret;
1718280297Sjkim}
1719238384Sjkim
1720280297Sjkim/*
1721280297Sjkim * Takes the Jacobian coordinates (X, Y, Z) of a point and returns (X', Y') =
1722280297Sjkim * (X/Z^2, Y/Z^3)
1723280297Sjkim */
1724238384Sjkimint ec_GFp_nistp521_point_get_affine_coordinates(const EC_GROUP *group,
1725280297Sjkim                                                 const EC_POINT *point,
1726280297Sjkim                                                 BIGNUM *x, BIGNUM *y,
1727280297Sjkim                                                 BN_CTX *ctx)
1728280297Sjkim{
1729280297Sjkim    felem z1, z2, x_in, y_in, x_out, y_out;
1730280297Sjkim    largefelem tmp;
1731238384Sjkim
1732280297Sjkim    if (EC_POINT_is_at_infinity(group, point)) {
1733280297Sjkim        ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES,
1734280297Sjkim              EC_R_POINT_AT_INFINITY);
1735280297Sjkim        return 0;
1736280297Sjkim    }
1737280297Sjkim    if ((!BN_to_felem(x_in, &point->X)) || (!BN_to_felem(y_in, &point->Y)) ||
1738280297Sjkim        (!BN_to_felem(z1, &point->Z)))
1739280297Sjkim        return 0;
1740280297Sjkim    felem_inv(z2, z1);
1741280297Sjkim    felem_square(tmp, z2);
1742280297Sjkim    felem_reduce(z1, tmp);
1743280297Sjkim    felem_mul(tmp, x_in, z1);
1744280297Sjkim    felem_reduce(x_in, tmp);
1745280297Sjkim    felem_contract(x_out, x_in);
1746280297Sjkim    if (x != NULL) {
1747280297Sjkim        if (!felem_to_BN(x, x_out)) {
1748280297Sjkim            ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES,
1749280297Sjkim                  ERR_R_BN_LIB);
1750280297Sjkim            return 0;
1751280297Sjkim        }
1752280297Sjkim    }
1753280297Sjkim    felem_mul(tmp, z1, z2);
1754280297Sjkim    felem_reduce(z1, tmp);
1755280297Sjkim    felem_mul(tmp, y_in, z1);
1756280297Sjkim    felem_reduce(y_in, tmp);
1757280297Sjkim    felem_contract(y_out, y_in);
1758280297Sjkim    if (y != NULL) {
1759280297Sjkim        if (!felem_to_BN(y, y_out)) {
1760280297Sjkim            ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES,
1761280297Sjkim                  ERR_R_BN_LIB);
1762280297Sjkim            return 0;
1763280297Sjkim        }
1764280297Sjkim    }
1765280297Sjkim    return 1;
1766280297Sjkim}
1767238384Sjkim
1768280297Sjkim/* points below is of size |num|, and tmp_felems is of size |num+1/ */
1769280297Sjkimstatic void make_points_affine(size_t num, felem points[][3],
1770280297Sjkim                               felem tmp_felems[])
1771280297Sjkim{
1772280297Sjkim    /*
1773280297Sjkim     * Runs in constant time, unless an input is the point at infinity (which
1774280297Sjkim     * normally shouldn't happen).
1775280297Sjkim     */
1776280297Sjkim    ec_GFp_nistp_points_make_affine_internal(num,
1777280297Sjkim                                             points,
1778280297Sjkim                                             sizeof(felem),
1779280297Sjkim                                             tmp_felems,
1780280297Sjkim                                             (void (*)(void *))felem_one,
1781280297Sjkim                                             felem_is_zero_int,
1782280297Sjkim                                             (void (*)(void *, const void *))
1783280297Sjkim                                             felem_assign,
1784280297Sjkim                                             (void (*)(void *, const void *))
1785280297Sjkim                                             felem_square_reduce, (void (*)
1786280297Sjkim                                                                   (void *,
1787280297Sjkim                                                                    const void
1788280297Sjkim                                                                    *,
1789280297Sjkim                                                                    const void
1790280297Sjkim                                                                    *))
1791280297Sjkim                                             felem_mul_reduce,
1792280297Sjkim                                             (void (*)(void *, const void *))
1793280297Sjkim                                             felem_inv,
1794280297Sjkim                                             (void (*)(void *, const void *))
1795280297Sjkim                                             felem_contract);
1796280297Sjkim}
1797238384Sjkim
1798280297Sjkim/*
1799280297Sjkim * Computes scalar*generator + \sum scalars[i]*points[i], ignoring NULL
1800280297Sjkim * values Result is stored in r (r can equal one of the inputs).
1801280297Sjkim */
1802238384Sjkimint ec_GFp_nistp521_points_mul(const EC_GROUP *group, EC_POINT *r,
1803280297Sjkim                               const BIGNUM *scalar, size_t num,
1804280297Sjkim                               const EC_POINT *points[],
1805280297Sjkim                               const BIGNUM *scalars[], BN_CTX *ctx)
1806280297Sjkim{
1807280297Sjkim    int ret = 0;
1808280297Sjkim    int j;
1809280297Sjkim    int mixed = 0;
1810280297Sjkim    BN_CTX *new_ctx = NULL;
1811280297Sjkim    BIGNUM *x, *y, *z, *tmp_scalar;
1812280297Sjkim    felem_bytearray g_secret;
1813280297Sjkim    felem_bytearray *secrets = NULL;
1814280297Sjkim    felem(*pre_comp)[17][3] = NULL;
1815280297Sjkim    felem *tmp_felems = NULL;
1816352193Sjkim    unsigned i;
1817352193Sjkim    int num_bytes;
1818280297Sjkim    int have_pre_comp = 0;
1819280297Sjkim    size_t num_points = num;
1820280297Sjkim    felem x_in, y_in, z_in, x_out, y_out, z_out;
1821280297Sjkim    NISTP521_PRE_COMP *pre = NULL;
1822280297Sjkim    felem(*g_pre_comp)[3] = NULL;
1823280297Sjkim    EC_POINT *generator = NULL;
1824280297Sjkim    const EC_POINT *p = NULL;
1825280297Sjkim    const BIGNUM *p_scalar = NULL;
1826238384Sjkim
1827280297Sjkim    if (ctx == NULL)
1828280297Sjkim        if ((ctx = new_ctx = BN_CTX_new()) == NULL)
1829280297Sjkim            return 0;
1830280297Sjkim    BN_CTX_start(ctx);
1831280297Sjkim    if (((x = BN_CTX_get(ctx)) == NULL) ||
1832280297Sjkim        ((y = BN_CTX_get(ctx)) == NULL) ||
1833280297Sjkim        ((z = BN_CTX_get(ctx)) == NULL) ||
1834280297Sjkim        ((tmp_scalar = BN_CTX_get(ctx)) == NULL))
1835280297Sjkim        goto err;
1836238384Sjkim
1837280297Sjkim    if (scalar != NULL) {
1838280297Sjkim        pre = EC_EX_DATA_get_data(group->extra_data,
1839280297Sjkim                                  nistp521_pre_comp_dup,
1840280297Sjkim                                  nistp521_pre_comp_free,
1841280297Sjkim                                  nistp521_pre_comp_clear_free);
1842280297Sjkim        if (pre)
1843280297Sjkim            /* we have precomputation, try to use it */
1844280297Sjkim            g_pre_comp = &pre->g_pre_comp[0];
1845280297Sjkim        else
1846280297Sjkim            /* try to use the standard precomputation */
1847280297Sjkim            g_pre_comp = (felem(*)[3]) gmul;
1848280297Sjkim        generator = EC_POINT_new(group);
1849280297Sjkim        if (generator == NULL)
1850280297Sjkim            goto err;
1851280297Sjkim        /* get the generator from precomputation */
1852280297Sjkim        if (!felem_to_BN(x, g_pre_comp[1][0]) ||
1853280297Sjkim            !felem_to_BN(y, g_pre_comp[1][1]) ||
1854280297Sjkim            !felem_to_BN(z, g_pre_comp[1][2])) {
1855280297Sjkim            ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
1856280297Sjkim            goto err;
1857280297Sjkim        }
1858280297Sjkim        if (!EC_POINT_set_Jprojective_coordinates_GFp(group,
1859280297Sjkim                                                      generator, x, y, z,
1860280297Sjkim                                                      ctx))
1861280297Sjkim            goto err;
1862280297Sjkim        if (0 == EC_POINT_cmp(group, generator, group->generator, ctx))
1863280297Sjkim            /* precomputation matches generator */
1864280297Sjkim            have_pre_comp = 1;
1865280297Sjkim        else
1866280297Sjkim            /*
1867280297Sjkim             * we don't have valid precomputation: treat the generator as a
1868280297Sjkim             * random point
1869280297Sjkim             */
1870280297Sjkim            num_points++;
1871280297Sjkim    }
1872238384Sjkim
1873280297Sjkim    if (num_points > 0) {
1874280297Sjkim        if (num_points >= 2) {
1875280297Sjkim            /*
1876280297Sjkim             * unless we precompute multiples for just one point, converting
1877280297Sjkim             * those into affine form is time well spent
1878280297Sjkim             */
1879280297Sjkim            mixed = 1;
1880280297Sjkim        }
1881280297Sjkim        secrets = OPENSSL_malloc(num_points * sizeof(felem_bytearray));
1882280297Sjkim        pre_comp = OPENSSL_malloc(num_points * 17 * 3 * sizeof(felem));
1883280297Sjkim        if (mixed)
1884280297Sjkim            tmp_felems =
1885280297Sjkim                OPENSSL_malloc((num_points * 17 + 1) * sizeof(felem));
1886280297Sjkim        if ((secrets == NULL) || (pre_comp == NULL)
1887280297Sjkim            || (mixed && (tmp_felems == NULL))) {
1888280297Sjkim            ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_MALLOC_FAILURE);
1889280297Sjkim            goto err;
1890280297Sjkim        }
1891238384Sjkim
1892280297Sjkim        /*
1893280297Sjkim         * we treat NULL scalars as 0, and NULL points as points at infinity,
1894280297Sjkim         * i.e., they contribute nothing to the linear combination
1895280297Sjkim         */
1896280297Sjkim        memset(secrets, 0, num_points * sizeof(felem_bytearray));
1897280297Sjkim        memset(pre_comp, 0, num_points * 17 * 3 * sizeof(felem));
1898280297Sjkim        for (i = 0; i < num_points; ++i) {
1899352193Sjkim            if (i == num) {
1900280297Sjkim                /*
1901280297Sjkim                 * we didn't have a valid precomputation, so we pick the
1902280297Sjkim                 * generator
1903280297Sjkim                 */
1904280297Sjkim                p = EC_GROUP_get0_generator(group);
1905280297Sjkim                p_scalar = scalar;
1906352193Sjkim            } else {
1907280297Sjkim                /* the i^th point */
1908280297Sjkim                p = points[i];
1909280297Sjkim                p_scalar = scalars[i];
1910280297Sjkim            }
1911280297Sjkim            if ((p_scalar != NULL) && (p != NULL)) {
1912280297Sjkim                /* reduce scalar to 0 <= scalar < 2^521 */
1913280297Sjkim                if ((BN_num_bits(p_scalar) > 521)
1914280297Sjkim                    || (BN_is_negative(p_scalar))) {
1915280297Sjkim                    /*
1916280297Sjkim                     * this is an unusual input, and we don't guarantee
1917280297Sjkim                     * constant-timeness
1918280297Sjkim                     */
1919280297Sjkim                    if (!BN_nnmod(tmp_scalar, p_scalar, &group->order, ctx)) {
1920280297Sjkim                        ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
1921280297Sjkim                        goto err;
1922280297Sjkim                    }
1923352193Sjkim                    num_bytes = bn_bn2lebinpad(tmp_scalar,
1924352193Sjkim                                               secrets[i], sizeof(secrets[i]));
1925352193Sjkim                } else {
1926352193Sjkim                    num_bytes = bn_bn2lebinpad(p_scalar,
1927352193Sjkim                                               secrets[i], sizeof(secrets[i]));
1928352193Sjkim                }
1929352193Sjkim                if (num_bytes < 0) {
1930352193Sjkim                    ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
1931352193Sjkim                    goto err;
1932352193Sjkim                }
1933280297Sjkim                /* precompute multiples */
1934280297Sjkim                if ((!BN_to_felem(x_out, &p->X)) ||
1935280297Sjkim                    (!BN_to_felem(y_out, &p->Y)) ||
1936280297Sjkim                    (!BN_to_felem(z_out, &p->Z)))
1937280297Sjkim                    goto err;
1938280297Sjkim                memcpy(pre_comp[i][1][0], x_out, sizeof(felem));
1939280297Sjkim                memcpy(pre_comp[i][1][1], y_out, sizeof(felem));
1940280297Sjkim                memcpy(pre_comp[i][1][2], z_out, sizeof(felem));
1941280297Sjkim                for (j = 2; j <= 16; ++j) {
1942280297Sjkim                    if (j & 1) {
1943280297Sjkim                        point_add(pre_comp[i][j][0], pre_comp[i][j][1],
1944280297Sjkim                                  pre_comp[i][j][2], pre_comp[i][1][0],
1945280297Sjkim                                  pre_comp[i][1][1], pre_comp[i][1][2], 0,
1946280297Sjkim                                  pre_comp[i][j - 1][0],
1947280297Sjkim                                  pre_comp[i][j - 1][1],
1948280297Sjkim                                  pre_comp[i][j - 1][2]);
1949280297Sjkim                    } else {
1950280297Sjkim                        point_double(pre_comp[i][j][0], pre_comp[i][j][1],
1951280297Sjkim                                     pre_comp[i][j][2], pre_comp[i][j / 2][0],
1952280297Sjkim                                     pre_comp[i][j / 2][1],
1953280297Sjkim                                     pre_comp[i][j / 2][2]);
1954280297Sjkim                    }
1955280297Sjkim                }
1956280297Sjkim            }
1957280297Sjkim        }
1958280297Sjkim        if (mixed)
1959280297Sjkim            make_points_affine(num_points * 17, pre_comp[0], tmp_felems);
1960280297Sjkim    }
1961238384Sjkim
1962280297Sjkim    /* the scalar for the generator */
1963280297Sjkim    if ((scalar != NULL) && (have_pre_comp)) {
1964280297Sjkim        memset(g_secret, 0, sizeof(g_secret));
1965280297Sjkim        /* reduce scalar to 0 <= scalar < 2^521 */
1966280297Sjkim        if ((BN_num_bits(scalar) > 521) || (BN_is_negative(scalar))) {
1967280297Sjkim            /*
1968280297Sjkim             * this is an unusual input, and we don't guarantee
1969280297Sjkim             * constant-timeness
1970280297Sjkim             */
1971280297Sjkim            if (!BN_nnmod(tmp_scalar, scalar, &group->order, ctx)) {
1972280297Sjkim                ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
1973280297Sjkim                goto err;
1974280297Sjkim            }
1975352193Sjkim            num_bytes = bn_bn2lebinpad(tmp_scalar, g_secret, sizeof(g_secret));
1976352193Sjkim        } else {
1977352193Sjkim            num_bytes = bn_bn2lebinpad(scalar, g_secret, sizeof(g_secret));
1978352193Sjkim        }
1979280297Sjkim        /* do the multiplication with generator precomputation */
1980280297Sjkim        batch_mul(x_out, y_out, z_out,
1981280297Sjkim                  (const felem_bytearray(*))secrets, num_points,
1982280297Sjkim                  g_secret,
1983280297Sjkim                  mixed, (const felem(*)[17][3])pre_comp,
1984280297Sjkim                  (const felem(*)[3])g_pre_comp);
1985352193Sjkim    } else {
1986280297Sjkim        /* do the multiplication without generator precomputation */
1987280297Sjkim        batch_mul(x_out, y_out, z_out,
1988280297Sjkim                  (const felem_bytearray(*))secrets, num_points,
1989280297Sjkim                  NULL, mixed, (const felem(*)[17][3])pre_comp, NULL);
1990352193Sjkim    }
1991280297Sjkim    /* reduce the output to its unique minimal representation */
1992280297Sjkim    felem_contract(x_in, x_out);
1993280297Sjkim    felem_contract(y_in, y_out);
1994280297Sjkim    felem_contract(z_in, z_out);
1995280297Sjkim    if ((!felem_to_BN(x, x_in)) || (!felem_to_BN(y, y_in)) ||
1996280297Sjkim        (!felem_to_BN(z, z_in))) {
1997280297Sjkim        ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
1998280297Sjkim        goto err;
1999280297Sjkim    }
2000280297Sjkim    ret = EC_POINT_set_Jprojective_coordinates_GFp(group, r, x, y, z, ctx);
2001238384Sjkim
2002280297Sjkim err:
2003280297Sjkim    BN_CTX_end(ctx);
2004280297Sjkim    if (generator != NULL)
2005280297Sjkim        EC_POINT_free(generator);
2006280297Sjkim    if (new_ctx != NULL)
2007280297Sjkim        BN_CTX_free(new_ctx);
2008280297Sjkim    if (secrets != NULL)
2009280297Sjkim        OPENSSL_free(secrets);
2010280297Sjkim    if (pre_comp != NULL)
2011280297Sjkim        OPENSSL_free(pre_comp);
2012280297Sjkim    if (tmp_felems != NULL)
2013280297Sjkim        OPENSSL_free(tmp_felems);
2014280297Sjkim    return ret;
2015280297Sjkim}
2016238384Sjkim
2017238384Sjkimint ec_GFp_nistp521_precompute_mult(EC_GROUP *group, BN_CTX *ctx)
2018280297Sjkim{
2019280297Sjkim    int ret = 0;
2020280297Sjkim    NISTP521_PRE_COMP *pre = NULL;
2021280297Sjkim    int i, j;
2022280297Sjkim    BN_CTX *new_ctx = NULL;
2023280297Sjkim    BIGNUM *x, *y;
2024280297Sjkim    EC_POINT *generator = NULL;
2025280297Sjkim    felem tmp_felems[16];
2026238384Sjkim
2027280297Sjkim    /* throw away old precomputation */
2028280297Sjkim    EC_EX_DATA_free_data(&group->extra_data, nistp521_pre_comp_dup,
2029280297Sjkim                         nistp521_pre_comp_free,
2030280297Sjkim                         nistp521_pre_comp_clear_free);
2031280297Sjkim    if (ctx == NULL)
2032280297Sjkim        if ((ctx = new_ctx = BN_CTX_new()) == NULL)
2033280297Sjkim            return 0;
2034280297Sjkim    BN_CTX_start(ctx);
2035280297Sjkim    if (((x = BN_CTX_get(ctx)) == NULL) || ((y = BN_CTX_get(ctx)) == NULL))
2036280297Sjkim        goto err;
2037280297Sjkim    /* get the generator */
2038280297Sjkim    if (group->generator == NULL)
2039280297Sjkim        goto err;
2040280297Sjkim    generator = EC_POINT_new(group);
2041280297Sjkim    if (generator == NULL)
2042280297Sjkim        goto err;
2043280297Sjkim    BN_bin2bn(nistp521_curve_params[3], sizeof(felem_bytearray), x);
2044280297Sjkim    BN_bin2bn(nistp521_curve_params[4], sizeof(felem_bytearray), y);
2045280297Sjkim    if (!EC_POINT_set_affine_coordinates_GFp(group, generator, x, y, ctx))
2046280297Sjkim        goto err;
2047280297Sjkim    if ((pre = nistp521_pre_comp_new()) == NULL)
2048280297Sjkim        goto err;
2049280297Sjkim    /*
2050280297Sjkim     * if the generator is the standard one, use built-in precomputation
2051280297Sjkim     */
2052280297Sjkim    if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) {
2053280297Sjkim        memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp));
2054296279Sjkim        goto done;
2055280297Sjkim    }
2056280297Sjkim    if ((!BN_to_felem(pre->g_pre_comp[1][0], &group->generator->X)) ||
2057280297Sjkim        (!BN_to_felem(pre->g_pre_comp[1][1], &group->generator->Y)) ||
2058280297Sjkim        (!BN_to_felem(pre->g_pre_comp[1][2], &group->generator->Z)))
2059280297Sjkim        goto err;
2060280297Sjkim    /* compute 2^130*G, 2^260*G, 2^390*G */
2061280297Sjkim    for (i = 1; i <= 4; i <<= 1) {
2062280297Sjkim        point_double(pre->g_pre_comp[2 * i][0], pre->g_pre_comp[2 * i][1],
2063280297Sjkim                     pre->g_pre_comp[2 * i][2], pre->g_pre_comp[i][0],
2064280297Sjkim                     pre->g_pre_comp[i][1], pre->g_pre_comp[i][2]);
2065280297Sjkim        for (j = 0; j < 129; ++j) {
2066280297Sjkim            point_double(pre->g_pre_comp[2 * i][0],
2067280297Sjkim                         pre->g_pre_comp[2 * i][1],
2068280297Sjkim                         pre->g_pre_comp[2 * i][2],
2069280297Sjkim                         pre->g_pre_comp[2 * i][0],
2070280297Sjkim                         pre->g_pre_comp[2 * i][1],
2071280297Sjkim                         pre->g_pre_comp[2 * i][2]);
2072280297Sjkim        }
2073280297Sjkim    }
2074280297Sjkim    /* g_pre_comp[0] is the point at infinity */
2075280297Sjkim    memset(pre->g_pre_comp[0], 0, sizeof(pre->g_pre_comp[0]));
2076280297Sjkim    /* the remaining multiples */
2077280297Sjkim    /* 2^130*G + 2^260*G */
2078280297Sjkim    point_add(pre->g_pre_comp[6][0], pre->g_pre_comp[6][1],
2079280297Sjkim              pre->g_pre_comp[6][2], pre->g_pre_comp[4][0],
2080280297Sjkim              pre->g_pre_comp[4][1], pre->g_pre_comp[4][2],
2081280297Sjkim              0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
2082280297Sjkim              pre->g_pre_comp[2][2]);
2083280297Sjkim    /* 2^130*G + 2^390*G */
2084280297Sjkim    point_add(pre->g_pre_comp[10][0], pre->g_pre_comp[10][1],
2085280297Sjkim              pre->g_pre_comp[10][2], pre->g_pre_comp[8][0],
2086280297Sjkim              pre->g_pre_comp[8][1], pre->g_pre_comp[8][2],
2087280297Sjkim              0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
2088280297Sjkim              pre->g_pre_comp[2][2]);
2089280297Sjkim    /* 2^260*G + 2^390*G */
2090280297Sjkim    point_add(pre->g_pre_comp[12][0], pre->g_pre_comp[12][1],
2091280297Sjkim              pre->g_pre_comp[12][2], pre->g_pre_comp[8][0],
2092280297Sjkim              pre->g_pre_comp[8][1], pre->g_pre_comp[8][2],
2093280297Sjkim              0, pre->g_pre_comp[4][0], pre->g_pre_comp[4][1],
2094280297Sjkim              pre->g_pre_comp[4][2]);
2095280297Sjkim    /* 2^130*G + 2^260*G + 2^390*G */
2096280297Sjkim    point_add(pre->g_pre_comp[14][0], pre->g_pre_comp[14][1],
2097280297Sjkim              pre->g_pre_comp[14][2], pre->g_pre_comp[12][0],
2098280297Sjkim              pre->g_pre_comp[12][1], pre->g_pre_comp[12][2],
2099280297Sjkim              0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
2100280297Sjkim              pre->g_pre_comp[2][2]);
2101280297Sjkim    for (i = 1; i < 8; ++i) {
2102280297Sjkim        /* odd multiples: add G */
2103280297Sjkim        point_add(pre->g_pre_comp[2 * i + 1][0],
2104280297Sjkim                  pre->g_pre_comp[2 * i + 1][1],
2105280297Sjkim                  pre->g_pre_comp[2 * i + 1][2], pre->g_pre_comp[2 * i][0],
2106280297Sjkim                  pre->g_pre_comp[2 * i][1], pre->g_pre_comp[2 * i][2], 0,
2107280297Sjkim                  pre->g_pre_comp[1][0], pre->g_pre_comp[1][1],
2108280297Sjkim                  pre->g_pre_comp[1][2]);
2109280297Sjkim    }
2110280297Sjkim    make_points_affine(15, &(pre->g_pre_comp[1]), tmp_felems);
2111238384Sjkim
2112296279Sjkim done:
2113280297Sjkim    if (!EC_EX_DATA_set_data(&group->extra_data, pre, nistp521_pre_comp_dup,
2114280297Sjkim                             nistp521_pre_comp_free,
2115280297Sjkim                             nistp521_pre_comp_clear_free))
2116280297Sjkim        goto err;
2117280297Sjkim    ret = 1;
2118280297Sjkim    pre = NULL;
2119238384Sjkim err:
2120280297Sjkim    BN_CTX_end(ctx);
2121280297Sjkim    if (generator != NULL)
2122280297Sjkim        EC_POINT_free(generator);
2123280297Sjkim    if (new_ctx != NULL)
2124280297Sjkim        BN_CTX_free(new_ctx);
2125280297Sjkim    if (pre)
2126280297Sjkim        nistp521_pre_comp_free(pre);
2127280297Sjkim    return ret;
2128280297Sjkim}
2129238384Sjkim
2130238384Sjkimint ec_GFp_nistp521_have_precompute_mult(const EC_GROUP *group)
2131280297Sjkim{
2132280297Sjkim    if (EC_EX_DATA_get_data(group->extra_data, nistp521_pre_comp_dup,
2133280297Sjkim                            nistp521_pre_comp_free,
2134280297Sjkim                            nistp521_pre_comp_clear_free)
2135280297Sjkim        != NULL)
2136280297Sjkim        return 1;
2137280297Sjkim    else
2138280297Sjkim        return 0;
2139280297Sjkim}
2140238384Sjkim
2141238384Sjkim#else
2142280297Sjkimstatic void *dummy = &dummy;
2143238384Sjkim#endif
2144