1238384Sjkim/* crypto/ec/ecp_nistp256.c */
2238384Sjkim/*
3238384Sjkim * Written by Adam Langley (Google) for the OpenSSL project
4238384Sjkim */
5238384Sjkim/* Copyright 2011 Google Inc.
6238384Sjkim *
7238384Sjkim * Licensed under the Apache License, Version 2.0 (the "License");
8238384Sjkim *
9238384Sjkim * you may not use this file except in compliance with the License.
10238384Sjkim * You may obtain a copy of the License at
11238384Sjkim *
12238384Sjkim *     http://www.apache.org/licenses/LICENSE-2.0
13238384Sjkim *
14238384Sjkim *  Unless required by applicable law or agreed to in writing, software
15238384Sjkim *  distributed under the License is distributed on an "AS IS" BASIS,
16238384Sjkim *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17238384Sjkim *  See the License for the specific language governing permissions and
18238384Sjkim *  limitations under the License.
19238384Sjkim */
20238384Sjkim
21238384Sjkim/*
22238384Sjkim * A 64-bit implementation of the NIST P-256 elliptic curve point multiplication
23238384Sjkim *
24238384Sjkim * OpenSSL integration was taken from Emilia Kasper's work in ecp_nistp224.c.
25238384Sjkim * Otherwise based on Emilia's P224 work, which was inspired by my curve25519
26238384Sjkim * work which got its smarts from Daniel J. Bernstein's work on the same.
27238384Sjkim */
28238384Sjkim
29238384Sjkim#include <openssl/opensslconf.h>
30238384Sjkim#ifndef OPENSSL_NO_EC_NISTP_64_GCC_128
31238384Sjkim
32280304Sjkim# ifndef OPENSSL_SYS_VMS
33280304Sjkim#  include <stdint.h>
34280304Sjkim# else
35280304Sjkim#  include <inttypes.h>
36280304Sjkim# endif
37238384Sjkim
38280304Sjkim# include <string.h>
39280304Sjkim# include <openssl/err.h>
40280304Sjkim# include "ec_lcl.h"
41238384Sjkim
42280304Sjkim# if defined(__GNUC__) && (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))
43238384Sjkim  /* even with gcc, the typedef won't work for 32-bit platforms */
44280304Sjkimtypedef __uint128_t uint128_t;  /* nonstandard; implemented by gcc on 64-bit
45280304Sjkim                                 * platforms */
46280304Sjkimtypedef __int128_t int128_t;
47280304Sjkim# else
48280304Sjkim#  error "Need GCC 3.1 or later to define type uint128_t"
49280304Sjkim# endif
50238384Sjkim
51238384Sjkimtypedef uint8_t u8;
52238384Sjkimtypedef uint32_t u32;
53238384Sjkimtypedef uint64_t u64;
54238384Sjkimtypedef int64_t s64;
55238384Sjkim
56280304Sjkim/*
57280304Sjkim * The underlying field. P256 operates over GF(2^256-2^224+2^192+2^96-1). We
58280304Sjkim * can serialise an element of this field into 32 bytes. We call this an
59280304Sjkim * felem_bytearray.
60280304Sjkim */
61238384Sjkim
62238384Sjkimtypedef u8 felem_bytearray[32];
63238384Sjkim
64280304Sjkim/*
65280304Sjkim * These are the parameters of P256, taken from FIPS 186-3, page 86. These
66280304Sjkim * values are big-endian.
67280304Sjkim */
68238384Sjkimstatic const felem_bytearray nistp256_curve_params[5] = {
69280304Sjkim    {0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, /* p */
70280304Sjkim     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
71280304Sjkim     0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,
72280304Sjkim     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
73280304Sjkim    {0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, /* a = -3 */
74280304Sjkim     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
75280304Sjkim     0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,
76280304Sjkim     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xfc}, /* b */
77280304Sjkim    {0x5a, 0xc6, 0x35, 0xd8, 0xaa, 0x3a, 0x93, 0xe7,
78280304Sjkim     0xb3, 0xeb, 0xbd, 0x55, 0x76, 0x98, 0x86, 0xbc,
79280304Sjkim     0x65, 0x1d, 0x06, 0xb0, 0xcc, 0x53, 0xb0, 0xf6,
80280304Sjkim     0x3b, 0xce, 0x3c, 0x3e, 0x27, 0xd2, 0x60, 0x4b},
81280304Sjkim    {0x6b, 0x17, 0xd1, 0xf2, 0xe1, 0x2c, 0x42, 0x47, /* x */
82280304Sjkim     0xf8, 0xbc, 0xe6, 0xe5, 0x63, 0xa4, 0x40, 0xf2,
83280304Sjkim     0x77, 0x03, 0x7d, 0x81, 0x2d, 0xeb, 0x33, 0xa0,
84280304Sjkim     0xf4, 0xa1, 0x39, 0x45, 0xd8, 0x98, 0xc2, 0x96},
85280304Sjkim    {0x4f, 0xe3, 0x42, 0xe2, 0xfe, 0x1a, 0x7f, 0x9b, /* y */
86280304Sjkim     0x8e, 0xe7, 0xeb, 0x4a, 0x7c, 0x0f, 0x9e, 0x16,
87280304Sjkim     0x2b, 0xce, 0x33, 0x57, 0x6b, 0x31, 0x5e, 0xce,
88280304Sjkim     0xcb, 0xb6, 0x40, 0x68, 0x37, 0xbf, 0x51, 0xf5}
89238384Sjkim};
90238384Sjkim
91280304Sjkim/*-
92280304Sjkim * The representation of field elements.
93238384Sjkim * ------------------------------------
94238384Sjkim *
95238384Sjkim * We represent field elements with either four 128-bit values, eight 128-bit
96238384Sjkim * values, or four 64-bit values. The field element represented is:
97238384Sjkim *   v[0]*2^0 + v[1]*2^64 + v[2]*2^128 + v[3]*2^192  (mod p)
98238384Sjkim * or:
99238384Sjkim *   v[0]*2^0 + v[1]*2^64 + v[2]*2^128 + ... + v[8]*2^512  (mod p)
100238384Sjkim *
101238384Sjkim * 128-bit values are called 'limbs'. Since the limbs are spaced only 64 bits
102238384Sjkim * apart, but are 128-bits wide, the most significant bits of each limb overlap
103238384Sjkim * with the least significant bits of the next.
104238384Sjkim *
105238384Sjkim * A field element with four limbs is an 'felem'. One with eight limbs is a
106238384Sjkim * 'longfelem'
107238384Sjkim *
108238384Sjkim * A field element with four, 64-bit values is called a 'smallfelem'. Small
109238384Sjkim * values are used as intermediate values before multiplication.
110238384Sjkim */
111238384Sjkim
112280304Sjkim# define NLIMBS 4
113238384Sjkim
114238384Sjkimtypedef uint128_t limb;
115238384Sjkimtypedef limb felem[NLIMBS];
116238384Sjkimtypedef limb longfelem[NLIMBS * 2];
117238384Sjkimtypedef u64 smallfelem[NLIMBS];
118238384Sjkim
119238384Sjkim/* This is the value of the prime as four 64-bit words, little-endian. */
120280304Sjkimstatic const u64 kPrime[4] =
121280304Sjkim    { 0xfffffffffffffffful, 0xffffffff, 0, 0xffffffff00000001ul };
122238384Sjkimstatic const u64 bottom63bits = 0x7ffffffffffffffful;
123238384Sjkim
124280304Sjkim/*
125280304Sjkim * bin32_to_felem takes a little-endian byte array and converts it into felem
126280304Sjkim * form. This assumes that the CPU is little-endian.
127280304Sjkim */
128238384Sjkimstatic void bin32_to_felem(felem out, const u8 in[32])
129280304Sjkim{
130280304Sjkim    out[0] = *((u64 *)&in[0]);
131280304Sjkim    out[1] = *((u64 *)&in[8]);
132280304Sjkim    out[2] = *((u64 *)&in[16]);
133280304Sjkim    out[3] = *((u64 *)&in[24]);
134280304Sjkim}
135238384Sjkim
136280304Sjkim/*
137280304Sjkim * smallfelem_to_bin32 takes a smallfelem and serialises into a little
138280304Sjkim * endian, 32 byte array. This assumes that the CPU is little-endian.
139280304Sjkim */
140238384Sjkimstatic void smallfelem_to_bin32(u8 out[32], const smallfelem in)
141280304Sjkim{
142280304Sjkim    *((u64 *)&out[0]) = in[0];
143280304Sjkim    *((u64 *)&out[8]) = in[1];
144280304Sjkim    *((u64 *)&out[16]) = in[2];
145280304Sjkim    *((u64 *)&out[24]) = in[3];
146280304Sjkim}
147238384Sjkim
148238384Sjkim/* To preserve endianness when using BN_bn2bin and BN_bin2bn */
149238384Sjkimstatic void flip_endian(u8 *out, const u8 *in, unsigned len)
150280304Sjkim{
151280304Sjkim    unsigned i;
152280304Sjkim    for (i = 0; i < len; ++i)
153280304Sjkim        out[i] = in[len - 1 - i];
154280304Sjkim}
155238384Sjkim
156238384Sjkim/* BN_to_felem converts an OpenSSL BIGNUM into an felem */
157238384Sjkimstatic int BN_to_felem(felem out, const BIGNUM *bn)
158280304Sjkim{
159280304Sjkim    felem_bytearray b_in;
160280304Sjkim    felem_bytearray b_out;
161280304Sjkim    unsigned num_bytes;
162238384Sjkim
163280304Sjkim    /* BN_bn2bin eats leading zeroes */
164280304Sjkim    memset(b_out, 0, sizeof b_out);
165280304Sjkim    num_bytes = BN_num_bytes(bn);
166280304Sjkim    if (num_bytes > sizeof b_out) {
167280304Sjkim        ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
168280304Sjkim        return 0;
169280304Sjkim    }
170280304Sjkim    if (BN_is_negative(bn)) {
171280304Sjkim        ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
172280304Sjkim        return 0;
173280304Sjkim    }
174280304Sjkim    num_bytes = BN_bn2bin(bn, b_in);
175280304Sjkim    flip_endian(b_out, b_in, num_bytes);
176280304Sjkim    bin32_to_felem(out, b_out);
177280304Sjkim    return 1;
178280304Sjkim}
179238384Sjkim
180238384Sjkim/* felem_to_BN converts an felem into an OpenSSL BIGNUM */
181238384Sjkimstatic BIGNUM *smallfelem_to_BN(BIGNUM *out, const smallfelem in)
182280304Sjkim{
183280304Sjkim    felem_bytearray b_in, b_out;
184280304Sjkim    smallfelem_to_bin32(b_in, in);
185280304Sjkim    flip_endian(b_out, b_in, sizeof b_out);
186280304Sjkim    return BN_bin2bn(b_out, sizeof b_out, out);
187280304Sjkim}
188238384Sjkim
189280304Sjkim/*-
190280304Sjkim * Field operations
191280304Sjkim * ----------------
192280304Sjkim */
193238384Sjkim
194238384Sjkimstatic void smallfelem_one(smallfelem out)
195280304Sjkim{
196280304Sjkim    out[0] = 1;
197280304Sjkim    out[1] = 0;
198280304Sjkim    out[2] = 0;
199280304Sjkim    out[3] = 0;
200280304Sjkim}
201238384Sjkim
202238384Sjkimstatic void smallfelem_assign(smallfelem out, const smallfelem in)
203280304Sjkim{
204280304Sjkim    out[0] = in[0];
205280304Sjkim    out[1] = in[1];
206280304Sjkim    out[2] = in[2];
207280304Sjkim    out[3] = in[3];
208280304Sjkim}
209238384Sjkim
210238384Sjkimstatic void felem_assign(felem out, const felem in)
211280304Sjkim{
212280304Sjkim    out[0] = in[0];
213280304Sjkim    out[1] = in[1];
214280304Sjkim    out[2] = in[2];
215280304Sjkim    out[3] = in[3];
216280304Sjkim}
217238384Sjkim
218238384Sjkim/* felem_sum sets out = out + in. */
219238384Sjkimstatic void felem_sum(felem out, const felem in)
220280304Sjkim{
221280304Sjkim    out[0] += in[0];
222280304Sjkim    out[1] += in[1];
223280304Sjkim    out[2] += in[2];
224280304Sjkim    out[3] += in[3];
225280304Sjkim}
226238384Sjkim
227238384Sjkim/* felem_small_sum sets out = out + in. */
228238384Sjkimstatic void felem_small_sum(felem out, const smallfelem in)
229280304Sjkim{
230280304Sjkim    out[0] += in[0];
231280304Sjkim    out[1] += in[1];
232280304Sjkim    out[2] += in[2];
233280304Sjkim    out[3] += in[3];
234280304Sjkim}
235238384Sjkim
236238384Sjkim/* felem_scalar sets out = out * scalar */
237238384Sjkimstatic void felem_scalar(felem out, const u64 scalar)
238280304Sjkim{
239280304Sjkim    out[0] *= scalar;
240280304Sjkim    out[1] *= scalar;
241280304Sjkim    out[2] *= scalar;
242280304Sjkim    out[3] *= scalar;
243280304Sjkim}
244238384Sjkim
245238384Sjkim/* longfelem_scalar sets out = out * scalar */
246238384Sjkimstatic void longfelem_scalar(longfelem out, const u64 scalar)
247280304Sjkim{
248280304Sjkim    out[0] *= scalar;
249280304Sjkim    out[1] *= scalar;
250280304Sjkim    out[2] *= scalar;
251280304Sjkim    out[3] *= scalar;
252280304Sjkim    out[4] *= scalar;
253280304Sjkim    out[5] *= scalar;
254280304Sjkim    out[6] *= scalar;
255280304Sjkim    out[7] *= scalar;
256280304Sjkim}
257238384Sjkim
258280304Sjkim# define two105m41m9 (((limb)1) << 105) - (((limb)1) << 41) - (((limb)1) << 9)
259280304Sjkim# define two105 (((limb)1) << 105)
260280304Sjkim# define two105m41p9 (((limb)1) << 105) - (((limb)1) << 41) + (((limb)1) << 9)
261238384Sjkim
262238384Sjkim/* zero105 is 0 mod p */
263280304Sjkimstatic const felem zero105 =
264280304Sjkim    { two105m41m9, two105, two105m41p9, two105m41p9 };
265238384Sjkim
266280304Sjkim/*-
267280304Sjkim * smallfelem_neg sets |out| to |-small|
268238384Sjkim * On exit:
269238384Sjkim *   out[i] < out[i] + 2^105
270238384Sjkim */
271238384Sjkimstatic void smallfelem_neg(felem out, const smallfelem small)
272280304Sjkim{
273280304Sjkim    /* In order to prevent underflow, we subtract from 0 mod p. */
274280304Sjkim    out[0] = zero105[0] - small[0];
275280304Sjkim    out[1] = zero105[1] - small[1];
276280304Sjkim    out[2] = zero105[2] - small[2];
277280304Sjkim    out[3] = zero105[3] - small[3];
278280304Sjkim}
279238384Sjkim
280280304Sjkim/*-
281280304Sjkim * felem_diff subtracts |in| from |out|
282238384Sjkim * On entry:
283238384Sjkim *   in[i] < 2^104
284238384Sjkim * On exit:
285238384Sjkim *   out[i] < out[i] + 2^105
286238384Sjkim */
287238384Sjkimstatic void felem_diff(felem out, const felem in)
288280304Sjkim{
289280304Sjkim    /*
290280304Sjkim     * In order to prevent underflow, we add 0 mod p before subtracting.
291280304Sjkim     */
292280304Sjkim    out[0] += zero105[0];
293280304Sjkim    out[1] += zero105[1];
294280304Sjkim    out[2] += zero105[2];
295280304Sjkim    out[3] += zero105[3];
296238384Sjkim
297280304Sjkim    out[0] -= in[0];
298280304Sjkim    out[1] -= in[1];
299280304Sjkim    out[2] -= in[2];
300280304Sjkim    out[3] -= in[3];
301280304Sjkim}
302238384Sjkim
303280304Sjkim# define two107m43m11 (((limb)1) << 107) - (((limb)1) << 43) - (((limb)1) << 11)
304280304Sjkim# define two107 (((limb)1) << 107)
305280304Sjkim# define two107m43p11 (((limb)1) << 107) - (((limb)1) << 43) + (((limb)1) << 11)
306238384Sjkim
307238384Sjkim/* zero107 is 0 mod p */
308280304Sjkimstatic const felem zero107 =
309280304Sjkim    { two107m43m11, two107, two107m43p11, two107m43p11 };
310238384Sjkim
311280304Sjkim/*-
312280304Sjkim * An alternative felem_diff for larger inputs |in|
313238384Sjkim * felem_diff_zero107 subtracts |in| from |out|
314238384Sjkim * On entry:
315238384Sjkim *   in[i] < 2^106
316238384Sjkim * On exit:
317238384Sjkim *   out[i] < out[i] + 2^107
318238384Sjkim */
319238384Sjkimstatic void felem_diff_zero107(felem out, const felem in)
320280304Sjkim{
321280304Sjkim    /*
322280304Sjkim     * In order to prevent underflow, we add 0 mod p before subtracting.
323280304Sjkim     */
324280304Sjkim    out[0] += zero107[0];
325280304Sjkim    out[1] += zero107[1];
326280304Sjkim    out[2] += zero107[2];
327280304Sjkim    out[3] += zero107[3];
328238384Sjkim
329280304Sjkim    out[0] -= in[0];
330280304Sjkim    out[1] -= in[1];
331280304Sjkim    out[2] -= in[2];
332280304Sjkim    out[3] -= in[3];
333280304Sjkim}
334238384Sjkim
335280304Sjkim/*-
336280304Sjkim * longfelem_diff subtracts |in| from |out|
337238384Sjkim * On entry:
338238384Sjkim *   in[i] < 7*2^67
339238384Sjkim * On exit:
340238384Sjkim *   out[i] < out[i] + 2^70 + 2^40
341238384Sjkim */
342238384Sjkimstatic void longfelem_diff(longfelem out, const longfelem in)
343280304Sjkim{
344280304Sjkim    static const limb two70m8p6 =
345280304Sjkim        (((limb) 1) << 70) - (((limb) 1) << 8) + (((limb) 1) << 6);
346280304Sjkim    static const limb two70p40 = (((limb) 1) << 70) + (((limb) 1) << 40);
347280304Sjkim    static const limb two70 = (((limb) 1) << 70);
348280304Sjkim    static const limb two70m40m38p6 =
349280304Sjkim        (((limb) 1) << 70) - (((limb) 1) << 40) - (((limb) 1) << 38) +
350280304Sjkim        (((limb) 1) << 6);
351280304Sjkim    static const limb two70m6 = (((limb) 1) << 70) - (((limb) 1) << 6);
352238384Sjkim
353280304Sjkim    /* add 0 mod p to avoid underflow */
354280304Sjkim    out[0] += two70m8p6;
355280304Sjkim    out[1] += two70p40;
356280304Sjkim    out[2] += two70;
357280304Sjkim    out[3] += two70m40m38p6;
358280304Sjkim    out[4] += two70m6;
359280304Sjkim    out[5] += two70m6;
360280304Sjkim    out[6] += two70m6;
361280304Sjkim    out[7] += two70m6;
362238384Sjkim
363280304Sjkim    /* in[i] < 7*2^67 < 2^70 - 2^40 - 2^38 + 2^6 */
364280304Sjkim    out[0] -= in[0];
365280304Sjkim    out[1] -= in[1];
366280304Sjkim    out[2] -= in[2];
367280304Sjkim    out[3] -= in[3];
368280304Sjkim    out[4] -= in[4];
369280304Sjkim    out[5] -= in[5];
370280304Sjkim    out[6] -= in[6];
371280304Sjkim    out[7] -= in[7];
372280304Sjkim}
373238384Sjkim
374280304Sjkim# define two64m0 (((limb)1) << 64) - 1
375280304Sjkim# define two110p32m0 (((limb)1) << 110) + (((limb)1) << 32) - 1
376280304Sjkim# define two64m46 (((limb)1) << 64) - (((limb)1) << 46)
377280304Sjkim# define two64m32 (((limb)1) << 64) - (((limb)1) << 32)
378238384Sjkim
379238384Sjkim/* zero110 is 0 mod p */
380238384Sjkimstatic const felem zero110 = { two64m0, two110p32m0, two64m46, two64m32 };
381238384Sjkim
382280304Sjkim/*-
383280304Sjkim * felem_shrink converts an felem into a smallfelem. The result isn't quite
384238384Sjkim * minimal as the value may be greater than p.
385238384Sjkim *
386238384Sjkim * On entry:
387238384Sjkim *   in[i] < 2^109
388238384Sjkim * On exit:
389238384Sjkim *   out[i] < 2^64
390238384Sjkim */
391238384Sjkimstatic void felem_shrink(smallfelem out, const felem in)
392280304Sjkim{
393280304Sjkim    felem tmp;
394280304Sjkim    u64 a, b, mask;
395280304Sjkim    s64 high, low;
396280304Sjkim    static const u64 kPrime3Test = 0x7fffffff00000001ul; /* 2^63 - 2^32 + 1 */
397238384Sjkim
398280304Sjkim    /* Carry 2->3 */
399280304Sjkim    tmp[3] = zero110[3] + in[3] + ((u64)(in[2] >> 64));
400280304Sjkim    /* tmp[3] < 2^110 */
401238384Sjkim
402280304Sjkim    tmp[2] = zero110[2] + (u64)in[2];
403280304Sjkim    tmp[0] = zero110[0] + in[0];
404280304Sjkim    tmp[1] = zero110[1] + in[1];
405280304Sjkim    /* tmp[0] < 2**110, tmp[1] < 2^111, tmp[2] < 2**65 */
406238384Sjkim
407280304Sjkim    /*
408280304Sjkim     * We perform two partial reductions where we eliminate the high-word of
409280304Sjkim     * tmp[3]. We don't update the other words till the end.
410280304Sjkim     */
411280304Sjkim    a = tmp[3] >> 64;           /* a < 2^46 */
412280304Sjkim    tmp[3] = (u64)tmp[3];
413280304Sjkim    tmp[3] -= a;
414280304Sjkim    tmp[3] += ((limb) a) << 32;
415280304Sjkim    /* tmp[3] < 2^79 */
416238384Sjkim
417280304Sjkim    b = a;
418280304Sjkim    a = tmp[3] >> 64;           /* a < 2^15 */
419280304Sjkim    b += a;                     /* b < 2^46 + 2^15 < 2^47 */
420280304Sjkim    tmp[3] = (u64)tmp[3];
421280304Sjkim    tmp[3] -= a;
422280304Sjkim    tmp[3] += ((limb) a) << 32;
423280304Sjkim    /* tmp[3] < 2^64 + 2^47 */
424238384Sjkim
425280304Sjkim    /*
426280304Sjkim     * This adjusts the other two words to complete the two partial
427280304Sjkim     * reductions.
428280304Sjkim     */
429280304Sjkim    tmp[0] += b;
430280304Sjkim    tmp[1] -= (((limb) b) << 32);
431238384Sjkim
432280304Sjkim    /*
433280304Sjkim     * In order to make space in tmp[3] for the carry from 2 -> 3, we
434280304Sjkim     * conditionally subtract kPrime if tmp[3] is large enough.
435280304Sjkim     */
436280304Sjkim    high = tmp[3] >> 64;
437280304Sjkim    /* As tmp[3] < 2^65, high is either 1 or 0 */
438280304Sjkim    high <<= 63;
439280304Sjkim    high >>= 63;
440280304Sjkim    /*-
441280304Sjkim     * high is:
442280304Sjkim     *   all ones   if the high word of tmp[3] is 1
443280304Sjkim     *   all zeros  if the high word of tmp[3] if 0 */
444280304Sjkim    low = tmp[3];
445280304Sjkim    mask = low >> 63;
446280304Sjkim    /*-
447280304Sjkim     * mask is:
448280304Sjkim     *   all ones   if the MSB of low is 1
449280304Sjkim     *   all zeros  if the MSB of low if 0 */
450280304Sjkim    low &= bottom63bits;
451280304Sjkim    low -= kPrime3Test;
452280304Sjkim    /* if low was greater than kPrime3Test then the MSB is zero */
453280304Sjkim    low = ~low;
454280304Sjkim    low >>= 63;
455280304Sjkim    /*-
456280304Sjkim     * low is:
457280304Sjkim     *   all ones   if low was > kPrime3Test
458280304Sjkim     *   all zeros  if low was <= kPrime3Test */
459280304Sjkim    mask = (mask & low) | high;
460280304Sjkim    tmp[0] -= mask & kPrime[0];
461280304Sjkim    tmp[1] -= mask & kPrime[1];
462280304Sjkim    /* kPrime[2] is zero, so omitted */
463280304Sjkim    tmp[3] -= mask & kPrime[3];
464280304Sjkim    /* tmp[3] < 2**64 - 2**32 + 1 */
465238384Sjkim
466280304Sjkim    tmp[1] += ((u64)(tmp[0] >> 64));
467280304Sjkim    tmp[0] = (u64)tmp[0];
468280304Sjkim    tmp[2] += ((u64)(tmp[1] >> 64));
469280304Sjkim    tmp[1] = (u64)tmp[1];
470280304Sjkim    tmp[3] += ((u64)(tmp[2] >> 64));
471280304Sjkim    tmp[2] = (u64)tmp[2];
472280304Sjkim    /* tmp[i] < 2^64 */
473238384Sjkim
474280304Sjkim    out[0] = tmp[0];
475280304Sjkim    out[1] = tmp[1];
476280304Sjkim    out[2] = tmp[2];
477280304Sjkim    out[3] = tmp[3];
478280304Sjkim}
479238384Sjkim
480238384Sjkim/* smallfelem_expand converts a smallfelem to an felem */
481238384Sjkimstatic void smallfelem_expand(felem out, const smallfelem in)
482280304Sjkim{
483280304Sjkim    out[0] = in[0];
484280304Sjkim    out[1] = in[1];
485280304Sjkim    out[2] = in[2];
486280304Sjkim    out[3] = in[3];
487280304Sjkim}
488238384Sjkim
489280304Sjkim/*-
490280304Sjkim * smallfelem_square sets |out| = |small|^2
491238384Sjkim * On entry:
492238384Sjkim *   small[i] < 2^64
493238384Sjkim * On exit:
494238384Sjkim *   out[i] < 7 * 2^64 < 2^67
495238384Sjkim */
496238384Sjkimstatic void smallfelem_square(longfelem out, const smallfelem small)
497280304Sjkim{
498280304Sjkim    limb a;
499280304Sjkim    u64 high, low;
500238384Sjkim
501280304Sjkim    a = ((uint128_t) small[0]) * small[0];
502280304Sjkim    low = a;
503280304Sjkim    high = a >> 64;
504280304Sjkim    out[0] = low;
505280304Sjkim    out[1] = high;
506238384Sjkim
507280304Sjkim    a = ((uint128_t) small[0]) * small[1];
508280304Sjkim    low = a;
509280304Sjkim    high = a >> 64;
510280304Sjkim    out[1] += low;
511280304Sjkim    out[1] += low;
512280304Sjkim    out[2] = high;
513238384Sjkim
514280304Sjkim    a = ((uint128_t) small[0]) * small[2];
515280304Sjkim    low = a;
516280304Sjkim    high = a >> 64;
517280304Sjkim    out[2] += low;
518280304Sjkim    out[2] *= 2;
519280304Sjkim    out[3] = high;
520238384Sjkim
521280304Sjkim    a = ((uint128_t) small[0]) * small[3];
522280304Sjkim    low = a;
523280304Sjkim    high = a >> 64;
524280304Sjkim    out[3] += low;
525280304Sjkim    out[4] = high;
526238384Sjkim
527280304Sjkim    a = ((uint128_t) small[1]) * small[2];
528280304Sjkim    low = a;
529280304Sjkim    high = a >> 64;
530280304Sjkim    out[3] += low;
531280304Sjkim    out[3] *= 2;
532280304Sjkim    out[4] += high;
533238384Sjkim
534280304Sjkim    a = ((uint128_t) small[1]) * small[1];
535280304Sjkim    low = a;
536280304Sjkim    high = a >> 64;
537280304Sjkim    out[2] += low;
538280304Sjkim    out[3] += high;
539238384Sjkim
540280304Sjkim    a = ((uint128_t) small[1]) * small[3];
541280304Sjkim    low = a;
542280304Sjkim    high = a >> 64;
543280304Sjkim    out[4] += low;
544280304Sjkim    out[4] *= 2;
545280304Sjkim    out[5] = high;
546238384Sjkim
547280304Sjkim    a = ((uint128_t) small[2]) * small[3];
548280304Sjkim    low = a;
549280304Sjkim    high = a >> 64;
550280304Sjkim    out[5] += low;
551280304Sjkim    out[5] *= 2;
552280304Sjkim    out[6] = high;
553280304Sjkim    out[6] += high;
554238384Sjkim
555280304Sjkim    a = ((uint128_t) small[2]) * small[2];
556280304Sjkim    low = a;
557280304Sjkim    high = a >> 64;
558280304Sjkim    out[4] += low;
559280304Sjkim    out[5] += high;
560238384Sjkim
561280304Sjkim    a = ((uint128_t) small[3]) * small[3];
562280304Sjkim    low = a;
563280304Sjkim    high = a >> 64;
564280304Sjkim    out[6] += low;
565280304Sjkim    out[7] = high;
566280304Sjkim}
567238384Sjkim
568280304Sjkim/*-
569280304Sjkim * felem_square sets |out| = |in|^2
570238384Sjkim * On entry:
571238384Sjkim *   in[i] < 2^109
572238384Sjkim * On exit:
573238384Sjkim *   out[i] < 7 * 2^64 < 2^67
574238384Sjkim */
575238384Sjkimstatic void felem_square(longfelem out, const felem in)
576280304Sjkim{
577280304Sjkim    u64 small[4];
578280304Sjkim    felem_shrink(small, in);
579280304Sjkim    smallfelem_square(out, small);
580280304Sjkim}
581238384Sjkim
582280304Sjkim/*-
583280304Sjkim * smallfelem_mul sets |out| = |small1| * |small2|
584238384Sjkim * On entry:
585238384Sjkim *   small1[i] < 2^64
586238384Sjkim *   small2[i] < 2^64
587238384Sjkim * On exit:
588238384Sjkim *   out[i] < 7 * 2^64 < 2^67
589238384Sjkim */
590280304Sjkimstatic void smallfelem_mul(longfelem out, const smallfelem small1,
591280304Sjkim                           const smallfelem small2)
592280304Sjkim{
593280304Sjkim    limb a;
594280304Sjkim    u64 high, low;
595238384Sjkim
596280304Sjkim    a = ((uint128_t) small1[0]) * small2[0];
597280304Sjkim    low = a;
598280304Sjkim    high = a >> 64;
599280304Sjkim    out[0] = low;
600280304Sjkim    out[1] = high;
601238384Sjkim
602280304Sjkim    a = ((uint128_t) small1[0]) * small2[1];
603280304Sjkim    low = a;
604280304Sjkim    high = a >> 64;
605280304Sjkim    out[1] += low;
606280304Sjkim    out[2] = high;
607238384Sjkim
608280304Sjkim    a = ((uint128_t) small1[1]) * small2[0];
609280304Sjkim    low = a;
610280304Sjkim    high = a >> 64;
611280304Sjkim    out[1] += low;
612280304Sjkim    out[2] += high;
613238384Sjkim
614280304Sjkim    a = ((uint128_t) small1[0]) * small2[2];
615280304Sjkim    low = a;
616280304Sjkim    high = a >> 64;
617280304Sjkim    out[2] += low;
618280304Sjkim    out[3] = high;
619238384Sjkim
620280304Sjkim    a = ((uint128_t) small1[1]) * small2[1];
621280304Sjkim    low = a;
622280304Sjkim    high = a >> 64;
623280304Sjkim    out[2] += low;
624280304Sjkim    out[3] += high;
625238384Sjkim
626280304Sjkim    a = ((uint128_t) small1[2]) * small2[0];
627280304Sjkim    low = a;
628280304Sjkim    high = a >> 64;
629280304Sjkim    out[2] += low;
630280304Sjkim    out[3] += high;
631238384Sjkim
632280304Sjkim    a = ((uint128_t) small1[0]) * small2[3];
633280304Sjkim    low = a;
634280304Sjkim    high = a >> 64;
635280304Sjkim    out[3] += low;
636280304Sjkim    out[4] = high;
637238384Sjkim
638280304Sjkim    a = ((uint128_t) small1[1]) * small2[2];
639280304Sjkim    low = a;
640280304Sjkim    high = a >> 64;
641280304Sjkim    out[3] += low;
642280304Sjkim    out[4] += high;
643238384Sjkim
644280304Sjkim    a = ((uint128_t) small1[2]) * small2[1];
645280304Sjkim    low = a;
646280304Sjkim    high = a >> 64;
647280304Sjkim    out[3] += low;
648280304Sjkim    out[4] += high;
649238384Sjkim
650280304Sjkim    a = ((uint128_t) small1[3]) * small2[0];
651280304Sjkim    low = a;
652280304Sjkim    high = a >> 64;
653280304Sjkim    out[3] += low;
654280304Sjkim    out[4] += high;
655238384Sjkim
656280304Sjkim    a = ((uint128_t) small1[1]) * small2[3];
657280304Sjkim    low = a;
658280304Sjkim    high = a >> 64;
659280304Sjkim    out[4] += low;
660280304Sjkim    out[5] = high;
661238384Sjkim
662280304Sjkim    a = ((uint128_t) small1[2]) * small2[2];
663280304Sjkim    low = a;
664280304Sjkim    high = a >> 64;
665280304Sjkim    out[4] += low;
666280304Sjkim    out[5] += high;
667238384Sjkim
668280304Sjkim    a = ((uint128_t) small1[3]) * small2[1];
669280304Sjkim    low = a;
670280304Sjkim    high = a >> 64;
671280304Sjkim    out[4] += low;
672280304Sjkim    out[5] += high;
673238384Sjkim
674280304Sjkim    a = ((uint128_t) small1[2]) * small2[3];
675280304Sjkim    low = a;
676280304Sjkim    high = a >> 64;
677280304Sjkim    out[5] += low;
678280304Sjkim    out[6] = high;
679238384Sjkim
680280304Sjkim    a = ((uint128_t) small1[3]) * small2[2];
681280304Sjkim    low = a;
682280304Sjkim    high = a >> 64;
683280304Sjkim    out[5] += low;
684280304Sjkim    out[6] += high;
685238384Sjkim
686280304Sjkim    a = ((uint128_t) small1[3]) * small2[3];
687280304Sjkim    low = a;
688280304Sjkim    high = a >> 64;
689280304Sjkim    out[6] += low;
690280304Sjkim    out[7] = high;
691280304Sjkim}
692238384Sjkim
693280304Sjkim/*-
694280304Sjkim * felem_mul sets |out| = |in1| * |in2|
695238384Sjkim * On entry:
696238384Sjkim *   in1[i] < 2^109
697238384Sjkim *   in2[i] < 2^109
698238384Sjkim * On exit:
699238384Sjkim *   out[i] < 7 * 2^64 < 2^67
700238384Sjkim */
701238384Sjkimstatic void felem_mul(longfelem out, const felem in1, const felem in2)
702280304Sjkim{
703280304Sjkim    smallfelem small1, small2;
704280304Sjkim    felem_shrink(small1, in1);
705280304Sjkim    felem_shrink(small2, in2);
706280304Sjkim    smallfelem_mul(out, small1, small2);
707280304Sjkim}
708238384Sjkim
709280304Sjkim/*-
710280304Sjkim * felem_small_mul sets |out| = |small1| * |in2|
711238384Sjkim * On entry:
712238384Sjkim *   small1[i] < 2^64
713238384Sjkim *   in2[i] < 2^109
714238384Sjkim * On exit:
715238384Sjkim *   out[i] < 7 * 2^64 < 2^67
716238384Sjkim */
717280304Sjkimstatic void felem_small_mul(longfelem out, const smallfelem small1,
718280304Sjkim                            const felem in2)
719280304Sjkim{
720280304Sjkim    smallfelem small2;
721280304Sjkim    felem_shrink(small2, in2);
722280304Sjkim    smallfelem_mul(out, small1, small2);
723280304Sjkim}
724238384Sjkim
725280304Sjkim# define two100m36m4 (((limb)1) << 100) - (((limb)1) << 36) - (((limb)1) << 4)
726280304Sjkim# define two100 (((limb)1) << 100)
727280304Sjkim# define two100m36p4 (((limb)1) << 100) - (((limb)1) << 36) + (((limb)1) << 4)
728238384Sjkim/* zero100 is 0 mod p */
729280304Sjkimstatic const felem zero100 =
730280304Sjkim    { two100m36m4, two100, two100m36p4, two100m36p4 };
731238384Sjkim
732280304Sjkim/*-
733280304Sjkim * Internal function for the different flavours of felem_reduce.
734238384Sjkim * felem_reduce_ reduces the higher coefficients in[4]-in[7].
735238384Sjkim * On entry:
736280304Sjkim *   out[0] >= in[6] + 2^32*in[6] + in[7] + 2^32*in[7]
737238384Sjkim *   out[1] >= in[7] + 2^32*in[4]
738238384Sjkim *   out[2] >= in[5] + 2^32*in[5]
739238384Sjkim *   out[3] >= in[4] + 2^32*in[5] + 2^32*in[6]
740238384Sjkim * On exit:
741238384Sjkim *   out[0] <= out[0] + in[4] + 2^32*in[5]
742238384Sjkim *   out[1] <= out[1] + in[5] + 2^33*in[6]
743238384Sjkim *   out[2] <= out[2] + in[7] + 2*in[6] + 2^33*in[7]
744238384Sjkim *   out[3] <= out[3] + 2^32*in[4] + 3*in[7]
745238384Sjkim */
746238384Sjkimstatic void felem_reduce_(felem out, const longfelem in)
747280304Sjkim{
748280304Sjkim    int128_t c;
749280304Sjkim    /* combine common terms from below */
750280304Sjkim    c = in[4] + (in[5] << 32);
751280304Sjkim    out[0] += c;
752280304Sjkim    out[3] -= c;
753238384Sjkim
754280304Sjkim    c = in[5] - in[7];
755280304Sjkim    out[1] += c;
756280304Sjkim    out[2] -= c;
757238384Sjkim
758280304Sjkim    /* the remaining terms */
759280304Sjkim    /* 256: [(0,1),(96,-1),(192,-1),(224,1)] */
760280304Sjkim    out[1] -= (in[4] << 32);
761280304Sjkim    out[3] += (in[4] << 32);
762238384Sjkim
763280304Sjkim    /* 320: [(32,1),(64,1),(128,-1),(160,-1),(224,-1)] */
764280304Sjkim    out[2] -= (in[5] << 32);
765238384Sjkim
766280304Sjkim    /* 384: [(0,-1),(32,-1),(96,2),(128,2),(224,-1)] */
767280304Sjkim    out[0] -= in[6];
768280304Sjkim    out[0] -= (in[6] << 32);
769280304Sjkim    out[1] += (in[6] << 33);
770280304Sjkim    out[2] += (in[6] * 2);
771280304Sjkim    out[3] -= (in[6] << 32);
772238384Sjkim
773280304Sjkim    /* 448: [(0,-1),(32,-1),(64,-1),(128,1),(160,2),(192,3)] */
774280304Sjkim    out[0] -= in[7];
775280304Sjkim    out[0] -= (in[7] << 32);
776280304Sjkim    out[2] += (in[7] << 33);
777280304Sjkim    out[3] += (in[7] * 3);
778280304Sjkim}
779238384Sjkim
780280304Sjkim/*-
781280304Sjkim * felem_reduce converts a longfelem into an felem.
782238384Sjkim * To be called directly after felem_square or felem_mul.
783238384Sjkim * On entry:
784238384Sjkim *   in[0] < 2^64, in[1] < 3*2^64, in[2] < 5*2^64, in[3] < 7*2^64
785238384Sjkim *   in[4] < 7*2^64, in[5] < 5*2^64, in[6] < 3*2^64, in[7] < 2*64
786238384Sjkim * On exit:
787238384Sjkim *   out[i] < 2^101
788238384Sjkim */
789238384Sjkimstatic void felem_reduce(felem out, const longfelem in)
790280304Sjkim{
791280304Sjkim    out[0] = zero100[0] + in[0];
792280304Sjkim    out[1] = zero100[1] + in[1];
793280304Sjkim    out[2] = zero100[2] + in[2];
794280304Sjkim    out[3] = zero100[3] + in[3];
795238384Sjkim
796280304Sjkim    felem_reduce_(out, in);
797238384Sjkim
798280304Sjkim    /*-
799280304Sjkim     * out[0] > 2^100 - 2^36 - 2^4 - 3*2^64 - 3*2^96 - 2^64 - 2^96 > 0
800280304Sjkim     * out[1] > 2^100 - 2^64 - 7*2^96 > 0
801280304Sjkim     * out[2] > 2^100 - 2^36 + 2^4 - 5*2^64 - 5*2^96 > 0
802280304Sjkim     * out[3] > 2^100 - 2^36 + 2^4 - 7*2^64 - 5*2^96 - 3*2^96 > 0
803280304Sjkim     *
804280304Sjkim     * out[0] < 2^100 + 2^64 + 7*2^64 + 5*2^96 < 2^101
805280304Sjkim     * out[1] < 2^100 + 3*2^64 + 5*2^64 + 3*2^97 < 2^101
806280304Sjkim     * out[2] < 2^100 + 5*2^64 + 2^64 + 3*2^65 + 2^97 < 2^101
807280304Sjkim     * out[3] < 2^100 + 7*2^64 + 7*2^96 + 3*2^64 < 2^101
808280304Sjkim     */
809280304Sjkim}
810238384Sjkim
811280304Sjkim/*-
812280304Sjkim * felem_reduce_zero105 converts a larger longfelem into an felem.
813238384Sjkim * On entry:
814238384Sjkim *   in[0] < 2^71
815238384Sjkim * On exit:
816238384Sjkim *   out[i] < 2^106
817238384Sjkim */
818238384Sjkimstatic void felem_reduce_zero105(felem out, const longfelem in)
819280304Sjkim{
820280304Sjkim    out[0] = zero105[0] + in[0];
821280304Sjkim    out[1] = zero105[1] + in[1];
822280304Sjkim    out[2] = zero105[2] + in[2];
823280304Sjkim    out[3] = zero105[3] + in[3];
824238384Sjkim
825280304Sjkim    felem_reduce_(out, in);
826238384Sjkim
827280304Sjkim    /*-
828280304Sjkim     * out[0] > 2^105 - 2^41 - 2^9 - 2^71 - 2^103 - 2^71 - 2^103 > 0
829280304Sjkim     * out[1] > 2^105 - 2^71 - 2^103 > 0
830280304Sjkim     * out[2] > 2^105 - 2^41 + 2^9 - 2^71 - 2^103 > 0
831280304Sjkim     * out[3] > 2^105 - 2^41 + 2^9 - 2^71 - 2^103 - 2^103 > 0
832280304Sjkim     *
833280304Sjkim     * out[0] < 2^105 + 2^71 + 2^71 + 2^103 < 2^106
834280304Sjkim     * out[1] < 2^105 + 2^71 + 2^71 + 2^103 < 2^106
835280304Sjkim     * out[2] < 2^105 + 2^71 + 2^71 + 2^71 + 2^103 < 2^106
836280304Sjkim     * out[3] < 2^105 + 2^71 + 2^103 + 2^71 < 2^106
837280304Sjkim     */
838280304Sjkim}
839238384Sjkim
840280304Sjkim/*
841280304Sjkim * subtract_u64 sets *result = *result - v and *carry to one if the
842280304Sjkim * subtraction underflowed.
843280304Sjkim */
844280304Sjkimstatic void subtract_u64(u64 *result, u64 *carry, u64 v)
845280304Sjkim{
846280304Sjkim    uint128_t r = *result;
847280304Sjkim    r -= v;
848280304Sjkim    *carry = (r >> 64) & 1;
849280304Sjkim    *result = (u64)r;
850280304Sjkim}
851238384Sjkim
852280304Sjkim/*
853280304Sjkim * felem_contract converts |in| to its unique, minimal representation. On
854280304Sjkim * entry: in[i] < 2^109
855238384Sjkim */
856238384Sjkimstatic void felem_contract(smallfelem out, const felem in)
857280304Sjkim{
858280304Sjkim    unsigned i;
859280304Sjkim    u64 all_equal_so_far = 0, result = 0, carry;
860238384Sjkim
861280304Sjkim    felem_shrink(out, in);
862280304Sjkim    /* small is minimal except that the value might be > p */
863238384Sjkim
864280304Sjkim    all_equal_so_far--;
865280304Sjkim    /*
866280304Sjkim     * We are doing a constant time test if out >= kPrime. We need to compare
867280304Sjkim     * each u64, from most-significant to least significant. For each one, if
868280304Sjkim     * all words so far have been equal (m is all ones) then a non-equal
869280304Sjkim     * result is the answer. Otherwise we continue.
870280304Sjkim     */
871280304Sjkim    for (i = 3; i < 4; i--) {
872280304Sjkim        u64 equal;
873280304Sjkim        uint128_t a = ((uint128_t) kPrime[i]) - out[i];
874280304Sjkim        /*
875280304Sjkim         * if out[i] > kPrime[i] then a will underflow and the high 64-bits
876280304Sjkim         * will all be set.
877280304Sjkim         */
878280304Sjkim        result |= all_equal_so_far & ((u64)(a >> 64));
879238384Sjkim
880280304Sjkim        /*
881280304Sjkim         * if kPrime[i] == out[i] then |equal| will be all zeros and the
882280304Sjkim         * decrement will make it all ones.
883280304Sjkim         */
884280304Sjkim        equal = kPrime[i] ^ out[i];
885280304Sjkim        equal--;
886280304Sjkim        equal &= equal << 32;
887280304Sjkim        equal &= equal << 16;
888280304Sjkim        equal &= equal << 8;
889280304Sjkim        equal &= equal << 4;
890280304Sjkim        equal &= equal << 2;
891280304Sjkim        equal &= equal << 1;
892280304Sjkim        equal = ((s64) equal) >> 63;
893238384Sjkim
894280304Sjkim        all_equal_so_far &= equal;
895280304Sjkim    }
896238384Sjkim
897280304Sjkim    /*
898280304Sjkim     * if all_equal_so_far is still all ones then the two values are equal
899280304Sjkim     * and so out >= kPrime is true.
900280304Sjkim     */
901280304Sjkim    result |= all_equal_so_far;
902238384Sjkim
903280304Sjkim    /* if out >= kPrime then we subtract kPrime. */
904280304Sjkim    subtract_u64(&out[0], &carry, result & kPrime[0]);
905280304Sjkim    subtract_u64(&out[1], &carry, carry);
906280304Sjkim    subtract_u64(&out[2], &carry, carry);
907280304Sjkim    subtract_u64(&out[3], &carry, carry);
908238384Sjkim
909280304Sjkim    subtract_u64(&out[1], &carry, result & kPrime[1]);
910280304Sjkim    subtract_u64(&out[2], &carry, carry);
911280304Sjkim    subtract_u64(&out[3], &carry, carry);
912238384Sjkim
913280304Sjkim    subtract_u64(&out[2], &carry, result & kPrime[2]);
914280304Sjkim    subtract_u64(&out[3], &carry, carry);
915238384Sjkim
916280304Sjkim    subtract_u64(&out[3], &carry, result & kPrime[3]);
917280304Sjkim}
918238384Sjkim
919238384Sjkimstatic void smallfelem_square_contract(smallfelem out, const smallfelem in)
920280304Sjkim{
921280304Sjkim    longfelem longtmp;
922280304Sjkim    felem tmp;
923238384Sjkim
924280304Sjkim    smallfelem_square(longtmp, in);
925280304Sjkim    felem_reduce(tmp, longtmp);
926280304Sjkim    felem_contract(out, tmp);
927280304Sjkim}
928238384Sjkim
929280304Sjkimstatic void smallfelem_mul_contract(smallfelem out, const smallfelem in1,
930280304Sjkim                                    const smallfelem in2)
931280304Sjkim{
932280304Sjkim    longfelem longtmp;
933280304Sjkim    felem tmp;
934238384Sjkim
935280304Sjkim    smallfelem_mul(longtmp, in1, in2);
936280304Sjkim    felem_reduce(tmp, longtmp);
937280304Sjkim    felem_contract(out, tmp);
938280304Sjkim}
939238384Sjkim
940280304Sjkim/*-
941280304Sjkim * felem_is_zero returns a limb with all bits set if |in| == 0 (mod p) and 0
942238384Sjkim * otherwise.
943238384Sjkim * On entry:
944238384Sjkim *   small[i] < 2^64
945238384Sjkim */
946238384Sjkimstatic limb smallfelem_is_zero(const smallfelem small)
947280304Sjkim{
948280304Sjkim    limb result;
949280304Sjkim    u64 is_p;
950238384Sjkim
951280304Sjkim    u64 is_zero = small[0] | small[1] | small[2] | small[3];
952280304Sjkim    is_zero--;
953280304Sjkim    is_zero &= is_zero << 32;
954280304Sjkim    is_zero &= is_zero << 16;
955280304Sjkim    is_zero &= is_zero << 8;
956280304Sjkim    is_zero &= is_zero << 4;
957280304Sjkim    is_zero &= is_zero << 2;
958280304Sjkim    is_zero &= is_zero << 1;
959280304Sjkim    is_zero = ((s64) is_zero) >> 63;
960238384Sjkim
961280304Sjkim    is_p = (small[0] ^ kPrime[0]) |
962280304Sjkim        (small[1] ^ kPrime[1]) |
963280304Sjkim        (small[2] ^ kPrime[2]) | (small[3] ^ kPrime[3]);
964280304Sjkim    is_p--;
965280304Sjkim    is_p &= is_p << 32;
966280304Sjkim    is_p &= is_p << 16;
967280304Sjkim    is_p &= is_p << 8;
968280304Sjkim    is_p &= is_p << 4;
969280304Sjkim    is_p &= is_p << 2;
970280304Sjkim    is_p &= is_p << 1;
971280304Sjkim    is_p = ((s64) is_p) >> 63;
972238384Sjkim
973280304Sjkim    is_zero |= is_p;
974238384Sjkim
975280304Sjkim    result = is_zero;
976280304Sjkim    result |= ((limb) is_zero) << 64;
977280304Sjkim    return result;
978280304Sjkim}
979238384Sjkim
980238384Sjkimstatic int smallfelem_is_zero_int(const smallfelem small)
981280304Sjkim{
982280304Sjkim    return (int)(smallfelem_is_zero(small) & ((limb) 1));
983280304Sjkim}
984238384Sjkim
985280304Sjkim/*-
986280304Sjkim * felem_inv calculates |out| = |in|^{-1}
987238384Sjkim *
988238384Sjkim * Based on Fermat's Little Theorem:
989238384Sjkim *   a^p = a (mod p)
990238384Sjkim *   a^{p-1} = 1 (mod p)
991238384Sjkim *   a^{p-2} = a^{-1} (mod p)
992238384Sjkim */
993238384Sjkimstatic void felem_inv(felem out, const felem in)
994280304Sjkim{
995280304Sjkim    felem ftmp, ftmp2;
996280304Sjkim    /* each e_I will hold |in|^{2^I - 1} */
997280304Sjkim    felem e2, e4, e8, e16, e32, e64;
998280304Sjkim    longfelem tmp;
999280304Sjkim    unsigned i;
1000238384Sjkim
1001280304Sjkim    felem_square(tmp, in);
1002280304Sjkim    felem_reduce(ftmp, tmp);    /* 2^1 */
1003280304Sjkim    felem_mul(tmp, in, ftmp);
1004280304Sjkim    felem_reduce(ftmp, tmp);    /* 2^2 - 2^0 */
1005280304Sjkim    felem_assign(e2, ftmp);
1006280304Sjkim    felem_square(tmp, ftmp);
1007280304Sjkim    felem_reduce(ftmp, tmp);    /* 2^3 - 2^1 */
1008280304Sjkim    felem_square(tmp, ftmp);
1009280304Sjkim    felem_reduce(ftmp, tmp);    /* 2^4 - 2^2 */
1010280304Sjkim    felem_mul(tmp, ftmp, e2);
1011280304Sjkim    felem_reduce(ftmp, tmp);    /* 2^4 - 2^0 */
1012280304Sjkim    felem_assign(e4, ftmp);
1013280304Sjkim    felem_square(tmp, ftmp);
1014280304Sjkim    felem_reduce(ftmp, tmp);    /* 2^5 - 2^1 */
1015280304Sjkim    felem_square(tmp, ftmp);
1016280304Sjkim    felem_reduce(ftmp, tmp);    /* 2^6 - 2^2 */
1017280304Sjkim    felem_square(tmp, ftmp);
1018280304Sjkim    felem_reduce(ftmp, tmp);    /* 2^7 - 2^3 */
1019280304Sjkim    felem_square(tmp, ftmp);
1020280304Sjkim    felem_reduce(ftmp, tmp);    /* 2^8 - 2^4 */
1021280304Sjkim    felem_mul(tmp, ftmp, e4);
1022280304Sjkim    felem_reduce(ftmp, tmp);    /* 2^8 - 2^0 */
1023280304Sjkim    felem_assign(e8, ftmp);
1024280304Sjkim    for (i = 0; i < 8; i++) {
1025280304Sjkim        felem_square(tmp, ftmp);
1026280304Sjkim        felem_reduce(ftmp, tmp);
1027280304Sjkim    }                           /* 2^16 - 2^8 */
1028280304Sjkim    felem_mul(tmp, ftmp, e8);
1029280304Sjkim    felem_reduce(ftmp, tmp);    /* 2^16 - 2^0 */
1030280304Sjkim    felem_assign(e16, ftmp);
1031280304Sjkim    for (i = 0; i < 16; i++) {
1032280304Sjkim        felem_square(tmp, ftmp);
1033280304Sjkim        felem_reduce(ftmp, tmp);
1034280304Sjkim    }                           /* 2^32 - 2^16 */
1035280304Sjkim    felem_mul(tmp, ftmp, e16);
1036280304Sjkim    felem_reduce(ftmp, tmp);    /* 2^32 - 2^0 */
1037280304Sjkim    felem_assign(e32, ftmp);
1038280304Sjkim    for (i = 0; i < 32; i++) {
1039280304Sjkim        felem_square(tmp, ftmp);
1040280304Sjkim        felem_reduce(ftmp, tmp);
1041280304Sjkim    }                           /* 2^64 - 2^32 */
1042280304Sjkim    felem_assign(e64, ftmp);
1043280304Sjkim    felem_mul(tmp, ftmp, in);
1044280304Sjkim    felem_reduce(ftmp, tmp);    /* 2^64 - 2^32 + 2^0 */
1045280304Sjkim    for (i = 0; i < 192; i++) {
1046280304Sjkim        felem_square(tmp, ftmp);
1047280304Sjkim        felem_reduce(ftmp, tmp);
1048280304Sjkim    }                           /* 2^256 - 2^224 + 2^192 */
1049238384Sjkim
1050280304Sjkim    felem_mul(tmp, e64, e32);
1051280304Sjkim    felem_reduce(ftmp2, tmp);   /* 2^64 - 2^0 */
1052280304Sjkim    for (i = 0; i < 16; i++) {
1053280304Sjkim        felem_square(tmp, ftmp2);
1054280304Sjkim        felem_reduce(ftmp2, tmp);
1055280304Sjkim    }                           /* 2^80 - 2^16 */
1056280304Sjkim    felem_mul(tmp, ftmp2, e16);
1057280304Sjkim    felem_reduce(ftmp2, tmp);   /* 2^80 - 2^0 */
1058280304Sjkim    for (i = 0; i < 8; i++) {
1059280304Sjkim        felem_square(tmp, ftmp2);
1060280304Sjkim        felem_reduce(ftmp2, tmp);
1061280304Sjkim    }                           /* 2^88 - 2^8 */
1062280304Sjkim    felem_mul(tmp, ftmp2, e8);
1063280304Sjkim    felem_reduce(ftmp2, tmp);   /* 2^88 - 2^0 */
1064280304Sjkim    for (i = 0; i < 4; i++) {
1065280304Sjkim        felem_square(tmp, ftmp2);
1066280304Sjkim        felem_reduce(ftmp2, tmp);
1067280304Sjkim    }                           /* 2^92 - 2^4 */
1068280304Sjkim    felem_mul(tmp, ftmp2, e4);
1069280304Sjkim    felem_reduce(ftmp2, tmp);   /* 2^92 - 2^0 */
1070280304Sjkim    felem_square(tmp, ftmp2);
1071280304Sjkim    felem_reduce(ftmp2, tmp);   /* 2^93 - 2^1 */
1072280304Sjkim    felem_square(tmp, ftmp2);
1073280304Sjkim    felem_reduce(ftmp2, tmp);   /* 2^94 - 2^2 */
1074280304Sjkim    felem_mul(tmp, ftmp2, e2);
1075280304Sjkim    felem_reduce(ftmp2, tmp);   /* 2^94 - 2^0 */
1076280304Sjkim    felem_square(tmp, ftmp2);
1077280304Sjkim    felem_reduce(ftmp2, tmp);   /* 2^95 - 2^1 */
1078280304Sjkim    felem_square(tmp, ftmp2);
1079280304Sjkim    felem_reduce(ftmp2, tmp);   /* 2^96 - 2^2 */
1080280304Sjkim    felem_mul(tmp, ftmp2, in);
1081280304Sjkim    felem_reduce(ftmp2, tmp);   /* 2^96 - 3 */
1082238384Sjkim
1083280304Sjkim    felem_mul(tmp, ftmp2, ftmp);
1084280304Sjkim    felem_reduce(out, tmp);     /* 2^256 - 2^224 + 2^192 + 2^96 - 3 */
1085280304Sjkim}
1086238384Sjkim
1087238384Sjkimstatic void smallfelem_inv_contract(smallfelem out, const smallfelem in)
1088280304Sjkim{
1089280304Sjkim    felem tmp;
1090238384Sjkim
1091280304Sjkim    smallfelem_expand(tmp, in);
1092280304Sjkim    felem_inv(tmp, tmp);
1093280304Sjkim    felem_contract(out, tmp);
1094280304Sjkim}
1095238384Sjkim
1096280304Sjkim/*-
1097280304Sjkim * Group operations
1098238384Sjkim * ----------------
1099238384Sjkim *
1100238384Sjkim * Building on top of the field operations we have the operations on the
1101238384Sjkim * elliptic curve group itself. Points on the curve are represented in Jacobian
1102280304Sjkim * coordinates
1103280304Sjkim */
1104238384Sjkim
1105280304Sjkim/*-
1106280304Sjkim * point_double calculates 2*(x_in, y_in, z_in)
1107238384Sjkim *
1108238384Sjkim * The method is taken from:
1109238384Sjkim *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b
1110238384Sjkim *
1111238384Sjkim * Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed.
1112280304Sjkim * while x_out == y_in is not (maybe this works, but it's not tested).
1113280304Sjkim */
1114238384Sjkimstatic void
1115238384Sjkimpoint_double(felem x_out, felem y_out, felem z_out,
1116280304Sjkim             const felem x_in, const felem y_in, const felem z_in)
1117280304Sjkim{
1118280304Sjkim    longfelem tmp, tmp2;
1119280304Sjkim    felem delta, gamma, beta, alpha, ftmp, ftmp2;
1120280304Sjkim    smallfelem small1, small2;
1121238384Sjkim
1122280304Sjkim    felem_assign(ftmp, x_in);
1123280304Sjkim    /* ftmp[i] < 2^106 */
1124280304Sjkim    felem_assign(ftmp2, x_in);
1125280304Sjkim    /* ftmp2[i] < 2^106 */
1126238384Sjkim
1127280304Sjkim    /* delta = z^2 */
1128280304Sjkim    felem_square(tmp, z_in);
1129280304Sjkim    felem_reduce(delta, tmp);
1130280304Sjkim    /* delta[i] < 2^101 */
1131238384Sjkim
1132280304Sjkim    /* gamma = y^2 */
1133280304Sjkim    felem_square(tmp, y_in);
1134280304Sjkim    felem_reduce(gamma, tmp);
1135280304Sjkim    /* gamma[i] < 2^101 */
1136280304Sjkim    felem_shrink(small1, gamma);
1137238384Sjkim
1138280304Sjkim    /* beta = x*gamma */
1139280304Sjkim    felem_small_mul(tmp, small1, x_in);
1140280304Sjkim    felem_reduce(beta, tmp);
1141280304Sjkim    /* beta[i] < 2^101 */
1142238384Sjkim
1143280304Sjkim    /* alpha = 3*(x-delta)*(x+delta) */
1144280304Sjkim    felem_diff(ftmp, delta);
1145280304Sjkim    /* ftmp[i] < 2^105 + 2^106 < 2^107 */
1146280304Sjkim    felem_sum(ftmp2, delta);
1147280304Sjkim    /* ftmp2[i] < 2^105 + 2^106 < 2^107 */
1148280304Sjkim    felem_scalar(ftmp2, 3);
1149280304Sjkim    /* ftmp2[i] < 3 * 2^107 < 2^109 */
1150280304Sjkim    felem_mul(tmp, ftmp, ftmp2);
1151280304Sjkim    felem_reduce(alpha, tmp);
1152280304Sjkim    /* alpha[i] < 2^101 */
1153280304Sjkim    felem_shrink(small2, alpha);
1154238384Sjkim
1155280304Sjkim    /* x' = alpha^2 - 8*beta */
1156280304Sjkim    smallfelem_square(tmp, small2);
1157280304Sjkim    felem_reduce(x_out, tmp);
1158280304Sjkim    felem_assign(ftmp, beta);
1159280304Sjkim    felem_scalar(ftmp, 8);
1160280304Sjkim    /* ftmp[i] < 8 * 2^101 = 2^104 */
1161280304Sjkim    felem_diff(x_out, ftmp);
1162280304Sjkim    /* x_out[i] < 2^105 + 2^101 < 2^106 */
1163238384Sjkim
1164280304Sjkim    /* z' = (y + z)^2 - gamma - delta */
1165280304Sjkim    felem_sum(delta, gamma);
1166280304Sjkim    /* delta[i] < 2^101 + 2^101 = 2^102 */
1167280304Sjkim    felem_assign(ftmp, y_in);
1168280304Sjkim    felem_sum(ftmp, z_in);
1169280304Sjkim    /* ftmp[i] < 2^106 + 2^106 = 2^107 */
1170280304Sjkim    felem_square(tmp, ftmp);
1171280304Sjkim    felem_reduce(z_out, tmp);
1172280304Sjkim    felem_diff(z_out, delta);
1173280304Sjkim    /* z_out[i] < 2^105 + 2^101 < 2^106 */
1174238384Sjkim
1175280304Sjkim    /* y' = alpha*(4*beta - x') - 8*gamma^2 */
1176280304Sjkim    felem_scalar(beta, 4);
1177280304Sjkim    /* beta[i] < 4 * 2^101 = 2^103 */
1178280304Sjkim    felem_diff_zero107(beta, x_out);
1179280304Sjkim    /* beta[i] < 2^107 + 2^103 < 2^108 */
1180280304Sjkim    felem_small_mul(tmp, small2, beta);
1181280304Sjkim    /* tmp[i] < 7 * 2^64 < 2^67 */
1182280304Sjkim    smallfelem_square(tmp2, small1);
1183280304Sjkim    /* tmp2[i] < 7 * 2^64 */
1184280304Sjkim    longfelem_scalar(tmp2, 8);
1185280304Sjkim    /* tmp2[i] < 8 * 7 * 2^64 = 7 * 2^67 */
1186280304Sjkim    longfelem_diff(tmp, tmp2);
1187280304Sjkim    /* tmp[i] < 2^67 + 2^70 + 2^40 < 2^71 */
1188280304Sjkim    felem_reduce_zero105(y_out, tmp);
1189280304Sjkim    /* y_out[i] < 2^106 */
1190280304Sjkim}
1191238384Sjkim
1192280304Sjkim/*
1193280304Sjkim * point_double_small is the same as point_double, except that it operates on
1194280304Sjkim * smallfelems
1195280304Sjkim */
1196238384Sjkimstatic void
1197238384Sjkimpoint_double_small(smallfelem x_out, smallfelem y_out, smallfelem z_out,
1198280304Sjkim                   const smallfelem x_in, const smallfelem y_in,
1199280304Sjkim                   const smallfelem z_in)
1200280304Sjkim{
1201280304Sjkim    felem felem_x_out, felem_y_out, felem_z_out;
1202280304Sjkim    felem felem_x_in, felem_y_in, felem_z_in;
1203238384Sjkim
1204280304Sjkim    smallfelem_expand(felem_x_in, x_in);
1205280304Sjkim    smallfelem_expand(felem_y_in, y_in);
1206280304Sjkim    smallfelem_expand(felem_z_in, z_in);
1207280304Sjkim    point_double(felem_x_out, felem_y_out, felem_z_out,
1208280304Sjkim                 felem_x_in, felem_y_in, felem_z_in);
1209280304Sjkim    felem_shrink(x_out, felem_x_out);
1210280304Sjkim    felem_shrink(y_out, felem_y_out);
1211280304Sjkim    felem_shrink(z_out, felem_z_out);
1212280304Sjkim}
1213238384Sjkim
1214238384Sjkim/* copy_conditional copies in to out iff mask is all ones. */
1215280304Sjkimstatic void copy_conditional(felem out, const felem in, limb mask)
1216280304Sjkim{
1217280304Sjkim    unsigned i;
1218280304Sjkim    for (i = 0; i < NLIMBS; ++i) {
1219280304Sjkim        const limb tmp = mask & (in[i] ^ out[i]);
1220280304Sjkim        out[i] ^= tmp;
1221280304Sjkim    }
1222280304Sjkim}
1223238384Sjkim
1224238384Sjkim/* copy_small_conditional copies in to out iff mask is all ones. */
1225280304Sjkimstatic void copy_small_conditional(felem out, const smallfelem in, limb mask)
1226280304Sjkim{
1227280304Sjkim    unsigned i;
1228280304Sjkim    const u64 mask64 = mask;
1229280304Sjkim    for (i = 0; i < NLIMBS; ++i) {
1230280304Sjkim        out[i] = ((limb) (in[i] & mask64)) | (out[i] & ~mask);
1231280304Sjkim    }
1232280304Sjkim}
1233238384Sjkim
1234280304Sjkim/*-
1235280304Sjkim * point_add calcuates (x1, y1, z1) + (x2, y2, z2)
1236238384Sjkim *
1237238384Sjkim * The method is taken from:
1238238384Sjkim *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl,
1239238384Sjkim * adapted for mixed addition (z2 = 1, or z2 = 0 for the point at infinity).
1240238384Sjkim *
1241238384Sjkim * This function includes a branch for checking whether the two input points
1242238384Sjkim * are equal, (while not equal to the point at infinity). This case never
1243238384Sjkim * happens during single point multiplication, so there is no timing leak for
1244280304Sjkim * ECDH or ECDSA signing.
1245280304Sjkim */
1246238384Sjkimstatic void point_add(felem x3, felem y3, felem z3,
1247280304Sjkim                      const felem x1, const felem y1, const felem z1,
1248280304Sjkim                      const int mixed, const smallfelem x2,
1249280304Sjkim                      const smallfelem y2, const smallfelem z2)
1250280304Sjkim{
1251280304Sjkim    felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, ftmp6, x_out, y_out, z_out;
1252280304Sjkim    longfelem tmp, tmp2;
1253280304Sjkim    smallfelem small1, small2, small3, small4, small5;
1254280304Sjkim    limb x_equal, y_equal, z1_is_zero, z2_is_zero;
1255238384Sjkim
1256280304Sjkim    felem_shrink(small3, z1);
1257238384Sjkim
1258280304Sjkim    z1_is_zero = smallfelem_is_zero(small3);
1259280304Sjkim    z2_is_zero = smallfelem_is_zero(z2);
1260238384Sjkim
1261280304Sjkim    /* ftmp = z1z1 = z1**2 */
1262280304Sjkim    smallfelem_square(tmp, small3);
1263280304Sjkim    felem_reduce(ftmp, tmp);
1264280304Sjkim    /* ftmp[i] < 2^101 */
1265280304Sjkim    felem_shrink(small1, ftmp);
1266238384Sjkim
1267280304Sjkim    if (!mixed) {
1268280304Sjkim        /* ftmp2 = z2z2 = z2**2 */
1269280304Sjkim        smallfelem_square(tmp, z2);
1270280304Sjkim        felem_reduce(ftmp2, tmp);
1271280304Sjkim        /* ftmp2[i] < 2^101 */
1272280304Sjkim        felem_shrink(small2, ftmp2);
1273238384Sjkim
1274280304Sjkim        felem_shrink(small5, x1);
1275238384Sjkim
1276280304Sjkim        /* u1 = ftmp3 = x1*z2z2 */
1277280304Sjkim        smallfelem_mul(tmp, small5, small2);
1278280304Sjkim        felem_reduce(ftmp3, tmp);
1279280304Sjkim        /* ftmp3[i] < 2^101 */
1280238384Sjkim
1281280304Sjkim        /* ftmp5 = z1 + z2 */
1282280304Sjkim        felem_assign(ftmp5, z1);
1283280304Sjkim        felem_small_sum(ftmp5, z2);
1284280304Sjkim        /* ftmp5[i] < 2^107 */
1285238384Sjkim
1286280304Sjkim        /* ftmp5 = (z1 + z2)**2 - (z1z1 + z2z2) = 2z1z2 */
1287280304Sjkim        felem_square(tmp, ftmp5);
1288280304Sjkim        felem_reduce(ftmp5, tmp);
1289280304Sjkim        /* ftmp2 = z2z2 + z1z1 */
1290280304Sjkim        felem_sum(ftmp2, ftmp);
1291280304Sjkim        /* ftmp2[i] < 2^101 + 2^101 = 2^102 */
1292280304Sjkim        felem_diff(ftmp5, ftmp2);
1293280304Sjkim        /* ftmp5[i] < 2^105 + 2^101 < 2^106 */
1294238384Sjkim
1295280304Sjkim        /* ftmp2 = z2 * z2z2 */
1296280304Sjkim        smallfelem_mul(tmp, small2, z2);
1297280304Sjkim        felem_reduce(ftmp2, tmp);
1298238384Sjkim
1299280304Sjkim        /* s1 = ftmp2 = y1 * z2**3 */
1300280304Sjkim        felem_mul(tmp, y1, ftmp2);
1301280304Sjkim        felem_reduce(ftmp6, tmp);
1302280304Sjkim        /* ftmp6[i] < 2^101 */
1303280304Sjkim    } else {
1304280304Sjkim        /*
1305280304Sjkim         * We'll assume z2 = 1 (special case z2 = 0 is handled later)
1306280304Sjkim         */
1307238384Sjkim
1308280304Sjkim        /* u1 = ftmp3 = x1*z2z2 */
1309280304Sjkim        felem_assign(ftmp3, x1);
1310280304Sjkim        /* ftmp3[i] < 2^106 */
1311238384Sjkim
1312280304Sjkim        /* ftmp5 = 2z1z2 */
1313280304Sjkim        felem_assign(ftmp5, z1);
1314280304Sjkim        felem_scalar(ftmp5, 2);
1315280304Sjkim        /* ftmp5[i] < 2*2^106 = 2^107 */
1316238384Sjkim
1317280304Sjkim        /* s1 = ftmp2 = y1 * z2**3 */
1318280304Sjkim        felem_assign(ftmp6, y1);
1319280304Sjkim        /* ftmp6[i] < 2^106 */
1320280304Sjkim    }
1321238384Sjkim
1322280304Sjkim    /* u2 = x2*z1z1 */
1323280304Sjkim    smallfelem_mul(tmp, x2, small1);
1324280304Sjkim    felem_reduce(ftmp4, tmp);
1325238384Sjkim
1326280304Sjkim    /* h = ftmp4 = u2 - u1 */
1327280304Sjkim    felem_diff_zero107(ftmp4, ftmp3);
1328280304Sjkim    /* ftmp4[i] < 2^107 + 2^101 < 2^108 */
1329280304Sjkim    felem_shrink(small4, ftmp4);
1330238384Sjkim
1331280304Sjkim    x_equal = smallfelem_is_zero(small4);
1332238384Sjkim
1333280304Sjkim    /* z_out = ftmp5 * h */
1334280304Sjkim    felem_small_mul(tmp, small4, ftmp5);
1335280304Sjkim    felem_reduce(z_out, tmp);
1336280304Sjkim    /* z_out[i] < 2^101 */
1337238384Sjkim
1338280304Sjkim    /* ftmp = z1 * z1z1 */
1339280304Sjkim    smallfelem_mul(tmp, small1, small3);
1340280304Sjkim    felem_reduce(ftmp, tmp);
1341238384Sjkim
1342280304Sjkim    /* s2 = tmp = y2 * z1**3 */
1343280304Sjkim    felem_small_mul(tmp, y2, ftmp);
1344280304Sjkim    felem_reduce(ftmp5, tmp);
1345238384Sjkim
1346280304Sjkim    /* r = ftmp5 = (s2 - s1)*2 */
1347280304Sjkim    felem_diff_zero107(ftmp5, ftmp6);
1348280304Sjkim    /* ftmp5[i] < 2^107 + 2^107 = 2^108 */
1349280304Sjkim    felem_scalar(ftmp5, 2);
1350280304Sjkim    /* ftmp5[i] < 2^109 */
1351280304Sjkim    felem_shrink(small1, ftmp5);
1352280304Sjkim    y_equal = smallfelem_is_zero(small1);
1353238384Sjkim
1354280304Sjkim    if (x_equal && y_equal && !z1_is_zero && !z2_is_zero) {
1355280304Sjkim        point_double(x3, y3, z3, x1, y1, z1);
1356280304Sjkim        return;
1357280304Sjkim    }
1358238384Sjkim
1359280304Sjkim    /* I = ftmp = (2h)**2 */
1360280304Sjkim    felem_assign(ftmp, ftmp4);
1361280304Sjkim    felem_scalar(ftmp, 2);
1362280304Sjkim    /* ftmp[i] < 2*2^108 = 2^109 */
1363280304Sjkim    felem_square(tmp, ftmp);
1364280304Sjkim    felem_reduce(ftmp, tmp);
1365238384Sjkim
1366280304Sjkim    /* J = ftmp2 = h * I */
1367280304Sjkim    felem_mul(tmp, ftmp4, ftmp);
1368280304Sjkim    felem_reduce(ftmp2, tmp);
1369238384Sjkim
1370280304Sjkim    /* V = ftmp4 = U1 * I */
1371280304Sjkim    felem_mul(tmp, ftmp3, ftmp);
1372280304Sjkim    felem_reduce(ftmp4, tmp);
1373238384Sjkim
1374280304Sjkim    /* x_out = r**2 - J - 2V */
1375280304Sjkim    smallfelem_square(tmp, small1);
1376280304Sjkim    felem_reduce(x_out, tmp);
1377280304Sjkim    felem_assign(ftmp3, ftmp4);
1378280304Sjkim    felem_scalar(ftmp4, 2);
1379280304Sjkim    felem_sum(ftmp4, ftmp2);
1380280304Sjkim    /* ftmp4[i] < 2*2^101 + 2^101 < 2^103 */
1381280304Sjkim    felem_diff(x_out, ftmp4);
1382280304Sjkim    /* x_out[i] < 2^105 + 2^101 */
1383238384Sjkim
1384280304Sjkim    /* y_out = r(V-x_out) - 2 * s1 * J */
1385280304Sjkim    felem_diff_zero107(ftmp3, x_out);
1386280304Sjkim    /* ftmp3[i] < 2^107 + 2^101 < 2^108 */
1387280304Sjkim    felem_small_mul(tmp, small1, ftmp3);
1388280304Sjkim    felem_mul(tmp2, ftmp6, ftmp2);
1389280304Sjkim    longfelem_scalar(tmp2, 2);
1390280304Sjkim    /* tmp2[i] < 2*2^67 = 2^68 */
1391280304Sjkim    longfelem_diff(tmp, tmp2);
1392280304Sjkim    /* tmp[i] < 2^67 + 2^70 + 2^40 < 2^71 */
1393280304Sjkim    felem_reduce_zero105(y_out, tmp);
1394280304Sjkim    /* y_out[i] < 2^106 */
1395238384Sjkim
1396280304Sjkim    copy_small_conditional(x_out, x2, z1_is_zero);
1397280304Sjkim    copy_conditional(x_out, x1, z2_is_zero);
1398280304Sjkim    copy_small_conditional(y_out, y2, z1_is_zero);
1399280304Sjkim    copy_conditional(y_out, y1, z2_is_zero);
1400280304Sjkim    copy_small_conditional(z_out, z2, z1_is_zero);
1401280304Sjkim    copy_conditional(z_out, z1, z2_is_zero);
1402280304Sjkim    felem_assign(x3, x_out);
1403280304Sjkim    felem_assign(y3, y_out);
1404280304Sjkim    felem_assign(z3, z_out);
1405280304Sjkim}
1406238384Sjkim
1407280304Sjkim/*
1408280304Sjkim * point_add_small is the same as point_add, except that it operates on
1409280304Sjkim * smallfelems
1410280304Sjkim */
1411238384Sjkimstatic void point_add_small(smallfelem x3, smallfelem y3, smallfelem z3,
1412280304Sjkim                            smallfelem x1, smallfelem y1, smallfelem z1,
1413280304Sjkim                            smallfelem x2, smallfelem y2, smallfelem z2)
1414280304Sjkim{
1415280304Sjkim    felem felem_x3, felem_y3, felem_z3;
1416280304Sjkim    felem felem_x1, felem_y1, felem_z1;
1417280304Sjkim    smallfelem_expand(felem_x1, x1);
1418280304Sjkim    smallfelem_expand(felem_y1, y1);
1419280304Sjkim    smallfelem_expand(felem_z1, z1);
1420280304Sjkim    point_add(felem_x3, felem_y3, felem_z3, felem_x1, felem_y1, felem_z1, 0,
1421280304Sjkim              x2, y2, z2);
1422280304Sjkim    felem_shrink(x3, felem_x3);
1423280304Sjkim    felem_shrink(y3, felem_y3);
1424280304Sjkim    felem_shrink(z3, felem_z3);
1425280304Sjkim}
1426238384Sjkim
1427280304Sjkim/*-
1428280304Sjkim * Base point pre computation
1429238384Sjkim * --------------------------
1430238384Sjkim *
1431238384Sjkim * Two different sorts of precomputed tables are used in the following code.
1432238384Sjkim * Each contain various points on the curve, where each point is three field
1433238384Sjkim * elements (x, y, z).
1434238384Sjkim *
1435238384Sjkim * For the base point table, z is usually 1 (0 for the point at infinity).
1436238384Sjkim * This table has 2 * 16 elements, starting with the following:
1437238384Sjkim * index | bits    | point
1438238384Sjkim * ------+---------+------------------------------
1439238384Sjkim *     0 | 0 0 0 0 | 0G
1440238384Sjkim *     1 | 0 0 0 1 | 1G
1441238384Sjkim *     2 | 0 0 1 0 | 2^64G
1442238384Sjkim *     3 | 0 0 1 1 | (2^64 + 1)G
1443238384Sjkim *     4 | 0 1 0 0 | 2^128G
1444238384Sjkim *     5 | 0 1 0 1 | (2^128 + 1)G
1445238384Sjkim *     6 | 0 1 1 0 | (2^128 + 2^64)G
1446238384Sjkim *     7 | 0 1 1 1 | (2^128 + 2^64 + 1)G
1447238384Sjkim *     8 | 1 0 0 0 | 2^192G
1448238384Sjkim *     9 | 1 0 0 1 | (2^192 + 1)G
1449238384Sjkim *    10 | 1 0 1 0 | (2^192 + 2^64)G
1450238384Sjkim *    11 | 1 0 1 1 | (2^192 + 2^64 + 1)G
1451238384Sjkim *    12 | 1 1 0 0 | (2^192 + 2^128)G
1452238384Sjkim *    13 | 1 1 0 1 | (2^192 + 2^128 + 1)G
1453238384Sjkim *    14 | 1 1 1 0 | (2^192 + 2^128 + 2^64)G
1454238384Sjkim *    15 | 1 1 1 1 | (2^192 + 2^128 + 2^64 + 1)G
1455238384Sjkim * followed by a copy of this with each element multiplied by 2^32.
1456238384Sjkim *
1457238384Sjkim * The reason for this is so that we can clock bits into four different
1458238384Sjkim * locations when doing simple scalar multiplies against the base point,
1459238384Sjkim * and then another four locations using the second 16 elements.
1460238384Sjkim *
1461238384Sjkim * Tables for other points have table[i] = iG for i in 0 .. 16. */
1462238384Sjkim
1463238384Sjkim/* gmul is the table of precomputed base points */
1464280304Sjkimstatic const smallfelem gmul[2][16][3] = {
1465280304Sjkim    {{{0, 0, 0, 0},
1466280304Sjkim      {0, 0, 0, 0},
1467280304Sjkim      {0, 0, 0, 0}},
1468280304Sjkim     {{0xf4a13945d898c296, 0x77037d812deb33a0, 0xf8bce6e563a440f2,
1469280304Sjkim       0x6b17d1f2e12c4247},
1470280304Sjkim      {0xcbb6406837bf51f5, 0x2bce33576b315ece, 0x8ee7eb4a7c0f9e16,
1471280304Sjkim       0x4fe342e2fe1a7f9b},
1472280304Sjkim      {1, 0, 0, 0}},
1473280304Sjkim     {{0x90e75cb48e14db63, 0x29493baaad651f7e, 0x8492592e326e25de,
1474280304Sjkim       0x0fa822bc2811aaa5},
1475280304Sjkim      {0xe41124545f462ee7, 0x34b1a65050fe82f5, 0x6f4ad4bcb3df188b,
1476280304Sjkim       0xbff44ae8f5dba80d},
1477280304Sjkim      {1, 0, 0, 0}},
1478280304Sjkim     {{0x93391ce2097992af, 0xe96c98fd0d35f1fa, 0xb257c0de95e02789,
1479280304Sjkim       0x300a4bbc89d6726f},
1480280304Sjkim      {0xaa54a291c08127a0, 0x5bb1eeada9d806a5, 0x7f1ddb25ff1e3c6f,
1481280304Sjkim       0x72aac7e0d09b4644},
1482280304Sjkim      {1, 0, 0, 0}},
1483280304Sjkim     {{0x57c84fc9d789bd85, 0xfc35ff7dc297eac3, 0xfb982fd588c6766e,
1484280304Sjkim       0x447d739beedb5e67},
1485280304Sjkim      {0x0c7e33c972e25b32, 0x3d349b95a7fae500, 0xe12e9d953a4aaff7,
1486280304Sjkim       0x2d4825ab834131ee},
1487280304Sjkim      {1, 0, 0, 0}},
1488280304Sjkim     {{0x13949c932a1d367f, 0xef7fbd2b1a0a11b7, 0xddc6068bb91dfc60,
1489280304Sjkim       0xef9519328a9c72ff},
1490280304Sjkim      {0x196035a77376d8a8, 0x23183b0895ca1740, 0xc1ee9807022c219c,
1491280304Sjkim       0x611e9fc37dbb2c9b},
1492280304Sjkim      {1, 0, 0, 0}},
1493280304Sjkim     {{0xcae2b1920b57f4bc, 0x2936df5ec6c9bc36, 0x7dea6482e11238bf,
1494280304Sjkim       0x550663797b51f5d8},
1495280304Sjkim      {0x44ffe216348a964c, 0x9fb3d576dbdefbe1, 0x0afa40018d9d50e5,
1496280304Sjkim       0x157164848aecb851},
1497280304Sjkim      {1, 0, 0, 0}},
1498280304Sjkim     {{0xe48ecafffc5cde01, 0x7ccd84e70d715f26, 0xa2e8f483f43e4391,
1499280304Sjkim       0xeb5d7745b21141ea},
1500280304Sjkim      {0xcac917e2731a3479, 0x85f22cfe2844b645, 0x0990e6a158006cee,
1501280304Sjkim       0xeafd72ebdbecc17b},
1502280304Sjkim      {1, 0, 0, 0}},
1503280304Sjkim     {{0x6cf20ffb313728be, 0x96439591a3c6b94a, 0x2736ff8344315fc5,
1504280304Sjkim       0xa6d39677a7849276},
1505280304Sjkim      {0xf2bab833c357f5f4, 0x824a920c2284059b, 0x66b8babd2d27ecdf,
1506280304Sjkim       0x674f84749b0b8816},
1507280304Sjkim      {1, 0, 0, 0}},
1508280304Sjkim     {{0x2df48c04677c8a3e, 0x74e02f080203a56b, 0x31855f7db8c7fedb,
1509280304Sjkim       0x4e769e7672c9ddad},
1510280304Sjkim      {0xa4c36165b824bbb0, 0xfb9ae16f3b9122a5, 0x1ec0057206947281,
1511280304Sjkim       0x42b99082de830663},
1512280304Sjkim      {1, 0, 0, 0}},
1513280304Sjkim     {{0x6ef95150dda868b9, 0xd1f89e799c0ce131, 0x7fdc1ca008a1c478,
1514280304Sjkim       0x78878ef61c6ce04d},
1515280304Sjkim      {0x9c62b9121fe0d976, 0x6ace570ebde08d4f, 0xde53142c12309def,
1516280304Sjkim       0xb6cb3f5d7b72c321},
1517280304Sjkim      {1, 0, 0, 0}},
1518280304Sjkim     {{0x7f991ed2c31a3573, 0x5b82dd5bd54fb496, 0x595c5220812ffcae,
1519280304Sjkim       0x0c88bc4d716b1287},
1520280304Sjkim      {0x3a57bf635f48aca8, 0x7c8181f4df2564f3, 0x18d1b5b39c04e6aa,
1521280304Sjkim       0xdd5ddea3f3901dc6},
1522280304Sjkim      {1, 0, 0, 0}},
1523280304Sjkim     {{0xe96a79fb3e72ad0c, 0x43a0a28c42ba792f, 0xefe0a423083e49f3,
1524280304Sjkim       0x68f344af6b317466},
1525280304Sjkim      {0xcdfe17db3fb24d4a, 0x668bfc2271f5c626, 0x604ed93c24d67ff3,
1526280304Sjkim       0x31b9c405f8540a20},
1527280304Sjkim      {1, 0, 0, 0}},
1528280304Sjkim     {{0xd36b4789a2582e7f, 0x0d1a10144ec39c28, 0x663c62c3edbad7a0,
1529280304Sjkim       0x4052bf4b6f461db9},
1530280304Sjkim      {0x235a27c3188d25eb, 0xe724f33999bfcc5b, 0x862be6bd71d70cc8,
1531280304Sjkim       0xfecf4d5190b0fc61},
1532280304Sjkim      {1, 0, 0, 0}},
1533280304Sjkim     {{0x74346c10a1d4cfac, 0xafdf5cc08526a7a4, 0x123202a8f62bff7a,
1534280304Sjkim       0x1eddbae2c802e41a},
1535280304Sjkim      {0x8fa0af2dd603f844, 0x36e06b7e4c701917, 0x0c45f45273db33a0,
1536280304Sjkim       0x43104d86560ebcfc},
1537280304Sjkim      {1, 0, 0, 0}},
1538280304Sjkim     {{0x9615b5110d1d78e5, 0x66b0de3225c4744b, 0x0a4a46fb6aaf363a,
1539280304Sjkim       0xb48e26b484f7a21c},
1540280304Sjkim      {0x06ebb0f621a01b2d, 0xc004e4048b7b0f98, 0x64131bcdfed6f668,
1541280304Sjkim       0xfac015404d4d3dab},
1542280304Sjkim      {1, 0, 0, 0}}},
1543280304Sjkim    {{{0, 0, 0, 0},
1544280304Sjkim      {0, 0, 0, 0},
1545280304Sjkim      {0, 0, 0, 0}},
1546280304Sjkim     {{0x3a5a9e22185a5943, 0x1ab919365c65dfb6, 0x21656b32262c71da,
1547280304Sjkim       0x7fe36b40af22af89},
1548280304Sjkim      {0xd50d152c699ca101, 0x74b3d5867b8af212, 0x9f09f40407dca6f1,
1549280304Sjkim       0xe697d45825b63624},
1550280304Sjkim      {1, 0, 0, 0}},
1551280304Sjkim     {{0xa84aa9397512218e, 0xe9a521b074ca0141, 0x57880b3a18a2e902,
1552280304Sjkim       0x4a5b506612a677a6},
1553280304Sjkim      {0x0beada7a4c4f3840, 0x626db15419e26d9d, 0xc42604fbe1627d40,
1554280304Sjkim       0xeb13461ceac089f1},
1555280304Sjkim      {1, 0, 0, 0}},
1556280304Sjkim     {{0xf9faed0927a43281, 0x5e52c4144103ecbc, 0xc342967aa815c857,
1557280304Sjkim       0x0781b8291c6a220a},
1558280304Sjkim      {0x5a8343ceeac55f80, 0x88f80eeee54a05e3, 0x97b2a14f12916434,
1559280304Sjkim       0x690cde8df0151593},
1560280304Sjkim      {1, 0, 0, 0}},
1561280304Sjkim     {{0xaee9c75df7f82f2a, 0x9e4c35874afdf43a, 0xf5622df437371326,
1562280304Sjkim       0x8a535f566ec73617},
1563280304Sjkim      {0xc5f9a0ac223094b7, 0xcde533864c8c7669, 0x37e02819085a92bf,
1564280304Sjkim       0x0455c08468b08bd7},
1565280304Sjkim      {1, 0, 0, 0}},
1566280304Sjkim     {{0x0c0a6e2c9477b5d9, 0xf9a4bf62876dc444, 0x5050a949b6cdc279,
1567280304Sjkim       0x06bada7ab77f8276},
1568280304Sjkim      {0xc8b4aed1ea48dac9, 0xdebd8a4b7ea1070f, 0x427d49101366eb70,
1569280304Sjkim       0x5b476dfd0e6cb18a},
1570280304Sjkim      {1, 0, 0, 0}},
1571280304Sjkim     {{0x7c5c3e44278c340a, 0x4d54606812d66f3b, 0x29a751b1ae23c5d8,
1572280304Sjkim       0x3e29864e8a2ec908},
1573280304Sjkim      {0x142d2a6626dbb850, 0xad1744c4765bd780, 0x1f150e68e322d1ed,
1574280304Sjkim       0x239b90ea3dc31e7e},
1575280304Sjkim      {1, 0, 0, 0}},
1576280304Sjkim     {{0x78c416527a53322a, 0x305dde6709776f8e, 0xdbcab759f8862ed4,
1577280304Sjkim       0x820f4dd949f72ff7},
1578280304Sjkim      {0x6cc544a62b5debd4, 0x75be5d937b4e8cc4, 0x1b481b1b215c14d3,
1579280304Sjkim       0x140406ec783a05ec},
1580280304Sjkim      {1, 0, 0, 0}},
1581280304Sjkim     {{0x6a703f10e895df07, 0xfd75f3fa01876bd8, 0xeb5b06e70ce08ffe,
1582280304Sjkim       0x68f6b8542783dfee},
1583280304Sjkim      {0x90c76f8a78712655, 0xcf5293d2f310bf7f, 0xfbc8044dfda45028,
1584280304Sjkim       0xcbe1feba92e40ce6},
1585280304Sjkim      {1, 0, 0, 0}},
1586280304Sjkim     {{0xe998ceea4396e4c1, 0xfc82ef0b6acea274, 0x230f729f2250e927,
1587280304Sjkim       0xd0b2f94d2f420109},
1588280304Sjkim      {0x4305adddb38d4966, 0x10b838f8624c3b45, 0x7db2636658954e7a,
1589280304Sjkim       0x971459828b0719e5},
1590280304Sjkim      {1, 0, 0, 0}},
1591280304Sjkim     {{0x4bd6b72623369fc9, 0x57f2929e53d0b876, 0xc2d5cba4f2340687,
1592280304Sjkim       0x961610004a866aba},
1593280304Sjkim      {0x49997bcd2e407a5e, 0x69ab197d92ddcb24, 0x2cf1f2438fe5131c,
1594280304Sjkim       0x7acb9fadcee75e44},
1595280304Sjkim      {1, 0, 0, 0}},
1596280304Sjkim     {{0x254e839423d2d4c0, 0xf57f0c917aea685b, 0xa60d880f6f75aaea,
1597280304Sjkim       0x24eb9acca333bf5b},
1598280304Sjkim      {0xe3de4ccb1cda5dea, 0xfeef9341c51a6b4f, 0x743125f88bac4c4d,
1599280304Sjkim       0x69f891c5acd079cc},
1600280304Sjkim      {1, 0, 0, 0}},
1601280304Sjkim     {{0xeee44b35702476b5, 0x7ed031a0e45c2258, 0xb422d1e7bd6f8514,
1602280304Sjkim       0xe51f547c5972a107},
1603280304Sjkim      {0xa25bcd6fc9cf343d, 0x8ca922ee097c184e, 0xa62f98b3a9fe9a06,
1604280304Sjkim       0x1c309a2b25bb1387},
1605280304Sjkim      {1, 0, 0, 0}},
1606280304Sjkim     {{0x9295dbeb1967c459, 0xb00148833472c98e, 0xc504977708011828,
1607280304Sjkim       0x20b87b8aa2c4e503},
1608280304Sjkim      {0x3063175de057c277, 0x1bd539338fe582dd, 0x0d11adef5f69a044,
1609280304Sjkim       0xf5c6fa49919776be},
1610280304Sjkim      {1, 0, 0, 0}},
1611280304Sjkim     {{0x8c944e760fd59e11, 0x3876cba1102fad5f, 0xa454c3fad83faa56,
1612280304Sjkim       0x1ed7d1b9332010b9},
1613280304Sjkim      {0xa1011a270024b889, 0x05e4d0dcac0cd344, 0x52b520f0eb6a2a24,
1614280304Sjkim       0x3a2b03f03217257a},
1615280304Sjkim      {1, 0, 0, 0}},
1616280304Sjkim     {{0xf20fc2afdf1d043d, 0xf330240db58d5a62, 0xfc7d229ca0058c3b,
1617280304Sjkim       0x15fee545c78dd9f6},
1618280304Sjkim      {0x501e82885bc98cda, 0x41ef80e5d046ac04, 0x557d9f49461210fb,
1619280304Sjkim       0x4ab5b6b2b8753f81},
1620280304Sjkim      {1, 0, 0, 0}}}
1621280304Sjkim};
1622238384Sjkim
1623280304Sjkim/*
1624280304Sjkim * select_point selects the |idx|th point from a precomputation table and
1625280304Sjkim * copies it to out.
1626280304Sjkim */
1627280304Sjkimstatic void select_point(const u64 idx, unsigned int size,
1628280304Sjkim                         const smallfelem pre_comp[16][3], smallfelem out[3])
1629280304Sjkim{
1630280304Sjkim    unsigned i, j;
1631280304Sjkim    u64 *outlimbs = &out[0][0];
1632280304Sjkim    memset(outlimbs, 0, 3 * sizeof(smallfelem));
1633238384Sjkim
1634280304Sjkim    for (i = 0; i < size; i++) {
1635280304Sjkim        const u64 *inlimbs = (u64 *)&pre_comp[i][0][0];
1636280304Sjkim        u64 mask = i ^ idx;
1637280304Sjkim        mask |= mask >> 4;
1638280304Sjkim        mask |= mask >> 2;
1639280304Sjkim        mask |= mask >> 1;
1640280304Sjkim        mask &= 1;
1641280304Sjkim        mask--;
1642280304Sjkim        for (j = 0; j < NLIMBS * 3; j++)
1643280304Sjkim            outlimbs[j] |= inlimbs[j] & mask;
1644280304Sjkim    }
1645280304Sjkim}
1646238384Sjkim
1647238384Sjkim/* get_bit returns the |i|th bit in |in| */
1648238384Sjkimstatic char get_bit(const felem_bytearray in, int i)
1649280304Sjkim{
1650280304Sjkim    if ((i < 0) || (i >= 256))
1651280304Sjkim        return 0;
1652280304Sjkim    return (in[i >> 3] >> (i & 7)) & 1;
1653280304Sjkim}
1654238384Sjkim
1655280304Sjkim/*
1656280304Sjkim * Interleaved point multiplication using precomputed point multiples: The
1657280304Sjkim * small point multiples 0*P, 1*P, ..., 17*P are in pre_comp[], the scalars
1658280304Sjkim * in scalars[]. If g_scalar is non-NULL, we also add this multiple of the
1659280304Sjkim * generator, using certain (large) precomputed multiples in g_pre_comp.
1660280304Sjkim * Output point (X, Y, Z) is stored in x_out, y_out, z_out
1661280304Sjkim */
1662238384Sjkimstatic void batch_mul(felem x_out, felem y_out, felem z_out,
1663280304Sjkim                      const felem_bytearray scalars[],
1664280304Sjkim                      const unsigned num_points, const u8 *g_scalar,
1665280304Sjkim                      const int mixed, const smallfelem pre_comp[][17][3],
1666280304Sjkim                      const smallfelem g_pre_comp[2][16][3])
1667280304Sjkim{
1668280304Sjkim    int i, skip;
1669280304Sjkim    unsigned num, gen_mul = (g_scalar != NULL);
1670280304Sjkim    felem nq[3], ftmp;
1671280304Sjkim    smallfelem tmp[3];
1672280304Sjkim    u64 bits;
1673280304Sjkim    u8 sign, digit;
1674238384Sjkim
1675280304Sjkim    /* set nq to the point at infinity */
1676280304Sjkim    memset(nq, 0, 3 * sizeof(felem));
1677238384Sjkim
1678280304Sjkim    /*
1679280304Sjkim     * Loop over all scalars msb-to-lsb, interleaving additions of multiples
1680280304Sjkim     * of the generator (two in each of the last 32 rounds) and additions of
1681280304Sjkim     * other points multiples (every 5th round).
1682280304Sjkim     */
1683280304Sjkim    skip = 1;                   /* save two point operations in the first
1684280304Sjkim                                 * round */
1685280304Sjkim    for (i = (num_points ? 255 : 31); i >= 0; --i) {
1686280304Sjkim        /* double */
1687280304Sjkim        if (!skip)
1688280304Sjkim            point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]);
1689238384Sjkim
1690280304Sjkim        /* add multiples of the generator */
1691280304Sjkim        if (gen_mul && (i <= 31)) {
1692280304Sjkim            /* first, look 32 bits upwards */
1693280304Sjkim            bits = get_bit(g_scalar, i + 224) << 3;
1694280304Sjkim            bits |= get_bit(g_scalar, i + 160) << 2;
1695280304Sjkim            bits |= get_bit(g_scalar, i + 96) << 1;
1696280304Sjkim            bits |= get_bit(g_scalar, i + 32);
1697280304Sjkim            /* select the point to add, in constant time */
1698280304Sjkim            select_point(bits, 16, g_pre_comp[1], tmp);
1699238384Sjkim
1700280304Sjkim            if (!skip) {
1701280304Sjkim                /* Arg 1 below is for "mixed" */
1702280304Sjkim                point_add(nq[0], nq[1], nq[2],
1703280304Sjkim                          nq[0], nq[1], nq[2], 1, tmp[0], tmp[1], tmp[2]);
1704280304Sjkim            } else {
1705280304Sjkim                smallfelem_expand(nq[0], tmp[0]);
1706280304Sjkim                smallfelem_expand(nq[1], tmp[1]);
1707280304Sjkim                smallfelem_expand(nq[2], tmp[2]);
1708280304Sjkim                skip = 0;
1709280304Sjkim            }
1710238384Sjkim
1711280304Sjkim            /* second, look at the current position */
1712280304Sjkim            bits = get_bit(g_scalar, i + 192) << 3;
1713280304Sjkim            bits |= get_bit(g_scalar, i + 128) << 2;
1714280304Sjkim            bits |= get_bit(g_scalar, i + 64) << 1;
1715280304Sjkim            bits |= get_bit(g_scalar, i);
1716280304Sjkim            /* select the point to add, in constant time */
1717280304Sjkim            select_point(bits, 16, g_pre_comp[0], tmp);
1718280304Sjkim            /* Arg 1 below is for "mixed" */
1719280304Sjkim            point_add(nq[0], nq[1], nq[2],
1720280304Sjkim                      nq[0], nq[1], nq[2], 1, tmp[0], tmp[1], tmp[2]);
1721280304Sjkim        }
1722238384Sjkim
1723280304Sjkim        /* do other additions every 5 doublings */
1724280304Sjkim        if (num_points && (i % 5 == 0)) {
1725280304Sjkim            /* loop over all scalars */
1726280304Sjkim            for (num = 0; num < num_points; ++num) {
1727280304Sjkim                bits = get_bit(scalars[num], i + 4) << 5;
1728280304Sjkim                bits |= get_bit(scalars[num], i + 3) << 4;
1729280304Sjkim                bits |= get_bit(scalars[num], i + 2) << 3;
1730280304Sjkim                bits |= get_bit(scalars[num], i + 1) << 2;
1731280304Sjkim                bits |= get_bit(scalars[num], i) << 1;
1732280304Sjkim                bits |= get_bit(scalars[num], i - 1);
1733280304Sjkim                ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits);
1734238384Sjkim
1735280304Sjkim                /*
1736280304Sjkim                 * select the point to add or subtract, in constant time
1737280304Sjkim                 */
1738280304Sjkim                select_point(digit, 17, pre_comp[num], tmp);
1739280304Sjkim                smallfelem_neg(ftmp, tmp[1]); /* (X, -Y, Z) is the negative
1740280304Sjkim                                               * point */
1741280304Sjkim                copy_small_conditional(ftmp, tmp[1], (((limb) sign) - 1));
1742280304Sjkim                felem_contract(tmp[1], ftmp);
1743238384Sjkim
1744280304Sjkim                if (!skip) {
1745280304Sjkim                    point_add(nq[0], nq[1], nq[2],
1746280304Sjkim                              nq[0], nq[1], nq[2],
1747280304Sjkim                              mixed, tmp[0], tmp[1], tmp[2]);
1748280304Sjkim                } else {
1749280304Sjkim                    smallfelem_expand(nq[0], tmp[0]);
1750280304Sjkim                    smallfelem_expand(nq[1], tmp[1]);
1751280304Sjkim                    smallfelem_expand(nq[2], tmp[2]);
1752280304Sjkim                    skip = 0;
1753280304Sjkim                }
1754280304Sjkim            }
1755280304Sjkim        }
1756280304Sjkim    }
1757280304Sjkim    felem_assign(x_out, nq[0]);
1758280304Sjkim    felem_assign(y_out, nq[1]);
1759280304Sjkim    felem_assign(z_out, nq[2]);
1760280304Sjkim}
1761238384Sjkim
1762238384Sjkim/* Precomputation for the group generator. */
1763238384Sjkimtypedef struct {
1764280304Sjkim    smallfelem g_pre_comp[2][16][3];
1765280304Sjkim    int references;
1766238384Sjkim} NISTP256_PRE_COMP;
1767238384Sjkim
1768238384Sjkimconst EC_METHOD *EC_GFp_nistp256_method(void)
1769280304Sjkim{
1770280304Sjkim    static const EC_METHOD ret = {
1771280304Sjkim        EC_FLAGS_DEFAULT_OCT,
1772280304Sjkim        NID_X9_62_prime_field,
1773280304Sjkim        ec_GFp_nistp256_group_init,
1774280304Sjkim        ec_GFp_simple_group_finish,
1775280304Sjkim        ec_GFp_simple_group_clear_finish,
1776280304Sjkim        ec_GFp_nist_group_copy,
1777280304Sjkim        ec_GFp_nistp256_group_set_curve,
1778280304Sjkim        ec_GFp_simple_group_get_curve,
1779280304Sjkim        ec_GFp_simple_group_get_degree,
1780280304Sjkim        ec_GFp_simple_group_check_discriminant,
1781280304Sjkim        ec_GFp_simple_point_init,
1782280304Sjkim        ec_GFp_simple_point_finish,
1783280304Sjkim        ec_GFp_simple_point_clear_finish,
1784280304Sjkim        ec_GFp_simple_point_copy,
1785280304Sjkim        ec_GFp_simple_point_set_to_infinity,
1786280304Sjkim        ec_GFp_simple_set_Jprojective_coordinates_GFp,
1787280304Sjkim        ec_GFp_simple_get_Jprojective_coordinates_GFp,
1788280304Sjkim        ec_GFp_simple_point_set_affine_coordinates,
1789280304Sjkim        ec_GFp_nistp256_point_get_affine_coordinates,
1790280304Sjkim        0 /* point_set_compressed_coordinates */ ,
1791280304Sjkim        0 /* point2oct */ ,
1792280304Sjkim        0 /* oct2point */ ,
1793280304Sjkim        ec_GFp_simple_add,
1794280304Sjkim        ec_GFp_simple_dbl,
1795280304Sjkim        ec_GFp_simple_invert,
1796280304Sjkim        ec_GFp_simple_is_at_infinity,
1797280304Sjkim        ec_GFp_simple_is_on_curve,
1798280304Sjkim        ec_GFp_simple_cmp,
1799280304Sjkim        ec_GFp_simple_make_affine,
1800280304Sjkim        ec_GFp_simple_points_make_affine,
1801280304Sjkim        ec_GFp_nistp256_points_mul,
1802280304Sjkim        ec_GFp_nistp256_precompute_mult,
1803280304Sjkim        ec_GFp_nistp256_have_precompute_mult,
1804280304Sjkim        ec_GFp_nist_field_mul,
1805280304Sjkim        ec_GFp_nist_field_sqr,
1806280304Sjkim        0 /* field_div */ ,
1807280304Sjkim        0 /* field_encode */ ,
1808280304Sjkim        0 /* field_decode */ ,
1809280304Sjkim        0                       /* field_set_to_one */
1810280304Sjkim    };
1811238384Sjkim
1812280304Sjkim    return &ret;
1813280304Sjkim}
1814238384Sjkim
1815238384Sjkim/******************************************************************************/
1816280304Sjkim/*
1817280304Sjkim * FUNCTIONS TO MANAGE PRECOMPUTATION
1818238384Sjkim */
1819238384Sjkim
1820238384Sjkimstatic NISTP256_PRE_COMP *nistp256_pre_comp_new()
1821280304Sjkim{
1822280304Sjkim    NISTP256_PRE_COMP *ret = NULL;
1823280304Sjkim    ret = (NISTP256_PRE_COMP *) OPENSSL_malloc(sizeof *ret);
1824280304Sjkim    if (!ret) {
1825280304Sjkim        ECerr(EC_F_NISTP256_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE);
1826280304Sjkim        return ret;
1827280304Sjkim    }
1828280304Sjkim    memset(ret->g_pre_comp, 0, sizeof(ret->g_pre_comp));
1829280304Sjkim    ret->references = 1;
1830280304Sjkim    return ret;
1831280304Sjkim}
1832238384Sjkim
1833238384Sjkimstatic void *nistp256_pre_comp_dup(void *src_)
1834280304Sjkim{
1835280304Sjkim    NISTP256_PRE_COMP *src = src_;
1836238384Sjkim
1837280304Sjkim    /* no need to actually copy, these objects never change! */
1838280304Sjkim    CRYPTO_add(&src->references, 1, CRYPTO_LOCK_EC_PRE_COMP);
1839238384Sjkim
1840280304Sjkim    return src_;
1841280304Sjkim}
1842238384Sjkim
1843238384Sjkimstatic void nistp256_pre_comp_free(void *pre_)
1844280304Sjkim{
1845280304Sjkim    int i;
1846280304Sjkim    NISTP256_PRE_COMP *pre = pre_;
1847238384Sjkim
1848280304Sjkim    if (!pre)
1849280304Sjkim        return;
1850238384Sjkim
1851280304Sjkim    i = CRYPTO_add(&pre->references, -1, CRYPTO_LOCK_EC_PRE_COMP);
1852280304Sjkim    if (i > 0)
1853280304Sjkim        return;
1854238384Sjkim
1855280304Sjkim    OPENSSL_free(pre);
1856280304Sjkim}
1857238384Sjkim
1858238384Sjkimstatic void nistp256_pre_comp_clear_free(void *pre_)
1859280304Sjkim{
1860280304Sjkim    int i;
1861280304Sjkim    NISTP256_PRE_COMP *pre = pre_;
1862238384Sjkim
1863280304Sjkim    if (!pre)
1864280304Sjkim        return;
1865238384Sjkim
1866280304Sjkim    i = CRYPTO_add(&pre->references, -1, CRYPTO_LOCK_EC_PRE_COMP);
1867280304Sjkim    if (i > 0)
1868280304Sjkim        return;
1869238384Sjkim
1870280304Sjkim    OPENSSL_cleanse(pre, sizeof *pre);
1871280304Sjkim    OPENSSL_free(pre);
1872280304Sjkim}
1873238384Sjkim
1874238384Sjkim/******************************************************************************/
1875280304Sjkim/*
1876280304Sjkim * OPENSSL EC_METHOD FUNCTIONS
1877238384Sjkim */
1878238384Sjkim
1879238384Sjkimint ec_GFp_nistp256_group_init(EC_GROUP *group)
1880280304Sjkim{
1881280304Sjkim    int ret;
1882280304Sjkim    ret = ec_GFp_simple_group_init(group);
1883280304Sjkim    group->a_is_minus3 = 1;
1884280304Sjkim    return ret;
1885280304Sjkim}
1886238384Sjkim
1887238384Sjkimint ec_GFp_nistp256_group_set_curve(EC_GROUP *group, const BIGNUM *p,
1888280304Sjkim                                    const BIGNUM *a, const BIGNUM *b,
1889280304Sjkim                                    BN_CTX *ctx)
1890280304Sjkim{
1891280304Sjkim    int ret = 0;
1892280304Sjkim    BN_CTX *new_ctx = NULL;
1893280304Sjkim    BIGNUM *curve_p, *curve_a, *curve_b;
1894238384Sjkim
1895280304Sjkim    if (ctx == NULL)
1896280304Sjkim        if ((ctx = new_ctx = BN_CTX_new()) == NULL)
1897280304Sjkim            return 0;
1898280304Sjkim    BN_CTX_start(ctx);
1899280304Sjkim    if (((curve_p = BN_CTX_get(ctx)) == NULL) ||
1900280304Sjkim        ((curve_a = BN_CTX_get(ctx)) == NULL) ||
1901280304Sjkim        ((curve_b = BN_CTX_get(ctx)) == NULL))
1902280304Sjkim        goto err;
1903280304Sjkim    BN_bin2bn(nistp256_curve_params[0], sizeof(felem_bytearray), curve_p);
1904280304Sjkim    BN_bin2bn(nistp256_curve_params[1], sizeof(felem_bytearray), curve_a);
1905280304Sjkim    BN_bin2bn(nistp256_curve_params[2], sizeof(felem_bytearray), curve_b);
1906280304Sjkim    if ((BN_cmp(curve_p, p)) || (BN_cmp(curve_a, a)) || (BN_cmp(curve_b, b))) {
1907280304Sjkim        ECerr(EC_F_EC_GFP_NISTP256_GROUP_SET_CURVE,
1908280304Sjkim              EC_R_WRONG_CURVE_PARAMETERS);
1909280304Sjkim        goto err;
1910280304Sjkim    }
1911280304Sjkim    group->field_mod_func = BN_nist_mod_256;
1912280304Sjkim    ret = ec_GFp_simple_group_set_curve(group, p, a, b, ctx);
1913280304Sjkim err:
1914280304Sjkim    BN_CTX_end(ctx);
1915280304Sjkim    if (new_ctx != NULL)
1916280304Sjkim        BN_CTX_free(new_ctx);
1917280304Sjkim    return ret;
1918280304Sjkim}
1919238384Sjkim
1920280304Sjkim/*
1921280304Sjkim * Takes the Jacobian coordinates (X, Y, Z) of a point and returns (X', Y') =
1922280304Sjkim * (X/Z^2, Y/Z^3)
1923280304Sjkim */
1924238384Sjkimint ec_GFp_nistp256_point_get_affine_coordinates(const EC_GROUP *group,
1925280304Sjkim                                                 const EC_POINT *point,
1926280304Sjkim                                                 BIGNUM *x, BIGNUM *y,
1927280304Sjkim                                                 BN_CTX *ctx)
1928280304Sjkim{
1929280304Sjkim    felem z1, z2, x_in, y_in;
1930280304Sjkim    smallfelem x_out, y_out;
1931280304Sjkim    longfelem tmp;
1932238384Sjkim
1933280304Sjkim    if (EC_POINT_is_at_infinity(group, point)) {
1934280304Sjkim        ECerr(EC_F_EC_GFP_NISTP256_POINT_GET_AFFINE_COORDINATES,
1935280304Sjkim              EC_R_POINT_AT_INFINITY);
1936280304Sjkim        return 0;
1937280304Sjkim    }
1938280304Sjkim    if ((!BN_to_felem(x_in, &point->X)) || (!BN_to_felem(y_in, &point->Y)) ||
1939280304Sjkim        (!BN_to_felem(z1, &point->Z)))
1940280304Sjkim        return 0;
1941280304Sjkim    felem_inv(z2, z1);
1942280304Sjkim    felem_square(tmp, z2);
1943280304Sjkim    felem_reduce(z1, tmp);
1944280304Sjkim    felem_mul(tmp, x_in, z1);
1945280304Sjkim    felem_reduce(x_in, tmp);
1946280304Sjkim    felem_contract(x_out, x_in);
1947280304Sjkim    if (x != NULL) {
1948280304Sjkim        if (!smallfelem_to_BN(x, x_out)) {
1949280304Sjkim            ECerr(EC_F_EC_GFP_NISTP256_POINT_GET_AFFINE_COORDINATES,
1950280304Sjkim                  ERR_R_BN_LIB);
1951280304Sjkim            return 0;
1952280304Sjkim        }
1953280304Sjkim    }
1954280304Sjkim    felem_mul(tmp, z1, z2);
1955280304Sjkim    felem_reduce(z1, tmp);
1956280304Sjkim    felem_mul(tmp, y_in, z1);
1957280304Sjkim    felem_reduce(y_in, tmp);
1958280304Sjkim    felem_contract(y_out, y_in);
1959280304Sjkim    if (y != NULL) {
1960280304Sjkim        if (!smallfelem_to_BN(y, y_out)) {
1961280304Sjkim            ECerr(EC_F_EC_GFP_NISTP256_POINT_GET_AFFINE_COORDINATES,
1962280304Sjkim                  ERR_R_BN_LIB);
1963280304Sjkim            return 0;
1964280304Sjkim        }
1965280304Sjkim    }
1966280304Sjkim    return 1;
1967280304Sjkim}
1968238384Sjkim
1969280304Sjkim/* points below is of size |num|, and tmp_smallfelems is of size |num+1| */
1970280304Sjkimstatic void make_points_affine(size_t num, smallfelem points[][3],
1971280304Sjkim                               smallfelem tmp_smallfelems[])
1972280304Sjkim{
1973280304Sjkim    /*
1974280304Sjkim     * Runs in constant time, unless an input is the point at infinity (which
1975280304Sjkim     * normally shouldn't happen).
1976280304Sjkim     */
1977280304Sjkim    ec_GFp_nistp_points_make_affine_internal(num,
1978280304Sjkim                                             points,
1979280304Sjkim                                             sizeof(smallfelem),
1980280304Sjkim                                             tmp_smallfelems,
1981280304Sjkim                                             (void (*)(void *))smallfelem_one,
1982280304Sjkim                                             (int (*)(const void *))
1983280304Sjkim                                             smallfelem_is_zero_int,
1984280304Sjkim                                             (void (*)(void *, const void *))
1985280304Sjkim                                             smallfelem_assign,
1986280304Sjkim                                             (void (*)(void *, const void *))
1987280304Sjkim                                             smallfelem_square_contract,
1988280304Sjkim                                             (void (*)
1989280304Sjkim                                              (void *, const void *,
1990280304Sjkim                                               const void *))
1991280304Sjkim                                             smallfelem_mul_contract,
1992280304Sjkim                                             (void (*)(void *, const void *))
1993280304Sjkim                                             smallfelem_inv_contract,
1994280304Sjkim                                             /* nothing to contract */
1995280304Sjkim                                             (void (*)(void *, const void *))
1996280304Sjkim                                             smallfelem_assign);
1997280304Sjkim}
1998238384Sjkim
1999280304Sjkim/*
2000280304Sjkim * Computes scalar*generator + \sum scalars[i]*points[i], ignoring NULL
2001280304Sjkim * values Result is stored in r (r can equal one of the inputs).
2002280304Sjkim */
2003238384Sjkimint ec_GFp_nistp256_points_mul(const EC_GROUP *group, EC_POINT *r,
2004280304Sjkim                               const BIGNUM *scalar, size_t num,
2005280304Sjkim                               const EC_POINT *points[],
2006280304Sjkim                               const BIGNUM *scalars[], BN_CTX *ctx)
2007280304Sjkim{
2008280304Sjkim    int ret = 0;
2009280304Sjkim    int j;
2010280304Sjkim    int mixed = 0;
2011280304Sjkim    BN_CTX *new_ctx = NULL;
2012280304Sjkim    BIGNUM *x, *y, *z, *tmp_scalar;
2013280304Sjkim    felem_bytearray g_secret;
2014280304Sjkim    felem_bytearray *secrets = NULL;
2015280304Sjkim    smallfelem(*pre_comp)[17][3] = NULL;
2016280304Sjkim    smallfelem *tmp_smallfelems = NULL;
2017280304Sjkim    felem_bytearray tmp;
2018280304Sjkim    unsigned i, num_bytes;
2019280304Sjkim    int have_pre_comp = 0;
2020280304Sjkim    size_t num_points = num;
2021280304Sjkim    smallfelem x_in, y_in, z_in;
2022280304Sjkim    felem x_out, y_out, z_out;
2023280304Sjkim    NISTP256_PRE_COMP *pre = NULL;
2024280304Sjkim    const smallfelem(*g_pre_comp)[16][3] = NULL;
2025280304Sjkim    EC_POINT *generator = NULL;
2026280304Sjkim    const EC_POINT *p = NULL;
2027280304Sjkim    const BIGNUM *p_scalar = NULL;
2028238384Sjkim
2029280304Sjkim    if (ctx == NULL)
2030280304Sjkim        if ((ctx = new_ctx = BN_CTX_new()) == NULL)
2031280304Sjkim            return 0;
2032280304Sjkim    BN_CTX_start(ctx);
2033280304Sjkim    if (((x = BN_CTX_get(ctx)) == NULL) ||
2034280304Sjkim        ((y = BN_CTX_get(ctx)) == NULL) ||
2035280304Sjkim        ((z = BN_CTX_get(ctx)) == NULL) ||
2036280304Sjkim        ((tmp_scalar = BN_CTX_get(ctx)) == NULL))
2037280304Sjkim        goto err;
2038238384Sjkim
2039280304Sjkim    if (scalar != NULL) {
2040280304Sjkim        pre = EC_EX_DATA_get_data(group->extra_data,
2041280304Sjkim                                  nistp256_pre_comp_dup,
2042280304Sjkim                                  nistp256_pre_comp_free,
2043280304Sjkim                                  nistp256_pre_comp_clear_free);
2044280304Sjkim        if (pre)
2045280304Sjkim            /* we have precomputation, try to use it */
2046280304Sjkim            g_pre_comp = (const smallfelem(*)[16][3])pre->g_pre_comp;
2047280304Sjkim        else
2048280304Sjkim            /* try to use the standard precomputation */
2049280304Sjkim            g_pre_comp = &gmul[0];
2050280304Sjkim        generator = EC_POINT_new(group);
2051280304Sjkim        if (generator == NULL)
2052280304Sjkim            goto err;
2053280304Sjkim        /* get the generator from precomputation */
2054280304Sjkim        if (!smallfelem_to_BN(x, g_pre_comp[0][1][0]) ||
2055280304Sjkim            !smallfelem_to_BN(y, g_pre_comp[0][1][1]) ||
2056280304Sjkim            !smallfelem_to_BN(z, g_pre_comp[0][1][2])) {
2057280304Sjkim            ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_BN_LIB);
2058280304Sjkim            goto err;
2059280304Sjkim        }
2060280304Sjkim        if (!EC_POINT_set_Jprojective_coordinates_GFp(group,
2061280304Sjkim                                                      generator, x, y, z,
2062280304Sjkim                                                      ctx))
2063280304Sjkim            goto err;
2064280304Sjkim        if (0 == EC_POINT_cmp(group, generator, group->generator, ctx))
2065280304Sjkim            /* precomputation matches generator */
2066280304Sjkim            have_pre_comp = 1;
2067280304Sjkim        else
2068280304Sjkim            /*
2069280304Sjkim             * we don't have valid precomputation: treat the generator as a
2070280304Sjkim             * random point
2071280304Sjkim             */
2072280304Sjkim            num_points++;
2073280304Sjkim    }
2074280304Sjkim    if (num_points > 0) {
2075280304Sjkim        if (num_points >= 3) {
2076280304Sjkim            /*
2077280304Sjkim             * unless we precompute multiples for just one or two points,
2078280304Sjkim             * converting those into affine form is time well spent
2079280304Sjkim             */
2080280304Sjkim            mixed = 1;
2081280304Sjkim        }
2082280304Sjkim        secrets = OPENSSL_malloc(num_points * sizeof(felem_bytearray));
2083280304Sjkim        pre_comp = OPENSSL_malloc(num_points * 17 * 3 * sizeof(smallfelem));
2084280304Sjkim        if (mixed)
2085280304Sjkim            tmp_smallfelems =
2086280304Sjkim                OPENSSL_malloc((num_points * 17 + 1) * sizeof(smallfelem));
2087280304Sjkim        if ((secrets == NULL) || (pre_comp == NULL)
2088280304Sjkim            || (mixed && (tmp_smallfelems == NULL))) {
2089280304Sjkim            ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_MALLOC_FAILURE);
2090280304Sjkim            goto err;
2091280304Sjkim        }
2092238384Sjkim
2093280304Sjkim        /*
2094280304Sjkim         * we treat NULL scalars as 0, and NULL points as points at infinity,
2095280304Sjkim         * i.e., they contribute nothing to the linear combination
2096280304Sjkim         */
2097280304Sjkim        memset(secrets, 0, num_points * sizeof(felem_bytearray));
2098280304Sjkim        memset(pre_comp, 0, num_points * 17 * 3 * sizeof(smallfelem));
2099280304Sjkim        for (i = 0; i < num_points; ++i) {
2100280304Sjkim            if (i == num)
2101280304Sjkim                /*
2102280304Sjkim                 * we didn't have a valid precomputation, so we pick the
2103280304Sjkim                 * generator
2104280304Sjkim                 */
2105280304Sjkim            {
2106280304Sjkim                p = EC_GROUP_get0_generator(group);
2107280304Sjkim                p_scalar = scalar;
2108280304Sjkim            } else
2109280304Sjkim                /* the i^th point */
2110280304Sjkim            {
2111280304Sjkim                p = points[i];
2112280304Sjkim                p_scalar = scalars[i];
2113280304Sjkim            }
2114280304Sjkim            if ((p_scalar != NULL) && (p != NULL)) {
2115280304Sjkim                /* reduce scalar to 0 <= scalar < 2^256 */
2116280304Sjkim                if ((BN_num_bits(p_scalar) > 256)
2117280304Sjkim                    || (BN_is_negative(p_scalar))) {
2118280304Sjkim                    /*
2119280304Sjkim                     * this is an unusual input, and we don't guarantee
2120280304Sjkim                     * constant-timeness
2121280304Sjkim                     */
2122280304Sjkim                    if (!BN_nnmod(tmp_scalar, p_scalar, &group->order, ctx)) {
2123280304Sjkim                        ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_BN_LIB);
2124280304Sjkim                        goto err;
2125280304Sjkim                    }
2126280304Sjkim                    num_bytes = BN_bn2bin(tmp_scalar, tmp);
2127280304Sjkim                } else
2128280304Sjkim                    num_bytes = BN_bn2bin(p_scalar, tmp);
2129280304Sjkim                flip_endian(secrets[i], tmp, num_bytes);
2130280304Sjkim                /* precompute multiples */
2131280304Sjkim                if ((!BN_to_felem(x_out, &p->X)) ||
2132280304Sjkim                    (!BN_to_felem(y_out, &p->Y)) ||
2133280304Sjkim                    (!BN_to_felem(z_out, &p->Z)))
2134280304Sjkim                    goto err;
2135280304Sjkim                felem_shrink(pre_comp[i][1][0], x_out);
2136280304Sjkim                felem_shrink(pre_comp[i][1][1], y_out);
2137280304Sjkim                felem_shrink(pre_comp[i][1][2], z_out);
2138280304Sjkim                for (j = 2; j <= 16; ++j) {
2139280304Sjkim                    if (j & 1) {
2140280304Sjkim                        point_add_small(pre_comp[i][j][0], pre_comp[i][j][1],
2141280304Sjkim                                        pre_comp[i][j][2], pre_comp[i][1][0],
2142280304Sjkim                                        pre_comp[i][1][1], pre_comp[i][1][2],
2143280304Sjkim                                        pre_comp[i][j - 1][0],
2144280304Sjkim                                        pre_comp[i][j - 1][1],
2145280304Sjkim                                        pre_comp[i][j - 1][2]);
2146280304Sjkim                    } else {
2147280304Sjkim                        point_double_small(pre_comp[i][j][0],
2148280304Sjkim                                           pre_comp[i][j][1],
2149280304Sjkim                                           pre_comp[i][j][2],
2150280304Sjkim                                           pre_comp[i][j / 2][0],
2151280304Sjkim                                           pre_comp[i][j / 2][1],
2152280304Sjkim                                           pre_comp[i][j / 2][2]);
2153280304Sjkim                    }
2154280304Sjkim                }
2155280304Sjkim            }
2156280304Sjkim        }
2157280304Sjkim        if (mixed)
2158280304Sjkim            make_points_affine(num_points * 17, pre_comp[0], tmp_smallfelems);
2159280304Sjkim    }
2160238384Sjkim
2161280304Sjkim    /* the scalar for the generator */
2162280304Sjkim    if ((scalar != NULL) && (have_pre_comp)) {
2163280304Sjkim        memset(g_secret, 0, sizeof(g_secret));
2164280304Sjkim        /* reduce scalar to 0 <= scalar < 2^256 */
2165280304Sjkim        if ((BN_num_bits(scalar) > 256) || (BN_is_negative(scalar))) {
2166280304Sjkim            /*
2167280304Sjkim             * this is an unusual input, and we don't guarantee
2168280304Sjkim             * constant-timeness
2169280304Sjkim             */
2170280304Sjkim            if (!BN_nnmod(tmp_scalar, scalar, &group->order, ctx)) {
2171280304Sjkim                ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_BN_LIB);
2172280304Sjkim                goto err;
2173280304Sjkim            }
2174280304Sjkim            num_bytes = BN_bn2bin(tmp_scalar, tmp);
2175280304Sjkim        } else
2176280304Sjkim            num_bytes = BN_bn2bin(scalar, tmp);
2177280304Sjkim        flip_endian(g_secret, tmp, num_bytes);
2178280304Sjkim        /* do the multiplication with generator precomputation */
2179280304Sjkim        batch_mul(x_out, y_out, z_out,
2180280304Sjkim                  (const felem_bytearray(*))secrets, num_points,
2181280304Sjkim                  g_secret,
2182280304Sjkim                  mixed, (const smallfelem(*)[17][3])pre_comp, g_pre_comp);
2183280304Sjkim    } else
2184280304Sjkim        /* do the multiplication without generator precomputation */
2185280304Sjkim        batch_mul(x_out, y_out, z_out,
2186280304Sjkim                  (const felem_bytearray(*))secrets, num_points,
2187280304Sjkim                  NULL, mixed, (const smallfelem(*)[17][3])pre_comp, NULL);
2188280304Sjkim    /* reduce the output to its unique minimal representation */
2189280304Sjkim    felem_contract(x_in, x_out);
2190280304Sjkim    felem_contract(y_in, y_out);
2191280304Sjkim    felem_contract(z_in, z_out);
2192280304Sjkim    if ((!smallfelem_to_BN(x, x_in)) || (!smallfelem_to_BN(y, y_in)) ||
2193280304Sjkim        (!smallfelem_to_BN(z, z_in))) {
2194280304Sjkim        ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_BN_LIB);
2195280304Sjkim        goto err;
2196280304Sjkim    }
2197280304Sjkim    ret = EC_POINT_set_Jprojective_coordinates_GFp(group, r, x, y, z, ctx);
2198238384Sjkim
2199280304Sjkim err:
2200280304Sjkim    BN_CTX_end(ctx);
2201280304Sjkim    if (generator != NULL)
2202280304Sjkim        EC_POINT_free(generator);
2203280304Sjkim    if (new_ctx != NULL)
2204280304Sjkim        BN_CTX_free(new_ctx);
2205280304Sjkim    if (secrets != NULL)
2206280304Sjkim        OPENSSL_free(secrets);
2207280304Sjkim    if (pre_comp != NULL)
2208280304Sjkim        OPENSSL_free(pre_comp);
2209280304Sjkim    if (tmp_smallfelems != NULL)
2210280304Sjkim        OPENSSL_free(tmp_smallfelems);
2211280304Sjkim    return ret;
2212280304Sjkim}
2213238384Sjkim
2214238384Sjkimint ec_GFp_nistp256_precompute_mult(EC_GROUP *group, BN_CTX *ctx)
2215280304Sjkim{
2216280304Sjkim    int ret = 0;
2217280304Sjkim    NISTP256_PRE_COMP *pre = NULL;
2218280304Sjkim    int i, j;
2219280304Sjkim    BN_CTX *new_ctx = NULL;
2220280304Sjkim    BIGNUM *x, *y;
2221280304Sjkim    EC_POINT *generator = NULL;
2222280304Sjkim    smallfelem tmp_smallfelems[32];
2223280304Sjkim    felem x_tmp, y_tmp, z_tmp;
2224238384Sjkim
2225280304Sjkim    /* throw away old precomputation */
2226280304Sjkim    EC_EX_DATA_free_data(&group->extra_data, nistp256_pre_comp_dup,
2227280304Sjkim                         nistp256_pre_comp_free,
2228280304Sjkim                         nistp256_pre_comp_clear_free);
2229280304Sjkim    if (ctx == NULL)
2230280304Sjkim        if ((ctx = new_ctx = BN_CTX_new()) == NULL)
2231280304Sjkim            return 0;
2232280304Sjkim    BN_CTX_start(ctx);
2233280304Sjkim    if (((x = BN_CTX_get(ctx)) == NULL) || ((y = BN_CTX_get(ctx)) == NULL))
2234280304Sjkim        goto err;
2235280304Sjkim    /* get the generator */
2236280304Sjkim    if (group->generator == NULL)
2237280304Sjkim        goto err;
2238280304Sjkim    generator = EC_POINT_new(group);
2239280304Sjkim    if (generator == NULL)
2240280304Sjkim        goto err;
2241280304Sjkim    BN_bin2bn(nistp256_curve_params[3], sizeof(felem_bytearray), x);
2242280304Sjkim    BN_bin2bn(nistp256_curve_params[4], sizeof(felem_bytearray), y);
2243280304Sjkim    if (!EC_POINT_set_affine_coordinates_GFp(group, generator, x, y, ctx))
2244280304Sjkim        goto err;
2245280304Sjkim    if ((pre = nistp256_pre_comp_new()) == NULL)
2246280304Sjkim        goto err;
2247280304Sjkim    /*
2248280304Sjkim     * if the generator is the standard one, use built-in precomputation
2249280304Sjkim     */
2250280304Sjkim    if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) {
2251280304Sjkim        memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp));
2252280304Sjkim        ret = 1;
2253280304Sjkim        goto err;
2254280304Sjkim    }
2255280304Sjkim    if ((!BN_to_felem(x_tmp, &group->generator->X)) ||
2256280304Sjkim        (!BN_to_felem(y_tmp, &group->generator->Y)) ||
2257280304Sjkim        (!BN_to_felem(z_tmp, &group->generator->Z)))
2258280304Sjkim        goto err;
2259280304Sjkim    felem_shrink(pre->g_pre_comp[0][1][0], x_tmp);
2260280304Sjkim    felem_shrink(pre->g_pre_comp[0][1][1], y_tmp);
2261280304Sjkim    felem_shrink(pre->g_pre_comp[0][1][2], z_tmp);
2262280304Sjkim    /*
2263280304Sjkim     * compute 2^64*G, 2^128*G, 2^192*G for the first table, 2^32*G, 2^96*G,
2264280304Sjkim     * 2^160*G, 2^224*G for the second one
2265280304Sjkim     */
2266280304Sjkim    for (i = 1; i <= 8; i <<= 1) {
2267280304Sjkim        point_double_small(pre->g_pre_comp[1][i][0], pre->g_pre_comp[1][i][1],
2268280304Sjkim                           pre->g_pre_comp[1][i][2], pre->g_pre_comp[0][i][0],
2269280304Sjkim                           pre->g_pre_comp[0][i][1],
2270280304Sjkim                           pre->g_pre_comp[0][i][2]);
2271280304Sjkim        for (j = 0; j < 31; ++j) {
2272280304Sjkim            point_double_small(pre->g_pre_comp[1][i][0],
2273280304Sjkim                               pre->g_pre_comp[1][i][1],
2274280304Sjkim                               pre->g_pre_comp[1][i][2],
2275280304Sjkim                               pre->g_pre_comp[1][i][0],
2276280304Sjkim                               pre->g_pre_comp[1][i][1],
2277280304Sjkim                               pre->g_pre_comp[1][i][2]);
2278280304Sjkim        }
2279280304Sjkim        if (i == 8)
2280280304Sjkim            break;
2281280304Sjkim        point_double_small(pre->g_pre_comp[0][2 * i][0],
2282280304Sjkim                           pre->g_pre_comp[0][2 * i][1],
2283280304Sjkim                           pre->g_pre_comp[0][2 * i][2],
2284280304Sjkim                           pre->g_pre_comp[1][i][0], pre->g_pre_comp[1][i][1],
2285280304Sjkim                           pre->g_pre_comp[1][i][2]);
2286280304Sjkim        for (j = 0; j < 31; ++j) {
2287280304Sjkim            point_double_small(pre->g_pre_comp[0][2 * i][0],
2288280304Sjkim                               pre->g_pre_comp[0][2 * i][1],
2289280304Sjkim                               pre->g_pre_comp[0][2 * i][2],
2290280304Sjkim                               pre->g_pre_comp[0][2 * i][0],
2291280304Sjkim                               pre->g_pre_comp[0][2 * i][1],
2292280304Sjkim                               pre->g_pre_comp[0][2 * i][2]);
2293280304Sjkim        }
2294280304Sjkim    }
2295280304Sjkim    for (i = 0; i < 2; i++) {
2296280304Sjkim        /* g_pre_comp[i][0] is the point at infinity */
2297280304Sjkim        memset(pre->g_pre_comp[i][0], 0, sizeof(pre->g_pre_comp[i][0]));
2298280304Sjkim        /* the remaining multiples */
2299280304Sjkim        /* 2^64*G + 2^128*G resp. 2^96*G + 2^160*G */
2300280304Sjkim        point_add_small(pre->g_pre_comp[i][6][0], pre->g_pre_comp[i][6][1],
2301280304Sjkim                        pre->g_pre_comp[i][6][2], pre->g_pre_comp[i][4][0],
2302280304Sjkim                        pre->g_pre_comp[i][4][1], pre->g_pre_comp[i][4][2],
2303280304Sjkim                        pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1],
2304280304Sjkim                        pre->g_pre_comp[i][2][2]);
2305280304Sjkim        /* 2^64*G + 2^192*G resp. 2^96*G + 2^224*G */
2306280304Sjkim        point_add_small(pre->g_pre_comp[i][10][0], pre->g_pre_comp[i][10][1],
2307280304Sjkim                        pre->g_pre_comp[i][10][2], pre->g_pre_comp[i][8][0],
2308280304Sjkim                        pre->g_pre_comp[i][8][1], pre->g_pre_comp[i][8][2],
2309280304Sjkim                        pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1],
2310280304Sjkim                        pre->g_pre_comp[i][2][2]);
2311280304Sjkim        /* 2^128*G + 2^192*G resp. 2^160*G + 2^224*G */
2312280304Sjkim        point_add_small(pre->g_pre_comp[i][12][0], pre->g_pre_comp[i][12][1],
2313280304Sjkim                        pre->g_pre_comp[i][12][2], pre->g_pre_comp[i][8][0],
2314280304Sjkim                        pre->g_pre_comp[i][8][1], pre->g_pre_comp[i][8][2],
2315280304Sjkim                        pre->g_pre_comp[i][4][0], pre->g_pre_comp[i][4][1],
2316280304Sjkim                        pre->g_pre_comp[i][4][2]);
2317280304Sjkim        /*
2318280304Sjkim         * 2^64*G + 2^128*G + 2^192*G resp. 2^96*G + 2^160*G + 2^224*G
2319280304Sjkim         */
2320280304Sjkim        point_add_small(pre->g_pre_comp[i][14][0], pre->g_pre_comp[i][14][1],
2321280304Sjkim                        pre->g_pre_comp[i][14][2], pre->g_pre_comp[i][12][0],
2322280304Sjkim                        pre->g_pre_comp[i][12][1], pre->g_pre_comp[i][12][2],
2323280304Sjkim                        pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1],
2324280304Sjkim                        pre->g_pre_comp[i][2][2]);
2325280304Sjkim        for (j = 1; j < 8; ++j) {
2326280304Sjkim            /* odd multiples: add G resp. 2^32*G */
2327280304Sjkim            point_add_small(pre->g_pre_comp[i][2 * j + 1][0],
2328280304Sjkim                            pre->g_pre_comp[i][2 * j + 1][1],
2329280304Sjkim                            pre->g_pre_comp[i][2 * j + 1][2],
2330280304Sjkim                            pre->g_pre_comp[i][2 * j][0],
2331280304Sjkim                            pre->g_pre_comp[i][2 * j][1],
2332280304Sjkim                            pre->g_pre_comp[i][2 * j][2],
2333280304Sjkim                            pre->g_pre_comp[i][1][0],
2334280304Sjkim                            pre->g_pre_comp[i][1][1],
2335280304Sjkim                            pre->g_pre_comp[i][1][2]);
2336280304Sjkim        }
2337280304Sjkim    }
2338280304Sjkim    make_points_affine(31, &(pre->g_pre_comp[0][1]), tmp_smallfelems);
2339238384Sjkim
2340280304Sjkim    if (!EC_EX_DATA_set_data(&group->extra_data, pre, nistp256_pre_comp_dup,
2341280304Sjkim                             nistp256_pre_comp_free,
2342280304Sjkim                             nistp256_pre_comp_clear_free))
2343280304Sjkim        goto err;
2344280304Sjkim    ret = 1;
2345280304Sjkim    pre = NULL;
2346238384Sjkim err:
2347280304Sjkim    BN_CTX_end(ctx);
2348280304Sjkim    if (generator != NULL)
2349280304Sjkim        EC_POINT_free(generator);
2350280304Sjkim    if (new_ctx != NULL)
2351280304Sjkim        BN_CTX_free(new_ctx);
2352280304Sjkim    if (pre)
2353280304Sjkim        nistp256_pre_comp_free(pre);
2354280304Sjkim    return ret;
2355280304Sjkim}
2356238384Sjkim
2357238384Sjkimint ec_GFp_nistp256_have_precompute_mult(const EC_GROUP *group)
2358280304Sjkim{
2359280304Sjkim    if (EC_EX_DATA_get_data(group->extra_data, nistp256_pre_comp_dup,
2360280304Sjkim                            nistp256_pre_comp_free,
2361280304Sjkim                            nistp256_pre_comp_clear_free)
2362280304Sjkim        != NULL)
2363280304Sjkim        return 1;
2364280304Sjkim    else
2365280304Sjkim        return 0;
2366280304Sjkim}
2367238384Sjkim#else
2368280304Sjkimstatic void *dummy = &dummy;
2369238384Sjkim#endif
2370