1238384Sjkim/* crypto/ec/ecp_nistp256.c */
2238384Sjkim/*
3238384Sjkim * Written by Adam Langley (Google) for the OpenSSL project
4238384Sjkim */
5238384Sjkim/* Copyright 2011 Google Inc.
6238384Sjkim *
7238384Sjkim * Licensed under the Apache License, Version 2.0 (the "License");
8238384Sjkim *
9238384Sjkim * you may not use this file except in compliance with the License.
10238384Sjkim * You may obtain a copy of the License at
11238384Sjkim *
12238384Sjkim *     http://www.apache.org/licenses/LICENSE-2.0
13238384Sjkim *
14238384Sjkim *  Unless required by applicable law or agreed to in writing, software
15238384Sjkim *  distributed under the License is distributed on an "AS IS" BASIS,
16238384Sjkim *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17238384Sjkim *  See the License for the specific language governing permissions and
18238384Sjkim *  limitations under the License.
19238384Sjkim */
20238384Sjkim
21238384Sjkim/*
22238384Sjkim * A 64-bit implementation of the NIST P-256 elliptic curve point multiplication
23238384Sjkim *
24238384Sjkim * OpenSSL integration was taken from Emilia Kasper's work in ecp_nistp224.c.
25238384Sjkim * Otherwise based on Emilia's P224 work, which was inspired by my curve25519
26238384Sjkim * work which got its smarts from Daniel J. Bernstein's work on the same.
27238384Sjkim */
28238384Sjkim
29238384Sjkim#include <openssl/opensslconf.h>
30238384Sjkim#ifndef OPENSSL_NO_EC_NISTP_64_GCC_128
31238384Sjkim
32296341Sdelphij# ifndef OPENSSL_SYS_VMS
33296341Sdelphij#  include <stdint.h>
34296341Sdelphij# else
35296341Sdelphij#  include <inttypes.h>
36296341Sdelphij# endif
37238384Sjkim
38296341Sdelphij# include <string.h>
39296341Sdelphij# include <openssl/err.h>
40296341Sdelphij# include "ec_lcl.h"
41238384Sjkim
42296341Sdelphij# if defined(__GNUC__) && (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))
43238384Sjkim  /* even with gcc, the typedef won't work for 32-bit platforms */
44296341Sdelphijtypedef __uint128_t uint128_t;  /* nonstandard; implemented by gcc on 64-bit
45296341Sdelphij                                 * platforms */
46296341Sdelphijtypedef __int128_t int128_t;
47296341Sdelphij# else
48296341Sdelphij#  error "Need GCC 3.1 or later to define type uint128_t"
49296341Sdelphij# endif
50238384Sjkim
51238384Sjkimtypedef uint8_t u8;
52238384Sjkimtypedef uint32_t u32;
53238384Sjkimtypedef uint64_t u64;
54238384Sjkimtypedef int64_t s64;
55238384Sjkim
56296341Sdelphij/*
57296341Sdelphij * The underlying field. P256 operates over GF(2^256-2^224+2^192+2^96-1). We
58296341Sdelphij * can serialise an element of this field into 32 bytes. We call this an
59296341Sdelphij * felem_bytearray.
60296341Sdelphij */
61238384Sjkim
62238384Sjkimtypedef u8 felem_bytearray[32];
63238384Sjkim
64296341Sdelphij/*
65296341Sdelphij * These are the parameters of P256, taken from FIPS 186-3, page 86. These
66296341Sdelphij * values are big-endian.
67296341Sdelphij */
68238384Sjkimstatic const felem_bytearray nistp256_curve_params[5] = {
69296341Sdelphij    {0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, /* p */
70296341Sdelphij     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
71296341Sdelphij     0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,
72296341Sdelphij     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
73296341Sdelphij    {0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, /* a = -3 */
74296341Sdelphij     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
75296341Sdelphij     0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,
76296341Sdelphij     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xfc}, /* b */
77296341Sdelphij    {0x5a, 0xc6, 0x35, 0xd8, 0xaa, 0x3a, 0x93, 0xe7,
78296341Sdelphij     0xb3, 0xeb, 0xbd, 0x55, 0x76, 0x98, 0x86, 0xbc,
79296341Sdelphij     0x65, 0x1d, 0x06, 0xb0, 0xcc, 0x53, 0xb0, 0xf6,
80296341Sdelphij     0x3b, 0xce, 0x3c, 0x3e, 0x27, 0xd2, 0x60, 0x4b},
81296341Sdelphij    {0x6b, 0x17, 0xd1, 0xf2, 0xe1, 0x2c, 0x42, 0x47, /* x */
82296341Sdelphij     0xf8, 0xbc, 0xe6, 0xe5, 0x63, 0xa4, 0x40, 0xf2,
83296341Sdelphij     0x77, 0x03, 0x7d, 0x81, 0x2d, 0xeb, 0x33, 0xa0,
84296341Sdelphij     0xf4, 0xa1, 0x39, 0x45, 0xd8, 0x98, 0xc2, 0x96},
85296341Sdelphij    {0x4f, 0xe3, 0x42, 0xe2, 0xfe, 0x1a, 0x7f, 0x9b, /* y */
86296341Sdelphij     0x8e, 0xe7, 0xeb, 0x4a, 0x7c, 0x0f, 0x9e, 0x16,
87296341Sdelphij     0x2b, 0xce, 0x33, 0x57, 0x6b, 0x31, 0x5e, 0xce,
88296341Sdelphij     0xcb, 0xb6, 0x40, 0x68, 0x37, 0xbf, 0x51, 0xf5}
89238384Sjkim};
90238384Sjkim
91296341Sdelphij/*-
92296341Sdelphij * The representation of field elements.
93238384Sjkim * ------------------------------------
94238384Sjkim *
95238384Sjkim * We represent field elements with either four 128-bit values, eight 128-bit
96238384Sjkim * values, or four 64-bit values. The field element represented is:
97238384Sjkim *   v[0]*2^0 + v[1]*2^64 + v[2]*2^128 + v[3]*2^192  (mod p)
98238384Sjkim * or:
99238384Sjkim *   v[0]*2^0 + v[1]*2^64 + v[2]*2^128 + ... + v[8]*2^512  (mod p)
100238384Sjkim *
101238384Sjkim * 128-bit values are called 'limbs'. Since the limbs are spaced only 64 bits
102238384Sjkim * apart, but are 128-bits wide, the most significant bits of each limb overlap
103238384Sjkim * with the least significant bits of the next.
104238384Sjkim *
105238384Sjkim * A field element with four limbs is an 'felem'. One with eight limbs is a
106238384Sjkim * 'longfelem'
107238384Sjkim *
108238384Sjkim * A field element with four, 64-bit values is called a 'smallfelem'. Small
109238384Sjkim * values are used as intermediate values before multiplication.
110238384Sjkim */
111238384Sjkim
112296341Sdelphij# define NLIMBS 4
113238384Sjkim
114238384Sjkimtypedef uint128_t limb;
115238384Sjkimtypedef limb felem[NLIMBS];
116238384Sjkimtypedef limb longfelem[NLIMBS * 2];
117238384Sjkimtypedef u64 smallfelem[NLIMBS];
118238384Sjkim
119238384Sjkim/* This is the value of the prime as four 64-bit words, little-endian. */
120296341Sdelphijstatic const u64 kPrime[4] =
121296341Sdelphij    { 0xfffffffffffffffful, 0xffffffff, 0, 0xffffffff00000001ul };
122238384Sjkimstatic const u64 bottom63bits = 0x7ffffffffffffffful;
123238384Sjkim
124296341Sdelphij/*
125296341Sdelphij * bin32_to_felem takes a little-endian byte array and converts it into felem
126296341Sdelphij * form. This assumes that the CPU is little-endian.
127296341Sdelphij */
128238384Sjkimstatic void bin32_to_felem(felem out, const u8 in[32])
129296341Sdelphij{
130296341Sdelphij    out[0] = *((u64 *)&in[0]);
131296341Sdelphij    out[1] = *((u64 *)&in[8]);
132296341Sdelphij    out[2] = *((u64 *)&in[16]);
133296341Sdelphij    out[3] = *((u64 *)&in[24]);
134296341Sdelphij}
135238384Sjkim
136296341Sdelphij/*
137296341Sdelphij * smallfelem_to_bin32 takes a smallfelem and serialises into a little
138296341Sdelphij * endian, 32 byte array. This assumes that the CPU is little-endian.
139296341Sdelphij */
140238384Sjkimstatic void smallfelem_to_bin32(u8 out[32], const smallfelem in)
141296341Sdelphij{
142296341Sdelphij    *((u64 *)&out[0]) = in[0];
143296341Sdelphij    *((u64 *)&out[8]) = in[1];
144296341Sdelphij    *((u64 *)&out[16]) = in[2];
145296341Sdelphij    *((u64 *)&out[24]) = in[3];
146296341Sdelphij}
147238384Sjkim
148238384Sjkim/* To preserve endianness when using BN_bn2bin and BN_bin2bn */
149238384Sjkimstatic void flip_endian(u8 *out, const u8 *in, unsigned len)
150296341Sdelphij{
151296341Sdelphij    unsigned i;
152296341Sdelphij    for (i = 0; i < len; ++i)
153296341Sdelphij        out[i] = in[len - 1 - i];
154296341Sdelphij}
155238384Sjkim
156238384Sjkim/* BN_to_felem converts an OpenSSL BIGNUM into an felem */
157238384Sjkimstatic int BN_to_felem(felem out, const BIGNUM *bn)
158296341Sdelphij{
159296341Sdelphij    felem_bytearray b_in;
160296341Sdelphij    felem_bytearray b_out;
161296341Sdelphij    unsigned num_bytes;
162238384Sjkim
163296341Sdelphij    /* BN_bn2bin eats leading zeroes */
164296341Sdelphij    memset(b_out, 0, sizeof b_out);
165296341Sdelphij    num_bytes = BN_num_bytes(bn);
166296341Sdelphij    if (num_bytes > sizeof b_out) {
167296341Sdelphij        ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
168296341Sdelphij        return 0;
169296341Sdelphij    }
170296341Sdelphij    if (BN_is_negative(bn)) {
171296341Sdelphij        ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
172296341Sdelphij        return 0;
173296341Sdelphij    }
174296341Sdelphij    num_bytes = BN_bn2bin(bn, b_in);
175296341Sdelphij    flip_endian(b_out, b_in, num_bytes);
176296341Sdelphij    bin32_to_felem(out, b_out);
177296341Sdelphij    return 1;
178296341Sdelphij}
179238384Sjkim
180238384Sjkim/* felem_to_BN converts an felem into an OpenSSL BIGNUM */
181238384Sjkimstatic BIGNUM *smallfelem_to_BN(BIGNUM *out, const smallfelem in)
182296341Sdelphij{
183296341Sdelphij    felem_bytearray b_in, b_out;
184296341Sdelphij    smallfelem_to_bin32(b_in, in);
185296341Sdelphij    flip_endian(b_out, b_in, sizeof b_out);
186296341Sdelphij    return BN_bin2bn(b_out, sizeof b_out, out);
187296341Sdelphij}
188238384Sjkim
189296341Sdelphij/*-
190296341Sdelphij * Field operations
191296341Sdelphij * ----------------
192296341Sdelphij */
193238384Sjkim
194238384Sjkimstatic void smallfelem_one(smallfelem out)
195296341Sdelphij{
196296341Sdelphij    out[0] = 1;
197296341Sdelphij    out[1] = 0;
198296341Sdelphij    out[2] = 0;
199296341Sdelphij    out[3] = 0;
200296341Sdelphij}
201238384Sjkim
202238384Sjkimstatic void smallfelem_assign(smallfelem out, const smallfelem in)
203296341Sdelphij{
204296341Sdelphij    out[0] = in[0];
205296341Sdelphij    out[1] = in[1];
206296341Sdelphij    out[2] = in[2];
207296341Sdelphij    out[3] = in[3];
208296341Sdelphij}
209238384Sjkim
210238384Sjkimstatic void felem_assign(felem out, const felem in)
211296341Sdelphij{
212296341Sdelphij    out[0] = in[0];
213296341Sdelphij    out[1] = in[1];
214296341Sdelphij    out[2] = in[2];
215296341Sdelphij    out[3] = in[3];
216296341Sdelphij}
217238384Sjkim
218238384Sjkim/* felem_sum sets out = out + in. */
219238384Sjkimstatic void felem_sum(felem out, const felem in)
220296341Sdelphij{
221296341Sdelphij    out[0] += in[0];
222296341Sdelphij    out[1] += in[1];
223296341Sdelphij    out[2] += in[2];
224296341Sdelphij    out[3] += in[3];
225296341Sdelphij}
226238384Sjkim
227238384Sjkim/* felem_small_sum sets out = out + in. */
228238384Sjkimstatic void felem_small_sum(felem out, const smallfelem in)
229296341Sdelphij{
230296341Sdelphij    out[0] += in[0];
231296341Sdelphij    out[1] += in[1];
232296341Sdelphij    out[2] += in[2];
233296341Sdelphij    out[3] += in[3];
234296341Sdelphij}
235238384Sjkim
236238384Sjkim/* felem_scalar sets out = out * scalar */
237238384Sjkimstatic void felem_scalar(felem out, const u64 scalar)
238296341Sdelphij{
239296341Sdelphij    out[0] *= scalar;
240296341Sdelphij    out[1] *= scalar;
241296341Sdelphij    out[2] *= scalar;
242296341Sdelphij    out[3] *= scalar;
243296341Sdelphij}
244238384Sjkim
245238384Sjkim/* longfelem_scalar sets out = out * scalar */
246238384Sjkimstatic void longfelem_scalar(longfelem out, const u64 scalar)
247296341Sdelphij{
248296341Sdelphij    out[0] *= scalar;
249296341Sdelphij    out[1] *= scalar;
250296341Sdelphij    out[2] *= scalar;
251296341Sdelphij    out[3] *= scalar;
252296341Sdelphij    out[4] *= scalar;
253296341Sdelphij    out[5] *= scalar;
254296341Sdelphij    out[6] *= scalar;
255296341Sdelphij    out[7] *= scalar;
256296341Sdelphij}
257238384Sjkim
258296341Sdelphij# define two105m41m9 (((limb)1) << 105) - (((limb)1) << 41) - (((limb)1) << 9)
259296341Sdelphij# define two105 (((limb)1) << 105)
260296341Sdelphij# define two105m41p9 (((limb)1) << 105) - (((limb)1) << 41) + (((limb)1) << 9)
261238384Sjkim
262238384Sjkim/* zero105 is 0 mod p */
263296341Sdelphijstatic const felem zero105 =
264296341Sdelphij    { two105m41m9, two105, two105m41p9, two105m41p9 };
265238384Sjkim
266296341Sdelphij/*-
267296341Sdelphij * smallfelem_neg sets |out| to |-small|
268238384Sjkim * On exit:
269238384Sjkim *   out[i] < out[i] + 2^105
270238384Sjkim */
271238384Sjkimstatic void smallfelem_neg(felem out, const smallfelem small)
272296341Sdelphij{
273296341Sdelphij    /* In order to prevent underflow, we subtract from 0 mod p. */
274296341Sdelphij    out[0] = zero105[0] - small[0];
275296341Sdelphij    out[1] = zero105[1] - small[1];
276296341Sdelphij    out[2] = zero105[2] - small[2];
277296341Sdelphij    out[3] = zero105[3] - small[3];
278296341Sdelphij}
279238384Sjkim
280296341Sdelphij/*-
281296341Sdelphij * felem_diff subtracts |in| from |out|
282238384Sjkim * On entry:
283238384Sjkim *   in[i] < 2^104
284238384Sjkim * On exit:
285238384Sjkim *   out[i] < out[i] + 2^105
286238384Sjkim */
287238384Sjkimstatic void felem_diff(felem out, const felem in)
288296341Sdelphij{
289296341Sdelphij    /*
290296341Sdelphij     * In order to prevent underflow, we add 0 mod p before subtracting.
291296341Sdelphij     */
292296341Sdelphij    out[0] += zero105[0];
293296341Sdelphij    out[1] += zero105[1];
294296341Sdelphij    out[2] += zero105[2];
295296341Sdelphij    out[3] += zero105[3];
296238384Sjkim
297296341Sdelphij    out[0] -= in[0];
298296341Sdelphij    out[1] -= in[1];
299296341Sdelphij    out[2] -= in[2];
300296341Sdelphij    out[3] -= in[3];
301296341Sdelphij}
302238384Sjkim
303296341Sdelphij# define two107m43m11 (((limb)1) << 107) - (((limb)1) << 43) - (((limb)1) << 11)
304296341Sdelphij# define two107 (((limb)1) << 107)
305296341Sdelphij# define two107m43p11 (((limb)1) << 107) - (((limb)1) << 43) + (((limb)1) << 11)
306238384Sjkim
307238384Sjkim/* zero107 is 0 mod p */
308296341Sdelphijstatic const felem zero107 =
309296341Sdelphij    { two107m43m11, two107, two107m43p11, two107m43p11 };
310238384Sjkim
311296341Sdelphij/*-
312296341Sdelphij * An alternative felem_diff for larger inputs |in|
313238384Sjkim * felem_diff_zero107 subtracts |in| from |out|
314238384Sjkim * On entry:
315238384Sjkim *   in[i] < 2^106
316238384Sjkim * On exit:
317238384Sjkim *   out[i] < out[i] + 2^107
318238384Sjkim */
319238384Sjkimstatic void felem_diff_zero107(felem out, const felem in)
320296341Sdelphij{
321296341Sdelphij    /*
322296341Sdelphij     * In order to prevent underflow, we add 0 mod p before subtracting.
323296341Sdelphij     */
324296341Sdelphij    out[0] += zero107[0];
325296341Sdelphij    out[1] += zero107[1];
326296341Sdelphij    out[2] += zero107[2];
327296341Sdelphij    out[3] += zero107[3];
328238384Sjkim
329296341Sdelphij    out[0] -= in[0];
330296341Sdelphij    out[1] -= in[1];
331296341Sdelphij    out[2] -= in[2];
332296341Sdelphij    out[3] -= in[3];
333296341Sdelphij}
334238384Sjkim
335296341Sdelphij/*-
336296341Sdelphij * longfelem_diff subtracts |in| from |out|
337238384Sjkim * On entry:
338238384Sjkim *   in[i] < 7*2^67
339238384Sjkim * On exit:
340238384Sjkim *   out[i] < out[i] + 2^70 + 2^40
341238384Sjkim */
342238384Sjkimstatic void longfelem_diff(longfelem out, const longfelem in)
343296341Sdelphij{
344296341Sdelphij    static const limb two70m8p6 =
345296341Sdelphij        (((limb) 1) << 70) - (((limb) 1) << 8) + (((limb) 1) << 6);
346296341Sdelphij    static const limb two70p40 = (((limb) 1) << 70) + (((limb) 1) << 40);
347296341Sdelphij    static const limb two70 = (((limb) 1) << 70);
348296341Sdelphij    static const limb two70m40m38p6 =
349296341Sdelphij        (((limb) 1) << 70) - (((limb) 1) << 40) - (((limb) 1) << 38) +
350296341Sdelphij        (((limb) 1) << 6);
351296341Sdelphij    static const limb two70m6 = (((limb) 1) << 70) - (((limb) 1) << 6);
352238384Sjkim
353296341Sdelphij    /* add 0 mod p to avoid underflow */
354296341Sdelphij    out[0] += two70m8p6;
355296341Sdelphij    out[1] += two70p40;
356296341Sdelphij    out[2] += two70;
357296341Sdelphij    out[3] += two70m40m38p6;
358296341Sdelphij    out[4] += two70m6;
359296341Sdelphij    out[5] += two70m6;
360296341Sdelphij    out[6] += two70m6;
361296341Sdelphij    out[7] += two70m6;
362238384Sjkim
363296341Sdelphij    /* in[i] < 7*2^67 < 2^70 - 2^40 - 2^38 + 2^6 */
364296341Sdelphij    out[0] -= in[0];
365296341Sdelphij    out[1] -= in[1];
366296341Sdelphij    out[2] -= in[2];
367296341Sdelphij    out[3] -= in[3];
368296341Sdelphij    out[4] -= in[4];
369296341Sdelphij    out[5] -= in[5];
370296341Sdelphij    out[6] -= in[6];
371296341Sdelphij    out[7] -= in[7];
372296341Sdelphij}
373238384Sjkim
374296341Sdelphij# define two64m0 (((limb)1) << 64) - 1
375296341Sdelphij# define two110p32m0 (((limb)1) << 110) + (((limb)1) << 32) - 1
376296341Sdelphij# define two64m46 (((limb)1) << 64) - (((limb)1) << 46)
377296341Sdelphij# define two64m32 (((limb)1) << 64) - (((limb)1) << 32)
378238384Sjkim
379238384Sjkim/* zero110 is 0 mod p */
380238384Sjkimstatic const felem zero110 = { two64m0, two110p32m0, two64m46, two64m32 };
381238384Sjkim
382296341Sdelphij/*-
383296341Sdelphij * felem_shrink converts an felem into a smallfelem. The result isn't quite
384238384Sjkim * minimal as the value may be greater than p.
385238384Sjkim *
386238384Sjkim * On entry:
387238384Sjkim *   in[i] < 2^109
388238384Sjkim * On exit:
389238384Sjkim *   out[i] < 2^64
390238384Sjkim */
391238384Sjkimstatic void felem_shrink(smallfelem out, const felem in)
392296341Sdelphij{
393296341Sdelphij    felem tmp;
394296341Sdelphij    u64 a, b, mask;
395296341Sdelphij    s64 high, low;
396296341Sdelphij    static const u64 kPrime3Test = 0x7fffffff00000001ul; /* 2^63 - 2^32 + 1 */
397238384Sjkim
398296341Sdelphij    /* Carry 2->3 */
399296341Sdelphij    tmp[3] = zero110[3] + in[3] + ((u64)(in[2] >> 64));
400296341Sdelphij    /* tmp[3] < 2^110 */
401238384Sjkim
402296341Sdelphij    tmp[2] = zero110[2] + (u64)in[2];
403296341Sdelphij    tmp[0] = zero110[0] + in[0];
404296341Sdelphij    tmp[1] = zero110[1] + in[1];
405296341Sdelphij    /* tmp[0] < 2**110, tmp[1] < 2^111, tmp[2] < 2**65 */
406238384Sjkim
407296341Sdelphij    /*
408296341Sdelphij     * We perform two partial reductions where we eliminate the high-word of
409296341Sdelphij     * tmp[3]. We don't update the other words till the end.
410296341Sdelphij     */
411296341Sdelphij    a = tmp[3] >> 64;           /* a < 2^46 */
412296341Sdelphij    tmp[3] = (u64)tmp[3];
413296341Sdelphij    tmp[3] -= a;
414296341Sdelphij    tmp[3] += ((limb) a) << 32;
415296341Sdelphij    /* tmp[3] < 2^79 */
416238384Sjkim
417296341Sdelphij    b = a;
418296341Sdelphij    a = tmp[3] >> 64;           /* a < 2^15 */
419296341Sdelphij    b += a;                     /* b < 2^46 + 2^15 < 2^47 */
420296341Sdelphij    tmp[3] = (u64)tmp[3];
421296341Sdelphij    tmp[3] -= a;
422296341Sdelphij    tmp[3] += ((limb) a) << 32;
423296341Sdelphij    /* tmp[3] < 2^64 + 2^47 */
424238384Sjkim
425296341Sdelphij    /*
426296341Sdelphij     * This adjusts the other two words to complete the two partial
427296341Sdelphij     * reductions.
428296341Sdelphij     */
429296341Sdelphij    tmp[0] += b;
430296341Sdelphij    tmp[1] -= (((limb) b) << 32);
431238384Sjkim
432296341Sdelphij    /*
433296341Sdelphij     * In order to make space in tmp[3] for the carry from 2 -> 3, we
434296341Sdelphij     * conditionally subtract kPrime if tmp[3] is large enough.
435296341Sdelphij     */
436296341Sdelphij    high = tmp[3] >> 64;
437296341Sdelphij    /* As tmp[3] < 2^65, high is either 1 or 0 */
438296341Sdelphij    high <<= 63;
439296341Sdelphij    high >>= 63;
440296341Sdelphij    /*-
441296341Sdelphij     * high is:
442296341Sdelphij     *   all ones   if the high word of tmp[3] is 1
443296341Sdelphij     *   all zeros  if the high word of tmp[3] if 0 */
444296341Sdelphij    low = tmp[3];
445296341Sdelphij    mask = low >> 63;
446296341Sdelphij    /*-
447296341Sdelphij     * mask is:
448296341Sdelphij     *   all ones   if the MSB of low is 1
449296341Sdelphij     *   all zeros  if the MSB of low if 0 */
450296341Sdelphij    low &= bottom63bits;
451296341Sdelphij    low -= kPrime3Test;
452296341Sdelphij    /* if low was greater than kPrime3Test then the MSB is zero */
453296341Sdelphij    low = ~low;
454296341Sdelphij    low >>= 63;
455296341Sdelphij    /*-
456296341Sdelphij     * low is:
457296341Sdelphij     *   all ones   if low was > kPrime3Test
458296341Sdelphij     *   all zeros  if low was <= kPrime3Test */
459296341Sdelphij    mask = (mask & low) | high;
460296341Sdelphij    tmp[0] -= mask & kPrime[0];
461296341Sdelphij    tmp[1] -= mask & kPrime[1];
462296341Sdelphij    /* kPrime[2] is zero, so omitted */
463296341Sdelphij    tmp[3] -= mask & kPrime[3];
464296341Sdelphij    /* tmp[3] < 2**64 - 2**32 + 1 */
465238384Sjkim
466296341Sdelphij    tmp[1] += ((u64)(tmp[0] >> 64));
467296341Sdelphij    tmp[0] = (u64)tmp[0];
468296341Sdelphij    tmp[2] += ((u64)(tmp[1] >> 64));
469296341Sdelphij    tmp[1] = (u64)tmp[1];
470296341Sdelphij    tmp[3] += ((u64)(tmp[2] >> 64));
471296341Sdelphij    tmp[2] = (u64)tmp[2];
472296341Sdelphij    /* tmp[i] < 2^64 */
473238384Sjkim
474296341Sdelphij    out[0] = tmp[0];
475296341Sdelphij    out[1] = tmp[1];
476296341Sdelphij    out[2] = tmp[2];
477296341Sdelphij    out[3] = tmp[3];
478296341Sdelphij}
479238384Sjkim
480238384Sjkim/* smallfelem_expand converts a smallfelem to an felem */
481238384Sjkimstatic void smallfelem_expand(felem out, const smallfelem in)
482296341Sdelphij{
483296341Sdelphij    out[0] = in[0];
484296341Sdelphij    out[1] = in[1];
485296341Sdelphij    out[2] = in[2];
486296341Sdelphij    out[3] = in[3];
487296341Sdelphij}
488238384Sjkim
489296341Sdelphij/*-
490296341Sdelphij * smallfelem_square sets |out| = |small|^2
491238384Sjkim * On entry:
492238384Sjkim *   small[i] < 2^64
493238384Sjkim * On exit:
494238384Sjkim *   out[i] < 7 * 2^64 < 2^67
495238384Sjkim */
496238384Sjkimstatic void smallfelem_square(longfelem out, const smallfelem small)
497296341Sdelphij{
498296341Sdelphij    limb a;
499296341Sdelphij    u64 high, low;
500238384Sjkim
501296341Sdelphij    a = ((uint128_t) small[0]) * small[0];
502296341Sdelphij    low = a;
503296341Sdelphij    high = a >> 64;
504296341Sdelphij    out[0] = low;
505296341Sdelphij    out[1] = high;
506238384Sjkim
507296341Sdelphij    a = ((uint128_t) small[0]) * small[1];
508296341Sdelphij    low = a;
509296341Sdelphij    high = a >> 64;
510296341Sdelphij    out[1] += low;
511296341Sdelphij    out[1] += low;
512296341Sdelphij    out[2] = high;
513238384Sjkim
514296341Sdelphij    a = ((uint128_t) small[0]) * small[2];
515296341Sdelphij    low = a;
516296341Sdelphij    high = a >> 64;
517296341Sdelphij    out[2] += low;
518296341Sdelphij    out[2] *= 2;
519296341Sdelphij    out[3] = high;
520238384Sjkim
521296341Sdelphij    a = ((uint128_t) small[0]) * small[3];
522296341Sdelphij    low = a;
523296341Sdelphij    high = a >> 64;
524296341Sdelphij    out[3] += low;
525296341Sdelphij    out[4] = high;
526238384Sjkim
527296341Sdelphij    a = ((uint128_t) small[1]) * small[2];
528296341Sdelphij    low = a;
529296341Sdelphij    high = a >> 64;
530296341Sdelphij    out[3] += low;
531296341Sdelphij    out[3] *= 2;
532296341Sdelphij    out[4] += high;
533238384Sjkim
534296341Sdelphij    a = ((uint128_t) small[1]) * small[1];
535296341Sdelphij    low = a;
536296341Sdelphij    high = a >> 64;
537296341Sdelphij    out[2] += low;
538296341Sdelphij    out[3] += high;
539238384Sjkim
540296341Sdelphij    a = ((uint128_t) small[1]) * small[3];
541296341Sdelphij    low = a;
542296341Sdelphij    high = a >> 64;
543296341Sdelphij    out[4] += low;
544296341Sdelphij    out[4] *= 2;
545296341Sdelphij    out[5] = high;
546238384Sjkim
547296341Sdelphij    a = ((uint128_t) small[2]) * small[3];
548296341Sdelphij    low = a;
549296341Sdelphij    high = a >> 64;
550296341Sdelphij    out[5] += low;
551296341Sdelphij    out[5] *= 2;
552296341Sdelphij    out[6] = high;
553296341Sdelphij    out[6] += high;
554238384Sjkim
555296341Sdelphij    a = ((uint128_t) small[2]) * small[2];
556296341Sdelphij    low = a;
557296341Sdelphij    high = a >> 64;
558296341Sdelphij    out[4] += low;
559296341Sdelphij    out[5] += high;
560238384Sjkim
561296341Sdelphij    a = ((uint128_t) small[3]) * small[3];
562296341Sdelphij    low = a;
563296341Sdelphij    high = a >> 64;
564296341Sdelphij    out[6] += low;
565296341Sdelphij    out[7] = high;
566296341Sdelphij}
567238384Sjkim
568296341Sdelphij/*-
569296341Sdelphij * felem_square sets |out| = |in|^2
570238384Sjkim * On entry:
571238384Sjkim *   in[i] < 2^109
572238384Sjkim * On exit:
573238384Sjkim *   out[i] < 7 * 2^64 < 2^67
574238384Sjkim */
575238384Sjkimstatic void felem_square(longfelem out, const felem in)
576296341Sdelphij{
577296341Sdelphij    u64 small[4];
578296341Sdelphij    felem_shrink(small, in);
579296341Sdelphij    smallfelem_square(out, small);
580296341Sdelphij}
581238384Sjkim
582296341Sdelphij/*-
583296341Sdelphij * smallfelem_mul sets |out| = |small1| * |small2|
584238384Sjkim * On entry:
585238384Sjkim *   small1[i] < 2^64
586238384Sjkim *   small2[i] < 2^64
587238384Sjkim * On exit:
588238384Sjkim *   out[i] < 7 * 2^64 < 2^67
589238384Sjkim */
590296341Sdelphijstatic void smallfelem_mul(longfelem out, const smallfelem small1,
591296341Sdelphij                           const smallfelem small2)
592296341Sdelphij{
593296341Sdelphij    limb a;
594296341Sdelphij    u64 high, low;
595238384Sjkim
596296341Sdelphij    a = ((uint128_t) small1[0]) * small2[0];
597296341Sdelphij    low = a;
598296341Sdelphij    high = a >> 64;
599296341Sdelphij    out[0] = low;
600296341Sdelphij    out[1] = high;
601238384Sjkim
602296341Sdelphij    a = ((uint128_t) small1[0]) * small2[1];
603296341Sdelphij    low = a;
604296341Sdelphij    high = a >> 64;
605296341Sdelphij    out[1] += low;
606296341Sdelphij    out[2] = high;
607238384Sjkim
608296341Sdelphij    a = ((uint128_t) small1[1]) * small2[0];
609296341Sdelphij    low = a;
610296341Sdelphij    high = a >> 64;
611296341Sdelphij    out[1] += low;
612296341Sdelphij    out[2] += high;
613238384Sjkim
614296341Sdelphij    a = ((uint128_t) small1[0]) * small2[2];
615296341Sdelphij    low = a;
616296341Sdelphij    high = a >> 64;
617296341Sdelphij    out[2] += low;
618296341Sdelphij    out[3] = high;
619238384Sjkim
620296341Sdelphij    a = ((uint128_t) small1[1]) * small2[1];
621296341Sdelphij    low = a;
622296341Sdelphij    high = a >> 64;
623296341Sdelphij    out[2] += low;
624296341Sdelphij    out[3] += high;
625238384Sjkim
626296341Sdelphij    a = ((uint128_t) small1[2]) * small2[0];
627296341Sdelphij    low = a;
628296341Sdelphij    high = a >> 64;
629296341Sdelphij    out[2] += low;
630296341Sdelphij    out[3] += high;
631238384Sjkim
632296341Sdelphij    a = ((uint128_t) small1[0]) * small2[3];
633296341Sdelphij    low = a;
634296341Sdelphij    high = a >> 64;
635296341Sdelphij    out[3] += low;
636296341Sdelphij    out[4] = high;
637238384Sjkim
638296341Sdelphij    a = ((uint128_t) small1[1]) * small2[2];
639296341Sdelphij    low = a;
640296341Sdelphij    high = a >> 64;
641296341Sdelphij    out[3] += low;
642296341Sdelphij    out[4] += high;
643238384Sjkim
644296341Sdelphij    a = ((uint128_t) small1[2]) * small2[1];
645296341Sdelphij    low = a;
646296341Sdelphij    high = a >> 64;
647296341Sdelphij    out[3] += low;
648296341Sdelphij    out[4] += high;
649238384Sjkim
650296341Sdelphij    a = ((uint128_t) small1[3]) * small2[0];
651296341Sdelphij    low = a;
652296341Sdelphij    high = a >> 64;
653296341Sdelphij    out[3] += low;
654296341Sdelphij    out[4] += high;
655238384Sjkim
656296341Sdelphij    a = ((uint128_t) small1[1]) * small2[3];
657296341Sdelphij    low = a;
658296341Sdelphij    high = a >> 64;
659296341Sdelphij    out[4] += low;
660296341Sdelphij    out[5] = high;
661238384Sjkim
662296341Sdelphij    a = ((uint128_t) small1[2]) * small2[2];
663296341Sdelphij    low = a;
664296341Sdelphij    high = a >> 64;
665296341Sdelphij    out[4] += low;
666296341Sdelphij    out[5] += high;
667238384Sjkim
668296341Sdelphij    a = ((uint128_t) small1[3]) * small2[1];
669296341Sdelphij    low = a;
670296341Sdelphij    high = a >> 64;
671296341Sdelphij    out[4] += low;
672296341Sdelphij    out[5] += high;
673238384Sjkim
674296341Sdelphij    a = ((uint128_t) small1[2]) * small2[3];
675296341Sdelphij    low = a;
676296341Sdelphij    high = a >> 64;
677296341Sdelphij    out[5] += low;
678296341Sdelphij    out[6] = high;
679238384Sjkim
680296341Sdelphij    a = ((uint128_t) small1[3]) * small2[2];
681296341Sdelphij    low = a;
682296341Sdelphij    high = a >> 64;
683296341Sdelphij    out[5] += low;
684296341Sdelphij    out[6] += high;
685238384Sjkim
686296341Sdelphij    a = ((uint128_t) small1[3]) * small2[3];
687296341Sdelphij    low = a;
688296341Sdelphij    high = a >> 64;
689296341Sdelphij    out[6] += low;
690296341Sdelphij    out[7] = high;
691296341Sdelphij}
692238384Sjkim
693296341Sdelphij/*-
694296341Sdelphij * felem_mul sets |out| = |in1| * |in2|
695238384Sjkim * On entry:
696238384Sjkim *   in1[i] < 2^109
697238384Sjkim *   in2[i] < 2^109
698238384Sjkim * On exit:
699238384Sjkim *   out[i] < 7 * 2^64 < 2^67
700238384Sjkim */
701238384Sjkimstatic void felem_mul(longfelem out, const felem in1, const felem in2)
702296341Sdelphij{
703296341Sdelphij    smallfelem small1, small2;
704296341Sdelphij    felem_shrink(small1, in1);
705296341Sdelphij    felem_shrink(small2, in2);
706296341Sdelphij    smallfelem_mul(out, small1, small2);
707296341Sdelphij}
708238384Sjkim
709296341Sdelphij/*-
710296341Sdelphij * felem_small_mul sets |out| = |small1| * |in2|
711238384Sjkim * On entry:
712238384Sjkim *   small1[i] < 2^64
713238384Sjkim *   in2[i] < 2^109
714238384Sjkim * On exit:
715238384Sjkim *   out[i] < 7 * 2^64 < 2^67
716238384Sjkim */
717296341Sdelphijstatic void felem_small_mul(longfelem out, const smallfelem small1,
718296341Sdelphij                            const felem in2)
719296341Sdelphij{
720296341Sdelphij    smallfelem small2;
721296341Sdelphij    felem_shrink(small2, in2);
722296341Sdelphij    smallfelem_mul(out, small1, small2);
723296341Sdelphij}
724238384Sjkim
725296341Sdelphij# define two100m36m4 (((limb)1) << 100) - (((limb)1) << 36) - (((limb)1) << 4)
726296341Sdelphij# define two100 (((limb)1) << 100)
727296341Sdelphij# define two100m36p4 (((limb)1) << 100) - (((limb)1) << 36) + (((limb)1) << 4)
728238384Sjkim/* zero100 is 0 mod p */
729296341Sdelphijstatic const felem zero100 =
730296341Sdelphij    { two100m36m4, two100, two100m36p4, two100m36p4 };
731238384Sjkim
732296341Sdelphij/*-
733296341Sdelphij * Internal function for the different flavours of felem_reduce.
734238384Sjkim * felem_reduce_ reduces the higher coefficients in[4]-in[7].
735238384Sjkim * On entry:
736296341Sdelphij *   out[0] >= in[6] + 2^32*in[6] + in[7] + 2^32*in[7]
737238384Sjkim *   out[1] >= in[7] + 2^32*in[4]
738238384Sjkim *   out[2] >= in[5] + 2^32*in[5]
739238384Sjkim *   out[3] >= in[4] + 2^32*in[5] + 2^32*in[6]
740238384Sjkim * On exit:
741238384Sjkim *   out[0] <= out[0] + in[4] + 2^32*in[5]
742238384Sjkim *   out[1] <= out[1] + in[5] + 2^33*in[6]
743238384Sjkim *   out[2] <= out[2] + in[7] + 2*in[6] + 2^33*in[7]
744238384Sjkim *   out[3] <= out[3] + 2^32*in[4] + 3*in[7]
745238384Sjkim */
746238384Sjkimstatic void felem_reduce_(felem out, const longfelem in)
747296341Sdelphij{
748296341Sdelphij    int128_t c;
749296341Sdelphij    /* combine common terms from below */
750296341Sdelphij    c = in[4] + (in[5] << 32);
751296341Sdelphij    out[0] += c;
752296341Sdelphij    out[3] -= c;
753238384Sjkim
754296341Sdelphij    c = in[5] - in[7];
755296341Sdelphij    out[1] += c;
756296341Sdelphij    out[2] -= c;
757238384Sjkim
758296341Sdelphij    /* the remaining terms */
759296341Sdelphij    /* 256: [(0,1),(96,-1),(192,-1),(224,1)] */
760296341Sdelphij    out[1] -= (in[4] << 32);
761296341Sdelphij    out[3] += (in[4] << 32);
762238384Sjkim
763296341Sdelphij    /* 320: [(32,1),(64,1),(128,-1),(160,-1),(224,-1)] */
764296341Sdelphij    out[2] -= (in[5] << 32);
765238384Sjkim
766296341Sdelphij    /* 384: [(0,-1),(32,-1),(96,2),(128,2),(224,-1)] */
767296341Sdelphij    out[0] -= in[6];
768296341Sdelphij    out[0] -= (in[6] << 32);
769296341Sdelphij    out[1] += (in[6] << 33);
770296341Sdelphij    out[2] += (in[6] * 2);
771296341Sdelphij    out[3] -= (in[6] << 32);
772238384Sjkim
773296341Sdelphij    /* 448: [(0,-1),(32,-1),(64,-1),(128,1),(160,2),(192,3)] */
774296341Sdelphij    out[0] -= in[7];
775296341Sdelphij    out[0] -= (in[7] << 32);
776296341Sdelphij    out[2] += (in[7] << 33);
777296341Sdelphij    out[3] += (in[7] * 3);
778296341Sdelphij}
779238384Sjkim
780296341Sdelphij/*-
781296341Sdelphij * felem_reduce converts a longfelem into an felem.
782238384Sjkim * To be called directly after felem_square or felem_mul.
783238384Sjkim * On entry:
784238384Sjkim *   in[0] < 2^64, in[1] < 3*2^64, in[2] < 5*2^64, in[3] < 7*2^64
785238384Sjkim *   in[4] < 7*2^64, in[5] < 5*2^64, in[6] < 3*2^64, in[7] < 2*64
786238384Sjkim * On exit:
787238384Sjkim *   out[i] < 2^101
788238384Sjkim */
789238384Sjkimstatic void felem_reduce(felem out, const longfelem in)
790296341Sdelphij{
791296341Sdelphij    out[0] = zero100[0] + in[0];
792296341Sdelphij    out[1] = zero100[1] + in[1];
793296341Sdelphij    out[2] = zero100[2] + in[2];
794296341Sdelphij    out[3] = zero100[3] + in[3];
795238384Sjkim
796296341Sdelphij    felem_reduce_(out, in);
797238384Sjkim
798296341Sdelphij    /*-
799296341Sdelphij     * out[0] > 2^100 - 2^36 - 2^4 - 3*2^64 - 3*2^96 - 2^64 - 2^96 > 0
800296341Sdelphij     * out[1] > 2^100 - 2^64 - 7*2^96 > 0
801296341Sdelphij     * out[2] > 2^100 - 2^36 + 2^4 - 5*2^64 - 5*2^96 > 0
802296341Sdelphij     * out[3] > 2^100 - 2^36 + 2^4 - 7*2^64 - 5*2^96 - 3*2^96 > 0
803296341Sdelphij     *
804296341Sdelphij     * out[0] < 2^100 + 2^64 + 7*2^64 + 5*2^96 < 2^101
805296341Sdelphij     * out[1] < 2^100 + 3*2^64 + 5*2^64 + 3*2^97 < 2^101
806296341Sdelphij     * out[2] < 2^100 + 5*2^64 + 2^64 + 3*2^65 + 2^97 < 2^101
807296341Sdelphij     * out[3] < 2^100 + 7*2^64 + 7*2^96 + 3*2^64 < 2^101
808296341Sdelphij     */
809296341Sdelphij}
810238384Sjkim
811296341Sdelphij/*-
812296341Sdelphij * felem_reduce_zero105 converts a larger longfelem into an felem.
813238384Sjkim * On entry:
814238384Sjkim *   in[0] < 2^71
815238384Sjkim * On exit:
816238384Sjkim *   out[i] < 2^106
817238384Sjkim */
818238384Sjkimstatic void felem_reduce_zero105(felem out, const longfelem in)
819296341Sdelphij{
820296341Sdelphij    out[0] = zero105[0] + in[0];
821296341Sdelphij    out[1] = zero105[1] + in[1];
822296341Sdelphij    out[2] = zero105[2] + in[2];
823296341Sdelphij    out[3] = zero105[3] + in[3];
824238384Sjkim
825296341Sdelphij    felem_reduce_(out, in);
826238384Sjkim
827296341Sdelphij    /*-
828296341Sdelphij     * out[0] > 2^105 - 2^41 - 2^9 - 2^71 - 2^103 - 2^71 - 2^103 > 0
829296341Sdelphij     * out[1] > 2^105 - 2^71 - 2^103 > 0
830296341Sdelphij     * out[2] > 2^105 - 2^41 + 2^9 - 2^71 - 2^103 > 0
831296341Sdelphij     * out[3] > 2^105 - 2^41 + 2^9 - 2^71 - 2^103 - 2^103 > 0
832296341Sdelphij     *
833296341Sdelphij     * out[0] < 2^105 + 2^71 + 2^71 + 2^103 < 2^106
834296341Sdelphij     * out[1] < 2^105 + 2^71 + 2^71 + 2^103 < 2^106
835296341Sdelphij     * out[2] < 2^105 + 2^71 + 2^71 + 2^71 + 2^103 < 2^106
836296341Sdelphij     * out[3] < 2^105 + 2^71 + 2^103 + 2^71 < 2^106
837296341Sdelphij     */
838296341Sdelphij}
839238384Sjkim
840296341Sdelphij/*
841296341Sdelphij * subtract_u64 sets *result = *result - v and *carry to one if the
842296341Sdelphij * subtraction underflowed.
843296341Sdelphij */
844296341Sdelphijstatic void subtract_u64(u64 *result, u64 *carry, u64 v)
845296341Sdelphij{
846296341Sdelphij    uint128_t r = *result;
847296341Sdelphij    r -= v;
848296341Sdelphij    *carry = (r >> 64) & 1;
849296341Sdelphij    *result = (u64)r;
850296341Sdelphij}
851238384Sjkim
852296341Sdelphij/*
853296341Sdelphij * felem_contract converts |in| to its unique, minimal representation. On
854296341Sdelphij * entry: in[i] < 2^109
855238384Sjkim */
856238384Sjkimstatic void felem_contract(smallfelem out, const felem in)
857296341Sdelphij{
858296341Sdelphij    unsigned i;
859296341Sdelphij    u64 all_equal_so_far = 0, result = 0, carry;
860238384Sjkim
861296341Sdelphij    felem_shrink(out, in);
862296341Sdelphij    /* small is minimal except that the value might be > p */
863238384Sjkim
864296341Sdelphij    all_equal_so_far--;
865296341Sdelphij    /*
866296341Sdelphij     * We are doing a constant time test if out >= kPrime. We need to compare
867296341Sdelphij     * each u64, from most-significant to least significant. For each one, if
868296341Sdelphij     * all words so far have been equal (m is all ones) then a non-equal
869296341Sdelphij     * result is the answer. Otherwise we continue.
870296341Sdelphij     */
871296341Sdelphij    for (i = 3; i < 4; i--) {
872296341Sdelphij        u64 equal;
873296341Sdelphij        uint128_t a = ((uint128_t) kPrime[i]) - out[i];
874296341Sdelphij        /*
875296341Sdelphij         * if out[i] > kPrime[i] then a will underflow and the high 64-bits
876296341Sdelphij         * will all be set.
877296341Sdelphij         */
878296341Sdelphij        result |= all_equal_so_far & ((u64)(a >> 64));
879238384Sjkim
880296341Sdelphij        /*
881296341Sdelphij         * if kPrime[i] == out[i] then |equal| will be all zeros and the
882296341Sdelphij         * decrement will make it all ones.
883296341Sdelphij         */
884296341Sdelphij        equal = kPrime[i] ^ out[i];
885296341Sdelphij        equal--;
886296341Sdelphij        equal &= equal << 32;
887296341Sdelphij        equal &= equal << 16;
888296341Sdelphij        equal &= equal << 8;
889296341Sdelphij        equal &= equal << 4;
890296341Sdelphij        equal &= equal << 2;
891296341Sdelphij        equal &= equal << 1;
892296341Sdelphij        equal = ((s64) equal) >> 63;
893238384Sjkim
894296341Sdelphij        all_equal_so_far &= equal;
895296341Sdelphij    }
896238384Sjkim
897296341Sdelphij    /*
898296341Sdelphij     * if all_equal_so_far is still all ones then the two values are equal
899296341Sdelphij     * and so out >= kPrime is true.
900296341Sdelphij     */
901296341Sdelphij    result |= all_equal_so_far;
902238384Sjkim
903296341Sdelphij    /* if out >= kPrime then we subtract kPrime. */
904296341Sdelphij    subtract_u64(&out[0], &carry, result & kPrime[0]);
905296341Sdelphij    subtract_u64(&out[1], &carry, carry);
906296341Sdelphij    subtract_u64(&out[2], &carry, carry);
907296341Sdelphij    subtract_u64(&out[3], &carry, carry);
908238384Sjkim
909296341Sdelphij    subtract_u64(&out[1], &carry, result & kPrime[1]);
910296341Sdelphij    subtract_u64(&out[2], &carry, carry);
911296341Sdelphij    subtract_u64(&out[3], &carry, carry);
912238384Sjkim
913296341Sdelphij    subtract_u64(&out[2], &carry, result & kPrime[2]);
914296341Sdelphij    subtract_u64(&out[3], &carry, carry);
915238384Sjkim
916296341Sdelphij    subtract_u64(&out[3], &carry, result & kPrime[3]);
917296341Sdelphij}
918238384Sjkim
919238384Sjkimstatic void smallfelem_square_contract(smallfelem out, const smallfelem in)
920296341Sdelphij{
921296341Sdelphij    longfelem longtmp;
922296341Sdelphij    felem tmp;
923238384Sjkim
924296341Sdelphij    smallfelem_square(longtmp, in);
925296341Sdelphij    felem_reduce(tmp, longtmp);
926296341Sdelphij    felem_contract(out, tmp);
927296341Sdelphij}
928238384Sjkim
929296341Sdelphijstatic void smallfelem_mul_contract(smallfelem out, const smallfelem in1,
930296341Sdelphij                                    const smallfelem in2)
931296341Sdelphij{
932296341Sdelphij    longfelem longtmp;
933296341Sdelphij    felem tmp;
934238384Sjkim
935296341Sdelphij    smallfelem_mul(longtmp, in1, in2);
936296341Sdelphij    felem_reduce(tmp, longtmp);
937296341Sdelphij    felem_contract(out, tmp);
938296341Sdelphij}
939238384Sjkim
940296341Sdelphij/*-
941296341Sdelphij * felem_is_zero returns a limb with all bits set if |in| == 0 (mod p) and 0
942238384Sjkim * otherwise.
943238384Sjkim * On entry:
944238384Sjkim *   small[i] < 2^64
945238384Sjkim */
946238384Sjkimstatic limb smallfelem_is_zero(const smallfelem small)
947296341Sdelphij{
948296341Sdelphij    limb result;
949296341Sdelphij    u64 is_p;
950238384Sjkim
951296341Sdelphij    u64 is_zero = small[0] | small[1] | small[2] | small[3];
952296341Sdelphij    is_zero--;
953296341Sdelphij    is_zero &= is_zero << 32;
954296341Sdelphij    is_zero &= is_zero << 16;
955296341Sdelphij    is_zero &= is_zero << 8;
956296341Sdelphij    is_zero &= is_zero << 4;
957296341Sdelphij    is_zero &= is_zero << 2;
958296341Sdelphij    is_zero &= is_zero << 1;
959296341Sdelphij    is_zero = ((s64) is_zero) >> 63;
960238384Sjkim
961296341Sdelphij    is_p = (small[0] ^ kPrime[0]) |
962296341Sdelphij        (small[1] ^ kPrime[1]) |
963296341Sdelphij        (small[2] ^ kPrime[2]) | (small[3] ^ kPrime[3]);
964296341Sdelphij    is_p--;
965296341Sdelphij    is_p &= is_p << 32;
966296341Sdelphij    is_p &= is_p << 16;
967296341Sdelphij    is_p &= is_p << 8;
968296341Sdelphij    is_p &= is_p << 4;
969296341Sdelphij    is_p &= is_p << 2;
970296341Sdelphij    is_p &= is_p << 1;
971296341Sdelphij    is_p = ((s64) is_p) >> 63;
972238384Sjkim
973296341Sdelphij    is_zero |= is_p;
974238384Sjkim
975296341Sdelphij    result = is_zero;
976296341Sdelphij    result |= ((limb) is_zero) << 64;
977296341Sdelphij    return result;
978296341Sdelphij}
979238384Sjkim
980238384Sjkimstatic int smallfelem_is_zero_int(const smallfelem small)
981296341Sdelphij{
982296341Sdelphij    return (int)(smallfelem_is_zero(small) & ((limb) 1));
983296341Sdelphij}
984238384Sjkim
985296341Sdelphij/*-
986296341Sdelphij * felem_inv calculates |out| = |in|^{-1}
987238384Sjkim *
988238384Sjkim * Based on Fermat's Little Theorem:
989238384Sjkim *   a^p = a (mod p)
990238384Sjkim *   a^{p-1} = 1 (mod p)
991238384Sjkim *   a^{p-2} = a^{-1} (mod p)
992238384Sjkim */
993238384Sjkimstatic void felem_inv(felem out, const felem in)
994296341Sdelphij{
995296341Sdelphij    felem ftmp, ftmp2;
996296341Sdelphij    /* each e_I will hold |in|^{2^I - 1} */
997296341Sdelphij    felem e2, e4, e8, e16, e32, e64;
998296341Sdelphij    longfelem tmp;
999296341Sdelphij    unsigned i;
1000238384Sjkim
1001296341Sdelphij    felem_square(tmp, in);
1002296341Sdelphij    felem_reduce(ftmp, tmp);    /* 2^1 */
1003296341Sdelphij    felem_mul(tmp, in, ftmp);
1004296341Sdelphij    felem_reduce(ftmp, tmp);    /* 2^2 - 2^0 */
1005296341Sdelphij    felem_assign(e2, ftmp);
1006296341Sdelphij    felem_square(tmp, ftmp);
1007296341Sdelphij    felem_reduce(ftmp, tmp);    /* 2^3 - 2^1 */
1008296341Sdelphij    felem_square(tmp, ftmp);
1009296341Sdelphij    felem_reduce(ftmp, tmp);    /* 2^4 - 2^2 */
1010296341Sdelphij    felem_mul(tmp, ftmp, e2);
1011296341Sdelphij    felem_reduce(ftmp, tmp);    /* 2^4 - 2^0 */
1012296341Sdelphij    felem_assign(e4, ftmp);
1013296341Sdelphij    felem_square(tmp, ftmp);
1014296341Sdelphij    felem_reduce(ftmp, tmp);    /* 2^5 - 2^1 */
1015296341Sdelphij    felem_square(tmp, ftmp);
1016296341Sdelphij    felem_reduce(ftmp, tmp);    /* 2^6 - 2^2 */
1017296341Sdelphij    felem_square(tmp, ftmp);
1018296341Sdelphij    felem_reduce(ftmp, tmp);    /* 2^7 - 2^3 */
1019296341Sdelphij    felem_square(tmp, ftmp);
1020296341Sdelphij    felem_reduce(ftmp, tmp);    /* 2^8 - 2^4 */
1021296341Sdelphij    felem_mul(tmp, ftmp, e4);
1022296341Sdelphij    felem_reduce(ftmp, tmp);    /* 2^8 - 2^0 */
1023296341Sdelphij    felem_assign(e8, ftmp);
1024296341Sdelphij    for (i = 0; i < 8; i++) {
1025296341Sdelphij        felem_square(tmp, ftmp);
1026296341Sdelphij        felem_reduce(ftmp, tmp);
1027296341Sdelphij    }                           /* 2^16 - 2^8 */
1028296341Sdelphij    felem_mul(tmp, ftmp, e8);
1029296341Sdelphij    felem_reduce(ftmp, tmp);    /* 2^16 - 2^0 */
1030296341Sdelphij    felem_assign(e16, ftmp);
1031296341Sdelphij    for (i = 0; i < 16; i++) {
1032296341Sdelphij        felem_square(tmp, ftmp);
1033296341Sdelphij        felem_reduce(ftmp, tmp);
1034296341Sdelphij    }                           /* 2^32 - 2^16 */
1035296341Sdelphij    felem_mul(tmp, ftmp, e16);
1036296341Sdelphij    felem_reduce(ftmp, tmp);    /* 2^32 - 2^0 */
1037296341Sdelphij    felem_assign(e32, ftmp);
1038296341Sdelphij    for (i = 0; i < 32; i++) {
1039296341Sdelphij        felem_square(tmp, ftmp);
1040296341Sdelphij        felem_reduce(ftmp, tmp);
1041296341Sdelphij    }                           /* 2^64 - 2^32 */
1042296341Sdelphij    felem_assign(e64, ftmp);
1043296341Sdelphij    felem_mul(tmp, ftmp, in);
1044296341Sdelphij    felem_reduce(ftmp, tmp);    /* 2^64 - 2^32 + 2^0 */
1045296341Sdelphij    for (i = 0; i < 192; i++) {
1046296341Sdelphij        felem_square(tmp, ftmp);
1047296341Sdelphij        felem_reduce(ftmp, tmp);
1048296341Sdelphij    }                           /* 2^256 - 2^224 + 2^192 */
1049238384Sjkim
1050296341Sdelphij    felem_mul(tmp, e64, e32);
1051296341Sdelphij    felem_reduce(ftmp2, tmp);   /* 2^64 - 2^0 */
1052296341Sdelphij    for (i = 0; i < 16; i++) {
1053296341Sdelphij        felem_square(tmp, ftmp2);
1054296341Sdelphij        felem_reduce(ftmp2, tmp);
1055296341Sdelphij    }                           /* 2^80 - 2^16 */
1056296341Sdelphij    felem_mul(tmp, ftmp2, e16);
1057296341Sdelphij    felem_reduce(ftmp2, tmp);   /* 2^80 - 2^0 */
1058296341Sdelphij    for (i = 0; i < 8; i++) {
1059296341Sdelphij        felem_square(tmp, ftmp2);
1060296341Sdelphij        felem_reduce(ftmp2, tmp);
1061296341Sdelphij    }                           /* 2^88 - 2^8 */
1062296341Sdelphij    felem_mul(tmp, ftmp2, e8);
1063296341Sdelphij    felem_reduce(ftmp2, tmp);   /* 2^88 - 2^0 */
1064296341Sdelphij    for (i = 0; i < 4; i++) {
1065296341Sdelphij        felem_square(tmp, ftmp2);
1066296341Sdelphij        felem_reduce(ftmp2, tmp);
1067296341Sdelphij    }                           /* 2^92 - 2^4 */
1068296341Sdelphij    felem_mul(tmp, ftmp2, e4);
1069296341Sdelphij    felem_reduce(ftmp2, tmp);   /* 2^92 - 2^0 */
1070296341Sdelphij    felem_square(tmp, ftmp2);
1071296341Sdelphij    felem_reduce(ftmp2, tmp);   /* 2^93 - 2^1 */
1072296341Sdelphij    felem_square(tmp, ftmp2);
1073296341Sdelphij    felem_reduce(ftmp2, tmp);   /* 2^94 - 2^2 */
1074296341Sdelphij    felem_mul(tmp, ftmp2, e2);
1075296341Sdelphij    felem_reduce(ftmp2, tmp);   /* 2^94 - 2^0 */
1076296341Sdelphij    felem_square(tmp, ftmp2);
1077296341Sdelphij    felem_reduce(ftmp2, tmp);   /* 2^95 - 2^1 */
1078296341Sdelphij    felem_square(tmp, ftmp2);
1079296341Sdelphij    felem_reduce(ftmp2, tmp);   /* 2^96 - 2^2 */
1080296341Sdelphij    felem_mul(tmp, ftmp2, in);
1081296341Sdelphij    felem_reduce(ftmp2, tmp);   /* 2^96 - 3 */
1082238384Sjkim
1083296341Sdelphij    felem_mul(tmp, ftmp2, ftmp);
1084296341Sdelphij    felem_reduce(out, tmp);     /* 2^256 - 2^224 + 2^192 + 2^96 - 3 */
1085296341Sdelphij}
1086238384Sjkim
1087238384Sjkimstatic void smallfelem_inv_contract(smallfelem out, const smallfelem in)
1088296341Sdelphij{
1089296341Sdelphij    felem tmp;
1090238384Sjkim
1091296341Sdelphij    smallfelem_expand(tmp, in);
1092296341Sdelphij    felem_inv(tmp, tmp);
1093296341Sdelphij    felem_contract(out, tmp);
1094296341Sdelphij}
1095238384Sjkim
1096296341Sdelphij/*-
1097296341Sdelphij * Group operations
1098238384Sjkim * ----------------
1099238384Sjkim *
1100238384Sjkim * Building on top of the field operations we have the operations on the
1101238384Sjkim * elliptic curve group itself. Points on the curve are represented in Jacobian
1102296341Sdelphij * coordinates
1103296341Sdelphij */
1104238384Sjkim
1105296341Sdelphij/*-
1106296341Sdelphij * point_double calculates 2*(x_in, y_in, z_in)
1107238384Sjkim *
1108238384Sjkim * The method is taken from:
1109238384Sjkim *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b
1110238384Sjkim *
1111238384Sjkim * Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed.
1112296341Sdelphij * while x_out == y_in is not (maybe this works, but it's not tested).
1113296341Sdelphij */
1114238384Sjkimstatic void
1115238384Sjkimpoint_double(felem x_out, felem y_out, felem z_out,
1116296341Sdelphij             const felem x_in, const felem y_in, const felem z_in)
1117296341Sdelphij{
1118296341Sdelphij    longfelem tmp, tmp2;
1119296341Sdelphij    felem delta, gamma, beta, alpha, ftmp, ftmp2;
1120296341Sdelphij    smallfelem small1, small2;
1121238384Sjkim
1122296341Sdelphij    felem_assign(ftmp, x_in);
1123296341Sdelphij    /* ftmp[i] < 2^106 */
1124296341Sdelphij    felem_assign(ftmp2, x_in);
1125296341Sdelphij    /* ftmp2[i] < 2^106 */
1126238384Sjkim
1127296341Sdelphij    /* delta = z^2 */
1128296341Sdelphij    felem_square(tmp, z_in);
1129296341Sdelphij    felem_reduce(delta, tmp);
1130296341Sdelphij    /* delta[i] < 2^101 */
1131238384Sjkim
1132296341Sdelphij    /* gamma = y^2 */
1133296341Sdelphij    felem_square(tmp, y_in);
1134296341Sdelphij    felem_reduce(gamma, tmp);
1135296341Sdelphij    /* gamma[i] < 2^101 */
1136296341Sdelphij    felem_shrink(small1, gamma);
1137238384Sjkim
1138296341Sdelphij    /* beta = x*gamma */
1139296341Sdelphij    felem_small_mul(tmp, small1, x_in);
1140296341Sdelphij    felem_reduce(beta, tmp);
1141296341Sdelphij    /* beta[i] < 2^101 */
1142238384Sjkim
1143296341Sdelphij    /* alpha = 3*(x-delta)*(x+delta) */
1144296341Sdelphij    felem_diff(ftmp, delta);
1145296341Sdelphij    /* ftmp[i] < 2^105 + 2^106 < 2^107 */
1146296341Sdelphij    felem_sum(ftmp2, delta);
1147296341Sdelphij    /* ftmp2[i] < 2^105 + 2^106 < 2^107 */
1148296341Sdelphij    felem_scalar(ftmp2, 3);
1149296341Sdelphij    /* ftmp2[i] < 3 * 2^107 < 2^109 */
1150296341Sdelphij    felem_mul(tmp, ftmp, ftmp2);
1151296341Sdelphij    felem_reduce(alpha, tmp);
1152296341Sdelphij    /* alpha[i] < 2^101 */
1153296341Sdelphij    felem_shrink(small2, alpha);
1154238384Sjkim
1155296341Sdelphij    /* x' = alpha^2 - 8*beta */
1156296341Sdelphij    smallfelem_square(tmp, small2);
1157296341Sdelphij    felem_reduce(x_out, tmp);
1158296341Sdelphij    felem_assign(ftmp, beta);
1159296341Sdelphij    felem_scalar(ftmp, 8);
1160296341Sdelphij    /* ftmp[i] < 8 * 2^101 = 2^104 */
1161296341Sdelphij    felem_diff(x_out, ftmp);
1162296341Sdelphij    /* x_out[i] < 2^105 + 2^101 < 2^106 */
1163238384Sjkim
1164296341Sdelphij    /* z' = (y + z)^2 - gamma - delta */
1165296341Sdelphij    felem_sum(delta, gamma);
1166296341Sdelphij    /* delta[i] < 2^101 + 2^101 = 2^102 */
1167296341Sdelphij    felem_assign(ftmp, y_in);
1168296341Sdelphij    felem_sum(ftmp, z_in);
1169296341Sdelphij    /* ftmp[i] < 2^106 + 2^106 = 2^107 */
1170296341Sdelphij    felem_square(tmp, ftmp);
1171296341Sdelphij    felem_reduce(z_out, tmp);
1172296341Sdelphij    felem_diff(z_out, delta);
1173296341Sdelphij    /* z_out[i] < 2^105 + 2^101 < 2^106 */
1174238384Sjkim
1175296341Sdelphij    /* y' = alpha*(4*beta - x') - 8*gamma^2 */
1176296341Sdelphij    felem_scalar(beta, 4);
1177296341Sdelphij    /* beta[i] < 4 * 2^101 = 2^103 */
1178296341Sdelphij    felem_diff_zero107(beta, x_out);
1179296341Sdelphij    /* beta[i] < 2^107 + 2^103 < 2^108 */
1180296341Sdelphij    felem_small_mul(tmp, small2, beta);
1181296341Sdelphij    /* tmp[i] < 7 * 2^64 < 2^67 */
1182296341Sdelphij    smallfelem_square(tmp2, small1);
1183296341Sdelphij    /* tmp2[i] < 7 * 2^64 */
1184296341Sdelphij    longfelem_scalar(tmp2, 8);
1185296341Sdelphij    /* tmp2[i] < 8 * 7 * 2^64 = 7 * 2^67 */
1186296341Sdelphij    longfelem_diff(tmp, tmp2);
1187296341Sdelphij    /* tmp[i] < 2^67 + 2^70 + 2^40 < 2^71 */
1188296341Sdelphij    felem_reduce_zero105(y_out, tmp);
1189296341Sdelphij    /* y_out[i] < 2^106 */
1190296341Sdelphij}
1191238384Sjkim
1192296341Sdelphij/*
1193296341Sdelphij * point_double_small is the same as point_double, except that it operates on
1194296341Sdelphij * smallfelems
1195296341Sdelphij */
1196238384Sjkimstatic void
1197238384Sjkimpoint_double_small(smallfelem x_out, smallfelem y_out, smallfelem z_out,
1198296341Sdelphij                   const smallfelem x_in, const smallfelem y_in,
1199296341Sdelphij                   const smallfelem z_in)
1200296341Sdelphij{
1201296341Sdelphij    felem felem_x_out, felem_y_out, felem_z_out;
1202296341Sdelphij    felem felem_x_in, felem_y_in, felem_z_in;
1203238384Sjkim
1204296341Sdelphij    smallfelem_expand(felem_x_in, x_in);
1205296341Sdelphij    smallfelem_expand(felem_y_in, y_in);
1206296341Sdelphij    smallfelem_expand(felem_z_in, z_in);
1207296341Sdelphij    point_double(felem_x_out, felem_y_out, felem_z_out,
1208296341Sdelphij                 felem_x_in, felem_y_in, felem_z_in);
1209296341Sdelphij    felem_shrink(x_out, felem_x_out);
1210296341Sdelphij    felem_shrink(y_out, felem_y_out);
1211296341Sdelphij    felem_shrink(z_out, felem_z_out);
1212296341Sdelphij}
1213238384Sjkim
1214238384Sjkim/* copy_conditional copies in to out iff mask is all ones. */
1215296341Sdelphijstatic void copy_conditional(felem out, const felem in, limb mask)
1216296341Sdelphij{
1217296341Sdelphij    unsigned i;
1218296341Sdelphij    for (i = 0; i < NLIMBS; ++i) {
1219296341Sdelphij        const limb tmp = mask & (in[i] ^ out[i]);
1220296341Sdelphij        out[i] ^= tmp;
1221296341Sdelphij    }
1222296341Sdelphij}
1223238384Sjkim
1224238384Sjkim/* copy_small_conditional copies in to out iff mask is all ones. */
1225296341Sdelphijstatic void copy_small_conditional(felem out, const smallfelem in, limb mask)
1226296341Sdelphij{
1227296341Sdelphij    unsigned i;
1228296341Sdelphij    const u64 mask64 = mask;
1229296341Sdelphij    for (i = 0; i < NLIMBS; ++i) {
1230296341Sdelphij        out[i] = ((limb) (in[i] & mask64)) | (out[i] & ~mask);
1231296341Sdelphij    }
1232296341Sdelphij}
1233238384Sjkim
1234296341Sdelphij/*-
1235296341Sdelphij * point_add calcuates (x1, y1, z1) + (x2, y2, z2)
1236238384Sjkim *
1237238384Sjkim * The method is taken from:
1238238384Sjkim *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl,
1239238384Sjkim * adapted for mixed addition (z2 = 1, or z2 = 0 for the point at infinity).
1240238384Sjkim *
1241238384Sjkim * This function includes a branch for checking whether the two input points
1242238384Sjkim * are equal, (while not equal to the point at infinity). This case never
1243238384Sjkim * happens during single point multiplication, so there is no timing leak for
1244296341Sdelphij * ECDH or ECDSA signing.
1245296341Sdelphij */
1246238384Sjkimstatic void point_add(felem x3, felem y3, felem z3,
1247296341Sdelphij                      const felem x1, const felem y1, const felem z1,
1248296341Sdelphij                      const int mixed, const smallfelem x2,
1249296341Sdelphij                      const smallfelem y2, const smallfelem z2)
1250296341Sdelphij{
1251296341Sdelphij    felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, ftmp6, x_out, y_out, z_out;
1252296341Sdelphij    longfelem tmp, tmp2;
1253296341Sdelphij    smallfelem small1, small2, small3, small4, small5;
1254296341Sdelphij    limb x_equal, y_equal, z1_is_zero, z2_is_zero;
1255238384Sjkim
1256296341Sdelphij    felem_shrink(small3, z1);
1257238384Sjkim
1258296341Sdelphij    z1_is_zero = smallfelem_is_zero(small3);
1259296341Sdelphij    z2_is_zero = smallfelem_is_zero(z2);
1260238384Sjkim
1261296341Sdelphij    /* ftmp = z1z1 = z1**2 */
1262296341Sdelphij    smallfelem_square(tmp, small3);
1263296341Sdelphij    felem_reduce(ftmp, tmp);
1264296341Sdelphij    /* ftmp[i] < 2^101 */
1265296341Sdelphij    felem_shrink(small1, ftmp);
1266238384Sjkim
1267296341Sdelphij    if (!mixed) {
1268296341Sdelphij        /* ftmp2 = z2z2 = z2**2 */
1269296341Sdelphij        smallfelem_square(tmp, z2);
1270296341Sdelphij        felem_reduce(ftmp2, tmp);
1271296341Sdelphij        /* ftmp2[i] < 2^101 */
1272296341Sdelphij        felem_shrink(small2, ftmp2);
1273238384Sjkim
1274296341Sdelphij        felem_shrink(small5, x1);
1275238384Sjkim
1276296341Sdelphij        /* u1 = ftmp3 = x1*z2z2 */
1277296341Sdelphij        smallfelem_mul(tmp, small5, small2);
1278296341Sdelphij        felem_reduce(ftmp3, tmp);
1279296341Sdelphij        /* ftmp3[i] < 2^101 */
1280238384Sjkim
1281296341Sdelphij        /* ftmp5 = z1 + z2 */
1282296341Sdelphij        felem_assign(ftmp5, z1);
1283296341Sdelphij        felem_small_sum(ftmp5, z2);
1284296341Sdelphij        /* ftmp5[i] < 2^107 */
1285238384Sjkim
1286296341Sdelphij        /* ftmp5 = (z1 + z2)**2 - (z1z1 + z2z2) = 2z1z2 */
1287296341Sdelphij        felem_square(tmp, ftmp5);
1288296341Sdelphij        felem_reduce(ftmp5, tmp);
1289296341Sdelphij        /* ftmp2 = z2z2 + z1z1 */
1290296341Sdelphij        felem_sum(ftmp2, ftmp);
1291296341Sdelphij        /* ftmp2[i] < 2^101 + 2^101 = 2^102 */
1292296341Sdelphij        felem_diff(ftmp5, ftmp2);
1293296341Sdelphij        /* ftmp5[i] < 2^105 + 2^101 < 2^106 */
1294238384Sjkim
1295296341Sdelphij        /* ftmp2 = z2 * z2z2 */
1296296341Sdelphij        smallfelem_mul(tmp, small2, z2);
1297296341Sdelphij        felem_reduce(ftmp2, tmp);
1298238384Sjkim
1299296341Sdelphij        /* s1 = ftmp2 = y1 * z2**3 */
1300296341Sdelphij        felem_mul(tmp, y1, ftmp2);
1301296341Sdelphij        felem_reduce(ftmp6, tmp);
1302296341Sdelphij        /* ftmp6[i] < 2^101 */
1303296341Sdelphij    } else {
1304296341Sdelphij        /*
1305296341Sdelphij         * We'll assume z2 = 1 (special case z2 = 0 is handled later)
1306296341Sdelphij         */
1307238384Sjkim
1308296341Sdelphij        /* u1 = ftmp3 = x1*z2z2 */
1309296341Sdelphij        felem_assign(ftmp3, x1);
1310296341Sdelphij        /* ftmp3[i] < 2^106 */
1311238384Sjkim
1312296341Sdelphij        /* ftmp5 = 2z1z2 */
1313296341Sdelphij        felem_assign(ftmp5, z1);
1314296341Sdelphij        felem_scalar(ftmp5, 2);
1315296341Sdelphij        /* ftmp5[i] < 2*2^106 = 2^107 */
1316238384Sjkim
1317296341Sdelphij        /* s1 = ftmp2 = y1 * z2**3 */
1318296341Sdelphij        felem_assign(ftmp6, y1);
1319296341Sdelphij        /* ftmp6[i] < 2^106 */
1320296341Sdelphij    }
1321238384Sjkim
1322296341Sdelphij    /* u2 = x2*z1z1 */
1323296341Sdelphij    smallfelem_mul(tmp, x2, small1);
1324296341Sdelphij    felem_reduce(ftmp4, tmp);
1325238384Sjkim
1326296341Sdelphij    /* h = ftmp4 = u2 - u1 */
1327296341Sdelphij    felem_diff_zero107(ftmp4, ftmp3);
1328296341Sdelphij    /* ftmp4[i] < 2^107 + 2^101 < 2^108 */
1329296341Sdelphij    felem_shrink(small4, ftmp4);
1330238384Sjkim
1331296341Sdelphij    x_equal = smallfelem_is_zero(small4);
1332238384Sjkim
1333296341Sdelphij    /* z_out = ftmp5 * h */
1334296341Sdelphij    felem_small_mul(tmp, small4, ftmp5);
1335296341Sdelphij    felem_reduce(z_out, tmp);
1336296341Sdelphij    /* z_out[i] < 2^101 */
1337238384Sjkim
1338296341Sdelphij    /* ftmp = z1 * z1z1 */
1339296341Sdelphij    smallfelem_mul(tmp, small1, small3);
1340296341Sdelphij    felem_reduce(ftmp, tmp);
1341238384Sjkim
1342296341Sdelphij    /* s2 = tmp = y2 * z1**3 */
1343296341Sdelphij    felem_small_mul(tmp, y2, ftmp);
1344296341Sdelphij    felem_reduce(ftmp5, tmp);
1345238384Sjkim
1346296341Sdelphij    /* r = ftmp5 = (s2 - s1)*2 */
1347296341Sdelphij    felem_diff_zero107(ftmp5, ftmp6);
1348296341Sdelphij    /* ftmp5[i] < 2^107 + 2^107 = 2^108 */
1349296341Sdelphij    felem_scalar(ftmp5, 2);
1350296341Sdelphij    /* ftmp5[i] < 2^109 */
1351296341Sdelphij    felem_shrink(small1, ftmp5);
1352296341Sdelphij    y_equal = smallfelem_is_zero(small1);
1353238384Sjkim
1354296341Sdelphij    if (x_equal && y_equal && !z1_is_zero && !z2_is_zero) {
1355296341Sdelphij        point_double(x3, y3, z3, x1, y1, z1);
1356296341Sdelphij        return;
1357296341Sdelphij    }
1358238384Sjkim
1359296341Sdelphij    /* I = ftmp = (2h)**2 */
1360296341Sdelphij    felem_assign(ftmp, ftmp4);
1361296341Sdelphij    felem_scalar(ftmp, 2);
1362296341Sdelphij    /* ftmp[i] < 2*2^108 = 2^109 */
1363296341Sdelphij    felem_square(tmp, ftmp);
1364296341Sdelphij    felem_reduce(ftmp, tmp);
1365238384Sjkim
1366296341Sdelphij    /* J = ftmp2 = h * I */
1367296341Sdelphij    felem_mul(tmp, ftmp4, ftmp);
1368296341Sdelphij    felem_reduce(ftmp2, tmp);
1369238384Sjkim
1370296341Sdelphij    /* V = ftmp4 = U1 * I */
1371296341Sdelphij    felem_mul(tmp, ftmp3, ftmp);
1372296341Sdelphij    felem_reduce(ftmp4, tmp);
1373238384Sjkim
1374296341Sdelphij    /* x_out = r**2 - J - 2V */
1375296341Sdelphij    smallfelem_square(tmp, small1);
1376296341Sdelphij    felem_reduce(x_out, tmp);
1377296341Sdelphij    felem_assign(ftmp3, ftmp4);
1378296341Sdelphij    felem_scalar(ftmp4, 2);
1379296341Sdelphij    felem_sum(ftmp4, ftmp2);
1380296341Sdelphij    /* ftmp4[i] < 2*2^101 + 2^101 < 2^103 */
1381296341Sdelphij    felem_diff(x_out, ftmp4);
1382296341Sdelphij    /* x_out[i] < 2^105 + 2^101 */
1383238384Sjkim
1384296341Sdelphij    /* y_out = r(V-x_out) - 2 * s1 * J */
1385296341Sdelphij    felem_diff_zero107(ftmp3, x_out);
1386296341Sdelphij    /* ftmp3[i] < 2^107 + 2^101 < 2^108 */
1387296341Sdelphij    felem_small_mul(tmp, small1, ftmp3);
1388296341Sdelphij    felem_mul(tmp2, ftmp6, ftmp2);
1389296341Sdelphij    longfelem_scalar(tmp2, 2);
1390296341Sdelphij    /* tmp2[i] < 2*2^67 = 2^68 */
1391296341Sdelphij    longfelem_diff(tmp, tmp2);
1392296341Sdelphij    /* tmp[i] < 2^67 + 2^70 + 2^40 < 2^71 */
1393296341Sdelphij    felem_reduce_zero105(y_out, tmp);
1394296341Sdelphij    /* y_out[i] < 2^106 */
1395238384Sjkim
1396296341Sdelphij    copy_small_conditional(x_out, x2, z1_is_zero);
1397296341Sdelphij    copy_conditional(x_out, x1, z2_is_zero);
1398296341Sdelphij    copy_small_conditional(y_out, y2, z1_is_zero);
1399296341Sdelphij    copy_conditional(y_out, y1, z2_is_zero);
1400296341Sdelphij    copy_small_conditional(z_out, z2, z1_is_zero);
1401296341Sdelphij    copy_conditional(z_out, z1, z2_is_zero);
1402296341Sdelphij    felem_assign(x3, x_out);
1403296341Sdelphij    felem_assign(y3, y_out);
1404296341Sdelphij    felem_assign(z3, z_out);
1405296341Sdelphij}
1406238384Sjkim
1407296341Sdelphij/*
1408296341Sdelphij * point_add_small is the same as point_add, except that it operates on
1409296341Sdelphij * smallfelems
1410296341Sdelphij */
1411238384Sjkimstatic void point_add_small(smallfelem x3, smallfelem y3, smallfelem z3,
1412296341Sdelphij                            smallfelem x1, smallfelem y1, smallfelem z1,
1413296341Sdelphij                            smallfelem x2, smallfelem y2, smallfelem z2)
1414296341Sdelphij{
1415296341Sdelphij    felem felem_x3, felem_y3, felem_z3;
1416296341Sdelphij    felem felem_x1, felem_y1, felem_z1;
1417296341Sdelphij    smallfelem_expand(felem_x1, x1);
1418296341Sdelphij    smallfelem_expand(felem_y1, y1);
1419296341Sdelphij    smallfelem_expand(felem_z1, z1);
1420296341Sdelphij    point_add(felem_x3, felem_y3, felem_z3, felem_x1, felem_y1, felem_z1, 0,
1421296341Sdelphij              x2, y2, z2);
1422296341Sdelphij    felem_shrink(x3, felem_x3);
1423296341Sdelphij    felem_shrink(y3, felem_y3);
1424296341Sdelphij    felem_shrink(z3, felem_z3);
1425296341Sdelphij}
1426238384Sjkim
1427296341Sdelphij/*-
1428296341Sdelphij * Base point pre computation
1429238384Sjkim * --------------------------
1430238384Sjkim *
1431238384Sjkim * Two different sorts of precomputed tables are used in the following code.
1432238384Sjkim * Each contain various points on the curve, where each point is three field
1433238384Sjkim * elements (x, y, z).
1434238384Sjkim *
1435238384Sjkim * For the base point table, z is usually 1 (0 for the point at infinity).
1436238384Sjkim * This table has 2 * 16 elements, starting with the following:
1437238384Sjkim * index | bits    | point
1438238384Sjkim * ------+---------+------------------------------
1439238384Sjkim *     0 | 0 0 0 0 | 0G
1440238384Sjkim *     1 | 0 0 0 1 | 1G
1441238384Sjkim *     2 | 0 0 1 0 | 2^64G
1442238384Sjkim *     3 | 0 0 1 1 | (2^64 + 1)G
1443238384Sjkim *     4 | 0 1 0 0 | 2^128G
1444238384Sjkim *     5 | 0 1 0 1 | (2^128 + 1)G
1445238384Sjkim *     6 | 0 1 1 0 | (2^128 + 2^64)G
1446238384Sjkim *     7 | 0 1 1 1 | (2^128 + 2^64 + 1)G
1447238384Sjkim *     8 | 1 0 0 0 | 2^192G
1448238384Sjkim *     9 | 1 0 0 1 | (2^192 + 1)G
1449238384Sjkim *    10 | 1 0 1 0 | (2^192 + 2^64)G
1450238384Sjkim *    11 | 1 0 1 1 | (2^192 + 2^64 + 1)G
1451238384Sjkim *    12 | 1 1 0 0 | (2^192 + 2^128)G
1452238384Sjkim *    13 | 1 1 0 1 | (2^192 + 2^128 + 1)G
1453238384Sjkim *    14 | 1 1 1 0 | (2^192 + 2^128 + 2^64)G
1454238384Sjkim *    15 | 1 1 1 1 | (2^192 + 2^128 + 2^64 + 1)G
1455238384Sjkim * followed by a copy of this with each element multiplied by 2^32.
1456238384Sjkim *
1457238384Sjkim * The reason for this is so that we can clock bits into four different
1458238384Sjkim * locations when doing simple scalar multiplies against the base point,
1459238384Sjkim * and then another four locations using the second 16 elements.
1460238384Sjkim *
1461238384Sjkim * Tables for other points have table[i] = iG for i in 0 .. 16. */
1462238384Sjkim
1463238384Sjkim/* gmul is the table of precomputed base points */
1464296341Sdelphijstatic const smallfelem gmul[2][16][3] = {
1465296341Sdelphij    {{{0, 0, 0, 0},
1466296341Sdelphij      {0, 0, 0, 0},
1467296341Sdelphij      {0, 0, 0, 0}},
1468296341Sdelphij     {{0xf4a13945d898c296, 0x77037d812deb33a0, 0xf8bce6e563a440f2,
1469296341Sdelphij       0x6b17d1f2e12c4247},
1470296341Sdelphij      {0xcbb6406837bf51f5, 0x2bce33576b315ece, 0x8ee7eb4a7c0f9e16,
1471296341Sdelphij       0x4fe342e2fe1a7f9b},
1472296341Sdelphij      {1, 0, 0, 0}},
1473296341Sdelphij     {{0x90e75cb48e14db63, 0x29493baaad651f7e, 0x8492592e326e25de,
1474296341Sdelphij       0x0fa822bc2811aaa5},
1475296341Sdelphij      {0xe41124545f462ee7, 0x34b1a65050fe82f5, 0x6f4ad4bcb3df188b,
1476296341Sdelphij       0xbff44ae8f5dba80d},
1477296341Sdelphij      {1, 0, 0, 0}},
1478296341Sdelphij     {{0x93391ce2097992af, 0xe96c98fd0d35f1fa, 0xb257c0de95e02789,
1479296341Sdelphij       0x300a4bbc89d6726f},
1480296341Sdelphij      {0xaa54a291c08127a0, 0x5bb1eeada9d806a5, 0x7f1ddb25ff1e3c6f,
1481296341Sdelphij       0x72aac7e0d09b4644},
1482296341Sdelphij      {1, 0, 0, 0}},
1483296341Sdelphij     {{0x57c84fc9d789bd85, 0xfc35ff7dc297eac3, 0xfb982fd588c6766e,
1484296341Sdelphij       0x447d739beedb5e67},
1485296341Sdelphij      {0x0c7e33c972e25b32, 0x3d349b95a7fae500, 0xe12e9d953a4aaff7,
1486296341Sdelphij       0x2d4825ab834131ee},
1487296341Sdelphij      {1, 0, 0, 0}},
1488296341Sdelphij     {{0x13949c932a1d367f, 0xef7fbd2b1a0a11b7, 0xddc6068bb91dfc60,
1489296341Sdelphij       0xef9519328a9c72ff},
1490296341Sdelphij      {0x196035a77376d8a8, 0x23183b0895ca1740, 0xc1ee9807022c219c,
1491296341Sdelphij       0x611e9fc37dbb2c9b},
1492296341Sdelphij      {1, 0, 0, 0}},
1493296341Sdelphij     {{0xcae2b1920b57f4bc, 0x2936df5ec6c9bc36, 0x7dea6482e11238bf,
1494296341Sdelphij       0x550663797b51f5d8},
1495296341Sdelphij      {0x44ffe216348a964c, 0x9fb3d576dbdefbe1, 0x0afa40018d9d50e5,
1496296341Sdelphij       0x157164848aecb851},
1497296341Sdelphij      {1, 0, 0, 0}},
1498296341Sdelphij     {{0xe48ecafffc5cde01, 0x7ccd84e70d715f26, 0xa2e8f483f43e4391,
1499296341Sdelphij       0xeb5d7745b21141ea},
1500296341Sdelphij      {0xcac917e2731a3479, 0x85f22cfe2844b645, 0x0990e6a158006cee,
1501296341Sdelphij       0xeafd72ebdbecc17b},
1502296341Sdelphij      {1, 0, 0, 0}},
1503296341Sdelphij     {{0x6cf20ffb313728be, 0x96439591a3c6b94a, 0x2736ff8344315fc5,
1504296341Sdelphij       0xa6d39677a7849276},
1505296341Sdelphij      {0xf2bab833c357f5f4, 0x824a920c2284059b, 0x66b8babd2d27ecdf,
1506296341Sdelphij       0x674f84749b0b8816},
1507296341Sdelphij      {1, 0, 0, 0}},
1508296341Sdelphij     {{0x2df48c04677c8a3e, 0x74e02f080203a56b, 0x31855f7db8c7fedb,
1509296341Sdelphij       0x4e769e7672c9ddad},
1510296341Sdelphij      {0xa4c36165b824bbb0, 0xfb9ae16f3b9122a5, 0x1ec0057206947281,
1511296341Sdelphij       0x42b99082de830663},
1512296341Sdelphij      {1, 0, 0, 0}},
1513296341Sdelphij     {{0x6ef95150dda868b9, 0xd1f89e799c0ce131, 0x7fdc1ca008a1c478,
1514296341Sdelphij       0x78878ef61c6ce04d},
1515296341Sdelphij      {0x9c62b9121fe0d976, 0x6ace570ebde08d4f, 0xde53142c12309def,
1516296341Sdelphij       0xb6cb3f5d7b72c321},
1517296341Sdelphij      {1, 0, 0, 0}},
1518296341Sdelphij     {{0x7f991ed2c31a3573, 0x5b82dd5bd54fb496, 0x595c5220812ffcae,
1519296341Sdelphij       0x0c88bc4d716b1287},
1520296341Sdelphij      {0x3a57bf635f48aca8, 0x7c8181f4df2564f3, 0x18d1b5b39c04e6aa,
1521296341Sdelphij       0xdd5ddea3f3901dc6},
1522296341Sdelphij      {1, 0, 0, 0}},
1523296341Sdelphij     {{0xe96a79fb3e72ad0c, 0x43a0a28c42ba792f, 0xefe0a423083e49f3,
1524296341Sdelphij       0x68f344af6b317466},
1525296341Sdelphij      {0xcdfe17db3fb24d4a, 0x668bfc2271f5c626, 0x604ed93c24d67ff3,
1526296341Sdelphij       0x31b9c405f8540a20},
1527296341Sdelphij      {1, 0, 0, 0}},
1528296341Sdelphij     {{0xd36b4789a2582e7f, 0x0d1a10144ec39c28, 0x663c62c3edbad7a0,
1529296341Sdelphij       0x4052bf4b6f461db9},
1530296341Sdelphij      {0x235a27c3188d25eb, 0xe724f33999bfcc5b, 0x862be6bd71d70cc8,
1531296341Sdelphij       0xfecf4d5190b0fc61},
1532296341Sdelphij      {1, 0, 0, 0}},
1533296341Sdelphij     {{0x74346c10a1d4cfac, 0xafdf5cc08526a7a4, 0x123202a8f62bff7a,
1534296341Sdelphij       0x1eddbae2c802e41a},
1535296341Sdelphij      {0x8fa0af2dd603f844, 0x36e06b7e4c701917, 0x0c45f45273db33a0,
1536296341Sdelphij       0x43104d86560ebcfc},
1537296341Sdelphij      {1, 0, 0, 0}},
1538296341Sdelphij     {{0x9615b5110d1d78e5, 0x66b0de3225c4744b, 0x0a4a46fb6aaf363a,
1539296341Sdelphij       0xb48e26b484f7a21c},
1540296341Sdelphij      {0x06ebb0f621a01b2d, 0xc004e4048b7b0f98, 0x64131bcdfed6f668,
1541296341Sdelphij       0xfac015404d4d3dab},
1542296341Sdelphij      {1, 0, 0, 0}}},
1543296341Sdelphij    {{{0, 0, 0, 0},
1544296341Sdelphij      {0, 0, 0, 0},
1545296341Sdelphij      {0, 0, 0, 0}},
1546296341Sdelphij     {{0x3a5a9e22185a5943, 0x1ab919365c65dfb6, 0x21656b32262c71da,
1547296341Sdelphij       0x7fe36b40af22af89},
1548296341Sdelphij      {0xd50d152c699ca101, 0x74b3d5867b8af212, 0x9f09f40407dca6f1,
1549296341Sdelphij       0xe697d45825b63624},
1550296341Sdelphij      {1, 0, 0, 0}},
1551296341Sdelphij     {{0xa84aa9397512218e, 0xe9a521b074ca0141, 0x57880b3a18a2e902,
1552296341Sdelphij       0x4a5b506612a677a6},
1553296341Sdelphij      {0x0beada7a4c4f3840, 0x626db15419e26d9d, 0xc42604fbe1627d40,
1554296341Sdelphij       0xeb13461ceac089f1},
1555296341Sdelphij      {1, 0, 0, 0}},
1556296341Sdelphij     {{0xf9faed0927a43281, 0x5e52c4144103ecbc, 0xc342967aa815c857,
1557296341Sdelphij       0x0781b8291c6a220a},
1558296341Sdelphij      {0x5a8343ceeac55f80, 0x88f80eeee54a05e3, 0x97b2a14f12916434,
1559296341Sdelphij       0x690cde8df0151593},
1560296341Sdelphij      {1, 0, 0, 0}},
1561296341Sdelphij     {{0xaee9c75df7f82f2a, 0x9e4c35874afdf43a, 0xf5622df437371326,
1562296341Sdelphij       0x8a535f566ec73617},
1563296341Sdelphij      {0xc5f9a0ac223094b7, 0xcde533864c8c7669, 0x37e02819085a92bf,
1564296341Sdelphij       0x0455c08468b08bd7},
1565296341Sdelphij      {1, 0, 0, 0}},
1566296341Sdelphij     {{0x0c0a6e2c9477b5d9, 0xf9a4bf62876dc444, 0x5050a949b6cdc279,
1567296341Sdelphij       0x06bada7ab77f8276},
1568296341Sdelphij      {0xc8b4aed1ea48dac9, 0xdebd8a4b7ea1070f, 0x427d49101366eb70,
1569296341Sdelphij       0x5b476dfd0e6cb18a},
1570296341Sdelphij      {1, 0, 0, 0}},
1571296341Sdelphij     {{0x7c5c3e44278c340a, 0x4d54606812d66f3b, 0x29a751b1ae23c5d8,
1572296341Sdelphij       0x3e29864e8a2ec908},
1573296341Sdelphij      {0x142d2a6626dbb850, 0xad1744c4765bd780, 0x1f150e68e322d1ed,
1574296341Sdelphij       0x239b90ea3dc31e7e},
1575296341Sdelphij      {1, 0, 0, 0}},
1576296341Sdelphij     {{0x78c416527a53322a, 0x305dde6709776f8e, 0xdbcab759f8862ed4,
1577296341Sdelphij       0x820f4dd949f72ff7},
1578296341Sdelphij      {0x6cc544a62b5debd4, 0x75be5d937b4e8cc4, 0x1b481b1b215c14d3,
1579296341Sdelphij       0x140406ec783a05ec},
1580296341Sdelphij      {1, 0, 0, 0}},
1581296341Sdelphij     {{0x6a703f10e895df07, 0xfd75f3fa01876bd8, 0xeb5b06e70ce08ffe,
1582296341Sdelphij       0x68f6b8542783dfee},
1583296341Sdelphij      {0x90c76f8a78712655, 0xcf5293d2f310bf7f, 0xfbc8044dfda45028,
1584296341Sdelphij       0xcbe1feba92e40ce6},
1585296341Sdelphij      {1, 0, 0, 0}},
1586296341Sdelphij     {{0xe998ceea4396e4c1, 0xfc82ef0b6acea274, 0x230f729f2250e927,
1587296341Sdelphij       0xd0b2f94d2f420109},
1588296341Sdelphij      {0x4305adddb38d4966, 0x10b838f8624c3b45, 0x7db2636658954e7a,
1589296341Sdelphij       0x971459828b0719e5},
1590296341Sdelphij      {1, 0, 0, 0}},
1591296341Sdelphij     {{0x4bd6b72623369fc9, 0x57f2929e53d0b876, 0xc2d5cba4f2340687,
1592296341Sdelphij       0x961610004a866aba},
1593296341Sdelphij      {0x49997bcd2e407a5e, 0x69ab197d92ddcb24, 0x2cf1f2438fe5131c,
1594296341Sdelphij       0x7acb9fadcee75e44},
1595296341Sdelphij      {1, 0, 0, 0}},
1596296341Sdelphij     {{0x254e839423d2d4c0, 0xf57f0c917aea685b, 0xa60d880f6f75aaea,
1597296341Sdelphij       0x24eb9acca333bf5b},
1598296341Sdelphij      {0xe3de4ccb1cda5dea, 0xfeef9341c51a6b4f, 0x743125f88bac4c4d,
1599296341Sdelphij       0x69f891c5acd079cc},
1600296341Sdelphij      {1, 0, 0, 0}},
1601296341Sdelphij     {{0xeee44b35702476b5, 0x7ed031a0e45c2258, 0xb422d1e7bd6f8514,
1602296341Sdelphij       0xe51f547c5972a107},
1603296341Sdelphij      {0xa25bcd6fc9cf343d, 0x8ca922ee097c184e, 0xa62f98b3a9fe9a06,
1604296341Sdelphij       0x1c309a2b25bb1387},
1605296341Sdelphij      {1, 0, 0, 0}},
1606296341Sdelphij     {{0x9295dbeb1967c459, 0xb00148833472c98e, 0xc504977708011828,
1607296341Sdelphij       0x20b87b8aa2c4e503},
1608296341Sdelphij      {0x3063175de057c277, 0x1bd539338fe582dd, 0x0d11adef5f69a044,
1609296341Sdelphij       0xf5c6fa49919776be},
1610296341Sdelphij      {1, 0, 0, 0}},
1611296341Sdelphij     {{0x8c944e760fd59e11, 0x3876cba1102fad5f, 0xa454c3fad83faa56,
1612296341Sdelphij       0x1ed7d1b9332010b9},
1613296341Sdelphij      {0xa1011a270024b889, 0x05e4d0dcac0cd344, 0x52b520f0eb6a2a24,
1614296341Sdelphij       0x3a2b03f03217257a},
1615296341Sdelphij      {1, 0, 0, 0}},
1616296341Sdelphij     {{0xf20fc2afdf1d043d, 0xf330240db58d5a62, 0xfc7d229ca0058c3b,
1617296341Sdelphij       0x15fee545c78dd9f6},
1618296341Sdelphij      {0x501e82885bc98cda, 0x41ef80e5d046ac04, 0x557d9f49461210fb,
1619296341Sdelphij       0x4ab5b6b2b8753f81},
1620296341Sdelphij      {1, 0, 0, 0}}}
1621296341Sdelphij};
1622238384Sjkim
1623296341Sdelphij/*
1624296341Sdelphij * select_point selects the |idx|th point from a precomputation table and
1625296341Sdelphij * copies it to out.
1626296341Sdelphij */
1627296341Sdelphijstatic void select_point(const u64 idx, unsigned int size,
1628296341Sdelphij                         const smallfelem pre_comp[16][3], smallfelem out[3])
1629296341Sdelphij{
1630296341Sdelphij    unsigned i, j;
1631296341Sdelphij    u64 *outlimbs = &out[0][0];
1632296341Sdelphij    memset(outlimbs, 0, 3 * sizeof(smallfelem));
1633238384Sjkim
1634296341Sdelphij    for (i = 0; i < size; i++) {
1635296341Sdelphij        const u64 *inlimbs = (u64 *)&pre_comp[i][0][0];
1636296341Sdelphij        u64 mask = i ^ idx;
1637296341Sdelphij        mask |= mask >> 4;
1638296341Sdelphij        mask |= mask >> 2;
1639296341Sdelphij        mask |= mask >> 1;
1640296341Sdelphij        mask &= 1;
1641296341Sdelphij        mask--;
1642296341Sdelphij        for (j = 0; j < NLIMBS * 3; j++)
1643296341Sdelphij            outlimbs[j] |= inlimbs[j] & mask;
1644296341Sdelphij    }
1645296341Sdelphij}
1646238384Sjkim
1647238384Sjkim/* get_bit returns the |i|th bit in |in| */
1648238384Sjkimstatic char get_bit(const felem_bytearray in, int i)
1649296341Sdelphij{
1650296341Sdelphij    if ((i < 0) || (i >= 256))
1651296341Sdelphij        return 0;
1652296341Sdelphij    return (in[i >> 3] >> (i & 7)) & 1;
1653296341Sdelphij}
1654238384Sjkim
1655296341Sdelphij/*
1656296341Sdelphij * Interleaved point multiplication using precomputed point multiples: The
1657296341Sdelphij * small point multiples 0*P, 1*P, ..., 17*P are in pre_comp[], the scalars
1658296341Sdelphij * in scalars[]. If g_scalar is non-NULL, we also add this multiple of the
1659296341Sdelphij * generator, using certain (large) precomputed multiples in g_pre_comp.
1660296341Sdelphij * Output point (X, Y, Z) is stored in x_out, y_out, z_out
1661296341Sdelphij */
1662238384Sjkimstatic void batch_mul(felem x_out, felem y_out, felem z_out,
1663296341Sdelphij                      const felem_bytearray scalars[],
1664296341Sdelphij                      const unsigned num_points, const u8 *g_scalar,
1665296341Sdelphij                      const int mixed, const smallfelem pre_comp[][17][3],
1666296341Sdelphij                      const smallfelem g_pre_comp[2][16][3])
1667296341Sdelphij{
1668296341Sdelphij    int i, skip;
1669296341Sdelphij    unsigned num, gen_mul = (g_scalar != NULL);
1670296341Sdelphij    felem nq[3], ftmp;
1671296341Sdelphij    smallfelem tmp[3];
1672296341Sdelphij    u64 bits;
1673296341Sdelphij    u8 sign, digit;
1674238384Sjkim
1675296341Sdelphij    /* set nq to the point at infinity */
1676296341Sdelphij    memset(nq, 0, 3 * sizeof(felem));
1677238384Sjkim
1678296341Sdelphij    /*
1679296341Sdelphij     * Loop over all scalars msb-to-lsb, interleaving additions of multiples
1680296341Sdelphij     * of the generator (two in each of the last 32 rounds) and additions of
1681296341Sdelphij     * other points multiples (every 5th round).
1682296341Sdelphij     */
1683296341Sdelphij    skip = 1;                   /* save two point operations in the first
1684296341Sdelphij                                 * round */
1685296341Sdelphij    for (i = (num_points ? 255 : 31); i >= 0; --i) {
1686296341Sdelphij        /* double */
1687296341Sdelphij        if (!skip)
1688296341Sdelphij            point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]);
1689238384Sjkim
1690296341Sdelphij        /* add multiples of the generator */
1691296341Sdelphij        if (gen_mul && (i <= 31)) {
1692296341Sdelphij            /* first, look 32 bits upwards */
1693296341Sdelphij            bits = get_bit(g_scalar, i + 224) << 3;
1694296341Sdelphij            bits |= get_bit(g_scalar, i + 160) << 2;
1695296341Sdelphij            bits |= get_bit(g_scalar, i + 96) << 1;
1696296341Sdelphij            bits |= get_bit(g_scalar, i + 32);
1697296341Sdelphij            /* select the point to add, in constant time */
1698296341Sdelphij            select_point(bits, 16, g_pre_comp[1], tmp);
1699238384Sjkim
1700296341Sdelphij            if (!skip) {
1701296341Sdelphij                /* Arg 1 below is for "mixed" */
1702296341Sdelphij                point_add(nq[0], nq[1], nq[2],
1703296341Sdelphij                          nq[0], nq[1], nq[2], 1, tmp[0], tmp[1], tmp[2]);
1704296341Sdelphij            } else {
1705296341Sdelphij                smallfelem_expand(nq[0], tmp[0]);
1706296341Sdelphij                smallfelem_expand(nq[1], tmp[1]);
1707296341Sdelphij                smallfelem_expand(nq[2], tmp[2]);
1708296341Sdelphij                skip = 0;
1709296341Sdelphij            }
1710238384Sjkim
1711296341Sdelphij            /* second, look at the current position */
1712296341Sdelphij            bits = get_bit(g_scalar, i + 192) << 3;
1713296341Sdelphij            bits |= get_bit(g_scalar, i + 128) << 2;
1714296341Sdelphij            bits |= get_bit(g_scalar, i + 64) << 1;
1715296341Sdelphij            bits |= get_bit(g_scalar, i);
1716296341Sdelphij            /* select the point to add, in constant time */
1717296341Sdelphij            select_point(bits, 16, g_pre_comp[0], tmp);
1718296341Sdelphij            /* Arg 1 below is for "mixed" */
1719296341Sdelphij            point_add(nq[0], nq[1], nq[2],
1720296341Sdelphij                      nq[0], nq[1], nq[2], 1, tmp[0], tmp[1], tmp[2]);
1721296341Sdelphij        }
1722238384Sjkim
1723296341Sdelphij        /* do other additions every 5 doublings */
1724296341Sdelphij        if (num_points && (i % 5 == 0)) {
1725296341Sdelphij            /* loop over all scalars */
1726296341Sdelphij            for (num = 0; num < num_points; ++num) {
1727296341Sdelphij                bits = get_bit(scalars[num], i + 4) << 5;
1728296341Sdelphij                bits |= get_bit(scalars[num], i + 3) << 4;
1729296341Sdelphij                bits |= get_bit(scalars[num], i + 2) << 3;
1730296341Sdelphij                bits |= get_bit(scalars[num], i + 1) << 2;
1731296341Sdelphij                bits |= get_bit(scalars[num], i) << 1;
1732296341Sdelphij                bits |= get_bit(scalars[num], i - 1);
1733296341Sdelphij                ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits);
1734238384Sjkim
1735296341Sdelphij                /*
1736296341Sdelphij                 * select the point to add or subtract, in constant time
1737296341Sdelphij                 */
1738296341Sdelphij                select_point(digit, 17, pre_comp[num], tmp);
1739296341Sdelphij                smallfelem_neg(ftmp, tmp[1]); /* (X, -Y, Z) is the negative
1740296341Sdelphij                                               * point */
1741296341Sdelphij                copy_small_conditional(ftmp, tmp[1], (((limb) sign) - 1));
1742296341Sdelphij                felem_contract(tmp[1], ftmp);
1743238384Sjkim
1744296341Sdelphij                if (!skip) {
1745296341Sdelphij                    point_add(nq[0], nq[1], nq[2],
1746296341Sdelphij                              nq[0], nq[1], nq[2],
1747296341Sdelphij                              mixed, tmp[0], tmp[1], tmp[2]);
1748296341Sdelphij                } else {
1749296341Sdelphij                    smallfelem_expand(nq[0], tmp[0]);
1750296341Sdelphij                    smallfelem_expand(nq[1], tmp[1]);
1751296341Sdelphij                    smallfelem_expand(nq[2], tmp[2]);
1752296341Sdelphij                    skip = 0;
1753296341Sdelphij                }
1754296341Sdelphij            }
1755296341Sdelphij        }
1756296341Sdelphij    }
1757296341Sdelphij    felem_assign(x_out, nq[0]);
1758296341Sdelphij    felem_assign(y_out, nq[1]);
1759296341Sdelphij    felem_assign(z_out, nq[2]);
1760296341Sdelphij}
1761238384Sjkim
1762238384Sjkim/* Precomputation for the group generator. */
1763238384Sjkimtypedef struct {
1764296341Sdelphij    smallfelem g_pre_comp[2][16][3];
1765296341Sdelphij    int references;
1766238384Sjkim} NISTP256_PRE_COMP;
1767238384Sjkim
1768238384Sjkimconst EC_METHOD *EC_GFp_nistp256_method(void)
1769296341Sdelphij{
1770296341Sdelphij    static const EC_METHOD ret = {
1771296341Sdelphij        EC_FLAGS_DEFAULT_OCT,
1772296341Sdelphij        NID_X9_62_prime_field,
1773296341Sdelphij        ec_GFp_nistp256_group_init,
1774296341Sdelphij        ec_GFp_simple_group_finish,
1775296341Sdelphij        ec_GFp_simple_group_clear_finish,
1776296341Sdelphij        ec_GFp_nist_group_copy,
1777296341Sdelphij        ec_GFp_nistp256_group_set_curve,
1778296341Sdelphij        ec_GFp_simple_group_get_curve,
1779296341Sdelphij        ec_GFp_simple_group_get_degree,
1780296341Sdelphij        ec_GFp_simple_group_check_discriminant,
1781296341Sdelphij        ec_GFp_simple_point_init,
1782296341Sdelphij        ec_GFp_simple_point_finish,
1783296341Sdelphij        ec_GFp_simple_point_clear_finish,
1784296341Sdelphij        ec_GFp_simple_point_copy,
1785296341Sdelphij        ec_GFp_simple_point_set_to_infinity,
1786296341Sdelphij        ec_GFp_simple_set_Jprojective_coordinates_GFp,
1787296341Sdelphij        ec_GFp_simple_get_Jprojective_coordinates_GFp,
1788296341Sdelphij        ec_GFp_simple_point_set_affine_coordinates,
1789296341Sdelphij        ec_GFp_nistp256_point_get_affine_coordinates,
1790296341Sdelphij        0 /* point_set_compressed_coordinates */ ,
1791296341Sdelphij        0 /* point2oct */ ,
1792296341Sdelphij        0 /* oct2point */ ,
1793296341Sdelphij        ec_GFp_simple_add,
1794296341Sdelphij        ec_GFp_simple_dbl,
1795296341Sdelphij        ec_GFp_simple_invert,
1796296341Sdelphij        ec_GFp_simple_is_at_infinity,
1797296341Sdelphij        ec_GFp_simple_is_on_curve,
1798296341Sdelphij        ec_GFp_simple_cmp,
1799296341Sdelphij        ec_GFp_simple_make_affine,
1800296341Sdelphij        ec_GFp_simple_points_make_affine,
1801296341Sdelphij        ec_GFp_nistp256_points_mul,
1802296341Sdelphij        ec_GFp_nistp256_precompute_mult,
1803296341Sdelphij        ec_GFp_nistp256_have_precompute_mult,
1804296341Sdelphij        ec_GFp_nist_field_mul,
1805296341Sdelphij        ec_GFp_nist_field_sqr,
1806296341Sdelphij        0 /* field_div */ ,
1807296341Sdelphij        0 /* field_encode */ ,
1808296341Sdelphij        0 /* field_decode */ ,
1809296341Sdelphij        0                       /* field_set_to_one */
1810296341Sdelphij    };
1811238384Sjkim
1812296341Sdelphij    return &ret;
1813296341Sdelphij}
1814238384Sjkim
1815238384Sjkim/******************************************************************************/
1816296341Sdelphij/*
1817296341Sdelphij * FUNCTIONS TO MANAGE PRECOMPUTATION
1818238384Sjkim */
1819238384Sjkim
1820238384Sjkimstatic NISTP256_PRE_COMP *nistp256_pre_comp_new()
1821296341Sdelphij{
1822296341Sdelphij    NISTP256_PRE_COMP *ret = NULL;
1823296341Sdelphij    ret = (NISTP256_PRE_COMP *) OPENSSL_malloc(sizeof *ret);
1824296341Sdelphij    if (!ret) {
1825296341Sdelphij        ECerr(EC_F_NISTP256_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE);
1826296341Sdelphij        return ret;
1827296341Sdelphij    }
1828296341Sdelphij    memset(ret->g_pre_comp, 0, sizeof(ret->g_pre_comp));
1829296341Sdelphij    ret->references = 1;
1830296341Sdelphij    return ret;
1831296341Sdelphij}
1832238384Sjkim
1833238384Sjkimstatic void *nistp256_pre_comp_dup(void *src_)
1834296341Sdelphij{
1835296341Sdelphij    NISTP256_PRE_COMP *src = src_;
1836238384Sjkim
1837296341Sdelphij    /* no need to actually copy, these objects never change! */
1838296341Sdelphij    CRYPTO_add(&src->references, 1, CRYPTO_LOCK_EC_PRE_COMP);
1839238384Sjkim
1840296341Sdelphij    return src_;
1841296341Sdelphij}
1842238384Sjkim
1843238384Sjkimstatic void nistp256_pre_comp_free(void *pre_)
1844296341Sdelphij{
1845296341Sdelphij    int i;
1846296341Sdelphij    NISTP256_PRE_COMP *pre = pre_;
1847238384Sjkim
1848296341Sdelphij    if (!pre)
1849296341Sdelphij        return;
1850238384Sjkim
1851296341Sdelphij    i = CRYPTO_add(&pre->references, -1, CRYPTO_LOCK_EC_PRE_COMP);
1852296341Sdelphij    if (i > 0)
1853296341Sdelphij        return;
1854238384Sjkim
1855296341Sdelphij    OPENSSL_free(pre);
1856296341Sdelphij}
1857238384Sjkim
1858238384Sjkimstatic void nistp256_pre_comp_clear_free(void *pre_)
1859296341Sdelphij{
1860296341Sdelphij    int i;
1861296341Sdelphij    NISTP256_PRE_COMP *pre = pre_;
1862238384Sjkim
1863296341Sdelphij    if (!pre)
1864296341Sdelphij        return;
1865238384Sjkim
1866296341Sdelphij    i = CRYPTO_add(&pre->references, -1, CRYPTO_LOCK_EC_PRE_COMP);
1867296341Sdelphij    if (i > 0)
1868296341Sdelphij        return;
1869238384Sjkim
1870296341Sdelphij    OPENSSL_cleanse(pre, sizeof *pre);
1871296341Sdelphij    OPENSSL_free(pre);
1872296341Sdelphij}
1873238384Sjkim
1874238384Sjkim/******************************************************************************/
1875296341Sdelphij/*
1876296341Sdelphij * OPENSSL EC_METHOD FUNCTIONS
1877238384Sjkim */
1878238384Sjkim
1879238384Sjkimint ec_GFp_nistp256_group_init(EC_GROUP *group)
1880296341Sdelphij{
1881296341Sdelphij    int ret;
1882296341Sdelphij    ret = ec_GFp_simple_group_init(group);
1883296341Sdelphij    group->a_is_minus3 = 1;
1884296341Sdelphij    return ret;
1885296341Sdelphij}
1886238384Sjkim
1887238384Sjkimint ec_GFp_nistp256_group_set_curve(EC_GROUP *group, const BIGNUM *p,
1888296341Sdelphij                                    const BIGNUM *a, const BIGNUM *b,
1889296341Sdelphij                                    BN_CTX *ctx)
1890296341Sdelphij{
1891296341Sdelphij    int ret = 0;
1892296341Sdelphij    BN_CTX *new_ctx = NULL;
1893296341Sdelphij    BIGNUM *curve_p, *curve_a, *curve_b;
1894238384Sjkim
1895296341Sdelphij    if (ctx == NULL)
1896296341Sdelphij        if ((ctx = new_ctx = BN_CTX_new()) == NULL)
1897296341Sdelphij            return 0;
1898296341Sdelphij    BN_CTX_start(ctx);
1899296341Sdelphij    if (((curve_p = BN_CTX_get(ctx)) == NULL) ||
1900296341Sdelphij        ((curve_a = BN_CTX_get(ctx)) == NULL) ||
1901296341Sdelphij        ((curve_b = BN_CTX_get(ctx)) == NULL))
1902296341Sdelphij        goto err;
1903296341Sdelphij    BN_bin2bn(nistp256_curve_params[0], sizeof(felem_bytearray), curve_p);
1904296341Sdelphij    BN_bin2bn(nistp256_curve_params[1], sizeof(felem_bytearray), curve_a);
1905296341Sdelphij    BN_bin2bn(nistp256_curve_params[2], sizeof(felem_bytearray), curve_b);
1906296341Sdelphij    if ((BN_cmp(curve_p, p)) || (BN_cmp(curve_a, a)) || (BN_cmp(curve_b, b))) {
1907296341Sdelphij        ECerr(EC_F_EC_GFP_NISTP256_GROUP_SET_CURVE,
1908296341Sdelphij              EC_R_WRONG_CURVE_PARAMETERS);
1909296341Sdelphij        goto err;
1910296341Sdelphij    }
1911296341Sdelphij    group->field_mod_func = BN_nist_mod_256;
1912296341Sdelphij    ret = ec_GFp_simple_group_set_curve(group, p, a, b, ctx);
1913296341Sdelphij err:
1914296341Sdelphij    BN_CTX_end(ctx);
1915296341Sdelphij    if (new_ctx != NULL)
1916296341Sdelphij        BN_CTX_free(new_ctx);
1917296341Sdelphij    return ret;
1918296341Sdelphij}
1919238384Sjkim
1920296341Sdelphij/*
1921296341Sdelphij * Takes the Jacobian coordinates (X, Y, Z) of a point and returns (X', Y') =
1922296341Sdelphij * (X/Z^2, Y/Z^3)
1923296341Sdelphij */
1924238384Sjkimint ec_GFp_nistp256_point_get_affine_coordinates(const EC_GROUP *group,
1925296341Sdelphij                                                 const EC_POINT *point,
1926296341Sdelphij                                                 BIGNUM *x, BIGNUM *y,
1927296341Sdelphij                                                 BN_CTX *ctx)
1928296341Sdelphij{
1929296341Sdelphij    felem z1, z2, x_in, y_in;
1930296341Sdelphij    smallfelem x_out, y_out;
1931296341Sdelphij    longfelem tmp;
1932238384Sjkim
1933296341Sdelphij    if (EC_POINT_is_at_infinity(group, point)) {
1934296341Sdelphij        ECerr(EC_F_EC_GFP_NISTP256_POINT_GET_AFFINE_COORDINATES,
1935296341Sdelphij              EC_R_POINT_AT_INFINITY);
1936296341Sdelphij        return 0;
1937296341Sdelphij    }
1938296341Sdelphij    if ((!BN_to_felem(x_in, &point->X)) || (!BN_to_felem(y_in, &point->Y)) ||
1939296341Sdelphij        (!BN_to_felem(z1, &point->Z)))
1940296341Sdelphij        return 0;
1941296341Sdelphij    felem_inv(z2, z1);
1942296341Sdelphij    felem_square(tmp, z2);
1943296341Sdelphij    felem_reduce(z1, tmp);
1944296341Sdelphij    felem_mul(tmp, x_in, z1);
1945296341Sdelphij    felem_reduce(x_in, tmp);
1946296341Sdelphij    felem_contract(x_out, x_in);
1947296341Sdelphij    if (x != NULL) {
1948296341Sdelphij        if (!smallfelem_to_BN(x, x_out)) {
1949296341Sdelphij            ECerr(EC_F_EC_GFP_NISTP256_POINT_GET_AFFINE_COORDINATES,
1950296341Sdelphij                  ERR_R_BN_LIB);
1951296341Sdelphij            return 0;
1952296341Sdelphij        }
1953296341Sdelphij    }
1954296341Sdelphij    felem_mul(tmp, z1, z2);
1955296341Sdelphij    felem_reduce(z1, tmp);
1956296341Sdelphij    felem_mul(tmp, y_in, z1);
1957296341Sdelphij    felem_reduce(y_in, tmp);
1958296341Sdelphij    felem_contract(y_out, y_in);
1959296341Sdelphij    if (y != NULL) {
1960296341Sdelphij        if (!smallfelem_to_BN(y, y_out)) {
1961296341Sdelphij            ECerr(EC_F_EC_GFP_NISTP256_POINT_GET_AFFINE_COORDINATES,
1962296341Sdelphij                  ERR_R_BN_LIB);
1963296341Sdelphij            return 0;
1964296341Sdelphij        }
1965296341Sdelphij    }
1966296341Sdelphij    return 1;
1967296341Sdelphij}
1968238384Sjkim
1969296341Sdelphij/* points below is of size |num|, and tmp_smallfelems is of size |num+1| */
1970296341Sdelphijstatic void make_points_affine(size_t num, smallfelem points[][3],
1971296341Sdelphij                               smallfelem tmp_smallfelems[])
1972296341Sdelphij{
1973296341Sdelphij    /*
1974296341Sdelphij     * Runs in constant time, unless an input is the point at infinity (which
1975296341Sdelphij     * normally shouldn't happen).
1976296341Sdelphij     */
1977296341Sdelphij    ec_GFp_nistp_points_make_affine_internal(num,
1978296341Sdelphij                                             points,
1979296341Sdelphij                                             sizeof(smallfelem),
1980296341Sdelphij                                             tmp_smallfelems,
1981296341Sdelphij                                             (void (*)(void *))smallfelem_one,
1982296341Sdelphij                                             (int (*)(const void *))
1983296341Sdelphij                                             smallfelem_is_zero_int,
1984296341Sdelphij                                             (void (*)(void *, const void *))
1985296341Sdelphij                                             smallfelem_assign,
1986296341Sdelphij                                             (void (*)(void *, const void *))
1987296341Sdelphij                                             smallfelem_square_contract,
1988296341Sdelphij                                             (void (*)
1989296341Sdelphij                                              (void *, const void *,
1990296341Sdelphij                                               const void *))
1991296341Sdelphij                                             smallfelem_mul_contract,
1992296341Sdelphij                                             (void (*)(void *, const void *))
1993296341Sdelphij                                             smallfelem_inv_contract,
1994296341Sdelphij                                             /* nothing to contract */
1995296341Sdelphij                                             (void (*)(void *, const void *))
1996296341Sdelphij                                             smallfelem_assign);
1997296341Sdelphij}
1998238384Sjkim
1999296341Sdelphij/*
2000296341Sdelphij * Computes scalar*generator + \sum scalars[i]*points[i], ignoring NULL
2001296341Sdelphij * values Result is stored in r (r can equal one of the inputs).
2002296341Sdelphij */
2003238384Sjkimint ec_GFp_nistp256_points_mul(const EC_GROUP *group, EC_POINT *r,
2004296341Sdelphij                               const BIGNUM *scalar, size_t num,
2005296341Sdelphij                               const EC_POINT *points[],
2006296341Sdelphij                               const BIGNUM *scalars[], BN_CTX *ctx)
2007296341Sdelphij{
2008296341Sdelphij    int ret = 0;
2009296341Sdelphij    int j;
2010296341Sdelphij    int mixed = 0;
2011296341Sdelphij    BN_CTX *new_ctx = NULL;
2012296341Sdelphij    BIGNUM *x, *y, *z, *tmp_scalar;
2013296341Sdelphij    felem_bytearray g_secret;
2014296341Sdelphij    felem_bytearray *secrets = NULL;
2015296341Sdelphij    smallfelem(*pre_comp)[17][3] = NULL;
2016296341Sdelphij    smallfelem *tmp_smallfelems = NULL;
2017296341Sdelphij    felem_bytearray tmp;
2018296341Sdelphij    unsigned i, num_bytes;
2019296341Sdelphij    int have_pre_comp = 0;
2020296341Sdelphij    size_t num_points = num;
2021296341Sdelphij    smallfelem x_in, y_in, z_in;
2022296341Sdelphij    felem x_out, y_out, z_out;
2023296341Sdelphij    NISTP256_PRE_COMP *pre = NULL;
2024296341Sdelphij    const smallfelem(*g_pre_comp)[16][3] = NULL;
2025296341Sdelphij    EC_POINT *generator = NULL;
2026296341Sdelphij    const EC_POINT *p = NULL;
2027296341Sdelphij    const BIGNUM *p_scalar = NULL;
2028238384Sjkim
2029296341Sdelphij    if (ctx == NULL)
2030296341Sdelphij        if ((ctx = new_ctx = BN_CTX_new()) == NULL)
2031296341Sdelphij            return 0;
2032296341Sdelphij    BN_CTX_start(ctx);
2033296341Sdelphij    if (((x = BN_CTX_get(ctx)) == NULL) ||
2034296341Sdelphij        ((y = BN_CTX_get(ctx)) == NULL) ||
2035296341Sdelphij        ((z = BN_CTX_get(ctx)) == NULL) ||
2036296341Sdelphij        ((tmp_scalar = BN_CTX_get(ctx)) == NULL))
2037296341Sdelphij        goto err;
2038238384Sjkim
2039296341Sdelphij    if (scalar != NULL) {
2040296341Sdelphij        pre = EC_EX_DATA_get_data(group->extra_data,
2041296341Sdelphij                                  nistp256_pre_comp_dup,
2042296341Sdelphij                                  nistp256_pre_comp_free,
2043296341Sdelphij                                  nistp256_pre_comp_clear_free);
2044296341Sdelphij        if (pre)
2045296341Sdelphij            /* we have precomputation, try to use it */
2046296341Sdelphij            g_pre_comp = (const smallfelem(*)[16][3])pre->g_pre_comp;
2047296341Sdelphij        else
2048296341Sdelphij            /* try to use the standard precomputation */
2049296341Sdelphij            g_pre_comp = &gmul[0];
2050296341Sdelphij        generator = EC_POINT_new(group);
2051296341Sdelphij        if (generator == NULL)
2052296341Sdelphij            goto err;
2053296341Sdelphij        /* get the generator from precomputation */
2054296341Sdelphij        if (!smallfelem_to_BN(x, g_pre_comp[0][1][0]) ||
2055296341Sdelphij            !smallfelem_to_BN(y, g_pre_comp[0][1][1]) ||
2056296341Sdelphij            !smallfelem_to_BN(z, g_pre_comp[0][1][2])) {
2057296341Sdelphij            ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_BN_LIB);
2058296341Sdelphij            goto err;
2059296341Sdelphij        }
2060296341Sdelphij        if (!EC_POINT_set_Jprojective_coordinates_GFp(group,
2061296341Sdelphij                                                      generator, x, y, z,
2062296341Sdelphij                                                      ctx))
2063296341Sdelphij            goto err;
2064296341Sdelphij        if (0 == EC_POINT_cmp(group, generator, group->generator, ctx))
2065296341Sdelphij            /* precomputation matches generator */
2066296341Sdelphij            have_pre_comp = 1;
2067296341Sdelphij        else
2068296341Sdelphij            /*
2069296341Sdelphij             * we don't have valid precomputation: treat the generator as a
2070296341Sdelphij             * random point
2071296341Sdelphij             */
2072296341Sdelphij            num_points++;
2073296341Sdelphij    }
2074296341Sdelphij    if (num_points > 0) {
2075296341Sdelphij        if (num_points >= 3) {
2076296341Sdelphij            /*
2077296341Sdelphij             * unless we precompute multiples for just one or two points,
2078296341Sdelphij             * converting those into affine form is time well spent
2079296341Sdelphij             */
2080296341Sdelphij            mixed = 1;
2081296341Sdelphij        }
2082296341Sdelphij        secrets = OPENSSL_malloc(num_points * sizeof(felem_bytearray));
2083296341Sdelphij        pre_comp = OPENSSL_malloc(num_points * 17 * 3 * sizeof(smallfelem));
2084296341Sdelphij        if (mixed)
2085296341Sdelphij            tmp_smallfelems =
2086296341Sdelphij                OPENSSL_malloc((num_points * 17 + 1) * sizeof(smallfelem));
2087296341Sdelphij        if ((secrets == NULL) || (pre_comp == NULL)
2088296341Sdelphij            || (mixed && (tmp_smallfelems == NULL))) {
2089296341Sdelphij            ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_MALLOC_FAILURE);
2090296341Sdelphij            goto err;
2091296341Sdelphij        }
2092238384Sjkim
2093296341Sdelphij        /*
2094296341Sdelphij         * we treat NULL scalars as 0, and NULL points as points at infinity,
2095296341Sdelphij         * i.e., they contribute nothing to the linear combination
2096296341Sdelphij         */
2097296341Sdelphij        memset(secrets, 0, num_points * sizeof(felem_bytearray));
2098296341Sdelphij        memset(pre_comp, 0, num_points * 17 * 3 * sizeof(smallfelem));
2099296341Sdelphij        for (i = 0; i < num_points; ++i) {
2100296341Sdelphij            if (i == num)
2101296341Sdelphij                /*
2102296341Sdelphij                 * we didn't have a valid precomputation, so we pick the
2103296341Sdelphij                 * generator
2104296341Sdelphij                 */
2105296341Sdelphij            {
2106296341Sdelphij                p = EC_GROUP_get0_generator(group);
2107296341Sdelphij                p_scalar = scalar;
2108296341Sdelphij            } else
2109296341Sdelphij                /* the i^th point */
2110296341Sdelphij            {
2111296341Sdelphij                p = points[i];
2112296341Sdelphij                p_scalar = scalars[i];
2113296341Sdelphij            }
2114296341Sdelphij            if ((p_scalar != NULL) && (p != NULL)) {
2115296341Sdelphij                /* reduce scalar to 0 <= scalar < 2^256 */
2116296341Sdelphij                if ((BN_num_bits(p_scalar) > 256)
2117296341Sdelphij                    || (BN_is_negative(p_scalar))) {
2118296341Sdelphij                    /*
2119296341Sdelphij                     * this is an unusual input, and we don't guarantee
2120296341Sdelphij                     * constant-timeness
2121296341Sdelphij                     */
2122296341Sdelphij                    if (!BN_nnmod(tmp_scalar, p_scalar, &group->order, ctx)) {
2123296341Sdelphij                        ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_BN_LIB);
2124296341Sdelphij                        goto err;
2125296341Sdelphij                    }
2126296341Sdelphij                    num_bytes = BN_bn2bin(tmp_scalar, tmp);
2127296341Sdelphij                } else
2128296341Sdelphij                    num_bytes = BN_bn2bin(p_scalar, tmp);
2129296341Sdelphij                flip_endian(secrets[i], tmp, num_bytes);
2130296341Sdelphij                /* precompute multiples */
2131296341Sdelphij                if ((!BN_to_felem(x_out, &p->X)) ||
2132296341Sdelphij                    (!BN_to_felem(y_out, &p->Y)) ||
2133296341Sdelphij                    (!BN_to_felem(z_out, &p->Z)))
2134296341Sdelphij                    goto err;
2135296341Sdelphij                felem_shrink(pre_comp[i][1][0], x_out);
2136296341Sdelphij                felem_shrink(pre_comp[i][1][1], y_out);
2137296341Sdelphij                felem_shrink(pre_comp[i][1][2], z_out);
2138296341Sdelphij                for (j = 2; j <= 16; ++j) {
2139296341Sdelphij                    if (j & 1) {
2140296341Sdelphij                        point_add_small(pre_comp[i][j][0], pre_comp[i][j][1],
2141296341Sdelphij                                        pre_comp[i][j][2], pre_comp[i][1][0],
2142296341Sdelphij                                        pre_comp[i][1][1], pre_comp[i][1][2],
2143296341Sdelphij                                        pre_comp[i][j - 1][0],
2144296341Sdelphij                                        pre_comp[i][j - 1][1],
2145296341Sdelphij                                        pre_comp[i][j - 1][2]);
2146296341Sdelphij                    } else {
2147296341Sdelphij                        point_double_small(pre_comp[i][j][0],
2148296341Sdelphij                                           pre_comp[i][j][1],
2149296341Sdelphij                                           pre_comp[i][j][2],
2150296341Sdelphij                                           pre_comp[i][j / 2][0],
2151296341Sdelphij                                           pre_comp[i][j / 2][1],
2152296341Sdelphij                                           pre_comp[i][j / 2][2]);
2153296341Sdelphij                    }
2154296341Sdelphij                }
2155296341Sdelphij            }
2156296341Sdelphij        }
2157296341Sdelphij        if (mixed)
2158296341Sdelphij            make_points_affine(num_points * 17, pre_comp[0], tmp_smallfelems);
2159296341Sdelphij    }
2160238384Sjkim
2161296341Sdelphij    /* the scalar for the generator */
2162296341Sdelphij    if ((scalar != NULL) && (have_pre_comp)) {
2163296341Sdelphij        memset(g_secret, 0, sizeof(g_secret));
2164296341Sdelphij        /* reduce scalar to 0 <= scalar < 2^256 */
2165296341Sdelphij        if ((BN_num_bits(scalar) > 256) || (BN_is_negative(scalar))) {
2166296341Sdelphij            /*
2167296341Sdelphij             * this is an unusual input, and we don't guarantee
2168296341Sdelphij             * constant-timeness
2169296341Sdelphij             */
2170296341Sdelphij            if (!BN_nnmod(tmp_scalar, scalar, &group->order, ctx)) {
2171296341Sdelphij                ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_BN_LIB);
2172296341Sdelphij                goto err;
2173296341Sdelphij            }
2174296341Sdelphij            num_bytes = BN_bn2bin(tmp_scalar, tmp);
2175296341Sdelphij        } else
2176296341Sdelphij            num_bytes = BN_bn2bin(scalar, tmp);
2177296341Sdelphij        flip_endian(g_secret, tmp, num_bytes);
2178296341Sdelphij        /* do the multiplication with generator precomputation */
2179296341Sdelphij        batch_mul(x_out, y_out, z_out,
2180296341Sdelphij                  (const felem_bytearray(*))secrets, num_points,
2181296341Sdelphij                  g_secret,
2182296341Sdelphij                  mixed, (const smallfelem(*)[17][3])pre_comp, g_pre_comp);
2183296341Sdelphij    } else
2184296341Sdelphij        /* do the multiplication without generator precomputation */
2185296341Sdelphij        batch_mul(x_out, y_out, z_out,
2186296341Sdelphij                  (const felem_bytearray(*))secrets, num_points,
2187296341Sdelphij                  NULL, mixed, (const smallfelem(*)[17][3])pre_comp, NULL);
2188296341Sdelphij    /* reduce the output to its unique minimal representation */
2189296341Sdelphij    felem_contract(x_in, x_out);
2190296341Sdelphij    felem_contract(y_in, y_out);
2191296341Sdelphij    felem_contract(z_in, z_out);
2192296341Sdelphij    if ((!smallfelem_to_BN(x, x_in)) || (!smallfelem_to_BN(y, y_in)) ||
2193296341Sdelphij        (!smallfelem_to_BN(z, z_in))) {
2194296341Sdelphij        ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_BN_LIB);
2195296341Sdelphij        goto err;
2196296341Sdelphij    }
2197296341Sdelphij    ret = EC_POINT_set_Jprojective_coordinates_GFp(group, r, x, y, z, ctx);
2198238384Sjkim
2199296341Sdelphij err:
2200296341Sdelphij    BN_CTX_end(ctx);
2201296341Sdelphij    if (generator != NULL)
2202296341Sdelphij        EC_POINT_free(generator);
2203296341Sdelphij    if (new_ctx != NULL)
2204296341Sdelphij        BN_CTX_free(new_ctx);
2205296341Sdelphij    if (secrets != NULL)
2206296341Sdelphij        OPENSSL_free(secrets);
2207296341Sdelphij    if (pre_comp != NULL)
2208296341Sdelphij        OPENSSL_free(pre_comp);
2209296341Sdelphij    if (tmp_smallfelems != NULL)
2210296341Sdelphij        OPENSSL_free(tmp_smallfelems);
2211296341Sdelphij    return ret;
2212296341Sdelphij}
2213238384Sjkim
2214238384Sjkimint ec_GFp_nistp256_precompute_mult(EC_GROUP *group, BN_CTX *ctx)
2215296341Sdelphij{
2216296341Sdelphij    int ret = 0;
2217296341Sdelphij    NISTP256_PRE_COMP *pre = NULL;
2218296341Sdelphij    int i, j;
2219296341Sdelphij    BN_CTX *new_ctx = NULL;
2220296341Sdelphij    BIGNUM *x, *y;
2221296341Sdelphij    EC_POINT *generator = NULL;
2222296341Sdelphij    smallfelem tmp_smallfelems[32];
2223296341Sdelphij    felem x_tmp, y_tmp, z_tmp;
2224238384Sjkim
2225296341Sdelphij    /* throw away old precomputation */
2226296341Sdelphij    EC_EX_DATA_free_data(&group->extra_data, nistp256_pre_comp_dup,
2227296341Sdelphij                         nistp256_pre_comp_free,
2228296341Sdelphij                         nistp256_pre_comp_clear_free);
2229296341Sdelphij    if (ctx == NULL)
2230296341Sdelphij        if ((ctx = new_ctx = BN_CTX_new()) == NULL)
2231296341Sdelphij            return 0;
2232296341Sdelphij    BN_CTX_start(ctx);
2233296341Sdelphij    if (((x = BN_CTX_get(ctx)) == NULL) || ((y = BN_CTX_get(ctx)) == NULL))
2234296341Sdelphij        goto err;
2235296341Sdelphij    /* get the generator */
2236296341Sdelphij    if (group->generator == NULL)
2237296341Sdelphij        goto err;
2238296341Sdelphij    generator = EC_POINT_new(group);
2239296341Sdelphij    if (generator == NULL)
2240296341Sdelphij        goto err;
2241296341Sdelphij    BN_bin2bn(nistp256_curve_params[3], sizeof(felem_bytearray), x);
2242296341Sdelphij    BN_bin2bn(nistp256_curve_params[4], sizeof(felem_bytearray), y);
2243296341Sdelphij    if (!EC_POINT_set_affine_coordinates_GFp(group, generator, x, y, ctx))
2244296341Sdelphij        goto err;
2245296341Sdelphij    if ((pre = nistp256_pre_comp_new()) == NULL)
2246296341Sdelphij        goto err;
2247296341Sdelphij    /*
2248296341Sdelphij     * if the generator is the standard one, use built-in precomputation
2249296341Sdelphij     */
2250296341Sdelphij    if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) {
2251296341Sdelphij        memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp));
2252296341Sdelphij        ret = 1;
2253296341Sdelphij        goto err;
2254296341Sdelphij    }
2255296341Sdelphij    if ((!BN_to_felem(x_tmp, &group->generator->X)) ||
2256296341Sdelphij        (!BN_to_felem(y_tmp, &group->generator->Y)) ||
2257296341Sdelphij        (!BN_to_felem(z_tmp, &group->generator->Z)))
2258296341Sdelphij        goto err;
2259296341Sdelphij    felem_shrink(pre->g_pre_comp[0][1][0], x_tmp);
2260296341Sdelphij    felem_shrink(pre->g_pre_comp[0][1][1], y_tmp);
2261296341Sdelphij    felem_shrink(pre->g_pre_comp[0][1][2], z_tmp);
2262296341Sdelphij    /*
2263296341Sdelphij     * compute 2^64*G, 2^128*G, 2^192*G for the first table, 2^32*G, 2^96*G,
2264296341Sdelphij     * 2^160*G, 2^224*G for the second one
2265296341Sdelphij     */
2266296341Sdelphij    for (i = 1; i <= 8; i <<= 1) {
2267296341Sdelphij        point_double_small(pre->g_pre_comp[1][i][0], pre->g_pre_comp[1][i][1],
2268296341Sdelphij                           pre->g_pre_comp[1][i][2], pre->g_pre_comp[0][i][0],
2269296341Sdelphij                           pre->g_pre_comp[0][i][1],
2270296341Sdelphij                           pre->g_pre_comp[0][i][2]);
2271296341Sdelphij        for (j = 0; j < 31; ++j) {
2272296341Sdelphij            point_double_small(pre->g_pre_comp[1][i][0],
2273296341Sdelphij                               pre->g_pre_comp[1][i][1],
2274296341Sdelphij                               pre->g_pre_comp[1][i][2],
2275296341Sdelphij                               pre->g_pre_comp[1][i][0],
2276296341Sdelphij                               pre->g_pre_comp[1][i][1],
2277296341Sdelphij                               pre->g_pre_comp[1][i][2]);
2278296341Sdelphij        }
2279296341Sdelphij        if (i == 8)
2280296341Sdelphij            break;
2281296341Sdelphij        point_double_small(pre->g_pre_comp[0][2 * i][0],
2282296341Sdelphij                           pre->g_pre_comp[0][2 * i][1],
2283296341Sdelphij                           pre->g_pre_comp[0][2 * i][2],
2284296341Sdelphij                           pre->g_pre_comp[1][i][0], pre->g_pre_comp[1][i][1],
2285296341Sdelphij                           pre->g_pre_comp[1][i][2]);
2286296341Sdelphij        for (j = 0; j < 31; ++j) {
2287296341Sdelphij            point_double_small(pre->g_pre_comp[0][2 * i][0],
2288296341Sdelphij                               pre->g_pre_comp[0][2 * i][1],
2289296341Sdelphij                               pre->g_pre_comp[0][2 * i][2],
2290296341Sdelphij                               pre->g_pre_comp[0][2 * i][0],
2291296341Sdelphij                               pre->g_pre_comp[0][2 * i][1],
2292296341Sdelphij                               pre->g_pre_comp[0][2 * i][2]);
2293296341Sdelphij        }
2294296341Sdelphij    }
2295296341Sdelphij    for (i = 0; i < 2; i++) {
2296296341Sdelphij        /* g_pre_comp[i][0] is the point at infinity */
2297296341Sdelphij        memset(pre->g_pre_comp[i][0], 0, sizeof(pre->g_pre_comp[i][0]));
2298296341Sdelphij        /* the remaining multiples */
2299296341Sdelphij        /* 2^64*G + 2^128*G resp. 2^96*G + 2^160*G */
2300296341Sdelphij        point_add_small(pre->g_pre_comp[i][6][0], pre->g_pre_comp[i][6][1],
2301296341Sdelphij                        pre->g_pre_comp[i][6][2], pre->g_pre_comp[i][4][0],
2302296341Sdelphij                        pre->g_pre_comp[i][4][1], pre->g_pre_comp[i][4][2],
2303296341Sdelphij                        pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1],
2304296341Sdelphij                        pre->g_pre_comp[i][2][2]);
2305296341Sdelphij        /* 2^64*G + 2^192*G resp. 2^96*G + 2^224*G */
2306296341Sdelphij        point_add_small(pre->g_pre_comp[i][10][0], pre->g_pre_comp[i][10][1],
2307296341Sdelphij                        pre->g_pre_comp[i][10][2], pre->g_pre_comp[i][8][0],
2308296341Sdelphij                        pre->g_pre_comp[i][8][1], pre->g_pre_comp[i][8][2],
2309296341Sdelphij                        pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1],
2310296341Sdelphij                        pre->g_pre_comp[i][2][2]);
2311296341Sdelphij        /* 2^128*G + 2^192*G resp. 2^160*G + 2^224*G */
2312296341Sdelphij        point_add_small(pre->g_pre_comp[i][12][0], pre->g_pre_comp[i][12][1],
2313296341Sdelphij                        pre->g_pre_comp[i][12][2], pre->g_pre_comp[i][8][0],
2314296341Sdelphij                        pre->g_pre_comp[i][8][1], pre->g_pre_comp[i][8][2],
2315296341Sdelphij                        pre->g_pre_comp[i][4][0], pre->g_pre_comp[i][4][1],
2316296341Sdelphij                        pre->g_pre_comp[i][4][2]);
2317296341Sdelphij        /*
2318296341Sdelphij         * 2^64*G + 2^128*G + 2^192*G resp. 2^96*G + 2^160*G + 2^224*G
2319296341Sdelphij         */
2320296341Sdelphij        point_add_small(pre->g_pre_comp[i][14][0], pre->g_pre_comp[i][14][1],
2321296341Sdelphij                        pre->g_pre_comp[i][14][2], pre->g_pre_comp[i][12][0],
2322296341Sdelphij                        pre->g_pre_comp[i][12][1], pre->g_pre_comp[i][12][2],
2323296341Sdelphij                        pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1],
2324296341Sdelphij                        pre->g_pre_comp[i][2][2]);
2325296341Sdelphij        for (j = 1; j < 8; ++j) {
2326296341Sdelphij            /* odd multiples: add G resp. 2^32*G */
2327296341Sdelphij            point_add_small(pre->g_pre_comp[i][2 * j + 1][0],
2328296341Sdelphij                            pre->g_pre_comp[i][2 * j + 1][1],
2329296341Sdelphij                            pre->g_pre_comp[i][2 * j + 1][2],
2330296341Sdelphij                            pre->g_pre_comp[i][2 * j][0],
2331296341Sdelphij                            pre->g_pre_comp[i][2 * j][1],
2332296341Sdelphij                            pre->g_pre_comp[i][2 * j][2],
2333296341Sdelphij                            pre->g_pre_comp[i][1][0],
2334296341Sdelphij                            pre->g_pre_comp[i][1][1],
2335296341Sdelphij                            pre->g_pre_comp[i][1][2]);
2336296341Sdelphij        }
2337296341Sdelphij    }
2338296341Sdelphij    make_points_affine(31, &(pre->g_pre_comp[0][1]), tmp_smallfelems);
2339238384Sjkim
2340296341Sdelphij    if (!EC_EX_DATA_set_data(&group->extra_data, pre, nistp256_pre_comp_dup,
2341296341Sdelphij                             nistp256_pre_comp_free,
2342296341Sdelphij                             nistp256_pre_comp_clear_free))
2343296341Sdelphij        goto err;
2344296341Sdelphij    ret = 1;
2345296341Sdelphij    pre = NULL;
2346238384Sjkim err:
2347296341Sdelphij    BN_CTX_end(ctx);
2348296341Sdelphij    if (generator != NULL)
2349296341Sdelphij        EC_POINT_free(generator);
2350296341Sdelphij    if (new_ctx != NULL)
2351296341Sdelphij        BN_CTX_free(new_ctx);
2352296341Sdelphij    if (pre)
2353296341Sdelphij        nistp256_pre_comp_free(pre);
2354296341Sdelphij    return ret;
2355296341Sdelphij}
2356238384Sjkim
2357238384Sjkimint ec_GFp_nistp256_have_precompute_mult(const EC_GROUP *group)
2358296341Sdelphij{
2359296341Sdelphij    if (EC_EX_DATA_get_data(group->extra_data, nistp256_pre_comp_dup,
2360296341Sdelphij                            nistp256_pre_comp_free,
2361296341Sdelphij                            nistp256_pre_comp_clear_free)
2362296341Sdelphij        != NULL)
2363296341Sdelphij        return 1;
2364296341Sdelphij    else
2365296341Sdelphij        return 0;
2366296341Sdelphij}
2367238384Sjkim#else
2368296341Sdelphijstatic void *dummy = &dummy;
2369238384Sjkim#endif
2370