1/*
2 * Copyright 2011-2020 The OpenSSL Project Authors. All Rights Reserved.
3 *
4 * Licensed under the OpenSSL license (the "License").  You may not use
5 * this file except in compliance with the License.  You can obtain a copy
6 * in the file LICENSE in the source distribution or at
7 * https://www.openssl.org/source/license.html
8 */
9
10/* Copyright 2011 Google Inc.
11 *
12 * Licensed under the Apache License, Version 2.0 (the "License");
13 *
14 * you may not use this file except in compliance with the License.
15 * You may obtain a copy of the License at
16 *
17 *     http://www.apache.org/licenses/LICENSE-2.0
18 *
19 *  Unless required by applicable law or agreed to in writing, software
20 *  distributed under the License is distributed on an "AS IS" BASIS,
21 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
22 *  See the License for the specific language governing permissions and
23 *  limitations under the License.
24 */
25
26/*
27 * A 64-bit implementation of the NIST P-256 elliptic curve point multiplication
28 *
29 * OpenSSL integration was taken from Emilia Kasper's work in ecp_nistp224.c.
30 * Otherwise based on Emilia's P224 work, which was inspired by my curve25519
31 * work which got its smarts from Daniel J. Bernstein's work on the same.
32 */
33
34#include <openssl/opensslconf.h>
35#ifdef OPENSSL_NO_EC_NISTP_64_GCC_128
36NON_EMPTY_TRANSLATION_UNIT
37#else
38
39# include <stdint.h>
40# include <string.h>
41# include <openssl/err.h>
42# include "ec_local.h"
43
44# if defined(__SIZEOF_INT128__) && __SIZEOF_INT128__==16
45  /* even with gcc, the typedef won't work for 32-bit platforms */
46typedef __uint128_t uint128_t;  /* nonstandard; implemented by gcc on 64-bit
47                                 * platforms */
48typedef __int128_t int128_t;
49# else
50#  error "Your compiler doesn't appear to support 128-bit integer types"
51# endif
52
53typedef uint8_t u8;
54typedef uint32_t u32;
55typedef uint64_t u64;
56
57/*
58 * The underlying field. P256 operates over GF(2^256-2^224+2^192+2^96-1). We
59 * can serialise an element of this field into 32 bytes. We call this an
60 * felem_bytearray.
61 */
62
63typedef u8 felem_bytearray[32];
64
65/*
66 * These are the parameters of P256, taken from FIPS 186-3, page 86. These
67 * values are big-endian.
68 */
69static const felem_bytearray nistp256_curve_params[5] = {
70    {0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, /* p */
71     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
72     0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,
73     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
74    {0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, /* a = -3 */
75     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
76     0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,
77     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xfc},
78    {0x5a, 0xc6, 0x35, 0xd8, 0xaa, 0x3a, 0x93, 0xe7, /* b */
79     0xb3, 0xeb, 0xbd, 0x55, 0x76, 0x98, 0x86, 0xbc,
80     0x65, 0x1d, 0x06, 0xb0, 0xcc, 0x53, 0xb0, 0xf6,
81     0x3b, 0xce, 0x3c, 0x3e, 0x27, 0xd2, 0x60, 0x4b},
82    {0x6b, 0x17, 0xd1, 0xf2, 0xe1, 0x2c, 0x42, 0x47, /* x */
83     0xf8, 0xbc, 0xe6, 0xe5, 0x63, 0xa4, 0x40, 0xf2,
84     0x77, 0x03, 0x7d, 0x81, 0x2d, 0xeb, 0x33, 0xa0,
85     0xf4, 0xa1, 0x39, 0x45, 0xd8, 0x98, 0xc2, 0x96},
86    {0x4f, 0xe3, 0x42, 0xe2, 0xfe, 0x1a, 0x7f, 0x9b, /* y */
87     0x8e, 0xe7, 0xeb, 0x4a, 0x7c, 0x0f, 0x9e, 0x16,
88     0x2b, 0xce, 0x33, 0x57, 0x6b, 0x31, 0x5e, 0xce,
89     0xcb, 0xb6, 0x40, 0x68, 0x37, 0xbf, 0x51, 0xf5}
90};
91
92/*-
93 * The representation of field elements.
94 * ------------------------------------
95 *
96 * We represent field elements with either four 128-bit values, eight 128-bit
97 * values, or four 64-bit values. The field element represented is:
98 *   v[0]*2^0 + v[1]*2^64 + v[2]*2^128 + v[3]*2^192  (mod p)
99 * or:
100 *   v[0]*2^0 + v[1]*2^64 + v[2]*2^128 + ... + v[8]*2^512  (mod p)
101 *
102 * 128-bit values are called 'limbs'. Since the limbs are spaced only 64 bits
103 * apart, but are 128-bits wide, the most significant bits of each limb overlap
104 * with the least significant bits of the next.
105 *
106 * A field element with four limbs is an 'felem'. One with eight limbs is a
107 * 'longfelem'
108 *
109 * A field element with four, 64-bit values is called a 'smallfelem'. Small
110 * values are used as intermediate values before multiplication.
111 */
112
113# define NLIMBS 4
114
115typedef uint128_t limb;
116typedef limb felem[NLIMBS];
117typedef limb longfelem[NLIMBS * 2];
118typedef u64 smallfelem[NLIMBS];
119
120/* This is the value of the prime as four 64-bit words, little-endian. */
121static const u64 kPrime[4] =
122    { 0xfffffffffffffffful, 0xffffffff, 0, 0xffffffff00000001ul };
123static const u64 bottom63bits = 0x7ffffffffffffffful;
124
125/*
126 * bin32_to_felem takes a little-endian byte array and converts it into felem
127 * form. This assumes that the CPU is little-endian.
128 */
129static void bin32_to_felem(felem out, const u8 in[32])
130{
131    out[0] = *((u64 *)&in[0]);
132    out[1] = *((u64 *)&in[8]);
133    out[2] = *((u64 *)&in[16]);
134    out[3] = *((u64 *)&in[24]);
135}
136
137/*
138 * smallfelem_to_bin32 takes a smallfelem and serialises into a little
139 * endian, 32 byte array. This assumes that the CPU is little-endian.
140 */
141static void smallfelem_to_bin32(u8 out[32], const smallfelem in)
142{
143    *((u64 *)&out[0]) = in[0];
144    *((u64 *)&out[8]) = in[1];
145    *((u64 *)&out[16]) = in[2];
146    *((u64 *)&out[24]) = in[3];
147}
148
149/* BN_to_felem converts an OpenSSL BIGNUM into an felem */
150static int BN_to_felem(felem out, const BIGNUM *bn)
151{
152    felem_bytearray b_out;
153    int num_bytes;
154
155    if (BN_is_negative(bn)) {
156        ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
157        return 0;
158    }
159    num_bytes = BN_bn2lebinpad(bn, b_out, sizeof(b_out));
160    if (num_bytes < 0) {
161        ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
162        return 0;
163    }
164    bin32_to_felem(out, b_out);
165    return 1;
166}
167
168/* felem_to_BN converts an felem into an OpenSSL BIGNUM */
169static BIGNUM *smallfelem_to_BN(BIGNUM *out, const smallfelem in)
170{
171    felem_bytearray b_out;
172    smallfelem_to_bin32(b_out, in);
173    return BN_lebin2bn(b_out, sizeof(b_out), out);
174}
175
176/*-
177 * Field operations
178 * ----------------
179 */
180
181static void smallfelem_one(smallfelem out)
182{
183    out[0] = 1;
184    out[1] = 0;
185    out[2] = 0;
186    out[3] = 0;
187}
188
189static void smallfelem_assign(smallfelem out, const smallfelem in)
190{
191    out[0] = in[0];
192    out[1] = in[1];
193    out[2] = in[2];
194    out[3] = in[3];
195}
196
197static void felem_assign(felem out, const felem in)
198{
199    out[0] = in[0];
200    out[1] = in[1];
201    out[2] = in[2];
202    out[3] = in[3];
203}
204
205/* felem_sum sets out = out + in. */
206static void felem_sum(felem out, const felem in)
207{
208    out[0] += in[0];
209    out[1] += in[1];
210    out[2] += in[2];
211    out[3] += in[3];
212}
213
214/* felem_small_sum sets out = out + in. */
215static void felem_small_sum(felem out, const smallfelem in)
216{
217    out[0] += in[0];
218    out[1] += in[1];
219    out[2] += in[2];
220    out[3] += in[3];
221}
222
223/* felem_scalar sets out = out * scalar */
224static void felem_scalar(felem out, const u64 scalar)
225{
226    out[0] *= scalar;
227    out[1] *= scalar;
228    out[2] *= scalar;
229    out[3] *= scalar;
230}
231
232/* longfelem_scalar sets out = out * scalar */
233static void longfelem_scalar(longfelem out, const u64 scalar)
234{
235    out[0] *= scalar;
236    out[1] *= scalar;
237    out[2] *= scalar;
238    out[3] *= scalar;
239    out[4] *= scalar;
240    out[5] *= scalar;
241    out[6] *= scalar;
242    out[7] *= scalar;
243}
244
245# define two105m41m9 (((limb)1) << 105) - (((limb)1) << 41) - (((limb)1) << 9)
246# define two105 (((limb)1) << 105)
247# define two105m41p9 (((limb)1) << 105) - (((limb)1) << 41) + (((limb)1) << 9)
248
249/* zero105 is 0 mod p */
250static const felem zero105 =
251    { two105m41m9, two105, two105m41p9, two105m41p9 };
252
253/*-
254 * smallfelem_neg sets |out| to |-small|
255 * On exit:
256 *   out[i] < out[i] + 2^105
257 */
258static void smallfelem_neg(felem out, const smallfelem small)
259{
260    /* In order to prevent underflow, we subtract from 0 mod p. */
261    out[0] = zero105[0] - small[0];
262    out[1] = zero105[1] - small[1];
263    out[2] = zero105[2] - small[2];
264    out[3] = zero105[3] - small[3];
265}
266
267/*-
268 * felem_diff subtracts |in| from |out|
269 * On entry:
270 *   in[i] < 2^104
271 * On exit:
272 *   out[i] < out[i] + 2^105
273 */
274static void felem_diff(felem out, const felem in)
275{
276    /*
277     * In order to prevent underflow, we add 0 mod p before subtracting.
278     */
279    out[0] += zero105[0];
280    out[1] += zero105[1];
281    out[2] += zero105[2];
282    out[3] += zero105[3];
283
284    out[0] -= in[0];
285    out[1] -= in[1];
286    out[2] -= in[2];
287    out[3] -= in[3];
288}
289
290# define two107m43m11 (((limb)1) << 107) - (((limb)1) << 43) - (((limb)1) << 11)
291# define two107 (((limb)1) << 107)
292# define two107m43p11 (((limb)1) << 107) - (((limb)1) << 43) + (((limb)1) << 11)
293
294/* zero107 is 0 mod p */
295static const felem zero107 =
296    { two107m43m11, two107, two107m43p11, two107m43p11 };
297
298/*-
299 * An alternative felem_diff for larger inputs |in|
300 * felem_diff_zero107 subtracts |in| from |out|
301 * On entry:
302 *   in[i] < 2^106
303 * On exit:
304 *   out[i] < out[i] + 2^107
305 */
306static void felem_diff_zero107(felem out, const felem in)
307{
308    /*
309     * In order to prevent underflow, we add 0 mod p before subtracting.
310     */
311    out[0] += zero107[0];
312    out[1] += zero107[1];
313    out[2] += zero107[2];
314    out[3] += zero107[3];
315
316    out[0] -= in[0];
317    out[1] -= in[1];
318    out[2] -= in[2];
319    out[3] -= in[3];
320}
321
322/*-
323 * longfelem_diff subtracts |in| from |out|
324 * On entry:
325 *   in[i] < 7*2^67
326 * On exit:
327 *   out[i] < out[i] + 2^70 + 2^40
328 */
329static void longfelem_diff(longfelem out, const longfelem in)
330{
331    static const limb two70m8p6 =
332        (((limb) 1) << 70) - (((limb) 1) << 8) + (((limb) 1) << 6);
333    static const limb two70p40 = (((limb) 1) << 70) + (((limb) 1) << 40);
334    static const limb two70 = (((limb) 1) << 70);
335    static const limb two70m40m38p6 =
336        (((limb) 1) << 70) - (((limb) 1) << 40) - (((limb) 1) << 38) +
337        (((limb) 1) << 6);
338    static const limb two70m6 = (((limb) 1) << 70) - (((limb) 1) << 6);
339
340    /* add 0 mod p to avoid underflow */
341    out[0] += two70m8p6;
342    out[1] += two70p40;
343    out[2] += two70;
344    out[3] += two70m40m38p6;
345    out[4] += two70m6;
346    out[5] += two70m6;
347    out[6] += two70m6;
348    out[7] += two70m6;
349
350    /* in[i] < 7*2^67 < 2^70 - 2^40 - 2^38 + 2^6 */
351    out[0] -= in[0];
352    out[1] -= in[1];
353    out[2] -= in[2];
354    out[3] -= in[3];
355    out[4] -= in[4];
356    out[5] -= in[5];
357    out[6] -= in[6];
358    out[7] -= in[7];
359}
360
361# define two64m0 (((limb)1) << 64) - 1
362# define two110p32m0 (((limb)1) << 110) + (((limb)1) << 32) - 1
363# define two64m46 (((limb)1) << 64) - (((limb)1) << 46)
364# define two64m32 (((limb)1) << 64) - (((limb)1) << 32)
365
366/* zero110 is 0 mod p */
367static const felem zero110 = { two64m0, two110p32m0, two64m46, two64m32 };
368
369/*-
370 * felem_shrink converts an felem into a smallfelem. The result isn't quite
371 * minimal as the value may be greater than p.
372 *
373 * On entry:
374 *   in[i] < 2^109
375 * On exit:
376 *   out[i] < 2^64
377 */
378static void felem_shrink(smallfelem out, const felem in)
379{
380    felem tmp;
381    u64 a, b, mask;
382    u64 high, low;
383    static const u64 kPrime3Test = 0x7fffffff00000001ul; /* 2^63 - 2^32 + 1 */
384
385    /* Carry 2->3 */
386    tmp[3] = zero110[3] + in[3] + ((u64)(in[2] >> 64));
387    /* tmp[3] < 2^110 */
388
389    tmp[2] = zero110[2] + (u64)in[2];
390    tmp[0] = zero110[0] + in[0];
391    tmp[1] = zero110[1] + in[1];
392    /* tmp[0] < 2**110, tmp[1] < 2^111, tmp[2] < 2**65 */
393
394    /*
395     * We perform two partial reductions where we eliminate the high-word of
396     * tmp[3]. We don't update the other words till the end.
397     */
398    a = tmp[3] >> 64;           /* a < 2^46 */
399    tmp[3] = (u64)tmp[3];
400    tmp[3] -= a;
401    tmp[3] += ((limb) a) << 32;
402    /* tmp[3] < 2^79 */
403
404    b = a;
405    a = tmp[3] >> 64;           /* a < 2^15 */
406    b += a;                     /* b < 2^46 + 2^15 < 2^47 */
407    tmp[3] = (u64)tmp[3];
408    tmp[3] -= a;
409    tmp[3] += ((limb) a) << 32;
410    /* tmp[3] < 2^64 + 2^47 */
411
412    /*
413     * This adjusts the other two words to complete the two partial
414     * reductions.
415     */
416    tmp[0] += b;
417    tmp[1] -= (((limb) b) << 32);
418
419    /*
420     * In order to make space in tmp[3] for the carry from 2 -> 3, we
421     * conditionally subtract kPrime if tmp[3] is large enough.
422     */
423    high = (u64)(tmp[3] >> 64);
424    /* As tmp[3] < 2^65, high is either 1 or 0 */
425    high = 0 - high;
426    /*-
427     * high is:
428     *   all ones   if the high word of tmp[3] is 1
429     *   all zeros  if the high word of tmp[3] if 0
430     */
431    low = (u64)tmp[3];
432    mask = 0 - (low >> 63);
433    /*-
434     * mask is:
435     *   all ones   if the MSB of low is 1
436     *   all zeros  if the MSB of low if 0
437     */
438    low &= bottom63bits;
439    low -= kPrime3Test;
440    /* if low was greater than kPrime3Test then the MSB is zero */
441    low = ~low;
442    low = 0 - (low >> 63);
443    /*-
444     * low is:
445     *   all ones   if low was > kPrime3Test
446     *   all zeros  if low was <= kPrime3Test
447     */
448    mask = (mask & low) | high;
449    tmp[0] -= mask & kPrime[0];
450    tmp[1] -= mask & kPrime[1];
451    /* kPrime[2] is zero, so omitted */
452    tmp[3] -= mask & kPrime[3];
453    /* tmp[3] < 2**64 - 2**32 + 1 */
454
455    tmp[1] += ((u64)(tmp[0] >> 64));
456    tmp[0] = (u64)tmp[0];
457    tmp[2] += ((u64)(tmp[1] >> 64));
458    tmp[1] = (u64)tmp[1];
459    tmp[3] += ((u64)(tmp[2] >> 64));
460    tmp[2] = (u64)tmp[2];
461    /* tmp[i] < 2^64 */
462
463    out[0] = tmp[0];
464    out[1] = tmp[1];
465    out[2] = tmp[2];
466    out[3] = tmp[3];
467}
468
469/* smallfelem_expand converts a smallfelem to an felem */
470static void smallfelem_expand(felem out, const smallfelem in)
471{
472    out[0] = in[0];
473    out[1] = in[1];
474    out[2] = in[2];
475    out[3] = in[3];
476}
477
478/*-
479 * smallfelem_square sets |out| = |small|^2
480 * On entry:
481 *   small[i] < 2^64
482 * On exit:
483 *   out[i] < 7 * 2^64 < 2^67
484 */
485static void smallfelem_square(longfelem out, const smallfelem small)
486{
487    limb a;
488    u64 high, low;
489
490    a = ((uint128_t) small[0]) * small[0];
491    low = a;
492    high = a >> 64;
493    out[0] = low;
494    out[1] = high;
495
496    a = ((uint128_t) small[0]) * small[1];
497    low = a;
498    high = a >> 64;
499    out[1] += low;
500    out[1] += low;
501    out[2] = high;
502
503    a = ((uint128_t) small[0]) * small[2];
504    low = a;
505    high = a >> 64;
506    out[2] += low;
507    out[2] *= 2;
508    out[3] = high;
509
510    a = ((uint128_t) small[0]) * small[3];
511    low = a;
512    high = a >> 64;
513    out[3] += low;
514    out[4] = high;
515
516    a = ((uint128_t) small[1]) * small[2];
517    low = a;
518    high = a >> 64;
519    out[3] += low;
520    out[3] *= 2;
521    out[4] += high;
522
523    a = ((uint128_t) small[1]) * small[1];
524    low = a;
525    high = a >> 64;
526    out[2] += low;
527    out[3] += high;
528
529    a = ((uint128_t) small[1]) * small[3];
530    low = a;
531    high = a >> 64;
532    out[4] += low;
533    out[4] *= 2;
534    out[5] = high;
535
536    a = ((uint128_t) small[2]) * small[3];
537    low = a;
538    high = a >> 64;
539    out[5] += low;
540    out[5] *= 2;
541    out[6] = high;
542    out[6] += high;
543
544    a = ((uint128_t) small[2]) * small[2];
545    low = a;
546    high = a >> 64;
547    out[4] += low;
548    out[5] += high;
549
550    a = ((uint128_t) small[3]) * small[3];
551    low = a;
552    high = a >> 64;
553    out[6] += low;
554    out[7] = high;
555}
556
557/*-
558 * felem_square sets |out| = |in|^2
559 * On entry:
560 *   in[i] < 2^109
561 * On exit:
562 *   out[i] < 7 * 2^64 < 2^67
563 */
564static void felem_square(longfelem out, const felem in)
565{
566    u64 small[4];
567    felem_shrink(small, in);
568    smallfelem_square(out, small);
569}
570
571/*-
572 * smallfelem_mul sets |out| = |small1| * |small2|
573 * On entry:
574 *   small1[i] < 2^64
575 *   small2[i] < 2^64
576 * On exit:
577 *   out[i] < 7 * 2^64 < 2^67
578 */
579static void smallfelem_mul(longfelem out, const smallfelem small1,
580                           const smallfelem small2)
581{
582    limb a;
583    u64 high, low;
584
585    a = ((uint128_t) small1[0]) * small2[0];
586    low = a;
587    high = a >> 64;
588    out[0] = low;
589    out[1] = high;
590
591    a = ((uint128_t) small1[0]) * small2[1];
592    low = a;
593    high = a >> 64;
594    out[1] += low;
595    out[2] = high;
596
597    a = ((uint128_t) small1[1]) * small2[0];
598    low = a;
599    high = a >> 64;
600    out[1] += low;
601    out[2] += high;
602
603    a = ((uint128_t) small1[0]) * small2[2];
604    low = a;
605    high = a >> 64;
606    out[2] += low;
607    out[3] = high;
608
609    a = ((uint128_t) small1[1]) * small2[1];
610    low = a;
611    high = a >> 64;
612    out[2] += low;
613    out[3] += high;
614
615    a = ((uint128_t) small1[2]) * small2[0];
616    low = a;
617    high = a >> 64;
618    out[2] += low;
619    out[3] += high;
620
621    a = ((uint128_t) small1[0]) * small2[3];
622    low = a;
623    high = a >> 64;
624    out[3] += low;
625    out[4] = high;
626
627    a = ((uint128_t) small1[1]) * small2[2];
628    low = a;
629    high = a >> 64;
630    out[3] += low;
631    out[4] += high;
632
633    a = ((uint128_t) small1[2]) * small2[1];
634    low = a;
635    high = a >> 64;
636    out[3] += low;
637    out[4] += high;
638
639    a = ((uint128_t) small1[3]) * small2[0];
640    low = a;
641    high = a >> 64;
642    out[3] += low;
643    out[4] += high;
644
645    a = ((uint128_t) small1[1]) * small2[3];
646    low = a;
647    high = a >> 64;
648    out[4] += low;
649    out[5] = high;
650
651    a = ((uint128_t) small1[2]) * small2[2];
652    low = a;
653    high = a >> 64;
654    out[4] += low;
655    out[5] += high;
656
657    a = ((uint128_t) small1[3]) * small2[1];
658    low = a;
659    high = a >> 64;
660    out[4] += low;
661    out[5] += high;
662
663    a = ((uint128_t) small1[2]) * small2[3];
664    low = a;
665    high = a >> 64;
666    out[5] += low;
667    out[6] = high;
668
669    a = ((uint128_t) small1[3]) * small2[2];
670    low = a;
671    high = a >> 64;
672    out[5] += low;
673    out[6] += high;
674
675    a = ((uint128_t) small1[3]) * small2[3];
676    low = a;
677    high = a >> 64;
678    out[6] += low;
679    out[7] = high;
680}
681
682/*-
683 * felem_mul sets |out| = |in1| * |in2|
684 * On entry:
685 *   in1[i] < 2^109
686 *   in2[i] < 2^109
687 * On exit:
688 *   out[i] < 7 * 2^64 < 2^67
689 */
690static void felem_mul(longfelem out, const felem in1, const felem in2)
691{
692    smallfelem small1, small2;
693    felem_shrink(small1, in1);
694    felem_shrink(small2, in2);
695    smallfelem_mul(out, small1, small2);
696}
697
698/*-
699 * felem_small_mul sets |out| = |small1| * |in2|
700 * On entry:
701 *   small1[i] < 2^64
702 *   in2[i] < 2^109
703 * On exit:
704 *   out[i] < 7 * 2^64 < 2^67
705 */
706static void felem_small_mul(longfelem out, const smallfelem small1,
707                            const felem in2)
708{
709    smallfelem small2;
710    felem_shrink(small2, in2);
711    smallfelem_mul(out, small1, small2);
712}
713
714# define two100m36m4 (((limb)1) << 100) - (((limb)1) << 36) - (((limb)1) << 4)
715# define two100 (((limb)1) << 100)
716# define two100m36p4 (((limb)1) << 100) - (((limb)1) << 36) + (((limb)1) << 4)
717/* zero100 is 0 mod p */
718static const felem zero100 =
719    { two100m36m4, two100, two100m36p4, two100m36p4 };
720
721/*-
722 * Internal function for the different flavours of felem_reduce.
723 * felem_reduce_ reduces the higher coefficients in[4]-in[7].
724 * On entry:
725 *   out[0] >= in[6] + 2^32*in[6] + in[7] + 2^32*in[7]
726 *   out[1] >= in[7] + 2^32*in[4]
727 *   out[2] >= in[5] + 2^32*in[5]
728 *   out[3] >= in[4] + 2^32*in[5] + 2^32*in[6]
729 * On exit:
730 *   out[0] <= out[0] + in[4] + 2^32*in[5]
731 *   out[1] <= out[1] + in[5] + 2^33*in[6]
732 *   out[2] <= out[2] + in[7] + 2*in[6] + 2^33*in[7]
733 *   out[3] <= out[3] + 2^32*in[4] + 3*in[7]
734 */
735static void felem_reduce_(felem out, const longfelem in)
736{
737    int128_t c;
738    /* combine common terms from below */
739    c = in[4] + (in[5] << 32);
740    out[0] += c;
741    out[3] -= c;
742
743    c = in[5] - in[7];
744    out[1] += c;
745    out[2] -= c;
746
747    /* the remaining terms */
748    /* 256: [(0,1),(96,-1),(192,-1),(224,1)] */
749    out[1] -= (in[4] << 32);
750    out[3] += (in[4] << 32);
751
752    /* 320: [(32,1),(64,1),(128,-1),(160,-1),(224,-1)] */
753    out[2] -= (in[5] << 32);
754
755    /* 384: [(0,-1),(32,-1),(96,2),(128,2),(224,-1)] */
756    out[0] -= in[6];
757    out[0] -= (in[6] << 32);
758    out[1] += (in[6] << 33);
759    out[2] += (in[6] * 2);
760    out[3] -= (in[6] << 32);
761
762    /* 448: [(0,-1),(32,-1),(64,-1),(128,1),(160,2),(192,3)] */
763    out[0] -= in[7];
764    out[0] -= (in[7] << 32);
765    out[2] += (in[7] << 33);
766    out[3] += (in[7] * 3);
767}
768
769/*-
770 * felem_reduce converts a longfelem into an felem.
771 * To be called directly after felem_square or felem_mul.
772 * On entry:
773 *   in[0] < 2^64, in[1] < 3*2^64, in[2] < 5*2^64, in[3] < 7*2^64
774 *   in[4] < 7*2^64, in[5] < 5*2^64, in[6] < 3*2^64, in[7] < 2*64
775 * On exit:
776 *   out[i] < 2^101
777 */
778static void felem_reduce(felem out, const longfelem in)
779{
780    out[0] = zero100[0] + in[0];
781    out[1] = zero100[1] + in[1];
782    out[2] = zero100[2] + in[2];
783    out[3] = zero100[3] + in[3];
784
785    felem_reduce_(out, in);
786
787    /*-
788     * out[0] > 2^100 - 2^36 - 2^4 - 3*2^64 - 3*2^96 - 2^64 - 2^96 > 0
789     * out[1] > 2^100 - 2^64 - 7*2^96 > 0
790     * out[2] > 2^100 - 2^36 + 2^4 - 5*2^64 - 5*2^96 > 0
791     * out[3] > 2^100 - 2^36 + 2^4 - 7*2^64 - 5*2^96 - 3*2^96 > 0
792     *
793     * out[0] < 2^100 + 2^64 + 7*2^64 + 5*2^96 < 2^101
794     * out[1] < 2^100 + 3*2^64 + 5*2^64 + 3*2^97 < 2^101
795     * out[2] < 2^100 + 5*2^64 + 2^64 + 3*2^65 + 2^97 < 2^101
796     * out[3] < 2^100 + 7*2^64 + 7*2^96 + 3*2^64 < 2^101
797     */
798}
799
800/*-
801 * felem_reduce_zero105 converts a larger longfelem into an felem.
802 * On entry:
803 *   in[0] < 2^71
804 * On exit:
805 *   out[i] < 2^106
806 */
807static void felem_reduce_zero105(felem out, const longfelem in)
808{
809    out[0] = zero105[0] + in[0];
810    out[1] = zero105[1] + in[1];
811    out[2] = zero105[2] + in[2];
812    out[3] = zero105[3] + in[3];
813
814    felem_reduce_(out, in);
815
816    /*-
817     * out[0] > 2^105 - 2^41 - 2^9 - 2^71 - 2^103 - 2^71 - 2^103 > 0
818     * out[1] > 2^105 - 2^71 - 2^103 > 0
819     * out[2] > 2^105 - 2^41 + 2^9 - 2^71 - 2^103 > 0
820     * out[3] > 2^105 - 2^41 + 2^9 - 2^71 - 2^103 - 2^103 > 0
821     *
822     * out[0] < 2^105 + 2^71 + 2^71 + 2^103 < 2^106
823     * out[1] < 2^105 + 2^71 + 2^71 + 2^103 < 2^106
824     * out[2] < 2^105 + 2^71 + 2^71 + 2^71 + 2^103 < 2^106
825     * out[3] < 2^105 + 2^71 + 2^103 + 2^71 < 2^106
826     */
827}
828
829/*
830 * subtract_u64 sets *result = *result - v and *carry to one if the
831 * subtraction underflowed.
832 */
833static void subtract_u64(u64 *result, u64 *carry, u64 v)
834{
835    uint128_t r = *result;
836    r -= v;
837    *carry = (r >> 64) & 1;
838    *result = (u64)r;
839}
840
841/*
842 * felem_contract converts |in| to its unique, minimal representation. On
843 * entry: in[i] < 2^109
844 */
845static void felem_contract(smallfelem out, const felem in)
846{
847    unsigned i;
848    u64 all_equal_so_far = 0, result = 0, carry;
849
850    felem_shrink(out, in);
851    /* small is minimal except that the value might be > p */
852
853    all_equal_so_far--;
854    /*
855     * We are doing a constant time test if out >= kPrime. We need to compare
856     * each u64, from most-significant to least significant. For each one, if
857     * all words so far have been equal (m is all ones) then a non-equal
858     * result is the answer. Otherwise we continue.
859     */
860    for (i = 3; i < 4; i--) {
861        u64 equal;
862        uint128_t a = ((uint128_t) kPrime[i]) - out[i];
863        /*
864         * if out[i] > kPrime[i] then a will underflow and the high 64-bits
865         * will all be set.
866         */
867        result |= all_equal_so_far & ((u64)(a >> 64));
868
869        /*
870         * if kPrime[i] == out[i] then |equal| will be all zeros and the
871         * decrement will make it all ones.
872         */
873        equal = kPrime[i] ^ out[i];
874        equal--;
875        equal &= equal << 32;
876        equal &= equal << 16;
877        equal &= equal << 8;
878        equal &= equal << 4;
879        equal &= equal << 2;
880        equal &= equal << 1;
881        equal = 0 - (equal >> 63);
882
883        all_equal_so_far &= equal;
884    }
885
886    /*
887     * if all_equal_so_far is still all ones then the two values are equal
888     * and so out >= kPrime is true.
889     */
890    result |= all_equal_so_far;
891
892    /* if out >= kPrime then we subtract kPrime. */
893    subtract_u64(&out[0], &carry, result & kPrime[0]);
894    subtract_u64(&out[1], &carry, carry);
895    subtract_u64(&out[2], &carry, carry);
896    subtract_u64(&out[3], &carry, carry);
897
898    subtract_u64(&out[1], &carry, result & kPrime[1]);
899    subtract_u64(&out[2], &carry, carry);
900    subtract_u64(&out[3], &carry, carry);
901
902    subtract_u64(&out[2], &carry, result & kPrime[2]);
903    subtract_u64(&out[3], &carry, carry);
904
905    subtract_u64(&out[3], &carry, result & kPrime[3]);
906}
907
908static void smallfelem_square_contract(smallfelem out, const smallfelem in)
909{
910    longfelem longtmp;
911    felem tmp;
912
913    smallfelem_square(longtmp, in);
914    felem_reduce(tmp, longtmp);
915    felem_contract(out, tmp);
916}
917
918static void smallfelem_mul_contract(smallfelem out, const smallfelem in1,
919                                    const smallfelem in2)
920{
921    longfelem longtmp;
922    felem tmp;
923
924    smallfelem_mul(longtmp, in1, in2);
925    felem_reduce(tmp, longtmp);
926    felem_contract(out, tmp);
927}
928
929/*-
930 * felem_is_zero returns a limb with all bits set if |in| == 0 (mod p) and 0
931 * otherwise.
932 * On entry:
933 *   small[i] < 2^64
934 */
935static limb smallfelem_is_zero(const smallfelem small)
936{
937    limb result;
938    u64 is_p;
939
940    u64 is_zero = small[0] | small[1] | small[2] | small[3];
941    is_zero--;
942    is_zero &= is_zero << 32;
943    is_zero &= is_zero << 16;
944    is_zero &= is_zero << 8;
945    is_zero &= is_zero << 4;
946    is_zero &= is_zero << 2;
947    is_zero &= is_zero << 1;
948    is_zero = 0 - (is_zero >> 63);
949
950    is_p = (small[0] ^ kPrime[0]) |
951        (small[1] ^ kPrime[1]) |
952        (small[2] ^ kPrime[2]) | (small[3] ^ kPrime[3]);
953    is_p--;
954    is_p &= is_p << 32;
955    is_p &= is_p << 16;
956    is_p &= is_p << 8;
957    is_p &= is_p << 4;
958    is_p &= is_p << 2;
959    is_p &= is_p << 1;
960    is_p = 0 - (is_p >> 63);
961
962    is_zero |= is_p;
963
964    result = is_zero;
965    result |= ((limb) is_zero) << 64;
966    return result;
967}
968
969static int smallfelem_is_zero_int(const void *small)
970{
971    return (int)(smallfelem_is_zero(small) & ((limb) 1));
972}
973
974/*-
975 * felem_inv calculates |out| = |in|^{-1}
976 *
977 * Based on Fermat's Little Theorem:
978 *   a^p = a (mod p)
979 *   a^{p-1} = 1 (mod p)
980 *   a^{p-2} = a^{-1} (mod p)
981 */
982static void felem_inv(felem out, const felem in)
983{
984    felem ftmp, ftmp2;
985    /* each e_I will hold |in|^{2^I - 1} */
986    felem e2, e4, e8, e16, e32, e64;
987    longfelem tmp;
988    unsigned i;
989
990    felem_square(tmp, in);
991    felem_reduce(ftmp, tmp);    /* 2^1 */
992    felem_mul(tmp, in, ftmp);
993    felem_reduce(ftmp, tmp);    /* 2^2 - 2^0 */
994    felem_assign(e2, ftmp);
995    felem_square(tmp, ftmp);
996    felem_reduce(ftmp, tmp);    /* 2^3 - 2^1 */
997    felem_square(tmp, ftmp);
998    felem_reduce(ftmp, tmp);    /* 2^4 - 2^2 */
999    felem_mul(tmp, ftmp, e2);
1000    felem_reduce(ftmp, tmp);    /* 2^4 - 2^0 */
1001    felem_assign(e4, ftmp);
1002    felem_square(tmp, ftmp);
1003    felem_reduce(ftmp, tmp);    /* 2^5 - 2^1 */
1004    felem_square(tmp, ftmp);
1005    felem_reduce(ftmp, tmp);    /* 2^6 - 2^2 */
1006    felem_square(tmp, ftmp);
1007    felem_reduce(ftmp, tmp);    /* 2^7 - 2^3 */
1008    felem_square(tmp, ftmp);
1009    felem_reduce(ftmp, tmp);    /* 2^8 - 2^4 */
1010    felem_mul(tmp, ftmp, e4);
1011    felem_reduce(ftmp, tmp);    /* 2^8 - 2^0 */
1012    felem_assign(e8, ftmp);
1013    for (i = 0; i < 8; i++) {
1014        felem_square(tmp, ftmp);
1015        felem_reduce(ftmp, tmp);
1016    }                           /* 2^16 - 2^8 */
1017    felem_mul(tmp, ftmp, e8);
1018    felem_reduce(ftmp, tmp);    /* 2^16 - 2^0 */
1019    felem_assign(e16, ftmp);
1020    for (i = 0; i < 16; i++) {
1021        felem_square(tmp, ftmp);
1022        felem_reduce(ftmp, tmp);
1023    }                           /* 2^32 - 2^16 */
1024    felem_mul(tmp, ftmp, e16);
1025    felem_reduce(ftmp, tmp);    /* 2^32 - 2^0 */
1026    felem_assign(e32, ftmp);
1027    for (i = 0; i < 32; i++) {
1028        felem_square(tmp, ftmp);
1029        felem_reduce(ftmp, tmp);
1030    }                           /* 2^64 - 2^32 */
1031    felem_assign(e64, ftmp);
1032    felem_mul(tmp, ftmp, in);
1033    felem_reduce(ftmp, tmp);    /* 2^64 - 2^32 + 2^0 */
1034    for (i = 0; i < 192; i++) {
1035        felem_square(tmp, ftmp);
1036        felem_reduce(ftmp, tmp);
1037    }                           /* 2^256 - 2^224 + 2^192 */
1038
1039    felem_mul(tmp, e64, e32);
1040    felem_reduce(ftmp2, tmp);   /* 2^64 - 2^0 */
1041    for (i = 0; i < 16; i++) {
1042        felem_square(tmp, ftmp2);
1043        felem_reduce(ftmp2, tmp);
1044    }                           /* 2^80 - 2^16 */
1045    felem_mul(tmp, ftmp2, e16);
1046    felem_reduce(ftmp2, tmp);   /* 2^80 - 2^0 */
1047    for (i = 0; i < 8; i++) {
1048        felem_square(tmp, ftmp2);
1049        felem_reduce(ftmp2, tmp);
1050    }                           /* 2^88 - 2^8 */
1051    felem_mul(tmp, ftmp2, e8);
1052    felem_reduce(ftmp2, tmp);   /* 2^88 - 2^0 */
1053    for (i = 0; i < 4; i++) {
1054        felem_square(tmp, ftmp2);
1055        felem_reduce(ftmp2, tmp);
1056    }                           /* 2^92 - 2^4 */
1057    felem_mul(tmp, ftmp2, e4);
1058    felem_reduce(ftmp2, tmp);   /* 2^92 - 2^0 */
1059    felem_square(tmp, ftmp2);
1060    felem_reduce(ftmp2, tmp);   /* 2^93 - 2^1 */
1061    felem_square(tmp, ftmp2);
1062    felem_reduce(ftmp2, tmp);   /* 2^94 - 2^2 */
1063    felem_mul(tmp, ftmp2, e2);
1064    felem_reduce(ftmp2, tmp);   /* 2^94 - 2^0 */
1065    felem_square(tmp, ftmp2);
1066    felem_reduce(ftmp2, tmp);   /* 2^95 - 2^1 */
1067    felem_square(tmp, ftmp2);
1068    felem_reduce(ftmp2, tmp);   /* 2^96 - 2^2 */
1069    felem_mul(tmp, ftmp2, in);
1070    felem_reduce(ftmp2, tmp);   /* 2^96 - 3 */
1071
1072    felem_mul(tmp, ftmp2, ftmp);
1073    felem_reduce(out, tmp);     /* 2^256 - 2^224 + 2^192 + 2^96 - 3 */
1074}
1075
1076static void smallfelem_inv_contract(smallfelem out, const smallfelem in)
1077{
1078    felem tmp;
1079
1080    smallfelem_expand(tmp, in);
1081    felem_inv(tmp, tmp);
1082    felem_contract(out, tmp);
1083}
1084
1085/*-
1086 * Group operations
1087 * ----------------
1088 *
1089 * Building on top of the field operations we have the operations on the
1090 * elliptic curve group itself. Points on the curve are represented in Jacobian
1091 * coordinates
1092 */
1093
1094/*-
1095 * point_double calculates 2*(x_in, y_in, z_in)
1096 *
1097 * The method is taken from:
1098 *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b
1099 *
1100 * Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed.
1101 * while x_out == y_in is not (maybe this works, but it's not tested).
1102 */
1103static void
1104point_double(felem x_out, felem y_out, felem z_out,
1105             const felem x_in, const felem y_in, const felem z_in)
1106{
1107    longfelem tmp, tmp2;
1108    felem delta, gamma, beta, alpha, ftmp, ftmp2;
1109    smallfelem small1, small2;
1110
1111    felem_assign(ftmp, x_in);
1112    /* ftmp[i] < 2^106 */
1113    felem_assign(ftmp2, x_in);
1114    /* ftmp2[i] < 2^106 */
1115
1116    /* delta = z^2 */
1117    felem_square(tmp, z_in);
1118    felem_reduce(delta, tmp);
1119    /* delta[i] < 2^101 */
1120
1121    /* gamma = y^2 */
1122    felem_square(tmp, y_in);
1123    felem_reduce(gamma, tmp);
1124    /* gamma[i] < 2^101 */
1125    felem_shrink(small1, gamma);
1126
1127    /* beta = x*gamma */
1128    felem_small_mul(tmp, small1, x_in);
1129    felem_reduce(beta, tmp);
1130    /* beta[i] < 2^101 */
1131
1132    /* alpha = 3*(x-delta)*(x+delta) */
1133    felem_diff(ftmp, delta);
1134    /* ftmp[i] < 2^105 + 2^106 < 2^107 */
1135    felem_sum(ftmp2, delta);
1136    /* ftmp2[i] < 2^105 + 2^106 < 2^107 */
1137    felem_scalar(ftmp2, 3);
1138    /* ftmp2[i] < 3 * 2^107 < 2^109 */
1139    felem_mul(tmp, ftmp, ftmp2);
1140    felem_reduce(alpha, tmp);
1141    /* alpha[i] < 2^101 */
1142    felem_shrink(small2, alpha);
1143
1144    /* x' = alpha^2 - 8*beta */
1145    smallfelem_square(tmp, small2);
1146    felem_reduce(x_out, tmp);
1147    felem_assign(ftmp, beta);
1148    felem_scalar(ftmp, 8);
1149    /* ftmp[i] < 8 * 2^101 = 2^104 */
1150    felem_diff(x_out, ftmp);
1151    /* x_out[i] < 2^105 + 2^101 < 2^106 */
1152
1153    /* z' = (y + z)^2 - gamma - delta */
1154    felem_sum(delta, gamma);
1155    /* delta[i] < 2^101 + 2^101 = 2^102 */
1156    felem_assign(ftmp, y_in);
1157    felem_sum(ftmp, z_in);
1158    /* ftmp[i] < 2^106 + 2^106 = 2^107 */
1159    felem_square(tmp, ftmp);
1160    felem_reduce(z_out, tmp);
1161    felem_diff(z_out, delta);
1162    /* z_out[i] < 2^105 + 2^101 < 2^106 */
1163
1164    /* y' = alpha*(4*beta - x') - 8*gamma^2 */
1165    felem_scalar(beta, 4);
1166    /* beta[i] < 4 * 2^101 = 2^103 */
1167    felem_diff_zero107(beta, x_out);
1168    /* beta[i] < 2^107 + 2^103 < 2^108 */
1169    felem_small_mul(tmp, small2, beta);
1170    /* tmp[i] < 7 * 2^64 < 2^67 */
1171    smallfelem_square(tmp2, small1);
1172    /* tmp2[i] < 7 * 2^64 */
1173    longfelem_scalar(tmp2, 8);
1174    /* tmp2[i] < 8 * 7 * 2^64 = 7 * 2^67 */
1175    longfelem_diff(tmp, tmp2);
1176    /* tmp[i] < 2^67 + 2^70 + 2^40 < 2^71 */
1177    felem_reduce_zero105(y_out, tmp);
1178    /* y_out[i] < 2^106 */
1179}
1180
1181/*
1182 * point_double_small is the same as point_double, except that it operates on
1183 * smallfelems
1184 */
1185static void
1186point_double_small(smallfelem x_out, smallfelem y_out, smallfelem z_out,
1187                   const smallfelem x_in, const smallfelem y_in,
1188                   const smallfelem z_in)
1189{
1190    felem felem_x_out, felem_y_out, felem_z_out;
1191    felem felem_x_in, felem_y_in, felem_z_in;
1192
1193    smallfelem_expand(felem_x_in, x_in);
1194    smallfelem_expand(felem_y_in, y_in);
1195    smallfelem_expand(felem_z_in, z_in);
1196    point_double(felem_x_out, felem_y_out, felem_z_out,
1197                 felem_x_in, felem_y_in, felem_z_in);
1198    felem_shrink(x_out, felem_x_out);
1199    felem_shrink(y_out, felem_y_out);
1200    felem_shrink(z_out, felem_z_out);
1201}
1202
1203/* copy_conditional copies in to out iff mask is all ones. */
1204static void copy_conditional(felem out, const felem in, limb mask)
1205{
1206    unsigned i;
1207    for (i = 0; i < NLIMBS; ++i) {
1208        const limb tmp = mask & (in[i] ^ out[i]);
1209        out[i] ^= tmp;
1210    }
1211}
1212
1213/* copy_small_conditional copies in to out iff mask is all ones. */
1214static void copy_small_conditional(felem out, const smallfelem in, limb mask)
1215{
1216    unsigned i;
1217    const u64 mask64 = mask;
1218    for (i = 0; i < NLIMBS; ++i) {
1219        out[i] = ((limb) (in[i] & mask64)) | (out[i] & ~mask);
1220    }
1221}
1222
1223/*-
1224 * point_add calculates (x1, y1, z1) + (x2, y2, z2)
1225 *
1226 * The method is taken from:
1227 *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl,
1228 * adapted for mixed addition (z2 = 1, or z2 = 0 for the point at infinity).
1229 *
1230 * This function includes a branch for checking whether the two input points
1231 * are equal, (while not equal to the point at infinity). This case never
1232 * happens during single point multiplication, so there is no timing leak for
1233 * ECDH or ECDSA signing.
1234 */
1235static void point_add(felem x3, felem y3, felem z3,
1236                      const felem x1, const felem y1, const felem z1,
1237                      const int mixed, const smallfelem x2,
1238                      const smallfelem y2, const smallfelem z2)
1239{
1240    felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, ftmp6, x_out, y_out, z_out;
1241    longfelem tmp, tmp2;
1242    smallfelem small1, small2, small3, small4, small5;
1243    limb x_equal, y_equal, z1_is_zero, z2_is_zero;
1244    limb points_equal;
1245
1246    felem_shrink(small3, z1);
1247
1248    z1_is_zero = smallfelem_is_zero(small3);
1249    z2_is_zero = smallfelem_is_zero(z2);
1250
1251    /* ftmp = z1z1 = z1**2 */
1252    smallfelem_square(tmp, small3);
1253    felem_reduce(ftmp, tmp);
1254    /* ftmp[i] < 2^101 */
1255    felem_shrink(small1, ftmp);
1256
1257    if (!mixed) {
1258        /* ftmp2 = z2z2 = z2**2 */
1259        smallfelem_square(tmp, z2);
1260        felem_reduce(ftmp2, tmp);
1261        /* ftmp2[i] < 2^101 */
1262        felem_shrink(small2, ftmp2);
1263
1264        felem_shrink(small5, x1);
1265
1266        /* u1 = ftmp3 = x1*z2z2 */
1267        smallfelem_mul(tmp, small5, small2);
1268        felem_reduce(ftmp3, tmp);
1269        /* ftmp3[i] < 2^101 */
1270
1271        /* ftmp5 = z1 + z2 */
1272        felem_assign(ftmp5, z1);
1273        felem_small_sum(ftmp5, z2);
1274        /* ftmp5[i] < 2^107 */
1275
1276        /* ftmp5 = (z1 + z2)**2 - (z1z1 + z2z2) = 2z1z2 */
1277        felem_square(tmp, ftmp5);
1278        felem_reduce(ftmp5, tmp);
1279        /* ftmp2 = z2z2 + z1z1 */
1280        felem_sum(ftmp2, ftmp);
1281        /* ftmp2[i] < 2^101 + 2^101 = 2^102 */
1282        felem_diff(ftmp5, ftmp2);
1283        /* ftmp5[i] < 2^105 + 2^101 < 2^106 */
1284
1285        /* ftmp2 = z2 * z2z2 */
1286        smallfelem_mul(tmp, small2, z2);
1287        felem_reduce(ftmp2, tmp);
1288
1289        /* s1 = ftmp2 = y1 * z2**3 */
1290        felem_mul(tmp, y1, ftmp2);
1291        felem_reduce(ftmp6, tmp);
1292        /* ftmp6[i] < 2^101 */
1293    } else {
1294        /*
1295         * We'll assume z2 = 1 (special case z2 = 0 is handled later)
1296         */
1297
1298        /* u1 = ftmp3 = x1*z2z2 */
1299        felem_assign(ftmp3, x1);
1300        /* ftmp3[i] < 2^106 */
1301
1302        /* ftmp5 = 2z1z2 */
1303        felem_assign(ftmp5, z1);
1304        felem_scalar(ftmp5, 2);
1305        /* ftmp5[i] < 2*2^106 = 2^107 */
1306
1307        /* s1 = ftmp2 = y1 * z2**3 */
1308        felem_assign(ftmp6, y1);
1309        /* ftmp6[i] < 2^106 */
1310    }
1311
1312    /* u2 = x2*z1z1 */
1313    smallfelem_mul(tmp, x2, small1);
1314    felem_reduce(ftmp4, tmp);
1315
1316    /* h = ftmp4 = u2 - u1 */
1317    felem_diff_zero107(ftmp4, ftmp3);
1318    /* ftmp4[i] < 2^107 + 2^101 < 2^108 */
1319    felem_shrink(small4, ftmp4);
1320
1321    x_equal = smallfelem_is_zero(small4);
1322
1323    /* z_out = ftmp5 * h */
1324    felem_small_mul(tmp, small4, ftmp5);
1325    felem_reduce(z_out, tmp);
1326    /* z_out[i] < 2^101 */
1327
1328    /* ftmp = z1 * z1z1 */
1329    smallfelem_mul(tmp, small1, small3);
1330    felem_reduce(ftmp, tmp);
1331
1332    /* s2 = tmp = y2 * z1**3 */
1333    felem_small_mul(tmp, y2, ftmp);
1334    felem_reduce(ftmp5, tmp);
1335
1336    /* r = ftmp5 = (s2 - s1)*2 */
1337    felem_diff_zero107(ftmp5, ftmp6);
1338    /* ftmp5[i] < 2^107 + 2^107 = 2^108 */
1339    felem_scalar(ftmp5, 2);
1340    /* ftmp5[i] < 2^109 */
1341    felem_shrink(small1, ftmp5);
1342    y_equal = smallfelem_is_zero(small1);
1343
1344    /*
1345     * The formulae are incorrect if the points are equal, in affine coordinates
1346     * (X_1, Y_1) == (X_2, Y_2), so we check for this and do doubling if this
1347     * happens.
1348     *
1349     * We use bitwise operations to avoid potential side-channels introduced by
1350     * the short-circuiting behaviour of boolean operators.
1351     *
1352     * The special case of either point being the point at infinity (z1 and/or
1353     * z2 are zero), is handled separately later on in this function, so we
1354     * avoid jumping to point_double here in those special cases.
1355     */
1356    points_equal = (x_equal & y_equal & (~z1_is_zero) & (~z2_is_zero));
1357
1358    if (points_equal) {
1359        /*
1360         * This is obviously not constant-time but, as mentioned before, this
1361         * case never happens during single point multiplication, so there is no
1362         * timing leak for ECDH or ECDSA signing.
1363         */
1364        point_double(x3, y3, z3, x1, y1, z1);
1365        return;
1366    }
1367
1368    /* I = ftmp = (2h)**2 */
1369    felem_assign(ftmp, ftmp4);
1370    felem_scalar(ftmp, 2);
1371    /* ftmp[i] < 2*2^108 = 2^109 */
1372    felem_square(tmp, ftmp);
1373    felem_reduce(ftmp, tmp);
1374
1375    /* J = ftmp2 = h * I */
1376    felem_mul(tmp, ftmp4, ftmp);
1377    felem_reduce(ftmp2, tmp);
1378
1379    /* V = ftmp4 = U1 * I */
1380    felem_mul(tmp, ftmp3, ftmp);
1381    felem_reduce(ftmp4, tmp);
1382
1383    /* x_out = r**2 - J - 2V */
1384    smallfelem_square(tmp, small1);
1385    felem_reduce(x_out, tmp);
1386    felem_assign(ftmp3, ftmp4);
1387    felem_scalar(ftmp4, 2);
1388    felem_sum(ftmp4, ftmp2);
1389    /* ftmp4[i] < 2*2^101 + 2^101 < 2^103 */
1390    felem_diff(x_out, ftmp4);
1391    /* x_out[i] < 2^105 + 2^101 */
1392
1393    /* y_out = r(V-x_out) - 2 * s1 * J */
1394    felem_diff_zero107(ftmp3, x_out);
1395    /* ftmp3[i] < 2^107 + 2^101 < 2^108 */
1396    felem_small_mul(tmp, small1, ftmp3);
1397    felem_mul(tmp2, ftmp6, ftmp2);
1398    longfelem_scalar(tmp2, 2);
1399    /* tmp2[i] < 2*2^67 = 2^68 */
1400    longfelem_diff(tmp, tmp2);
1401    /* tmp[i] < 2^67 + 2^70 + 2^40 < 2^71 */
1402    felem_reduce_zero105(y_out, tmp);
1403    /* y_out[i] < 2^106 */
1404
1405    copy_small_conditional(x_out, x2, z1_is_zero);
1406    copy_conditional(x_out, x1, z2_is_zero);
1407    copy_small_conditional(y_out, y2, z1_is_zero);
1408    copy_conditional(y_out, y1, z2_is_zero);
1409    copy_small_conditional(z_out, z2, z1_is_zero);
1410    copy_conditional(z_out, z1, z2_is_zero);
1411    felem_assign(x3, x_out);
1412    felem_assign(y3, y_out);
1413    felem_assign(z3, z_out);
1414}
1415
1416/*
1417 * point_add_small is the same as point_add, except that it operates on
1418 * smallfelems
1419 */
1420static void point_add_small(smallfelem x3, smallfelem y3, smallfelem z3,
1421                            smallfelem x1, smallfelem y1, smallfelem z1,
1422                            smallfelem x2, smallfelem y2, smallfelem z2)
1423{
1424    felem felem_x3, felem_y3, felem_z3;
1425    felem felem_x1, felem_y1, felem_z1;
1426    smallfelem_expand(felem_x1, x1);
1427    smallfelem_expand(felem_y1, y1);
1428    smallfelem_expand(felem_z1, z1);
1429    point_add(felem_x3, felem_y3, felem_z3, felem_x1, felem_y1, felem_z1, 0,
1430              x2, y2, z2);
1431    felem_shrink(x3, felem_x3);
1432    felem_shrink(y3, felem_y3);
1433    felem_shrink(z3, felem_z3);
1434}
1435
1436/*-
1437 * Base point pre computation
1438 * --------------------------
1439 *
1440 * Two different sorts of precomputed tables are used in the following code.
1441 * Each contain various points on the curve, where each point is three field
1442 * elements (x, y, z).
1443 *
1444 * For the base point table, z is usually 1 (0 for the point at infinity).
1445 * This table has 2 * 16 elements, starting with the following:
1446 * index | bits    | point
1447 * ------+---------+------------------------------
1448 *     0 | 0 0 0 0 | 0G
1449 *     1 | 0 0 0 1 | 1G
1450 *     2 | 0 0 1 0 | 2^64G
1451 *     3 | 0 0 1 1 | (2^64 + 1)G
1452 *     4 | 0 1 0 0 | 2^128G
1453 *     5 | 0 1 0 1 | (2^128 + 1)G
1454 *     6 | 0 1 1 0 | (2^128 + 2^64)G
1455 *     7 | 0 1 1 1 | (2^128 + 2^64 + 1)G
1456 *     8 | 1 0 0 0 | 2^192G
1457 *     9 | 1 0 0 1 | (2^192 + 1)G
1458 *    10 | 1 0 1 0 | (2^192 + 2^64)G
1459 *    11 | 1 0 1 1 | (2^192 + 2^64 + 1)G
1460 *    12 | 1 1 0 0 | (2^192 + 2^128)G
1461 *    13 | 1 1 0 1 | (2^192 + 2^128 + 1)G
1462 *    14 | 1 1 1 0 | (2^192 + 2^128 + 2^64)G
1463 *    15 | 1 1 1 1 | (2^192 + 2^128 + 2^64 + 1)G
1464 * followed by a copy of this with each element multiplied by 2^32.
1465 *
1466 * The reason for this is so that we can clock bits into four different
1467 * locations when doing simple scalar multiplies against the base point,
1468 * and then another four locations using the second 16 elements.
1469 *
1470 * Tables for other points have table[i] = iG for i in 0 .. 16. */
1471
1472/* gmul is the table of precomputed base points */
1473static const smallfelem gmul[2][16][3] = {
1474    {{{0, 0, 0, 0},
1475      {0, 0, 0, 0},
1476      {0, 0, 0, 0}},
1477     {{0xf4a13945d898c296, 0x77037d812deb33a0, 0xf8bce6e563a440f2,
1478       0x6b17d1f2e12c4247},
1479      {0xcbb6406837bf51f5, 0x2bce33576b315ece, 0x8ee7eb4a7c0f9e16,
1480       0x4fe342e2fe1a7f9b},
1481      {1, 0, 0, 0}},
1482     {{0x90e75cb48e14db63, 0x29493baaad651f7e, 0x8492592e326e25de,
1483       0x0fa822bc2811aaa5},
1484      {0xe41124545f462ee7, 0x34b1a65050fe82f5, 0x6f4ad4bcb3df188b,
1485       0xbff44ae8f5dba80d},
1486      {1, 0, 0, 0}},
1487     {{0x93391ce2097992af, 0xe96c98fd0d35f1fa, 0xb257c0de95e02789,
1488       0x300a4bbc89d6726f},
1489      {0xaa54a291c08127a0, 0x5bb1eeada9d806a5, 0x7f1ddb25ff1e3c6f,
1490       0x72aac7e0d09b4644},
1491      {1, 0, 0, 0}},
1492     {{0x57c84fc9d789bd85, 0xfc35ff7dc297eac3, 0xfb982fd588c6766e,
1493       0x447d739beedb5e67},
1494      {0x0c7e33c972e25b32, 0x3d349b95a7fae500, 0xe12e9d953a4aaff7,
1495       0x2d4825ab834131ee},
1496      {1, 0, 0, 0}},
1497     {{0x13949c932a1d367f, 0xef7fbd2b1a0a11b7, 0xddc6068bb91dfc60,
1498       0xef9519328a9c72ff},
1499      {0x196035a77376d8a8, 0x23183b0895ca1740, 0xc1ee9807022c219c,
1500       0x611e9fc37dbb2c9b},
1501      {1, 0, 0, 0}},
1502     {{0xcae2b1920b57f4bc, 0x2936df5ec6c9bc36, 0x7dea6482e11238bf,
1503       0x550663797b51f5d8},
1504      {0x44ffe216348a964c, 0x9fb3d576dbdefbe1, 0x0afa40018d9d50e5,
1505       0x157164848aecb851},
1506      {1, 0, 0, 0}},
1507     {{0xe48ecafffc5cde01, 0x7ccd84e70d715f26, 0xa2e8f483f43e4391,
1508       0xeb5d7745b21141ea},
1509      {0xcac917e2731a3479, 0x85f22cfe2844b645, 0x0990e6a158006cee,
1510       0xeafd72ebdbecc17b},
1511      {1, 0, 0, 0}},
1512     {{0x6cf20ffb313728be, 0x96439591a3c6b94a, 0x2736ff8344315fc5,
1513       0xa6d39677a7849276},
1514      {0xf2bab833c357f5f4, 0x824a920c2284059b, 0x66b8babd2d27ecdf,
1515       0x674f84749b0b8816},
1516      {1, 0, 0, 0}},
1517     {{0x2df48c04677c8a3e, 0x74e02f080203a56b, 0x31855f7db8c7fedb,
1518       0x4e769e7672c9ddad},
1519      {0xa4c36165b824bbb0, 0xfb9ae16f3b9122a5, 0x1ec0057206947281,
1520       0x42b99082de830663},
1521      {1, 0, 0, 0}},
1522     {{0x6ef95150dda868b9, 0xd1f89e799c0ce131, 0x7fdc1ca008a1c478,
1523       0x78878ef61c6ce04d},
1524      {0x9c62b9121fe0d976, 0x6ace570ebde08d4f, 0xde53142c12309def,
1525       0xb6cb3f5d7b72c321},
1526      {1, 0, 0, 0}},
1527     {{0x7f991ed2c31a3573, 0x5b82dd5bd54fb496, 0x595c5220812ffcae,
1528       0x0c88bc4d716b1287},
1529      {0x3a57bf635f48aca8, 0x7c8181f4df2564f3, 0x18d1b5b39c04e6aa,
1530       0xdd5ddea3f3901dc6},
1531      {1, 0, 0, 0}},
1532     {{0xe96a79fb3e72ad0c, 0x43a0a28c42ba792f, 0xefe0a423083e49f3,
1533       0x68f344af6b317466},
1534      {0xcdfe17db3fb24d4a, 0x668bfc2271f5c626, 0x604ed93c24d67ff3,
1535       0x31b9c405f8540a20},
1536      {1, 0, 0, 0}},
1537     {{0xd36b4789a2582e7f, 0x0d1a10144ec39c28, 0x663c62c3edbad7a0,
1538       0x4052bf4b6f461db9},
1539      {0x235a27c3188d25eb, 0xe724f33999bfcc5b, 0x862be6bd71d70cc8,
1540       0xfecf4d5190b0fc61},
1541      {1, 0, 0, 0}},
1542     {{0x74346c10a1d4cfac, 0xafdf5cc08526a7a4, 0x123202a8f62bff7a,
1543       0x1eddbae2c802e41a},
1544      {0x8fa0af2dd603f844, 0x36e06b7e4c701917, 0x0c45f45273db33a0,
1545       0x43104d86560ebcfc},
1546      {1, 0, 0, 0}},
1547     {{0x9615b5110d1d78e5, 0x66b0de3225c4744b, 0x0a4a46fb6aaf363a,
1548       0xb48e26b484f7a21c},
1549      {0x06ebb0f621a01b2d, 0xc004e4048b7b0f98, 0x64131bcdfed6f668,
1550       0xfac015404d4d3dab},
1551      {1, 0, 0, 0}}},
1552    {{{0, 0, 0, 0},
1553      {0, 0, 0, 0},
1554      {0, 0, 0, 0}},
1555     {{0x3a5a9e22185a5943, 0x1ab919365c65dfb6, 0x21656b32262c71da,
1556       0x7fe36b40af22af89},
1557      {0xd50d152c699ca101, 0x74b3d5867b8af212, 0x9f09f40407dca6f1,
1558       0xe697d45825b63624},
1559      {1, 0, 0, 0}},
1560     {{0xa84aa9397512218e, 0xe9a521b074ca0141, 0x57880b3a18a2e902,
1561       0x4a5b506612a677a6},
1562      {0x0beada7a4c4f3840, 0x626db15419e26d9d, 0xc42604fbe1627d40,
1563       0xeb13461ceac089f1},
1564      {1, 0, 0, 0}},
1565     {{0xf9faed0927a43281, 0x5e52c4144103ecbc, 0xc342967aa815c857,
1566       0x0781b8291c6a220a},
1567      {0x5a8343ceeac55f80, 0x88f80eeee54a05e3, 0x97b2a14f12916434,
1568       0x690cde8df0151593},
1569      {1, 0, 0, 0}},
1570     {{0xaee9c75df7f82f2a, 0x9e4c35874afdf43a, 0xf5622df437371326,
1571       0x8a535f566ec73617},
1572      {0xc5f9a0ac223094b7, 0xcde533864c8c7669, 0x37e02819085a92bf,
1573       0x0455c08468b08bd7},
1574      {1, 0, 0, 0}},
1575     {{0x0c0a6e2c9477b5d9, 0xf9a4bf62876dc444, 0x5050a949b6cdc279,
1576       0x06bada7ab77f8276},
1577      {0xc8b4aed1ea48dac9, 0xdebd8a4b7ea1070f, 0x427d49101366eb70,
1578       0x5b476dfd0e6cb18a},
1579      {1, 0, 0, 0}},
1580     {{0x7c5c3e44278c340a, 0x4d54606812d66f3b, 0x29a751b1ae23c5d8,
1581       0x3e29864e8a2ec908},
1582      {0x142d2a6626dbb850, 0xad1744c4765bd780, 0x1f150e68e322d1ed,
1583       0x239b90ea3dc31e7e},
1584      {1, 0, 0, 0}},
1585     {{0x78c416527a53322a, 0x305dde6709776f8e, 0xdbcab759f8862ed4,
1586       0x820f4dd949f72ff7},
1587      {0x6cc544a62b5debd4, 0x75be5d937b4e8cc4, 0x1b481b1b215c14d3,
1588       0x140406ec783a05ec},
1589      {1, 0, 0, 0}},
1590     {{0x6a703f10e895df07, 0xfd75f3fa01876bd8, 0xeb5b06e70ce08ffe,
1591       0x68f6b8542783dfee},
1592      {0x90c76f8a78712655, 0xcf5293d2f310bf7f, 0xfbc8044dfda45028,
1593       0xcbe1feba92e40ce6},
1594      {1, 0, 0, 0}},
1595     {{0xe998ceea4396e4c1, 0xfc82ef0b6acea274, 0x230f729f2250e927,
1596       0xd0b2f94d2f420109},
1597      {0x4305adddb38d4966, 0x10b838f8624c3b45, 0x7db2636658954e7a,
1598       0x971459828b0719e5},
1599      {1, 0, 0, 0}},
1600     {{0x4bd6b72623369fc9, 0x57f2929e53d0b876, 0xc2d5cba4f2340687,
1601       0x961610004a866aba},
1602      {0x49997bcd2e407a5e, 0x69ab197d92ddcb24, 0x2cf1f2438fe5131c,
1603       0x7acb9fadcee75e44},
1604      {1, 0, 0, 0}},
1605     {{0x254e839423d2d4c0, 0xf57f0c917aea685b, 0xa60d880f6f75aaea,
1606       0x24eb9acca333bf5b},
1607      {0xe3de4ccb1cda5dea, 0xfeef9341c51a6b4f, 0x743125f88bac4c4d,
1608       0x69f891c5acd079cc},
1609      {1, 0, 0, 0}},
1610     {{0xeee44b35702476b5, 0x7ed031a0e45c2258, 0xb422d1e7bd6f8514,
1611       0xe51f547c5972a107},
1612      {0xa25bcd6fc9cf343d, 0x8ca922ee097c184e, 0xa62f98b3a9fe9a06,
1613       0x1c309a2b25bb1387},
1614      {1, 0, 0, 0}},
1615     {{0x9295dbeb1967c459, 0xb00148833472c98e, 0xc504977708011828,
1616       0x20b87b8aa2c4e503},
1617      {0x3063175de057c277, 0x1bd539338fe582dd, 0x0d11adef5f69a044,
1618       0xf5c6fa49919776be},
1619      {1, 0, 0, 0}},
1620     {{0x8c944e760fd59e11, 0x3876cba1102fad5f, 0xa454c3fad83faa56,
1621       0x1ed7d1b9332010b9},
1622      {0xa1011a270024b889, 0x05e4d0dcac0cd344, 0x52b520f0eb6a2a24,
1623       0x3a2b03f03217257a},
1624      {1, 0, 0, 0}},
1625     {{0xf20fc2afdf1d043d, 0xf330240db58d5a62, 0xfc7d229ca0058c3b,
1626       0x15fee545c78dd9f6},
1627      {0x501e82885bc98cda, 0x41ef80e5d046ac04, 0x557d9f49461210fb,
1628       0x4ab5b6b2b8753f81},
1629      {1, 0, 0, 0}}}
1630};
1631
1632/*
1633 * select_point selects the |idx|th point from a precomputation table and
1634 * copies it to out.
1635 */
1636static void select_point(const u64 idx, unsigned int size,
1637                         const smallfelem pre_comp[16][3], smallfelem out[3])
1638{
1639    unsigned i, j;
1640    u64 *outlimbs = &out[0][0];
1641
1642    memset(out, 0, sizeof(*out) * 3);
1643
1644    for (i = 0; i < size; i++) {
1645        const u64 *inlimbs = (u64 *)&pre_comp[i][0][0];
1646        u64 mask = i ^ idx;
1647        mask |= mask >> 4;
1648        mask |= mask >> 2;
1649        mask |= mask >> 1;
1650        mask &= 1;
1651        mask--;
1652        for (j = 0; j < NLIMBS * 3; j++)
1653            outlimbs[j] |= inlimbs[j] & mask;
1654    }
1655}
1656
1657/* get_bit returns the |i|th bit in |in| */
1658static char get_bit(const felem_bytearray in, int i)
1659{
1660    if ((i < 0) || (i >= 256))
1661        return 0;
1662    return (in[i >> 3] >> (i & 7)) & 1;
1663}
1664
1665/*
1666 * Interleaved point multiplication using precomputed point multiples: The
1667 * small point multiples 0*P, 1*P, ..., 17*P are in pre_comp[], the scalars
1668 * in scalars[]. If g_scalar is non-NULL, we also add this multiple of the
1669 * generator, using certain (large) precomputed multiples in g_pre_comp.
1670 * Output point (X, Y, Z) is stored in x_out, y_out, z_out
1671 */
1672static void batch_mul(felem x_out, felem y_out, felem z_out,
1673                      const felem_bytearray scalars[],
1674                      const unsigned num_points, const u8 *g_scalar,
1675                      const int mixed, const smallfelem pre_comp[][17][3],
1676                      const smallfelem g_pre_comp[2][16][3])
1677{
1678    int i, skip;
1679    unsigned num, gen_mul = (g_scalar != NULL);
1680    felem nq[3], ftmp;
1681    smallfelem tmp[3];
1682    u64 bits;
1683    u8 sign, digit;
1684
1685    /* set nq to the point at infinity */
1686    memset(nq, 0, sizeof(nq));
1687
1688    /*
1689     * Loop over all scalars msb-to-lsb, interleaving additions of multiples
1690     * of the generator (two in each of the last 32 rounds) and additions of
1691     * other points multiples (every 5th round).
1692     */
1693    skip = 1;                   /* save two point operations in the first
1694                                 * round */
1695    for (i = (num_points ? 255 : 31); i >= 0; --i) {
1696        /* double */
1697        if (!skip)
1698            point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]);
1699
1700        /* add multiples of the generator */
1701        if (gen_mul && (i <= 31)) {
1702            /* first, look 32 bits upwards */
1703            bits = get_bit(g_scalar, i + 224) << 3;
1704            bits |= get_bit(g_scalar, i + 160) << 2;
1705            bits |= get_bit(g_scalar, i + 96) << 1;
1706            bits |= get_bit(g_scalar, i + 32);
1707            /* select the point to add, in constant time */
1708            select_point(bits, 16, g_pre_comp[1], tmp);
1709
1710            if (!skip) {
1711                /* Arg 1 below is for "mixed" */
1712                point_add(nq[0], nq[1], nq[2],
1713                          nq[0], nq[1], nq[2], 1, tmp[0], tmp[1], tmp[2]);
1714            } else {
1715                smallfelem_expand(nq[0], tmp[0]);
1716                smallfelem_expand(nq[1], tmp[1]);
1717                smallfelem_expand(nq[2], tmp[2]);
1718                skip = 0;
1719            }
1720
1721            /* second, look at the current position */
1722            bits = get_bit(g_scalar, i + 192) << 3;
1723            bits |= get_bit(g_scalar, i + 128) << 2;
1724            bits |= get_bit(g_scalar, i + 64) << 1;
1725            bits |= get_bit(g_scalar, i);
1726            /* select the point to add, in constant time */
1727            select_point(bits, 16, g_pre_comp[0], tmp);
1728            /* Arg 1 below is for "mixed" */
1729            point_add(nq[0], nq[1], nq[2],
1730                      nq[0], nq[1], nq[2], 1, tmp[0], tmp[1], tmp[2]);
1731        }
1732
1733        /* do other additions every 5 doublings */
1734        if (num_points && (i % 5 == 0)) {
1735            /* loop over all scalars */
1736            for (num = 0; num < num_points; ++num) {
1737                bits = get_bit(scalars[num], i + 4) << 5;
1738                bits |= get_bit(scalars[num], i + 3) << 4;
1739                bits |= get_bit(scalars[num], i + 2) << 3;
1740                bits |= get_bit(scalars[num], i + 1) << 2;
1741                bits |= get_bit(scalars[num], i) << 1;
1742                bits |= get_bit(scalars[num], i - 1);
1743                ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits);
1744
1745                /*
1746                 * select the point to add or subtract, in constant time
1747                 */
1748                select_point(digit, 17, pre_comp[num], tmp);
1749                smallfelem_neg(ftmp, tmp[1]); /* (X, -Y, Z) is the negative
1750                                               * point */
1751                copy_small_conditional(ftmp, tmp[1], (((limb) sign) - 1));
1752                felem_contract(tmp[1], ftmp);
1753
1754                if (!skip) {
1755                    point_add(nq[0], nq[1], nq[2],
1756                              nq[0], nq[1], nq[2],
1757                              mixed, tmp[0], tmp[1], tmp[2]);
1758                } else {
1759                    smallfelem_expand(nq[0], tmp[0]);
1760                    smallfelem_expand(nq[1], tmp[1]);
1761                    smallfelem_expand(nq[2], tmp[2]);
1762                    skip = 0;
1763                }
1764            }
1765        }
1766    }
1767    felem_assign(x_out, nq[0]);
1768    felem_assign(y_out, nq[1]);
1769    felem_assign(z_out, nq[2]);
1770}
1771
1772/* Precomputation for the group generator. */
1773struct nistp256_pre_comp_st {
1774    smallfelem g_pre_comp[2][16][3];
1775    CRYPTO_REF_COUNT references;
1776    CRYPTO_RWLOCK *lock;
1777};
1778
1779const EC_METHOD *EC_GFp_nistp256_method(void)
1780{
1781    static const EC_METHOD ret = {
1782        EC_FLAGS_DEFAULT_OCT,
1783        NID_X9_62_prime_field,
1784        ec_GFp_nistp256_group_init,
1785        ec_GFp_simple_group_finish,
1786        ec_GFp_simple_group_clear_finish,
1787        ec_GFp_nist_group_copy,
1788        ec_GFp_nistp256_group_set_curve,
1789        ec_GFp_simple_group_get_curve,
1790        ec_GFp_simple_group_get_degree,
1791        ec_group_simple_order_bits,
1792        ec_GFp_simple_group_check_discriminant,
1793        ec_GFp_simple_point_init,
1794        ec_GFp_simple_point_finish,
1795        ec_GFp_simple_point_clear_finish,
1796        ec_GFp_simple_point_copy,
1797        ec_GFp_simple_point_set_to_infinity,
1798        ec_GFp_simple_set_Jprojective_coordinates_GFp,
1799        ec_GFp_simple_get_Jprojective_coordinates_GFp,
1800        ec_GFp_simple_point_set_affine_coordinates,
1801        ec_GFp_nistp256_point_get_affine_coordinates,
1802        0 /* point_set_compressed_coordinates */ ,
1803        0 /* point2oct */ ,
1804        0 /* oct2point */ ,
1805        ec_GFp_simple_add,
1806        ec_GFp_simple_dbl,
1807        ec_GFp_simple_invert,
1808        ec_GFp_simple_is_at_infinity,
1809        ec_GFp_simple_is_on_curve,
1810        ec_GFp_simple_cmp,
1811        ec_GFp_simple_make_affine,
1812        ec_GFp_simple_points_make_affine,
1813        ec_GFp_nistp256_points_mul,
1814        ec_GFp_nistp256_precompute_mult,
1815        ec_GFp_nistp256_have_precompute_mult,
1816        ec_GFp_nist_field_mul,
1817        ec_GFp_nist_field_sqr,
1818        0 /* field_div */ ,
1819        ec_GFp_simple_field_inv,
1820        0 /* field_encode */ ,
1821        0 /* field_decode */ ,
1822        0,                      /* field_set_to_one */
1823        ec_key_simple_priv2oct,
1824        ec_key_simple_oct2priv,
1825        0, /* set private */
1826        ec_key_simple_generate_key,
1827        ec_key_simple_check_key,
1828        ec_key_simple_generate_public_key,
1829        0, /* keycopy */
1830        0, /* keyfinish */
1831        ecdh_simple_compute_key,
1832        0, /* field_inverse_mod_ord */
1833        0, /* blind_coordinates */
1834        0, /* ladder_pre */
1835        0, /* ladder_step */
1836        0  /* ladder_post */
1837    };
1838
1839    return &ret;
1840}
1841
1842/******************************************************************************/
1843/*
1844 * FUNCTIONS TO MANAGE PRECOMPUTATION
1845 */
1846
1847static NISTP256_PRE_COMP *nistp256_pre_comp_new(void)
1848{
1849    NISTP256_PRE_COMP *ret = OPENSSL_zalloc(sizeof(*ret));
1850
1851    if (ret == NULL) {
1852        ECerr(EC_F_NISTP256_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE);
1853        return ret;
1854    }
1855
1856    ret->references = 1;
1857
1858    ret->lock = CRYPTO_THREAD_lock_new();
1859    if (ret->lock == NULL) {
1860        ECerr(EC_F_NISTP256_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE);
1861        OPENSSL_free(ret);
1862        return NULL;
1863    }
1864    return ret;
1865}
1866
1867NISTP256_PRE_COMP *EC_nistp256_pre_comp_dup(NISTP256_PRE_COMP *p)
1868{
1869    int i;
1870    if (p != NULL)
1871        CRYPTO_UP_REF(&p->references, &i, p->lock);
1872    return p;
1873}
1874
1875void EC_nistp256_pre_comp_free(NISTP256_PRE_COMP *pre)
1876{
1877    int i;
1878
1879    if (pre == NULL)
1880        return;
1881
1882    CRYPTO_DOWN_REF(&pre->references, &i, pre->lock);
1883    REF_PRINT_COUNT("EC_nistp256", x);
1884    if (i > 0)
1885        return;
1886    REF_ASSERT_ISNT(i < 0);
1887
1888    CRYPTO_THREAD_lock_free(pre->lock);
1889    OPENSSL_free(pre);
1890}
1891
1892/******************************************************************************/
1893/*
1894 * OPENSSL EC_METHOD FUNCTIONS
1895 */
1896
1897int ec_GFp_nistp256_group_init(EC_GROUP *group)
1898{
1899    int ret;
1900    ret = ec_GFp_simple_group_init(group);
1901    group->a_is_minus3 = 1;
1902    return ret;
1903}
1904
1905int ec_GFp_nistp256_group_set_curve(EC_GROUP *group, const BIGNUM *p,
1906                                    const BIGNUM *a, const BIGNUM *b,
1907                                    BN_CTX *ctx)
1908{
1909    int ret = 0;
1910    BN_CTX *new_ctx = NULL;
1911    BIGNUM *curve_p, *curve_a, *curve_b;
1912
1913    if (ctx == NULL)
1914        if ((ctx = new_ctx = BN_CTX_new()) == NULL)
1915            return 0;
1916    BN_CTX_start(ctx);
1917    curve_p = BN_CTX_get(ctx);
1918    curve_a = BN_CTX_get(ctx);
1919    curve_b = BN_CTX_get(ctx);
1920    if (curve_b == NULL)
1921        goto err;
1922    BN_bin2bn(nistp256_curve_params[0], sizeof(felem_bytearray), curve_p);
1923    BN_bin2bn(nistp256_curve_params[1], sizeof(felem_bytearray), curve_a);
1924    BN_bin2bn(nistp256_curve_params[2], sizeof(felem_bytearray), curve_b);
1925    if ((BN_cmp(curve_p, p)) || (BN_cmp(curve_a, a)) || (BN_cmp(curve_b, b))) {
1926        ECerr(EC_F_EC_GFP_NISTP256_GROUP_SET_CURVE,
1927              EC_R_WRONG_CURVE_PARAMETERS);
1928        goto err;
1929    }
1930    group->field_mod_func = BN_nist_mod_256;
1931    ret = ec_GFp_simple_group_set_curve(group, p, a, b, ctx);
1932 err:
1933    BN_CTX_end(ctx);
1934    BN_CTX_free(new_ctx);
1935    return ret;
1936}
1937
1938/*
1939 * Takes the Jacobian coordinates (X, Y, Z) of a point and returns (X', Y') =
1940 * (X/Z^2, Y/Z^3)
1941 */
1942int ec_GFp_nistp256_point_get_affine_coordinates(const EC_GROUP *group,
1943                                                 const EC_POINT *point,
1944                                                 BIGNUM *x, BIGNUM *y,
1945                                                 BN_CTX *ctx)
1946{
1947    felem z1, z2, x_in, y_in;
1948    smallfelem x_out, y_out;
1949    longfelem tmp;
1950
1951    if (EC_POINT_is_at_infinity(group, point)) {
1952        ECerr(EC_F_EC_GFP_NISTP256_POINT_GET_AFFINE_COORDINATES,
1953              EC_R_POINT_AT_INFINITY);
1954        return 0;
1955    }
1956    if ((!BN_to_felem(x_in, point->X)) || (!BN_to_felem(y_in, point->Y)) ||
1957        (!BN_to_felem(z1, point->Z)))
1958        return 0;
1959    felem_inv(z2, z1);
1960    felem_square(tmp, z2);
1961    felem_reduce(z1, tmp);
1962    felem_mul(tmp, x_in, z1);
1963    felem_reduce(x_in, tmp);
1964    felem_contract(x_out, x_in);
1965    if (x != NULL) {
1966        if (!smallfelem_to_BN(x, x_out)) {
1967            ECerr(EC_F_EC_GFP_NISTP256_POINT_GET_AFFINE_COORDINATES,
1968                  ERR_R_BN_LIB);
1969            return 0;
1970        }
1971    }
1972    felem_mul(tmp, z1, z2);
1973    felem_reduce(z1, tmp);
1974    felem_mul(tmp, y_in, z1);
1975    felem_reduce(y_in, tmp);
1976    felem_contract(y_out, y_in);
1977    if (y != NULL) {
1978        if (!smallfelem_to_BN(y, y_out)) {
1979            ECerr(EC_F_EC_GFP_NISTP256_POINT_GET_AFFINE_COORDINATES,
1980                  ERR_R_BN_LIB);
1981            return 0;
1982        }
1983    }
1984    return 1;
1985}
1986
1987/* points below is of size |num|, and tmp_smallfelems is of size |num+1| */
1988static void make_points_affine(size_t num, smallfelem points[][3],
1989                               smallfelem tmp_smallfelems[])
1990{
1991    /*
1992     * Runs in constant time, unless an input is the point at infinity (which
1993     * normally shouldn't happen).
1994     */
1995    ec_GFp_nistp_points_make_affine_internal(num,
1996                                             points,
1997                                             sizeof(smallfelem),
1998                                             tmp_smallfelems,
1999                                             (void (*)(void *))smallfelem_one,
2000                                             smallfelem_is_zero_int,
2001                                             (void (*)(void *, const void *))
2002                                             smallfelem_assign,
2003                                             (void (*)(void *, const void *))
2004                                             smallfelem_square_contract,
2005                                             (void (*)
2006                                              (void *, const void *,
2007                                               const void *))
2008                                             smallfelem_mul_contract,
2009                                             (void (*)(void *, const void *))
2010                                             smallfelem_inv_contract,
2011                                             /* nothing to contract */
2012                                             (void (*)(void *, const void *))
2013                                             smallfelem_assign);
2014}
2015
2016/*
2017 * Computes scalar*generator + \sum scalars[i]*points[i], ignoring NULL
2018 * values Result is stored in r (r can equal one of the inputs).
2019 */
2020int ec_GFp_nistp256_points_mul(const EC_GROUP *group, EC_POINT *r,
2021                               const BIGNUM *scalar, size_t num,
2022                               const EC_POINT *points[],
2023                               const BIGNUM *scalars[], BN_CTX *ctx)
2024{
2025    int ret = 0;
2026    int j;
2027    int mixed = 0;
2028    BIGNUM *x, *y, *z, *tmp_scalar;
2029    felem_bytearray g_secret;
2030    felem_bytearray *secrets = NULL;
2031    smallfelem (*pre_comp)[17][3] = NULL;
2032    smallfelem *tmp_smallfelems = NULL;
2033    unsigned i;
2034    int num_bytes;
2035    int have_pre_comp = 0;
2036    size_t num_points = num;
2037    smallfelem x_in, y_in, z_in;
2038    felem x_out, y_out, z_out;
2039    NISTP256_PRE_COMP *pre = NULL;
2040    const smallfelem(*g_pre_comp)[16][3] = NULL;
2041    EC_POINT *generator = NULL;
2042    const EC_POINT *p = NULL;
2043    const BIGNUM *p_scalar = NULL;
2044
2045    BN_CTX_start(ctx);
2046    x = BN_CTX_get(ctx);
2047    y = BN_CTX_get(ctx);
2048    z = BN_CTX_get(ctx);
2049    tmp_scalar = BN_CTX_get(ctx);
2050    if (tmp_scalar == NULL)
2051        goto err;
2052
2053    if (scalar != NULL) {
2054        pre = group->pre_comp.nistp256;
2055        if (pre)
2056            /* we have precomputation, try to use it */
2057            g_pre_comp = (const smallfelem(*)[16][3])pre->g_pre_comp;
2058        else
2059            /* try to use the standard precomputation */
2060            g_pre_comp = &gmul[0];
2061        generator = EC_POINT_new(group);
2062        if (generator == NULL)
2063            goto err;
2064        /* get the generator from precomputation */
2065        if (!smallfelem_to_BN(x, g_pre_comp[0][1][0]) ||
2066            !smallfelem_to_BN(y, g_pre_comp[0][1][1]) ||
2067            !smallfelem_to_BN(z, g_pre_comp[0][1][2])) {
2068            ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_BN_LIB);
2069            goto err;
2070        }
2071        if (!EC_POINT_set_Jprojective_coordinates_GFp(group,
2072                                                      generator, x, y, z,
2073                                                      ctx))
2074            goto err;
2075        if (0 == EC_POINT_cmp(group, generator, group->generator, ctx))
2076            /* precomputation matches generator */
2077            have_pre_comp = 1;
2078        else
2079            /*
2080             * we don't have valid precomputation: treat the generator as a
2081             * random point
2082             */
2083            num_points++;
2084    }
2085    if (num_points > 0) {
2086        if (num_points >= 3) {
2087            /*
2088             * unless we precompute multiples for just one or two points,
2089             * converting those into affine form is time well spent
2090             */
2091            mixed = 1;
2092        }
2093        secrets = OPENSSL_malloc(sizeof(*secrets) * num_points);
2094        pre_comp = OPENSSL_malloc(sizeof(*pre_comp) * num_points);
2095        if (mixed)
2096            tmp_smallfelems =
2097              OPENSSL_malloc(sizeof(*tmp_smallfelems) * (num_points * 17 + 1));
2098        if ((secrets == NULL) || (pre_comp == NULL)
2099            || (mixed && (tmp_smallfelems == NULL))) {
2100            ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_MALLOC_FAILURE);
2101            goto err;
2102        }
2103
2104        /*
2105         * we treat NULL scalars as 0, and NULL points as points at infinity,
2106         * i.e., they contribute nothing to the linear combination
2107         */
2108        memset(secrets, 0, sizeof(*secrets) * num_points);
2109        memset(pre_comp, 0, sizeof(*pre_comp) * num_points);
2110        for (i = 0; i < num_points; ++i) {
2111            if (i == num) {
2112                /*
2113                 * we didn't have a valid precomputation, so we pick the
2114                 * generator
2115                 */
2116                p = EC_GROUP_get0_generator(group);
2117                p_scalar = scalar;
2118            } else {
2119                /* the i^th point */
2120                p = points[i];
2121                p_scalar = scalars[i];
2122            }
2123            if ((p_scalar != NULL) && (p != NULL)) {
2124                /* reduce scalar to 0 <= scalar < 2^256 */
2125                if ((BN_num_bits(p_scalar) > 256)
2126                    || (BN_is_negative(p_scalar))) {
2127                    /*
2128                     * this is an unusual input, and we don't guarantee
2129                     * constant-timeness
2130                     */
2131                    if (!BN_nnmod(tmp_scalar, p_scalar, group->order, ctx)) {
2132                        ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_BN_LIB);
2133                        goto err;
2134                    }
2135                    num_bytes = BN_bn2lebinpad(tmp_scalar,
2136                                               secrets[i], sizeof(secrets[i]));
2137                } else {
2138                    num_bytes = BN_bn2lebinpad(p_scalar,
2139                                               secrets[i], sizeof(secrets[i]));
2140                }
2141                if (num_bytes < 0) {
2142                    ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_BN_LIB);
2143                    goto err;
2144                }
2145                /* precompute multiples */
2146                if ((!BN_to_felem(x_out, p->X)) ||
2147                    (!BN_to_felem(y_out, p->Y)) ||
2148                    (!BN_to_felem(z_out, p->Z)))
2149                    goto err;
2150                felem_shrink(pre_comp[i][1][0], x_out);
2151                felem_shrink(pre_comp[i][1][1], y_out);
2152                felem_shrink(pre_comp[i][1][2], z_out);
2153                for (j = 2; j <= 16; ++j) {
2154                    if (j & 1) {
2155                        point_add_small(pre_comp[i][j][0], pre_comp[i][j][1],
2156                                        pre_comp[i][j][2], pre_comp[i][1][0],
2157                                        pre_comp[i][1][1], pre_comp[i][1][2],
2158                                        pre_comp[i][j - 1][0],
2159                                        pre_comp[i][j - 1][1],
2160                                        pre_comp[i][j - 1][2]);
2161                    } else {
2162                        point_double_small(pre_comp[i][j][0],
2163                                           pre_comp[i][j][1],
2164                                           pre_comp[i][j][2],
2165                                           pre_comp[i][j / 2][0],
2166                                           pre_comp[i][j / 2][1],
2167                                           pre_comp[i][j / 2][2]);
2168                    }
2169                }
2170            }
2171        }
2172        if (mixed)
2173            make_points_affine(num_points * 17, pre_comp[0], tmp_smallfelems);
2174    }
2175
2176    /* the scalar for the generator */
2177    if ((scalar != NULL) && (have_pre_comp)) {
2178        memset(g_secret, 0, sizeof(g_secret));
2179        /* reduce scalar to 0 <= scalar < 2^256 */
2180        if ((BN_num_bits(scalar) > 256) || (BN_is_negative(scalar))) {
2181            /*
2182             * this is an unusual input, and we don't guarantee
2183             * constant-timeness
2184             */
2185            if (!BN_nnmod(tmp_scalar, scalar, group->order, ctx)) {
2186                ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_BN_LIB);
2187                goto err;
2188            }
2189            num_bytes = BN_bn2lebinpad(tmp_scalar, g_secret, sizeof(g_secret));
2190        } else {
2191            num_bytes = BN_bn2lebinpad(scalar, g_secret, sizeof(g_secret));
2192        }
2193        /* do the multiplication with generator precomputation */
2194        batch_mul(x_out, y_out, z_out,
2195                  (const felem_bytearray(*))secrets, num_points,
2196                  g_secret,
2197                  mixed, (const smallfelem(*)[17][3])pre_comp, g_pre_comp);
2198    } else {
2199        /* do the multiplication without generator precomputation */
2200        batch_mul(x_out, y_out, z_out,
2201                  (const felem_bytearray(*))secrets, num_points,
2202                  NULL, mixed, (const smallfelem(*)[17][3])pre_comp, NULL);
2203    }
2204    /* reduce the output to its unique minimal representation */
2205    felem_contract(x_in, x_out);
2206    felem_contract(y_in, y_out);
2207    felem_contract(z_in, z_out);
2208    if ((!smallfelem_to_BN(x, x_in)) || (!smallfelem_to_BN(y, y_in)) ||
2209        (!smallfelem_to_BN(z, z_in))) {
2210        ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_BN_LIB);
2211        goto err;
2212    }
2213    ret = EC_POINT_set_Jprojective_coordinates_GFp(group, r, x, y, z, ctx);
2214
2215 err:
2216    BN_CTX_end(ctx);
2217    EC_POINT_free(generator);
2218    OPENSSL_free(secrets);
2219    OPENSSL_free(pre_comp);
2220    OPENSSL_free(tmp_smallfelems);
2221    return ret;
2222}
2223
2224int ec_GFp_nistp256_precompute_mult(EC_GROUP *group, BN_CTX *ctx)
2225{
2226    int ret = 0;
2227    NISTP256_PRE_COMP *pre = NULL;
2228    int i, j;
2229    BN_CTX *new_ctx = NULL;
2230    BIGNUM *x, *y;
2231    EC_POINT *generator = NULL;
2232    smallfelem tmp_smallfelems[32];
2233    felem x_tmp, y_tmp, z_tmp;
2234
2235    /* throw away old precomputation */
2236    EC_pre_comp_free(group);
2237    if (ctx == NULL)
2238        if ((ctx = new_ctx = BN_CTX_new()) == NULL)
2239            return 0;
2240    BN_CTX_start(ctx);
2241    x = BN_CTX_get(ctx);
2242    y = BN_CTX_get(ctx);
2243    if (y == NULL)
2244        goto err;
2245    /* get the generator */
2246    if (group->generator == NULL)
2247        goto err;
2248    generator = EC_POINT_new(group);
2249    if (generator == NULL)
2250        goto err;
2251    BN_bin2bn(nistp256_curve_params[3], sizeof(felem_bytearray), x);
2252    BN_bin2bn(nistp256_curve_params[4], sizeof(felem_bytearray), y);
2253    if (!EC_POINT_set_affine_coordinates(group, generator, x, y, ctx))
2254        goto err;
2255    if ((pre = nistp256_pre_comp_new()) == NULL)
2256        goto err;
2257    /*
2258     * if the generator is the standard one, use built-in precomputation
2259     */
2260    if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) {
2261        memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp));
2262        goto done;
2263    }
2264    if ((!BN_to_felem(x_tmp, group->generator->X)) ||
2265        (!BN_to_felem(y_tmp, group->generator->Y)) ||
2266        (!BN_to_felem(z_tmp, group->generator->Z)))
2267        goto err;
2268    felem_shrink(pre->g_pre_comp[0][1][0], x_tmp);
2269    felem_shrink(pre->g_pre_comp[0][1][1], y_tmp);
2270    felem_shrink(pre->g_pre_comp[0][1][2], z_tmp);
2271    /*
2272     * compute 2^64*G, 2^128*G, 2^192*G for the first table, 2^32*G, 2^96*G,
2273     * 2^160*G, 2^224*G for the second one
2274     */
2275    for (i = 1; i <= 8; i <<= 1) {
2276        point_double_small(pre->g_pre_comp[1][i][0], pre->g_pre_comp[1][i][1],
2277                           pre->g_pre_comp[1][i][2], pre->g_pre_comp[0][i][0],
2278                           pre->g_pre_comp[0][i][1],
2279                           pre->g_pre_comp[0][i][2]);
2280        for (j = 0; j < 31; ++j) {
2281            point_double_small(pre->g_pre_comp[1][i][0],
2282                               pre->g_pre_comp[1][i][1],
2283                               pre->g_pre_comp[1][i][2],
2284                               pre->g_pre_comp[1][i][0],
2285                               pre->g_pre_comp[1][i][1],
2286                               pre->g_pre_comp[1][i][2]);
2287        }
2288        if (i == 8)
2289            break;
2290        point_double_small(pre->g_pre_comp[0][2 * i][0],
2291                           pre->g_pre_comp[0][2 * i][1],
2292                           pre->g_pre_comp[0][2 * i][2],
2293                           pre->g_pre_comp[1][i][0], pre->g_pre_comp[1][i][1],
2294                           pre->g_pre_comp[1][i][2]);
2295        for (j = 0; j < 31; ++j) {
2296            point_double_small(pre->g_pre_comp[0][2 * i][0],
2297                               pre->g_pre_comp[0][2 * i][1],
2298                               pre->g_pre_comp[0][2 * i][2],
2299                               pre->g_pre_comp[0][2 * i][0],
2300                               pre->g_pre_comp[0][2 * i][1],
2301                               pre->g_pre_comp[0][2 * i][2]);
2302        }
2303    }
2304    for (i = 0; i < 2; i++) {
2305        /* g_pre_comp[i][0] is the point at infinity */
2306        memset(pre->g_pre_comp[i][0], 0, sizeof(pre->g_pre_comp[i][0]));
2307        /* the remaining multiples */
2308        /* 2^64*G + 2^128*G resp. 2^96*G + 2^160*G */
2309        point_add_small(pre->g_pre_comp[i][6][0], pre->g_pre_comp[i][6][1],
2310                        pre->g_pre_comp[i][6][2], pre->g_pre_comp[i][4][0],
2311                        pre->g_pre_comp[i][4][1], pre->g_pre_comp[i][4][2],
2312                        pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1],
2313                        pre->g_pre_comp[i][2][2]);
2314        /* 2^64*G + 2^192*G resp. 2^96*G + 2^224*G */
2315        point_add_small(pre->g_pre_comp[i][10][0], pre->g_pre_comp[i][10][1],
2316                        pre->g_pre_comp[i][10][2], pre->g_pre_comp[i][8][0],
2317                        pre->g_pre_comp[i][8][1], pre->g_pre_comp[i][8][2],
2318                        pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1],
2319                        pre->g_pre_comp[i][2][2]);
2320        /* 2^128*G + 2^192*G resp. 2^160*G + 2^224*G */
2321        point_add_small(pre->g_pre_comp[i][12][0], pre->g_pre_comp[i][12][1],
2322                        pre->g_pre_comp[i][12][2], pre->g_pre_comp[i][8][0],
2323                        pre->g_pre_comp[i][8][1], pre->g_pre_comp[i][8][2],
2324                        pre->g_pre_comp[i][4][0], pre->g_pre_comp[i][4][1],
2325                        pre->g_pre_comp[i][4][2]);
2326        /*
2327         * 2^64*G + 2^128*G + 2^192*G resp. 2^96*G + 2^160*G + 2^224*G
2328         */
2329        point_add_small(pre->g_pre_comp[i][14][0], pre->g_pre_comp[i][14][1],
2330                        pre->g_pre_comp[i][14][2], pre->g_pre_comp[i][12][0],
2331                        pre->g_pre_comp[i][12][1], pre->g_pre_comp[i][12][2],
2332                        pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1],
2333                        pre->g_pre_comp[i][2][2]);
2334        for (j = 1; j < 8; ++j) {
2335            /* odd multiples: add G resp. 2^32*G */
2336            point_add_small(pre->g_pre_comp[i][2 * j + 1][0],
2337                            pre->g_pre_comp[i][2 * j + 1][1],
2338                            pre->g_pre_comp[i][2 * j + 1][2],
2339                            pre->g_pre_comp[i][2 * j][0],
2340                            pre->g_pre_comp[i][2 * j][1],
2341                            pre->g_pre_comp[i][2 * j][2],
2342                            pre->g_pre_comp[i][1][0],
2343                            pre->g_pre_comp[i][1][1],
2344                            pre->g_pre_comp[i][1][2]);
2345        }
2346    }
2347    make_points_affine(31, &(pre->g_pre_comp[0][1]), tmp_smallfelems);
2348
2349 done:
2350    SETPRECOMP(group, nistp256, pre);
2351    pre = NULL;
2352    ret = 1;
2353
2354 err:
2355    BN_CTX_end(ctx);
2356    EC_POINT_free(generator);
2357    BN_CTX_free(new_ctx);
2358    EC_nistp256_pre_comp_free(pre);
2359    return ret;
2360}
2361
2362int ec_GFp_nistp256_have_precompute_mult(const EC_GROUP *group)
2363{
2364    return HAVEPRECOMP(group, nistp256);
2365}
2366#endif
2367