ecp_nistp256.c revision 331638
1/* crypto/ec/ecp_nistp256.c */
2/*
3 * Written by Adam Langley (Google) for the OpenSSL project
4 */
5/* Copyright 2011 Google Inc.
6 *
7 * Licensed under the Apache License, Version 2.0 (the "License");
8 *
9 * you may not use this file except in compliance with the License.
10 * You may obtain a copy of the License at
11 *
12 *     http://www.apache.org/licenses/LICENSE-2.0
13 *
14 *  Unless required by applicable law or agreed to in writing, software
15 *  distributed under the License is distributed on an "AS IS" BASIS,
16 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 *  See the License for the specific language governing permissions and
18 *  limitations under the License.
19 */
20
21/*
22 * A 64-bit implementation of the NIST P-256 elliptic curve point multiplication
23 *
24 * OpenSSL integration was taken from Emilia Kasper's work in ecp_nistp224.c.
25 * Otherwise based on Emilia's P224 work, which was inspired by my curve25519
26 * work which got its smarts from Daniel J. Bernstein's work on the same.
27 */
28
29#include <openssl/opensslconf.h>
30#ifndef OPENSSL_NO_EC_NISTP_64_GCC_128
31
32# ifndef OPENSSL_SYS_VMS
33#  include <stdint.h>
34# else
35#  include <inttypes.h>
36# endif
37
38# include <string.h>
39# include <openssl/err.h>
40# include "ec_lcl.h"
41
42# if defined(__GNUC__) && (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))
43  /* even with gcc, the typedef won't work for 32-bit platforms */
44typedef __uint128_t uint128_t;  /* nonstandard; implemented by gcc on 64-bit
45                                 * platforms */
46typedef __int128_t int128_t;
47# else
48#  error "Need GCC 3.1 or later to define type uint128_t"
49# endif
50
51typedef uint8_t u8;
52typedef uint32_t u32;
53typedef uint64_t u64;
54
55/*
56 * The underlying field. P256 operates over GF(2^256-2^224+2^192+2^96-1). We
57 * can serialise an element of this field into 32 bytes. We call this an
58 * felem_bytearray.
59 */
60
61typedef u8 felem_bytearray[32];
62
63/*
64 * These are the parameters of P256, taken from FIPS 186-3, page 86. These
65 * values are big-endian.
66 */
67static const felem_bytearray nistp256_curve_params[5] = {
68    {0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, /* p */
69     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
70     0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,
71     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
72    {0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, /* a = -3 */
73     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
74     0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,
75     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xfc}, /* b */
76    {0x5a, 0xc6, 0x35, 0xd8, 0xaa, 0x3a, 0x93, 0xe7,
77     0xb3, 0xeb, 0xbd, 0x55, 0x76, 0x98, 0x86, 0xbc,
78     0x65, 0x1d, 0x06, 0xb0, 0xcc, 0x53, 0xb0, 0xf6,
79     0x3b, 0xce, 0x3c, 0x3e, 0x27, 0xd2, 0x60, 0x4b},
80    {0x6b, 0x17, 0xd1, 0xf2, 0xe1, 0x2c, 0x42, 0x47, /* x */
81     0xf8, 0xbc, 0xe6, 0xe5, 0x63, 0xa4, 0x40, 0xf2,
82     0x77, 0x03, 0x7d, 0x81, 0x2d, 0xeb, 0x33, 0xa0,
83     0xf4, 0xa1, 0x39, 0x45, 0xd8, 0x98, 0xc2, 0x96},
84    {0x4f, 0xe3, 0x42, 0xe2, 0xfe, 0x1a, 0x7f, 0x9b, /* y */
85     0x8e, 0xe7, 0xeb, 0x4a, 0x7c, 0x0f, 0x9e, 0x16,
86     0x2b, 0xce, 0x33, 0x57, 0x6b, 0x31, 0x5e, 0xce,
87     0xcb, 0xb6, 0x40, 0x68, 0x37, 0xbf, 0x51, 0xf5}
88};
89
90/*-
91 * The representation of field elements.
92 * ------------------------------------
93 *
94 * We represent field elements with either four 128-bit values, eight 128-bit
95 * values, or four 64-bit values. The field element represented is:
96 *   v[0]*2^0 + v[1]*2^64 + v[2]*2^128 + v[3]*2^192  (mod p)
97 * or:
98 *   v[0]*2^0 + v[1]*2^64 + v[2]*2^128 + ... + v[8]*2^512  (mod p)
99 *
100 * 128-bit values are called 'limbs'. Since the limbs are spaced only 64 bits
101 * apart, but are 128-bits wide, the most significant bits of each limb overlap
102 * with the least significant bits of the next.
103 *
104 * A field element with four limbs is an 'felem'. One with eight limbs is a
105 * 'longfelem'
106 *
107 * A field element with four, 64-bit values is called a 'smallfelem'. Small
108 * values are used as intermediate values before multiplication.
109 */
110
111# define NLIMBS 4
112
113typedef uint128_t limb;
114typedef limb felem[NLIMBS];
115typedef limb longfelem[NLIMBS * 2];
116typedef u64 smallfelem[NLIMBS];
117
118/* This is the value of the prime as four 64-bit words, little-endian. */
119static const u64 kPrime[4] =
120    { 0xfffffffffffffffful, 0xffffffff, 0, 0xffffffff00000001ul };
121static const u64 bottom63bits = 0x7ffffffffffffffful;
122
123/*
124 * bin32_to_felem takes a little-endian byte array and converts it into felem
125 * form. This assumes that the CPU is little-endian.
126 */
127static void bin32_to_felem(felem out, const u8 in[32])
128{
129    out[0] = *((u64 *)&in[0]);
130    out[1] = *((u64 *)&in[8]);
131    out[2] = *((u64 *)&in[16]);
132    out[3] = *((u64 *)&in[24]);
133}
134
135/*
136 * smallfelem_to_bin32 takes a smallfelem and serialises into a little
137 * endian, 32 byte array. This assumes that the CPU is little-endian.
138 */
139static void smallfelem_to_bin32(u8 out[32], const smallfelem in)
140{
141    *((u64 *)&out[0]) = in[0];
142    *((u64 *)&out[8]) = in[1];
143    *((u64 *)&out[16]) = in[2];
144    *((u64 *)&out[24]) = in[3];
145}
146
147/* To preserve endianness when using BN_bn2bin and BN_bin2bn */
148static void flip_endian(u8 *out, const u8 *in, unsigned len)
149{
150    unsigned i;
151    for (i = 0; i < len; ++i)
152        out[i] = in[len - 1 - i];
153}
154
155/* BN_to_felem converts an OpenSSL BIGNUM into an felem */
156static int BN_to_felem(felem out, const BIGNUM *bn)
157{
158    felem_bytearray b_in;
159    felem_bytearray b_out;
160    unsigned num_bytes;
161
162    /* BN_bn2bin eats leading zeroes */
163    memset(b_out, 0, sizeof(b_out));
164    num_bytes = BN_num_bytes(bn);
165    if (num_bytes > sizeof(b_out)) {
166        ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
167        return 0;
168    }
169    if (BN_is_negative(bn)) {
170        ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
171        return 0;
172    }
173    num_bytes = BN_bn2bin(bn, b_in);
174    flip_endian(b_out, b_in, num_bytes);
175    bin32_to_felem(out, b_out);
176    return 1;
177}
178
179/* felem_to_BN converts an felem into an OpenSSL BIGNUM */
180static BIGNUM *smallfelem_to_BN(BIGNUM *out, const smallfelem in)
181{
182    felem_bytearray b_in, b_out;
183    smallfelem_to_bin32(b_in, in);
184    flip_endian(b_out, b_in, sizeof(b_out));
185    return BN_bin2bn(b_out, sizeof(b_out), out);
186}
187
188/*-
189 * Field operations
190 * ----------------
191 */
192
193static void smallfelem_one(smallfelem out)
194{
195    out[0] = 1;
196    out[1] = 0;
197    out[2] = 0;
198    out[3] = 0;
199}
200
201static void smallfelem_assign(smallfelem out, const smallfelem in)
202{
203    out[0] = in[0];
204    out[1] = in[1];
205    out[2] = in[2];
206    out[3] = in[3];
207}
208
209static void felem_assign(felem out, const felem in)
210{
211    out[0] = in[0];
212    out[1] = in[1];
213    out[2] = in[2];
214    out[3] = in[3];
215}
216
217/* felem_sum sets out = out + in. */
218static void felem_sum(felem out, const felem in)
219{
220    out[0] += in[0];
221    out[1] += in[1];
222    out[2] += in[2];
223    out[3] += in[3];
224}
225
226/* felem_small_sum sets out = out + in. */
227static void felem_small_sum(felem out, const smallfelem in)
228{
229    out[0] += in[0];
230    out[1] += in[1];
231    out[2] += in[2];
232    out[3] += in[3];
233}
234
235/* felem_scalar sets out = out * scalar */
236static void felem_scalar(felem out, const u64 scalar)
237{
238    out[0] *= scalar;
239    out[1] *= scalar;
240    out[2] *= scalar;
241    out[3] *= scalar;
242}
243
244/* longfelem_scalar sets out = out * scalar */
245static void longfelem_scalar(longfelem out, const u64 scalar)
246{
247    out[0] *= scalar;
248    out[1] *= scalar;
249    out[2] *= scalar;
250    out[3] *= scalar;
251    out[4] *= scalar;
252    out[5] *= scalar;
253    out[6] *= scalar;
254    out[7] *= scalar;
255}
256
257# define two105m41m9 (((limb)1) << 105) - (((limb)1) << 41) - (((limb)1) << 9)
258# define two105 (((limb)1) << 105)
259# define two105m41p9 (((limb)1) << 105) - (((limb)1) << 41) + (((limb)1) << 9)
260
261/* zero105 is 0 mod p */
262static const felem zero105 =
263    { two105m41m9, two105, two105m41p9, two105m41p9 };
264
265/*-
266 * smallfelem_neg sets |out| to |-small|
267 * On exit:
268 *   out[i] < out[i] + 2^105
269 */
270static void smallfelem_neg(felem out, const smallfelem small)
271{
272    /* In order to prevent underflow, we subtract from 0 mod p. */
273    out[0] = zero105[0] - small[0];
274    out[1] = zero105[1] - small[1];
275    out[2] = zero105[2] - small[2];
276    out[3] = zero105[3] - small[3];
277}
278
279/*-
280 * felem_diff subtracts |in| from |out|
281 * On entry:
282 *   in[i] < 2^104
283 * On exit:
284 *   out[i] < out[i] + 2^105
285 */
286static void felem_diff(felem out, const felem in)
287{
288    /*
289     * In order to prevent underflow, we add 0 mod p before subtracting.
290     */
291    out[0] += zero105[0];
292    out[1] += zero105[1];
293    out[2] += zero105[2];
294    out[3] += zero105[3];
295
296    out[0] -= in[0];
297    out[1] -= in[1];
298    out[2] -= in[2];
299    out[3] -= in[3];
300}
301
302# define two107m43m11 (((limb)1) << 107) - (((limb)1) << 43) - (((limb)1) << 11)
303# define two107 (((limb)1) << 107)
304# define two107m43p11 (((limb)1) << 107) - (((limb)1) << 43) + (((limb)1) << 11)
305
306/* zero107 is 0 mod p */
307static const felem zero107 =
308    { two107m43m11, two107, two107m43p11, two107m43p11 };
309
310/*-
311 * An alternative felem_diff for larger inputs |in|
312 * felem_diff_zero107 subtracts |in| from |out|
313 * On entry:
314 *   in[i] < 2^106
315 * On exit:
316 *   out[i] < out[i] + 2^107
317 */
318static void felem_diff_zero107(felem out, const felem in)
319{
320    /*
321     * In order to prevent underflow, we add 0 mod p before subtracting.
322     */
323    out[0] += zero107[0];
324    out[1] += zero107[1];
325    out[2] += zero107[2];
326    out[3] += zero107[3];
327
328    out[0] -= in[0];
329    out[1] -= in[1];
330    out[2] -= in[2];
331    out[3] -= in[3];
332}
333
334/*-
335 * longfelem_diff subtracts |in| from |out|
336 * On entry:
337 *   in[i] < 7*2^67
338 * On exit:
339 *   out[i] < out[i] + 2^70 + 2^40
340 */
341static void longfelem_diff(longfelem out, const longfelem in)
342{
343    static const limb two70m8p6 =
344        (((limb) 1) << 70) - (((limb) 1) << 8) + (((limb) 1) << 6);
345    static const limb two70p40 = (((limb) 1) << 70) + (((limb) 1) << 40);
346    static const limb two70 = (((limb) 1) << 70);
347    static const limb two70m40m38p6 =
348        (((limb) 1) << 70) - (((limb) 1) << 40) - (((limb) 1) << 38) +
349        (((limb) 1) << 6);
350    static const limb two70m6 = (((limb) 1) << 70) - (((limb) 1) << 6);
351
352    /* add 0 mod p to avoid underflow */
353    out[0] += two70m8p6;
354    out[1] += two70p40;
355    out[2] += two70;
356    out[3] += two70m40m38p6;
357    out[4] += two70m6;
358    out[5] += two70m6;
359    out[6] += two70m6;
360    out[7] += two70m6;
361
362    /* in[i] < 7*2^67 < 2^70 - 2^40 - 2^38 + 2^6 */
363    out[0] -= in[0];
364    out[1] -= in[1];
365    out[2] -= in[2];
366    out[3] -= in[3];
367    out[4] -= in[4];
368    out[5] -= in[5];
369    out[6] -= in[6];
370    out[7] -= in[7];
371}
372
373# define two64m0 (((limb)1) << 64) - 1
374# define two110p32m0 (((limb)1) << 110) + (((limb)1) << 32) - 1
375# define two64m46 (((limb)1) << 64) - (((limb)1) << 46)
376# define two64m32 (((limb)1) << 64) - (((limb)1) << 32)
377
378/* zero110 is 0 mod p */
379static const felem zero110 = { two64m0, two110p32m0, two64m46, two64m32 };
380
381/*-
382 * felem_shrink converts an felem into a smallfelem. The result isn't quite
383 * minimal as the value may be greater than p.
384 *
385 * On entry:
386 *   in[i] < 2^109
387 * On exit:
388 *   out[i] < 2^64
389 */
390static void felem_shrink(smallfelem out, const felem in)
391{
392    felem tmp;
393    u64 a, b, mask;
394    u64 high, low;
395    static const u64 kPrime3Test = 0x7fffffff00000001ul; /* 2^63 - 2^32 + 1 */
396
397    /* Carry 2->3 */
398    tmp[3] = zero110[3] + in[3] + ((u64)(in[2] >> 64));
399    /* tmp[3] < 2^110 */
400
401    tmp[2] = zero110[2] + (u64)in[2];
402    tmp[0] = zero110[0] + in[0];
403    tmp[1] = zero110[1] + in[1];
404    /* tmp[0] < 2**110, tmp[1] < 2^111, tmp[2] < 2**65 */
405
406    /*
407     * We perform two partial reductions where we eliminate the high-word of
408     * tmp[3]. We don't update the other words till the end.
409     */
410    a = tmp[3] >> 64;           /* a < 2^46 */
411    tmp[3] = (u64)tmp[3];
412    tmp[3] -= a;
413    tmp[3] += ((limb) a) << 32;
414    /* tmp[3] < 2^79 */
415
416    b = a;
417    a = tmp[3] >> 64;           /* a < 2^15 */
418    b += a;                     /* b < 2^46 + 2^15 < 2^47 */
419    tmp[3] = (u64)tmp[3];
420    tmp[3] -= a;
421    tmp[3] += ((limb) a) << 32;
422    /* tmp[3] < 2^64 + 2^47 */
423
424    /*
425     * This adjusts the other two words to complete the two partial
426     * reductions.
427     */
428    tmp[0] += b;
429    tmp[1] -= (((limb) b) << 32);
430
431    /*
432     * In order to make space in tmp[3] for the carry from 2 -> 3, we
433     * conditionally subtract kPrime if tmp[3] is large enough.
434     */
435    high = (u64)(tmp[3] >> 64);
436    /* As tmp[3] < 2^65, high is either 1 or 0 */
437    high = 0 - high;
438    /*-
439     * high is:
440     *   all ones   if the high word of tmp[3] is 1
441     *   all zeros  if the high word of tmp[3] if 0
442     */
443    low = (u64)tmp[3];
444    mask = 0 - (low >> 63);
445    /*-
446     * mask is:
447     *   all ones   if the MSB of low is 1
448     *   all zeros  if the MSB of low if 0
449     */
450    low &= bottom63bits;
451    low -= kPrime3Test;
452    /* if low was greater than kPrime3Test then the MSB is zero */
453    low = ~low;
454    low = 0 - (low >> 63);
455    /*-
456     * low is:
457     *   all ones   if low was > kPrime3Test
458     *   all zeros  if low was <= kPrime3Test
459     */
460    mask = (mask & low) | high;
461    tmp[0] -= mask & kPrime[0];
462    tmp[1] -= mask & kPrime[1];
463    /* kPrime[2] is zero, so omitted */
464    tmp[3] -= mask & kPrime[3];
465    /* tmp[3] < 2**64 - 2**32 + 1 */
466
467    tmp[1] += ((u64)(tmp[0] >> 64));
468    tmp[0] = (u64)tmp[0];
469    tmp[2] += ((u64)(tmp[1] >> 64));
470    tmp[1] = (u64)tmp[1];
471    tmp[3] += ((u64)(tmp[2] >> 64));
472    tmp[2] = (u64)tmp[2];
473    /* tmp[i] < 2^64 */
474
475    out[0] = tmp[0];
476    out[1] = tmp[1];
477    out[2] = tmp[2];
478    out[3] = tmp[3];
479}
480
481/* smallfelem_expand converts a smallfelem to an felem */
482static void smallfelem_expand(felem out, const smallfelem in)
483{
484    out[0] = in[0];
485    out[1] = in[1];
486    out[2] = in[2];
487    out[3] = in[3];
488}
489
490/*-
491 * smallfelem_square sets |out| = |small|^2
492 * On entry:
493 *   small[i] < 2^64
494 * On exit:
495 *   out[i] < 7 * 2^64 < 2^67
496 */
497static void smallfelem_square(longfelem out, const smallfelem small)
498{
499    limb a;
500    u64 high, low;
501
502    a = ((uint128_t) small[0]) * small[0];
503    low = a;
504    high = a >> 64;
505    out[0] = low;
506    out[1] = high;
507
508    a = ((uint128_t) small[0]) * small[1];
509    low = a;
510    high = a >> 64;
511    out[1] += low;
512    out[1] += low;
513    out[2] = high;
514
515    a = ((uint128_t) small[0]) * small[2];
516    low = a;
517    high = a >> 64;
518    out[2] += low;
519    out[2] *= 2;
520    out[3] = high;
521
522    a = ((uint128_t) small[0]) * small[3];
523    low = a;
524    high = a >> 64;
525    out[3] += low;
526    out[4] = high;
527
528    a = ((uint128_t) small[1]) * small[2];
529    low = a;
530    high = a >> 64;
531    out[3] += low;
532    out[3] *= 2;
533    out[4] += high;
534
535    a = ((uint128_t) small[1]) * small[1];
536    low = a;
537    high = a >> 64;
538    out[2] += low;
539    out[3] += high;
540
541    a = ((uint128_t) small[1]) * small[3];
542    low = a;
543    high = a >> 64;
544    out[4] += low;
545    out[4] *= 2;
546    out[5] = high;
547
548    a = ((uint128_t) small[2]) * small[3];
549    low = a;
550    high = a >> 64;
551    out[5] += low;
552    out[5] *= 2;
553    out[6] = high;
554    out[6] += high;
555
556    a = ((uint128_t) small[2]) * small[2];
557    low = a;
558    high = a >> 64;
559    out[4] += low;
560    out[5] += high;
561
562    a = ((uint128_t) small[3]) * small[3];
563    low = a;
564    high = a >> 64;
565    out[6] += low;
566    out[7] = high;
567}
568
569/*-
570 * felem_square sets |out| = |in|^2
571 * On entry:
572 *   in[i] < 2^109
573 * On exit:
574 *   out[i] < 7 * 2^64 < 2^67
575 */
576static void felem_square(longfelem out, const felem in)
577{
578    u64 small[4];
579    felem_shrink(small, in);
580    smallfelem_square(out, small);
581}
582
583/*-
584 * smallfelem_mul sets |out| = |small1| * |small2|
585 * On entry:
586 *   small1[i] < 2^64
587 *   small2[i] < 2^64
588 * On exit:
589 *   out[i] < 7 * 2^64 < 2^67
590 */
591static void smallfelem_mul(longfelem out, const smallfelem small1,
592                           const smallfelem small2)
593{
594    limb a;
595    u64 high, low;
596
597    a = ((uint128_t) small1[0]) * small2[0];
598    low = a;
599    high = a >> 64;
600    out[0] = low;
601    out[1] = high;
602
603    a = ((uint128_t) small1[0]) * small2[1];
604    low = a;
605    high = a >> 64;
606    out[1] += low;
607    out[2] = high;
608
609    a = ((uint128_t) small1[1]) * small2[0];
610    low = a;
611    high = a >> 64;
612    out[1] += low;
613    out[2] += high;
614
615    a = ((uint128_t) small1[0]) * small2[2];
616    low = a;
617    high = a >> 64;
618    out[2] += low;
619    out[3] = high;
620
621    a = ((uint128_t) small1[1]) * small2[1];
622    low = a;
623    high = a >> 64;
624    out[2] += low;
625    out[3] += high;
626
627    a = ((uint128_t) small1[2]) * small2[0];
628    low = a;
629    high = a >> 64;
630    out[2] += low;
631    out[3] += high;
632
633    a = ((uint128_t) small1[0]) * small2[3];
634    low = a;
635    high = a >> 64;
636    out[3] += low;
637    out[4] = high;
638
639    a = ((uint128_t) small1[1]) * small2[2];
640    low = a;
641    high = a >> 64;
642    out[3] += low;
643    out[4] += high;
644
645    a = ((uint128_t) small1[2]) * small2[1];
646    low = a;
647    high = a >> 64;
648    out[3] += low;
649    out[4] += high;
650
651    a = ((uint128_t) small1[3]) * small2[0];
652    low = a;
653    high = a >> 64;
654    out[3] += low;
655    out[4] += high;
656
657    a = ((uint128_t) small1[1]) * small2[3];
658    low = a;
659    high = a >> 64;
660    out[4] += low;
661    out[5] = high;
662
663    a = ((uint128_t) small1[2]) * small2[2];
664    low = a;
665    high = a >> 64;
666    out[4] += low;
667    out[5] += high;
668
669    a = ((uint128_t) small1[3]) * small2[1];
670    low = a;
671    high = a >> 64;
672    out[4] += low;
673    out[5] += high;
674
675    a = ((uint128_t) small1[2]) * small2[3];
676    low = a;
677    high = a >> 64;
678    out[5] += low;
679    out[6] = high;
680
681    a = ((uint128_t) small1[3]) * small2[2];
682    low = a;
683    high = a >> 64;
684    out[5] += low;
685    out[6] += high;
686
687    a = ((uint128_t) small1[3]) * small2[3];
688    low = a;
689    high = a >> 64;
690    out[6] += low;
691    out[7] = high;
692}
693
694/*-
695 * felem_mul sets |out| = |in1| * |in2|
696 * On entry:
697 *   in1[i] < 2^109
698 *   in2[i] < 2^109
699 * On exit:
700 *   out[i] < 7 * 2^64 < 2^67
701 */
702static void felem_mul(longfelem out, const felem in1, const felem in2)
703{
704    smallfelem small1, small2;
705    felem_shrink(small1, in1);
706    felem_shrink(small2, in2);
707    smallfelem_mul(out, small1, small2);
708}
709
710/*-
711 * felem_small_mul sets |out| = |small1| * |in2|
712 * On entry:
713 *   small1[i] < 2^64
714 *   in2[i] < 2^109
715 * On exit:
716 *   out[i] < 7 * 2^64 < 2^67
717 */
718static void felem_small_mul(longfelem out, const smallfelem small1,
719                            const felem in2)
720{
721    smallfelem small2;
722    felem_shrink(small2, in2);
723    smallfelem_mul(out, small1, small2);
724}
725
726# define two100m36m4 (((limb)1) << 100) - (((limb)1) << 36) - (((limb)1) << 4)
727# define two100 (((limb)1) << 100)
728# define two100m36p4 (((limb)1) << 100) - (((limb)1) << 36) + (((limb)1) << 4)
729/* zero100 is 0 mod p */
730static const felem zero100 =
731    { two100m36m4, two100, two100m36p4, two100m36p4 };
732
733/*-
734 * Internal function for the different flavours of felem_reduce.
735 * felem_reduce_ reduces the higher coefficients in[4]-in[7].
736 * On entry:
737 *   out[0] >= in[6] + 2^32*in[6] + in[7] + 2^32*in[7]
738 *   out[1] >= in[7] + 2^32*in[4]
739 *   out[2] >= in[5] + 2^32*in[5]
740 *   out[3] >= in[4] + 2^32*in[5] + 2^32*in[6]
741 * On exit:
742 *   out[0] <= out[0] + in[4] + 2^32*in[5]
743 *   out[1] <= out[1] + in[5] + 2^33*in[6]
744 *   out[2] <= out[2] + in[7] + 2*in[6] + 2^33*in[7]
745 *   out[3] <= out[3] + 2^32*in[4] + 3*in[7]
746 */
747static void felem_reduce_(felem out, const longfelem in)
748{
749    int128_t c;
750    /* combine common terms from below */
751    c = in[4] + (in[5] << 32);
752    out[0] += c;
753    out[3] -= c;
754
755    c = in[5] - in[7];
756    out[1] += c;
757    out[2] -= c;
758
759    /* the remaining terms */
760    /* 256: [(0,1),(96,-1),(192,-1),(224,1)] */
761    out[1] -= (in[4] << 32);
762    out[3] += (in[4] << 32);
763
764    /* 320: [(32,1),(64,1),(128,-1),(160,-1),(224,-1)] */
765    out[2] -= (in[5] << 32);
766
767    /* 384: [(0,-1),(32,-1),(96,2),(128,2),(224,-1)] */
768    out[0] -= in[6];
769    out[0] -= (in[6] << 32);
770    out[1] += (in[6] << 33);
771    out[2] += (in[6] * 2);
772    out[3] -= (in[6] << 32);
773
774    /* 448: [(0,-1),(32,-1),(64,-1),(128,1),(160,2),(192,3)] */
775    out[0] -= in[7];
776    out[0] -= (in[7] << 32);
777    out[2] += (in[7] << 33);
778    out[3] += (in[7] * 3);
779}
780
781/*-
782 * felem_reduce converts a longfelem into an felem.
783 * To be called directly after felem_square or felem_mul.
784 * On entry:
785 *   in[0] < 2^64, in[1] < 3*2^64, in[2] < 5*2^64, in[3] < 7*2^64
786 *   in[4] < 7*2^64, in[5] < 5*2^64, in[6] < 3*2^64, in[7] < 2*64
787 * On exit:
788 *   out[i] < 2^101
789 */
790static void felem_reduce(felem out, const longfelem in)
791{
792    out[0] = zero100[0] + in[0];
793    out[1] = zero100[1] + in[1];
794    out[2] = zero100[2] + in[2];
795    out[3] = zero100[3] + in[3];
796
797    felem_reduce_(out, in);
798
799    /*-
800     * out[0] > 2^100 - 2^36 - 2^4 - 3*2^64 - 3*2^96 - 2^64 - 2^96 > 0
801     * out[1] > 2^100 - 2^64 - 7*2^96 > 0
802     * out[2] > 2^100 - 2^36 + 2^4 - 5*2^64 - 5*2^96 > 0
803     * out[3] > 2^100 - 2^36 + 2^4 - 7*2^64 - 5*2^96 - 3*2^96 > 0
804     *
805     * out[0] < 2^100 + 2^64 + 7*2^64 + 5*2^96 < 2^101
806     * out[1] < 2^100 + 3*2^64 + 5*2^64 + 3*2^97 < 2^101
807     * out[2] < 2^100 + 5*2^64 + 2^64 + 3*2^65 + 2^97 < 2^101
808     * out[3] < 2^100 + 7*2^64 + 7*2^96 + 3*2^64 < 2^101
809     */
810}
811
812/*-
813 * felem_reduce_zero105 converts a larger longfelem into an felem.
814 * On entry:
815 *   in[0] < 2^71
816 * On exit:
817 *   out[i] < 2^106
818 */
819static void felem_reduce_zero105(felem out, const longfelem in)
820{
821    out[0] = zero105[0] + in[0];
822    out[1] = zero105[1] + in[1];
823    out[2] = zero105[2] + in[2];
824    out[3] = zero105[3] + in[3];
825
826    felem_reduce_(out, in);
827
828    /*-
829     * out[0] > 2^105 - 2^41 - 2^9 - 2^71 - 2^103 - 2^71 - 2^103 > 0
830     * out[1] > 2^105 - 2^71 - 2^103 > 0
831     * out[2] > 2^105 - 2^41 + 2^9 - 2^71 - 2^103 > 0
832     * out[3] > 2^105 - 2^41 + 2^9 - 2^71 - 2^103 - 2^103 > 0
833     *
834     * out[0] < 2^105 + 2^71 + 2^71 + 2^103 < 2^106
835     * out[1] < 2^105 + 2^71 + 2^71 + 2^103 < 2^106
836     * out[2] < 2^105 + 2^71 + 2^71 + 2^71 + 2^103 < 2^106
837     * out[3] < 2^105 + 2^71 + 2^103 + 2^71 < 2^106
838     */
839}
840
841/*
842 * subtract_u64 sets *result = *result - v and *carry to one if the
843 * subtraction underflowed.
844 */
845static void subtract_u64(u64 *result, u64 *carry, u64 v)
846{
847    uint128_t r = *result;
848    r -= v;
849    *carry = (r >> 64) & 1;
850    *result = (u64)r;
851}
852
853/*
854 * felem_contract converts |in| to its unique, minimal representation. On
855 * entry: in[i] < 2^109
856 */
857static void felem_contract(smallfelem out, const felem in)
858{
859    unsigned i;
860    u64 all_equal_so_far = 0, result = 0, carry;
861
862    felem_shrink(out, in);
863    /* small is minimal except that the value might be > p */
864
865    all_equal_so_far--;
866    /*
867     * We are doing a constant time test if out >= kPrime. We need to compare
868     * each u64, from most-significant to least significant. For each one, if
869     * all words so far have been equal (m is all ones) then a non-equal
870     * result is the answer. Otherwise we continue.
871     */
872    for (i = 3; i < 4; i--) {
873        u64 equal;
874        uint128_t a = ((uint128_t) kPrime[i]) - out[i];
875        /*
876         * if out[i] > kPrime[i] then a will underflow and the high 64-bits
877         * will all be set.
878         */
879        result |= all_equal_so_far & ((u64)(a >> 64));
880
881        /*
882         * if kPrime[i] == out[i] then |equal| will be all zeros and the
883         * decrement will make it all ones.
884         */
885        equal = kPrime[i] ^ out[i];
886        equal--;
887        equal &= equal << 32;
888        equal &= equal << 16;
889        equal &= equal << 8;
890        equal &= equal << 4;
891        equal &= equal << 2;
892        equal &= equal << 1;
893        equal = 0 - (equal >> 63);
894
895        all_equal_so_far &= equal;
896    }
897
898    /*
899     * if all_equal_so_far is still all ones then the two values are equal
900     * and so out >= kPrime is true.
901     */
902    result |= all_equal_so_far;
903
904    /* if out >= kPrime then we subtract kPrime. */
905    subtract_u64(&out[0], &carry, result & kPrime[0]);
906    subtract_u64(&out[1], &carry, carry);
907    subtract_u64(&out[2], &carry, carry);
908    subtract_u64(&out[3], &carry, carry);
909
910    subtract_u64(&out[1], &carry, result & kPrime[1]);
911    subtract_u64(&out[2], &carry, carry);
912    subtract_u64(&out[3], &carry, carry);
913
914    subtract_u64(&out[2], &carry, result & kPrime[2]);
915    subtract_u64(&out[3], &carry, carry);
916
917    subtract_u64(&out[3], &carry, result & kPrime[3]);
918}
919
920static void smallfelem_square_contract(smallfelem out, const smallfelem in)
921{
922    longfelem longtmp;
923    felem tmp;
924
925    smallfelem_square(longtmp, in);
926    felem_reduce(tmp, longtmp);
927    felem_contract(out, tmp);
928}
929
930static void smallfelem_mul_contract(smallfelem out, const smallfelem in1,
931                                    const smallfelem in2)
932{
933    longfelem longtmp;
934    felem tmp;
935
936    smallfelem_mul(longtmp, in1, in2);
937    felem_reduce(tmp, longtmp);
938    felem_contract(out, tmp);
939}
940
941/*-
942 * felem_is_zero returns a limb with all bits set if |in| == 0 (mod p) and 0
943 * otherwise.
944 * On entry:
945 *   small[i] < 2^64
946 */
947static limb smallfelem_is_zero(const smallfelem small)
948{
949    limb result;
950    u64 is_p;
951
952    u64 is_zero = small[0] | small[1] | small[2] | small[3];
953    is_zero--;
954    is_zero &= is_zero << 32;
955    is_zero &= is_zero << 16;
956    is_zero &= is_zero << 8;
957    is_zero &= is_zero << 4;
958    is_zero &= is_zero << 2;
959    is_zero &= is_zero << 1;
960    is_zero = 0 - (is_zero >> 63);
961
962    is_p = (small[0] ^ kPrime[0]) |
963        (small[1] ^ kPrime[1]) |
964        (small[2] ^ kPrime[2]) | (small[3] ^ kPrime[3]);
965    is_p--;
966    is_p &= is_p << 32;
967    is_p &= is_p << 16;
968    is_p &= is_p << 8;
969    is_p &= is_p << 4;
970    is_p &= is_p << 2;
971    is_p &= is_p << 1;
972    is_p = 0 - (is_p >> 63);
973
974    is_zero |= is_p;
975
976    result = is_zero;
977    result |= ((limb) is_zero) << 64;
978    return result;
979}
980
981static int smallfelem_is_zero_int(const void *small)
982{
983    return (int)(smallfelem_is_zero(small) & ((limb) 1));
984}
985
986/*-
987 * felem_inv calculates |out| = |in|^{-1}
988 *
989 * Based on Fermat's Little Theorem:
990 *   a^p = a (mod p)
991 *   a^{p-1} = 1 (mod p)
992 *   a^{p-2} = a^{-1} (mod p)
993 */
994static void felem_inv(felem out, const felem in)
995{
996    felem ftmp, ftmp2;
997    /* each e_I will hold |in|^{2^I - 1} */
998    felem e2, e4, e8, e16, e32, e64;
999    longfelem tmp;
1000    unsigned i;
1001
1002    felem_square(tmp, in);
1003    felem_reduce(ftmp, tmp);    /* 2^1 */
1004    felem_mul(tmp, in, ftmp);
1005    felem_reduce(ftmp, tmp);    /* 2^2 - 2^0 */
1006    felem_assign(e2, ftmp);
1007    felem_square(tmp, ftmp);
1008    felem_reduce(ftmp, tmp);    /* 2^3 - 2^1 */
1009    felem_square(tmp, ftmp);
1010    felem_reduce(ftmp, tmp);    /* 2^4 - 2^2 */
1011    felem_mul(tmp, ftmp, e2);
1012    felem_reduce(ftmp, tmp);    /* 2^4 - 2^0 */
1013    felem_assign(e4, ftmp);
1014    felem_square(tmp, ftmp);
1015    felem_reduce(ftmp, tmp);    /* 2^5 - 2^1 */
1016    felem_square(tmp, ftmp);
1017    felem_reduce(ftmp, tmp);    /* 2^6 - 2^2 */
1018    felem_square(tmp, ftmp);
1019    felem_reduce(ftmp, tmp);    /* 2^7 - 2^3 */
1020    felem_square(tmp, ftmp);
1021    felem_reduce(ftmp, tmp);    /* 2^8 - 2^4 */
1022    felem_mul(tmp, ftmp, e4);
1023    felem_reduce(ftmp, tmp);    /* 2^8 - 2^0 */
1024    felem_assign(e8, ftmp);
1025    for (i = 0; i < 8; i++) {
1026        felem_square(tmp, ftmp);
1027        felem_reduce(ftmp, tmp);
1028    }                           /* 2^16 - 2^8 */
1029    felem_mul(tmp, ftmp, e8);
1030    felem_reduce(ftmp, tmp);    /* 2^16 - 2^0 */
1031    felem_assign(e16, ftmp);
1032    for (i = 0; i < 16; i++) {
1033        felem_square(tmp, ftmp);
1034        felem_reduce(ftmp, tmp);
1035    }                           /* 2^32 - 2^16 */
1036    felem_mul(tmp, ftmp, e16);
1037    felem_reduce(ftmp, tmp);    /* 2^32 - 2^0 */
1038    felem_assign(e32, ftmp);
1039    for (i = 0; i < 32; i++) {
1040        felem_square(tmp, ftmp);
1041        felem_reduce(ftmp, tmp);
1042    }                           /* 2^64 - 2^32 */
1043    felem_assign(e64, ftmp);
1044    felem_mul(tmp, ftmp, in);
1045    felem_reduce(ftmp, tmp);    /* 2^64 - 2^32 + 2^0 */
1046    for (i = 0; i < 192; i++) {
1047        felem_square(tmp, ftmp);
1048        felem_reduce(ftmp, tmp);
1049    }                           /* 2^256 - 2^224 + 2^192 */
1050
1051    felem_mul(tmp, e64, e32);
1052    felem_reduce(ftmp2, tmp);   /* 2^64 - 2^0 */
1053    for (i = 0; i < 16; i++) {
1054        felem_square(tmp, ftmp2);
1055        felem_reduce(ftmp2, tmp);
1056    }                           /* 2^80 - 2^16 */
1057    felem_mul(tmp, ftmp2, e16);
1058    felem_reduce(ftmp2, tmp);   /* 2^80 - 2^0 */
1059    for (i = 0; i < 8; i++) {
1060        felem_square(tmp, ftmp2);
1061        felem_reduce(ftmp2, tmp);
1062    }                           /* 2^88 - 2^8 */
1063    felem_mul(tmp, ftmp2, e8);
1064    felem_reduce(ftmp2, tmp);   /* 2^88 - 2^0 */
1065    for (i = 0; i < 4; i++) {
1066        felem_square(tmp, ftmp2);
1067        felem_reduce(ftmp2, tmp);
1068    }                           /* 2^92 - 2^4 */
1069    felem_mul(tmp, ftmp2, e4);
1070    felem_reduce(ftmp2, tmp);   /* 2^92 - 2^0 */
1071    felem_square(tmp, ftmp2);
1072    felem_reduce(ftmp2, tmp);   /* 2^93 - 2^1 */
1073    felem_square(tmp, ftmp2);
1074    felem_reduce(ftmp2, tmp);   /* 2^94 - 2^2 */
1075    felem_mul(tmp, ftmp2, e2);
1076    felem_reduce(ftmp2, tmp);   /* 2^94 - 2^0 */
1077    felem_square(tmp, ftmp2);
1078    felem_reduce(ftmp2, tmp);   /* 2^95 - 2^1 */
1079    felem_square(tmp, ftmp2);
1080    felem_reduce(ftmp2, tmp);   /* 2^96 - 2^2 */
1081    felem_mul(tmp, ftmp2, in);
1082    felem_reduce(ftmp2, tmp);   /* 2^96 - 3 */
1083
1084    felem_mul(tmp, ftmp2, ftmp);
1085    felem_reduce(out, tmp);     /* 2^256 - 2^224 + 2^192 + 2^96 - 3 */
1086}
1087
1088static void smallfelem_inv_contract(smallfelem out, const smallfelem in)
1089{
1090    felem tmp;
1091
1092    smallfelem_expand(tmp, in);
1093    felem_inv(tmp, tmp);
1094    felem_contract(out, tmp);
1095}
1096
1097/*-
1098 * Group operations
1099 * ----------------
1100 *
1101 * Building on top of the field operations we have the operations on the
1102 * elliptic curve group itself. Points on the curve are represented in Jacobian
1103 * coordinates
1104 */
1105
1106/*-
1107 * point_double calculates 2*(x_in, y_in, z_in)
1108 *
1109 * The method is taken from:
1110 *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b
1111 *
1112 * Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed.
1113 * while x_out == y_in is not (maybe this works, but it's not tested).
1114 */
1115static void
1116point_double(felem x_out, felem y_out, felem z_out,
1117             const felem x_in, const felem y_in, const felem z_in)
1118{
1119    longfelem tmp, tmp2;
1120    felem delta, gamma, beta, alpha, ftmp, ftmp2;
1121    smallfelem small1, small2;
1122
1123    felem_assign(ftmp, x_in);
1124    /* ftmp[i] < 2^106 */
1125    felem_assign(ftmp2, x_in);
1126    /* ftmp2[i] < 2^106 */
1127
1128    /* delta = z^2 */
1129    felem_square(tmp, z_in);
1130    felem_reduce(delta, tmp);
1131    /* delta[i] < 2^101 */
1132
1133    /* gamma = y^2 */
1134    felem_square(tmp, y_in);
1135    felem_reduce(gamma, tmp);
1136    /* gamma[i] < 2^101 */
1137    felem_shrink(small1, gamma);
1138
1139    /* beta = x*gamma */
1140    felem_small_mul(tmp, small1, x_in);
1141    felem_reduce(beta, tmp);
1142    /* beta[i] < 2^101 */
1143
1144    /* alpha = 3*(x-delta)*(x+delta) */
1145    felem_diff(ftmp, delta);
1146    /* ftmp[i] < 2^105 + 2^106 < 2^107 */
1147    felem_sum(ftmp2, delta);
1148    /* ftmp2[i] < 2^105 + 2^106 < 2^107 */
1149    felem_scalar(ftmp2, 3);
1150    /* ftmp2[i] < 3 * 2^107 < 2^109 */
1151    felem_mul(tmp, ftmp, ftmp2);
1152    felem_reduce(alpha, tmp);
1153    /* alpha[i] < 2^101 */
1154    felem_shrink(small2, alpha);
1155
1156    /* x' = alpha^2 - 8*beta */
1157    smallfelem_square(tmp, small2);
1158    felem_reduce(x_out, tmp);
1159    felem_assign(ftmp, beta);
1160    felem_scalar(ftmp, 8);
1161    /* ftmp[i] < 8 * 2^101 = 2^104 */
1162    felem_diff(x_out, ftmp);
1163    /* x_out[i] < 2^105 + 2^101 < 2^106 */
1164
1165    /* z' = (y + z)^2 - gamma - delta */
1166    felem_sum(delta, gamma);
1167    /* delta[i] < 2^101 + 2^101 = 2^102 */
1168    felem_assign(ftmp, y_in);
1169    felem_sum(ftmp, z_in);
1170    /* ftmp[i] < 2^106 + 2^106 = 2^107 */
1171    felem_square(tmp, ftmp);
1172    felem_reduce(z_out, tmp);
1173    felem_diff(z_out, delta);
1174    /* z_out[i] < 2^105 + 2^101 < 2^106 */
1175
1176    /* y' = alpha*(4*beta - x') - 8*gamma^2 */
1177    felem_scalar(beta, 4);
1178    /* beta[i] < 4 * 2^101 = 2^103 */
1179    felem_diff_zero107(beta, x_out);
1180    /* beta[i] < 2^107 + 2^103 < 2^108 */
1181    felem_small_mul(tmp, small2, beta);
1182    /* tmp[i] < 7 * 2^64 < 2^67 */
1183    smallfelem_square(tmp2, small1);
1184    /* tmp2[i] < 7 * 2^64 */
1185    longfelem_scalar(tmp2, 8);
1186    /* tmp2[i] < 8 * 7 * 2^64 = 7 * 2^67 */
1187    longfelem_diff(tmp, tmp2);
1188    /* tmp[i] < 2^67 + 2^70 + 2^40 < 2^71 */
1189    felem_reduce_zero105(y_out, tmp);
1190    /* y_out[i] < 2^106 */
1191}
1192
1193/*
1194 * point_double_small is the same as point_double, except that it operates on
1195 * smallfelems
1196 */
1197static void
1198point_double_small(smallfelem x_out, smallfelem y_out, smallfelem z_out,
1199                   const smallfelem x_in, const smallfelem y_in,
1200                   const smallfelem z_in)
1201{
1202    felem felem_x_out, felem_y_out, felem_z_out;
1203    felem felem_x_in, felem_y_in, felem_z_in;
1204
1205    smallfelem_expand(felem_x_in, x_in);
1206    smallfelem_expand(felem_y_in, y_in);
1207    smallfelem_expand(felem_z_in, z_in);
1208    point_double(felem_x_out, felem_y_out, felem_z_out,
1209                 felem_x_in, felem_y_in, felem_z_in);
1210    felem_shrink(x_out, felem_x_out);
1211    felem_shrink(y_out, felem_y_out);
1212    felem_shrink(z_out, felem_z_out);
1213}
1214
1215/* copy_conditional copies in to out iff mask is all ones. */
1216static void copy_conditional(felem out, const felem in, limb mask)
1217{
1218    unsigned i;
1219    for (i = 0; i < NLIMBS; ++i) {
1220        const limb tmp = mask & (in[i] ^ out[i]);
1221        out[i] ^= tmp;
1222    }
1223}
1224
1225/* copy_small_conditional copies in to out iff mask is all ones. */
1226static void copy_small_conditional(felem out, const smallfelem in, limb mask)
1227{
1228    unsigned i;
1229    const u64 mask64 = mask;
1230    for (i = 0; i < NLIMBS; ++i) {
1231        out[i] = ((limb) (in[i] & mask64)) | (out[i] & ~mask);
1232    }
1233}
1234
1235/*-
1236 * point_add calcuates (x1, y1, z1) + (x2, y2, z2)
1237 *
1238 * The method is taken from:
1239 *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl,
1240 * adapted for mixed addition (z2 = 1, or z2 = 0 for the point at infinity).
1241 *
1242 * This function includes a branch for checking whether the two input points
1243 * are equal, (while not equal to the point at infinity). This case never
1244 * happens during single point multiplication, so there is no timing leak for
1245 * ECDH or ECDSA signing.
1246 */
1247static void point_add(felem x3, felem y3, felem z3,
1248                      const felem x1, const felem y1, const felem z1,
1249                      const int mixed, const smallfelem x2,
1250                      const smallfelem y2, const smallfelem z2)
1251{
1252    felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, ftmp6, x_out, y_out, z_out;
1253    longfelem tmp, tmp2;
1254    smallfelem small1, small2, small3, small4, small5;
1255    limb x_equal, y_equal, z1_is_zero, z2_is_zero;
1256
1257    felem_shrink(small3, z1);
1258
1259    z1_is_zero = smallfelem_is_zero(small3);
1260    z2_is_zero = smallfelem_is_zero(z2);
1261
1262    /* ftmp = z1z1 = z1**2 */
1263    smallfelem_square(tmp, small3);
1264    felem_reduce(ftmp, tmp);
1265    /* ftmp[i] < 2^101 */
1266    felem_shrink(small1, ftmp);
1267
1268    if (!mixed) {
1269        /* ftmp2 = z2z2 = z2**2 */
1270        smallfelem_square(tmp, z2);
1271        felem_reduce(ftmp2, tmp);
1272        /* ftmp2[i] < 2^101 */
1273        felem_shrink(small2, ftmp2);
1274
1275        felem_shrink(small5, x1);
1276
1277        /* u1 = ftmp3 = x1*z2z2 */
1278        smallfelem_mul(tmp, small5, small2);
1279        felem_reduce(ftmp3, tmp);
1280        /* ftmp3[i] < 2^101 */
1281
1282        /* ftmp5 = z1 + z2 */
1283        felem_assign(ftmp5, z1);
1284        felem_small_sum(ftmp5, z2);
1285        /* ftmp5[i] < 2^107 */
1286
1287        /* ftmp5 = (z1 + z2)**2 - (z1z1 + z2z2) = 2z1z2 */
1288        felem_square(tmp, ftmp5);
1289        felem_reduce(ftmp5, tmp);
1290        /* ftmp2 = z2z2 + z1z1 */
1291        felem_sum(ftmp2, ftmp);
1292        /* ftmp2[i] < 2^101 + 2^101 = 2^102 */
1293        felem_diff(ftmp5, ftmp2);
1294        /* ftmp5[i] < 2^105 + 2^101 < 2^106 */
1295
1296        /* ftmp2 = z2 * z2z2 */
1297        smallfelem_mul(tmp, small2, z2);
1298        felem_reduce(ftmp2, tmp);
1299
1300        /* s1 = ftmp2 = y1 * z2**3 */
1301        felem_mul(tmp, y1, ftmp2);
1302        felem_reduce(ftmp6, tmp);
1303        /* ftmp6[i] < 2^101 */
1304    } else {
1305        /*
1306         * We'll assume z2 = 1 (special case z2 = 0 is handled later)
1307         */
1308
1309        /* u1 = ftmp3 = x1*z2z2 */
1310        felem_assign(ftmp3, x1);
1311        /* ftmp3[i] < 2^106 */
1312
1313        /* ftmp5 = 2z1z2 */
1314        felem_assign(ftmp5, z1);
1315        felem_scalar(ftmp5, 2);
1316        /* ftmp5[i] < 2*2^106 = 2^107 */
1317
1318        /* s1 = ftmp2 = y1 * z2**3 */
1319        felem_assign(ftmp6, y1);
1320        /* ftmp6[i] < 2^106 */
1321    }
1322
1323    /* u2 = x2*z1z1 */
1324    smallfelem_mul(tmp, x2, small1);
1325    felem_reduce(ftmp4, tmp);
1326
1327    /* h = ftmp4 = u2 - u1 */
1328    felem_diff_zero107(ftmp4, ftmp3);
1329    /* ftmp4[i] < 2^107 + 2^101 < 2^108 */
1330    felem_shrink(small4, ftmp4);
1331
1332    x_equal = smallfelem_is_zero(small4);
1333
1334    /* z_out = ftmp5 * h */
1335    felem_small_mul(tmp, small4, ftmp5);
1336    felem_reduce(z_out, tmp);
1337    /* z_out[i] < 2^101 */
1338
1339    /* ftmp = z1 * z1z1 */
1340    smallfelem_mul(tmp, small1, small3);
1341    felem_reduce(ftmp, tmp);
1342
1343    /* s2 = tmp = y2 * z1**3 */
1344    felem_small_mul(tmp, y2, ftmp);
1345    felem_reduce(ftmp5, tmp);
1346
1347    /* r = ftmp5 = (s2 - s1)*2 */
1348    felem_diff_zero107(ftmp5, ftmp6);
1349    /* ftmp5[i] < 2^107 + 2^107 = 2^108 */
1350    felem_scalar(ftmp5, 2);
1351    /* ftmp5[i] < 2^109 */
1352    felem_shrink(small1, ftmp5);
1353    y_equal = smallfelem_is_zero(small1);
1354
1355    if (x_equal && y_equal && !z1_is_zero && !z2_is_zero) {
1356        point_double(x3, y3, z3, x1, y1, z1);
1357        return;
1358    }
1359
1360    /* I = ftmp = (2h)**2 */
1361    felem_assign(ftmp, ftmp4);
1362    felem_scalar(ftmp, 2);
1363    /* ftmp[i] < 2*2^108 = 2^109 */
1364    felem_square(tmp, ftmp);
1365    felem_reduce(ftmp, tmp);
1366
1367    /* J = ftmp2 = h * I */
1368    felem_mul(tmp, ftmp4, ftmp);
1369    felem_reduce(ftmp2, tmp);
1370
1371    /* V = ftmp4 = U1 * I */
1372    felem_mul(tmp, ftmp3, ftmp);
1373    felem_reduce(ftmp4, tmp);
1374
1375    /* x_out = r**2 - J - 2V */
1376    smallfelem_square(tmp, small1);
1377    felem_reduce(x_out, tmp);
1378    felem_assign(ftmp3, ftmp4);
1379    felem_scalar(ftmp4, 2);
1380    felem_sum(ftmp4, ftmp2);
1381    /* ftmp4[i] < 2*2^101 + 2^101 < 2^103 */
1382    felem_diff(x_out, ftmp4);
1383    /* x_out[i] < 2^105 + 2^101 */
1384
1385    /* y_out = r(V-x_out) - 2 * s1 * J */
1386    felem_diff_zero107(ftmp3, x_out);
1387    /* ftmp3[i] < 2^107 + 2^101 < 2^108 */
1388    felem_small_mul(tmp, small1, ftmp3);
1389    felem_mul(tmp2, ftmp6, ftmp2);
1390    longfelem_scalar(tmp2, 2);
1391    /* tmp2[i] < 2*2^67 = 2^68 */
1392    longfelem_diff(tmp, tmp2);
1393    /* tmp[i] < 2^67 + 2^70 + 2^40 < 2^71 */
1394    felem_reduce_zero105(y_out, tmp);
1395    /* y_out[i] < 2^106 */
1396
1397    copy_small_conditional(x_out, x2, z1_is_zero);
1398    copy_conditional(x_out, x1, z2_is_zero);
1399    copy_small_conditional(y_out, y2, z1_is_zero);
1400    copy_conditional(y_out, y1, z2_is_zero);
1401    copy_small_conditional(z_out, z2, z1_is_zero);
1402    copy_conditional(z_out, z1, z2_is_zero);
1403    felem_assign(x3, x_out);
1404    felem_assign(y3, y_out);
1405    felem_assign(z3, z_out);
1406}
1407
1408/*
1409 * point_add_small is the same as point_add, except that it operates on
1410 * smallfelems
1411 */
1412static void point_add_small(smallfelem x3, smallfelem y3, smallfelem z3,
1413                            smallfelem x1, smallfelem y1, smallfelem z1,
1414                            smallfelem x2, smallfelem y2, smallfelem z2)
1415{
1416    felem felem_x3, felem_y3, felem_z3;
1417    felem felem_x1, felem_y1, felem_z1;
1418    smallfelem_expand(felem_x1, x1);
1419    smallfelem_expand(felem_y1, y1);
1420    smallfelem_expand(felem_z1, z1);
1421    point_add(felem_x3, felem_y3, felem_z3, felem_x1, felem_y1, felem_z1, 0,
1422              x2, y2, z2);
1423    felem_shrink(x3, felem_x3);
1424    felem_shrink(y3, felem_y3);
1425    felem_shrink(z3, felem_z3);
1426}
1427
1428/*-
1429 * Base point pre computation
1430 * --------------------------
1431 *
1432 * Two different sorts of precomputed tables are used in the following code.
1433 * Each contain various points on the curve, where each point is three field
1434 * elements (x, y, z).
1435 *
1436 * For the base point table, z is usually 1 (0 for the point at infinity).
1437 * This table has 2 * 16 elements, starting with the following:
1438 * index | bits    | point
1439 * ------+---------+------------------------------
1440 *     0 | 0 0 0 0 | 0G
1441 *     1 | 0 0 0 1 | 1G
1442 *     2 | 0 0 1 0 | 2^64G
1443 *     3 | 0 0 1 1 | (2^64 + 1)G
1444 *     4 | 0 1 0 0 | 2^128G
1445 *     5 | 0 1 0 1 | (2^128 + 1)G
1446 *     6 | 0 1 1 0 | (2^128 + 2^64)G
1447 *     7 | 0 1 1 1 | (2^128 + 2^64 + 1)G
1448 *     8 | 1 0 0 0 | 2^192G
1449 *     9 | 1 0 0 1 | (2^192 + 1)G
1450 *    10 | 1 0 1 0 | (2^192 + 2^64)G
1451 *    11 | 1 0 1 1 | (2^192 + 2^64 + 1)G
1452 *    12 | 1 1 0 0 | (2^192 + 2^128)G
1453 *    13 | 1 1 0 1 | (2^192 + 2^128 + 1)G
1454 *    14 | 1 1 1 0 | (2^192 + 2^128 + 2^64)G
1455 *    15 | 1 1 1 1 | (2^192 + 2^128 + 2^64 + 1)G
1456 * followed by a copy of this with each element multiplied by 2^32.
1457 *
1458 * The reason for this is so that we can clock bits into four different
1459 * locations when doing simple scalar multiplies against the base point,
1460 * and then another four locations using the second 16 elements.
1461 *
1462 * Tables for other points have table[i] = iG for i in 0 .. 16. */
1463
1464/* gmul is the table of precomputed base points */
1465static const smallfelem gmul[2][16][3] = {
1466    {{{0, 0, 0, 0},
1467      {0, 0, 0, 0},
1468      {0, 0, 0, 0}},
1469     {{0xf4a13945d898c296, 0x77037d812deb33a0, 0xf8bce6e563a440f2,
1470       0x6b17d1f2e12c4247},
1471      {0xcbb6406837bf51f5, 0x2bce33576b315ece, 0x8ee7eb4a7c0f9e16,
1472       0x4fe342e2fe1a7f9b},
1473      {1, 0, 0, 0}},
1474     {{0x90e75cb48e14db63, 0x29493baaad651f7e, 0x8492592e326e25de,
1475       0x0fa822bc2811aaa5},
1476      {0xe41124545f462ee7, 0x34b1a65050fe82f5, 0x6f4ad4bcb3df188b,
1477       0xbff44ae8f5dba80d},
1478      {1, 0, 0, 0}},
1479     {{0x93391ce2097992af, 0xe96c98fd0d35f1fa, 0xb257c0de95e02789,
1480       0x300a4bbc89d6726f},
1481      {0xaa54a291c08127a0, 0x5bb1eeada9d806a5, 0x7f1ddb25ff1e3c6f,
1482       0x72aac7e0d09b4644},
1483      {1, 0, 0, 0}},
1484     {{0x57c84fc9d789bd85, 0xfc35ff7dc297eac3, 0xfb982fd588c6766e,
1485       0x447d739beedb5e67},
1486      {0x0c7e33c972e25b32, 0x3d349b95a7fae500, 0xe12e9d953a4aaff7,
1487       0x2d4825ab834131ee},
1488      {1, 0, 0, 0}},
1489     {{0x13949c932a1d367f, 0xef7fbd2b1a0a11b7, 0xddc6068bb91dfc60,
1490       0xef9519328a9c72ff},
1491      {0x196035a77376d8a8, 0x23183b0895ca1740, 0xc1ee9807022c219c,
1492       0x611e9fc37dbb2c9b},
1493      {1, 0, 0, 0}},
1494     {{0xcae2b1920b57f4bc, 0x2936df5ec6c9bc36, 0x7dea6482e11238bf,
1495       0x550663797b51f5d8},
1496      {0x44ffe216348a964c, 0x9fb3d576dbdefbe1, 0x0afa40018d9d50e5,
1497       0x157164848aecb851},
1498      {1, 0, 0, 0}},
1499     {{0xe48ecafffc5cde01, 0x7ccd84e70d715f26, 0xa2e8f483f43e4391,
1500       0xeb5d7745b21141ea},
1501      {0xcac917e2731a3479, 0x85f22cfe2844b645, 0x0990e6a158006cee,
1502       0xeafd72ebdbecc17b},
1503      {1, 0, 0, 0}},
1504     {{0x6cf20ffb313728be, 0x96439591a3c6b94a, 0x2736ff8344315fc5,
1505       0xa6d39677a7849276},
1506      {0xf2bab833c357f5f4, 0x824a920c2284059b, 0x66b8babd2d27ecdf,
1507       0x674f84749b0b8816},
1508      {1, 0, 0, 0}},
1509     {{0x2df48c04677c8a3e, 0x74e02f080203a56b, 0x31855f7db8c7fedb,
1510       0x4e769e7672c9ddad},
1511      {0xa4c36165b824bbb0, 0xfb9ae16f3b9122a5, 0x1ec0057206947281,
1512       0x42b99082de830663},
1513      {1, 0, 0, 0}},
1514     {{0x6ef95150dda868b9, 0xd1f89e799c0ce131, 0x7fdc1ca008a1c478,
1515       0x78878ef61c6ce04d},
1516      {0x9c62b9121fe0d976, 0x6ace570ebde08d4f, 0xde53142c12309def,
1517       0xb6cb3f5d7b72c321},
1518      {1, 0, 0, 0}},
1519     {{0x7f991ed2c31a3573, 0x5b82dd5bd54fb496, 0x595c5220812ffcae,
1520       0x0c88bc4d716b1287},
1521      {0x3a57bf635f48aca8, 0x7c8181f4df2564f3, 0x18d1b5b39c04e6aa,
1522       0xdd5ddea3f3901dc6},
1523      {1, 0, 0, 0}},
1524     {{0xe96a79fb3e72ad0c, 0x43a0a28c42ba792f, 0xefe0a423083e49f3,
1525       0x68f344af6b317466},
1526      {0xcdfe17db3fb24d4a, 0x668bfc2271f5c626, 0x604ed93c24d67ff3,
1527       0x31b9c405f8540a20},
1528      {1, 0, 0, 0}},
1529     {{0xd36b4789a2582e7f, 0x0d1a10144ec39c28, 0x663c62c3edbad7a0,
1530       0x4052bf4b6f461db9},
1531      {0x235a27c3188d25eb, 0xe724f33999bfcc5b, 0x862be6bd71d70cc8,
1532       0xfecf4d5190b0fc61},
1533      {1, 0, 0, 0}},
1534     {{0x74346c10a1d4cfac, 0xafdf5cc08526a7a4, 0x123202a8f62bff7a,
1535       0x1eddbae2c802e41a},
1536      {0x8fa0af2dd603f844, 0x36e06b7e4c701917, 0x0c45f45273db33a0,
1537       0x43104d86560ebcfc},
1538      {1, 0, 0, 0}},
1539     {{0x9615b5110d1d78e5, 0x66b0de3225c4744b, 0x0a4a46fb6aaf363a,
1540       0xb48e26b484f7a21c},
1541      {0x06ebb0f621a01b2d, 0xc004e4048b7b0f98, 0x64131bcdfed6f668,
1542       0xfac015404d4d3dab},
1543      {1, 0, 0, 0}}},
1544    {{{0, 0, 0, 0},
1545      {0, 0, 0, 0},
1546      {0, 0, 0, 0}},
1547     {{0x3a5a9e22185a5943, 0x1ab919365c65dfb6, 0x21656b32262c71da,
1548       0x7fe36b40af22af89},
1549      {0xd50d152c699ca101, 0x74b3d5867b8af212, 0x9f09f40407dca6f1,
1550       0xe697d45825b63624},
1551      {1, 0, 0, 0}},
1552     {{0xa84aa9397512218e, 0xe9a521b074ca0141, 0x57880b3a18a2e902,
1553       0x4a5b506612a677a6},
1554      {0x0beada7a4c4f3840, 0x626db15419e26d9d, 0xc42604fbe1627d40,
1555       0xeb13461ceac089f1},
1556      {1, 0, 0, 0}},
1557     {{0xf9faed0927a43281, 0x5e52c4144103ecbc, 0xc342967aa815c857,
1558       0x0781b8291c6a220a},
1559      {0x5a8343ceeac55f80, 0x88f80eeee54a05e3, 0x97b2a14f12916434,
1560       0x690cde8df0151593},
1561      {1, 0, 0, 0}},
1562     {{0xaee9c75df7f82f2a, 0x9e4c35874afdf43a, 0xf5622df437371326,
1563       0x8a535f566ec73617},
1564      {0xc5f9a0ac223094b7, 0xcde533864c8c7669, 0x37e02819085a92bf,
1565       0x0455c08468b08bd7},
1566      {1, 0, 0, 0}},
1567     {{0x0c0a6e2c9477b5d9, 0xf9a4bf62876dc444, 0x5050a949b6cdc279,
1568       0x06bada7ab77f8276},
1569      {0xc8b4aed1ea48dac9, 0xdebd8a4b7ea1070f, 0x427d49101366eb70,
1570       0x5b476dfd0e6cb18a},
1571      {1, 0, 0, 0}},
1572     {{0x7c5c3e44278c340a, 0x4d54606812d66f3b, 0x29a751b1ae23c5d8,
1573       0x3e29864e8a2ec908},
1574      {0x142d2a6626dbb850, 0xad1744c4765bd780, 0x1f150e68e322d1ed,
1575       0x239b90ea3dc31e7e},
1576      {1, 0, 0, 0}},
1577     {{0x78c416527a53322a, 0x305dde6709776f8e, 0xdbcab759f8862ed4,
1578       0x820f4dd949f72ff7},
1579      {0x6cc544a62b5debd4, 0x75be5d937b4e8cc4, 0x1b481b1b215c14d3,
1580       0x140406ec783a05ec},
1581      {1, 0, 0, 0}},
1582     {{0x6a703f10e895df07, 0xfd75f3fa01876bd8, 0xeb5b06e70ce08ffe,
1583       0x68f6b8542783dfee},
1584      {0x90c76f8a78712655, 0xcf5293d2f310bf7f, 0xfbc8044dfda45028,
1585       0xcbe1feba92e40ce6},
1586      {1, 0, 0, 0}},
1587     {{0xe998ceea4396e4c1, 0xfc82ef0b6acea274, 0x230f729f2250e927,
1588       0xd0b2f94d2f420109},
1589      {0x4305adddb38d4966, 0x10b838f8624c3b45, 0x7db2636658954e7a,
1590       0x971459828b0719e5},
1591      {1, 0, 0, 0}},
1592     {{0x4bd6b72623369fc9, 0x57f2929e53d0b876, 0xc2d5cba4f2340687,
1593       0x961610004a866aba},
1594      {0x49997bcd2e407a5e, 0x69ab197d92ddcb24, 0x2cf1f2438fe5131c,
1595       0x7acb9fadcee75e44},
1596      {1, 0, 0, 0}},
1597     {{0x254e839423d2d4c0, 0xf57f0c917aea685b, 0xa60d880f6f75aaea,
1598       0x24eb9acca333bf5b},
1599      {0xe3de4ccb1cda5dea, 0xfeef9341c51a6b4f, 0x743125f88bac4c4d,
1600       0x69f891c5acd079cc},
1601      {1, 0, 0, 0}},
1602     {{0xeee44b35702476b5, 0x7ed031a0e45c2258, 0xb422d1e7bd6f8514,
1603       0xe51f547c5972a107},
1604      {0xa25bcd6fc9cf343d, 0x8ca922ee097c184e, 0xa62f98b3a9fe9a06,
1605       0x1c309a2b25bb1387},
1606      {1, 0, 0, 0}},
1607     {{0x9295dbeb1967c459, 0xb00148833472c98e, 0xc504977708011828,
1608       0x20b87b8aa2c4e503},
1609      {0x3063175de057c277, 0x1bd539338fe582dd, 0x0d11adef5f69a044,
1610       0xf5c6fa49919776be},
1611      {1, 0, 0, 0}},
1612     {{0x8c944e760fd59e11, 0x3876cba1102fad5f, 0xa454c3fad83faa56,
1613       0x1ed7d1b9332010b9},
1614      {0xa1011a270024b889, 0x05e4d0dcac0cd344, 0x52b520f0eb6a2a24,
1615       0x3a2b03f03217257a},
1616      {1, 0, 0, 0}},
1617     {{0xf20fc2afdf1d043d, 0xf330240db58d5a62, 0xfc7d229ca0058c3b,
1618       0x15fee545c78dd9f6},
1619      {0x501e82885bc98cda, 0x41ef80e5d046ac04, 0x557d9f49461210fb,
1620       0x4ab5b6b2b8753f81},
1621      {1, 0, 0, 0}}}
1622};
1623
1624/*
1625 * select_point selects the |idx|th point from a precomputation table and
1626 * copies it to out.
1627 */
1628static void select_point(const u64 idx, unsigned int size,
1629                         const smallfelem pre_comp[16][3], smallfelem out[3])
1630{
1631    unsigned i, j;
1632    u64 *outlimbs = &out[0][0];
1633    memset(outlimbs, 0, 3 * sizeof(smallfelem));
1634
1635    for (i = 0; i < size; i++) {
1636        const u64 *inlimbs = (u64 *)&pre_comp[i][0][0];
1637        u64 mask = i ^ idx;
1638        mask |= mask >> 4;
1639        mask |= mask >> 2;
1640        mask |= mask >> 1;
1641        mask &= 1;
1642        mask--;
1643        for (j = 0; j < NLIMBS * 3; j++)
1644            outlimbs[j] |= inlimbs[j] & mask;
1645    }
1646}
1647
1648/* get_bit returns the |i|th bit in |in| */
1649static char get_bit(const felem_bytearray in, int i)
1650{
1651    if ((i < 0) || (i >= 256))
1652        return 0;
1653    return (in[i >> 3] >> (i & 7)) & 1;
1654}
1655
1656/*
1657 * Interleaved point multiplication using precomputed point multiples: The
1658 * small point multiples 0*P, 1*P, ..., 17*P are in pre_comp[], the scalars
1659 * in scalars[]. If g_scalar is non-NULL, we also add this multiple of the
1660 * generator, using certain (large) precomputed multiples in g_pre_comp.
1661 * Output point (X, Y, Z) is stored in x_out, y_out, z_out
1662 */
1663static void batch_mul(felem x_out, felem y_out, felem z_out,
1664                      const felem_bytearray scalars[],
1665                      const unsigned num_points, const u8 *g_scalar,
1666                      const int mixed, const smallfelem pre_comp[][17][3],
1667                      const smallfelem g_pre_comp[2][16][3])
1668{
1669    int i, skip;
1670    unsigned num, gen_mul = (g_scalar != NULL);
1671    felem nq[3], ftmp;
1672    smallfelem tmp[3];
1673    u64 bits;
1674    u8 sign, digit;
1675
1676    /* set nq to the point at infinity */
1677    memset(nq, 0, 3 * sizeof(felem));
1678
1679    /*
1680     * Loop over all scalars msb-to-lsb, interleaving additions of multiples
1681     * of the generator (two in each of the last 32 rounds) and additions of
1682     * other points multiples (every 5th round).
1683     */
1684    skip = 1;                   /* save two point operations in the first
1685                                 * round */
1686    for (i = (num_points ? 255 : 31); i >= 0; --i) {
1687        /* double */
1688        if (!skip)
1689            point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]);
1690
1691        /* add multiples of the generator */
1692        if (gen_mul && (i <= 31)) {
1693            /* first, look 32 bits upwards */
1694            bits = get_bit(g_scalar, i + 224) << 3;
1695            bits |= get_bit(g_scalar, i + 160) << 2;
1696            bits |= get_bit(g_scalar, i + 96) << 1;
1697            bits |= get_bit(g_scalar, i + 32);
1698            /* select the point to add, in constant time */
1699            select_point(bits, 16, g_pre_comp[1], tmp);
1700
1701            if (!skip) {
1702                /* Arg 1 below is for "mixed" */
1703                point_add(nq[0], nq[1], nq[2],
1704                          nq[0], nq[1], nq[2], 1, tmp[0], tmp[1], tmp[2]);
1705            } else {
1706                smallfelem_expand(nq[0], tmp[0]);
1707                smallfelem_expand(nq[1], tmp[1]);
1708                smallfelem_expand(nq[2], tmp[2]);
1709                skip = 0;
1710            }
1711
1712            /* second, look at the current position */
1713            bits = get_bit(g_scalar, i + 192) << 3;
1714            bits |= get_bit(g_scalar, i + 128) << 2;
1715            bits |= get_bit(g_scalar, i + 64) << 1;
1716            bits |= get_bit(g_scalar, i);
1717            /* select the point to add, in constant time */
1718            select_point(bits, 16, g_pre_comp[0], tmp);
1719            /* Arg 1 below is for "mixed" */
1720            point_add(nq[0], nq[1], nq[2],
1721                      nq[0], nq[1], nq[2], 1, tmp[0], tmp[1], tmp[2]);
1722        }
1723
1724        /* do other additions every 5 doublings */
1725        if (num_points && (i % 5 == 0)) {
1726            /* loop over all scalars */
1727            for (num = 0; num < num_points; ++num) {
1728                bits = get_bit(scalars[num], i + 4) << 5;
1729                bits |= get_bit(scalars[num], i + 3) << 4;
1730                bits |= get_bit(scalars[num], i + 2) << 3;
1731                bits |= get_bit(scalars[num], i + 1) << 2;
1732                bits |= get_bit(scalars[num], i) << 1;
1733                bits |= get_bit(scalars[num], i - 1);
1734                ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits);
1735
1736                /*
1737                 * select the point to add or subtract, in constant time
1738                 */
1739                select_point(digit, 17, pre_comp[num], tmp);
1740                smallfelem_neg(ftmp, tmp[1]); /* (X, -Y, Z) is the negative
1741                                               * point */
1742                copy_small_conditional(ftmp, tmp[1], (((limb) sign) - 1));
1743                felem_contract(tmp[1], ftmp);
1744
1745                if (!skip) {
1746                    point_add(nq[0], nq[1], nq[2],
1747                              nq[0], nq[1], nq[2],
1748                              mixed, tmp[0], tmp[1], tmp[2]);
1749                } else {
1750                    smallfelem_expand(nq[0], tmp[0]);
1751                    smallfelem_expand(nq[1], tmp[1]);
1752                    smallfelem_expand(nq[2], tmp[2]);
1753                    skip = 0;
1754                }
1755            }
1756        }
1757    }
1758    felem_assign(x_out, nq[0]);
1759    felem_assign(y_out, nq[1]);
1760    felem_assign(z_out, nq[2]);
1761}
1762
1763/* Precomputation for the group generator. */
1764typedef struct {
1765    smallfelem g_pre_comp[2][16][3];
1766    int references;
1767} NISTP256_PRE_COMP;
1768
1769const EC_METHOD *EC_GFp_nistp256_method(void)
1770{
1771    static const EC_METHOD ret = {
1772        EC_FLAGS_DEFAULT_OCT,
1773        NID_X9_62_prime_field,
1774        ec_GFp_nistp256_group_init,
1775        ec_GFp_simple_group_finish,
1776        ec_GFp_simple_group_clear_finish,
1777        ec_GFp_nist_group_copy,
1778        ec_GFp_nistp256_group_set_curve,
1779        ec_GFp_simple_group_get_curve,
1780        ec_GFp_simple_group_get_degree,
1781        ec_GFp_simple_group_check_discriminant,
1782        ec_GFp_simple_point_init,
1783        ec_GFp_simple_point_finish,
1784        ec_GFp_simple_point_clear_finish,
1785        ec_GFp_simple_point_copy,
1786        ec_GFp_simple_point_set_to_infinity,
1787        ec_GFp_simple_set_Jprojective_coordinates_GFp,
1788        ec_GFp_simple_get_Jprojective_coordinates_GFp,
1789        ec_GFp_simple_point_set_affine_coordinates,
1790        ec_GFp_nistp256_point_get_affine_coordinates,
1791        0 /* point_set_compressed_coordinates */ ,
1792        0 /* point2oct */ ,
1793        0 /* oct2point */ ,
1794        ec_GFp_simple_add,
1795        ec_GFp_simple_dbl,
1796        ec_GFp_simple_invert,
1797        ec_GFp_simple_is_at_infinity,
1798        ec_GFp_simple_is_on_curve,
1799        ec_GFp_simple_cmp,
1800        ec_GFp_simple_make_affine,
1801        ec_GFp_simple_points_make_affine,
1802        ec_GFp_nistp256_points_mul,
1803        ec_GFp_nistp256_precompute_mult,
1804        ec_GFp_nistp256_have_precompute_mult,
1805        ec_GFp_nist_field_mul,
1806        ec_GFp_nist_field_sqr,
1807        0 /* field_div */ ,
1808        0 /* field_encode */ ,
1809        0 /* field_decode */ ,
1810        0                       /* field_set_to_one */
1811    };
1812
1813    return &ret;
1814}
1815
1816/******************************************************************************/
1817/*
1818 * FUNCTIONS TO MANAGE PRECOMPUTATION
1819 */
1820
1821static NISTP256_PRE_COMP *nistp256_pre_comp_new()
1822{
1823    NISTP256_PRE_COMP *ret = NULL;
1824    ret = (NISTP256_PRE_COMP *) OPENSSL_malloc(sizeof(*ret));
1825    if (!ret) {
1826        ECerr(EC_F_NISTP256_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE);
1827        return ret;
1828    }
1829    memset(ret->g_pre_comp, 0, sizeof(ret->g_pre_comp));
1830    ret->references = 1;
1831    return ret;
1832}
1833
1834static void *nistp256_pre_comp_dup(void *src_)
1835{
1836    NISTP256_PRE_COMP *src = src_;
1837
1838    /* no need to actually copy, these objects never change! */
1839    CRYPTO_add(&src->references, 1, CRYPTO_LOCK_EC_PRE_COMP);
1840
1841    return src_;
1842}
1843
1844static void nistp256_pre_comp_free(void *pre_)
1845{
1846    int i;
1847    NISTP256_PRE_COMP *pre = pre_;
1848
1849    if (!pre)
1850        return;
1851
1852    i = CRYPTO_add(&pre->references, -1, CRYPTO_LOCK_EC_PRE_COMP);
1853    if (i > 0)
1854        return;
1855
1856    OPENSSL_free(pre);
1857}
1858
1859static void nistp256_pre_comp_clear_free(void *pre_)
1860{
1861    int i;
1862    NISTP256_PRE_COMP *pre = pre_;
1863
1864    if (!pre)
1865        return;
1866
1867    i = CRYPTO_add(&pre->references, -1, CRYPTO_LOCK_EC_PRE_COMP);
1868    if (i > 0)
1869        return;
1870
1871    OPENSSL_cleanse(pre, sizeof(*pre));
1872    OPENSSL_free(pre);
1873}
1874
1875/******************************************************************************/
1876/*
1877 * OPENSSL EC_METHOD FUNCTIONS
1878 */
1879
1880int ec_GFp_nistp256_group_init(EC_GROUP *group)
1881{
1882    int ret;
1883    ret = ec_GFp_simple_group_init(group);
1884    group->a_is_minus3 = 1;
1885    return ret;
1886}
1887
1888int ec_GFp_nistp256_group_set_curve(EC_GROUP *group, const BIGNUM *p,
1889                                    const BIGNUM *a, const BIGNUM *b,
1890                                    BN_CTX *ctx)
1891{
1892    int ret = 0;
1893    BN_CTX *new_ctx = NULL;
1894    BIGNUM *curve_p, *curve_a, *curve_b;
1895
1896    if (ctx == NULL)
1897        if ((ctx = new_ctx = BN_CTX_new()) == NULL)
1898            return 0;
1899    BN_CTX_start(ctx);
1900    if (((curve_p = BN_CTX_get(ctx)) == NULL) ||
1901        ((curve_a = BN_CTX_get(ctx)) == NULL) ||
1902        ((curve_b = BN_CTX_get(ctx)) == NULL))
1903        goto err;
1904    BN_bin2bn(nistp256_curve_params[0], sizeof(felem_bytearray), curve_p);
1905    BN_bin2bn(nistp256_curve_params[1], sizeof(felem_bytearray), curve_a);
1906    BN_bin2bn(nistp256_curve_params[2], sizeof(felem_bytearray), curve_b);
1907    if ((BN_cmp(curve_p, p)) || (BN_cmp(curve_a, a)) || (BN_cmp(curve_b, b))) {
1908        ECerr(EC_F_EC_GFP_NISTP256_GROUP_SET_CURVE,
1909              EC_R_WRONG_CURVE_PARAMETERS);
1910        goto err;
1911    }
1912    group->field_mod_func = BN_nist_mod_256;
1913    ret = ec_GFp_simple_group_set_curve(group, p, a, b, ctx);
1914 err:
1915    BN_CTX_end(ctx);
1916    if (new_ctx != NULL)
1917        BN_CTX_free(new_ctx);
1918    return ret;
1919}
1920
1921/*
1922 * Takes the Jacobian coordinates (X, Y, Z) of a point and returns (X', Y') =
1923 * (X/Z^2, Y/Z^3)
1924 */
1925int ec_GFp_nistp256_point_get_affine_coordinates(const EC_GROUP *group,
1926                                                 const EC_POINT *point,
1927                                                 BIGNUM *x, BIGNUM *y,
1928                                                 BN_CTX *ctx)
1929{
1930    felem z1, z2, x_in, y_in;
1931    smallfelem x_out, y_out;
1932    longfelem tmp;
1933
1934    if (EC_POINT_is_at_infinity(group, point)) {
1935        ECerr(EC_F_EC_GFP_NISTP256_POINT_GET_AFFINE_COORDINATES,
1936              EC_R_POINT_AT_INFINITY);
1937        return 0;
1938    }
1939    if ((!BN_to_felem(x_in, &point->X)) || (!BN_to_felem(y_in, &point->Y)) ||
1940        (!BN_to_felem(z1, &point->Z)))
1941        return 0;
1942    felem_inv(z2, z1);
1943    felem_square(tmp, z2);
1944    felem_reduce(z1, tmp);
1945    felem_mul(tmp, x_in, z1);
1946    felem_reduce(x_in, tmp);
1947    felem_contract(x_out, x_in);
1948    if (x != NULL) {
1949        if (!smallfelem_to_BN(x, x_out)) {
1950            ECerr(EC_F_EC_GFP_NISTP256_POINT_GET_AFFINE_COORDINATES,
1951                  ERR_R_BN_LIB);
1952            return 0;
1953        }
1954    }
1955    felem_mul(tmp, z1, z2);
1956    felem_reduce(z1, tmp);
1957    felem_mul(tmp, y_in, z1);
1958    felem_reduce(y_in, tmp);
1959    felem_contract(y_out, y_in);
1960    if (y != NULL) {
1961        if (!smallfelem_to_BN(y, y_out)) {
1962            ECerr(EC_F_EC_GFP_NISTP256_POINT_GET_AFFINE_COORDINATES,
1963                  ERR_R_BN_LIB);
1964            return 0;
1965        }
1966    }
1967    return 1;
1968}
1969
1970/* points below is of size |num|, and tmp_smallfelems is of size |num+1| */
1971static void make_points_affine(size_t num, smallfelem points[][3],
1972                               smallfelem tmp_smallfelems[])
1973{
1974    /*
1975     * Runs in constant time, unless an input is the point at infinity (which
1976     * normally shouldn't happen).
1977     */
1978    ec_GFp_nistp_points_make_affine_internal(num,
1979                                             points,
1980                                             sizeof(smallfelem),
1981                                             tmp_smallfelems,
1982                                             (void (*)(void *))smallfelem_one,
1983                                             smallfelem_is_zero_int,
1984                                             (void (*)(void *, const void *))
1985                                             smallfelem_assign,
1986                                             (void (*)(void *, const void *))
1987                                             smallfelem_square_contract,
1988                                             (void (*)
1989                                              (void *, const void *,
1990                                               const void *))
1991                                             smallfelem_mul_contract,
1992                                             (void (*)(void *, const void *))
1993                                             smallfelem_inv_contract,
1994                                             /* nothing to contract */
1995                                             (void (*)(void *, const void *))
1996                                             smallfelem_assign);
1997}
1998
1999/*
2000 * Computes scalar*generator + \sum scalars[i]*points[i], ignoring NULL
2001 * values Result is stored in r (r can equal one of the inputs).
2002 */
2003int ec_GFp_nistp256_points_mul(const EC_GROUP *group, EC_POINT *r,
2004                               const BIGNUM *scalar, size_t num,
2005                               const EC_POINT *points[],
2006                               const BIGNUM *scalars[], BN_CTX *ctx)
2007{
2008    int ret = 0;
2009    int j;
2010    int mixed = 0;
2011    BN_CTX *new_ctx = NULL;
2012    BIGNUM *x, *y, *z, *tmp_scalar;
2013    felem_bytearray g_secret;
2014    felem_bytearray *secrets = NULL;
2015    smallfelem(*pre_comp)[17][3] = NULL;
2016    smallfelem *tmp_smallfelems = NULL;
2017    felem_bytearray tmp;
2018    unsigned i, num_bytes;
2019    int have_pre_comp = 0;
2020    size_t num_points = num;
2021    smallfelem x_in, y_in, z_in;
2022    felem x_out, y_out, z_out;
2023    NISTP256_PRE_COMP *pre = NULL;
2024    const smallfelem(*g_pre_comp)[16][3] = NULL;
2025    EC_POINT *generator = NULL;
2026    const EC_POINT *p = NULL;
2027    const BIGNUM *p_scalar = NULL;
2028
2029    if (ctx == NULL)
2030        if ((ctx = new_ctx = BN_CTX_new()) == NULL)
2031            return 0;
2032    BN_CTX_start(ctx);
2033    if (((x = BN_CTX_get(ctx)) == NULL) ||
2034        ((y = BN_CTX_get(ctx)) == NULL) ||
2035        ((z = BN_CTX_get(ctx)) == NULL) ||
2036        ((tmp_scalar = BN_CTX_get(ctx)) == NULL))
2037        goto err;
2038
2039    if (scalar != NULL) {
2040        pre = EC_EX_DATA_get_data(group->extra_data,
2041                                  nistp256_pre_comp_dup,
2042                                  nistp256_pre_comp_free,
2043                                  nistp256_pre_comp_clear_free);
2044        if (pre)
2045            /* we have precomputation, try to use it */
2046            g_pre_comp = (const smallfelem(*)[16][3])pre->g_pre_comp;
2047        else
2048            /* try to use the standard precomputation */
2049            g_pre_comp = &gmul[0];
2050        generator = EC_POINT_new(group);
2051        if (generator == NULL)
2052            goto err;
2053        /* get the generator from precomputation */
2054        if (!smallfelem_to_BN(x, g_pre_comp[0][1][0]) ||
2055            !smallfelem_to_BN(y, g_pre_comp[0][1][1]) ||
2056            !smallfelem_to_BN(z, g_pre_comp[0][1][2])) {
2057            ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_BN_LIB);
2058            goto err;
2059        }
2060        if (!EC_POINT_set_Jprojective_coordinates_GFp(group,
2061                                                      generator, x, y, z,
2062                                                      ctx))
2063            goto err;
2064        if (0 == EC_POINT_cmp(group, generator, group->generator, ctx))
2065            /* precomputation matches generator */
2066            have_pre_comp = 1;
2067        else
2068            /*
2069             * we don't have valid precomputation: treat the generator as a
2070             * random point
2071             */
2072            num_points++;
2073    }
2074    if (num_points > 0) {
2075        if (num_points >= 3) {
2076            /*
2077             * unless we precompute multiples for just one or two points,
2078             * converting those into affine form is time well spent
2079             */
2080            mixed = 1;
2081        }
2082        secrets = OPENSSL_malloc(num_points * sizeof(felem_bytearray));
2083        pre_comp = OPENSSL_malloc(num_points * 17 * 3 * sizeof(smallfelem));
2084        if (mixed)
2085            tmp_smallfelems =
2086                OPENSSL_malloc((num_points * 17 + 1) * sizeof(smallfelem));
2087        if ((secrets == NULL) || (pre_comp == NULL)
2088            || (mixed && (tmp_smallfelems == NULL))) {
2089            ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_MALLOC_FAILURE);
2090            goto err;
2091        }
2092
2093        /*
2094         * we treat NULL scalars as 0, and NULL points as points at infinity,
2095         * i.e., they contribute nothing to the linear combination
2096         */
2097        memset(secrets, 0, num_points * sizeof(felem_bytearray));
2098        memset(pre_comp, 0, num_points * 17 * 3 * sizeof(smallfelem));
2099        for (i = 0; i < num_points; ++i) {
2100            if (i == num)
2101                /*
2102                 * we didn't have a valid precomputation, so we pick the
2103                 * generator
2104                 */
2105            {
2106                p = EC_GROUP_get0_generator(group);
2107                p_scalar = scalar;
2108            } else
2109                /* the i^th point */
2110            {
2111                p = points[i];
2112                p_scalar = scalars[i];
2113            }
2114            if ((p_scalar != NULL) && (p != NULL)) {
2115                /* reduce scalar to 0 <= scalar < 2^256 */
2116                if ((BN_num_bits(p_scalar) > 256)
2117                    || (BN_is_negative(p_scalar))) {
2118                    /*
2119                     * this is an unusual input, and we don't guarantee
2120                     * constant-timeness
2121                     */
2122                    if (!BN_nnmod(tmp_scalar, p_scalar, &group->order, ctx)) {
2123                        ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_BN_LIB);
2124                        goto err;
2125                    }
2126                    num_bytes = BN_bn2bin(tmp_scalar, tmp);
2127                } else
2128                    num_bytes = BN_bn2bin(p_scalar, tmp);
2129                flip_endian(secrets[i], tmp, num_bytes);
2130                /* precompute multiples */
2131                if ((!BN_to_felem(x_out, &p->X)) ||
2132                    (!BN_to_felem(y_out, &p->Y)) ||
2133                    (!BN_to_felem(z_out, &p->Z)))
2134                    goto err;
2135                felem_shrink(pre_comp[i][1][0], x_out);
2136                felem_shrink(pre_comp[i][1][1], y_out);
2137                felem_shrink(pre_comp[i][1][2], z_out);
2138                for (j = 2; j <= 16; ++j) {
2139                    if (j & 1) {
2140                        point_add_small(pre_comp[i][j][0], pre_comp[i][j][1],
2141                                        pre_comp[i][j][2], pre_comp[i][1][0],
2142                                        pre_comp[i][1][1], pre_comp[i][1][2],
2143                                        pre_comp[i][j - 1][0],
2144                                        pre_comp[i][j - 1][1],
2145                                        pre_comp[i][j - 1][2]);
2146                    } else {
2147                        point_double_small(pre_comp[i][j][0],
2148                                           pre_comp[i][j][1],
2149                                           pre_comp[i][j][2],
2150                                           pre_comp[i][j / 2][0],
2151                                           pre_comp[i][j / 2][1],
2152                                           pre_comp[i][j / 2][2]);
2153                    }
2154                }
2155            }
2156        }
2157        if (mixed)
2158            make_points_affine(num_points * 17, pre_comp[0], tmp_smallfelems);
2159    }
2160
2161    /* the scalar for the generator */
2162    if ((scalar != NULL) && (have_pre_comp)) {
2163        memset(g_secret, 0, sizeof(g_secret));
2164        /* reduce scalar to 0 <= scalar < 2^256 */
2165        if ((BN_num_bits(scalar) > 256) || (BN_is_negative(scalar))) {
2166            /*
2167             * this is an unusual input, and we don't guarantee
2168             * constant-timeness
2169             */
2170            if (!BN_nnmod(tmp_scalar, scalar, &group->order, ctx)) {
2171                ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_BN_LIB);
2172                goto err;
2173            }
2174            num_bytes = BN_bn2bin(tmp_scalar, tmp);
2175        } else
2176            num_bytes = BN_bn2bin(scalar, tmp);
2177        flip_endian(g_secret, tmp, num_bytes);
2178        /* do the multiplication with generator precomputation */
2179        batch_mul(x_out, y_out, z_out,
2180                  (const felem_bytearray(*))secrets, num_points,
2181                  g_secret,
2182                  mixed, (const smallfelem(*)[17][3])pre_comp, g_pre_comp);
2183    } else
2184        /* do the multiplication without generator precomputation */
2185        batch_mul(x_out, y_out, z_out,
2186                  (const felem_bytearray(*))secrets, num_points,
2187                  NULL, mixed, (const smallfelem(*)[17][3])pre_comp, NULL);
2188    /* reduce the output to its unique minimal representation */
2189    felem_contract(x_in, x_out);
2190    felem_contract(y_in, y_out);
2191    felem_contract(z_in, z_out);
2192    if ((!smallfelem_to_BN(x, x_in)) || (!smallfelem_to_BN(y, y_in)) ||
2193        (!smallfelem_to_BN(z, z_in))) {
2194        ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_BN_LIB);
2195        goto err;
2196    }
2197    ret = EC_POINT_set_Jprojective_coordinates_GFp(group, r, x, y, z, ctx);
2198
2199 err:
2200    BN_CTX_end(ctx);
2201    if (generator != NULL)
2202        EC_POINT_free(generator);
2203    if (new_ctx != NULL)
2204        BN_CTX_free(new_ctx);
2205    if (secrets != NULL)
2206        OPENSSL_free(secrets);
2207    if (pre_comp != NULL)
2208        OPENSSL_free(pre_comp);
2209    if (tmp_smallfelems != NULL)
2210        OPENSSL_free(tmp_smallfelems);
2211    return ret;
2212}
2213
2214int ec_GFp_nistp256_precompute_mult(EC_GROUP *group, BN_CTX *ctx)
2215{
2216    int ret = 0;
2217    NISTP256_PRE_COMP *pre = NULL;
2218    int i, j;
2219    BN_CTX *new_ctx = NULL;
2220    BIGNUM *x, *y;
2221    EC_POINT *generator = NULL;
2222    smallfelem tmp_smallfelems[32];
2223    felem x_tmp, y_tmp, z_tmp;
2224
2225    /* throw away old precomputation */
2226    EC_EX_DATA_free_data(&group->extra_data, nistp256_pre_comp_dup,
2227                         nistp256_pre_comp_free,
2228                         nistp256_pre_comp_clear_free);
2229    if (ctx == NULL)
2230        if ((ctx = new_ctx = BN_CTX_new()) == NULL)
2231            return 0;
2232    BN_CTX_start(ctx);
2233    if (((x = BN_CTX_get(ctx)) == NULL) || ((y = BN_CTX_get(ctx)) == NULL))
2234        goto err;
2235    /* get the generator */
2236    if (group->generator == NULL)
2237        goto err;
2238    generator = EC_POINT_new(group);
2239    if (generator == NULL)
2240        goto err;
2241    BN_bin2bn(nistp256_curve_params[3], sizeof(felem_bytearray), x);
2242    BN_bin2bn(nistp256_curve_params[4], sizeof(felem_bytearray), y);
2243    if (!EC_POINT_set_affine_coordinates_GFp(group, generator, x, y, ctx))
2244        goto err;
2245    if ((pre = nistp256_pre_comp_new()) == NULL)
2246        goto err;
2247    /*
2248     * if the generator is the standard one, use built-in precomputation
2249     */
2250    if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) {
2251        memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp));
2252        goto done;
2253    }
2254    if ((!BN_to_felem(x_tmp, &group->generator->X)) ||
2255        (!BN_to_felem(y_tmp, &group->generator->Y)) ||
2256        (!BN_to_felem(z_tmp, &group->generator->Z)))
2257        goto err;
2258    felem_shrink(pre->g_pre_comp[0][1][0], x_tmp);
2259    felem_shrink(pre->g_pre_comp[0][1][1], y_tmp);
2260    felem_shrink(pre->g_pre_comp[0][1][2], z_tmp);
2261    /*
2262     * compute 2^64*G, 2^128*G, 2^192*G for the first table, 2^32*G, 2^96*G,
2263     * 2^160*G, 2^224*G for the second one
2264     */
2265    for (i = 1; i <= 8; i <<= 1) {
2266        point_double_small(pre->g_pre_comp[1][i][0], pre->g_pre_comp[1][i][1],
2267                           pre->g_pre_comp[1][i][2], pre->g_pre_comp[0][i][0],
2268                           pre->g_pre_comp[0][i][1],
2269                           pre->g_pre_comp[0][i][2]);
2270        for (j = 0; j < 31; ++j) {
2271            point_double_small(pre->g_pre_comp[1][i][0],
2272                               pre->g_pre_comp[1][i][1],
2273                               pre->g_pre_comp[1][i][2],
2274                               pre->g_pre_comp[1][i][0],
2275                               pre->g_pre_comp[1][i][1],
2276                               pre->g_pre_comp[1][i][2]);
2277        }
2278        if (i == 8)
2279            break;
2280        point_double_small(pre->g_pre_comp[0][2 * i][0],
2281                           pre->g_pre_comp[0][2 * i][1],
2282                           pre->g_pre_comp[0][2 * i][2],
2283                           pre->g_pre_comp[1][i][0], pre->g_pre_comp[1][i][1],
2284                           pre->g_pre_comp[1][i][2]);
2285        for (j = 0; j < 31; ++j) {
2286            point_double_small(pre->g_pre_comp[0][2 * i][0],
2287                               pre->g_pre_comp[0][2 * i][1],
2288                               pre->g_pre_comp[0][2 * i][2],
2289                               pre->g_pre_comp[0][2 * i][0],
2290                               pre->g_pre_comp[0][2 * i][1],
2291                               pre->g_pre_comp[0][2 * i][2]);
2292        }
2293    }
2294    for (i = 0; i < 2; i++) {
2295        /* g_pre_comp[i][0] is the point at infinity */
2296        memset(pre->g_pre_comp[i][0], 0, sizeof(pre->g_pre_comp[i][0]));
2297        /* the remaining multiples */
2298        /* 2^64*G + 2^128*G resp. 2^96*G + 2^160*G */
2299        point_add_small(pre->g_pre_comp[i][6][0], pre->g_pre_comp[i][6][1],
2300                        pre->g_pre_comp[i][6][2], pre->g_pre_comp[i][4][0],
2301                        pre->g_pre_comp[i][4][1], pre->g_pre_comp[i][4][2],
2302                        pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1],
2303                        pre->g_pre_comp[i][2][2]);
2304        /* 2^64*G + 2^192*G resp. 2^96*G + 2^224*G */
2305        point_add_small(pre->g_pre_comp[i][10][0], pre->g_pre_comp[i][10][1],
2306                        pre->g_pre_comp[i][10][2], pre->g_pre_comp[i][8][0],
2307                        pre->g_pre_comp[i][8][1], pre->g_pre_comp[i][8][2],
2308                        pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1],
2309                        pre->g_pre_comp[i][2][2]);
2310        /* 2^128*G + 2^192*G resp. 2^160*G + 2^224*G */
2311        point_add_small(pre->g_pre_comp[i][12][0], pre->g_pre_comp[i][12][1],
2312                        pre->g_pre_comp[i][12][2], pre->g_pre_comp[i][8][0],
2313                        pre->g_pre_comp[i][8][1], pre->g_pre_comp[i][8][2],
2314                        pre->g_pre_comp[i][4][0], pre->g_pre_comp[i][4][1],
2315                        pre->g_pre_comp[i][4][2]);
2316        /*
2317         * 2^64*G + 2^128*G + 2^192*G resp. 2^96*G + 2^160*G + 2^224*G
2318         */
2319        point_add_small(pre->g_pre_comp[i][14][0], pre->g_pre_comp[i][14][1],
2320                        pre->g_pre_comp[i][14][2], pre->g_pre_comp[i][12][0],
2321                        pre->g_pre_comp[i][12][1], pre->g_pre_comp[i][12][2],
2322                        pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1],
2323                        pre->g_pre_comp[i][2][2]);
2324        for (j = 1; j < 8; ++j) {
2325            /* odd multiples: add G resp. 2^32*G */
2326            point_add_small(pre->g_pre_comp[i][2 * j + 1][0],
2327                            pre->g_pre_comp[i][2 * j + 1][1],
2328                            pre->g_pre_comp[i][2 * j + 1][2],
2329                            pre->g_pre_comp[i][2 * j][0],
2330                            pre->g_pre_comp[i][2 * j][1],
2331                            pre->g_pre_comp[i][2 * j][2],
2332                            pre->g_pre_comp[i][1][0],
2333                            pre->g_pre_comp[i][1][1],
2334                            pre->g_pre_comp[i][1][2]);
2335        }
2336    }
2337    make_points_affine(31, &(pre->g_pre_comp[0][1]), tmp_smallfelems);
2338
2339 done:
2340    if (!EC_EX_DATA_set_data(&group->extra_data, pre, nistp256_pre_comp_dup,
2341                             nistp256_pre_comp_free,
2342                             nistp256_pre_comp_clear_free))
2343        goto err;
2344    ret = 1;
2345    pre = NULL;
2346 err:
2347    BN_CTX_end(ctx);
2348    if (generator != NULL)
2349        EC_POINT_free(generator);
2350    if (new_ctx != NULL)
2351        BN_CTX_free(new_ctx);
2352    if (pre)
2353        nistp256_pre_comp_free(pre);
2354    return ret;
2355}
2356
2357int ec_GFp_nistp256_have_precompute_mult(const EC_GROUP *group)
2358{
2359    if (EC_EX_DATA_get_data(group->extra_data, nistp256_pre_comp_dup,
2360                            nistp256_pre_comp_free,
2361                            nistp256_pre_comp_clear_free)
2362        != NULL)
2363        return 1;
2364    else
2365        return 0;
2366}
2367#else
2368static void *dummy = &dummy;
2369#endif
2370