ecp_nistp256.c revision 280304
1/* crypto/ec/ecp_nistp256.c */
2/*
3 * Written by Adam Langley (Google) for the OpenSSL project
4 */
5/* Copyright 2011 Google Inc.
6 *
7 * Licensed under the Apache License, Version 2.0 (the "License");
8 *
9 * you may not use this file except in compliance with the License.
10 * You may obtain a copy of the License at
11 *
12 *     http://www.apache.org/licenses/LICENSE-2.0
13 *
14 *  Unless required by applicable law or agreed to in writing, software
15 *  distributed under the License is distributed on an "AS IS" BASIS,
16 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 *  See the License for the specific language governing permissions and
18 *  limitations under the License.
19 */
20
21/*
22 * A 64-bit implementation of the NIST P-256 elliptic curve point multiplication
23 *
24 * OpenSSL integration was taken from Emilia Kasper's work in ecp_nistp224.c.
25 * Otherwise based on Emilia's P224 work, which was inspired by my curve25519
26 * work which got its smarts from Daniel J. Bernstein's work on the same.
27 */
28
29#include <openssl/opensslconf.h>
30#ifndef OPENSSL_NO_EC_NISTP_64_GCC_128
31
32# ifndef OPENSSL_SYS_VMS
33#  include <stdint.h>
34# else
35#  include <inttypes.h>
36# endif
37
38# include <string.h>
39# include <openssl/err.h>
40# include "ec_lcl.h"
41
42# if defined(__GNUC__) && (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))
43  /* even with gcc, the typedef won't work for 32-bit platforms */
44typedef __uint128_t uint128_t;  /* nonstandard; implemented by gcc on 64-bit
45                                 * platforms */
46typedef __int128_t int128_t;
47# else
48#  error "Need GCC 3.1 or later to define type uint128_t"
49# endif
50
51typedef uint8_t u8;
52typedef uint32_t u32;
53typedef uint64_t u64;
54typedef int64_t s64;
55
56/*
57 * The underlying field. P256 operates over GF(2^256-2^224+2^192+2^96-1). We
58 * can serialise an element of this field into 32 bytes. We call this an
59 * felem_bytearray.
60 */
61
62typedef u8 felem_bytearray[32];
63
64/*
65 * These are the parameters of P256, taken from FIPS 186-3, page 86. These
66 * values are big-endian.
67 */
68static const felem_bytearray nistp256_curve_params[5] = {
69    {0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, /* p */
70     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
71     0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,
72     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
73    {0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, /* a = -3 */
74     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
75     0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,
76     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xfc}, /* b */
77    {0x5a, 0xc6, 0x35, 0xd8, 0xaa, 0x3a, 0x93, 0xe7,
78     0xb3, 0xeb, 0xbd, 0x55, 0x76, 0x98, 0x86, 0xbc,
79     0x65, 0x1d, 0x06, 0xb0, 0xcc, 0x53, 0xb0, 0xf6,
80     0x3b, 0xce, 0x3c, 0x3e, 0x27, 0xd2, 0x60, 0x4b},
81    {0x6b, 0x17, 0xd1, 0xf2, 0xe1, 0x2c, 0x42, 0x47, /* x */
82     0xf8, 0xbc, 0xe6, 0xe5, 0x63, 0xa4, 0x40, 0xf2,
83     0x77, 0x03, 0x7d, 0x81, 0x2d, 0xeb, 0x33, 0xa0,
84     0xf4, 0xa1, 0x39, 0x45, 0xd8, 0x98, 0xc2, 0x96},
85    {0x4f, 0xe3, 0x42, 0xe2, 0xfe, 0x1a, 0x7f, 0x9b, /* y */
86     0x8e, 0xe7, 0xeb, 0x4a, 0x7c, 0x0f, 0x9e, 0x16,
87     0x2b, 0xce, 0x33, 0x57, 0x6b, 0x31, 0x5e, 0xce,
88     0xcb, 0xb6, 0x40, 0x68, 0x37, 0xbf, 0x51, 0xf5}
89};
90
91/*-
92 * The representation of field elements.
93 * ------------------------------------
94 *
95 * We represent field elements with either four 128-bit values, eight 128-bit
96 * values, or four 64-bit values. The field element represented is:
97 *   v[0]*2^0 + v[1]*2^64 + v[2]*2^128 + v[3]*2^192  (mod p)
98 * or:
99 *   v[0]*2^0 + v[1]*2^64 + v[2]*2^128 + ... + v[8]*2^512  (mod p)
100 *
101 * 128-bit values are called 'limbs'. Since the limbs are spaced only 64 bits
102 * apart, but are 128-bits wide, the most significant bits of each limb overlap
103 * with the least significant bits of the next.
104 *
105 * A field element with four limbs is an 'felem'. One with eight limbs is a
106 * 'longfelem'
107 *
108 * A field element with four, 64-bit values is called a 'smallfelem'. Small
109 * values are used as intermediate values before multiplication.
110 */
111
112# define NLIMBS 4
113
114typedef uint128_t limb;
115typedef limb felem[NLIMBS];
116typedef limb longfelem[NLIMBS * 2];
117typedef u64 smallfelem[NLIMBS];
118
119/* This is the value of the prime as four 64-bit words, little-endian. */
120static const u64 kPrime[4] =
121    { 0xfffffffffffffffful, 0xffffffff, 0, 0xffffffff00000001ul };
122static const u64 bottom63bits = 0x7ffffffffffffffful;
123
124/*
125 * bin32_to_felem takes a little-endian byte array and converts it into felem
126 * form. This assumes that the CPU is little-endian.
127 */
128static void bin32_to_felem(felem out, const u8 in[32])
129{
130    out[0] = *((u64 *)&in[0]);
131    out[1] = *((u64 *)&in[8]);
132    out[2] = *((u64 *)&in[16]);
133    out[3] = *((u64 *)&in[24]);
134}
135
136/*
137 * smallfelem_to_bin32 takes a smallfelem and serialises into a little
138 * endian, 32 byte array. This assumes that the CPU is little-endian.
139 */
140static void smallfelem_to_bin32(u8 out[32], const smallfelem in)
141{
142    *((u64 *)&out[0]) = in[0];
143    *((u64 *)&out[8]) = in[1];
144    *((u64 *)&out[16]) = in[2];
145    *((u64 *)&out[24]) = in[3];
146}
147
148/* To preserve endianness when using BN_bn2bin and BN_bin2bn */
149static void flip_endian(u8 *out, const u8 *in, unsigned len)
150{
151    unsigned i;
152    for (i = 0; i < len; ++i)
153        out[i] = in[len - 1 - i];
154}
155
156/* BN_to_felem converts an OpenSSL BIGNUM into an felem */
157static int BN_to_felem(felem out, const BIGNUM *bn)
158{
159    felem_bytearray b_in;
160    felem_bytearray b_out;
161    unsigned num_bytes;
162
163    /* BN_bn2bin eats leading zeroes */
164    memset(b_out, 0, sizeof b_out);
165    num_bytes = BN_num_bytes(bn);
166    if (num_bytes > sizeof b_out) {
167        ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
168        return 0;
169    }
170    if (BN_is_negative(bn)) {
171        ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
172        return 0;
173    }
174    num_bytes = BN_bn2bin(bn, b_in);
175    flip_endian(b_out, b_in, num_bytes);
176    bin32_to_felem(out, b_out);
177    return 1;
178}
179
180/* felem_to_BN converts an felem into an OpenSSL BIGNUM */
181static BIGNUM *smallfelem_to_BN(BIGNUM *out, const smallfelem in)
182{
183    felem_bytearray b_in, b_out;
184    smallfelem_to_bin32(b_in, in);
185    flip_endian(b_out, b_in, sizeof b_out);
186    return BN_bin2bn(b_out, sizeof b_out, out);
187}
188
189/*-
190 * Field operations
191 * ----------------
192 */
193
194static void smallfelem_one(smallfelem out)
195{
196    out[0] = 1;
197    out[1] = 0;
198    out[2] = 0;
199    out[3] = 0;
200}
201
202static void smallfelem_assign(smallfelem out, const smallfelem in)
203{
204    out[0] = in[0];
205    out[1] = in[1];
206    out[2] = in[2];
207    out[3] = in[3];
208}
209
210static void felem_assign(felem out, const felem in)
211{
212    out[0] = in[0];
213    out[1] = in[1];
214    out[2] = in[2];
215    out[3] = in[3];
216}
217
218/* felem_sum sets out = out + in. */
219static void felem_sum(felem out, const felem in)
220{
221    out[0] += in[0];
222    out[1] += in[1];
223    out[2] += in[2];
224    out[3] += in[3];
225}
226
227/* felem_small_sum sets out = out + in. */
228static void felem_small_sum(felem out, const smallfelem in)
229{
230    out[0] += in[0];
231    out[1] += in[1];
232    out[2] += in[2];
233    out[3] += in[3];
234}
235
236/* felem_scalar sets out = out * scalar */
237static void felem_scalar(felem out, const u64 scalar)
238{
239    out[0] *= scalar;
240    out[1] *= scalar;
241    out[2] *= scalar;
242    out[3] *= scalar;
243}
244
245/* longfelem_scalar sets out = out * scalar */
246static void longfelem_scalar(longfelem out, const u64 scalar)
247{
248    out[0] *= scalar;
249    out[1] *= scalar;
250    out[2] *= scalar;
251    out[3] *= scalar;
252    out[4] *= scalar;
253    out[5] *= scalar;
254    out[6] *= scalar;
255    out[7] *= scalar;
256}
257
258# define two105m41m9 (((limb)1) << 105) - (((limb)1) << 41) - (((limb)1) << 9)
259# define two105 (((limb)1) << 105)
260# define two105m41p9 (((limb)1) << 105) - (((limb)1) << 41) + (((limb)1) << 9)
261
262/* zero105 is 0 mod p */
263static const felem zero105 =
264    { two105m41m9, two105, two105m41p9, two105m41p9 };
265
266/*-
267 * smallfelem_neg sets |out| to |-small|
268 * On exit:
269 *   out[i] < out[i] + 2^105
270 */
271static void smallfelem_neg(felem out, const smallfelem small)
272{
273    /* In order to prevent underflow, we subtract from 0 mod p. */
274    out[0] = zero105[0] - small[0];
275    out[1] = zero105[1] - small[1];
276    out[2] = zero105[2] - small[2];
277    out[3] = zero105[3] - small[3];
278}
279
280/*-
281 * felem_diff subtracts |in| from |out|
282 * On entry:
283 *   in[i] < 2^104
284 * On exit:
285 *   out[i] < out[i] + 2^105
286 */
287static void felem_diff(felem out, const felem in)
288{
289    /*
290     * In order to prevent underflow, we add 0 mod p before subtracting.
291     */
292    out[0] += zero105[0];
293    out[1] += zero105[1];
294    out[2] += zero105[2];
295    out[3] += zero105[3];
296
297    out[0] -= in[0];
298    out[1] -= in[1];
299    out[2] -= in[2];
300    out[3] -= in[3];
301}
302
303# define two107m43m11 (((limb)1) << 107) - (((limb)1) << 43) - (((limb)1) << 11)
304# define two107 (((limb)1) << 107)
305# define two107m43p11 (((limb)1) << 107) - (((limb)1) << 43) + (((limb)1) << 11)
306
307/* zero107 is 0 mod p */
308static const felem zero107 =
309    { two107m43m11, two107, two107m43p11, two107m43p11 };
310
311/*-
312 * An alternative felem_diff for larger inputs |in|
313 * felem_diff_zero107 subtracts |in| from |out|
314 * On entry:
315 *   in[i] < 2^106
316 * On exit:
317 *   out[i] < out[i] + 2^107
318 */
319static void felem_diff_zero107(felem out, const felem in)
320{
321    /*
322     * In order to prevent underflow, we add 0 mod p before subtracting.
323     */
324    out[0] += zero107[0];
325    out[1] += zero107[1];
326    out[2] += zero107[2];
327    out[3] += zero107[3];
328
329    out[0] -= in[0];
330    out[1] -= in[1];
331    out[2] -= in[2];
332    out[3] -= in[3];
333}
334
335/*-
336 * longfelem_diff subtracts |in| from |out|
337 * On entry:
338 *   in[i] < 7*2^67
339 * On exit:
340 *   out[i] < out[i] + 2^70 + 2^40
341 */
342static void longfelem_diff(longfelem out, const longfelem in)
343{
344    static const limb two70m8p6 =
345        (((limb) 1) << 70) - (((limb) 1) << 8) + (((limb) 1) << 6);
346    static const limb two70p40 = (((limb) 1) << 70) + (((limb) 1) << 40);
347    static const limb two70 = (((limb) 1) << 70);
348    static const limb two70m40m38p6 =
349        (((limb) 1) << 70) - (((limb) 1) << 40) - (((limb) 1) << 38) +
350        (((limb) 1) << 6);
351    static const limb two70m6 = (((limb) 1) << 70) - (((limb) 1) << 6);
352
353    /* add 0 mod p to avoid underflow */
354    out[0] += two70m8p6;
355    out[1] += two70p40;
356    out[2] += two70;
357    out[3] += two70m40m38p6;
358    out[4] += two70m6;
359    out[5] += two70m6;
360    out[6] += two70m6;
361    out[7] += two70m6;
362
363    /* in[i] < 7*2^67 < 2^70 - 2^40 - 2^38 + 2^6 */
364    out[0] -= in[0];
365    out[1] -= in[1];
366    out[2] -= in[2];
367    out[3] -= in[3];
368    out[4] -= in[4];
369    out[5] -= in[5];
370    out[6] -= in[6];
371    out[7] -= in[7];
372}
373
374# define two64m0 (((limb)1) << 64) - 1
375# define two110p32m0 (((limb)1) << 110) + (((limb)1) << 32) - 1
376# define two64m46 (((limb)1) << 64) - (((limb)1) << 46)
377# define two64m32 (((limb)1) << 64) - (((limb)1) << 32)
378
379/* zero110 is 0 mod p */
380static const felem zero110 = { two64m0, two110p32m0, two64m46, two64m32 };
381
382/*-
383 * felem_shrink converts an felem into a smallfelem. The result isn't quite
384 * minimal as the value may be greater than p.
385 *
386 * On entry:
387 *   in[i] < 2^109
388 * On exit:
389 *   out[i] < 2^64
390 */
391static void felem_shrink(smallfelem out, const felem in)
392{
393    felem tmp;
394    u64 a, b, mask;
395    s64 high, low;
396    static const u64 kPrime3Test = 0x7fffffff00000001ul; /* 2^63 - 2^32 + 1 */
397
398    /* Carry 2->3 */
399    tmp[3] = zero110[3] + in[3] + ((u64)(in[2] >> 64));
400    /* tmp[3] < 2^110 */
401
402    tmp[2] = zero110[2] + (u64)in[2];
403    tmp[0] = zero110[0] + in[0];
404    tmp[1] = zero110[1] + in[1];
405    /* tmp[0] < 2**110, tmp[1] < 2^111, tmp[2] < 2**65 */
406
407    /*
408     * We perform two partial reductions where we eliminate the high-word of
409     * tmp[3]. We don't update the other words till the end.
410     */
411    a = tmp[3] >> 64;           /* a < 2^46 */
412    tmp[3] = (u64)tmp[3];
413    tmp[3] -= a;
414    tmp[3] += ((limb) a) << 32;
415    /* tmp[3] < 2^79 */
416
417    b = a;
418    a = tmp[3] >> 64;           /* a < 2^15 */
419    b += a;                     /* b < 2^46 + 2^15 < 2^47 */
420    tmp[3] = (u64)tmp[3];
421    tmp[3] -= a;
422    tmp[3] += ((limb) a) << 32;
423    /* tmp[3] < 2^64 + 2^47 */
424
425    /*
426     * This adjusts the other two words to complete the two partial
427     * reductions.
428     */
429    tmp[0] += b;
430    tmp[1] -= (((limb) b) << 32);
431
432    /*
433     * In order to make space in tmp[3] for the carry from 2 -> 3, we
434     * conditionally subtract kPrime if tmp[3] is large enough.
435     */
436    high = tmp[3] >> 64;
437    /* As tmp[3] < 2^65, high is either 1 or 0 */
438    high <<= 63;
439    high >>= 63;
440    /*-
441     * high is:
442     *   all ones   if the high word of tmp[3] is 1
443     *   all zeros  if the high word of tmp[3] if 0 */
444    low = tmp[3];
445    mask = low >> 63;
446    /*-
447     * mask is:
448     *   all ones   if the MSB of low is 1
449     *   all zeros  if the MSB of low if 0 */
450    low &= bottom63bits;
451    low -= kPrime3Test;
452    /* if low was greater than kPrime3Test then the MSB is zero */
453    low = ~low;
454    low >>= 63;
455    /*-
456     * low is:
457     *   all ones   if low was > kPrime3Test
458     *   all zeros  if low was <= kPrime3Test */
459    mask = (mask & low) | high;
460    tmp[0] -= mask & kPrime[0];
461    tmp[1] -= mask & kPrime[1];
462    /* kPrime[2] is zero, so omitted */
463    tmp[3] -= mask & kPrime[3];
464    /* tmp[3] < 2**64 - 2**32 + 1 */
465
466    tmp[1] += ((u64)(tmp[0] >> 64));
467    tmp[0] = (u64)tmp[0];
468    tmp[2] += ((u64)(tmp[1] >> 64));
469    tmp[1] = (u64)tmp[1];
470    tmp[3] += ((u64)(tmp[2] >> 64));
471    tmp[2] = (u64)tmp[2];
472    /* tmp[i] < 2^64 */
473
474    out[0] = tmp[0];
475    out[1] = tmp[1];
476    out[2] = tmp[2];
477    out[3] = tmp[3];
478}
479
480/* smallfelem_expand converts a smallfelem to an felem */
481static void smallfelem_expand(felem out, const smallfelem in)
482{
483    out[0] = in[0];
484    out[1] = in[1];
485    out[2] = in[2];
486    out[3] = in[3];
487}
488
489/*-
490 * smallfelem_square sets |out| = |small|^2
491 * On entry:
492 *   small[i] < 2^64
493 * On exit:
494 *   out[i] < 7 * 2^64 < 2^67
495 */
496static void smallfelem_square(longfelem out, const smallfelem small)
497{
498    limb a;
499    u64 high, low;
500
501    a = ((uint128_t) small[0]) * small[0];
502    low = a;
503    high = a >> 64;
504    out[0] = low;
505    out[1] = high;
506
507    a = ((uint128_t) small[0]) * small[1];
508    low = a;
509    high = a >> 64;
510    out[1] += low;
511    out[1] += low;
512    out[2] = high;
513
514    a = ((uint128_t) small[0]) * small[2];
515    low = a;
516    high = a >> 64;
517    out[2] += low;
518    out[2] *= 2;
519    out[3] = high;
520
521    a = ((uint128_t) small[0]) * small[3];
522    low = a;
523    high = a >> 64;
524    out[3] += low;
525    out[4] = high;
526
527    a = ((uint128_t) small[1]) * small[2];
528    low = a;
529    high = a >> 64;
530    out[3] += low;
531    out[3] *= 2;
532    out[4] += high;
533
534    a = ((uint128_t) small[1]) * small[1];
535    low = a;
536    high = a >> 64;
537    out[2] += low;
538    out[3] += high;
539
540    a = ((uint128_t) small[1]) * small[3];
541    low = a;
542    high = a >> 64;
543    out[4] += low;
544    out[4] *= 2;
545    out[5] = high;
546
547    a = ((uint128_t) small[2]) * small[3];
548    low = a;
549    high = a >> 64;
550    out[5] += low;
551    out[5] *= 2;
552    out[6] = high;
553    out[6] += high;
554
555    a = ((uint128_t) small[2]) * small[2];
556    low = a;
557    high = a >> 64;
558    out[4] += low;
559    out[5] += high;
560
561    a = ((uint128_t) small[3]) * small[3];
562    low = a;
563    high = a >> 64;
564    out[6] += low;
565    out[7] = high;
566}
567
568/*-
569 * felem_square sets |out| = |in|^2
570 * On entry:
571 *   in[i] < 2^109
572 * On exit:
573 *   out[i] < 7 * 2^64 < 2^67
574 */
575static void felem_square(longfelem out, const felem in)
576{
577    u64 small[4];
578    felem_shrink(small, in);
579    smallfelem_square(out, small);
580}
581
582/*-
583 * smallfelem_mul sets |out| = |small1| * |small2|
584 * On entry:
585 *   small1[i] < 2^64
586 *   small2[i] < 2^64
587 * On exit:
588 *   out[i] < 7 * 2^64 < 2^67
589 */
590static void smallfelem_mul(longfelem out, const smallfelem small1,
591                           const smallfelem small2)
592{
593    limb a;
594    u64 high, low;
595
596    a = ((uint128_t) small1[0]) * small2[0];
597    low = a;
598    high = a >> 64;
599    out[0] = low;
600    out[1] = high;
601
602    a = ((uint128_t) small1[0]) * small2[1];
603    low = a;
604    high = a >> 64;
605    out[1] += low;
606    out[2] = high;
607
608    a = ((uint128_t) small1[1]) * small2[0];
609    low = a;
610    high = a >> 64;
611    out[1] += low;
612    out[2] += high;
613
614    a = ((uint128_t) small1[0]) * small2[2];
615    low = a;
616    high = a >> 64;
617    out[2] += low;
618    out[3] = high;
619
620    a = ((uint128_t) small1[1]) * small2[1];
621    low = a;
622    high = a >> 64;
623    out[2] += low;
624    out[3] += high;
625
626    a = ((uint128_t) small1[2]) * small2[0];
627    low = a;
628    high = a >> 64;
629    out[2] += low;
630    out[3] += high;
631
632    a = ((uint128_t) small1[0]) * small2[3];
633    low = a;
634    high = a >> 64;
635    out[3] += low;
636    out[4] = high;
637
638    a = ((uint128_t) small1[1]) * small2[2];
639    low = a;
640    high = a >> 64;
641    out[3] += low;
642    out[4] += high;
643
644    a = ((uint128_t) small1[2]) * small2[1];
645    low = a;
646    high = a >> 64;
647    out[3] += low;
648    out[4] += high;
649
650    a = ((uint128_t) small1[3]) * small2[0];
651    low = a;
652    high = a >> 64;
653    out[3] += low;
654    out[4] += high;
655
656    a = ((uint128_t) small1[1]) * small2[3];
657    low = a;
658    high = a >> 64;
659    out[4] += low;
660    out[5] = high;
661
662    a = ((uint128_t) small1[2]) * small2[2];
663    low = a;
664    high = a >> 64;
665    out[4] += low;
666    out[5] += high;
667
668    a = ((uint128_t) small1[3]) * small2[1];
669    low = a;
670    high = a >> 64;
671    out[4] += low;
672    out[5] += high;
673
674    a = ((uint128_t) small1[2]) * small2[3];
675    low = a;
676    high = a >> 64;
677    out[5] += low;
678    out[6] = high;
679
680    a = ((uint128_t) small1[3]) * small2[2];
681    low = a;
682    high = a >> 64;
683    out[5] += low;
684    out[6] += high;
685
686    a = ((uint128_t) small1[3]) * small2[3];
687    low = a;
688    high = a >> 64;
689    out[6] += low;
690    out[7] = high;
691}
692
693/*-
694 * felem_mul sets |out| = |in1| * |in2|
695 * On entry:
696 *   in1[i] < 2^109
697 *   in2[i] < 2^109
698 * On exit:
699 *   out[i] < 7 * 2^64 < 2^67
700 */
701static void felem_mul(longfelem out, const felem in1, const felem in2)
702{
703    smallfelem small1, small2;
704    felem_shrink(small1, in1);
705    felem_shrink(small2, in2);
706    smallfelem_mul(out, small1, small2);
707}
708
709/*-
710 * felem_small_mul sets |out| = |small1| * |in2|
711 * On entry:
712 *   small1[i] < 2^64
713 *   in2[i] < 2^109
714 * On exit:
715 *   out[i] < 7 * 2^64 < 2^67
716 */
717static void felem_small_mul(longfelem out, const smallfelem small1,
718                            const felem in2)
719{
720    smallfelem small2;
721    felem_shrink(small2, in2);
722    smallfelem_mul(out, small1, small2);
723}
724
725# define two100m36m4 (((limb)1) << 100) - (((limb)1) << 36) - (((limb)1) << 4)
726# define two100 (((limb)1) << 100)
727# define two100m36p4 (((limb)1) << 100) - (((limb)1) << 36) + (((limb)1) << 4)
728/* zero100 is 0 mod p */
729static const felem zero100 =
730    { two100m36m4, two100, two100m36p4, two100m36p4 };
731
732/*-
733 * Internal function for the different flavours of felem_reduce.
734 * felem_reduce_ reduces the higher coefficients in[4]-in[7].
735 * On entry:
736 *   out[0] >= in[6] + 2^32*in[6] + in[7] + 2^32*in[7]
737 *   out[1] >= in[7] + 2^32*in[4]
738 *   out[2] >= in[5] + 2^32*in[5]
739 *   out[3] >= in[4] + 2^32*in[5] + 2^32*in[6]
740 * On exit:
741 *   out[0] <= out[0] + in[4] + 2^32*in[5]
742 *   out[1] <= out[1] + in[5] + 2^33*in[6]
743 *   out[2] <= out[2] + in[7] + 2*in[6] + 2^33*in[7]
744 *   out[3] <= out[3] + 2^32*in[4] + 3*in[7]
745 */
746static void felem_reduce_(felem out, const longfelem in)
747{
748    int128_t c;
749    /* combine common terms from below */
750    c = in[4] + (in[5] << 32);
751    out[0] += c;
752    out[3] -= c;
753
754    c = in[5] - in[7];
755    out[1] += c;
756    out[2] -= c;
757
758    /* the remaining terms */
759    /* 256: [(0,1),(96,-1),(192,-1),(224,1)] */
760    out[1] -= (in[4] << 32);
761    out[3] += (in[4] << 32);
762
763    /* 320: [(32,1),(64,1),(128,-1),(160,-1),(224,-1)] */
764    out[2] -= (in[5] << 32);
765
766    /* 384: [(0,-1),(32,-1),(96,2),(128,2),(224,-1)] */
767    out[0] -= in[6];
768    out[0] -= (in[6] << 32);
769    out[1] += (in[6] << 33);
770    out[2] += (in[6] * 2);
771    out[3] -= (in[6] << 32);
772
773    /* 448: [(0,-1),(32,-1),(64,-1),(128,1),(160,2),(192,3)] */
774    out[0] -= in[7];
775    out[0] -= (in[7] << 32);
776    out[2] += (in[7] << 33);
777    out[3] += (in[7] * 3);
778}
779
780/*-
781 * felem_reduce converts a longfelem into an felem.
782 * To be called directly after felem_square or felem_mul.
783 * On entry:
784 *   in[0] < 2^64, in[1] < 3*2^64, in[2] < 5*2^64, in[3] < 7*2^64
785 *   in[4] < 7*2^64, in[5] < 5*2^64, in[6] < 3*2^64, in[7] < 2*64
786 * On exit:
787 *   out[i] < 2^101
788 */
789static void felem_reduce(felem out, const longfelem in)
790{
791    out[0] = zero100[0] + in[0];
792    out[1] = zero100[1] + in[1];
793    out[2] = zero100[2] + in[2];
794    out[3] = zero100[3] + in[3];
795
796    felem_reduce_(out, in);
797
798    /*-
799     * out[0] > 2^100 - 2^36 - 2^4 - 3*2^64 - 3*2^96 - 2^64 - 2^96 > 0
800     * out[1] > 2^100 - 2^64 - 7*2^96 > 0
801     * out[2] > 2^100 - 2^36 + 2^4 - 5*2^64 - 5*2^96 > 0
802     * out[3] > 2^100 - 2^36 + 2^4 - 7*2^64 - 5*2^96 - 3*2^96 > 0
803     *
804     * out[0] < 2^100 + 2^64 + 7*2^64 + 5*2^96 < 2^101
805     * out[1] < 2^100 + 3*2^64 + 5*2^64 + 3*2^97 < 2^101
806     * out[2] < 2^100 + 5*2^64 + 2^64 + 3*2^65 + 2^97 < 2^101
807     * out[3] < 2^100 + 7*2^64 + 7*2^96 + 3*2^64 < 2^101
808     */
809}
810
811/*-
812 * felem_reduce_zero105 converts a larger longfelem into an felem.
813 * On entry:
814 *   in[0] < 2^71
815 * On exit:
816 *   out[i] < 2^106
817 */
818static void felem_reduce_zero105(felem out, const longfelem in)
819{
820    out[0] = zero105[0] + in[0];
821    out[1] = zero105[1] + in[1];
822    out[2] = zero105[2] + in[2];
823    out[3] = zero105[3] + in[3];
824
825    felem_reduce_(out, in);
826
827    /*-
828     * out[0] > 2^105 - 2^41 - 2^9 - 2^71 - 2^103 - 2^71 - 2^103 > 0
829     * out[1] > 2^105 - 2^71 - 2^103 > 0
830     * out[2] > 2^105 - 2^41 + 2^9 - 2^71 - 2^103 > 0
831     * out[3] > 2^105 - 2^41 + 2^9 - 2^71 - 2^103 - 2^103 > 0
832     *
833     * out[0] < 2^105 + 2^71 + 2^71 + 2^103 < 2^106
834     * out[1] < 2^105 + 2^71 + 2^71 + 2^103 < 2^106
835     * out[2] < 2^105 + 2^71 + 2^71 + 2^71 + 2^103 < 2^106
836     * out[3] < 2^105 + 2^71 + 2^103 + 2^71 < 2^106
837     */
838}
839
840/*
841 * subtract_u64 sets *result = *result - v and *carry to one if the
842 * subtraction underflowed.
843 */
844static void subtract_u64(u64 *result, u64 *carry, u64 v)
845{
846    uint128_t r = *result;
847    r -= v;
848    *carry = (r >> 64) & 1;
849    *result = (u64)r;
850}
851
852/*
853 * felem_contract converts |in| to its unique, minimal representation. On
854 * entry: in[i] < 2^109
855 */
856static void felem_contract(smallfelem out, const felem in)
857{
858    unsigned i;
859    u64 all_equal_so_far = 0, result = 0, carry;
860
861    felem_shrink(out, in);
862    /* small is minimal except that the value might be > p */
863
864    all_equal_so_far--;
865    /*
866     * We are doing a constant time test if out >= kPrime. We need to compare
867     * each u64, from most-significant to least significant. For each one, if
868     * all words so far have been equal (m is all ones) then a non-equal
869     * result is the answer. Otherwise we continue.
870     */
871    for (i = 3; i < 4; i--) {
872        u64 equal;
873        uint128_t a = ((uint128_t) kPrime[i]) - out[i];
874        /*
875         * if out[i] > kPrime[i] then a will underflow and the high 64-bits
876         * will all be set.
877         */
878        result |= all_equal_so_far & ((u64)(a >> 64));
879
880        /*
881         * if kPrime[i] == out[i] then |equal| will be all zeros and the
882         * decrement will make it all ones.
883         */
884        equal = kPrime[i] ^ out[i];
885        equal--;
886        equal &= equal << 32;
887        equal &= equal << 16;
888        equal &= equal << 8;
889        equal &= equal << 4;
890        equal &= equal << 2;
891        equal &= equal << 1;
892        equal = ((s64) equal) >> 63;
893
894        all_equal_so_far &= equal;
895    }
896
897    /*
898     * if all_equal_so_far is still all ones then the two values are equal
899     * and so out >= kPrime is true.
900     */
901    result |= all_equal_so_far;
902
903    /* if out >= kPrime then we subtract kPrime. */
904    subtract_u64(&out[0], &carry, result & kPrime[0]);
905    subtract_u64(&out[1], &carry, carry);
906    subtract_u64(&out[2], &carry, carry);
907    subtract_u64(&out[3], &carry, carry);
908
909    subtract_u64(&out[1], &carry, result & kPrime[1]);
910    subtract_u64(&out[2], &carry, carry);
911    subtract_u64(&out[3], &carry, carry);
912
913    subtract_u64(&out[2], &carry, result & kPrime[2]);
914    subtract_u64(&out[3], &carry, carry);
915
916    subtract_u64(&out[3], &carry, result & kPrime[3]);
917}
918
919static void smallfelem_square_contract(smallfelem out, const smallfelem in)
920{
921    longfelem longtmp;
922    felem tmp;
923
924    smallfelem_square(longtmp, in);
925    felem_reduce(tmp, longtmp);
926    felem_contract(out, tmp);
927}
928
929static void smallfelem_mul_contract(smallfelem out, const smallfelem in1,
930                                    const smallfelem in2)
931{
932    longfelem longtmp;
933    felem tmp;
934
935    smallfelem_mul(longtmp, in1, in2);
936    felem_reduce(tmp, longtmp);
937    felem_contract(out, tmp);
938}
939
940/*-
941 * felem_is_zero returns a limb with all bits set if |in| == 0 (mod p) and 0
942 * otherwise.
943 * On entry:
944 *   small[i] < 2^64
945 */
946static limb smallfelem_is_zero(const smallfelem small)
947{
948    limb result;
949    u64 is_p;
950
951    u64 is_zero = small[0] | small[1] | small[2] | small[3];
952    is_zero--;
953    is_zero &= is_zero << 32;
954    is_zero &= is_zero << 16;
955    is_zero &= is_zero << 8;
956    is_zero &= is_zero << 4;
957    is_zero &= is_zero << 2;
958    is_zero &= is_zero << 1;
959    is_zero = ((s64) is_zero) >> 63;
960
961    is_p = (small[0] ^ kPrime[0]) |
962        (small[1] ^ kPrime[1]) |
963        (small[2] ^ kPrime[2]) | (small[3] ^ kPrime[3]);
964    is_p--;
965    is_p &= is_p << 32;
966    is_p &= is_p << 16;
967    is_p &= is_p << 8;
968    is_p &= is_p << 4;
969    is_p &= is_p << 2;
970    is_p &= is_p << 1;
971    is_p = ((s64) is_p) >> 63;
972
973    is_zero |= is_p;
974
975    result = is_zero;
976    result |= ((limb) is_zero) << 64;
977    return result;
978}
979
980static int smallfelem_is_zero_int(const smallfelem small)
981{
982    return (int)(smallfelem_is_zero(small) & ((limb) 1));
983}
984
985/*-
986 * felem_inv calculates |out| = |in|^{-1}
987 *
988 * Based on Fermat's Little Theorem:
989 *   a^p = a (mod p)
990 *   a^{p-1} = 1 (mod p)
991 *   a^{p-2} = a^{-1} (mod p)
992 */
993static void felem_inv(felem out, const felem in)
994{
995    felem ftmp, ftmp2;
996    /* each e_I will hold |in|^{2^I - 1} */
997    felem e2, e4, e8, e16, e32, e64;
998    longfelem tmp;
999    unsigned i;
1000
1001    felem_square(tmp, in);
1002    felem_reduce(ftmp, tmp);    /* 2^1 */
1003    felem_mul(tmp, in, ftmp);
1004    felem_reduce(ftmp, tmp);    /* 2^2 - 2^0 */
1005    felem_assign(e2, ftmp);
1006    felem_square(tmp, ftmp);
1007    felem_reduce(ftmp, tmp);    /* 2^3 - 2^1 */
1008    felem_square(tmp, ftmp);
1009    felem_reduce(ftmp, tmp);    /* 2^4 - 2^2 */
1010    felem_mul(tmp, ftmp, e2);
1011    felem_reduce(ftmp, tmp);    /* 2^4 - 2^0 */
1012    felem_assign(e4, ftmp);
1013    felem_square(tmp, ftmp);
1014    felem_reduce(ftmp, tmp);    /* 2^5 - 2^1 */
1015    felem_square(tmp, ftmp);
1016    felem_reduce(ftmp, tmp);    /* 2^6 - 2^2 */
1017    felem_square(tmp, ftmp);
1018    felem_reduce(ftmp, tmp);    /* 2^7 - 2^3 */
1019    felem_square(tmp, ftmp);
1020    felem_reduce(ftmp, tmp);    /* 2^8 - 2^4 */
1021    felem_mul(tmp, ftmp, e4);
1022    felem_reduce(ftmp, tmp);    /* 2^8 - 2^0 */
1023    felem_assign(e8, ftmp);
1024    for (i = 0; i < 8; i++) {
1025        felem_square(tmp, ftmp);
1026        felem_reduce(ftmp, tmp);
1027    }                           /* 2^16 - 2^8 */
1028    felem_mul(tmp, ftmp, e8);
1029    felem_reduce(ftmp, tmp);    /* 2^16 - 2^0 */
1030    felem_assign(e16, ftmp);
1031    for (i = 0; i < 16; i++) {
1032        felem_square(tmp, ftmp);
1033        felem_reduce(ftmp, tmp);
1034    }                           /* 2^32 - 2^16 */
1035    felem_mul(tmp, ftmp, e16);
1036    felem_reduce(ftmp, tmp);    /* 2^32 - 2^0 */
1037    felem_assign(e32, ftmp);
1038    for (i = 0; i < 32; i++) {
1039        felem_square(tmp, ftmp);
1040        felem_reduce(ftmp, tmp);
1041    }                           /* 2^64 - 2^32 */
1042    felem_assign(e64, ftmp);
1043    felem_mul(tmp, ftmp, in);
1044    felem_reduce(ftmp, tmp);    /* 2^64 - 2^32 + 2^0 */
1045    for (i = 0; i < 192; i++) {
1046        felem_square(tmp, ftmp);
1047        felem_reduce(ftmp, tmp);
1048    }                           /* 2^256 - 2^224 + 2^192 */
1049
1050    felem_mul(tmp, e64, e32);
1051    felem_reduce(ftmp2, tmp);   /* 2^64 - 2^0 */
1052    for (i = 0; i < 16; i++) {
1053        felem_square(tmp, ftmp2);
1054        felem_reduce(ftmp2, tmp);
1055    }                           /* 2^80 - 2^16 */
1056    felem_mul(tmp, ftmp2, e16);
1057    felem_reduce(ftmp2, tmp);   /* 2^80 - 2^0 */
1058    for (i = 0; i < 8; i++) {
1059        felem_square(tmp, ftmp2);
1060        felem_reduce(ftmp2, tmp);
1061    }                           /* 2^88 - 2^8 */
1062    felem_mul(tmp, ftmp2, e8);
1063    felem_reduce(ftmp2, tmp);   /* 2^88 - 2^0 */
1064    for (i = 0; i < 4; i++) {
1065        felem_square(tmp, ftmp2);
1066        felem_reduce(ftmp2, tmp);
1067    }                           /* 2^92 - 2^4 */
1068    felem_mul(tmp, ftmp2, e4);
1069    felem_reduce(ftmp2, tmp);   /* 2^92 - 2^0 */
1070    felem_square(tmp, ftmp2);
1071    felem_reduce(ftmp2, tmp);   /* 2^93 - 2^1 */
1072    felem_square(tmp, ftmp2);
1073    felem_reduce(ftmp2, tmp);   /* 2^94 - 2^2 */
1074    felem_mul(tmp, ftmp2, e2);
1075    felem_reduce(ftmp2, tmp);   /* 2^94 - 2^0 */
1076    felem_square(tmp, ftmp2);
1077    felem_reduce(ftmp2, tmp);   /* 2^95 - 2^1 */
1078    felem_square(tmp, ftmp2);
1079    felem_reduce(ftmp2, tmp);   /* 2^96 - 2^2 */
1080    felem_mul(tmp, ftmp2, in);
1081    felem_reduce(ftmp2, tmp);   /* 2^96 - 3 */
1082
1083    felem_mul(tmp, ftmp2, ftmp);
1084    felem_reduce(out, tmp);     /* 2^256 - 2^224 + 2^192 + 2^96 - 3 */
1085}
1086
1087static void smallfelem_inv_contract(smallfelem out, const smallfelem in)
1088{
1089    felem tmp;
1090
1091    smallfelem_expand(tmp, in);
1092    felem_inv(tmp, tmp);
1093    felem_contract(out, tmp);
1094}
1095
1096/*-
1097 * Group operations
1098 * ----------------
1099 *
1100 * Building on top of the field operations we have the operations on the
1101 * elliptic curve group itself. Points on the curve are represented in Jacobian
1102 * coordinates
1103 */
1104
1105/*-
1106 * point_double calculates 2*(x_in, y_in, z_in)
1107 *
1108 * The method is taken from:
1109 *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b
1110 *
1111 * Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed.
1112 * while x_out == y_in is not (maybe this works, but it's not tested).
1113 */
1114static void
1115point_double(felem x_out, felem y_out, felem z_out,
1116             const felem x_in, const felem y_in, const felem z_in)
1117{
1118    longfelem tmp, tmp2;
1119    felem delta, gamma, beta, alpha, ftmp, ftmp2;
1120    smallfelem small1, small2;
1121
1122    felem_assign(ftmp, x_in);
1123    /* ftmp[i] < 2^106 */
1124    felem_assign(ftmp2, x_in);
1125    /* ftmp2[i] < 2^106 */
1126
1127    /* delta = z^2 */
1128    felem_square(tmp, z_in);
1129    felem_reduce(delta, tmp);
1130    /* delta[i] < 2^101 */
1131
1132    /* gamma = y^2 */
1133    felem_square(tmp, y_in);
1134    felem_reduce(gamma, tmp);
1135    /* gamma[i] < 2^101 */
1136    felem_shrink(small1, gamma);
1137
1138    /* beta = x*gamma */
1139    felem_small_mul(tmp, small1, x_in);
1140    felem_reduce(beta, tmp);
1141    /* beta[i] < 2^101 */
1142
1143    /* alpha = 3*(x-delta)*(x+delta) */
1144    felem_diff(ftmp, delta);
1145    /* ftmp[i] < 2^105 + 2^106 < 2^107 */
1146    felem_sum(ftmp2, delta);
1147    /* ftmp2[i] < 2^105 + 2^106 < 2^107 */
1148    felem_scalar(ftmp2, 3);
1149    /* ftmp2[i] < 3 * 2^107 < 2^109 */
1150    felem_mul(tmp, ftmp, ftmp2);
1151    felem_reduce(alpha, tmp);
1152    /* alpha[i] < 2^101 */
1153    felem_shrink(small2, alpha);
1154
1155    /* x' = alpha^2 - 8*beta */
1156    smallfelem_square(tmp, small2);
1157    felem_reduce(x_out, tmp);
1158    felem_assign(ftmp, beta);
1159    felem_scalar(ftmp, 8);
1160    /* ftmp[i] < 8 * 2^101 = 2^104 */
1161    felem_diff(x_out, ftmp);
1162    /* x_out[i] < 2^105 + 2^101 < 2^106 */
1163
1164    /* z' = (y + z)^2 - gamma - delta */
1165    felem_sum(delta, gamma);
1166    /* delta[i] < 2^101 + 2^101 = 2^102 */
1167    felem_assign(ftmp, y_in);
1168    felem_sum(ftmp, z_in);
1169    /* ftmp[i] < 2^106 + 2^106 = 2^107 */
1170    felem_square(tmp, ftmp);
1171    felem_reduce(z_out, tmp);
1172    felem_diff(z_out, delta);
1173    /* z_out[i] < 2^105 + 2^101 < 2^106 */
1174
1175    /* y' = alpha*(4*beta - x') - 8*gamma^2 */
1176    felem_scalar(beta, 4);
1177    /* beta[i] < 4 * 2^101 = 2^103 */
1178    felem_diff_zero107(beta, x_out);
1179    /* beta[i] < 2^107 + 2^103 < 2^108 */
1180    felem_small_mul(tmp, small2, beta);
1181    /* tmp[i] < 7 * 2^64 < 2^67 */
1182    smallfelem_square(tmp2, small1);
1183    /* tmp2[i] < 7 * 2^64 */
1184    longfelem_scalar(tmp2, 8);
1185    /* tmp2[i] < 8 * 7 * 2^64 = 7 * 2^67 */
1186    longfelem_diff(tmp, tmp2);
1187    /* tmp[i] < 2^67 + 2^70 + 2^40 < 2^71 */
1188    felem_reduce_zero105(y_out, tmp);
1189    /* y_out[i] < 2^106 */
1190}
1191
1192/*
1193 * point_double_small is the same as point_double, except that it operates on
1194 * smallfelems
1195 */
1196static void
1197point_double_small(smallfelem x_out, smallfelem y_out, smallfelem z_out,
1198                   const smallfelem x_in, const smallfelem y_in,
1199                   const smallfelem z_in)
1200{
1201    felem felem_x_out, felem_y_out, felem_z_out;
1202    felem felem_x_in, felem_y_in, felem_z_in;
1203
1204    smallfelem_expand(felem_x_in, x_in);
1205    smallfelem_expand(felem_y_in, y_in);
1206    smallfelem_expand(felem_z_in, z_in);
1207    point_double(felem_x_out, felem_y_out, felem_z_out,
1208                 felem_x_in, felem_y_in, felem_z_in);
1209    felem_shrink(x_out, felem_x_out);
1210    felem_shrink(y_out, felem_y_out);
1211    felem_shrink(z_out, felem_z_out);
1212}
1213
1214/* copy_conditional copies in to out iff mask is all ones. */
1215static void copy_conditional(felem out, const felem in, limb mask)
1216{
1217    unsigned i;
1218    for (i = 0; i < NLIMBS; ++i) {
1219        const limb tmp = mask & (in[i] ^ out[i]);
1220        out[i] ^= tmp;
1221    }
1222}
1223
1224/* copy_small_conditional copies in to out iff mask is all ones. */
1225static void copy_small_conditional(felem out, const smallfelem in, limb mask)
1226{
1227    unsigned i;
1228    const u64 mask64 = mask;
1229    for (i = 0; i < NLIMBS; ++i) {
1230        out[i] = ((limb) (in[i] & mask64)) | (out[i] & ~mask);
1231    }
1232}
1233
1234/*-
1235 * point_add calcuates (x1, y1, z1) + (x2, y2, z2)
1236 *
1237 * The method is taken from:
1238 *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl,
1239 * adapted for mixed addition (z2 = 1, or z2 = 0 for the point at infinity).
1240 *
1241 * This function includes a branch for checking whether the two input points
1242 * are equal, (while not equal to the point at infinity). This case never
1243 * happens during single point multiplication, so there is no timing leak for
1244 * ECDH or ECDSA signing.
1245 */
1246static void point_add(felem x3, felem y3, felem z3,
1247                      const felem x1, const felem y1, const felem z1,
1248                      const int mixed, const smallfelem x2,
1249                      const smallfelem y2, const smallfelem z2)
1250{
1251    felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, ftmp6, x_out, y_out, z_out;
1252    longfelem tmp, tmp2;
1253    smallfelem small1, small2, small3, small4, small5;
1254    limb x_equal, y_equal, z1_is_zero, z2_is_zero;
1255
1256    felem_shrink(small3, z1);
1257
1258    z1_is_zero = smallfelem_is_zero(small3);
1259    z2_is_zero = smallfelem_is_zero(z2);
1260
1261    /* ftmp = z1z1 = z1**2 */
1262    smallfelem_square(tmp, small3);
1263    felem_reduce(ftmp, tmp);
1264    /* ftmp[i] < 2^101 */
1265    felem_shrink(small1, ftmp);
1266
1267    if (!mixed) {
1268        /* ftmp2 = z2z2 = z2**2 */
1269        smallfelem_square(tmp, z2);
1270        felem_reduce(ftmp2, tmp);
1271        /* ftmp2[i] < 2^101 */
1272        felem_shrink(small2, ftmp2);
1273
1274        felem_shrink(small5, x1);
1275
1276        /* u1 = ftmp3 = x1*z2z2 */
1277        smallfelem_mul(tmp, small5, small2);
1278        felem_reduce(ftmp3, tmp);
1279        /* ftmp3[i] < 2^101 */
1280
1281        /* ftmp5 = z1 + z2 */
1282        felem_assign(ftmp5, z1);
1283        felem_small_sum(ftmp5, z2);
1284        /* ftmp5[i] < 2^107 */
1285
1286        /* ftmp5 = (z1 + z2)**2 - (z1z1 + z2z2) = 2z1z2 */
1287        felem_square(tmp, ftmp5);
1288        felem_reduce(ftmp5, tmp);
1289        /* ftmp2 = z2z2 + z1z1 */
1290        felem_sum(ftmp2, ftmp);
1291        /* ftmp2[i] < 2^101 + 2^101 = 2^102 */
1292        felem_diff(ftmp5, ftmp2);
1293        /* ftmp5[i] < 2^105 + 2^101 < 2^106 */
1294
1295        /* ftmp2 = z2 * z2z2 */
1296        smallfelem_mul(tmp, small2, z2);
1297        felem_reduce(ftmp2, tmp);
1298
1299        /* s1 = ftmp2 = y1 * z2**3 */
1300        felem_mul(tmp, y1, ftmp2);
1301        felem_reduce(ftmp6, tmp);
1302        /* ftmp6[i] < 2^101 */
1303    } else {
1304        /*
1305         * We'll assume z2 = 1 (special case z2 = 0 is handled later)
1306         */
1307
1308        /* u1 = ftmp3 = x1*z2z2 */
1309        felem_assign(ftmp3, x1);
1310        /* ftmp3[i] < 2^106 */
1311
1312        /* ftmp5 = 2z1z2 */
1313        felem_assign(ftmp5, z1);
1314        felem_scalar(ftmp5, 2);
1315        /* ftmp5[i] < 2*2^106 = 2^107 */
1316
1317        /* s1 = ftmp2 = y1 * z2**3 */
1318        felem_assign(ftmp6, y1);
1319        /* ftmp6[i] < 2^106 */
1320    }
1321
1322    /* u2 = x2*z1z1 */
1323    smallfelem_mul(tmp, x2, small1);
1324    felem_reduce(ftmp4, tmp);
1325
1326    /* h = ftmp4 = u2 - u1 */
1327    felem_diff_zero107(ftmp4, ftmp3);
1328    /* ftmp4[i] < 2^107 + 2^101 < 2^108 */
1329    felem_shrink(small4, ftmp4);
1330
1331    x_equal = smallfelem_is_zero(small4);
1332
1333    /* z_out = ftmp5 * h */
1334    felem_small_mul(tmp, small4, ftmp5);
1335    felem_reduce(z_out, tmp);
1336    /* z_out[i] < 2^101 */
1337
1338    /* ftmp = z1 * z1z1 */
1339    smallfelem_mul(tmp, small1, small3);
1340    felem_reduce(ftmp, tmp);
1341
1342    /* s2 = tmp = y2 * z1**3 */
1343    felem_small_mul(tmp, y2, ftmp);
1344    felem_reduce(ftmp5, tmp);
1345
1346    /* r = ftmp5 = (s2 - s1)*2 */
1347    felem_diff_zero107(ftmp5, ftmp6);
1348    /* ftmp5[i] < 2^107 + 2^107 = 2^108 */
1349    felem_scalar(ftmp5, 2);
1350    /* ftmp5[i] < 2^109 */
1351    felem_shrink(small1, ftmp5);
1352    y_equal = smallfelem_is_zero(small1);
1353
1354    if (x_equal && y_equal && !z1_is_zero && !z2_is_zero) {
1355        point_double(x3, y3, z3, x1, y1, z1);
1356        return;
1357    }
1358
1359    /* I = ftmp = (2h)**2 */
1360    felem_assign(ftmp, ftmp4);
1361    felem_scalar(ftmp, 2);
1362    /* ftmp[i] < 2*2^108 = 2^109 */
1363    felem_square(tmp, ftmp);
1364    felem_reduce(ftmp, tmp);
1365
1366    /* J = ftmp2 = h * I */
1367    felem_mul(tmp, ftmp4, ftmp);
1368    felem_reduce(ftmp2, tmp);
1369
1370    /* V = ftmp4 = U1 * I */
1371    felem_mul(tmp, ftmp3, ftmp);
1372    felem_reduce(ftmp4, tmp);
1373
1374    /* x_out = r**2 - J - 2V */
1375    smallfelem_square(tmp, small1);
1376    felem_reduce(x_out, tmp);
1377    felem_assign(ftmp3, ftmp4);
1378    felem_scalar(ftmp4, 2);
1379    felem_sum(ftmp4, ftmp2);
1380    /* ftmp4[i] < 2*2^101 + 2^101 < 2^103 */
1381    felem_diff(x_out, ftmp4);
1382    /* x_out[i] < 2^105 + 2^101 */
1383
1384    /* y_out = r(V-x_out) - 2 * s1 * J */
1385    felem_diff_zero107(ftmp3, x_out);
1386    /* ftmp3[i] < 2^107 + 2^101 < 2^108 */
1387    felem_small_mul(tmp, small1, ftmp3);
1388    felem_mul(tmp2, ftmp6, ftmp2);
1389    longfelem_scalar(tmp2, 2);
1390    /* tmp2[i] < 2*2^67 = 2^68 */
1391    longfelem_diff(tmp, tmp2);
1392    /* tmp[i] < 2^67 + 2^70 + 2^40 < 2^71 */
1393    felem_reduce_zero105(y_out, tmp);
1394    /* y_out[i] < 2^106 */
1395
1396    copy_small_conditional(x_out, x2, z1_is_zero);
1397    copy_conditional(x_out, x1, z2_is_zero);
1398    copy_small_conditional(y_out, y2, z1_is_zero);
1399    copy_conditional(y_out, y1, z2_is_zero);
1400    copy_small_conditional(z_out, z2, z1_is_zero);
1401    copy_conditional(z_out, z1, z2_is_zero);
1402    felem_assign(x3, x_out);
1403    felem_assign(y3, y_out);
1404    felem_assign(z3, z_out);
1405}
1406
1407/*
1408 * point_add_small is the same as point_add, except that it operates on
1409 * smallfelems
1410 */
1411static void point_add_small(smallfelem x3, smallfelem y3, smallfelem z3,
1412                            smallfelem x1, smallfelem y1, smallfelem z1,
1413                            smallfelem x2, smallfelem y2, smallfelem z2)
1414{
1415    felem felem_x3, felem_y3, felem_z3;
1416    felem felem_x1, felem_y1, felem_z1;
1417    smallfelem_expand(felem_x1, x1);
1418    smallfelem_expand(felem_y1, y1);
1419    smallfelem_expand(felem_z1, z1);
1420    point_add(felem_x3, felem_y3, felem_z3, felem_x1, felem_y1, felem_z1, 0,
1421              x2, y2, z2);
1422    felem_shrink(x3, felem_x3);
1423    felem_shrink(y3, felem_y3);
1424    felem_shrink(z3, felem_z3);
1425}
1426
1427/*-
1428 * Base point pre computation
1429 * --------------------------
1430 *
1431 * Two different sorts of precomputed tables are used in the following code.
1432 * Each contain various points on the curve, where each point is three field
1433 * elements (x, y, z).
1434 *
1435 * For the base point table, z is usually 1 (0 for the point at infinity).
1436 * This table has 2 * 16 elements, starting with the following:
1437 * index | bits    | point
1438 * ------+---------+------------------------------
1439 *     0 | 0 0 0 0 | 0G
1440 *     1 | 0 0 0 1 | 1G
1441 *     2 | 0 0 1 0 | 2^64G
1442 *     3 | 0 0 1 1 | (2^64 + 1)G
1443 *     4 | 0 1 0 0 | 2^128G
1444 *     5 | 0 1 0 1 | (2^128 + 1)G
1445 *     6 | 0 1 1 0 | (2^128 + 2^64)G
1446 *     7 | 0 1 1 1 | (2^128 + 2^64 + 1)G
1447 *     8 | 1 0 0 0 | 2^192G
1448 *     9 | 1 0 0 1 | (2^192 + 1)G
1449 *    10 | 1 0 1 0 | (2^192 + 2^64)G
1450 *    11 | 1 0 1 1 | (2^192 + 2^64 + 1)G
1451 *    12 | 1 1 0 0 | (2^192 + 2^128)G
1452 *    13 | 1 1 0 1 | (2^192 + 2^128 + 1)G
1453 *    14 | 1 1 1 0 | (2^192 + 2^128 + 2^64)G
1454 *    15 | 1 1 1 1 | (2^192 + 2^128 + 2^64 + 1)G
1455 * followed by a copy of this with each element multiplied by 2^32.
1456 *
1457 * The reason for this is so that we can clock bits into four different
1458 * locations when doing simple scalar multiplies against the base point,
1459 * and then another four locations using the second 16 elements.
1460 *
1461 * Tables for other points have table[i] = iG for i in 0 .. 16. */
1462
1463/* gmul is the table of precomputed base points */
1464static const smallfelem gmul[2][16][3] = {
1465    {{{0, 0, 0, 0},
1466      {0, 0, 0, 0},
1467      {0, 0, 0, 0}},
1468     {{0xf4a13945d898c296, 0x77037d812deb33a0, 0xf8bce6e563a440f2,
1469       0x6b17d1f2e12c4247},
1470      {0xcbb6406837bf51f5, 0x2bce33576b315ece, 0x8ee7eb4a7c0f9e16,
1471       0x4fe342e2fe1a7f9b},
1472      {1, 0, 0, 0}},
1473     {{0x90e75cb48e14db63, 0x29493baaad651f7e, 0x8492592e326e25de,
1474       0x0fa822bc2811aaa5},
1475      {0xe41124545f462ee7, 0x34b1a65050fe82f5, 0x6f4ad4bcb3df188b,
1476       0xbff44ae8f5dba80d},
1477      {1, 0, 0, 0}},
1478     {{0x93391ce2097992af, 0xe96c98fd0d35f1fa, 0xb257c0de95e02789,
1479       0x300a4bbc89d6726f},
1480      {0xaa54a291c08127a0, 0x5bb1eeada9d806a5, 0x7f1ddb25ff1e3c6f,
1481       0x72aac7e0d09b4644},
1482      {1, 0, 0, 0}},
1483     {{0x57c84fc9d789bd85, 0xfc35ff7dc297eac3, 0xfb982fd588c6766e,
1484       0x447d739beedb5e67},
1485      {0x0c7e33c972e25b32, 0x3d349b95a7fae500, 0xe12e9d953a4aaff7,
1486       0x2d4825ab834131ee},
1487      {1, 0, 0, 0}},
1488     {{0x13949c932a1d367f, 0xef7fbd2b1a0a11b7, 0xddc6068bb91dfc60,
1489       0xef9519328a9c72ff},
1490      {0x196035a77376d8a8, 0x23183b0895ca1740, 0xc1ee9807022c219c,
1491       0x611e9fc37dbb2c9b},
1492      {1, 0, 0, 0}},
1493     {{0xcae2b1920b57f4bc, 0x2936df5ec6c9bc36, 0x7dea6482e11238bf,
1494       0x550663797b51f5d8},
1495      {0x44ffe216348a964c, 0x9fb3d576dbdefbe1, 0x0afa40018d9d50e5,
1496       0x157164848aecb851},
1497      {1, 0, 0, 0}},
1498     {{0xe48ecafffc5cde01, 0x7ccd84e70d715f26, 0xa2e8f483f43e4391,
1499       0xeb5d7745b21141ea},
1500      {0xcac917e2731a3479, 0x85f22cfe2844b645, 0x0990e6a158006cee,
1501       0xeafd72ebdbecc17b},
1502      {1, 0, 0, 0}},
1503     {{0x6cf20ffb313728be, 0x96439591a3c6b94a, 0x2736ff8344315fc5,
1504       0xa6d39677a7849276},
1505      {0xf2bab833c357f5f4, 0x824a920c2284059b, 0x66b8babd2d27ecdf,
1506       0x674f84749b0b8816},
1507      {1, 0, 0, 0}},
1508     {{0x2df48c04677c8a3e, 0x74e02f080203a56b, 0x31855f7db8c7fedb,
1509       0x4e769e7672c9ddad},
1510      {0xa4c36165b824bbb0, 0xfb9ae16f3b9122a5, 0x1ec0057206947281,
1511       0x42b99082de830663},
1512      {1, 0, 0, 0}},
1513     {{0x6ef95150dda868b9, 0xd1f89e799c0ce131, 0x7fdc1ca008a1c478,
1514       0x78878ef61c6ce04d},
1515      {0x9c62b9121fe0d976, 0x6ace570ebde08d4f, 0xde53142c12309def,
1516       0xb6cb3f5d7b72c321},
1517      {1, 0, 0, 0}},
1518     {{0x7f991ed2c31a3573, 0x5b82dd5bd54fb496, 0x595c5220812ffcae,
1519       0x0c88bc4d716b1287},
1520      {0x3a57bf635f48aca8, 0x7c8181f4df2564f3, 0x18d1b5b39c04e6aa,
1521       0xdd5ddea3f3901dc6},
1522      {1, 0, 0, 0}},
1523     {{0xe96a79fb3e72ad0c, 0x43a0a28c42ba792f, 0xefe0a423083e49f3,
1524       0x68f344af6b317466},
1525      {0xcdfe17db3fb24d4a, 0x668bfc2271f5c626, 0x604ed93c24d67ff3,
1526       0x31b9c405f8540a20},
1527      {1, 0, 0, 0}},
1528     {{0xd36b4789a2582e7f, 0x0d1a10144ec39c28, 0x663c62c3edbad7a0,
1529       0x4052bf4b6f461db9},
1530      {0x235a27c3188d25eb, 0xe724f33999bfcc5b, 0x862be6bd71d70cc8,
1531       0xfecf4d5190b0fc61},
1532      {1, 0, 0, 0}},
1533     {{0x74346c10a1d4cfac, 0xafdf5cc08526a7a4, 0x123202a8f62bff7a,
1534       0x1eddbae2c802e41a},
1535      {0x8fa0af2dd603f844, 0x36e06b7e4c701917, 0x0c45f45273db33a0,
1536       0x43104d86560ebcfc},
1537      {1, 0, 0, 0}},
1538     {{0x9615b5110d1d78e5, 0x66b0de3225c4744b, 0x0a4a46fb6aaf363a,
1539       0xb48e26b484f7a21c},
1540      {0x06ebb0f621a01b2d, 0xc004e4048b7b0f98, 0x64131bcdfed6f668,
1541       0xfac015404d4d3dab},
1542      {1, 0, 0, 0}}},
1543    {{{0, 0, 0, 0},
1544      {0, 0, 0, 0},
1545      {0, 0, 0, 0}},
1546     {{0x3a5a9e22185a5943, 0x1ab919365c65dfb6, 0x21656b32262c71da,
1547       0x7fe36b40af22af89},
1548      {0xd50d152c699ca101, 0x74b3d5867b8af212, 0x9f09f40407dca6f1,
1549       0xe697d45825b63624},
1550      {1, 0, 0, 0}},
1551     {{0xa84aa9397512218e, 0xe9a521b074ca0141, 0x57880b3a18a2e902,
1552       0x4a5b506612a677a6},
1553      {0x0beada7a4c4f3840, 0x626db15419e26d9d, 0xc42604fbe1627d40,
1554       0xeb13461ceac089f1},
1555      {1, 0, 0, 0}},
1556     {{0xf9faed0927a43281, 0x5e52c4144103ecbc, 0xc342967aa815c857,
1557       0x0781b8291c6a220a},
1558      {0x5a8343ceeac55f80, 0x88f80eeee54a05e3, 0x97b2a14f12916434,
1559       0x690cde8df0151593},
1560      {1, 0, 0, 0}},
1561     {{0xaee9c75df7f82f2a, 0x9e4c35874afdf43a, 0xf5622df437371326,
1562       0x8a535f566ec73617},
1563      {0xc5f9a0ac223094b7, 0xcde533864c8c7669, 0x37e02819085a92bf,
1564       0x0455c08468b08bd7},
1565      {1, 0, 0, 0}},
1566     {{0x0c0a6e2c9477b5d9, 0xf9a4bf62876dc444, 0x5050a949b6cdc279,
1567       0x06bada7ab77f8276},
1568      {0xc8b4aed1ea48dac9, 0xdebd8a4b7ea1070f, 0x427d49101366eb70,
1569       0x5b476dfd0e6cb18a},
1570      {1, 0, 0, 0}},
1571     {{0x7c5c3e44278c340a, 0x4d54606812d66f3b, 0x29a751b1ae23c5d8,
1572       0x3e29864e8a2ec908},
1573      {0x142d2a6626dbb850, 0xad1744c4765bd780, 0x1f150e68e322d1ed,
1574       0x239b90ea3dc31e7e},
1575      {1, 0, 0, 0}},
1576     {{0x78c416527a53322a, 0x305dde6709776f8e, 0xdbcab759f8862ed4,
1577       0x820f4dd949f72ff7},
1578      {0x6cc544a62b5debd4, 0x75be5d937b4e8cc4, 0x1b481b1b215c14d3,
1579       0x140406ec783a05ec},
1580      {1, 0, 0, 0}},
1581     {{0x6a703f10e895df07, 0xfd75f3fa01876bd8, 0xeb5b06e70ce08ffe,
1582       0x68f6b8542783dfee},
1583      {0x90c76f8a78712655, 0xcf5293d2f310bf7f, 0xfbc8044dfda45028,
1584       0xcbe1feba92e40ce6},
1585      {1, 0, 0, 0}},
1586     {{0xe998ceea4396e4c1, 0xfc82ef0b6acea274, 0x230f729f2250e927,
1587       0xd0b2f94d2f420109},
1588      {0x4305adddb38d4966, 0x10b838f8624c3b45, 0x7db2636658954e7a,
1589       0x971459828b0719e5},
1590      {1, 0, 0, 0}},
1591     {{0x4bd6b72623369fc9, 0x57f2929e53d0b876, 0xc2d5cba4f2340687,
1592       0x961610004a866aba},
1593      {0x49997bcd2e407a5e, 0x69ab197d92ddcb24, 0x2cf1f2438fe5131c,
1594       0x7acb9fadcee75e44},
1595      {1, 0, 0, 0}},
1596     {{0x254e839423d2d4c0, 0xf57f0c917aea685b, 0xa60d880f6f75aaea,
1597       0x24eb9acca333bf5b},
1598      {0xe3de4ccb1cda5dea, 0xfeef9341c51a6b4f, 0x743125f88bac4c4d,
1599       0x69f891c5acd079cc},
1600      {1, 0, 0, 0}},
1601     {{0xeee44b35702476b5, 0x7ed031a0e45c2258, 0xb422d1e7bd6f8514,
1602       0xe51f547c5972a107},
1603      {0xa25bcd6fc9cf343d, 0x8ca922ee097c184e, 0xa62f98b3a9fe9a06,
1604       0x1c309a2b25bb1387},
1605      {1, 0, 0, 0}},
1606     {{0x9295dbeb1967c459, 0xb00148833472c98e, 0xc504977708011828,
1607       0x20b87b8aa2c4e503},
1608      {0x3063175de057c277, 0x1bd539338fe582dd, 0x0d11adef5f69a044,
1609       0xf5c6fa49919776be},
1610      {1, 0, 0, 0}},
1611     {{0x8c944e760fd59e11, 0x3876cba1102fad5f, 0xa454c3fad83faa56,
1612       0x1ed7d1b9332010b9},
1613      {0xa1011a270024b889, 0x05e4d0dcac0cd344, 0x52b520f0eb6a2a24,
1614       0x3a2b03f03217257a},
1615      {1, 0, 0, 0}},
1616     {{0xf20fc2afdf1d043d, 0xf330240db58d5a62, 0xfc7d229ca0058c3b,
1617       0x15fee545c78dd9f6},
1618      {0x501e82885bc98cda, 0x41ef80e5d046ac04, 0x557d9f49461210fb,
1619       0x4ab5b6b2b8753f81},
1620      {1, 0, 0, 0}}}
1621};
1622
1623/*
1624 * select_point selects the |idx|th point from a precomputation table and
1625 * copies it to out.
1626 */
1627static void select_point(const u64 idx, unsigned int size,
1628                         const smallfelem pre_comp[16][3], smallfelem out[3])
1629{
1630    unsigned i, j;
1631    u64 *outlimbs = &out[0][0];
1632    memset(outlimbs, 0, 3 * sizeof(smallfelem));
1633
1634    for (i = 0; i < size; i++) {
1635        const u64 *inlimbs = (u64 *)&pre_comp[i][0][0];
1636        u64 mask = i ^ idx;
1637        mask |= mask >> 4;
1638        mask |= mask >> 2;
1639        mask |= mask >> 1;
1640        mask &= 1;
1641        mask--;
1642        for (j = 0; j < NLIMBS * 3; j++)
1643            outlimbs[j] |= inlimbs[j] & mask;
1644    }
1645}
1646
1647/* get_bit returns the |i|th bit in |in| */
1648static char get_bit(const felem_bytearray in, int i)
1649{
1650    if ((i < 0) || (i >= 256))
1651        return 0;
1652    return (in[i >> 3] >> (i & 7)) & 1;
1653}
1654
1655/*
1656 * Interleaved point multiplication using precomputed point multiples: The
1657 * small point multiples 0*P, 1*P, ..., 17*P are in pre_comp[], the scalars
1658 * in scalars[]. If g_scalar is non-NULL, we also add this multiple of the
1659 * generator, using certain (large) precomputed multiples in g_pre_comp.
1660 * Output point (X, Y, Z) is stored in x_out, y_out, z_out
1661 */
1662static void batch_mul(felem x_out, felem y_out, felem z_out,
1663                      const felem_bytearray scalars[],
1664                      const unsigned num_points, const u8 *g_scalar,
1665                      const int mixed, const smallfelem pre_comp[][17][3],
1666                      const smallfelem g_pre_comp[2][16][3])
1667{
1668    int i, skip;
1669    unsigned num, gen_mul = (g_scalar != NULL);
1670    felem nq[3], ftmp;
1671    smallfelem tmp[3];
1672    u64 bits;
1673    u8 sign, digit;
1674
1675    /* set nq to the point at infinity */
1676    memset(nq, 0, 3 * sizeof(felem));
1677
1678    /*
1679     * Loop over all scalars msb-to-lsb, interleaving additions of multiples
1680     * of the generator (two in each of the last 32 rounds) and additions of
1681     * other points multiples (every 5th round).
1682     */
1683    skip = 1;                   /* save two point operations in the first
1684                                 * round */
1685    for (i = (num_points ? 255 : 31); i >= 0; --i) {
1686        /* double */
1687        if (!skip)
1688            point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]);
1689
1690        /* add multiples of the generator */
1691        if (gen_mul && (i <= 31)) {
1692            /* first, look 32 bits upwards */
1693            bits = get_bit(g_scalar, i + 224) << 3;
1694            bits |= get_bit(g_scalar, i + 160) << 2;
1695            bits |= get_bit(g_scalar, i + 96) << 1;
1696            bits |= get_bit(g_scalar, i + 32);
1697            /* select the point to add, in constant time */
1698            select_point(bits, 16, g_pre_comp[1], tmp);
1699
1700            if (!skip) {
1701                /* Arg 1 below is for "mixed" */
1702                point_add(nq[0], nq[1], nq[2],
1703                          nq[0], nq[1], nq[2], 1, tmp[0], tmp[1], tmp[2]);
1704            } else {
1705                smallfelem_expand(nq[0], tmp[0]);
1706                smallfelem_expand(nq[1], tmp[1]);
1707                smallfelem_expand(nq[2], tmp[2]);
1708                skip = 0;
1709            }
1710
1711            /* second, look at the current position */
1712            bits = get_bit(g_scalar, i + 192) << 3;
1713            bits |= get_bit(g_scalar, i + 128) << 2;
1714            bits |= get_bit(g_scalar, i + 64) << 1;
1715            bits |= get_bit(g_scalar, i);
1716            /* select the point to add, in constant time */
1717            select_point(bits, 16, g_pre_comp[0], tmp);
1718            /* Arg 1 below is for "mixed" */
1719            point_add(nq[0], nq[1], nq[2],
1720                      nq[0], nq[1], nq[2], 1, tmp[0], tmp[1], tmp[2]);
1721        }
1722
1723        /* do other additions every 5 doublings */
1724        if (num_points && (i % 5 == 0)) {
1725            /* loop over all scalars */
1726            for (num = 0; num < num_points; ++num) {
1727                bits = get_bit(scalars[num], i + 4) << 5;
1728                bits |= get_bit(scalars[num], i + 3) << 4;
1729                bits |= get_bit(scalars[num], i + 2) << 3;
1730                bits |= get_bit(scalars[num], i + 1) << 2;
1731                bits |= get_bit(scalars[num], i) << 1;
1732                bits |= get_bit(scalars[num], i - 1);
1733                ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits);
1734
1735                /*
1736                 * select the point to add or subtract, in constant time
1737                 */
1738                select_point(digit, 17, pre_comp[num], tmp);
1739                smallfelem_neg(ftmp, tmp[1]); /* (X, -Y, Z) is the negative
1740                                               * point */
1741                copy_small_conditional(ftmp, tmp[1], (((limb) sign) - 1));
1742                felem_contract(tmp[1], ftmp);
1743
1744                if (!skip) {
1745                    point_add(nq[0], nq[1], nq[2],
1746                              nq[0], nq[1], nq[2],
1747                              mixed, tmp[0], tmp[1], tmp[2]);
1748                } else {
1749                    smallfelem_expand(nq[0], tmp[0]);
1750                    smallfelem_expand(nq[1], tmp[1]);
1751                    smallfelem_expand(nq[2], tmp[2]);
1752                    skip = 0;
1753                }
1754            }
1755        }
1756    }
1757    felem_assign(x_out, nq[0]);
1758    felem_assign(y_out, nq[1]);
1759    felem_assign(z_out, nq[2]);
1760}
1761
1762/* Precomputation for the group generator. */
1763typedef struct {
1764    smallfelem g_pre_comp[2][16][3];
1765    int references;
1766} NISTP256_PRE_COMP;
1767
1768const EC_METHOD *EC_GFp_nistp256_method(void)
1769{
1770    static const EC_METHOD ret = {
1771        EC_FLAGS_DEFAULT_OCT,
1772        NID_X9_62_prime_field,
1773        ec_GFp_nistp256_group_init,
1774        ec_GFp_simple_group_finish,
1775        ec_GFp_simple_group_clear_finish,
1776        ec_GFp_nist_group_copy,
1777        ec_GFp_nistp256_group_set_curve,
1778        ec_GFp_simple_group_get_curve,
1779        ec_GFp_simple_group_get_degree,
1780        ec_GFp_simple_group_check_discriminant,
1781        ec_GFp_simple_point_init,
1782        ec_GFp_simple_point_finish,
1783        ec_GFp_simple_point_clear_finish,
1784        ec_GFp_simple_point_copy,
1785        ec_GFp_simple_point_set_to_infinity,
1786        ec_GFp_simple_set_Jprojective_coordinates_GFp,
1787        ec_GFp_simple_get_Jprojective_coordinates_GFp,
1788        ec_GFp_simple_point_set_affine_coordinates,
1789        ec_GFp_nistp256_point_get_affine_coordinates,
1790        0 /* point_set_compressed_coordinates */ ,
1791        0 /* point2oct */ ,
1792        0 /* oct2point */ ,
1793        ec_GFp_simple_add,
1794        ec_GFp_simple_dbl,
1795        ec_GFp_simple_invert,
1796        ec_GFp_simple_is_at_infinity,
1797        ec_GFp_simple_is_on_curve,
1798        ec_GFp_simple_cmp,
1799        ec_GFp_simple_make_affine,
1800        ec_GFp_simple_points_make_affine,
1801        ec_GFp_nistp256_points_mul,
1802        ec_GFp_nistp256_precompute_mult,
1803        ec_GFp_nistp256_have_precompute_mult,
1804        ec_GFp_nist_field_mul,
1805        ec_GFp_nist_field_sqr,
1806        0 /* field_div */ ,
1807        0 /* field_encode */ ,
1808        0 /* field_decode */ ,
1809        0                       /* field_set_to_one */
1810    };
1811
1812    return &ret;
1813}
1814
1815/******************************************************************************/
1816/*
1817 * FUNCTIONS TO MANAGE PRECOMPUTATION
1818 */
1819
1820static NISTP256_PRE_COMP *nistp256_pre_comp_new()
1821{
1822    NISTP256_PRE_COMP *ret = NULL;
1823    ret = (NISTP256_PRE_COMP *) OPENSSL_malloc(sizeof *ret);
1824    if (!ret) {
1825        ECerr(EC_F_NISTP256_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE);
1826        return ret;
1827    }
1828    memset(ret->g_pre_comp, 0, sizeof(ret->g_pre_comp));
1829    ret->references = 1;
1830    return ret;
1831}
1832
1833static void *nistp256_pre_comp_dup(void *src_)
1834{
1835    NISTP256_PRE_COMP *src = src_;
1836
1837    /* no need to actually copy, these objects never change! */
1838    CRYPTO_add(&src->references, 1, CRYPTO_LOCK_EC_PRE_COMP);
1839
1840    return src_;
1841}
1842
1843static void nistp256_pre_comp_free(void *pre_)
1844{
1845    int i;
1846    NISTP256_PRE_COMP *pre = pre_;
1847
1848    if (!pre)
1849        return;
1850
1851    i = CRYPTO_add(&pre->references, -1, CRYPTO_LOCK_EC_PRE_COMP);
1852    if (i > 0)
1853        return;
1854
1855    OPENSSL_free(pre);
1856}
1857
1858static void nistp256_pre_comp_clear_free(void *pre_)
1859{
1860    int i;
1861    NISTP256_PRE_COMP *pre = pre_;
1862
1863    if (!pre)
1864        return;
1865
1866    i = CRYPTO_add(&pre->references, -1, CRYPTO_LOCK_EC_PRE_COMP);
1867    if (i > 0)
1868        return;
1869
1870    OPENSSL_cleanse(pre, sizeof *pre);
1871    OPENSSL_free(pre);
1872}
1873
1874/******************************************************************************/
1875/*
1876 * OPENSSL EC_METHOD FUNCTIONS
1877 */
1878
1879int ec_GFp_nistp256_group_init(EC_GROUP *group)
1880{
1881    int ret;
1882    ret = ec_GFp_simple_group_init(group);
1883    group->a_is_minus3 = 1;
1884    return ret;
1885}
1886
1887int ec_GFp_nistp256_group_set_curve(EC_GROUP *group, const BIGNUM *p,
1888                                    const BIGNUM *a, const BIGNUM *b,
1889                                    BN_CTX *ctx)
1890{
1891    int ret = 0;
1892    BN_CTX *new_ctx = NULL;
1893    BIGNUM *curve_p, *curve_a, *curve_b;
1894
1895    if (ctx == NULL)
1896        if ((ctx = new_ctx = BN_CTX_new()) == NULL)
1897            return 0;
1898    BN_CTX_start(ctx);
1899    if (((curve_p = BN_CTX_get(ctx)) == NULL) ||
1900        ((curve_a = BN_CTX_get(ctx)) == NULL) ||
1901        ((curve_b = BN_CTX_get(ctx)) == NULL))
1902        goto err;
1903    BN_bin2bn(nistp256_curve_params[0], sizeof(felem_bytearray), curve_p);
1904    BN_bin2bn(nistp256_curve_params[1], sizeof(felem_bytearray), curve_a);
1905    BN_bin2bn(nistp256_curve_params[2], sizeof(felem_bytearray), curve_b);
1906    if ((BN_cmp(curve_p, p)) || (BN_cmp(curve_a, a)) || (BN_cmp(curve_b, b))) {
1907        ECerr(EC_F_EC_GFP_NISTP256_GROUP_SET_CURVE,
1908              EC_R_WRONG_CURVE_PARAMETERS);
1909        goto err;
1910    }
1911    group->field_mod_func = BN_nist_mod_256;
1912    ret = ec_GFp_simple_group_set_curve(group, p, a, b, ctx);
1913 err:
1914    BN_CTX_end(ctx);
1915    if (new_ctx != NULL)
1916        BN_CTX_free(new_ctx);
1917    return ret;
1918}
1919
1920/*
1921 * Takes the Jacobian coordinates (X, Y, Z) of a point and returns (X', Y') =
1922 * (X/Z^2, Y/Z^3)
1923 */
1924int ec_GFp_nistp256_point_get_affine_coordinates(const EC_GROUP *group,
1925                                                 const EC_POINT *point,
1926                                                 BIGNUM *x, BIGNUM *y,
1927                                                 BN_CTX *ctx)
1928{
1929    felem z1, z2, x_in, y_in;
1930    smallfelem x_out, y_out;
1931    longfelem tmp;
1932
1933    if (EC_POINT_is_at_infinity(group, point)) {
1934        ECerr(EC_F_EC_GFP_NISTP256_POINT_GET_AFFINE_COORDINATES,
1935              EC_R_POINT_AT_INFINITY);
1936        return 0;
1937    }
1938    if ((!BN_to_felem(x_in, &point->X)) || (!BN_to_felem(y_in, &point->Y)) ||
1939        (!BN_to_felem(z1, &point->Z)))
1940        return 0;
1941    felem_inv(z2, z1);
1942    felem_square(tmp, z2);
1943    felem_reduce(z1, tmp);
1944    felem_mul(tmp, x_in, z1);
1945    felem_reduce(x_in, tmp);
1946    felem_contract(x_out, x_in);
1947    if (x != NULL) {
1948        if (!smallfelem_to_BN(x, x_out)) {
1949            ECerr(EC_F_EC_GFP_NISTP256_POINT_GET_AFFINE_COORDINATES,
1950                  ERR_R_BN_LIB);
1951            return 0;
1952        }
1953    }
1954    felem_mul(tmp, z1, z2);
1955    felem_reduce(z1, tmp);
1956    felem_mul(tmp, y_in, z1);
1957    felem_reduce(y_in, tmp);
1958    felem_contract(y_out, y_in);
1959    if (y != NULL) {
1960        if (!smallfelem_to_BN(y, y_out)) {
1961            ECerr(EC_F_EC_GFP_NISTP256_POINT_GET_AFFINE_COORDINATES,
1962                  ERR_R_BN_LIB);
1963            return 0;
1964        }
1965    }
1966    return 1;
1967}
1968
1969/* points below is of size |num|, and tmp_smallfelems is of size |num+1| */
1970static void make_points_affine(size_t num, smallfelem points[][3],
1971                               smallfelem tmp_smallfelems[])
1972{
1973    /*
1974     * Runs in constant time, unless an input is the point at infinity (which
1975     * normally shouldn't happen).
1976     */
1977    ec_GFp_nistp_points_make_affine_internal(num,
1978                                             points,
1979                                             sizeof(smallfelem),
1980                                             tmp_smallfelems,
1981                                             (void (*)(void *))smallfelem_one,
1982                                             (int (*)(const void *))
1983                                             smallfelem_is_zero_int,
1984                                             (void (*)(void *, const void *))
1985                                             smallfelem_assign,
1986                                             (void (*)(void *, const void *))
1987                                             smallfelem_square_contract,
1988                                             (void (*)
1989                                              (void *, const void *,
1990                                               const void *))
1991                                             smallfelem_mul_contract,
1992                                             (void (*)(void *, const void *))
1993                                             smallfelem_inv_contract,
1994                                             /* nothing to contract */
1995                                             (void (*)(void *, const void *))
1996                                             smallfelem_assign);
1997}
1998
1999/*
2000 * Computes scalar*generator + \sum scalars[i]*points[i], ignoring NULL
2001 * values Result is stored in r (r can equal one of the inputs).
2002 */
2003int ec_GFp_nistp256_points_mul(const EC_GROUP *group, EC_POINT *r,
2004                               const BIGNUM *scalar, size_t num,
2005                               const EC_POINT *points[],
2006                               const BIGNUM *scalars[], BN_CTX *ctx)
2007{
2008    int ret = 0;
2009    int j;
2010    int mixed = 0;
2011    BN_CTX *new_ctx = NULL;
2012    BIGNUM *x, *y, *z, *tmp_scalar;
2013    felem_bytearray g_secret;
2014    felem_bytearray *secrets = NULL;
2015    smallfelem(*pre_comp)[17][3] = NULL;
2016    smallfelem *tmp_smallfelems = NULL;
2017    felem_bytearray tmp;
2018    unsigned i, num_bytes;
2019    int have_pre_comp = 0;
2020    size_t num_points = num;
2021    smallfelem x_in, y_in, z_in;
2022    felem x_out, y_out, z_out;
2023    NISTP256_PRE_COMP *pre = NULL;
2024    const smallfelem(*g_pre_comp)[16][3] = NULL;
2025    EC_POINT *generator = NULL;
2026    const EC_POINT *p = NULL;
2027    const BIGNUM *p_scalar = NULL;
2028
2029    if (ctx == NULL)
2030        if ((ctx = new_ctx = BN_CTX_new()) == NULL)
2031            return 0;
2032    BN_CTX_start(ctx);
2033    if (((x = BN_CTX_get(ctx)) == NULL) ||
2034        ((y = BN_CTX_get(ctx)) == NULL) ||
2035        ((z = BN_CTX_get(ctx)) == NULL) ||
2036        ((tmp_scalar = BN_CTX_get(ctx)) == NULL))
2037        goto err;
2038
2039    if (scalar != NULL) {
2040        pre = EC_EX_DATA_get_data(group->extra_data,
2041                                  nistp256_pre_comp_dup,
2042                                  nistp256_pre_comp_free,
2043                                  nistp256_pre_comp_clear_free);
2044        if (pre)
2045            /* we have precomputation, try to use it */
2046            g_pre_comp = (const smallfelem(*)[16][3])pre->g_pre_comp;
2047        else
2048            /* try to use the standard precomputation */
2049            g_pre_comp = &gmul[0];
2050        generator = EC_POINT_new(group);
2051        if (generator == NULL)
2052            goto err;
2053        /* get the generator from precomputation */
2054        if (!smallfelem_to_BN(x, g_pre_comp[0][1][0]) ||
2055            !smallfelem_to_BN(y, g_pre_comp[0][1][1]) ||
2056            !smallfelem_to_BN(z, g_pre_comp[0][1][2])) {
2057            ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_BN_LIB);
2058            goto err;
2059        }
2060        if (!EC_POINT_set_Jprojective_coordinates_GFp(group,
2061                                                      generator, x, y, z,
2062                                                      ctx))
2063            goto err;
2064        if (0 == EC_POINT_cmp(group, generator, group->generator, ctx))
2065            /* precomputation matches generator */
2066            have_pre_comp = 1;
2067        else
2068            /*
2069             * we don't have valid precomputation: treat the generator as a
2070             * random point
2071             */
2072            num_points++;
2073    }
2074    if (num_points > 0) {
2075        if (num_points >= 3) {
2076            /*
2077             * unless we precompute multiples for just one or two points,
2078             * converting those into affine form is time well spent
2079             */
2080            mixed = 1;
2081        }
2082        secrets = OPENSSL_malloc(num_points * sizeof(felem_bytearray));
2083        pre_comp = OPENSSL_malloc(num_points * 17 * 3 * sizeof(smallfelem));
2084        if (mixed)
2085            tmp_smallfelems =
2086                OPENSSL_malloc((num_points * 17 + 1) * sizeof(smallfelem));
2087        if ((secrets == NULL) || (pre_comp == NULL)
2088            || (mixed && (tmp_smallfelems == NULL))) {
2089            ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_MALLOC_FAILURE);
2090            goto err;
2091        }
2092
2093        /*
2094         * we treat NULL scalars as 0, and NULL points as points at infinity,
2095         * i.e., they contribute nothing to the linear combination
2096         */
2097        memset(secrets, 0, num_points * sizeof(felem_bytearray));
2098        memset(pre_comp, 0, num_points * 17 * 3 * sizeof(smallfelem));
2099        for (i = 0; i < num_points; ++i) {
2100            if (i == num)
2101                /*
2102                 * we didn't have a valid precomputation, so we pick the
2103                 * generator
2104                 */
2105            {
2106                p = EC_GROUP_get0_generator(group);
2107                p_scalar = scalar;
2108            } else
2109                /* the i^th point */
2110            {
2111                p = points[i];
2112                p_scalar = scalars[i];
2113            }
2114            if ((p_scalar != NULL) && (p != NULL)) {
2115                /* reduce scalar to 0 <= scalar < 2^256 */
2116                if ((BN_num_bits(p_scalar) > 256)
2117                    || (BN_is_negative(p_scalar))) {
2118                    /*
2119                     * this is an unusual input, and we don't guarantee
2120                     * constant-timeness
2121                     */
2122                    if (!BN_nnmod(tmp_scalar, p_scalar, &group->order, ctx)) {
2123                        ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_BN_LIB);
2124                        goto err;
2125                    }
2126                    num_bytes = BN_bn2bin(tmp_scalar, tmp);
2127                } else
2128                    num_bytes = BN_bn2bin(p_scalar, tmp);
2129                flip_endian(secrets[i], tmp, num_bytes);
2130                /* precompute multiples */
2131                if ((!BN_to_felem(x_out, &p->X)) ||
2132                    (!BN_to_felem(y_out, &p->Y)) ||
2133                    (!BN_to_felem(z_out, &p->Z)))
2134                    goto err;
2135                felem_shrink(pre_comp[i][1][0], x_out);
2136                felem_shrink(pre_comp[i][1][1], y_out);
2137                felem_shrink(pre_comp[i][1][2], z_out);
2138                for (j = 2; j <= 16; ++j) {
2139                    if (j & 1) {
2140                        point_add_small(pre_comp[i][j][0], pre_comp[i][j][1],
2141                                        pre_comp[i][j][2], pre_comp[i][1][0],
2142                                        pre_comp[i][1][1], pre_comp[i][1][2],
2143                                        pre_comp[i][j - 1][0],
2144                                        pre_comp[i][j - 1][1],
2145                                        pre_comp[i][j - 1][2]);
2146                    } else {
2147                        point_double_small(pre_comp[i][j][0],
2148                                           pre_comp[i][j][1],
2149                                           pre_comp[i][j][2],
2150                                           pre_comp[i][j / 2][0],
2151                                           pre_comp[i][j / 2][1],
2152                                           pre_comp[i][j / 2][2]);
2153                    }
2154                }
2155            }
2156        }
2157        if (mixed)
2158            make_points_affine(num_points * 17, pre_comp[0], tmp_smallfelems);
2159    }
2160
2161    /* the scalar for the generator */
2162    if ((scalar != NULL) && (have_pre_comp)) {
2163        memset(g_secret, 0, sizeof(g_secret));
2164        /* reduce scalar to 0 <= scalar < 2^256 */
2165        if ((BN_num_bits(scalar) > 256) || (BN_is_negative(scalar))) {
2166            /*
2167             * this is an unusual input, and we don't guarantee
2168             * constant-timeness
2169             */
2170            if (!BN_nnmod(tmp_scalar, scalar, &group->order, ctx)) {
2171                ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_BN_LIB);
2172                goto err;
2173            }
2174            num_bytes = BN_bn2bin(tmp_scalar, tmp);
2175        } else
2176            num_bytes = BN_bn2bin(scalar, tmp);
2177        flip_endian(g_secret, tmp, num_bytes);
2178        /* do the multiplication with generator precomputation */
2179        batch_mul(x_out, y_out, z_out,
2180                  (const felem_bytearray(*))secrets, num_points,
2181                  g_secret,
2182                  mixed, (const smallfelem(*)[17][3])pre_comp, g_pre_comp);
2183    } else
2184        /* do the multiplication without generator precomputation */
2185        batch_mul(x_out, y_out, z_out,
2186                  (const felem_bytearray(*))secrets, num_points,
2187                  NULL, mixed, (const smallfelem(*)[17][3])pre_comp, NULL);
2188    /* reduce the output to its unique minimal representation */
2189    felem_contract(x_in, x_out);
2190    felem_contract(y_in, y_out);
2191    felem_contract(z_in, z_out);
2192    if ((!smallfelem_to_BN(x, x_in)) || (!smallfelem_to_BN(y, y_in)) ||
2193        (!smallfelem_to_BN(z, z_in))) {
2194        ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_BN_LIB);
2195        goto err;
2196    }
2197    ret = EC_POINT_set_Jprojective_coordinates_GFp(group, r, x, y, z, ctx);
2198
2199 err:
2200    BN_CTX_end(ctx);
2201    if (generator != NULL)
2202        EC_POINT_free(generator);
2203    if (new_ctx != NULL)
2204        BN_CTX_free(new_ctx);
2205    if (secrets != NULL)
2206        OPENSSL_free(secrets);
2207    if (pre_comp != NULL)
2208        OPENSSL_free(pre_comp);
2209    if (tmp_smallfelems != NULL)
2210        OPENSSL_free(tmp_smallfelems);
2211    return ret;
2212}
2213
2214int ec_GFp_nistp256_precompute_mult(EC_GROUP *group, BN_CTX *ctx)
2215{
2216    int ret = 0;
2217    NISTP256_PRE_COMP *pre = NULL;
2218    int i, j;
2219    BN_CTX *new_ctx = NULL;
2220    BIGNUM *x, *y;
2221    EC_POINT *generator = NULL;
2222    smallfelem tmp_smallfelems[32];
2223    felem x_tmp, y_tmp, z_tmp;
2224
2225    /* throw away old precomputation */
2226    EC_EX_DATA_free_data(&group->extra_data, nistp256_pre_comp_dup,
2227                         nistp256_pre_comp_free,
2228                         nistp256_pre_comp_clear_free);
2229    if (ctx == NULL)
2230        if ((ctx = new_ctx = BN_CTX_new()) == NULL)
2231            return 0;
2232    BN_CTX_start(ctx);
2233    if (((x = BN_CTX_get(ctx)) == NULL) || ((y = BN_CTX_get(ctx)) == NULL))
2234        goto err;
2235    /* get the generator */
2236    if (group->generator == NULL)
2237        goto err;
2238    generator = EC_POINT_new(group);
2239    if (generator == NULL)
2240        goto err;
2241    BN_bin2bn(nistp256_curve_params[3], sizeof(felem_bytearray), x);
2242    BN_bin2bn(nistp256_curve_params[4], sizeof(felem_bytearray), y);
2243    if (!EC_POINT_set_affine_coordinates_GFp(group, generator, x, y, ctx))
2244        goto err;
2245    if ((pre = nistp256_pre_comp_new()) == NULL)
2246        goto err;
2247    /*
2248     * if the generator is the standard one, use built-in precomputation
2249     */
2250    if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) {
2251        memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp));
2252        ret = 1;
2253        goto err;
2254    }
2255    if ((!BN_to_felem(x_tmp, &group->generator->X)) ||
2256        (!BN_to_felem(y_tmp, &group->generator->Y)) ||
2257        (!BN_to_felem(z_tmp, &group->generator->Z)))
2258        goto err;
2259    felem_shrink(pre->g_pre_comp[0][1][0], x_tmp);
2260    felem_shrink(pre->g_pre_comp[0][1][1], y_tmp);
2261    felem_shrink(pre->g_pre_comp[0][1][2], z_tmp);
2262    /*
2263     * compute 2^64*G, 2^128*G, 2^192*G for the first table, 2^32*G, 2^96*G,
2264     * 2^160*G, 2^224*G for the second one
2265     */
2266    for (i = 1; i <= 8; i <<= 1) {
2267        point_double_small(pre->g_pre_comp[1][i][0], pre->g_pre_comp[1][i][1],
2268                           pre->g_pre_comp[1][i][2], pre->g_pre_comp[0][i][0],
2269                           pre->g_pre_comp[0][i][1],
2270                           pre->g_pre_comp[0][i][2]);
2271        for (j = 0; j < 31; ++j) {
2272            point_double_small(pre->g_pre_comp[1][i][0],
2273                               pre->g_pre_comp[1][i][1],
2274                               pre->g_pre_comp[1][i][2],
2275                               pre->g_pre_comp[1][i][0],
2276                               pre->g_pre_comp[1][i][1],
2277                               pre->g_pre_comp[1][i][2]);
2278        }
2279        if (i == 8)
2280            break;
2281        point_double_small(pre->g_pre_comp[0][2 * i][0],
2282                           pre->g_pre_comp[0][2 * i][1],
2283                           pre->g_pre_comp[0][2 * i][2],
2284                           pre->g_pre_comp[1][i][0], pre->g_pre_comp[1][i][1],
2285                           pre->g_pre_comp[1][i][2]);
2286        for (j = 0; j < 31; ++j) {
2287            point_double_small(pre->g_pre_comp[0][2 * i][0],
2288                               pre->g_pre_comp[0][2 * i][1],
2289                               pre->g_pre_comp[0][2 * i][2],
2290                               pre->g_pre_comp[0][2 * i][0],
2291                               pre->g_pre_comp[0][2 * i][1],
2292                               pre->g_pre_comp[0][2 * i][2]);
2293        }
2294    }
2295    for (i = 0; i < 2; i++) {
2296        /* g_pre_comp[i][0] is the point at infinity */
2297        memset(pre->g_pre_comp[i][0], 0, sizeof(pre->g_pre_comp[i][0]));
2298        /* the remaining multiples */
2299        /* 2^64*G + 2^128*G resp. 2^96*G + 2^160*G */
2300        point_add_small(pre->g_pre_comp[i][6][0], pre->g_pre_comp[i][6][1],
2301                        pre->g_pre_comp[i][6][2], pre->g_pre_comp[i][4][0],
2302                        pre->g_pre_comp[i][4][1], pre->g_pre_comp[i][4][2],
2303                        pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1],
2304                        pre->g_pre_comp[i][2][2]);
2305        /* 2^64*G + 2^192*G resp. 2^96*G + 2^224*G */
2306        point_add_small(pre->g_pre_comp[i][10][0], pre->g_pre_comp[i][10][1],
2307                        pre->g_pre_comp[i][10][2], pre->g_pre_comp[i][8][0],
2308                        pre->g_pre_comp[i][8][1], pre->g_pre_comp[i][8][2],
2309                        pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1],
2310                        pre->g_pre_comp[i][2][2]);
2311        /* 2^128*G + 2^192*G resp. 2^160*G + 2^224*G */
2312        point_add_small(pre->g_pre_comp[i][12][0], pre->g_pre_comp[i][12][1],
2313                        pre->g_pre_comp[i][12][2], pre->g_pre_comp[i][8][0],
2314                        pre->g_pre_comp[i][8][1], pre->g_pre_comp[i][8][2],
2315                        pre->g_pre_comp[i][4][0], pre->g_pre_comp[i][4][1],
2316                        pre->g_pre_comp[i][4][2]);
2317        /*
2318         * 2^64*G + 2^128*G + 2^192*G resp. 2^96*G + 2^160*G + 2^224*G
2319         */
2320        point_add_small(pre->g_pre_comp[i][14][0], pre->g_pre_comp[i][14][1],
2321                        pre->g_pre_comp[i][14][2], pre->g_pre_comp[i][12][0],
2322                        pre->g_pre_comp[i][12][1], pre->g_pre_comp[i][12][2],
2323                        pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1],
2324                        pre->g_pre_comp[i][2][2]);
2325        for (j = 1; j < 8; ++j) {
2326            /* odd multiples: add G resp. 2^32*G */
2327            point_add_small(pre->g_pre_comp[i][2 * j + 1][0],
2328                            pre->g_pre_comp[i][2 * j + 1][1],
2329                            pre->g_pre_comp[i][2 * j + 1][2],
2330                            pre->g_pre_comp[i][2 * j][0],
2331                            pre->g_pre_comp[i][2 * j][1],
2332                            pre->g_pre_comp[i][2 * j][2],
2333                            pre->g_pre_comp[i][1][0],
2334                            pre->g_pre_comp[i][1][1],
2335                            pre->g_pre_comp[i][1][2]);
2336        }
2337    }
2338    make_points_affine(31, &(pre->g_pre_comp[0][1]), tmp_smallfelems);
2339
2340    if (!EC_EX_DATA_set_data(&group->extra_data, pre, nistp256_pre_comp_dup,
2341                             nistp256_pre_comp_free,
2342                             nistp256_pre_comp_clear_free))
2343        goto err;
2344    ret = 1;
2345    pre = NULL;
2346 err:
2347    BN_CTX_end(ctx);
2348    if (generator != NULL)
2349        EC_POINT_free(generator);
2350    if (new_ctx != NULL)
2351        BN_CTX_free(new_ctx);
2352    if (pre)
2353        nistp256_pre_comp_free(pre);
2354    return ret;
2355}
2356
2357int ec_GFp_nistp256_have_precompute_mult(const EC_GROUP *group)
2358{
2359    if (EC_EX_DATA_get_data(group->extra_data, nistp256_pre_comp_dup,
2360                            nistp256_pre_comp_free,
2361                            nistp256_pre_comp_clear_free)
2362        != NULL)
2363        return 1;
2364    else
2365        return 0;
2366}
2367#else
2368static void *dummy = &dummy;
2369#endif
2370