ecp_nistp256.c revision 352193
1/* crypto/ec/ecp_nistp256.c */
2/*
3 * Written by Adam Langley (Google) for the OpenSSL project
4 */
5/* Copyright 2011 Google Inc.
6 *
7 * Licensed under the Apache License, Version 2.0 (the "License");
8 *
9 * you may not use this file except in compliance with the License.
10 * You may obtain a copy of the License at
11 *
12 *     http://www.apache.org/licenses/LICENSE-2.0
13 *
14 *  Unless required by applicable law or agreed to in writing, software
15 *  distributed under the License is distributed on an "AS IS" BASIS,
16 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 *  See the License for the specific language governing permissions and
18 *  limitations under the License.
19 */
20
21/*
22 * A 64-bit implementation of the NIST P-256 elliptic curve point multiplication
23 *
24 * OpenSSL integration was taken from Emilia Kasper's work in ecp_nistp224.c.
25 * Otherwise based on Emilia's P224 work, which was inspired by my curve25519
26 * work which got its smarts from Daniel J. Bernstein's work on the same.
27 */
28
29#include <openssl/opensslconf.h>
30#ifndef OPENSSL_NO_EC_NISTP_64_GCC_128
31
32# ifndef OPENSSL_SYS_VMS
33#  include <stdint.h>
34# else
35#  include <inttypes.h>
36# endif
37
38# include <string.h>
39# include <openssl/err.h>
40# include "ec_lcl.h"
41# include "bn_int.h" /* bn_bn2lebinpad, bn_lebin2bn */
42
43# if defined(__GNUC__) && (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))
44  /* even with gcc, the typedef won't work for 32-bit platforms */
45typedef __uint128_t uint128_t;  /* nonstandard; implemented by gcc on 64-bit
46                                 * platforms */
47typedef __int128_t int128_t;
48# else
49#  error "Need GCC 3.1 or later to define type uint128_t"
50# endif
51
52typedef uint8_t u8;
53typedef uint32_t u32;
54typedef uint64_t u64;
55
56/*
57 * The underlying field. P256 operates over GF(2^256-2^224+2^192+2^96-1). We
58 * can serialise an element of this field into 32 bytes. We call this an
59 * felem_bytearray.
60 */
61
62typedef u8 felem_bytearray[32];
63
64/*
65 * These are the parameters of P256, taken from FIPS 186-3, page 86. These
66 * values are big-endian.
67 */
68static const felem_bytearray nistp256_curve_params[5] = {
69    {0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, /* p */
70     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
71     0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,
72     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
73    {0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, /* a = -3 */
74     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
75     0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,
76     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xfc}, /* b */
77    {0x5a, 0xc6, 0x35, 0xd8, 0xaa, 0x3a, 0x93, 0xe7,
78     0xb3, 0xeb, 0xbd, 0x55, 0x76, 0x98, 0x86, 0xbc,
79     0x65, 0x1d, 0x06, 0xb0, 0xcc, 0x53, 0xb0, 0xf6,
80     0x3b, 0xce, 0x3c, 0x3e, 0x27, 0xd2, 0x60, 0x4b},
81    {0x6b, 0x17, 0xd1, 0xf2, 0xe1, 0x2c, 0x42, 0x47, /* x */
82     0xf8, 0xbc, 0xe6, 0xe5, 0x63, 0xa4, 0x40, 0xf2,
83     0x77, 0x03, 0x7d, 0x81, 0x2d, 0xeb, 0x33, 0xa0,
84     0xf4, 0xa1, 0x39, 0x45, 0xd8, 0x98, 0xc2, 0x96},
85    {0x4f, 0xe3, 0x42, 0xe2, 0xfe, 0x1a, 0x7f, 0x9b, /* y */
86     0x8e, 0xe7, 0xeb, 0x4a, 0x7c, 0x0f, 0x9e, 0x16,
87     0x2b, 0xce, 0x33, 0x57, 0x6b, 0x31, 0x5e, 0xce,
88     0xcb, 0xb6, 0x40, 0x68, 0x37, 0xbf, 0x51, 0xf5}
89};
90
91/*-
92 * The representation of field elements.
93 * ------------------------------------
94 *
95 * We represent field elements with either four 128-bit values, eight 128-bit
96 * values, or four 64-bit values. The field element represented is:
97 *   v[0]*2^0 + v[1]*2^64 + v[2]*2^128 + v[3]*2^192  (mod p)
98 * or:
99 *   v[0]*2^0 + v[1]*2^64 + v[2]*2^128 + ... + v[8]*2^512  (mod p)
100 *
101 * 128-bit values are called 'limbs'. Since the limbs are spaced only 64 bits
102 * apart, but are 128-bits wide, the most significant bits of each limb overlap
103 * with the least significant bits of the next.
104 *
105 * A field element with four limbs is an 'felem'. One with eight limbs is a
106 * 'longfelem'
107 *
108 * A field element with four, 64-bit values is called a 'smallfelem'. Small
109 * values are used as intermediate values before multiplication.
110 */
111
112# define NLIMBS 4
113
114typedef uint128_t limb;
115typedef limb felem[NLIMBS];
116typedef limb longfelem[NLIMBS * 2];
117typedef u64 smallfelem[NLIMBS];
118
119/* This is the value of the prime as four 64-bit words, little-endian. */
120static const u64 kPrime[4] =
121    { 0xfffffffffffffffful, 0xffffffff, 0, 0xffffffff00000001ul };
122static const u64 bottom63bits = 0x7ffffffffffffffful;
123
124/*
125 * bin32_to_felem takes a little-endian byte array and converts it into felem
126 * form. This assumes that the CPU is little-endian.
127 */
128static void bin32_to_felem(felem out, const u8 in[32])
129{
130    out[0] = *((u64 *)&in[0]);
131    out[1] = *((u64 *)&in[8]);
132    out[2] = *((u64 *)&in[16]);
133    out[3] = *((u64 *)&in[24]);
134}
135
136/*
137 * smallfelem_to_bin32 takes a smallfelem and serialises into a little
138 * endian, 32 byte array. This assumes that the CPU is little-endian.
139 */
140static void smallfelem_to_bin32(u8 out[32], const smallfelem in)
141{
142    *((u64 *)&out[0]) = in[0];
143    *((u64 *)&out[8]) = in[1];
144    *((u64 *)&out[16]) = in[2];
145    *((u64 *)&out[24]) = in[3];
146}
147
148/* BN_to_felem converts an OpenSSL BIGNUM into an felem */
149static int BN_to_felem(felem out, const BIGNUM *bn)
150{
151    felem_bytearray b_out;
152    int num_bytes;
153
154    if (BN_is_negative(bn)) {
155        ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
156        return 0;
157    }
158    num_bytes = bn_bn2lebinpad(bn, b_out, sizeof(b_out));
159    if (num_bytes < 0) {
160        ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
161        return 0;
162    }
163    bin32_to_felem(out, b_out);
164    return 1;
165}
166
167/* felem_to_BN converts an felem into an OpenSSL BIGNUM */
168static BIGNUM *smallfelem_to_BN(BIGNUM *out, const smallfelem in)
169{
170    felem_bytearray b_out;
171    smallfelem_to_bin32(b_out, in);
172    return bn_lebin2bn(b_out, sizeof(b_out), out);
173}
174
175/*-
176 * Field operations
177 * ----------------
178 */
179
180static void smallfelem_one(smallfelem out)
181{
182    out[0] = 1;
183    out[1] = 0;
184    out[2] = 0;
185    out[3] = 0;
186}
187
188static void smallfelem_assign(smallfelem out, const smallfelem in)
189{
190    out[0] = in[0];
191    out[1] = in[1];
192    out[2] = in[2];
193    out[3] = in[3];
194}
195
196static void felem_assign(felem out, const felem in)
197{
198    out[0] = in[0];
199    out[1] = in[1];
200    out[2] = in[2];
201    out[3] = in[3];
202}
203
204/* felem_sum sets out = out + in. */
205static void felem_sum(felem out, const felem in)
206{
207    out[0] += in[0];
208    out[1] += in[1];
209    out[2] += in[2];
210    out[3] += in[3];
211}
212
213/* felem_small_sum sets out = out + in. */
214static void felem_small_sum(felem out, const smallfelem in)
215{
216    out[0] += in[0];
217    out[1] += in[1];
218    out[2] += in[2];
219    out[3] += in[3];
220}
221
222/* felem_scalar sets out = out * scalar */
223static void felem_scalar(felem out, const u64 scalar)
224{
225    out[0] *= scalar;
226    out[1] *= scalar;
227    out[2] *= scalar;
228    out[3] *= scalar;
229}
230
231/* longfelem_scalar sets out = out * scalar */
232static void longfelem_scalar(longfelem out, const u64 scalar)
233{
234    out[0] *= scalar;
235    out[1] *= scalar;
236    out[2] *= scalar;
237    out[3] *= scalar;
238    out[4] *= scalar;
239    out[5] *= scalar;
240    out[6] *= scalar;
241    out[7] *= scalar;
242}
243
244# define two105m41m9 (((limb)1) << 105) - (((limb)1) << 41) - (((limb)1) << 9)
245# define two105 (((limb)1) << 105)
246# define two105m41p9 (((limb)1) << 105) - (((limb)1) << 41) + (((limb)1) << 9)
247
248/* zero105 is 0 mod p */
249static const felem zero105 =
250    { two105m41m9, two105, two105m41p9, two105m41p9 };
251
252/*-
253 * smallfelem_neg sets |out| to |-small|
254 * On exit:
255 *   out[i] < out[i] + 2^105
256 */
257static void smallfelem_neg(felem out, const smallfelem small)
258{
259    /* In order to prevent underflow, we subtract from 0 mod p. */
260    out[0] = zero105[0] - small[0];
261    out[1] = zero105[1] - small[1];
262    out[2] = zero105[2] - small[2];
263    out[3] = zero105[3] - small[3];
264}
265
266/*-
267 * felem_diff subtracts |in| from |out|
268 * On entry:
269 *   in[i] < 2^104
270 * On exit:
271 *   out[i] < out[i] + 2^105
272 */
273static void felem_diff(felem out, const felem in)
274{
275    /*
276     * In order to prevent underflow, we add 0 mod p before subtracting.
277     */
278    out[0] += zero105[0];
279    out[1] += zero105[1];
280    out[2] += zero105[2];
281    out[3] += zero105[3];
282
283    out[0] -= in[0];
284    out[1] -= in[1];
285    out[2] -= in[2];
286    out[3] -= in[3];
287}
288
289# define two107m43m11 (((limb)1) << 107) - (((limb)1) << 43) - (((limb)1) << 11)
290# define two107 (((limb)1) << 107)
291# define two107m43p11 (((limb)1) << 107) - (((limb)1) << 43) + (((limb)1) << 11)
292
293/* zero107 is 0 mod p */
294static const felem zero107 =
295    { two107m43m11, two107, two107m43p11, two107m43p11 };
296
297/*-
298 * An alternative felem_diff for larger inputs |in|
299 * felem_diff_zero107 subtracts |in| from |out|
300 * On entry:
301 *   in[i] < 2^106
302 * On exit:
303 *   out[i] < out[i] + 2^107
304 */
305static void felem_diff_zero107(felem out, const felem in)
306{
307    /*
308     * In order to prevent underflow, we add 0 mod p before subtracting.
309     */
310    out[0] += zero107[0];
311    out[1] += zero107[1];
312    out[2] += zero107[2];
313    out[3] += zero107[3];
314
315    out[0] -= in[0];
316    out[1] -= in[1];
317    out[2] -= in[2];
318    out[3] -= in[3];
319}
320
321/*-
322 * longfelem_diff subtracts |in| from |out|
323 * On entry:
324 *   in[i] < 7*2^67
325 * On exit:
326 *   out[i] < out[i] + 2^70 + 2^40
327 */
328static void longfelem_diff(longfelem out, const longfelem in)
329{
330    static const limb two70m8p6 =
331        (((limb) 1) << 70) - (((limb) 1) << 8) + (((limb) 1) << 6);
332    static const limb two70p40 = (((limb) 1) << 70) + (((limb) 1) << 40);
333    static const limb two70 = (((limb) 1) << 70);
334    static const limb two70m40m38p6 =
335        (((limb) 1) << 70) - (((limb) 1) << 40) - (((limb) 1) << 38) +
336        (((limb) 1) << 6);
337    static const limb two70m6 = (((limb) 1) << 70) - (((limb) 1) << 6);
338
339    /* add 0 mod p to avoid underflow */
340    out[0] += two70m8p6;
341    out[1] += two70p40;
342    out[2] += two70;
343    out[3] += two70m40m38p6;
344    out[4] += two70m6;
345    out[5] += two70m6;
346    out[6] += two70m6;
347    out[7] += two70m6;
348
349    /* in[i] < 7*2^67 < 2^70 - 2^40 - 2^38 + 2^6 */
350    out[0] -= in[0];
351    out[1] -= in[1];
352    out[2] -= in[2];
353    out[3] -= in[3];
354    out[4] -= in[4];
355    out[5] -= in[5];
356    out[6] -= in[6];
357    out[7] -= in[7];
358}
359
360# define two64m0 (((limb)1) << 64) - 1
361# define two110p32m0 (((limb)1) << 110) + (((limb)1) << 32) - 1
362# define two64m46 (((limb)1) << 64) - (((limb)1) << 46)
363# define two64m32 (((limb)1) << 64) - (((limb)1) << 32)
364
365/* zero110 is 0 mod p */
366static const felem zero110 = { two64m0, two110p32m0, two64m46, two64m32 };
367
368/*-
369 * felem_shrink converts an felem into a smallfelem. The result isn't quite
370 * minimal as the value may be greater than p.
371 *
372 * On entry:
373 *   in[i] < 2^109
374 * On exit:
375 *   out[i] < 2^64
376 */
377static void felem_shrink(smallfelem out, const felem in)
378{
379    felem tmp;
380    u64 a, b, mask;
381    u64 high, low;
382    static const u64 kPrime3Test = 0x7fffffff00000001ul; /* 2^63 - 2^32 + 1 */
383
384    /* Carry 2->3 */
385    tmp[3] = zero110[3] + in[3] + ((u64)(in[2] >> 64));
386    /* tmp[3] < 2^110 */
387
388    tmp[2] = zero110[2] + (u64)in[2];
389    tmp[0] = zero110[0] + in[0];
390    tmp[1] = zero110[1] + in[1];
391    /* tmp[0] < 2**110, tmp[1] < 2^111, tmp[2] < 2**65 */
392
393    /*
394     * We perform two partial reductions where we eliminate the high-word of
395     * tmp[3]. We don't update the other words till the end.
396     */
397    a = tmp[3] >> 64;           /* a < 2^46 */
398    tmp[3] = (u64)tmp[3];
399    tmp[3] -= a;
400    tmp[3] += ((limb) a) << 32;
401    /* tmp[3] < 2^79 */
402
403    b = a;
404    a = tmp[3] >> 64;           /* a < 2^15 */
405    b += a;                     /* b < 2^46 + 2^15 < 2^47 */
406    tmp[3] = (u64)tmp[3];
407    tmp[3] -= a;
408    tmp[3] += ((limb) a) << 32;
409    /* tmp[3] < 2^64 + 2^47 */
410
411    /*
412     * This adjusts the other two words to complete the two partial
413     * reductions.
414     */
415    tmp[0] += b;
416    tmp[1] -= (((limb) b) << 32);
417
418    /*
419     * In order to make space in tmp[3] for the carry from 2 -> 3, we
420     * conditionally subtract kPrime if tmp[3] is large enough.
421     */
422    high = (u64)(tmp[3] >> 64);
423    /* As tmp[3] < 2^65, high is either 1 or 0 */
424    high = 0 - high;
425    /*-
426     * high is:
427     *   all ones   if the high word of tmp[3] is 1
428     *   all zeros  if the high word of tmp[3] if 0
429     */
430    low = (u64)tmp[3];
431    mask = 0 - (low >> 63);
432    /*-
433     * mask is:
434     *   all ones   if the MSB of low is 1
435     *   all zeros  if the MSB of low if 0
436     */
437    low &= bottom63bits;
438    low -= kPrime3Test;
439    /* if low was greater than kPrime3Test then the MSB is zero */
440    low = ~low;
441    low = 0 - (low >> 63);
442    /*-
443     * low is:
444     *   all ones   if low was > kPrime3Test
445     *   all zeros  if low was <= kPrime3Test
446     */
447    mask = (mask & low) | high;
448    tmp[0] -= mask & kPrime[0];
449    tmp[1] -= mask & kPrime[1];
450    /* kPrime[2] is zero, so omitted */
451    tmp[3] -= mask & kPrime[3];
452    /* tmp[3] < 2**64 - 2**32 + 1 */
453
454    tmp[1] += ((u64)(tmp[0] >> 64));
455    tmp[0] = (u64)tmp[0];
456    tmp[2] += ((u64)(tmp[1] >> 64));
457    tmp[1] = (u64)tmp[1];
458    tmp[3] += ((u64)(tmp[2] >> 64));
459    tmp[2] = (u64)tmp[2];
460    /* tmp[i] < 2^64 */
461
462    out[0] = tmp[0];
463    out[1] = tmp[1];
464    out[2] = tmp[2];
465    out[3] = tmp[3];
466}
467
468/* smallfelem_expand converts a smallfelem to an felem */
469static void smallfelem_expand(felem out, const smallfelem in)
470{
471    out[0] = in[0];
472    out[1] = in[1];
473    out[2] = in[2];
474    out[3] = in[3];
475}
476
477/*-
478 * smallfelem_square sets |out| = |small|^2
479 * On entry:
480 *   small[i] < 2^64
481 * On exit:
482 *   out[i] < 7 * 2^64 < 2^67
483 */
484static void smallfelem_square(longfelem out, const smallfelem small)
485{
486    limb a;
487    u64 high, low;
488
489    a = ((uint128_t) small[0]) * small[0];
490    low = a;
491    high = a >> 64;
492    out[0] = low;
493    out[1] = high;
494
495    a = ((uint128_t) small[0]) * small[1];
496    low = a;
497    high = a >> 64;
498    out[1] += low;
499    out[1] += low;
500    out[2] = high;
501
502    a = ((uint128_t) small[0]) * small[2];
503    low = a;
504    high = a >> 64;
505    out[2] += low;
506    out[2] *= 2;
507    out[3] = high;
508
509    a = ((uint128_t) small[0]) * small[3];
510    low = a;
511    high = a >> 64;
512    out[3] += low;
513    out[4] = high;
514
515    a = ((uint128_t) small[1]) * small[2];
516    low = a;
517    high = a >> 64;
518    out[3] += low;
519    out[3] *= 2;
520    out[4] += high;
521
522    a = ((uint128_t) small[1]) * small[1];
523    low = a;
524    high = a >> 64;
525    out[2] += low;
526    out[3] += high;
527
528    a = ((uint128_t) small[1]) * small[3];
529    low = a;
530    high = a >> 64;
531    out[4] += low;
532    out[4] *= 2;
533    out[5] = high;
534
535    a = ((uint128_t) small[2]) * small[3];
536    low = a;
537    high = a >> 64;
538    out[5] += low;
539    out[5] *= 2;
540    out[6] = high;
541    out[6] += high;
542
543    a = ((uint128_t) small[2]) * small[2];
544    low = a;
545    high = a >> 64;
546    out[4] += low;
547    out[5] += high;
548
549    a = ((uint128_t) small[3]) * small[3];
550    low = a;
551    high = a >> 64;
552    out[6] += low;
553    out[7] = high;
554}
555
556/*-
557 * felem_square sets |out| = |in|^2
558 * On entry:
559 *   in[i] < 2^109
560 * On exit:
561 *   out[i] < 7 * 2^64 < 2^67
562 */
563static void felem_square(longfelem out, const felem in)
564{
565    u64 small[4];
566    felem_shrink(small, in);
567    smallfelem_square(out, small);
568}
569
570/*-
571 * smallfelem_mul sets |out| = |small1| * |small2|
572 * On entry:
573 *   small1[i] < 2^64
574 *   small2[i] < 2^64
575 * On exit:
576 *   out[i] < 7 * 2^64 < 2^67
577 */
578static void smallfelem_mul(longfelem out, const smallfelem small1,
579                           const smallfelem small2)
580{
581    limb a;
582    u64 high, low;
583
584    a = ((uint128_t) small1[0]) * small2[0];
585    low = a;
586    high = a >> 64;
587    out[0] = low;
588    out[1] = high;
589
590    a = ((uint128_t) small1[0]) * small2[1];
591    low = a;
592    high = a >> 64;
593    out[1] += low;
594    out[2] = high;
595
596    a = ((uint128_t) small1[1]) * small2[0];
597    low = a;
598    high = a >> 64;
599    out[1] += low;
600    out[2] += high;
601
602    a = ((uint128_t) small1[0]) * small2[2];
603    low = a;
604    high = a >> 64;
605    out[2] += low;
606    out[3] = high;
607
608    a = ((uint128_t) small1[1]) * small2[1];
609    low = a;
610    high = a >> 64;
611    out[2] += low;
612    out[3] += high;
613
614    a = ((uint128_t) small1[2]) * small2[0];
615    low = a;
616    high = a >> 64;
617    out[2] += low;
618    out[3] += high;
619
620    a = ((uint128_t) small1[0]) * small2[3];
621    low = a;
622    high = a >> 64;
623    out[3] += low;
624    out[4] = high;
625
626    a = ((uint128_t) small1[1]) * small2[2];
627    low = a;
628    high = a >> 64;
629    out[3] += low;
630    out[4] += high;
631
632    a = ((uint128_t) small1[2]) * small2[1];
633    low = a;
634    high = a >> 64;
635    out[3] += low;
636    out[4] += high;
637
638    a = ((uint128_t) small1[3]) * small2[0];
639    low = a;
640    high = a >> 64;
641    out[3] += low;
642    out[4] += high;
643
644    a = ((uint128_t) small1[1]) * small2[3];
645    low = a;
646    high = a >> 64;
647    out[4] += low;
648    out[5] = high;
649
650    a = ((uint128_t) small1[2]) * small2[2];
651    low = a;
652    high = a >> 64;
653    out[4] += low;
654    out[5] += high;
655
656    a = ((uint128_t) small1[3]) * small2[1];
657    low = a;
658    high = a >> 64;
659    out[4] += low;
660    out[5] += high;
661
662    a = ((uint128_t) small1[2]) * small2[3];
663    low = a;
664    high = a >> 64;
665    out[5] += low;
666    out[6] = high;
667
668    a = ((uint128_t) small1[3]) * small2[2];
669    low = a;
670    high = a >> 64;
671    out[5] += low;
672    out[6] += high;
673
674    a = ((uint128_t) small1[3]) * small2[3];
675    low = a;
676    high = a >> 64;
677    out[6] += low;
678    out[7] = high;
679}
680
681/*-
682 * felem_mul sets |out| = |in1| * |in2|
683 * On entry:
684 *   in1[i] < 2^109
685 *   in2[i] < 2^109
686 * On exit:
687 *   out[i] < 7 * 2^64 < 2^67
688 */
689static void felem_mul(longfelem out, const felem in1, const felem in2)
690{
691    smallfelem small1, small2;
692    felem_shrink(small1, in1);
693    felem_shrink(small2, in2);
694    smallfelem_mul(out, small1, small2);
695}
696
697/*-
698 * felem_small_mul sets |out| = |small1| * |in2|
699 * On entry:
700 *   small1[i] < 2^64
701 *   in2[i] < 2^109
702 * On exit:
703 *   out[i] < 7 * 2^64 < 2^67
704 */
705static void felem_small_mul(longfelem out, const smallfelem small1,
706                            const felem in2)
707{
708    smallfelem small2;
709    felem_shrink(small2, in2);
710    smallfelem_mul(out, small1, small2);
711}
712
713# define two100m36m4 (((limb)1) << 100) - (((limb)1) << 36) - (((limb)1) << 4)
714# define two100 (((limb)1) << 100)
715# define two100m36p4 (((limb)1) << 100) - (((limb)1) << 36) + (((limb)1) << 4)
716/* zero100 is 0 mod p */
717static const felem zero100 =
718    { two100m36m4, two100, two100m36p4, two100m36p4 };
719
720/*-
721 * Internal function for the different flavours of felem_reduce.
722 * felem_reduce_ reduces the higher coefficients in[4]-in[7].
723 * On entry:
724 *   out[0] >= in[6] + 2^32*in[6] + in[7] + 2^32*in[7]
725 *   out[1] >= in[7] + 2^32*in[4]
726 *   out[2] >= in[5] + 2^32*in[5]
727 *   out[3] >= in[4] + 2^32*in[5] + 2^32*in[6]
728 * On exit:
729 *   out[0] <= out[0] + in[4] + 2^32*in[5]
730 *   out[1] <= out[1] + in[5] + 2^33*in[6]
731 *   out[2] <= out[2] + in[7] + 2*in[6] + 2^33*in[7]
732 *   out[3] <= out[3] + 2^32*in[4] + 3*in[7]
733 */
734static void felem_reduce_(felem out, const longfelem in)
735{
736    int128_t c;
737    /* combine common terms from below */
738    c = in[4] + (in[5] << 32);
739    out[0] += c;
740    out[3] -= c;
741
742    c = in[5] - in[7];
743    out[1] += c;
744    out[2] -= c;
745
746    /* the remaining terms */
747    /* 256: [(0,1),(96,-1),(192,-1),(224,1)] */
748    out[1] -= (in[4] << 32);
749    out[3] += (in[4] << 32);
750
751    /* 320: [(32,1),(64,1),(128,-1),(160,-1),(224,-1)] */
752    out[2] -= (in[5] << 32);
753
754    /* 384: [(0,-1),(32,-1),(96,2),(128,2),(224,-1)] */
755    out[0] -= in[6];
756    out[0] -= (in[6] << 32);
757    out[1] += (in[6] << 33);
758    out[2] += (in[6] * 2);
759    out[3] -= (in[6] << 32);
760
761    /* 448: [(0,-1),(32,-1),(64,-1),(128,1),(160,2),(192,3)] */
762    out[0] -= in[7];
763    out[0] -= (in[7] << 32);
764    out[2] += (in[7] << 33);
765    out[3] += (in[7] * 3);
766}
767
768/*-
769 * felem_reduce converts a longfelem into an felem.
770 * To be called directly after felem_square or felem_mul.
771 * On entry:
772 *   in[0] < 2^64, in[1] < 3*2^64, in[2] < 5*2^64, in[3] < 7*2^64
773 *   in[4] < 7*2^64, in[5] < 5*2^64, in[6] < 3*2^64, in[7] < 2*64
774 * On exit:
775 *   out[i] < 2^101
776 */
777static void felem_reduce(felem out, const longfelem in)
778{
779    out[0] = zero100[0] + in[0];
780    out[1] = zero100[1] + in[1];
781    out[2] = zero100[2] + in[2];
782    out[3] = zero100[3] + in[3];
783
784    felem_reduce_(out, in);
785
786    /*-
787     * out[0] > 2^100 - 2^36 - 2^4 - 3*2^64 - 3*2^96 - 2^64 - 2^96 > 0
788     * out[1] > 2^100 - 2^64 - 7*2^96 > 0
789     * out[2] > 2^100 - 2^36 + 2^4 - 5*2^64 - 5*2^96 > 0
790     * out[3] > 2^100 - 2^36 + 2^4 - 7*2^64 - 5*2^96 - 3*2^96 > 0
791     *
792     * out[0] < 2^100 + 2^64 + 7*2^64 + 5*2^96 < 2^101
793     * out[1] < 2^100 + 3*2^64 + 5*2^64 + 3*2^97 < 2^101
794     * out[2] < 2^100 + 5*2^64 + 2^64 + 3*2^65 + 2^97 < 2^101
795     * out[3] < 2^100 + 7*2^64 + 7*2^96 + 3*2^64 < 2^101
796     */
797}
798
799/*-
800 * felem_reduce_zero105 converts a larger longfelem into an felem.
801 * On entry:
802 *   in[0] < 2^71
803 * On exit:
804 *   out[i] < 2^106
805 */
806static void felem_reduce_zero105(felem out, const longfelem in)
807{
808    out[0] = zero105[0] + in[0];
809    out[1] = zero105[1] + in[1];
810    out[2] = zero105[2] + in[2];
811    out[3] = zero105[3] + in[3];
812
813    felem_reduce_(out, in);
814
815    /*-
816     * out[0] > 2^105 - 2^41 - 2^9 - 2^71 - 2^103 - 2^71 - 2^103 > 0
817     * out[1] > 2^105 - 2^71 - 2^103 > 0
818     * out[2] > 2^105 - 2^41 + 2^9 - 2^71 - 2^103 > 0
819     * out[3] > 2^105 - 2^41 + 2^9 - 2^71 - 2^103 - 2^103 > 0
820     *
821     * out[0] < 2^105 + 2^71 + 2^71 + 2^103 < 2^106
822     * out[1] < 2^105 + 2^71 + 2^71 + 2^103 < 2^106
823     * out[2] < 2^105 + 2^71 + 2^71 + 2^71 + 2^103 < 2^106
824     * out[3] < 2^105 + 2^71 + 2^103 + 2^71 < 2^106
825     */
826}
827
828/*
829 * subtract_u64 sets *result = *result - v and *carry to one if the
830 * subtraction underflowed.
831 */
832static void subtract_u64(u64 *result, u64 *carry, u64 v)
833{
834    uint128_t r = *result;
835    r -= v;
836    *carry = (r >> 64) & 1;
837    *result = (u64)r;
838}
839
840/*
841 * felem_contract converts |in| to its unique, minimal representation. On
842 * entry: in[i] < 2^109
843 */
844static void felem_contract(smallfelem out, const felem in)
845{
846    unsigned i;
847    u64 all_equal_so_far = 0, result = 0, carry;
848
849    felem_shrink(out, in);
850    /* small is minimal except that the value might be > p */
851
852    all_equal_so_far--;
853    /*
854     * We are doing a constant time test if out >= kPrime. We need to compare
855     * each u64, from most-significant to least significant. For each one, if
856     * all words so far have been equal (m is all ones) then a non-equal
857     * result is the answer. Otherwise we continue.
858     */
859    for (i = 3; i < 4; i--) {
860        u64 equal;
861        uint128_t a = ((uint128_t) kPrime[i]) - out[i];
862        /*
863         * if out[i] > kPrime[i] then a will underflow and the high 64-bits
864         * will all be set.
865         */
866        result |= all_equal_so_far & ((u64)(a >> 64));
867
868        /*
869         * if kPrime[i] == out[i] then |equal| will be all zeros and the
870         * decrement will make it all ones.
871         */
872        equal = kPrime[i] ^ out[i];
873        equal--;
874        equal &= equal << 32;
875        equal &= equal << 16;
876        equal &= equal << 8;
877        equal &= equal << 4;
878        equal &= equal << 2;
879        equal &= equal << 1;
880        equal = 0 - (equal >> 63);
881
882        all_equal_so_far &= equal;
883    }
884
885    /*
886     * if all_equal_so_far is still all ones then the two values are equal
887     * and so out >= kPrime is true.
888     */
889    result |= all_equal_so_far;
890
891    /* if out >= kPrime then we subtract kPrime. */
892    subtract_u64(&out[0], &carry, result & kPrime[0]);
893    subtract_u64(&out[1], &carry, carry);
894    subtract_u64(&out[2], &carry, carry);
895    subtract_u64(&out[3], &carry, carry);
896
897    subtract_u64(&out[1], &carry, result & kPrime[1]);
898    subtract_u64(&out[2], &carry, carry);
899    subtract_u64(&out[3], &carry, carry);
900
901    subtract_u64(&out[2], &carry, result & kPrime[2]);
902    subtract_u64(&out[3], &carry, carry);
903
904    subtract_u64(&out[3], &carry, result & kPrime[3]);
905}
906
907static void smallfelem_square_contract(smallfelem out, const smallfelem in)
908{
909    longfelem longtmp;
910    felem tmp;
911
912    smallfelem_square(longtmp, in);
913    felem_reduce(tmp, longtmp);
914    felem_contract(out, tmp);
915}
916
917static void smallfelem_mul_contract(smallfelem out, const smallfelem in1,
918                                    const smallfelem in2)
919{
920    longfelem longtmp;
921    felem tmp;
922
923    smallfelem_mul(longtmp, in1, in2);
924    felem_reduce(tmp, longtmp);
925    felem_contract(out, tmp);
926}
927
928/*-
929 * felem_is_zero returns a limb with all bits set if |in| == 0 (mod p) and 0
930 * otherwise.
931 * On entry:
932 *   small[i] < 2^64
933 */
934static limb smallfelem_is_zero(const smallfelem small)
935{
936    limb result;
937    u64 is_p;
938
939    u64 is_zero = small[0] | small[1] | small[2] | small[3];
940    is_zero--;
941    is_zero &= is_zero << 32;
942    is_zero &= is_zero << 16;
943    is_zero &= is_zero << 8;
944    is_zero &= is_zero << 4;
945    is_zero &= is_zero << 2;
946    is_zero &= is_zero << 1;
947    is_zero = 0 - (is_zero >> 63);
948
949    is_p = (small[0] ^ kPrime[0]) |
950        (small[1] ^ kPrime[1]) |
951        (small[2] ^ kPrime[2]) | (small[3] ^ kPrime[3]);
952    is_p--;
953    is_p &= is_p << 32;
954    is_p &= is_p << 16;
955    is_p &= is_p << 8;
956    is_p &= is_p << 4;
957    is_p &= is_p << 2;
958    is_p &= is_p << 1;
959    is_p = 0 - (is_p >> 63);
960
961    is_zero |= is_p;
962
963    result = is_zero;
964    result |= ((limb) is_zero) << 64;
965    return result;
966}
967
968static int smallfelem_is_zero_int(const void *small)
969{
970    return (int)(smallfelem_is_zero(small) & ((limb) 1));
971}
972
973/*-
974 * felem_inv calculates |out| = |in|^{-1}
975 *
976 * Based on Fermat's Little Theorem:
977 *   a^p = a (mod p)
978 *   a^{p-1} = 1 (mod p)
979 *   a^{p-2} = a^{-1} (mod p)
980 */
981static void felem_inv(felem out, const felem in)
982{
983    felem ftmp, ftmp2;
984    /* each e_I will hold |in|^{2^I - 1} */
985    felem e2, e4, e8, e16, e32, e64;
986    longfelem tmp;
987    unsigned i;
988
989    felem_square(tmp, in);
990    felem_reduce(ftmp, tmp);    /* 2^1 */
991    felem_mul(tmp, in, ftmp);
992    felem_reduce(ftmp, tmp);    /* 2^2 - 2^0 */
993    felem_assign(e2, ftmp);
994    felem_square(tmp, ftmp);
995    felem_reduce(ftmp, tmp);    /* 2^3 - 2^1 */
996    felem_square(tmp, ftmp);
997    felem_reduce(ftmp, tmp);    /* 2^4 - 2^2 */
998    felem_mul(tmp, ftmp, e2);
999    felem_reduce(ftmp, tmp);    /* 2^4 - 2^0 */
1000    felem_assign(e4, ftmp);
1001    felem_square(tmp, ftmp);
1002    felem_reduce(ftmp, tmp);    /* 2^5 - 2^1 */
1003    felem_square(tmp, ftmp);
1004    felem_reduce(ftmp, tmp);    /* 2^6 - 2^2 */
1005    felem_square(tmp, ftmp);
1006    felem_reduce(ftmp, tmp);    /* 2^7 - 2^3 */
1007    felem_square(tmp, ftmp);
1008    felem_reduce(ftmp, tmp);    /* 2^8 - 2^4 */
1009    felem_mul(tmp, ftmp, e4);
1010    felem_reduce(ftmp, tmp);    /* 2^8 - 2^0 */
1011    felem_assign(e8, ftmp);
1012    for (i = 0; i < 8; i++) {
1013        felem_square(tmp, ftmp);
1014        felem_reduce(ftmp, tmp);
1015    }                           /* 2^16 - 2^8 */
1016    felem_mul(tmp, ftmp, e8);
1017    felem_reduce(ftmp, tmp);    /* 2^16 - 2^0 */
1018    felem_assign(e16, ftmp);
1019    for (i = 0; i < 16; i++) {
1020        felem_square(tmp, ftmp);
1021        felem_reduce(ftmp, tmp);
1022    }                           /* 2^32 - 2^16 */
1023    felem_mul(tmp, ftmp, e16);
1024    felem_reduce(ftmp, tmp);    /* 2^32 - 2^0 */
1025    felem_assign(e32, ftmp);
1026    for (i = 0; i < 32; i++) {
1027        felem_square(tmp, ftmp);
1028        felem_reduce(ftmp, tmp);
1029    }                           /* 2^64 - 2^32 */
1030    felem_assign(e64, ftmp);
1031    felem_mul(tmp, ftmp, in);
1032    felem_reduce(ftmp, tmp);    /* 2^64 - 2^32 + 2^0 */
1033    for (i = 0; i < 192; i++) {
1034        felem_square(tmp, ftmp);
1035        felem_reduce(ftmp, tmp);
1036    }                           /* 2^256 - 2^224 + 2^192 */
1037
1038    felem_mul(tmp, e64, e32);
1039    felem_reduce(ftmp2, tmp);   /* 2^64 - 2^0 */
1040    for (i = 0; i < 16; i++) {
1041        felem_square(tmp, ftmp2);
1042        felem_reduce(ftmp2, tmp);
1043    }                           /* 2^80 - 2^16 */
1044    felem_mul(tmp, ftmp2, e16);
1045    felem_reduce(ftmp2, tmp);   /* 2^80 - 2^0 */
1046    for (i = 0; i < 8; i++) {
1047        felem_square(tmp, ftmp2);
1048        felem_reduce(ftmp2, tmp);
1049    }                           /* 2^88 - 2^8 */
1050    felem_mul(tmp, ftmp2, e8);
1051    felem_reduce(ftmp2, tmp);   /* 2^88 - 2^0 */
1052    for (i = 0; i < 4; i++) {
1053        felem_square(tmp, ftmp2);
1054        felem_reduce(ftmp2, tmp);
1055    }                           /* 2^92 - 2^4 */
1056    felem_mul(tmp, ftmp2, e4);
1057    felem_reduce(ftmp2, tmp);   /* 2^92 - 2^0 */
1058    felem_square(tmp, ftmp2);
1059    felem_reduce(ftmp2, tmp);   /* 2^93 - 2^1 */
1060    felem_square(tmp, ftmp2);
1061    felem_reduce(ftmp2, tmp);   /* 2^94 - 2^2 */
1062    felem_mul(tmp, ftmp2, e2);
1063    felem_reduce(ftmp2, tmp);   /* 2^94 - 2^0 */
1064    felem_square(tmp, ftmp2);
1065    felem_reduce(ftmp2, tmp);   /* 2^95 - 2^1 */
1066    felem_square(tmp, ftmp2);
1067    felem_reduce(ftmp2, tmp);   /* 2^96 - 2^2 */
1068    felem_mul(tmp, ftmp2, in);
1069    felem_reduce(ftmp2, tmp);   /* 2^96 - 3 */
1070
1071    felem_mul(tmp, ftmp2, ftmp);
1072    felem_reduce(out, tmp);     /* 2^256 - 2^224 + 2^192 + 2^96 - 3 */
1073}
1074
1075static void smallfelem_inv_contract(smallfelem out, const smallfelem in)
1076{
1077    felem tmp;
1078
1079    smallfelem_expand(tmp, in);
1080    felem_inv(tmp, tmp);
1081    felem_contract(out, tmp);
1082}
1083
1084/*-
1085 * Group operations
1086 * ----------------
1087 *
1088 * Building on top of the field operations we have the operations on the
1089 * elliptic curve group itself. Points on the curve are represented in Jacobian
1090 * coordinates
1091 */
1092
1093/*-
1094 * point_double calculates 2*(x_in, y_in, z_in)
1095 *
1096 * The method is taken from:
1097 *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b
1098 *
1099 * Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed.
1100 * while x_out == y_in is not (maybe this works, but it's not tested).
1101 */
1102static void
1103point_double(felem x_out, felem y_out, felem z_out,
1104             const felem x_in, const felem y_in, const felem z_in)
1105{
1106    longfelem tmp, tmp2;
1107    felem delta, gamma, beta, alpha, ftmp, ftmp2;
1108    smallfelem small1, small2;
1109
1110    felem_assign(ftmp, x_in);
1111    /* ftmp[i] < 2^106 */
1112    felem_assign(ftmp2, x_in);
1113    /* ftmp2[i] < 2^106 */
1114
1115    /* delta = z^2 */
1116    felem_square(tmp, z_in);
1117    felem_reduce(delta, tmp);
1118    /* delta[i] < 2^101 */
1119
1120    /* gamma = y^2 */
1121    felem_square(tmp, y_in);
1122    felem_reduce(gamma, tmp);
1123    /* gamma[i] < 2^101 */
1124    felem_shrink(small1, gamma);
1125
1126    /* beta = x*gamma */
1127    felem_small_mul(tmp, small1, x_in);
1128    felem_reduce(beta, tmp);
1129    /* beta[i] < 2^101 */
1130
1131    /* alpha = 3*(x-delta)*(x+delta) */
1132    felem_diff(ftmp, delta);
1133    /* ftmp[i] < 2^105 + 2^106 < 2^107 */
1134    felem_sum(ftmp2, delta);
1135    /* ftmp2[i] < 2^105 + 2^106 < 2^107 */
1136    felem_scalar(ftmp2, 3);
1137    /* ftmp2[i] < 3 * 2^107 < 2^109 */
1138    felem_mul(tmp, ftmp, ftmp2);
1139    felem_reduce(alpha, tmp);
1140    /* alpha[i] < 2^101 */
1141    felem_shrink(small2, alpha);
1142
1143    /* x' = alpha^2 - 8*beta */
1144    smallfelem_square(tmp, small2);
1145    felem_reduce(x_out, tmp);
1146    felem_assign(ftmp, beta);
1147    felem_scalar(ftmp, 8);
1148    /* ftmp[i] < 8 * 2^101 = 2^104 */
1149    felem_diff(x_out, ftmp);
1150    /* x_out[i] < 2^105 + 2^101 < 2^106 */
1151
1152    /* z' = (y + z)^2 - gamma - delta */
1153    felem_sum(delta, gamma);
1154    /* delta[i] < 2^101 + 2^101 = 2^102 */
1155    felem_assign(ftmp, y_in);
1156    felem_sum(ftmp, z_in);
1157    /* ftmp[i] < 2^106 + 2^106 = 2^107 */
1158    felem_square(tmp, ftmp);
1159    felem_reduce(z_out, tmp);
1160    felem_diff(z_out, delta);
1161    /* z_out[i] < 2^105 + 2^101 < 2^106 */
1162
1163    /* y' = alpha*(4*beta - x') - 8*gamma^2 */
1164    felem_scalar(beta, 4);
1165    /* beta[i] < 4 * 2^101 = 2^103 */
1166    felem_diff_zero107(beta, x_out);
1167    /* beta[i] < 2^107 + 2^103 < 2^108 */
1168    felem_small_mul(tmp, small2, beta);
1169    /* tmp[i] < 7 * 2^64 < 2^67 */
1170    smallfelem_square(tmp2, small1);
1171    /* tmp2[i] < 7 * 2^64 */
1172    longfelem_scalar(tmp2, 8);
1173    /* tmp2[i] < 8 * 7 * 2^64 = 7 * 2^67 */
1174    longfelem_diff(tmp, tmp2);
1175    /* tmp[i] < 2^67 + 2^70 + 2^40 < 2^71 */
1176    felem_reduce_zero105(y_out, tmp);
1177    /* y_out[i] < 2^106 */
1178}
1179
1180/*
1181 * point_double_small is the same as point_double, except that it operates on
1182 * smallfelems
1183 */
1184static void
1185point_double_small(smallfelem x_out, smallfelem y_out, smallfelem z_out,
1186                   const smallfelem x_in, const smallfelem y_in,
1187                   const smallfelem z_in)
1188{
1189    felem felem_x_out, felem_y_out, felem_z_out;
1190    felem felem_x_in, felem_y_in, felem_z_in;
1191
1192    smallfelem_expand(felem_x_in, x_in);
1193    smallfelem_expand(felem_y_in, y_in);
1194    smallfelem_expand(felem_z_in, z_in);
1195    point_double(felem_x_out, felem_y_out, felem_z_out,
1196                 felem_x_in, felem_y_in, felem_z_in);
1197    felem_shrink(x_out, felem_x_out);
1198    felem_shrink(y_out, felem_y_out);
1199    felem_shrink(z_out, felem_z_out);
1200}
1201
1202/* copy_conditional copies in to out iff mask is all ones. */
1203static void copy_conditional(felem out, const felem in, limb mask)
1204{
1205    unsigned i;
1206    for (i = 0; i < NLIMBS; ++i) {
1207        const limb tmp = mask & (in[i] ^ out[i]);
1208        out[i] ^= tmp;
1209    }
1210}
1211
1212/* copy_small_conditional copies in to out iff mask is all ones. */
1213static void copy_small_conditional(felem out, const smallfelem in, limb mask)
1214{
1215    unsigned i;
1216    const u64 mask64 = mask;
1217    for (i = 0; i < NLIMBS; ++i) {
1218        out[i] = ((limb) (in[i] & mask64)) | (out[i] & ~mask);
1219    }
1220}
1221
1222/*-
1223 * point_add calcuates (x1, y1, z1) + (x2, y2, z2)
1224 *
1225 * The method is taken from:
1226 *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl,
1227 * adapted for mixed addition (z2 = 1, or z2 = 0 for the point at infinity).
1228 *
1229 * This function includes a branch for checking whether the two input points
1230 * are equal, (while not equal to the point at infinity). This case never
1231 * happens during single point multiplication, so there is no timing leak for
1232 * ECDH or ECDSA signing.
1233 */
1234static void point_add(felem x3, felem y3, felem z3,
1235                      const felem x1, const felem y1, const felem z1,
1236                      const int mixed, const smallfelem x2,
1237                      const smallfelem y2, const smallfelem z2)
1238{
1239    felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, ftmp6, x_out, y_out, z_out;
1240    longfelem tmp, tmp2;
1241    smallfelem small1, small2, small3, small4, small5;
1242    limb x_equal, y_equal, z1_is_zero, z2_is_zero;
1243
1244    felem_shrink(small3, z1);
1245
1246    z1_is_zero = smallfelem_is_zero(small3);
1247    z2_is_zero = smallfelem_is_zero(z2);
1248
1249    /* ftmp = z1z1 = z1**2 */
1250    smallfelem_square(tmp, small3);
1251    felem_reduce(ftmp, tmp);
1252    /* ftmp[i] < 2^101 */
1253    felem_shrink(small1, ftmp);
1254
1255    if (!mixed) {
1256        /* ftmp2 = z2z2 = z2**2 */
1257        smallfelem_square(tmp, z2);
1258        felem_reduce(ftmp2, tmp);
1259        /* ftmp2[i] < 2^101 */
1260        felem_shrink(small2, ftmp2);
1261
1262        felem_shrink(small5, x1);
1263
1264        /* u1 = ftmp3 = x1*z2z2 */
1265        smallfelem_mul(tmp, small5, small2);
1266        felem_reduce(ftmp3, tmp);
1267        /* ftmp3[i] < 2^101 */
1268
1269        /* ftmp5 = z1 + z2 */
1270        felem_assign(ftmp5, z1);
1271        felem_small_sum(ftmp5, z2);
1272        /* ftmp5[i] < 2^107 */
1273
1274        /* ftmp5 = (z1 + z2)**2 - (z1z1 + z2z2) = 2z1z2 */
1275        felem_square(tmp, ftmp5);
1276        felem_reduce(ftmp5, tmp);
1277        /* ftmp2 = z2z2 + z1z1 */
1278        felem_sum(ftmp2, ftmp);
1279        /* ftmp2[i] < 2^101 + 2^101 = 2^102 */
1280        felem_diff(ftmp5, ftmp2);
1281        /* ftmp5[i] < 2^105 + 2^101 < 2^106 */
1282
1283        /* ftmp2 = z2 * z2z2 */
1284        smallfelem_mul(tmp, small2, z2);
1285        felem_reduce(ftmp2, tmp);
1286
1287        /* s1 = ftmp2 = y1 * z2**3 */
1288        felem_mul(tmp, y1, ftmp2);
1289        felem_reduce(ftmp6, tmp);
1290        /* ftmp6[i] < 2^101 */
1291    } else {
1292        /*
1293         * We'll assume z2 = 1 (special case z2 = 0 is handled later)
1294         */
1295
1296        /* u1 = ftmp3 = x1*z2z2 */
1297        felem_assign(ftmp3, x1);
1298        /* ftmp3[i] < 2^106 */
1299
1300        /* ftmp5 = 2z1z2 */
1301        felem_assign(ftmp5, z1);
1302        felem_scalar(ftmp5, 2);
1303        /* ftmp5[i] < 2*2^106 = 2^107 */
1304
1305        /* s1 = ftmp2 = y1 * z2**3 */
1306        felem_assign(ftmp6, y1);
1307        /* ftmp6[i] < 2^106 */
1308    }
1309
1310    /* u2 = x2*z1z1 */
1311    smallfelem_mul(tmp, x2, small1);
1312    felem_reduce(ftmp4, tmp);
1313
1314    /* h = ftmp4 = u2 - u1 */
1315    felem_diff_zero107(ftmp4, ftmp3);
1316    /* ftmp4[i] < 2^107 + 2^101 < 2^108 */
1317    felem_shrink(small4, ftmp4);
1318
1319    x_equal = smallfelem_is_zero(small4);
1320
1321    /* z_out = ftmp5 * h */
1322    felem_small_mul(tmp, small4, ftmp5);
1323    felem_reduce(z_out, tmp);
1324    /* z_out[i] < 2^101 */
1325
1326    /* ftmp = z1 * z1z1 */
1327    smallfelem_mul(tmp, small1, small3);
1328    felem_reduce(ftmp, tmp);
1329
1330    /* s2 = tmp = y2 * z1**3 */
1331    felem_small_mul(tmp, y2, ftmp);
1332    felem_reduce(ftmp5, tmp);
1333
1334    /* r = ftmp5 = (s2 - s1)*2 */
1335    felem_diff_zero107(ftmp5, ftmp6);
1336    /* ftmp5[i] < 2^107 + 2^107 = 2^108 */
1337    felem_scalar(ftmp5, 2);
1338    /* ftmp5[i] < 2^109 */
1339    felem_shrink(small1, ftmp5);
1340    y_equal = smallfelem_is_zero(small1);
1341
1342    if (x_equal && y_equal && !z1_is_zero && !z2_is_zero) {
1343        point_double(x3, y3, z3, x1, y1, z1);
1344        return;
1345    }
1346
1347    /* I = ftmp = (2h)**2 */
1348    felem_assign(ftmp, ftmp4);
1349    felem_scalar(ftmp, 2);
1350    /* ftmp[i] < 2*2^108 = 2^109 */
1351    felem_square(tmp, ftmp);
1352    felem_reduce(ftmp, tmp);
1353
1354    /* J = ftmp2 = h * I */
1355    felem_mul(tmp, ftmp4, ftmp);
1356    felem_reduce(ftmp2, tmp);
1357
1358    /* V = ftmp4 = U1 * I */
1359    felem_mul(tmp, ftmp3, ftmp);
1360    felem_reduce(ftmp4, tmp);
1361
1362    /* x_out = r**2 - J - 2V */
1363    smallfelem_square(tmp, small1);
1364    felem_reduce(x_out, tmp);
1365    felem_assign(ftmp3, ftmp4);
1366    felem_scalar(ftmp4, 2);
1367    felem_sum(ftmp4, ftmp2);
1368    /* ftmp4[i] < 2*2^101 + 2^101 < 2^103 */
1369    felem_diff(x_out, ftmp4);
1370    /* x_out[i] < 2^105 + 2^101 */
1371
1372    /* y_out = r(V-x_out) - 2 * s1 * J */
1373    felem_diff_zero107(ftmp3, x_out);
1374    /* ftmp3[i] < 2^107 + 2^101 < 2^108 */
1375    felem_small_mul(tmp, small1, ftmp3);
1376    felem_mul(tmp2, ftmp6, ftmp2);
1377    longfelem_scalar(tmp2, 2);
1378    /* tmp2[i] < 2*2^67 = 2^68 */
1379    longfelem_diff(tmp, tmp2);
1380    /* tmp[i] < 2^67 + 2^70 + 2^40 < 2^71 */
1381    felem_reduce_zero105(y_out, tmp);
1382    /* y_out[i] < 2^106 */
1383
1384    copy_small_conditional(x_out, x2, z1_is_zero);
1385    copy_conditional(x_out, x1, z2_is_zero);
1386    copy_small_conditional(y_out, y2, z1_is_zero);
1387    copy_conditional(y_out, y1, z2_is_zero);
1388    copy_small_conditional(z_out, z2, z1_is_zero);
1389    copy_conditional(z_out, z1, z2_is_zero);
1390    felem_assign(x3, x_out);
1391    felem_assign(y3, y_out);
1392    felem_assign(z3, z_out);
1393}
1394
1395/*
1396 * point_add_small is the same as point_add, except that it operates on
1397 * smallfelems
1398 */
1399static void point_add_small(smallfelem x3, smallfelem y3, smallfelem z3,
1400                            smallfelem x1, smallfelem y1, smallfelem z1,
1401                            smallfelem x2, smallfelem y2, smallfelem z2)
1402{
1403    felem felem_x3, felem_y3, felem_z3;
1404    felem felem_x1, felem_y1, felem_z1;
1405    smallfelem_expand(felem_x1, x1);
1406    smallfelem_expand(felem_y1, y1);
1407    smallfelem_expand(felem_z1, z1);
1408    point_add(felem_x3, felem_y3, felem_z3, felem_x1, felem_y1, felem_z1, 0,
1409              x2, y2, z2);
1410    felem_shrink(x3, felem_x3);
1411    felem_shrink(y3, felem_y3);
1412    felem_shrink(z3, felem_z3);
1413}
1414
1415/*-
1416 * Base point pre computation
1417 * --------------------------
1418 *
1419 * Two different sorts of precomputed tables are used in the following code.
1420 * Each contain various points on the curve, where each point is three field
1421 * elements (x, y, z).
1422 *
1423 * For the base point table, z is usually 1 (0 for the point at infinity).
1424 * This table has 2 * 16 elements, starting with the following:
1425 * index | bits    | point
1426 * ------+---------+------------------------------
1427 *     0 | 0 0 0 0 | 0G
1428 *     1 | 0 0 0 1 | 1G
1429 *     2 | 0 0 1 0 | 2^64G
1430 *     3 | 0 0 1 1 | (2^64 + 1)G
1431 *     4 | 0 1 0 0 | 2^128G
1432 *     5 | 0 1 0 1 | (2^128 + 1)G
1433 *     6 | 0 1 1 0 | (2^128 + 2^64)G
1434 *     7 | 0 1 1 1 | (2^128 + 2^64 + 1)G
1435 *     8 | 1 0 0 0 | 2^192G
1436 *     9 | 1 0 0 1 | (2^192 + 1)G
1437 *    10 | 1 0 1 0 | (2^192 + 2^64)G
1438 *    11 | 1 0 1 1 | (2^192 + 2^64 + 1)G
1439 *    12 | 1 1 0 0 | (2^192 + 2^128)G
1440 *    13 | 1 1 0 1 | (2^192 + 2^128 + 1)G
1441 *    14 | 1 1 1 0 | (2^192 + 2^128 + 2^64)G
1442 *    15 | 1 1 1 1 | (2^192 + 2^128 + 2^64 + 1)G
1443 * followed by a copy of this with each element multiplied by 2^32.
1444 *
1445 * The reason for this is so that we can clock bits into four different
1446 * locations when doing simple scalar multiplies against the base point,
1447 * and then another four locations using the second 16 elements.
1448 *
1449 * Tables for other points have table[i] = iG for i in 0 .. 16. */
1450
1451/* gmul is the table of precomputed base points */
1452static const smallfelem gmul[2][16][3] = {
1453    {{{0, 0, 0, 0},
1454      {0, 0, 0, 0},
1455      {0, 0, 0, 0}},
1456     {{0xf4a13945d898c296, 0x77037d812deb33a0, 0xf8bce6e563a440f2,
1457       0x6b17d1f2e12c4247},
1458      {0xcbb6406837bf51f5, 0x2bce33576b315ece, 0x8ee7eb4a7c0f9e16,
1459       0x4fe342e2fe1a7f9b},
1460      {1, 0, 0, 0}},
1461     {{0x90e75cb48e14db63, 0x29493baaad651f7e, 0x8492592e326e25de,
1462       0x0fa822bc2811aaa5},
1463      {0xe41124545f462ee7, 0x34b1a65050fe82f5, 0x6f4ad4bcb3df188b,
1464       0xbff44ae8f5dba80d},
1465      {1, 0, 0, 0}},
1466     {{0x93391ce2097992af, 0xe96c98fd0d35f1fa, 0xb257c0de95e02789,
1467       0x300a4bbc89d6726f},
1468      {0xaa54a291c08127a0, 0x5bb1eeada9d806a5, 0x7f1ddb25ff1e3c6f,
1469       0x72aac7e0d09b4644},
1470      {1, 0, 0, 0}},
1471     {{0x57c84fc9d789bd85, 0xfc35ff7dc297eac3, 0xfb982fd588c6766e,
1472       0x447d739beedb5e67},
1473      {0x0c7e33c972e25b32, 0x3d349b95a7fae500, 0xe12e9d953a4aaff7,
1474       0x2d4825ab834131ee},
1475      {1, 0, 0, 0}},
1476     {{0x13949c932a1d367f, 0xef7fbd2b1a0a11b7, 0xddc6068bb91dfc60,
1477       0xef9519328a9c72ff},
1478      {0x196035a77376d8a8, 0x23183b0895ca1740, 0xc1ee9807022c219c,
1479       0x611e9fc37dbb2c9b},
1480      {1, 0, 0, 0}},
1481     {{0xcae2b1920b57f4bc, 0x2936df5ec6c9bc36, 0x7dea6482e11238bf,
1482       0x550663797b51f5d8},
1483      {0x44ffe216348a964c, 0x9fb3d576dbdefbe1, 0x0afa40018d9d50e5,
1484       0x157164848aecb851},
1485      {1, 0, 0, 0}},
1486     {{0xe48ecafffc5cde01, 0x7ccd84e70d715f26, 0xa2e8f483f43e4391,
1487       0xeb5d7745b21141ea},
1488      {0xcac917e2731a3479, 0x85f22cfe2844b645, 0x0990e6a158006cee,
1489       0xeafd72ebdbecc17b},
1490      {1, 0, 0, 0}},
1491     {{0x6cf20ffb313728be, 0x96439591a3c6b94a, 0x2736ff8344315fc5,
1492       0xa6d39677a7849276},
1493      {0xf2bab833c357f5f4, 0x824a920c2284059b, 0x66b8babd2d27ecdf,
1494       0x674f84749b0b8816},
1495      {1, 0, 0, 0}},
1496     {{0x2df48c04677c8a3e, 0x74e02f080203a56b, 0x31855f7db8c7fedb,
1497       0x4e769e7672c9ddad},
1498      {0xa4c36165b824bbb0, 0xfb9ae16f3b9122a5, 0x1ec0057206947281,
1499       0x42b99082de830663},
1500      {1, 0, 0, 0}},
1501     {{0x6ef95150dda868b9, 0xd1f89e799c0ce131, 0x7fdc1ca008a1c478,
1502       0x78878ef61c6ce04d},
1503      {0x9c62b9121fe0d976, 0x6ace570ebde08d4f, 0xde53142c12309def,
1504       0xb6cb3f5d7b72c321},
1505      {1, 0, 0, 0}},
1506     {{0x7f991ed2c31a3573, 0x5b82dd5bd54fb496, 0x595c5220812ffcae,
1507       0x0c88bc4d716b1287},
1508      {0x3a57bf635f48aca8, 0x7c8181f4df2564f3, 0x18d1b5b39c04e6aa,
1509       0xdd5ddea3f3901dc6},
1510      {1, 0, 0, 0}},
1511     {{0xe96a79fb3e72ad0c, 0x43a0a28c42ba792f, 0xefe0a423083e49f3,
1512       0x68f344af6b317466},
1513      {0xcdfe17db3fb24d4a, 0x668bfc2271f5c626, 0x604ed93c24d67ff3,
1514       0x31b9c405f8540a20},
1515      {1, 0, 0, 0}},
1516     {{0xd36b4789a2582e7f, 0x0d1a10144ec39c28, 0x663c62c3edbad7a0,
1517       0x4052bf4b6f461db9},
1518      {0x235a27c3188d25eb, 0xe724f33999bfcc5b, 0x862be6bd71d70cc8,
1519       0xfecf4d5190b0fc61},
1520      {1, 0, 0, 0}},
1521     {{0x74346c10a1d4cfac, 0xafdf5cc08526a7a4, 0x123202a8f62bff7a,
1522       0x1eddbae2c802e41a},
1523      {0x8fa0af2dd603f844, 0x36e06b7e4c701917, 0x0c45f45273db33a0,
1524       0x43104d86560ebcfc},
1525      {1, 0, 0, 0}},
1526     {{0x9615b5110d1d78e5, 0x66b0de3225c4744b, 0x0a4a46fb6aaf363a,
1527       0xb48e26b484f7a21c},
1528      {0x06ebb0f621a01b2d, 0xc004e4048b7b0f98, 0x64131bcdfed6f668,
1529       0xfac015404d4d3dab},
1530      {1, 0, 0, 0}}},
1531    {{{0, 0, 0, 0},
1532      {0, 0, 0, 0},
1533      {0, 0, 0, 0}},
1534     {{0x3a5a9e22185a5943, 0x1ab919365c65dfb6, 0x21656b32262c71da,
1535       0x7fe36b40af22af89},
1536      {0xd50d152c699ca101, 0x74b3d5867b8af212, 0x9f09f40407dca6f1,
1537       0xe697d45825b63624},
1538      {1, 0, 0, 0}},
1539     {{0xa84aa9397512218e, 0xe9a521b074ca0141, 0x57880b3a18a2e902,
1540       0x4a5b506612a677a6},
1541      {0x0beada7a4c4f3840, 0x626db15419e26d9d, 0xc42604fbe1627d40,
1542       0xeb13461ceac089f1},
1543      {1, 0, 0, 0}},
1544     {{0xf9faed0927a43281, 0x5e52c4144103ecbc, 0xc342967aa815c857,
1545       0x0781b8291c6a220a},
1546      {0x5a8343ceeac55f80, 0x88f80eeee54a05e3, 0x97b2a14f12916434,
1547       0x690cde8df0151593},
1548      {1, 0, 0, 0}},
1549     {{0xaee9c75df7f82f2a, 0x9e4c35874afdf43a, 0xf5622df437371326,
1550       0x8a535f566ec73617},
1551      {0xc5f9a0ac223094b7, 0xcde533864c8c7669, 0x37e02819085a92bf,
1552       0x0455c08468b08bd7},
1553      {1, 0, 0, 0}},
1554     {{0x0c0a6e2c9477b5d9, 0xf9a4bf62876dc444, 0x5050a949b6cdc279,
1555       0x06bada7ab77f8276},
1556      {0xc8b4aed1ea48dac9, 0xdebd8a4b7ea1070f, 0x427d49101366eb70,
1557       0x5b476dfd0e6cb18a},
1558      {1, 0, 0, 0}},
1559     {{0x7c5c3e44278c340a, 0x4d54606812d66f3b, 0x29a751b1ae23c5d8,
1560       0x3e29864e8a2ec908},
1561      {0x142d2a6626dbb850, 0xad1744c4765bd780, 0x1f150e68e322d1ed,
1562       0x239b90ea3dc31e7e},
1563      {1, 0, 0, 0}},
1564     {{0x78c416527a53322a, 0x305dde6709776f8e, 0xdbcab759f8862ed4,
1565       0x820f4dd949f72ff7},
1566      {0x6cc544a62b5debd4, 0x75be5d937b4e8cc4, 0x1b481b1b215c14d3,
1567       0x140406ec783a05ec},
1568      {1, 0, 0, 0}},
1569     {{0x6a703f10e895df07, 0xfd75f3fa01876bd8, 0xeb5b06e70ce08ffe,
1570       0x68f6b8542783dfee},
1571      {0x90c76f8a78712655, 0xcf5293d2f310bf7f, 0xfbc8044dfda45028,
1572       0xcbe1feba92e40ce6},
1573      {1, 0, 0, 0}},
1574     {{0xe998ceea4396e4c1, 0xfc82ef0b6acea274, 0x230f729f2250e927,
1575       0xd0b2f94d2f420109},
1576      {0x4305adddb38d4966, 0x10b838f8624c3b45, 0x7db2636658954e7a,
1577       0x971459828b0719e5},
1578      {1, 0, 0, 0}},
1579     {{0x4bd6b72623369fc9, 0x57f2929e53d0b876, 0xc2d5cba4f2340687,
1580       0x961610004a866aba},
1581      {0x49997bcd2e407a5e, 0x69ab197d92ddcb24, 0x2cf1f2438fe5131c,
1582       0x7acb9fadcee75e44},
1583      {1, 0, 0, 0}},
1584     {{0x254e839423d2d4c0, 0xf57f0c917aea685b, 0xa60d880f6f75aaea,
1585       0x24eb9acca333bf5b},
1586      {0xe3de4ccb1cda5dea, 0xfeef9341c51a6b4f, 0x743125f88bac4c4d,
1587       0x69f891c5acd079cc},
1588      {1, 0, 0, 0}},
1589     {{0xeee44b35702476b5, 0x7ed031a0e45c2258, 0xb422d1e7bd6f8514,
1590       0xe51f547c5972a107},
1591      {0xa25bcd6fc9cf343d, 0x8ca922ee097c184e, 0xa62f98b3a9fe9a06,
1592       0x1c309a2b25bb1387},
1593      {1, 0, 0, 0}},
1594     {{0x9295dbeb1967c459, 0xb00148833472c98e, 0xc504977708011828,
1595       0x20b87b8aa2c4e503},
1596      {0x3063175de057c277, 0x1bd539338fe582dd, 0x0d11adef5f69a044,
1597       0xf5c6fa49919776be},
1598      {1, 0, 0, 0}},
1599     {{0x8c944e760fd59e11, 0x3876cba1102fad5f, 0xa454c3fad83faa56,
1600       0x1ed7d1b9332010b9},
1601      {0xa1011a270024b889, 0x05e4d0dcac0cd344, 0x52b520f0eb6a2a24,
1602       0x3a2b03f03217257a},
1603      {1, 0, 0, 0}},
1604     {{0xf20fc2afdf1d043d, 0xf330240db58d5a62, 0xfc7d229ca0058c3b,
1605       0x15fee545c78dd9f6},
1606      {0x501e82885bc98cda, 0x41ef80e5d046ac04, 0x557d9f49461210fb,
1607       0x4ab5b6b2b8753f81},
1608      {1, 0, 0, 0}}}
1609};
1610
1611/*
1612 * select_point selects the |idx|th point from a precomputation table and
1613 * copies it to out.
1614 */
1615static void select_point(const u64 idx, unsigned int size,
1616                         const smallfelem pre_comp[16][3], smallfelem out[3])
1617{
1618    unsigned i, j;
1619    u64 *outlimbs = &out[0][0];
1620    memset(outlimbs, 0, 3 * sizeof(smallfelem));
1621
1622    for (i = 0; i < size; i++) {
1623        const u64 *inlimbs = (u64 *)&pre_comp[i][0][0];
1624        u64 mask = i ^ idx;
1625        mask |= mask >> 4;
1626        mask |= mask >> 2;
1627        mask |= mask >> 1;
1628        mask &= 1;
1629        mask--;
1630        for (j = 0; j < NLIMBS * 3; j++)
1631            outlimbs[j] |= inlimbs[j] & mask;
1632    }
1633}
1634
1635/* get_bit returns the |i|th bit in |in| */
1636static char get_bit(const felem_bytearray in, int i)
1637{
1638    if ((i < 0) || (i >= 256))
1639        return 0;
1640    return (in[i >> 3] >> (i & 7)) & 1;
1641}
1642
1643/*
1644 * Interleaved point multiplication using precomputed point multiples: The
1645 * small point multiples 0*P, 1*P, ..., 17*P are in pre_comp[], the scalars
1646 * in scalars[]. If g_scalar is non-NULL, we also add this multiple of the
1647 * generator, using certain (large) precomputed multiples in g_pre_comp.
1648 * Output point (X, Y, Z) is stored in x_out, y_out, z_out
1649 */
1650static void batch_mul(felem x_out, felem y_out, felem z_out,
1651                      const felem_bytearray scalars[],
1652                      const unsigned num_points, const u8 *g_scalar,
1653                      const int mixed, const smallfelem pre_comp[][17][3],
1654                      const smallfelem g_pre_comp[2][16][3])
1655{
1656    int i, skip;
1657    unsigned num, gen_mul = (g_scalar != NULL);
1658    felem nq[3], ftmp;
1659    smallfelem tmp[3];
1660    u64 bits;
1661    u8 sign, digit;
1662
1663    /* set nq to the point at infinity */
1664    memset(nq, 0, 3 * sizeof(felem));
1665
1666    /*
1667     * Loop over all scalars msb-to-lsb, interleaving additions of multiples
1668     * of the generator (two in each of the last 32 rounds) and additions of
1669     * other points multiples (every 5th round).
1670     */
1671    skip = 1;                   /* save two point operations in the first
1672                                 * round */
1673    for (i = (num_points ? 255 : 31); i >= 0; --i) {
1674        /* double */
1675        if (!skip)
1676            point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]);
1677
1678        /* add multiples of the generator */
1679        if (gen_mul && (i <= 31)) {
1680            /* first, look 32 bits upwards */
1681            bits = get_bit(g_scalar, i + 224) << 3;
1682            bits |= get_bit(g_scalar, i + 160) << 2;
1683            bits |= get_bit(g_scalar, i + 96) << 1;
1684            bits |= get_bit(g_scalar, i + 32);
1685            /* select the point to add, in constant time */
1686            select_point(bits, 16, g_pre_comp[1], tmp);
1687
1688            if (!skip) {
1689                /* Arg 1 below is for "mixed" */
1690                point_add(nq[0], nq[1], nq[2],
1691                          nq[0], nq[1], nq[2], 1, tmp[0], tmp[1], tmp[2]);
1692            } else {
1693                smallfelem_expand(nq[0], tmp[0]);
1694                smallfelem_expand(nq[1], tmp[1]);
1695                smallfelem_expand(nq[2], tmp[2]);
1696                skip = 0;
1697            }
1698
1699            /* second, look at the current position */
1700            bits = get_bit(g_scalar, i + 192) << 3;
1701            bits |= get_bit(g_scalar, i + 128) << 2;
1702            bits |= get_bit(g_scalar, i + 64) << 1;
1703            bits |= get_bit(g_scalar, i);
1704            /* select the point to add, in constant time */
1705            select_point(bits, 16, g_pre_comp[0], tmp);
1706            /* Arg 1 below is for "mixed" */
1707            point_add(nq[0], nq[1], nq[2],
1708                      nq[0], nq[1], nq[2], 1, tmp[0], tmp[1], tmp[2]);
1709        }
1710
1711        /* do other additions every 5 doublings */
1712        if (num_points && (i % 5 == 0)) {
1713            /* loop over all scalars */
1714            for (num = 0; num < num_points; ++num) {
1715                bits = get_bit(scalars[num], i + 4) << 5;
1716                bits |= get_bit(scalars[num], i + 3) << 4;
1717                bits |= get_bit(scalars[num], i + 2) << 3;
1718                bits |= get_bit(scalars[num], i + 1) << 2;
1719                bits |= get_bit(scalars[num], i) << 1;
1720                bits |= get_bit(scalars[num], i - 1);
1721                ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits);
1722
1723                /*
1724                 * select the point to add or subtract, in constant time
1725                 */
1726                select_point(digit, 17, pre_comp[num], tmp);
1727                smallfelem_neg(ftmp, tmp[1]); /* (X, -Y, Z) is the negative
1728                                               * point */
1729                copy_small_conditional(ftmp, tmp[1], (((limb) sign) - 1));
1730                felem_contract(tmp[1], ftmp);
1731
1732                if (!skip) {
1733                    point_add(nq[0], nq[1], nq[2],
1734                              nq[0], nq[1], nq[2],
1735                              mixed, tmp[0], tmp[1], tmp[2]);
1736                } else {
1737                    smallfelem_expand(nq[0], tmp[0]);
1738                    smallfelem_expand(nq[1], tmp[1]);
1739                    smallfelem_expand(nq[2], tmp[2]);
1740                    skip = 0;
1741                }
1742            }
1743        }
1744    }
1745    felem_assign(x_out, nq[0]);
1746    felem_assign(y_out, nq[1]);
1747    felem_assign(z_out, nq[2]);
1748}
1749
1750/* Precomputation for the group generator. */
1751typedef struct {
1752    smallfelem g_pre_comp[2][16][3];
1753    int references;
1754} NISTP256_PRE_COMP;
1755
1756const EC_METHOD *EC_GFp_nistp256_method(void)
1757{
1758    static const EC_METHOD ret = {
1759        EC_FLAGS_DEFAULT_OCT,
1760        NID_X9_62_prime_field,
1761        ec_GFp_nistp256_group_init,
1762        ec_GFp_simple_group_finish,
1763        ec_GFp_simple_group_clear_finish,
1764        ec_GFp_nist_group_copy,
1765        ec_GFp_nistp256_group_set_curve,
1766        ec_GFp_simple_group_get_curve,
1767        ec_GFp_simple_group_get_degree,
1768        ec_GFp_simple_group_check_discriminant,
1769        ec_GFp_simple_point_init,
1770        ec_GFp_simple_point_finish,
1771        ec_GFp_simple_point_clear_finish,
1772        ec_GFp_simple_point_copy,
1773        ec_GFp_simple_point_set_to_infinity,
1774        ec_GFp_simple_set_Jprojective_coordinates_GFp,
1775        ec_GFp_simple_get_Jprojective_coordinates_GFp,
1776        ec_GFp_simple_point_set_affine_coordinates,
1777        ec_GFp_nistp256_point_get_affine_coordinates,
1778        0 /* point_set_compressed_coordinates */ ,
1779        0 /* point2oct */ ,
1780        0 /* oct2point */ ,
1781        ec_GFp_simple_add,
1782        ec_GFp_simple_dbl,
1783        ec_GFp_simple_invert,
1784        ec_GFp_simple_is_at_infinity,
1785        ec_GFp_simple_is_on_curve,
1786        ec_GFp_simple_cmp,
1787        ec_GFp_simple_make_affine,
1788        ec_GFp_simple_points_make_affine,
1789        ec_GFp_nistp256_points_mul,
1790        ec_GFp_nistp256_precompute_mult,
1791        ec_GFp_nistp256_have_precompute_mult,
1792        ec_GFp_nist_field_mul,
1793        ec_GFp_nist_field_sqr,
1794        0 /* field_div */ ,
1795        0 /* field_encode */ ,
1796        0 /* field_decode */ ,
1797        0                       /* field_set_to_one */
1798    };
1799
1800    return &ret;
1801}
1802
1803/******************************************************************************/
1804/*
1805 * FUNCTIONS TO MANAGE PRECOMPUTATION
1806 */
1807
1808static NISTP256_PRE_COMP *nistp256_pre_comp_new()
1809{
1810    NISTP256_PRE_COMP *ret = NULL;
1811    ret = (NISTP256_PRE_COMP *) OPENSSL_malloc(sizeof(*ret));
1812    if (!ret) {
1813        ECerr(EC_F_NISTP256_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE);
1814        return ret;
1815    }
1816    memset(ret->g_pre_comp, 0, sizeof(ret->g_pre_comp));
1817    ret->references = 1;
1818    return ret;
1819}
1820
1821static void *nistp256_pre_comp_dup(void *src_)
1822{
1823    NISTP256_PRE_COMP *src = src_;
1824
1825    /* no need to actually copy, these objects never change! */
1826    CRYPTO_add(&src->references, 1, CRYPTO_LOCK_EC_PRE_COMP);
1827
1828    return src_;
1829}
1830
1831static void nistp256_pre_comp_free(void *pre_)
1832{
1833    int i;
1834    NISTP256_PRE_COMP *pre = pre_;
1835
1836    if (!pre)
1837        return;
1838
1839    i = CRYPTO_add(&pre->references, -1, CRYPTO_LOCK_EC_PRE_COMP);
1840    if (i > 0)
1841        return;
1842
1843    OPENSSL_free(pre);
1844}
1845
1846static void nistp256_pre_comp_clear_free(void *pre_)
1847{
1848    int i;
1849    NISTP256_PRE_COMP *pre = pre_;
1850
1851    if (!pre)
1852        return;
1853
1854    i = CRYPTO_add(&pre->references, -1, CRYPTO_LOCK_EC_PRE_COMP);
1855    if (i > 0)
1856        return;
1857
1858    OPENSSL_cleanse(pre, sizeof(*pre));
1859    OPENSSL_free(pre);
1860}
1861
1862/******************************************************************************/
1863/*
1864 * OPENSSL EC_METHOD FUNCTIONS
1865 */
1866
1867int ec_GFp_nistp256_group_init(EC_GROUP *group)
1868{
1869    int ret;
1870    ret = ec_GFp_simple_group_init(group);
1871    group->a_is_minus3 = 1;
1872    return ret;
1873}
1874
1875int ec_GFp_nistp256_group_set_curve(EC_GROUP *group, const BIGNUM *p,
1876                                    const BIGNUM *a, const BIGNUM *b,
1877                                    BN_CTX *ctx)
1878{
1879    int ret = 0;
1880    BN_CTX *new_ctx = NULL;
1881    BIGNUM *curve_p, *curve_a, *curve_b;
1882
1883    if (ctx == NULL)
1884        if ((ctx = new_ctx = BN_CTX_new()) == NULL)
1885            return 0;
1886    BN_CTX_start(ctx);
1887    if (((curve_p = BN_CTX_get(ctx)) == NULL) ||
1888        ((curve_a = BN_CTX_get(ctx)) == NULL) ||
1889        ((curve_b = BN_CTX_get(ctx)) == NULL))
1890        goto err;
1891    BN_bin2bn(nistp256_curve_params[0], sizeof(felem_bytearray), curve_p);
1892    BN_bin2bn(nistp256_curve_params[1], sizeof(felem_bytearray), curve_a);
1893    BN_bin2bn(nistp256_curve_params[2], sizeof(felem_bytearray), curve_b);
1894    if ((BN_cmp(curve_p, p)) || (BN_cmp(curve_a, a)) || (BN_cmp(curve_b, b))) {
1895        ECerr(EC_F_EC_GFP_NISTP256_GROUP_SET_CURVE,
1896              EC_R_WRONG_CURVE_PARAMETERS);
1897        goto err;
1898    }
1899    group->field_mod_func = BN_nist_mod_256;
1900    ret = ec_GFp_simple_group_set_curve(group, p, a, b, ctx);
1901 err:
1902    BN_CTX_end(ctx);
1903    if (new_ctx != NULL)
1904        BN_CTX_free(new_ctx);
1905    return ret;
1906}
1907
1908/*
1909 * Takes the Jacobian coordinates (X, Y, Z) of a point and returns (X', Y') =
1910 * (X/Z^2, Y/Z^3)
1911 */
1912int ec_GFp_nistp256_point_get_affine_coordinates(const EC_GROUP *group,
1913                                                 const EC_POINT *point,
1914                                                 BIGNUM *x, BIGNUM *y,
1915                                                 BN_CTX *ctx)
1916{
1917    felem z1, z2, x_in, y_in;
1918    smallfelem x_out, y_out;
1919    longfelem tmp;
1920
1921    if (EC_POINT_is_at_infinity(group, point)) {
1922        ECerr(EC_F_EC_GFP_NISTP256_POINT_GET_AFFINE_COORDINATES,
1923              EC_R_POINT_AT_INFINITY);
1924        return 0;
1925    }
1926    if ((!BN_to_felem(x_in, &point->X)) || (!BN_to_felem(y_in, &point->Y)) ||
1927        (!BN_to_felem(z1, &point->Z)))
1928        return 0;
1929    felem_inv(z2, z1);
1930    felem_square(tmp, z2);
1931    felem_reduce(z1, tmp);
1932    felem_mul(tmp, x_in, z1);
1933    felem_reduce(x_in, tmp);
1934    felem_contract(x_out, x_in);
1935    if (x != NULL) {
1936        if (!smallfelem_to_BN(x, x_out)) {
1937            ECerr(EC_F_EC_GFP_NISTP256_POINT_GET_AFFINE_COORDINATES,
1938                  ERR_R_BN_LIB);
1939            return 0;
1940        }
1941    }
1942    felem_mul(tmp, z1, z2);
1943    felem_reduce(z1, tmp);
1944    felem_mul(tmp, y_in, z1);
1945    felem_reduce(y_in, tmp);
1946    felem_contract(y_out, y_in);
1947    if (y != NULL) {
1948        if (!smallfelem_to_BN(y, y_out)) {
1949            ECerr(EC_F_EC_GFP_NISTP256_POINT_GET_AFFINE_COORDINATES,
1950                  ERR_R_BN_LIB);
1951            return 0;
1952        }
1953    }
1954    return 1;
1955}
1956
1957/* points below is of size |num|, and tmp_smallfelems is of size |num+1| */
1958static void make_points_affine(size_t num, smallfelem points[][3],
1959                               smallfelem tmp_smallfelems[])
1960{
1961    /*
1962     * Runs in constant time, unless an input is the point at infinity (which
1963     * normally shouldn't happen).
1964     */
1965    ec_GFp_nistp_points_make_affine_internal(num,
1966                                             points,
1967                                             sizeof(smallfelem),
1968                                             tmp_smallfelems,
1969                                             (void (*)(void *))smallfelem_one,
1970                                             smallfelem_is_zero_int,
1971                                             (void (*)(void *, const void *))
1972                                             smallfelem_assign,
1973                                             (void (*)(void *, const void *))
1974                                             smallfelem_square_contract,
1975                                             (void (*)
1976                                              (void *, const void *,
1977                                               const void *))
1978                                             smallfelem_mul_contract,
1979                                             (void (*)(void *, const void *))
1980                                             smallfelem_inv_contract,
1981                                             /* nothing to contract */
1982                                             (void (*)(void *, const void *))
1983                                             smallfelem_assign);
1984}
1985
1986/*
1987 * Computes scalar*generator + \sum scalars[i]*points[i], ignoring NULL
1988 * values Result is stored in r (r can equal one of the inputs).
1989 */
1990int ec_GFp_nistp256_points_mul(const EC_GROUP *group, EC_POINT *r,
1991                               const BIGNUM *scalar, size_t num,
1992                               const EC_POINT *points[],
1993                               const BIGNUM *scalars[], BN_CTX *ctx)
1994{
1995    int ret = 0;
1996    int j;
1997    int mixed = 0;
1998    BN_CTX *new_ctx = NULL;
1999    BIGNUM *x, *y, *z, *tmp_scalar;
2000    felem_bytearray g_secret;
2001    felem_bytearray *secrets = NULL;
2002    smallfelem(*pre_comp)[17][3] = NULL;
2003    smallfelem *tmp_smallfelems = NULL;
2004    unsigned i;
2005    int num_bytes;
2006    int have_pre_comp = 0;
2007    size_t num_points = num;
2008    smallfelem x_in, y_in, z_in;
2009    felem x_out, y_out, z_out;
2010    NISTP256_PRE_COMP *pre = NULL;
2011    const smallfelem(*g_pre_comp)[16][3] = NULL;
2012    EC_POINT *generator = NULL;
2013    const EC_POINT *p = NULL;
2014    const BIGNUM *p_scalar = NULL;
2015
2016    if (ctx == NULL)
2017        if ((ctx = new_ctx = BN_CTX_new()) == NULL)
2018            return 0;
2019    BN_CTX_start(ctx);
2020    if (((x = BN_CTX_get(ctx)) == NULL) ||
2021        ((y = BN_CTX_get(ctx)) == NULL) ||
2022        ((z = BN_CTX_get(ctx)) == NULL) ||
2023        ((tmp_scalar = BN_CTX_get(ctx)) == NULL))
2024        goto err;
2025
2026    if (scalar != NULL) {
2027        pre = EC_EX_DATA_get_data(group->extra_data,
2028                                  nistp256_pre_comp_dup,
2029                                  nistp256_pre_comp_free,
2030                                  nistp256_pre_comp_clear_free);
2031        if (pre)
2032            /* we have precomputation, try to use it */
2033            g_pre_comp = (const smallfelem(*)[16][3])pre->g_pre_comp;
2034        else
2035            /* try to use the standard precomputation */
2036            g_pre_comp = &gmul[0];
2037        generator = EC_POINT_new(group);
2038        if (generator == NULL)
2039            goto err;
2040        /* get the generator from precomputation */
2041        if (!smallfelem_to_BN(x, g_pre_comp[0][1][0]) ||
2042            !smallfelem_to_BN(y, g_pre_comp[0][1][1]) ||
2043            !smallfelem_to_BN(z, g_pre_comp[0][1][2])) {
2044            ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_BN_LIB);
2045            goto err;
2046        }
2047        if (!EC_POINT_set_Jprojective_coordinates_GFp(group,
2048                                                      generator, x, y, z,
2049                                                      ctx))
2050            goto err;
2051        if (0 == EC_POINT_cmp(group, generator, group->generator, ctx))
2052            /* precomputation matches generator */
2053            have_pre_comp = 1;
2054        else
2055            /*
2056             * we don't have valid precomputation: treat the generator as a
2057             * random point
2058             */
2059            num_points++;
2060    }
2061    if (num_points > 0) {
2062        if (num_points >= 3) {
2063            /*
2064             * unless we precompute multiples for just one or two points,
2065             * converting those into affine form is time well spent
2066             */
2067            mixed = 1;
2068        }
2069        secrets = OPENSSL_malloc(num_points * sizeof(felem_bytearray));
2070        pre_comp = OPENSSL_malloc(num_points * 17 * 3 * sizeof(smallfelem));
2071        if (mixed)
2072            tmp_smallfelems =
2073                OPENSSL_malloc((num_points * 17 + 1) * sizeof(smallfelem));
2074        if ((secrets == NULL) || (pre_comp == NULL)
2075            || (mixed && (tmp_smallfelems == NULL))) {
2076            ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_MALLOC_FAILURE);
2077            goto err;
2078        }
2079
2080        /*
2081         * we treat NULL scalars as 0, and NULL points as points at infinity,
2082         * i.e., they contribute nothing to the linear combination
2083         */
2084        memset(secrets, 0, num_points * sizeof(felem_bytearray));
2085        memset(pre_comp, 0, num_points * 17 * 3 * sizeof(smallfelem));
2086        for (i = 0; i < num_points; ++i) {
2087            if (i == num) {
2088                /*
2089                 * we didn't have a valid precomputation, so we pick the
2090                 * generator
2091                 */
2092                p = EC_GROUP_get0_generator(group);
2093                p_scalar = scalar;
2094            } else {
2095                /* the i^th point */
2096                p = points[i];
2097                p_scalar = scalars[i];
2098            }
2099            if ((p_scalar != NULL) && (p != NULL)) {
2100                /* reduce scalar to 0 <= scalar < 2^256 */
2101                if ((BN_num_bits(p_scalar) > 256)
2102                    || (BN_is_negative(p_scalar))) {
2103                    /*
2104                     * this is an unusual input, and we don't guarantee
2105                     * constant-timeness
2106                     */
2107                    if (!BN_nnmod(tmp_scalar, p_scalar, &group->order, ctx)) {
2108                        ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_BN_LIB);
2109                        goto err;
2110                    }
2111                    num_bytes = bn_bn2lebinpad(tmp_scalar,
2112                                               secrets[i], sizeof(secrets[i]));
2113                } else {
2114                    num_bytes = bn_bn2lebinpad(p_scalar,
2115                                               secrets[i], sizeof(secrets[i]));
2116                }
2117                if (num_bytes < 0) {
2118                    ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_BN_LIB);
2119                    goto err;
2120                }
2121                /* precompute multiples */
2122                if ((!BN_to_felem(x_out, &p->X)) ||
2123                    (!BN_to_felem(y_out, &p->Y)) ||
2124                    (!BN_to_felem(z_out, &p->Z)))
2125                    goto err;
2126                felem_shrink(pre_comp[i][1][0], x_out);
2127                felem_shrink(pre_comp[i][1][1], y_out);
2128                felem_shrink(pre_comp[i][1][2], z_out);
2129                for (j = 2; j <= 16; ++j) {
2130                    if (j & 1) {
2131                        point_add_small(pre_comp[i][j][0], pre_comp[i][j][1],
2132                                        pre_comp[i][j][2], pre_comp[i][1][0],
2133                                        pre_comp[i][1][1], pre_comp[i][1][2],
2134                                        pre_comp[i][j - 1][0],
2135                                        pre_comp[i][j - 1][1],
2136                                        pre_comp[i][j - 1][2]);
2137                    } else {
2138                        point_double_small(pre_comp[i][j][0],
2139                                           pre_comp[i][j][1],
2140                                           pre_comp[i][j][2],
2141                                           pre_comp[i][j / 2][0],
2142                                           pre_comp[i][j / 2][1],
2143                                           pre_comp[i][j / 2][2]);
2144                    }
2145                }
2146            }
2147        }
2148        if (mixed)
2149            make_points_affine(num_points * 17, pre_comp[0], tmp_smallfelems);
2150    }
2151
2152    /* the scalar for the generator */
2153    if ((scalar != NULL) && (have_pre_comp)) {
2154        memset(g_secret, 0, sizeof(g_secret));
2155        /* reduce scalar to 0 <= scalar < 2^256 */
2156        if ((BN_num_bits(scalar) > 256) || (BN_is_negative(scalar))) {
2157            /*
2158             * this is an unusual input, and we don't guarantee
2159             * constant-timeness
2160             */
2161            if (!BN_nnmod(tmp_scalar, scalar, &group->order, ctx)) {
2162                ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_BN_LIB);
2163                goto err;
2164            }
2165            num_bytes = bn_bn2lebinpad(tmp_scalar, g_secret, sizeof(g_secret));
2166        } else {
2167            num_bytes = bn_bn2lebinpad(scalar, g_secret, sizeof(g_secret));
2168        }
2169        /* do the multiplication with generator precomputation */
2170        batch_mul(x_out, y_out, z_out,
2171                  (const felem_bytearray(*))secrets, num_points,
2172                  g_secret,
2173                  mixed, (const smallfelem(*)[17][3])pre_comp, g_pre_comp);
2174    } else {
2175        /* do the multiplication without generator precomputation */
2176        batch_mul(x_out, y_out, z_out,
2177                  (const felem_bytearray(*))secrets, num_points,
2178                  NULL, mixed, (const smallfelem(*)[17][3])pre_comp, NULL);
2179    }
2180    /* reduce the output to its unique minimal representation */
2181    felem_contract(x_in, x_out);
2182    felem_contract(y_in, y_out);
2183    felem_contract(z_in, z_out);
2184    if ((!smallfelem_to_BN(x, x_in)) || (!smallfelem_to_BN(y, y_in)) ||
2185        (!smallfelem_to_BN(z, z_in))) {
2186        ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_BN_LIB);
2187        goto err;
2188    }
2189    ret = EC_POINT_set_Jprojective_coordinates_GFp(group, r, x, y, z, ctx);
2190
2191 err:
2192    BN_CTX_end(ctx);
2193    if (generator != NULL)
2194        EC_POINT_free(generator);
2195    if (new_ctx != NULL)
2196        BN_CTX_free(new_ctx);
2197    if (secrets != NULL)
2198        OPENSSL_free(secrets);
2199    if (pre_comp != NULL)
2200        OPENSSL_free(pre_comp);
2201    if (tmp_smallfelems != NULL)
2202        OPENSSL_free(tmp_smallfelems);
2203    return ret;
2204}
2205
2206int ec_GFp_nistp256_precompute_mult(EC_GROUP *group, BN_CTX *ctx)
2207{
2208    int ret = 0;
2209    NISTP256_PRE_COMP *pre = NULL;
2210    int i, j;
2211    BN_CTX *new_ctx = NULL;
2212    BIGNUM *x, *y;
2213    EC_POINT *generator = NULL;
2214    smallfelem tmp_smallfelems[32];
2215    felem x_tmp, y_tmp, z_tmp;
2216
2217    /* throw away old precomputation */
2218    EC_EX_DATA_free_data(&group->extra_data, nistp256_pre_comp_dup,
2219                         nistp256_pre_comp_free,
2220                         nistp256_pre_comp_clear_free);
2221    if (ctx == NULL)
2222        if ((ctx = new_ctx = BN_CTX_new()) == NULL)
2223            return 0;
2224    BN_CTX_start(ctx);
2225    if (((x = BN_CTX_get(ctx)) == NULL) || ((y = BN_CTX_get(ctx)) == NULL))
2226        goto err;
2227    /* get the generator */
2228    if (group->generator == NULL)
2229        goto err;
2230    generator = EC_POINT_new(group);
2231    if (generator == NULL)
2232        goto err;
2233    BN_bin2bn(nistp256_curve_params[3], sizeof(felem_bytearray), x);
2234    BN_bin2bn(nistp256_curve_params[4], sizeof(felem_bytearray), y);
2235    if (!EC_POINT_set_affine_coordinates_GFp(group, generator, x, y, ctx))
2236        goto err;
2237    if ((pre = nistp256_pre_comp_new()) == NULL)
2238        goto err;
2239    /*
2240     * if the generator is the standard one, use built-in precomputation
2241     */
2242    if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) {
2243        memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp));
2244        goto done;
2245    }
2246    if ((!BN_to_felem(x_tmp, &group->generator->X)) ||
2247        (!BN_to_felem(y_tmp, &group->generator->Y)) ||
2248        (!BN_to_felem(z_tmp, &group->generator->Z)))
2249        goto err;
2250    felem_shrink(pre->g_pre_comp[0][1][0], x_tmp);
2251    felem_shrink(pre->g_pre_comp[0][1][1], y_tmp);
2252    felem_shrink(pre->g_pre_comp[0][1][2], z_tmp);
2253    /*
2254     * compute 2^64*G, 2^128*G, 2^192*G for the first table, 2^32*G, 2^96*G,
2255     * 2^160*G, 2^224*G for the second one
2256     */
2257    for (i = 1; i <= 8; i <<= 1) {
2258        point_double_small(pre->g_pre_comp[1][i][0], pre->g_pre_comp[1][i][1],
2259                           pre->g_pre_comp[1][i][2], pre->g_pre_comp[0][i][0],
2260                           pre->g_pre_comp[0][i][1],
2261                           pre->g_pre_comp[0][i][2]);
2262        for (j = 0; j < 31; ++j) {
2263            point_double_small(pre->g_pre_comp[1][i][0],
2264                               pre->g_pre_comp[1][i][1],
2265                               pre->g_pre_comp[1][i][2],
2266                               pre->g_pre_comp[1][i][0],
2267                               pre->g_pre_comp[1][i][1],
2268                               pre->g_pre_comp[1][i][2]);
2269        }
2270        if (i == 8)
2271            break;
2272        point_double_small(pre->g_pre_comp[0][2 * i][0],
2273                           pre->g_pre_comp[0][2 * i][1],
2274                           pre->g_pre_comp[0][2 * i][2],
2275                           pre->g_pre_comp[1][i][0], pre->g_pre_comp[1][i][1],
2276                           pre->g_pre_comp[1][i][2]);
2277        for (j = 0; j < 31; ++j) {
2278            point_double_small(pre->g_pre_comp[0][2 * i][0],
2279                               pre->g_pre_comp[0][2 * i][1],
2280                               pre->g_pre_comp[0][2 * i][2],
2281                               pre->g_pre_comp[0][2 * i][0],
2282                               pre->g_pre_comp[0][2 * i][1],
2283                               pre->g_pre_comp[0][2 * i][2]);
2284        }
2285    }
2286    for (i = 0; i < 2; i++) {
2287        /* g_pre_comp[i][0] is the point at infinity */
2288        memset(pre->g_pre_comp[i][0], 0, sizeof(pre->g_pre_comp[i][0]));
2289        /* the remaining multiples */
2290        /* 2^64*G + 2^128*G resp. 2^96*G + 2^160*G */
2291        point_add_small(pre->g_pre_comp[i][6][0], pre->g_pre_comp[i][6][1],
2292                        pre->g_pre_comp[i][6][2], pre->g_pre_comp[i][4][0],
2293                        pre->g_pre_comp[i][4][1], pre->g_pre_comp[i][4][2],
2294                        pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1],
2295                        pre->g_pre_comp[i][2][2]);
2296        /* 2^64*G + 2^192*G resp. 2^96*G + 2^224*G */
2297        point_add_small(pre->g_pre_comp[i][10][0], pre->g_pre_comp[i][10][1],
2298                        pre->g_pre_comp[i][10][2], pre->g_pre_comp[i][8][0],
2299                        pre->g_pre_comp[i][8][1], pre->g_pre_comp[i][8][2],
2300                        pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1],
2301                        pre->g_pre_comp[i][2][2]);
2302        /* 2^128*G + 2^192*G resp. 2^160*G + 2^224*G */
2303        point_add_small(pre->g_pre_comp[i][12][0], pre->g_pre_comp[i][12][1],
2304                        pre->g_pre_comp[i][12][2], pre->g_pre_comp[i][8][0],
2305                        pre->g_pre_comp[i][8][1], pre->g_pre_comp[i][8][2],
2306                        pre->g_pre_comp[i][4][0], pre->g_pre_comp[i][4][1],
2307                        pre->g_pre_comp[i][4][2]);
2308        /*
2309         * 2^64*G + 2^128*G + 2^192*G resp. 2^96*G + 2^160*G + 2^224*G
2310         */
2311        point_add_small(pre->g_pre_comp[i][14][0], pre->g_pre_comp[i][14][1],
2312                        pre->g_pre_comp[i][14][2], pre->g_pre_comp[i][12][0],
2313                        pre->g_pre_comp[i][12][1], pre->g_pre_comp[i][12][2],
2314                        pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1],
2315                        pre->g_pre_comp[i][2][2]);
2316        for (j = 1; j < 8; ++j) {
2317            /* odd multiples: add G resp. 2^32*G */
2318            point_add_small(pre->g_pre_comp[i][2 * j + 1][0],
2319                            pre->g_pre_comp[i][2 * j + 1][1],
2320                            pre->g_pre_comp[i][2 * j + 1][2],
2321                            pre->g_pre_comp[i][2 * j][0],
2322                            pre->g_pre_comp[i][2 * j][1],
2323                            pre->g_pre_comp[i][2 * j][2],
2324                            pre->g_pre_comp[i][1][0],
2325                            pre->g_pre_comp[i][1][1],
2326                            pre->g_pre_comp[i][1][2]);
2327        }
2328    }
2329    make_points_affine(31, &(pre->g_pre_comp[0][1]), tmp_smallfelems);
2330
2331 done:
2332    if (!EC_EX_DATA_set_data(&group->extra_data, pre, nistp256_pre_comp_dup,
2333                             nistp256_pre_comp_free,
2334                             nistp256_pre_comp_clear_free))
2335        goto err;
2336    ret = 1;
2337    pre = NULL;
2338 err:
2339    BN_CTX_end(ctx);
2340    if (generator != NULL)
2341        EC_POINT_free(generator);
2342    if (new_ctx != NULL)
2343        BN_CTX_free(new_ctx);
2344    if (pre)
2345        nistp256_pre_comp_free(pre);
2346    return ret;
2347}
2348
2349int ec_GFp_nistp256_have_precompute_mult(const EC_GROUP *group)
2350{
2351    if (EC_EX_DATA_get_data(group->extra_data, nistp256_pre_comp_dup,
2352                            nistp256_pre_comp_free,
2353                            nistp256_pre_comp_clear_free)
2354        != NULL)
2355        return 1;
2356    else
2357        return 0;
2358}
2359#else
2360static void *dummy = &dummy;
2361#endif
2362