1/*
2 * Copyright 2011-2021 The OpenSSL Project Authors. All Rights Reserved.
3 *
4 * Licensed under the Apache License 2.0 (the "License").  You may not use
5 * this file except in compliance with the License.  You can obtain a copy
6 * in the file LICENSE in the source distribution or at
7 * https://www.openssl.org/source/license.html
8 */
9
10/* Copyright 2011 Google Inc.
11 *
12 * Licensed under the Apache License, Version 2.0 (the "License");
13 *
14 * you may not use this file except in compliance with the License.
15 * You may obtain a copy of the License at
16 *
17 *     http://www.apache.org/licenses/LICENSE-2.0
18 *
19 *  Unless required by applicable law or agreed to in writing, software
20 *  distributed under the License is distributed on an "AS IS" BASIS,
21 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
22 *  See the License for the specific language governing permissions and
23 *  limitations under the License.
24 */
25
26/*
27 * ECDSA low level APIs are deprecated for public use, but still ok for
28 * internal use.
29 */
30#include "internal/deprecated.h"
31
32/*
33 * A 64-bit implementation of the NIST P-256 elliptic curve point multiplication
34 *
35 * OpenSSL integration was taken from Emilia Kasper's work in ecp_nistp224.c.
36 * Otherwise based on Emilia's P224 work, which was inspired by my curve25519
37 * work which got its smarts from Daniel J. Bernstein's work on the same.
38 */
39
40#include <openssl/opensslconf.h>
41
42#include <stdint.h>
43#include <string.h>
44#include <openssl/err.h>
45#include "ec_local.h"
46
47#include "internal/numbers.h"
48
49#ifndef INT128_MAX
50# error "Your compiler doesn't appear to support 128-bit integer types"
51#endif
52
53typedef uint8_t u8;
54typedef uint32_t u32;
55typedef uint64_t u64;
56
57/*
58 * The underlying field. P256 operates over GF(2^256-2^224+2^192+2^96-1). We
59 * can serialize an element of this field into 32 bytes. We call this an
60 * felem_bytearray.
61 */
62
63typedef u8 felem_bytearray[32];
64
65/*
66 * These are the parameters of P256, taken from FIPS 186-3, page 86. These
67 * values are big-endian.
68 */
69static const felem_bytearray nistp256_curve_params[5] = {
70    {0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, /* p */
71     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
72     0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,
73     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
74    {0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, /* a = -3 */
75     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
76     0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,
77     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xfc},
78    {0x5a, 0xc6, 0x35, 0xd8, 0xaa, 0x3a, 0x93, 0xe7, /* b */
79     0xb3, 0xeb, 0xbd, 0x55, 0x76, 0x98, 0x86, 0xbc,
80     0x65, 0x1d, 0x06, 0xb0, 0xcc, 0x53, 0xb0, 0xf6,
81     0x3b, 0xce, 0x3c, 0x3e, 0x27, 0xd2, 0x60, 0x4b},
82    {0x6b, 0x17, 0xd1, 0xf2, 0xe1, 0x2c, 0x42, 0x47, /* x */
83     0xf8, 0xbc, 0xe6, 0xe5, 0x63, 0xa4, 0x40, 0xf2,
84     0x77, 0x03, 0x7d, 0x81, 0x2d, 0xeb, 0x33, 0xa0,
85     0xf4, 0xa1, 0x39, 0x45, 0xd8, 0x98, 0xc2, 0x96},
86    {0x4f, 0xe3, 0x42, 0xe2, 0xfe, 0x1a, 0x7f, 0x9b, /* y */
87     0x8e, 0xe7, 0xeb, 0x4a, 0x7c, 0x0f, 0x9e, 0x16,
88     0x2b, 0xce, 0x33, 0x57, 0x6b, 0x31, 0x5e, 0xce,
89     0xcb, 0xb6, 0x40, 0x68, 0x37, 0xbf, 0x51, 0xf5}
90};
91
92/*-
93 * The representation of field elements.
94 * ------------------------------------
95 *
96 * We represent field elements with either four 128-bit values, eight 128-bit
97 * values, or four 64-bit values. The field element represented is:
98 *   v[0]*2^0 + v[1]*2^64 + v[2]*2^128 + v[3]*2^192  (mod p)
99 * or:
100 *   v[0]*2^0 + v[1]*2^64 + v[2]*2^128 + ... + v[8]*2^512  (mod p)
101 *
102 * 128-bit values are called 'limbs'. Since the limbs are spaced only 64 bits
103 * apart, but are 128-bits wide, the most significant bits of each limb overlap
104 * with the least significant bits of the next.
105 *
106 * A field element with four limbs is an 'felem'. One with eight limbs is a
107 * 'longfelem'
108 *
109 * A field element with four, 64-bit values is called a 'smallfelem'. Small
110 * values are used as intermediate values before multiplication.
111 */
112
113#define NLIMBS 4
114
115typedef uint128_t limb;
116typedef limb felem[NLIMBS];
117typedef limb longfelem[NLIMBS * 2];
118typedef u64 smallfelem[NLIMBS];
119
120/* This is the value of the prime as four 64-bit words, little-endian. */
121static const u64 kPrime[4] =
122    { 0xfffffffffffffffful, 0xffffffff, 0, 0xffffffff00000001ul };
123static const u64 bottom63bits = 0x7ffffffffffffffful;
124
125/*
126 * bin32_to_felem takes a little-endian byte array and converts it into felem
127 * form. This assumes that the CPU is little-endian.
128 */
129static void bin32_to_felem(felem out, const u8 in[32])
130{
131    out[0] = *((u64 *)&in[0]);
132    out[1] = *((u64 *)&in[8]);
133    out[2] = *((u64 *)&in[16]);
134    out[3] = *((u64 *)&in[24]);
135}
136
137/*
138 * smallfelem_to_bin32 takes a smallfelem and serializes into a little
139 * endian, 32 byte array. This assumes that the CPU is little-endian.
140 */
141static void smallfelem_to_bin32(u8 out[32], const smallfelem in)
142{
143    *((u64 *)&out[0]) = in[0];
144    *((u64 *)&out[8]) = in[1];
145    *((u64 *)&out[16]) = in[2];
146    *((u64 *)&out[24]) = in[3];
147}
148
149/* BN_to_felem converts an OpenSSL BIGNUM into an felem */
150static int BN_to_felem(felem out, const BIGNUM *bn)
151{
152    felem_bytearray b_out;
153    int num_bytes;
154
155    if (BN_is_negative(bn)) {
156        ERR_raise(ERR_LIB_EC, EC_R_BIGNUM_OUT_OF_RANGE);
157        return 0;
158    }
159    num_bytes = BN_bn2lebinpad(bn, b_out, sizeof(b_out));
160    if (num_bytes < 0) {
161        ERR_raise(ERR_LIB_EC, EC_R_BIGNUM_OUT_OF_RANGE);
162        return 0;
163    }
164    bin32_to_felem(out, b_out);
165    return 1;
166}
167
168/* felem_to_BN converts an felem into an OpenSSL BIGNUM */
169static BIGNUM *smallfelem_to_BN(BIGNUM *out, const smallfelem in)
170{
171    felem_bytearray b_out;
172    smallfelem_to_bin32(b_out, in);
173    return BN_lebin2bn(b_out, sizeof(b_out), out);
174}
175
176/*-
177 * Field operations
178 * ----------------
179 */
180
181static void smallfelem_one(smallfelem out)
182{
183    out[0] = 1;
184    out[1] = 0;
185    out[2] = 0;
186    out[3] = 0;
187}
188
189static void smallfelem_assign(smallfelem out, const smallfelem in)
190{
191    out[0] = in[0];
192    out[1] = in[1];
193    out[2] = in[2];
194    out[3] = in[3];
195}
196
197static void felem_assign(felem out, const felem in)
198{
199    out[0] = in[0];
200    out[1] = in[1];
201    out[2] = in[2];
202    out[3] = in[3];
203}
204
205/* felem_sum sets out = out + in. */
206static void felem_sum(felem out, const felem in)
207{
208    out[0] += in[0];
209    out[1] += in[1];
210    out[2] += in[2];
211    out[3] += in[3];
212}
213
214/* felem_small_sum sets out = out + in. */
215static void felem_small_sum(felem out, const smallfelem in)
216{
217    out[0] += in[0];
218    out[1] += in[1];
219    out[2] += in[2];
220    out[3] += in[3];
221}
222
223/* felem_scalar sets out = out * scalar */
224static void felem_scalar(felem out, const u64 scalar)
225{
226    out[0] *= scalar;
227    out[1] *= scalar;
228    out[2] *= scalar;
229    out[3] *= scalar;
230}
231
232/* longfelem_scalar sets out = out * scalar */
233static void longfelem_scalar(longfelem out, const u64 scalar)
234{
235    out[0] *= scalar;
236    out[1] *= scalar;
237    out[2] *= scalar;
238    out[3] *= scalar;
239    out[4] *= scalar;
240    out[5] *= scalar;
241    out[6] *= scalar;
242    out[7] *= scalar;
243}
244
245#define two105m41m9 (((limb)1) << 105) - (((limb)1) << 41) - (((limb)1) << 9)
246#define two105 (((limb)1) << 105)
247#define two105m41p9 (((limb)1) << 105) - (((limb)1) << 41) + (((limb)1) << 9)
248
249/* zero105 is 0 mod p */
250static const felem zero105 =
251    { two105m41m9, two105, two105m41p9, two105m41p9 };
252
253/*-
254 * smallfelem_neg sets |out| to |-small|
255 * On exit:
256 *   out[i] < out[i] + 2^105
257 */
258static void smallfelem_neg(felem out, const smallfelem small)
259{
260    /* In order to prevent underflow, we subtract from 0 mod p. */
261    out[0] = zero105[0] - small[0];
262    out[1] = zero105[1] - small[1];
263    out[2] = zero105[2] - small[2];
264    out[3] = zero105[3] - small[3];
265}
266
267/*-
268 * felem_diff subtracts |in| from |out|
269 * On entry:
270 *   in[i] < 2^104
271 * On exit:
272 *   out[i] < out[i] + 2^105
273 */
274static void felem_diff(felem out, const felem in)
275{
276    /*
277     * In order to prevent underflow, we add 0 mod p before subtracting.
278     */
279    out[0] += zero105[0];
280    out[1] += zero105[1];
281    out[2] += zero105[2];
282    out[3] += zero105[3];
283
284    out[0] -= in[0];
285    out[1] -= in[1];
286    out[2] -= in[2];
287    out[3] -= in[3];
288}
289
290#define two107m43m11 (((limb)1) << 107) - (((limb)1) << 43) - (((limb)1) << 11)
291#define two107 (((limb)1) << 107)
292#define two107m43p11 (((limb)1) << 107) - (((limb)1) << 43) + (((limb)1) << 11)
293
294/* zero107 is 0 mod p */
295static const felem zero107 =
296    { two107m43m11, two107, two107m43p11, two107m43p11 };
297
298/*-
299 * An alternative felem_diff for larger inputs |in|
300 * felem_diff_zero107 subtracts |in| from |out|
301 * On entry:
302 *   in[i] < 2^106
303 * On exit:
304 *   out[i] < out[i] + 2^107
305 */
306static void felem_diff_zero107(felem out, const felem in)
307{
308    /*
309     * In order to prevent underflow, we add 0 mod p before subtracting.
310     */
311    out[0] += zero107[0];
312    out[1] += zero107[1];
313    out[2] += zero107[2];
314    out[3] += zero107[3];
315
316    out[0] -= in[0];
317    out[1] -= in[1];
318    out[2] -= in[2];
319    out[3] -= in[3];
320}
321
322/*-
323 * longfelem_diff subtracts |in| from |out|
324 * On entry:
325 *   in[i] < 7*2^67
326 * On exit:
327 *   out[i] < out[i] + 2^70 + 2^40
328 */
329static void longfelem_diff(longfelem out, const longfelem in)
330{
331    static const limb two70m8p6 =
332        (((limb) 1) << 70) - (((limb) 1) << 8) + (((limb) 1) << 6);
333    static const limb two70p40 = (((limb) 1) << 70) + (((limb) 1) << 40);
334    static const limb two70 = (((limb) 1) << 70);
335    static const limb two70m40m38p6 =
336        (((limb) 1) << 70) - (((limb) 1) << 40) - (((limb) 1) << 38) +
337        (((limb) 1) << 6);
338    static const limb two70m6 = (((limb) 1) << 70) - (((limb) 1) << 6);
339
340    /* add 0 mod p to avoid underflow */
341    out[0] += two70m8p6;
342    out[1] += two70p40;
343    out[2] += two70;
344    out[3] += two70m40m38p6;
345    out[4] += two70m6;
346    out[5] += two70m6;
347    out[6] += two70m6;
348    out[7] += two70m6;
349
350    /* in[i] < 7*2^67 < 2^70 - 2^40 - 2^38 + 2^6 */
351    out[0] -= in[0];
352    out[1] -= in[1];
353    out[2] -= in[2];
354    out[3] -= in[3];
355    out[4] -= in[4];
356    out[5] -= in[5];
357    out[6] -= in[6];
358    out[7] -= in[7];
359}
360
361#define two64m0 (((limb)1) << 64) - 1
362#define two110p32m0 (((limb)1) << 110) + (((limb)1) << 32) - 1
363#define two64m46 (((limb)1) << 64) - (((limb)1) << 46)
364#define two64m32 (((limb)1) << 64) - (((limb)1) << 32)
365
366/* zero110 is 0 mod p */
367static const felem zero110 = { two64m0, two110p32m0, two64m46, two64m32 };
368
369/*-
370 * felem_shrink converts an felem into a smallfelem. The result isn't quite
371 * minimal as the value may be greater than p.
372 *
373 * On entry:
374 *   in[i] < 2^109
375 * On exit:
376 *   out[i] < 2^64
377 */
378static void felem_shrink(smallfelem out, const felem in)
379{
380    felem tmp;
381    u64 a, b, mask;
382    u64 high, low;
383    static const u64 kPrime3Test = 0x7fffffff00000001ul; /* 2^63 - 2^32 + 1 */
384
385    /* Carry 2->3 */
386    tmp[3] = zero110[3] + in[3] + ((u64)(in[2] >> 64));
387    /* tmp[3] < 2^110 */
388
389    tmp[2] = zero110[2] + (u64)in[2];
390    tmp[0] = zero110[0] + in[0];
391    tmp[1] = zero110[1] + in[1];
392    /* tmp[0] < 2**110, tmp[1] < 2^111, tmp[2] < 2**65 */
393
394    /*
395     * We perform two partial reductions where we eliminate the high-word of
396     * tmp[3]. We don't update the other words till the end.
397     */
398    a = tmp[3] >> 64;           /* a < 2^46 */
399    tmp[3] = (u64)tmp[3];
400    tmp[3] -= a;
401    tmp[3] += ((limb) a) << 32;
402    /* tmp[3] < 2^79 */
403
404    b = a;
405    a = tmp[3] >> 64;           /* a < 2^15 */
406    b += a;                     /* b < 2^46 + 2^15 < 2^47 */
407    tmp[3] = (u64)tmp[3];
408    tmp[3] -= a;
409    tmp[3] += ((limb) a) << 32;
410    /* tmp[3] < 2^64 + 2^47 */
411
412    /*
413     * This adjusts the other two words to complete the two partial
414     * reductions.
415     */
416    tmp[0] += b;
417    tmp[1] -= (((limb) b) << 32);
418
419    /*
420     * In order to make space in tmp[3] for the carry from 2 -> 3, we
421     * conditionally subtract kPrime if tmp[3] is large enough.
422     */
423    high = (u64)(tmp[3] >> 64);
424    /* As tmp[3] < 2^65, high is either 1 or 0 */
425    high = 0 - high;
426    /*-
427     * high is:
428     *   all ones   if the high word of tmp[3] is 1
429     *   all zeros  if the high word of tmp[3] if 0
430     */
431    low = (u64)tmp[3];
432    mask = 0 - (low >> 63);
433    /*-
434     * mask is:
435     *   all ones   if the MSB of low is 1
436     *   all zeros  if the MSB of low if 0
437     */
438    low &= bottom63bits;
439    low -= kPrime3Test;
440    /* if low was greater than kPrime3Test then the MSB is zero */
441    low = ~low;
442    low = 0 - (low >> 63);
443    /*-
444     * low is:
445     *   all ones   if low was > kPrime3Test
446     *   all zeros  if low was <= kPrime3Test
447     */
448    mask = (mask & low) | high;
449    tmp[0] -= mask & kPrime[0];
450    tmp[1] -= mask & kPrime[1];
451    /* kPrime[2] is zero, so omitted */
452    tmp[3] -= mask & kPrime[3];
453    /* tmp[3] < 2**64 - 2**32 + 1 */
454
455    tmp[1] += ((u64)(tmp[0] >> 64));
456    tmp[0] = (u64)tmp[0];
457    tmp[2] += ((u64)(tmp[1] >> 64));
458    tmp[1] = (u64)tmp[1];
459    tmp[3] += ((u64)(tmp[2] >> 64));
460    tmp[2] = (u64)tmp[2];
461    /* tmp[i] < 2^64 */
462
463    out[0] = tmp[0];
464    out[1] = tmp[1];
465    out[2] = tmp[2];
466    out[3] = tmp[3];
467}
468
469/* smallfelem_expand converts a smallfelem to an felem */
470static void smallfelem_expand(felem out, const smallfelem in)
471{
472    out[0] = in[0];
473    out[1] = in[1];
474    out[2] = in[2];
475    out[3] = in[3];
476}
477
478/*-
479 * smallfelem_square sets |out| = |small|^2
480 * On entry:
481 *   small[i] < 2^64
482 * On exit:
483 *   out[i] < 7 * 2^64 < 2^67
484 */
485static void smallfelem_square(longfelem out, const smallfelem small)
486{
487    limb a;
488    u64 high, low;
489
490    a = ((uint128_t) small[0]) * small[0];
491    low = a;
492    high = a >> 64;
493    out[0] = low;
494    out[1] = high;
495
496    a = ((uint128_t) small[0]) * small[1];
497    low = a;
498    high = a >> 64;
499    out[1] += low;
500    out[1] += low;
501    out[2] = high;
502
503    a = ((uint128_t) small[0]) * small[2];
504    low = a;
505    high = a >> 64;
506    out[2] += low;
507    out[2] *= 2;
508    out[3] = high;
509
510    a = ((uint128_t) small[0]) * small[3];
511    low = a;
512    high = a >> 64;
513    out[3] += low;
514    out[4] = high;
515
516    a = ((uint128_t) small[1]) * small[2];
517    low = a;
518    high = a >> 64;
519    out[3] += low;
520    out[3] *= 2;
521    out[4] += high;
522
523    a = ((uint128_t) small[1]) * small[1];
524    low = a;
525    high = a >> 64;
526    out[2] += low;
527    out[3] += high;
528
529    a = ((uint128_t) small[1]) * small[3];
530    low = a;
531    high = a >> 64;
532    out[4] += low;
533    out[4] *= 2;
534    out[5] = high;
535
536    a = ((uint128_t) small[2]) * small[3];
537    low = a;
538    high = a >> 64;
539    out[5] += low;
540    out[5] *= 2;
541    out[6] = high;
542    out[6] += high;
543
544    a = ((uint128_t) small[2]) * small[2];
545    low = a;
546    high = a >> 64;
547    out[4] += low;
548    out[5] += high;
549
550    a = ((uint128_t) small[3]) * small[3];
551    low = a;
552    high = a >> 64;
553    out[6] += low;
554    out[7] = high;
555}
556
557/*-
558 * felem_square sets |out| = |in|^2
559 * On entry:
560 *   in[i] < 2^109
561 * On exit:
562 *   out[i] < 7 * 2^64 < 2^67
563 */
564static void felem_square(longfelem out, const felem in)
565{
566    u64 small[4];
567    felem_shrink(small, in);
568    smallfelem_square(out, small);
569}
570
571/*-
572 * smallfelem_mul sets |out| = |small1| * |small2|
573 * On entry:
574 *   small1[i] < 2^64
575 *   small2[i] < 2^64
576 * On exit:
577 *   out[i] < 7 * 2^64 < 2^67
578 */
579static void smallfelem_mul(longfelem out, const smallfelem small1,
580                           const smallfelem small2)
581{
582    limb a;
583    u64 high, low;
584
585    a = ((uint128_t) small1[0]) * small2[0];
586    low = a;
587    high = a >> 64;
588    out[0] = low;
589    out[1] = high;
590
591    a = ((uint128_t) small1[0]) * small2[1];
592    low = a;
593    high = a >> 64;
594    out[1] += low;
595    out[2] = high;
596
597    a = ((uint128_t) small1[1]) * small2[0];
598    low = a;
599    high = a >> 64;
600    out[1] += low;
601    out[2] += high;
602
603    a = ((uint128_t) small1[0]) * small2[2];
604    low = a;
605    high = a >> 64;
606    out[2] += low;
607    out[3] = high;
608
609    a = ((uint128_t) small1[1]) * small2[1];
610    low = a;
611    high = a >> 64;
612    out[2] += low;
613    out[3] += high;
614
615    a = ((uint128_t) small1[2]) * small2[0];
616    low = a;
617    high = a >> 64;
618    out[2] += low;
619    out[3] += high;
620
621    a = ((uint128_t) small1[0]) * small2[3];
622    low = a;
623    high = a >> 64;
624    out[3] += low;
625    out[4] = high;
626
627    a = ((uint128_t) small1[1]) * small2[2];
628    low = a;
629    high = a >> 64;
630    out[3] += low;
631    out[4] += high;
632
633    a = ((uint128_t) small1[2]) * small2[1];
634    low = a;
635    high = a >> 64;
636    out[3] += low;
637    out[4] += high;
638
639    a = ((uint128_t) small1[3]) * small2[0];
640    low = a;
641    high = a >> 64;
642    out[3] += low;
643    out[4] += high;
644
645    a = ((uint128_t) small1[1]) * small2[3];
646    low = a;
647    high = a >> 64;
648    out[4] += low;
649    out[5] = high;
650
651    a = ((uint128_t) small1[2]) * small2[2];
652    low = a;
653    high = a >> 64;
654    out[4] += low;
655    out[5] += high;
656
657    a = ((uint128_t) small1[3]) * small2[1];
658    low = a;
659    high = a >> 64;
660    out[4] += low;
661    out[5] += high;
662
663    a = ((uint128_t) small1[2]) * small2[3];
664    low = a;
665    high = a >> 64;
666    out[5] += low;
667    out[6] = high;
668
669    a = ((uint128_t) small1[3]) * small2[2];
670    low = a;
671    high = a >> 64;
672    out[5] += low;
673    out[6] += high;
674
675    a = ((uint128_t) small1[3]) * small2[3];
676    low = a;
677    high = a >> 64;
678    out[6] += low;
679    out[7] = high;
680}
681
682/*-
683 * felem_mul sets |out| = |in1| * |in2|
684 * On entry:
685 *   in1[i] < 2^109
686 *   in2[i] < 2^109
687 * On exit:
688 *   out[i] < 7 * 2^64 < 2^67
689 */
690static void felem_mul(longfelem out, const felem in1, const felem in2)
691{
692    smallfelem small1, small2;
693    felem_shrink(small1, in1);
694    felem_shrink(small2, in2);
695    smallfelem_mul(out, small1, small2);
696}
697
698/*-
699 * felem_small_mul sets |out| = |small1| * |in2|
700 * On entry:
701 *   small1[i] < 2^64
702 *   in2[i] < 2^109
703 * On exit:
704 *   out[i] < 7 * 2^64 < 2^67
705 */
706static void felem_small_mul(longfelem out, const smallfelem small1,
707                            const felem in2)
708{
709    smallfelem small2;
710    felem_shrink(small2, in2);
711    smallfelem_mul(out, small1, small2);
712}
713
714#define two100m36m4 (((limb)1) << 100) - (((limb)1) << 36) - (((limb)1) << 4)
715#define two100 (((limb)1) << 100)
716#define two100m36p4 (((limb)1) << 100) - (((limb)1) << 36) + (((limb)1) << 4)
717/* zero100 is 0 mod p */
718static const felem zero100 =
719    { two100m36m4, two100, two100m36p4, two100m36p4 };
720
721/*-
722 * Internal function for the different flavours of felem_reduce.
723 * felem_reduce_ reduces the higher coefficients in[4]-in[7].
724 * On entry:
725 *   out[0] >= in[6] + 2^32*in[6] + in[7] + 2^32*in[7]
726 *   out[1] >= in[7] + 2^32*in[4]
727 *   out[2] >= in[5] + 2^32*in[5]
728 *   out[3] >= in[4] + 2^32*in[5] + 2^32*in[6]
729 * On exit:
730 *   out[0] <= out[0] + in[4] + 2^32*in[5]
731 *   out[1] <= out[1] + in[5] + 2^33*in[6]
732 *   out[2] <= out[2] + in[7] + 2*in[6] + 2^33*in[7]
733 *   out[3] <= out[3] + 2^32*in[4] + 3*in[7]
734 */
735static void felem_reduce_(felem out, const longfelem in)
736{
737    int128_t c;
738    /* combine common terms from below */
739    c = in[4] + (in[5] << 32);
740    out[0] += c;
741    out[3] -= c;
742
743    c = in[5] - in[7];
744    out[1] += c;
745    out[2] -= c;
746
747    /* the remaining terms */
748    /* 256: [(0,1),(96,-1),(192,-1),(224,1)] */
749    out[1] -= (in[4] << 32);
750    out[3] += (in[4] << 32);
751
752    /* 320: [(32,1),(64,1),(128,-1),(160,-1),(224,-1)] */
753    out[2] -= (in[5] << 32);
754
755    /* 384: [(0,-1),(32,-1),(96,2),(128,2),(224,-1)] */
756    out[0] -= in[6];
757    out[0] -= (in[6] << 32);
758    out[1] += (in[6] << 33);
759    out[2] += (in[6] * 2);
760    out[3] -= (in[6] << 32);
761
762    /* 448: [(0,-1),(32,-1),(64,-1),(128,1),(160,2),(192,3)] */
763    out[0] -= in[7];
764    out[0] -= (in[7] << 32);
765    out[2] += (in[7] << 33);
766    out[3] += (in[7] * 3);
767}
768
769/*-
770 * felem_reduce converts a longfelem into an felem.
771 * To be called directly after felem_square or felem_mul.
772 * On entry:
773 *   in[0] < 2^64, in[1] < 3*2^64, in[2] < 5*2^64, in[3] < 7*2^64
774 *   in[4] < 7*2^64, in[5] < 5*2^64, in[6] < 3*2^64, in[7] < 2*64
775 * On exit:
776 *   out[i] < 2^101
777 */
778static void felem_reduce(felem out, const longfelem in)
779{
780    out[0] = zero100[0] + in[0];
781    out[1] = zero100[1] + in[1];
782    out[2] = zero100[2] + in[2];
783    out[3] = zero100[3] + in[3];
784
785    felem_reduce_(out, in);
786
787    /*-
788     * out[0] > 2^100 - 2^36 - 2^4 - 3*2^64 - 3*2^96 - 2^64 - 2^96 > 0
789     * out[1] > 2^100 - 2^64 - 7*2^96 > 0
790     * out[2] > 2^100 - 2^36 + 2^4 - 5*2^64 - 5*2^96 > 0
791     * out[3] > 2^100 - 2^36 + 2^4 - 7*2^64 - 5*2^96 - 3*2^96 > 0
792     *
793     * out[0] < 2^100 + 2^64 + 7*2^64 + 5*2^96 < 2^101
794     * out[1] < 2^100 + 3*2^64 + 5*2^64 + 3*2^97 < 2^101
795     * out[2] < 2^100 + 5*2^64 + 2^64 + 3*2^65 + 2^97 < 2^101
796     * out[3] < 2^100 + 7*2^64 + 7*2^96 + 3*2^64 < 2^101
797     */
798}
799
800/*-
801 * felem_reduce_zero105 converts a larger longfelem into an felem.
802 * On entry:
803 *   in[0] < 2^71
804 * On exit:
805 *   out[i] < 2^106
806 */
807static void felem_reduce_zero105(felem out, const longfelem in)
808{
809    out[0] = zero105[0] + in[0];
810    out[1] = zero105[1] + in[1];
811    out[2] = zero105[2] + in[2];
812    out[3] = zero105[3] + in[3];
813
814    felem_reduce_(out, in);
815
816    /*-
817     * out[0] > 2^105 - 2^41 - 2^9 - 2^71 - 2^103 - 2^71 - 2^103 > 0
818     * out[1] > 2^105 - 2^71 - 2^103 > 0
819     * out[2] > 2^105 - 2^41 + 2^9 - 2^71 - 2^103 > 0
820     * out[3] > 2^105 - 2^41 + 2^9 - 2^71 - 2^103 - 2^103 > 0
821     *
822     * out[0] < 2^105 + 2^71 + 2^71 + 2^103 < 2^106
823     * out[1] < 2^105 + 2^71 + 2^71 + 2^103 < 2^106
824     * out[2] < 2^105 + 2^71 + 2^71 + 2^71 + 2^103 < 2^106
825     * out[3] < 2^105 + 2^71 + 2^103 + 2^71 < 2^106
826     */
827}
828
829/*
830 * subtract_u64 sets *result = *result - v and *carry to one if the
831 * subtraction underflowed.
832 */
833static void subtract_u64(u64 *result, u64 *carry, u64 v)
834{
835    uint128_t r = *result;
836    r -= v;
837    *carry = (r >> 64) & 1;
838    *result = (u64)r;
839}
840
841/*
842 * felem_contract converts |in| to its unique, minimal representation. On
843 * entry: in[i] < 2^109
844 */
845static void felem_contract(smallfelem out, const felem in)
846{
847    unsigned i;
848    u64 all_equal_so_far = 0, result = 0, carry;
849
850    felem_shrink(out, in);
851    /* small is minimal except that the value might be > p */
852
853    all_equal_so_far--;
854    /*
855     * We are doing a constant time test if out >= kPrime. We need to compare
856     * each u64, from most-significant to least significant. For each one, if
857     * all words so far have been equal (m is all ones) then a non-equal
858     * result is the answer. Otherwise we continue.
859     */
860    for (i = 3; i < 4; i--) {
861        u64 equal;
862        uint128_t a = ((uint128_t) kPrime[i]) - out[i];
863        /*
864         * if out[i] > kPrime[i] then a will underflow and the high 64-bits
865         * will all be set.
866         */
867        result |= all_equal_so_far & ((u64)(a >> 64));
868
869        /*
870         * if kPrime[i] == out[i] then |equal| will be all zeros and the
871         * decrement will make it all ones.
872         */
873        equal = kPrime[i] ^ out[i];
874        equal--;
875        equal &= equal << 32;
876        equal &= equal << 16;
877        equal &= equal << 8;
878        equal &= equal << 4;
879        equal &= equal << 2;
880        equal &= equal << 1;
881        equal = 0 - (equal >> 63);
882
883        all_equal_so_far &= equal;
884    }
885
886    /*
887     * if all_equal_so_far is still all ones then the two values are equal
888     * and so out >= kPrime is true.
889     */
890    result |= all_equal_so_far;
891
892    /* if out >= kPrime then we subtract kPrime. */
893    subtract_u64(&out[0], &carry, result & kPrime[0]);
894    subtract_u64(&out[1], &carry, carry);
895    subtract_u64(&out[2], &carry, carry);
896    subtract_u64(&out[3], &carry, carry);
897
898    subtract_u64(&out[1], &carry, result & kPrime[1]);
899    subtract_u64(&out[2], &carry, carry);
900    subtract_u64(&out[3], &carry, carry);
901
902    subtract_u64(&out[2], &carry, result & kPrime[2]);
903    subtract_u64(&out[3], &carry, carry);
904
905    subtract_u64(&out[3], &carry, result & kPrime[3]);
906}
907
908static void smallfelem_square_contract(smallfelem out, const smallfelem in)
909{
910    longfelem longtmp;
911    felem tmp;
912
913    smallfelem_square(longtmp, in);
914    felem_reduce(tmp, longtmp);
915    felem_contract(out, tmp);
916}
917
918static void smallfelem_mul_contract(smallfelem out, const smallfelem in1,
919                                    const smallfelem in2)
920{
921    longfelem longtmp;
922    felem tmp;
923
924    smallfelem_mul(longtmp, in1, in2);
925    felem_reduce(tmp, longtmp);
926    felem_contract(out, tmp);
927}
928
929/*-
930 * felem_is_zero returns a limb with all bits set if |in| == 0 (mod p) and 0
931 * otherwise.
932 * On entry:
933 *   small[i] < 2^64
934 */
935static limb smallfelem_is_zero(const smallfelem small)
936{
937    limb result;
938    u64 is_p;
939
940    u64 is_zero = small[0] | small[1] | small[2] | small[3];
941    is_zero--;
942    is_zero &= is_zero << 32;
943    is_zero &= is_zero << 16;
944    is_zero &= is_zero << 8;
945    is_zero &= is_zero << 4;
946    is_zero &= is_zero << 2;
947    is_zero &= is_zero << 1;
948    is_zero = 0 - (is_zero >> 63);
949
950    is_p = (small[0] ^ kPrime[0]) |
951        (small[1] ^ kPrime[1]) |
952        (small[2] ^ kPrime[2]) | (small[3] ^ kPrime[3]);
953    is_p--;
954    is_p &= is_p << 32;
955    is_p &= is_p << 16;
956    is_p &= is_p << 8;
957    is_p &= is_p << 4;
958    is_p &= is_p << 2;
959    is_p &= is_p << 1;
960    is_p = 0 - (is_p >> 63);
961
962    is_zero |= is_p;
963
964    result = is_zero;
965    result |= ((limb) is_zero) << 64;
966    return result;
967}
968
969static int smallfelem_is_zero_int(const void *small)
970{
971    return (int)(smallfelem_is_zero(small) & ((limb) 1));
972}
973
974/*-
975 * felem_inv calculates |out| = |in|^{-1}
976 *
977 * Based on Fermat's Little Theorem:
978 *   a^p = a (mod p)
979 *   a^{p-1} = 1 (mod p)
980 *   a^{p-2} = a^{-1} (mod p)
981 */
982static void felem_inv(felem out, const felem in)
983{
984    felem ftmp, ftmp2;
985    /* each e_I will hold |in|^{2^I - 1} */
986    felem e2, e4, e8, e16, e32, e64;
987    longfelem tmp;
988    unsigned i;
989
990    felem_square(tmp, in);
991    felem_reduce(ftmp, tmp);    /* 2^1 */
992    felem_mul(tmp, in, ftmp);
993    felem_reduce(ftmp, tmp);    /* 2^2 - 2^0 */
994    felem_assign(e2, ftmp);
995    felem_square(tmp, ftmp);
996    felem_reduce(ftmp, tmp);    /* 2^3 - 2^1 */
997    felem_square(tmp, ftmp);
998    felem_reduce(ftmp, tmp);    /* 2^4 - 2^2 */
999    felem_mul(tmp, ftmp, e2);
1000    felem_reduce(ftmp, tmp);    /* 2^4 - 2^0 */
1001    felem_assign(e4, ftmp);
1002    felem_square(tmp, ftmp);
1003    felem_reduce(ftmp, tmp);    /* 2^5 - 2^1 */
1004    felem_square(tmp, ftmp);
1005    felem_reduce(ftmp, tmp);    /* 2^6 - 2^2 */
1006    felem_square(tmp, ftmp);
1007    felem_reduce(ftmp, tmp);    /* 2^7 - 2^3 */
1008    felem_square(tmp, ftmp);
1009    felem_reduce(ftmp, tmp);    /* 2^8 - 2^4 */
1010    felem_mul(tmp, ftmp, e4);
1011    felem_reduce(ftmp, tmp);    /* 2^8 - 2^0 */
1012    felem_assign(e8, ftmp);
1013    for (i = 0; i < 8; i++) {
1014        felem_square(tmp, ftmp);
1015        felem_reduce(ftmp, tmp);
1016    }                           /* 2^16 - 2^8 */
1017    felem_mul(tmp, ftmp, e8);
1018    felem_reduce(ftmp, tmp);    /* 2^16 - 2^0 */
1019    felem_assign(e16, ftmp);
1020    for (i = 0; i < 16; i++) {
1021        felem_square(tmp, ftmp);
1022        felem_reduce(ftmp, tmp);
1023    }                           /* 2^32 - 2^16 */
1024    felem_mul(tmp, ftmp, e16);
1025    felem_reduce(ftmp, tmp);    /* 2^32 - 2^0 */
1026    felem_assign(e32, ftmp);
1027    for (i = 0; i < 32; i++) {
1028        felem_square(tmp, ftmp);
1029        felem_reduce(ftmp, tmp);
1030    }                           /* 2^64 - 2^32 */
1031    felem_assign(e64, ftmp);
1032    felem_mul(tmp, ftmp, in);
1033    felem_reduce(ftmp, tmp);    /* 2^64 - 2^32 + 2^0 */
1034    for (i = 0; i < 192; i++) {
1035        felem_square(tmp, ftmp);
1036        felem_reduce(ftmp, tmp);
1037    }                           /* 2^256 - 2^224 + 2^192 */
1038
1039    felem_mul(tmp, e64, e32);
1040    felem_reduce(ftmp2, tmp);   /* 2^64 - 2^0 */
1041    for (i = 0; i < 16; i++) {
1042        felem_square(tmp, ftmp2);
1043        felem_reduce(ftmp2, tmp);
1044    }                           /* 2^80 - 2^16 */
1045    felem_mul(tmp, ftmp2, e16);
1046    felem_reduce(ftmp2, tmp);   /* 2^80 - 2^0 */
1047    for (i = 0; i < 8; i++) {
1048        felem_square(tmp, ftmp2);
1049        felem_reduce(ftmp2, tmp);
1050    }                           /* 2^88 - 2^8 */
1051    felem_mul(tmp, ftmp2, e8);
1052    felem_reduce(ftmp2, tmp);   /* 2^88 - 2^0 */
1053    for (i = 0; i < 4; i++) {
1054        felem_square(tmp, ftmp2);
1055        felem_reduce(ftmp2, tmp);
1056    }                           /* 2^92 - 2^4 */
1057    felem_mul(tmp, ftmp2, e4);
1058    felem_reduce(ftmp2, tmp);   /* 2^92 - 2^0 */
1059    felem_square(tmp, ftmp2);
1060    felem_reduce(ftmp2, tmp);   /* 2^93 - 2^1 */
1061    felem_square(tmp, ftmp2);
1062    felem_reduce(ftmp2, tmp);   /* 2^94 - 2^2 */
1063    felem_mul(tmp, ftmp2, e2);
1064    felem_reduce(ftmp2, tmp);   /* 2^94 - 2^0 */
1065    felem_square(tmp, ftmp2);
1066    felem_reduce(ftmp2, tmp);   /* 2^95 - 2^1 */
1067    felem_square(tmp, ftmp2);
1068    felem_reduce(ftmp2, tmp);   /* 2^96 - 2^2 */
1069    felem_mul(tmp, ftmp2, in);
1070    felem_reduce(ftmp2, tmp);   /* 2^96 - 3 */
1071
1072    felem_mul(tmp, ftmp2, ftmp);
1073    felem_reduce(out, tmp);     /* 2^256 - 2^224 + 2^192 + 2^96 - 3 */
1074}
1075
1076static void smallfelem_inv_contract(smallfelem out, const smallfelem in)
1077{
1078    felem tmp;
1079
1080    smallfelem_expand(tmp, in);
1081    felem_inv(tmp, tmp);
1082    felem_contract(out, tmp);
1083}
1084
1085/*-
1086 * Group operations
1087 * ----------------
1088 *
1089 * Building on top of the field operations we have the operations on the
1090 * elliptic curve group itself. Points on the curve are represented in Jacobian
1091 * coordinates
1092 */
1093
1094/*-
1095 * point_double calculates 2*(x_in, y_in, z_in)
1096 *
1097 * The method is taken from:
1098 *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b
1099 *
1100 * Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed.
1101 * while x_out == y_in is not (maybe this works, but it's not tested).
1102 */
1103static void
1104point_double(felem x_out, felem y_out, felem z_out,
1105             const felem x_in, const felem y_in, const felem z_in)
1106{
1107    longfelem tmp, tmp2;
1108    felem delta, gamma, beta, alpha, ftmp, ftmp2;
1109    smallfelem small1, small2;
1110
1111    felem_assign(ftmp, x_in);
1112    /* ftmp[i] < 2^106 */
1113    felem_assign(ftmp2, x_in);
1114    /* ftmp2[i] < 2^106 */
1115
1116    /* delta = z^2 */
1117    felem_square(tmp, z_in);
1118    felem_reduce(delta, tmp);
1119    /* delta[i] < 2^101 */
1120
1121    /* gamma = y^2 */
1122    felem_square(tmp, y_in);
1123    felem_reduce(gamma, tmp);
1124    /* gamma[i] < 2^101 */
1125    felem_shrink(small1, gamma);
1126
1127    /* beta = x*gamma */
1128    felem_small_mul(tmp, small1, x_in);
1129    felem_reduce(beta, tmp);
1130    /* beta[i] < 2^101 */
1131
1132    /* alpha = 3*(x-delta)*(x+delta) */
1133    felem_diff(ftmp, delta);
1134    /* ftmp[i] < 2^105 + 2^106 < 2^107 */
1135    felem_sum(ftmp2, delta);
1136    /* ftmp2[i] < 2^105 + 2^106 < 2^107 */
1137    felem_scalar(ftmp2, 3);
1138    /* ftmp2[i] < 3 * 2^107 < 2^109 */
1139    felem_mul(tmp, ftmp, ftmp2);
1140    felem_reduce(alpha, tmp);
1141    /* alpha[i] < 2^101 */
1142    felem_shrink(small2, alpha);
1143
1144    /* x' = alpha^2 - 8*beta */
1145    smallfelem_square(tmp, small2);
1146    felem_reduce(x_out, tmp);
1147    felem_assign(ftmp, beta);
1148    felem_scalar(ftmp, 8);
1149    /* ftmp[i] < 8 * 2^101 = 2^104 */
1150    felem_diff(x_out, ftmp);
1151    /* x_out[i] < 2^105 + 2^101 < 2^106 */
1152
1153    /* z' = (y + z)^2 - gamma - delta */
1154    felem_sum(delta, gamma);
1155    /* delta[i] < 2^101 + 2^101 = 2^102 */
1156    felem_assign(ftmp, y_in);
1157    felem_sum(ftmp, z_in);
1158    /* ftmp[i] < 2^106 + 2^106 = 2^107 */
1159    felem_square(tmp, ftmp);
1160    felem_reduce(z_out, tmp);
1161    felem_diff(z_out, delta);
1162    /* z_out[i] < 2^105 + 2^101 < 2^106 */
1163
1164    /* y' = alpha*(4*beta - x') - 8*gamma^2 */
1165    felem_scalar(beta, 4);
1166    /* beta[i] < 4 * 2^101 = 2^103 */
1167    felem_diff_zero107(beta, x_out);
1168    /* beta[i] < 2^107 + 2^103 < 2^108 */
1169    felem_small_mul(tmp, small2, beta);
1170    /* tmp[i] < 7 * 2^64 < 2^67 */
1171    smallfelem_square(tmp2, small1);
1172    /* tmp2[i] < 7 * 2^64 */
1173    longfelem_scalar(tmp2, 8);
1174    /* tmp2[i] < 8 * 7 * 2^64 = 7 * 2^67 */
1175    longfelem_diff(tmp, tmp2);
1176    /* tmp[i] < 2^67 + 2^70 + 2^40 < 2^71 */
1177    felem_reduce_zero105(y_out, tmp);
1178    /* y_out[i] < 2^106 */
1179}
1180
1181/*
1182 * point_double_small is the same as point_double, except that it operates on
1183 * smallfelems
1184 */
1185static void
1186point_double_small(smallfelem x_out, smallfelem y_out, smallfelem z_out,
1187                   const smallfelem x_in, const smallfelem y_in,
1188                   const smallfelem z_in)
1189{
1190    felem felem_x_out, felem_y_out, felem_z_out;
1191    felem felem_x_in, felem_y_in, felem_z_in;
1192
1193    smallfelem_expand(felem_x_in, x_in);
1194    smallfelem_expand(felem_y_in, y_in);
1195    smallfelem_expand(felem_z_in, z_in);
1196    point_double(felem_x_out, felem_y_out, felem_z_out,
1197                 felem_x_in, felem_y_in, felem_z_in);
1198    felem_shrink(x_out, felem_x_out);
1199    felem_shrink(y_out, felem_y_out);
1200    felem_shrink(z_out, felem_z_out);
1201}
1202
1203/* copy_conditional copies in to out iff mask is all ones. */
1204static void copy_conditional(felem out, const felem in, limb mask)
1205{
1206    unsigned i;
1207    for (i = 0; i < NLIMBS; ++i) {
1208        const limb tmp = mask & (in[i] ^ out[i]);
1209        out[i] ^= tmp;
1210    }
1211}
1212
1213/* copy_small_conditional copies in to out iff mask is all ones. */
1214static void copy_small_conditional(felem out, const smallfelem in, limb mask)
1215{
1216    unsigned i;
1217    const u64 mask64 = mask;
1218    for (i = 0; i < NLIMBS; ++i) {
1219        out[i] = ((limb) (in[i] & mask64)) | (out[i] & ~mask);
1220    }
1221}
1222
1223/*-
1224 * point_add calculates (x1, y1, z1) + (x2, y2, z2)
1225 *
1226 * The method is taken from:
1227 *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl,
1228 * adapted for mixed addition (z2 = 1, or z2 = 0 for the point at infinity).
1229 *
1230 * This function includes a branch for checking whether the two input points
1231 * are equal, (while not equal to the point at infinity). This case never
1232 * happens during single point multiplication, so there is no timing leak for
1233 * ECDH or ECDSA signing.
1234 */
1235static void point_add(felem x3, felem y3, felem z3,
1236                      const felem x1, const felem y1, const felem z1,
1237                      const int mixed, const smallfelem x2,
1238                      const smallfelem y2, const smallfelem z2)
1239{
1240    felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, ftmp6, x_out, y_out, z_out;
1241    longfelem tmp, tmp2;
1242    smallfelem small1, small2, small3, small4, small5;
1243    limb x_equal, y_equal, z1_is_zero, z2_is_zero;
1244    limb points_equal;
1245
1246    felem_shrink(small3, z1);
1247
1248    z1_is_zero = smallfelem_is_zero(small3);
1249    z2_is_zero = smallfelem_is_zero(z2);
1250
1251    /* ftmp = z1z1 = z1**2 */
1252    smallfelem_square(tmp, small3);
1253    felem_reduce(ftmp, tmp);
1254    /* ftmp[i] < 2^101 */
1255    felem_shrink(small1, ftmp);
1256
1257    if (!mixed) {
1258        /* ftmp2 = z2z2 = z2**2 */
1259        smallfelem_square(tmp, z2);
1260        felem_reduce(ftmp2, tmp);
1261        /* ftmp2[i] < 2^101 */
1262        felem_shrink(small2, ftmp2);
1263
1264        felem_shrink(small5, x1);
1265
1266        /* u1 = ftmp3 = x1*z2z2 */
1267        smallfelem_mul(tmp, small5, small2);
1268        felem_reduce(ftmp3, tmp);
1269        /* ftmp3[i] < 2^101 */
1270
1271        /* ftmp5 = z1 + z2 */
1272        felem_assign(ftmp5, z1);
1273        felem_small_sum(ftmp5, z2);
1274        /* ftmp5[i] < 2^107 */
1275
1276        /* ftmp5 = (z1 + z2)**2 - (z1z1 + z2z2) = 2z1z2 */
1277        felem_square(tmp, ftmp5);
1278        felem_reduce(ftmp5, tmp);
1279        /* ftmp2 = z2z2 + z1z1 */
1280        felem_sum(ftmp2, ftmp);
1281        /* ftmp2[i] < 2^101 + 2^101 = 2^102 */
1282        felem_diff(ftmp5, ftmp2);
1283        /* ftmp5[i] < 2^105 + 2^101 < 2^106 */
1284
1285        /* ftmp2 = z2 * z2z2 */
1286        smallfelem_mul(tmp, small2, z2);
1287        felem_reduce(ftmp2, tmp);
1288
1289        /* s1 = ftmp2 = y1 * z2**3 */
1290        felem_mul(tmp, y1, ftmp2);
1291        felem_reduce(ftmp6, tmp);
1292        /* ftmp6[i] < 2^101 */
1293    } else {
1294        /*
1295         * We'll assume z2 = 1 (special case z2 = 0 is handled later)
1296         */
1297
1298        /* u1 = ftmp3 = x1*z2z2 */
1299        felem_assign(ftmp3, x1);
1300        /* ftmp3[i] < 2^106 */
1301
1302        /* ftmp5 = 2z1z2 */
1303        felem_assign(ftmp5, z1);
1304        felem_scalar(ftmp5, 2);
1305        /* ftmp5[i] < 2*2^106 = 2^107 */
1306
1307        /* s1 = ftmp2 = y1 * z2**3 */
1308        felem_assign(ftmp6, y1);
1309        /* ftmp6[i] < 2^106 */
1310    }
1311
1312    /* u2 = x2*z1z1 */
1313    smallfelem_mul(tmp, x2, small1);
1314    felem_reduce(ftmp4, tmp);
1315
1316    /* h = ftmp4 = u2 - u1 */
1317    felem_diff_zero107(ftmp4, ftmp3);
1318    /* ftmp4[i] < 2^107 + 2^101 < 2^108 */
1319    felem_shrink(small4, ftmp4);
1320
1321    x_equal = smallfelem_is_zero(small4);
1322
1323    /* z_out = ftmp5 * h */
1324    felem_small_mul(tmp, small4, ftmp5);
1325    felem_reduce(z_out, tmp);
1326    /* z_out[i] < 2^101 */
1327
1328    /* ftmp = z1 * z1z1 */
1329    smallfelem_mul(tmp, small1, small3);
1330    felem_reduce(ftmp, tmp);
1331
1332    /* s2 = tmp = y2 * z1**3 */
1333    felem_small_mul(tmp, y2, ftmp);
1334    felem_reduce(ftmp5, tmp);
1335
1336    /* r = ftmp5 = (s2 - s1)*2 */
1337    felem_diff_zero107(ftmp5, ftmp6);
1338    /* ftmp5[i] < 2^107 + 2^107 = 2^108 */
1339    felem_scalar(ftmp5, 2);
1340    /* ftmp5[i] < 2^109 */
1341    felem_shrink(small1, ftmp5);
1342    y_equal = smallfelem_is_zero(small1);
1343
1344    /*
1345     * The formulae are incorrect if the points are equal, in affine coordinates
1346     * (X_1, Y_1) == (X_2, Y_2), so we check for this and do doubling if this
1347     * happens.
1348     *
1349     * We use bitwise operations to avoid potential side-channels introduced by
1350     * the short-circuiting behaviour of boolean operators.
1351     *
1352     * The special case of either point being the point at infinity (z1 and/or
1353     * z2 are zero), is handled separately later on in this function, so we
1354     * avoid jumping to point_double here in those special cases.
1355     */
1356    points_equal = (x_equal & y_equal & (~z1_is_zero) & (~z2_is_zero));
1357
1358    if (points_equal) {
1359        /*
1360         * This is obviously not constant-time but, as mentioned before, this
1361         * case never happens during single point multiplication, so there is no
1362         * timing leak for ECDH or ECDSA signing.
1363         */
1364        point_double(x3, y3, z3, x1, y1, z1);
1365        return;
1366    }
1367
1368    /* I = ftmp = (2h)**2 */
1369    felem_assign(ftmp, ftmp4);
1370    felem_scalar(ftmp, 2);
1371    /* ftmp[i] < 2*2^108 = 2^109 */
1372    felem_square(tmp, ftmp);
1373    felem_reduce(ftmp, tmp);
1374
1375    /* J = ftmp2 = h * I */
1376    felem_mul(tmp, ftmp4, ftmp);
1377    felem_reduce(ftmp2, tmp);
1378
1379    /* V = ftmp4 = U1 * I */
1380    felem_mul(tmp, ftmp3, ftmp);
1381    felem_reduce(ftmp4, tmp);
1382
1383    /* x_out = r**2 - J - 2V */
1384    smallfelem_square(tmp, small1);
1385    felem_reduce(x_out, tmp);
1386    felem_assign(ftmp3, ftmp4);
1387    felem_scalar(ftmp4, 2);
1388    felem_sum(ftmp4, ftmp2);
1389    /* ftmp4[i] < 2*2^101 + 2^101 < 2^103 */
1390    felem_diff(x_out, ftmp4);
1391    /* x_out[i] < 2^105 + 2^101 */
1392
1393    /* y_out = r(V-x_out) - 2 * s1 * J */
1394    felem_diff_zero107(ftmp3, x_out);
1395    /* ftmp3[i] < 2^107 + 2^101 < 2^108 */
1396    felem_small_mul(tmp, small1, ftmp3);
1397    felem_mul(tmp2, ftmp6, ftmp2);
1398    longfelem_scalar(tmp2, 2);
1399    /* tmp2[i] < 2*2^67 = 2^68 */
1400    longfelem_diff(tmp, tmp2);
1401    /* tmp[i] < 2^67 + 2^70 + 2^40 < 2^71 */
1402    felem_reduce_zero105(y_out, tmp);
1403    /* y_out[i] < 2^106 */
1404
1405    copy_small_conditional(x_out, x2, z1_is_zero);
1406    copy_conditional(x_out, x1, z2_is_zero);
1407    copy_small_conditional(y_out, y2, z1_is_zero);
1408    copy_conditional(y_out, y1, z2_is_zero);
1409    copy_small_conditional(z_out, z2, z1_is_zero);
1410    copy_conditional(z_out, z1, z2_is_zero);
1411    felem_assign(x3, x_out);
1412    felem_assign(y3, y_out);
1413    felem_assign(z3, z_out);
1414}
1415
1416/*
1417 * point_add_small is the same as point_add, except that it operates on
1418 * smallfelems
1419 */
1420static void point_add_small(smallfelem x3, smallfelem y3, smallfelem z3,
1421                            smallfelem x1, smallfelem y1, smallfelem z1,
1422                            smallfelem x2, smallfelem y2, smallfelem z2)
1423{
1424    felem felem_x3, felem_y3, felem_z3;
1425    felem felem_x1, felem_y1, felem_z1;
1426    smallfelem_expand(felem_x1, x1);
1427    smallfelem_expand(felem_y1, y1);
1428    smallfelem_expand(felem_z1, z1);
1429    point_add(felem_x3, felem_y3, felem_z3, felem_x1, felem_y1, felem_z1, 0,
1430              x2, y2, z2);
1431    felem_shrink(x3, felem_x3);
1432    felem_shrink(y3, felem_y3);
1433    felem_shrink(z3, felem_z3);
1434}
1435
1436/*-
1437 * Base point pre computation
1438 * --------------------------
1439 *
1440 * Two different sorts of precomputed tables are used in the following code.
1441 * Each contain various points on the curve, where each point is three field
1442 * elements (x, y, z).
1443 *
1444 * For the base point table, z is usually 1 (0 for the point at infinity).
1445 * This table has 2 * 16 elements, starting with the following:
1446 * index | bits    | point
1447 * ------+---------+------------------------------
1448 *     0 | 0 0 0 0 | 0G
1449 *     1 | 0 0 0 1 | 1G
1450 *     2 | 0 0 1 0 | 2^64G
1451 *     3 | 0 0 1 1 | (2^64 + 1)G
1452 *     4 | 0 1 0 0 | 2^128G
1453 *     5 | 0 1 0 1 | (2^128 + 1)G
1454 *     6 | 0 1 1 0 | (2^128 + 2^64)G
1455 *     7 | 0 1 1 1 | (2^128 + 2^64 + 1)G
1456 *     8 | 1 0 0 0 | 2^192G
1457 *     9 | 1 0 0 1 | (2^192 + 1)G
1458 *    10 | 1 0 1 0 | (2^192 + 2^64)G
1459 *    11 | 1 0 1 1 | (2^192 + 2^64 + 1)G
1460 *    12 | 1 1 0 0 | (2^192 + 2^128)G
1461 *    13 | 1 1 0 1 | (2^192 + 2^128 + 1)G
1462 *    14 | 1 1 1 0 | (2^192 + 2^128 + 2^64)G
1463 *    15 | 1 1 1 1 | (2^192 + 2^128 + 2^64 + 1)G
1464 * followed by a copy of this with each element multiplied by 2^32.
1465 *
1466 * The reason for this is so that we can clock bits into four different
1467 * locations when doing simple scalar multiplies against the base point,
1468 * and then another four locations using the second 16 elements.
1469 *
1470 * Tables for other points have table[i] = iG for i in 0 .. 16. */
1471
1472/* gmul is the table of precomputed base points */
1473static const smallfelem gmul[2][16][3] = {
1474    {{{0, 0, 0, 0},
1475      {0, 0, 0, 0},
1476      {0, 0, 0, 0}},
1477     {{0xf4a13945d898c296, 0x77037d812deb33a0, 0xf8bce6e563a440f2,
1478       0x6b17d1f2e12c4247},
1479      {0xcbb6406837bf51f5, 0x2bce33576b315ece, 0x8ee7eb4a7c0f9e16,
1480       0x4fe342e2fe1a7f9b},
1481      {1, 0, 0, 0}},
1482     {{0x90e75cb48e14db63, 0x29493baaad651f7e, 0x8492592e326e25de,
1483       0x0fa822bc2811aaa5},
1484      {0xe41124545f462ee7, 0x34b1a65050fe82f5, 0x6f4ad4bcb3df188b,
1485       0xbff44ae8f5dba80d},
1486      {1, 0, 0, 0}},
1487     {{0x93391ce2097992af, 0xe96c98fd0d35f1fa, 0xb257c0de95e02789,
1488       0x300a4bbc89d6726f},
1489      {0xaa54a291c08127a0, 0x5bb1eeada9d806a5, 0x7f1ddb25ff1e3c6f,
1490       0x72aac7e0d09b4644},
1491      {1, 0, 0, 0}},
1492     {{0x57c84fc9d789bd85, 0xfc35ff7dc297eac3, 0xfb982fd588c6766e,
1493       0x447d739beedb5e67},
1494      {0x0c7e33c972e25b32, 0x3d349b95a7fae500, 0xe12e9d953a4aaff7,
1495       0x2d4825ab834131ee},
1496      {1, 0, 0, 0}},
1497     {{0x13949c932a1d367f, 0xef7fbd2b1a0a11b7, 0xddc6068bb91dfc60,
1498       0xef9519328a9c72ff},
1499      {0x196035a77376d8a8, 0x23183b0895ca1740, 0xc1ee9807022c219c,
1500       0x611e9fc37dbb2c9b},
1501      {1, 0, 0, 0}},
1502     {{0xcae2b1920b57f4bc, 0x2936df5ec6c9bc36, 0x7dea6482e11238bf,
1503       0x550663797b51f5d8},
1504      {0x44ffe216348a964c, 0x9fb3d576dbdefbe1, 0x0afa40018d9d50e5,
1505       0x157164848aecb851},
1506      {1, 0, 0, 0}},
1507     {{0xe48ecafffc5cde01, 0x7ccd84e70d715f26, 0xa2e8f483f43e4391,
1508       0xeb5d7745b21141ea},
1509      {0xcac917e2731a3479, 0x85f22cfe2844b645, 0x0990e6a158006cee,
1510       0xeafd72ebdbecc17b},
1511      {1, 0, 0, 0}},
1512     {{0x6cf20ffb313728be, 0x96439591a3c6b94a, 0x2736ff8344315fc5,
1513       0xa6d39677a7849276},
1514      {0xf2bab833c357f5f4, 0x824a920c2284059b, 0x66b8babd2d27ecdf,
1515       0x674f84749b0b8816},
1516      {1, 0, 0, 0}},
1517     {{0x2df48c04677c8a3e, 0x74e02f080203a56b, 0x31855f7db8c7fedb,
1518       0x4e769e7672c9ddad},
1519      {0xa4c36165b824bbb0, 0xfb9ae16f3b9122a5, 0x1ec0057206947281,
1520       0x42b99082de830663},
1521      {1, 0, 0, 0}},
1522     {{0x6ef95150dda868b9, 0xd1f89e799c0ce131, 0x7fdc1ca008a1c478,
1523       0x78878ef61c6ce04d},
1524      {0x9c62b9121fe0d976, 0x6ace570ebde08d4f, 0xde53142c12309def,
1525       0xb6cb3f5d7b72c321},
1526      {1, 0, 0, 0}},
1527     {{0x7f991ed2c31a3573, 0x5b82dd5bd54fb496, 0x595c5220812ffcae,
1528       0x0c88bc4d716b1287},
1529      {0x3a57bf635f48aca8, 0x7c8181f4df2564f3, 0x18d1b5b39c04e6aa,
1530       0xdd5ddea3f3901dc6},
1531      {1, 0, 0, 0}},
1532     {{0xe96a79fb3e72ad0c, 0x43a0a28c42ba792f, 0xefe0a423083e49f3,
1533       0x68f344af6b317466},
1534      {0xcdfe17db3fb24d4a, 0x668bfc2271f5c626, 0x604ed93c24d67ff3,
1535       0x31b9c405f8540a20},
1536      {1, 0, 0, 0}},
1537     {{0xd36b4789a2582e7f, 0x0d1a10144ec39c28, 0x663c62c3edbad7a0,
1538       0x4052bf4b6f461db9},
1539      {0x235a27c3188d25eb, 0xe724f33999bfcc5b, 0x862be6bd71d70cc8,
1540       0xfecf4d5190b0fc61},
1541      {1, 0, 0, 0}},
1542     {{0x74346c10a1d4cfac, 0xafdf5cc08526a7a4, 0x123202a8f62bff7a,
1543       0x1eddbae2c802e41a},
1544      {0x8fa0af2dd603f844, 0x36e06b7e4c701917, 0x0c45f45273db33a0,
1545       0x43104d86560ebcfc},
1546      {1, 0, 0, 0}},
1547     {{0x9615b5110d1d78e5, 0x66b0de3225c4744b, 0x0a4a46fb6aaf363a,
1548       0xb48e26b484f7a21c},
1549      {0x06ebb0f621a01b2d, 0xc004e4048b7b0f98, 0x64131bcdfed6f668,
1550       0xfac015404d4d3dab},
1551      {1, 0, 0, 0}}},
1552    {{{0, 0, 0, 0},
1553      {0, 0, 0, 0},
1554      {0, 0, 0, 0}},
1555     {{0x3a5a9e22185a5943, 0x1ab919365c65dfb6, 0x21656b32262c71da,
1556       0x7fe36b40af22af89},
1557      {0xd50d152c699ca101, 0x74b3d5867b8af212, 0x9f09f40407dca6f1,
1558       0xe697d45825b63624},
1559      {1, 0, 0, 0}},
1560     {{0xa84aa9397512218e, 0xe9a521b074ca0141, 0x57880b3a18a2e902,
1561       0x4a5b506612a677a6},
1562      {0x0beada7a4c4f3840, 0x626db15419e26d9d, 0xc42604fbe1627d40,
1563       0xeb13461ceac089f1},
1564      {1, 0, 0, 0}},
1565     {{0xf9faed0927a43281, 0x5e52c4144103ecbc, 0xc342967aa815c857,
1566       0x0781b8291c6a220a},
1567      {0x5a8343ceeac55f80, 0x88f80eeee54a05e3, 0x97b2a14f12916434,
1568       0x690cde8df0151593},
1569      {1, 0, 0, 0}},
1570     {{0xaee9c75df7f82f2a, 0x9e4c35874afdf43a, 0xf5622df437371326,
1571       0x8a535f566ec73617},
1572      {0xc5f9a0ac223094b7, 0xcde533864c8c7669, 0x37e02819085a92bf,
1573       0x0455c08468b08bd7},
1574      {1, 0, 0, 0}},
1575     {{0x0c0a6e2c9477b5d9, 0xf9a4bf62876dc444, 0x5050a949b6cdc279,
1576       0x06bada7ab77f8276},
1577      {0xc8b4aed1ea48dac9, 0xdebd8a4b7ea1070f, 0x427d49101366eb70,
1578       0x5b476dfd0e6cb18a},
1579      {1, 0, 0, 0}},
1580     {{0x7c5c3e44278c340a, 0x4d54606812d66f3b, 0x29a751b1ae23c5d8,
1581       0x3e29864e8a2ec908},
1582      {0x142d2a6626dbb850, 0xad1744c4765bd780, 0x1f150e68e322d1ed,
1583       0x239b90ea3dc31e7e},
1584      {1, 0, 0, 0}},
1585     {{0x78c416527a53322a, 0x305dde6709776f8e, 0xdbcab759f8862ed4,
1586       0x820f4dd949f72ff7},
1587      {0x6cc544a62b5debd4, 0x75be5d937b4e8cc4, 0x1b481b1b215c14d3,
1588       0x140406ec783a05ec},
1589      {1, 0, 0, 0}},
1590     {{0x6a703f10e895df07, 0xfd75f3fa01876bd8, 0xeb5b06e70ce08ffe,
1591       0x68f6b8542783dfee},
1592      {0x90c76f8a78712655, 0xcf5293d2f310bf7f, 0xfbc8044dfda45028,
1593       0xcbe1feba92e40ce6},
1594      {1, 0, 0, 0}},
1595     {{0xe998ceea4396e4c1, 0xfc82ef0b6acea274, 0x230f729f2250e927,
1596       0xd0b2f94d2f420109},
1597      {0x4305adddb38d4966, 0x10b838f8624c3b45, 0x7db2636658954e7a,
1598       0x971459828b0719e5},
1599      {1, 0, 0, 0}},
1600     {{0x4bd6b72623369fc9, 0x57f2929e53d0b876, 0xc2d5cba4f2340687,
1601       0x961610004a866aba},
1602      {0x49997bcd2e407a5e, 0x69ab197d92ddcb24, 0x2cf1f2438fe5131c,
1603       0x7acb9fadcee75e44},
1604      {1, 0, 0, 0}},
1605     {{0x254e839423d2d4c0, 0xf57f0c917aea685b, 0xa60d880f6f75aaea,
1606       0x24eb9acca333bf5b},
1607      {0xe3de4ccb1cda5dea, 0xfeef9341c51a6b4f, 0x743125f88bac4c4d,
1608       0x69f891c5acd079cc},
1609      {1, 0, 0, 0}},
1610     {{0xeee44b35702476b5, 0x7ed031a0e45c2258, 0xb422d1e7bd6f8514,
1611       0xe51f547c5972a107},
1612      {0xa25bcd6fc9cf343d, 0x8ca922ee097c184e, 0xa62f98b3a9fe9a06,
1613       0x1c309a2b25bb1387},
1614      {1, 0, 0, 0}},
1615     {{0x9295dbeb1967c459, 0xb00148833472c98e, 0xc504977708011828,
1616       0x20b87b8aa2c4e503},
1617      {0x3063175de057c277, 0x1bd539338fe582dd, 0x0d11adef5f69a044,
1618       0xf5c6fa49919776be},
1619      {1, 0, 0, 0}},
1620     {{0x8c944e760fd59e11, 0x3876cba1102fad5f, 0xa454c3fad83faa56,
1621       0x1ed7d1b9332010b9},
1622      {0xa1011a270024b889, 0x05e4d0dcac0cd344, 0x52b520f0eb6a2a24,
1623       0x3a2b03f03217257a},
1624      {1, 0, 0, 0}},
1625     {{0xf20fc2afdf1d043d, 0xf330240db58d5a62, 0xfc7d229ca0058c3b,
1626       0x15fee545c78dd9f6},
1627      {0x501e82885bc98cda, 0x41ef80e5d046ac04, 0x557d9f49461210fb,
1628       0x4ab5b6b2b8753f81},
1629      {1, 0, 0, 0}}}
1630};
1631
1632/*
1633 * select_point selects the |idx|th point from a precomputation table and
1634 * copies it to out.
1635 */
1636static void select_point(const u64 idx, unsigned int size,
1637                         const smallfelem pre_comp[16][3], smallfelem out[3])
1638{
1639    unsigned i, j;
1640    u64 *outlimbs = &out[0][0];
1641
1642    memset(out, 0, sizeof(*out) * 3);
1643
1644    for (i = 0; i < size; i++) {
1645        const u64 *inlimbs = (u64 *)&pre_comp[i][0][0];
1646        u64 mask = i ^ idx;
1647        mask |= mask >> 4;
1648        mask |= mask >> 2;
1649        mask |= mask >> 1;
1650        mask &= 1;
1651        mask--;
1652        for (j = 0; j < NLIMBS * 3; j++)
1653            outlimbs[j] |= inlimbs[j] & mask;
1654    }
1655}
1656
1657/* get_bit returns the |i|th bit in |in| */
1658static char get_bit(const felem_bytearray in, int i)
1659{
1660    if ((i < 0) || (i >= 256))
1661        return 0;
1662    return (in[i >> 3] >> (i & 7)) & 1;
1663}
1664
1665/*
1666 * Interleaved point multiplication using precomputed point multiples: The
1667 * small point multiples 0*P, 1*P, ..., 17*P are in pre_comp[], the scalars
1668 * in scalars[]. If g_scalar is non-NULL, we also add this multiple of the
1669 * generator, using certain (large) precomputed multiples in g_pre_comp.
1670 * Output point (X, Y, Z) is stored in x_out, y_out, z_out
1671 */
1672static void batch_mul(felem x_out, felem y_out, felem z_out,
1673                      const felem_bytearray scalars[],
1674                      const unsigned num_points, const u8 *g_scalar,
1675                      const int mixed, const smallfelem pre_comp[][17][3],
1676                      const smallfelem g_pre_comp[2][16][3])
1677{
1678    int i, skip;
1679    unsigned num, gen_mul = (g_scalar != NULL);
1680    felem nq[3], ftmp;
1681    smallfelem tmp[3];
1682    u64 bits;
1683    u8 sign, digit;
1684
1685    /* set nq to the point at infinity */
1686    memset(nq, 0, sizeof(nq));
1687
1688    /*
1689     * Loop over all scalars msb-to-lsb, interleaving additions of multiples
1690     * of the generator (two in each of the last 32 rounds) and additions of
1691     * other points multiples (every 5th round).
1692     */
1693    skip = 1;                   /* save two point operations in the first
1694                                 * round */
1695    for (i = (num_points ? 255 : 31); i >= 0; --i) {
1696        /* double */
1697        if (!skip)
1698            point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]);
1699
1700        /* add multiples of the generator */
1701        if (gen_mul && (i <= 31)) {
1702            /* first, look 32 bits upwards */
1703            bits = get_bit(g_scalar, i + 224) << 3;
1704            bits |= get_bit(g_scalar, i + 160) << 2;
1705            bits |= get_bit(g_scalar, i + 96) << 1;
1706            bits |= get_bit(g_scalar, i + 32);
1707            /* select the point to add, in constant time */
1708            select_point(bits, 16, g_pre_comp[1], tmp);
1709
1710            if (!skip) {
1711                /* Arg 1 below is for "mixed" */
1712                point_add(nq[0], nq[1], nq[2],
1713                          nq[0], nq[1], nq[2], 1, tmp[0], tmp[1], tmp[2]);
1714            } else {
1715                smallfelem_expand(nq[0], tmp[0]);
1716                smallfelem_expand(nq[1], tmp[1]);
1717                smallfelem_expand(nq[2], tmp[2]);
1718                skip = 0;
1719            }
1720
1721            /* second, look at the current position */
1722            bits = get_bit(g_scalar, i + 192) << 3;
1723            bits |= get_bit(g_scalar, i + 128) << 2;
1724            bits |= get_bit(g_scalar, i + 64) << 1;
1725            bits |= get_bit(g_scalar, i);
1726            /* select the point to add, in constant time */
1727            select_point(bits, 16, g_pre_comp[0], tmp);
1728            /* Arg 1 below is for "mixed" */
1729            point_add(nq[0], nq[1], nq[2],
1730                      nq[0], nq[1], nq[2], 1, tmp[0], tmp[1], tmp[2]);
1731        }
1732
1733        /* do other additions every 5 doublings */
1734        if (num_points && (i % 5 == 0)) {
1735            /* loop over all scalars */
1736            for (num = 0; num < num_points; ++num) {
1737                bits = get_bit(scalars[num], i + 4) << 5;
1738                bits |= get_bit(scalars[num], i + 3) << 4;
1739                bits |= get_bit(scalars[num], i + 2) << 3;
1740                bits |= get_bit(scalars[num], i + 1) << 2;
1741                bits |= get_bit(scalars[num], i) << 1;
1742                bits |= get_bit(scalars[num], i - 1);
1743                ossl_ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits);
1744
1745                /*
1746                 * select the point to add or subtract, in constant time
1747                 */
1748                select_point(digit, 17, pre_comp[num], tmp);
1749                smallfelem_neg(ftmp, tmp[1]); /* (X, -Y, Z) is the negative
1750                                               * point */
1751                copy_small_conditional(ftmp, tmp[1], (((limb) sign) - 1));
1752                felem_contract(tmp[1], ftmp);
1753
1754                if (!skip) {
1755                    point_add(nq[0], nq[1], nq[2],
1756                              nq[0], nq[1], nq[2],
1757                              mixed, tmp[0], tmp[1], tmp[2]);
1758                } else {
1759                    smallfelem_expand(nq[0], tmp[0]);
1760                    smallfelem_expand(nq[1], tmp[1]);
1761                    smallfelem_expand(nq[2], tmp[2]);
1762                    skip = 0;
1763                }
1764            }
1765        }
1766    }
1767    felem_assign(x_out, nq[0]);
1768    felem_assign(y_out, nq[1]);
1769    felem_assign(z_out, nq[2]);
1770}
1771
1772/* Precomputation for the group generator. */
1773struct nistp256_pre_comp_st {
1774    smallfelem g_pre_comp[2][16][3];
1775    CRYPTO_REF_COUNT references;
1776    CRYPTO_RWLOCK *lock;
1777};
1778
1779const EC_METHOD *EC_GFp_nistp256_method(void)
1780{
1781    static const EC_METHOD ret = {
1782        EC_FLAGS_DEFAULT_OCT,
1783        NID_X9_62_prime_field,
1784        ossl_ec_GFp_nistp256_group_init,
1785        ossl_ec_GFp_simple_group_finish,
1786        ossl_ec_GFp_simple_group_clear_finish,
1787        ossl_ec_GFp_nist_group_copy,
1788        ossl_ec_GFp_nistp256_group_set_curve,
1789        ossl_ec_GFp_simple_group_get_curve,
1790        ossl_ec_GFp_simple_group_get_degree,
1791        ossl_ec_group_simple_order_bits,
1792        ossl_ec_GFp_simple_group_check_discriminant,
1793        ossl_ec_GFp_simple_point_init,
1794        ossl_ec_GFp_simple_point_finish,
1795        ossl_ec_GFp_simple_point_clear_finish,
1796        ossl_ec_GFp_simple_point_copy,
1797        ossl_ec_GFp_simple_point_set_to_infinity,
1798        ossl_ec_GFp_simple_point_set_affine_coordinates,
1799        ossl_ec_GFp_nistp256_point_get_affine_coordinates,
1800        0 /* point_set_compressed_coordinates */ ,
1801        0 /* point2oct */ ,
1802        0 /* oct2point */ ,
1803        ossl_ec_GFp_simple_add,
1804        ossl_ec_GFp_simple_dbl,
1805        ossl_ec_GFp_simple_invert,
1806        ossl_ec_GFp_simple_is_at_infinity,
1807        ossl_ec_GFp_simple_is_on_curve,
1808        ossl_ec_GFp_simple_cmp,
1809        ossl_ec_GFp_simple_make_affine,
1810        ossl_ec_GFp_simple_points_make_affine,
1811        ossl_ec_GFp_nistp256_points_mul,
1812        ossl_ec_GFp_nistp256_precompute_mult,
1813        ossl_ec_GFp_nistp256_have_precompute_mult,
1814        ossl_ec_GFp_nist_field_mul,
1815        ossl_ec_GFp_nist_field_sqr,
1816        0 /* field_div */ ,
1817        ossl_ec_GFp_simple_field_inv,
1818        0 /* field_encode */ ,
1819        0 /* field_decode */ ,
1820        0,                      /* field_set_to_one */
1821        ossl_ec_key_simple_priv2oct,
1822        ossl_ec_key_simple_oct2priv,
1823        0, /* set private */
1824        ossl_ec_key_simple_generate_key,
1825        ossl_ec_key_simple_check_key,
1826        ossl_ec_key_simple_generate_public_key,
1827        0, /* keycopy */
1828        0, /* keyfinish */
1829        ossl_ecdh_simple_compute_key,
1830        ossl_ecdsa_simple_sign_setup,
1831        ossl_ecdsa_simple_sign_sig,
1832        ossl_ecdsa_simple_verify_sig,
1833        0, /* field_inverse_mod_ord */
1834        0, /* blind_coordinates */
1835        0, /* ladder_pre */
1836        0, /* ladder_step */
1837        0  /* ladder_post */
1838    };
1839
1840    return &ret;
1841}
1842
1843/******************************************************************************/
1844/*
1845 * FUNCTIONS TO MANAGE PRECOMPUTATION
1846 */
1847
1848static NISTP256_PRE_COMP *nistp256_pre_comp_new(void)
1849{
1850    NISTP256_PRE_COMP *ret = OPENSSL_zalloc(sizeof(*ret));
1851
1852    if (ret == NULL) {
1853        ERR_raise(ERR_LIB_EC, ERR_R_MALLOC_FAILURE);
1854        return ret;
1855    }
1856
1857    ret->references = 1;
1858
1859    ret->lock = CRYPTO_THREAD_lock_new();
1860    if (ret->lock == NULL) {
1861        ERR_raise(ERR_LIB_EC, ERR_R_MALLOC_FAILURE);
1862        OPENSSL_free(ret);
1863        return NULL;
1864    }
1865    return ret;
1866}
1867
1868NISTP256_PRE_COMP *EC_nistp256_pre_comp_dup(NISTP256_PRE_COMP *p)
1869{
1870    int i;
1871    if (p != NULL)
1872        CRYPTO_UP_REF(&p->references, &i, p->lock);
1873    return p;
1874}
1875
1876void EC_nistp256_pre_comp_free(NISTP256_PRE_COMP *pre)
1877{
1878    int i;
1879
1880    if (pre == NULL)
1881        return;
1882
1883    CRYPTO_DOWN_REF(&pre->references, &i, pre->lock);
1884    REF_PRINT_COUNT("EC_nistp256", pre);
1885    if (i > 0)
1886        return;
1887    REF_ASSERT_ISNT(i < 0);
1888
1889    CRYPTO_THREAD_lock_free(pre->lock);
1890    OPENSSL_free(pre);
1891}
1892
1893/******************************************************************************/
1894/*
1895 * OPENSSL EC_METHOD FUNCTIONS
1896 */
1897
1898int ossl_ec_GFp_nistp256_group_init(EC_GROUP *group)
1899{
1900    int ret;
1901    ret = ossl_ec_GFp_simple_group_init(group);
1902    group->a_is_minus3 = 1;
1903    return ret;
1904}
1905
1906int ossl_ec_GFp_nistp256_group_set_curve(EC_GROUP *group, const BIGNUM *p,
1907                                         const BIGNUM *a, const BIGNUM *b,
1908                                         BN_CTX *ctx)
1909{
1910    int ret = 0;
1911    BIGNUM *curve_p, *curve_a, *curve_b;
1912#ifndef FIPS_MODULE
1913    BN_CTX *new_ctx = NULL;
1914
1915    if (ctx == NULL)
1916        ctx = new_ctx = BN_CTX_new();
1917#endif
1918    if (ctx == NULL)
1919        return 0;
1920
1921    BN_CTX_start(ctx);
1922    curve_p = BN_CTX_get(ctx);
1923    curve_a = BN_CTX_get(ctx);
1924    curve_b = BN_CTX_get(ctx);
1925    if (curve_b == NULL)
1926        goto err;
1927    BN_bin2bn(nistp256_curve_params[0], sizeof(felem_bytearray), curve_p);
1928    BN_bin2bn(nistp256_curve_params[1], sizeof(felem_bytearray), curve_a);
1929    BN_bin2bn(nistp256_curve_params[2], sizeof(felem_bytearray), curve_b);
1930    if ((BN_cmp(curve_p, p)) || (BN_cmp(curve_a, a)) || (BN_cmp(curve_b, b))) {
1931        ERR_raise(ERR_LIB_EC, EC_R_WRONG_CURVE_PARAMETERS);
1932        goto err;
1933    }
1934    group->field_mod_func = BN_nist_mod_256;
1935    ret = ossl_ec_GFp_simple_group_set_curve(group, p, a, b, ctx);
1936 err:
1937    BN_CTX_end(ctx);
1938#ifndef FIPS_MODULE
1939    BN_CTX_free(new_ctx);
1940#endif
1941    return ret;
1942}
1943
1944/*
1945 * Takes the Jacobian coordinates (X, Y, Z) of a point and returns (X', Y') =
1946 * (X/Z^2, Y/Z^3)
1947 */
1948int ossl_ec_GFp_nistp256_point_get_affine_coordinates(const EC_GROUP *group,
1949                                                      const EC_POINT *point,
1950                                                      BIGNUM *x, BIGNUM *y,
1951                                                      BN_CTX *ctx)
1952{
1953    felem z1, z2, x_in, y_in;
1954    smallfelem x_out, y_out;
1955    longfelem tmp;
1956
1957    if (EC_POINT_is_at_infinity(group, point)) {
1958        ERR_raise(ERR_LIB_EC, EC_R_POINT_AT_INFINITY);
1959        return 0;
1960    }
1961    if ((!BN_to_felem(x_in, point->X)) || (!BN_to_felem(y_in, point->Y)) ||
1962        (!BN_to_felem(z1, point->Z)))
1963        return 0;
1964    felem_inv(z2, z1);
1965    felem_square(tmp, z2);
1966    felem_reduce(z1, tmp);
1967    felem_mul(tmp, x_in, z1);
1968    felem_reduce(x_in, tmp);
1969    felem_contract(x_out, x_in);
1970    if (x != NULL) {
1971        if (!smallfelem_to_BN(x, x_out)) {
1972            ERR_raise(ERR_LIB_EC, ERR_R_BN_LIB);
1973            return 0;
1974        }
1975    }
1976    felem_mul(tmp, z1, z2);
1977    felem_reduce(z1, tmp);
1978    felem_mul(tmp, y_in, z1);
1979    felem_reduce(y_in, tmp);
1980    felem_contract(y_out, y_in);
1981    if (y != NULL) {
1982        if (!smallfelem_to_BN(y, y_out)) {
1983            ERR_raise(ERR_LIB_EC, ERR_R_BN_LIB);
1984            return 0;
1985        }
1986    }
1987    return 1;
1988}
1989
1990/* points below is of size |num|, and tmp_smallfelems is of size |num+1| */
1991static void make_points_affine(size_t num, smallfelem points[][3],
1992                               smallfelem tmp_smallfelems[])
1993{
1994    /*
1995     * Runs in constant time, unless an input is the point at infinity (which
1996     * normally shouldn't happen).
1997     */
1998    ossl_ec_GFp_nistp_points_make_affine_internal(num,
1999                                                  points,
2000                                                  sizeof(smallfelem),
2001                                                  tmp_smallfelems,
2002                                                  (void (*)(void *))smallfelem_one,
2003                                                  smallfelem_is_zero_int,
2004                                                  (void (*)(void *, const void *))
2005                                                  smallfelem_assign,
2006                                                  (void (*)(void *, const void *))
2007                                                  smallfelem_square_contract,
2008                                                  (void (*)
2009                                                   (void *, const void *,
2010                                                    const void *))
2011                                                  smallfelem_mul_contract,
2012                                                  (void (*)(void *, const void *))
2013                                                  smallfelem_inv_contract,
2014                                                  /* nothing to contract */
2015                                                  (void (*)(void *, const void *))
2016                                                  smallfelem_assign);
2017}
2018
2019/*
2020 * Computes scalar*generator + \sum scalars[i]*points[i], ignoring NULL
2021 * values Result is stored in r (r can equal one of the inputs).
2022 */
2023int ossl_ec_GFp_nistp256_points_mul(const EC_GROUP *group, EC_POINT *r,
2024                                    const BIGNUM *scalar, size_t num,
2025                                    const EC_POINT *points[],
2026                                    const BIGNUM *scalars[], BN_CTX *ctx)
2027{
2028    int ret = 0;
2029    int j;
2030    int mixed = 0;
2031    BIGNUM *x, *y, *z, *tmp_scalar;
2032    felem_bytearray g_secret;
2033    felem_bytearray *secrets = NULL;
2034    smallfelem (*pre_comp)[17][3] = NULL;
2035    smallfelem *tmp_smallfelems = NULL;
2036    unsigned i;
2037    int num_bytes;
2038    int have_pre_comp = 0;
2039    size_t num_points = num;
2040    smallfelem x_in, y_in, z_in;
2041    felem x_out, y_out, z_out;
2042    NISTP256_PRE_COMP *pre = NULL;
2043    const smallfelem(*g_pre_comp)[16][3] = NULL;
2044    EC_POINT *generator = NULL;
2045    const EC_POINT *p = NULL;
2046    const BIGNUM *p_scalar = NULL;
2047
2048    BN_CTX_start(ctx);
2049    x = BN_CTX_get(ctx);
2050    y = BN_CTX_get(ctx);
2051    z = BN_CTX_get(ctx);
2052    tmp_scalar = BN_CTX_get(ctx);
2053    if (tmp_scalar == NULL)
2054        goto err;
2055
2056    if (scalar != NULL) {
2057        pre = group->pre_comp.nistp256;
2058        if (pre)
2059            /* we have precomputation, try to use it */
2060            g_pre_comp = (const smallfelem(*)[16][3])pre->g_pre_comp;
2061        else
2062            /* try to use the standard precomputation */
2063            g_pre_comp = &gmul[0];
2064        generator = EC_POINT_new(group);
2065        if (generator == NULL)
2066            goto err;
2067        /* get the generator from precomputation */
2068        if (!smallfelem_to_BN(x, g_pre_comp[0][1][0]) ||
2069            !smallfelem_to_BN(y, g_pre_comp[0][1][1]) ||
2070            !smallfelem_to_BN(z, g_pre_comp[0][1][2])) {
2071            ERR_raise(ERR_LIB_EC, ERR_R_BN_LIB);
2072            goto err;
2073        }
2074        if (!ossl_ec_GFp_simple_set_Jprojective_coordinates_GFp(group,
2075                                                                generator,
2076                                                                x, y, z, ctx))
2077            goto err;
2078        if (0 == EC_POINT_cmp(group, generator, group->generator, ctx))
2079            /* precomputation matches generator */
2080            have_pre_comp = 1;
2081        else
2082            /*
2083             * we don't have valid precomputation: treat the generator as a
2084             * random point
2085             */
2086            num_points++;
2087    }
2088    if (num_points > 0) {
2089        if (num_points >= 3) {
2090            /*
2091             * unless we precompute multiples for just one or two points,
2092             * converting those into affine form is time well spent
2093             */
2094            mixed = 1;
2095        }
2096        secrets = OPENSSL_malloc(sizeof(*secrets) * num_points);
2097        pre_comp = OPENSSL_malloc(sizeof(*pre_comp) * num_points);
2098        if (mixed)
2099            tmp_smallfelems =
2100              OPENSSL_malloc(sizeof(*tmp_smallfelems) * (num_points * 17 + 1));
2101        if ((secrets == NULL) || (pre_comp == NULL)
2102            || (mixed && (tmp_smallfelems == NULL))) {
2103            ERR_raise(ERR_LIB_EC, ERR_R_MALLOC_FAILURE);
2104            goto err;
2105        }
2106
2107        /*
2108         * we treat NULL scalars as 0, and NULL points as points at infinity,
2109         * i.e., they contribute nothing to the linear combination
2110         */
2111        memset(secrets, 0, sizeof(*secrets) * num_points);
2112        memset(pre_comp, 0, sizeof(*pre_comp) * num_points);
2113        for (i = 0; i < num_points; ++i) {
2114            if (i == num) {
2115                /*
2116                 * we didn't have a valid precomputation, so we pick the
2117                 * generator
2118                 */
2119                p = EC_GROUP_get0_generator(group);
2120                p_scalar = scalar;
2121            } else {
2122                /* the i^th point */
2123                p = points[i];
2124                p_scalar = scalars[i];
2125            }
2126            if ((p_scalar != NULL) && (p != NULL)) {
2127                /* reduce scalar to 0 <= scalar < 2^256 */
2128                if ((BN_num_bits(p_scalar) > 256)
2129                    || (BN_is_negative(p_scalar))) {
2130                    /*
2131                     * this is an unusual input, and we don't guarantee
2132                     * constant-timeness
2133                     */
2134                    if (!BN_nnmod(tmp_scalar, p_scalar, group->order, ctx)) {
2135                        ERR_raise(ERR_LIB_EC, ERR_R_BN_LIB);
2136                        goto err;
2137                    }
2138                    num_bytes = BN_bn2lebinpad(tmp_scalar,
2139                                               secrets[i], sizeof(secrets[i]));
2140                } else {
2141                    num_bytes = BN_bn2lebinpad(p_scalar,
2142                                               secrets[i], sizeof(secrets[i]));
2143                }
2144                if (num_bytes < 0) {
2145                    ERR_raise(ERR_LIB_EC, ERR_R_BN_LIB);
2146                    goto err;
2147                }
2148                /* precompute multiples */
2149                if ((!BN_to_felem(x_out, p->X)) ||
2150                    (!BN_to_felem(y_out, p->Y)) ||
2151                    (!BN_to_felem(z_out, p->Z)))
2152                    goto err;
2153                felem_shrink(pre_comp[i][1][0], x_out);
2154                felem_shrink(pre_comp[i][1][1], y_out);
2155                felem_shrink(pre_comp[i][1][2], z_out);
2156                for (j = 2; j <= 16; ++j) {
2157                    if (j & 1) {
2158                        point_add_small(pre_comp[i][j][0], pre_comp[i][j][1],
2159                                        pre_comp[i][j][2], pre_comp[i][1][0],
2160                                        pre_comp[i][1][1], pre_comp[i][1][2],
2161                                        pre_comp[i][j - 1][0],
2162                                        pre_comp[i][j - 1][1],
2163                                        pre_comp[i][j - 1][2]);
2164                    } else {
2165                        point_double_small(pre_comp[i][j][0],
2166                                           pre_comp[i][j][1],
2167                                           pre_comp[i][j][2],
2168                                           pre_comp[i][j / 2][0],
2169                                           pre_comp[i][j / 2][1],
2170                                           pre_comp[i][j / 2][2]);
2171                    }
2172                }
2173            }
2174        }
2175        if (mixed)
2176            make_points_affine(num_points * 17, pre_comp[0], tmp_smallfelems);
2177    }
2178
2179    /* the scalar for the generator */
2180    if ((scalar != NULL) && (have_pre_comp)) {
2181        memset(g_secret, 0, sizeof(g_secret));
2182        /* reduce scalar to 0 <= scalar < 2^256 */
2183        if ((BN_num_bits(scalar) > 256) || (BN_is_negative(scalar))) {
2184            /*
2185             * this is an unusual input, and we don't guarantee
2186             * constant-timeness
2187             */
2188            if (!BN_nnmod(tmp_scalar, scalar, group->order, ctx)) {
2189                ERR_raise(ERR_LIB_EC, ERR_R_BN_LIB);
2190                goto err;
2191            }
2192            num_bytes = BN_bn2lebinpad(tmp_scalar, g_secret, sizeof(g_secret));
2193        } else {
2194            num_bytes = BN_bn2lebinpad(scalar, g_secret, sizeof(g_secret));
2195        }
2196        /* do the multiplication with generator precomputation */
2197        batch_mul(x_out, y_out, z_out,
2198                  (const felem_bytearray(*))secrets, num_points,
2199                  g_secret,
2200                  mixed, (const smallfelem(*)[17][3])pre_comp, g_pre_comp);
2201    } else {
2202        /* do the multiplication without generator precomputation */
2203        batch_mul(x_out, y_out, z_out,
2204                  (const felem_bytearray(*))secrets, num_points,
2205                  NULL, mixed, (const smallfelem(*)[17][3])pre_comp, NULL);
2206    }
2207    /* reduce the output to its unique minimal representation */
2208    felem_contract(x_in, x_out);
2209    felem_contract(y_in, y_out);
2210    felem_contract(z_in, z_out);
2211    if ((!smallfelem_to_BN(x, x_in)) || (!smallfelem_to_BN(y, y_in)) ||
2212        (!smallfelem_to_BN(z, z_in))) {
2213        ERR_raise(ERR_LIB_EC, ERR_R_BN_LIB);
2214        goto err;
2215    }
2216    ret = ossl_ec_GFp_simple_set_Jprojective_coordinates_GFp(group, r, x, y, z,
2217                                                             ctx);
2218
2219 err:
2220    BN_CTX_end(ctx);
2221    EC_POINT_free(generator);
2222    OPENSSL_free(secrets);
2223    OPENSSL_free(pre_comp);
2224    OPENSSL_free(tmp_smallfelems);
2225    return ret;
2226}
2227
2228int ossl_ec_GFp_nistp256_precompute_mult(EC_GROUP *group, BN_CTX *ctx)
2229{
2230    int ret = 0;
2231    NISTP256_PRE_COMP *pre = NULL;
2232    int i, j;
2233    BIGNUM *x, *y;
2234    EC_POINT *generator = NULL;
2235    smallfelem tmp_smallfelems[32];
2236    felem x_tmp, y_tmp, z_tmp;
2237#ifndef FIPS_MODULE
2238    BN_CTX *new_ctx = NULL;
2239#endif
2240
2241    /* throw away old precomputation */
2242    EC_pre_comp_free(group);
2243
2244#ifndef FIPS_MODULE
2245    if (ctx == NULL)
2246        ctx = new_ctx = BN_CTX_new();
2247#endif
2248    if (ctx == NULL)
2249        return 0;
2250
2251    BN_CTX_start(ctx);
2252    x = BN_CTX_get(ctx);
2253    y = BN_CTX_get(ctx);
2254    if (y == NULL)
2255        goto err;
2256    /* get the generator */
2257    if (group->generator == NULL)
2258        goto err;
2259    generator = EC_POINT_new(group);
2260    if (generator == NULL)
2261        goto err;
2262    BN_bin2bn(nistp256_curve_params[3], sizeof(felem_bytearray), x);
2263    BN_bin2bn(nistp256_curve_params[4], sizeof(felem_bytearray), y);
2264    if (!EC_POINT_set_affine_coordinates(group, generator, x, y, ctx))
2265        goto err;
2266    if ((pre = nistp256_pre_comp_new()) == NULL)
2267        goto err;
2268    /*
2269     * if the generator is the standard one, use built-in precomputation
2270     */
2271    if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) {
2272        memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp));
2273        goto done;
2274    }
2275    if ((!BN_to_felem(x_tmp, group->generator->X)) ||
2276        (!BN_to_felem(y_tmp, group->generator->Y)) ||
2277        (!BN_to_felem(z_tmp, group->generator->Z)))
2278        goto err;
2279    felem_shrink(pre->g_pre_comp[0][1][0], x_tmp);
2280    felem_shrink(pre->g_pre_comp[0][1][1], y_tmp);
2281    felem_shrink(pre->g_pre_comp[0][1][2], z_tmp);
2282    /*
2283     * compute 2^64*G, 2^128*G, 2^192*G for the first table, 2^32*G, 2^96*G,
2284     * 2^160*G, 2^224*G for the second one
2285     */
2286    for (i = 1; i <= 8; i <<= 1) {
2287        point_double_small(pre->g_pre_comp[1][i][0], pre->g_pre_comp[1][i][1],
2288                           pre->g_pre_comp[1][i][2], pre->g_pre_comp[0][i][0],
2289                           pre->g_pre_comp[0][i][1],
2290                           pre->g_pre_comp[0][i][2]);
2291        for (j = 0; j < 31; ++j) {
2292            point_double_small(pre->g_pre_comp[1][i][0],
2293                               pre->g_pre_comp[1][i][1],
2294                               pre->g_pre_comp[1][i][2],
2295                               pre->g_pre_comp[1][i][0],
2296                               pre->g_pre_comp[1][i][1],
2297                               pre->g_pre_comp[1][i][2]);
2298        }
2299        if (i == 8)
2300            break;
2301        point_double_small(pre->g_pre_comp[0][2 * i][0],
2302                           pre->g_pre_comp[0][2 * i][1],
2303                           pre->g_pre_comp[0][2 * i][2],
2304                           pre->g_pre_comp[1][i][0], pre->g_pre_comp[1][i][1],
2305                           pre->g_pre_comp[1][i][2]);
2306        for (j = 0; j < 31; ++j) {
2307            point_double_small(pre->g_pre_comp[0][2 * i][0],
2308                               pre->g_pre_comp[0][2 * i][1],
2309                               pre->g_pre_comp[0][2 * i][2],
2310                               pre->g_pre_comp[0][2 * i][0],
2311                               pre->g_pre_comp[0][2 * i][1],
2312                               pre->g_pre_comp[0][2 * i][2]);
2313        }
2314    }
2315    for (i = 0; i < 2; i++) {
2316        /* g_pre_comp[i][0] is the point at infinity */
2317        memset(pre->g_pre_comp[i][0], 0, sizeof(pre->g_pre_comp[i][0]));
2318        /* the remaining multiples */
2319        /* 2^64*G + 2^128*G resp. 2^96*G + 2^160*G */
2320        point_add_small(pre->g_pre_comp[i][6][0], pre->g_pre_comp[i][6][1],
2321                        pre->g_pre_comp[i][6][2], pre->g_pre_comp[i][4][0],
2322                        pre->g_pre_comp[i][4][1], pre->g_pre_comp[i][4][2],
2323                        pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1],
2324                        pre->g_pre_comp[i][2][2]);
2325        /* 2^64*G + 2^192*G resp. 2^96*G + 2^224*G */
2326        point_add_small(pre->g_pre_comp[i][10][0], pre->g_pre_comp[i][10][1],
2327                        pre->g_pre_comp[i][10][2], pre->g_pre_comp[i][8][0],
2328                        pre->g_pre_comp[i][8][1], pre->g_pre_comp[i][8][2],
2329                        pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1],
2330                        pre->g_pre_comp[i][2][2]);
2331        /* 2^128*G + 2^192*G resp. 2^160*G + 2^224*G */
2332        point_add_small(pre->g_pre_comp[i][12][0], pre->g_pre_comp[i][12][1],
2333                        pre->g_pre_comp[i][12][2], pre->g_pre_comp[i][8][0],
2334                        pre->g_pre_comp[i][8][1], pre->g_pre_comp[i][8][2],
2335                        pre->g_pre_comp[i][4][0], pre->g_pre_comp[i][4][1],
2336                        pre->g_pre_comp[i][4][2]);
2337        /*
2338         * 2^64*G + 2^128*G + 2^192*G resp. 2^96*G + 2^160*G + 2^224*G
2339         */
2340        point_add_small(pre->g_pre_comp[i][14][0], pre->g_pre_comp[i][14][1],
2341                        pre->g_pre_comp[i][14][2], pre->g_pre_comp[i][12][0],
2342                        pre->g_pre_comp[i][12][1], pre->g_pre_comp[i][12][2],
2343                        pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1],
2344                        pre->g_pre_comp[i][2][2]);
2345        for (j = 1; j < 8; ++j) {
2346            /* odd multiples: add G resp. 2^32*G */
2347            point_add_small(pre->g_pre_comp[i][2 * j + 1][0],
2348                            pre->g_pre_comp[i][2 * j + 1][1],
2349                            pre->g_pre_comp[i][2 * j + 1][2],
2350                            pre->g_pre_comp[i][2 * j][0],
2351                            pre->g_pre_comp[i][2 * j][1],
2352                            pre->g_pre_comp[i][2 * j][2],
2353                            pre->g_pre_comp[i][1][0],
2354                            pre->g_pre_comp[i][1][1],
2355                            pre->g_pre_comp[i][1][2]);
2356        }
2357    }
2358    make_points_affine(31, &(pre->g_pre_comp[0][1]), tmp_smallfelems);
2359
2360 done:
2361    SETPRECOMP(group, nistp256, pre);
2362    pre = NULL;
2363    ret = 1;
2364
2365 err:
2366    BN_CTX_end(ctx);
2367    EC_POINT_free(generator);
2368#ifndef FIPS_MODULE
2369    BN_CTX_free(new_ctx);
2370#endif
2371    EC_nistp256_pre_comp_free(pre);
2372    return ret;
2373}
2374
2375int ossl_ec_GFp_nistp256_have_precompute_mult(const EC_GROUP *group)
2376{
2377    return HAVEPRECOMP(group, nistp256);
2378}
2379