crypto/ec/ecp_nistp256.c

238384Sjkim/* crypto/ec/ecp_nistp256.c */
238384Sjkim/*
238384Sjkim * Written by Adam Langley (Google) for the OpenSSL project
238384Sjkim */
238384Sjkim/* Copyright 2011 Google Inc.
238384Sjkim *
238384Sjkim * Licensed under the Apache License, Version 2.0 (the "License");
238384Sjkim *
238384Sjkim * you may not use this file except in compliance with the License.
238384Sjkim * You may obtain a copy of the License at
238384Sjkim *
238384Sjkim *     http://www.apache.org/licenses/LICENSE-2.0
238384Sjkim *
238384Sjkim *  Unless required by applicable law or agreed to in writing, software
238384Sjkim *  distributed under the License is distributed on an "AS IS" BASIS,
238384Sjkim *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
238384Sjkim *  See the License for the specific language governing permissions and
238384Sjkim *  limitations under the License.
238384Sjkim */
238384Sjkim
238384Sjkim/*
238384Sjkim * A 64-bit implementation of the NIST P-256 elliptic curve point multiplication
238384Sjkim *
238384Sjkim * OpenSSL integration was taken from Emilia Kasper's work in ecp_nistp224.c.
238384Sjkim * Otherwise based on Emilia's P224 work, which was inspired by my curve25519
238384Sjkim * work which got its smarts from Daniel J. Bernstein's work on the same.
238384Sjkim */
238384Sjkim
238384Sjkim#include <openssl/opensslconf.h>
238384Sjkim#ifndef OPENSSL_NO_EC_NISTP_64_GCC_128
238384Sjkim
280304Sjkim# ifndef OPENSSL_SYS_VMS
280304Sjkim#  include <stdint.h>
280304Sjkim# else
280304Sjkim#  include <inttypes.h>
280304Sjkim# endif
238384Sjkim
280304Sjkim# include <string.h>
280304Sjkim# include <openssl/err.h>
280304Sjkim# include "ec_lcl.h"
238384Sjkim
280304Sjkim# if defined(__GNUC__) && (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))
238384Sjkim  /* even with gcc, the typedef won't work for 32-bit platforms */
280304Sjkimtypedef __uint128_t uint128_t;  /* nonstandard; implemented by gcc on 64-bit
280304Sjkim                                 * platforms */
280304Sjkimtypedef __int128_t int128_t;
280304Sjkim# else
280304Sjkim#  error "Need GCC 3.1 or later to define type uint128_t"
280304Sjkim# endif
238384Sjkim
238384Sjkimtypedef uint8_t u8;
238384Sjkimtypedef uint32_t u32;
238384Sjkimtypedef uint64_t u64;
238384Sjkimtypedef int64_t s64;
238384Sjkim
280304Sjkim/*
280304Sjkim * The underlying field. P256 operates over GF(2^256-2^224+2^192+2^96-1). We
280304Sjkim * can serialise an element of this field into 32 bytes. We call this an
280304Sjkim * felem_bytearray.
280304Sjkim */
238384Sjkim
238384Sjkimtypedef u8 felem_bytearray[32];
238384Sjkim
280304Sjkim/*
280304Sjkim * These are the parameters of P256, taken from FIPS 186-3, page 86. These
280304Sjkim * values are big-endian.
280304Sjkim */
238384Sjkimstatic const felem_bytearray nistp256_curve_params[5] = {
280304Sjkim    {0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, /* p */
280304Sjkim     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
280304Sjkim     0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,
280304Sjkim     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
280304Sjkim    {0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, /* a = -3 */
280304Sjkim     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
280304Sjkim     0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,
280304Sjkim     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xfc}, /* b */
280304Sjkim    {0x5a, 0xc6, 0x35, 0xd8, 0xaa, 0x3a, 0x93, 0xe7,
280304Sjkim     0xb3, 0xeb, 0xbd, 0x55, 0x76, 0x98, 0x86, 0xbc,
280304Sjkim     0x65, 0x1d, 0x06, 0xb0, 0xcc, 0x53, 0xb0, 0xf6,
280304Sjkim     0x3b, 0xce, 0x3c, 0x3e, 0x27, 0xd2, 0x60, 0x4b},
280304Sjkim    {0x6b, 0x17, 0xd1, 0xf2, 0xe1, 0x2c, 0x42, 0x47, /* x */
280304Sjkim     0xf8, 0xbc, 0xe6, 0xe5, 0x63, 0xa4, 0x40, 0xf2,
280304Sjkim     0x77, 0x03, 0x7d, 0x81, 0x2d, 0xeb, 0x33, 0xa0,
280304Sjkim     0xf4, 0xa1, 0x39, 0x45, 0xd8, 0x98, 0xc2, 0x96},
280304Sjkim    {0x4f, 0xe3, 0x42, 0xe2, 0xfe, 0x1a, 0x7f, 0x9b, /* y */
280304Sjkim     0x8e, 0xe7, 0xeb, 0x4a, 0x7c, 0x0f, 0x9e, 0x16,
280304Sjkim     0x2b, 0xce, 0x33, 0x57, 0x6b, 0x31, 0x5e, 0xce,
280304Sjkim     0xcb, 0xb6, 0x40, 0x68, 0x37, 0xbf, 0x51, 0xf5}
238384Sjkim};
238384Sjkim
280304Sjkim/*-
280304Sjkim * The representation of field elements.
238384Sjkim * ------------------------------------
238384Sjkim *
238384Sjkim * We represent field elements with either four 128-bit values, eight 128-bit
238384Sjkim * values, or four 64-bit values. The field element represented is:
238384Sjkim *   v[0]*2^0 + v[1]*2^64 + v[2]*2^128 + v[3]*2^192  (mod p)
238384Sjkim * or:
238384Sjkim *   v[0]*2^0 + v[1]*2^64 + v[2]*2^128 + ... + v[8]*2^512  (mod p)
238384Sjkim *
238384Sjkim * 128-bit values are called 'limbs'. Since the limbs are spaced only 64 bits
238384Sjkim * apart, but are 128-bits wide, the most significant bits of each limb overlap
238384Sjkim * with the least significant bits of the next.
238384Sjkim *
238384Sjkim * A field element with four limbs is an 'felem'. One with eight limbs is a
238384Sjkim * 'longfelem'
238384Sjkim *
238384Sjkim * A field element with four, 64-bit values is called a 'smallfelem'. Small
238384Sjkim * values are used as intermediate values before multiplication.
238384Sjkim */
238384Sjkim
280304Sjkim# define NLIMBS 4
238384Sjkim
238384Sjkimtypedef uint128_t limb;
238384Sjkimtypedef limb felem[NLIMBS];
238384Sjkimtypedef limb longfelem[NLIMBS * 2];
238384Sjkimtypedef u64 smallfelem[NLIMBS];
238384Sjkim
238384Sjkim/* This is the value of the prime as four 64-bit words, little-endian. */
280304Sjkimstatic const u64 kPrime[4] =
280304Sjkim    { 0xfffffffffffffffful, 0xffffffff, 0, 0xffffffff00000001ul };
238384Sjkimstatic const u64 bottom63bits = 0x7ffffffffffffffful;
238384Sjkim
280304Sjkim/*
280304Sjkim * bin32_to_felem takes a little-endian byte array and converts it into felem
280304Sjkim * form. This assumes that the CPU is little-endian.
280304Sjkim */
238384Sjkimstatic void bin32_to_felem(felem out, const u8 in[32])
280304Sjkim{
280304Sjkim    out[0] = *((u64 *)&in[0]);
280304Sjkim    out[1] = *((u64 *)&in[8]);
280304Sjkim    out[2] = *((u64 *)&in[16]);
280304Sjkim    out[3] = *((u64 *)&in[24]);
280304Sjkim}
238384Sjkim
280304Sjkim/*
280304Sjkim * smallfelem_to_bin32 takes a smallfelem and serialises into a little
280304Sjkim * endian, 32 byte array. This assumes that the CPU is little-endian.
280304Sjkim */
238384Sjkimstatic void smallfelem_to_bin32(u8 out[32], const smallfelem in)
280304Sjkim{
280304Sjkim    *((u64 *)&out[0]) = in[0];
280304Sjkim    *((u64 *)&out[8]) = in[1];
280304Sjkim    *((u64 *)&out[16]) = in[2];
280304Sjkim    *((u64 *)&out[24]) = in[3];
280304Sjkim}
238384Sjkim
238384Sjkim/* To preserve endianness when using BN_bn2bin and BN_bin2bn */
238384Sjkimstatic void flip_endian(u8 *out, const u8 *in, unsigned len)
280304Sjkim{
280304Sjkim    unsigned i;
280304Sjkim    for (i = 0; i < len; ++i)
280304Sjkim        out[i] = in[len - 1 - i];
280304Sjkim}
238384Sjkim
238384Sjkim/* BN_to_felem converts an OpenSSL BIGNUM into an felem */
238384Sjkimstatic int BN_to_felem(felem out, const BIGNUM *bn)
280304Sjkim{
280304Sjkim    felem_bytearray b_in;
280304Sjkim    felem_bytearray b_out;
280304Sjkim    unsigned num_bytes;
238384Sjkim
280304Sjkim    /* BN_bn2bin eats leading zeroes */
280304Sjkim    memset(b_out, 0, sizeof b_out);
280304Sjkim    num_bytes = BN_num_bytes(bn);
280304Sjkim    if (num_bytes > sizeof b_out) {
280304Sjkim        ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
280304Sjkim        return 0;
280304Sjkim    }
280304Sjkim    if (BN_is_negative(bn)) {
280304Sjkim        ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
280304Sjkim        return 0;
280304Sjkim    }
280304Sjkim    num_bytes = BN_bn2bin(bn, b_in);
280304Sjkim    flip_endian(b_out, b_in, num_bytes);
280304Sjkim    bin32_to_felem(out, b_out);
280304Sjkim    return 1;
280304Sjkim}
238384Sjkim
238384Sjkim/* felem_to_BN converts an felem into an OpenSSL BIGNUM */
238384Sjkimstatic BIGNUM *smallfelem_to_BN(BIGNUM *out, const smallfelem in)
280304Sjkim{
280304Sjkim    felem_bytearray b_in, b_out;
280304Sjkim    smallfelem_to_bin32(b_in, in);
280304Sjkim    flip_endian(b_out, b_in, sizeof b_out);
280304Sjkim    return BN_bin2bn(b_out, sizeof b_out, out);
280304Sjkim}
238384Sjkim
280304Sjkim/*-
280304Sjkim * Field operations
280304Sjkim * ----------------
280304Sjkim */
238384Sjkim
238384Sjkimstatic void smallfelem_one(smallfelem out)
280304Sjkim{
280304Sjkim    out[0] = 1;
280304Sjkim    out[1] = 0;
280304Sjkim    out[2] = 0;
280304Sjkim    out[3] = 0;
280304Sjkim}
238384Sjkim
238384Sjkimstatic void smallfelem_assign(smallfelem out, const smallfelem in)
280304Sjkim{
280304Sjkim    out[0] = in[0];
280304Sjkim    out[1] = in[1];
280304Sjkim    out[2] = in[2];
280304Sjkim    out[3] = in[3];
280304Sjkim}
238384Sjkim
238384Sjkimstatic void felem_assign(felem out, const felem in)
280304Sjkim{
280304Sjkim    out[0] = in[0];
280304Sjkim    out[1] = in[1];
280304Sjkim    out[2] = in[2];
280304Sjkim    out[3] = in[3];
280304Sjkim}
238384Sjkim
238384Sjkim/* felem_sum sets out = out + in. */
238384Sjkimstatic void felem_sum(felem out, const felem in)
280304Sjkim{
280304Sjkim    out[0] += in[0];
280304Sjkim    out[1] += in[1];
280304Sjkim    out[2] += in[2];
280304Sjkim    out[3] += in[3];
280304Sjkim}
238384Sjkim
238384Sjkim/* felem_small_sum sets out = out + in. */
238384Sjkimstatic void felem_small_sum(felem out, const smallfelem in)
280304Sjkim{
280304Sjkim    out[0] += in[0];
280304Sjkim    out[1] += in[1];
280304Sjkim    out[2] += in[2];
280304Sjkim    out[3] += in[3];
280304Sjkim}
238384Sjkim
238384Sjkim/* felem_scalar sets out = out * scalar */
238384Sjkimstatic void felem_scalar(felem out, const u64 scalar)
280304Sjkim{
280304Sjkim    out[0] *= scalar;
280304Sjkim    out[1] *= scalar;
280304Sjkim    out[2] *= scalar;
280304Sjkim    out[3] *= scalar;
280304Sjkim}
238384Sjkim
238384Sjkim/* longfelem_scalar sets out = out * scalar */
238384Sjkimstatic void longfelem_scalar(longfelem out, const u64 scalar)
280304Sjkim{
280304Sjkim    out[0] *= scalar;
280304Sjkim    out[1] *= scalar;
280304Sjkim    out[2] *= scalar;
280304Sjkim    out[3] *= scalar;
280304Sjkim    out[4] *= scalar;
280304Sjkim    out[5] *= scalar;
280304Sjkim    out[6] *= scalar;
280304Sjkim    out[7] *= scalar;
280304Sjkim}
238384Sjkim
280304Sjkim# define two105m41m9 (((limb)1) << 105) - (((limb)1) << 41) - (((limb)1) << 9)
280304Sjkim# define two105 (((limb)1) << 105)
280304Sjkim# define two105m41p9 (((limb)1) << 105) - (((limb)1) << 41) + (((limb)1) << 9)
238384Sjkim
238384Sjkim/* zero105 is 0 mod p */
280304Sjkimstatic const felem zero105 =
280304Sjkim    { two105m41m9, two105, two105m41p9, two105m41p9 };
238384Sjkim
280304Sjkim/*-
280304Sjkim * smallfelem_neg sets |out| to |-small|
238384Sjkim * On exit:
238384Sjkim *   out[i] < out[i] + 2^105
238384Sjkim */
238384Sjkimstatic void smallfelem_neg(felem out, const smallfelem small)
280304Sjkim{
280304Sjkim    /* In order to prevent underflow, we subtract from 0 mod p. */
280304Sjkim    out[0] = zero105[0] - small[0];
280304Sjkim    out[1] = zero105[1] - small[1];
280304Sjkim    out[2] = zero105[2] - small[2];
280304Sjkim    out[3] = zero105[3] - small[3];
280304Sjkim}
238384Sjkim
280304Sjkim/*-
280304Sjkim * felem_diff subtracts |in| from |out|
238384Sjkim * On entry:
238384Sjkim *   in[i] < 2^104
238384Sjkim * On exit:
238384Sjkim *   out[i] < out[i] + 2^105
238384Sjkim */
238384Sjkimstatic void felem_diff(felem out, const felem in)
280304Sjkim{
280304Sjkim    /*
280304Sjkim     * In order to prevent underflow, we add 0 mod p before subtracting.
280304Sjkim     */
280304Sjkim    out[0] += zero105[0];
280304Sjkim    out[1] += zero105[1];
280304Sjkim    out[2] += zero105[2];
280304Sjkim    out[3] += zero105[3];
238384Sjkim
280304Sjkim    out[0] -= in[0];
280304Sjkim    out[1] -= in[1];
280304Sjkim    out[2] -= in[2];
280304Sjkim    out[3] -= in[3];
280304Sjkim}
238384Sjkim
280304Sjkim# define two107m43m11 (((limb)1) << 107) - (((limb)1) << 43) - (((limb)1) << 11)
280304Sjkim# define two107 (((limb)1) << 107)
280304Sjkim# define two107m43p11 (((limb)1) << 107) - (((limb)1) << 43) + (((limb)1) << 11)
238384Sjkim
238384Sjkim/* zero107 is 0 mod p */
280304Sjkimstatic const felem zero107 =
280304Sjkim    { two107m43m11, two107, two107m43p11, two107m43p11 };
238384Sjkim
280304Sjkim/*-
280304Sjkim * An alternative felem_diff for larger inputs |in|
238384Sjkim * felem_diff_zero107 subtracts |in| from |out|
238384Sjkim * On entry:
238384Sjkim *   in[i] < 2^106
238384Sjkim * On exit:
238384Sjkim *   out[i] < out[i] + 2^107
238384Sjkim */
238384Sjkimstatic void felem_diff_zero107(felem out, const felem in)
280304Sjkim{
280304Sjkim    /*
280304Sjkim     * In order to prevent underflow, we add 0 mod p before subtracting.
280304Sjkim     */
280304Sjkim    out[0] += zero107[0];
280304Sjkim    out[1] += zero107[1];
280304Sjkim    out[2] += zero107[2];
280304Sjkim    out[3] += zero107[3];
238384Sjkim
280304Sjkim    out[0] -= in[0];
280304Sjkim    out[1] -= in[1];
280304Sjkim    out[2] -= in[2];
280304Sjkim    out[3] -= in[3];
280304Sjkim}
238384Sjkim
280304Sjkim/*-
280304Sjkim * longfelem_diff subtracts |in| from |out|
238384Sjkim * On entry:
238384Sjkim *   in[i] < 7*2^67
238384Sjkim * On exit:
238384Sjkim *   out[i] < out[i] + 2^70 + 2^40
238384Sjkim */
238384Sjkimstatic void longfelem_diff(longfelem out, const longfelem in)
280304Sjkim{
280304Sjkim    static const limb two70m8p6 =
280304Sjkim        (((limb) 1) << 70) - (((limb) 1) << 8) + (((limb) 1) << 6);
280304Sjkim    static const limb two70p40 = (((limb) 1) << 70) + (((limb) 1) << 40);
280304Sjkim    static const limb two70 = (((limb) 1) << 70);
280304Sjkim    static const limb two70m40m38p6 =
280304Sjkim        (((limb) 1) << 70) - (((limb) 1) << 40) - (((limb) 1) << 38) +
280304Sjkim        (((limb) 1) << 6);
280304Sjkim    static const limb two70m6 = (((limb) 1) << 70) - (((limb) 1) << 6);
238384Sjkim
280304Sjkim    /* add 0 mod p to avoid underflow */
280304Sjkim    out[0] += two70m8p6;
280304Sjkim    out[1] += two70p40;
280304Sjkim    out[2] += two70;
280304Sjkim    out[3] += two70m40m38p6;
280304Sjkim    out[4] += two70m6;
280304Sjkim    out[5] += two70m6;
280304Sjkim    out[6] += two70m6;
280304Sjkim    out[7] += two70m6;
238384Sjkim
280304Sjkim    /* in[i] < 7*2^67 < 2^70 - 2^40 - 2^38 + 2^6 */
280304Sjkim    out[0] -= in[0];
280304Sjkim    out[1] -= in[1];
280304Sjkim    out[2] -= in[2];
280304Sjkim    out[3] -= in[3];
280304Sjkim    out[4] -= in[4];
280304Sjkim    out[5] -= in[5];
280304Sjkim    out[6] -= in[6];
280304Sjkim    out[7] -= in[7];
280304Sjkim}
238384Sjkim
280304Sjkim# define two64m0 (((limb)1) << 64) - 1
280304Sjkim# define two110p32m0 (((limb)1) << 110) + (((limb)1) << 32) - 1
280304Sjkim# define two64m46 (((limb)1) << 64) - (((limb)1) << 46)
280304Sjkim# define two64m32 (((limb)1) << 64) - (((limb)1) << 32)
238384Sjkim
238384Sjkim/* zero110 is 0 mod p */
238384Sjkimstatic const felem zero110 = { two64m0, two110p32m0, two64m46, two64m32 };
238384Sjkim
280304Sjkim/*-
280304Sjkim * felem_shrink converts an felem into a smallfelem. The result isn't quite
238384Sjkim * minimal as the value may be greater than p.
238384Sjkim *
238384Sjkim * On entry:
238384Sjkim *   in[i] < 2^109
238384Sjkim * On exit:
238384Sjkim *   out[i] < 2^64
238384Sjkim */
238384Sjkimstatic void felem_shrink(smallfelem out, const felem in)
280304Sjkim{
280304Sjkim    felem tmp;
280304Sjkim    u64 a, b, mask;
280304Sjkim    s64 high, low;
280304Sjkim    static const u64 kPrime3Test = 0x7fffffff00000001ul; /* 2^63 - 2^32 + 1 */
238384Sjkim
280304Sjkim    /* Carry 2->3 */
280304Sjkim    tmp[3] = zero110[3] + in[3] + ((u64)(in[2] >> 64));
280304Sjkim    /* tmp[3] < 2^110 */
238384Sjkim
280304Sjkim    tmp[2] = zero110[2] + (u64)in[2];
280304Sjkim    tmp[0] = zero110[0] + in[0];
280304Sjkim    tmp[1] = zero110[1] + in[1];
280304Sjkim    /* tmp[0] < 2**110, tmp[1] < 2^111, tmp[2] < 2**65 */
238384Sjkim
280304Sjkim    /*
280304Sjkim     * We perform two partial reductions where we eliminate the high-word of
280304Sjkim     * tmp[3]. We don't update the other words till the end.
280304Sjkim     */
280304Sjkim    a = tmp[3] >> 64;           /* a < 2^46 */
280304Sjkim    tmp[3] = (u64)tmp[3];
280304Sjkim    tmp[3] -= a;
280304Sjkim    tmp[3] += ((limb) a) << 32;
280304Sjkim    /* tmp[3] < 2^79 */
238384Sjkim
280304Sjkim    b = a;
280304Sjkim    a = tmp[3] >> 64;           /* a < 2^15 */
280304Sjkim    b += a;                     /* b < 2^46 + 2^15 < 2^47 */
280304Sjkim    tmp[3] = (u64)tmp[3];
280304Sjkim    tmp[3] -= a;
280304Sjkim    tmp[3] += ((limb) a) << 32;
280304Sjkim    /* tmp[3] < 2^64 + 2^47 */
238384Sjkim
280304Sjkim    /*
280304Sjkim     * This adjusts the other two words to complete the two partial
280304Sjkim     * reductions.
280304Sjkim     */
280304Sjkim    tmp[0] += b;
280304Sjkim    tmp[1] -= (((limb) b) << 32);
238384Sjkim
280304Sjkim    /*
280304Sjkim     * In order to make space in tmp[3] for the carry from 2 -> 3, we
280304Sjkim     * conditionally subtract kPrime if tmp[3] is large enough.
280304Sjkim     */
280304Sjkim    high = tmp[3] >> 64;
280304Sjkim    /* As tmp[3] < 2^65, high is either 1 or 0 */
280304Sjkim    high <<= 63;
280304Sjkim    high >>= 63;
280304Sjkim    /*-
280304Sjkim     * high is:
280304Sjkim     *   all ones   if the high word of tmp[3] is 1
280304Sjkim     *   all zeros  if the high word of tmp[3] if 0 */
280304Sjkim    low = tmp[3];
280304Sjkim    mask = low >> 63;
280304Sjkim    /*-
280304Sjkim     * mask is:
280304Sjkim     *   all ones   if the MSB of low is 1
280304Sjkim     *   all zeros  if the MSB of low if 0 */
280304Sjkim    low &= bottom63bits;
280304Sjkim    low -= kPrime3Test;
280304Sjkim    /* if low was greater than kPrime3Test then the MSB is zero */
280304Sjkim    low = ~low;
280304Sjkim    low >>= 63;
280304Sjkim    /*-
280304Sjkim     * low is:
280304Sjkim     *   all ones   if low was > kPrime3Test
280304Sjkim     *   all zeros  if low was <= kPrime3Test */
280304Sjkim    mask = (mask & low) | high;
280304Sjkim    tmp[0] -= mask & kPrime[0];
280304Sjkim    tmp[1] -= mask & kPrime[1];
280304Sjkim    /* kPrime[2] is zero, so omitted */
280304Sjkim    tmp[3] -= mask & kPrime[3];
280304Sjkim    /* tmp[3] < 2**64 - 2**32 + 1 */
238384Sjkim
280304Sjkim    tmp[1] += ((u64)(tmp[0] >> 64));
280304Sjkim    tmp[0] = (u64)tmp[0];
280304Sjkim    tmp[2] += ((u64)(tmp[1] >> 64));
280304Sjkim    tmp[1] = (u64)tmp[1];
280304Sjkim    tmp[3] += ((u64)(tmp[2] >> 64));
280304Sjkim    tmp[2] = (u64)tmp[2];
280304Sjkim    /* tmp[i] < 2^64 */
238384Sjkim
280304Sjkim    out[0] = tmp[0];
280304Sjkim    out[1] = tmp[1];
280304Sjkim    out[2] = tmp[2];
280304Sjkim    out[3] = tmp[3];
280304Sjkim}
238384Sjkim
238384Sjkim/* smallfelem_expand converts a smallfelem to an felem */
238384Sjkimstatic void smallfelem_expand(felem out, const smallfelem in)
280304Sjkim{
280304Sjkim    out[0] = in[0];
280304Sjkim    out[1] = in[1];
280304Sjkim    out[2] = in[2];
280304Sjkim    out[3] = in[3];
280304Sjkim}
238384Sjkim
280304Sjkim/*-
280304Sjkim * smallfelem_square sets |out| = |small|^2
238384Sjkim * On entry:
238384Sjkim *   small[i] < 2^64
238384Sjkim * On exit:
238384Sjkim *   out[i] < 7 * 2^64 < 2^67
238384Sjkim */
238384Sjkimstatic void smallfelem_square(longfelem out, const smallfelem small)
280304Sjkim{
280304Sjkim    limb a;
280304Sjkim    u64 high, low;
238384Sjkim
280304Sjkim    a = ((uint128_t) small[0]) * small[0];
280304Sjkim    low = a;
280304Sjkim    high = a >> 64;
280304Sjkim    out[0] = low;
280304Sjkim    out[1] = high;
238384Sjkim
280304Sjkim    a = ((uint128_t) small[0]) * small[1];
280304Sjkim    low = a;
280304Sjkim    high = a >> 64;
280304Sjkim    out[1] += low;
280304Sjkim    out[1] += low;
280304Sjkim    out[2] = high;
238384Sjkim
280304Sjkim    a = ((uint128_t) small[0]) * small[2];
280304Sjkim    low = a;
280304Sjkim    high = a >> 64;
280304Sjkim    out[2] += low;
280304Sjkim    out[2] *= 2;
280304Sjkim    out[3] = high;
238384Sjkim
280304Sjkim    a = ((uint128_t) small[0]) * small[3];
280304Sjkim    low = a;
280304Sjkim    high = a >> 64;
280304Sjkim    out[3] += low;
280304Sjkim    out[4] = high;
238384Sjkim
280304Sjkim    a = ((uint128_t) small[1]) * small[2];
280304Sjkim    low = a;
280304Sjkim    high = a >> 64;
280304Sjkim    out[3] += low;
280304Sjkim    out[3] *= 2;
280304Sjkim    out[4] += high;
238384Sjkim
280304Sjkim    a = ((uint128_t) small[1]) * small[1];
280304Sjkim    low = a;
280304Sjkim    high = a >> 64;
280304Sjkim    out[2] += low;
280304Sjkim    out[3] += high;
238384Sjkim
280304Sjkim    a = ((uint128_t) small[1]) * small[3];
280304Sjkim    low = a;
280304Sjkim    high = a >> 64;
280304Sjkim    out[4] += low;
280304Sjkim    out[4] *= 2;
280304Sjkim    out[5] = high;
238384Sjkim
280304Sjkim    a = ((uint128_t) small[2]) * small[3];
280304Sjkim    low = a;
280304Sjkim    high = a >> 64;
280304Sjkim    out[5] += low;
280304Sjkim    out[5] *= 2;
280304Sjkim    out[6] = high;
280304Sjkim    out[6] += high;
238384Sjkim
280304Sjkim    a = ((uint128_t) small[2]) * small[2];
280304Sjkim    low = a;
280304Sjkim    high = a >> 64;
280304Sjkim    out[4] += low;
280304Sjkim    out[5] += high;
238384Sjkim
280304Sjkim    a = ((uint128_t) small[3]) * small[3];
280304Sjkim    low = a;
280304Sjkim    high = a >> 64;
280304Sjkim    out[6] += low;
280304Sjkim    out[7] = high;
280304Sjkim}
238384Sjkim
280304Sjkim/*-
280304Sjkim * felem_square sets |out| = |in|^2
238384Sjkim * On entry:
238384Sjkim *   in[i] < 2^109
238384Sjkim * On exit:
238384Sjkim *   out[i] < 7 * 2^64 < 2^67
238384Sjkim */
238384Sjkimstatic void felem_square(longfelem out, const felem in)
280304Sjkim{
280304Sjkim    u64 small[4];
280304Sjkim    felem_shrink(small, in);
280304Sjkim    smallfelem_square(out, small);
280304Sjkim}
238384Sjkim
280304Sjkim/*-
280304Sjkim * smallfelem_mul sets |out| = |small1| * |small2|
238384Sjkim * On entry:
238384Sjkim *   small1[i] < 2^64
238384Sjkim *   small2[i] < 2^64
238384Sjkim * On exit:
238384Sjkim *   out[i] < 7 * 2^64 < 2^67
238384Sjkim */
280304Sjkimstatic void smallfelem_mul(longfelem out, const smallfelem small1,
280304Sjkim                           const smallfelem small2)
280304Sjkim{
280304Sjkim    limb a;
280304Sjkim    u64 high, low;
238384Sjkim
280304Sjkim    a = ((uint128_t) small1[0]) * small2[0];
280304Sjkim    low = a;
280304Sjkim    high = a >> 64;
280304Sjkim    out[0] = low;
280304Sjkim    out[1] = high;
238384Sjkim
280304Sjkim    a = ((uint128_t) small1[0]) * small2[1];
280304Sjkim    low = a;
280304Sjkim    high = a >> 64;
280304Sjkim    out[1] += low;
280304Sjkim    out[2] = high;
238384Sjkim
280304Sjkim    a = ((uint128_t) small1[1]) * small2[0];
280304Sjkim    low = a;
280304Sjkim    high = a >> 64;
280304Sjkim    out[1] += low;
280304Sjkim    out[2] += high;
238384Sjkim
280304Sjkim    a = ((uint128_t) small1[0]) * small2[2];
280304Sjkim    low = a;
280304Sjkim    high = a >> 64;
280304Sjkim    out[2] += low;
280304Sjkim    out[3] = high;
238384Sjkim
280304Sjkim    a = ((uint128_t) small1[1]) * small2[1];
280304Sjkim    low = a;
280304Sjkim    high = a >> 64;
280304Sjkim    out[2] += low;
280304Sjkim    out[3] += high;
238384Sjkim
280304Sjkim    a = ((uint128_t) small1[2]) * small2[0];
280304Sjkim    low = a;
280304Sjkim    high = a >> 64;
280304Sjkim    out[2] += low;
280304Sjkim    out[3] += high;
238384Sjkim
280304Sjkim    a = ((uint128_t) small1[0]) * small2[3];
280304Sjkim    low = a;
280304Sjkim    high = a >> 64;
280304Sjkim    out[3] += low;
280304Sjkim    out[4] = high;
238384Sjkim
280304Sjkim    a = ((uint128_t) small1[1]) * small2[2];
280304Sjkim    low = a;
280304Sjkim    high = a >> 64;
280304Sjkim    out[3] += low;
280304Sjkim    out[4] += high;
238384Sjkim
280304Sjkim    a = ((uint128_t) small1[2]) * small2[1];
280304Sjkim    low = a;
280304Sjkim    high = a >> 64;
280304Sjkim    out[3] += low;
280304Sjkim    out[4] += high;
238384Sjkim
280304Sjkim    a = ((uint128_t) small1[3]) * small2[0];
280304Sjkim    low = a;
280304Sjkim    high = a >> 64;
280304Sjkim    out[3] += low;
280304Sjkim    out[4] += high;
238384Sjkim
280304Sjkim    a = ((uint128_t) small1[1]) * small2[3];
280304Sjkim    low = a;
280304Sjkim    high = a >> 64;
280304Sjkim    out[4] += low;
280304Sjkim    out[5] = high;
238384Sjkim
280304Sjkim    a = ((uint128_t) small1[2]) * small2[2];
280304Sjkim    low = a;
280304Sjkim    high = a >> 64;
280304Sjkim    out[4] += low;
280304Sjkim    out[5] += high;
238384Sjkim
280304Sjkim    a = ((uint128_t) small1[3]) * small2[1];
280304Sjkim    low = a;
280304Sjkim    high = a >> 64;
280304Sjkim    out[4] += low;
280304Sjkim    out[5] += high;
238384Sjkim
280304Sjkim    a = ((uint128_t) small1[2]) * small2[3];
280304Sjkim    low = a;
280304Sjkim    high = a >> 64;
280304Sjkim    out[5] += low;
280304Sjkim    out[6] = high;
238384Sjkim
280304Sjkim    a = ((uint128_t) small1[3]) * small2[2];
280304Sjkim    low = a;
280304Sjkim    high = a >> 64;
280304Sjkim    out[5] += low;
280304Sjkim    out[6] += high;
238384Sjkim
280304Sjkim    a = ((uint128_t) small1[3]) * small2[3];
280304Sjkim    low = a;
280304Sjkim    high = a >> 64;
280304Sjkim    out[6] += low;
280304Sjkim    out[7] = high;
280304Sjkim}
238384Sjkim
280304Sjkim/*-
280304Sjkim * felem_mul sets |out| = |in1| * |in2|
238384Sjkim * On entry:
238384Sjkim *   in1[i] < 2^109
238384Sjkim *   in2[i] < 2^109
238384Sjkim * On exit:
238384Sjkim *   out[i] < 7 * 2^64 < 2^67
238384Sjkim */
238384Sjkimstatic void felem_mul(longfelem out, const felem in1, const felem in2)
280304Sjkim{
280304Sjkim    smallfelem small1, small2;
280304Sjkim    felem_shrink(small1, in1);
280304Sjkim    felem_shrink(small2, in2);
280304Sjkim    smallfelem_mul(out, small1, small2);
280304Sjkim}
238384Sjkim
280304Sjkim/*-
280304Sjkim * felem_small_mul sets |out| = |small1| * |in2|
238384Sjkim * On entry:
238384Sjkim *   small1[i] < 2^64
238384Sjkim *   in2[i] < 2^109
238384Sjkim * On exit:
238384Sjkim *   out[i] < 7 * 2^64 < 2^67
238384Sjkim */
280304Sjkimstatic void felem_small_mul(longfelem out, const smallfelem small1,
280304Sjkim                            const felem in2)
280304Sjkim{
280304Sjkim    smallfelem small2;
280304Sjkim    felem_shrink(small2, in2);
280304Sjkim    smallfelem_mul(out, small1, small2);
280304Sjkim}
238384Sjkim
280304Sjkim# define two100m36m4 (((limb)1) << 100) - (((limb)1) << 36) - (((limb)1) << 4)
280304Sjkim# define two100 (((limb)1) << 100)
280304Sjkim# define two100m36p4 (((limb)1) << 100) - (((limb)1) << 36) + (((limb)1) << 4)
238384Sjkim/* zero100 is 0 mod p */
280304Sjkimstatic const felem zero100 =
280304Sjkim    { two100m36m4, two100, two100m36p4, two100m36p4 };
238384Sjkim
280304Sjkim/*-
280304Sjkim * Internal function for the different flavours of felem_reduce.
238384Sjkim * felem_reduce_ reduces the higher coefficients in[4]-in[7].
238384Sjkim * On entry:
280304Sjkim *   out[0] >= in[6] + 2^32*in[6] + in[7] + 2^32*in[7]
238384Sjkim *   out[1] >= in[7] + 2^32*in[4]
238384Sjkim *   out[2] >= in[5] + 2^32*in[5]
238384Sjkim *   out[3] >= in[4] + 2^32*in[5] + 2^32*in[6]
238384Sjkim * On exit:
238384Sjkim *   out[0] <= out[0] + in[4] + 2^32*in[5]
238384Sjkim *   out[1] <= out[1] + in[5] + 2^33*in[6]
238384Sjkim *   out[2] <= out[2] + in[7] + 2*in[6] + 2^33*in[7]
238384Sjkim *   out[3] <= out[3] + 2^32*in[4] + 3*in[7]
238384Sjkim */
238384Sjkimstatic void felem_reduce_(felem out, const longfelem in)
280304Sjkim{
280304Sjkim    int128_t c;
280304Sjkim    /* combine common terms from below */
280304Sjkim    c = in[4] + (in[5] << 32);
280304Sjkim    out[0] += c;
280304Sjkim    out[3] -= c;
238384Sjkim
280304Sjkim    c = in[5] - in[7];
280304Sjkim    out[1] += c;
280304Sjkim    out[2] -= c;
238384Sjkim
280304Sjkim    /* the remaining terms */
280304Sjkim    /* 256: [(0,1),(96,-1),(192,-1),(224,1)] */
280304Sjkim    out[1] -= (in[4] << 32);
280304Sjkim    out[3] += (in[4] << 32);
238384Sjkim
280304Sjkim    /* 320: [(32,1),(64,1),(128,-1),(160,-1),(224,-1)] */
280304Sjkim    out[2] -= (in[5] << 32);
238384Sjkim
280304Sjkim    /* 384: [(0,-1),(32,-1),(96,2),(128,2),(224,-1)] */
280304Sjkim    out[0] -= in[6];
280304Sjkim    out[0] -= (in[6] << 32);
280304Sjkim    out[1] += (in[6] << 33);
280304Sjkim    out[2] += (in[6] * 2);
280304Sjkim    out[3] -= (in[6] << 32);
238384Sjkim
280304Sjkim    /* 448: [(0,-1),(32,-1),(64,-1),(128,1),(160,2),(192,3)] */
280304Sjkim    out[0] -= in[7];
280304Sjkim    out[0] -= (in[7] << 32);
280304Sjkim    out[2] += (in[7] << 33);
280304Sjkim    out[3] += (in[7] * 3);
280304Sjkim}
238384Sjkim
280304Sjkim/*-
280304Sjkim * felem_reduce converts a longfelem into an felem.
238384Sjkim * To be called directly after felem_square or felem_mul.
238384Sjkim * On entry:
238384Sjkim *   in[0] < 2^64, in[1] < 3*2^64, in[2] < 5*2^64, in[3] < 7*2^64
238384Sjkim *   in[4] < 7*2^64, in[5] < 5*2^64, in[6] < 3*2^64, in[7] < 2*64
238384Sjkim * On exit:
238384Sjkim *   out[i] < 2^101
238384Sjkim */
238384Sjkimstatic void felem_reduce(felem out, const longfelem in)
280304Sjkim{
280304Sjkim    out[0] = zero100[0] + in[0];
280304Sjkim    out[1] = zero100[1] + in[1];
280304Sjkim    out[2] = zero100[2] + in[2];
280304Sjkim    out[3] = zero100[3] + in[3];
238384Sjkim
280304Sjkim    felem_reduce_(out, in);
238384Sjkim
280304Sjkim    /*-
280304Sjkim     * out[0] > 2^100 - 2^36 - 2^4 - 3*2^64 - 3*2^96 - 2^64 - 2^96 > 0
280304Sjkim     * out[1] > 2^100 - 2^64 - 7*2^96 > 0
280304Sjkim     * out[2] > 2^100 - 2^36 + 2^4 - 5*2^64 - 5*2^96 > 0
280304Sjkim     * out[3] > 2^100 - 2^36 + 2^4 - 7*2^64 - 5*2^96 - 3*2^96 > 0
280304Sjkim     *
280304Sjkim     * out[0] < 2^100 + 2^64 + 7*2^64 + 5*2^96 < 2^101
280304Sjkim     * out[1] < 2^100 + 3*2^64 + 5*2^64 + 3*2^97 < 2^101
280304Sjkim     * out[2] < 2^100 + 5*2^64 + 2^64 + 3*2^65 + 2^97 < 2^101
280304Sjkim     * out[3] < 2^100 + 7*2^64 + 7*2^96 + 3*2^64 < 2^101
280304Sjkim     */
280304Sjkim}
238384Sjkim
280304Sjkim/*-
280304Sjkim * felem_reduce_zero105 converts a larger longfelem into an felem.
238384Sjkim * On entry:
238384Sjkim *   in[0] < 2^71
238384Sjkim * On exit:
238384Sjkim *   out[i] < 2^106
238384Sjkim */
238384Sjkimstatic void felem_reduce_zero105(felem out, const longfelem in)
280304Sjkim{
280304Sjkim    out[0] = zero105[0] + in[0];
280304Sjkim    out[1] = zero105[1] + in[1];
280304Sjkim    out[2] = zero105[2] + in[2];
280304Sjkim    out[3] = zero105[3] + in[3];
238384Sjkim
280304Sjkim    felem_reduce_(out, in);
238384Sjkim
280304Sjkim    /*-
280304Sjkim     * out[0] > 2^105 - 2^41 - 2^9 - 2^71 - 2^103 - 2^71 - 2^103 > 0
280304Sjkim     * out[1] > 2^105 - 2^71 - 2^103 > 0
280304Sjkim     * out[2] > 2^105 - 2^41 + 2^9 - 2^71 - 2^103 > 0
280304Sjkim     * out[3] > 2^105 - 2^41 + 2^9 - 2^71 - 2^103 - 2^103 > 0
280304Sjkim     *
280304Sjkim     * out[0] < 2^105 + 2^71 + 2^71 + 2^103 < 2^106
280304Sjkim     * out[1] < 2^105 + 2^71 + 2^71 + 2^103 < 2^106
280304Sjkim     * out[2] < 2^105 + 2^71 + 2^71 + 2^71 + 2^103 < 2^106
280304Sjkim     * out[3] < 2^105 + 2^71 + 2^103 + 2^71 < 2^106
280304Sjkim     */
280304Sjkim}
238384Sjkim
280304Sjkim/*
280304Sjkim * subtract_u64 sets *result = *result - v and *carry to one if the
280304Sjkim * subtraction underflowed.
280304Sjkim */
280304Sjkimstatic void subtract_u64(u64 *result, u64 *carry, u64 v)
280304Sjkim{
280304Sjkim    uint128_t r = *result;
280304Sjkim    r -= v;
280304Sjkim    *carry = (r >> 64) & 1;
280304Sjkim    *result = (u64)r;
280304Sjkim}
238384Sjkim
280304Sjkim/*
280304Sjkim * felem_contract converts |in| to its unique, minimal representation. On
280304Sjkim * entry: in[i] < 2^109
238384Sjkim */
238384Sjkimstatic void felem_contract(smallfelem out, const felem in)
280304Sjkim{
280304Sjkim    unsigned i;
280304Sjkim    u64 all_equal_so_far = 0, result = 0, carry;
238384Sjkim
280304Sjkim    felem_shrink(out, in);
280304Sjkim    /* small is minimal except that the value might be > p */
238384Sjkim
280304Sjkim    all_equal_so_far--;
280304Sjkim    /*
280304Sjkim     * We are doing a constant time test if out >= kPrime. We need to compare
280304Sjkim     * each u64, from most-significant to least significant. For each one, if
280304Sjkim     * all words so far have been equal (m is all ones) then a non-equal
280304Sjkim     * result is the answer. Otherwise we continue.
280304Sjkim     */
280304Sjkim    for (i = 3; i < 4; i--) {
280304Sjkim        u64 equal;
280304Sjkim        uint128_t a = ((uint128_t) kPrime[i]) - out[i];
280304Sjkim        /*
280304Sjkim         * if out[i] > kPrime[i] then a will underflow and the high 64-bits
280304Sjkim         * will all be set.
280304Sjkim         */
280304Sjkim        result |= all_equal_so_far & ((u64)(a >> 64));
238384Sjkim
280304Sjkim        /*
280304Sjkim         * if kPrime[i] == out[i] then |equal| will be all zeros and the
280304Sjkim         * decrement will make it all ones.
280304Sjkim         */
280304Sjkim        equal = kPrime[i] ^ out[i];
280304Sjkim        equal--;
280304Sjkim        equal &= equal << 32;
280304Sjkim        equal &= equal << 16;
280304Sjkim        equal &= equal << 8;
280304Sjkim        equal &= equal << 4;
280304Sjkim        equal &= equal << 2;
280304Sjkim        equal &= equal << 1;
280304Sjkim        equal = ((s64) equal) >> 63;
238384Sjkim
280304Sjkim        all_equal_so_far &= equal;
280304Sjkim    }
238384Sjkim
280304Sjkim    /*
280304Sjkim     * if all_equal_so_far is still all ones then the two values are equal
280304Sjkim     * and so out >= kPrime is true.
280304Sjkim     */
280304Sjkim    result |= all_equal_so_far;
238384Sjkim
280304Sjkim    /* if out >= kPrime then we subtract kPrime. */
280304Sjkim    subtract_u64(&out[0], &carry, result & kPrime[0]);
280304Sjkim    subtract_u64(&out[1], &carry, carry);
280304Sjkim    subtract_u64(&out[2], &carry, carry);
280304Sjkim    subtract_u64(&out[3], &carry, carry);
238384Sjkim
280304Sjkim    subtract_u64(&out[1], &carry, result & kPrime[1]);
280304Sjkim    subtract_u64(&out[2], &carry, carry);
280304Sjkim    subtract_u64(&out[3], &carry, carry);
238384Sjkim
280304Sjkim    subtract_u64(&out[2], &carry, result & kPrime[2]);
280304Sjkim    subtract_u64(&out[3], &carry, carry);
238384Sjkim
280304Sjkim    subtract_u64(&out[3], &carry, result & kPrime[3]);
280304Sjkim}
238384Sjkim
238384Sjkimstatic void smallfelem_square_contract(smallfelem out, const smallfelem in)
280304Sjkim{
280304Sjkim    longfelem longtmp;
280304Sjkim    felem tmp;
238384Sjkim
280304Sjkim    smallfelem_square(longtmp, in);
280304Sjkim    felem_reduce(tmp, longtmp);
280304Sjkim    felem_contract(out, tmp);
280304Sjkim}
238384Sjkim
280304Sjkimstatic void smallfelem_mul_contract(smallfelem out, const smallfelem in1,
280304Sjkim                                    const smallfelem in2)
280304Sjkim{
280304Sjkim    longfelem longtmp;
280304Sjkim    felem tmp;
238384Sjkim
280304Sjkim    smallfelem_mul(longtmp, in1, in2);
280304Sjkim    felem_reduce(tmp, longtmp);
280304Sjkim    felem_contract(out, tmp);
280304Sjkim}
238384Sjkim
280304Sjkim/*-
280304Sjkim * felem_is_zero returns a limb with all bits set if |in| == 0 (mod p) and 0
238384Sjkim * otherwise.
238384Sjkim * On entry:
238384Sjkim *   small[i] < 2^64
238384Sjkim */
238384Sjkimstatic limb smallfelem_is_zero(const smallfelem small)
280304Sjkim{
280304Sjkim    limb result;
280304Sjkim    u64 is_p;
238384Sjkim
280304Sjkim    u64 is_zero = small[0] | small[1] | small[2] | small[3];
280304Sjkim    is_zero--;
280304Sjkim    is_zero &= is_zero << 32;
280304Sjkim    is_zero &= is_zero << 16;
280304Sjkim    is_zero &= is_zero << 8;
280304Sjkim    is_zero &= is_zero << 4;
280304Sjkim    is_zero &= is_zero << 2;
280304Sjkim    is_zero &= is_zero << 1;
280304Sjkim    is_zero = ((s64) is_zero) >> 63;
238384Sjkim
280304Sjkim    is_p = (small[0] ^ kPrime[0]) |
280304Sjkim        (small[1] ^ kPrime[1]) |
280304Sjkim        (small[2] ^ kPrime[2]) | (small[3] ^ kPrime[3]);
280304Sjkim    is_p--;
280304Sjkim    is_p &= is_p << 32;
280304Sjkim    is_p &= is_p << 16;
280304Sjkim    is_p &= is_p << 8;
280304Sjkim    is_p &= is_p << 4;
280304Sjkim    is_p &= is_p << 2;
280304Sjkim    is_p &= is_p << 1;
280304Sjkim    is_p = ((s64) is_p) >> 63;
238384Sjkim
280304Sjkim    is_zero |= is_p;
238384Sjkim
280304Sjkim    result = is_zero;
280304Sjkim    result |= ((limb) is_zero) << 64;
280304Sjkim    return result;
280304Sjkim}
238384Sjkim
238384Sjkimstatic int smallfelem_is_zero_int(const smallfelem small)
280304Sjkim{
280304Sjkim    return (int)(smallfelem_is_zero(small) & ((limb) 1));
280304Sjkim}
238384Sjkim
280304Sjkim/*-
280304Sjkim * felem_inv calculates |out| = |in|^{-1}
238384Sjkim *
238384Sjkim * Based on Fermat's Little Theorem:
238384Sjkim *   a^p = a (mod p)
238384Sjkim *   a^{p-1} = 1 (mod p)
238384Sjkim *   a^{p-2} = a^{-1} (mod p)
238384Sjkim */
238384Sjkimstatic void felem_inv(felem out, const felem in)
280304Sjkim{
280304Sjkim    felem ftmp, ftmp2;
280304Sjkim    /* each e_I will hold |in|^{2^I - 1} */
280304Sjkim    felem e2, e4, e8, e16, e32, e64;
280304Sjkim    longfelem tmp;
280304Sjkim    unsigned i;
238384Sjkim
280304Sjkim    felem_square(tmp, in);
280304Sjkim    felem_reduce(ftmp, tmp);    /* 2^1 */
280304Sjkim    felem_mul(tmp, in, ftmp);
280304Sjkim    felem_reduce(ftmp, tmp);    /* 2^2 - 2^0 */
280304Sjkim    felem_assign(e2, ftmp);
280304Sjkim    felem_square(tmp, ftmp);
280304Sjkim    felem_reduce(ftmp, tmp);    /* 2^3 - 2^1 */
280304Sjkim    felem_square(tmp, ftmp);
280304Sjkim    felem_reduce(ftmp, tmp);    /* 2^4 - 2^2 */
280304Sjkim    felem_mul(tmp, ftmp, e2);
280304Sjkim    felem_reduce(ftmp, tmp);    /* 2^4 - 2^0 */
280304Sjkim    felem_assign(e4, ftmp);
280304Sjkim    felem_square(tmp, ftmp);
280304Sjkim    felem_reduce(ftmp, tmp);    /* 2^5 - 2^1 */
280304Sjkim    felem_square(tmp, ftmp);
280304Sjkim    felem_reduce(ftmp, tmp);    /* 2^6 - 2^2 */
280304Sjkim    felem_square(tmp, ftmp);
280304Sjkim    felem_reduce(ftmp, tmp);    /* 2^7 - 2^3 */
280304Sjkim    felem_square(tmp, ftmp);
280304Sjkim    felem_reduce(ftmp, tmp);    /* 2^8 - 2^4 */
280304Sjkim    felem_mul(tmp, ftmp, e4);
280304Sjkim    felem_reduce(ftmp, tmp);    /* 2^8 - 2^0 */
280304Sjkim    felem_assign(e8, ftmp);
280304Sjkim    for (i = 0; i < 8; i++) {
280304Sjkim        felem_square(tmp, ftmp);
280304Sjkim        felem_reduce(ftmp, tmp);
280304Sjkim    }                           /* 2^16 - 2^8 */
280304Sjkim    felem_mul(tmp, ftmp, e8);
280304Sjkim    felem_reduce(ftmp, tmp);    /* 2^16 - 2^0 */
280304Sjkim    felem_assign(e16, ftmp);
280304Sjkim    for (i = 0; i < 16; i++) {
280304Sjkim        felem_square(tmp, ftmp);
280304Sjkim        felem_reduce(ftmp, tmp);
280304Sjkim    }                           /* 2^32 - 2^16 */
280304Sjkim    felem_mul(tmp, ftmp, e16);
280304Sjkim    felem_reduce(ftmp, tmp);    /* 2^32 - 2^0 */
280304Sjkim    felem_assign(e32, ftmp);
280304Sjkim    for (i = 0; i < 32; i++) {
280304Sjkim        felem_square(tmp, ftmp);
280304Sjkim        felem_reduce(ftmp, tmp);
280304Sjkim    }                           /* 2^64 - 2^32 */
280304Sjkim    felem_assign(e64, ftmp);
280304Sjkim    felem_mul(tmp, ftmp, in);
280304Sjkim    felem_reduce(ftmp, tmp);    /* 2^64 - 2^32 + 2^0 */
280304Sjkim    for (i = 0; i < 192; i++) {
280304Sjkim        felem_square(tmp, ftmp);
280304Sjkim        felem_reduce(ftmp, tmp);
280304Sjkim    }                           /* 2^256 - 2^224 + 2^192 */
238384Sjkim
280304Sjkim    felem_mul(tmp, e64, e32);
280304Sjkim    felem_reduce(ftmp2, tmp);   /* 2^64 - 2^0 */
280304Sjkim    for (i = 0; i < 16; i++) {
280304Sjkim        felem_square(tmp, ftmp2);
280304Sjkim        felem_reduce(ftmp2, tmp);
280304Sjkim    }                           /* 2^80 - 2^16 */
280304Sjkim    felem_mul(tmp, ftmp2, e16);
280304Sjkim    felem_reduce(ftmp2, tmp);   /* 2^80 - 2^0 */
280304Sjkim    for (i = 0; i < 8; i++) {
280304Sjkim        felem_square(tmp, ftmp2);
280304Sjkim        felem_reduce(ftmp2, tmp);
280304Sjkim    }                           /* 2^88 - 2^8 */
280304Sjkim    felem_mul(tmp, ftmp2, e8);
280304Sjkim    felem_reduce(ftmp2, tmp);   /* 2^88 - 2^0 */
280304Sjkim    for (i = 0; i < 4; i++) {
280304Sjkim        felem_square(tmp, ftmp2);
280304Sjkim        felem_reduce(ftmp2, tmp);
280304Sjkim    }                           /* 2^92 - 2^4 */
280304Sjkim    felem_mul(tmp, ftmp2, e4);
280304Sjkim    felem_reduce(ftmp2, tmp);   /* 2^92 - 2^0 */
280304Sjkim    felem_square(tmp, ftmp2);
280304Sjkim    felem_reduce(ftmp2, tmp);   /* 2^93 - 2^1 */
280304Sjkim    felem_square(tmp, ftmp2);
280304Sjkim    felem_reduce(ftmp2, tmp);   /* 2^94 - 2^2 */
280304Sjkim    felem_mul(tmp, ftmp2, e2);
280304Sjkim    felem_reduce(ftmp2, tmp);   /* 2^94 - 2^0 */
280304Sjkim    felem_square(tmp, ftmp2);
280304Sjkim    felem_reduce(ftmp2, tmp);   /* 2^95 - 2^1 */
280304Sjkim    felem_square(tmp, ftmp2);
280304Sjkim    felem_reduce(ftmp2, tmp);   /* 2^96 - 2^2 */
280304Sjkim    felem_mul(tmp, ftmp2, in);
280304Sjkim    felem_reduce(ftmp2, tmp);   /* 2^96 - 3 */
238384Sjkim
280304Sjkim    felem_mul(tmp, ftmp2, ftmp);
280304Sjkim    felem_reduce(out, tmp);     /* 2^256 - 2^224 + 2^192 + 2^96 - 3 */
280304Sjkim}
238384Sjkim
238384Sjkimstatic void smallfelem_inv_contract(smallfelem out, const smallfelem in)
280304Sjkim{
280304Sjkim    felem tmp;
238384Sjkim
280304Sjkim    smallfelem_expand(tmp, in);
280304Sjkim    felem_inv(tmp, tmp);
280304Sjkim    felem_contract(out, tmp);
280304Sjkim}
238384Sjkim
280304Sjkim/*-
280304Sjkim * Group operations
238384Sjkim * ----------------
238384Sjkim *
238384Sjkim * Building on top of the field operations we have the operations on the
238384Sjkim * elliptic curve group itself. Points on the curve are represented in Jacobian
280304Sjkim * coordinates
280304Sjkim */
238384Sjkim
280304Sjkim/*-
280304Sjkim * point_double calculates 2*(x_in, y_in, z_in)
238384Sjkim *
238384Sjkim * The method is taken from:
238384Sjkim *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b
238384Sjkim *
238384Sjkim * Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed.
280304Sjkim * while x_out == y_in is not (maybe this works, but it's not tested).
280304Sjkim */
238384Sjkimstatic void
238384Sjkimpoint_double(felem x_out, felem y_out, felem z_out,
280304Sjkim             const felem x_in, const felem y_in, const felem z_in)
280304Sjkim{
280304Sjkim    longfelem tmp, tmp2;
280304Sjkim    felem delta, gamma, beta, alpha, ftmp, ftmp2;
280304Sjkim    smallfelem small1, small2;
238384Sjkim
280304Sjkim    felem_assign(ftmp, x_in);
280304Sjkim    /* ftmp[i] < 2^106 */
280304Sjkim    felem_assign(ftmp2, x_in);
280304Sjkim    /* ftmp2[i] < 2^106 */
238384Sjkim
280304Sjkim    /* delta = z^2 */
280304Sjkim    felem_square(tmp, z_in);
280304Sjkim    felem_reduce(delta, tmp);
280304Sjkim    /* delta[i] < 2^101 */
238384Sjkim
280304Sjkim    /* gamma = y^2 */
280304Sjkim    felem_square(tmp, y_in);
280304Sjkim    felem_reduce(gamma, tmp);
280304Sjkim    /* gamma[i] < 2^101 */
280304Sjkim    felem_shrink(small1, gamma);
238384Sjkim
280304Sjkim    /* beta = x*gamma */
280304Sjkim    felem_small_mul(tmp, small1, x_in);
280304Sjkim    felem_reduce(beta, tmp);
280304Sjkim    /* beta[i] < 2^101 */
238384Sjkim
280304Sjkim    /* alpha = 3*(x-delta)*(x+delta) */
280304Sjkim    felem_diff(ftmp, delta);
280304Sjkim    /* ftmp[i] < 2^105 + 2^106 < 2^107 */
280304Sjkim    felem_sum(ftmp2, delta);
280304Sjkim    /* ftmp2[i] < 2^105 + 2^106 < 2^107 */
280304Sjkim    felem_scalar(ftmp2, 3);
280304Sjkim    /* ftmp2[i] < 3 * 2^107 < 2^109 */
280304Sjkim    felem_mul(tmp, ftmp, ftmp2);
280304Sjkim    felem_reduce(alpha, tmp);
280304Sjkim    /* alpha[i] < 2^101 */
280304Sjkim    felem_shrink(small2, alpha);
238384Sjkim
280304Sjkim    /* x' = alpha^2 - 8*beta */
280304Sjkim    smallfelem_square(tmp, small2);
280304Sjkim    felem_reduce(x_out, tmp);
280304Sjkim    felem_assign(ftmp, beta);
280304Sjkim    felem_scalar(ftmp, 8);
280304Sjkim    /* ftmp[i] < 8 * 2^101 = 2^104 */
280304Sjkim    felem_diff(x_out, ftmp);
280304Sjkim    /* x_out[i] < 2^105 + 2^101 < 2^106 */
238384Sjkim
280304Sjkim    /* z' = (y + z)^2 - gamma - delta */
280304Sjkim    felem_sum(delta, gamma);
280304Sjkim    /* delta[i] < 2^101 + 2^101 = 2^102 */
280304Sjkim    felem_assign(ftmp, y_in);
280304Sjkim    felem_sum(ftmp, z_in);
280304Sjkim    /* ftmp[i] < 2^106 + 2^106 = 2^107 */
280304Sjkim    felem_square(tmp, ftmp);
280304Sjkim    felem_reduce(z_out, tmp);
280304Sjkim    felem_diff(z_out, delta);
280304Sjkim    /* z_out[i] < 2^105 + 2^101 < 2^106 */
238384Sjkim
280304Sjkim    /* y' = alpha*(4*beta - x') - 8*gamma^2 */
280304Sjkim    felem_scalar(beta, 4);
280304Sjkim    /* beta[i] < 4 * 2^101 = 2^103 */
280304Sjkim    felem_diff_zero107(beta, x_out);
280304Sjkim    /* beta[i] < 2^107 + 2^103 < 2^108 */
280304Sjkim    felem_small_mul(tmp, small2, beta);
280304Sjkim    /* tmp[i] < 7 * 2^64 < 2^67 */
280304Sjkim    smallfelem_square(tmp2, small1);
280304Sjkim    /* tmp2[i] < 7 * 2^64 */
280304Sjkim    longfelem_scalar(tmp2, 8);
280304Sjkim    /* tmp2[i] < 8 * 7 * 2^64 = 7 * 2^67 */
280304Sjkim    longfelem_diff(tmp, tmp2);
280304Sjkim    /* tmp[i] < 2^67 + 2^70 + 2^40 < 2^71 */
280304Sjkim    felem_reduce_zero105(y_out, tmp);
280304Sjkim    /* y_out[i] < 2^106 */
280304Sjkim}
238384Sjkim
280304Sjkim/*
280304Sjkim * point_double_small is the same as point_double, except that it operates on
280304Sjkim * smallfelems
280304Sjkim */
238384Sjkimstatic void
238384Sjkimpoint_double_small(smallfelem x_out, smallfelem y_out, smallfelem z_out,
280304Sjkim                   const smallfelem x_in, const smallfelem y_in,
280304Sjkim                   const smallfelem z_in)
280304Sjkim{
280304Sjkim    felem felem_x_out, felem_y_out, felem_z_out;
280304Sjkim    felem felem_x_in, felem_y_in, felem_z_in;
238384Sjkim
280304Sjkim    smallfelem_expand(felem_x_in, x_in);
280304Sjkim    smallfelem_expand(felem_y_in, y_in);
280304Sjkim    smallfelem_expand(felem_z_in, z_in);
280304Sjkim    point_double(felem_x_out, felem_y_out, felem_z_out,
280304Sjkim                 felem_x_in, felem_y_in, felem_z_in);
280304Sjkim    felem_shrink(x_out, felem_x_out);
280304Sjkim    felem_shrink(y_out, felem_y_out);
280304Sjkim    felem_shrink(z_out, felem_z_out);
280304Sjkim}
238384Sjkim
238384Sjkim/* copy_conditional copies in to out iff mask is all ones. */
280304Sjkimstatic void copy_conditional(felem out, const felem in, limb mask)
280304Sjkim{
280304Sjkim    unsigned i;
280304Sjkim    for (i = 0; i < NLIMBS; ++i) {
280304Sjkim        const limb tmp = mask & (in[i] ^ out[i]);
280304Sjkim        out[i] ^= tmp;
280304Sjkim    }
280304Sjkim}
238384Sjkim
238384Sjkim/* copy_small_conditional copies in to out iff mask is all ones. */
280304Sjkimstatic void copy_small_conditional(felem out, const smallfelem in, limb mask)
280304Sjkim{
280304Sjkim    unsigned i;
280304Sjkim    const u64 mask64 = mask;
280304Sjkim    for (i = 0; i < NLIMBS; ++i) {
280304Sjkim        out[i] = ((limb) (in[i] & mask64)) | (out[i] & ~mask);
280304Sjkim    }
280304Sjkim}
238384Sjkim
280304Sjkim/*-
280304Sjkim * point_add calcuates (x1, y1, z1) + (x2, y2, z2)
238384Sjkim *
238384Sjkim * The method is taken from:
238384Sjkim *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl,
238384Sjkim * adapted for mixed addition (z2 = 1, or z2 = 0 for the point at infinity).
238384Sjkim *
238384Sjkim * This function includes a branch for checking whether the two input points
238384Sjkim * are equal, (while not equal to the point at infinity). This case never
238384Sjkim * happens during single point multiplication, so there is no timing leak for
280304Sjkim * ECDH or ECDSA signing.
280304Sjkim */
238384Sjkimstatic void point_add(felem x3, felem y3, felem z3,
280304Sjkim                      const felem x1, const felem y1, const felem z1,
280304Sjkim                      const int mixed, const smallfelem x2,
280304Sjkim                      const smallfelem y2, const smallfelem z2)
280304Sjkim{
280304Sjkim    felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, ftmp6, x_out, y_out, z_out;
280304Sjkim    longfelem tmp, tmp2;
280304Sjkim    smallfelem small1, small2, small3, small4, small5;
280304Sjkim    limb x_equal, y_equal, z1_is_zero, z2_is_zero;
238384Sjkim
280304Sjkim    felem_shrink(small3, z1);
238384Sjkim
280304Sjkim    z1_is_zero = smallfelem_is_zero(small3);
280304Sjkim    z2_is_zero = smallfelem_is_zero(z2);
238384Sjkim
280304Sjkim    /* ftmp = z1z1 = z1**2 */
280304Sjkim    smallfelem_square(tmp, small3);
280304Sjkim    felem_reduce(ftmp, tmp);
280304Sjkim    /* ftmp[i] < 2^101 */
280304Sjkim    felem_shrink(small1, ftmp);
238384Sjkim
280304Sjkim    if (!mixed) {
280304Sjkim        /* ftmp2 = z2z2 = z2**2 */
280304Sjkim        smallfelem_square(tmp, z2);
280304Sjkim        felem_reduce(ftmp2, tmp);
280304Sjkim        /* ftmp2[i] < 2^101 */
280304Sjkim        felem_shrink(small2, ftmp2);
238384Sjkim
280304Sjkim        felem_shrink(small5, x1);
238384Sjkim
280304Sjkim        /* u1 = ftmp3 = x1*z2z2 */
280304Sjkim        smallfelem_mul(tmp, small5, small2);
280304Sjkim        felem_reduce(ftmp3, tmp);
280304Sjkim        /* ftmp3[i] < 2^101 */
238384Sjkim
280304Sjkim        /* ftmp5 = z1 + z2 */
280304Sjkim        felem_assign(ftmp5, z1);
280304Sjkim        felem_small_sum(ftmp5, z2);
280304Sjkim        /* ftmp5[i] < 2^107 */
238384Sjkim
280304Sjkim        /* ftmp5 = (z1 + z2)**2 - (z1z1 + z2z2) = 2z1z2 */
280304Sjkim        felem_square(tmp, ftmp5);
280304Sjkim        felem_reduce(ftmp5, tmp);
280304Sjkim        /* ftmp2 = z2z2 + z1z1 */
280304Sjkim        felem_sum(ftmp2, ftmp);
280304Sjkim        /* ftmp2[i] < 2^101 + 2^101 = 2^102 */
280304Sjkim        felem_diff(ftmp5, ftmp2);
280304Sjkim        /* ftmp5[i] < 2^105 + 2^101 < 2^106 */
238384Sjkim
280304Sjkim        /* ftmp2 = z2 * z2z2 */
280304Sjkim        smallfelem_mul(tmp, small2, z2);
280304Sjkim        felem_reduce(ftmp2, tmp);
238384Sjkim
280304Sjkim        /* s1 = ftmp2 = y1 * z2**3 */
280304Sjkim        felem_mul(tmp, y1, ftmp2);
280304Sjkim        felem_reduce(ftmp6, tmp);
280304Sjkim        /* ftmp6[i] < 2^101 */
280304Sjkim    } else {
280304Sjkim        /*
280304Sjkim         * We'll assume z2 = 1 (special case z2 = 0 is handled later)
280304Sjkim         */
238384Sjkim
280304Sjkim        /* u1 = ftmp3 = x1*z2z2 */
280304Sjkim        felem_assign(ftmp3, x1);
280304Sjkim        /* ftmp3[i] < 2^106 */
238384Sjkim
280304Sjkim        /* ftmp5 = 2z1z2 */
280304Sjkim        felem_assign(ftmp5, z1);
280304Sjkim        felem_scalar(ftmp5, 2);
280304Sjkim        /* ftmp5[i] < 2*2^106 = 2^107 */
238384Sjkim
280304Sjkim        /* s1 = ftmp2 = y1 * z2**3 */
280304Sjkim        felem_assign(ftmp6, y1);
280304Sjkim        /* ftmp6[i] < 2^106 */
280304Sjkim    }
238384Sjkim
280304Sjkim    /* u2 = x2*z1z1 */
280304Sjkim    smallfelem_mul(tmp, x2, small1);
280304Sjkim    felem_reduce(ftmp4, tmp);
238384Sjkim
280304Sjkim    /* h = ftmp4 = u2 - u1 */
280304Sjkim    felem_diff_zero107(ftmp4, ftmp3);
280304Sjkim    /* ftmp4[i] < 2^107 + 2^101 < 2^108 */
280304Sjkim    felem_shrink(small4, ftmp4);
238384Sjkim
280304Sjkim    x_equal = smallfelem_is_zero(small4);
238384Sjkim
280304Sjkim    /* z_out = ftmp5 * h */
280304Sjkim    felem_small_mul(tmp, small4, ftmp5);
280304Sjkim    felem_reduce(z_out, tmp);
280304Sjkim    /* z_out[i] < 2^101 */
238384Sjkim
280304Sjkim    /* ftmp = z1 * z1z1 */
280304Sjkim    smallfelem_mul(tmp, small1, small3);
280304Sjkim    felem_reduce(ftmp, tmp);
238384Sjkim
280304Sjkim    /* s2 = tmp = y2 * z1**3 */
280304Sjkim    felem_small_mul(tmp, y2, ftmp);
280304Sjkim    felem_reduce(ftmp5, tmp);
238384Sjkim
280304Sjkim    /* r = ftmp5 = (s2 - s1)*2 */
280304Sjkim    felem_diff_zero107(ftmp5, ftmp6);
280304Sjkim    /* ftmp5[i] < 2^107 + 2^107 = 2^108 */
280304Sjkim    felem_scalar(ftmp5, 2);
280304Sjkim    /* ftmp5[i] < 2^109 */
280304Sjkim    felem_shrink(small1, ftmp5);
280304Sjkim    y_equal = smallfelem_is_zero(small1);
238384Sjkim
280304Sjkim    if (x_equal && y_equal && !z1_is_zero && !z2_is_zero) {
280304Sjkim        point_double(x3, y3, z3, x1, y1, z1);
280304Sjkim        return;
280304Sjkim    }
238384Sjkim
280304Sjkim    /* I = ftmp = (2h)**2 */
280304Sjkim    felem_assign(ftmp, ftmp4);
280304Sjkim    felem_scalar(ftmp, 2);
280304Sjkim    /* ftmp[i] < 2*2^108 = 2^109 */
280304Sjkim    felem_square(tmp, ftmp);
280304Sjkim    felem_reduce(ftmp, tmp);
238384Sjkim
280304Sjkim    /* J = ftmp2 = h * I */
280304Sjkim    felem_mul(tmp, ftmp4, ftmp);
280304Sjkim    felem_reduce(ftmp2, tmp);
238384Sjkim
280304Sjkim    /* V = ftmp4 = U1 * I */
280304Sjkim    felem_mul(tmp, ftmp3, ftmp);
280304Sjkim    felem_reduce(ftmp4, tmp);
238384Sjkim
280304Sjkim    /* x_out = r**2 - J - 2V */
280304Sjkim    smallfelem_square(tmp, small1);
280304Sjkim    felem_reduce(x_out, tmp);
280304Sjkim    felem_assign(ftmp3, ftmp4);
280304Sjkim    felem_scalar(ftmp4, 2);
280304Sjkim    felem_sum(ftmp4, ftmp2);
280304Sjkim    /* ftmp4[i] < 2*2^101 + 2^101 < 2^103 */
280304Sjkim    felem_diff(x_out, ftmp4);
280304Sjkim    /* x_out[i] < 2^105 + 2^101 */
238384Sjkim
280304Sjkim    /* y_out = r(V-x_out) - 2 * s1 * J */
280304Sjkim    felem_diff_zero107(ftmp3, x_out);
280304Sjkim    /* ftmp3[i] < 2^107 + 2^101 < 2^108 */
280304Sjkim    felem_small_mul(tmp, small1, ftmp3);
280304Sjkim    felem_mul(tmp2, ftmp6, ftmp2);
280304Sjkim    longfelem_scalar(tmp2, 2);
280304Sjkim    /* tmp2[i] < 2*2^67 = 2^68 */
280304Sjkim    longfelem_diff(tmp, tmp2);
280304Sjkim    /* tmp[i] < 2^67 + 2^70 + 2^40 < 2^71 */
280304Sjkim    felem_reduce_zero105(y_out, tmp);
280304Sjkim    /* y_out[i] < 2^106 */
238384Sjkim
280304Sjkim    copy_small_conditional(x_out, x2, z1_is_zero);
280304Sjkim    copy_conditional(x_out, x1, z2_is_zero);
280304Sjkim    copy_small_conditional(y_out, y2, z1_is_zero);
280304Sjkim    copy_conditional(y_out, y1, z2_is_zero);
280304Sjkim    copy_small_conditional(z_out, z2, z1_is_zero);
280304Sjkim    copy_conditional(z_out, z1, z2_is_zero);
280304Sjkim    felem_assign(x3, x_out);
280304Sjkim    felem_assign(y3, y_out);
280304Sjkim    felem_assign(z3, z_out);
280304Sjkim}
238384Sjkim
280304Sjkim/*
280304Sjkim * point_add_small is the same as point_add, except that it operates on
280304Sjkim * smallfelems
280304Sjkim */
238384Sjkimstatic void point_add_small(smallfelem x3, smallfelem y3, smallfelem z3,
280304Sjkim                            smallfelem x1, smallfelem y1, smallfelem z1,
280304Sjkim                            smallfelem x2, smallfelem y2, smallfelem z2)
280304Sjkim{
280304Sjkim    felem felem_x3, felem_y3, felem_z3;
280304Sjkim    felem felem_x1, felem_y1, felem_z1;
280304Sjkim    smallfelem_expand(felem_x1, x1);
280304Sjkim    smallfelem_expand(felem_y1, y1);
280304Sjkim    smallfelem_expand(felem_z1, z1);
280304Sjkim    point_add(felem_x3, felem_y3, felem_z3, felem_x1, felem_y1, felem_z1, 0,
280304Sjkim              x2, y2, z2);
280304Sjkim    felem_shrink(x3, felem_x3);
280304Sjkim    felem_shrink(y3, felem_y3);
280304Sjkim    felem_shrink(z3, felem_z3);
280304Sjkim}
238384Sjkim
280304Sjkim/*-
280304Sjkim * Base point pre computation
238384Sjkim * --------------------------
238384Sjkim *
238384Sjkim * Two different sorts of precomputed tables are used in the following code.
238384Sjkim * Each contain various points on the curve, where each point is three field
238384Sjkim * elements (x, y, z).
238384Sjkim *
238384Sjkim * For the base point table, z is usually 1 (0 for the point at infinity).
238384Sjkim * This table has 2 * 16 elements, starting with the following:
238384Sjkim * index | bits    | point
238384Sjkim * ------+---------+------------------------------
238384Sjkim *     0 | 0 0 0 0 | 0G
238384Sjkim *     1 | 0 0 0 1 | 1G
238384Sjkim *     2 | 0 0 1 0 | 2^64G
238384Sjkim *     3 | 0 0 1 1 | (2^64 + 1)G
238384Sjkim *     4 | 0 1 0 0 | 2^128G
238384Sjkim *     5 | 0 1 0 1 | (2^128 + 1)G
238384Sjkim *     6 | 0 1 1 0 | (2^128 + 2^64)G
238384Sjkim *     7 | 0 1 1 1 | (2^128 + 2^64 + 1)G
238384Sjkim *     8 | 1 0 0 0 | 2^192G
238384Sjkim *     9 | 1 0 0 1 | (2^192 + 1)G
238384Sjkim *    10 | 1 0 1 0 | (2^192 + 2^64)G
238384Sjkim *    11 | 1 0 1 1 | (2^192 + 2^64 + 1)G
238384Sjkim *    12 | 1 1 0 0 | (2^192 + 2^128)G
238384Sjkim *    13 | 1 1 0 1 | (2^192 + 2^128 + 1)G
238384Sjkim *    14 | 1 1 1 0 | (2^192 + 2^128 + 2^64)G
238384Sjkim *    15 | 1 1 1 1 | (2^192 + 2^128 + 2^64 + 1)G
238384Sjkim * followed by a copy of this with each element multiplied by 2^32.
238384Sjkim *
238384Sjkim * The reason for this is so that we can clock bits into four different
238384Sjkim * locations when doing simple scalar multiplies against the base point,
238384Sjkim * and then another four locations using the second 16 elements.
238384Sjkim *
238384Sjkim * Tables for other points have table[i] = iG for i in 0 .. 16. */
238384Sjkim
238384Sjkim/* gmul is the table of precomputed base points */
280304Sjkimstatic const smallfelem gmul[2][16][3] = {
280304Sjkim    {{{0, 0, 0, 0},
280304Sjkim      {0, 0, 0, 0},
280304Sjkim      {0, 0, 0, 0}},
280304Sjkim     {{0xf4a13945d898c296, 0x77037d812deb33a0, 0xf8bce6e563a440f2,
280304Sjkim       0x6b17d1f2e12c4247},
280304Sjkim      {0xcbb6406837bf51f5, 0x2bce33576b315ece, 0x8ee7eb4a7c0f9e16,
280304Sjkim       0x4fe342e2fe1a7f9b},
280304Sjkim      {1, 0, 0, 0}},
280304Sjkim     {{0x90e75cb48e14db63, 0x29493baaad651f7e, 0x8492592e326e25de,
280304Sjkim       0x0fa822bc2811aaa5},
280304Sjkim      {0xe41124545f462ee7, 0x34b1a65050fe82f5, 0x6f4ad4bcb3df188b,
280304Sjkim       0xbff44ae8f5dba80d},
280304Sjkim      {1, 0, 0, 0}},
280304Sjkim     {{0x93391ce2097992af, 0xe96c98fd0d35f1fa, 0xb257c0de95e02789,
280304Sjkim       0x300a4bbc89d6726f},
280304Sjkim      {0xaa54a291c08127a0, 0x5bb1eeada9d806a5, 0x7f1ddb25ff1e3c6f,
280304Sjkim       0x72aac7e0d09b4644},
280304Sjkim      {1, 0, 0, 0}},
280304Sjkim     {{0x57c84fc9d789bd85, 0xfc35ff7dc297eac3, 0xfb982fd588c6766e,
280304Sjkim       0x447d739beedb5e67},
280304Sjkim      {0x0c7e33c972e25b32, 0x3d349b95a7fae500, 0xe12e9d953a4aaff7,
280304Sjkim       0x2d4825ab834131ee},
280304Sjkim      {1, 0, 0, 0}},
280304Sjkim     {{0x13949c932a1d367f, 0xef7fbd2b1a0a11b7, 0xddc6068bb91dfc60,
280304Sjkim       0xef9519328a9c72ff},
280304Sjkim      {0x196035a77376d8a8, 0x23183b0895ca1740, 0xc1ee9807022c219c,
280304Sjkim       0x611e9fc37dbb2c9b},
280304Sjkim      {1, 0, 0, 0}},
280304Sjkim     {{0xcae2b1920b57f4bc, 0x2936df5ec6c9bc36, 0x7dea6482e11238bf,
280304Sjkim       0x550663797b51f5d8},
280304Sjkim      {0x44ffe216348a964c, 0x9fb3d576dbdefbe1, 0x0afa40018d9d50e5,
280304Sjkim       0x157164848aecb851},
280304Sjkim      {1, 0, 0, 0}},
280304Sjkim     {{0xe48ecafffc5cde01, 0x7ccd84e70d715f26, 0xa2e8f483f43e4391,
280304Sjkim       0xeb5d7745b21141ea},
280304Sjkim      {0xcac917e2731a3479, 0x85f22cfe2844b645, 0x0990e6a158006cee,
280304Sjkim       0xeafd72ebdbecc17b},
280304Sjkim      {1, 0, 0, 0}},
280304Sjkim     {{0x6cf20ffb313728be, 0x96439591a3c6b94a, 0x2736ff8344315fc5,
280304Sjkim       0xa6d39677a7849276},
280304Sjkim      {0xf2bab833c357f5f4, 0x824a920c2284059b, 0x66b8babd2d27ecdf,
280304Sjkim       0x674f84749b0b8816},
280304Sjkim      {1, 0, 0, 0}},
280304Sjkim     {{0x2df48c04677c8a3e, 0x74e02f080203a56b, 0x31855f7db8c7fedb,
280304Sjkim       0x4e769e7672c9ddad},
280304Sjkim      {0xa4c36165b824bbb0, 0xfb9ae16f3b9122a5, 0x1ec0057206947281,
280304Sjkim       0x42b99082de830663},
280304Sjkim      {1, 0, 0, 0}},
280304Sjkim     {{0x6ef95150dda868b9, 0xd1f89e799c0ce131, 0x7fdc1ca008a1c478,
280304Sjkim       0x78878ef61c6ce04d},
280304Sjkim      {0x9c62b9121fe0d976, 0x6ace570ebde08d4f, 0xde53142c12309def,
280304Sjkim       0xb6cb3f5d7b72c321},
280304Sjkim      {1, 0, 0, 0}},
280304Sjkim     {{0x7f991ed2c31a3573, 0x5b82dd5bd54fb496, 0x595c5220812ffcae,
280304Sjkim       0x0c88bc4d716b1287},
280304Sjkim      {0x3a57bf635f48aca8, 0x7c8181f4df2564f3, 0x18d1b5b39c04e6aa,
280304Sjkim       0xdd5ddea3f3901dc6},
280304Sjkim      {1, 0, 0, 0}},
280304Sjkim     {{0xe96a79fb3e72ad0c, 0x43a0a28c42ba792f, 0xefe0a423083e49f3,
280304Sjkim       0x68f344af6b317466},
280304Sjkim      {0xcdfe17db3fb24d4a, 0x668bfc2271f5c626, 0x604ed93c24d67ff3,
280304Sjkim       0x31b9c405f8540a20},
280304Sjkim      {1, 0, 0, 0}},
280304Sjkim     {{0xd36b4789a2582e7f, 0x0d1a10144ec39c28, 0x663c62c3edbad7a0,
280304Sjkim       0x4052bf4b6f461db9},
280304Sjkim      {0x235a27c3188d25eb, 0xe724f33999bfcc5b, 0x862be6bd71d70cc8,
280304Sjkim       0xfecf4d5190b0fc61},
280304Sjkim      {1, 0, 0, 0}},
280304Sjkim     {{0x74346c10a1d4cfac, 0xafdf5cc08526a7a4, 0x123202a8f62bff7a,
280304Sjkim       0x1eddbae2c802e41a},
280304Sjkim      {0x8fa0af2dd603f844, 0x36e06b7e4c701917, 0x0c45f45273db33a0,
280304Sjkim       0x43104d86560ebcfc},
280304Sjkim      {1, 0, 0, 0}},
280304Sjkim     {{0x9615b5110d1d78e5, 0x66b0de3225c4744b, 0x0a4a46fb6aaf363a,
280304Sjkim       0xb48e26b484f7a21c},
280304Sjkim      {0x06ebb0f621a01b2d, 0xc004e4048b7b0f98, 0x64131bcdfed6f668,
280304Sjkim       0xfac015404d4d3dab},
280304Sjkim      {1, 0, 0, 0}}},
280304Sjkim    {{{0, 0, 0, 0},
280304Sjkim      {0, 0, 0, 0},
280304Sjkim      {0, 0, 0, 0}},
280304Sjkim     {{0x3a5a9e22185a5943, 0x1ab919365c65dfb6, 0x21656b32262c71da,
280304Sjkim       0x7fe36b40af22af89},
280304Sjkim      {0xd50d152c699ca101, 0x74b3d5867b8af212, 0x9f09f40407dca6f1,
280304Sjkim       0xe697d45825b63624},
280304Sjkim      {1, 0, 0, 0}},
280304Sjkim     {{0xa84aa9397512218e, 0xe9a521b074ca0141, 0x57880b3a18a2e902,
280304Sjkim       0x4a5b506612a677a6},
280304Sjkim      {0x0beada7a4c4f3840, 0x626db15419e26d9d, 0xc42604fbe1627d40,
280304Sjkim       0xeb13461ceac089f1},
280304Sjkim      {1, 0, 0, 0}},
280304Sjkim     {{0xf9faed0927a43281, 0x5e52c4144103ecbc, 0xc342967aa815c857,
280304Sjkim       0x0781b8291c6a220a},
280304Sjkim      {0x5a8343ceeac55f80, 0x88f80eeee54a05e3, 0x97b2a14f12916434,
280304Sjkim       0x690cde8df0151593},
280304Sjkim      {1, 0, 0, 0}},
280304Sjkim     {{0xaee9c75df7f82f2a, 0x9e4c35874afdf43a, 0xf5622df437371326,
280304Sjkim       0x8a535f566ec73617},
280304Sjkim      {0xc5f9a0ac223094b7, 0xcde533864c8c7669, 0x37e02819085a92bf,
280304Sjkim       0x0455c08468b08bd7},
280304Sjkim      {1, 0, 0, 0}},
280304Sjkim     {{0x0c0a6e2c9477b5d9, 0xf9a4bf62876dc444, 0x5050a949b6cdc279,
280304Sjkim       0x06bada7ab77f8276},
280304Sjkim      {0xc8b4aed1ea48dac9, 0xdebd8a4b7ea1070f, 0x427d49101366eb70,
280304Sjkim       0x5b476dfd0e6cb18a},
280304Sjkim      {1, 0, 0, 0}},
280304Sjkim     {{0x7c5c3e44278c340a, 0x4d54606812d66f3b, 0x29a751b1ae23c5d8,
280304Sjkim       0x3e29864e8a2ec908},
280304Sjkim      {0x142d2a6626dbb850, 0xad1744c4765bd780, 0x1f150e68e322d1ed,
280304Sjkim       0x239b90ea3dc31e7e},
280304Sjkim      {1, 0, 0, 0}},
280304Sjkim     {{0x78c416527a53322a, 0x305dde6709776f8e, 0xdbcab759f8862ed4,
280304Sjkim       0x820f4dd949f72ff7},
280304Sjkim      {0x6cc544a62b5debd4, 0x75be5d937b4e8cc4, 0x1b481b1b215c14d3,
280304Sjkim       0x140406ec783a05ec},
280304Sjkim      {1, 0, 0, 0}},
280304Sjkim     {{0x6a703f10e895df07, 0xfd75f3fa01876bd8, 0xeb5b06e70ce08ffe,
280304Sjkim       0x68f6b8542783dfee},
280304Sjkim      {0x90c76f8a78712655, 0xcf5293d2f310bf7f, 0xfbc8044dfda45028,
280304Sjkim       0xcbe1feba92e40ce6},
280304Sjkim      {1, 0, 0, 0}},
280304Sjkim     {{0xe998ceea4396e4c1, 0xfc82ef0b6acea274, 0x230f729f2250e927,
280304Sjkim       0xd0b2f94d2f420109},
280304Sjkim      {0x4305adddb38d4966, 0x10b838f8624c3b45, 0x7db2636658954e7a,
280304Sjkim       0x971459828b0719e5},
280304Sjkim      {1, 0, 0, 0}},
280304Sjkim     {{0x4bd6b72623369fc9, 0x57f2929e53d0b876, 0xc2d5cba4f2340687,
280304Sjkim       0x961610004a866aba},
280304Sjkim      {0x49997bcd2e407a5e, 0x69ab197d92ddcb24, 0x2cf1f2438fe5131c,
280304Sjkim       0x7acb9fadcee75e44},
280304Sjkim      {1, 0, 0, 0}},
280304Sjkim     {{0x254e839423d2d4c0, 0xf57f0c917aea685b, 0xa60d880f6f75aaea,
280304Sjkim       0x24eb9acca333bf5b},
280304Sjkim      {0xe3de4ccb1cda5dea, 0xfeef9341c51a6b4f, 0x743125f88bac4c4d,
280304Sjkim       0x69f891c5acd079cc},
280304Sjkim      {1, 0, 0, 0}},
280304Sjkim     {{0xeee44b35702476b5, 0x7ed031a0e45c2258, 0xb422d1e7bd6f8514,
280304Sjkim       0xe51f547c5972a107},
280304Sjkim      {0xa25bcd6fc9cf343d, 0x8ca922ee097c184e, 0xa62f98b3a9fe9a06,
280304Sjkim       0x1c309a2b25bb1387},
280304Sjkim      {1, 0, 0, 0}},
280304Sjkim     {{0x9295dbeb1967c459, 0xb00148833472c98e, 0xc504977708011828,
280304Sjkim       0x20b87b8aa2c4e503},
280304Sjkim      {0x3063175de057c277, 0x1bd539338fe582dd, 0x0d11adef5f69a044,
280304Sjkim       0xf5c6fa49919776be},
280304Sjkim      {1, 0, 0, 0}},
280304Sjkim     {{0x8c944e760fd59e11, 0x3876cba1102fad5f, 0xa454c3fad83faa56,
280304Sjkim       0x1ed7d1b9332010b9},
280304Sjkim      {0xa1011a270024b889, 0x05e4d0dcac0cd344, 0x52b520f0eb6a2a24,
280304Sjkim       0x3a2b03f03217257a},
280304Sjkim      {1, 0, 0, 0}},
280304Sjkim     {{0xf20fc2afdf1d043d, 0xf330240db58d5a62, 0xfc7d229ca0058c3b,
280304Sjkim       0x15fee545c78dd9f6},
280304Sjkim      {0x501e82885bc98cda, 0x41ef80e5d046ac04, 0x557d9f49461210fb,
280304Sjkim       0x4ab5b6b2b8753f81},
280304Sjkim      {1, 0, 0, 0}}}
280304Sjkim};
238384Sjkim
280304Sjkim/*
280304Sjkim * select_point selects the |idx|th point from a precomputation table and
280304Sjkim * copies it to out.
280304Sjkim */
280304Sjkimstatic void select_point(const u64 idx, unsigned int size,
280304Sjkim                         const smallfelem pre_comp[16][3], smallfelem out[3])
280304Sjkim{
280304Sjkim    unsigned i, j;
280304Sjkim    u64 *outlimbs = &out[0][0];
280304Sjkim    memset(outlimbs, 0, 3 * sizeof(smallfelem));
238384Sjkim
280304Sjkim    for (i = 0; i < size; i++) {
280304Sjkim        const u64 *inlimbs = (u64 *)&pre_comp[i][0][0];
280304Sjkim        u64 mask = i ^ idx;
280304Sjkim        mask |= mask >> 4;
280304Sjkim        mask |= mask >> 2;
280304Sjkim        mask |= mask >> 1;
280304Sjkim        mask &= 1;
280304Sjkim        mask--;
280304Sjkim        for (j = 0; j < NLIMBS * 3; j++)
280304Sjkim            outlimbs[j] |= inlimbs[j] & mask;
280304Sjkim    }
280304Sjkim}
238384Sjkim
238384Sjkim/* get_bit returns the |i|th bit in |in| */
238384Sjkimstatic char get_bit(const felem_bytearray in, int i)
280304Sjkim{
280304Sjkim    if ((i < 0) || (i >= 256))
280304Sjkim        return 0;
280304Sjkim    return (in[i >> 3] >> (i & 7)) & 1;
280304Sjkim}
238384Sjkim
280304Sjkim/*
280304Sjkim * Interleaved point multiplication using precomputed point multiples: The
280304Sjkim * small point multiples 0*P, 1*P, ..., 17*P are in pre_comp[], the scalars
280304Sjkim * in scalars[]. If g_scalar is non-NULL, we also add this multiple of the
280304Sjkim * generator, using certain (large) precomputed multiples in g_pre_comp.
280304Sjkim * Output point (X, Y, Z) is stored in x_out, y_out, z_out
280304Sjkim */
238384Sjkimstatic void batch_mul(felem x_out, felem y_out, felem z_out,
280304Sjkim                      const felem_bytearray scalars[],
280304Sjkim                      const unsigned num_points, const u8 *g_scalar,
280304Sjkim                      const int mixed, const smallfelem pre_comp[][17][3],
280304Sjkim                      const smallfelem g_pre_comp[2][16][3])
280304Sjkim{
280304Sjkim    int i, skip;
280304Sjkim    unsigned num, gen_mul = (g_scalar != NULL);
280304Sjkim    felem nq[3], ftmp;
280304Sjkim    smallfelem tmp[3];
280304Sjkim    u64 bits;
280304Sjkim    u8 sign, digit;
238384Sjkim
280304Sjkim    /* set nq to the point at infinity */
280304Sjkim    memset(nq, 0, 3 * sizeof(felem));
238384Sjkim
280304Sjkim    /*
280304Sjkim     * Loop over all scalars msb-to-lsb, interleaving additions of multiples
280304Sjkim     * of the generator (two in each of the last 32 rounds) and additions of
280304Sjkim     * other points multiples (every 5th round).
280304Sjkim     */
280304Sjkim    skip = 1;                   /* save two point operations in the first
280304Sjkim                                 * round */
280304Sjkim    for (i = (num_points ? 255 : 31); i >= 0; --i) {
280304Sjkim        /* double */
280304Sjkim        if (!skip)
280304Sjkim            point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]);
238384Sjkim
280304Sjkim        /* add multiples of the generator */
280304Sjkim        if (gen_mul && (i <= 31)) {
280304Sjkim            /* first, look 32 bits upwards */
280304Sjkim            bits = get_bit(g_scalar, i + 224) << 3;
280304Sjkim            bits |= get_bit(g_scalar, i + 160) << 2;
280304Sjkim            bits |= get_bit(g_scalar, i + 96) << 1;
280304Sjkim            bits |= get_bit(g_scalar, i + 32);
280304Sjkim            /* select the point to add, in constant time */
280304Sjkim            select_point(bits, 16, g_pre_comp[1], tmp);
238384Sjkim
280304Sjkim            if (!skip) {
280304Sjkim                /* Arg 1 below is for "mixed" */
280304Sjkim                point_add(nq[0], nq[1], nq[2],
280304Sjkim                          nq[0], nq[1], nq[2], 1, tmp[0], tmp[1], tmp[2]);
280304Sjkim            } else {
280304Sjkim                smallfelem_expand(nq[0], tmp[0]);
280304Sjkim                smallfelem_expand(nq[1], tmp[1]);
280304Sjkim                smallfelem_expand(nq[2], tmp[2]);
280304Sjkim                skip = 0;
280304Sjkim            }
238384Sjkim
280304Sjkim            /* second, look at the current position */
280304Sjkim            bits = get_bit(g_scalar, i + 192) << 3;
280304Sjkim            bits |= get_bit(g_scalar, i + 128) << 2;
280304Sjkim            bits |= get_bit(g_scalar, i + 64) << 1;
280304Sjkim            bits |= get_bit(g_scalar, i);
280304Sjkim            /* select the point to add, in constant time */
280304Sjkim            select_point(bits, 16, g_pre_comp[0], tmp);
280304Sjkim            /* Arg 1 below is for "mixed" */
280304Sjkim            point_add(nq[0], nq[1], nq[2],
280304Sjkim                      nq[0], nq[1], nq[2], 1, tmp[0], tmp[1], tmp[2]);
280304Sjkim        }
238384Sjkim
280304Sjkim        /* do other additions every 5 doublings */
280304Sjkim        if (num_points && (i % 5 == 0)) {
280304Sjkim            /* loop over all scalars */
280304Sjkim            for (num = 0; num < num_points; ++num) {
280304Sjkim                bits = get_bit(scalars[num], i + 4) << 5;
280304Sjkim                bits |= get_bit(scalars[num], i + 3) << 4;
280304Sjkim                bits |= get_bit(scalars[num], i + 2) << 3;
280304Sjkim                bits |= get_bit(scalars[num], i + 1) << 2;
280304Sjkim                bits |= get_bit(scalars[num], i) << 1;
280304Sjkim                bits |= get_bit(scalars[num], i - 1);
280304Sjkim                ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits);
238384Sjkim
280304Sjkim                /*
280304Sjkim                 * select the point to add or subtract, in constant time
280304Sjkim                 */
280304Sjkim                select_point(digit, 17, pre_comp[num], tmp);
280304Sjkim                smallfelem_neg(ftmp, tmp[1]); /* (X, -Y, Z) is the negative
280304Sjkim                                               * point */
280304Sjkim                copy_small_conditional(ftmp, tmp[1], (((limb) sign) - 1));
280304Sjkim                felem_contract(tmp[1], ftmp);
238384Sjkim
280304Sjkim                if (!skip) {
280304Sjkim                    point_add(nq[0], nq[1], nq[2],
280304Sjkim                              nq[0], nq[1], nq[2],
280304Sjkim                              mixed, tmp[0], tmp[1], tmp[2]);
280304Sjkim                } else {
280304Sjkim                    smallfelem_expand(nq[0], tmp[0]);
280304Sjkim                    smallfelem_expand(nq[1], tmp[1]);
280304Sjkim                    smallfelem_expand(nq[2], tmp[2]);
280304Sjkim                    skip = 0;
280304Sjkim                }
280304Sjkim            }
280304Sjkim        }
280304Sjkim    }
280304Sjkim    felem_assign(x_out, nq[0]);
280304Sjkim    felem_assign(y_out, nq[1]);
280304Sjkim    felem_assign(z_out, nq[2]);
280304Sjkim}
238384Sjkim
238384Sjkim/* Precomputation for the group generator. */
238384Sjkimtypedef struct {
280304Sjkim    smallfelem g_pre_comp[2][16][3];
280304Sjkim    int references;
238384Sjkim} NISTP256_PRE_COMP;
238384Sjkim
238384Sjkimconst EC_METHOD *EC_GFp_nistp256_method(void)
280304Sjkim{
280304Sjkim    static const EC_METHOD ret = {
280304Sjkim        EC_FLAGS_DEFAULT_OCT,
280304Sjkim        NID_X9_62_prime_field,
280304Sjkim        ec_GFp_nistp256_group_init,
280304Sjkim        ec_GFp_simple_group_finish,
280304Sjkim        ec_GFp_simple_group_clear_finish,
280304Sjkim        ec_GFp_nist_group_copy,
280304Sjkim        ec_GFp_nistp256_group_set_curve,
280304Sjkim        ec_GFp_simple_group_get_curve,
280304Sjkim        ec_GFp_simple_group_get_degree,
280304Sjkim        ec_GFp_simple_group_check_discriminant,
280304Sjkim        ec_GFp_simple_point_init,
280304Sjkim        ec_GFp_simple_point_finish,
280304Sjkim        ec_GFp_simple_point_clear_finish,
280304Sjkim        ec_GFp_simple_point_copy,
280304Sjkim        ec_GFp_simple_point_set_to_infinity,
280304Sjkim        ec_GFp_simple_set_Jprojective_coordinates_GFp,
280304Sjkim        ec_GFp_simple_get_Jprojective_coordinates_GFp,
280304Sjkim        ec_GFp_simple_point_set_affine_coordinates,
280304Sjkim        ec_GFp_nistp256_point_get_affine_coordinates,
280304Sjkim        0 /* point_set_compressed_coordinates */ ,
280304Sjkim        0 /* point2oct */ ,
280304Sjkim        0 /* oct2point */ ,
280304Sjkim        ec_GFp_simple_add,
280304Sjkim        ec_GFp_simple_dbl,
280304Sjkim        ec_GFp_simple_invert,
280304Sjkim        ec_GFp_simple_is_at_infinity,
280304Sjkim        ec_GFp_simple_is_on_curve,
280304Sjkim        ec_GFp_simple_cmp,
280304Sjkim        ec_GFp_simple_make_affine,
280304Sjkim        ec_GFp_simple_points_make_affine,
280304Sjkim        ec_GFp_nistp256_points_mul,
280304Sjkim        ec_GFp_nistp256_precompute_mult,
280304Sjkim        ec_GFp_nistp256_have_precompute_mult,
280304Sjkim        ec_GFp_nist_field_mul,
280304Sjkim        ec_GFp_nist_field_sqr,
280304Sjkim        0 /* field_div */ ,
280304Sjkim        0 /* field_encode */ ,
280304Sjkim        0 /* field_decode */ ,
280304Sjkim        0                       /* field_set_to_one */
280304Sjkim    };
238384Sjkim
280304Sjkim    return &ret;
280304Sjkim}
238384Sjkim
238384Sjkim/******************************************************************************/
280304Sjkim/*
280304Sjkim * FUNCTIONS TO MANAGE PRECOMPUTATION
238384Sjkim */
238384Sjkim
238384Sjkimstatic NISTP256_PRE_COMP *nistp256_pre_comp_new()
280304Sjkim{
280304Sjkim    NISTP256_PRE_COMP *ret = NULL;
280304Sjkim    ret = (NISTP256_PRE_COMP *) OPENSSL_malloc(sizeof *ret);
280304Sjkim    if (!ret) {
280304Sjkim        ECerr(EC_F_NISTP256_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE);
280304Sjkim        return ret;
280304Sjkim    }
280304Sjkim    memset(ret->g_pre_comp, 0, sizeof(ret->g_pre_comp));
280304Sjkim    ret->references = 1;
280304Sjkim    return ret;
280304Sjkim}
238384Sjkim
238384Sjkimstatic void *nistp256_pre_comp_dup(void *src_)
280304Sjkim{
280304Sjkim    NISTP256_PRE_COMP *src = src_;
238384Sjkim
280304Sjkim    /* no need to actually copy, these objects never change! */
280304Sjkim    CRYPTO_add(&src->references, 1, CRYPTO_LOCK_EC_PRE_COMP);
238384Sjkim
280304Sjkim    return src_;
280304Sjkim}
238384Sjkim
238384Sjkimstatic void nistp256_pre_comp_free(void *pre_)
280304Sjkim{
280304Sjkim    int i;
280304Sjkim    NISTP256_PRE_COMP *pre = pre_;
238384Sjkim
280304Sjkim    if (!pre)
280304Sjkim        return;
238384Sjkim
280304Sjkim    i = CRYPTO_add(&pre->references, -1, CRYPTO_LOCK_EC_PRE_COMP);
280304Sjkim    if (i > 0)
280304Sjkim        return;
238384Sjkim
280304Sjkim    OPENSSL_free(pre);
280304Sjkim}
238384Sjkim
238384Sjkimstatic void nistp256_pre_comp_clear_free(void *pre_)
280304Sjkim{
280304Sjkim    int i;
280304Sjkim    NISTP256_PRE_COMP *pre = pre_;
238384Sjkim
280304Sjkim    if (!pre)
280304Sjkim        return;
238384Sjkim
280304Sjkim    i = CRYPTO_add(&pre->references, -1, CRYPTO_LOCK_EC_PRE_COMP);
280304Sjkim    if (i > 0)
280304Sjkim        return;
238384Sjkim
280304Sjkim    OPENSSL_cleanse(pre, sizeof *pre);
280304Sjkim    OPENSSL_free(pre);
280304Sjkim}
238384Sjkim
238384Sjkim/******************************************************************************/
280304Sjkim/*
280304Sjkim * OPENSSL EC_METHOD FUNCTIONS
238384Sjkim */
238384Sjkim
238384Sjkimint ec_GFp_nistp256_group_init(EC_GROUP *group)
280304Sjkim{
280304Sjkim    int ret;
280304Sjkim    ret = ec_GFp_simple_group_init(group);
280304Sjkim    group->a_is_minus3 = 1;
280304Sjkim    return ret;
280304Sjkim}
238384Sjkim
238384Sjkimint ec_GFp_nistp256_group_set_curve(EC_GROUP *group, const BIGNUM *p,
280304Sjkim                                    const BIGNUM *a, const BIGNUM *b,
280304Sjkim                                    BN_CTX *ctx)
280304Sjkim{
280304Sjkim    int ret = 0;
280304Sjkim    BN_CTX *new_ctx = NULL;
280304Sjkim    BIGNUM *curve_p, *curve_a, *curve_b;
238384Sjkim
280304Sjkim    if (ctx == NULL)
280304Sjkim        if ((ctx = new_ctx = BN_CTX_new()) == NULL)
280304Sjkim            return 0;
280304Sjkim    BN_CTX_start(ctx);
280304Sjkim    if (((curve_p = BN_CTX_get(ctx)) == NULL) ||
280304Sjkim        ((curve_a = BN_CTX_get(ctx)) == NULL) ||
280304Sjkim        ((curve_b = BN_CTX_get(ctx)) == NULL))
280304Sjkim        goto err;
280304Sjkim    BN_bin2bn(nistp256_curve_params[0], sizeof(felem_bytearray), curve_p);
280304Sjkim    BN_bin2bn(nistp256_curve_params[1], sizeof(felem_bytearray), curve_a);
280304Sjkim    BN_bin2bn(nistp256_curve_params[2], sizeof(felem_bytearray), curve_b);
280304Sjkim    if ((BN_cmp(curve_p, p)) || (BN_cmp(curve_a, a)) || (BN_cmp(curve_b, b))) {
280304Sjkim        ECerr(EC_F_EC_GFP_NISTP256_GROUP_SET_CURVE,
280304Sjkim              EC_R_WRONG_CURVE_PARAMETERS);
280304Sjkim        goto err;
280304Sjkim    }
280304Sjkim    group->field_mod_func = BN_nist_mod_256;
280304Sjkim    ret = ec_GFp_simple_group_set_curve(group, p, a, b, ctx);
280304Sjkim err:
280304Sjkim    BN_CTX_end(ctx);
280304Sjkim    if (new_ctx != NULL)
280304Sjkim        BN_CTX_free(new_ctx);
280304Sjkim    return ret;
280304Sjkim}
238384Sjkim
280304Sjkim/*
280304Sjkim * Takes the Jacobian coordinates (X, Y, Z) of a point and returns (X', Y') =
280304Sjkim * (X/Z^2, Y/Z^3)
280304Sjkim */
238384Sjkimint ec_GFp_nistp256_point_get_affine_coordinates(const EC_GROUP *group,
280304Sjkim                                                 const EC_POINT *point,
280304Sjkim                                                 BIGNUM *x, BIGNUM *y,
280304Sjkim                                                 BN_CTX *ctx)
280304Sjkim{
280304Sjkim    felem z1, z2, x_in, y_in;
280304Sjkim    smallfelem x_out, y_out;
280304Sjkim    longfelem tmp;
238384Sjkim
280304Sjkim    if (EC_POINT_is_at_infinity(group, point)) {
280304Sjkim        ECerr(EC_F_EC_GFP_NISTP256_POINT_GET_AFFINE_COORDINATES,
280304Sjkim              EC_R_POINT_AT_INFINITY);
280304Sjkim        return 0;
280304Sjkim    }
280304Sjkim    if ((!BN_to_felem(x_in, &point->X)) || (!BN_to_felem(y_in, &point->Y)) ||
280304Sjkim        (!BN_to_felem(z1, &point->Z)))
280304Sjkim        return 0;
280304Sjkim    felem_inv(z2, z1);
280304Sjkim    felem_square(tmp, z2);
280304Sjkim    felem_reduce(z1, tmp);
280304Sjkim    felem_mul(tmp, x_in, z1);
280304Sjkim    felem_reduce(x_in, tmp);
280304Sjkim    felem_contract(x_out, x_in);
280304Sjkim    if (x != NULL) {
280304Sjkim        if (!smallfelem_to_BN(x, x_out)) {
280304Sjkim            ECerr(EC_F_EC_GFP_NISTP256_POINT_GET_AFFINE_COORDINATES,
280304Sjkim                  ERR_R_BN_LIB);
280304Sjkim            return 0;
280304Sjkim        }
280304Sjkim    }
280304Sjkim    felem_mul(tmp, z1, z2);
280304Sjkim    felem_reduce(z1, tmp);
280304Sjkim    felem_mul(tmp, y_in, z1);
280304Sjkim    felem_reduce(y_in, tmp);
280304Sjkim    felem_contract(y_out, y_in);
280304Sjkim    if (y != NULL) {
280304Sjkim        if (!smallfelem_to_BN(y, y_out)) {
280304Sjkim            ECerr(EC_F_EC_GFP_NISTP256_POINT_GET_AFFINE_COORDINATES,
280304Sjkim                  ERR_R_BN_LIB);
280304Sjkim            return 0;
280304Sjkim        }
280304Sjkim    }
280304Sjkim    return 1;
280304Sjkim}
238384Sjkim
280304Sjkim/* points below is of size |num|, and tmp_smallfelems is of size |num+1| */
280304Sjkimstatic void make_points_affine(size_t num, smallfelem points[][3],
280304Sjkim                               smallfelem tmp_smallfelems[])
280304Sjkim{
280304Sjkim    /*
280304Sjkim     * Runs in constant time, unless an input is the point at infinity (which
280304Sjkim     * normally shouldn't happen).
280304Sjkim     */
280304Sjkim    ec_GFp_nistp_points_make_affine_internal(num,
280304Sjkim                                             points,
280304Sjkim                                             sizeof(smallfelem),
280304Sjkim                                             tmp_smallfelems,
280304Sjkim                                             (void (*)(void *))smallfelem_one,
280304Sjkim                                             (int (*)(const void *))
280304Sjkim                                             smallfelem_is_zero_int,
280304Sjkim                                             (void (*)(void *, const void *))
280304Sjkim                                             smallfelem_assign,
280304Sjkim                                             (void (*)(void *, const void *))
280304Sjkim                                             smallfelem_square_contract,
280304Sjkim                                             (void (*)
280304Sjkim                                              (void *, const void *,
280304Sjkim                                               const void *))
280304Sjkim                                             smallfelem_mul_contract,
280304Sjkim                                             (void (*)(void *, const void *))
280304Sjkim                                             smallfelem_inv_contract,
280304Sjkim                                             /* nothing to contract */
280304Sjkim                                             (void (*)(void *, const void *))
280304Sjkim                                             smallfelem_assign);
280304Sjkim}
238384Sjkim
280304Sjkim/*
280304Sjkim * Computes scalar*generator + \sum scalars[i]*points[i], ignoring NULL
280304Sjkim * values Result is stored in r (r can equal one of the inputs).
280304Sjkim */
238384Sjkimint ec_GFp_nistp256_points_mul(const EC_GROUP *group, EC_POINT *r,
280304Sjkim                               const BIGNUM *scalar, size_t num,
280304Sjkim                               const EC_POINT *points[],
280304Sjkim                               const BIGNUM *scalars[], BN_CTX *ctx)
280304Sjkim{
280304Sjkim    int ret = 0;
280304Sjkim    int j;
280304Sjkim    int mixed = 0;
280304Sjkim    BN_CTX *new_ctx = NULL;
280304Sjkim    BIGNUM *x, *y, *z, *tmp_scalar;
280304Sjkim    felem_bytearray g_secret;
280304Sjkim    felem_bytearray *secrets = NULL;
280304Sjkim    smallfelem(*pre_comp)[17][3] = NULL;
280304Sjkim    smallfelem *tmp_smallfelems = NULL;
280304Sjkim    felem_bytearray tmp;
280304Sjkim    unsigned i, num_bytes;
280304Sjkim    int have_pre_comp = 0;
280304Sjkim    size_t num_points = num;
280304Sjkim    smallfelem x_in, y_in, z_in;
280304Sjkim    felem x_out, y_out, z_out;
280304Sjkim    NISTP256_PRE_COMP *pre = NULL;
280304Sjkim    const smallfelem(*g_pre_comp)[16][3] = NULL;
280304Sjkim    EC_POINT *generator = NULL;
280304Sjkim    const EC_POINT *p = NULL;
280304Sjkim    const BIGNUM *p_scalar = NULL;
238384Sjkim
280304Sjkim    if (ctx == NULL)
280304Sjkim        if ((ctx = new_ctx = BN_CTX_new()) == NULL)
280304Sjkim            return 0;
280304Sjkim    BN_CTX_start(ctx);
280304Sjkim    if (((x = BN_CTX_get(ctx)) == NULL) ||
280304Sjkim        ((y = BN_CTX_get(ctx)) == NULL) ||
280304Sjkim        ((z = BN_CTX_get(ctx)) == NULL) ||
280304Sjkim        ((tmp_scalar = BN_CTX_get(ctx)) == NULL))
280304Sjkim        goto err;
238384Sjkim
280304Sjkim    if (scalar != NULL) {
280304Sjkim        pre = EC_EX_DATA_get_data(group->extra_data,
280304Sjkim                                  nistp256_pre_comp_dup,
280304Sjkim                                  nistp256_pre_comp_free,
280304Sjkim                                  nistp256_pre_comp_clear_free);
280304Sjkim        if (pre)
280304Sjkim            /* we have precomputation, try to use it */
280304Sjkim            g_pre_comp = (const smallfelem(*)[16][3])pre->g_pre_comp;
280304Sjkim        else
280304Sjkim            /* try to use the standard precomputation */
280304Sjkim            g_pre_comp = &gmul[0];
280304Sjkim        generator = EC_POINT_new(group);
280304Sjkim        if (generator == NULL)
280304Sjkim            goto err;
280304Sjkim        /* get the generator from precomputation */
280304Sjkim        if (!smallfelem_to_BN(x, g_pre_comp[0][1][0]) ||
280304Sjkim            !smallfelem_to_BN(y, g_pre_comp[0][1][1]) ||
280304Sjkim            !smallfelem_to_BN(z, g_pre_comp[0][1][2])) {
280304Sjkim            ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_BN_LIB);
280304Sjkim            goto err;
280304Sjkim        }
280304Sjkim        if (!EC_POINT_set_Jprojective_coordinates_GFp(group,
280304Sjkim                                                      generator, x, y, z,
280304Sjkim                                                      ctx))
280304Sjkim            goto err;
280304Sjkim        if (0 == EC_POINT_cmp(group, generator, group->generator, ctx))
280304Sjkim            /* precomputation matches generator */
280304Sjkim            have_pre_comp = 1;
280304Sjkim        else
280304Sjkim            /*
280304Sjkim             * we don't have valid precomputation: treat the generator as a
280304Sjkim             * random point
280304Sjkim             */
280304Sjkim            num_points++;
280304Sjkim    }
280304Sjkim    if (num_points > 0) {
280304Sjkim        if (num_points >= 3) {
280304Sjkim            /*
280304Sjkim             * unless we precompute multiples for just one or two points,
280304Sjkim             * converting those into affine form is time well spent
280304Sjkim             */
280304Sjkim            mixed = 1;
280304Sjkim        }
280304Sjkim        secrets = OPENSSL_malloc(num_points * sizeof(felem_bytearray));
280304Sjkim        pre_comp = OPENSSL_malloc(num_points * 17 * 3 * sizeof(smallfelem));
280304Sjkim        if (mixed)
280304Sjkim            tmp_smallfelems =
280304Sjkim                OPENSSL_malloc((num_points * 17 + 1) * sizeof(smallfelem));
280304Sjkim        if ((secrets == NULL) || (pre_comp == NULL)
280304Sjkim            || (mixed && (tmp_smallfelems == NULL))) {
280304Sjkim            ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_MALLOC_FAILURE);
280304Sjkim            goto err;
280304Sjkim        }
238384Sjkim
280304Sjkim        /*
280304Sjkim         * we treat NULL scalars as 0, and NULL points as points at infinity,
280304Sjkim         * i.e., they contribute nothing to the linear combination
280304Sjkim         */
280304Sjkim        memset(secrets, 0, num_points * sizeof(felem_bytearray));
280304Sjkim        memset(pre_comp, 0, num_points * 17 * 3 * sizeof(smallfelem));
280304Sjkim        for (i = 0; i < num_points; ++i) {
280304Sjkim            if (i == num)
280304Sjkim                /*
280304Sjkim                 * we didn't have a valid precomputation, so we pick the
280304Sjkim                 * generator
280304Sjkim                 */
280304Sjkim            {
280304Sjkim                p = EC_GROUP_get0_generator(group);
280304Sjkim                p_scalar = scalar;
280304Sjkim            } else
280304Sjkim                /* the i^th point */
280304Sjkim            {
280304Sjkim                p = points[i];
280304Sjkim                p_scalar = scalars[i];
280304Sjkim            }
280304Sjkim            if ((p_scalar != NULL) && (p != NULL)) {
280304Sjkim                /* reduce scalar to 0 <= scalar < 2^256 */
280304Sjkim                if ((BN_num_bits(p_scalar) > 256)
280304Sjkim                    || (BN_is_negative(p_scalar))) {
280304Sjkim                    /*
280304Sjkim                     * this is an unusual input, and we don't guarantee
280304Sjkim                     * constant-timeness
280304Sjkim                     */
280304Sjkim                    if (!BN_nnmod(tmp_scalar, p_scalar, &group->order, ctx)) {
280304Sjkim                        ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_BN_LIB);
280304Sjkim                        goto err;
280304Sjkim                    }
280304Sjkim                    num_bytes = BN_bn2bin(tmp_scalar, tmp);
280304Sjkim                } else
280304Sjkim                    num_bytes = BN_bn2bin(p_scalar, tmp);
280304Sjkim                flip_endian(secrets[i], tmp, num_bytes);
280304Sjkim                /* precompute multiples */
280304Sjkim                if ((!BN_to_felem(x_out, &p->X)) ||
280304Sjkim                    (!BN_to_felem(y_out, &p->Y)) ||
280304Sjkim                    (!BN_to_felem(z_out, &p->Z)))
280304Sjkim                    goto err;
280304Sjkim                felem_shrink(pre_comp[i][1][0], x_out);
280304Sjkim                felem_shrink(pre_comp[i][1][1], y_out);
280304Sjkim                felem_shrink(pre_comp[i][1][2], z_out);
280304Sjkim                for (j = 2; j <= 16; ++j) {
280304Sjkim                    if (j & 1) {
280304Sjkim                        point_add_small(pre_comp[i][j][0], pre_comp[i][j][1],
280304Sjkim                                        pre_comp[i][j][2], pre_comp[i][1][0],
280304Sjkim                                        pre_comp[i][1][1], pre_comp[i][1][2],
280304Sjkim                                        pre_comp[i][j - 1][0],
280304Sjkim                                        pre_comp[i][j - 1][1],
280304Sjkim                                        pre_comp[i][j - 1][2]);
280304Sjkim                    } else {
280304Sjkim                        point_double_small(pre_comp[i][j][0],
280304Sjkim                                           pre_comp[i][j][1],
280304Sjkim                                           pre_comp[i][j][2],
280304Sjkim                                           pre_comp[i][j / 2][0],
280304Sjkim                                           pre_comp[i][j / 2][1],
280304Sjkim                                           pre_comp[i][j / 2][2]);
280304Sjkim                    }
280304Sjkim                }
280304Sjkim            }
280304Sjkim        }
280304Sjkim        if (mixed)
280304Sjkim            make_points_affine(num_points * 17, pre_comp[0], tmp_smallfelems);
280304Sjkim    }
238384Sjkim
280304Sjkim    /* the scalar for the generator */
280304Sjkim    if ((scalar != NULL) && (have_pre_comp)) {
280304Sjkim        memset(g_secret, 0, sizeof(g_secret));
280304Sjkim        /* reduce scalar to 0 <= scalar < 2^256 */
280304Sjkim        if ((BN_num_bits(scalar) > 256) || (BN_is_negative(scalar))) {
280304Sjkim            /*
280304Sjkim             * this is an unusual input, and we don't guarantee
280304Sjkim             * constant-timeness
280304Sjkim             */
280304Sjkim            if (!BN_nnmod(tmp_scalar, scalar, &group->order, ctx)) {
280304Sjkim                ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_BN_LIB);
280304Sjkim                goto err;
280304Sjkim            }
280304Sjkim            num_bytes = BN_bn2bin(tmp_scalar, tmp);
280304Sjkim        } else
280304Sjkim            num_bytes = BN_bn2bin(scalar, tmp);
280304Sjkim        flip_endian(g_secret, tmp, num_bytes);
280304Sjkim        /* do the multiplication with generator precomputation */
280304Sjkim        batch_mul(x_out, y_out, z_out,
280304Sjkim                  (const felem_bytearray(*))secrets, num_points,
280304Sjkim                  g_secret,
280304Sjkim                  mixed, (const smallfelem(*)[17][3])pre_comp, g_pre_comp);
280304Sjkim    } else
280304Sjkim        /* do the multiplication without generator precomputation */
280304Sjkim        batch_mul(x_out, y_out, z_out,
280304Sjkim                  (const felem_bytearray(*))secrets, num_points,
280304Sjkim                  NULL, mixed, (const smallfelem(*)[17][3])pre_comp, NULL);
280304Sjkim    /* reduce the output to its unique minimal representation */
280304Sjkim    felem_contract(x_in, x_out);
280304Sjkim    felem_contract(y_in, y_out);
280304Sjkim    felem_contract(z_in, z_out);
280304Sjkim    if ((!smallfelem_to_BN(x, x_in)) || (!smallfelem_to_BN(y, y_in)) ||
280304Sjkim        (!smallfelem_to_BN(z, z_in))) {
280304Sjkim        ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_BN_LIB);
280304Sjkim        goto err;
280304Sjkim    }
280304Sjkim    ret = EC_POINT_set_Jprojective_coordinates_GFp(group, r, x, y, z, ctx);
238384Sjkim
280304Sjkim err:
280304Sjkim    BN_CTX_end(ctx);
280304Sjkim    if (generator != NULL)
280304Sjkim        EC_POINT_free(generator);
280304Sjkim    if (new_ctx != NULL)
280304Sjkim        BN_CTX_free(new_ctx);
280304Sjkim    if (secrets != NULL)
280304Sjkim        OPENSSL_free(secrets);
280304Sjkim    if (pre_comp != NULL)
280304Sjkim        OPENSSL_free(pre_comp);
280304Sjkim    if (tmp_smallfelems != NULL)
280304Sjkim        OPENSSL_free(tmp_smallfelems);
280304Sjkim    return ret;
280304Sjkim}
238384Sjkim
238384Sjkimint ec_GFp_nistp256_precompute_mult(EC_GROUP *group, BN_CTX *ctx)
280304Sjkim{
280304Sjkim    int ret = 0;
280304Sjkim    NISTP256_PRE_COMP *pre = NULL;
280304Sjkim    int i, j;
280304Sjkim    BN_CTX *new_ctx = NULL;
280304Sjkim    BIGNUM *x, *y;
280304Sjkim    EC_POINT *generator = NULL;
280304Sjkim    smallfelem tmp_smallfelems[32];
280304Sjkim    felem x_tmp, y_tmp, z_tmp;
238384Sjkim
280304Sjkim    /* throw away old precomputation */
280304Sjkim    EC_EX_DATA_free_data(&group->extra_data, nistp256_pre_comp_dup,
280304Sjkim                         nistp256_pre_comp_free,
280304Sjkim                         nistp256_pre_comp_clear_free);
280304Sjkim    if (ctx == NULL)
280304Sjkim        if ((ctx = new_ctx = BN_CTX_new()) == NULL)
280304Sjkim            return 0;
280304Sjkim    BN_CTX_start(ctx);
280304Sjkim    if (((x = BN_CTX_get(ctx)) == NULL) || ((y = BN_CTX_get(ctx)) == NULL))
280304Sjkim        goto err;
280304Sjkim    /* get the generator */
280304Sjkim    if (group->generator == NULL)
280304Sjkim        goto err;
280304Sjkim    generator = EC_POINT_new(group);
280304Sjkim    if (generator == NULL)
280304Sjkim        goto err;
280304Sjkim    BN_bin2bn(nistp256_curve_params[3], sizeof(felem_bytearray), x);
280304Sjkim    BN_bin2bn(nistp256_curve_params[4], sizeof(felem_bytearray), y);
280304Sjkim    if (!EC_POINT_set_affine_coordinates_GFp(group, generator, x, y, ctx))
280304Sjkim        goto err;
280304Sjkim    if ((pre = nistp256_pre_comp_new()) == NULL)
280304Sjkim        goto err;
280304Sjkim    /*
280304Sjkim     * if the generator is the standard one, use built-in precomputation
280304Sjkim     */
280304Sjkim    if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) {
280304Sjkim        memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp));
280304Sjkim        ret = 1;
280304Sjkim        goto err;
280304Sjkim    }
280304Sjkim    if ((!BN_to_felem(x_tmp, &group->generator->X)) ||
280304Sjkim        (!BN_to_felem(y_tmp, &group->generator->Y)) ||
280304Sjkim        (!BN_to_felem(z_tmp, &group->generator->Z)))
280304Sjkim        goto err;
280304Sjkim    felem_shrink(pre->g_pre_comp[0][1][0], x_tmp);
280304Sjkim    felem_shrink(pre->g_pre_comp[0][1][1], y_tmp);
280304Sjkim    felem_shrink(pre->g_pre_comp[0][1][2], z_tmp);
280304Sjkim    /*
280304Sjkim     * compute 2^64*G, 2^128*G, 2^192*G for the first table, 2^32*G, 2^96*G,
280304Sjkim     * 2^160*G, 2^224*G for the second one
280304Sjkim     */
280304Sjkim    for (i = 1; i <= 8; i <<= 1) {
280304Sjkim        point_double_small(pre->g_pre_comp[1][i][0], pre->g_pre_comp[1][i][1],
280304Sjkim                           pre->g_pre_comp[1][i][2], pre->g_pre_comp[0][i][0],
280304Sjkim                           pre->g_pre_comp[0][i][1],
280304Sjkim                           pre->g_pre_comp[0][i][2]);
280304Sjkim        for (j = 0; j < 31; ++j) {
280304Sjkim            point_double_small(pre->g_pre_comp[1][i][0],
280304Sjkim                               pre->g_pre_comp[1][i][1],
280304Sjkim                               pre->g_pre_comp[1][i][2],
280304Sjkim                               pre->g_pre_comp[1][i][0],
280304Sjkim                               pre->g_pre_comp[1][i][1],
280304Sjkim                               pre->g_pre_comp[1][i][2]);
280304Sjkim        }
280304Sjkim        if (i == 8)
280304Sjkim            break;
280304Sjkim        point_double_small(pre->g_pre_comp[0][2 * i][0],
280304Sjkim                           pre->g_pre_comp[0][2 * i][1],
280304Sjkim                           pre->g_pre_comp[0][2 * i][2],
280304Sjkim                           pre->g_pre_comp[1][i][0], pre->g_pre_comp[1][i][1],
280304Sjkim                           pre->g_pre_comp[1][i][2]);
280304Sjkim        for (j = 0; j < 31; ++j) {
280304Sjkim            point_double_small(pre->g_pre_comp[0][2 * i][0],
280304Sjkim                               pre->g_pre_comp[0][2 * i][1],
280304Sjkim                               pre->g_pre_comp[0][2 * i][2],
280304Sjkim                               pre->g_pre_comp[0][2 * i][0],
280304Sjkim                               pre->g_pre_comp[0][2 * i][1],
280304Sjkim                               pre->g_pre_comp[0][2 * i][2]);
280304Sjkim        }
280304Sjkim    }
280304Sjkim    for (i = 0; i < 2; i++) {
280304Sjkim        /* g_pre_comp[i][0] is the point at infinity */
280304Sjkim        memset(pre->g_pre_comp[i][0], 0, sizeof(pre->g_pre_comp[i][0]));
280304Sjkim        /* the remaining multiples */
280304Sjkim        /* 2^64*G + 2^128*G resp. 2^96*G + 2^160*G */
280304Sjkim        point_add_small(pre->g_pre_comp[i][6][0], pre->g_pre_comp[i][6][1],
280304Sjkim                        pre->g_pre_comp[i][6][2], pre->g_pre_comp[i][4][0],
280304Sjkim                        pre->g_pre_comp[i][4][1], pre->g_pre_comp[i][4][2],
280304Sjkim                        pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1],
280304Sjkim                        pre->g_pre_comp[i][2][2]);
280304Sjkim        /* 2^64*G + 2^192*G resp. 2^96*G + 2^224*G */
280304Sjkim        point_add_small(pre->g_pre_comp[i][10][0], pre->g_pre_comp[i][10][1],
280304Sjkim                        pre->g_pre_comp[i][10][2], pre->g_pre_comp[i][8][0],
280304Sjkim                        pre->g_pre_comp[i][8][1], pre->g_pre_comp[i][8][2],
280304Sjkim                        pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1],
280304Sjkim                        pre->g_pre_comp[i][2][2]);
280304Sjkim        /* 2^128*G + 2^192*G resp. 2^160*G + 2^224*G */
280304Sjkim        point_add_small(pre->g_pre_comp[i][12][0], pre->g_pre_comp[i][12][1],
280304Sjkim                        pre->g_pre_comp[i][12][2], pre->g_pre_comp[i][8][0],
280304Sjkim                        pre->g_pre_comp[i][8][1], pre->g_pre_comp[i][8][2],
280304Sjkim                        pre->g_pre_comp[i][4][0], pre->g_pre_comp[i][4][1],
280304Sjkim                        pre->g_pre_comp[i][4][2]);
280304Sjkim        /*
280304Sjkim         * 2^64*G + 2^128*G + 2^192*G resp. 2^96*G + 2^160*G + 2^224*G
280304Sjkim         */
280304Sjkim        point_add_small(pre->g_pre_comp[i][14][0], pre->g_pre_comp[i][14][1],
280304Sjkim                        pre->g_pre_comp[i][14][2], pre->g_pre_comp[i][12][0],
280304Sjkim                        pre->g_pre_comp[i][12][1], pre->g_pre_comp[i][12][2],
280304Sjkim                        pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1],
280304Sjkim                        pre->g_pre_comp[i][2][2]);
280304Sjkim        for (j = 1; j < 8; ++j) {
280304Sjkim            /* odd multiples: add G resp. 2^32*G */
280304Sjkim            point_add_small(pre->g_pre_comp[i][2 * j + 1][0],
280304Sjkim                            pre->g_pre_comp[i][2 * j + 1][1],
280304Sjkim                            pre->g_pre_comp[i][2 * j + 1][2],
280304Sjkim                            pre->g_pre_comp[i][2 * j][0],
280304Sjkim                            pre->g_pre_comp[i][2 * j][1],
280304Sjkim                            pre->g_pre_comp[i][2 * j][2],
280304Sjkim                            pre->g_pre_comp[i][1][0],
280304Sjkim                            pre->g_pre_comp[i][1][1],
280304Sjkim                            pre->g_pre_comp[i][1][2]);
280304Sjkim        }
280304Sjkim    }
280304Sjkim    make_points_affine(31, &(pre->g_pre_comp[0][1]), tmp_smallfelems);
238384Sjkim
280304Sjkim    if (!EC_EX_DATA_set_data(&group->extra_data, pre, nistp256_pre_comp_dup,
280304Sjkim                             nistp256_pre_comp_free,
280304Sjkim                             nistp256_pre_comp_clear_free))
280304Sjkim        goto err;
280304Sjkim    ret = 1;
280304Sjkim    pre = NULL;
238384Sjkim err:
280304Sjkim    BN_CTX_end(ctx);
280304Sjkim    if (generator != NULL)
280304Sjkim        EC_POINT_free(generator);
280304Sjkim    if (new_ctx != NULL)
280304Sjkim        BN_CTX_free(new_ctx);
280304Sjkim    if (pre)
280304Sjkim        nistp256_pre_comp_free(pre);
280304Sjkim    return ret;
280304Sjkim}
238384Sjkim
238384Sjkimint ec_GFp_nistp256_have_precompute_mult(const EC_GROUP *group)
280304Sjkim{
280304Sjkim    if (EC_EX_DATA_get_data(group->extra_data, nistp256_pre_comp_dup,
280304Sjkim                            nistp256_pre_comp_free,
280304Sjkim                            nistp256_pre_comp_clear_free)
280304Sjkim        != NULL)
280304Sjkim        return 1;
280304Sjkim    else
280304Sjkim        return 0;
280304Sjkim}
238384Sjkim#else
280304Sjkimstatic void *dummy = &dummy;
238384Sjkim#endif