1238384Sjkim/* crypto/ec/ecp_nistp256.c */ 2238384Sjkim/* 3238384Sjkim * Written by Adam Langley (Google) for the OpenSSL project 4238384Sjkim */ 5238384Sjkim/* Copyright 2011 Google Inc. 6238384Sjkim * 7238384Sjkim * Licensed under the Apache License, Version 2.0 (the "License"); 8238384Sjkim * 9238384Sjkim * you may not use this file except in compliance with the License. 10238384Sjkim * You may obtain a copy of the License at 11238384Sjkim * 12238384Sjkim * http://www.apache.org/licenses/LICENSE-2.0 13238384Sjkim * 14238384Sjkim * Unless required by applicable law or agreed to in writing, software 15238384Sjkim * distributed under the License is distributed on an "AS IS" BASIS, 16238384Sjkim * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17238384Sjkim * See the License for the specific language governing permissions and 18238384Sjkim * limitations under the License. 19238384Sjkim */ 20238384Sjkim 21238384Sjkim/* 22238384Sjkim * A 64-bit implementation of the NIST P-256 elliptic curve point multiplication 23238384Sjkim * 24238384Sjkim * OpenSSL integration was taken from Emilia Kasper's work in ecp_nistp224.c. 25238384Sjkim * Otherwise based on Emilia's P224 work, which was inspired by my curve25519 26238384Sjkim * work which got its smarts from Daniel J. Bernstein's work on the same. 27238384Sjkim */ 28238384Sjkim 29238384Sjkim#include <openssl/opensslconf.h> 30238384Sjkim#ifndef OPENSSL_NO_EC_NISTP_64_GCC_128 31238384Sjkim 32280304Sjkim# ifndef OPENSSL_SYS_VMS 33280304Sjkim# include <stdint.h> 34280304Sjkim# else 35280304Sjkim# include <inttypes.h> 36280304Sjkim# endif 37238384Sjkim 38280304Sjkim# include <string.h> 39280304Sjkim# include <openssl/err.h> 40280304Sjkim# include "ec_lcl.h" 41238384Sjkim 42280304Sjkim# if defined(__GNUC__) && (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1)) 43238384Sjkim /* even with gcc, the typedef won't work for 32-bit platforms */ 44280304Sjkimtypedef __uint128_t uint128_t; /* nonstandard; implemented by gcc on 64-bit 45280304Sjkim * platforms */ 46280304Sjkimtypedef __int128_t int128_t; 47280304Sjkim# else 48280304Sjkim# error "Need GCC 3.1 or later to define type uint128_t" 49280304Sjkim# endif 50238384Sjkim 51238384Sjkimtypedef uint8_t u8; 52238384Sjkimtypedef uint32_t u32; 53238384Sjkimtypedef uint64_t u64; 54238384Sjkimtypedef int64_t s64; 55238384Sjkim 56280304Sjkim/* 57280304Sjkim * The underlying field. P256 operates over GF(2^256-2^224+2^192+2^96-1). We 58280304Sjkim * can serialise an element of this field into 32 bytes. We call this an 59280304Sjkim * felem_bytearray. 60280304Sjkim */ 61238384Sjkim 62238384Sjkimtypedef u8 felem_bytearray[32]; 63238384Sjkim 64280304Sjkim/* 65280304Sjkim * These are the parameters of P256, taken from FIPS 186-3, page 86. These 66280304Sjkim * values are big-endian. 67280304Sjkim */ 68238384Sjkimstatic const felem_bytearray nistp256_curve_params[5] = { 69280304Sjkim {0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, /* p */ 70280304Sjkim 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 71280304Sjkim 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 72280304Sjkim 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, 73280304Sjkim {0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, /* a = -3 */ 74280304Sjkim 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 75280304Sjkim 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 76280304Sjkim 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xfc}, /* b */ 77280304Sjkim {0x5a, 0xc6, 0x35, 0xd8, 0xaa, 0x3a, 0x93, 0xe7, 78280304Sjkim 0xb3, 0xeb, 0xbd, 0x55, 0x76, 0x98, 0x86, 0xbc, 79280304Sjkim 0x65, 0x1d, 0x06, 0xb0, 0xcc, 0x53, 0xb0, 0xf6, 80280304Sjkim 0x3b, 0xce, 0x3c, 0x3e, 0x27, 0xd2, 0x60, 0x4b}, 81280304Sjkim {0x6b, 0x17, 0xd1, 0xf2, 0xe1, 0x2c, 0x42, 0x47, /* x */ 82280304Sjkim 0xf8, 0xbc, 0xe6, 0xe5, 0x63, 0xa4, 0x40, 0xf2, 83280304Sjkim 0x77, 0x03, 0x7d, 0x81, 0x2d, 0xeb, 0x33, 0xa0, 84280304Sjkim 0xf4, 0xa1, 0x39, 0x45, 0xd8, 0x98, 0xc2, 0x96}, 85280304Sjkim {0x4f, 0xe3, 0x42, 0xe2, 0xfe, 0x1a, 0x7f, 0x9b, /* y */ 86280304Sjkim 0x8e, 0xe7, 0xeb, 0x4a, 0x7c, 0x0f, 0x9e, 0x16, 87280304Sjkim 0x2b, 0xce, 0x33, 0x57, 0x6b, 0x31, 0x5e, 0xce, 88280304Sjkim 0xcb, 0xb6, 0x40, 0x68, 0x37, 0xbf, 0x51, 0xf5} 89238384Sjkim}; 90238384Sjkim 91280304Sjkim/*- 92280304Sjkim * The representation of field elements. 93238384Sjkim * ------------------------------------ 94238384Sjkim * 95238384Sjkim * We represent field elements with either four 128-bit values, eight 128-bit 96238384Sjkim * values, or four 64-bit values. The field element represented is: 97238384Sjkim * v[0]*2^0 + v[1]*2^64 + v[2]*2^128 + v[3]*2^192 (mod p) 98238384Sjkim * or: 99238384Sjkim * v[0]*2^0 + v[1]*2^64 + v[2]*2^128 + ... + v[8]*2^512 (mod p) 100238384Sjkim * 101238384Sjkim * 128-bit values are called 'limbs'. Since the limbs are spaced only 64 bits 102238384Sjkim * apart, but are 128-bits wide, the most significant bits of each limb overlap 103238384Sjkim * with the least significant bits of the next. 104238384Sjkim * 105238384Sjkim * A field element with four limbs is an 'felem'. One with eight limbs is a 106238384Sjkim * 'longfelem' 107238384Sjkim * 108238384Sjkim * A field element with four, 64-bit values is called a 'smallfelem'. Small 109238384Sjkim * values are used as intermediate values before multiplication. 110238384Sjkim */ 111238384Sjkim 112280304Sjkim# define NLIMBS 4 113238384Sjkim 114238384Sjkimtypedef uint128_t limb; 115238384Sjkimtypedef limb felem[NLIMBS]; 116238384Sjkimtypedef limb longfelem[NLIMBS * 2]; 117238384Sjkimtypedef u64 smallfelem[NLIMBS]; 118238384Sjkim 119238384Sjkim/* This is the value of the prime as four 64-bit words, little-endian. */ 120280304Sjkimstatic const u64 kPrime[4] = 121280304Sjkim { 0xfffffffffffffffful, 0xffffffff, 0, 0xffffffff00000001ul }; 122238384Sjkimstatic const u64 bottom63bits = 0x7ffffffffffffffful; 123238384Sjkim 124280304Sjkim/* 125280304Sjkim * bin32_to_felem takes a little-endian byte array and converts it into felem 126280304Sjkim * form. This assumes that the CPU is little-endian. 127280304Sjkim */ 128238384Sjkimstatic void bin32_to_felem(felem out, const u8 in[32]) 129280304Sjkim{ 130280304Sjkim out[0] = *((u64 *)&in[0]); 131280304Sjkim out[1] = *((u64 *)&in[8]); 132280304Sjkim out[2] = *((u64 *)&in[16]); 133280304Sjkim out[3] = *((u64 *)&in[24]); 134280304Sjkim} 135238384Sjkim 136280304Sjkim/* 137280304Sjkim * smallfelem_to_bin32 takes a smallfelem and serialises into a little 138280304Sjkim * endian, 32 byte array. This assumes that the CPU is little-endian. 139280304Sjkim */ 140238384Sjkimstatic void smallfelem_to_bin32(u8 out[32], const smallfelem in) 141280304Sjkim{ 142280304Sjkim *((u64 *)&out[0]) = in[0]; 143280304Sjkim *((u64 *)&out[8]) = in[1]; 144280304Sjkim *((u64 *)&out[16]) = in[2]; 145280304Sjkim *((u64 *)&out[24]) = in[3]; 146280304Sjkim} 147238384Sjkim 148238384Sjkim/* To preserve endianness when using BN_bn2bin and BN_bin2bn */ 149238384Sjkimstatic void flip_endian(u8 *out, const u8 *in, unsigned len) 150280304Sjkim{ 151280304Sjkim unsigned i; 152280304Sjkim for (i = 0; i < len; ++i) 153280304Sjkim out[i] = in[len - 1 - i]; 154280304Sjkim} 155238384Sjkim 156238384Sjkim/* BN_to_felem converts an OpenSSL BIGNUM into an felem */ 157238384Sjkimstatic int BN_to_felem(felem out, const BIGNUM *bn) 158280304Sjkim{ 159280304Sjkim felem_bytearray b_in; 160280304Sjkim felem_bytearray b_out; 161280304Sjkim unsigned num_bytes; 162238384Sjkim 163280304Sjkim /* BN_bn2bin eats leading zeroes */ 164280304Sjkim memset(b_out, 0, sizeof b_out); 165280304Sjkim num_bytes = BN_num_bytes(bn); 166280304Sjkim if (num_bytes > sizeof b_out) { 167280304Sjkim ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE); 168280304Sjkim return 0; 169280304Sjkim } 170280304Sjkim if (BN_is_negative(bn)) { 171280304Sjkim ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE); 172280304Sjkim return 0; 173280304Sjkim } 174280304Sjkim num_bytes = BN_bn2bin(bn, b_in); 175280304Sjkim flip_endian(b_out, b_in, num_bytes); 176280304Sjkim bin32_to_felem(out, b_out); 177280304Sjkim return 1; 178280304Sjkim} 179238384Sjkim 180238384Sjkim/* felem_to_BN converts an felem into an OpenSSL BIGNUM */ 181238384Sjkimstatic BIGNUM *smallfelem_to_BN(BIGNUM *out, const smallfelem in) 182280304Sjkim{ 183280304Sjkim felem_bytearray b_in, b_out; 184280304Sjkim smallfelem_to_bin32(b_in, in); 185280304Sjkim flip_endian(b_out, b_in, sizeof b_out); 186280304Sjkim return BN_bin2bn(b_out, sizeof b_out, out); 187280304Sjkim} 188238384Sjkim 189280304Sjkim/*- 190280304Sjkim * Field operations 191280304Sjkim * ---------------- 192280304Sjkim */ 193238384Sjkim 194238384Sjkimstatic void smallfelem_one(smallfelem out) 195280304Sjkim{ 196280304Sjkim out[0] = 1; 197280304Sjkim out[1] = 0; 198280304Sjkim out[2] = 0; 199280304Sjkim out[3] = 0; 200280304Sjkim} 201238384Sjkim 202238384Sjkimstatic void smallfelem_assign(smallfelem out, const smallfelem in) 203280304Sjkim{ 204280304Sjkim out[0] = in[0]; 205280304Sjkim out[1] = in[1]; 206280304Sjkim out[2] = in[2]; 207280304Sjkim out[3] = in[3]; 208280304Sjkim} 209238384Sjkim 210238384Sjkimstatic void felem_assign(felem out, const felem in) 211280304Sjkim{ 212280304Sjkim out[0] = in[0]; 213280304Sjkim out[1] = in[1]; 214280304Sjkim out[2] = in[2]; 215280304Sjkim out[3] = in[3]; 216280304Sjkim} 217238384Sjkim 218238384Sjkim/* felem_sum sets out = out + in. */ 219238384Sjkimstatic void felem_sum(felem out, const felem in) 220280304Sjkim{ 221280304Sjkim out[0] += in[0]; 222280304Sjkim out[1] += in[1]; 223280304Sjkim out[2] += in[2]; 224280304Sjkim out[3] += in[3]; 225280304Sjkim} 226238384Sjkim 227238384Sjkim/* felem_small_sum sets out = out + in. */ 228238384Sjkimstatic void felem_small_sum(felem out, const smallfelem in) 229280304Sjkim{ 230280304Sjkim out[0] += in[0]; 231280304Sjkim out[1] += in[1]; 232280304Sjkim out[2] += in[2]; 233280304Sjkim out[3] += in[3]; 234280304Sjkim} 235238384Sjkim 236238384Sjkim/* felem_scalar sets out = out * scalar */ 237238384Sjkimstatic void felem_scalar(felem out, const u64 scalar) 238280304Sjkim{ 239280304Sjkim out[0] *= scalar; 240280304Sjkim out[1] *= scalar; 241280304Sjkim out[2] *= scalar; 242280304Sjkim out[3] *= scalar; 243280304Sjkim} 244238384Sjkim 245238384Sjkim/* longfelem_scalar sets out = out * scalar */ 246238384Sjkimstatic void longfelem_scalar(longfelem out, const u64 scalar) 247280304Sjkim{ 248280304Sjkim out[0] *= scalar; 249280304Sjkim out[1] *= scalar; 250280304Sjkim out[2] *= scalar; 251280304Sjkim out[3] *= scalar; 252280304Sjkim out[4] *= scalar; 253280304Sjkim out[5] *= scalar; 254280304Sjkim out[6] *= scalar; 255280304Sjkim out[7] *= scalar; 256280304Sjkim} 257238384Sjkim 258280304Sjkim# define two105m41m9 (((limb)1) << 105) - (((limb)1) << 41) - (((limb)1) << 9) 259280304Sjkim# define two105 (((limb)1) << 105) 260280304Sjkim# define two105m41p9 (((limb)1) << 105) - (((limb)1) << 41) + (((limb)1) << 9) 261238384Sjkim 262238384Sjkim/* zero105 is 0 mod p */ 263280304Sjkimstatic const felem zero105 = 264280304Sjkim { two105m41m9, two105, two105m41p9, two105m41p9 }; 265238384Sjkim 266280304Sjkim/*- 267280304Sjkim * smallfelem_neg sets |out| to |-small| 268238384Sjkim * On exit: 269238384Sjkim * out[i] < out[i] + 2^105 270238384Sjkim */ 271238384Sjkimstatic void smallfelem_neg(felem out, const smallfelem small) 272280304Sjkim{ 273280304Sjkim /* In order to prevent underflow, we subtract from 0 mod p. */ 274280304Sjkim out[0] = zero105[0] - small[0]; 275280304Sjkim out[1] = zero105[1] - small[1]; 276280304Sjkim out[2] = zero105[2] - small[2]; 277280304Sjkim out[3] = zero105[3] - small[3]; 278280304Sjkim} 279238384Sjkim 280280304Sjkim/*- 281280304Sjkim * felem_diff subtracts |in| from |out| 282238384Sjkim * On entry: 283238384Sjkim * in[i] < 2^104 284238384Sjkim * On exit: 285238384Sjkim * out[i] < out[i] + 2^105 286238384Sjkim */ 287238384Sjkimstatic void felem_diff(felem out, const felem in) 288280304Sjkim{ 289280304Sjkim /* 290280304Sjkim * In order to prevent underflow, we add 0 mod p before subtracting. 291280304Sjkim */ 292280304Sjkim out[0] += zero105[0]; 293280304Sjkim out[1] += zero105[1]; 294280304Sjkim out[2] += zero105[2]; 295280304Sjkim out[3] += zero105[3]; 296238384Sjkim 297280304Sjkim out[0] -= in[0]; 298280304Sjkim out[1] -= in[1]; 299280304Sjkim out[2] -= in[2]; 300280304Sjkim out[3] -= in[3]; 301280304Sjkim} 302238384Sjkim 303280304Sjkim# define two107m43m11 (((limb)1) << 107) - (((limb)1) << 43) - (((limb)1) << 11) 304280304Sjkim# define two107 (((limb)1) << 107) 305280304Sjkim# define two107m43p11 (((limb)1) << 107) - (((limb)1) << 43) + (((limb)1) << 11) 306238384Sjkim 307238384Sjkim/* zero107 is 0 mod p */ 308280304Sjkimstatic const felem zero107 = 309280304Sjkim { two107m43m11, two107, two107m43p11, two107m43p11 }; 310238384Sjkim 311280304Sjkim/*- 312280304Sjkim * An alternative felem_diff for larger inputs |in| 313238384Sjkim * felem_diff_zero107 subtracts |in| from |out| 314238384Sjkim * On entry: 315238384Sjkim * in[i] < 2^106 316238384Sjkim * On exit: 317238384Sjkim * out[i] < out[i] + 2^107 318238384Sjkim */ 319238384Sjkimstatic void felem_diff_zero107(felem out, const felem in) 320280304Sjkim{ 321280304Sjkim /* 322280304Sjkim * In order to prevent underflow, we add 0 mod p before subtracting. 323280304Sjkim */ 324280304Sjkim out[0] += zero107[0]; 325280304Sjkim out[1] += zero107[1]; 326280304Sjkim out[2] += zero107[2]; 327280304Sjkim out[3] += zero107[3]; 328238384Sjkim 329280304Sjkim out[0] -= in[0]; 330280304Sjkim out[1] -= in[1]; 331280304Sjkim out[2] -= in[2]; 332280304Sjkim out[3] -= in[3]; 333280304Sjkim} 334238384Sjkim 335280304Sjkim/*- 336280304Sjkim * longfelem_diff subtracts |in| from |out| 337238384Sjkim * On entry: 338238384Sjkim * in[i] < 7*2^67 339238384Sjkim * On exit: 340238384Sjkim * out[i] < out[i] + 2^70 + 2^40 341238384Sjkim */ 342238384Sjkimstatic void longfelem_diff(longfelem out, const longfelem in) 343280304Sjkim{ 344280304Sjkim static const limb two70m8p6 = 345280304Sjkim (((limb) 1) << 70) - (((limb) 1) << 8) + (((limb) 1) << 6); 346280304Sjkim static const limb two70p40 = (((limb) 1) << 70) + (((limb) 1) << 40); 347280304Sjkim static const limb two70 = (((limb) 1) << 70); 348280304Sjkim static const limb two70m40m38p6 = 349280304Sjkim (((limb) 1) << 70) - (((limb) 1) << 40) - (((limb) 1) << 38) + 350280304Sjkim (((limb) 1) << 6); 351280304Sjkim static const limb two70m6 = (((limb) 1) << 70) - (((limb) 1) << 6); 352238384Sjkim 353280304Sjkim /* add 0 mod p to avoid underflow */ 354280304Sjkim out[0] += two70m8p6; 355280304Sjkim out[1] += two70p40; 356280304Sjkim out[2] += two70; 357280304Sjkim out[3] += two70m40m38p6; 358280304Sjkim out[4] += two70m6; 359280304Sjkim out[5] += two70m6; 360280304Sjkim out[6] += two70m6; 361280304Sjkim out[7] += two70m6; 362238384Sjkim 363280304Sjkim /* in[i] < 7*2^67 < 2^70 - 2^40 - 2^38 + 2^6 */ 364280304Sjkim out[0] -= in[0]; 365280304Sjkim out[1] -= in[1]; 366280304Sjkim out[2] -= in[2]; 367280304Sjkim out[3] -= in[3]; 368280304Sjkim out[4] -= in[4]; 369280304Sjkim out[5] -= in[5]; 370280304Sjkim out[6] -= in[6]; 371280304Sjkim out[7] -= in[7]; 372280304Sjkim} 373238384Sjkim 374280304Sjkim# define two64m0 (((limb)1) << 64) - 1 375280304Sjkim# define two110p32m0 (((limb)1) << 110) + (((limb)1) << 32) - 1 376280304Sjkim# define two64m46 (((limb)1) << 64) - (((limb)1) << 46) 377280304Sjkim# define two64m32 (((limb)1) << 64) - (((limb)1) << 32) 378238384Sjkim 379238384Sjkim/* zero110 is 0 mod p */ 380238384Sjkimstatic const felem zero110 = { two64m0, two110p32m0, two64m46, two64m32 }; 381238384Sjkim 382280304Sjkim/*- 383280304Sjkim * felem_shrink converts an felem into a smallfelem. The result isn't quite 384238384Sjkim * minimal as the value may be greater than p. 385238384Sjkim * 386238384Sjkim * On entry: 387238384Sjkim * in[i] < 2^109 388238384Sjkim * On exit: 389238384Sjkim * out[i] < 2^64 390238384Sjkim */ 391238384Sjkimstatic void felem_shrink(smallfelem out, const felem in) 392280304Sjkim{ 393280304Sjkim felem tmp; 394280304Sjkim u64 a, b, mask; 395280304Sjkim s64 high, low; 396280304Sjkim static const u64 kPrime3Test = 0x7fffffff00000001ul; /* 2^63 - 2^32 + 1 */ 397238384Sjkim 398280304Sjkim /* Carry 2->3 */ 399280304Sjkim tmp[3] = zero110[3] + in[3] + ((u64)(in[2] >> 64)); 400280304Sjkim /* tmp[3] < 2^110 */ 401238384Sjkim 402280304Sjkim tmp[2] = zero110[2] + (u64)in[2]; 403280304Sjkim tmp[0] = zero110[0] + in[0]; 404280304Sjkim tmp[1] = zero110[1] + in[1]; 405280304Sjkim /* tmp[0] < 2**110, tmp[1] < 2^111, tmp[2] < 2**65 */ 406238384Sjkim 407280304Sjkim /* 408280304Sjkim * We perform two partial reductions where we eliminate the high-word of 409280304Sjkim * tmp[3]. We don't update the other words till the end. 410280304Sjkim */ 411280304Sjkim a = tmp[3] >> 64; /* a < 2^46 */ 412280304Sjkim tmp[3] = (u64)tmp[3]; 413280304Sjkim tmp[3] -= a; 414280304Sjkim tmp[3] += ((limb) a) << 32; 415280304Sjkim /* tmp[3] < 2^79 */ 416238384Sjkim 417280304Sjkim b = a; 418280304Sjkim a = tmp[3] >> 64; /* a < 2^15 */ 419280304Sjkim b += a; /* b < 2^46 + 2^15 < 2^47 */ 420280304Sjkim tmp[3] = (u64)tmp[3]; 421280304Sjkim tmp[3] -= a; 422280304Sjkim tmp[3] += ((limb) a) << 32; 423280304Sjkim /* tmp[3] < 2^64 + 2^47 */ 424238384Sjkim 425280304Sjkim /* 426280304Sjkim * This adjusts the other two words to complete the two partial 427280304Sjkim * reductions. 428280304Sjkim */ 429280304Sjkim tmp[0] += b; 430280304Sjkim tmp[1] -= (((limb) b) << 32); 431238384Sjkim 432280304Sjkim /* 433280304Sjkim * In order to make space in tmp[3] for the carry from 2 -> 3, we 434280304Sjkim * conditionally subtract kPrime if tmp[3] is large enough. 435280304Sjkim */ 436280304Sjkim high = tmp[3] >> 64; 437280304Sjkim /* As tmp[3] < 2^65, high is either 1 or 0 */ 438280304Sjkim high <<= 63; 439280304Sjkim high >>= 63; 440280304Sjkim /*- 441280304Sjkim * high is: 442280304Sjkim * all ones if the high word of tmp[3] is 1 443280304Sjkim * all zeros if the high word of tmp[3] if 0 */ 444280304Sjkim low = tmp[3]; 445280304Sjkim mask = low >> 63; 446280304Sjkim /*- 447280304Sjkim * mask is: 448280304Sjkim * all ones if the MSB of low is 1 449280304Sjkim * all zeros if the MSB of low if 0 */ 450280304Sjkim low &= bottom63bits; 451280304Sjkim low -= kPrime3Test; 452280304Sjkim /* if low was greater than kPrime3Test then the MSB is zero */ 453280304Sjkim low = ~low; 454280304Sjkim low >>= 63; 455280304Sjkim /*- 456280304Sjkim * low is: 457280304Sjkim * all ones if low was > kPrime3Test 458280304Sjkim * all zeros if low was <= kPrime3Test */ 459280304Sjkim mask = (mask & low) | high; 460280304Sjkim tmp[0] -= mask & kPrime[0]; 461280304Sjkim tmp[1] -= mask & kPrime[1]; 462280304Sjkim /* kPrime[2] is zero, so omitted */ 463280304Sjkim tmp[3] -= mask & kPrime[3]; 464280304Sjkim /* tmp[3] < 2**64 - 2**32 + 1 */ 465238384Sjkim 466280304Sjkim tmp[1] += ((u64)(tmp[0] >> 64)); 467280304Sjkim tmp[0] = (u64)tmp[0]; 468280304Sjkim tmp[2] += ((u64)(tmp[1] >> 64)); 469280304Sjkim tmp[1] = (u64)tmp[1]; 470280304Sjkim tmp[3] += ((u64)(tmp[2] >> 64)); 471280304Sjkim tmp[2] = (u64)tmp[2]; 472280304Sjkim /* tmp[i] < 2^64 */ 473238384Sjkim 474280304Sjkim out[0] = tmp[0]; 475280304Sjkim out[1] = tmp[1]; 476280304Sjkim out[2] = tmp[2]; 477280304Sjkim out[3] = tmp[3]; 478280304Sjkim} 479238384Sjkim 480238384Sjkim/* smallfelem_expand converts a smallfelem to an felem */ 481238384Sjkimstatic void smallfelem_expand(felem out, const smallfelem in) 482280304Sjkim{ 483280304Sjkim out[0] = in[0]; 484280304Sjkim out[1] = in[1]; 485280304Sjkim out[2] = in[2]; 486280304Sjkim out[3] = in[3]; 487280304Sjkim} 488238384Sjkim 489280304Sjkim/*- 490280304Sjkim * smallfelem_square sets |out| = |small|^2 491238384Sjkim * On entry: 492238384Sjkim * small[i] < 2^64 493238384Sjkim * On exit: 494238384Sjkim * out[i] < 7 * 2^64 < 2^67 495238384Sjkim */ 496238384Sjkimstatic void smallfelem_square(longfelem out, const smallfelem small) 497280304Sjkim{ 498280304Sjkim limb a; 499280304Sjkim u64 high, low; 500238384Sjkim 501280304Sjkim a = ((uint128_t) small[0]) * small[0]; 502280304Sjkim low = a; 503280304Sjkim high = a >> 64; 504280304Sjkim out[0] = low; 505280304Sjkim out[1] = high; 506238384Sjkim 507280304Sjkim a = ((uint128_t) small[0]) * small[1]; 508280304Sjkim low = a; 509280304Sjkim high = a >> 64; 510280304Sjkim out[1] += low; 511280304Sjkim out[1] += low; 512280304Sjkim out[2] = high; 513238384Sjkim 514280304Sjkim a = ((uint128_t) small[0]) * small[2]; 515280304Sjkim low = a; 516280304Sjkim high = a >> 64; 517280304Sjkim out[2] += low; 518280304Sjkim out[2] *= 2; 519280304Sjkim out[3] = high; 520238384Sjkim 521280304Sjkim a = ((uint128_t) small[0]) * small[3]; 522280304Sjkim low = a; 523280304Sjkim high = a >> 64; 524280304Sjkim out[3] += low; 525280304Sjkim out[4] = high; 526238384Sjkim 527280304Sjkim a = ((uint128_t) small[1]) * small[2]; 528280304Sjkim low = a; 529280304Sjkim high = a >> 64; 530280304Sjkim out[3] += low; 531280304Sjkim out[3] *= 2; 532280304Sjkim out[4] += high; 533238384Sjkim 534280304Sjkim a = ((uint128_t) small[1]) * small[1]; 535280304Sjkim low = a; 536280304Sjkim high = a >> 64; 537280304Sjkim out[2] += low; 538280304Sjkim out[3] += high; 539238384Sjkim 540280304Sjkim a = ((uint128_t) small[1]) * small[3]; 541280304Sjkim low = a; 542280304Sjkim high = a >> 64; 543280304Sjkim out[4] += low; 544280304Sjkim out[4] *= 2; 545280304Sjkim out[5] = high; 546238384Sjkim 547280304Sjkim a = ((uint128_t) small[2]) * small[3]; 548280304Sjkim low = a; 549280304Sjkim high = a >> 64; 550280304Sjkim out[5] += low; 551280304Sjkim out[5] *= 2; 552280304Sjkim out[6] = high; 553280304Sjkim out[6] += high; 554238384Sjkim 555280304Sjkim a = ((uint128_t) small[2]) * small[2]; 556280304Sjkim low = a; 557280304Sjkim high = a >> 64; 558280304Sjkim out[4] += low; 559280304Sjkim out[5] += high; 560238384Sjkim 561280304Sjkim a = ((uint128_t) small[3]) * small[3]; 562280304Sjkim low = a; 563280304Sjkim high = a >> 64; 564280304Sjkim out[6] += low; 565280304Sjkim out[7] = high; 566280304Sjkim} 567238384Sjkim 568280304Sjkim/*- 569280304Sjkim * felem_square sets |out| = |in|^2 570238384Sjkim * On entry: 571238384Sjkim * in[i] < 2^109 572238384Sjkim * On exit: 573238384Sjkim * out[i] < 7 * 2^64 < 2^67 574238384Sjkim */ 575238384Sjkimstatic void felem_square(longfelem out, const felem in) 576280304Sjkim{ 577280304Sjkim u64 small[4]; 578280304Sjkim felem_shrink(small, in); 579280304Sjkim smallfelem_square(out, small); 580280304Sjkim} 581238384Sjkim 582280304Sjkim/*- 583280304Sjkim * smallfelem_mul sets |out| = |small1| * |small2| 584238384Sjkim * On entry: 585238384Sjkim * small1[i] < 2^64 586238384Sjkim * small2[i] < 2^64 587238384Sjkim * On exit: 588238384Sjkim * out[i] < 7 * 2^64 < 2^67 589238384Sjkim */ 590280304Sjkimstatic void smallfelem_mul(longfelem out, const smallfelem small1, 591280304Sjkim const smallfelem small2) 592280304Sjkim{ 593280304Sjkim limb a; 594280304Sjkim u64 high, low; 595238384Sjkim 596280304Sjkim a = ((uint128_t) small1[0]) * small2[0]; 597280304Sjkim low = a; 598280304Sjkim high = a >> 64; 599280304Sjkim out[0] = low; 600280304Sjkim out[1] = high; 601238384Sjkim 602280304Sjkim a = ((uint128_t) small1[0]) * small2[1]; 603280304Sjkim low = a; 604280304Sjkim high = a >> 64; 605280304Sjkim out[1] += low; 606280304Sjkim out[2] = high; 607238384Sjkim 608280304Sjkim a = ((uint128_t) small1[1]) * small2[0]; 609280304Sjkim low = a; 610280304Sjkim high = a >> 64; 611280304Sjkim out[1] += low; 612280304Sjkim out[2] += high; 613238384Sjkim 614280304Sjkim a = ((uint128_t) small1[0]) * small2[2]; 615280304Sjkim low = a; 616280304Sjkim high = a >> 64; 617280304Sjkim out[2] += low; 618280304Sjkim out[3] = high; 619238384Sjkim 620280304Sjkim a = ((uint128_t) small1[1]) * small2[1]; 621280304Sjkim low = a; 622280304Sjkim high = a >> 64; 623280304Sjkim out[2] += low; 624280304Sjkim out[3] += high; 625238384Sjkim 626280304Sjkim a = ((uint128_t) small1[2]) * small2[0]; 627280304Sjkim low = a; 628280304Sjkim high = a >> 64; 629280304Sjkim out[2] += low; 630280304Sjkim out[3] += high; 631238384Sjkim 632280304Sjkim a = ((uint128_t) small1[0]) * small2[3]; 633280304Sjkim low = a; 634280304Sjkim high = a >> 64; 635280304Sjkim out[3] += low; 636280304Sjkim out[4] = high; 637238384Sjkim 638280304Sjkim a = ((uint128_t) small1[1]) * small2[2]; 639280304Sjkim low = a; 640280304Sjkim high = a >> 64; 641280304Sjkim out[3] += low; 642280304Sjkim out[4] += high; 643238384Sjkim 644280304Sjkim a = ((uint128_t) small1[2]) * small2[1]; 645280304Sjkim low = a; 646280304Sjkim high = a >> 64; 647280304Sjkim out[3] += low; 648280304Sjkim out[4] += high; 649238384Sjkim 650280304Sjkim a = ((uint128_t) small1[3]) * small2[0]; 651280304Sjkim low = a; 652280304Sjkim high = a >> 64; 653280304Sjkim out[3] += low; 654280304Sjkim out[4] += high; 655238384Sjkim 656280304Sjkim a = ((uint128_t) small1[1]) * small2[3]; 657280304Sjkim low = a; 658280304Sjkim high = a >> 64; 659280304Sjkim out[4] += low; 660280304Sjkim out[5] = high; 661238384Sjkim 662280304Sjkim a = ((uint128_t) small1[2]) * small2[2]; 663280304Sjkim low = a; 664280304Sjkim high = a >> 64; 665280304Sjkim out[4] += low; 666280304Sjkim out[5] += high; 667238384Sjkim 668280304Sjkim a = ((uint128_t) small1[3]) * small2[1]; 669280304Sjkim low = a; 670280304Sjkim high = a >> 64; 671280304Sjkim out[4] += low; 672280304Sjkim out[5] += high; 673238384Sjkim 674280304Sjkim a = ((uint128_t) small1[2]) * small2[3]; 675280304Sjkim low = a; 676280304Sjkim high = a >> 64; 677280304Sjkim out[5] += low; 678280304Sjkim out[6] = high; 679238384Sjkim 680280304Sjkim a = ((uint128_t) small1[3]) * small2[2]; 681280304Sjkim low = a; 682280304Sjkim high = a >> 64; 683280304Sjkim out[5] += low; 684280304Sjkim out[6] += high; 685238384Sjkim 686280304Sjkim a = ((uint128_t) small1[3]) * small2[3]; 687280304Sjkim low = a; 688280304Sjkim high = a >> 64; 689280304Sjkim out[6] += low; 690280304Sjkim out[7] = high; 691280304Sjkim} 692238384Sjkim 693280304Sjkim/*- 694280304Sjkim * felem_mul sets |out| = |in1| * |in2| 695238384Sjkim * On entry: 696238384Sjkim * in1[i] < 2^109 697238384Sjkim * in2[i] < 2^109 698238384Sjkim * On exit: 699238384Sjkim * out[i] < 7 * 2^64 < 2^67 700238384Sjkim */ 701238384Sjkimstatic void felem_mul(longfelem out, const felem in1, const felem in2) 702280304Sjkim{ 703280304Sjkim smallfelem small1, small2; 704280304Sjkim felem_shrink(small1, in1); 705280304Sjkim felem_shrink(small2, in2); 706280304Sjkim smallfelem_mul(out, small1, small2); 707280304Sjkim} 708238384Sjkim 709280304Sjkim/*- 710280304Sjkim * felem_small_mul sets |out| = |small1| * |in2| 711238384Sjkim * On entry: 712238384Sjkim * small1[i] < 2^64 713238384Sjkim * in2[i] < 2^109 714238384Sjkim * On exit: 715238384Sjkim * out[i] < 7 * 2^64 < 2^67 716238384Sjkim */ 717280304Sjkimstatic void felem_small_mul(longfelem out, const smallfelem small1, 718280304Sjkim const felem in2) 719280304Sjkim{ 720280304Sjkim smallfelem small2; 721280304Sjkim felem_shrink(small2, in2); 722280304Sjkim smallfelem_mul(out, small1, small2); 723280304Sjkim} 724238384Sjkim 725280304Sjkim# define two100m36m4 (((limb)1) << 100) - (((limb)1) << 36) - (((limb)1) << 4) 726280304Sjkim# define two100 (((limb)1) << 100) 727280304Sjkim# define two100m36p4 (((limb)1) << 100) - (((limb)1) << 36) + (((limb)1) << 4) 728238384Sjkim/* zero100 is 0 mod p */ 729280304Sjkimstatic const felem zero100 = 730280304Sjkim { two100m36m4, two100, two100m36p4, two100m36p4 }; 731238384Sjkim 732280304Sjkim/*- 733280304Sjkim * Internal function for the different flavours of felem_reduce. 734238384Sjkim * felem_reduce_ reduces the higher coefficients in[4]-in[7]. 735238384Sjkim * On entry: 736280304Sjkim * out[0] >= in[6] + 2^32*in[6] + in[7] + 2^32*in[7] 737238384Sjkim * out[1] >= in[7] + 2^32*in[4] 738238384Sjkim * out[2] >= in[5] + 2^32*in[5] 739238384Sjkim * out[3] >= in[4] + 2^32*in[5] + 2^32*in[6] 740238384Sjkim * On exit: 741238384Sjkim * out[0] <= out[0] + in[4] + 2^32*in[5] 742238384Sjkim * out[1] <= out[1] + in[5] + 2^33*in[6] 743238384Sjkim * out[2] <= out[2] + in[7] + 2*in[6] + 2^33*in[7] 744238384Sjkim * out[3] <= out[3] + 2^32*in[4] + 3*in[7] 745238384Sjkim */ 746238384Sjkimstatic void felem_reduce_(felem out, const longfelem in) 747280304Sjkim{ 748280304Sjkim int128_t c; 749280304Sjkim /* combine common terms from below */ 750280304Sjkim c = in[4] + (in[5] << 32); 751280304Sjkim out[0] += c; 752280304Sjkim out[3] -= c; 753238384Sjkim 754280304Sjkim c = in[5] - in[7]; 755280304Sjkim out[1] += c; 756280304Sjkim out[2] -= c; 757238384Sjkim 758280304Sjkim /* the remaining terms */ 759280304Sjkim /* 256: [(0,1),(96,-1),(192,-1),(224,1)] */ 760280304Sjkim out[1] -= (in[4] << 32); 761280304Sjkim out[3] += (in[4] << 32); 762238384Sjkim 763280304Sjkim /* 320: [(32,1),(64,1),(128,-1),(160,-1),(224,-1)] */ 764280304Sjkim out[2] -= (in[5] << 32); 765238384Sjkim 766280304Sjkim /* 384: [(0,-1),(32,-1),(96,2),(128,2),(224,-1)] */ 767280304Sjkim out[0] -= in[6]; 768280304Sjkim out[0] -= (in[6] << 32); 769280304Sjkim out[1] += (in[6] << 33); 770280304Sjkim out[2] += (in[6] * 2); 771280304Sjkim out[3] -= (in[6] << 32); 772238384Sjkim 773280304Sjkim /* 448: [(0,-1),(32,-1),(64,-1),(128,1),(160,2),(192,3)] */ 774280304Sjkim out[0] -= in[7]; 775280304Sjkim out[0] -= (in[7] << 32); 776280304Sjkim out[2] += (in[7] << 33); 777280304Sjkim out[3] += (in[7] * 3); 778280304Sjkim} 779238384Sjkim 780280304Sjkim/*- 781280304Sjkim * felem_reduce converts a longfelem into an felem. 782238384Sjkim * To be called directly after felem_square or felem_mul. 783238384Sjkim * On entry: 784238384Sjkim * in[0] < 2^64, in[1] < 3*2^64, in[2] < 5*2^64, in[3] < 7*2^64 785238384Sjkim * in[4] < 7*2^64, in[5] < 5*2^64, in[6] < 3*2^64, in[7] < 2*64 786238384Sjkim * On exit: 787238384Sjkim * out[i] < 2^101 788238384Sjkim */ 789238384Sjkimstatic void felem_reduce(felem out, const longfelem in) 790280304Sjkim{ 791280304Sjkim out[0] = zero100[0] + in[0]; 792280304Sjkim out[1] = zero100[1] + in[1]; 793280304Sjkim out[2] = zero100[2] + in[2]; 794280304Sjkim out[3] = zero100[3] + in[3]; 795238384Sjkim 796280304Sjkim felem_reduce_(out, in); 797238384Sjkim 798280304Sjkim /*- 799280304Sjkim * out[0] > 2^100 - 2^36 - 2^4 - 3*2^64 - 3*2^96 - 2^64 - 2^96 > 0 800280304Sjkim * out[1] > 2^100 - 2^64 - 7*2^96 > 0 801280304Sjkim * out[2] > 2^100 - 2^36 + 2^4 - 5*2^64 - 5*2^96 > 0 802280304Sjkim * out[3] > 2^100 - 2^36 + 2^4 - 7*2^64 - 5*2^96 - 3*2^96 > 0 803280304Sjkim * 804280304Sjkim * out[0] < 2^100 + 2^64 + 7*2^64 + 5*2^96 < 2^101 805280304Sjkim * out[1] < 2^100 + 3*2^64 + 5*2^64 + 3*2^97 < 2^101 806280304Sjkim * out[2] < 2^100 + 5*2^64 + 2^64 + 3*2^65 + 2^97 < 2^101 807280304Sjkim * out[3] < 2^100 + 7*2^64 + 7*2^96 + 3*2^64 < 2^101 808280304Sjkim */ 809280304Sjkim} 810238384Sjkim 811280304Sjkim/*- 812280304Sjkim * felem_reduce_zero105 converts a larger longfelem into an felem. 813238384Sjkim * On entry: 814238384Sjkim * in[0] < 2^71 815238384Sjkim * On exit: 816238384Sjkim * out[i] < 2^106 817238384Sjkim */ 818238384Sjkimstatic void felem_reduce_zero105(felem out, const longfelem in) 819280304Sjkim{ 820280304Sjkim out[0] = zero105[0] + in[0]; 821280304Sjkim out[1] = zero105[1] + in[1]; 822280304Sjkim out[2] = zero105[2] + in[2]; 823280304Sjkim out[3] = zero105[3] + in[3]; 824238384Sjkim 825280304Sjkim felem_reduce_(out, in); 826238384Sjkim 827280304Sjkim /*- 828280304Sjkim * out[0] > 2^105 - 2^41 - 2^9 - 2^71 - 2^103 - 2^71 - 2^103 > 0 829280304Sjkim * out[1] > 2^105 - 2^71 - 2^103 > 0 830280304Sjkim * out[2] > 2^105 - 2^41 + 2^9 - 2^71 - 2^103 > 0 831280304Sjkim * out[3] > 2^105 - 2^41 + 2^9 - 2^71 - 2^103 - 2^103 > 0 832280304Sjkim * 833280304Sjkim * out[0] < 2^105 + 2^71 + 2^71 + 2^103 < 2^106 834280304Sjkim * out[1] < 2^105 + 2^71 + 2^71 + 2^103 < 2^106 835280304Sjkim * out[2] < 2^105 + 2^71 + 2^71 + 2^71 + 2^103 < 2^106 836280304Sjkim * out[3] < 2^105 + 2^71 + 2^103 + 2^71 < 2^106 837280304Sjkim */ 838280304Sjkim} 839238384Sjkim 840280304Sjkim/* 841280304Sjkim * subtract_u64 sets *result = *result - v and *carry to one if the 842280304Sjkim * subtraction underflowed. 843280304Sjkim */ 844280304Sjkimstatic void subtract_u64(u64 *result, u64 *carry, u64 v) 845280304Sjkim{ 846280304Sjkim uint128_t r = *result; 847280304Sjkim r -= v; 848280304Sjkim *carry = (r >> 64) & 1; 849280304Sjkim *result = (u64)r; 850280304Sjkim} 851238384Sjkim 852280304Sjkim/* 853280304Sjkim * felem_contract converts |in| to its unique, minimal representation. On 854280304Sjkim * entry: in[i] < 2^109 855238384Sjkim */ 856238384Sjkimstatic void felem_contract(smallfelem out, const felem in) 857280304Sjkim{ 858280304Sjkim unsigned i; 859280304Sjkim u64 all_equal_so_far = 0, result = 0, carry; 860238384Sjkim 861280304Sjkim felem_shrink(out, in); 862280304Sjkim /* small is minimal except that the value might be > p */ 863238384Sjkim 864280304Sjkim all_equal_so_far--; 865280304Sjkim /* 866280304Sjkim * We are doing a constant time test if out >= kPrime. We need to compare 867280304Sjkim * each u64, from most-significant to least significant. For each one, if 868280304Sjkim * all words so far have been equal (m is all ones) then a non-equal 869280304Sjkim * result is the answer. Otherwise we continue. 870280304Sjkim */ 871280304Sjkim for (i = 3; i < 4; i--) { 872280304Sjkim u64 equal; 873280304Sjkim uint128_t a = ((uint128_t) kPrime[i]) - out[i]; 874280304Sjkim /* 875280304Sjkim * if out[i] > kPrime[i] then a will underflow and the high 64-bits 876280304Sjkim * will all be set. 877280304Sjkim */ 878280304Sjkim result |= all_equal_so_far & ((u64)(a >> 64)); 879238384Sjkim 880280304Sjkim /* 881280304Sjkim * if kPrime[i] == out[i] then |equal| will be all zeros and the 882280304Sjkim * decrement will make it all ones. 883280304Sjkim */ 884280304Sjkim equal = kPrime[i] ^ out[i]; 885280304Sjkim equal--; 886280304Sjkim equal &= equal << 32; 887280304Sjkim equal &= equal << 16; 888280304Sjkim equal &= equal << 8; 889280304Sjkim equal &= equal << 4; 890280304Sjkim equal &= equal << 2; 891280304Sjkim equal &= equal << 1; 892280304Sjkim equal = ((s64) equal) >> 63; 893238384Sjkim 894280304Sjkim all_equal_so_far &= equal; 895280304Sjkim } 896238384Sjkim 897280304Sjkim /* 898280304Sjkim * if all_equal_so_far is still all ones then the two values are equal 899280304Sjkim * and so out >= kPrime is true. 900280304Sjkim */ 901280304Sjkim result |= all_equal_so_far; 902238384Sjkim 903280304Sjkim /* if out >= kPrime then we subtract kPrime. */ 904280304Sjkim subtract_u64(&out[0], &carry, result & kPrime[0]); 905280304Sjkim subtract_u64(&out[1], &carry, carry); 906280304Sjkim subtract_u64(&out[2], &carry, carry); 907280304Sjkim subtract_u64(&out[3], &carry, carry); 908238384Sjkim 909280304Sjkim subtract_u64(&out[1], &carry, result & kPrime[1]); 910280304Sjkim subtract_u64(&out[2], &carry, carry); 911280304Sjkim subtract_u64(&out[3], &carry, carry); 912238384Sjkim 913280304Sjkim subtract_u64(&out[2], &carry, result & kPrime[2]); 914280304Sjkim subtract_u64(&out[3], &carry, carry); 915238384Sjkim 916280304Sjkim subtract_u64(&out[3], &carry, result & kPrime[3]); 917280304Sjkim} 918238384Sjkim 919238384Sjkimstatic void smallfelem_square_contract(smallfelem out, const smallfelem in) 920280304Sjkim{ 921280304Sjkim longfelem longtmp; 922280304Sjkim felem tmp; 923238384Sjkim 924280304Sjkim smallfelem_square(longtmp, in); 925280304Sjkim felem_reduce(tmp, longtmp); 926280304Sjkim felem_contract(out, tmp); 927280304Sjkim} 928238384Sjkim 929280304Sjkimstatic void smallfelem_mul_contract(smallfelem out, const smallfelem in1, 930280304Sjkim const smallfelem in2) 931280304Sjkim{ 932280304Sjkim longfelem longtmp; 933280304Sjkim felem tmp; 934238384Sjkim 935280304Sjkim smallfelem_mul(longtmp, in1, in2); 936280304Sjkim felem_reduce(tmp, longtmp); 937280304Sjkim felem_contract(out, tmp); 938280304Sjkim} 939238384Sjkim 940280304Sjkim/*- 941280304Sjkim * felem_is_zero returns a limb with all bits set if |in| == 0 (mod p) and 0 942238384Sjkim * otherwise. 943238384Sjkim * On entry: 944238384Sjkim * small[i] < 2^64 945238384Sjkim */ 946238384Sjkimstatic limb smallfelem_is_zero(const smallfelem small) 947280304Sjkim{ 948280304Sjkim limb result; 949280304Sjkim u64 is_p; 950238384Sjkim 951280304Sjkim u64 is_zero = small[0] | small[1] | small[2] | small[3]; 952280304Sjkim is_zero--; 953280304Sjkim is_zero &= is_zero << 32; 954280304Sjkim is_zero &= is_zero << 16; 955280304Sjkim is_zero &= is_zero << 8; 956280304Sjkim is_zero &= is_zero << 4; 957280304Sjkim is_zero &= is_zero << 2; 958280304Sjkim is_zero &= is_zero << 1; 959280304Sjkim is_zero = ((s64) is_zero) >> 63; 960238384Sjkim 961280304Sjkim is_p = (small[0] ^ kPrime[0]) | 962280304Sjkim (small[1] ^ kPrime[1]) | 963280304Sjkim (small[2] ^ kPrime[2]) | (small[3] ^ kPrime[3]); 964280304Sjkim is_p--; 965280304Sjkim is_p &= is_p << 32; 966280304Sjkim is_p &= is_p << 16; 967280304Sjkim is_p &= is_p << 8; 968280304Sjkim is_p &= is_p << 4; 969280304Sjkim is_p &= is_p << 2; 970280304Sjkim is_p &= is_p << 1; 971280304Sjkim is_p = ((s64) is_p) >> 63; 972238384Sjkim 973280304Sjkim is_zero |= is_p; 974238384Sjkim 975280304Sjkim result = is_zero; 976280304Sjkim result |= ((limb) is_zero) << 64; 977280304Sjkim return result; 978280304Sjkim} 979238384Sjkim 980238384Sjkimstatic int smallfelem_is_zero_int(const smallfelem small) 981280304Sjkim{ 982280304Sjkim return (int)(smallfelem_is_zero(small) & ((limb) 1)); 983280304Sjkim} 984238384Sjkim 985280304Sjkim/*- 986280304Sjkim * felem_inv calculates |out| = |in|^{-1} 987238384Sjkim * 988238384Sjkim * Based on Fermat's Little Theorem: 989238384Sjkim * a^p = a (mod p) 990238384Sjkim * a^{p-1} = 1 (mod p) 991238384Sjkim * a^{p-2} = a^{-1} (mod p) 992238384Sjkim */ 993238384Sjkimstatic void felem_inv(felem out, const felem in) 994280304Sjkim{ 995280304Sjkim felem ftmp, ftmp2; 996280304Sjkim /* each e_I will hold |in|^{2^I - 1} */ 997280304Sjkim felem e2, e4, e8, e16, e32, e64; 998280304Sjkim longfelem tmp; 999280304Sjkim unsigned i; 1000238384Sjkim 1001280304Sjkim felem_square(tmp, in); 1002280304Sjkim felem_reduce(ftmp, tmp); /* 2^1 */ 1003280304Sjkim felem_mul(tmp, in, ftmp); 1004280304Sjkim felem_reduce(ftmp, tmp); /* 2^2 - 2^0 */ 1005280304Sjkim felem_assign(e2, ftmp); 1006280304Sjkim felem_square(tmp, ftmp); 1007280304Sjkim felem_reduce(ftmp, tmp); /* 2^3 - 2^1 */ 1008280304Sjkim felem_square(tmp, ftmp); 1009280304Sjkim felem_reduce(ftmp, tmp); /* 2^4 - 2^2 */ 1010280304Sjkim felem_mul(tmp, ftmp, e2); 1011280304Sjkim felem_reduce(ftmp, tmp); /* 2^4 - 2^0 */ 1012280304Sjkim felem_assign(e4, ftmp); 1013280304Sjkim felem_square(tmp, ftmp); 1014280304Sjkim felem_reduce(ftmp, tmp); /* 2^5 - 2^1 */ 1015280304Sjkim felem_square(tmp, ftmp); 1016280304Sjkim felem_reduce(ftmp, tmp); /* 2^6 - 2^2 */ 1017280304Sjkim felem_square(tmp, ftmp); 1018280304Sjkim felem_reduce(ftmp, tmp); /* 2^7 - 2^3 */ 1019280304Sjkim felem_square(tmp, ftmp); 1020280304Sjkim felem_reduce(ftmp, tmp); /* 2^8 - 2^4 */ 1021280304Sjkim felem_mul(tmp, ftmp, e4); 1022280304Sjkim felem_reduce(ftmp, tmp); /* 2^8 - 2^0 */ 1023280304Sjkim felem_assign(e8, ftmp); 1024280304Sjkim for (i = 0; i < 8; i++) { 1025280304Sjkim felem_square(tmp, ftmp); 1026280304Sjkim felem_reduce(ftmp, tmp); 1027280304Sjkim } /* 2^16 - 2^8 */ 1028280304Sjkim felem_mul(tmp, ftmp, e8); 1029280304Sjkim felem_reduce(ftmp, tmp); /* 2^16 - 2^0 */ 1030280304Sjkim felem_assign(e16, ftmp); 1031280304Sjkim for (i = 0; i < 16; i++) { 1032280304Sjkim felem_square(tmp, ftmp); 1033280304Sjkim felem_reduce(ftmp, tmp); 1034280304Sjkim } /* 2^32 - 2^16 */ 1035280304Sjkim felem_mul(tmp, ftmp, e16); 1036280304Sjkim felem_reduce(ftmp, tmp); /* 2^32 - 2^0 */ 1037280304Sjkim felem_assign(e32, ftmp); 1038280304Sjkim for (i = 0; i < 32; i++) { 1039280304Sjkim felem_square(tmp, ftmp); 1040280304Sjkim felem_reduce(ftmp, tmp); 1041280304Sjkim } /* 2^64 - 2^32 */ 1042280304Sjkim felem_assign(e64, ftmp); 1043280304Sjkim felem_mul(tmp, ftmp, in); 1044280304Sjkim felem_reduce(ftmp, tmp); /* 2^64 - 2^32 + 2^0 */ 1045280304Sjkim for (i = 0; i < 192; i++) { 1046280304Sjkim felem_square(tmp, ftmp); 1047280304Sjkim felem_reduce(ftmp, tmp); 1048280304Sjkim } /* 2^256 - 2^224 + 2^192 */ 1049238384Sjkim 1050280304Sjkim felem_mul(tmp, e64, e32); 1051280304Sjkim felem_reduce(ftmp2, tmp); /* 2^64 - 2^0 */ 1052280304Sjkim for (i = 0; i < 16; i++) { 1053280304Sjkim felem_square(tmp, ftmp2); 1054280304Sjkim felem_reduce(ftmp2, tmp); 1055280304Sjkim } /* 2^80 - 2^16 */ 1056280304Sjkim felem_mul(tmp, ftmp2, e16); 1057280304Sjkim felem_reduce(ftmp2, tmp); /* 2^80 - 2^0 */ 1058280304Sjkim for (i = 0; i < 8; i++) { 1059280304Sjkim felem_square(tmp, ftmp2); 1060280304Sjkim felem_reduce(ftmp2, tmp); 1061280304Sjkim } /* 2^88 - 2^8 */ 1062280304Sjkim felem_mul(tmp, ftmp2, e8); 1063280304Sjkim felem_reduce(ftmp2, tmp); /* 2^88 - 2^0 */ 1064280304Sjkim for (i = 0; i < 4; i++) { 1065280304Sjkim felem_square(tmp, ftmp2); 1066280304Sjkim felem_reduce(ftmp2, tmp); 1067280304Sjkim } /* 2^92 - 2^4 */ 1068280304Sjkim felem_mul(tmp, ftmp2, e4); 1069280304Sjkim felem_reduce(ftmp2, tmp); /* 2^92 - 2^0 */ 1070280304Sjkim felem_square(tmp, ftmp2); 1071280304Sjkim felem_reduce(ftmp2, tmp); /* 2^93 - 2^1 */ 1072280304Sjkim felem_square(tmp, ftmp2); 1073280304Sjkim felem_reduce(ftmp2, tmp); /* 2^94 - 2^2 */ 1074280304Sjkim felem_mul(tmp, ftmp2, e2); 1075280304Sjkim felem_reduce(ftmp2, tmp); /* 2^94 - 2^0 */ 1076280304Sjkim felem_square(tmp, ftmp2); 1077280304Sjkim felem_reduce(ftmp2, tmp); /* 2^95 - 2^1 */ 1078280304Sjkim felem_square(tmp, ftmp2); 1079280304Sjkim felem_reduce(ftmp2, tmp); /* 2^96 - 2^2 */ 1080280304Sjkim felem_mul(tmp, ftmp2, in); 1081280304Sjkim felem_reduce(ftmp2, tmp); /* 2^96 - 3 */ 1082238384Sjkim 1083280304Sjkim felem_mul(tmp, ftmp2, ftmp); 1084280304Sjkim felem_reduce(out, tmp); /* 2^256 - 2^224 + 2^192 + 2^96 - 3 */ 1085280304Sjkim} 1086238384Sjkim 1087238384Sjkimstatic void smallfelem_inv_contract(smallfelem out, const smallfelem in) 1088280304Sjkim{ 1089280304Sjkim felem tmp; 1090238384Sjkim 1091280304Sjkim smallfelem_expand(tmp, in); 1092280304Sjkim felem_inv(tmp, tmp); 1093280304Sjkim felem_contract(out, tmp); 1094280304Sjkim} 1095238384Sjkim 1096280304Sjkim/*- 1097280304Sjkim * Group operations 1098238384Sjkim * ---------------- 1099238384Sjkim * 1100238384Sjkim * Building on top of the field operations we have the operations on the 1101238384Sjkim * elliptic curve group itself. Points on the curve are represented in Jacobian 1102280304Sjkim * coordinates 1103280304Sjkim */ 1104238384Sjkim 1105280304Sjkim/*- 1106280304Sjkim * point_double calculates 2*(x_in, y_in, z_in) 1107238384Sjkim * 1108238384Sjkim * The method is taken from: 1109238384Sjkim * http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b 1110238384Sjkim * 1111238384Sjkim * Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed. 1112280304Sjkim * while x_out == y_in is not (maybe this works, but it's not tested). 1113280304Sjkim */ 1114238384Sjkimstatic void 1115238384Sjkimpoint_double(felem x_out, felem y_out, felem z_out, 1116280304Sjkim const felem x_in, const felem y_in, const felem z_in) 1117280304Sjkim{ 1118280304Sjkim longfelem tmp, tmp2; 1119280304Sjkim felem delta, gamma, beta, alpha, ftmp, ftmp2; 1120280304Sjkim smallfelem small1, small2; 1121238384Sjkim 1122280304Sjkim felem_assign(ftmp, x_in); 1123280304Sjkim /* ftmp[i] < 2^106 */ 1124280304Sjkim felem_assign(ftmp2, x_in); 1125280304Sjkim /* ftmp2[i] < 2^106 */ 1126238384Sjkim 1127280304Sjkim /* delta = z^2 */ 1128280304Sjkim felem_square(tmp, z_in); 1129280304Sjkim felem_reduce(delta, tmp); 1130280304Sjkim /* delta[i] < 2^101 */ 1131238384Sjkim 1132280304Sjkim /* gamma = y^2 */ 1133280304Sjkim felem_square(tmp, y_in); 1134280304Sjkim felem_reduce(gamma, tmp); 1135280304Sjkim /* gamma[i] < 2^101 */ 1136280304Sjkim felem_shrink(small1, gamma); 1137238384Sjkim 1138280304Sjkim /* beta = x*gamma */ 1139280304Sjkim felem_small_mul(tmp, small1, x_in); 1140280304Sjkim felem_reduce(beta, tmp); 1141280304Sjkim /* beta[i] < 2^101 */ 1142238384Sjkim 1143280304Sjkim /* alpha = 3*(x-delta)*(x+delta) */ 1144280304Sjkim felem_diff(ftmp, delta); 1145280304Sjkim /* ftmp[i] < 2^105 + 2^106 < 2^107 */ 1146280304Sjkim felem_sum(ftmp2, delta); 1147280304Sjkim /* ftmp2[i] < 2^105 + 2^106 < 2^107 */ 1148280304Sjkim felem_scalar(ftmp2, 3); 1149280304Sjkim /* ftmp2[i] < 3 * 2^107 < 2^109 */ 1150280304Sjkim felem_mul(tmp, ftmp, ftmp2); 1151280304Sjkim felem_reduce(alpha, tmp); 1152280304Sjkim /* alpha[i] < 2^101 */ 1153280304Sjkim felem_shrink(small2, alpha); 1154238384Sjkim 1155280304Sjkim /* x' = alpha^2 - 8*beta */ 1156280304Sjkim smallfelem_square(tmp, small2); 1157280304Sjkim felem_reduce(x_out, tmp); 1158280304Sjkim felem_assign(ftmp, beta); 1159280304Sjkim felem_scalar(ftmp, 8); 1160280304Sjkim /* ftmp[i] < 8 * 2^101 = 2^104 */ 1161280304Sjkim felem_diff(x_out, ftmp); 1162280304Sjkim /* x_out[i] < 2^105 + 2^101 < 2^106 */ 1163238384Sjkim 1164280304Sjkim /* z' = (y + z)^2 - gamma - delta */ 1165280304Sjkim felem_sum(delta, gamma); 1166280304Sjkim /* delta[i] < 2^101 + 2^101 = 2^102 */ 1167280304Sjkim felem_assign(ftmp, y_in); 1168280304Sjkim felem_sum(ftmp, z_in); 1169280304Sjkim /* ftmp[i] < 2^106 + 2^106 = 2^107 */ 1170280304Sjkim felem_square(tmp, ftmp); 1171280304Sjkim felem_reduce(z_out, tmp); 1172280304Sjkim felem_diff(z_out, delta); 1173280304Sjkim /* z_out[i] < 2^105 + 2^101 < 2^106 */ 1174238384Sjkim 1175280304Sjkim /* y' = alpha*(4*beta - x') - 8*gamma^2 */ 1176280304Sjkim felem_scalar(beta, 4); 1177280304Sjkim /* beta[i] < 4 * 2^101 = 2^103 */ 1178280304Sjkim felem_diff_zero107(beta, x_out); 1179280304Sjkim /* beta[i] < 2^107 + 2^103 < 2^108 */ 1180280304Sjkim felem_small_mul(tmp, small2, beta); 1181280304Sjkim /* tmp[i] < 7 * 2^64 < 2^67 */ 1182280304Sjkim smallfelem_square(tmp2, small1); 1183280304Sjkim /* tmp2[i] < 7 * 2^64 */ 1184280304Sjkim longfelem_scalar(tmp2, 8); 1185280304Sjkim /* tmp2[i] < 8 * 7 * 2^64 = 7 * 2^67 */ 1186280304Sjkim longfelem_diff(tmp, tmp2); 1187280304Sjkim /* tmp[i] < 2^67 + 2^70 + 2^40 < 2^71 */ 1188280304Sjkim felem_reduce_zero105(y_out, tmp); 1189280304Sjkim /* y_out[i] < 2^106 */ 1190280304Sjkim} 1191238384Sjkim 1192280304Sjkim/* 1193280304Sjkim * point_double_small is the same as point_double, except that it operates on 1194280304Sjkim * smallfelems 1195280304Sjkim */ 1196238384Sjkimstatic void 1197238384Sjkimpoint_double_small(smallfelem x_out, smallfelem y_out, smallfelem z_out, 1198280304Sjkim const smallfelem x_in, const smallfelem y_in, 1199280304Sjkim const smallfelem z_in) 1200280304Sjkim{ 1201280304Sjkim felem felem_x_out, felem_y_out, felem_z_out; 1202280304Sjkim felem felem_x_in, felem_y_in, felem_z_in; 1203238384Sjkim 1204280304Sjkim smallfelem_expand(felem_x_in, x_in); 1205280304Sjkim smallfelem_expand(felem_y_in, y_in); 1206280304Sjkim smallfelem_expand(felem_z_in, z_in); 1207280304Sjkim point_double(felem_x_out, felem_y_out, felem_z_out, 1208280304Sjkim felem_x_in, felem_y_in, felem_z_in); 1209280304Sjkim felem_shrink(x_out, felem_x_out); 1210280304Sjkim felem_shrink(y_out, felem_y_out); 1211280304Sjkim felem_shrink(z_out, felem_z_out); 1212280304Sjkim} 1213238384Sjkim 1214238384Sjkim/* copy_conditional copies in to out iff mask is all ones. */ 1215280304Sjkimstatic void copy_conditional(felem out, const felem in, limb mask) 1216280304Sjkim{ 1217280304Sjkim unsigned i; 1218280304Sjkim for (i = 0; i < NLIMBS; ++i) { 1219280304Sjkim const limb tmp = mask & (in[i] ^ out[i]); 1220280304Sjkim out[i] ^= tmp; 1221280304Sjkim } 1222280304Sjkim} 1223238384Sjkim 1224238384Sjkim/* copy_small_conditional copies in to out iff mask is all ones. */ 1225280304Sjkimstatic void copy_small_conditional(felem out, const smallfelem in, limb mask) 1226280304Sjkim{ 1227280304Sjkim unsigned i; 1228280304Sjkim const u64 mask64 = mask; 1229280304Sjkim for (i = 0; i < NLIMBS; ++i) { 1230280304Sjkim out[i] = ((limb) (in[i] & mask64)) | (out[i] & ~mask); 1231280304Sjkim } 1232280304Sjkim} 1233238384Sjkim 1234280304Sjkim/*- 1235280304Sjkim * point_add calcuates (x1, y1, z1) + (x2, y2, z2) 1236238384Sjkim * 1237238384Sjkim * The method is taken from: 1238238384Sjkim * http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl, 1239238384Sjkim * adapted for mixed addition (z2 = 1, or z2 = 0 for the point at infinity). 1240238384Sjkim * 1241238384Sjkim * This function includes a branch for checking whether the two input points 1242238384Sjkim * are equal, (while not equal to the point at infinity). This case never 1243238384Sjkim * happens during single point multiplication, so there is no timing leak for 1244280304Sjkim * ECDH or ECDSA signing. 1245280304Sjkim */ 1246238384Sjkimstatic void point_add(felem x3, felem y3, felem z3, 1247280304Sjkim const felem x1, const felem y1, const felem z1, 1248280304Sjkim const int mixed, const smallfelem x2, 1249280304Sjkim const smallfelem y2, const smallfelem z2) 1250280304Sjkim{ 1251280304Sjkim felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, ftmp6, x_out, y_out, z_out; 1252280304Sjkim longfelem tmp, tmp2; 1253280304Sjkim smallfelem small1, small2, small3, small4, small5; 1254280304Sjkim limb x_equal, y_equal, z1_is_zero, z2_is_zero; 1255238384Sjkim 1256280304Sjkim felem_shrink(small3, z1); 1257238384Sjkim 1258280304Sjkim z1_is_zero = smallfelem_is_zero(small3); 1259280304Sjkim z2_is_zero = smallfelem_is_zero(z2); 1260238384Sjkim 1261280304Sjkim /* ftmp = z1z1 = z1**2 */ 1262280304Sjkim smallfelem_square(tmp, small3); 1263280304Sjkim felem_reduce(ftmp, tmp); 1264280304Sjkim /* ftmp[i] < 2^101 */ 1265280304Sjkim felem_shrink(small1, ftmp); 1266238384Sjkim 1267280304Sjkim if (!mixed) { 1268280304Sjkim /* ftmp2 = z2z2 = z2**2 */ 1269280304Sjkim smallfelem_square(tmp, z2); 1270280304Sjkim felem_reduce(ftmp2, tmp); 1271280304Sjkim /* ftmp2[i] < 2^101 */ 1272280304Sjkim felem_shrink(small2, ftmp2); 1273238384Sjkim 1274280304Sjkim felem_shrink(small5, x1); 1275238384Sjkim 1276280304Sjkim /* u1 = ftmp3 = x1*z2z2 */ 1277280304Sjkim smallfelem_mul(tmp, small5, small2); 1278280304Sjkim felem_reduce(ftmp3, tmp); 1279280304Sjkim /* ftmp3[i] < 2^101 */ 1280238384Sjkim 1281280304Sjkim /* ftmp5 = z1 + z2 */ 1282280304Sjkim felem_assign(ftmp5, z1); 1283280304Sjkim felem_small_sum(ftmp5, z2); 1284280304Sjkim /* ftmp5[i] < 2^107 */ 1285238384Sjkim 1286280304Sjkim /* ftmp5 = (z1 + z2)**2 - (z1z1 + z2z2) = 2z1z2 */ 1287280304Sjkim felem_square(tmp, ftmp5); 1288280304Sjkim felem_reduce(ftmp5, tmp); 1289280304Sjkim /* ftmp2 = z2z2 + z1z1 */ 1290280304Sjkim felem_sum(ftmp2, ftmp); 1291280304Sjkim /* ftmp2[i] < 2^101 + 2^101 = 2^102 */ 1292280304Sjkim felem_diff(ftmp5, ftmp2); 1293280304Sjkim /* ftmp5[i] < 2^105 + 2^101 < 2^106 */ 1294238384Sjkim 1295280304Sjkim /* ftmp2 = z2 * z2z2 */ 1296280304Sjkim smallfelem_mul(tmp, small2, z2); 1297280304Sjkim felem_reduce(ftmp2, tmp); 1298238384Sjkim 1299280304Sjkim /* s1 = ftmp2 = y1 * z2**3 */ 1300280304Sjkim felem_mul(tmp, y1, ftmp2); 1301280304Sjkim felem_reduce(ftmp6, tmp); 1302280304Sjkim /* ftmp6[i] < 2^101 */ 1303280304Sjkim } else { 1304280304Sjkim /* 1305280304Sjkim * We'll assume z2 = 1 (special case z2 = 0 is handled later) 1306280304Sjkim */ 1307238384Sjkim 1308280304Sjkim /* u1 = ftmp3 = x1*z2z2 */ 1309280304Sjkim felem_assign(ftmp3, x1); 1310280304Sjkim /* ftmp3[i] < 2^106 */ 1311238384Sjkim 1312280304Sjkim /* ftmp5 = 2z1z2 */ 1313280304Sjkim felem_assign(ftmp5, z1); 1314280304Sjkim felem_scalar(ftmp5, 2); 1315280304Sjkim /* ftmp5[i] < 2*2^106 = 2^107 */ 1316238384Sjkim 1317280304Sjkim /* s1 = ftmp2 = y1 * z2**3 */ 1318280304Sjkim felem_assign(ftmp6, y1); 1319280304Sjkim /* ftmp6[i] < 2^106 */ 1320280304Sjkim } 1321238384Sjkim 1322280304Sjkim /* u2 = x2*z1z1 */ 1323280304Sjkim smallfelem_mul(tmp, x2, small1); 1324280304Sjkim felem_reduce(ftmp4, tmp); 1325238384Sjkim 1326280304Sjkim /* h = ftmp4 = u2 - u1 */ 1327280304Sjkim felem_diff_zero107(ftmp4, ftmp3); 1328280304Sjkim /* ftmp4[i] < 2^107 + 2^101 < 2^108 */ 1329280304Sjkim felem_shrink(small4, ftmp4); 1330238384Sjkim 1331280304Sjkim x_equal = smallfelem_is_zero(small4); 1332238384Sjkim 1333280304Sjkim /* z_out = ftmp5 * h */ 1334280304Sjkim felem_small_mul(tmp, small4, ftmp5); 1335280304Sjkim felem_reduce(z_out, tmp); 1336280304Sjkim /* z_out[i] < 2^101 */ 1337238384Sjkim 1338280304Sjkim /* ftmp = z1 * z1z1 */ 1339280304Sjkim smallfelem_mul(tmp, small1, small3); 1340280304Sjkim felem_reduce(ftmp, tmp); 1341238384Sjkim 1342280304Sjkim /* s2 = tmp = y2 * z1**3 */ 1343280304Sjkim felem_small_mul(tmp, y2, ftmp); 1344280304Sjkim felem_reduce(ftmp5, tmp); 1345238384Sjkim 1346280304Sjkim /* r = ftmp5 = (s2 - s1)*2 */ 1347280304Sjkim felem_diff_zero107(ftmp5, ftmp6); 1348280304Sjkim /* ftmp5[i] < 2^107 + 2^107 = 2^108 */ 1349280304Sjkim felem_scalar(ftmp5, 2); 1350280304Sjkim /* ftmp5[i] < 2^109 */ 1351280304Sjkim felem_shrink(small1, ftmp5); 1352280304Sjkim y_equal = smallfelem_is_zero(small1); 1353238384Sjkim 1354280304Sjkim if (x_equal && y_equal && !z1_is_zero && !z2_is_zero) { 1355280304Sjkim point_double(x3, y3, z3, x1, y1, z1); 1356280304Sjkim return; 1357280304Sjkim } 1358238384Sjkim 1359280304Sjkim /* I = ftmp = (2h)**2 */ 1360280304Sjkim felem_assign(ftmp, ftmp4); 1361280304Sjkim felem_scalar(ftmp, 2); 1362280304Sjkim /* ftmp[i] < 2*2^108 = 2^109 */ 1363280304Sjkim felem_square(tmp, ftmp); 1364280304Sjkim felem_reduce(ftmp, tmp); 1365238384Sjkim 1366280304Sjkim /* J = ftmp2 = h * I */ 1367280304Sjkim felem_mul(tmp, ftmp4, ftmp); 1368280304Sjkim felem_reduce(ftmp2, tmp); 1369238384Sjkim 1370280304Sjkim /* V = ftmp4 = U1 * I */ 1371280304Sjkim felem_mul(tmp, ftmp3, ftmp); 1372280304Sjkim felem_reduce(ftmp4, tmp); 1373238384Sjkim 1374280304Sjkim /* x_out = r**2 - J - 2V */ 1375280304Sjkim smallfelem_square(tmp, small1); 1376280304Sjkim felem_reduce(x_out, tmp); 1377280304Sjkim felem_assign(ftmp3, ftmp4); 1378280304Sjkim felem_scalar(ftmp4, 2); 1379280304Sjkim felem_sum(ftmp4, ftmp2); 1380280304Sjkim /* ftmp4[i] < 2*2^101 + 2^101 < 2^103 */ 1381280304Sjkim felem_diff(x_out, ftmp4); 1382280304Sjkim /* x_out[i] < 2^105 + 2^101 */ 1383238384Sjkim 1384280304Sjkim /* y_out = r(V-x_out) - 2 * s1 * J */ 1385280304Sjkim felem_diff_zero107(ftmp3, x_out); 1386280304Sjkim /* ftmp3[i] < 2^107 + 2^101 < 2^108 */ 1387280304Sjkim felem_small_mul(tmp, small1, ftmp3); 1388280304Sjkim felem_mul(tmp2, ftmp6, ftmp2); 1389280304Sjkim longfelem_scalar(tmp2, 2); 1390280304Sjkim /* tmp2[i] < 2*2^67 = 2^68 */ 1391280304Sjkim longfelem_diff(tmp, tmp2); 1392280304Sjkim /* tmp[i] < 2^67 + 2^70 + 2^40 < 2^71 */ 1393280304Sjkim felem_reduce_zero105(y_out, tmp); 1394280304Sjkim /* y_out[i] < 2^106 */ 1395238384Sjkim 1396280304Sjkim copy_small_conditional(x_out, x2, z1_is_zero); 1397280304Sjkim copy_conditional(x_out, x1, z2_is_zero); 1398280304Sjkim copy_small_conditional(y_out, y2, z1_is_zero); 1399280304Sjkim copy_conditional(y_out, y1, z2_is_zero); 1400280304Sjkim copy_small_conditional(z_out, z2, z1_is_zero); 1401280304Sjkim copy_conditional(z_out, z1, z2_is_zero); 1402280304Sjkim felem_assign(x3, x_out); 1403280304Sjkim felem_assign(y3, y_out); 1404280304Sjkim felem_assign(z3, z_out); 1405280304Sjkim} 1406238384Sjkim 1407280304Sjkim/* 1408280304Sjkim * point_add_small is the same as point_add, except that it operates on 1409280304Sjkim * smallfelems 1410280304Sjkim */ 1411238384Sjkimstatic void point_add_small(smallfelem x3, smallfelem y3, smallfelem z3, 1412280304Sjkim smallfelem x1, smallfelem y1, smallfelem z1, 1413280304Sjkim smallfelem x2, smallfelem y2, smallfelem z2) 1414280304Sjkim{ 1415280304Sjkim felem felem_x3, felem_y3, felem_z3; 1416280304Sjkim felem felem_x1, felem_y1, felem_z1; 1417280304Sjkim smallfelem_expand(felem_x1, x1); 1418280304Sjkim smallfelem_expand(felem_y1, y1); 1419280304Sjkim smallfelem_expand(felem_z1, z1); 1420280304Sjkim point_add(felem_x3, felem_y3, felem_z3, felem_x1, felem_y1, felem_z1, 0, 1421280304Sjkim x2, y2, z2); 1422280304Sjkim felem_shrink(x3, felem_x3); 1423280304Sjkim felem_shrink(y3, felem_y3); 1424280304Sjkim felem_shrink(z3, felem_z3); 1425280304Sjkim} 1426238384Sjkim 1427280304Sjkim/*- 1428280304Sjkim * Base point pre computation 1429238384Sjkim * -------------------------- 1430238384Sjkim * 1431238384Sjkim * Two different sorts of precomputed tables are used in the following code. 1432238384Sjkim * Each contain various points on the curve, where each point is three field 1433238384Sjkim * elements (x, y, z). 1434238384Sjkim * 1435238384Sjkim * For the base point table, z is usually 1 (0 for the point at infinity). 1436238384Sjkim * This table has 2 * 16 elements, starting with the following: 1437238384Sjkim * index | bits | point 1438238384Sjkim * ------+---------+------------------------------ 1439238384Sjkim * 0 | 0 0 0 0 | 0G 1440238384Sjkim * 1 | 0 0 0 1 | 1G 1441238384Sjkim * 2 | 0 0 1 0 | 2^64G 1442238384Sjkim * 3 | 0 0 1 1 | (2^64 + 1)G 1443238384Sjkim * 4 | 0 1 0 0 | 2^128G 1444238384Sjkim * 5 | 0 1 0 1 | (2^128 + 1)G 1445238384Sjkim * 6 | 0 1 1 0 | (2^128 + 2^64)G 1446238384Sjkim * 7 | 0 1 1 1 | (2^128 + 2^64 + 1)G 1447238384Sjkim * 8 | 1 0 0 0 | 2^192G 1448238384Sjkim * 9 | 1 0 0 1 | (2^192 + 1)G 1449238384Sjkim * 10 | 1 0 1 0 | (2^192 + 2^64)G 1450238384Sjkim * 11 | 1 0 1 1 | (2^192 + 2^64 + 1)G 1451238384Sjkim * 12 | 1 1 0 0 | (2^192 + 2^128)G 1452238384Sjkim * 13 | 1 1 0 1 | (2^192 + 2^128 + 1)G 1453238384Sjkim * 14 | 1 1 1 0 | (2^192 + 2^128 + 2^64)G 1454238384Sjkim * 15 | 1 1 1 1 | (2^192 + 2^128 + 2^64 + 1)G 1455238384Sjkim * followed by a copy of this with each element multiplied by 2^32. 1456238384Sjkim * 1457238384Sjkim * The reason for this is so that we can clock bits into four different 1458238384Sjkim * locations when doing simple scalar multiplies against the base point, 1459238384Sjkim * and then another four locations using the second 16 elements. 1460238384Sjkim * 1461238384Sjkim * Tables for other points have table[i] = iG for i in 0 .. 16. */ 1462238384Sjkim 1463238384Sjkim/* gmul is the table of precomputed base points */ 1464280304Sjkimstatic const smallfelem gmul[2][16][3] = { 1465280304Sjkim {{{0, 0, 0, 0}, 1466280304Sjkim {0, 0, 0, 0}, 1467280304Sjkim {0, 0, 0, 0}}, 1468280304Sjkim {{0xf4a13945d898c296, 0x77037d812deb33a0, 0xf8bce6e563a440f2, 1469280304Sjkim 0x6b17d1f2e12c4247}, 1470280304Sjkim {0xcbb6406837bf51f5, 0x2bce33576b315ece, 0x8ee7eb4a7c0f9e16, 1471280304Sjkim 0x4fe342e2fe1a7f9b}, 1472280304Sjkim {1, 0, 0, 0}}, 1473280304Sjkim {{0x90e75cb48e14db63, 0x29493baaad651f7e, 0x8492592e326e25de, 1474280304Sjkim 0x0fa822bc2811aaa5}, 1475280304Sjkim {0xe41124545f462ee7, 0x34b1a65050fe82f5, 0x6f4ad4bcb3df188b, 1476280304Sjkim 0xbff44ae8f5dba80d}, 1477280304Sjkim {1, 0, 0, 0}}, 1478280304Sjkim {{0x93391ce2097992af, 0xe96c98fd0d35f1fa, 0xb257c0de95e02789, 1479280304Sjkim 0x300a4bbc89d6726f}, 1480280304Sjkim {0xaa54a291c08127a0, 0x5bb1eeada9d806a5, 0x7f1ddb25ff1e3c6f, 1481280304Sjkim 0x72aac7e0d09b4644}, 1482280304Sjkim {1, 0, 0, 0}}, 1483280304Sjkim {{0x57c84fc9d789bd85, 0xfc35ff7dc297eac3, 0xfb982fd588c6766e, 1484280304Sjkim 0x447d739beedb5e67}, 1485280304Sjkim {0x0c7e33c972e25b32, 0x3d349b95a7fae500, 0xe12e9d953a4aaff7, 1486280304Sjkim 0x2d4825ab834131ee}, 1487280304Sjkim {1, 0, 0, 0}}, 1488280304Sjkim {{0x13949c932a1d367f, 0xef7fbd2b1a0a11b7, 0xddc6068bb91dfc60, 1489280304Sjkim 0xef9519328a9c72ff}, 1490280304Sjkim {0x196035a77376d8a8, 0x23183b0895ca1740, 0xc1ee9807022c219c, 1491280304Sjkim 0x611e9fc37dbb2c9b}, 1492280304Sjkim {1, 0, 0, 0}}, 1493280304Sjkim {{0xcae2b1920b57f4bc, 0x2936df5ec6c9bc36, 0x7dea6482e11238bf, 1494280304Sjkim 0x550663797b51f5d8}, 1495280304Sjkim {0x44ffe216348a964c, 0x9fb3d576dbdefbe1, 0x0afa40018d9d50e5, 1496280304Sjkim 0x157164848aecb851}, 1497280304Sjkim {1, 0, 0, 0}}, 1498280304Sjkim {{0xe48ecafffc5cde01, 0x7ccd84e70d715f26, 0xa2e8f483f43e4391, 1499280304Sjkim 0xeb5d7745b21141ea}, 1500280304Sjkim {0xcac917e2731a3479, 0x85f22cfe2844b645, 0x0990e6a158006cee, 1501280304Sjkim 0xeafd72ebdbecc17b}, 1502280304Sjkim {1, 0, 0, 0}}, 1503280304Sjkim {{0x6cf20ffb313728be, 0x96439591a3c6b94a, 0x2736ff8344315fc5, 1504280304Sjkim 0xa6d39677a7849276}, 1505280304Sjkim {0xf2bab833c357f5f4, 0x824a920c2284059b, 0x66b8babd2d27ecdf, 1506280304Sjkim 0x674f84749b0b8816}, 1507280304Sjkim {1, 0, 0, 0}}, 1508280304Sjkim {{0x2df48c04677c8a3e, 0x74e02f080203a56b, 0x31855f7db8c7fedb, 1509280304Sjkim 0x4e769e7672c9ddad}, 1510280304Sjkim {0xa4c36165b824bbb0, 0xfb9ae16f3b9122a5, 0x1ec0057206947281, 1511280304Sjkim 0x42b99082de830663}, 1512280304Sjkim {1, 0, 0, 0}}, 1513280304Sjkim {{0x6ef95150dda868b9, 0xd1f89e799c0ce131, 0x7fdc1ca008a1c478, 1514280304Sjkim 0x78878ef61c6ce04d}, 1515280304Sjkim {0x9c62b9121fe0d976, 0x6ace570ebde08d4f, 0xde53142c12309def, 1516280304Sjkim 0xb6cb3f5d7b72c321}, 1517280304Sjkim {1, 0, 0, 0}}, 1518280304Sjkim {{0x7f991ed2c31a3573, 0x5b82dd5bd54fb496, 0x595c5220812ffcae, 1519280304Sjkim 0x0c88bc4d716b1287}, 1520280304Sjkim {0x3a57bf635f48aca8, 0x7c8181f4df2564f3, 0x18d1b5b39c04e6aa, 1521280304Sjkim 0xdd5ddea3f3901dc6}, 1522280304Sjkim {1, 0, 0, 0}}, 1523280304Sjkim {{0xe96a79fb3e72ad0c, 0x43a0a28c42ba792f, 0xefe0a423083e49f3, 1524280304Sjkim 0x68f344af6b317466}, 1525280304Sjkim {0xcdfe17db3fb24d4a, 0x668bfc2271f5c626, 0x604ed93c24d67ff3, 1526280304Sjkim 0x31b9c405f8540a20}, 1527280304Sjkim {1, 0, 0, 0}}, 1528280304Sjkim {{0xd36b4789a2582e7f, 0x0d1a10144ec39c28, 0x663c62c3edbad7a0, 1529280304Sjkim 0x4052bf4b6f461db9}, 1530280304Sjkim {0x235a27c3188d25eb, 0xe724f33999bfcc5b, 0x862be6bd71d70cc8, 1531280304Sjkim 0xfecf4d5190b0fc61}, 1532280304Sjkim {1, 0, 0, 0}}, 1533280304Sjkim {{0x74346c10a1d4cfac, 0xafdf5cc08526a7a4, 0x123202a8f62bff7a, 1534280304Sjkim 0x1eddbae2c802e41a}, 1535280304Sjkim {0x8fa0af2dd603f844, 0x36e06b7e4c701917, 0x0c45f45273db33a0, 1536280304Sjkim 0x43104d86560ebcfc}, 1537280304Sjkim {1, 0, 0, 0}}, 1538280304Sjkim {{0x9615b5110d1d78e5, 0x66b0de3225c4744b, 0x0a4a46fb6aaf363a, 1539280304Sjkim 0xb48e26b484f7a21c}, 1540280304Sjkim {0x06ebb0f621a01b2d, 0xc004e4048b7b0f98, 0x64131bcdfed6f668, 1541280304Sjkim 0xfac015404d4d3dab}, 1542280304Sjkim {1, 0, 0, 0}}}, 1543280304Sjkim {{{0, 0, 0, 0}, 1544280304Sjkim {0, 0, 0, 0}, 1545280304Sjkim {0, 0, 0, 0}}, 1546280304Sjkim {{0x3a5a9e22185a5943, 0x1ab919365c65dfb6, 0x21656b32262c71da, 1547280304Sjkim 0x7fe36b40af22af89}, 1548280304Sjkim {0xd50d152c699ca101, 0x74b3d5867b8af212, 0x9f09f40407dca6f1, 1549280304Sjkim 0xe697d45825b63624}, 1550280304Sjkim {1, 0, 0, 0}}, 1551280304Sjkim {{0xa84aa9397512218e, 0xe9a521b074ca0141, 0x57880b3a18a2e902, 1552280304Sjkim 0x4a5b506612a677a6}, 1553280304Sjkim {0x0beada7a4c4f3840, 0x626db15419e26d9d, 0xc42604fbe1627d40, 1554280304Sjkim 0xeb13461ceac089f1}, 1555280304Sjkim {1, 0, 0, 0}}, 1556280304Sjkim {{0xf9faed0927a43281, 0x5e52c4144103ecbc, 0xc342967aa815c857, 1557280304Sjkim 0x0781b8291c6a220a}, 1558280304Sjkim {0x5a8343ceeac55f80, 0x88f80eeee54a05e3, 0x97b2a14f12916434, 1559280304Sjkim 0x690cde8df0151593}, 1560280304Sjkim {1, 0, 0, 0}}, 1561280304Sjkim {{0xaee9c75df7f82f2a, 0x9e4c35874afdf43a, 0xf5622df437371326, 1562280304Sjkim 0x8a535f566ec73617}, 1563280304Sjkim {0xc5f9a0ac223094b7, 0xcde533864c8c7669, 0x37e02819085a92bf, 1564280304Sjkim 0x0455c08468b08bd7}, 1565280304Sjkim {1, 0, 0, 0}}, 1566280304Sjkim {{0x0c0a6e2c9477b5d9, 0xf9a4bf62876dc444, 0x5050a949b6cdc279, 1567280304Sjkim 0x06bada7ab77f8276}, 1568280304Sjkim {0xc8b4aed1ea48dac9, 0xdebd8a4b7ea1070f, 0x427d49101366eb70, 1569280304Sjkim 0x5b476dfd0e6cb18a}, 1570280304Sjkim {1, 0, 0, 0}}, 1571280304Sjkim {{0x7c5c3e44278c340a, 0x4d54606812d66f3b, 0x29a751b1ae23c5d8, 1572280304Sjkim 0x3e29864e8a2ec908}, 1573280304Sjkim {0x142d2a6626dbb850, 0xad1744c4765bd780, 0x1f150e68e322d1ed, 1574280304Sjkim 0x239b90ea3dc31e7e}, 1575280304Sjkim {1, 0, 0, 0}}, 1576280304Sjkim {{0x78c416527a53322a, 0x305dde6709776f8e, 0xdbcab759f8862ed4, 1577280304Sjkim 0x820f4dd949f72ff7}, 1578280304Sjkim {0x6cc544a62b5debd4, 0x75be5d937b4e8cc4, 0x1b481b1b215c14d3, 1579280304Sjkim 0x140406ec783a05ec}, 1580280304Sjkim {1, 0, 0, 0}}, 1581280304Sjkim {{0x6a703f10e895df07, 0xfd75f3fa01876bd8, 0xeb5b06e70ce08ffe, 1582280304Sjkim 0x68f6b8542783dfee}, 1583280304Sjkim {0x90c76f8a78712655, 0xcf5293d2f310bf7f, 0xfbc8044dfda45028, 1584280304Sjkim 0xcbe1feba92e40ce6}, 1585280304Sjkim {1, 0, 0, 0}}, 1586280304Sjkim {{0xe998ceea4396e4c1, 0xfc82ef0b6acea274, 0x230f729f2250e927, 1587280304Sjkim 0xd0b2f94d2f420109}, 1588280304Sjkim {0x4305adddb38d4966, 0x10b838f8624c3b45, 0x7db2636658954e7a, 1589280304Sjkim 0x971459828b0719e5}, 1590280304Sjkim {1, 0, 0, 0}}, 1591280304Sjkim {{0x4bd6b72623369fc9, 0x57f2929e53d0b876, 0xc2d5cba4f2340687, 1592280304Sjkim 0x961610004a866aba}, 1593280304Sjkim {0x49997bcd2e407a5e, 0x69ab197d92ddcb24, 0x2cf1f2438fe5131c, 1594280304Sjkim 0x7acb9fadcee75e44}, 1595280304Sjkim {1, 0, 0, 0}}, 1596280304Sjkim {{0x254e839423d2d4c0, 0xf57f0c917aea685b, 0xa60d880f6f75aaea, 1597280304Sjkim 0x24eb9acca333bf5b}, 1598280304Sjkim {0xe3de4ccb1cda5dea, 0xfeef9341c51a6b4f, 0x743125f88bac4c4d, 1599280304Sjkim 0x69f891c5acd079cc}, 1600280304Sjkim {1, 0, 0, 0}}, 1601280304Sjkim {{0xeee44b35702476b5, 0x7ed031a0e45c2258, 0xb422d1e7bd6f8514, 1602280304Sjkim 0xe51f547c5972a107}, 1603280304Sjkim {0xa25bcd6fc9cf343d, 0x8ca922ee097c184e, 0xa62f98b3a9fe9a06, 1604280304Sjkim 0x1c309a2b25bb1387}, 1605280304Sjkim {1, 0, 0, 0}}, 1606280304Sjkim {{0x9295dbeb1967c459, 0xb00148833472c98e, 0xc504977708011828, 1607280304Sjkim 0x20b87b8aa2c4e503}, 1608280304Sjkim {0x3063175de057c277, 0x1bd539338fe582dd, 0x0d11adef5f69a044, 1609280304Sjkim 0xf5c6fa49919776be}, 1610280304Sjkim {1, 0, 0, 0}}, 1611280304Sjkim {{0x8c944e760fd59e11, 0x3876cba1102fad5f, 0xa454c3fad83faa56, 1612280304Sjkim 0x1ed7d1b9332010b9}, 1613280304Sjkim {0xa1011a270024b889, 0x05e4d0dcac0cd344, 0x52b520f0eb6a2a24, 1614280304Sjkim 0x3a2b03f03217257a}, 1615280304Sjkim {1, 0, 0, 0}}, 1616280304Sjkim {{0xf20fc2afdf1d043d, 0xf330240db58d5a62, 0xfc7d229ca0058c3b, 1617280304Sjkim 0x15fee545c78dd9f6}, 1618280304Sjkim {0x501e82885bc98cda, 0x41ef80e5d046ac04, 0x557d9f49461210fb, 1619280304Sjkim 0x4ab5b6b2b8753f81}, 1620280304Sjkim {1, 0, 0, 0}}} 1621280304Sjkim}; 1622238384Sjkim 1623280304Sjkim/* 1624280304Sjkim * select_point selects the |idx|th point from a precomputation table and 1625280304Sjkim * copies it to out. 1626280304Sjkim */ 1627280304Sjkimstatic void select_point(const u64 idx, unsigned int size, 1628280304Sjkim const smallfelem pre_comp[16][3], smallfelem out[3]) 1629280304Sjkim{ 1630280304Sjkim unsigned i, j; 1631280304Sjkim u64 *outlimbs = &out[0][0]; 1632280304Sjkim memset(outlimbs, 0, 3 * sizeof(smallfelem)); 1633238384Sjkim 1634280304Sjkim for (i = 0; i < size; i++) { 1635280304Sjkim const u64 *inlimbs = (u64 *)&pre_comp[i][0][0]; 1636280304Sjkim u64 mask = i ^ idx; 1637280304Sjkim mask |= mask >> 4; 1638280304Sjkim mask |= mask >> 2; 1639280304Sjkim mask |= mask >> 1; 1640280304Sjkim mask &= 1; 1641280304Sjkim mask--; 1642280304Sjkim for (j = 0; j < NLIMBS * 3; j++) 1643280304Sjkim outlimbs[j] |= inlimbs[j] & mask; 1644280304Sjkim } 1645280304Sjkim} 1646238384Sjkim 1647238384Sjkim/* get_bit returns the |i|th bit in |in| */ 1648238384Sjkimstatic char get_bit(const felem_bytearray in, int i) 1649280304Sjkim{ 1650280304Sjkim if ((i < 0) || (i >= 256)) 1651280304Sjkim return 0; 1652280304Sjkim return (in[i >> 3] >> (i & 7)) & 1; 1653280304Sjkim} 1654238384Sjkim 1655280304Sjkim/* 1656280304Sjkim * Interleaved point multiplication using precomputed point multiples: The 1657280304Sjkim * small point multiples 0*P, 1*P, ..., 17*P are in pre_comp[], the scalars 1658280304Sjkim * in scalars[]. If g_scalar is non-NULL, we also add this multiple of the 1659280304Sjkim * generator, using certain (large) precomputed multiples in g_pre_comp. 1660280304Sjkim * Output point (X, Y, Z) is stored in x_out, y_out, z_out 1661280304Sjkim */ 1662238384Sjkimstatic void batch_mul(felem x_out, felem y_out, felem z_out, 1663280304Sjkim const felem_bytearray scalars[], 1664280304Sjkim const unsigned num_points, const u8 *g_scalar, 1665280304Sjkim const int mixed, const smallfelem pre_comp[][17][3], 1666280304Sjkim const smallfelem g_pre_comp[2][16][3]) 1667280304Sjkim{ 1668280304Sjkim int i, skip; 1669280304Sjkim unsigned num, gen_mul = (g_scalar != NULL); 1670280304Sjkim felem nq[3], ftmp; 1671280304Sjkim smallfelem tmp[3]; 1672280304Sjkim u64 bits; 1673280304Sjkim u8 sign, digit; 1674238384Sjkim 1675280304Sjkim /* set nq to the point at infinity */ 1676280304Sjkim memset(nq, 0, 3 * sizeof(felem)); 1677238384Sjkim 1678280304Sjkim /* 1679280304Sjkim * Loop over all scalars msb-to-lsb, interleaving additions of multiples 1680280304Sjkim * of the generator (two in each of the last 32 rounds) and additions of 1681280304Sjkim * other points multiples (every 5th round). 1682280304Sjkim */ 1683280304Sjkim skip = 1; /* save two point operations in the first 1684280304Sjkim * round */ 1685280304Sjkim for (i = (num_points ? 255 : 31); i >= 0; --i) { 1686280304Sjkim /* double */ 1687280304Sjkim if (!skip) 1688280304Sjkim point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]); 1689238384Sjkim 1690280304Sjkim /* add multiples of the generator */ 1691280304Sjkim if (gen_mul && (i <= 31)) { 1692280304Sjkim /* first, look 32 bits upwards */ 1693280304Sjkim bits = get_bit(g_scalar, i + 224) << 3; 1694280304Sjkim bits |= get_bit(g_scalar, i + 160) << 2; 1695280304Sjkim bits |= get_bit(g_scalar, i + 96) << 1; 1696280304Sjkim bits |= get_bit(g_scalar, i + 32); 1697280304Sjkim /* select the point to add, in constant time */ 1698280304Sjkim select_point(bits, 16, g_pre_comp[1], tmp); 1699238384Sjkim 1700280304Sjkim if (!skip) { 1701280304Sjkim /* Arg 1 below is for "mixed" */ 1702280304Sjkim point_add(nq[0], nq[1], nq[2], 1703280304Sjkim nq[0], nq[1], nq[2], 1, tmp[0], tmp[1], tmp[2]); 1704280304Sjkim } else { 1705280304Sjkim smallfelem_expand(nq[0], tmp[0]); 1706280304Sjkim smallfelem_expand(nq[1], tmp[1]); 1707280304Sjkim smallfelem_expand(nq[2], tmp[2]); 1708280304Sjkim skip = 0; 1709280304Sjkim } 1710238384Sjkim 1711280304Sjkim /* second, look at the current position */ 1712280304Sjkim bits = get_bit(g_scalar, i + 192) << 3; 1713280304Sjkim bits |= get_bit(g_scalar, i + 128) << 2; 1714280304Sjkim bits |= get_bit(g_scalar, i + 64) << 1; 1715280304Sjkim bits |= get_bit(g_scalar, i); 1716280304Sjkim /* select the point to add, in constant time */ 1717280304Sjkim select_point(bits, 16, g_pre_comp[0], tmp); 1718280304Sjkim /* Arg 1 below is for "mixed" */ 1719280304Sjkim point_add(nq[0], nq[1], nq[2], 1720280304Sjkim nq[0], nq[1], nq[2], 1, tmp[0], tmp[1], tmp[2]); 1721280304Sjkim } 1722238384Sjkim 1723280304Sjkim /* do other additions every 5 doublings */ 1724280304Sjkim if (num_points && (i % 5 == 0)) { 1725280304Sjkim /* loop over all scalars */ 1726280304Sjkim for (num = 0; num < num_points; ++num) { 1727280304Sjkim bits = get_bit(scalars[num], i + 4) << 5; 1728280304Sjkim bits |= get_bit(scalars[num], i + 3) << 4; 1729280304Sjkim bits |= get_bit(scalars[num], i + 2) << 3; 1730280304Sjkim bits |= get_bit(scalars[num], i + 1) << 2; 1731280304Sjkim bits |= get_bit(scalars[num], i) << 1; 1732280304Sjkim bits |= get_bit(scalars[num], i - 1); 1733280304Sjkim ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits); 1734238384Sjkim 1735280304Sjkim /* 1736280304Sjkim * select the point to add or subtract, in constant time 1737280304Sjkim */ 1738280304Sjkim select_point(digit, 17, pre_comp[num], tmp); 1739280304Sjkim smallfelem_neg(ftmp, tmp[1]); /* (X, -Y, Z) is the negative 1740280304Sjkim * point */ 1741280304Sjkim copy_small_conditional(ftmp, tmp[1], (((limb) sign) - 1)); 1742280304Sjkim felem_contract(tmp[1], ftmp); 1743238384Sjkim 1744280304Sjkim if (!skip) { 1745280304Sjkim point_add(nq[0], nq[1], nq[2], 1746280304Sjkim nq[0], nq[1], nq[2], 1747280304Sjkim mixed, tmp[0], tmp[1], tmp[2]); 1748280304Sjkim } else { 1749280304Sjkim smallfelem_expand(nq[0], tmp[0]); 1750280304Sjkim smallfelem_expand(nq[1], tmp[1]); 1751280304Sjkim smallfelem_expand(nq[2], tmp[2]); 1752280304Sjkim skip = 0; 1753280304Sjkim } 1754280304Sjkim } 1755280304Sjkim } 1756280304Sjkim } 1757280304Sjkim felem_assign(x_out, nq[0]); 1758280304Sjkim felem_assign(y_out, nq[1]); 1759280304Sjkim felem_assign(z_out, nq[2]); 1760280304Sjkim} 1761238384Sjkim 1762238384Sjkim/* Precomputation for the group generator. */ 1763238384Sjkimtypedef struct { 1764280304Sjkim smallfelem g_pre_comp[2][16][3]; 1765280304Sjkim int references; 1766238384Sjkim} NISTP256_PRE_COMP; 1767238384Sjkim 1768238384Sjkimconst EC_METHOD *EC_GFp_nistp256_method(void) 1769280304Sjkim{ 1770280304Sjkim static const EC_METHOD ret = { 1771280304Sjkim EC_FLAGS_DEFAULT_OCT, 1772280304Sjkim NID_X9_62_prime_field, 1773280304Sjkim ec_GFp_nistp256_group_init, 1774280304Sjkim ec_GFp_simple_group_finish, 1775280304Sjkim ec_GFp_simple_group_clear_finish, 1776280304Sjkim ec_GFp_nist_group_copy, 1777280304Sjkim ec_GFp_nistp256_group_set_curve, 1778280304Sjkim ec_GFp_simple_group_get_curve, 1779280304Sjkim ec_GFp_simple_group_get_degree, 1780280304Sjkim ec_GFp_simple_group_check_discriminant, 1781280304Sjkim ec_GFp_simple_point_init, 1782280304Sjkim ec_GFp_simple_point_finish, 1783280304Sjkim ec_GFp_simple_point_clear_finish, 1784280304Sjkim ec_GFp_simple_point_copy, 1785280304Sjkim ec_GFp_simple_point_set_to_infinity, 1786280304Sjkim ec_GFp_simple_set_Jprojective_coordinates_GFp, 1787280304Sjkim ec_GFp_simple_get_Jprojective_coordinates_GFp, 1788280304Sjkim ec_GFp_simple_point_set_affine_coordinates, 1789280304Sjkim ec_GFp_nistp256_point_get_affine_coordinates, 1790280304Sjkim 0 /* point_set_compressed_coordinates */ , 1791280304Sjkim 0 /* point2oct */ , 1792280304Sjkim 0 /* oct2point */ , 1793280304Sjkim ec_GFp_simple_add, 1794280304Sjkim ec_GFp_simple_dbl, 1795280304Sjkim ec_GFp_simple_invert, 1796280304Sjkim ec_GFp_simple_is_at_infinity, 1797280304Sjkim ec_GFp_simple_is_on_curve, 1798280304Sjkim ec_GFp_simple_cmp, 1799280304Sjkim ec_GFp_simple_make_affine, 1800280304Sjkim ec_GFp_simple_points_make_affine, 1801280304Sjkim ec_GFp_nistp256_points_mul, 1802280304Sjkim ec_GFp_nistp256_precompute_mult, 1803280304Sjkim ec_GFp_nistp256_have_precompute_mult, 1804280304Sjkim ec_GFp_nist_field_mul, 1805280304Sjkim ec_GFp_nist_field_sqr, 1806280304Sjkim 0 /* field_div */ , 1807280304Sjkim 0 /* field_encode */ , 1808280304Sjkim 0 /* field_decode */ , 1809280304Sjkim 0 /* field_set_to_one */ 1810280304Sjkim }; 1811238384Sjkim 1812280304Sjkim return &ret; 1813280304Sjkim} 1814238384Sjkim 1815238384Sjkim/******************************************************************************/ 1816280304Sjkim/* 1817280304Sjkim * FUNCTIONS TO MANAGE PRECOMPUTATION 1818238384Sjkim */ 1819238384Sjkim 1820238384Sjkimstatic NISTP256_PRE_COMP *nistp256_pre_comp_new() 1821280304Sjkim{ 1822280304Sjkim NISTP256_PRE_COMP *ret = NULL; 1823280304Sjkim ret = (NISTP256_PRE_COMP *) OPENSSL_malloc(sizeof *ret); 1824280304Sjkim if (!ret) { 1825280304Sjkim ECerr(EC_F_NISTP256_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE); 1826280304Sjkim return ret; 1827280304Sjkim } 1828280304Sjkim memset(ret->g_pre_comp, 0, sizeof(ret->g_pre_comp)); 1829280304Sjkim ret->references = 1; 1830280304Sjkim return ret; 1831280304Sjkim} 1832238384Sjkim 1833238384Sjkimstatic void *nistp256_pre_comp_dup(void *src_) 1834280304Sjkim{ 1835280304Sjkim NISTP256_PRE_COMP *src = src_; 1836238384Sjkim 1837280304Sjkim /* no need to actually copy, these objects never change! */ 1838280304Sjkim CRYPTO_add(&src->references, 1, CRYPTO_LOCK_EC_PRE_COMP); 1839238384Sjkim 1840280304Sjkim return src_; 1841280304Sjkim} 1842238384Sjkim 1843238384Sjkimstatic void nistp256_pre_comp_free(void *pre_) 1844280304Sjkim{ 1845280304Sjkim int i; 1846280304Sjkim NISTP256_PRE_COMP *pre = pre_; 1847238384Sjkim 1848280304Sjkim if (!pre) 1849280304Sjkim return; 1850238384Sjkim 1851280304Sjkim i = CRYPTO_add(&pre->references, -1, CRYPTO_LOCK_EC_PRE_COMP); 1852280304Sjkim if (i > 0) 1853280304Sjkim return; 1854238384Sjkim 1855280304Sjkim OPENSSL_free(pre); 1856280304Sjkim} 1857238384Sjkim 1858238384Sjkimstatic void nistp256_pre_comp_clear_free(void *pre_) 1859280304Sjkim{ 1860280304Sjkim int i; 1861280304Sjkim NISTP256_PRE_COMP *pre = pre_; 1862238384Sjkim 1863280304Sjkim if (!pre) 1864280304Sjkim return; 1865238384Sjkim 1866280304Sjkim i = CRYPTO_add(&pre->references, -1, CRYPTO_LOCK_EC_PRE_COMP); 1867280304Sjkim if (i > 0) 1868280304Sjkim return; 1869238384Sjkim 1870280304Sjkim OPENSSL_cleanse(pre, sizeof *pre); 1871280304Sjkim OPENSSL_free(pre); 1872280304Sjkim} 1873238384Sjkim 1874238384Sjkim/******************************************************************************/ 1875280304Sjkim/* 1876280304Sjkim * OPENSSL EC_METHOD FUNCTIONS 1877238384Sjkim */ 1878238384Sjkim 1879238384Sjkimint ec_GFp_nistp256_group_init(EC_GROUP *group) 1880280304Sjkim{ 1881280304Sjkim int ret; 1882280304Sjkim ret = ec_GFp_simple_group_init(group); 1883280304Sjkim group->a_is_minus3 = 1; 1884280304Sjkim return ret; 1885280304Sjkim} 1886238384Sjkim 1887238384Sjkimint ec_GFp_nistp256_group_set_curve(EC_GROUP *group, const BIGNUM *p, 1888280304Sjkim const BIGNUM *a, const BIGNUM *b, 1889280304Sjkim BN_CTX *ctx) 1890280304Sjkim{ 1891280304Sjkim int ret = 0; 1892280304Sjkim BN_CTX *new_ctx = NULL; 1893280304Sjkim BIGNUM *curve_p, *curve_a, *curve_b; 1894238384Sjkim 1895280304Sjkim if (ctx == NULL) 1896280304Sjkim if ((ctx = new_ctx = BN_CTX_new()) == NULL) 1897280304Sjkim return 0; 1898280304Sjkim BN_CTX_start(ctx); 1899280304Sjkim if (((curve_p = BN_CTX_get(ctx)) == NULL) || 1900280304Sjkim ((curve_a = BN_CTX_get(ctx)) == NULL) || 1901280304Sjkim ((curve_b = BN_CTX_get(ctx)) == NULL)) 1902280304Sjkim goto err; 1903280304Sjkim BN_bin2bn(nistp256_curve_params[0], sizeof(felem_bytearray), curve_p); 1904280304Sjkim BN_bin2bn(nistp256_curve_params[1], sizeof(felem_bytearray), curve_a); 1905280304Sjkim BN_bin2bn(nistp256_curve_params[2], sizeof(felem_bytearray), curve_b); 1906280304Sjkim if ((BN_cmp(curve_p, p)) || (BN_cmp(curve_a, a)) || (BN_cmp(curve_b, b))) { 1907280304Sjkim ECerr(EC_F_EC_GFP_NISTP256_GROUP_SET_CURVE, 1908280304Sjkim EC_R_WRONG_CURVE_PARAMETERS); 1909280304Sjkim goto err; 1910280304Sjkim } 1911280304Sjkim group->field_mod_func = BN_nist_mod_256; 1912280304Sjkim ret = ec_GFp_simple_group_set_curve(group, p, a, b, ctx); 1913280304Sjkim err: 1914280304Sjkim BN_CTX_end(ctx); 1915280304Sjkim if (new_ctx != NULL) 1916280304Sjkim BN_CTX_free(new_ctx); 1917280304Sjkim return ret; 1918280304Sjkim} 1919238384Sjkim 1920280304Sjkim/* 1921280304Sjkim * Takes the Jacobian coordinates (X, Y, Z) of a point and returns (X', Y') = 1922280304Sjkim * (X/Z^2, Y/Z^3) 1923280304Sjkim */ 1924238384Sjkimint ec_GFp_nistp256_point_get_affine_coordinates(const EC_GROUP *group, 1925280304Sjkim const EC_POINT *point, 1926280304Sjkim BIGNUM *x, BIGNUM *y, 1927280304Sjkim BN_CTX *ctx) 1928280304Sjkim{ 1929280304Sjkim felem z1, z2, x_in, y_in; 1930280304Sjkim smallfelem x_out, y_out; 1931280304Sjkim longfelem tmp; 1932238384Sjkim 1933280304Sjkim if (EC_POINT_is_at_infinity(group, point)) { 1934280304Sjkim ECerr(EC_F_EC_GFP_NISTP256_POINT_GET_AFFINE_COORDINATES, 1935280304Sjkim EC_R_POINT_AT_INFINITY); 1936280304Sjkim return 0; 1937280304Sjkim } 1938280304Sjkim if ((!BN_to_felem(x_in, &point->X)) || (!BN_to_felem(y_in, &point->Y)) || 1939280304Sjkim (!BN_to_felem(z1, &point->Z))) 1940280304Sjkim return 0; 1941280304Sjkim felem_inv(z2, z1); 1942280304Sjkim felem_square(tmp, z2); 1943280304Sjkim felem_reduce(z1, tmp); 1944280304Sjkim felem_mul(tmp, x_in, z1); 1945280304Sjkim felem_reduce(x_in, tmp); 1946280304Sjkim felem_contract(x_out, x_in); 1947280304Sjkim if (x != NULL) { 1948280304Sjkim if (!smallfelem_to_BN(x, x_out)) { 1949280304Sjkim ECerr(EC_F_EC_GFP_NISTP256_POINT_GET_AFFINE_COORDINATES, 1950280304Sjkim ERR_R_BN_LIB); 1951280304Sjkim return 0; 1952280304Sjkim } 1953280304Sjkim } 1954280304Sjkim felem_mul(tmp, z1, z2); 1955280304Sjkim felem_reduce(z1, tmp); 1956280304Sjkim felem_mul(tmp, y_in, z1); 1957280304Sjkim felem_reduce(y_in, tmp); 1958280304Sjkim felem_contract(y_out, y_in); 1959280304Sjkim if (y != NULL) { 1960280304Sjkim if (!smallfelem_to_BN(y, y_out)) { 1961280304Sjkim ECerr(EC_F_EC_GFP_NISTP256_POINT_GET_AFFINE_COORDINATES, 1962280304Sjkim ERR_R_BN_LIB); 1963280304Sjkim return 0; 1964280304Sjkim } 1965280304Sjkim } 1966280304Sjkim return 1; 1967280304Sjkim} 1968238384Sjkim 1969280304Sjkim/* points below is of size |num|, and tmp_smallfelems is of size |num+1| */ 1970280304Sjkimstatic void make_points_affine(size_t num, smallfelem points[][3], 1971280304Sjkim smallfelem tmp_smallfelems[]) 1972280304Sjkim{ 1973280304Sjkim /* 1974280304Sjkim * Runs in constant time, unless an input is the point at infinity (which 1975280304Sjkim * normally shouldn't happen). 1976280304Sjkim */ 1977280304Sjkim ec_GFp_nistp_points_make_affine_internal(num, 1978280304Sjkim points, 1979280304Sjkim sizeof(smallfelem), 1980280304Sjkim tmp_smallfelems, 1981280304Sjkim (void (*)(void *))smallfelem_one, 1982280304Sjkim (int (*)(const void *)) 1983280304Sjkim smallfelem_is_zero_int, 1984280304Sjkim (void (*)(void *, const void *)) 1985280304Sjkim smallfelem_assign, 1986280304Sjkim (void (*)(void *, const void *)) 1987280304Sjkim smallfelem_square_contract, 1988280304Sjkim (void (*) 1989280304Sjkim (void *, const void *, 1990280304Sjkim const void *)) 1991280304Sjkim smallfelem_mul_contract, 1992280304Sjkim (void (*)(void *, const void *)) 1993280304Sjkim smallfelem_inv_contract, 1994280304Sjkim /* nothing to contract */ 1995280304Sjkim (void (*)(void *, const void *)) 1996280304Sjkim smallfelem_assign); 1997280304Sjkim} 1998238384Sjkim 1999280304Sjkim/* 2000280304Sjkim * Computes scalar*generator + \sum scalars[i]*points[i], ignoring NULL 2001280304Sjkim * values Result is stored in r (r can equal one of the inputs). 2002280304Sjkim */ 2003238384Sjkimint ec_GFp_nistp256_points_mul(const EC_GROUP *group, EC_POINT *r, 2004280304Sjkim const BIGNUM *scalar, size_t num, 2005280304Sjkim const EC_POINT *points[], 2006280304Sjkim const BIGNUM *scalars[], BN_CTX *ctx) 2007280304Sjkim{ 2008280304Sjkim int ret = 0; 2009280304Sjkim int j; 2010280304Sjkim int mixed = 0; 2011280304Sjkim BN_CTX *new_ctx = NULL; 2012280304Sjkim BIGNUM *x, *y, *z, *tmp_scalar; 2013280304Sjkim felem_bytearray g_secret; 2014280304Sjkim felem_bytearray *secrets = NULL; 2015280304Sjkim smallfelem(*pre_comp)[17][3] = NULL; 2016280304Sjkim smallfelem *tmp_smallfelems = NULL; 2017280304Sjkim felem_bytearray tmp; 2018280304Sjkim unsigned i, num_bytes; 2019280304Sjkim int have_pre_comp = 0; 2020280304Sjkim size_t num_points = num; 2021280304Sjkim smallfelem x_in, y_in, z_in; 2022280304Sjkim felem x_out, y_out, z_out; 2023280304Sjkim NISTP256_PRE_COMP *pre = NULL; 2024280304Sjkim const smallfelem(*g_pre_comp)[16][3] = NULL; 2025280304Sjkim EC_POINT *generator = NULL; 2026280304Sjkim const EC_POINT *p = NULL; 2027280304Sjkim const BIGNUM *p_scalar = NULL; 2028238384Sjkim 2029280304Sjkim if (ctx == NULL) 2030280304Sjkim if ((ctx = new_ctx = BN_CTX_new()) == NULL) 2031280304Sjkim return 0; 2032280304Sjkim BN_CTX_start(ctx); 2033280304Sjkim if (((x = BN_CTX_get(ctx)) == NULL) || 2034280304Sjkim ((y = BN_CTX_get(ctx)) == NULL) || 2035280304Sjkim ((z = BN_CTX_get(ctx)) == NULL) || 2036280304Sjkim ((tmp_scalar = BN_CTX_get(ctx)) == NULL)) 2037280304Sjkim goto err; 2038238384Sjkim 2039280304Sjkim if (scalar != NULL) { 2040280304Sjkim pre = EC_EX_DATA_get_data(group->extra_data, 2041280304Sjkim nistp256_pre_comp_dup, 2042280304Sjkim nistp256_pre_comp_free, 2043280304Sjkim nistp256_pre_comp_clear_free); 2044280304Sjkim if (pre) 2045280304Sjkim /* we have precomputation, try to use it */ 2046280304Sjkim g_pre_comp = (const smallfelem(*)[16][3])pre->g_pre_comp; 2047280304Sjkim else 2048280304Sjkim /* try to use the standard precomputation */ 2049280304Sjkim g_pre_comp = &gmul[0]; 2050280304Sjkim generator = EC_POINT_new(group); 2051280304Sjkim if (generator == NULL) 2052280304Sjkim goto err; 2053280304Sjkim /* get the generator from precomputation */ 2054280304Sjkim if (!smallfelem_to_BN(x, g_pre_comp[0][1][0]) || 2055280304Sjkim !smallfelem_to_BN(y, g_pre_comp[0][1][1]) || 2056280304Sjkim !smallfelem_to_BN(z, g_pre_comp[0][1][2])) { 2057280304Sjkim ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_BN_LIB); 2058280304Sjkim goto err; 2059280304Sjkim } 2060280304Sjkim if (!EC_POINT_set_Jprojective_coordinates_GFp(group, 2061280304Sjkim generator, x, y, z, 2062280304Sjkim ctx)) 2063280304Sjkim goto err; 2064280304Sjkim if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) 2065280304Sjkim /* precomputation matches generator */ 2066280304Sjkim have_pre_comp = 1; 2067280304Sjkim else 2068280304Sjkim /* 2069280304Sjkim * we don't have valid precomputation: treat the generator as a 2070280304Sjkim * random point 2071280304Sjkim */ 2072280304Sjkim num_points++; 2073280304Sjkim } 2074280304Sjkim if (num_points > 0) { 2075280304Sjkim if (num_points >= 3) { 2076280304Sjkim /* 2077280304Sjkim * unless we precompute multiples for just one or two points, 2078280304Sjkim * converting those into affine form is time well spent 2079280304Sjkim */ 2080280304Sjkim mixed = 1; 2081280304Sjkim } 2082280304Sjkim secrets = OPENSSL_malloc(num_points * sizeof(felem_bytearray)); 2083280304Sjkim pre_comp = OPENSSL_malloc(num_points * 17 * 3 * sizeof(smallfelem)); 2084280304Sjkim if (mixed) 2085280304Sjkim tmp_smallfelems = 2086280304Sjkim OPENSSL_malloc((num_points * 17 + 1) * sizeof(smallfelem)); 2087280304Sjkim if ((secrets == NULL) || (pre_comp == NULL) 2088280304Sjkim || (mixed && (tmp_smallfelems == NULL))) { 2089280304Sjkim ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_MALLOC_FAILURE); 2090280304Sjkim goto err; 2091280304Sjkim } 2092238384Sjkim 2093280304Sjkim /* 2094280304Sjkim * we treat NULL scalars as 0, and NULL points as points at infinity, 2095280304Sjkim * i.e., they contribute nothing to the linear combination 2096280304Sjkim */ 2097280304Sjkim memset(secrets, 0, num_points * sizeof(felem_bytearray)); 2098280304Sjkim memset(pre_comp, 0, num_points * 17 * 3 * sizeof(smallfelem)); 2099280304Sjkim for (i = 0; i < num_points; ++i) { 2100280304Sjkim if (i == num) 2101280304Sjkim /* 2102280304Sjkim * we didn't have a valid precomputation, so we pick the 2103280304Sjkim * generator 2104280304Sjkim */ 2105280304Sjkim { 2106280304Sjkim p = EC_GROUP_get0_generator(group); 2107280304Sjkim p_scalar = scalar; 2108280304Sjkim } else 2109280304Sjkim /* the i^th point */ 2110280304Sjkim { 2111280304Sjkim p = points[i]; 2112280304Sjkim p_scalar = scalars[i]; 2113280304Sjkim } 2114280304Sjkim if ((p_scalar != NULL) && (p != NULL)) { 2115280304Sjkim /* reduce scalar to 0 <= scalar < 2^256 */ 2116280304Sjkim if ((BN_num_bits(p_scalar) > 256) 2117280304Sjkim || (BN_is_negative(p_scalar))) { 2118280304Sjkim /* 2119280304Sjkim * this is an unusual input, and we don't guarantee 2120280304Sjkim * constant-timeness 2121280304Sjkim */ 2122280304Sjkim if (!BN_nnmod(tmp_scalar, p_scalar, &group->order, ctx)) { 2123280304Sjkim ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_BN_LIB); 2124280304Sjkim goto err; 2125280304Sjkim } 2126280304Sjkim num_bytes = BN_bn2bin(tmp_scalar, tmp); 2127280304Sjkim } else 2128280304Sjkim num_bytes = BN_bn2bin(p_scalar, tmp); 2129280304Sjkim flip_endian(secrets[i], tmp, num_bytes); 2130280304Sjkim /* precompute multiples */ 2131280304Sjkim if ((!BN_to_felem(x_out, &p->X)) || 2132280304Sjkim (!BN_to_felem(y_out, &p->Y)) || 2133280304Sjkim (!BN_to_felem(z_out, &p->Z))) 2134280304Sjkim goto err; 2135280304Sjkim felem_shrink(pre_comp[i][1][0], x_out); 2136280304Sjkim felem_shrink(pre_comp[i][1][1], y_out); 2137280304Sjkim felem_shrink(pre_comp[i][1][2], z_out); 2138280304Sjkim for (j = 2; j <= 16; ++j) { 2139280304Sjkim if (j & 1) { 2140280304Sjkim point_add_small(pre_comp[i][j][0], pre_comp[i][j][1], 2141280304Sjkim pre_comp[i][j][2], pre_comp[i][1][0], 2142280304Sjkim pre_comp[i][1][1], pre_comp[i][1][2], 2143280304Sjkim pre_comp[i][j - 1][0], 2144280304Sjkim pre_comp[i][j - 1][1], 2145280304Sjkim pre_comp[i][j - 1][2]); 2146280304Sjkim } else { 2147280304Sjkim point_double_small(pre_comp[i][j][0], 2148280304Sjkim pre_comp[i][j][1], 2149280304Sjkim pre_comp[i][j][2], 2150280304Sjkim pre_comp[i][j / 2][0], 2151280304Sjkim pre_comp[i][j / 2][1], 2152280304Sjkim pre_comp[i][j / 2][2]); 2153280304Sjkim } 2154280304Sjkim } 2155280304Sjkim } 2156280304Sjkim } 2157280304Sjkim if (mixed) 2158280304Sjkim make_points_affine(num_points * 17, pre_comp[0], tmp_smallfelems); 2159280304Sjkim } 2160238384Sjkim 2161280304Sjkim /* the scalar for the generator */ 2162280304Sjkim if ((scalar != NULL) && (have_pre_comp)) { 2163280304Sjkim memset(g_secret, 0, sizeof(g_secret)); 2164280304Sjkim /* reduce scalar to 0 <= scalar < 2^256 */ 2165280304Sjkim if ((BN_num_bits(scalar) > 256) || (BN_is_negative(scalar))) { 2166280304Sjkim /* 2167280304Sjkim * this is an unusual input, and we don't guarantee 2168280304Sjkim * constant-timeness 2169280304Sjkim */ 2170280304Sjkim if (!BN_nnmod(tmp_scalar, scalar, &group->order, ctx)) { 2171280304Sjkim ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_BN_LIB); 2172280304Sjkim goto err; 2173280304Sjkim } 2174280304Sjkim num_bytes = BN_bn2bin(tmp_scalar, tmp); 2175280304Sjkim } else 2176280304Sjkim num_bytes = BN_bn2bin(scalar, tmp); 2177280304Sjkim flip_endian(g_secret, tmp, num_bytes); 2178280304Sjkim /* do the multiplication with generator precomputation */ 2179280304Sjkim batch_mul(x_out, y_out, z_out, 2180280304Sjkim (const felem_bytearray(*))secrets, num_points, 2181280304Sjkim g_secret, 2182280304Sjkim mixed, (const smallfelem(*)[17][3])pre_comp, g_pre_comp); 2183280304Sjkim } else 2184280304Sjkim /* do the multiplication without generator precomputation */ 2185280304Sjkim batch_mul(x_out, y_out, z_out, 2186280304Sjkim (const felem_bytearray(*))secrets, num_points, 2187280304Sjkim NULL, mixed, (const smallfelem(*)[17][3])pre_comp, NULL); 2188280304Sjkim /* reduce the output to its unique minimal representation */ 2189280304Sjkim felem_contract(x_in, x_out); 2190280304Sjkim felem_contract(y_in, y_out); 2191280304Sjkim felem_contract(z_in, z_out); 2192280304Sjkim if ((!smallfelem_to_BN(x, x_in)) || (!smallfelem_to_BN(y, y_in)) || 2193280304Sjkim (!smallfelem_to_BN(z, z_in))) { 2194280304Sjkim ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_BN_LIB); 2195280304Sjkim goto err; 2196280304Sjkim } 2197280304Sjkim ret = EC_POINT_set_Jprojective_coordinates_GFp(group, r, x, y, z, ctx); 2198238384Sjkim 2199280304Sjkim err: 2200280304Sjkim BN_CTX_end(ctx); 2201280304Sjkim if (generator != NULL) 2202280304Sjkim EC_POINT_free(generator); 2203280304Sjkim if (new_ctx != NULL) 2204280304Sjkim BN_CTX_free(new_ctx); 2205280304Sjkim if (secrets != NULL) 2206280304Sjkim OPENSSL_free(secrets); 2207280304Sjkim if (pre_comp != NULL) 2208280304Sjkim OPENSSL_free(pre_comp); 2209280304Sjkim if (tmp_smallfelems != NULL) 2210280304Sjkim OPENSSL_free(tmp_smallfelems); 2211280304Sjkim return ret; 2212280304Sjkim} 2213238384Sjkim 2214238384Sjkimint ec_GFp_nistp256_precompute_mult(EC_GROUP *group, BN_CTX *ctx) 2215280304Sjkim{ 2216280304Sjkim int ret = 0; 2217280304Sjkim NISTP256_PRE_COMP *pre = NULL; 2218280304Sjkim int i, j; 2219280304Sjkim BN_CTX *new_ctx = NULL; 2220280304Sjkim BIGNUM *x, *y; 2221280304Sjkim EC_POINT *generator = NULL; 2222280304Sjkim smallfelem tmp_smallfelems[32]; 2223280304Sjkim felem x_tmp, y_tmp, z_tmp; 2224238384Sjkim 2225280304Sjkim /* throw away old precomputation */ 2226280304Sjkim EC_EX_DATA_free_data(&group->extra_data, nistp256_pre_comp_dup, 2227280304Sjkim nistp256_pre_comp_free, 2228280304Sjkim nistp256_pre_comp_clear_free); 2229280304Sjkim if (ctx == NULL) 2230280304Sjkim if ((ctx = new_ctx = BN_CTX_new()) == NULL) 2231280304Sjkim return 0; 2232280304Sjkim BN_CTX_start(ctx); 2233280304Sjkim if (((x = BN_CTX_get(ctx)) == NULL) || ((y = BN_CTX_get(ctx)) == NULL)) 2234280304Sjkim goto err; 2235280304Sjkim /* get the generator */ 2236280304Sjkim if (group->generator == NULL) 2237280304Sjkim goto err; 2238280304Sjkim generator = EC_POINT_new(group); 2239280304Sjkim if (generator == NULL) 2240280304Sjkim goto err; 2241280304Sjkim BN_bin2bn(nistp256_curve_params[3], sizeof(felem_bytearray), x); 2242280304Sjkim BN_bin2bn(nistp256_curve_params[4], sizeof(felem_bytearray), y); 2243280304Sjkim if (!EC_POINT_set_affine_coordinates_GFp(group, generator, x, y, ctx)) 2244280304Sjkim goto err; 2245280304Sjkim if ((pre = nistp256_pre_comp_new()) == NULL) 2246280304Sjkim goto err; 2247280304Sjkim /* 2248280304Sjkim * if the generator is the standard one, use built-in precomputation 2249280304Sjkim */ 2250280304Sjkim if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) { 2251280304Sjkim memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp)); 2252280304Sjkim ret = 1; 2253280304Sjkim goto err; 2254280304Sjkim } 2255280304Sjkim if ((!BN_to_felem(x_tmp, &group->generator->X)) || 2256280304Sjkim (!BN_to_felem(y_tmp, &group->generator->Y)) || 2257280304Sjkim (!BN_to_felem(z_tmp, &group->generator->Z))) 2258280304Sjkim goto err; 2259280304Sjkim felem_shrink(pre->g_pre_comp[0][1][0], x_tmp); 2260280304Sjkim felem_shrink(pre->g_pre_comp[0][1][1], y_tmp); 2261280304Sjkim felem_shrink(pre->g_pre_comp[0][1][2], z_tmp); 2262280304Sjkim /* 2263280304Sjkim * compute 2^64*G, 2^128*G, 2^192*G for the first table, 2^32*G, 2^96*G, 2264280304Sjkim * 2^160*G, 2^224*G for the second one 2265280304Sjkim */ 2266280304Sjkim for (i = 1; i <= 8; i <<= 1) { 2267280304Sjkim point_double_small(pre->g_pre_comp[1][i][0], pre->g_pre_comp[1][i][1], 2268280304Sjkim pre->g_pre_comp[1][i][2], pre->g_pre_comp[0][i][0], 2269280304Sjkim pre->g_pre_comp[0][i][1], 2270280304Sjkim pre->g_pre_comp[0][i][2]); 2271280304Sjkim for (j = 0; j < 31; ++j) { 2272280304Sjkim point_double_small(pre->g_pre_comp[1][i][0], 2273280304Sjkim pre->g_pre_comp[1][i][1], 2274280304Sjkim pre->g_pre_comp[1][i][2], 2275280304Sjkim pre->g_pre_comp[1][i][0], 2276280304Sjkim pre->g_pre_comp[1][i][1], 2277280304Sjkim pre->g_pre_comp[1][i][2]); 2278280304Sjkim } 2279280304Sjkim if (i == 8) 2280280304Sjkim break; 2281280304Sjkim point_double_small(pre->g_pre_comp[0][2 * i][0], 2282280304Sjkim pre->g_pre_comp[0][2 * i][1], 2283280304Sjkim pre->g_pre_comp[0][2 * i][2], 2284280304Sjkim pre->g_pre_comp[1][i][0], pre->g_pre_comp[1][i][1], 2285280304Sjkim pre->g_pre_comp[1][i][2]); 2286280304Sjkim for (j = 0; j < 31; ++j) { 2287280304Sjkim point_double_small(pre->g_pre_comp[0][2 * i][0], 2288280304Sjkim pre->g_pre_comp[0][2 * i][1], 2289280304Sjkim pre->g_pre_comp[0][2 * i][2], 2290280304Sjkim pre->g_pre_comp[0][2 * i][0], 2291280304Sjkim pre->g_pre_comp[0][2 * i][1], 2292280304Sjkim pre->g_pre_comp[0][2 * i][2]); 2293280304Sjkim } 2294280304Sjkim } 2295280304Sjkim for (i = 0; i < 2; i++) { 2296280304Sjkim /* g_pre_comp[i][0] is the point at infinity */ 2297280304Sjkim memset(pre->g_pre_comp[i][0], 0, sizeof(pre->g_pre_comp[i][0])); 2298280304Sjkim /* the remaining multiples */ 2299280304Sjkim /* 2^64*G + 2^128*G resp. 2^96*G + 2^160*G */ 2300280304Sjkim point_add_small(pre->g_pre_comp[i][6][0], pre->g_pre_comp[i][6][1], 2301280304Sjkim pre->g_pre_comp[i][6][2], pre->g_pre_comp[i][4][0], 2302280304Sjkim pre->g_pre_comp[i][4][1], pre->g_pre_comp[i][4][2], 2303280304Sjkim pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1], 2304280304Sjkim pre->g_pre_comp[i][2][2]); 2305280304Sjkim /* 2^64*G + 2^192*G resp. 2^96*G + 2^224*G */ 2306280304Sjkim point_add_small(pre->g_pre_comp[i][10][0], pre->g_pre_comp[i][10][1], 2307280304Sjkim pre->g_pre_comp[i][10][2], pre->g_pre_comp[i][8][0], 2308280304Sjkim pre->g_pre_comp[i][8][1], pre->g_pre_comp[i][8][2], 2309280304Sjkim pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1], 2310280304Sjkim pre->g_pre_comp[i][2][2]); 2311280304Sjkim /* 2^128*G + 2^192*G resp. 2^160*G + 2^224*G */ 2312280304Sjkim point_add_small(pre->g_pre_comp[i][12][0], pre->g_pre_comp[i][12][1], 2313280304Sjkim pre->g_pre_comp[i][12][2], pre->g_pre_comp[i][8][0], 2314280304Sjkim pre->g_pre_comp[i][8][1], pre->g_pre_comp[i][8][2], 2315280304Sjkim pre->g_pre_comp[i][4][0], pre->g_pre_comp[i][4][1], 2316280304Sjkim pre->g_pre_comp[i][4][2]); 2317280304Sjkim /* 2318280304Sjkim * 2^64*G + 2^128*G + 2^192*G resp. 2^96*G + 2^160*G + 2^224*G 2319280304Sjkim */ 2320280304Sjkim point_add_small(pre->g_pre_comp[i][14][0], pre->g_pre_comp[i][14][1], 2321280304Sjkim pre->g_pre_comp[i][14][2], pre->g_pre_comp[i][12][0], 2322280304Sjkim pre->g_pre_comp[i][12][1], pre->g_pre_comp[i][12][2], 2323280304Sjkim pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1], 2324280304Sjkim pre->g_pre_comp[i][2][2]); 2325280304Sjkim for (j = 1; j < 8; ++j) { 2326280304Sjkim /* odd multiples: add G resp. 2^32*G */ 2327280304Sjkim point_add_small(pre->g_pre_comp[i][2 * j + 1][0], 2328280304Sjkim pre->g_pre_comp[i][2 * j + 1][1], 2329280304Sjkim pre->g_pre_comp[i][2 * j + 1][2], 2330280304Sjkim pre->g_pre_comp[i][2 * j][0], 2331280304Sjkim pre->g_pre_comp[i][2 * j][1], 2332280304Sjkim pre->g_pre_comp[i][2 * j][2], 2333280304Sjkim pre->g_pre_comp[i][1][0], 2334280304Sjkim pre->g_pre_comp[i][1][1], 2335280304Sjkim pre->g_pre_comp[i][1][2]); 2336280304Sjkim } 2337280304Sjkim } 2338280304Sjkim make_points_affine(31, &(pre->g_pre_comp[0][1]), tmp_smallfelems); 2339238384Sjkim 2340280304Sjkim if (!EC_EX_DATA_set_data(&group->extra_data, pre, nistp256_pre_comp_dup, 2341280304Sjkim nistp256_pre_comp_free, 2342280304Sjkim nistp256_pre_comp_clear_free)) 2343280304Sjkim goto err; 2344280304Sjkim ret = 1; 2345280304Sjkim pre = NULL; 2346238384Sjkim err: 2347280304Sjkim BN_CTX_end(ctx); 2348280304Sjkim if (generator != NULL) 2349280304Sjkim EC_POINT_free(generator); 2350280304Sjkim if (new_ctx != NULL) 2351280304Sjkim BN_CTX_free(new_ctx); 2352280304Sjkim if (pre) 2353280304Sjkim nistp256_pre_comp_free(pre); 2354280304Sjkim return ret; 2355280304Sjkim} 2356238384Sjkim 2357238384Sjkimint ec_GFp_nistp256_have_precompute_mult(const EC_GROUP *group) 2358280304Sjkim{ 2359280304Sjkim if (EC_EX_DATA_get_data(group->extra_data, nistp256_pre_comp_dup, 2360280304Sjkim nistp256_pre_comp_free, 2361280304Sjkim nistp256_pre_comp_clear_free) 2362280304Sjkim != NULL) 2363280304Sjkim return 1; 2364280304Sjkim else 2365280304Sjkim return 0; 2366280304Sjkim} 2367238384Sjkim#else 2368280304Sjkimstatic void *dummy = &dummy; 2369238384Sjkim#endif 2370