1289848Sjkim/****************************************************************************** 2289848Sjkim * * 3289848Sjkim * Copyright 2014 Intel Corporation * 4289848Sjkim * * 5289848Sjkim * Licensed under the Apache License, Version 2.0 (the "License"); * 6289848Sjkim * you may not use this file except in compliance with the License. * 7289848Sjkim * You may obtain a copy of the License at * 8289848Sjkim * * 9289848Sjkim * http://www.apache.org/licenses/LICENSE-2.0 * 10289848Sjkim * * 11289848Sjkim * Unless required by applicable law or agreed to in writing, software * 12289848Sjkim * distributed under the License is distributed on an "AS IS" BASIS, * 13289848Sjkim * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * 14289848Sjkim * See the License for the specific language governing permissions and * 15289848Sjkim * limitations under the License. * 16289848Sjkim * * 17289848Sjkim ****************************************************************************** 18289848Sjkim * * 19289848Sjkim * Developers and authors: * 20289848Sjkim * Shay Gueron (1, 2), and Vlad Krasnov (1) * 21289848Sjkim * (1) Intel Corporation, Israel Development Center * 22289848Sjkim * (2) University of Haifa * 23289848Sjkim * Reference: * 24289848Sjkim * S.Gueron and V.Krasnov, "Fast Prime Field Elliptic Curve Cryptography with * 25289848Sjkim * 256 Bit Primes" * 26289848Sjkim * * 27289848Sjkim ******************************************************************************/ 28289848Sjkim 29289848Sjkim#include <string.h> 30289848Sjkim 31289848Sjkim#include <openssl/bn.h> 32289848Sjkim#include <openssl/err.h> 33289848Sjkim#include <openssl/ec.h> 34289848Sjkim#include "cryptlib.h" 35289848Sjkim 36289848Sjkim#include "ec_lcl.h" 37289848Sjkim 38289848Sjkim#if BN_BITS2 != 64 39289848Sjkim# define TOBN(hi,lo) lo,hi 40289848Sjkim#else 41289848Sjkim# define TOBN(hi,lo) ((BN_ULONG)hi<<32|lo) 42289848Sjkim#endif 43289848Sjkim 44289848Sjkim#if defined(__GNUC__) 45289848Sjkim# define ALIGN32 __attribute((aligned(32))) 46289848Sjkim#elif defined(_MSC_VER) 47289848Sjkim# define ALIGN32 __declspec(align(32)) 48289848Sjkim#else 49289848Sjkim# define ALIGN32 50289848Sjkim#endif 51289848Sjkim 52289848Sjkim#define ALIGNPTR(p,N) ((unsigned char *)p+N-(size_t)p%N) 53289848Sjkim#define P256_LIMBS (256/BN_BITS2) 54289848Sjkim 55289848Sjkimtypedef unsigned short u16; 56289848Sjkim 57289848Sjkimtypedef struct { 58289848Sjkim BN_ULONG X[P256_LIMBS]; 59289848Sjkim BN_ULONG Y[P256_LIMBS]; 60289848Sjkim BN_ULONG Z[P256_LIMBS]; 61289848Sjkim} P256_POINT; 62289848Sjkim 63289848Sjkimtypedef struct { 64289848Sjkim BN_ULONG X[P256_LIMBS]; 65289848Sjkim BN_ULONG Y[P256_LIMBS]; 66289848Sjkim} P256_POINT_AFFINE; 67289848Sjkim 68289848Sjkimtypedef P256_POINT_AFFINE PRECOMP256_ROW[64]; 69289848Sjkim 70289848Sjkim/* structure for precomputed multiples of the generator */ 71289848Sjkimtypedef struct ec_pre_comp_st { 72289848Sjkim const EC_GROUP *group; /* Parent EC_GROUP object */ 73289848Sjkim size_t w; /* Window size */ 74289848Sjkim /* 75289848Sjkim * Constant time access to the X and Y coordinates of the pre-computed, 76289848Sjkim * generator multiplies, in the Montgomery domain. Pre-calculated 77289848Sjkim * multiplies are stored in affine form. 78289848Sjkim */ 79289848Sjkim PRECOMP256_ROW *precomp; 80289848Sjkim void *precomp_storage; 81289848Sjkim int references; 82289848Sjkim} EC_PRE_COMP; 83289848Sjkim 84289848Sjkim/* Functions implemented in assembly */ 85306195Sjkim/* 86306195Sjkim * Most of below mentioned functions *preserve* the property of inputs 87306195Sjkim * being fully reduced, i.e. being in [0, modulus) range. Simply put if 88306195Sjkim * inputs are fully reduced, then output is too. Note that reverse is 89306195Sjkim * not true, in sense that given partially reduced inputs output can be 90306195Sjkim * either, not unlikely reduced. And "most" in first sentence refers to 91306195Sjkim * the fact that given the calculations flow one can tolerate that 92306195Sjkim * addition, 1st function below, produces partially reduced result *if* 93306195Sjkim * multiplications by 2 and 3, which customarily use addition, fully 94306195Sjkim * reduce it. This effectively gives two options: a) addition produces 95306195Sjkim * fully reduced result [as long as inputs are, just like remaining 96306195Sjkim * functions]; b) addition is allowed to produce partially reduced 97306195Sjkim * result, but multiplications by 2 and 3 perform additional reduction 98306195Sjkim * step. Choice between the two can be platform-specific, but it was a) 99306195Sjkim * in all cases so far... 100306195Sjkim */ 101306195Sjkim/* Modular add: res = a+b mod P */ 102306195Sjkimvoid ecp_nistz256_add(BN_ULONG res[P256_LIMBS], 103306195Sjkim const BN_ULONG a[P256_LIMBS], 104306195Sjkim const BN_ULONG b[P256_LIMBS]); 105289848Sjkim/* Modular mul by 2: res = 2*a mod P */ 106289848Sjkimvoid ecp_nistz256_mul_by_2(BN_ULONG res[P256_LIMBS], 107289848Sjkim const BN_ULONG a[P256_LIMBS]); 108306195Sjkim/* Modular mul by 3: res = 3*a mod P */ 109306195Sjkimvoid ecp_nistz256_mul_by_3(BN_ULONG res[P256_LIMBS], 110306195Sjkim const BN_ULONG a[P256_LIMBS]); 111306195Sjkim 112289848Sjkim/* Modular div by 2: res = a/2 mod P */ 113289848Sjkimvoid ecp_nistz256_div_by_2(BN_ULONG res[P256_LIMBS], 114289848Sjkim const BN_ULONG a[P256_LIMBS]); 115289848Sjkim/* Modular sub: res = a-b mod P */ 116289848Sjkimvoid ecp_nistz256_sub(BN_ULONG res[P256_LIMBS], 117289848Sjkim const BN_ULONG a[P256_LIMBS], 118289848Sjkim const BN_ULONG b[P256_LIMBS]); 119289848Sjkim/* Modular neg: res = -a mod P */ 120289848Sjkimvoid ecp_nistz256_neg(BN_ULONG res[P256_LIMBS], const BN_ULONG a[P256_LIMBS]); 121289848Sjkim/* Montgomery mul: res = a*b*2^-256 mod P */ 122289848Sjkimvoid ecp_nistz256_mul_mont(BN_ULONG res[P256_LIMBS], 123289848Sjkim const BN_ULONG a[P256_LIMBS], 124289848Sjkim const BN_ULONG b[P256_LIMBS]); 125289848Sjkim/* Montgomery sqr: res = a*a*2^-256 mod P */ 126289848Sjkimvoid ecp_nistz256_sqr_mont(BN_ULONG res[P256_LIMBS], 127289848Sjkim const BN_ULONG a[P256_LIMBS]); 128289848Sjkim/* Convert a number from Montgomery domain, by multiplying with 1 */ 129289848Sjkimvoid ecp_nistz256_from_mont(BN_ULONG res[P256_LIMBS], 130289848Sjkim const BN_ULONG in[P256_LIMBS]); 131289848Sjkim/* Convert a number to Montgomery domain, by multiplying with 2^512 mod P*/ 132289848Sjkimvoid ecp_nistz256_to_mont(BN_ULONG res[P256_LIMBS], 133289848Sjkim const BN_ULONG in[P256_LIMBS]); 134289848Sjkim/* Functions that perform constant time access to the precomputed tables */ 135289848Sjkimvoid ecp_nistz256_select_w5(P256_POINT * val, 136289848Sjkim const P256_POINT * in_t, int index); 137289848Sjkimvoid ecp_nistz256_select_w7(P256_POINT_AFFINE * val, 138289848Sjkim const P256_POINT_AFFINE * in_t, int index); 139289848Sjkim 140289848Sjkim/* One converted into the Montgomery domain */ 141289848Sjkimstatic const BN_ULONG ONE[P256_LIMBS] = { 142289848Sjkim TOBN(0x00000000, 0x00000001), TOBN(0xffffffff, 0x00000000), 143289848Sjkim TOBN(0xffffffff, 0xffffffff), TOBN(0x00000000, 0xfffffffe) 144289848Sjkim}; 145289848Sjkim 146289848Sjkimstatic void *ecp_nistz256_pre_comp_dup(void *); 147289848Sjkimstatic void ecp_nistz256_pre_comp_free(void *); 148289848Sjkimstatic void ecp_nistz256_pre_comp_clear_free(void *); 149289848Sjkimstatic EC_PRE_COMP *ecp_nistz256_pre_comp_new(const EC_GROUP *group); 150289848Sjkim 151289848Sjkim/* Precomputed tables for the default generator */ 152289848Sjkim#include "ecp_nistz256_table.c" 153289848Sjkim 154289848Sjkim/* Recode window to a signed digit, see ecp_nistputil.c for details */ 155289848Sjkimstatic unsigned int _booth_recode_w5(unsigned int in) 156289848Sjkim{ 157289848Sjkim unsigned int s, d; 158289848Sjkim 159289848Sjkim s = ~((in >> 5) - 1); 160289848Sjkim d = (1 << 6) - in - 1; 161289848Sjkim d = (d & s) | (in & ~s); 162289848Sjkim d = (d >> 1) + (d & 1); 163289848Sjkim 164289848Sjkim return (d << 1) + (s & 1); 165289848Sjkim} 166289848Sjkim 167289848Sjkimstatic unsigned int _booth_recode_w7(unsigned int in) 168289848Sjkim{ 169289848Sjkim unsigned int s, d; 170289848Sjkim 171289848Sjkim s = ~((in >> 7) - 1); 172289848Sjkim d = (1 << 8) - in - 1; 173289848Sjkim d = (d & s) | (in & ~s); 174289848Sjkim d = (d >> 1) + (d & 1); 175289848Sjkim 176289848Sjkim return (d << 1) + (s & 1); 177289848Sjkim} 178289848Sjkim 179289848Sjkimstatic void copy_conditional(BN_ULONG dst[P256_LIMBS], 180289848Sjkim const BN_ULONG src[P256_LIMBS], BN_ULONG move) 181289848Sjkim{ 182289848Sjkim BN_ULONG mask1 = -move; 183289848Sjkim BN_ULONG mask2 = ~mask1; 184289848Sjkim 185289848Sjkim dst[0] = (src[0] & mask1) ^ (dst[0] & mask2); 186289848Sjkim dst[1] = (src[1] & mask1) ^ (dst[1] & mask2); 187289848Sjkim dst[2] = (src[2] & mask1) ^ (dst[2] & mask2); 188289848Sjkim dst[3] = (src[3] & mask1) ^ (dst[3] & mask2); 189289848Sjkim if (P256_LIMBS == 8) { 190289848Sjkim dst[4] = (src[4] & mask1) ^ (dst[4] & mask2); 191289848Sjkim dst[5] = (src[5] & mask1) ^ (dst[5] & mask2); 192289848Sjkim dst[6] = (src[6] & mask1) ^ (dst[6] & mask2); 193289848Sjkim dst[7] = (src[7] & mask1) ^ (dst[7] & mask2); 194289848Sjkim } 195289848Sjkim} 196289848Sjkim 197289848Sjkimstatic BN_ULONG is_zero(BN_ULONG in) 198289848Sjkim{ 199289848Sjkim in |= (0 - in); 200289848Sjkim in = ~in; 201289848Sjkim in &= BN_MASK2; 202289848Sjkim in >>= BN_BITS2 - 1; 203289848Sjkim return in; 204289848Sjkim} 205289848Sjkim 206289848Sjkimstatic BN_ULONG is_equal(const BN_ULONG a[P256_LIMBS], 207289848Sjkim const BN_ULONG b[P256_LIMBS]) 208289848Sjkim{ 209289848Sjkim BN_ULONG res; 210289848Sjkim 211289848Sjkim res = a[0] ^ b[0]; 212289848Sjkim res |= a[1] ^ b[1]; 213289848Sjkim res |= a[2] ^ b[2]; 214289848Sjkim res |= a[3] ^ b[3]; 215289848Sjkim if (P256_LIMBS == 8) { 216289848Sjkim res |= a[4] ^ b[4]; 217289848Sjkim res |= a[5] ^ b[5]; 218289848Sjkim res |= a[6] ^ b[6]; 219289848Sjkim res |= a[7] ^ b[7]; 220289848Sjkim } 221289848Sjkim 222289848Sjkim return is_zero(res); 223289848Sjkim} 224289848Sjkim 225306195Sjkimstatic BN_ULONG is_one(const BIGNUM *z) 226289848Sjkim{ 227306195Sjkim BN_ULONG res = 0; 228306195Sjkim BN_ULONG *a = z->d; 229289848Sjkim 230306195Sjkim if (z->top == (P256_LIMBS - P256_LIMBS / 8)) { 231306195Sjkim res = a[0] ^ ONE[0]; 232306195Sjkim res |= a[1] ^ ONE[1]; 233306195Sjkim res |= a[2] ^ ONE[2]; 234306195Sjkim res |= a[3] ^ ONE[3]; 235306195Sjkim if (P256_LIMBS == 8) { 236306195Sjkim res |= a[4] ^ ONE[4]; 237306195Sjkim res |= a[5] ^ ONE[5]; 238306195Sjkim res |= a[6] ^ ONE[6]; 239306195Sjkim /* 240306195Sjkim * no check for a[7] (being zero) on 32-bit platforms, 241306195Sjkim * because value of "one" takes only 7 limbs. 242306195Sjkim */ 243306195Sjkim } 244306195Sjkim res = is_zero(res); 245289848Sjkim } 246289848Sjkim 247306195Sjkim return res; 248289848Sjkim} 249289848Sjkim 250289848Sjkimstatic int ecp_nistz256_set_words(BIGNUM *a, BN_ULONG words[P256_LIMBS]) 251289848Sjkim { 252289848Sjkim if (bn_wexpand(a, P256_LIMBS) == NULL) { 253289848Sjkim ECerr(EC_F_ECP_NISTZ256_SET_WORDS, ERR_R_MALLOC_FAILURE); 254289848Sjkim return 0; 255289848Sjkim } 256289848Sjkim memcpy(a->d, words, sizeof(BN_ULONG) * P256_LIMBS); 257289848Sjkim a->top = P256_LIMBS; 258289848Sjkim bn_correct_top(a); 259289848Sjkim return 1; 260289848Sjkim} 261289848Sjkim 262289848Sjkim#ifndef ECP_NISTZ256_REFERENCE_IMPLEMENTATION 263289848Sjkimvoid ecp_nistz256_point_double(P256_POINT *r, const P256_POINT *a); 264289848Sjkimvoid ecp_nistz256_point_add(P256_POINT *r, 265289848Sjkim const P256_POINT *a, const P256_POINT *b); 266289848Sjkimvoid ecp_nistz256_point_add_affine(P256_POINT *r, 267289848Sjkim const P256_POINT *a, 268289848Sjkim const P256_POINT_AFFINE *b); 269289848Sjkim#else 270289848Sjkim/* Point double: r = 2*a */ 271289848Sjkimstatic void ecp_nistz256_point_double(P256_POINT *r, const P256_POINT *a) 272289848Sjkim{ 273289848Sjkim BN_ULONG S[P256_LIMBS]; 274289848Sjkim BN_ULONG M[P256_LIMBS]; 275289848Sjkim BN_ULONG Zsqr[P256_LIMBS]; 276289848Sjkim BN_ULONG tmp0[P256_LIMBS]; 277289848Sjkim 278289848Sjkim const BN_ULONG *in_x = a->X; 279289848Sjkim const BN_ULONG *in_y = a->Y; 280289848Sjkim const BN_ULONG *in_z = a->Z; 281289848Sjkim 282289848Sjkim BN_ULONG *res_x = r->X; 283289848Sjkim BN_ULONG *res_y = r->Y; 284289848Sjkim BN_ULONG *res_z = r->Z; 285289848Sjkim 286289848Sjkim ecp_nistz256_mul_by_2(S, in_y); 287289848Sjkim 288289848Sjkim ecp_nistz256_sqr_mont(Zsqr, in_z); 289289848Sjkim 290289848Sjkim ecp_nistz256_sqr_mont(S, S); 291289848Sjkim 292289848Sjkim ecp_nistz256_mul_mont(res_z, in_z, in_y); 293289848Sjkim ecp_nistz256_mul_by_2(res_z, res_z); 294289848Sjkim 295289848Sjkim ecp_nistz256_add(M, in_x, Zsqr); 296289848Sjkim ecp_nistz256_sub(Zsqr, in_x, Zsqr); 297289848Sjkim 298289848Sjkim ecp_nistz256_sqr_mont(res_y, S); 299289848Sjkim ecp_nistz256_div_by_2(res_y, res_y); 300289848Sjkim 301289848Sjkim ecp_nistz256_mul_mont(M, M, Zsqr); 302289848Sjkim ecp_nistz256_mul_by_3(M, M); 303289848Sjkim 304289848Sjkim ecp_nistz256_mul_mont(S, S, in_x); 305289848Sjkim ecp_nistz256_mul_by_2(tmp0, S); 306289848Sjkim 307289848Sjkim ecp_nistz256_sqr_mont(res_x, M); 308289848Sjkim 309289848Sjkim ecp_nistz256_sub(res_x, res_x, tmp0); 310289848Sjkim ecp_nistz256_sub(S, S, res_x); 311289848Sjkim 312289848Sjkim ecp_nistz256_mul_mont(S, S, M); 313289848Sjkim ecp_nistz256_sub(res_y, S, res_y); 314289848Sjkim} 315289848Sjkim 316289848Sjkim/* Point addition: r = a+b */ 317289848Sjkimstatic void ecp_nistz256_point_add(P256_POINT *r, 318289848Sjkim const P256_POINT *a, const P256_POINT *b) 319289848Sjkim{ 320289848Sjkim BN_ULONG U2[P256_LIMBS], S2[P256_LIMBS]; 321289848Sjkim BN_ULONG U1[P256_LIMBS], S1[P256_LIMBS]; 322289848Sjkim BN_ULONG Z1sqr[P256_LIMBS]; 323289848Sjkim BN_ULONG Z2sqr[P256_LIMBS]; 324289848Sjkim BN_ULONG H[P256_LIMBS], R[P256_LIMBS]; 325289848Sjkim BN_ULONG Hsqr[P256_LIMBS]; 326289848Sjkim BN_ULONG Rsqr[P256_LIMBS]; 327289848Sjkim BN_ULONG Hcub[P256_LIMBS]; 328289848Sjkim 329289848Sjkim BN_ULONG res_x[P256_LIMBS]; 330289848Sjkim BN_ULONG res_y[P256_LIMBS]; 331289848Sjkim BN_ULONG res_z[P256_LIMBS]; 332289848Sjkim 333289848Sjkim BN_ULONG in1infty, in2infty; 334289848Sjkim 335289848Sjkim const BN_ULONG *in1_x = a->X; 336289848Sjkim const BN_ULONG *in1_y = a->Y; 337289848Sjkim const BN_ULONG *in1_z = a->Z; 338289848Sjkim 339289848Sjkim const BN_ULONG *in2_x = b->X; 340289848Sjkim const BN_ULONG *in2_y = b->Y; 341289848Sjkim const BN_ULONG *in2_z = b->Z; 342289848Sjkim 343306195Sjkim /* 344306195Sjkim * Infinity in encoded as (,,0) 345306195Sjkim */ 346306195Sjkim in1infty = (in1_z[0] | in1_z[1] | in1_z[2] | in1_z[3]); 347289848Sjkim if (P256_LIMBS == 8) 348306195Sjkim in1infty |= (in1_z[4] | in1_z[5] | in1_z[6] | in1_z[7]); 349289848Sjkim 350306195Sjkim in2infty = (in2_z[0] | in2_z[1] | in2_z[2] | in2_z[3]); 351289848Sjkim if (P256_LIMBS == 8) 352306195Sjkim in2infty |= (in2_z[4] | in2_z[5] | in2_z[6] | in2_z[7]); 353289848Sjkim 354289848Sjkim in1infty = is_zero(in1infty); 355289848Sjkim in2infty = is_zero(in2infty); 356289848Sjkim 357289848Sjkim ecp_nistz256_sqr_mont(Z2sqr, in2_z); /* Z2^2 */ 358289848Sjkim ecp_nistz256_sqr_mont(Z1sqr, in1_z); /* Z1^2 */ 359289848Sjkim 360289848Sjkim ecp_nistz256_mul_mont(S1, Z2sqr, in2_z); /* S1 = Z2^3 */ 361289848Sjkim ecp_nistz256_mul_mont(S2, Z1sqr, in1_z); /* S2 = Z1^3 */ 362289848Sjkim 363289848Sjkim ecp_nistz256_mul_mont(S1, S1, in1_y); /* S1 = Y1*Z2^3 */ 364289848Sjkim ecp_nistz256_mul_mont(S2, S2, in2_y); /* S2 = Y2*Z1^3 */ 365289848Sjkim ecp_nistz256_sub(R, S2, S1); /* R = S2 - S1 */ 366289848Sjkim 367289848Sjkim ecp_nistz256_mul_mont(U1, in1_x, Z2sqr); /* U1 = X1*Z2^2 */ 368289848Sjkim ecp_nistz256_mul_mont(U2, in2_x, Z1sqr); /* U2 = X2*Z1^2 */ 369289848Sjkim ecp_nistz256_sub(H, U2, U1); /* H = U2 - U1 */ 370289848Sjkim 371289848Sjkim /* 372289848Sjkim * This should not happen during sign/ecdh, so no constant time violation 373289848Sjkim */ 374289848Sjkim if (is_equal(U1, U2) && !in1infty && !in2infty) { 375289848Sjkim if (is_equal(S1, S2)) { 376289848Sjkim ecp_nistz256_point_double(r, a); 377289848Sjkim return; 378289848Sjkim } else { 379289848Sjkim memset(r, 0, sizeof(*r)); 380289848Sjkim return; 381289848Sjkim } 382289848Sjkim } 383289848Sjkim 384289848Sjkim ecp_nistz256_sqr_mont(Rsqr, R); /* R^2 */ 385289848Sjkim ecp_nistz256_mul_mont(res_z, H, in1_z); /* Z3 = H*Z1*Z2 */ 386289848Sjkim ecp_nistz256_sqr_mont(Hsqr, H); /* H^2 */ 387289848Sjkim ecp_nistz256_mul_mont(res_z, res_z, in2_z); /* Z3 = H*Z1*Z2 */ 388289848Sjkim ecp_nistz256_mul_mont(Hcub, Hsqr, H); /* H^3 */ 389289848Sjkim 390289848Sjkim ecp_nistz256_mul_mont(U2, U1, Hsqr); /* U1*H^2 */ 391289848Sjkim ecp_nistz256_mul_by_2(Hsqr, U2); /* 2*U1*H^2 */ 392289848Sjkim 393289848Sjkim ecp_nistz256_sub(res_x, Rsqr, Hsqr); 394289848Sjkim ecp_nistz256_sub(res_x, res_x, Hcub); 395289848Sjkim 396289848Sjkim ecp_nistz256_sub(res_y, U2, res_x); 397289848Sjkim 398289848Sjkim ecp_nistz256_mul_mont(S2, S1, Hcub); 399289848Sjkim ecp_nistz256_mul_mont(res_y, R, res_y); 400289848Sjkim ecp_nistz256_sub(res_y, res_y, S2); 401289848Sjkim 402289848Sjkim copy_conditional(res_x, in2_x, in1infty); 403289848Sjkim copy_conditional(res_y, in2_y, in1infty); 404289848Sjkim copy_conditional(res_z, in2_z, in1infty); 405289848Sjkim 406289848Sjkim copy_conditional(res_x, in1_x, in2infty); 407289848Sjkim copy_conditional(res_y, in1_y, in2infty); 408289848Sjkim copy_conditional(res_z, in1_z, in2infty); 409289848Sjkim 410289848Sjkim memcpy(r->X, res_x, sizeof(res_x)); 411289848Sjkim memcpy(r->Y, res_y, sizeof(res_y)); 412289848Sjkim memcpy(r->Z, res_z, sizeof(res_z)); 413289848Sjkim} 414289848Sjkim 415289848Sjkim/* Point addition when b is known to be affine: r = a+b */ 416289848Sjkimstatic void ecp_nistz256_point_add_affine(P256_POINT *r, 417289848Sjkim const P256_POINT *a, 418289848Sjkim const P256_POINT_AFFINE *b) 419289848Sjkim{ 420289848Sjkim BN_ULONG U2[P256_LIMBS], S2[P256_LIMBS]; 421289848Sjkim BN_ULONG Z1sqr[P256_LIMBS]; 422289848Sjkim BN_ULONG H[P256_LIMBS], R[P256_LIMBS]; 423289848Sjkim BN_ULONG Hsqr[P256_LIMBS]; 424289848Sjkim BN_ULONG Rsqr[P256_LIMBS]; 425289848Sjkim BN_ULONG Hcub[P256_LIMBS]; 426289848Sjkim 427289848Sjkim BN_ULONG res_x[P256_LIMBS]; 428289848Sjkim BN_ULONG res_y[P256_LIMBS]; 429289848Sjkim BN_ULONG res_z[P256_LIMBS]; 430289848Sjkim 431289848Sjkim BN_ULONG in1infty, in2infty; 432289848Sjkim 433289848Sjkim const BN_ULONG *in1_x = a->X; 434289848Sjkim const BN_ULONG *in1_y = a->Y; 435289848Sjkim const BN_ULONG *in1_z = a->Z; 436289848Sjkim 437289848Sjkim const BN_ULONG *in2_x = b->X; 438289848Sjkim const BN_ULONG *in2_y = b->Y; 439289848Sjkim 440289848Sjkim /* 441306195Sjkim * Infinity in encoded as (,,0) 442289848Sjkim */ 443306195Sjkim in1infty = (in1_z[0] | in1_z[1] | in1_z[2] | in1_z[3]); 444289848Sjkim if (P256_LIMBS == 8) 445306195Sjkim in1infty |= (in1_z[4] | in1_z[5] | in1_z[6] | in1_z[7]); 446289848Sjkim 447306195Sjkim /* 448306195Sjkim * In affine representation we encode infinity as (0,0), which is 449306195Sjkim * not on the curve, so it is OK 450306195Sjkim */ 451289848Sjkim in2infty = (in2_x[0] | in2_x[1] | in2_x[2] | in2_x[3] | 452289848Sjkim in2_y[0] | in2_y[1] | in2_y[2] | in2_y[3]); 453289848Sjkim if (P256_LIMBS == 8) 454289848Sjkim in2infty |= (in2_x[4] | in2_x[5] | in2_x[6] | in2_x[7] | 455289848Sjkim in2_y[4] | in2_y[5] | in2_y[6] | in2_y[7]); 456289848Sjkim 457289848Sjkim in1infty = is_zero(in1infty); 458289848Sjkim in2infty = is_zero(in2infty); 459289848Sjkim 460289848Sjkim ecp_nistz256_sqr_mont(Z1sqr, in1_z); /* Z1^2 */ 461289848Sjkim 462289848Sjkim ecp_nistz256_mul_mont(U2, in2_x, Z1sqr); /* U2 = X2*Z1^2 */ 463289848Sjkim ecp_nistz256_sub(H, U2, in1_x); /* H = U2 - U1 */ 464289848Sjkim 465289848Sjkim ecp_nistz256_mul_mont(S2, Z1sqr, in1_z); /* S2 = Z1^3 */ 466289848Sjkim 467289848Sjkim ecp_nistz256_mul_mont(res_z, H, in1_z); /* Z3 = H*Z1*Z2 */ 468289848Sjkim 469289848Sjkim ecp_nistz256_mul_mont(S2, S2, in2_y); /* S2 = Y2*Z1^3 */ 470289848Sjkim ecp_nistz256_sub(R, S2, in1_y); /* R = S2 - S1 */ 471289848Sjkim 472289848Sjkim ecp_nistz256_sqr_mont(Hsqr, H); /* H^2 */ 473289848Sjkim ecp_nistz256_sqr_mont(Rsqr, R); /* R^2 */ 474289848Sjkim ecp_nistz256_mul_mont(Hcub, Hsqr, H); /* H^3 */ 475289848Sjkim 476289848Sjkim ecp_nistz256_mul_mont(U2, in1_x, Hsqr); /* U1*H^2 */ 477289848Sjkim ecp_nistz256_mul_by_2(Hsqr, U2); /* 2*U1*H^2 */ 478289848Sjkim 479289848Sjkim ecp_nistz256_sub(res_x, Rsqr, Hsqr); 480289848Sjkim ecp_nistz256_sub(res_x, res_x, Hcub); 481289848Sjkim ecp_nistz256_sub(H, U2, res_x); 482289848Sjkim 483289848Sjkim ecp_nistz256_mul_mont(S2, in1_y, Hcub); 484289848Sjkim ecp_nistz256_mul_mont(H, H, R); 485289848Sjkim ecp_nistz256_sub(res_y, H, S2); 486289848Sjkim 487289848Sjkim copy_conditional(res_x, in2_x, in1infty); 488289848Sjkim copy_conditional(res_x, in1_x, in2infty); 489289848Sjkim 490289848Sjkim copy_conditional(res_y, in2_y, in1infty); 491289848Sjkim copy_conditional(res_y, in1_y, in2infty); 492289848Sjkim 493289848Sjkim copy_conditional(res_z, ONE, in1infty); 494289848Sjkim copy_conditional(res_z, in1_z, in2infty); 495289848Sjkim 496289848Sjkim memcpy(r->X, res_x, sizeof(res_x)); 497289848Sjkim memcpy(r->Y, res_y, sizeof(res_y)); 498289848Sjkim memcpy(r->Z, res_z, sizeof(res_z)); 499289848Sjkim} 500289848Sjkim#endif 501289848Sjkim 502289848Sjkim/* r = in^-1 mod p */ 503289848Sjkimstatic void ecp_nistz256_mod_inverse(BN_ULONG r[P256_LIMBS], 504289848Sjkim const BN_ULONG in[P256_LIMBS]) 505289848Sjkim{ 506289848Sjkim /* 507289848Sjkim * The poly is ffffffff 00000001 00000000 00000000 00000000 ffffffff 508289848Sjkim * ffffffff ffffffff We use FLT and used poly-2 as exponent 509289848Sjkim */ 510289848Sjkim BN_ULONG p2[P256_LIMBS]; 511289848Sjkim BN_ULONG p4[P256_LIMBS]; 512289848Sjkim BN_ULONG p8[P256_LIMBS]; 513289848Sjkim BN_ULONG p16[P256_LIMBS]; 514289848Sjkim BN_ULONG p32[P256_LIMBS]; 515289848Sjkim BN_ULONG res[P256_LIMBS]; 516289848Sjkim int i; 517289848Sjkim 518289848Sjkim ecp_nistz256_sqr_mont(res, in); 519289848Sjkim ecp_nistz256_mul_mont(p2, res, in); /* 3*p */ 520289848Sjkim 521289848Sjkim ecp_nistz256_sqr_mont(res, p2); 522289848Sjkim ecp_nistz256_sqr_mont(res, res); 523289848Sjkim ecp_nistz256_mul_mont(p4, res, p2); /* f*p */ 524289848Sjkim 525289848Sjkim ecp_nistz256_sqr_mont(res, p4); 526289848Sjkim ecp_nistz256_sqr_mont(res, res); 527289848Sjkim ecp_nistz256_sqr_mont(res, res); 528289848Sjkim ecp_nistz256_sqr_mont(res, res); 529289848Sjkim ecp_nistz256_mul_mont(p8, res, p4); /* ff*p */ 530289848Sjkim 531289848Sjkim ecp_nistz256_sqr_mont(res, p8); 532289848Sjkim for (i = 0; i < 7; i++) 533289848Sjkim ecp_nistz256_sqr_mont(res, res); 534289848Sjkim ecp_nistz256_mul_mont(p16, res, p8); /* ffff*p */ 535289848Sjkim 536289848Sjkim ecp_nistz256_sqr_mont(res, p16); 537289848Sjkim for (i = 0; i < 15; i++) 538289848Sjkim ecp_nistz256_sqr_mont(res, res); 539289848Sjkim ecp_nistz256_mul_mont(p32, res, p16); /* ffffffff*p */ 540289848Sjkim 541289848Sjkim ecp_nistz256_sqr_mont(res, p32); 542289848Sjkim for (i = 0; i < 31; i++) 543289848Sjkim ecp_nistz256_sqr_mont(res, res); 544289848Sjkim ecp_nistz256_mul_mont(res, res, in); 545289848Sjkim 546289848Sjkim for (i = 0; i < 32 * 4; i++) 547289848Sjkim ecp_nistz256_sqr_mont(res, res); 548289848Sjkim ecp_nistz256_mul_mont(res, res, p32); 549289848Sjkim 550289848Sjkim for (i = 0; i < 32; i++) 551289848Sjkim ecp_nistz256_sqr_mont(res, res); 552289848Sjkim ecp_nistz256_mul_mont(res, res, p32); 553289848Sjkim 554289848Sjkim for (i = 0; i < 16; i++) 555289848Sjkim ecp_nistz256_sqr_mont(res, res); 556289848Sjkim ecp_nistz256_mul_mont(res, res, p16); 557289848Sjkim 558289848Sjkim for (i = 0; i < 8; i++) 559289848Sjkim ecp_nistz256_sqr_mont(res, res); 560289848Sjkim ecp_nistz256_mul_mont(res, res, p8); 561289848Sjkim 562289848Sjkim ecp_nistz256_sqr_mont(res, res); 563289848Sjkim ecp_nistz256_sqr_mont(res, res); 564289848Sjkim ecp_nistz256_sqr_mont(res, res); 565289848Sjkim ecp_nistz256_sqr_mont(res, res); 566289848Sjkim ecp_nistz256_mul_mont(res, res, p4); 567289848Sjkim 568289848Sjkim ecp_nistz256_sqr_mont(res, res); 569289848Sjkim ecp_nistz256_sqr_mont(res, res); 570289848Sjkim ecp_nistz256_mul_mont(res, res, p2); 571289848Sjkim 572289848Sjkim ecp_nistz256_sqr_mont(res, res); 573289848Sjkim ecp_nistz256_sqr_mont(res, res); 574289848Sjkim ecp_nistz256_mul_mont(res, res, in); 575289848Sjkim 576289848Sjkim memcpy(r, res, sizeof(res)); 577289848Sjkim} 578289848Sjkim 579289848Sjkim/* 580289848Sjkim * ecp_nistz256_bignum_to_field_elem copies the contents of |in| to |out| and 581289848Sjkim * returns one if it fits. Otherwise it returns zero. 582289848Sjkim */ 583289848Sjkimstatic int ecp_nistz256_bignum_to_field_elem(BN_ULONG out[P256_LIMBS], 584289848Sjkim const BIGNUM *in) 585289848Sjkim{ 586289848Sjkim if (in->top > P256_LIMBS) 587289848Sjkim return 0; 588289848Sjkim 589289848Sjkim memset(out, 0, sizeof(BN_ULONG) * P256_LIMBS); 590289848Sjkim memcpy(out, in->d, sizeof(BN_ULONG) * in->top); 591289848Sjkim return 1; 592289848Sjkim} 593289848Sjkim 594289848Sjkim/* r = sum(scalar[i]*point[i]) */ 595289848Sjkimstatic int ecp_nistz256_windowed_mul(const EC_GROUP *group, 596289848Sjkim P256_POINT *r, 597289848Sjkim const BIGNUM **scalar, 598289848Sjkim const EC_POINT **point, 599289848Sjkim int num, BN_CTX *ctx) 600289848Sjkim{ 601289848Sjkim 602289848Sjkim int i, j, ret = 0; 603289848Sjkim unsigned int index; 604289848Sjkim unsigned char (*p_str)[33] = NULL; 605289848Sjkim const unsigned int window_size = 5; 606289848Sjkim const unsigned int mask = (1 << (window_size + 1)) - 1; 607289848Sjkim unsigned int wvalue; 608289848Sjkim BN_ULONG tmp[P256_LIMBS]; 609289848Sjkim ALIGN32 P256_POINT h; 610289848Sjkim const BIGNUM **scalars = NULL; 611289848Sjkim P256_POINT (*table)[16] = NULL; 612289848Sjkim void *table_storage = NULL; 613289848Sjkim 614289848Sjkim if ((table_storage = 615289848Sjkim OPENSSL_malloc(num * 16 * sizeof(P256_POINT) + 64)) == NULL 616289848Sjkim || (p_str = 617289848Sjkim OPENSSL_malloc(num * 33 * sizeof(unsigned char))) == NULL 618289848Sjkim || (scalars = OPENSSL_malloc(num * sizeof(BIGNUM *))) == NULL) { 619289848Sjkim ECerr(EC_F_ECP_NISTZ256_WINDOWED_MUL, ERR_R_MALLOC_FAILURE); 620289848Sjkim goto err; 621289848Sjkim } else { 622289848Sjkim table = (void *)ALIGNPTR(table_storage, 64); 623289848Sjkim } 624289848Sjkim 625289848Sjkim for (i = 0; i < num; i++) { 626289848Sjkim P256_POINT *row = table[i]; 627289848Sjkim 628289848Sjkim /* This is an unusual input, we don't guarantee constant-timeness. */ 629289848Sjkim if ((BN_num_bits(scalar[i]) > 256) || BN_is_negative(scalar[i])) { 630289848Sjkim BIGNUM *mod; 631289848Sjkim 632289848Sjkim if ((mod = BN_CTX_get(ctx)) == NULL) 633289848Sjkim goto err; 634289848Sjkim if (!BN_nnmod(mod, scalar[i], &group->order, ctx)) { 635289848Sjkim ECerr(EC_F_ECP_NISTZ256_WINDOWED_MUL, ERR_R_BN_LIB); 636289848Sjkim goto err; 637289848Sjkim } 638289848Sjkim scalars[i] = mod; 639289848Sjkim } else 640289848Sjkim scalars[i] = scalar[i]; 641289848Sjkim 642289848Sjkim for (j = 0; j < scalars[i]->top * BN_BYTES; j += BN_BYTES) { 643289848Sjkim BN_ULONG d = scalars[i]->d[j / BN_BYTES]; 644289848Sjkim 645289848Sjkim p_str[i][j + 0] = d & 0xff; 646289848Sjkim p_str[i][j + 1] = (d >> 8) & 0xff; 647289848Sjkim p_str[i][j + 2] = (d >> 16) & 0xff; 648289848Sjkim p_str[i][j + 3] = (d >>= 24) & 0xff; 649289848Sjkim if (BN_BYTES == 8) { 650289848Sjkim d >>= 8; 651289848Sjkim p_str[i][j + 4] = d & 0xff; 652289848Sjkim p_str[i][j + 5] = (d >> 8) & 0xff; 653289848Sjkim p_str[i][j + 6] = (d >> 16) & 0xff; 654289848Sjkim p_str[i][j + 7] = (d >> 24) & 0xff; 655289848Sjkim } 656289848Sjkim } 657289848Sjkim for (; j < 33; j++) 658289848Sjkim p_str[i][j] = 0; 659289848Sjkim 660289848Sjkim /* table[0] is implicitly (0,0,0) (the point at infinity), 661289848Sjkim * therefore it is not stored. All other values are actually 662289848Sjkim * stored with an offset of -1 in table. 663289848Sjkim */ 664289848Sjkim 665289848Sjkim if (!ecp_nistz256_bignum_to_field_elem(row[1 - 1].X, &point[i]->X) 666289848Sjkim || !ecp_nistz256_bignum_to_field_elem(row[1 - 1].Y, &point[i]->Y) 667289848Sjkim || !ecp_nistz256_bignum_to_field_elem(row[1 - 1].Z, &point[i]->Z)) { 668289848Sjkim ECerr(EC_F_ECP_NISTZ256_WINDOWED_MUL, EC_R_COORDINATES_OUT_OF_RANGE); 669289848Sjkim goto err; 670289848Sjkim } 671289848Sjkim 672289848Sjkim ecp_nistz256_point_double(&row[ 2 - 1], &row[ 1 - 1]); 673289848Sjkim ecp_nistz256_point_add (&row[ 3 - 1], &row[ 2 - 1], &row[1 - 1]); 674289848Sjkim ecp_nistz256_point_double(&row[ 4 - 1], &row[ 2 - 1]); 675289848Sjkim ecp_nistz256_point_double(&row[ 6 - 1], &row[ 3 - 1]); 676289848Sjkim ecp_nistz256_point_double(&row[ 8 - 1], &row[ 4 - 1]); 677289848Sjkim ecp_nistz256_point_double(&row[12 - 1], &row[ 6 - 1]); 678289848Sjkim ecp_nistz256_point_add (&row[ 5 - 1], &row[ 4 - 1], &row[1 - 1]); 679289848Sjkim ecp_nistz256_point_add (&row[ 7 - 1], &row[ 6 - 1], &row[1 - 1]); 680289848Sjkim ecp_nistz256_point_add (&row[ 9 - 1], &row[ 8 - 1], &row[1 - 1]); 681289848Sjkim ecp_nistz256_point_add (&row[13 - 1], &row[12 - 1], &row[1 - 1]); 682289848Sjkim ecp_nistz256_point_double(&row[14 - 1], &row[ 7 - 1]); 683289848Sjkim ecp_nistz256_point_double(&row[10 - 1], &row[ 5 - 1]); 684289848Sjkim ecp_nistz256_point_add (&row[15 - 1], &row[14 - 1], &row[1 - 1]); 685289848Sjkim ecp_nistz256_point_add (&row[11 - 1], &row[10 - 1], &row[1 - 1]); 686289848Sjkim ecp_nistz256_point_add (&row[16 - 1], &row[15 - 1], &row[1 - 1]); 687289848Sjkim } 688289848Sjkim 689289848Sjkim index = 255; 690289848Sjkim 691289848Sjkim wvalue = p_str[0][(index - 1) / 8]; 692289848Sjkim wvalue = (wvalue >> ((index - 1) % 8)) & mask; 693289848Sjkim 694289848Sjkim ecp_nistz256_select_w5(r, table[0], _booth_recode_w5(wvalue) >> 1); 695289848Sjkim 696289848Sjkim while (index >= 5) { 697289848Sjkim for (i = (index == 255 ? 1 : 0); i < num; i++) { 698289848Sjkim unsigned int off = (index - 1) / 8; 699289848Sjkim 700289848Sjkim wvalue = p_str[i][off] | p_str[i][off + 1] << 8; 701289848Sjkim wvalue = (wvalue >> ((index - 1) % 8)) & mask; 702289848Sjkim 703289848Sjkim wvalue = _booth_recode_w5(wvalue); 704289848Sjkim 705289848Sjkim ecp_nistz256_select_w5(&h, table[i], wvalue >> 1); 706289848Sjkim 707289848Sjkim ecp_nistz256_neg(tmp, h.Y); 708289848Sjkim copy_conditional(h.Y, tmp, (wvalue & 1)); 709289848Sjkim 710289848Sjkim ecp_nistz256_point_add(r, r, &h); 711289848Sjkim } 712289848Sjkim 713289848Sjkim index -= window_size; 714289848Sjkim 715289848Sjkim ecp_nistz256_point_double(r, r); 716289848Sjkim ecp_nistz256_point_double(r, r); 717289848Sjkim ecp_nistz256_point_double(r, r); 718289848Sjkim ecp_nistz256_point_double(r, r); 719289848Sjkim ecp_nistz256_point_double(r, r); 720289848Sjkim } 721289848Sjkim 722289848Sjkim /* Final window */ 723289848Sjkim for (i = 0; i < num; i++) { 724289848Sjkim wvalue = p_str[i][0]; 725289848Sjkim wvalue = (wvalue << 1) & mask; 726289848Sjkim 727289848Sjkim wvalue = _booth_recode_w5(wvalue); 728289848Sjkim 729289848Sjkim ecp_nistz256_select_w5(&h, table[i], wvalue >> 1); 730289848Sjkim 731289848Sjkim ecp_nistz256_neg(tmp, h.Y); 732289848Sjkim copy_conditional(h.Y, tmp, wvalue & 1); 733289848Sjkim 734289848Sjkim ecp_nistz256_point_add(r, r, &h); 735289848Sjkim } 736289848Sjkim 737289848Sjkim ret = 1; 738289848Sjkim err: 739289848Sjkim if (table_storage) 740289848Sjkim OPENSSL_free(table_storage); 741289848Sjkim if (p_str) 742289848Sjkim OPENSSL_free(p_str); 743289848Sjkim if (scalars) 744289848Sjkim OPENSSL_free(scalars); 745289848Sjkim return ret; 746289848Sjkim} 747289848Sjkim 748289848Sjkim/* Coordinates of G, for which we have precomputed tables */ 749289848Sjkimconst static BN_ULONG def_xG[P256_LIMBS] = { 750289848Sjkim TOBN(0x79e730d4, 0x18a9143c), TOBN(0x75ba95fc, 0x5fedb601), 751289848Sjkim TOBN(0x79fb732b, 0x77622510), TOBN(0x18905f76, 0xa53755c6) 752289848Sjkim}; 753289848Sjkim 754289848Sjkimconst static BN_ULONG def_yG[P256_LIMBS] = { 755289848Sjkim TOBN(0xddf25357, 0xce95560a), TOBN(0x8b4ab8e4, 0xba19e45c), 756289848Sjkim TOBN(0xd2e88688, 0xdd21f325), TOBN(0x8571ff18, 0x25885d85) 757289848Sjkim}; 758289848Sjkim 759289848Sjkim/* 760289848Sjkim * ecp_nistz256_is_affine_G returns one if |generator| is the standard, P-256 761289848Sjkim * generator. 762289848Sjkim */ 763289848Sjkimstatic int ecp_nistz256_is_affine_G(const EC_POINT *generator) 764289848Sjkim{ 765289848Sjkim return (generator->X.top == P256_LIMBS) && 766289848Sjkim (generator->Y.top == P256_LIMBS) && 767289848Sjkim is_equal(generator->X.d, def_xG) && 768306195Sjkim is_equal(generator->Y.d, def_yG) && is_one(&generator->Z); 769289848Sjkim} 770289848Sjkim 771289848Sjkimstatic int ecp_nistz256_mult_precompute(EC_GROUP *group, BN_CTX *ctx) 772289848Sjkim{ 773289848Sjkim /* 774289848Sjkim * We precompute a table for a Booth encoded exponent (wNAF) based 775289848Sjkim * computation. Each table holds 64 values for safe access, with an 776289848Sjkim * implicit value of infinity at index zero. We use window of size 7, and 777289848Sjkim * therefore require ceil(256/7) = 37 tables. 778289848Sjkim */ 779289848Sjkim BIGNUM *order; 780289848Sjkim EC_POINT *P = NULL, *T = NULL; 781289848Sjkim const EC_POINT *generator; 782289848Sjkim EC_PRE_COMP *pre_comp; 783289848Sjkim BN_CTX *new_ctx = NULL; 784289848Sjkim int i, j, k, ret = 0; 785289848Sjkim size_t w; 786289848Sjkim 787289848Sjkim PRECOMP256_ROW *preComputedTable = NULL; 788289848Sjkim unsigned char *precomp_storage = NULL; 789289848Sjkim 790289848Sjkim /* if there is an old EC_PRE_COMP object, throw it away */ 791289848Sjkim EC_EX_DATA_free_data(&group->extra_data, ecp_nistz256_pre_comp_dup, 792289848Sjkim ecp_nistz256_pre_comp_free, 793289848Sjkim ecp_nistz256_pre_comp_clear_free); 794289848Sjkim 795289848Sjkim generator = EC_GROUP_get0_generator(group); 796289848Sjkim if (generator == NULL) { 797289848Sjkim ECerr(EC_F_ECP_NISTZ256_MULT_PRECOMPUTE, EC_R_UNDEFINED_GENERATOR); 798289848Sjkim return 0; 799289848Sjkim } 800289848Sjkim 801289848Sjkim if (ecp_nistz256_is_affine_G(generator)) { 802289848Sjkim /* 803289848Sjkim * No need to calculate tables for the standard generator because we 804289848Sjkim * have them statically. 805289848Sjkim */ 806289848Sjkim return 1; 807289848Sjkim } 808289848Sjkim 809289848Sjkim if ((pre_comp = ecp_nistz256_pre_comp_new(group)) == NULL) 810289848Sjkim return 0; 811289848Sjkim 812289848Sjkim if (ctx == NULL) { 813289848Sjkim ctx = new_ctx = BN_CTX_new(); 814289848Sjkim if (ctx == NULL) 815289848Sjkim goto err; 816289848Sjkim } 817289848Sjkim 818289848Sjkim BN_CTX_start(ctx); 819289848Sjkim order = BN_CTX_get(ctx); 820289848Sjkim 821289848Sjkim if (order == NULL) 822289848Sjkim goto err; 823289848Sjkim 824289848Sjkim if (!EC_GROUP_get_order(group, order, ctx)) 825289848Sjkim goto err; 826289848Sjkim 827289848Sjkim if (BN_is_zero(order)) { 828289848Sjkim ECerr(EC_F_ECP_NISTZ256_MULT_PRECOMPUTE, EC_R_UNKNOWN_ORDER); 829289848Sjkim goto err; 830289848Sjkim } 831289848Sjkim 832289848Sjkim w = 7; 833289848Sjkim 834289848Sjkim if ((precomp_storage = 835289848Sjkim OPENSSL_malloc(37 * 64 * sizeof(P256_POINT_AFFINE) + 64)) == NULL) { 836289848Sjkim ECerr(EC_F_ECP_NISTZ256_MULT_PRECOMPUTE, ERR_R_MALLOC_FAILURE); 837289848Sjkim goto err; 838289848Sjkim } else { 839289848Sjkim preComputedTable = (void *)ALIGNPTR(precomp_storage, 64); 840289848Sjkim } 841289848Sjkim 842289848Sjkim P = EC_POINT_new(group); 843289848Sjkim T = EC_POINT_new(group); 844289848Sjkim if (P == NULL || T == NULL) 845289848Sjkim goto err; 846289848Sjkim 847289848Sjkim /* 848289848Sjkim * The zero entry is implicitly infinity, and we skip it, storing other 849289848Sjkim * values with -1 offset. 850289848Sjkim */ 851289848Sjkim if (!EC_POINT_copy(T, generator)) 852289848Sjkim goto err; 853289848Sjkim 854289848Sjkim for (k = 0; k < 64; k++) { 855289848Sjkim if (!EC_POINT_copy(P, T)) 856289848Sjkim goto err; 857289848Sjkim for (j = 0; j < 37; j++) { 858289848Sjkim /* 859289848Sjkim * It would be faster to use EC_POINTs_make_affine and 860289848Sjkim * make multiple points affine at the same time. 861289848Sjkim */ 862289848Sjkim if (!EC_POINT_make_affine(group, P, ctx)) 863289848Sjkim goto err; 864289848Sjkim if (!ecp_nistz256_bignum_to_field_elem(preComputedTable[j][k].X, 865289848Sjkim &P->X) || 866289848Sjkim !ecp_nistz256_bignum_to_field_elem(preComputedTable[j][k].Y, 867289848Sjkim &P->Y)) { 868289848Sjkim ECerr(EC_F_ECP_NISTZ256_MULT_PRECOMPUTE, 869289848Sjkim EC_R_COORDINATES_OUT_OF_RANGE); 870289848Sjkim goto err; 871289848Sjkim } 872289848Sjkim for (i = 0; i < 7; i++) { 873289848Sjkim if (!EC_POINT_dbl(group, P, P, ctx)) 874289848Sjkim goto err; 875289848Sjkim } 876289848Sjkim } 877289848Sjkim if (!EC_POINT_add(group, T, T, generator, ctx)) 878289848Sjkim goto err; 879289848Sjkim } 880289848Sjkim 881289848Sjkim pre_comp->group = group; 882289848Sjkim pre_comp->w = w; 883289848Sjkim pre_comp->precomp = preComputedTable; 884289848Sjkim pre_comp->precomp_storage = precomp_storage; 885289848Sjkim 886289848Sjkim precomp_storage = NULL; 887289848Sjkim 888289848Sjkim if (!EC_EX_DATA_set_data(&group->extra_data, pre_comp, 889289848Sjkim ecp_nistz256_pre_comp_dup, 890289848Sjkim ecp_nistz256_pre_comp_free, 891289848Sjkim ecp_nistz256_pre_comp_clear_free)) { 892289848Sjkim goto err; 893289848Sjkim } 894289848Sjkim 895289848Sjkim pre_comp = NULL; 896289848Sjkim 897289848Sjkim ret = 1; 898289848Sjkim 899289848Sjkim err: 900289848Sjkim if (ctx != NULL) 901289848Sjkim BN_CTX_end(ctx); 902289848Sjkim BN_CTX_free(new_ctx); 903289848Sjkim 904289848Sjkim if (pre_comp) 905289848Sjkim ecp_nistz256_pre_comp_free(pre_comp); 906289848Sjkim if (precomp_storage) 907289848Sjkim OPENSSL_free(precomp_storage); 908289848Sjkim if (P) 909289848Sjkim EC_POINT_free(P); 910289848Sjkim if (T) 911289848Sjkim EC_POINT_free(T); 912289848Sjkim return ret; 913289848Sjkim} 914289848Sjkim 915289848Sjkim/* 916289848Sjkim * Note that by default ECP_NISTZ256_AVX2 is undefined. While it's great 917289848Sjkim * code processing 4 points in parallel, corresponding serial operation 918289848Sjkim * is several times slower, because it uses 29x29=58-bit multiplication 919289848Sjkim * as opposite to 64x64=128-bit in integer-only scalar case. As result 920289848Sjkim * it doesn't provide *significant* performance improvement. Note that 921289848Sjkim * just defining ECP_NISTZ256_AVX2 is not sufficient to make it work, 922289848Sjkim * you'd need to compile even asm/ecp_nistz256-avx.pl module. 923289848Sjkim */ 924289848Sjkim#if defined(ECP_NISTZ256_AVX2) 925289848Sjkim# if !(defined(__x86_64) || defined(__x86_64__)) || \ 926289848Sjkim defined(_M_AMD64) || defined(_MX64)) || \ 927289848Sjkim !(defined(__GNUC__) || defined(_MSC_VER)) /* this is for ALIGN32 */ 928289848Sjkim# undef ECP_NISTZ256_AVX2 929289848Sjkim# else 930289848Sjkim/* Constant time access, loading four values, from four consecutive tables */ 931289848Sjkimvoid ecp_nistz256_avx2_select_w7(P256_POINT_AFFINE * val, 932289848Sjkim const P256_POINT_AFFINE * in_t, int index); 933289848Sjkimvoid ecp_nistz256_avx2_multi_select_w7(void *result, const void *in, int index0, 934289848Sjkim int index1, int index2, int index3); 935289848Sjkimvoid ecp_nistz256_avx2_transpose_convert(void *RESULTx4, const void *in); 936289848Sjkimvoid ecp_nistz256_avx2_convert_transpose_back(void *result, const void *Ax4); 937289848Sjkimvoid ecp_nistz256_avx2_point_add_affine_x4(void *RESULTx4, const void *Ax4, 938289848Sjkim const void *Bx4); 939289848Sjkimvoid ecp_nistz256_avx2_point_add_affines_x4(void *RESULTx4, const void *Ax4, 940289848Sjkim const void *Bx4); 941289848Sjkimvoid ecp_nistz256_avx2_to_mont(void *RESULTx4, const void *Ax4); 942289848Sjkimvoid ecp_nistz256_avx2_from_mont(void *RESULTx4, const void *Ax4); 943289848Sjkimvoid ecp_nistz256_avx2_set1(void *RESULTx4); 944289848Sjkimint ecp_nistz_avx2_eligible(void); 945289848Sjkim 946289848Sjkimstatic void booth_recode_w7(unsigned char *sign, 947289848Sjkim unsigned char *digit, unsigned char in) 948289848Sjkim{ 949289848Sjkim unsigned char s, d; 950289848Sjkim 951289848Sjkim s = ~((in >> 7) - 1); 952289848Sjkim d = (1 << 8) - in - 1; 953289848Sjkim d = (d & s) | (in & ~s); 954289848Sjkim d = (d >> 1) + (d & 1); 955289848Sjkim 956289848Sjkim *sign = s & 1; 957289848Sjkim *digit = d; 958289848Sjkim} 959289848Sjkim 960289848Sjkim/* 961289848Sjkim * ecp_nistz256_avx2_mul_g performs multiplication by G, using only the 962289848Sjkim * precomputed table. It does 4 affine point additions in parallel, 963289848Sjkim * significantly speeding up point multiplication for a fixed value. 964289848Sjkim */ 965289848Sjkimstatic void ecp_nistz256_avx2_mul_g(P256_POINT *r, 966289848Sjkim unsigned char p_str[33], 967289848Sjkim const P256_POINT_AFFINE(*preComputedTable)[64]) 968289848Sjkim{ 969289848Sjkim const unsigned int window_size = 7; 970289848Sjkim const unsigned int mask = (1 << (window_size + 1)) - 1; 971289848Sjkim unsigned int wvalue; 972289848Sjkim /* Using 4 windows at a time */ 973289848Sjkim unsigned char sign0, digit0; 974289848Sjkim unsigned char sign1, digit1; 975289848Sjkim unsigned char sign2, digit2; 976289848Sjkim unsigned char sign3, digit3; 977289848Sjkim unsigned int index = 0; 978289848Sjkim BN_ULONG tmp[P256_LIMBS]; 979289848Sjkim int i; 980289848Sjkim 981289848Sjkim ALIGN32 BN_ULONG aX4[4 * 9 * 3] = { 0 }; 982289848Sjkim ALIGN32 BN_ULONG bX4[4 * 9 * 2] = { 0 }; 983289848Sjkim ALIGN32 P256_POINT_AFFINE point_arr[P256_LIMBS]; 984289848Sjkim ALIGN32 P256_POINT res_point_arr[P256_LIMBS]; 985289848Sjkim 986289848Sjkim /* Initial four windows */ 987289848Sjkim wvalue = *((u16 *) & p_str[0]); 988289848Sjkim wvalue = (wvalue << 1) & mask; 989289848Sjkim index += window_size; 990289848Sjkim booth_recode_w7(&sign0, &digit0, wvalue); 991289848Sjkim wvalue = *((u16 *) & p_str[(index - 1) / 8]); 992289848Sjkim wvalue = (wvalue >> ((index - 1) % 8)) & mask; 993289848Sjkim index += window_size; 994289848Sjkim booth_recode_w7(&sign1, &digit1, wvalue); 995289848Sjkim wvalue = *((u16 *) & p_str[(index - 1) / 8]); 996289848Sjkim wvalue = (wvalue >> ((index - 1) % 8)) & mask; 997289848Sjkim index += window_size; 998289848Sjkim booth_recode_w7(&sign2, &digit2, wvalue); 999289848Sjkim wvalue = *((u16 *) & p_str[(index - 1) / 8]); 1000289848Sjkim wvalue = (wvalue >> ((index - 1) % 8)) & mask; 1001289848Sjkim index += window_size; 1002289848Sjkim booth_recode_w7(&sign3, &digit3, wvalue); 1003289848Sjkim 1004289848Sjkim ecp_nistz256_avx2_multi_select_w7(point_arr, preComputedTable[0], 1005289848Sjkim digit0, digit1, digit2, digit3); 1006289848Sjkim 1007289848Sjkim ecp_nistz256_neg(tmp, point_arr[0].Y); 1008289848Sjkim copy_conditional(point_arr[0].Y, tmp, sign0); 1009289848Sjkim ecp_nistz256_neg(tmp, point_arr[1].Y); 1010289848Sjkim copy_conditional(point_arr[1].Y, tmp, sign1); 1011289848Sjkim ecp_nistz256_neg(tmp, point_arr[2].Y); 1012289848Sjkim copy_conditional(point_arr[2].Y, tmp, sign2); 1013289848Sjkim ecp_nistz256_neg(tmp, point_arr[3].Y); 1014289848Sjkim copy_conditional(point_arr[3].Y, tmp, sign3); 1015289848Sjkim 1016289848Sjkim ecp_nistz256_avx2_transpose_convert(aX4, point_arr); 1017289848Sjkim ecp_nistz256_avx2_to_mont(aX4, aX4); 1018289848Sjkim ecp_nistz256_avx2_to_mont(&aX4[4 * 9], &aX4[4 * 9]); 1019289848Sjkim ecp_nistz256_avx2_set1(&aX4[4 * 9 * 2]); 1020289848Sjkim 1021289848Sjkim wvalue = *((u16 *) & p_str[(index - 1) / 8]); 1022289848Sjkim wvalue = (wvalue >> ((index - 1) % 8)) & mask; 1023289848Sjkim index += window_size; 1024289848Sjkim booth_recode_w7(&sign0, &digit0, wvalue); 1025289848Sjkim wvalue = *((u16 *) & p_str[(index - 1) / 8]); 1026289848Sjkim wvalue = (wvalue >> ((index - 1) % 8)) & mask; 1027289848Sjkim index += window_size; 1028289848Sjkim booth_recode_w7(&sign1, &digit1, wvalue); 1029289848Sjkim wvalue = *((u16 *) & p_str[(index - 1) / 8]); 1030289848Sjkim wvalue = (wvalue >> ((index - 1) % 8)) & mask; 1031289848Sjkim index += window_size; 1032289848Sjkim booth_recode_w7(&sign2, &digit2, wvalue); 1033289848Sjkim wvalue = *((u16 *) & p_str[(index - 1) / 8]); 1034289848Sjkim wvalue = (wvalue >> ((index - 1) % 8)) & mask; 1035289848Sjkim index += window_size; 1036289848Sjkim booth_recode_w7(&sign3, &digit3, wvalue); 1037289848Sjkim 1038289848Sjkim ecp_nistz256_avx2_multi_select_w7(point_arr, preComputedTable[4 * 1], 1039289848Sjkim digit0, digit1, digit2, digit3); 1040289848Sjkim 1041289848Sjkim ecp_nistz256_neg(tmp, point_arr[0].Y); 1042289848Sjkim copy_conditional(point_arr[0].Y, tmp, sign0); 1043289848Sjkim ecp_nistz256_neg(tmp, point_arr[1].Y); 1044289848Sjkim copy_conditional(point_arr[1].Y, tmp, sign1); 1045289848Sjkim ecp_nistz256_neg(tmp, point_arr[2].Y); 1046289848Sjkim copy_conditional(point_arr[2].Y, tmp, sign2); 1047289848Sjkim ecp_nistz256_neg(tmp, point_arr[3].Y); 1048289848Sjkim copy_conditional(point_arr[3].Y, tmp, sign3); 1049289848Sjkim 1050289848Sjkim ecp_nistz256_avx2_transpose_convert(bX4, point_arr); 1051289848Sjkim ecp_nistz256_avx2_to_mont(bX4, bX4); 1052289848Sjkim ecp_nistz256_avx2_to_mont(&bX4[4 * 9], &bX4[4 * 9]); 1053289848Sjkim /* Optimized when both inputs are affine */ 1054289848Sjkim ecp_nistz256_avx2_point_add_affines_x4(aX4, aX4, bX4); 1055289848Sjkim 1056289848Sjkim for (i = 2; i < 9; i++) { 1057289848Sjkim wvalue = *((u16 *) & p_str[(index - 1) / 8]); 1058289848Sjkim wvalue = (wvalue >> ((index - 1) % 8)) & mask; 1059289848Sjkim index += window_size; 1060289848Sjkim booth_recode_w7(&sign0, &digit0, wvalue); 1061289848Sjkim wvalue = *((u16 *) & p_str[(index - 1) / 8]); 1062289848Sjkim wvalue = (wvalue >> ((index - 1) % 8)) & mask; 1063289848Sjkim index += window_size; 1064289848Sjkim booth_recode_w7(&sign1, &digit1, wvalue); 1065289848Sjkim wvalue = *((u16 *) & p_str[(index - 1) / 8]); 1066289848Sjkim wvalue = (wvalue >> ((index - 1) % 8)) & mask; 1067289848Sjkim index += window_size; 1068289848Sjkim booth_recode_w7(&sign2, &digit2, wvalue); 1069289848Sjkim wvalue = *((u16 *) & p_str[(index - 1) / 8]); 1070289848Sjkim wvalue = (wvalue >> ((index - 1) % 8)) & mask; 1071289848Sjkim index += window_size; 1072289848Sjkim booth_recode_w7(&sign3, &digit3, wvalue); 1073289848Sjkim 1074289848Sjkim ecp_nistz256_avx2_multi_select_w7(point_arr, 1075289848Sjkim preComputedTable[4 * i], 1076289848Sjkim digit0, digit1, digit2, digit3); 1077289848Sjkim 1078289848Sjkim ecp_nistz256_neg(tmp, point_arr[0].Y); 1079289848Sjkim copy_conditional(point_arr[0].Y, tmp, sign0); 1080289848Sjkim ecp_nistz256_neg(tmp, point_arr[1].Y); 1081289848Sjkim copy_conditional(point_arr[1].Y, tmp, sign1); 1082289848Sjkim ecp_nistz256_neg(tmp, point_arr[2].Y); 1083289848Sjkim copy_conditional(point_arr[2].Y, tmp, sign2); 1084289848Sjkim ecp_nistz256_neg(tmp, point_arr[3].Y); 1085289848Sjkim copy_conditional(point_arr[3].Y, tmp, sign3); 1086289848Sjkim 1087289848Sjkim ecp_nistz256_avx2_transpose_convert(bX4, point_arr); 1088289848Sjkim ecp_nistz256_avx2_to_mont(bX4, bX4); 1089289848Sjkim ecp_nistz256_avx2_to_mont(&bX4[4 * 9], &bX4[4 * 9]); 1090289848Sjkim 1091289848Sjkim ecp_nistz256_avx2_point_add_affine_x4(aX4, aX4, bX4); 1092289848Sjkim } 1093289848Sjkim 1094289848Sjkim ecp_nistz256_avx2_from_mont(&aX4[4 * 9 * 0], &aX4[4 * 9 * 0]); 1095289848Sjkim ecp_nistz256_avx2_from_mont(&aX4[4 * 9 * 1], &aX4[4 * 9 * 1]); 1096289848Sjkim ecp_nistz256_avx2_from_mont(&aX4[4 * 9 * 2], &aX4[4 * 9 * 2]); 1097289848Sjkim 1098289848Sjkim ecp_nistz256_avx2_convert_transpose_back(res_point_arr, aX4); 1099289848Sjkim /* Last window is performed serially */ 1100289848Sjkim wvalue = *((u16 *) & p_str[(index - 1) / 8]); 1101289848Sjkim wvalue = (wvalue >> ((index - 1) % 8)) & mask; 1102289848Sjkim booth_recode_w7(&sign0, &digit0, wvalue); 1103289848Sjkim ecp_nistz256_avx2_select_w7((P256_POINT_AFFINE *) r, 1104289848Sjkim preComputedTable[36], digit0); 1105289848Sjkim ecp_nistz256_neg(tmp, r->Y); 1106289848Sjkim copy_conditional(r->Y, tmp, sign0); 1107289848Sjkim memcpy(r->Z, ONE, sizeof(ONE)); 1108289848Sjkim /* Sum the four windows */ 1109289848Sjkim ecp_nistz256_point_add(r, r, &res_point_arr[0]); 1110289848Sjkim ecp_nistz256_point_add(r, r, &res_point_arr[1]); 1111289848Sjkim ecp_nistz256_point_add(r, r, &res_point_arr[2]); 1112289848Sjkim ecp_nistz256_point_add(r, r, &res_point_arr[3]); 1113289848Sjkim} 1114289848Sjkim# endif 1115289848Sjkim#endif 1116289848Sjkim 1117289848Sjkimstatic int ecp_nistz256_set_from_affine(EC_POINT *out, const EC_GROUP *group, 1118289848Sjkim const P256_POINT_AFFINE *in, 1119289848Sjkim BN_CTX *ctx) 1120289848Sjkim{ 1121337982Sjkim BIGNUM x, y, z; 1122289848Sjkim int ret = 0; 1123289848Sjkim 1124337982Sjkim /* 1125337982Sjkim * |const| qualifier omission is compensated by BN_FLG_STATIC_DATA 1126337982Sjkim * flag, which effectively means "read-only data". 1127337982Sjkim */ 1128337982Sjkim x.d = (BN_ULONG *)in->X; 1129289848Sjkim x.dmax = x.top = P256_LIMBS; 1130289848Sjkim x.neg = 0; 1131289848Sjkim x.flags = BN_FLG_STATIC_DATA; 1132289848Sjkim 1133337982Sjkim y.d = (BN_ULONG *)in->Y; 1134289848Sjkim y.dmax = y.top = P256_LIMBS; 1135289848Sjkim y.neg = 0; 1136289848Sjkim y.flags = BN_FLG_STATIC_DATA; 1137289848Sjkim 1138337982Sjkim z.d = (BN_ULONG *)ONE; 1139337982Sjkim z.dmax = z.top = P256_LIMBS; 1140337982Sjkim z.neg = 0; 1141337982Sjkim z.flags = BN_FLG_STATIC_DATA; 1142289848Sjkim 1143337982Sjkim if ((ret = (BN_copy(&out->X, &x) != NULL)) 1144337982Sjkim && (ret = (BN_copy(&out->Y, &y) != NULL)) 1145337982Sjkim && (ret = (BN_copy(&out->Z, &z) != NULL))) 1146337982Sjkim out->Z_is_one = 1; 1147337982Sjkim 1148289848Sjkim return ret; 1149289848Sjkim} 1150289848Sjkim 1151289848Sjkim/* r = scalar*G + sum(scalars[i]*points[i]) */ 1152289848Sjkimstatic int ecp_nistz256_points_mul(const EC_GROUP *group, 1153289848Sjkim EC_POINT *r, 1154289848Sjkim const BIGNUM *scalar, 1155289848Sjkim size_t num, 1156289848Sjkim const EC_POINT *points[], 1157289848Sjkim const BIGNUM *scalars[], BN_CTX *ctx) 1158289848Sjkim{ 1159289848Sjkim int i = 0, ret = 0, no_precomp_for_generator = 0, p_is_infinity = 0; 1160289848Sjkim size_t j; 1161289848Sjkim unsigned char p_str[33] = { 0 }; 1162289848Sjkim const PRECOMP256_ROW *preComputedTable = NULL; 1163289848Sjkim const EC_PRE_COMP *pre_comp = NULL; 1164289848Sjkim const EC_POINT *generator = NULL; 1165289848Sjkim unsigned int index = 0; 1166289848Sjkim BN_CTX *new_ctx = NULL; 1167289848Sjkim const BIGNUM **new_scalars = NULL; 1168289848Sjkim const EC_POINT **new_points = NULL; 1169289848Sjkim const unsigned int window_size = 7; 1170289848Sjkim const unsigned int mask = (1 << (window_size + 1)) - 1; 1171289848Sjkim unsigned int wvalue; 1172289848Sjkim ALIGN32 union { 1173289848Sjkim P256_POINT p; 1174289848Sjkim P256_POINT_AFFINE a; 1175289848Sjkim } t, p; 1176289848Sjkim BIGNUM *tmp_scalar; 1177289848Sjkim 1178289848Sjkim if (group->meth != r->meth) { 1179289848Sjkim ECerr(EC_F_ECP_NISTZ256_POINTS_MUL, EC_R_INCOMPATIBLE_OBJECTS); 1180289848Sjkim return 0; 1181289848Sjkim } 1182289848Sjkim 1183289848Sjkim if ((scalar == NULL) && (num == 0)) 1184289848Sjkim return EC_POINT_set_to_infinity(group, r); 1185289848Sjkim 1186289848Sjkim for (j = 0; j < num; j++) { 1187289848Sjkim if (group->meth != points[j]->meth) { 1188289848Sjkim ECerr(EC_F_ECP_NISTZ256_POINTS_MUL, EC_R_INCOMPATIBLE_OBJECTS); 1189289848Sjkim return 0; 1190289848Sjkim } 1191289848Sjkim } 1192289848Sjkim 1193289848Sjkim if (ctx == NULL) { 1194289848Sjkim ctx = new_ctx = BN_CTX_new(); 1195289848Sjkim if (ctx == NULL) 1196289848Sjkim goto err; 1197289848Sjkim } 1198289848Sjkim 1199289848Sjkim BN_CTX_start(ctx); 1200289848Sjkim 1201289848Sjkim if (scalar) { 1202289848Sjkim generator = EC_GROUP_get0_generator(group); 1203289848Sjkim if (generator == NULL) { 1204289848Sjkim ECerr(EC_F_ECP_NISTZ256_POINTS_MUL, EC_R_UNDEFINED_GENERATOR); 1205289848Sjkim goto err; 1206289848Sjkim } 1207289848Sjkim 1208289848Sjkim /* look if we can use precomputed multiples of generator */ 1209289848Sjkim pre_comp = 1210289848Sjkim EC_EX_DATA_get_data(group->extra_data, ecp_nistz256_pre_comp_dup, 1211289848Sjkim ecp_nistz256_pre_comp_free, 1212289848Sjkim ecp_nistz256_pre_comp_clear_free); 1213289848Sjkim 1214289848Sjkim if (pre_comp) { 1215289848Sjkim /* 1216289848Sjkim * If there is a precomputed table for the generator, check that 1217289848Sjkim * it was generated with the same generator. 1218289848Sjkim */ 1219289848Sjkim EC_POINT *pre_comp_generator = EC_POINT_new(group); 1220289848Sjkim if (pre_comp_generator == NULL) 1221289848Sjkim goto err; 1222289848Sjkim 1223289848Sjkim if (!ecp_nistz256_set_from_affine 1224289848Sjkim (pre_comp_generator, group, pre_comp->precomp[0], ctx)) { 1225289848Sjkim EC_POINT_free(pre_comp_generator); 1226289848Sjkim goto err; 1227289848Sjkim } 1228289848Sjkim 1229289848Sjkim if (0 == EC_POINT_cmp(group, generator, pre_comp_generator, ctx)) 1230289848Sjkim preComputedTable = (const PRECOMP256_ROW *)pre_comp->precomp; 1231289848Sjkim 1232289848Sjkim EC_POINT_free(pre_comp_generator); 1233289848Sjkim } 1234289848Sjkim 1235289848Sjkim if (preComputedTable == NULL && ecp_nistz256_is_affine_G(generator)) { 1236289848Sjkim /* 1237289848Sjkim * If there is no precomputed data, but the generator 1238289848Sjkim * is the default, a hardcoded table of precomputed 1239289848Sjkim * data is used. This is because applications, such as 1240289848Sjkim * Apache, do not use EC_KEY_precompute_mult. 1241289848Sjkim */ 1242289848Sjkim preComputedTable = (const PRECOMP256_ROW *)ecp_nistz256_precomputed; 1243289848Sjkim } 1244289848Sjkim 1245289848Sjkim if (preComputedTable) { 1246289848Sjkim if ((BN_num_bits(scalar) > 256) 1247289848Sjkim || BN_is_negative(scalar)) { 1248289848Sjkim if ((tmp_scalar = BN_CTX_get(ctx)) == NULL) 1249289848Sjkim goto err; 1250289848Sjkim 1251289848Sjkim if (!BN_nnmod(tmp_scalar, scalar, &group->order, ctx)) { 1252289848Sjkim ECerr(EC_F_ECP_NISTZ256_POINTS_MUL, ERR_R_BN_LIB); 1253289848Sjkim goto err; 1254289848Sjkim } 1255289848Sjkim scalar = tmp_scalar; 1256289848Sjkim } 1257289848Sjkim 1258289848Sjkim for (i = 0; i < scalar->top * BN_BYTES; i += BN_BYTES) { 1259289848Sjkim BN_ULONG d = scalar->d[i / BN_BYTES]; 1260289848Sjkim 1261289848Sjkim p_str[i + 0] = d & 0xff; 1262289848Sjkim p_str[i + 1] = (d >> 8) & 0xff; 1263289848Sjkim p_str[i + 2] = (d >> 16) & 0xff; 1264289848Sjkim p_str[i + 3] = (d >>= 24) & 0xff; 1265289848Sjkim if (BN_BYTES == 8) { 1266289848Sjkim d >>= 8; 1267289848Sjkim p_str[i + 4] = d & 0xff; 1268289848Sjkim p_str[i + 5] = (d >> 8) & 0xff; 1269289848Sjkim p_str[i + 6] = (d >> 16) & 0xff; 1270289848Sjkim p_str[i + 7] = (d >> 24) & 0xff; 1271289848Sjkim } 1272289848Sjkim } 1273289848Sjkim 1274289848Sjkim for (; i < 33; i++) 1275289848Sjkim p_str[i] = 0; 1276289848Sjkim 1277289848Sjkim#if defined(ECP_NISTZ256_AVX2) 1278289848Sjkim if (ecp_nistz_avx2_eligible()) { 1279289848Sjkim ecp_nistz256_avx2_mul_g(&p.p, p_str, preComputedTable); 1280289848Sjkim } else 1281289848Sjkim#endif 1282289848Sjkim { 1283306195Sjkim BN_ULONG infty; 1284306195Sjkim 1285289848Sjkim /* First window */ 1286289848Sjkim wvalue = (p_str[0] << 1) & mask; 1287289848Sjkim index += window_size; 1288289848Sjkim 1289289848Sjkim wvalue = _booth_recode_w7(wvalue); 1290289848Sjkim 1291289848Sjkim ecp_nistz256_select_w7(&p.a, preComputedTable[0], wvalue >> 1); 1292289848Sjkim 1293289848Sjkim ecp_nistz256_neg(p.p.Z, p.p.Y); 1294289848Sjkim copy_conditional(p.p.Y, p.p.Z, wvalue & 1); 1295289848Sjkim 1296306195Sjkim /* 1297306195Sjkim * Since affine infinity is encoded as (0,0) and 1298306195Sjkim * Jacobian ias (,,0), we need to harmonize them 1299306195Sjkim * by assigning "one" or zero to Z. 1300306195Sjkim */ 1301306195Sjkim infty = (p.p.X[0] | p.p.X[1] | p.p.X[2] | p.p.X[3] | 1302306195Sjkim p.p.Y[0] | p.p.Y[1] | p.p.Y[2] | p.p.Y[3]); 1303306195Sjkim if (P256_LIMBS == 8) 1304306195Sjkim infty |= (p.p.X[4] | p.p.X[5] | p.p.X[6] | p.p.X[7] | 1305306195Sjkim p.p.Y[4] | p.p.Y[5] | p.p.Y[6] | p.p.Y[7]); 1306289848Sjkim 1307306195Sjkim infty = 0 - is_zero(infty); 1308306195Sjkim infty = ~infty; 1309306195Sjkim 1310306195Sjkim p.p.Z[0] = ONE[0] & infty; 1311306195Sjkim p.p.Z[1] = ONE[1] & infty; 1312306195Sjkim p.p.Z[2] = ONE[2] & infty; 1313306195Sjkim p.p.Z[3] = ONE[3] & infty; 1314306195Sjkim if (P256_LIMBS == 8) { 1315306195Sjkim p.p.Z[4] = ONE[4] & infty; 1316306195Sjkim p.p.Z[5] = ONE[5] & infty; 1317306195Sjkim p.p.Z[6] = ONE[6] & infty; 1318306195Sjkim p.p.Z[7] = ONE[7] & infty; 1319306195Sjkim } 1320306195Sjkim 1321289848Sjkim for (i = 1; i < 37; i++) { 1322289848Sjkim unsigned int off = (index - 1) / 8; 1323289848Sjkim wvalue = p_str[off] | p_str[off + 1] << 8; 1324289848Sjkim wvalue = (wvalue >> ((index - 1) % 8)) & mask; 1325289848Sjkim index += window_size; 1326289848Sjkim 1327289848Sjkim wvalue = _booth_recode_w7(wvalue); 1328289848Sjkim 1329289848Sjkim ecp_nistz256_select_w7(&t.a, 1330289848Sjkim preComputedTable[i], wvalue >> 1); 1331289848Sjkim 1332289848Sjkim ecp_nistz256_neg(t.p.Z, t.a.Y); 1333289848Sjkim copy_conditional(t.a.Y, t.p.Z, wvalue & 1); 1334289848Sjkim 1335289848Sjkim ecp_nistz256_point_add_affine(&p.p, &p.p, &t.a); 1336289848Sjkim } 1337289848Sjkim } 1338289848Sjkim } else { 1339289848Sjkim p_is_infinity = 1; 1340289848Sjkim no_precomp_for_generator = 1; 1341289848Sjkim } 1342289848Sjkim } else 1343289848Sjkim p_is_infinity = 1; 1344289848Sjkim 1345289848Sjkim if (no_precomp_for_generator) { 1346289848Sjkim /* 1347289848Sjkim * Without a precomputed table for the generator, it has to be 1348289848Sjkim * handled like a normal point. 1349289848Sjkim */ 1350289848Sjkim new_scalars = OPENSSL_malloc((num + 1) * sizeof(BIGNUM *)); 1351289848Sjkim if (!new_scalars) { 1352289848Sjkim ECerr(EC_F_ECP_NISTZ256_POINTS_MUL, ERR_R_MALLOC_FAILURE); 1353289848Sjkim goto err; 1354289848Sjkim } 1355289848Sjkim 1356289848Sjkim new_points = OPENSSL_malloc((num + 1) * sizeof(EC_POINT *)); 1357289848Sjkim if (!new_points) { 1358289848Sjkim ECerr(EC_F_ECP_NISTZ256_POINTS_MUL, ERR_R_MALLOC_FAILURE); 1359289848Sjkim goto err; 1360289848Sjkim } 1361289848Sjkim 1362289848Sjkim memcpy(new_scalars, scalars, num * sizeof(BIGNUM *)); 1363289848Sjkim new_scalars[num] = scalar; 1364289848Sjkim memcpy(new_points, points, num * sizeof(EC_POINT *)); 1365289848Sjkim new_points[num] = generator; 1366289848Sjkim 1367289848Sjkim scalars = new_scalars; 1368289848Sjkim points = new_points; 1369289848Sjkim num++; 1370289848Sjkim } 1371289848Sjkim 1372289848Sjkim if (num) { 1373289848Sjkim P256_POINT *out = &t.p; 1374289848Sjkim if (p_is_infinity) 1375289848Sjkim out = &p.p; 1376289848Sjkim 1377289848Sjkim if (!ecp_nistz256_windowed_mul(group, out, scalars, points, num, ctx)) 1378289848Sjkim goto err; 1379289848Sjkim 1380289848Sjkim if (!p_is_infinity) 1381289848Sjkim ecp_nistz256_point_add(&p.p, &p.p, out); 1382289848Sjkim } 1383289848Sjkim 1384289848Sjkim /* Not constant-time, but we're only operating on the public output. */ 1385289848Sjkim if (!ecp_nistz256_set_words(&r->X, p.p.X) || 1386289848Sjkim !ecp_nistz256_set_words(&r->Y, p.p.Y) || 1387289848Sjkim !ecp_nistz256_set_words(&r->Z, p.p.Z)) { 1388289848Sjkim goto err; 1389289848Sjkim } 1390306195Sjkim r->Z_is_one = is_one(&r->Z) & 1; 1391289848Sjkim 1392289848Sjkim ret = 1; 1393289848Sjkim 1394289848Sjkimerr: 1395289848Sjkim if (ctx) 1396289848Sjkim BN_CTX_end(ctx); 1397289848Sjkim BN_CTX_free(new_ctx); 1398289848Sjkim if (new_points) 1399289848Sjkim OPENSSL_free(new_points); 1400289848Sjkim if (new_scalars) 1401289848Sjkim OPENSSL_free(new_scalars); 1402289848Sjkim return ret; 1403289848Sjkim} 1404289848Sjkim 1405289848Sjkimstatic int ecp_nistz256_get_affine(const EC_GROUP *group, 1406289848Sjkim const EC_POINT *point, 1407289848Sjkim BIGNUM *x, BIGNUM *y, BN_CTX *ctx) 1408289848Sjkim{ 1409289848Sjkim BN_ULONG z_inv2[P256_LIMBS]; 1410289848Sjkim BN_ULONG z_inv3[P256_LIMBS]; 1411289848Sjkim BN_ULONG x_aff[P256_LIMBS]; 1412289848Sjkim BN_ULONG y_aff[P256_LIMBS]; 1413289848Sjkim BN_ULONG point_x[P256_LIMBS], point_y[P256_LIMBS], point_z[P256_LIMBS]; 1414289848Sjkim BN_ULONG x_ret[P256_LIMBS], y_ret[P256_LIMBS]; 1415289848Sjkim 1416289848Sjkim if (EC_POINT_is_at_infinity(group, point)) { 1417289848Sjkim ECerr(EC_F_ECP_NISTZ256_GET_AFFINE, EC_R_POINT_AT_INFINITY); 1418289848Sjkim return 0; 1419289848Sjkim } 1420289848Sjkim 1421289848Sjkim if (!ecp_nistz256_bignum_to_field_elem(point_x, &point->X) || 1422289848Sjkim !ecp_nistz256_bignum_to_field_elem(point_y, &point->Y) || 1423289848Sjkim !ecp_nistz256_bignum_to_field_elem(point_z, &point->Z)) { 1424289848Sjkim ECerr(EC_F_ECP_NISTZ256_GET_AFFINE, EC_R_COORDINATES_OUT_OF_RANGE); 1425289848Sjkim return 0; 1426289848Sjkim } 1427289848Sjkim 1428289848Sjkim ecp_nistz256_mod_inverse(z_inv3, point_z); 1429289848Sjkim ecp_nistz256_sqr_mont(z_inv2, z_inv3); 1430289848Sjkim ecp_nistz256_mul_mont(x_aff, z_inv2, point_x); 1431289848Sjkim 1432289848Sjkim if (x != NULL) { 1433289848Sjkim ecp_nistz256_from_mont(x_ret, x_aff); 1434289848Sjkim if (!ecp_nistz256_set_words(x, x_ret)) 1435289848Sjkim return 0; 1436289848Sjkim } 1437289848Sjkim 1438289848Sjkim if (y != NULL) { 1439289848Sjkim ecp_nistz256_mul_mont(z_inv3, z_inv3, z_inv2); 1440289848Sjkim ecp_nistz256_mul_mont(y_aff, z_inv3, point_y); 1441289848Sjkim ecp_nistz256_from_mont(y_ret, y_aff); 1442289848Sjkim if (!ecp_nistz256_set_words(y, y_ret)) 1443289848Sjkim return 0; 1444289848Sjkim } 1445289848Sjkim 1446289848Sjkim return 1; 1447289848Sjkim} 1448289848Sjkim 1449289848Sjkimstatic EC_PRE_COMP *ecp_nistz256_pre_comp_new(const EC_GROUP *group) 1450289848Sjkim{ 1451289848Sjkim EC_PRE_COMP *ret = NULL; 1452289848Sjkim 1453289848Sjkim if (!group) 1454289848Sjkim return NULL; 1455289848Sjkim 1456289848Sjkim ret = (EC_PRE_COMP *)OPENSSL_malloc(sizeof(EC_PRE_COMP)); 1457289848Sjkim 1458289848Sjkim if (!ret) { 1459289848Sjkim ECerr(EC_F_ECP_NISTZ256_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE); 1460289848Sjkim return ret; 1461289848Sjkim } 1462289848Sjkim 1463289848Sjkim ret->group = group; 1464289848Sjkim ret->w = 6; /* default */ 1465289848Sjkim ret->precomp = NULL; 1466289848Sjkim ret->precomp_storage = NULL; 1467289848Sjkim ret->references = 1; 1468289848Sjkim return ret; 1469289848Sjkim} 1470289848Sjkim 1471289848Sjkimstatic void *ecp_nistz256_pre_comp_dup(void *src_) 1472289848Sjkim{ 1473289848Sjkim EC_PRE_COMP *src = src_; 1474289848Sjkim 1475289848Sjkim /* no need to actually copy, these objects never change! */ 1476289848Sjkim CRYPTO_add(&src->references, 1, CRYPTO_LOCK_EC_PRE_COMP); 1477289848Sjkim 1478289848Sjkim return src_; 1479289848Sjkim} 1480289848Sjkim 1481289848Sjkimstatic void ecp_nistz256_pre_comp_free(void *pre_) 1482289848Sjkim{ 1483289848Sjkim int i; 1484289848Sjkim EC_PRE_COMP *pre = pre_; 1485289848Sjkim 1486289848Sjkim if (!pre) 1487289848Sjkim return; 1488289848Sjkim 1489289848Sjkim i = CRYPTO_add(&pre->references, -1, CRYPTO_LOCK_EC_PRE_COMP); 1490289848Sjkim if (i > 0) 1491289848Sjkim return; 1492289848Sjkim 1493289848Sjkim if (pre->precomp_storage) 1494289848Sjkim OPENSSL_free(pre->precomp_storage); 1495289848Sjkim 1496289848Sjkim OPENSSL_free(pre); 1497289848Sjkim} 1498289848Sjkim 1499289848Sjkimstatic void ecp_nistz256_pre_comp_clear_free(void *pre_) 1500289848Sjkim{ 1501289848Sjkim int i; 1502289848Sjkim EC_PRE_COMP *pre = pre_; 1503289848Sjkim 1504289848Sjkim if (!pre) 1505289848Sjkim return; 1506289848Sjkim 1507289848Sjkim i = CRYPTO_add(&pre->references, -1, CRYPTO_LOCK_EC_PRE_COMP); 1508289848Sjkim if (i > 0) 1509289848Sjkim return; 1510289848Sjkim 1511289848Sjkim if (pre->precomp_storage) { 1512289848Sjkim OPENSSL_cleanse(pre->precomp, 1513289848Sjkim 32 * sizeof(unsigned char) * (1 << pre->w) * 2 * 37); 1514289848Sjkim OPENSSL_free(pre->precomp_storage); 1515289848Sjkim } 1516331638Sjkim OPENSSL_cleanse(pre, sizeof(*pre)); 1517289848Sjkim OPENSSL_free(pre); 1518289848Sjkim} 1519289848Sjkim 1520289848Sjkimstatic int ecp_nistz256_window_have_precompute_mult(const EC_GROUP *group) 1521289848Sjkim{ 1522289848Sjkim /* There is a hard-coded table for the default generator. */ 1523289848Sjkim const EC_POINT *generator = EC_GROUP_get0_generator(group); 1524289848Sjkim if (generator != NULL && ecp_nistz256_is_affine_G(generator)) { 1525289848Sjkim /* There is a hard-coded table for the default generator. */ 1526289848Sjkim return 1; 1527289848Sjkim } 1528289848Sjkim 1529289848Sjkim return EC_EX_DATA_get_data(group->extra_data, ecp_nistz256_pre_comp_dup, 1530289848Sjkim ecp_nistz256_pre_comp_free, 1531289848Sjkim ecp_nistz256_pre_comp_clear_free) != NULL; 1532289848Sjkim} 1533289848Sjkim 1534289848Sjkimconst EC_METHOD *EC_GFp_nistz256_method(void) 1535289848Sjkim{ 1536289848Sjkim static const EC_METHOD ret = { 1537289848Sjkim EC_FLAGS_DEFAULT_OCT, 1538289848Sjkim NID_X9_62_prime_field, 1539289848Sjkim ec_GFp_mont_group_init, 1540289848Sjkim ec_GFp_mont_group_finish, 1541289848Sjkim ec_GFp_mont_group_clear_finish, 1542289848Sjkim ec_GFp_mont_group_copy, 1543289848Sjkim ec_GFp_mont_group_set_curve, 1544289848Sjkim ec_GFp_simple_group_get_curve, 1545289848Sjkim ec_GFp_simple_group_get_degree, 1546289848Sjkim ec_GFp_simple_group_check_discriminant, 1547289848Sjkim ec_GFp_simple_point_init, 1548289848Sjkim ec_GFp_simple_point_finish, 1549289848Sjkim ec_GFp_simple_point_clear_finish, 1550289848Sjkim ec_GFp_simple_point_copy, 1551289848Sjkim ec_GFp_simple_point_set_to_infinity, 1552289848Sjkim ec_GFp_simple_set_Jprojective_coordinates_GFp, 1553289848Sjkim ec_GFp_simple_get_Jprojective_coordinates_GFp, 1554289848Sjkim ec_GFp_simple_point_set_affine_coordinates, 1555289848Sjkim ecp_nistz256_get_affine, 1556289848Sjkim 0, 0, 0, 1557289848Sjkim ec_GFp_simple_add, 1558289848Sjkim ec_GFp_simple_dbl, 1559289848Sjkim ec_GFp_simple_invert, 1560289848Sjkim ec_GFp_simple_is_at_infinity, 1561289848Sjkim ec_GFp_simple_is_on_curve, 1562289848Sjkim ec_GFp_simple_cmp, 1563289848Sjkim ec_GFp_simple_make_affine, 1564289848Sjkim ec_GFp_simple_points_make_affine, 1565289848Sjkim ecp_nistz256_points_mul, /* mul */ 1566289848Sjkim ecp_nistz256_mult_precompute, /* precompute_mult */ 1567289848Sjkim ecp_nistz256_window_have_precompute_mult, /* have_precompute_mult */ 1568289848Sjkim ec_GFp_mont_field_mul, 1569289848Sjkim ec_GFp_mont_field_sqr, 1570289848Sjkim 0, /* field_div */ 1571289848Sjkim ec_GFp_mont_field_encode, 1572289848Sjkim ec_GFp_mont_field_decode, 1573289848Sjkim ec_GFp_mont_field_set_to_one 1574289848Sjkim }; 1575289848Sjkim 1576289848Sjkim return &ret; 1577289848Sjkim} 1578