1/****************************************************************************** 2 * * 3 * Copyright 2014 Intel Corporation * 4 * * 5 * Licensed under the Apache License, Version 2.0 (the "License"); * 6 * you may not use this file except in compliance with the License. * 7 * You may obtain a copy of the License at * 8 * * 9 * http://www.apache.org/licenses/LICENSE-2.0 * 10 * * 11 * Unless required by applicable law or agreed to in writing, software * 12 * distributed under the License is distributed on an "AS IS" BASIS, * 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * 14 * See the License for the specific language governing permissions and * 15 * limitations under the License. * 16 * * 17 ****************************************************************************** 18 * * 19 * Developers and authors: * 20 * Shay Gueron (1, 2), and Vlad Krasnov (1) * 21 * (1) Intel Corporation, Israel Development Center * 22 * (2) University of Haifa * 23 * Reference: * 24 * S.Gueron and V.Krasnov, "Fast Prime Field Elliptic Curve Cryptography with * 25 * 256 Bit Primes" * 26 * * 27 ******************************************************************************/ 28 29#include <string.h> 30 31#include <openssl/bn.h> 32#include <openssl/err.h> 33#include <openssl/ec.h> 34#include "cryptlib.h" 35 36#include "ec_lcl.h" 37 38#if BN_BITS2 != 64 39# define TOBN(hi,lo) lo,hi 40#else 41# define TOBN(hi,lo) ((BN_ULONG)hi<<32|lo) 42#endif 43 44#if defined(__GNUC__) 45# define ALIGN32 __attribute((aligned(32))) 46#elif defined(_MSC_VER) 47# define ALIGN32 __declspec(align(32)) 48#else 49# define ALIGN32 50#endif 51 52#define ALIGNPTR(p,N) ((unsigned char *)p+N-(size_t)p%N) 53#define P256_LIMBS (256/BN_BITS2) 54 55typedef unsigned short u16; 56 57typedef struct { 58 BN_ULONG X[P256_LIMBS]; 59 BN_ULONG Y[P256_LIMBS]; 60 BN_ULONG Z[P256_LIMBS]; 61} P256_POINT; 62 63typedef struct { 64 BN_ULONG X[P256_LIMBS]; 65 BN_ULONG Y[P256_LIMBS]; 66} P256_POINT_AFFINE; 67 68typedef P256_POINT_AFFINE PRECOMP256_ROW[64]; 69 70/* structure for precomputed multiples of the generator */ 71typedef struct ec_pre_comp_st { 72 const EC_GROUP *group; /* Parent EC_GROUP object */ 73 size_t w; /* Window size */ 74 /* 75 * Constant time access to the X and Y coordinates of the pre-computed, 76 * generator multiplies, in the Montgomery domain. Pre-calculated 77 * multiplies are stored in affine form. 78 */ 79 PRECOMP256_ROW *precomp; 80 void *precomp_storage; 81 int references; 82} EC_PRE_COMP; 83 84/* Functions implemented in assembly */ 85/* 86 * Most of below mentioned functions *preserve* the property of inputs 87 * being fully reduced, i.e. being in [0, modulus) range. Simply put if 88 * inputs are fully reduced, then output is too. Note that reverse is 89 * not true, in sense that given partially reduced inputs output can be 90 * either, not unlikely reduced. And "most" in first sentence refers to 91 * the fact that given the calculations flow one can tolerate that 92 * addition, 1st function below, produces partially reduced result *if* 93 * multiplications by 2 and 3, which customarily use addition, fully 94 * reduce it. This effectively gives two options: a) addition produces 95 * fully reduced result [as long as inputs are, just like remaining 96 * functions]; b) addition is allowed to produce partially reduced 97 * result, but multiplications by 2 and 3 perform additional reduction 98 * step. Choice between the two can be platform-specific, but it was a) 99 * in all cases so far... 100 */ 101/* Modular add: res = a+b mod P */ 102void ecp_nistz256_add(BN_ULONG res[P256_LIMBS], 103 const BN_ULONG a[P256_LIMBS], 104 const BN_ULONG b[P256_LIMBS]); 105/* Modular mul by 2: res = 2*a mod P */ 106void ecp_nistz256_mul_by_2(BN_ULONG res[P256_LIMBS], 107 const BN_ULONG a[P256_LIMBS]); 108/* Modular mul by 3: res = 3*a mod P */ 109void ecp_nistz256_mul_by_3(BN_ULONG res[P256_LIMBS], 110 const BN_ULONG a[P256_LIMBS]); 111 112/* Modular div by 2: res = a/2 mod P */ 113void ecp_nistz256_div_by_2(BN_ULONG res[P256_LIMBS], 114 const BN_ULONG a[P256_LIMBS]); 115/* Modular sub: res = a-b mod P */ 116void ecp_nistz256_sub(BN_ULONG res[P256_LIMBS], 117 const BN_ULONG a[P256_LIMBS], 118 const BN_ULONG b[P256_LIMBS]); 119/* Modular neg: res = -a mod P */ 120void ecp_nistz256_neg(BN_ULONG res[P256_LIMBS], const BN_ULONG a[P256_LIMBS]); 121/* Montgomery mul: res = a*b*2^-256 mod P */ 122void ecp_nistz256_mul_mont(BN_ULONG res[P256_LIMBS], 123 const BN_ULONG a[P256_LIMBS], 124 const BN_ULONG b[P256_LIMBS]); 125/* Montgomery sqr: res = a*a*2^-256 mod P */ 126void ecp_nistz256_sqr_mont(BN_ULONG res[P256_LIMBS], 127 const BN_ULONG a[P256_LIMBS]); 128/* Convert a number from Montgomery domain, by multiplying with 1 */ 129void ecp_nistz256_from_mont(BN_ULONG res[P256_LIMBS], 130 const BN_ULONG in[P256_LIMBS]); 131/* Convert a number to Montgomery domain, by multiplying with 2^512 mod P*/ 132void ecp_nistz256_to_mont(BN_ULONG res[P256_LIMBS], 133 const BN_ULONG in[P256_LIMBS]); 134/* Functions that perform constant time access to the precomputed tables */ 135void ecp_nistz256_select_w5(P256_POINT * val, 136 const P256_POINT * in_t, int index); 137void ecp_nistz256_select_w7(P256_POINT_AFFINE * val, 138 const P256_POINT_AFFINE * in_t, int index); 139 140/* One converted into the Montgomery domain */ 141static const BN_ULONG ONE[P256_LIMBS] = { 142 TOBN(0x00000000, 0x00000001), TOBN(0xffffffff, 0x00000000), 143 TOBN(0xffffffff, 0xffffffff), TOBN(0x00000000, 0xfffffffe) 144}; 145 146static void *ecp_nistz256_pre_comp_dup(void *); 147static void ecp_nistz256_pre_comp_free(void *); 148static void ecp_nistz256_pre_comp_clear_free(void *); 149static EC_PRE_COMP *ecp_nistz256_pre_comp_new(const EC_GROUP *group); 150 151/* Precomputed tables for the default generator */ 152#include "ecp_nistz256_table.c" 153 154/* Recode window to a signed digit, see ecp_nistputil.c for details */ 155static unsigned int _booth_recode_w5(unsigned int in) 156{ 157 unsigned int s, d; 158 159 s = ~((in >> 5) - 1); 160 d = (1 << 6) - in - 1; 161 d = (d & s) | (in & ~s); 162 d = (d >> 1) + (d & 1); 163 164 return (d << 1) + (s & 1); 165} 166 167static unsigned int _booth_recode_w7(unsigned int in) 168{ 169 unsigned int s, d; 170 171 s = ~((in >> 7) - 1); 172 d = (1 << 8) - in - 1; 173 d = (d & s) | (in & ~s); 174 d = (d >> 1) + (d & 1); 175 176 return (d << 1) + (s & 1); 177} 178 179static void copy_conditional(BN_ULONG dst[P256_LIMBS], 180 const BN_ULONG src[P256_LIMBS], BN_ULONG move) 181{ 182 BN_ULONG mask1 = -move; 183 BN_ULONG mask2 = ~mask1; 184 185 dst[0] = (src[0] & mask1) ^ (dst[0] & mask2); 186 dst[1] = (src[1] & mask1) ^ (dst[1] & mask2); 187 dst[2] = (src[2] & mask1) ^ (dst[2] & mask2); 188 dst[3] = (src[3] & mask1) ^ (dst[3] & mask2); 189 if (P256_LIMBS == 8) { 190 dst[4] = (src[4] & mask1) ^ (dst[4] & mask2); 191 dst[5] = (src[5] & mask1) ^ (dst[5] & mask2); 192 dst[6] = (src[6] & mask1) ^ (dst[6] & mask2); 193 dst[7] = (src[7] & mask1) ^ (dst[7] & mask2); 194 } 195} 196 197static BN_ULONG is_zero(BN_ULONG in) 198{ 199 in |= (0 - in); 200 in = ~in; 201 in &= BN_MASK2; 202 in >>= BN_BITS2 - 1; 203 return in; 204} 205 206static BN_ULONG is_equal(const BN_ULONG a[P256_LIMBS], 207 const BN_ULONG b[P256_LIMBS]) 208{ 209 BN_ULONG res; 210 211 res = a[0] ^ b[0]; 212 res |= a[1] ^ b[1]; 213 res |= a[2] ^ b[2]; 214 res |= a[3] ^ b[3]; 215 if (P256_LIMBS == 8) { 216 res |= a[4] ^ b[4]; 217 res |= a[5] ^ b[5]; 218 res |= a[6] ^ b[6]; 219 res |= a[7] ^ b[7]; 220 } 221 222 return is_zero(res); 223} 224 225static BN_ULONG is_one(const BIGNUM *z) 226{ 227 BN_ULONG res = 0; 228 BN_ULONG *a = z->d; 229 230 if (z->top == (P256_LIMBS - P256_LIMBS / 8)) { 231 res = a[0] ^ ONE[0]; 232 res |= a[1] ^ ONE[1]; 233 res |= a[2] ^ ONE[2]; 234 res |= a[3] ^ ONE[3]; 235 if (P256_LIMBS == 8) { 236 res |= a[4] ^ ONE[4]; 237 res |= a[5] ^ ONE[5]; 238 res |= a[6] ^ ONE[6]; 239 /* 240 * no check for a[7] (being zero) on 32-bit platforms, 241 * because value of "one" takes only 7 limbs. 242 */ 243 } 244 res = is_zero(res); 245 } 246 247 return res; 248} 249 250static int ecp_nistz256_set_words(BIGNUM *a, BN_ULONG words[P256_LIMBS]) 251 { 252 if (bn_wexpand(a, P256_LIMBS) == NULL) { 253 ECerr(EC_F_ECP_NISTZ256_SET_WORDS, ERR_R_MALLOC_FAILURE); 254 return 0; 255 } 256 memcpy(a->d, words, sizeof(BN_ULONG) * P256_LIMBS); 257 a->top = P256_LIMBS; 258 bn_correct_top(a); 259 return 1; 260} 261 262#ifndef ECP_NISTZ256_REFERENCE_IMPLEMENTATION 263void ecp_nistz256_point_double(P256_POINT *r, const P256_POINT *a); 264void ecp_nistz256_point_add(P256_POINT *r, 265 const P256_POINT *a, const P256_POINT *b); 266void ecp_nistz256_point_add_affine(P256_POINT *r, 267 const P256_POINT *a, 268 const P256_POINT_AFFINE *b); 269#else 270/* Point double: r = 2*a */ 271static void ecp_nistz256_point_double(P256_POINT *r, const P256_POINT *a) 272{ 273 BN_ULONG S[P256_LIMBS]; 274 BN_ULONG M[P256_LIMBS]; 275 BN_ULONG Zsqr[P256_LIMBS]; 276 BN_ULONG tmp0[P256_LIMBS]; 277 278 const BN_ULONG *in_x = a->X; 279 const BN_ULONG *in_y = a->Y; 280 const BN_ULONG *in_z = a->Z; 281 282 BN_ULONG *res_x = r->X; 283 BN_ULONG *res_y = r->Y; 284 BN_ULONG *res_z = r->Z; 285 286 ecp_nistz256_mul_by_2(S, in_y); 287 288 ecp_nistz256_sqr_mont(Zsqr, in_z); 289 290 ecp_nistz256_sqr_mont(S, S); 291 292 ecp_nistz256_mul_mont(res_z, in_z, in_y); 293 ecp_nistz256_mul_by_2(res_z, res_z); 294 295 ecp_nistz256_add(M, in_x, Zsqr); 296 ecp_nistz256_sub(Zsqr, in_x, Zsqr); 297 298 ecp_nistz256_sqr_mont(res_y, S); 299 ecp_nistz256_div_by_2(res_y, res_y); 300 301 ecp_nistz256_mul_mont(M, M, Zsqr); 302 ecp_nistz256_mul_by_3(M, M); 303 304 ecp_nistz256_mul_mont(S, S, in_x); 305 ecp_nistz256_mul_by_2(tmp0, S); 306 307 ecp_nistz256_sqr_mont(res_x, M); 308 309 ecp_nistz256_sub(res_x, res_x, tmp0); 310 ecp_nistz256_sub(S, S, res_x); 311 312 ecp_nistz256_mul_mont(S, S, M); 313 ecp_nistz256_sub(res_y, S, res_y); 314} 315 316/* Point addition: r = a+b */ 317static void ecp_nistz256_point_add(P256_POINT *r, 318 const P256_POINT *a, const P256_POINT *b) 319{ 320 BN_ULONG U2[P256_LIMBS], S2[P256_LIMBS]; 321 BN_ULONG U1[P256_LIMBS], S1[P256_LIMBS]; 322 BN_ULONG Z1sqr[P256_LIMBS]; 323 BN_ULONG Z2sqr[P256_LIMBS]; 324 BN_ULONG H[P256_LIMBS], R[P256_LIMBS]; 325 BN_ULONG Hsqr[P256_LIMBS]; 326 BN_ULONG Rsqr[P256_LIMBS]; 327 BN_ULONG Hcub[P256_LIMBS]; 328 329 BN_ULONG res_x[P256_LIMBS]; 330 BN_ULONG res_y[P256_LIMBS]; 331 BN_ULONG res_z[P256_LIMBS]; 332 333 BN_ULONG in1infty, in2infty; 334 335 const BN_ULONG *in1_x = a->X; 336 const BN_ULONG *in1_y = a->Y; 337 const BN_ULONG *in1_z = a->Z; 338 339 const BN_ULONG *in2_x = b->X; 340 const BN_ULONG *in2_y = b->Y; 341 const BN_ULONG *in2_z = b->Z; 342 343 /* 344 * Infinity in encoded as (,,0) 345 */ 346 in1infty = (in1_z[0] | in1_z[1] | in1_z[2] | in1_z[3]); 347 if (P256_LIMBS == 8) 348 in1infty |= (in1_z[4] | in1_z[5] | in1_z[6] | in1_z[7]); 349 350 in2infty = (in2_z[0] | in2_z[1] | in2_z[2] | in2_z[3]); 351 if (P256_LIMBS == 8) 352 in2infty |= (in2_z[4] | in2_z[5] | in2_z[6] | in2_z[7]); 353 354 in1infty = is_zero(in1infty); 355 in2infty = is_zero(in2infty); 356 357 ecp_nistz256_sqr_mont(Z2sqr, in2_z); /* Z2^2 */ 358 ecp_nistz256_sqr_mont(Z1sqr, in1_z); /* Z1^2 */ 359 360 ecp_nistz256_mul_mont(S1, Z2sqr, in2_z); /* S1 = Z2^3 */ 361 ecp_nistz256_mul_mont(S2, Z1sqr, in1_z); /* S2 = Z1^3 */ 362 363 ecp_nistz256_mul_mont(S1, S1, in1_y); /* S1 = Y1*Z2^3 */ 364 ecp_nistz256_mul_mont(S2, S2, in2_y); /* S2 = Y2*Z1^3 */ 365 ecp_nistz256_sub(R, S2, S1); /* R = S2 - S1 */ 366 367 ecp_nistz256_mul_mont(U1, in1_x, Z2sqr); /* U1 = X1*Z2^2 */ 368 ecp_nistz256_mul_mont(U2, in2_x, Z1sqr); /* U2 = X2*Z1^2 */ 369 ecp_nistz256_sub(H, U2, U1); /* H = U2 - U1 */ 370 371 /* 372 * This should not happen during sign/ecdh, so no constant time violation 373 */ 374 if (is_equal(U1, U2) && !in1infty && !in2infty) { 375 if (is_equal(S1, S2)) { 376 ecp_nistz256_point_double(r, a); 377 return; 378 } else { 379 memset(r, 0, sizeof(*r)); 380 return; 381 } 382 } 383 384 ecp_nistz256_sqr_mont(Rsqr, R); /* R^2 */ 385 ecp_nistz256_mul_mont(res_z, H, in1_z); /* Z3 = H*Z1*Z2 */ 386 ecp_nistz256_sqr_mont(Hsqr, H); /* H^2 */ 387 ecp_nistz256_mul_mont(res_z, res_z, in2_z); /* Z3 = H*Z1*Z2 */ 388 ecp_nistz256_mul_mont(Hcub, Hsqr, H); /* H^3 */ 389 390 ecp_nistz256_mul_mont(U2, U1, Hsqr); /* U1*H^2 */ 391 ecp_nistz256_mul_by_2(Hsqr, U2); /* 2*U1*H^2 */ 392 393 ecp_nistz256_sub(res_x, Rsqr, Hsqr); 394 ecp_nistz256_sub(res_x, res_x, Hcub); 395 396 ecp_nistz256_sub(res_y, U2, res_x); 397 398 ecp_nistz256_mul_mont(S2, S1, Hcub); 399 ecp_nistz256_mul_mont(res_y, R, res_y); 400 ecp_nistz256_sub(res_y, res_y, S2); 401 402 copy_conditional(res_x, in2_x, in1infty); 403 copy_conditional(res_y, in2_y, in1infty); 404 copy_conditional(res_z, in2_z, in1infty); 405 406 copy_conditional(res_x, in1_x, in2infty); 407 copy_conditional(res_y, in1_y, in2infty); 408 copy_conditional(res_z, in1_z, in2infty); 409 410 memcpy(r->X, res_x, sizeof(res_x)); 411 memcpy(r->Y, res_y, sizeof(res_y)); 412 memcpy(r->Z, res_z, sizeof(res_z)); 413} 414 415/* Point addition when b is known to be affine: r = a+b */ 416static void ecp_nistz256_point_add_affine(P256_POINT *r, 417 const P256_POINT *a, 418 const P256_POINT_AFFINE *b) 419{ 420 BN_ULONG U2[P256_LIMBS], S2[P256_LIMBS]; 421 BN_ULONG Z1sqr[P256_LIMBS]; 422 BN_ULONG H[P256_LIMBS], R[P256_LIMBS]; 423 BN_ULONG Hsqr[P256_LIMBS]; 424 BN_ULONG Rsqr[P256_LIMBS]; 425 BN_ULONG Hcub[P256_LIMBS]; 426 427 BN_ULONG res_x[P256_LIMBS]; 428 BN_ULONG res_y[P256_LIMBS]; 429 BN_ULONG res_z[P256_LIMBS]; 430 431 BN_ULONG in1infty, in2infty; 432 433 const BN_ULONG *in1_x = a->X; 434 const BN_ULONG *in1_y = a->Y; 435 const BN_ULONG *in1_z = a->Z; 436 437 const BN_ULONG *in2_x = b->X; 438 const BN_ULONG *in2_y = b->Y; 439 440 /* 441 * Infinity in encoded as (,,0) 442 */ 443 in1infty = (in1_z[0] | in1_z[1] | in1_z[2] | in1_z[3]); 444 if (P256_LIMBS == 8) 445 in1infty |= (in1_z[4] | in1_z[5] | in1_z[6] | in1_z[7]); 446 447 /* 448 * In affine representation we encode infinity as (0,0), which is 449 * not on the curve, so it is OK 450 */ 451 in2infty = (in2_x[0] | in2_x[1] | in2_x[2] | in2_x[3] | 452 in2_y[0] | in2_y[1] | in2_y[2] | in2_y[3]); 453 if (P256_LIMBS == 8) 454 in2infty |= (in2_x[4] | in2_x[5] | in2_x[6] | in2_x[7] | 455 in2_y[4] | in2_y[5] | in2_y[6] | in2_y[7]); 456 457 in1infty = is_zero(in1infty); 458 in2infty = is_zero(in2infty); 459 460 ecp_nistz256_sqr_mont(Z1sqr, in1_z); /* Z1^2 */ 461 462 ecp_nistz256_mul_mont(U2, in2_x, Z1sqr); /* U2 = X2*Z1^2 */ 463 ecp_nistz256_sub(H, U2, in1_x); /* H = U2 - U1 */ 464 465 ecp_nistz256_mul_mont(S2, Z1sqr, in1_z); /* S2 = Z1^3 */ 466 467 ecp_nistz256_mul_mont(res_z, H, in1_z); /* Z3 = H*Z1*Z2 */ 468 469 ecp_nistz256_mul_mont(S2, S2, in2_y); /* S2 = Y2*Z1^3 */ 470 ecp_nistz256_sub(R, S2, in1_y); /* R = S2 - S1 */ 471 472 ecp_nistz256_sqr_mont(Hsqr, H); /* H^2 */ 473 ecp_nistz256_sqr_mont(Rsqr, R); /* R^2 */ 474 ecp_nistz256_mul_mont(Hcub, Hsqr, H); /* H^3 */ 475 476 ecp_nistz256_mul_mont(U2, in1_x, Hsqr); /* U1*H^2 */ 477 ecp_nistz256_mul_by_2(Hsqr, U2); /* 2*U1*H^2 */ 478 479 ecp_nistz256_sub(res_x, Rsqr, Hsqr); 480 ecp_nistz256_sub(res_x, res_x, Hcub); 481 ecp_nistz256_sub(H, U2, res_x); 482 483 ecp_nistz256_mul_mont(S2, in1_y, Hcub); 484 ecp_nistz256_mul_mont(H, H, R); 485 ecp_nistz256_sub(res_y, H, S2); 486 487 copy_conditional(res_x, in2_x, in1infty); 488 copy_conditional(res_x, in1_x, in2infty); 489 490 copy_conditional(res_y, in2_y, in1infty); 491 copy_conditional(res_y, in1_y, in2infty); 492 493 copy_conditional(res_z, ONE, in1infty); 494 copy_conditional(res_z, in1_z, in2infty); 495 496 memcpy(r->X, res_x, sizeof(res_x)); 497 memcpy(r->Y, res_y, sizeof(res_y)); 498 memcpy(r->Z, res_z, sizeof(res_z)); 499} 500#endif 501 502/* r = in^-1 mod p */ 503static void ecp_nistz256_mod_inverse(BN_ULONG r[P256_LIMBS], 504 const BN_ULONG in[P256_LIMBS]) 505{ 506 /* 507 * The poly is ffffffff 00000001 00000000 00000000 00000000 ffffffff 508 * ffffffff ffffffff We use FLT and used poly-2 as exponent 509 */ 510 BN_ULONG p2[P256_LIMBS]; 511 BN_ULONG p4[P256_LIMBS]; 512 BN_ULONG p8[P256_LIMBS]; 513 BN_ULONG p16[P256_LIMBS]; 514 BN_ULONG p32[P256_LIMBS]; 515 BN_ULONG res[P256_LIMBS]; 516 int i; 517 518 ecp_nistz256_sqr_mont(res, in); 519 ecp_nistz256_mul_mont(p2, res, in); /* 3*p */ 520 521 ecp_nistz256_sqr_mont(res, p2); 522 ecp_nistz256_sqr_mont(res, res); 523 ecp_nistz256_mul_mont(p4, res, p2); /* f*p */ 524 525 ecp_nistz256_sqr_mont(res, p4); 526 ecp_nistz256_sqr_mont(res, res); 527 ecp_nistz256_sqr_mont(res, res); 528 ecp_nistz256_sqr_mont(res, res); 529 ecp_nistz256_mul_mont(p8, res, p4); /* ff*p */ 530 531 ecp_nistz256_sqr_mont(res, p8); 532 for (i = 0; i < 7; i++) 533 ecp_nistz256_sqr_mont(res, res); 534 ecp_nistz256_mul_mont(p16, res, p8); /* ffff*p */ 535 536 ecp_nistz256_sqr_mont(res, p16); 537 for (i = 0; i < 15; i++) 538 ecp_nistz256_sqr_mont(res, res); 539 ecp_nistz256_mul_mont(p32, res, p16); /* ffffffff*p */ 540 541 ecp_nistz256_sqr_mont(res, p32); 542 for (i = 0; i < 31; i++) 543 ecp_nistz256_sqr_mont(res, res); 544 ecp_nistz256_mul_mont(res, res, in); 545 546 for (i = 0; i < 32 * 4; i++) 547 ecp_nistz256_sqr_mont(res, res); 548 ecp_nistz256_mul_mont(res, res, p32); 549 550 for (i = 0; i < 32; i++) 551 ecp_nistz256_sqr_mont(res, res); 552 ecp_nistz256_mul_mont(res, res, p32); 553 554 for (i = 0; i < 16; i++) 555 ecp_nistz256_sqr_mont(res, res); 556 ecp_nistz256_mul_mont(res, res, p16); 557 558 for (i = 0; i < 8; i++) 559 ecp_nistz256_sqr_mont(res, res); 560 ecp_nistz256_mul_mont(res, res, p8); 561 562 ecp_nistz256_sqr_mont(res, res); 563 ecp_nistz256_sqr_mont(res, res); 564 ecp_nistz256_sqr_mont(res, res); 565 ecp_nistz256_sqr_mont(res, res); 566 ecp_nistz256_mul_mont(res, res, p4); 567 568 ecp_nistz256_sqr_mont(res, res); 569 ecp_nistz256_sqr_mont(res, res); 570 ecp_nistz256_mul_mont(res, res, p2); 571 572 ecp_nistz256_sqr_mont(res, res); 573 ecp_nistz256_sqr_mont(res, res); 574 ecp_nistz256_mul_mont(res, res, in); 575 576 memcpy(r, res, sizeof(res)); 577} 578 579/* 580 * ecp_nistz256_bignum_to_field_elem copies the contents of |in| to |out| and 581 * returns one if it fits. Otherwise it returns zero. 582 */ 583static int ecp_nistz256_bignum_to_field_elem(BN_ULONG out[P256_LIMBS], 584 const BIGNUM *in) 585{ 586 if (in->top > P256_LIMBS) 587 return 0; 588 589 memset(out, 0, sizeof(BN_ULONG) * P256_LIMBS); 590 memcpy(out, in->d, sizeof(BN_ULONG) * in->top); 591 return 1; 592} 593 594/* r = sum(scalar[i]*point[i]) */ 595static int ecp_nistz256_windowed_mul(const EC_GROUP *group, 596 P256_POINT *r, 597 const BIGNUM **scalar, 598 const EC_POINT **point, 599 int num, BN_CTX *ctx) 600{ 601 602 int i, j, ret = 0; 603 unsigned int index; 604 unsigned char (*p_str)[33] = NULL; 605 const unsigned int window_size = 5; 606 const unsigned int mask = (1 << (window_size + 1)) - 1; 607 unsigned int wvalue; 608 BN_ULONG tmp[P256_LIMBS]; 609 ALIGN32 P256_POINT h; 610 const BIGNUM **scalars = NULL; 611 P256_POINT (*table)[16] = NULL; 612 void *table_storage = NULL; 613 614 if ((table_storage = 615 OPENSSL_malloc(num * 16 * sizeof(P256_POINT) + 64)) == NULL 616 || (p_str = 617 OPENSSL_malloc(num * 33 * sizeof(unsigned char))) == NULL 618 || (scalars = OPENSSL_malloc(num * sizeof(BIGNUM *))) == NULL) { 619 ECerr(EC_F_ECP_NISTZ256_WINDOWED_MUL, ERR_R_MALLOC_FAILURE); 620 goto err; 621 } else { 622 table = (void *)ALIGNPTR(table_storage, 64); 623 } 624 625 for (i = 0; i < num; i++) { 626 P256_POINT *row = table[i]; 627 628 /* This is an unusual input, we don't guarantee constant-timeness. */ 629 if ((BN_num_bits(scalar[i]) > 256) || BN_is_negative(scalar[i])) { 630 BIGNUM *mod; 631 632 if ((mod = BN_CTX_get(ctx)) == NULL) 633 goto err; 634 if (!BN_nnmod(mod, scalar[i], &group->order, ctx)) { 635 ECerr(EC_F_ECP_NISTZ256_WINDOWED_MUL, ERR_R_BN_LIB); 636 goto err; 637 } 638 scalars[i] = mod; 639 } else 640 scalars[i] = scalar[i]; 641 642 for (j = 0; j < scalars[i]->top * BN_BYTES; j += BN_BYTES) { 643 BN_ULONG d = scalars[i]->d[j / BN_BYTES]; 644 645 p_str[i][j + 0] = d & 0xff; 646 p_str[i][j + 1] = (d >> 8) & 0xff; 647 p_str[i][j + 2] = (d >> 16) & 0xff; 648 p_str[i][j + 3] = (d >>= 24) & 0xff; 649 if (BN_BYTES == 8) { 650 d >>= 8; 651 p_str[i][j + 4] = d & 0xff; 652 p_str[i][j + 5] = (d >> 8) & 0xff; 653 p_str[i][j + 6] = (d >> 16) & 0xff; 654 p_str[i][j + 7] = (d >> 24) & 0xff; 655 } 656 } 657 for (; j < 33; j++) 658 p_str[i][j] = 0; 659 660 /* table[0] is implicitly (0,0,0) (the point at infinity), 661 * therefore it is not stored. All other values are actually 662 * stored with an offset of -1 in table. 663 */ 664 665 if (!ecp_nistz256_bignum_to_field_elem(row[1 - 1].X, &point[i]->X) 666 || !ecp_nistz256_bignum_to_field_elem(row[1 - 1].Y, &point[i]->Y) 667 || !ecp_nistz256_bignum_to_field_elem(row[1 - 1].Z, &point[i]->Z)) { 668 ECerr(EC_F_ECP_NISTZ256_WINDOWED_MUL, EC_R_COORDINATES_OUT_OF_RANGE); 669 goto err; 670 } 671 672 ecp_nistz256_point_double(&row[ 2 - 1], &row[ 1 - 1]); 673 ecp_nistz256_point_add (&row[ 3 - 1], &row[ 2 - 1], &row[1 - 1]); 674 ecp_nistz256_point_double(&row[ 4 - 1], &row[ 2 - 1]); 675 ecp_nistz256_point_double(&row[ 6 - 1], &row[ 3 - 1]); 676 ecp_nistz256_point_double(&row[ 8 - 1], &row[ 4 - 1]); 677 ecp_nistz256_point_double(&row[12 - 1], &row[ 6 - 1]); 678 ecp_nistz256_point_add (&row[ 5 - 1], &row[ 4 - 1], &row[1 - 1]); 679 ecp_nistz256_point_add (&row[ 7 - 1], &row[ 6 - 1], &row[1 - 1]); 680 ecp_nistz256_point_add (&row[ 9 - 1], &row[ 8 - 1], &row[1 - 1]); 681 ecp_nistz256_point_add (&row[13 - 1], &row[12 - 1], &row[1 - 1]); 682 ecp_nistz256_point_double(&row[14 - 1], &row[ 7 - 1]); 683 ecp_nistz256_point_double(&row[10 - 1], &row[ 5 - 1]); 684 ecp_nistz256_point_add (&row[15 - 1], &row[14 - 1], &row[1 - 1]); 685 ecp_nistz256_point_add (&row[11 - 1], &row[10 - 1], &row[1 - 1]); 686 ecp_nistz256_point_add (&row[16 - 1], &row[15 - 1], &row[1 - 1]); 687 } 688 689 index = 255; 690 691 wvalue = p_str[0][(index - 1) / 8]; 692 wvalue = (wvalue >> ((index - 1) % 8)) & mask; 693 694 ecp_nistz256_select_w5(r, table[0], _booth_recode_w5(wvalue) >> 1); 695 696 while (index >= 5) { 697 for (i = (index == 255 ? 1 : 0); i < num; i++) { 698 unsigned int off = (index - 1) / 8; 699 700 wvalue = p_str[i][off] | p_str[i][off + 1] << 8; 701 wvalue = (wvalue >> ((index - 1) % 8)) & mask; 702 703 wvalue = _booth_recode_w5(wvalue); 704 705 ecp_nistz256_select_w5(&h, table[i], wvalue >> 1); 706 707 ecp_nistz256_neg(tmp, h.Y); 708 copy_conditional(h.Y, tmp, (wvalue & 1)); 709 710 ecp_nistz256_point_add(r, r, &h); 711 } 712 713 index -= window_size; 714 715 ecp_nistz256_point_double(r, r); 716 ecp_nistz256_point_double(r, r); 717 ecp_nistz256_point_double(r, r); 718 ecp_nistz256_point_double(r, r); 719 ecp_nistz256_point_double(r, r); 720 } 721 722 /* Final window */ 723 for (i = 0; i < num; i++) { 724 wvalue = p_str[i][0]; 725 wvalue = (wvalue << 1) & mask; 726 727 wvalue = _booth_recode_w5(wvalue); 728 729 ecp_nistz256_select_w5(&h, table[i], wvalue >> 1); 730 731 ecp_nistz256_neg(tmp, h.Y); 732 copy_conditional(h.Y, tmp, wvalue & 1); 733 734 ecp_nistz256_point_add(r, r, &h); 735 } 736 737 ret = 1; 738 err: 739 if (table_storage) 740 OPENSSL_free(table_storage); 741 if (p_str) 742 OPENSSL_free(p_str); 743 if (scalars) 744 OPENSSL_free(scalars); 745 return ret; 746} 747 748/* Coordinates of G, for which we have precomputed tables */ 749const static BN_ULONG def_xG[P256_LIMBS] = { 750 TOBN(0x79e730d4, 0x18a9143c), TOBN(0x75ba95fc, 0x5fedb601), 751 TOBN(0x79fb732b, 0x77622510), TOBN(0x18905f76, 0xa53755c6) 752}; 753 754const static BN_ULONG def_yG[P256_LIMBS] = { 755 TOBN(0xddf25357, 0xce95560a), TOBN(0x8b4ab8e4, 0xba19e45c), 756 TOBN(0xd2e88688, 0xdd21f325), TOBN(0x8571ff18, 0x25885d85) 757}; 758 759/* 760 * ecp_nistz256_is_affine_G returns one if |generator| is the standard, P-256 761 * generator. 762 */ 763static int ecp_nistz256_is_affine_G(const EC_POINT *generator) 764{ 765 return (generator->X.top == P256_LIMBS) && 766 (generator->Y.top == P256_LIMBS) && 767 is_equal(generator->X.d, def_xG) && 768 is_equal(generator->Y.d, def_yG) && is_one(&generator->Z); 769} 770 771static int ecp_nistz256_mult_precompute(EC_GROUP *group, BN_CTX *ctx) 772{ 773 /* 774 * We precompute a table for a Booth encoded exponent (wNAF) based 775 * computation. Each table holds 64 values for safe access, with an 776 * implicit value of infinity at index zero. We use window of size 7, and 777 * therefore require ceil(256/7) = 37 tables. 778 */ 779 BIGNUM *order; 780 EC_POINT *P = NULL, *T = NULL; 781 const EC_POINT *generator; 782 EC_PRE_COMP *pre_comp; 783 BN_CTX *new_ctx = NULL; 784 int i, j, k, ret = 0; 785 size_t w; 786 787 PRECOMP256_ROW *preComputedTable = NULL; 788 unsigned char *precomp_storage = NULL; 789 790 /* if there is an old EC_PRE_COMP object, throw it away */ 791 EC_EX_DATA_free_data(&group->extra_data, ecp_nistz256_pre_comp_dup, 792 ecp_nistz256_pre_comp_free, 793 ecp_nistz256_pre_comp_clear_free); 794 795 generator = EC_GROUP_get0_generator(group); 796 if (generator == NULL) { 797 ECerr(EC_F_ECP_NISTZ256_MULT_PRECOMPUTE, EC_R_UNDEFINED_GENERATOR); 798 return 0; 799 } 800 801 if (ecp_nistz256_is_affine_G(generator)) { 802 /* 803 * No need to calculate tables for the standard generator because we 804 * have them statically. 805 */ 806 return 1; 807 } 808 809 if ((pre_comp = ecp_nistz256_pre_comp_new(group)) == NULL) 810 return 0; 811 812 if (ctx == NULL) { 813 ctx = new_ctx = BN_CTX_new(); 814 if (ctx == NULL) 815 goto err; 816 } 817 818 BN_CTX_start(ctx); 819 order = BN_CTX_get(ctx); 820 821 if (order == NULL) 822 goto err; 823 824 if (!EC_GROUP_get_order(group, order, ctx)) 825 goto err; 826 827 if (BN_is_zero(order)) { 828 ECerr(EC_F_ECP_NISTZ256_MULT_PRECOMPUTE, EC_R_UNKNOWN_ORDER); 829 goto err; 830 } 831 832 w = 7; 833 834 if ((precomp_storage = 835 OPENSSL_malloc(37 * 64 * sizeof(P256_POINT_AFFINE) + 64)) == NULL) { 836 ECerr(EC_F_ECP_NISTZ256_MULT_PRECOMPUTE, ERR_R_MALLOC_FAILURE); 837 goto err; 838 } else { 839 preComputedTable = (void *)ALIGNPTR(precomp_storage, 64); 840 } 841 842 P = EC_POINT_new(group); 843 T = EC_POINT_new(group); 844 if (P == NULL || T == NULL) 845 goto err; 846 847 /* 848 * The zero entry is implicitly infinity, and we skip it, storing other 849 * values with -1 offset. 850 */ 851 if (!EC_POINT_copy(T, generator)) 852 goto err; 853 854 for (k = 0; k < 64; k++) { 855 if (!EC_POINT_copy(P, T)) 856 goto err; 857 for (j = 0; j < 37; j++) { 858 /* 859 * It would be faster to use EC_POINTs_make_affine and 860 * make multiple points affine at the same time. 861 */ 862 if (!EC_POINT_make_affine(group, P, ctx)) 863 goto err; 864 if (!ecp_nistz256_bignum_to_field_elem(preComputedTable[j][k].X, 865 &P->X) || 866 !ecp_nistz256_bignum_to_field_elem(preComputedTable[j][k].Y, 867 &P->Y)) { 868 ECerr(EC_F_ECP_NISTZ256_MULT_PRECOMPUTE, 869 EC_R_COORDINATES_OUT_OF_RANGE); 870 goto err; 871 } 872 for (i = 0; i < 7; i++) { 873 if (!EC_POINT_dbl(group, P, P, ctx)) 874 goto err; 875 } 876 } 877 if (!EC_POINT_add(group, T, T, generator, ctx)) 878 goto err; 879 } 880 881 pre_comp->group = group; 882 pre_comp->w = w; 883 pre_comp->precomp = preComputedTable; 884 pre_comp->precomp_storage = precomp_storage; 885 886 precomp_storage = NULL; 887 888 if (!EC_EX_DATA_set_data(&group->extra_data, pre_comp, 889 ecp_nistz256_pre_comp_dup, 890 ecp_nistz256_pre_comp_free, 891 ecp_nistz256_pre_comp_clear_free)) { 892 goto err; 893 } 894 895 pre_comp = NULL; 896 897 ret = 1; 898 899 err: 900 if (ctx != NULL) 901 BN_CTX_end(ctx); 902 BN_CTX_free(new_ctx); 903 904 if (pre_comp) 905 ecp_nistz256_pre_comp_free(pre_comp); 906 if (precomp_storage) 907 OPENSSL_free(precomp_storage); 908 if (P) 909 EC_POINT_free(P); 910 if (T) 911 EC_POINT_free(T); 912 return ret; 913} 914 915/* 916 * Note that by default ECP_NISTZ256_AVX2 is undefined. While it's great 917 * code processing 4 points in parallel, corresponding serial operation 918 * is several times slower, because it uses 29x29=58-bit multiplication 919 * as opposite to 64x64=128-bit in integer-only scalar case. As result 920 * it doesn't provide *significant* performance improvement. Note that 921 * just defining ECP_NISTZ256_AVX2 is not sufficient to make it work, 922 * you'd need to compile even asm/ecp_nistz256-avx.pl module. 923 */ 924#if defined(ECP_NISTZ256_AVX2) 925# if !(defined(__x86_64) || defined(__x86_64__)) || \ 926 defined(_M_AMD64) || defined(_MX64)) || \ 927 !(defined(__GNUC__) || defined(_MSC_VER)) /* this is for ALIGN32 */ 928# undef ECP_NISTZ256_AVX2 929# else 930/* Constant time access, loading four values, from four consecutive tables */ 931void ecp_nistz256_avx2_select_w7(P256_POINT_AFFINE * val, 932 const P256_POINT_AFFINE * in_t, int index); 933void ecp_nistz256_avx2_multi_select_w7(void *result, const void *in, int index0, 934 int index1, int index2, int index3); 935void ecp_nistz256_avx2_transpose_convert(void *RESULTx4, const void *in); 936void ecp_nistz256_avx2_convert_transpose_back(void *result, const void *Ax4); 937void ecp_nistz256_avx2_point_add_affine_x4(void *RESULTx4, const void *Ax4, 938 const void *Bx4); 939void ecp_nistz256_avx2_point_add_affines_x4(void *RESULTx4, const void *Ax4, 940 const void *Bx4); 941void ecp_nistz256_avx2_to_mont(void *RESULTx4, const void *Ax4); 942void ecp_nistz256_avx2_from_mont(void *RESULTx4, const void *Ax4); 943void ecp_nistz256_avx2_set1(void *RESULTx4); 944int ecp_nistz_avx2_eligible(void); 945 946static void booth_recode_w7(unsigned char *sign, 947 unsigned char *digit, unsigned char in) 948{ 949 unsigned char s, d; 950 951 s = ~((in >> 7) - 1); 952 d = (1 << 8) - in - 1; 953 d = (d & s) | (in & ~s); 954 d = (d >> 1) + (d & 1); 955 956 *sign = s & 1; 957 *digit = d; 958} 959 960/* 961 * ecp_nistz256_avx2_mul_g performs multiplication by G, using only the 962 * precomputed table. It does 4 affine point additions in parallel, 963 * significantly speeding up point multiplication for a fixed value. 964 */ 965static void ecp_nistz256_avx2_mul_g(P256_POINT *r, 966 unsigned char p_str[33], 967 const P256_POINT_AFFINE(*preComputedTable)[64]) 968{ 969 const unsigned int window_size = 7; 970 const unsigned int mask = (1 << (window_size + 1)) - 1; 971 unsigned int wvalue; 972 /* Using 4 windows at a time */ 973 unsigned char sign0, digit0; 974 unsigned char sign1, digit1; 975 unsigned char sign2, digit2; 976 unsigned char sign3, digit3; 977 unsigned int index = 0; 978 BN_ULONG tmp[P256_LIMBS]; 979 int i; 980 981 ALIGN32 BN_ULONG aX4[4 * 9 * 3] = { 0 }; 982 ALIGN32 BN_ULONG bX4[4 * 9 * 2] = { 0 }; 983 ALIGN32 P256_POINT_AFFINE point_arr[P256_LIMBS]; 984 ALIGN32 P256_POINT res_point_arr[P256_LIMBS]; 985 986 /* Initial four windows */ 987 wvalue = *((u16 *) & p_str[0]); 988 wvalue = (wvalue << 1) & mask; 989 index += window_size; 990 booth_recode_w7(&sign0, &digit0, wvalue); 991 wvalue = *((u16 *) & p_str[(index - 1) / 8]); 992 wvalue = (wvalue >> ((index - 1) % 8)) & mask; 993 index += window_size; 994 booth_recode_w7(&sign1, &digit1, wvalue); 995 wvalue = *((u16 *) & p_str[(index - 1) / 8]); 996 wvalue = (wvalue >> ((index - 1) % 8)) & mask; 997 index += window_size; 998 booth_recode_w7(&sign2, &digit2, wvalue); 999 wvalue = *((u16 *) & p_str[(index - 1) / 8]); 1000 wvalue = (wvalue >> ((index - 1) % 8)) & mask; 1001 index += window_size; 1002 booth_recode_w7(&sign3, &digit3, wvalue); 1003 1004 ecp_nistz256_avx2_multi_select_w7(point_arr, preComputedTable[0], 1005 digit0, digit1, digit2, digit3); 1006 1007 ecp_nistz256_neg(tmp, point_arr[0].Y); 1008 copy_conditional(point_arr[0].Y, tmp, sign0); 1009 ecp_nistz256_neg(tmp, point_arr[1].Y); 1010 copy_conditional(point_arr[1].Y, tmp, sign1); 1011 ecp_nistz256_neg(tmp, point_arr[2].Y); 1012 copy_conditional(point_arr[2].Y, tmp, sign2); 1013 ecp_nistz256_neg(tmp, point_arr[3].Y); 1014 copy_conditional(point_arr[3].Y, tmp, sign3); 1015 1016 ecp_nistz256_avx2_transpose_convert(aX4, point_arr); 1017 ecp_nistz256_avx2_to_mont(aX4, aX4); 1018 ecp_nistz256_avx2_to_mont(&aX4[4 * 9], &aX4[4 * 9]); 1019 ecp_nistz256_avx2_set1(&aX4[4 * 9 * 2]); 1020 1021 wvalue = *((u16 *) & p_str[(index - 1) / 8]); 1022 wvalue = (wvalue >> ((index - 1) % 8)) & mask; 1023 index += window_size; 1024 booth_recode_w7(&sign0, &digit0, wvalue); 1025 wvalue = *((u16 *) & p_str[(index - 1) / 8]); 1026 wvalue = (wvalue >> ((index - 1) % 8)) & mask; 1027 index += window_size; 1028 booth_recode_w7(&sign1, &digit1, wvalue); 1029 wvalue = *((u16 *) & p_str[(index - 1) / 8]); 1030 wvalue = (wvalue >> ((index - 1) % 8)) & mask; 1031 index += window_size; 1032 booth_recode_w7(&sign2, &digit2, wvalue); 1033 wvalue = *((u16 *) & p_str[(index - 1) / 8]); 1034 wvalue = (wvalue >> ((index - 1) % 8)) & mask; 1035 index += window_size; 1036 booth_recode_w7(&sign3, &digit3, wvalue); 1037 1038 ecp_nistz256_avx2_multi_select_w7(point_arr, preComputedTable[4 * 1], 1039 digit0, digit1, digit2, digit3); 1040 1041 ecp_nistz256_neg(tmp, point_arr[0].Y); 1042 copy_conditional(point_arr[0].Y, tmp, sign0); 1043 ecp_nistz256_neg(tmp, point_arr[1].Y); 1044 copy_conditional(point_arr[1].Y, tmp, sign1); 1045 ecp_nistz256_neg(tmp, point_arr[2].Y); 1046 copy_conditional(point_arr[2].Y, tmp, sign2); 1047 ecp_nistz256_neg(tmp, point_arr[3].Y); 1048 copy_conditional(point_arr[3].Y, tmp, sign3); 1049 1050 ecp_nistz256_avx2_transpose_convert(bX4, point_arr); 1051 ecp_nistz256_avx2_to_mont(bX4, bX4); 1052 ecp_nistz256_avx2_to_mont(&bX4[4 * 9], &bX4[4 * 9]); 1053 /* Optimized when both inputs are affine */ 1054 ecp_nistz256_avx2_point_add_affines_x4(aX4, aX4, bX4); 1055 1056 for (i = 2; i < 9; i++) { 1057 wvalue = *((u16 *) & p_str[(index - 1) / 8]); 1058 wvalue = (wvalue >> ((index - 1) % 8)) & mask; 1059 index += window_size; 1060 booth_recode_w7(&sign0, &digit0, wvalue); 1061 wvalue = *((u16 *) & p_str[(index - 1) / 8]); 1062 wvalue = (wvalue >> ((index - 1) % 8)) & mask; 1063 index += window_size; 1064 booth_recode_w7(&sign1, &digit1, wvalue); 1065 wvalue = *((u16 *) & p_str[(index - 1) / 8]); 1066 wvalue = (wvalue >> ((index - 1) % 8)) & mask; 1067 index += window_size; 1068 booth_recode_w7(&sign2, &digit2, wvalue); 1069 wvalue = *((u16 *) & p_str[(index - 1) / 8]); 1070 wvalue = (wvalue >> ((index - 1) % 8)) & mask; 1071 index += window_size; 1072 booth_recode_w7(&sign3, &digit3, wvalue); 1073 1074 ecp_nistz256_avx2_multi_select_w7(point_arr, 1075 preComputedTable[4 * i], 1076 digit0, digit1, digit2, digit3); 1077 1078 ecp_nistz256_neg(tmp, point_arr[0].Y); 1079 copy_conditional(point_arr[0].Y, tmp, sign0); 1080 ecp_nistz256_neg(tmp, point_arr[1].Y); 1081 copy_conditional(point_arr[1].Y, tmp, sign1); 1082 ecp_nistz256_neg(tmp, point_arr[2].Y); 1083 copy_conditional(point_arr[2].Y, tmp, sign2); 1084 ecp_nistz256_neg(tmp, point_arr[3].Y); 1085 copy_conditional(point_arr[3].Y, tmp, sign3); 1086 1087 ecp_nistz256_avx2_transpose_convert(bX4, point_arr); 1088 ecp_nistz256_avx2_to_mont(bX4, bX4); 1089 ecp_nistz256_avx2_to_mont(&bX4[4 * 9], &bX4[4 * 9]); 1090 1091 ecp_nistz256_avx2_point_add_affine_x4(aX4, aX4, bX4); 1092 } 1093 1094 ecp_nistz256_avx2_from_mont(&aX4[4 * 9 * 0], &aX4[4 * 9 * 0]); 1095 ecp_nistz256_avx2_from_mont(&aX4[4 * 9 * 1], &aX4[4 * 9 * 1]); 1096 ecp_nistz256_avx2_from_mont(&aX4[4 * 9 * 2], &aX4[4 * 9 * 2]); 1097 1098 ecp_nistz256_avx2_convert_transpose_back(res_point_arr, aX4); 1099 /* Last window is performed serially */ 1100 wvalue = *((u16 *) & p_str[(index - 1) / 8]); 1101 wvalue = (wvalue >> ((index - 1) % 8)) & mask; 1102 booth_recode_w7(&sign0, &digit0, wvalue); 1103 ecp_nistz256_avx2_select_w7((P256_POINT_AFFINE *) r, 1104 preComputedTable[36], digit0); 1105 ecp_nistz256_neg(tmp, r->Y); 1106 copy_conditional(r->Y, tmp, sign0); 1107 memcpy(r->Z, ONE, sizeof(ONE)); 1108 /* Sum the four windows */ 1109 ecp_nistz256_point_add(r, r, &res_point_arr[0]); 1110 ecp_nistz256_point_add(r, r, &res_point_arr[1]); 1111 ecp_nistz256_point_add(r, r, &res_point_arr[2]); 1112 ecp_nistz256_point_add(r, r, &res_point_arr[3]); 1113} 1114# endif 1115#endif 1116 1117static int ecp_nistz256_set_from_affine(EC_POINT *out, const EC_GROUP *group, 1118 const P256_POINT_AFFINE *in, 1119 BN_CTX *ctx) 1120{ 1121 BIGNUM x, y, z; 1122 int ret = 0; 1123 1124 /* 1125 * |const| qualifier omission is compensated by BN_FLG_STATIC_DATA 1126 * flag, which effectively means "read-only data". 1127 */ 1128 x.d = (BN_ULONG *)in->X; 1129 x.dmax = x.top = P256_LIMBS; 1130 x.neg = 0; 1131 x.flags = BN_FLG_STATIC_DATA; 1132 1133 y.d = (BN_ULONG *)in->Y; 1134 y.dmax = y.top = P256_LIMBS; 1135 y.neg = 0; 1136 y.flags = BN_FLG_STATIC_DATA; 1137 1138 z.d = (BN_ULONG *)ONE; 1139 z.dmax = z.top = P256_LIMBS; 1140 z.neg = 0; 1141 z.flags = BN_FLG_STATIC_DATA; 1142 1143 if ((ret = (BN_copy(&out->X, &x) != NULL)) 1144 && (ret = (BN_copy(&out->Y, &y) != NULL)) 1145 && (ret = (BN_copy(&out->Z, &z) != NULL))) 1146 out->Z_is_one = 1; 1147 1148 return ret; 1149} 1150 1151/* r = scalar*G + sum(scalars[i]*points[i]) */ 1152static int ecp_nistz256_points_mul(const EC_GROUP *group, 1153 EC_POINT *r, 1154 const BIGNUM *scalar, 1155 size_t num, 1156 const EC_POINT *points[], 1157 const BIGNUM *scalars[], BN_CTX *ctx) 1158{ 1159 int i = 0, ret = 0, no_precomp_for_generator = 0, p_is_infinity = 0; 1160 size_t j; 1161 unsigned char p_str[33] = { 0 }; 1162 const PRECOMP256_ROW *preComputedTable = NULL; 1163 const EC_PRE_COMP *pre_comp = NULL; 1164 const EC_POINT *generator = NULL; 1165 unsigned int index = 0; 1166 BN_CTX *new_ctx = NULL; 1167 const BIGNUM **new_scalars = NULL; 1168 const EC_POINT **new_points = NULL; 1169 const unsigned int window_size = 7; 1170 const unsigned int mask = (1 << (window_size + 1)) - 1; 1171 unsigned int wvalue; 1172 ALIGN32 union { 1173 P256_POINT p; 1174 P256_POINT_AFFINE a; 1175 } t, p; 1176 BIGNUM *tmp_scalar; 1177 1178 if (group->meth != r->meth) { 1179 ECerr(EC_F_ECP_NISTZ256_POINTS_MUL, EC_R_INCOMPATIBLE_OBJECTS); 1180 return 0; 1181 } 1182 1183 if ((scalar == NULL) && (num == 0)) 1184 return EC_POINT_set_to_infinity(group, r); 1185 1186 for (j = 0; j < num; j++) { 1187 if (group->meth != points[j]->meth) { 1188 ECerr(EC_F_ECP_NISTZ256_POINTS_MUL, EC_R_INCOMPATIBLE_OBJECTS); 1189 return 0; 1190 } 1191 } 1192 1193 if (ctx == NULL) { 1194 ctx = new_ctx = BN_CTX_new(); 1195 if (ctx == NULL) 1196 goto err; 1197 } 1198 1199 BN_CTX_start(ctx); 1200 1201 if (scalar) { 1202 generator = EC_GROUP_get0_generator(group); 1203 if (generator == NULL) { 1204 ECerr(EC_F_ECP_NISTZ256_POINTS_MUL, EC_R_UNDEFINED_GENERATOR); 1205 goto err; 1206 } 1207 1208 /* look if we can use precomputed multiples of generator */ 1209 pre_comp = 1210 EC_EX_DATA_get_data(group->extra_data, ecp_nistz256_pre_comp_dup, 1211 ecp_nistz256_pre_comp_free, 1212 ecp_nistz256_pre_comp_clear_free); 1213 1214 if (pre_comp) { 1215 /* 1216 * If there is a precomputed table for the generator, check that 1217 * it was generated with the same generator. 1218 */ 1219 EC_POINT *pre_comp_generator = EC_POINT_new(group); 1220 if (pre_comp_generator == NULL) 1221 goto err; 1222 1223 if (!ecp_nistz256_set_from_affine 1224 (pre_comp_generator, group, pre_comp->precomp[0], ctx)) { 1225 EC_POINT_free(pre_comp_generator); 1226 goto err; 1227 } 1228 1229 if (0 == EC_POINT_cmp(group, generator, pre_comp_generator, ctx)) 1230 preComputedTable = (const PRECOMP256_ROW *)pre_comp->precomp; 1231 1232 EC_POINT_free(pre_comp_generator); 1233 } 1234 1235 if (preComputedTable == NULL && ecp_nistz256_is_affine_G(generator)) { 1236 /* 1237 * If there is no precomputed data, but the generator 1238 * is the default, a hardcoded table of precomputed 1239 * data is used. This is because applications, such as 1240 * Apache, do not use EC_KEY_precompute_mult. 1241 */ 1242 preComputedTable = (const PRECOMP256_ROW *)ecp_nistz256_precomputed; 1243 } 1244 1245 if (preComputedTable) { 1246 if ((BN_num_bits(scalar) > 256) 1247 || BN_is_negative(scalar)) { 1248 if ((tmp_scalar = BN_CTX_get(ctx)) == NULL) 1249 goto err; 1250 1251 if (!BN_nnmod(tmp_scalar, scalar, &group->order, ctx)) { 1252 ECerr(EC_F_ECP_NISTZ256_POINTS_MUL, ERR_R_BN_LIB); 1253 goto err; 1254 } 1255 scalar = tmp_scalar; 1256 } 1257 1258 for (i = 0; i < scalar->top * BN_BYTES; i += BN_BYTES) { 1259 BN_ULONG d = scalar->d[i / BN_BYTES]; 1260 1261 p_str[i + 0] = d & 0xff; 1262 p_str[i + 1] = (d >> 8) & 0xff; 1263 p_str[i + 2] = (d >> 16) & 0xff; 1264 p_str[i + 3] = (d >>= 24) & 0xff; 1265 if (BN_BYTES == 8) { 1266 d >>= 8; 1267 p_str[i + 4] = d & 0xff; 1268 p_str[i + 5] = (d >> 8) & 0xff; 1269 p_str[i + 6] = (d >> 16) & 0xff; 1270 p_str[i + 7] = (d >> 24) & 0xff; 1271 } 1272 } 1273 1274 for (; i < 33; i++) 1275 p_str[i] = 0; 1276 1277#if defined(ECP_NISTZ256_AVX2) 1278 if (ecp_nistz_avx2_eligible()) { 1279 ecp_nistz256_avx2_mul_g(&p.p, p_str, preComputedTable); 1280 } else 1281#endif 1282 { 1283 BN_ULONG infty; 1284 1285 /* First window */ 1286 wvalue = (p_str[0] << 1) & mask; 1287 index += window_size; 1288 1289 wvalue = _booth_recode_w7(wvalue); 1290 1291 ecp_nistz256_select_w7(&p.a, preComputedTable[0], wvalue >> 1); 1292 1293 ecp_nistz256_neg(p.p.Z, p.p.Y); 1294 copy_conditional(p.p.Y, p.p.Z, wvalue & 1); 1295 1296 /* 1297 * Since affine infinity is encoded as (0,0) and 1298 * Jacobian ias (,,0), we need to harmonize them 1299 * by assigning "one" or zero to Z. 1300 */ 1301 infty = (p.p.X[0] | p.p.X[1] | p.p.X[2] | p.p.X[3] | 1302 p.p.Y[0] | p.p.Y[1] | p.p.Y[2] | p.p.Y[3]); 1303 if (P256_LIMBS == 8) 1304 infty |= (p.p.X[4] | p.p.X[5] | p.p.X[6] | p.p.X[7] | 1305 p.p.Y[4] | p.p.Y[5] | p.p.Y[6] | p.p.Y[7]); 1306 1307 infty = 0 - is_zero(infty); 1308 infty = ~infty; 1309 1310 p.p.Z[0] = ONE[0] & infty; 1311 p.p.Z[1] = ONE[1] & infty; 1312 p.p.Z[2] = ONE[2] & infty; 1313 p.p.Z[3] = ONE[3] & infty; 1314 if (P256_LIMBS == 8) { 1315 p.p.Z[4] = ONE[4] & infty; 1316 p.p.Z[5] = ONE[5] & infty; 1317 p.p.Z[6] = ONE[6] & infty; 1318 p.p.Z[7] = ONE[7] & infty; 1319 } 1320 1321 for (i = 1; i < 37; i++) { 1322 unsigned int off = (index - 1) / 8; 1323 wvalue = p_str[off] | p_str[off + 1] << 8; 1324 wvalue = (wvalue >> ((index - 1) % 8)) & mask; 1325 index += window_size; 1326 1327 wvalue = _booth_recode_w7(wvalue); 1328 1329 ecp_nistz256_select_w7(&t.a, 1330 preComputedTable[i], wvalue >> 1); 1331 1332 ecp_nistz256_neg(t.p.Z, t.a.Y); 1333 copy_conditional(t.a.Y, t.p.Z, wvalue & 1); 1334 1335 ecp_nistz256_point_add_affine(&p.p, &p.p, &t.a); 1336 } 1337 } 1338 } else { 1339 p_is_infinity = 1; 1340 no_precomp_for_generator = 1; 1341 } 1342 } else 1343 p_is_infinity = 1; 1344 1345 if (no_precomp_for_generator) { 1346 /* 1347 * Without a precomputed table for the generator, it has to be 1348 * handled like a normal point. 1349 */ 1350 new_scalars = OPENSSL_malloc((num + 1) * sizeof(BIGNUM *)); 1351 if (!new_scalars) { 1352 ECerr(EC_F_ECP_NISTZ256_POINTS_MUL, ERR_R_MALLOC_FAILURE); 1353 goto err; 1354 } 1355 1356 new_points = OPENSSL_malloc((num + 1) * sizeof(EC_POINT *)); 1357 if (!new_points) { 1358 ECerr(EC_F_ECP_NISTZ256_POINTS_MUL, ERR_R_MALLOC_FAILURE); 1359 goto err; 1360 } 1361 1362 memcpy(new_scalars, scalars, num * sizeof(BIGNUM *)); 1363 new_scalars[num] = scalar; 1364 memcpy(new_points, points, num * sizeof(EC_POINT *)); 1365 new_points[num] = generator; 1366 1367 scalars = new_scalars; 1368 points = new_points; 1369 num++; 1370 } 1371 1372 if (num) { 1373 P256_POINT *out = &t.p; 1374 if (p_is_infinity) 1375 out = &p.p; 1376 1377 if (!ecp_nistz256_windowed_mul(group, out, scalars, points, num, ctx)) 1378 goto err; 1379 1380 if (!p_is_infinity) 1381 ecp_nistz256_point_add(&p.p, &p.p, out); 1382 } 1383 1384 /* Not constant-time, but we're only operating on the public output. */ 1385 if (!ecp_nistz256_set_words(&r->X, p.p.X) || 1386 !ecp_nistz256_set_words(&r->Y, p.p.Y) || 1387 !ecp_nistz256_set_words(&r->Z, p.p.Z)) { 1388 goto err; 1389 } 1390 r->Z_is_one = is_one(&r->Z) & 1; 1391 1392 ret = 1; 1393 1394err: 1395 if (ctx) 1396 BN_CTX_end(ctx); 1397 BN_CTX_free(new_ctx); 1398 if (new_points) 1399 OPENSSL_free(new_points); 1400 if (new_scalars) 1401 OPENSSL_free(new_scalars); 1402 return ret; 1403} 1404 1405static int ecp_nistz256_get_affine(const EC_GROUP *group, 1406 const EC_POINT *point, 1407 BIGNUM *x, BIGNUM *y, BN_CTX *ctx) 1408{ 1409 BN_ULONG z_inv2[P256_LIMBS]; 1410 BN_ULONG z_inv3[P256_LIMBS]; 1411 BN_ULONG x_aff[P256_LIMBS]; 1412 BN_ULONG y_aff[P256_LIMBS]; 1413 BN_ULONG point_x[P256_LIMBS], point_y[P256_LIMBS], point_z[P256_LIMBS]; 1414 BN_ULONG x_ret[P256_LIMBS], y_ret[P256_LIMBS]; 1415 1416 if (EC_POINT_is_at_infinity(group, point)) { 1417 ECerr(EC_F_ECP_NISTZ256_GET_AFFINE, EC_R_POINT_AT_INFINITY); 1418 return 0; 1419 } 1420 1421 if (!ecp_nistz256_bignum_to_field_elem(point_x, &point->X) || 1422 !ecp_nistz256_bignum_to_field_elem(point_y, &point->Y) || 1423 !ecp_nistz256_bignum_to_field_elem(point_z, &point->Z)) { 1424 ECerr(EC_F_ECP_NISTZ256_GET_AFFINE, EC_R_COORDINATES_OUT_OF_RANGE); 1425 return 0; 1426 } 1427 1428 ecp_nistz256_mod_inverse(z_inv3, point_z); 1429 ecp_nistz256_sqr_mont(z_inv2, z_inv3); 1430 ecp_nistz256_mul_mont(x_aff, z_inv2, point_x); 1431 1432 if (x != NULL) { 1433 ecp_nistz256_from_mont(x_ret, x_aff); 1434 if (!ecp_nistz256_set_words(x, x_ret)) 1435 return 0; 1436 } 1437 1438 if (y != NULL) { 1439 ecp_nistz256_mul_mont(z_inv3, z_inv3, z_inv2); 1440 ecp_nistz256_mul_mont(y_aff, z_inv3, point_y); 1441 ecp_nistz256_from_mont(y_ret, y_aff); 1442 if (!ecp_nistz256_set_words(y, y_ret)) 1443 return 0; 1444 } 1445 1446 return 1; 1447} 1448 1449static EC_PRE_COMP *ecp_nistz256_pre_comp_new(const EC_GROUP *group) 1450{ 1451 EC_PRE_COMP *ret = NULL; 1452 1453 if (!group) 1454 return NULL; 1455 1456 ret = (EC_PRE_COMP *)OPENSSL_malloc(sizeof(EC_PRE_COMP)); 1457 1458 if (!ret) { 1459 ECerr(EC_F_ECP_NISTZ256_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE); 1460 return ret; 1461 } 1462 1463 ret->group = group; 1464 ret->w = 6; /* default */ 1465 ret->precomp = NULL; 1466 ret->precomp_storage = NULL; 1467 ret->references = 1; 1468 return ret; 1469} 1470 1471static void *ecp_nistz256_pre_comp_dup(void *src_) 1472{ 1473 EC_PRE_COMP *src = src_; 1474 1475 /* no need to actually copy, these objects never change! */ 1476 CRYPTO_add(&src->references, 1, CRYPTO_LOCK_EC_PRE_COMP); 1477 1478 return src_; 1479} 1480 1481static void ecp_nistz256_pre_comp_free(void *pre_) 1482{ 1483 int i; 1484 EC_PRE_COMP *pre = pre_; 1485 1486 if (!pre) 1487 return; 1488 1489 i = CRYPTO_add(&pre->references, -1, CRYPTO_LOCK_EC_PRE_COMP); 1490 if (i > 0) 1491 return; 1492 1493 if (pre->precomp_storage) 1494 OPENSSL_free(pre->precomp_storage); 1495 1496 OPENSSL_free(pre); 1497} 1498 1499static void ecp_nistz256_pre_comp_clear_free(void *pre_) 1500{ 1501 int i; 1502 EC_PRE_COMP *pre = pre_; 1503 1504 if (!pre) 1505 return; 1506 1507 i = CRYPTO_add(&pre->references, -1, CRYPTO_LOCK_EC_PRE_COMP); 1508 if (i > 0) 1509 return; 1510 1511 if (pre->precomp_storage) { 1512 OPENSSL_cleanse(pre->precomp, 1513 32 * sizeof(unsigned char) * (1 << pre->w) * 2 * 37); 1514 OPENSSL_free(pre->precomp_storage); 1515 } 1516 OPENSSL_cleanse(pre, sizeof(*pre)); 1517 OPENSSL_free(pre); 1518} 1519 1520static int ecp_nistz256_window_have_precompute_mult(const EC_GROUP *group) 1521{ 1522 /* There is a hard-coded table for the default generator. */ 1523 const EC_POINT *generator = EC_GROUP_get0_generator(group); 1524 if (generator != NULL && ecp_nistz256_is_affine_G(generator)) { 1525 /* There is a hard-coded table for the default generator. */ 1526 return 1; 1527 } 1528 1529 return EC_EX_DATA_get_data(group->extra_data, ecp_nistz256_pre_comp_dup, 1530 ecp_nistz256_pre_comp_free, 1531 ecp_nistz256_pre_comp_clear_free) != NULL; 1532} 1533 1534const EC_METHOD *EC_GFp_nistz256_method(void) 1535{ 1536 static const EC_METHOD ret = { 1537 EC_FLAGS_DEFAULT_OCT, 1538 NID_X9_62_prime_field, 1539 ec_GFp_mont_group_init, 1540 ec_GFp_mont_group_finish, 1541 ec_GFp_mont_group_clear_finish, 1542 ec_GFp_mont_group_copy, 1543 ec_GFp_mont_group_set_curve, 1544 ec_GFp_simple_group_get_curve, 1545 ec_GFp_simple_group_get_degree, 1546 ec_GFp_simple_group_check_discriminant, 1547 ec_GFp_simple_point_init, 1548 ec_GFp_simple_point_finish, 1549 ec_GFp_simple_point_clear_finish, 1550 ec_GFp_simple_point_copy, 1551 ec_GFp_simple_point_set_to_infinity, 1552 ec_GFp_simple_set_Jprojective_coordinates_GFp, 1553 ec_GFp_simple_get_Jprojective_coordinates_GFp, 1554 ec_GFp_simple_point_set_affine_coordinates, 1555 ecp_nistz256_get_affine, 1556 0, 0, 0, 1557 ec_GFp_simple_add, 1558 ec_GFp_simple_dbl, 1559 ec_GFp_simple_invert, 1560 ec_GFp_simple_is_at_infinity, 1561 ec_GFp_simple_is_on_curve, 1562 ec_GFp_simple_cmp, 1563 ec_GFp_simple_make_affine, 1564 ec_GFp_simple_points_make_affine, 1565 ecp_nistz256_points_mul, /* mul */ 1566 ecp_nistz256_mult_precompute, /* precompute_mult */ 1567 ecp_nistz256_window_have_precompute_mult, /* have_precompute_mult */ 1568 ec_GFp_mont_field_mul, 1569 ec_GFp_mont_field_sqr, 1570 0, /* field_div */ 1571 ec_GFp_mont_field_encode, 1572 ec_GFp_mont_field_decode, 1573 ec_GFp_mont_field_set_to_one 1574 }; 1575 1576 return &ret; 1577} 1578