1/* 2 * Copyright 2014-2022 The OpenSSL Project Authors. All Rights Reserved. 3 * Copyright (c) 2014, Intel Corporation. All Rights Reserved. 4 * Copyright (c) 2015, CloudFlare, Inc. 5 * 6 * Licensed under the Apache License 2.0 (the "License"). You may not use 7 * this file except in compliance with the License. You can obtain a copy 8 * in the file LICENSE in the source distribution or at 9 * https://www.openssl.org/source/license.html 10 * 11 * Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1, 3) 12 * (1) Intel Corporation, Israel Development Center, Haifa, Israel 13 * (2) University of Haifa, Israel 14 * (3) CloudFlare, Inc. 15 * 16 * Reference: 17 * S.Gueron and V.Krasnov, "Fast Prime Field Elliptic Curve Cryptography with 18 * 256 Bit Primes" 19 */ 20 21/* 22 * ECDSA low level APIs are deprecated for public use, but still ok for 23 * internal use. 24 */ 25#include "internal/deprecated.h" 26 27#include <string.h> 28 29#include "internal/cryptlib.h" 30#include "crypto/bn.h" 31#include "ec_local.h" 32#include "internal/refcount.h" 33 34#if BN_BITS2 != 64 35# define TOBN(hi,lo) lo,hi 36#else 37# define TOBN(hi,lo) ((BN_ULONG)hi<<32|lo) 38#endif 39 40#if defined(__GNUC__) 41# define ALIGN32 __attribute((aligned(32))) 42#elif defined(_MSC_VER) 43# define ALIGN32 __declspec(align(32)) 44#else 45# define ALIGN32 46#endif 47 48#define ALIGNPTR(p,N) ((unsigned char *)p+N-(size_t)p%N) 49#define P256_LIMBS (256/BN_BITS2) 50 51typedef unsigned short u16; 52 53typedef struct { 54 BN_ULONG X[P256_LIMBS]; 55 BN_ULONG Y[P256_LIMBS]; 56 BN_ULONG Z[P256_LIMBS]; 57} P256_POINT; 58 59typedef struct { 60 BN_ULONG X[P256_LIMBS]; 61 BN_ULONG Y[P256_LIMBS]; 62} P256_POINT_AFFINE; 63 64typedef P256_POINT_AFFINE PRECOMP256_ROW[64]; 65 66/* structure for precomputed multiples of the generator */ 67struct nistz256_pre_comp_st { 68 const EC_GROUP *group; /* Parent EC_GROUP object */ 69 size_t w; /* Window size */ 70 /* 71 * Constant time access to the X and Y coordinates of the pre-computed, 72 * generator multiplies, in the Montgomery domain. Pre-calculated 73 * multiplies are stored in affine form. 74 */ 75 PRECOMP256_ROW *precomp; 76 void *precomp_storage; 77 CRYPTO_REF_COUNT references; 78 CRYPTO_RWLOCK *lock; 79}; 80 81/* Functions implemented in assembly */ 82/* 83 * Most of below mentioned functions *preserve* the property of inputs 84 * being fully reduced, i.e. being in [0, modulus) range. Simply put if 85 * inputs are fully reduced, then output is too. Note that reverse is 86 * not true, in sense that given partially reduced inputs output can be 87 * either, not unlikely reduced. And "most" in first sentence refers to 88 * the fact that given the calculations flow one can tolerate that 89 * addition, 1st function below, produces partially reduced result *if* 90 * multiplications by 2 and 3, which customarily use addition, fully 91 * reduce it. This effectively gives two options: a) addition produces 92 * fully reduced result [as long as inputs are, just like remaining 93 * functions]; b) addition is allowed to produce partially reduced 94 * result, but multiplications by 2 and 3 perform additional reduction 95 * step. Choice between the two can be platform-specific, but it was a) 96 * in all cases so far... 97 */ 98/* Modular add: res = a+b mod P */ 99void ecp_nistz256_add(BN_ULONG res[P256_LIMBS], 100 const BN_ULONG a[P256_LIMBS], 101 const BN_ULONG b[P256_LIMBS]); 102/* Modular mul by 2: res = 2*a mod P */ 103void ecp_nistz256_mul_by_2(BN_ULONG res[P256_LIMBS], 104 const BN_ULONG a[P256_LIMBS]); 105/* Modular mul by 3: res = 3*a mod P */ 106void ecp_nistz256_mul_by_3(BN_ULONG res[P256_LIMBS], 107 const BN_ULONG a[P256_LIMBS]); 108 109/* Modular div by 2: res = a/2 mod P */ 110void ecp_nistz256_div_by_2(BN_ULONG res[P256_LIMBS], 111 const BN_ULONG a[P256_LIMBS]); 112/* Modular sub: res = a-b mod P */ 113void ecp_nistz256_sub(BN_ULONG res[P256_LIMBS], 114 const BN_ULONG a[P256_LIMBS], 115 const BN_ULONG b[P256_LIMBS]); 116/* Modular neg: res = -a mod P */ 117void ecp_nistz256_neg(BN_ULONG res[P256_LIMBS], const BN_ULONG a[P256_LIMBS]); 118/* Montgomery mul: res = a*b*2^-256 mod P */ 119void ecp_nistz256_mul_mont(BN_ULONG res[P256_LIMBS], 120 const BN_ULONG a[P256_LIMBS], 121 const BN_ULONG b[P256_LIMBS]); 122/* Montgomery sqr: res = a*a*2^-256 mod P */ 123void ecp_nistz256_sqr_mont(BN_ULONG res[P256_LIMBS], 124 const BN_ULONG a[P256_LIMBS]); 125/* Convert a number from Montgomery domain, by multiplying with 1 */ 126void ecp_nistz256_from_mont(BN_ULONG res[P256_LIMBS], 127 const BN_ULONG in[P256_LIMBS]); 128/* Convert a number to Montgomery domain, by multiplying with 2^512 mod P*/ 129void ecp_nistz256_to_mont(BN_ULONG res[P256_LIMBS], 130 const BN_ULONG in[P256_LIMBS]); 131/* Functions that perform constant time access to the precomputed tables */ 132void ecp_nistz256_scatter_w5(P256_POINT *val, 133 const P256_POINT *in_t, int idx); 134void ecp_nistz256_gather_w5(P256_POINT *val, 135 const P256_POINT *in_t, int idx); 136void ecp_nistz256_scatter_w7(P256_POINT_AFFINE *val, 137 const P256_POINT_AFFINE *in_t, int idx); 138void ecp_nistz256_gather_w7(P256_POINT_AFFINE *val, 139 const P256_POINT_AFFINE *in_t, int idx); 140 141/* One converted into the Montgomery domain */ 142static const BN_ULONG ONE[P256_LIMBS] = { 143 TOBN(0x00000000, 0x00000001), TOBN(0xffffffff, 0x00000000), 144 TOBN(0xffffffff, 0xffffffff), TOBN(0x00000000, 0xfffffffe) 145}; 146 147static NISTZ256_PRE_COMP *ecp_nistz256_pre_comp_new(const EC_GROUP *group); 148 149/* Precomputed tables for the default generator */ 150extern const PRECOMP256_ROW ecp_nistz256_precomputed[37]; 151 152/* Recode window to a signed digit, see ecp_nistputil.c for details */ 153static unsigned int _booth_recode_w5(unsigned int in) 154{ 155 unsigned int s, d; 156 157 s = ~((in >> 5) - 1); 158 d = (1 << 6) - in - 1; 159 d = (d & s) | (in & ~s); 160 d = (d >> 1) + (d & 1); 161 162 return (d << 1) + (s & 1); 163} 164 165static unsigned int _booth_recode_w7(unsigned int in) 166{ 167 unsigned int s, d; 168 169 s = ~((in >> 7) - 1); 170 d = (1 << 8) - in - 1; 171 d = (d & s) | (in & ~s); 172 d = (d >> 1) + (d & 1); 173 174 return (d << 1) + (s & 1); 175} 176 177static void copy_conditional(BN_ULONG dst[P256_LIMBS], 178 const BN_ULONG src[P256_LIMBS], BN_ULONG move) 179{ 180 BN_ULONG mask1 = 0-move; 181 BN_ULONG mask2 = ~mask1; 182 183 dst[0] = (src[0] & mask1) ^ (dst[0] & mask2); 184 dst[1] = (src[1] & mask1) ^ (dst[1] & mask2); 185 dst[2] = (src[2] & mask1) ^ (dst[2] & mask2); 186 dst[3] = (src[3] & mask1) ^ (dst[3] & mask2); 187 if (P256_LIMBS == 8) { 188 dst[4] = (src[4] & mask1) ^ (dst[4] & mask2); 189 dst[5] = (src[5] & mask1) ^ (dst[5] & mask2); 190 dst[6] = (src[6] & mask1) ^ (dst[6] & mask2); 191 dst[7] = (src[7] & mask1) ^ (dst[7] & mask2); 192 } 193} 194 195static BN_ULONG is_zero(BN_ULONG in) 196{ 197 in |= (0 - in); 198 in = ~in; 199 in >>= BN_BITS2 - 1; 200 return in; 201} 202 203static BN_ULONG is_equal(const BN_ULONG a[P256_LIMBS], 204 const BN_ULONG b[P256_LIMBS]) 205{ 206 BN_ULONG res; 207 208 res = a[0] ^ b[0]; 209 res |= a[1] ^ b[1]; 210 res |= a[2] ^ b[2]; 211 res |= a[3] ^ b[3]; 212 if (P256_LIMBS == 8) { 213 res |= a[4] ^ b[4]; 214 res |= a[5] ^ b[5]; 215 res |= a[6] ^ b[6]; 216 res |= a[7] ^ b[7]; 217 } 218 219 return is_zero(res); 220} 221 222static BN_ULONG is_one(const BIGNUM *z) 223{ 224 BN_ULONG res = 0; 225 BN_ULONG *a = bn_get_words(z); 226 227 if (bn_get_top(z) == (P256_LIMBS - P256_LIMBS / 8)) { 228 res = a[0] ^ ONE[0]; 229 res |= a[1] ^ ONE[1]; 230 res |= a[2] ^ ONE[2]; 231 res |= a[3] ^ ONE[3]; 232 if (P256_LIMBS == 8) { 233 res |= a[4] ^ ONE[4]; 234 res |= a[5] ^ ONE[5]; 235 res |= a[6] ^ ONE[6]; 236 /* 237 * no check for a[7] (being zero) on 32-bit platforms, 238 * because value of "one" takes only 7 limbs. 239 */ 240 } 241 res = is_zero(res); 242 } 243 244 return res; 245} 246 247/* 248 * For reference, this macro is used only when new ecp_nistz256 assembly 249 * module is being developed. For example, configure with 250 * -DECP_NISTZ256_REFERENCE_IMPLEMENTATION and implement only functions 251 * performing simplest arithmetic operations on 256-bit vectors. Then 252 * work on implementation of higher-level functions performing point 253 * operations. Then remove ECP_NISTZ256_REFERENCE_IMPLEMENTATION 254 * and never define it again. (The correct macro denoting presence of 255 * ecp_nistz256 module is ECP_NISTZ256_ASM.) 256 */ 257#ifndef ECP_NISTZ256_REFERENCE_IMPLEMENTATION 258void ecp_nistz256_point_double(P256_POINT *r, const P256_POINT *a); 259void ecp_nistz256_point_add(P256_POINT *r, 260 const P256_POINT *a, const P256_POINT *b); 261void ecp_nistz256_point_add_affine(P256_POINT *r, 262 const P256_POINT *a, 263 const P256_POINT_AFFINE *b); 264#else 265/* Point double: r = 2*a */ 266static void ecp_nistz256_point_double(P256_POINT *r, const P256_POINT *a) 267{ 268 BN_ULONG S[P256_LIMBS]; 269 BN_ULONG M[P256_LIMBS]; 270 BN_ULONG Zsqr[P256_LIMBS]; 271 BN_ULONG tmp0[P256_LIMBS]; 272 273 const BN_ULONG *in_x = a->X; 274 const BN_ULONG *in_y = a->Y; 275 const BN_ULONG *in_z = a->Z; 276 277 BN_ULONG *res_x = r->X; 278 BN_ULONG *res_y = r->Y; 279 BN_ULONG *res_z = r->Z; 280 281 ecp_nistz256_mul_by_2(S, in_y); 282 283 ecp_nistz256_sqr_mont(Zsqr, in_z); 284 285 ecp_nistz256_sqr_mont(S, S); 286 287 ecp_nistz256_mul_mont(res_z, in_z, in_y); 288 ecp_nistz256_mul_by_2(res_z, res_z); 289 290 ecp_nistz256_add(M, in_x, Zsqr); 291 ecp_nistz256_sub(Zsqr, in_x, Zsqr); 292 293 ecp_nistz256_sqr_mont(res_y, S); 294 ecp_nistz256_div_by_2(res_y, res_y); 295 296 ecp_nistz256_mul_mont(M, M, Zsqr); 297 ecp_nistz256_mul_by_3(M, M); 298 299 ecp_nistz256_mul_mont(S, S, in_x); 300 ecp_nistz256_mul_by_2(tmp0, S); 301 302 ecp_nistz256_sqr_mont(res_x, M); 303 304 ecp_nistz256_sub(res_x, res_x, tmp0); 305 ecp_nistz256_sub(S, S, res_x); 306 307 ecp_nistz256_mul_mont(S, S, M); 308 ecp_nistz256_sub(res_y, S, res_y); 309} 310 311/* Point addition: r = a+b */ 312static void ecp_nistz256_point_add(P256_POINT *r, 313 const P256_POINT *a, const P256_POINT *b) 314{ 315 BN_ULONG U2[P256_LIMBS], S2[P256_LIMBS]; 316 BN_ULONG U1[P256_LIMBS], S1[P256_LIMBS]; 317 BN_ULONG Z1sqr[P256_LIMBS]; 318 BN_ULONG Z2sqr[P256_LIMBS]; 319 BN_ULONG H[P256_LIMBS], R[P256_LIMBS]; 320 BN_ULONG Hsqr[P256_LIMBS]; 321 BN_ULONG Rsqr[P256_LIMBS]; 322 BN_ULONG Hcub[P256_LIMBS]; 323 324 BN_ULONG res_x[P256_LIMBS]; 325 BN_ULONG res_y[P256_LIMBS]; 326 BN_ULONG res_z[P256_LIMBS]; 327 328 BN_ULONG in1infty, in2infty; 329 330 const BN_ULONG *in1_x = a->X; 331 const BN_ULONG *in1_y = a->Y; 332 const BN_ULONG *in1_z = a->Z; 333 334 const BN_ULONG *in2_x = b->X; 335 const BN_ULONG *in2_y = b->Y; 336 const BN_ULONG *in2_z = b->Z; 337 338 /* 339 * Infinity in encoded as (,,0) 340 */ 341 in1infty = (in1_z[0] | in1_z[1] | in1_z[2] | in1_z[3]); 342 if (P256_LIMBS == 8) 343 in1infty |= (in1_z[4] | in1_z[5] | in1_z[6] | in1_z[7]); 344 345 in2infty = (in2_z[0] | in2_z[1] | in2_z[2] | in2_z[3]); 346 if (P256_LIMBS == 8) 347 in2infty |= (in2_z[4] | in2_z[5] | in2_z[6] | in2_z[7]); 348 349 in1infty = is_zero(in1infty); 350 in2infty = is_zero(in2infty); 351 352 ecp_nistz256_sqr_mont(Z2sqr, in2_z); /* Z2^2 */ 353 ecp_nistz256_sqr_mont(Z1sqr, in1_z); /* Z1^2 */ 354 355 ecp_nistz256_mul_mont(S1, Z2sqr, in2_z); /* S1 = Z2^3 */ 356 ecp_nistz256_mul_mont(S2, Z1sqr, in1_z); /* S2 = Z1^3 */ 357 358 ecp_nistz256_mul_mont(S1, S1, in1_y); /* S1 = Y1*Z2^3 */ 359 ecp_nistz256_mul_mont(S2, S2, in2_y); /* S2 = Y2*Z1^3 */ 360 ecp_nistz256_sub(R, S2, S1); /* R = S2 - S1 */ 361 362 ecp_nistz256_mul_mont(U1, in1_x, Z2sqr); /* U1 = X1*Z2^2 */ 363 ecp_nistz256_mul_mont(U2, in2_x, Z1sqr); /* U2 = X2*Z1^2 */ 364 ecp_nistz256_sub(H, U2, U1); /* H = U2 - U1 */ 365 366 /* 367 * The formulae are incorrect if the points are equal so we check for 368 * this and do doubling if this happens. 369 * 370 * Points here are in Jacobian projective coordinates (Xi, Yi, Zi) 371 * that are bound to the affine coordinates (xi, yi) by the following 372 * equations: 373 * - xi = Xi / (Zi)^2 374 * - y1 = Yi / (Zi)^3 375 * 376 * For the sake of optimization, the algorithm operates over 377 * intermediate variables U1, U2 and S1, S2 that are derived from 378 * the projective coordinates: 379 * - U1 = X1 * (Z2)^2 ; U2 = X2 * (Z1)^2 380 * - S1 = Y1 * (Z2)^3 ; S2 = Y2 * (Z1)^3 381 * 382 * It is easy to prove that is_equal(U1, U2) implies that the affine 383 * x-coordinates are equal, or either point is at infinity. 384 * Likewise is_equal(S1, S2) implies that the affine y-coordinates are 385 * equal, or either point is at infinity. 386 * 387 * The special case of either point being the point at infinity (Z1 or Z2 388 * is zero), is handled separately later on in this function, so we avoid 389 * jumping to point_double here in those special cases. 390 * 391 * When both points are inverse of each other, we know that the affine 392 * x-coordinates are equal, and the y-coordinates have different sign. 393 * Therefore since U1 = U2, we know H = 0, and therefore Z3 = H*Z1*Z2 394 * will equal 0, thus the result is infinity, if we simply let this 395 * function continue normally. 396 * 397 * We use bitwise operations to avoid potential side-channels introduced by 398 * the short-circuiting behaviour of boolean operators. 399 */ 400 if (is_equal(U1, U2) & ~in1infty & ~in2infty & is_equal(S1, S2)) { 401 /* 402 * This is obviously not constant-time but it should never happen during 403 * single point multiplication, so there is no timing leak for ECDH or 404 * ECDSA signing. 405 */ 406 ecp_nistz256_point_double(r, a); 407 return; 408 } 409 410 ecp_nistz256_sqr_mont(Rsqr, R); /* R^2 */ 411 ecp_nistz256_mul_mont(res_z, H, in1_z); /* Z3 = H*Z1*Z2 */ 412 ecp_nistz256_sqr_mont(Hsqr, H); /* H^2 */ 413 ecp_nistz256_mul_mont(res_z, res_z, in2_z); /* Z3 = H*Z1*Z2 */ 414 ecp_nistz256_mul_mont(Hcub, Hsqr, H); /* H^3 */ 415 416 ecp_nistz256_mul_mont(U2, U1, Hsqr); /* U1*H^2 */ 417 ecp_nistz256_mul_by_2(Hsqr, U2); /* 2*U1*H^2 */ 418 419 ecp_nistz256_sub(res_x, Rsqr, Hsqr); 420 ecp_nistz256_sub(res_x, res_x, Hcub); 421 422 ecp_nistz256_sub(res_y, U2, res_x); 423 424 ecp_nistz256_mul_mont(S2, S1, Hcub); 425 ecp_nistz256_mul_mont(res_y, R, res_y); 426 ecp_nistz256_sub(res_y, res_y, S2); 427 428 copy_conditional(res_x, in2_x, in1infty); 429 copy_conditional(res_y, in2_y, in1infty); 430 copy_conditional(res_z, in2_z, in1infty); 431 432 copy_conditional(res_x, in1_x, in2infty); 433 copy_conditional(res_y, in1_y, in2infty); 434 copy_conditional(res_z, in1_z, in2infty); 435 436 memcpy(r->X, res_x, sizeof(res_x)); 437 memcpy(r->Y, res_y, sizeof(res_y)); 438 memcpy(r->Z, res_z, sizeof(res_z)); 439} 440 441/* Point addition when b is known to be affine: r = a+b */ 442static void ecp_nistz256_point_add_affine(P256_POINT *r, 443 const P256_POINT *a, 444 const P256_POINT_AFFINE *b) 445{ 446 BN_ULONG U2[P256_LIMBS], S2[P256_LIMBS]; 447 BN_ULONG Z1sqr[P256_LIMBS]; 448 BN_ULONG H[P256_LIMBS], R[P256_LIMBS]; 449 BN_ULONG Hsqr[P256_LIMBS]; 450 BN_ULONG Rsqr[P256_LIMBS]; 451 BN_ULONG Hcub[P256_LIMBS]; 452 453 BN_ULONG res_x[P256_LIMBS]; 454 BN_ULONG res_y[P256_LIMBS]; 455 BN_ULONG res_z[P256_LIMBS]; 456 457 BN_ULONG in1infty, in2infty; 458 459 const BN_ULONG *in1_x = a->X; 460 const BN_ULONG *in1_y = a->Y; 461 const BN_ULONG *in1_z = a->Z; 462 463 const BN_ULONG *in2_x = b->X; 464 const BN_ULONG *in2_y = b->Y; 465 466 /* 467 * Infinity in encoded as (,,0) 468 */ 469 in1infty = (in1_z[0] | in1_z[1] | in1_z[2] | in1_z[3]); 470 if (P256_LIMBS == 8) 471 in1infty |= (in1_z[4] | in1_z[5] | in1_z[6] | in1_z[7]); 472 473 /* 474 * In affine representation we encode infinity as (0,0), which is 475 * not on the curve, so it is OK 476 */ 477 in2infty = (in2_x[0] | in2_x[1] | in2_x[2] | in2_x[3] | 478 in2_y[0] | in2_y[1] | in2_y[2] | in2_y[3]); 479 if (P256_LIMBS == 8) 480 in2infty |= (in2_x[4] | in2_x[5] | in2_x[6] | in2_x[7] | 481 in2_y[4] | in2_y[5] | in2_y[6] | in2_y[7]); 482 483 in1infty = is_zero(in1infty); 484 in2infty = is_zero(in2infty); 485 486 ecp_nistz256_sqr_mont(Z1sqr, in1_z); /* Z1^2 */ 487 488 ecp_nistz256_mul_mont(U2, in2_x, Z1sqr); /* U2 = X2*Z1^2 */ 489 ecp_nistz256_sub(H, U2, in1_x); /* H = U2 - U1 */ 490 491 ecp_nistz256_mul_mont(S2, Z1sqr, in1_z); /* S2 = Z1^3 */ 492 493 ecp_nistz256_mul_mont(res_z, H, in1_z); /* Z3 = H*Z1*Z2 */ 494 495 ecp_nistz256_mul_mont(S2, S2, in2_y); /* S2 = Y2*Z1^3 */ 496 ecp_nistz256_sub(R, S2, in1_y); /* R = S2 - S1 */ 497 498 ecp_nistz256_sqr_mont(Hsqr, H); /* H^2 */ 499 ecp_nistz256_sqr_mont(Rsqr, R); /* R^2 */ 500 ecp_nistz256_mul_mont(Hcub, Hsqr, H); /* H^3 */ 501 502 ecp_nistz256_mul_mont(U2, in1_x, Hsqr); /* U1*H^2 */ 503 ecp_nistz256_mul_by_2(Hsqr, U2); /* 2*U1*H^2 */ 504 505 ecp_nistz256_sub(res_x, Rsqr, Hsqr); 506 ecp_nistz256_sub(res_x, res_x, Hcub); 507 ecp_nistz256_sub(H, U2, res_x); 508 509 ecp_nistz256_mul_mont(S2, in1_y, Hcub); 510 ecp_nistz256_mul_mont(H, H, R); 511 ecp_nistz256_sub(res_y, H, S2); 512 513 copy_conditional(res_x, in2_x, in1infty); 514 copy_conditional(res_x, in1_x, in2infty); 515 516 copy_conditional(res_y, in2_y, in1infty); 517 copy_conditional(res_y, in1_y, in2infty); 518 519 copy_conditional(res_z, ONE, in1infty); 520 copy_conditional(res_z, in1_z, in2infty); 521 522 memcpy(r->X, res_x, sizeof(res_x)); 523 memcpy(r->Y, res_y, sizeof(res_y)); 524 memcpy(r->Z, res_z, sizeof(res_z)); 525} 526#endif 527 528/* r = in^-1 mod p */ 529static void ecp_nistz256_mod_inverse(BN_ULONG r[P256_LIMBS], 530 const BN_ULONG in[P256_LIMBS]) 531{ 532 /* 533 * The poly is ffffffff 00000001 00000000 00000000 00000000 ffffffff 534 * ffffffff ffffffff We use FLT and used poly-2 as exponent 535 */ 536 BN_ULONG p2[P256_LIMBS]; 537 BN_ULONG p4[P256_LIMBS]; 538 BN_ULONG p8[P256_LIMBS]; 539 BN_ULONG p16[P256_LIMBS]; 540 BN_ULONG p32[P256_LIMBS]; 541 BN_ULONG res[P256_LIMBS]; 542 int i; 543 544 ecp_nistz256_sqr_mont(res, in); 545 ecp_nistz256_mul_mont(p2, res, in); /* 3*p */ 546 547 ecp_nistz256_sqr_mont(res, p2); 548 ecp_nistz256_sqr_mont(res, res); 549 ecp_nistz256_mul_mont(p4, res, p2); /* f*p */ 550 551 ecp_nistz256_sqr_mont(res, p4); 552 ecp_nistz256_sqr_mont(res, res); 553 ecp_nistz256_sqr_mont(res, res); 554 ecp_nistz256_sqr_mont(res, res); 555 ecp_nistz256_mul_mont(p8, res, p4); /* ff*p */ 556 557 ecp_nistz256_sqr_mont(res, p8); 558 for (i = 0; i < 7; i++) 559 ecp_nistz256_sqr_mont(res, res); 560 ecp_nistz256_mul_mont(p16, res, p8); /* ffff*p */ 561 562 ecp_nistz256_sqr_mont(res, p16); 563 for (i = 0; i < 15; i++) 564 ecp_nistz256_sqr_mont(res, res); 565 ecp_nistz256_mul_mont(p32, res, p16); /* ffffffff*p */ 566 567 ecp_nistz256_sqr_mont(res, p32); 568 for (i = 0; i < 31; i++) 569 ecp_nistz256_sqr_mont(res, res); 570 ecp_nistz256_mul_mont(res, res, in); 571 572 for (i = 0; i < 32 * 4; i++) 573 ecp_nistz256_sqr_mont(res, res); 574 ecp_nistz256_mul_mont(res, res, p32); 575 576 for (i = 0; i < 32; i++) 577 ecp_nistz256_sqr_mont(res, res); 578 ecp_nistz256_mul_mont(res, res, p32); 579 580 for (i = 0; i < 16; i++) 581 ecp_nistz256_sqr_mont(res, res); 582 ecp_nistz256_mul_mont(res, res, p16); 583 584 for (i = 0; i < 8; i++) 585 ecp_nistz256_sqr_mont(res, res); 586 ecp_nistz256_mul_mont(res, res, p8); 587 588 ecp_nistz256_sqr_mont(res, res); 589 ecp_nistz256_sqr_mont(res, res); 590 ecp_nistz256_sqr_mont(res, res); 591 ecp_nistz256_sqr_mont(res, res); 592 ecp_nistz256_mul_mont(res, res, p4); 593 594 ecp_nistz256_sqr_mont(res, res); 595 ecp_nistz256_sqr_mont(res, res); 596 ecp_nistz256_mul_mont(res, res, p2); 597 598 ecp_nistz256_sqr_mont(res, res); 599 ecp_nistz256_sqr_mont(res, res); 600 ecp_nistz256_mul_mont(res, res, in); 601 602 memcpy(r, res, sizeof(res)); 603} 604 605/* 606 * ecp_nistz256_bignum_to_field_elem copies the contents of |in| to |out| and 607 * returns one if it fits. Otherwise it returns zero. 608 */ 609__owur static int ecp_nistz256_bignum_to_field_elem(BN_ULONG out[P256_LIMBS], 610 const BIGNUM *in) 611{ 612 return bn_copy_words(out, in, P256_LIMBS); 613} 614 615/* r = sum(scalar[i]*point[i]) */ 616__owur static int ecp_nistz256_windowed_mul(const EC_GROUP *group, 617 P256_POINT *r, 618 const BIGNUM **scalar, 619 const EC_POINT **point, 620 size_t num, BN_CTX *ctx) 621{ 622 size_t i; 623 int j, ret = 0; 624 unsigned int idx; 625 unsigned char (*p_str)[33] = NULL; 626 const unsigned int window_size = 5; 627 const unsigned int mask = (1 << (window_size + 1)) - 1; 628 unsigned int wvalue; 629 P256_POINT *temp; /* place for 5 temporary points */ 630 const BIGNUM **scalars = NULL; 631 P256_POINT (*table)[16] = NULL; 632 void *table_storage = NULL; 633 634 if ((num * 16 + 6) > OPENSSL_MALLOC_MAX_NELEMS(P256_POINT) 635 || (table_storage = 636 OPENSSL_malloc((num * 16 + 5) * sizeof(P256_POINT) + 64)) == NULL 637 || (p_str = 638 OPENSSL_malloc(num * 33 * sizeof(unsigned char))) == NULL 639 || (scalars = OPENSSL_malloc(num * sizeof(BIGNUM *))) == NULL) { 640 ERR_raise(ERR_LIB_EC, ERR_R_MALLOC_FAILURE); 641 goto err; 642 } 643 644 table = (void *)ALIGNPTR(table_storage, 64); 645 temp = (P256_POINT *)(table + num); 646 647 for (i = 0; i < num; i++) { 648 P256_POINT *row = table[i]; 649 650 /* This is an unusual input, we don't guarantee constant-timeness. */ 651 if ((BN_num_bits(scalar[i]) > 256) || BN_is_negative(scalar[i])) { 652 BIGNUM *mod; 653 654 if ((mod = BN_CTX_get(ctx)) == NULL) 655 goto err; 656 if (!BN_nnmod(mod, scalar[i], group->order, ctx)) { 657 ERR_raise(ERR_LIB_EC, ERR_R_BN_LIB); 658 goto err; 659 } 660 scalars[i] = mod; 661 } else 662 scalars[i] = scalar[i]; 663 664 for (j = 0; j < bn_get_top(scalars[i]) * BN_BYTES; j += BN_BYTES) { 665 BN_ULONG d = bn_get_words(scalars[i])[j / BN_BYTES]; 666 667 p_str[i][j + 0] = (unsigned char)d; 668 p_str[i][j + 1] = (unsigned char)(d >> 8); 669 p_str[i][j + 2] = (unsigned char)(d >> 16); 670 p_str[i][j + 3] = (unsigned char)(d >>= 24); 671 if (BN_BYTES == 8) { 672 d >>= 8; 673 p_str[i][j + 4] = (unsigned char)d; 674 p_str[i][j + 5] = (unsigned char)(d >> 8); 675 p_str[i][j + 6] = (unsigned char)(d >> 16); 676 p_str[i][j + 7] = (unsigned char)(d >> 24); 677 } 678 } 679 for (; j < 33; j++) 680 p_str[i][j] = 0; 681 682 if (!ecp_nistz256_bignum_to_field_elem(temp[0].X, point[i]->X) 683 || !ecp_nistz256_bignum_to_field_elem(temp[0].Y, point[i]->Y) 684 || !ecp_nistz256_bignum_to_field_elem(temp[0].Z, point[i]->Z)) { 685 ERR_raise(ERR_LIB_EC, EC_R_COORDINATES_OUT_OF_RANGE); 686 goto err; 687 } 688 689 /* 690 * row[0] is implicitly (0,0,0) (the point at infinity), therefore it 691 * is not stored. All other values are actually stored with an offset 692 * of -1 in table. 693 */ 694 695 ecp_nistz256_scatter_w5 (row, &temp[0], 1); 696 ecp_nistz256_point_double(&temp[1], &temp[0]); /*1+1=2 */ 697 ecp_nistz256_scatter_w5 (row, &temp[1], 2); 698 ecp_nistz256_point_add (&temp[2], &temp[1], &temp[0]); /*2+1=3 */ 699 ecp_nistz256_scatter_w5 (row, &temp[2], 3); 700 ecp_nistz256_point_double(&temp[1], &temp[1]); /*2*2=4 */ 701 ecp_nistz256_scatter_w5 (row, &temp[1], 4); 702 ecp_nistz256_point_double(&temp[2], &temp[2]); /*2*3=6 */ 703 ecp_nistz256_scatter_w5 (row, &temp[2], 6); 704 ecp_nistz256_point_add (&temp[3], &temp[1], &temp[0]); /*4+1=5 */ 705 ecp_nistz256_scatter_w5 (row, &temp[3], 5); 706 ecp_nistz256_point_add (&temp[4], &temp[2], &temp[0]); /*6+1=7 */ 707 ecp_nistz256_scatter_w5 (row, &temp[4], 7); 708 ecp_nistz256_point_double(&temp[1], &temp[1]); /*2*4=8 */ 709 ecp_nistz256_scatter_w5 (row, &temp[1], 8); 710 ecp_nistz256_point_double(&temp[2], &temp[2]); /*2*6=12 */ 711 ecp_nistz256_scatter_w5 (row, &temp[2], 12); 712 ecp_nistz256_point_double(&temp[3], &temp[3]); /*2*5=10 */ 713 ecp_nistz256_scatter_w5 (row, &temp[3], 10); 714 ecp_nistz256_point_double(&temp[4], &temp[4]); /*2*7=14 */ 715 ecp_nistz256_scatter_w5 (row, &temp[4], 14); 716 ecp_nistz256_point_add (&temp[2], &temp[2], &temp[0]); /*12+1=13*/ 717 ecp_nistz256_scatter_w5 (row, &temp[2], 13); 718 ecp_nistz256_point_add (&temp[3], &temp[3], &temp[0]); /*10+1=11*/ 719 ecp_nistz256_scatter_w5 (row, &temp[3], 11); 720 ecp_nistz256_point_add (&temp[4], &temp[4], &temp[0]); /*14+1=15*/ 721 ecp_nistz256_scatter_w5 (row, &temp[4], 15); 722 ecp_nistz256_point_add (&temp[2], &temp[1], &temp[0]); /*8+1=9 */ 723 ecp_nistz256_scatter_w5 (row, &temp[2], 9); 724 ecp_nistz256_point_double(&temp[1], &temp[1]); /*2*8=16 */ 725 ecp_nistz256_scatter_w5 (row, &temp[1], 16); 726 } 727 728 idx = 255; 729 730 wvalue = p_str[0][(idx - 1) / 8]; 731 wvalue = (wvalue >> ((idx - 1) % 8)) & mask; 732 733 /* 734 * We gather to temp[0], because we know it's position relative 735 * to table 736 */ 737 ecp_nistz256_gather_w5(&temp[0], table[0], _booth_recode_w5(wvalue) >> 1); 738 memcpy(r, &temp[0], sizeof(temp[0])); 739 740 while (idx >= 5) { 741 for (i = (idx == 255 ? 1 : 0); i < num; i++) { 742 unsigned int off = (idx - 1) / 8; 743 744 wvalue = p_str[i][off] | p_str[i][off + 1] << 8; 745 wvalue = (wvalue >> ((idx - 1) % 8)) & mask; 746 747 wvalue = _booth_recode_w5(wvalue); 748 749 ecp_nistz256_gather_w5(&temp[0], table[i], wvalue >> 1); 750 751 ecp_nistz256_neg(temp[1].Y, temp[0].Y); 752 copy_conditional(temp[0].Y, temp[1].Y, (wvalue & 1)); 753 754 ecp_nistz256_point_add(r, r, &temp[0]); 755 } 756 757 idx -= window_size; 758 759 ecp_nistz256_point_double(r, r); 760 ecp_nistz256_point_double(r, r); 761 ecp_nistz256_point_double(r, r); 762 ecp_nistz256_point_double(r, r); 763 ecp_nistz256_point_double(r, r); 764 } 765 766 /* Final window */ 767 for (i = 0; i < num; i++) { 768 wvalue = p_str[i][0]; 769 wvalue = (wvalue << 1) & mask; 770 771 wvalue = _booth_recode_w5(wvalue); 772 773 ecp_nistz256_gather_w5(&temp[0], table[i], wvalue >> 1); 774 775 ecp_nistz256_neg(temp[1].Y, temp[0].Y); 776 copy_conditional(temp[0].Y, temp[1].Y, wvalue & 1); 777 778 ecp_nistz256_point_add(r, r, &temp[0]); 779 } 780 781 ret = 1; 782 err: 783 OPENSSL_free(table_storage); 784 OPENSSL_free(p_str); 785 OPENSSL_free(scalars); 786 return ret; 787} 788 789/* Coordinates of G, for which we have precomputed tables */ 790static const BN_ULONG def_xG[P256_LIMBS] = { 791 TOBN(0x79e730d4, 0x18a9143c), TOBN(0x75ba95fc, 0x5fedb601), 792 TOBN(0x79fb732b, 0x77622510), TOBN(0x18905f76, 0xa53755c6) 793}; 794 795static const BN_ULONG def_yG[P256_LIMBS] = { 796 TOBN(0xddf25357, 0xce95560a), TOBN(0x8b4ab8e4, 0xba19e45c), 797 TOBN(0xd2e88688, 0xdd21f325), TOBN(0x8571ff18, 0x25885d85) 798}; 799 800/* 801 * ecp_nistz256_is_affine_G returns one if |generator| is the standard, P-256 802 * generator. 803 */ 804static int ecp_nistz256_is_affine_G(const EC_POINT *generator) 805{ 806 return (bn_get_top(generator->X) == P256_LIMBS) && 807 (bn_get_top(generator->Y) == P256_LIMBS) && 808 is_equal(bn_get_words(generator->X), def_xG) && 809 is_equal(bn_get_words(generator->Y), def_yG) && 810 is_one(generator->Z); 811} 812 813__owur static int ecp_nistz256_mult_precompute(EC_GROUP *group, BN_CTX *ctx) 814{ 815 /* 816 * We precompute a table for a Booth encoded exponent (wNAF) based 817 * computation. Each table holds 64 values for safe access, with an 818 * implicit value of infinity at index zero. We use window of size 7, and 819 * therefore require ceil(256/7) = 37 tables. 820 */ 821 const BIGNUM *order; 822 EC_POINT *P = NULL, *T = NULL; 823 const EC_POINT *generator; 824 NISTZ256_PRE_COMP *pre_comp; 825 BN_CTX *new_ctx = NULL; 826 int i, j, k, ret = 0; 827 size_t w; 828 829 PRECOMP256_ROW *preComputedTable = NULL; 830 unsigned char *precomp_storage = NULL; 831 832 /* if there is an old NISTZ256_PRE_COMP object, throw it away */ 833 EC_pre_comp_free(group); 834 generator = EC_GROUP_get0_generator(group); 835 if (generator == NULL) { 836 ERR_raise(ERR_LIB_EC, EC_R_UNDEFINED_GENERATOR); 837 return 0; 838 } 839 840 if (ecp_nistz256_is_affine_G(generator)) { 841 /* 842 * No need to calculate tables for the standard generator because we 843 * have them statically. 844 */ 845 return 1; 846 } 847 848 if ((pre_comp = ecp_nistz256_pre_comp_new(group)) == NULL) 849 return 0; 850 851 if (ctx == NULL) { 852 ctx = new_ctx = BN_CTX_new_ex(group->libctx); 853 if (ctx == NULL) 854 goto err; 855 } 856 857 BN_CTX_start(ctx); 858 859 order = EC_GROUP_get0_order(group); 860 if (order == NULL) 861 goto err; 862 863 if (BN_is_zero(order)) { 864 ERR_raise(ERR_LIB_EC, EC_R_UNKNOWN_ORDER); 865 goto err; 866 } 867 868 w = 7; 869 870 if ((precomp_storage = 871 OPENSSL_malloc(37 * 64 * sizeof(P256_POINT_AFFINE) + 64)) == NULL) { 872 ERR_raise(ERR_LIB_EC, ERR_R_MALLOC_FAILURE); 873 goto err; 874 } 875 876 preComputedTable = (void *)ALIGNPTR(precomp_storage, 64); 877 878 P = EC_POINT_new(group); 879 T = EC_POINT_new(group); 880 if (P == NULL || T == NULL) 881 goto err; 882 883 /* 884 * The zero entry is implicitly infinity, and we skip it, storing other 885 * values with -1 offset. 886 */ 887 if (!EC_POINT_copy(T, generator)) 888 goto err; 889 890 for (k = 0; k < 64; k++) { 891 if (!EC_POINT_copy(P, T)) 892 goto err; 893 for (j = 0; j < 37; j++) { 894 P256_POINT_AFFINE temp; 895 /* 896 * It would be faster to use EC_POINTs_make_affine and 897 * make multiple points affine at the same time. 898 */ 899 if (group->meth->make_affine == NULL 900 || !group->meth->make_affine(group, P, ctx)) 901 goto err; 902 if (!ecp_nistz256_bignum_to_field_elem(temp.X, P->X) || 903 !ecp_nistz256_bignum_to_field_elem(temp.Y, P->Y)) { 904 ERR_raise(ERR_LIB_EC, EC_R_COORDINATES_OUT_OF_RANGE); 905 goto err; 906 } 907 ecp_nistz256_scatter_w7(preComputedTable[j], &temp, k); 908 for (i = 0; i < 7; i++) { 909 if (!EC_POINT_dbl(group, P, P, ctx)) 910 goto err; 911 } 912 } 913 if (!EC_POINT_add(group, T, T, generator, ctx)) 914 goto err; 915 } 916 917 pre_comp->group = group; 918 pre_comp->w = w; 919 pre_comp->precomp = preComputedTable; 920 pre_comp->precomp_storage = precomp_storage; 921 precomp_storage = NULL; 922 SETPRECOMP(group, nistz256, pre_comp); 923 pre_comp = NULL; 924 ret = 1; 925 926 err: 927 BN_CTX_end(ctx); 928 BN_CTX_free(new_ctx); 929 930 EC_nistz256_pre_comp_free(pre_comp); 931 OPENSSL_free(precomp_storage); 932 EC_POINT_free(P); 933 EC_POINT_free(T); 934 return ret; 935} 936 937__owur static int ecp_nistz256_set_from_affine(EC_POINT *out, const EC_GROUP *group, 938 const P256_POINT_AFFINE *in, 939 BN_CTX *ctx) 940{ 941 int ret = 0; 942 943 if ((ret = bn_set_words(out->X, in->X, P256_LIMBS)) 944 && (ret = bn_set_words(out->Y, in->Y, P256_LIMBS)) 945 && (ret = bn_set_words(out->Z, ONE, P256_LIMBS))) 946 out->Z_is_one = 1; 947 948 return ret; 949} 950 951/* r = scalar*G + sum(scalars[i]*points[i]) */ 952__owur static int ecp_nistz256_points_mul(const EC_GROUP *group, 953 EC_POINT *r, 954 const BIGNUM *scalar, 955 size_t num, 956 const EC_POINT *points[], 957 const BIGNUM *scalars[], BN_CTX *ctx) 958{ 959 int i = 0, ret = 0, no_precomp_for_generator = 0, p_is_infinity = 0; 960 unsigned char p_str[33] = { 0 }; 961 const PRECOMP256_ROW *preComputedTable = NULL; 962 const NISTZ256_PRE_COMP *pre_comp = NULL; 963 const EC_POINT *generator = NULL; 964 const BIGNUM **new_scalars = NULL; 965 const EC_POINT **new_points = NULL; 966 unsigned int idx = 0; 967 const unsigned int window_size = 7; 968 const unsigned int mask = (1 << (window_size + 1)) - 1; 969 unsigned int wvalue; 970 ALIGN32 union { 971 P256_POINT p; 972 P256_POINT_AFFINE a; 973 } t, p; 974 BIGNUM *tmp_scalar; 975 976 if ((num + 1) == 0 || (num + 1) > OPENSSL_MALLOC_MAX_NELEMS(void *)) { 977 ERR_raise(ERR_LIB_EC, ERR_R_MALLOC_FAILURE); 978 return 0; 979 } 980 981 memset(&p, 0, sizeof(p)); 982 BN_CTX_start(ctx); 983 984 if (scalar) { 985 generator = EC_GROUP_get0_generator(group); 986 if (generator == NULL) { 987 ERR_raise(ERR_LIB_EC, EC_R_UNDEFINED_GENERATOR); 988 goto err; 989 } 990 991 /* look if we can use precomputed multiples of generator */ 992 pre_comp = group->pre_comp.nistz256; 993 994 if (pre_comp) { 995 /* 996 * If there is a precomputed table for the generator, check that 997 * it was generated with the same generator. 998 */ 999 EC_POINT *pre_comp_generator = EC_POINT_new(group); 1000 if (pre_comp_generator == NULL) 1001 goto err; 1002 1003 ecp_nistz256_gather_w7(&p.a, pre_comp->precomp[0], 1); 1004 if (!ecp_nistz256_set_from_affine(pre_comp_generator, 1005 group, &p.a, ctx)) { 1006 EC_POINT_free(pre_comp_generator); 1007 goto err; 1008 } 1009 1010 if (0 == EC_POINT_cmp(group, generator, pre_comp_generator, ctx)) 1011 preComputedTable = (const PRECOMP256_ROW *)pre_comp->precomp; 1012 1013 EC_POINT_free(pre_comp_generator); 1014 } 1015 1016 if (preComputedTable == NULL && ecp_nistz256_is_affine_G(generator)) { 1017 /* 1018 * If there is no precomputed data, but the generator is the 1019 * default, a hardcoded table of precomputed data is used. This 1020 * is because applications, such as Apache, do not use 1021 * EC_KEY_precompute_mult. 1022 */ 1023 preComputedTable = ecp_nistz256_precomputed; 1024 } 1025 1026 if (preComputedTable) { 1027 BN_ULONG infty; 1028 1029 if ((BN_num_bits(scalar) > 256) 1030 || BN_is_negative(scalar)) { 1031 if ((tmp_scalar = BN_CTX_get(ctx)) == NULL) 1032 goto err; 1033 1034 if (!BN_nnmod(tmp_scalar, scalar, group->order, ctx)) { 1035 ERR_raise(ERR_LIB_EC, ERR_R_BN_LIB); 1036 goto err; 1037 } 1038 scalar = tmp_scalar; 1039 } 1040 1041 for (i = 0; i < bn_get_top(scalar) * BN_BYTES; i += BN_BYTES) { 1042 BN_ULONG d = bn_get_words(scalar)[i / BN_BYTES]; 1043 1044 p_str[i + 0] = (unsigned char)d; 1045 p_str[i + 1] = (unsigned char)(d >> 8); 1046 p_str[i + 2] = (unsigned char)(d >> 16); 1047 p_str[i + 3] = (unsigned char)(d >>= 24); 1048 if (BN_BYTES == 8) { 1049 d >>= 8; 1050 p_str[i + 4] = (unsigned char)d; 1051 p_str[i + 5] = (unsigned char)(d >> 8); 1052 p_str[i + 6] = (unsigned char)(d >> 16); 1053 p_str[i + 7] = (unsigned char)(d >> 24); 1054 } 1055 } 1056 1057 for (; i < 33; i++) 1058 p_str[i] = 0; 1059 1060 /* First window */ 1061 wvalue = (p_str[0] << 1) & mask; 1062 idx += window_size; 1063 1064 wvalue = _booth_recode_w7(wvalue); 1065 1066 ecp_nistz256_gather_w7(&p.a, preComputedTable[0], 1067 wvalue >> 1); 1068 1069 ecp_nistz256_neg(p.p.Z, p.p.Y); 1070 copy_conditional(p.p.Y, p.p.Z, wvalue & 1); 1071 1072 /* 1073 * Since affine infinity is encoded as (0,0) and 1074 * Jacobian is (,,0), we need to harmonize them 1075 * by assigning "one" or zero to Z. 1076 */ 1077 infty = (p.p.X[0] | p.p.X[1] | p.p.X[2] | p.p.X[3] | 1078 p.p.Y[0] | p.p.Y[1] | p.p.Y[2] | p.p.Y[3]); 1079 if (P256_LIMBS == 8) 1080 infty |= (p.p.X[4] | p.p.X[5] | p.p.X[6] | p.p.X[7] | 1081 p.p.Y[4] | p.p.Y[5] | p.p.Y[6] | p.p.Y[7]); 1082 1083 infty = 0 - is_zero(infty); 1084 infty = ~infty; 1085 1086 p.p.Z[0] = ONE[0] & infty; 1087 p.p.Z[1] = ONE[1] & infty; 1088 p.p.Z[2] = ONE[2] & infty; 1089 p.p.Z[3] = ONE[3] & infty; 1090 if (P256_LIMBS == 8) { 1091 p.p.Z[4] = ONE[4] & infty; 1092 p.p.Z[5] = ONE[5] & infty; 1093 p.p.Z[6] = ONE[6] & infty; 1094 p.p.Z[7] = ONE[7] & infty; 1095 } 1096 1097 for (i = 1; i < 37; i++) { 1098 unsigned int off = (idx - 1) / 8; 1099 wvalue = p_str[off] | p_str[off + 1] << 8; 1100 wvalue = (wvalue >> ((idx - 1) % 8)) & mask; 1101 idx += window_size; 1102 1103 wvalue = _booth_recode_w7(wvalue); 1104 1105 ecp_nistz256_gather_w7(&t.a, 1106 preComputedTable[i], wvalue >> 1); 1107 1108 ecp_nistz256_neg(t.p.Z, t.a.Y); 1109 copy_conditional(t.a.Y, t.p.Z, wvalue & 1); 1110 1111 ecp_nistz256_point_add_affine(&p.p, &p.p, &t.a); 1112 } 1113 } else { 1114 p_is_infinity = 1; 1115 no_precomp_for_generator = 1; 1116 } 1117 } else 1118 p_is_infinity = 1; 1119 1120 if (no_precomp_for_generator) { 1121 /* 1122 * Without a precomputed table for the generator, it has to be 1123 * handled like a normal point. 1124 */ 1125 new_scalars = OPENSSL_malloc((num + 1) * sizeof(BIGNUM *)); 1126 if (new_scalars == NULL) { 1127 ERR_raise(ERR_LIB_EC, ERR_R_MALLOC_FAILURE); 1128 goto err; 1129 } 1130 1131 new_points = OPENSSL_malloc((num + 1) * sizeof(EC_POINT *)); 1132 if (new_points == NULL) { 1133 ERR_raise(ERR_LIB_EC, ERR_R_MALLOC_FAILURE); 1134 goto err; 1135 } 1136 1137 memcpy(new_scalars, scalars, num * sizeof(BIGNUM *)); 1138 new_scalars[num] = scalar; 1139 memcpy(new_points, points, num * sizeof(EC_POINT *)); 1140 new_points[num] = generator; 1141 1142 scalars = new_scalars; 1143 points = new_points; 1144 num++; 1145 } 1146 1147 if (num) { 1148 P256_POINT *out = &t.p; 1149 if (p_is_infinity) 1150 out = &p.p; 1151 1152 if (!ecp_nistz256_windowed_mul(group, out, scalars, points, num, ctx)) 1153 goto err; 1154 1155 if (!p_is_infinity) 1156 ecp_nistz256_point_add(&p.p, &p.p, out); 1157 } 1158 1159 /* Not constant-time, but we're only operating on the public output. */ 1160 if (!bn_set_words(r->X, p.p.X, P256_LIMBS) || 1161 !bn_set_words(r->Y, p.p.Y, P256_LIMBS) || 1162 !bn_set_words(r->Z, p.p.Z, P256_LIMBS)) { 1163 goto err; 1164 } 1165 r->Z_is_one = is_one(r->Z) & 1; 1166 1167 ret = 1; 1168 1169err: 1170 BN_CTX_end(ctx); 1171 OPENSSL_free(new_points); 1172 OPENSSL_free(new_scalars); 1173 return ret; 1174} 1175 1176__owur static int ecp_nistz256_get_affine(const EC_GROUP *group, 1177 const EC_POINT *point, 1178 BIGNUM *x, BIGNUM *y, BN_CTX *ctx) 1179{ 1180 BN_ULONG z_inv2[P256_LIMBS]; 1181 BN_ULONG z_inv3[P256_LIMBS]; 1182 BN_ULONG x_aff[P256_LIMBS]; 1183 BN_ULONG y_aff[P256_LIMBS]; 1184 BN_ULONG point_x[P256_LIMBS], point_y[P256_LIMBS], point_z[P256_LIMBS]; 1185 BN_ULONG x_ret[P256_LIMBS], y_ret[P256_LIMBS]; 1186 1187 if (EC_POINT_is_at_infinity(group, point)) { 1188 ERR_raise(ERR_LIB_EC, EC_R_POINT_AT_INFINITY); 1189 return 0; 1190 } 1191 1192 if (!ecp_nistz256_bignum_to_field_elem(point_x, point->X) || 1193 !ecp_nistz256_bignum_to_field_elem(point_y, point->Y) || 1194 !ecp_nistz256_bignum_to_field_elem(point_z, point->Z)) { 1195 ERR_raise(ERR_LIB_EC, EC_R_COORDINATES_OUT_OF_RANGE); 1196 return 0; 1197 } 1198 1199 ecp_nistz256_mod_inverse(z_inv3, point_z); 1200 ecp_nistz256_sqr_mont(z_inv2, z_inv3); 1201 ecp_nistz256_mul_mont(x_aff, z_inv2, point_x); 1202 1203 if (x != NULL) { 1204 ecp_nistz256_from_mont(x_ret, x_aff); 1205 if (!bn_set_words(x, x_ret, P256_LIMBS)) 1206 return 0; 1207 } 1208 1209 if (y != NULL) { 1210 ecp_nistz256_mul_mont(z_inv3, z_inv3, z_inv2); 1211 ecp_nistz256_mul_mont(y_aff, z_inv3, point_y); 1212 ecp_nistz256_from_mont(y_ret, y_aff); 1213 if (!bn_set_words(y, y_ret, P256_LIMBS)) 1214 return 0; 1215 } 1216 1217 return 1; 1218} 1219 1220static NISTZ256_PRE_COMP *ecp_nistz256_pre_comp_new(const EC_GROUP *group) 1221{ 1222 NISTZ256_PRE_COMP *ret = NULL; 1223 1224 if (!group) 1225 return NULL; 1226 1227 ret = OPENSSL_zalloc(sizeof(*ret)); 1228 1229 if (ret == NULL) { 1230 ERR_raise(ERR_LIB_EC, ERR_R_MALLOC_FAILURE); 1231 return ret; 1232 } 1233 1234 ret->group = group; 1235 ret->w = 6; /* default */ 1236 ret->references = 1; 1237 1238 ret->lock = CRYPTO_THREAD_lock_new(); 1239 if (ret->lock == NULL) { 1240 ERR_raise(ERR_LIB_EC, ERR_R_MALLOC_FAILURE); 1241 OPENSSL_free(ret); 1242 return NULL; 1243 } 1244 return ret; 1245} 1246 1247NISTZ256_PRE_COMP *EC_nistz256_pre_comp_dup(NISTZ256_PRE_COMP *p) 1248{ 1249 int i; 1250 if (p != NULL) 1251 CRYPTO_UP_REF(&p->references, &i, p->lock); 1252 return p; 1253} 1254 1255void EC_nistz256_pre_comp_free(NISTZ256_PRE_COMP *pre) 1256{ 1257 int i; 1258 1259 if (pre == NULL) 1260 return; 1261 1262 CRYPTO_DOWN_REF(&pre->references, &i, pre->lock); 1263 REF_PRINT_COUNT("EC_nistz256", pre); 1264 if (i > 0) 1265 return; 1266 REF_ASSERT_ISNT(i < 0); 1267 1268 OPENSSL_free(pre->precomp_storage); 1269 CRYPTO_THREAD_lock_free(pre->lock); 1270 OPENSSL_free(pre); 1271} 1272 1273 1274static int ecp_nistz256_window_have_precompute_mult(const EC_GROUP *group) 1275{ 1276 /* There is a hard-coded table for the default generator. */ 1277 const EC_POINT *generator = EC_GROUP_get0_generator(group); 1278 1279 if (generator != NULL && ecp_nistz256_is_affine_G(generator)) { 1280 /* There is a hard-coded table for the default generator. */ 1281 return 1; 1282 } 1283 1284 return HAVEPRECOMP(group, nistz256); 1285} 1286 1287#if defined(__x86_64) || defined(__x86_64__) || \ 1288 defined(_M_AMD64) || defined(_M_X64) || \ 1289 defined(__powerpc64__) || defined(_ARCH_PP64) || \ 1290 defined(__aarch64__) 1291/* 1292 * Montgomery mul modulo Order(P): res = a*b*2^-256 mod Order(P) 1293 */ 1294void ecp_nistz256_ord_mul_mont(BN_ULONG res[P256_LIMBS], 1295 const BN_ULONG a[P256_LIMBS], 1296 const BN_ULONG b[P256_LIMBS]); 1297void ecp_nistz256_ord_sqr_mont(BN_ULONG res[P256_LIMBS], 1298 const BN_ULONG a[P256_LIMBS], 1299 BN_ULONG rep); 1300 1301static int ecp_nistz256_inv_mod_ord(const EC_GROUP *group, BIGNUM *r, 1302 const BIGNUM *x, BN_CTX *ctx) 1303{ 1304 /* RR = 2^512 mod ord(p256) */ 1305 static const BN_ULONG RR[P256_LIMBS] = { 1306 TOBN(0x83244c95,0xbe79eea2), TOBN(0x4699799c,0x49bd6fa6), 1307 TOBN(0x2845b239,0x2b6bec59), TOBN(0x66e12d94,0xf3d95620) 1308 }; 1309 /* The constant 1 (unlike ONE that is one in Montgomery representation) */ 1310 static const BN_ULONG one[P256_LIMBS] = { 1311 TOBN(0,1), TOBN(0,0), TOBN(0,0), TOBN(0,0) 1312 }; 1313 /* 1314 * We don't use entry 0 in the table, so we omit it and address 1315 * with -1 offset. 1316 */ 1317 BN_ULONG table[15][P256_LIMBS]; 1318 BN_ULONG out[P256_LIMBS], t[P256_LIMBS]; 1319 int i, ret = 0; 1320 enum { 1321 i_1 = 0, i_10, i_11, i_101, i_111, i_1010, i_1111, 1322 i_10101, i_101010, i_101111, i_x6, i_x8, i_x16, i_x32 1323 }; 1324 1325 /* 1326 * Catch allocation failure early. 1327 */ 1328 if (bn_wexpand(r, P256_LIMBS) == NULL) { 1329 ERR_raise(ERR_LIB_EC, ERR_R_BN_LIB); 1330 goto err; 1331 } 1332 1333 if ((BN_num_bits(x) > 256) || BN_is_negative(x)) { 1334 BIGNUM *tmp; 1335 1336 if ((tmp = BN_CTX_get(ctx)) == NULL 1337 || !BN_nnmod(tmp, x, group->order, ctx)) { 1338 ERR_raise(ERR_LIB_EC, ERR_R_BN_LIB); 1339 goto err; 1340 } 1341 x = tmp; 1342 } 1343 1344 if (!ecp_nistz256_bignum_to_field_elem(t, x)) { 1345 ERR_raise(ERR_LIB_EC, EC_R_COORDINATES_OUT_OF_RANGE); 1346 goto err; 1347 } 1348 1349 ecp_nistz256_ord_mul_mont(table[0], t, RR); 1350#if 0 1351 /* 1352 * Original sparse-then-fixed-window algorithm, retained for reference. 1353 */ 1354 for (i = 2; i < 16; i += 2) { 1355 ecp_nistz256_ord_sqr_mont(table[i-1], table[i/2-1], 1); 1356 ecp_nistz256_ord_mul_mont(table[i], table[i-1], table[0]); 1357 } 1358 1359 /* 1360 * The top 128bit of the exponent are highly redudndant, so we 1361 * perform an optimized flow 1362 */ 1363 ecp_nistz256_ord_sqr_mont(t, table[15-1], 4); /* f0 */ 1364 ecp_nistz256_ord_mul_mont(t, t, table[15-1]); /* ff */ 1365 1366 ecp_nistz256_ord_sqr_mont(out, t, 8); /* ff00 */ 1367 ecp_nistz256_ord_mul_mont(out, out, t); /* ffff */ 1368 1369 ecp_nistz256_ord_sqr_mont(t, out, 16); /* ffff0000 */ 1370 ecp_nistz256_ord_mul_mont(t, t, out); /* ffffffff */ 1371 1372 ecp_nistz256_ord_sqr_mont(out, t, 64); /* ffffffff0000000000000000 */ 1373 ecp_nistz256_ord_mul_mont(out, out, t); /* ffffffff00000000ffffffff */ 1374 1375 ecp_nistz256_ord_sqr_mont(out, out, 32); /* ffffffff00000000ffffffff00000000 */ 1376 ecp_nistz256_ord_mul_mont(out, out, t); /* ffffffff00000000ffffffffffffffff */ 1377 1378 /* 1379 * The bottom 128 bit of the exponent are processed with fixed 4-bit window 1380 */ 1381 for(i = 0; i < 32; i++) { 1382 /* expLo - the low 128 bits of the exponent we use (ord(p256) - 2), 1383 * split into nibbles */ 1384 static const unsigned char expLo[32] = { 1385 0xb,0xc,0xe,0x6,0xf,0xa,0xa,0xd,0xa,0x7,0x1,0x7,0x9,0xe,0x8,0x4, 1386 0xf,0x3,0xb,0x9,0xc,0xa,0xc,0x2,0xf,0xc,0x6,0x3,0x2,0x5,0x4,0xf 1387 }; 1388 1389 ecp_nistz256_ord_sqr_mont(out, out, 4); 1390 /* The exponent is public, no need in constant-time access */ 1391 ecp_nistz256_ord_mul_mont(out, out, table[expLo[i]-1]); 1392 } 1393#else 1394 /* 1395 * https://briansmith.org/ecc-inversion-addition-chains-01#p256_scalar_inversion 1396 * 1397 * Even though this code path spares 12 squarings, 4.5%, and 13 1398 * multiplications, 25%, on grand scale sign operation is not that 1399 * much faster, not more that 2%... 1400 */ 1401 1402 /* pre-calculate powers */ 1403 ecp_nistz256_ord_sqr_mont(table[i_10], table[i_1], 1); 1404 1405 ecp_nistz256_ord_mul_mont(table[i_11], table[i_1], table[i_10]); 1406 1407 ecp_nistz256_ord_mul_mont(table[i_101], table[i_11], table[i_10]); 1408 1409 ecp_nistz256_ord_mul_mont(table[i_111], table[i_101], table[i_10]); 1410 1411 ecp_nistz256_ord_sqr_mont(table[i_1010], table[i_101], 1); 1412 1413 ecp_nistz256_ord_mul_mont(table[i_1111], table[i_1010], table[i_101]); 1414 1415 ecp_nistz256_ord_sqr_mont(table[i_10101], table[i_1010], 1); 1416 ecp_nistz256_ord_mul_mont(table[i_10101], table[i_10101], table[i_1]); 1417 1418 ecp_nistz256_ord_sqr_mont(table[i_101010], table[i_10101], 1); 1419 1420 ecp_nistz256_ord_mul_mont(table[i_101111], table[i_101010], table[i_101]); 1421 1422 ecp_nistz256_ord_mul_mont(table[i_x6], table[i_101010], table[i_10101]); 1423 1424 ecp_nistz256_ord_sqr_mont(table[i_x8], table[i_x6], 2); 1425 ecp_nistz256_ord_mul_mont(table[i_x8], table[i_x8], table[i_11]); 1426 1427 ecp_nistz256_ord_sqr_mont(table[i_x16], table[i_x8], 8); 1428 ecp_nistz256_ord_mul_mont(table[i_x16], table[i_x16], table[i_x8]); 1429 1430 ecp_nistz256_ord_sqr_mont(table[i_x32], table[i_x16], 16); 1431 ecp_nistz256_ord_mul_mont(table[i_x32], table[i_x32], table[i_x16]); 1432 1433 /* calculations */ 1434 ecp_nistz256_ord_sqr_mont(out, table[i_x32], 64); 1435 ecp_nistz256_ord_mul_mont(out, out, table[i_x32]); 1436 1437 for (i = 0; i < 27; i++) { 1438 static const struct { unsigned char p, i; } chain[27] = { 1439 { 32, i_x32 }, { 6, i_101111 }, { 5, i_111 }, 1440 { 4, i_11 }, { 5, i_1111 }, { 5, i_10101 }, 1441 { 4, i_101 }, { 3, i_101 }, { 3, i_101 }, 1442 { 5, i_111 }, { 9, i_101111 }, { 6, i_1111 }, 1443 { 2, i_1 }, { 5, i_1 }, { 6, i_1111 }, 1444 { 5, i_111 }, { 4, i_111 }, { 5, i_111 }, 1445 { 5, i_101 }, { 3, i_11 }, { 10, i_101111 }, 1446 { 2, i_11 }, { 5, i_11 }, { 5, i_11 }, 1447 { 3, i_1 }, { 7, i_10101 }, { 6, i_1111 } 1448 }; 1449 1450 ecp_nistz256_ord_sqr_mont(out, out, chain[i].p); 1451 ecp_nistz256_ord_mul_mont(out, out, table[chain[i].i]); 1452 } 1453#endif 1454 ecp_nistz256_ord_mul_mont(out, out, one); 1455 1456 /* 1457 * Can't fail, but check return code to be consistent anyway. 1458 */ 1459 if (!bn_set_words(r, out, P256_LIMBS)) 1460 goto err; 1461 1462 ret = 1; 1463err: 1464 return ret; 1465} 1466#else 1467# define ecp_nistz256_inv_mod_ord NULL 1468#endif 1469 1470const EC_METHOD *EC_GFp_nistz256_method(void) 1471{ 1472 static const EC_METHOD ret = { 1473 EC_FLAGS_DEFAULT_OCT, 1474 NID_X9_62_prime_field, 1475 ossl_ec_GFp_mont_group_init, 1476 ossl_ec_GFp_mont_group_finish, 1477 ossl_ec_GFp_mont_group_clear_finish, 1478 ossl_ec_GFp_mont_group_copy, 1479 ossl_ec_GFp_mont_group_set_curve, 1480 ossl_ec_GFp_simple_group_get_curve, 1481 ossl_ec_GFp_simple_group_get_degree, 1482 ossl_ec_group_simple_order_bits, 1483 ossl_ec_GFp_simple_group_check_discriminant, 1484 ossl_ec_GFp_simple_point_init, 1485 ossl_ec_GFp_simple_point_finish, 1486 ossl_ec_GFp_simple_point_clear_finish, 1487 ossl_ec_GFp_simple_point_copy, 1488 ossl_ec_GFp_simple_point_set_to_infinity, 1489 ossl_ec_GFp_simple_point_set_affine_coordinates, 1490 ecp_nistz256_get_affine, 1491 0, 0, 0, 1492 ossl_ec_GFp_simple_add, 1493 ossl_ec_GFp_simple_dbl, 1494 ossl_ec_GFp_simple_invert, 1495 ossl_ec_GFp_simple_is_at_infinity, 1496 ossl_ec_GFp_simple_is_on_curve, 1497 ossl_ec_GFp_simple_cmp, 1498 ossl_ec_GFp_simple_make_affine, 1499 ossl_ec_GFp_simple_points_make_affine, 1500 ecp_nistz256_points_mul, /* mul */ 1501 ecp_nistz256_mult_precompute, /* precompute_mult */ 1502 ecp_nistz256_window_have_precompute_mult, /* have_precompute_mult */ 1503 ossl_ec_GFp_mont_field_mul, 1504 ossl_ec_GFp_mont_field_sqr, 1505 0, /* field_div */ 1506 ossl_ec_GFp_mont_field_inv, 1507 ossl_ec_GFp_mont_field_encode, 1508 ossl_ec_GFp_mont_field_decode, 1509 ossl_ec_GFp_mont_field_set_to_one, 1510 ossl_ec_key_simple_priv2oct, 1511 ossl_ec_key_simple_oct2priv, 1512 0, /* set private */ 1513 ossl_ec_key_simple_generate_key, 1514 ossl_ec_key_simple_check_key, 1515 ossl_ec_key_simple_generate_public_key, 1516 0, /* keycopy */ 1517 0, /* keyfinish */ 1518 ossl_ecdh_simple_compute_key, 1519 ossl_ecdsa_simple_sign_setup, 1520 ossl_ecdsa_simple_sign_sig, 1521 ossl_ecdsa_simple_verify_sig, 1522 ecp_nistz256_inv_mod_ord, /* can be #define-d NULL */ 1523 0, /* blind_coordinates */ 1524 0, /* ladder_pre */ 1525 0, /* ladder_step */ 1526 0 /* ladder_post */ 1527 }; 1528 1529 return &ret; 1530} 1531