ecp_nistp521.c revision 238384
1/* crypto/ec/ecp_nistp521.c */
2/*
3 * Written by Adam Langley (Google) for the OpenSSL project
4 */
5/* Copyright 2011 Google Inc.
6 *
7 * Licensed under the Apache License, Version 2.0 (the "License");
8 *
9 * you may not use this file except in compliance with the License.
10 * You may obtain a copy of the License at
11 *
12 *     http://www.apache.org/licenses/LICENSE-2.0
13 *
14 *  Unless required by applicable law or agreed to in writing, software
15 *  distributed under the License is distributed on an "AS IS" BASIS,
16 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 *  See the License for the specific language governing permissions and
18 *  limitations under the License.
19 */
20
21/*
22 * A 64-bit implementation of the NIST P-521 elliptic curve point multiplication
23 *
24 * OpenSSL integration was taken from Emilia Kasper's work in ecp_nistp224.c.
25 * Otherwise based on Emilia's P224 work, which was inspired by my curve25519
26 * work which got its smarts from Daniel J. Bernstein's work on the same.
27 */
28
29#include <openssl/opensslconf.h>
30#ifndef OPENSSL_NO_EC_NISTP_64_GCC_128
31
32#ifndef OPENSSL_SYS_VMS
33#include <stdint.h>
34#else
35#include <inttypes.h>
36#endif
37
38#include <string.h>
39#include <openssl/err.h>
40#include "ec_lcl.h"
41
42#if defined(__GNUC__) && (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))
43  /* even with gcc, the typedef won't work for 32-bit platforms */
44  typedef __uint128_t uint128_t; /* nonstandard; implemented by gcc on 64-bit platforms */
45#else
46  #error "Need GCC 3.1 or later to define type uint128_t"
47#endif
48
49typedef uint8_t u8;
50typedef uint64_t u64;
51typedef int64_t s64;
52
53/* The underlying field.
54 *
55 * P521 operates over GF(2^521-1). We can serialise an element of this field
56 * into 66 bytes where the most significant byte contains only a single bit. We
57 * call this an felem_bytearray. */
58
59typedef u8 felem_bytearray[66];
60
61/* These are the parameters of P521, taken from FIPS 186-3, section D.1.2.5.
62 * These values are big-endian. */
63static const felem_bytearray nistp521_curve_params[5] =
64	{
65	{0x01, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,  /* p */
66	 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
67	 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
68	 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
69	 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
70	 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
71	 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
72	 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
73	 0xff, 0xff},
74	{0x01, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,  /* a = -3 */
75	 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
76	 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
77	 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
78	 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
79	 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
80	 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
81	 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
82	 0xff, 0xfc},
83	{0x00, 0x51, 0x95, 0x3e, 0xb9, 0x61, 0x8e, 0x1c,  /* b */
84	 0x9a, 0x1f, 0x92, 0x9a, 0x21, 0xa0, 0xb6, 0x85,
85	 0x40, 0xee, 0xa2, 0xda, 0x72, 0x5b, 0x99, 0xb3,
86	 0x15, 0xf3, 0xb8, 0xb4, 0x89, 0x91, 0x8e, 0xf1,
87	 0x09, 0xe1, 0x56, 0x19, 0x39, 0x51, 0xec, 0x7e,
88	 0x93, 0x7b, 0x16, 0x52, 0xc0, 0xbd, 0x3b, 0xb1,
89	 0xbf, 0x07, 0x35, 0x73, 0xdf, 0x88, 0x3d, 0x2c,
90	 0x34, 0xf1, 0xef, 0x45, 0x1f, 0xd4, 0x6b, 0x50,
91	 0x3f, 0x00},
92	{0x00, 0xc6, 0x85, 0x8e, 0x06, 0xb7, 0x04, 0x04,  /* x */
93	 0xe9, 0xcd, 0x9e, 0x3e, 0xcb, 0x66, 0x23, 0x95,
94	 0xb4, 0x42, 0x9c, 0x64, 0x81, 0x39, 0x05, 0x3f,
95	 0xb5, 0x21, 0xf8, 0x28, 0xaf, 0x60, 0x6b, 0x4d,
96	 0x3d, 0xba, 0xa1, 0x4b, 0x5e, 0x77, 0xef, 0xe7,
97	 0x59, 0x28, 0xfe, 0x1d, 0xc1, 0x27, 0xa2, 0xff,
98	 0xa8, 0xde, 0x33, 0x48, 0xb3, 0xc1, 0x85, 0x6a,
99	 0x42, 0x9b, 0xf9, 0x7e, 0x7e, 0x31, 0xc2, 0xe5,
100	 0xbd, 0x66},
101	{0x01, 0x18, 0x39, 0x29, 0x6a, 0x78, 0x9a, 0x3b,  /* y */
102	 0xc0, 0x04, 0x5c, 0x8a, 0x5f, 0xb4, 0x2c, 0x7d,
103	 0x1b, 0xd9, 0x98, 0xf5, 0x44, 0x49, 0x57, 0x9b,
104	 0x44, 0x68, 0x17, 0xaf, 0xbd, 0x17, 0x27, 0x3e,
105	 0x66, 0x2c, 0x97, 0xee, 0x72, 0x99, 0x5e, 0xf4,
106	 0x26, 0x40, 0xc5, 0x50, 0xb9, 0x01, 0x3f, 0xad,
107	 0x07, 0x61, 0x35, 0x3c, 0x70, 0x86, 0xa2, 0x72,
108	 0xc2, 0x40, 0x88, 0xbe, 0x94, 0x76, 0x9f, 0xd1,
109	 0x66, 0x50}
110	};
111
112/* The representation of field elements.
113 * ------------------------------------
114 *
115 * We represent field elements with nine values. These values are either 64 or
116 * 128 bits and the field element represented is:
117 *   v[0]*2^0 + v[1]*2^58 + v[2]*2^116 + ... + v[8]*2^464  (mod p)
118 * Each of the nine values is called a 'limb'. Since the limbs are spaced only
119 * 58 bits apart, but are greater than 58 bits in length, the most significant
120 * bits of each limb overlap with the least significant bits of the next.
121 *
122 * A field element with 64-bit limbs is an 'felem'. One with 128-bit limbs is a
123 * 'largefelem' */
124
125#define NLIMBS 9
126
127typedef uint64_t limb;
128typedef limb felem[NLIMBS];
129typedef uint128_t largefelem[NLIMBS];
130
131static const limb bottom57bits = 0x1ffffffffffffff;
132static const limb bottom58bits = 0x3ffffffffffffff;
133
134/* bin66_to_felem takes a little-endian byte array and converts it into felem
135 * form. This assumes that the CPU is little-endian. */
136static void bin66_to_felem(felem out, const u8 in[66])
137	{
138	out[0] = (*((limb*) &in[0])) & bottom58bits;
139	out[1] = (*((limb*) &in[7]) >> 2) & bottom58bits;
140	out[2] = (*((limb*) &in[14]) >> 4) & bottom58bits;
141	out[3] = (*((limb*) &in[21]) >> 6) & bottom58bits;
142	out[4] = (*((limb*) &in[29])) & bottom58bits;
143	out[5] = (*((limb*) &in[36]) >> 2) & bottom58bits;
144	out[6] = (*((limb*) &in[43]) >> 4) & bottom58bits;
145	out[7] = (*((limb*) &in[50]) >> 6) & bottom58bits;
146	out[8] = (*((limb*) &in[58])) & bottom57bits;
147	}
148
149/* felem_to_bin66 takes an felem and serialises into a little endian, 66 byte
150 * array. This assumes that the CPU is little-endian. */
151static void felem_to_bin66(u8 out[66], const felem in)
152	{
153	memset(out, 0, 66);
154	(*((limb*) &out[0])) = in[0];
155	(*((limb*) &out[7])) |= in[1] << 2;
156	(*((limb*) &out[14])) |= in[2] << 4;
157	(*((limb*) &out[21])) |= in[3] << 6;
158	(*((limb*) &out[29])) = in[4];
159	(*((limb*) &out[36])) |= in[5] << 2;
160	(*((limb*) &out[43])) |= in[6] << 4;
161	(*((limb*) &out[50])) |= in[7] << 6;
162	(*((limb*) &out[58])) = in[8];
163	}
164
165/* To preserve endianness when using BN_bn2bin and BN_bin2bn */
166static void flip_endian(u8 *out, const u8 *in, unsigned len)
167	{
168	unsigned i;
169	for (i = 0; i < len; ++i)
170		out[i] = in[len-1-i];
171	}
172
173/* BN_to_felem converts an OpenSSL BIGNUM into an felem */
174static int BN_to_felem(felem out, const BIGNUM *bn)
175	{
176	felem_bytearray b_in;
177	felem_bytearray b_out;
178	unsigned num_bytes;
179
180	/* BN_bn2bin eats leading zeroes */
181	memset(b_out, 0, sizeof b_out);
182	num_bytes = BN_num_bytes(bn);
183	if (num_bytes > sizeof b_out)
184		{
185		ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
186		return 0;
187		}
188	if (BN_is_negative(bn))
189		{
190		ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
191		return 0;
192		}
193	num_bytes = BN_bn2bin(bn, b_in);
194	flip_endian(b_out, b_in, num_bytes);
195	bin66_to_felem(out, b_out);
196	return 1;
197	}
198
199/* felem_to_BN converts an felem into an OpenSSL BIGNUM */
200static BIGNUM *felem_to_BN(BIGNUM *out, const felem in)
201	{
202	felem_bytearray b_in, b_out;
203	felem_to_bin66(b_in, in);
204	flip_endian(b_out, b_in, sizeof b_out);
205	return BN_bin2bn(b_out, sizeof b_out, out);
206	}
207
208
209/* Field operations
210 * ---------------- */
211
212static void felem_one(felem out)
213	{
214	out[0] = 1;
215	out[1] = 0;
216	out[2] = 0;
217	out[3] = 0;
218	out[4] = 0;
219	out[5] = 0;
220	out[6] = 0;
221	out[7] = 0;
222	out[8] = 0;
223	}
224
225static void felem_assign(felem out, const felem in)
226	{
227	out[0] = in[0];
228	out[1] = in[1];
229	out[2] = in[2];
230	out[3] = in[3];
231	out[4] = in[4];
232	out[5] = in[5];
233	out[6] = in[6];
234	out[7] = in[7];
235	out[8] = in[8];
236	}
237
238/* felem_sum64 sets out = out + in. */
239static void felem_sum64(felem out, const felem in)
240	{
241	out[0] += in[0];
242	out[1] += in[1];
243	out[2] += in[2];
244	out[3] += in[3];
245	out[4] += in[4];
246	out[5] += in[5];
247	out[6] += in[6];
248	out[7] += in[7];
249	out[8] += in[8];
250	}
251
252/* felem_scalar sets out = in * scalar */
253static void felem_scalar(felem out, const felem in, limb scalar)
254	{
255	out[0] = in[0] * scalar;
256	out[1] = in[1] * scalar;
257	out[2] = in[2] * scalar;
258	out[3] = in[3] * scalar;
259	out[4] = in[4] * scalar;
260	out[5] = in[5] * scalar;
261	out[6] = in[6] * scalar;
262	out[7] = in[7] * scalar;
263	out[8] = in[8] * scalar;
264	}
265
266/* felem_scalar64 sets out = out * scalar */
267static void felem_scalar64(felem out, limb scalar)
268	{
269	out[0] *= scalar;
270	out[1] *= scalar;
271	out[2] *= scalar;
272	out[3] *= scalar;
273	out[4] *= scalar;
274	out[5] *= scalar;
275	out[6] *= scalar;
276	out[7] *= scalar;
277	out[8] *= scalar;
278	}
279
280/* felem_scalar128 sets out = out * scalar */
281static void felem_scalar128(largefelem out, limb scalar)
282	{
283	out[0] *= scalar;
284	out[1] *= scalar;
285	out[2] *= scalar;
286	out[3] *= scalar;
287	out[4] *= scalar;
288	out[5] *= scalar;
289	out[6] *= scalar;
290	out[7] *= scalar;
291	out[8] *= scalar;
292	}
293
294/* felem_neg sets |out| to |-in|
295 * On entry:
296 *   in[i] < 2^59 + 2^14
297 * On exit:
298 *   out[i] < 2^62
299 */
300static void felem_neg(felem out, const felem in)
301	{
302	/* In order to prevent underflow, we subtract from 0 mod p. */
303	static const limb two62m3 = (((limb)1) << 62) - (((limb)1) << 5);
304	static const limb two62m2 = (((limb)1) << 62) - (((limb)1) << 4);
305
306	out[0] = two62m3 - in[0];
307	out[1] = two62m2 - in[1];
308	out[2] = two62m2 - in[2];
309	out[3] = two62m2 - in[3];
310	out[4] = two62m2 - in[4];
311	out[5] = two62m2 - in[5];
312	out[6] = two62m2 - in[6];
313	out[7] = two62m2 - in[7];
314	out[8] = two62m2 - in[8];
315	}
316
317/* felem_diff64 subtracts |in| from |out|
318 * On entry:
319 *   in[i] < 2^59 + 2^14
320 * On exit:
321 *   out[i] < out[i] + 2^62
322 */
323static void felem_diff64(felem out, const felem in)
324	{
325	/* In order to prevent underflow, we add 0 mod p before subtracting. */
326	static const limb two62m3 = (((limb)1) << 62) - (((limb)1) << 5);
327	static const limb two62m2 = (((limb)1) << 62) - (((limb)1) << 4);
328
329	out[0] += two62m3 - in[0];
330	out[1] += two62m2 - in[1];
331	out[2] += two62m2 - in[2];
332	out[3] += two62m2 - in[3];
333	out[4] += two62m2 - in[4];
334	out[5] += two62m2 - in[5];
335	out[6] += two62m2 - in[6];
336	out[7] += two62m2 - in[7];
337	out[8] += two62m2 - in[8];
338	}
339
340/* felem_diff_128_64 subtracts |in| from |out|
341 * On entry:
342 *   in[i] < 2^62 + 2^17
343 * On exit:
344 *   out[i] < out[i] + 2^63
345 */
346static void felem_diff_128_64(largefelem out, const felem in)
347	{
348	/* In order to prevent underflow, we add 0 mod p before subtracting. */
349	static const limb two63m6 = (((limb)1) << 62) - (((limb)1) << 5);
350	static const limb two63m5 = (((limb)1) << 62) - (((limb)1) << 4);
351
352	out[0] += two63m6 - in[0];
353	out[1] += two63m5 - in[1];
354	out[2] += two63m5 - in[2];
355	out[3] += two63m5 - in[3];
356	out[4] += two63m5 - in[4];
357	out[5] += two63m5 - in[5];
358	out[6] += two63m5 - in[6];
359	out[7] += two63m5 - in[7];
360	out[8] += two63m5 - in[8];
361	}
362
363/* felem_diff_128_64 subtracts |in| from |out|
364 * On entry:
365 *   in[i] < 2^126
366 * On exit:
367 *   out[i] < out[i] + 2^127 - 2^69
368 */
369static void felem_diff128(largefelem out, const largefelem in)
370	{
371	/* In order to prevent underflow, we add 0 mod p before subtracting. */
372	static const uint128_t two127m70 = (((uint128_t)1) << 127) - (((uint128_t)1) << 70);
373	static const uint128_t two127m69 = (((uint128_t)1) << 127) - (((uint128_t)1) << 69);
374
375	out[0] += (two127m70 - in[0]);
376	out[1] += (two127m69 - in[1]);
377	out[2] += (two127m69 - in[2]);
378	out[3] += (two127m69 - in[3]);
379	out[4] += (two127m69 - in[4]);
380	out[5] += (two127m69 - in[5]);
381	out[6] += (two127m69 - in[6]);
382	out[7] += (two127m69 - in[7]);
383	out[8] += (two127m69 - in[8]);
384	}
385
386/* felem_square sets |out| = |in|^2
387 * On entry:
388 *   in[i] < 2^62
389 * On exit:
390 *   out[i] < 17 * max(in[i]) * max(in[i])
391 */
392static void felem_square(largefelem out, const felem in)
393	{
394	felem inx2, inx4;
395	felem_scalar(inx2, in, 2);
396	felem_scalar(inx4, in, 4);
397
398	/* We have many cases were we want to do
399	 *   in[x] * in[y] +
400	 *   in[y] * in[x]
401	 * This is obviously just
402	 *   2 * in[x] * in[y]
403	 * However, rather than do the doubling on the 128 bit result, we
404	 * double one of the inputs to the multiplication by reading from
405	 * |inx2| */
406
407	out[0] = ((uint128_t) in[0]) * in[0];
408	out[1] = ((uint128_t) in[0]) * inx2[1];
409	out[2] = ((uint128_t) in[0]) * inx2[2] +
410		 ((uint128_t) in[1]) * in[1];
411	out[3] = ((uint128_t) in[0]) * inx2[3] +
412		 ((uint128_t) in[1]) * inx2[2];
413	out[4] = ((uint128_t) in[0]) * inx2[4] +
414		 ((uint128_t) in[1]) * inx2[3] +
415		 ((uint128_t) in[2]) * in[2];
416	out[5] = ((uint128_t) in[0]) * inx2[5] +
417		 ((uint128_t) in[1]) * inx2[4] +
418		 ((uint128_t) in[2]) * inx2[3];
419	out[6] = ((uint128_t) in[0]) * inx2[6] +
420		 ((uint128_t) in[1]) * inx2[5] +
421		 ((uint128_t) in[2]) * inx2[4] +
422		 ((uint128_t) in[3]) * in[3];
423	out[7] = ((uint128_t) in[0]) * inx2[7] +
424		 ((uint128_t) in[1]) * inx2[6] +
425		 ((uint128_t) in[2]) * inx2[5] +
426		 ((uint128_t) in[3]) * inx2[4];
427	out[8] = ((uint128_t) in[0]) * inx2[8] +
428		 ((uint128_t) in[1]) * inx2[7] +
429		 ((uint128_t) in[2]) * inx2[6] +
430		 ((uint128_t) in[3]) * inx2[5] +
431		 ((uint128_t) in[4]) * in[4];
432
433	/* The remaining limbs fall above 2^521, with the first falling at
434	 * 2^522. They correspond to locations one bit up from the limbs
435	 * produced above so we would have to multiply by two to align them.
436	 * Again, rather than operate on the 128-bit result, we double one of
437	 * the inputs to the multiplication. If we want to double for both this
438	 * reason, and the reason above, then we end up multiplying by four. */
439
440	/* 9 */
441	out[0] += ((uint128_t) in[1]) * inx4[8] +
442		  ((uint128_t) in[2]) * inx4[7] +
443		  ((uint128_t) in[3]) * inx4[6] +
444		  ((uint128_t) in[4]) * inx4[5];
445
446	/* 10 */
447	out[1] += ((uint128_t) in[2]) * inx4[8] +
448		  ((uint128_t) in[3]) * inx4[7] +
449		  ((uint128_t) in[4]) * inx4[6] +
450		  ((uint128_t) in[5]) * inx2[5];
451
452	/* 11 */
453	out[2] += ((uint128_t) in[3]) * inx4[8] +
454		  ((uint128_t) in[4]) * inx4[7] +
455		  ((uint128_t) in[5]) * inx4[6];
456
457	/* 12 */
458	out[3] += ((uint128_t) in[4]) * inx4[8] +
459		  ((uint128_t) in[5]) * inx4[7] +
460		  ((uint128_t) in[6]) * inx2[6];
461
462	/* 13 */
463	out[4] += ((uint128_t) in[5]) * inx4[8] +
464		  ((uint128_t) in[6]) * inx4[7];
465
466	/* 14 */
467	out[5] += ((uint128_t) in[6]) * inx4[8] +
468		  ((uint128_t) in[7]) * inx2[7];
469
470	/* 15 */
471	out[6] += ((uint128_t) in[7]) * inx4[8];
472
473	/* 16 */
474	out[7] += ((uint128_t) in[8]) * inx2[8];
475	}
476
477/* felem_mul sets |out| = |in1| * |in2|
478 * On entry:
479 *   in1[i] < 2^64
480 *   in2[i] < 2^63
481 * On exit:
482 *   out[i] < 17 * max(in1[i]) * max(in2[i])
483 */
484static void felem_mul(largefelem out, const felem in1, const felem in2)
485	{
486	felem in2x2;
487	felem_scalar(in2x2, in2, 2);
488
489	out[0] = ((uint128_t) in1[0]) * in2[0];
490
491	out[1] = ((uint128_t) in1[0]) * in2[1] +
492	         ((uint128_t) in1[1]) * in2[0];
493
494	out[2] = ((uint128_t) in1[0]) * in2[2] +
495		 ((uint128_t) in1[1]) * in2[1] +
496	         ((uint128_t) in1[2]) * in2[0];
497
498	out[3] = ((uint128_t) in1[0]) * in2[3] +
499		 ((uint128_t) in1[1]) * in2[2] +
500		 ((uint128_t) in1[2]) * in2[1] +
501		 ((uint128_t) in1[3]) * in2[0];
502
503	out[4] = ((uint128_t) in1[0]) * in2[4] +
504		 ((uint128_t) in1[1]) * in2[3] +
505		 ((uint128_t) in1[2]) * in2[2] +
506		 ((uint128_t) in1[3]) * in2[1] +
507		 ((uint128_t) in1[4]) * in2[0];
508
509	out[5] = ((uint128_t) in1[0]) * in2[5] +
510		 ((uint128_t) in1[1]) * in2[4] +
511		 ((uint128_t) in1[2]) * in2[3] +
512		 ((uint128_t) in1[3]) * in2[2] +
513		 ((uint128_t) in1[4]) * in2[1] +
514		 ((uint128_t) in1[5]) * in2[0];
515
516	out[6] = ((uint128_t) in1[0]) * in2[6] +
517		 ((uint128_t) in1[1]) * in2[5] +
518		 ((uint128_t) in1[2]) * in2[4] +
519		 ((uint128_t) in1[3]) * in2[3] +
520		 ((uint128_t) in1[4]) * in2[2] +
521		 ((uint128_t) in1[5]) * in2[1] +
522		 ((uint128_t) in1[6]) * in2[0];
523
524	out[7] = ((uint128_t) in1[0]) * in2[7] +
525		 ((uint128_t) in1[1]) * in2[6] +
526		 ((uint128_t) in1[2]) * in2[5] +
527		 ((uint128_t) in1[3]) * in2[4] +
528		 ((uint128_t) in1[4]) * in2[3] +
529		 ((uint128_t) in1[5]) * in2[2] +
530		 ((uint128_t) in1[6]) * in2[1] +
531		 ((uint128_t) in1[7]) * in2[0];
532
533	out[8] = ((uint128_t) in1[0]) * in2[8] +
534		 ((uint128_t) in1[1]) * in2[7] +
535		 ((uint128_t) in1[2]) * in2[6] +
536		 ((uint128_t) in1[3]) * in2[5] +
537		 ((uint128_t) in1[4]) * in2[4] +
538		 ((uint128_t) in1[5]) * in2[3] +
539		 ((uint128_t) in1[6]) * in2[2] +
540		 ((uint128_t) in1[7]) * in2[1] +
541		 ((uint128_t) in1[8]) * in2[0];
542
543	/* See comment in felem_square about the use of in2x2 here */
544
545	out[0] += ((uint128_t) in1[1]) * in2x2[8] +
546		  ((uint128_t) in1[2]) * in2x2[7] +
547		  ((uint128_t) in1[3]) * in2x2[6] +
548		  ((uint128_t) in1[4]) * in2x2[5] +
549		  ((uint128_t) in1[5]) * in2x2[4] +
550		  ((uint128_t) in1[6]) * in2x2[3] +
551		  ((uint128_t) in1[7]) * in2x2[2] +
552		  ((uint128_t) in1[8]) * in2x2[1];
553
554	out[1] += ((uint128_t) in1[2]) * in2x2[8] +
555		  ((uint128_t) in1[3]) * in2x2[7] +
556		  ((uint128_t) in1[4]) * in2x2[6] +
557		  ((uint128_t) in1[5]) * in2x2[5] +
558		  ((uint128_t) in1[6]) * in2x2[4] +
559		  ((uint128_t) in1[7]) * in2x2[3] +
560		  ((uint128_t) in1[8]) * in2x2[2];
561
562	out[2] += ((uint128_t) in1[3]) * in2x2[8] +
563		  ((uint128_t) in1[4]) * in2x2[7] +
564		  ((uint128_t) in1[5]) * in2x2[6] +
565		  ((uint128_t) in1[6]) * in2x2[5] +
566		  ((uint128_t) in1[7]) * in2x2[4] +
567		  ((uint128_t) in1[8]) * in2x2[3];
568
569	out[3] += ((uint128_t) in1[4]) * in2x2[8] +
570		  ((uint128_t) in1[5]) * in2x2[7] +
571		  ((uint128_t) in1[6]) * in2x2[6] +
572		  ((uint128_t) in1[7]) * in2x2[5] +
573		  ((uint128_t) in1[8]) * in2x2[4];
574
575	out[4] += ((uint128_t) in1[5]) * in2x2[8] +
576		  ((uint128_t) in1[6]) * in2x2[7] +
577		  ((uint128_t) in1[7]) * in2x2[6] +
578		  ((uint128_t) in1[8]) * in2x2[5];
579
580	out[5] += ((uint128_t) in1[6]) * in2x2[8] +
581		  ((uint128_t) in1[7]) * in2x2[7] +
582		  ((uint128_t) in1[8]) * in2x2[6];
583
584	out[6] += ((uint128_t) in1[7]) * in2x2[8] +
585		  ((uint128_t) in1[8]) * in2x2[7];
586
587	out[7] += ((uint128_t) in1[8]) * in2x2[8];
588	}
589
590static const limb bottom52bits = 0xfffffffffffff;
591
592/* felem_reduce converts a largefelem to an felem.
593 * On entry:
594 *   in[i] < 2^128
595 * On exit:
596 *   out[i] < 2^59 + 2^14
597 */
598static void felem_reduce(felem out, const largefelem in)
599	{
600	u64 overflow1, overflow2;
601
602	out[0] = ((limb) in[0]) & bottom58bits;
603	out[1] = ((limb) in[1]) & bottom58bits;
604	out[2] = ((limb) in[2]) & bottom58bits;
605	out[3] = ((limb) in[3]) & bottom58bits;
606	out[4] = ((limb) in[4]) & bottom58bits;
607	out[5] = ((limb) in[5]) & bottom58bits;
608	out[6] = ((limb) in[6]) & bottom58bits;
609	out[7] = ((limb) in[7]) & bottom58bits;
610	out[8] = ((limb) in[8]) & bottom58bits;
611
612	/* out[i] < 2^58 */
613
614	out[1] += ((limb) in[0]) >> 58;
615	out[1] += (((limb) (in[0] >> 64)) & bottom52bits) << 6;
616	/* out[1] < 2^58 + 2^6 + 2^58
617	 *        = 2^59 + 2^6 */
618	out[2] += ((limb) (in[0] >> 64)) >> 52;
619
620	out[2] += ((limb) in[1]) >> 58;
621	out[2] += (((limb) (in[1] >> 64)) & bottom52bits) << 6;
622	out[3] += ((limb) (in[1] >> 64)) >> 52;
623
624	out[3] += ((limb) in[2]) >> 58;
625	out[3] += (((limb) (in[2] >> 64)) & bottom52bits) << 6;
626	out[4] += ((limb) (in[2] >> 64)) >> 52;
627
628	out[4] += ((limb) in[3]) >> 58;
629	out[4] += (((limb) (in[3] >> 64)) & bottom52bits) << 6;
630	out[5] += ((limb) (in[3] >> 64)) >> 52;
631
632	out[5] += ((limb) in[4]) >> 58;
633	out[5] += (((limb) (in[4] >> 64)) & bottom52bits) << 6;
634	out[6] += ((limb) (in[4] >> 64)) >> 52;
635
636	out[6] += ((limb) in[5]) >> 58;
637	out[6] += (((limb) (in[5] >> 64)) & bottom52bits) << 6;
638	out[7] += ((limb) (in[5] >> 64)) >> 52;
639
640	out[7] += ((limb) in[6]) >> 58;
641	out[7] += (((limb) (in[6] >> 64)) & bottom52bits) << 6;
642	out[8] += ((limb) (in[6] >> 64)) >> 52;
643
644	out[8] += ((limb) in[7]) >> 58;
645	out[8] += (((limb) (in[7] >> 64)) & bottom52bits) << 6;
646	/* out[x > 1] < 2^58 + 2^6 + 2^58 + 2^12
647	 *            < 2^59 + 2^13 */
648	overflow1 = ((limb) (in[7] >> 64)) >> 52;
649
650	overflow1 += ((limb) in[8]) >> 58;
651	overflow1 += (((limb) (in[8] >> 64)) & bottom52bits) << 6;
652	overflow2 = ((limb) (in[8] >> 64)) >> 52;
653
654	overflow1 <<= 1;  /* overflow1 < 2^13 + 2^7 + 2^59 */
655	overflow2 <<= 1;  /* overflow2 < 2^13 */
656
657	out[0] += overflow1;  /* out[0] < 2^60 */
658	out[1] += overflow2;  /* out[1] < 2^59 + 2^6 + 2^13 */
659
660	out[1] += out[0] >> 58; out[0] &= bottom58bits;
661	/* out[0] < 2^58
662	 * out[1] < 2^59 + 2^6 + 2^13 + 2^2
663	 *        < 2^59 + 2^14 */
664	}
665
666static void felem_square_reduce(felem out, const felem in)
667	{
668	largefelem tmp;
669	felem_square(tmp, in);
670	felem_reduce(out, tmp);
671	}
672
673static void felem_mul_reduce(felem out, const felem in1, const felem in2)
674	{
675	largefelem tmp;
676	felem_mul(tmp, in1, in2);
677	felem_reduce(out, tmp);
678	}
679
680/* felem_inv calculates |out| = |in|^{-1}
681 *
682 * Based on Fermat's Little Theorem:
683 *   a^p = a (mod p)
684 *   a^{p-1} = 1 (mod p)
685 *   a^{p-2} = a^{-1} (mod p)
686 */
687static void felem_inv(felem out, const felem in)
688	{
689	felem ftmp, ftmp2, ftmp3, ftmp4;
690	largefelem tmp;
691	unsigned i;
692
693	felem_square(tmp, in); felem_reduce(ftmp, tmp);		/* 2^1 */
694	felem_mul(tmp, in, ftmp); felem_reduce(ftmp, tmp);	/* 2^2 - 2^0 */
695	felem_assign(ftmp2, ftmp);
696	felem_square(tmp, ftmp); felem_reduce(ftmp, tmp);	/* 2^3 - 2^1 */
697	felem_mul(tmp, in, ftmp); felem_reduce(ftmp, tmp);	/* 2^3 - 2^0 */
698	felem_square(tmp, ftmp); felem_reduce(ftmp, tmp);	/* 2^4 - 2^1 */
699
700	felem_square(tmp, ftmp2); felem_reduce(ftmp3, tmp);	/* 2^3 - 2^1 */
701	felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp);	/* 2^4 - 2^2 */
702	felem_mul(tmp, ftmp3, ftmp2); felem_reduce(ftmp3, tmp);	/* 2^4 - 2^0 */
703
704	felem_assign(ftmp2, ftmp3);
705	felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp);	/* 2^5 - 2^1 */
706	felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp);	/* 2^6 - 2^2 */
707	felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp);	/* 2^7 - 2^3 */
708	felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp);	/* 2^8 - 2^4 */
709	felem_assign(ftmp4, ftmp3);
710	felem_mul(tmp, ftmp3, ftmp); felem_reduce(ftmp4, tmp);	/* 2^8 - 2^1 */
711	felem_square(tmp, ftmp4); felem_reduce(ftmp4, tmp);	/* 2^9 - 2^2 */
712	felem_mul(tmp, ftmp3, ftmp2); felem_reduce(ftmp3, tmp);	/* 2^8 - 2^0 */
713	felem_assign(ftmp2, ftmp3);
714
715	for (i = 0; i < 8; i++)
716		{
717		felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp);	/* 2^16 - 2^8 */
718		}
719	felem_mul(tmp, ftmp3, ftmp2); felem_reduce(ftmp3, tmp);	/* 2^16 - 2^0 */
720	felem_assign(ftmp2, ftmp3);
721
722	for (i = 0; i < 16; i++)
723		{
724		felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp);	/* 2^32 - 2^16 */
725		}
726	felem_mul(tmp, ftmp3, ftmp2); felem_reduce(ftmp3, tmp);	/* 2^32 - 2^0 */
727	felem_assign(ftmp2, ftmp3);
728
729	for (i = 0; i < 32; i++)
730		{
731		felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp);	/* 2^64 - 2^32 */
732		}
733	felem_mul(tmp, ftmp3, ftmp2); felem_reduce(ftmp3, tmp);	/* 2^64 - 2^0 */
734	felem_assign(ftmp2, ftmp3);
735
736	for (i = 0; i < 64; i++)
737		{
738		felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp);	/* 2^128 - 2^64 */
739		}
740	felem_mul(tmp, ftmp3, ftmp2); felem_reduce(ftmp3, tmp);	/* 2^128 - 2^0 */
741	felem_assign(ftmp2, ftmp3);
742
743	for (i = 0; i < 128; i++)
744		{
745		felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp);	/* 2^256 - 2^128 */
746		}
747	felem_mul(tmp, ftmp3, ftmp2); felem_reduce(ftmp3, tmp);	/* 2^256 - 2^0 */
748	felem_assign(ftmp2, ftmp3);
749
750	for (i = 0; i < 256; i++)
751		{
752		felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp);	/* 2^512 - 2^256 */
753		}
754	felem_mul(tmp, ftmp3, ftmp2); felem_reduce(ftmp3, tmp);	/* 2^512 - 2^0 */
755
756	for (i = 0; i < 9; i++)
757		{
758		felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp);	/* 2^521 - 2^9 */
759		}
760	felem_mul(tmp, ftmp3, ftmp4); felem_reduce(ftmp3, tmp);	/* 2^512 - 2^2 */
761	felem_mul(tmp, ftmp3, in); felem_reduce(out, tmp);	/* 2^512 - 3 */
762}
763
764/* This is 2^521-1, expressed as an felem */
765static const felem kPrime =
766	{
767	0x03ffffffffffffff, 0x03ffffffffffffff, 0x03ffffffffffffff,
768	0x03ffffffffffffff, 0x03ffffffffffffff, 0x03ffffffffffffff,
769	0x03ffffffffffffff, 0x03ffffffffffffff, 0x01ffffffffffffff
770	};
771
772/* felem_is_zero returns a limb with all bits set if |in| == 0 (mod p) and 0
773 * otherwise.
774 * On entry:
775 *   in[i] < 2^59 + 2^14
776 */
777static limb felem_is_zero(const felem in)
778	{
779	felem ftmp;
780	limb is_zero, is_p;
781	felem_assign(ftmp, in);
782
783	ftmp[0] += ftmp[8] >> 57; ftmp[8] &= bottom57bits;
784	/* ftmp[8] < 2^57 */
785	ftmp[1] += ftmp[0] >> 58; ftmp[0] &= bottom58bits;
786	ftmp[2] += ftmp[1] >> 58; ftmp[1] &= bottom58bits;
787	ftmp[3] += ftmp[2] >> 58; ftmp[2] &= bottom58bits;
788	ftmp[4] += ftmp[3] >> 58; ftmp[3] &= bottom58bits;
789	ftmp[5] += ftmp[4] >> 58; ftmp[4] &= bottom58bits;
790	ftmp[6] += ftmp[5] >> 58; ftmp[5] &= bottom58bits;
791	ftmp[7] += ftmp[6] >> 58; ftmp[6] &= bottom58bits;
792	ftmp[8] += ftmp[7] >> 58; ftmp[7] &= bottom58bits;
793	/* ftmp[8] < 2^57 + 4 */
794
795	/* The ninth limb of 2*(2^521-1) is 0x03ffffffffffffff, which is
796	 * greater than our bound for ftmp[8]. Therefore we only have to check
797	 * if the zero is zero or 2^521-1. */
798
799	is_zero = 0;
800	is_zero |= ftmp[0];
801	is_zero |= ftmp[1];
802	is_zero |= ftmp[2];
803	is_zero |= ftmp[3];
804	is_zero |= ftmp[4];
805	is_zero |= ftmp[5];
806	is_zero |= ftmp[6];
807	is_zero |= ftmp[7];
808	is_zero |= ftmp[8];
809
810	is_zero--;
811	/* We know that ftmp[i] < 2^63, therefore the only way that the top bit
812	 * can be set is if is_zero was 0 before the decrement. */
813	is_zero = ((s64) is_zero) >> 63;
814
815	is_p = ftmp[0] ^ kPrime[0];
816	is_p |= ftmp[1] ^ kPrime[1];
817	is_p |= ftmp[2] ^ kPrime[2];
818	is_p |= ftmp[3] ^ kPrime[3];
819	is_p |= ftmp[4] ^ kPrime[4];
820	is_p |= ftmp[5] ^ kPrime[5];
821	is_p |= ftmp[6] ^ kPrime[6];
822	is_p |= ftmp[7] ^ kPrime[7];
823	is_p |= ftmp[8] ^ kPrime[8];
824
825	is_p--;
826	is_p = ((s64) is_p) >> 63;
827
828	is_zero |= is_p;
829	return is_zero;
830	}
831
832static int felem_is_zero_int(const felem in)
833	{
834	return (int) (felem_is_zero(in) & ((limb)1));
835	}
836
837/* felem_contract converts |in| to its unique, minimal representation.
838 * On entry:
839 *   in[i] < 2^59 + 2^14
840 */
841static void felem_contract(felem out, const felem in)
842	{
843	limb is_p, is_greater, sign;
844	static const limb two58 = ((limb)1) << 58;
845
846	felem_assign(out, in);
847
848	out[0] += out[8] >> 57; out[8] &= bottom57bits;
849	/* out[8] < 2^57 */
850	out[1] += out[0] >> 58; out[0] &= bottom58bits;
851	out[2] += out[1] >> 58; out[1] &= bottom58bits;
852	out[3] += out[2] >> 58; out[2] &= bottom58bits;
853	out[4] += out[3] >> 58; out[3] &= bottom58bits;
854	out[5] += out[4] >> 58; out[4] &= bottom58bits;
855	out[6] += out[5] >> 58; out[5] &= bottom58bits;
856	out[7] += out[6] >> 58; out[6] &= bottom58bits;
857	out[8] += out[7] >> 58; out[7] &= bottom58bits;
858	/* out[8] < 2^57 + 4 */
859
860	/* If the value is greater than 2^521-1 then we have to subtract
861	 * 2^521-1 out. See the comments in felem_is_zero regarding why we
862	 * don't test for other multiples of the prime. */
863
864	/* First, if |out| is equal to 2^521-1, we subtract it out to get zero. */
865
866	is_p = out[0] ^ kPrime[0];
867	is_p |= out[1] ^ kPrime[1];
868	is_p |= out[2] ^ kPrime[2];
869	is_p |= out[3] ^ kPrime[3];
870	is_p |= out[4] ^ kPrime[4];
871	is_p |= out[5] ^ kPrime[5];
872	is_p |= out[6] ^ kPrime[6];
873	is_p |= out[7] ^ kPrime[7];
874	is_p |= out[8] ^ kPrime[8];
875
876	is_p--;
877	is_p &= is_p << 32;
878	is_p &= is_p << 16;
879	is_p &= is_p << 8;
880	is_p &= is_p << 4;
881	is_p &= is_p << 2;
882	is_p &= is_p << 1;
883	is_p = ((s64) is_p) >> 63;
884	is_p = ~is_p;
885
886	/* is_p is 0 iff |out| == 2^521-1 and all ones otherwise */
887
888	out[0] &= is_p;
889	out[1] &= is_p;
890	out[2] &= is_p;
891	out[3] &= is_p;
892	out[4] &= is_p;
893	out[5] &= is_p;
894	out[6] &= is_p;
895	out[7] &= is_p;
896	out[8] &= is_p;
897
898	/* In order to test that |out| >= 2^521-1 we need only test if out[8]
899	 * >> 57 is greater than zero as (2^521-1) + x >= 2^522 */
900	is_greater = out[8] >> 57;
901	is_greater |= is_greater << 32;
902	is_greater |= is_greater << 16;
903	is_greater |= is_greater << 8;
904	is_greater |= is_greater << 4;
905	is_greater |= is_greater << 2;
906	is_greater |= is_greater << 1;
907	is_greater = ((s64) is_greater) >> 63;
908
909	out[0] -= kPrime[0] & is_greater;
910	out[1] -= kPrime[1] & is_greater;
911	out[2] -= kPrime[2] & is_greater;
912	out[3] -= kPrime[3] & is_greater;
913	out[4] -= kPrime[4] & is_greater;
914	out[5] -= kPrime[5] & is_greater;
915	out[6] -= kPrime[6] & is_greater;
916	out[7] -= kPrime[7] & is_greater;
917	out[8] -= kPrime[8] & is_greater;
918
919	/* Eliminate negative coefficients */
920	sign = -(out[0] >> 63); out[0] += (two58 & sign); out[1] -= (1 & sign);
921	sign = -(out[1] >> 63); out[1] += (two58 & sign); out[2] -= (1 & sign);
922	sign = -(out[2] >> 63); out[2] += (two58 & sign); out[3] -= (1 & sign);
923	sign = -(out[3] >> 63); out[3] += (two58 & sign); out[4] -= (1 & sign);
924	sign = -(out[4] >> 63); out[4] += (two58 & sign); out[5] -= (1 & sign);
925	sign = -(out[0] >> 63); out[5] += (two58 & sign); out[6] -= (1 & sign);
926	sign = -(out[6] >> 63); out[6] += (two58 & sign); out[7] -= (1 & sign);
927	sign = -(out[7] >> 63); out[7] += (two58 & sign); out[8] -= (1 & sign);
928	sign = -(out[5] >> 63); out[5] += (two58 & sign); out[6] -= (1 & sign);
929	sign = -(out[6] >> 63); out[6] += (two58 & sign); out[7] -= (1 & sign);
930	sign = -(out[7] >> 63); out[7] += (two58 & sign); out[8] -= (1 & sign);
931	}
932
933/* Group operations
934 * ----------------
935 *
936 * Building on top of the field operations we have the operations on the
937 * elliptic curve group itself. Points on the curve are represented in Jacobian
938 * coordinates */
939
940/* point_double calcuates 2*(x_in, y_in, z_in)
941 *
942 * The method is taken from:
943 *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b
944 *
945 * Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed.
946 * while x_out == y_in is not (maybe this works, but it's not tested). */
947static void
948point_double(felem x_out, felem y_out, felem z_out,
949	     const felem x_in, const felem y_in, const felem z_in)
950	{
951	largefelem tmp, tmp2;
952	felem delta, gamma, beta, alpha, ftmp, ftmp2;
953
954	felem_assign(ftmp, x_in);
955	felem_assign(ftmp2, x_in);
956
957	/* delta = z^2 */
958	felem_square(tmp, z_in);
959	felem_reduce(delta, tmp);  /* delta[i] < 2^59 + 2^14 */
960
961	/* gamma = y^2 */
962	felem_square(tmp, y_in);
963	felem_reduce(gamma, tmp);  /* gamma[i] < 2^59 + 2^14 */
964
965	/* beta = x*gamma */
966	felem_mul(tmp, x_in, gamma);
967	felem_reduce(beta, tmp);  /* beta[i] < 2^59 + 2^14 */
968
969	/* alpha = 3*(x-delta)*(x+delta) */
970	felem_diff64(ftmp, delta);
971	/* ftmp[i] < 2^61 */
972	felem_sum64(ftmp2, delta);
973	/* ftmp2[i] < 2^60 + 2^15 */
974	felem_scalar64(ftmp2, 3);
975	/* ftmp2[i] < 3*2^60 + 3*2^15 */
976	felem_mul(tmp, ftmp, ftmp2);
977	/* tmp[i] < 17(3*2^121 + 3*2^76)
978	 *        = 61*2^121 + 61*2^76
979	 *        < 64*2^121 + 64*2^76
980	 *        = 2^127 + 2^82
981	 *        < 2^128 */
982	felem_reduce(alpha, tmp);
983
984	/* x' = alpha^2 - 8*beta */
985	felem_square(tmp, alpha);
986	/* tmp[i] < 17*2^120
987	 *        < 2^125 */
988	felem_assign(ftmp, beta);
989	felem_scalar64(ftmp, 8);
990	/* ftmp[i] < 2^62 + 2^17 */
991	felem_diff_128_64(tmp, ftmp);
992	/* tmp[i] < 2^125 + 2^63 + 2^62 + 2^17 */
993	felem_reduce(x_out, tmp);
994
995	/* z' = (y + z)^2 - gamma - delta */
996	felem_sum64(delta, gamma);
997	/* delta[i] < 2^60 + 2^15 */
998	felem_assign(ftmp, y_in);
999	felem_sum64(ftmp, z_in);
1000	/* ftmp[i] < 2^60 + 2^15 */
1001	felem_square(tmp, ftmp);
1002	/* tmp[i] < 17(2^122)
1003	 *        < 2^127 */
1004	felem_diff_128_64(tmp, delta);
1005	/* tmp[i] < 2^127 + 2^63 */
1006	felem_reduce(z_out, tmp);
1007
1008	/* y' = alpha*(4*beta - x') - 8*gamma^2 */
1009	felem_scalar64(beta, 4);
1010	/* beta[i] < 2^61 + 2^16 */
1011	felem_diff64(beta, x_out);
1012	/* beta[i] < 2^61 + 2^60 + 2^16 */
1013	felem_mul(tmp, alpha, beta);
1014	/* tmp[i] < 17*((2^59 + 2^14)(2^61 + 2^60 + 2^16))
1015	 *        = 17*(2^120 + 2^75 + 2^119 + 2^74 + 2^75 + 2^30)
1016	 *        = 17*(2^120 + 2^119 + 2^76 + 2^74 + 2^30)
1017	 *        < 2^128 */
1018	felem_square(tmp2, gamma);
1019	/* tmp2[i] < 17*(2^59 + 2^14)^2
1020	 *         = 17*(2^118 + 2^74 + 2^28) */
1021	felem_scalar128(tmp2, 8);
1022	/* tmp2[i] < 8*17*(2^118 + 2^74 + 2^28)
1023	 *         = 2^125 + 2^121 + 2^81 + 2^77 + 2^35 + 2^31
1024	 *         < 2^126 */
1025	felem_diff128(tmp, tmp2);
1026	/* tmp[i] < 2^127 - 2^69 + 17(2^120 + 2^119 + 2^76 + 2^74 + 2^30)
1027	 *        = 2^127 + 2^124 + 2^122 + 2^120 + 2^118 + 2^80 + 2^78 + 2^76 +
1028	 *          2^74 + 2^69 + 2^34 + 2^30
1029	 *        < 2^128 */
1030	felem_reduce(y_out, tmp);
1031	}
1032
1033/* copy_conditional copies in to out iff mask is all ones. */
1034static void
1035copy_conditional(felem out, const felem in, limb mask)
1036	{
1037	unsigned i;
1038	for (i = 0; i < NLIMBS; ++i)
1039		{
1040		const limb tmp = mask & (in[i] ^ out[i]);
1041		out[i] ^= tmp;
1042		}
1043	}
1044
1045/* point_add calcuates (x1, y1, z1) + (x2, y2, z2)
1046 *
1047 * The method is taken from
1048 *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl,
1049 * adapted for mixed addition (z2 = 1, or z2 = 0 for the point at infinity).
1050 *
1051 * This function includes a branch for checking whether the two input points
1052 * are equal (while not equal to the point at infinity). This case never
1053 * happens during single point multiplication, so there is no timing leak for
1054 * ECDH or ECDSA signing. */
1055static void point_add(felem x3, felem y3, felem z3,
1056	const felem x1, const felem y1, const felem z1,
1057	const int mixed, const felem x2, const felem y2, const felem z2)
1058	{
1059	felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, ftmp6, x_out, y_out, z_out;
1060	largefelem tmp, tmp2;
1061	limb x_equal, y_equal, z1_is_zero, z2_is_zero;
1062
1063	z1_is_zero = felem_is_zero(z1);
1064	z2_is_zero = felem_is_zero(z2);
1065
1066	/* ftmp = z1z1 = z1**2 */
1067	felem_square(tmp, z1);
1068	felem_reduce(ftmp, tmp);
1069
1070	if (!mixed)
1071		{
1072		/* ftmp2 = z2z2 = z2**2 */
1073		felem_square(tmp, z2);
1074		felem_reduce(ftmp2, tmp);
1075
1076		/* u1 = ftmp3 = x1*z2z2 */
1077		felem_mul(tmp, x1, ftmp2);
1078		felem_reduce(ftmp3, tmp);
1079
1080		/* ftmp5 = z1 + z2 */
1081		felem_assign(ftmp5, z1);
1082		felem_sum64(ftmp5, z2);
1083		/* ftmp5[i] < 2^61 */
1084
1085		/* ftmp5 = (z1 + z2)**2 - z1z1 - z2z2 = 2*z1z2 */
1086		felem_square(tmp, ftmp5);
1087		/* tmp[i] < 17*2^122 */
1088		felem_diff_128_64(tmp, ftmp);
1089		/* tmp[i] < 17*2^122 + 2^63 */
1090		felem_diff_128_64(tmp, ftmp2);
1091		/* tmp[i] < 17*2^122 + 2^64 */
1092		felem_reduce(ftmp5, tmp);
1093
1094		/* ftmp2 = z2 * z2z2 */
1095		felem_mul(tmp, ftmp2, z2);
1096		felem_reduce(ftmp2, tmp);
1097
1098		/* s1 = ftmp6 = y1 * z2**3 */
1099		felem_mul(tmp, y1, ftmp2);
1100		felem_reduce(ftmp6, tmp);
1101		}
1102	else
1103		{
1104		/* We'll assume z2 = 1 (special case z2 = 0 is handled later) */
1105
1106		/* u1 = ftmp3 = x1*z2z2 */
1107		felem_assign(ftmp3, x1);
1108
1109		/* ftmp5 = 2*z1z2 */
1110		felem_scalar(ftmp5, z1, 2);
1111
1112		/* s1 = ftmp6 = y1 * z2**3 */
1113		felem_assign(ftmp6, y1);
1114		}
1115
1116	/* u2 = x2*z1z1 */
1117	felem_mul(tmp, x2, ftmp);
1118	/* tmp[i] < 17*2^120 */
1119
1120	/* h = ftmp4 = u2 - u1 */
1121	felem_diff_128_64(tmp, ftmp3);
1122	/* tmp[i] < 17*2^120 + 2^63 */
1123	felem_reduce(ftmp4, tmp);
1124
1125	x_equal = felem_is_zero(ftmp4);
1126
1127	/* z_out = ftmp5 * h */
1128	felem_mul(tmp, ftmp5, ftmp4);
1129	felem_reduce(z_out, tmp);
1130
1131	/* ftmp = z1 * z1z1 */
1132	felem_mul(tmp, ftmp, z1);
1133	felem_reduce(ftmp, tmp);
1134
1135	/* s2 = tmp = y2 * z1**3 */
1136	felem_mul(tmp, y2, ftmp);
1137	/* tmp[i] < 17*2^120 */
1138
1139	/* r = ftmp5 = (s2 - s1)*2 */
1140	felem_diff_128_64(tmp, ftmp6);
1141	/* tmp[i] < 17*2^120 + 2^63 */
1142	felem_reduce(ftmp5, tmp);
1143	y_equal = felem_is_zero(ftmp5);
1144	felem_scalar64(ftmp5, 2);
1145	/* ftmp5[i] < 2^61 */
1146
1147	if (x_equal && y_equal && !z1_is_zero && !z2_is_zero)
1148		{
1149		point_double(x3, y3, z3, x1, y1, z1);
1150		return;
1151		}
1152
1153	/* I = ftmp = (2h)**2 */
1154	felem_assign(ftmp, ftmp4);
1155	felem_scalar64(ftmp, 2);
1156	/* ftmp[i] < 2^61 */
1157	felem_square(tmp, ftmp);
1158	/* tmp[i] < 17*2^122 */
1159	felem_reduce(ftmp, tmp);
1160
1161	/* J = ftmp2 = h * I */
1162	felem_mul(tmp, ftmp4, ftmp);
1163	felem_reduce(ftmp2, tmp);
1164
1165	/* V = ftmp4 = U1 * I */
1166	felem_mul(tmp, ftmp3, ftmp);
1167	felem_reduce(ftmp4, tmp);
1168
1169	/* x_out = r**2 - J - 2V */
1170	felem_square(tmp, ftmp5);
1171	/* tmp[i] < 17*2^122 */
1172	felem_diff_128_64(tmp, ftmp2);
1173	/* tmp[i] < 17*2^122 + 2^63 */
1174	felem_assign(ftmp3, ftmp4);
1175	felem_scalar64(ftmp4, 2);
1176	/* ftmp4[i] < 2^61 */
1177	felem_diff_128_64(tmp, ftmp4);
1178	/* tmp[i] < 17*2^122 + 2^64 */
1179	felem_reduce(x_out, tmp);
1180
1181	/* y_out = r(V-x_out) - 2 * s1 * J */
1182	felem_diff64(ftmp3, x_out);
1183	/* ftmp3[i] < 2^60 + 2^60
1184	 *          = 2^61 */
1185	felem_mul(tmp, ftmp5, ftmp3);
1186	/* tmp[i] < 17*2^122 */
1187	felem_mul(tmp2, ftmp6, ftmp2);
1188	/* tmp2[i] < 17*2^120 */
1189	felem_scalar128(tmp2, 2);
1190	/* tmp2[i] < 17*2^121 */
1191	felem_diff128(tmp, tmp2);
1192	/* tmp[i] < 2^127 - 2^69 + 17*2^122
1193	 *        = 2^126 - 2^122 - 2^6 - 2^2 - 1
1194	 *        < 2^127 */
1195	felem_reduce(y_out, tmp);
1196
1197	copy_conditional(x_out, x2, z1_is_zero);
1198	copy_conditional(x_out, x1, z2_is_zero);
1199	copy_conditional(y_out, y2, z1_is_zero);
1200	copy_conditional(y_out, y1, z2_is_zero);
1201	copy_conditional(z_out, z2, z1_is_zero);
1202	copy_conditional(z_out, z1, z2_is_zero);
1203	felem_assign(x3, x_out);
1204	felem_assign(y3, y_out);
1205	felem_assign(z3, z_out);
1206	}
1207
1208/* Base point pre computation
1209 * --------------------------
1210 *
1211 * Two different sorts of precomputed tables are used in the following code.
1212 * Each contain various points on the curve, where each point is three field
1213 * elements (x, y, z).
1214 *
1215 * For the base point table, z is usually 1 (0 for the point at infinity).
1216 * This table has 16 elements:
1217 * index | bits    | point
1218 * ------+---------+------------------------------
1219 *     0 | 0 0 0 0 | 0G
1220 *     1 | 0 0 0 1 | 1G
1221 *     2 | 0 0 1 0 | 2^130G
1222 *     3 | 0 0 1 1 | (2^130 + 1)G
1223 *     4 | 0 1 0 0 | 2^260G
1224 *     5 | 0 1 0 1 | (2^260 + 1)G
1225 *     6 | 0 1 1 0 | (2^260 + 2^130)G
1226 *     7 | 0 1 1 1 | (2^260 + 2^130 + 1)G
1227 *     8 | 1 0 0 0 | 2^390G
1228 *     9 | 1 0 0 1 | (2^390 + 1)G
1229 *    10 | 1 0 1 0 | (2^390 + 2^130)G
1230 *    11 | 1 0 1 1 | (2^390 + 2^130 + 1)G
1231 *    12 | 1 1 0 0 | (2^390 + 2^260)G
1232 *    13 | 1 1 0 1 | (2^390 + 2^260 + 1)G
1233 *    14 | 1 1 1 0 | (2^390 + 2^260 + 2^130)G
1234 *    15 | 1 1 1 1 | (2^390 + 2^260 + 2^130 + 1)G
1235 *
1236 * The reason for this is so that we can clock bits into four different
1237 * locations when doing simple scalar multiplies against the base point.
1238 *
1239 * Tables for other points have table[i] = iG for i in 0 .. 16. */
1240
1241/* gmul is the table of precomputed base points */
1242static const felem gmul[16][3] =
1243	{{{0, 0, 0, 0, 0, 0, 0, 0, 0},
1244	  {0, 0, 0, 0, 0, 0, 0, 0, 0},
1245	  {0, 0, 0, 0, 0, 0, 0, 0, 0}},
1246	 {{0x017e7e31c2e5bd66, 0x022cf0615a90a6fe, 0x00127a2ffa8de334,
1247	   0x01dfbf9d64a3f877, 0x006b4d3dbaa14b5e, 0x014fed487e0a2bd8,
1248	   0x015b4429c6481390, 0x03a73678fb2d988e, 0x00c6858e06b70404},
1249	  {0x00be94769fd16650, 0x031c21a89cb09022, 0x039013fad0761353,
1250	   0x02657bd099031542, 0x03273e662c97ee72, 0x01e6d11a05ebef45,
1251	   0x03d1bd998f544495, 0x03001172297ed0b1, 0x011839296a789a3b},
1252	  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1253	 {{0x0373faacbc875bae, 0x00f325023721c671, 0x00f666fd3dbde5ad,
1254	   0x01a6932363f88ea7, 0x01fc6d9e13f9c47b, 0x03bcbffc2bbf734e,
1255	   0x013ee3c3647f3a92, 0x029409fefe75d07d, 0x00ef9199963d85e5},
1256	  {0x011173743ad5b178, 0x02499c7c21bf7d46, 0x035beaeabb8b1a58,
1257	   0x00f989c4752ea0a3, 0x0101e1de48a9c1a3, 0x01a20076be28ba6c,
1258	   0x02f8052e5eb2de95, 0x01bfe8f82dea117c, 0x0160074d3c36ddb7},
1259	  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1260	 {{0x012f3fc373393b3b, 0x03d3d6172f1419fa, 0x02adc943c0b86873,
1261	   0x00d475584177952b, 0x012a4d1673750ee2, 0x00512517a0f13b0c,
1262	   0x02b184671a7b1734, 0x0315b84236f1a50a, 0x00a4afc472edbdb9},
1263	  {0x00152a7077f385c4, 0x03044007d8d1c2ee, 0x0065829d61d52b52,
1264	   0x00494ff6b6631d0d, 0x00a11d94d5f06bcf, 0x02d2f89474d9282e,
1265	   0x0241c5727c06eeb9, 0x0386928710fbdb9d, 0x01f883f727b0dfbe},
1266	  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1267	 {{0x019b0c3c9185544d, 0x006243a37c9d97db, 0x02ee3cbe030a2ad2,
1268	   0x00cfdd946bb51e0d, 0x0271c00932606b91, 0x03f817d1ec68c561,
1269	   0x03f37009806a369c, 0x03c1f30baf184fd5, 0x01091022d6d2f065},
1270	  {0x0292c583514c45ed, 0x0316fca51f9a286c, 0x00300af507c1489a,
1271	   0x0295f69008298cf1, 0x02c0ed8274943d7b, 0x016509b9b47a431e,
1272	   0x02bc9de9634868ce, 0x005b34929bffcb09, 0x000c1a0121681524},
1273	  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1274	 {{0x0286abc0292fb9f2, 0x02665eee9805b3f7, 0x01ed7455f17f26d6,
1275	   0x0346355b83175d13, 0x006284944cd0a097, 0x0191895bcdec5e51,
1276	   0x02e288370afda7d9, 0x03b22312bfefa67a, 0x01d104d3fc0613fe},
1277	  {0x0092421a12f7e47f, 0x0077a83fa373c501, 0x03bd25c5f696bd0d,
1278	   0x035c41e4d5459761, 0x01ca0d1742b24f53, 0x00aaab27863a509c,
1279	   0x018b6de47df73917, 0x025c0b771705cd01, 0x01fd51d566d760a7},
1280	  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1281	 {{0x01dd92ff6b0d1dbd, 0x039c5e2e8f8afa69, 0x0261ed13242c3b27,
1282	   0x0382c6e67026e6a0, 0x01d60b10be2089f9, 0x03c15f3dce86723f,
1283	   0x03c764a32d2a062d, 0x017307eac0fad056, 0x018207c0b96c5256},
1284	  {0x0196a16d60e13154, 0x03e6ce74c0267030, 0x00ddbf2b4e52a5aa,
1285	   0x012738241bbf31c8, 0x00ebe8dc04685a28, 0x024c2ad6d380d4a2,
1286	   0x035ee062a6e62d0e, 0x0029ed74af7d3a0f, 0x00eef32aec142ebd},
1287	  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1288	 {{0x00c31ec398993b39, 0x03a9f45bcda68253, 0x00ac733c24c70890,
1289	   0x00872b111401ff01, 0x01d178c23195eafb, 0x03bca2c816b87f74,
1290	   0x0261a9af46fbad7a, 0x0324b2a8dd3d28f9, 0x00918121d8f24e23},
1291	  {0x032bc8c1ca983cd7, 0x00d869dfb08fc8c6, 0x01693cb61fce1516,
1292	   0x012a5ea68f4e88a8, 0x010869cab88d7ae3, 0x009081ad277ceee1,
1293	   0x033a77166d064cdc, 0x03955235a1fb3a95, 0x01251a4a9b25b65e},
1294	  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1295	 {{0x00148a3a1b27f40b, 0x0123186df1b31fdc, 0x00026e7beaad34ce,
1296	   0x01db446ac1d3dbba, 0x0299c1a33437eaec, 0x024540610183cbb7,
1297	   0x0173bb0e9ce92e46, 0x02b937e43921214b, 0x01ab0436a9bf01b5},
1298	  {0x0383381640d46948, 0x008dacbf0e7f330f, 0x03602122bcc3f318,
1299	   0x01ee596b200620d6, 0x03bd0585fda430b3, 0x014aed77fd123a83,
1300	   0x005ace749e52f742, 0x0390fe041da2b842, 0x0189a8ceb3299242},
1301	  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1302	 {{0x012a19d6b3282473, 0x00c0915918b423ce, 0x023a954eb94405ae,
1303	   0x00529f692be26158, 0x0289fa1b6fa4b2aa, 0x0198ae4ceea346ef,
1304	   0x0047d8cdfbdedd49, 0x00cc8c8953f0f6b8, 0x001424abbff49203},
1305	  {0x0256732a1115a03a, 0x0351bc38665c6733, 0x03f7b950fb4a6447,
1306	   0x000afffa94c22155, 0x025763d0a4dab540, 0x000511e92d4fc283,
1307	   0x030a7e9eda0ee96c, 0x004c3cd93a28bf0a, 0x017edb3a8719217f},
1308	  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1309	 {{0x011de5675a88e673, 0x031d7d0f5e567fbe, 0x0016b2062c970ae5,
1310	   0x03f4a2be49d90aa7, 0x03cef0bd13822866, 0x03f0923dcf774a6c,
1311	   0x0284bebc4f322f72, 0x016ab2645302bb2c, 0x01793f95dace0e2a},
1312	  {0x010646e13527a28f, 0x01ca1babd59dc5e7, 0x01afedfd9a5595df,
1313	   0x01f15785212ea6b1, 0x0324e5d64f6ae3f4, 0x02d680f526d00645,
1314	   0x0127920fadf627a7, 0x03b383f75df4f684, 0x0089e0057e783b0a},
1315	  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1316	 {{0x00f334b9eb3c26c6, 0x0298fdaa98568dce, 0x01c2d24843a82292,
1317	   0x020bcb24fa1b0711, 0x02cbdb3d2b1875e6, 0x0014907598f89422,
1318	   0x03abe3aa43b26664, 0x02cbf47f720bc168, 0x0133b5e73014b79b},
1319	  {0x034aab5dab05779d, 0x00cdc5d71fee9abb, 0x0399f16bd4bd9d30,
1320	   0x03582fa592d82647, 0x02be1cdfb775b0e9, 0x0034f7cea32e94cb,
1321	   0x0335a7f08f56f286, 0x03b707e9565d1c8b, 0x0015c946ea5b614f},
1322	  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1323	 {{0x024676f6cff72255, 0x00d14625cac96378, 0x00532b6008bc3767,
1324	   0x01fc16721b985322, 0x023355ea1b091668, 0x029de7afdc0317c3,
1325	   0x02fc8a7ca2da037c, 0x02de1217d74a6f30, 0x013f7173175b73bf},
1326	  {0x0344913f441490b5, 0x0200f9e272b61eca, 0x0258a246b1dd55d2,
1327	   0x03753db9ea496f36, 0x025e02937a09c5ef, 0x030cbd3d14012692,
1328	   0x01793a67e70dc72a, 0x03ec1d37048a662e, 0x006550f700c32a8d},
1329	  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1330	 {{0x00d3f48a347eba27, 0x008e636649b61bd8, 0x00d3b93716778fb3,
1331	   0x004d1915757bd209, 0x019d5311a3da44e0, 0x016d1afcbbe6aade,
1332	   0x0241bf5f73265616, 0x0384672e5d50d39b, 0x005009fee522b684},
1333	  {0x029b4fab064435fe, 0x018868ee095bbb07, 0x01ea3d6936cc92b8,
1334	   0x000608b00f78a2f3, 0x02db911073d1c20f, 0x018205938470100a,
1335	   0x01f1e4964cbe6ff2, 0x021a19a29eed4663, 0x01414485f42afa81},
1336	  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1337	 {{0x01612b3a17f63e34, 0x03813992885428e6, 0x022b3c215b5a9608,
1338	   0x029b4057e19f2fcb, 0x0384059a587af7e6, 0x02d6400ace6fe610,
1339	   0x029354d896e8e331, 0x00c047ee6dfba65e, 0x0037720542e9d49d},
1340	  {0x02ce9eed7c5e9278, 0x0374ed703e79643b, 0x01316c54c4072006,
1341	   0x005aaa09054b2ee8, 0x002824000c840d57, 0x03d4eba24771ed86,
1342	   0x0189c50aabc3bdae, 0x0338c01541e15510, 0x00466d56e38eed42},
1343	  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1344	 {{0x007efd8330ad8bd6, 0x02465ed48047710b, 0x0034c6606b215e0c,
1345	   0x016ae30c53cbf839, 0x01fa17bd37161216, 0x018ead4e61ce8ab9,
1346	   0x005482ed5f5dee46, 0x037543755bba1d7f, 0x005e5ac7e70a9d0f},
1347	  {0x0117e1bb2fdcb2a2, 0x03deea36249f40c4, 0x028d09b4a6246cb7,
1348	   0x03524b8855bcf756, 0x023d7d109d5ceb58, 0x0178e43e3223ef9c,
1349	   0x0154536a0c6e966a, 0x037964d1286ee9fe, 0x0199bcd90e125055},
1350	 {1, 0, 0, 0, 0, 0, 0, 0, 0}}};
1351
1352/* select_point selects the |idx|th point from a precomputation table and
1353 * copies it to out. */
1354static void select_point(const limb idx, unsigned int size, const felem pre_comp[/* size */][3],
1355			 felem out[3])
1356	{
1357	unsigned i, j;
1358	limb *outlimbs = &out[0][0];
1359	memset(outlimbs, 0, 3 * sizeof(felem));
1360
1361	for (i = 0; i < size; i++)
1362		{
1363		const limb *inlimbs = &pre_comp[i][0][0];
1364		limb mask = i ^ idx;
1365		mask |= mask >> 4;
1366		mask |= mask >> 2;
1367		mask |= mask >> 1;
1368		mask &= 1;
1369		mask--;
1370		for (j = 0; j < NLIMBS * 3; j++)
1371			outlimbs[j] |= inlimbs[j] & mask;
1372		}
1373	}
1374
1375/* get_bit returns the |i|th bit in |in| */
1376static char get_bit(const felem_bytearray in, int i)
1377	{
1378	if (i < 0)
1379		return 0;
1380	return (in[i >> 3] >> (i & 7)) & 1;
1381	}
1382
1383/* Interleaved point multiplication using precomputed point multiples:
1384 * The small point multiples 0*P, 1*P, ..., 16*P are in pre_comp[],
1385 * the scalars in scalars[]. If g_scalar is non-NULL, we also add this multiple
1386 * of the generator, using certain (large) precomputed multiples in g_pre_comp.
1387 * Output point (X, Y, Z) is stored in x_out, y_out, z_out */
1388static void batch_mul(felem x_out, felem y_out, felem z_out,
1389	const felem_bytearray scalars[], const unsigned num_points, const u8 *g_scalar,
1390	const int mixed, const felem pre_comp[][17][3], const felem g_pre_comp[16][3])
1391	{
1392	int i, skip;
1393	unsigned num, gen_mul = (g_scalar != NULL);
1394	felem nq[3], tmp[4];
1395	limb bits;
1396	u8 sign, digit;
1397
1398	/* set nq to the point at infinity */
1399	memset(nq, 0, 3 * sizeof(felem));
1400
1401	/* Loop over all scalars msb-to-lsb, interleaving additions
1402	 * of multiples of the generator (last quarter of rounds)
1403	 * and additions of other points multiples (every 5th round).
1404	 */
1405	skip = 1; /* save two point operations in the first round */
1406	for (i = (num_points ? 520 : 130); i >= 0; --i)
1407		{
1408		/* double */
1409		if (!skip)
1410			point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]);
1411
1412		/* add multiples of the generator */
1413		if (gen_mul && (i <= 130))
1414			{
1415			bits = get_bit(g_scalar, i + 390) << 3;
1416			if (i < 130)
1417				{
1418				bits |= get_bit(g_scalar, i + 260) << 2;
1419				bits |= get_bit(g_scalar, i + 130) << 1;
1420				bits |= get_bit(g_scalar, i);
1421				}
1422			/* select the point to add, in constant time */
1423			select_point(bits, 16, g_pre_comp, tmp);
1424			if (!skip)
1425				{
1426				point_add(nq[0], nq[1], nq[2],
1427					nq[0], nq[1], nq[2],
1428					1 /* mixed */, tmp[0], tmp[1], tmp[2]);
1429				}
1430			else
1431				{
1432				memcpy(nq, tmp, 3 * sizeof(felem));
1433				skip = 0;
1434				}
1435			}
1436
1437		/* do other additions every 5 doublings */
1438		if (num_points && (i % 5 == 0))
1439			{
1440			/* loop over all scalars */
1441			for (num = 0; num < num_points; ++num)
1442				{
1443				bits = get_bit(scalars[num], i + 4) << 5;
1444				bits |= get_bit(scalars[num], i + 3) << 4;
1445				bits |= get_bit(scalars[num], i + 2) << 3;
1446				bits |= get_bit(scalars[num], i + 1) << 2;
1447				bits |= get_bit(scalars[num], i) << 1;
1448				bits |= get_bit(scalars[num], i - 1);
1449				ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits);
1450
1451				/* select the point to add or subtract, in constant time */
1452				select_point(digit, 17, pre_comp[num], tmp);
1453				felem_neg(tmp[3], tmp[1]); /* (X, -Y, Z) is the negative point */
1454				copy_conditional(tmp[1], tmp[3], (-(limb) sign));
1455
1456				if (!skip)
1457					{
1458					point_add(nq[0], nq[1], nq[2],
1459						nq[0], nq[1], nq[2],
1460						mixed, tmp[0], tmp[1], tmp[2]);
1461					}
1462				else
1463					{
1464					memcpy(nq, tmp, 3 * sizeof(felem));
1465					skip = 0;
1466					}
1467				}
1468			}
1469		}
1470	felem_assign(x_out, nq[0]);
1471	felem_assign(y_out, nq[1]);
1472	felem_assign(z_out, nq[2]);
1473	}
1474
1475
1476/* Precomputation for the group generator. */
1477typedef struct {
1478	felem g_pre_comp[16][3];
1479	int references;
1480} NISTP521_PRE_COMP;
1481
1482const EC_METHOD *EC_GFp_nistp521_method(void)
1483	{
1484	static const EC_METHOD ret = {
1485		EC_FLAGS_DEFAULT_OCT,
1486		NID_X9_62_prime_field,
1487		ec_GFp_nistp521_group_init,
1488		ec_GFp_simple_group_finish,
1489		ec_GFp_simple_group_clear_finish,
1490		ec_GFp_nist_group_copy,
1491		ec_GFp_nistp521_group_set_curve,
1492		ec_GFp_simple_group_get_curve,
1493		ec_GFp_simple_group_get_degree,
1494		ec_GFp_simple_group_check_discriminant,
1495		ec_GFp_simple_point_init,
1496		ec_GFp_simple_point_finish,
1497		ec_GFp_simple_point_clear_finish,
1498		ec_GFp_simple_point_copy,
1499		ec_GFp_simple_point_set_to_infinity,
1500		ec_GFp_simple_set_Jprojective_coordinates_GFp,
1501		ec_GFp_simple_get_Jprojective_coordinates_GFp,
1502		ec_GFp_simple_point_set_affine_coordinates,
1503		ec_GFp_nistp521_point_get_affine_coordinates,
1504		0 /* point_set_compressed_coordinates */,
1505		0 /* point2oct */,
1506		0 /* oct2point */,
1507		ec_GFp_simple_add,
1508		ec_GFp_simple_dbl,
1509		ec_GFp_simple_invert,
1510		ec_GFp_simple_is_at_infinity,
1511		ec_GFp_simple_is_on_curve,
1512		ec_GFp_simple_cmp,
1513		ec_GFp_simple_make_affine,
1514		ec_GFp_simple_points_make_affine,
1515		ec_GFp_nistp521_points_mul,
1516		ec_GFp_nistp521_precompute_mult,
1517		ec_GFp_nistp521_have_precompute_mult,
1518		ec_GFp_nist_field_mul,
1519		ec_GFp_nist_field_sqr,
1520		0 /* field_div */,
1521		0 /* field_encode */,
1522		0 /* field_decode */,
1523		0 /* field_set_to_one */ };
1524
1525	return &ret;
1526	}
1527
1528
1529/******************************************************************************/
1530/*		       FUNCTIONS TO MANAGE PRECOMPUTATION
1531 */
1532
1533static NISTP521_PRE_COMP *nistp521_pre_comp_new()
1534	{
1535	NISTP521_PRE_COMP *ret = NULL;
1536	ret = (NISTP521_PRE_COMP *)OPENSSL_malloc(sizeof(NISTP521_PRE_COMP));
1537	if (!ret)
1538		{
1539		ECerr(EC_F_NISTP521_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE);
1540		return ret;
1541		}
1542	memset(ret->g_pre_comp, 0, sizeof(ret->g_pre_comp));
1543	ret->references = 1;
1544	return ret;
1545	}
1546
1547static void *nistp521_pre_comp_dup(void *src_)
1548	{
1549	NISTP521_PRE_COMP *src = src_;
1550
1551	/* no need to actually copy, these objects never change! */
1552	CRYPTO_add(&src->references, 1, CRYPTO_LOCK_EC_PRE_COMP);
1553
1554	return src_;
1555	}
1556
1557static void nistp521_pre_comp_free(void *pre_)
1558	{
1559	int i;
1560	NISTP521_PRE_COMP *pre = pre_;
1561
1562	if (!pre)
1563		return;
1564
1565	i = CRYPTO_add(&pre->references, -1, CRYPTO_LOCK_EC_PRE_COMP);
1566	if (i > 0)
1567		return;
1568
1569	OPENSSL_free(pre);
1570	}
1571
1572static void nistp521_pre_comp_clear_free(void *pre_)
1573	{
1574	int i;
1575	NISTP521_PRE_COMP *pre = pre_;
1576
1577	if (!pre)
1578		return;
1579
1580	i = CRYPTO_add(&pre->references, -1, CRYPTO_LOCK_EC_PRE_COMP);
1581	if (i > 0)
1582		return;
1583
1584	OPENSSL_cleanse(pre, sizeof(*pre));
1585	OPENSSL_free(pre);
1586	}
1587
1588/******************************************************************************/
1589/*			   OPENSSL EC_METHOD FUNCTIONS
1590 */
1591
1592int ec_GFp_nistp521_group_init(EC_GROUP *group)
1593	{
1594	int ret;
1595	ret = ec_GFp_simple_group_init(group);
1596	group->a_is_minus3 = 1;
1597	return ret;
1598	}
1599
1600int ec_GFp_nistp521_group_set_curve(EC_GROUP *group, const BIGNUM *p,
1601	const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx)
1602	{
1603	int ret = 0;
1604	BN_CTX *new_ctx = NULL;
1605	BIGNUM *curve_p, *curve_a, *curve_b;
1606
1607	if (ctx == NULL)
1608		if ((ctx = new_ctx = BN_CTX_new()) == NULL) return 0;
1609	BN_CTX_start(ctx);
1610	if (((curve_p = BN_CTX_get(ctx)) == NULL) ||
1611		((curve_a = BN_CTX_get(ctx)) == NULL) ||
1612		((curve_b = BN_CTX_get(ctx)) == NULL)) goto err;
1613	BN_bin2bn(nistp521_curve_params[0], sizeof(felem_bytearray), curve_p);
1614	BN_bin2bn(nistp521_curve_params[1], sizeof(felem_bytearray), curve_a);
1615	BN_bin2bn(nistp521_curve_params[2], sizeof(felem_bytearray), curve_b);
1616	if ((BN_cmp(curve_p, p)) || (BN_cmp(curve_a, a)) ||
1617		(BN_cmp(curve_b, b)))
1618		{
1619		ECerr(EC_F_EC_GFP_NISTP521_GROUP_SET_CURVE,
1620			EC_R_WRONG_CURVE_PARAMETERS);
1621		goto err;
1622		}
1623	group->field_mod_func = BN_nist_mod_521;
1624	ret = ec_GFp_simple_group_set_curve(group, p, a, b, ctx);
1625err:
1626	BN_CTX_end(ctx);
1627	if (new_ctx != NULL)
1628		BN_CTX_free(new_ctx);
1629	return ret;
1630	}
1631
1632/* Takes the Jacobian coordinates (X, Y, Z) of a point and returns
1633 * (X', Y') = (X/Z^2, Y/Z^3) */
1634int ec_GFp_nistp521_point_get_affine_coordinates(const EC_GROUP *group,
1635	const EC_POINT *point, BIGNUM *x, BIGNUM *y, BN_CTX *ctx)
1636	{
1637	felem z1, z2, x_in, y_in, x_out, y_out;
1638	largefelem tmp;
1639
1640	if (EC_POINT_is_at_infinity(group, point))
1641		{
1642		ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES,
1643			EC_R_POINT_AT_INFINITY);
1644		return 0;
1645		}
1646	if ((!BN_to_felem(x_in, &point->X)) || (!BN_to_felem(y_in, &point->Y)) ||
1647		(!BN_to_felem(z1, &point->Z))) return 0;
1648	felem_inv(z2, z1);
1649	felem_square(tmp, z2); felem_reduce(z1, tmp);
1650	felem_mul(tmp, x_in, z1); felem_reduce(x_in, tmp);
1651	felem_contract(x_out, x_in);
1652	if (x != NULL)
1653		{
1654		if (!felem_to_BN(x, x_out))
1655			{
1656			ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES, ERR_R_BN_LIB);
1657			return 0;
1658			}
1659		}
1660	felem_mul(tmp, z1, z2); felem_reduce(z1, tmp);
1661	felem_mul(tmp, y_in, z1); felem_reduce(y_in, tmp);
1662	felem_contract(y_out, y_in);
1663	if (y != NULL)
1664		{
1665		if (!felem_to_BN(y, y_out))
1666			{
1667			ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES, ERR_R_BN_LIB);
1668			return 0;
1669			}
1670		}
1671	return 1;
1672	}
1673
1674static void make_points_affine(size_t num, felem points[/* num */][3], felem tmp_felems[/* num+1 */])
1675	{
1676	/* Runs in constant time, unless an input is the point at infinity
1677	 * (which normally shouldn't happen). */
1678	ec_GFp_nistp_points_make_affine_internal(
1679		num,
1680		points,
1681		sizeof(felem),
1682		tmp_felems,
1683		(void (*)(void *)) felem_one,
1684		(int (*)(const void *)) felem_is_zero_int,
1685		(void (*)(void *, const void *)) felem_assign,
1686		(void (*)(void *, const void *)) felem_square_reduce,
1687		(void (*)(void *, const void *, const void *)) felem_mul_reduce,
1688		(void (*)(void *, const void *)) felem_inv,
1689		(void (*)(void *, const void *)) felem_contract);
1690	}
1691
1692/* Computes scalar*generator + \sum scalars[i]*points[i], ignoring NULL values
1693 * Result is stored in r (r can equal one of the inputs). */
1694int ec_GFp_nistp521_points_mul(const EC_GROUP *group, EC_POINT *r,
1695	const BIGNUM *scalar, size_t num, const EC_POINT *points[],
1696	const BIGNUM *scalars[], BN_CTX *ctx)
1697	{
1698	int ret = 0;
1699	int j;
1700	int mixed = 0;
1701	BN_CTX *new_ctx = NULL;
1702	BIGNUM *x, *y, *z, *tmp_scalar;
1703	felem_bytearray g_secret;
1704	felem_bytearray *secrets = NULL;
1705	felem (*pre_comp)[17][3] = NULL;
1706	felem *tmp_felems = NULL;
1707	felem_bytearray tmp;
1708	unsigned i, num_bytes;
1709	int have_pre_comp = 0;
1710	size_t num_points = num;
1711	felem x_in, y_in, z_in, x_out, y_out, z_out;
1712	NISTP521_PRE_COMP *pre = NULL;
1713	felem (*g_pre_comp)[3] = NULL;
1714	EC_POINT *generator = NULL;
1715	const EC_POINT *p = NULL;
1716	const BIGNUM *p_scalar = NULL;
1717
1718	if (ctx == NULL)
1719		if ((ctx = new_ctx = BN_CTX_new()) == NULL) return 0;
1720	BN_CTX_start(ctx);
1721	if (((x = BN_CTX_get(ctx)) == NULL) ||
1722		((y = BN_CTX_get(ctx)) == NULL) ||
1723		((z = BN_CTX_get(ctx)) == NULL) ||
1724		((tmp_scalar = BN_CTX_get(ctx)) == NULL))
1725		goto err;
1726
1727	if (scalar != NULL)
1728		{
1729		pre = EC_EX_DATA_get_data(group->extra_data,
1730			nistp521_pre_comp_dup, nistp521_pre_comp_free,
1731			nistp521_pre_comp_clear_free);
1732		if (pre)
1733			/* we have precomputation, try to use it */
1734			g_pre_comp = &pre->g_pre_comp[0];
1735		else
1736			/* try to use the standard precomputation */
1737			g_pre_comp = (felem (*)[3]) gmul;
1738		generator = EC_POINT_new(group);
1739		if (generator == NULL)
1740			goto err;
1741		/* get the generator from precomputation */
1742		if (!felem_to_BN(x, g_pre_comp[1][0]) ||
1743			!felem_to_BN(y, g_pre_comp[1][1]) ||
1744			!felem_to_BN(z, g_pre_comp[1][2]))
1745			{
1746			ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
1747			goto err;
1748			}
1749		if (!EC_POINT_set_Jprojective_coordinates_GFp(group,
1750				generator, x, y, z, ctx))
1751			goto err;
1752		if (0 == EC_POINT_cmp(group, generator, group->generator, ctx))
1753			/* precomputation matches generator */
1754			have_pre_comp = 1;
1755		else
1756			/* we don't have valid precomputation:
1757			 * treat the generator as a random point */
1758			num_points++;
1759		}
1760
1761	if (num_points > 0)
1762		{
1763		if (num_points >= 2)
1764			{
1765			/* unless we precompute multiples for just one point,
1766			 * converting those into affine form is time well spent  */
1767			mixed = 1;
1768			}
1769		secrets = OPENSSL_malloc(num_points * sizeof(felem_bytearray));
1770		pre_comp = OPENSSL_malloc(num_points * 17 * 3 * sizeof(felem));
1771		if (mixed)
1772			tmp_felems = OPENSSL_malloc((num_points * 17 + 1) * sizeof(felem));
1773		if ((secrets == NULL) || (pre_comp == NULL) || (mixed && (tmp_felems == NULL)))
1774			{
1775			ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_MALLOC_FAILURE);
1776			goto err;
1777			}
1778
1779		/* we treat NULL scalars as 0, and NULL points as points at infinity,
1780		 * i.e., they contribute nothing to the linear combination */
1781		memset(secrets, 0, num_points * sizeof(felem_bytearray));
1782		memset(pre_comp, 0, num_points * 17 * 3 * sizeof(felem));
1783		for (i = 0; i < num_points; ++i)
1784			{
1785			if (i == num)
1786				/* we didn't have a valid precomputation, so we pick
1787				 * the generator */
1788				{
1789				p = EC_GROUP_get0_generator(group);
1790				p_scalar = scalar;
1791				}
1792			else
1793				/* the i^th point */
1794				{
1795				p = points[i];
1796				p_scalar = scalars[i];
1797				}
1798			if ((p_scalar != NULL) && (p != NULL))
1799				{
1800				/* reduce scalar to 0 <= scalar < 2^521 */
1801				if ((BN_num_bits(p_scalar) > 521) || (BN_is_negative(p_scalar)))
1802					{
1803					/* this is an unusual input, and we don't guarantee
1804					 * constant-timeness */
1805					if (!BN_nnmod(tmp_scalar, p_scalar, &group->order, ctx))
1806						{
1807						ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
1808						goto err;
1809						}
1810					num_bytes = BN_bn2bin(tmp_scalar, tmp);
1811					}
1812				else
1813					num_bytes = BN_bn2bin(p_scalar, tmp);
1814				flip_endian(secrets[i], tmp, num_bytes);
1815				/* precompute multiples */
1816				if ((!BN_to_felem(x_out, &p->X)) ||
1817					(!BN_to_felem(y_out, &p->Y)) ||
1818					(!BN_to_felem(z_out, &p->Z))) goto err;
1819				memcpy(pre_comp[i][1][0], x_out, sizeof(felem));
1820				memcpy(pre_comp[i][1][1], y_out, sizeof(felem));
1821				memcpy(pre_comp[i][1][2], z_out, sizeof(felem));
1822				for (j = 2; j <= 16; ++j)
1823					{
1824					if (j & 1)
1825						{
1826						point_add(
1827							pre_comp[i][j][0], pre_comp[i][j][1], pre_comp[i][j][2],
1828							pre_comp[i][1][0], pre_comp[i][1][1], pre_comp[i][1][2],
1829							0, pre_comp[i][j-1][0], pre_comp[i][j-1][1], pre_comp[i][j-1][2]);
1830						}
1831					else
1832						{
1833						point_double(
1834							pre_comp[i][j][0], pre_comp[i][j][1], pre_comp[i][j][2],
1835							pre_comp[i][j/2][0], pre_comp[i][j/2][1], pre_comp[i][j/2][2]);
1836						}
1837					}
1838				}
1839			}
1840		if (mixed)
1841			make_points_affine(num_points * 17, pre_comp[0], tmp_felems);
1842		}
1843
1844	/* the scalar for the generator */
1845	if ((scalar != NULL) && (have_pre_comp))
1846		{
1847		memset(g_secret, 0, sizeof(g_secret));
1848		/* reduce scalar to 0 <= scalar < 2^521 */
1849		if ((BN_num_bits(scalar) > 521) || (BN_is_negative(scalar)))
1850			{
1851			/* this is an unusual input, and we don't guarantee
1852			 * constant-timeness */
1853			if (!BN_nnmod(tmp_scalar, scalar, &group->order, ctx))
1854				{
1855				ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
1856				goto err;
1857				}
1858			num_bytes = BN_bn2bin(tmp_scalar, tmp);
1859			}
1860		else
1861			num_bytes = BN_bn2bin(scalar, tmp);
1862		flip_endian(g_secret, tmp, num_bytes);
1863		/* do the multiplication with generator precomputation*/
1864		batch_mul(x_out, y_out, z_out,
1865			(const felem_bytearray (*)) secrets, num_points,
1866			g_secret,
1867			mixed, (const felem (*)[17][3]) pre_comp,
1868			(const felem (*)[3]) g_pre_comp);
1869		}
1870	else
1871		/* do the multiplication without generator precomputation */
1872		batch_mul(x_out, y_out, z_out,
1873			(const felem_bytearray (*)) secrets, num_points,
1874			NULL, mixed, (const felem (*)[17][3]) pre_comp, NULL);
1875	/* reduce the output to its unique minimal representation */
1876	felem_contract(x_in, x_out);
1877	felem_contract(y_in, y_out);
1878	felem_contract(z_in, z_out);
1879	if ((!felem_to_BN(x, x_in)) || (!felem_to_BN(y, y_in)) ||
1880		(!felem_to_BN(z, z_in)))
1881		{
1882		ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
1883		goto err;
1884		}
1885	ret = EC_POINT_set_Jprojective_coordinates_GFp(group, r, x, y, z, ctx);
1886
1887err:
1888	BN_CTX_end(ctx);
1889	if (generator != NULL)
1890		EC_POINT_free(generator);
1891	if (new_ctx != NULL)
1892		BN_CTX_free(new_ctx);
1893	if (secrets != NULL)
1894		OPENSSL_free(secrets);
1895	if (pre_comp != NULL)
1896		OPENSSL_free(pre_comp);
1897	if (tmp_felems != NULL)
1898		OPENSSL_free(tmp_felems);
1899	return ret;
1900	}
1901
1902int ec_GFp_nistp521_precompute_mult(EC_GROUP *group, BN_CTX *ctx)
1903	{
1904	int ret = 0;
1905	NISTP521_PRE_COMP *pre = NULL;
1906	int i, j;
1907	BN_CTX *new_ctx = NULL;
1908	BIGNUM *x, *y;
1909	EC_POINT *generator = NULL;
1910	felem tmp_felems[16];
1911
1912	/* throw away old precomputation */
1913	EC_EX_DATA_free_data(&group->extra_data, nistp521_pre_comp_dup,
1914		nistp521_pre_comp_free, nistp521_pre_comp_clear_free);
1915	if (ctx == NULL)
1916		if ((ctx = new_ctx = BN_CTX_new()) == NULL) return 0;
1917	BN_CTX_start(ctx);
1918	if (((x = BN_CTX_get(ctx)) == NULL) ||
1919		((y = BN_CTX_get(ctx)) == NULL))
1920		goto err;
1921	/* get the generator */
1922	if (group->generator == NULL) goto err;
1923	generator = EC_POINT_new(group);
1924	if (generator == NULL)
1925		goto err;
1926	BN_bin2bn(nistp521_curve_params[3], sizeof (felem_bytearray), x);
1927	BN_bin2bn(nistp521_curve_params[4], sizeof (felem_bytearray), y);
1928	if (!EC_POINT_set_affine_coordinates_GFp(group, generator, x, y, ctx))
1929		goto err;
1930	if ((pre = nistp521_pre_comp_new()) == NULL)
1931		goto err;
1932	/* if the generator is the standard one, use built-in precomputation */
1933	if (0 == EC_POINT_cmp(group, generator, group->generator, ctx))
1934		{
1935		memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp));
1936		ret = 1;
1937		goto err;
1938		}
1939	if ((!BN_to_felem(pre->g_pre_comp[1][0], &group->generator->X)) ||
1940		(!BN_to_felem(pre->g_pre_comp[1][1], &group->generator->Y)) ||
1941		(!BN_to_felem(pre->g_pre_comp[1][2], &group->generator->Z)))
1942		goto err;
1943	/* compute 2^130*G, 2^260*G, 2^390*G */
1944	for (i = 1; i <= 4; i <<= 1)
1945		{
1946		point_double(pre->g_pre_comp[2*i][0], pre->g_pre_comp[2*i][1],
1947			pre->g_pre_comp[2*i][2], pre->g_pre_comp[i][0],
1948			pre->g_pre_comp[i][1], pre->g_pre_comp[i][2]);
1949		for (j = 0; j < 129; ++j)
1950			{
1951			point_double(pre->g_pre_comp[2*i][0],
1952				pre->g_pre_comp[2*i][1],
1953				pre->g_pre_comp[2*i][2],
1954				pre->g_pre_comp[2*i][0],
1955				pre->g_pre_comp[2*i][1],
1956				pre->g_pre_comp[2*i][2]);
1957			}
1958		}
1959	/* g_pre_comp[0] is the point at infinity */
1960	memset(pre->g_pre_comp[0], 0, sizeof(pre->g_pre_comp[0]));
1961	/* the remaining multiples */
1962	/* 2^130*G + 2^260*G */
1963	point_add(pre->g_pre_comp[6][0], pre->g_pre_comp[6][1],
1964		pre->g_pre_comp[6][2], pre->g_pre_comp[4][0],
1965		pre->g_pre_comp[4][1], pre->g_pre_comp[4][2],
1966		0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
1967		pre->g_pre_comp[2][2]);
1968	/* 2^130*G + 2^390*G */
1969	point_add(pre->g_pre_comp[10][0], pre->g_pre_comp[10][1],
1970		pre->g_pre_comp[10][2], pre->g_pre_comp[8][0],
1971		pre->g_pre_comp[8][1], pre->g_pre_comp[8][2],
1972		0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
1973		pre->g_pre_comp[2][2]);
1974	/* 2^260*G + 2^390*G */
1975	point_add(pre->g_pre_comp[12][0], pre->g_pre_comp[12][1],
1976		pre->g_pre_comp[12][2], pre->g_pre_comp[8][0],
1977		pre->g_pre_comp[8][1], pre->g_pre_comp[8][2],
1978		0, pre->g_pre_comp[4][0], pre->g_pre_comp[4][1],
1979		pre->g_pre_comp[4][2]);
1980	/* 2^130*G + 2^260*G + 2^390*G */
1981	point_add(pre->g_pre_comp[14][0], pre->g_pre_comp[14][1],
1982		pre->g_pre_comp[14][2], pre->g_pre_comp[12][0],
1983		pre->g_pre_comp[12][1], pre->g_pre_comp[12][2],
1984		0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
1985		pre->g_pre_comp[2][2]);
1986	for (i = 1; i < 8; ++i)
1987		{
1988		/* odd multiples: add G */
1989		point_add(pre->g_pre_comp[2*i+1][0], pre->g_pre_comp[2*i+1][1],
1990			pre->g_pre_comp[2*i+1][2], pre->g_pre_comp[2*i][0],
1991			pre->g_pre_comp[2*i][1], pre->g_pre_comp[2*i][2],
1992			0, pre->g_pre_comp[1][0], pre->g_pre_comp[1][1],
1993			pre->g_pre_comp[1][2]);
1994		}
1995	make_points_affine(15, &(pre->g_pre_comp[1]), tmp_felems);
1996
1997	if (!EC_EX_DATA_set_data(&group->extra_data, pre, nistp521_pre_comp_dup,
1998			nistp521_pre_comp_free, nistp521_pre_comp_clear_free))
1999		goto err;
2000	ret = 1;
2001	pre = NULL;
2002 err:
2003	BN_CTX_end(ctx);
2004	if (generator != NULL)
2005		EC_POINT_free(generator);
2006	if (new_ctx != NULL)
2007		BN_CTX_free(new_ctx);
2008	if (pre)
2009		nistp521_pre_comp_free(pre);
2010	return ret;
2011	}
2012
2013int ec_GFp_nistp521_have_precompute_mult(const EC_GROUP *group)
2014	{
2015	if (EC_EX_DATA_get_data(group->extra_data, nistp521_pre_comp_dup,
2016			nistp521_pre_comp_free, nistp521_pre_comp_clear_free)
2017		!= NULL)
2018		return 1;
2019	else
2020		return 0;
2021	}
2022
2023#else
2024static void *dummy=&dummy;
2025#endif
2026