1/*
2 * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining
5 * a copy of this software and associated documentation files (the
6 * "Software"), to deal in the Software without restriction, including
7 * without limitation the rights to use, copy, modify, merge, publish,
8 * distribute, sublicense, and/or sell copies of the Software, and to
9 * permit persons to whom the Software is furnished to do so, subject to
10 * the following conditions:
11 *
12 * The above copyright notice and this permission notice shall be
13 * included in all copies or substantial portions of the Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
19 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
20 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
21 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25#include "inner.h"
26
27/*
28 * If BR_NO_ARITH_SHIFT is undefined, or defined to 0, then we _assume_
29 * that right-shifting a signed negative integer copies the sign bit
30 * (arithmetic right-shift). This is "implementation-defined behaviour",
31 * i.e. it is not undefined, but it may differ between compilers. Each
32 * compiler is supposed to document its behaviour in that respect. GCC
33 * explicitly defines that an arithmetic right shift is used. We expect
34 * all other compilers to do the same, because underlying CPU offer an
35 * arithmetic right shift opcode that could not be used otherwise.
36 */
37#if BR_NO_ARITH_SHIFT
38#define ARSH(x, n)    (((uint32_t)(x) >> (n)) \
39                      | ((-((uint32_t)(x) >> 31)) << (32 - (n))))
40#define ARSHW(x, n)   (((uint64_t)(x) >> (n)) \
41                      | ((-((uint64_t)(x) >> 63)) << (64 - (n))))
42#else
43#define ARSH(x, n)    ((*(int32_t *)&(x)) >> (n))
44#define ARSHW(x, n)   ((*(int64_t *)&(x)) >> (n))
45#endif
46
47/*
48 * Convert an integer from unsigned big-endian encoding to a sequence of
49 * 30-bit words in little-endian order. The final "partial" word is
50 * returned.
51 */
52static uint32_t
53be8_to_le30(uint32_t *dst, const unsigned char *src, size_t len)
54{
55	uint32_t acc;
56	int acc_len;
57
58	acc = 0;
59	acc_len = 0;
60	while (len -- > 0) {
61		uint32_t b;
62
63		b = src[len];
64		if (acc_len < 22) {
65			acc |= b << acc_len;
66			acc_len += 8;
67		} else {
68			*dst ++ = (acc | (b << acc_len)) & 0x3FFFFFFF;
69			acc = b >> (30 - acc_len);
70			acc_len -= 22;
71		}
72	}
73	return acc;
74}
75
76/*
77 * Convert an integer (30-bit words, little-endian) to unsigned
78 * big-endian encoding. The total encoding length is provided; all
79 * the destination bytes will be filled.
80 */
81static void
82le30_to_be8(unsigned char *dst, size_t len, const uint32_t *src)
83{
84	uint32_t acc;
85	int acc_len;
86
87	acc = 0;
88	acc_len = 0;
89	while (len -- > 0) {
90		if (acc_len < 8) {
91			uint32_t w;
92
93			w = *src ++;
94			dst[len] = (unsigned char)(acc | (w << acc_len));
95			acc = w >> (8 - acc_len);
96			acc_len += 22;
97		} else {
98			dst[len] = (unsigned char)acc;
99			acc >>= 8;
100			acc_len -= 8;
101		}
102	}
103}
104
105/*
106 * Multiply two integers. Source integers are represented as arrays of
107 * nine 30-bit words, for values up to 2^270-1. Result is encoded over
108 * 18 words of 30 bits each.
109 */
110static void
111mul9(uint32_t *d, const uint32_t *a, const uint32_t *b)
112{
113	/*
114	 * Maximum intermediate result is no more than
115	 * 10376293531797946367, which fits in 64 bits. Reason:
116	 *
117	 *   10376293531797946367 = 9 * (2^30-1)^2 + 9663676406
118	 *   10376293531797946367 < 9663676407 * 2^30
119	 *
120	 * Thus, adding together 9 products of 30-bit integers, with
121	 * a carry of at most 9663676406, yields an integer that fits
122	 * on 64 bits and generates a carry of at most 9663676406.
123	 */
124	uint64_t t[17];
125	uint64_t cc;
126	int i;
127
128	t[ 0] = MUL31(a[0], b[0]);
129	t[ 1] = MUL31(a[0], b[1])
130		+ MUL31(a[1], b[0]);
131	t[ 2] = MUL31(a[0], b[2])
132		+ MUL31(a[1], b[1])
133		+ MUL31(a[2], b[0]);
134	t[ 3] = MUL31(a[0], b[3])
135		+ MUL31(a[1], b[2])
136		+ MUL31(a[2], b[1])
137		+ MUL31(a[3], b[0]);
138	t[ 4] = MUL31(a[0], b[4])
139		+ MUL31(a[1], b[3])
140		+ MUL31(a[2], b[2])
141		+ MUL31(a[3], b[1])
142		+ MUL31(a[4], b[0]);
143	t[ 5] = MUL31(a[0], b[5])
144		+ MUL31(a[1], b[4])
145		+ MUL31(a[2], b[3])
146		+ MUL31(a[3], b[2])
147		+ MUL31(a[4], b[1])
148		+ MUL31(a[5], b[0]);
149	t[ 6] = MUL31(a[0], b[6])
150		+ MUL31(a[1], b[5])
151		+ MUL31(a[2], b[4])
152		+ MUL31(a[3], b[3])
153		+ MUL31(a[4], b[2])
154		+ MUL31(a[5], b[1])
155		+ MUL31(a[6], b[0]);
156	t[ 7] = MUL31(a[0], b[7])
157		+ MUL31(a[1], b[6])
158		+ MUL31(a[2], b[5])
159		+ MUL31(a[3], b[4])
160		+ MUL31(a[4], b[3])
161		+ MUL31(a[5], b[2])
162		+ MUL31(a[6], b[1])
163		+ MUL31(a[7], b[0]);
164	t[ 8] = MUL31(a[0], b[8])
165		+ MUL31(a[1], b[7])
166		+ MUL31(a[2], b[6])
167		+ MUL31(a[3], b[5])
168		+ MUL31(a[4], b[4])
169		+ MUL31(a[5], b[3])
170		+ MUL31(a[6], b[2])
171		+ MUL31(a[7], b[1])
172		+ MUL31(a[8], b[0]);
173	t[ 9] = MUL31(a[1], b[8])
174		+ MUL31(a[2], b[7])
175		+ MUL31(a[3], b[6])
176		+ MUL31(a[4], b[5])
177		+ MUL31(a[5], b[4])
178		+ MUL31(a[6], b[3])
179		+ MUL31(a[7], b[2])
180		+ MUL31(a[8], b[1]);
181	t[10] = MUL31(a[2], b[8])
182		+ MUL31(a[3], b[7])
183		+ MUL31(a[4], b[6])
184		+ MUL31(a[5], b[5])
185		+ MUL31(a[6], b[4])
186		+ MUL31(a[7], b[3])
187		+ MUL31(a[8], b[2]);
188	t[11] = MUL31(a[3], b[8])
189		+ MUL31(a[4], b[7])
190		+ MUL31(a[5], b[6])
191		+ MUL31(a[6], b[5])
192		+ MUL31(a[7], b[4])
193		+ MUL31(a[8], b[3]);
194	t[12] = MUL31(a[4], b[8])
195		+ MUL31(a[5], b[7])
196		+ MUL31(a[6], b[6])
197		+ MUL31(a[7], b[5])
198		+ MUL31(a[8], b[4]);
199	t[13] = MUL31(a[5], b[8])
200		+ MUL31(a[6], b[7])
201		+ MUL31(a[7], b[6])
202		+ MUL31(a[8], b[5]);
203	t[14] = MUL31(a[6], b[8])
204		+ MUL31(a[7], b[7])
205		+ MUL31(a[8], b[6]);
206	t[15] = MUL31(a[7], b[8])
207		+ MUL31(a[8], b[7]);
208	t[16] = MUL31(a[8], b[8]);
209
210	/*
211	 * Propagate carries.
212	 */
213	cc = 0;
214	for (i = 0; i < 17; i ++) {
215		uint64_t w;
216
217		w = t[i] + cc;
218		d[i] = (uint32_t)w & 0x3FFFFFFF;
219		cc = w >> 30;
220	}
221	d[17] = (uint32_t)cc;
222}
223
224/*
225 * Square a 270-bit integer, represented as an array of nine 30-bit words.
226 * Result uses 18 words of 30 bits each.
227 */
228static void
229square9(uint32_t *d, const uint32_t *a)
230{
231	uint64_t t[17];
232	uint64_t cc;
233	int i;
234
235	t[ 0] = MUL31(a[0], a[0]);
236	t[ 1] = ((MUL31(a[0], a[1])) << 1);
237	t[ 2] = MUL31(a[1], a[1])
238		+ ((MUL31(a[0], a[2])) << 1);
239	t[ 3] = ((MUL31(a[0], a[3])
240		+ MUL31(a[1], a[2])) << 1);
241	t[ 4] = MUL31(a[2], a[2])
242		+ ((MUL31(a[0], a[4])
243		+ MUL31(a[1], a[3])) << 1);
244	t[ 5] = ((MUL31(a[0], a[5])
245		+ MUL31(a[1], a[4])
246		+ MUL31(a[2], a[3])) << 1);
247	t[ 6] = MUL31(a[3], a[3])
248		+ ((MUL31(a[0], a[6])
249		+ MUL31(a[1], a[5])
250		+ MUL31(a[2], a[4])) << 1);
251	t[ 7] = ((MUL31(a[0], a[7])
252		+ MUL31(a[1], a[6])
253		+ MUL31(a[2], a[5])
254		+ MUL31(a[3], a[4])) << 1);
255	t[ 8] = MUL31(a[4], a[4])
256		+ ((MUL31(a[0], a[8])
257		+ MUL31(a[1], a[7])
258		+ MUL31(a[2], a[6])
259		+ MUL31(a[3], a[5])) << 1);
260	t[ 9] = ((MUL31(a[1], a[8])
261		+ MUL31(a[2], a[7])
262		+ MUL31(a[3], a[6])
263		+ MUL31(a[4], a[5])) << 1);
264	t[10] = MUL31(a[5], a[5])
265		+ ((MUL31(a[2], a[8])
266		+ MUL31(a[3], a[7])
267		+ MUL31(a[4], a[6])) << 1);
268	t[11] = ((MUL31(a[3], a[8])
269		+ MUL31(a[4], a[7])
270		+ MUL31(a[5], a[6])) << 1);
271	t[12] = MUL31(a[6], a[6])
272		+ ((MUL31(a[4], a[8])
273		+ MUL31(a[5], a[7])) << 1);
274	t[13] = ((MUL31(a[5], a[8])
275		+ MUL31(a[6], a[7])) << 1);
276	t[14] = MUL31(a[7], a[7])
277		+ ((MUL31(a[6], a[8])) << 1);
278	t[15] = ((MUL31(a[7], a[8])) << 1);
279	t[16] = MUL31(a[8], a[8]);
280
281	/*
282	 * Propagate carries.
283	 */
284	cc = 0;
285	for (i = 0; i < 17; i ++) {
286		uint64_t w;
287
288		w = t[i] + cc;
289		d[i] = (uint32_t)w & 0x3FFFFFFF;
290		cc = w >> 30;
291	}
292	d[17] = (uint32_t)cc;
293}
294
295/*
296 * Base field modulus for P-256.
297 */
298static const uint32_t F256[] = {
299
300	0x3FFFFFFF, 0x3FFFFFFF, 0x3FFFFFFF, 0x0000003F, 0x00000000,
301	0x00000000, 0x00001000, 0x3FFFC000, 0x0000FFFF
302};
303
304/*
305 * The 'b' curve equation coefficient for P-256.
306 */
307static const uint32_t P256_B[] = {
308
309	0x27D2604B, 0x2F38F0F8, 0x053B0F63, 0x0741AC33, 0x1886BC65,
310	0x2EF555DA, 0x293E7B3E, 0x0D762A8E, 0x00005AC6
311};
312
313/*
314 * Addition in the field. Source operands shall fit on 257 bits; output
315 * will be lower than twice the modulus.
316 */
317static void
318add_f256(uint32_t *d, const uint32_t *a, const uint32_t *b)
319{
320	uint32_t w, cc;
321	int i;
322
323	cc = 0;
324	for (i = 0; i < 9; i ++) {
325		w = a[i] + b[i] + cc;
326		d[i] = w & 0x3FFFFFFF;
327		cc = w >> 30;
328	}
329	w >>= 16;
330	d[8] &= 0xFFFF;
331	d[3] -= w << 6;
332	d[6] -= w << 12;
333	d[7] += w << 14;
334	cc = w;
335	for (i = 0; i < 9; i ++) {
336		w = d[i] + cc;
337		d[i] = w & 0x3FFFFFFF;
338		cc = ARSH(w, 30);
339	}
340}
341
342/*
343 * Subtraction in the field. Source operands shall be smaller than twice
344 * the modulus; the result will fulfil the same property.
345 */
346static void
347sub_f256(uint32_t *d, const uint32_t *a, const uint32_t *b)
348{
349	uint32_t w, cc;
350	int i;
351
352	/*
353	 * We really compute a - b + 2*p to make sure that the result is
354	 * positive.
355	 */
356	w = a[0] - b[0] - 0x00002;
357	d[0] = w & 0x3FFFFFFF;
358	w = a[1] - b[1] + ARSH(w, 30);
359	d[1] = w & 0x3FFFFFFF;
360	w = a[2] - b[2] + ARSH(w, 30);
361	d[2] = w & 0x3FFFFFFF;
362	w = a[3] - b[3] + ARSH(w, 30) + 0x00080;
363	d[3] = w & 0x3FFFFFFF;
364	w = a[4] - b[4] + ARSH(w, 30);
365	d[4] = w & 0x3FFFFFFF;
366	w = a[5] - b[5] + ARSH(w, 30);
367	d[5] = w & 0x3FFFFFFF;
368	w = a[6] - b[6] + ARSH(w, 30) + 0x02000;
369	d[6] = w & 0x3FFFFFFF;
370	w = a[7] - b[7] + ARSH(w, 30) - 0x08000;
371	d[7] = w & 0x3FFFFFFF;
372	w = a[8] - b[8] + ARSH(w, 30) + 0x20000;
373	d[8] = w & 0xFFFF;
374	w >>= 16;
375	d[8] &= 0xFFFF;
376	d[3] -= w << 6;
377	d[6] -= w << 12;
378	d[7] += w << 14;
379	cc = w;
380	for (i = 0; i < 9; i ++) {
381		w = d[i] + cc;
382		d[i] = w & 0x3FFFFFFF;
383		cc = ARSH(w, 30);
384	}
385}
386
387/*
388 * Compute a multiplication in F256. Source operands shall be less than
389 * twice the modulus.
390 */
391static void
392mul_f256(uint32_t *d, const uint32_t *a, const uint32_t *b)
393{
394	uint32_t t[18];
395	uint64_t s[18];
396	uint64_t cc, x;
397	uint32_t z, c;
398	int i;
399
400	mul9(t, a, b);
401
402	/*
403	 * Modular reduction: each high word in added/subtracted where
404	 * necessary.
405	 *
406	 * The modulus is:
407	 *    p = 2^256 - 2^224 + 2^192 + 2^96 - 1
408	 * Therefore:
409	 *    2^256 = 2^224 - 2^192 - 2^96 + 1 mod p
410	 *
411	 * For a word x at bit offset n (n >= 256), we have:
412	 *    x*2^n = x*2^(n-32) - x*2^(n-64)
413	 *            - x*2^(n - 160) + x*2^(n-256) mod p
414	 *
415	 * Thus, we can nullify the high word if we reinject it at some
416	 * proper emplacements.
417	 *
418	 * We use 64-bit intermediate words to allow for carries to
419	 * accumulate easily, before performing the final propagation.
420	 */
421	for (i = 0; i < 18; i ++) {
422		s[i] = t[i];
423	}
424
425	for (i = 17; i >= 9; i --) {
426		uint64_t y;
427
428		y = s[i];
429		s[i - 1] += ARSHW(y, 2);
430		s[i - 2] += (y << 28) & 0x3FFFFFFF;
431		s[i - 2] -= ARSHW(y, 4);
432		s[i - 3] -= (y << 26) & 0x3FFFFFFF;
433		s[i - 5] -= ARSHW(y, 10);
434		s[i - 6] -= (y << 20) & 0x3FFFFFFF;
435		s[i - 8] += ARSHW(y, 16);
436		s[i - 9] += (y << 14) & 0x3FFFFFFF;
437	}
438
439	/*
440	 * Carry propagation must be signed. Moreover, we may have overdone
441	 * it a bit, and obtain a negative result.
442	 *
443	 * The loop above ran 9 times; each time, each word was augmented
444	 * by at most one extra word (in absolute value). Thus, the top
445	 * word must in fine fit in 39 bits, so the carry below will fit
446	 * on 9 bits.
447	 */
448	cc = 0;
449	for (i = 0; i < 9; i ++) {
450		x = s[i] + cc;
451		d[i] = (uint32_t)x & 0x3FFFFFFF;
452		cc = ARSHW(x, 30);
453	}
454
455	/*
456	 * All nine words fit on 30 bits, but there may be an extra
457	 * carry for a few bits (at most 9), and that carry may be
458	 * negative. Moreover, we want the result to fit on 257 bits.
459	 * The two lines below ensure that the word in d[] has length
460	 * 256 bits, and the (signed) carry (beyond 2^256) is in cc. The
461	 * significant length of cc is less than 24 bits, so we will be
462	 * able to switch to 32-bit operations.
463	 */
464	cc = ARSHW(x, 16);
465	d[8] &= 0xFFFF;
466
467	/*
468	 * One extra round of reduction, for cc*2^256, which means
469	 * adding cc*(2^224-2^192-2^96+1) to a 256-bit (nonnegative)
470	 * value. If cc is negative, then it may happen (rarely, but
471	 * not neglectibly so) that the result would be negative. In
472	 * order to avoid that, if cc is negative, then we add the
473	 * modulus once. Note that if cc is negative, then propagating
474	 * that carry must yield a value lower than the modulus, so
475	 * adding the modulus once will keep the final result under
476	 * twice the modulus.
477	 */
478	z = (uint32_t)cc;
479	d[3] -= z << 6;
480	d[6] -= (z << 12) & 0x3FFFFFFF;
481	d[7] -= ARSH(z, 18);
482	d[7] += (z << 14) & 0x3FFFFFFF;
483	d[8] += ARSH(z, 16);
484	c = z >> 31;
485	d[0] -= c;
486	d[3] += c << 6;
487	d[6] += c << 12;
488	d[7] -= c << 14;
489	d[8] += c << 16;
490	for (i = 0; i < 9; i ++) {
491		uint32_t w;
492
493		w = d[i] + z;
494		d[i] = w & 0x3FFFFFFF;
495		z = ARSH(w, 30);
496	}
497}
498
499/*
500 * Compute a square in F256. Source operand shall be less than
501 * twice the modulus.
502 */
503static void
504square_f256(uint32_t *d, const uint32_t *a)
505{
506	uint32_t t[18];
507	uint64_t s[18];
508	uint64_t cc, x;
509	uint32_t z, c;
510	int i;
511
512	square9(t, a);
513
514	/*
515	 * Modular reduction: each high word in added/subtracted where
516	 * necessary.
517	 *
518	 * The modulus is:
519	 *    p = 2^256 - 2^224 + 2^192 + 2^96 - 1
520	 * Therefore:
521	 *    2^256 = 2^224 - 2^192 - 2^96 + 1 mod p
522	 *
523	 * For a word x at bit offset n (n >= 256), we have:
524	 *    x*2^n = x*2^(n-32) - x*2^(n-64)
525	 *            - x*2^(n - 160) + x*2^(n-256) mod p
526	 *
527	 * Thus, we can nullify the high word if we reinject it at some
528	 * proper emplacements.
529	 *
530	 * We use 64-bit intermediate words to allow for carries to
531	 * accumulate easily, before performing the final propagation.
532	 */
533	for (i = 0; i < 18; i ++) {
534		s[i] = t[i];
535	}
536
537	for (i = 17; i >= 9; i --) {
538		uint64_t y;
539
540		y = s[i];
541		s[i - 1] += ARSHW(y, 2);
542		s[i - 2] += (y << 28) & 0x3FFFFFFF;
543		s[i - 2] -= ARSHW(y, 4);
544		s[i - 3] -= (y << 26) & 0x3FFFFFFF;
545		s[i - 5] -= ARSHW(y, 10);
546		s[i - 6] -= (y << 20) & 0x3FFFFFFF;
547		s[i - 8] += ARSHW(y, 16);
548		s[i - 9] += (y << 14) & 0x3FFFFFFF;
549	}
550
551	/*
552	 * Carry propagation must be signed. Moreover, we may have overdone
553	 * it a bit, and obtain a negative result.
554	 *
555	 * The loop above ran 9 times; each time, each word was augmented
556	 * by at most one extra word (in absolute value). Thus, the top
557	 * word must in fine fit in 39 bits, so the carry below will fit
558	 * on 9 bits.
559	 */
560	cc = 0;
561	for (i = 0; i < 9; i ++) {
562		x = s[i] + cc;
563		d[i] = (uint32_t)x & 0x3FFFFFFF;
564		cc = ARSHW(x, 30);
565	}
566
567	/*
568	 * All nine words fit on 30 bits, but there may be an extra
569	 * carry for a few bits (at most 9), and that carry may be
570	 * negative. Moreover, we want the result to fit on 257 bits.
571	 * The two lines below ensure that the word in d[] has length
572	 * 256 bits, and the (signed) carry (beyond 2^256) is in cc. The
573	 * significant length of cc is less than 24 bits, so we will be
574	 * able to switch to 32-bit operations.
575	 */
576	cc = ARSHW(x, 16);
577	d[8] &= 0xFFFF;
578
579	/*
580	 * One extra round of reduction, for cc*2^256, which means
581	 * adding cc*(2^224-2^192-2^96+1) to a 256-bit (nonnegative)
582	 * value. If cc is negative, then it may happen (rarely, but
583	 * not neglectibly so) that the result would be negative. In
584	 * order to avoid that, if cc is negative, then we add the
585	 * modulus once. Note that if cc is negative, then propagating
586	 * that carry must yield a value lower than the modulus, so
587	 * adding the modulus once will keep the final result under
588	 * twice the modulus.
589	 */
590	z = (uint32_t)cc;
591	d[3] -= z << 6;
592	d[6] -= (z << 12) & 0x3FFFFFFF;
593	d[7] -= ARSH(z, 18);
594	d[7] += (z << 14) & 0x3FFFFFFF;
595	d[8] += ARSH(z, 16);
596	c = z >> 31;
597	d[0] -= c;
598	d[3] += c << 6;
599	d[6] += c << 12;
600	d[7] -= c << 14;
601	d[8] += c << 16;
602	for (i = 0; i < 9; i ++) {
603		uint32_t w;
604
605		w = d[i] + z;
606		d[i] = w & 0x3FFFFFFF;
607		z = ARSH(w, 30);
608	}
609}
610
611/*
612 * Perform a "final reduction" in field F256 (field for curve P-256).
613 * The source value must be less than twice the modulus. If the value
614 * is not lower than the modulus, then the modulus is subtracted and
615 * this function returns 1; otherwise, it leaves it untouched and it
616 * returns 0.
617 */
618static uint32_t
619reduce_final_f256(uint32_t *d)
620{
621	uint32_t t[9];
622	uint32_t cc;
623	int i;
624
625	cc = 0;
626	for (i = 0; i < 9; i ++) {
627		uint32_t w;
628
629		w = d[i] - F256[i] - cc;
630		cc = w >> 31;
631		t[i] = w & 0x3FFFFFFF;
632	}
633	cc ^= 1;
634	CCOPY(cc, d, t, sizeof t);
635	return cc;
636}
637
638/*
639 * Jacobian coordinates for a point in P-256: affine coordinates (X,Y)
640 * are such that:
641 *   X = x / z^2
642 *   Y = y / z^3
643 * For the point at infinity, z = 0.
644 * Each point thus admits many possible representations.
645 *
646 * Coordinates are represented in arrays of 32-bit integers, each holding
647 * 30 bits of data. Values may also be slightly greater than the modulus,
648 * but they will always be lower than twice the modulus.
649 */
650typedef struct {
651	uint32_t x[9];
652	uint32_t y[9];
653	uint32_t z[9];
654} p256_jacobian;
655
656/*
657 * Convert a point to affine coordinates:
658 *  - If the point is the point at infinity, then all three coordinates
659 *    are set to 0.
660 *  - Otherwise, the 'z' coordinate is set to 1, and the 'x' and 'y'
661 *    coordinates are the 'X' and 'Y' affine coordinates.
662 * The coordinates are guaranteed to be lower than the modulus.
663 */
664static void
665p256_to_affine(p256_jacobian *P)
666{
667	uint32_t t1[9], t2[9];
668	int i;
669
670	/*
671	 * Invert z with a modular exponentiation: the modulus is
672	 * p = 2^256 - 2^224 + 2^192 + 2^96 - 1, and the exponent is
673	 * p-2. Exponent bit pattern (from high to low) is:
674	 *  - 32 bits of value 1
675	 *  - 31 bits of value 0
676	 *  - 1 bit of value 1
677	 *  - 96 bits of value 0
678	 *  - 94 bits of value 1
679	 *  - 1 bit of value 0
680	 *  - 1 bit of value 1
681	 * Thus, we precompute z^(2^31-1) to speed things up.
682	 *
683	 * If z = 0 (point at infinity) then the modular exponentiation
684	 * will yield 0, which leads to the expected result (all three
685	 * coordinates set to 0).
686	 */
687
688	/*
689	 * A simple square-and-multiply for z^(2^31-1). We could save about
690	 * two dozen multiplications here with an addition chain, but
691	 * this would require a bit more code, and extra stack buffers.
692	 */
693	memcpy(t1, P->z, sizeof P->z);
694	for (i = 0; i < 30; i ++) {
695		square_f256(t1, t1);
696		mul_f256(t1, t1, P->z);
697	}
698
699	/*
700	 * Square-and-multiply. Apart from the squarings, we have a few
701	 * multiplications to set bits to 1; we multiply by the original z
702	 * for setting 1 bit, and by t1 for setting 31 bits.
703	 */
704	memcpy(t2, P->z, sizeof P->z);
705	for (i = 1; i < 256; i ++) {
706		square_f256(t2, t2);
707		switch (i) {
708		case 31:
709		case 190:
710		case 221:
711		case 252:
712			mul_f256(t2, t2, t1);
713			break;
714		case 63:
715		case 253:
716		case 255:
717			mul_f256(t2, t2, P->z);
718			break;
719		}
720	}
721
722	/*
723	 * Now that we have 1/z, multiply x by 1/z^2 and y by 1/z^3.
724	 */
725	mul_f256(t1, t2, t2);
726	mul_f256(P->x, t1, P->x);
727	mul_f256(t1, t1, t2);
728	mul_f256(P->y, t1, P->y);
729	reduce_final_f256(P->x);
730	reduce_final_f256(P->y);
731
732	/*
733	 * Multiply z by 1/z. If z = 0, then this will yield 0, otherwise
734	 * this will set z to 1.
735	 */
736	mul_f256(P->z, P->z, t2);
737	reduce_final_f256(P->z);
738}
739
740/*
741 * Double a point in P-256. This function works for all valid points,
742 * including the point at infinity.
743 */
744static void
745p256_double(p256_jacobian *Q)
746{
747	/*
748	 * Doubling formulas are:
749	 *
750	 *   s = 4*x*y^2
751	 *   m = 3*(x + z^2)*(x - z^2)
752	 *   x' = m^2 - 2*s
753	 *   y' = m*(s - x') - 8*y^4
754	 *   z' = 2*y*z
755	 *
756	 * These formulas work for all points, including points of order 2
757	 * and points at infinity:
758	 *   - If y = 0 then z' = 0. But there is no such point in P-256
759	 *     anyway.
760	 *   - If z = 0 then z' = 0.
761	 */
762	uint32_t t1[9], t2[9], t3[9], t4[9];
763
764	/*
765	 * Compute z^2 in t1.
766	 */
767	square_f256(t1, Q->z);
768
769	/*
770	 * Compute x-z^2 in t2 and x+z^2 in t1.
771	 */
772	add_f256(t2, Q->x, t1);
773	sub_f256(t1, Q->x, t1);
774
775	/*
776	 * Compute 3*(x+z^2)*(x-z^2) in t1.
777	 */
778	mul_f256(t3, t1, t2);
779	add_f256(t1, t3, t3);
780	add_f256(t1, t3, t1);
781
782	/*
783	 * Compute 4*x*y^2 (in t2) and 2*y^2 (in t3).
784	 */
785	square_f256(t3, Q->y);
786	add_f256(t3, t3, t3);
787	mul_f256(t2, Q->x, t3);
788	add_f256(t2, t2, t2);
789
790	/*
791	 * Compute x' = m^2 - 2*s.
792	 */
793	square_f256(Q->x, t1);
794	sub_f256(Q->x, Q->x, t2);
795	sub_f256(Q->x, Q->x, t2);
796
797	/*
798	 * Compute z' = 2*y*z.
799	 */
800	mul_f256(t4, Q->y, Q->z);
801	add_f256(Q->z, t4, t4);
802
803	/*
804	 * Compute y' = m*(s - x') - 8*y^4. Note that we already have
805	 * 2*y^2 in t3.
806	 */
807	sub_f256(t2, t2, Q->x);
808	mul_f256(Q->y, t1, t2);
809	square_f256(t4, t3);
810	add_f256(t4, t4, t4);
811	sub_f256(Q->y, Q->y, t4);
812}
813
814/*
815 * Add point P2 to point P1.
816 *
817 * This function computes the wrong result in the following cases:
818 *
819 *   - If P1 == 0 but P2 != 0
820 *   - If P1 != 0 but P2 == 0
821 *   - If P1 == P2
822 *
823 * In all three cases, P1 is set to the point at infinity.
824 *
825 * Returned value is 0 if one of the following occurs:
826 *
827 *   - P1 and P2 have the same Y coordinate
828 *   - P1 == 0 and P2 == 0
829 *   - The Y coordinate of one of the points is 0 and the other point is
830 *     the point at infinity.
831 *
832 * The third case cannot actually happen with valid points, since a point
833 * with Y == 0 is a point of order 2, and there is no point of order 2 on
834 * curve P-256.
835 *
836 * Therefore, assuming that P1 != 0 and P2 != 0 on input, then the caller
837 * can apply the following:
838 *
839 *   - If the result is not the point at infinity, then it is correct.
840 *   - Otherwise, if the returned value is 1, then this is a case of
841 *     P1+P2 == 0, so the result is indeed the point at infinity.
842 *   - Otherwise, P1 == P2, so a "double" operation should have been
843 *     performed.
844 */
845static uint32_t
846p256_add(p256_jacobian *P1, const p256_jacobian *P2)
847{
848	/*
849	 * Addtions formulas are:
850	 *
851	 *   u1 = x1 * z2^2
852	 *   u2 = x2 * z1^2
853	 *   s1 = y1 * z2^3
854	 *   s2 = y2 * z1^3
855	 *   h = u2 - u1
856	 *   r = s2 - s1
857	 *   x3 = r^2 - h^3 - 2 * u1 * h^2
858	 *   y3 = r * (u1 * h^2 - x3) - s1 * h^3
859	 *   z3 = h * z1 * z2
860	 */
861	uint32_t t1[9], t2[9], t3[9], t4[9], t5[9], t6[9], t7[9];
862	uint32_t ret;
863	int i;
864
865	/*
866	 * Compute u1 = x1*z2^2 (in t1) and s1 = y1*z2^3 (in t3).
867	 */
868	square_f256(t3, P2->z);
869	mul_f256(t1, P1->x, t3);
870	mul_f256(t4, P2->z, t3);
871	mul_f256(t3, P1->y, t4);
872
873	/*
874	 * Compute u2 = x2*z1^2 (in t2) and s2 = y2*z1^3 (in t4).
875	 */
876	square_f256(t4, P1->z);
877	mul_f256(t2, P2->x, t4);
878	mul_f256(t5, P1->z, t4);
879	mul_f256(t4, P2->y, t5);
880
881	/*
882	 * Compute h = h2 - u1 (in t2) and r = s2 - s1 (in t4).
883	 * We need to test whether r is zero, so we will do some extra
884	 * reduce.
885	 */
886	sub_f256(t2, t2, t1);
887	sub_f256(t4, t4, t3);
888	reduce_final_f256(t4);
889	ret = 0;
890	for (i = 0; i < 9; i ++) {
891		ret |= t4[i];
892	}
893	ret = (ret | -ret) >> 31;
894
895	/*
896	 * Compute u1*h^2 (in t6) and h^3 (in t5);
897	 */
898	square_f256(t7, t2);
899	mul_f256(t6, t1, t7);
900	mul_f256(t5, t7, t2);
901
902	/*
903	 * Compute x3 = r^2 - h^3 - 2*u1*h^2.
904	 */
905	square_f256(P1->x, t4);
906	sub_f256(P1->x, P1->x, t5);
907	sub_f256(P1->x, P1->x, t6);
908	sub_f256(P1->x, P1->x, t6);
909
910	/*
911	 * Compute y3 = r*(u1*h^2 - x3) - s1*h^3.
912	 */
913	sub_f256(t6, t6, P1->x);
914	mul_f256(P1->y, t4, t6);
915	mul_f256(t1, t5, t3);
916	sub_f256(P1->y, P1->y, t1);
917
918	/*
919	 * Compute z3 = h*z1*z2.
920	 */
921	mul_f256(t1, P1->z, P2->z);
922	mul_f256(P1->z, t1, t2);
923
924	return ret;
925}
926
927/*
928 * Add point P2 to point P1. This is a specialised function for the
929 * case when P2 is a non-zero point in affine coordinate.
930 *
931 * This function computes the wrong result in the following cases:
932 *
933 *   - If P1 == 0
934 *   - If P1 == P2
935 *
936 * In both cases, P1 is set to the point at infinity.
937 *
938 * Returned value is 0 if one of the following occurs:
939 *
940 *   - P1 and P2 have the same Y coordinate
941 *   - The Y coordinate of P2 is 0 and P1 is the point at infinity.
942 *
943 * The second case cannot actually happen with valid points, since a point
944 * with Y == 0 is a point of order 2, and there is no point of order 2 on
945 * curve P-256.
946 *
947 * Therefore, assuming that P1 != 0 on input, then the caller
948 * can apply the following:
949 *
950 *   - If the result is not the point at infinity, then it is correct.
951 *   - Otherwise, if the returned value is 1, then this is a case of
952 *     P1+P2 == 0, so the result is indeed the point at infinity.
953 *   - Otherwise, P1 == P2, so a "double" operation should have been
954 *     performed.
955 */
956static uint32_t
957p256_add_mixed(p256_jacobian *P1, const p256_jacobian *P2)
958{
959	/*
960	 * Addtions formulas are:
961	 *
962	 *   u1 = x1
963	 *   u2 = x2 * z1^2
964	 *   s1 = y1
965	 *   s2 = y2 * z1^3
966	 *   h = u2 - u1
967	 *   r = s2 - s1
968	 *   x3 = r^2 - h^3 - 2 * u1 * h^2
969	 *   y3 = r * (u1 * h^2 - x3) - s1 * h^3
970	 *   z3 = h * z1
971	 */
972	uint32_t t1[9], t2[9], t3[9], t4[9], t5[9], t6[9], t7[9];
973	uint32_t ret;
974	int i;
975
976	/*
977	 * Compute u1 = x1 (in t1) and s1 = y1 (in t3).
978	 */
979	memcpy(t1, P1->x, sizeof t1);
980	memcpy(t3, P1->y, sizeof t3);
981
982	/*
983	 * Compute u2 = x2*z1^2 (in t2) and s2 = y2*z1^3 (in t4).
984	 */
985	square_f256(t4, P1->z);
986	mul_f256(t2, P2->x, t4);
987	mul_f256(t5, P1->z, t4);
988	mul_f256(t4, P2->y, t5);
989
990	/*
991	 * Compute h = h2 - u1 (in t2) and r = s2 - s1 (in t4).
992	 * We need to test whether r is zero, so we will do some extra
993	 * reduce.
994	 */
995	sub_f256(t2, t2, t1);
996	sub_f256(t4, t4, t3);
997	reduce_final_f256(t4);
998	ret = 0;
999	for (i = 0; i < 9; i ++) {
1000		ret |= t4[i];
1001	}
1002	ret = (ret | -ret) >> 31;
1003
1004	/*
1005	 * Compute u1*h^2 (in t6) and h^3 (in t5);
1006	 */
1007	square_f256(t7, t2);
1008	mul_f256(t6, t1, t7);
1009	mul_f256(t5, t7, t2);
1010
1011	/*
1012	 * Compute x3 = r^2 - h^3 - 2*u1*h^2.
1013	 */
1014	square_f256(P1->x, t4);
1015	sub_f256(P1->x, P1->x, t5);
1016	sub_f256(P1->x, P1->x, t6);
1017	sub_f256(P1->x, P1->x, t6);
1018
1019	/*
1020	 * Compute y3 = r*(u1*h^2 - x3) - s1*h^3.
1021	 */
1022	sub_f256(t6, t6, P1->x);
1023	mul_f256(P1->y, t4, t6);
1024	mul_f256(t1, t5, t3);
1025	sub_f256(P1->y, P1->y, t1);
1026
1027	/*
1028	 * Compute z3 = h*z1*z2.
1029	 */
1030	mul_f256(P1->z, P1->z, t2);
1031
1032	return ret;
1033}
1034
1035/*
1036 * Decode a P-256 point. This function does not support the point at
1037 * infinity. Returned value is 0 if the point is invalid, 1 otherwise.
1038 */
1039static uint32_t
1040p256_decode(p256_jacobian *P, const void *src, size_t len)
1041{
1042	const unsigned char *buf;
1043	uint32_t tx[9], ty[9], t1[9], t2[9];
1044	uint32_t bad;
1045	int i;
1046
1047	if (len != 65) {
1048		return 0;
1049	}
1050	buf = src;
1051
1052	/*
1053	 * First byte must be 0x04 (uncompressed format). We could support
1054	 * "hybrid format" (first byte is 0x06 or 0x07, and encodes the
1055	 * least significant bit of the Y coordinate), but it is explicitly
1056	 * forbidden by RFC 5480 (section 2.2).
1057	 */
1058	bad = NEQ(buf[0], 0x04);
1059
1060	/*
1061	 * Decode the coordinates, and check that they are both lower
1062	 * than the modulus.
1063	 */
1064	tx[8] = be8_to_le30(tx, buf + 1, 32);
1065	ty[8] = be8_to_le30(ty, buf + 33, 32);
1066	bad |= reduce_final_f256(tx);
1067	bad |= reduce_final_f256(ty);
1068
1069	/*
1070	 * Check curve equation.
1071	 */
1072	square_f256(t1, tx);
1073	mul_f256(t1, tx, t1);
1074	square_f256(t2, ty);
1075	sub_f256(t1, t1, tx);
1076	sub_f256(t1, t1, tx);
1077	sub_f256(t1, t1, tx);
1078	add_f256(t1, t1, P256_B);
1079	sub_f256(t1, t1, t2);
1080	reduce_final_f256(t1);
1081	for (i = 0; i < 9; i ++) {
1082		bad |= t1[i];
1083	}
1084
1085	/*
1086	 * Copy coordinates to the point structure.
1087	 */
1088	memcpy(P->x, tx, sizeof tx);
1089	memcpy(P->y, ty, sizeof ty);
1090	memset(P->z, 0, sizeof P->z);
1091	P->z[0] = 1;
1092	return EQ(bad, 0);
1093}
1094
1095/*
1096 * Encode a point into a buffer. This function assumes that the point is
1097 * valid, in affine coordinates, and not the point at infinity.
1098 */
1099static void
1100p256_encode(void *dst, const p256_jacobian *P)
1101{
1102	unsigned char *buf;
1103
1104	buf = dst;
1105	buf[0] = 0x04;
1106	le30_to_be8(buf + 1, 32, P->x);
1107	le30_to_be8(buf + 33, 32, P->y);
1108}
1109
1110/*
1111 * Multiply a curve point by an integer. The integer is assumed to be
1112 * lower than the curve order, and the base point must not be the point
1113 * at infinity.
1114 */
1115static void
1116p256_mul(p256_jacobian *P, const unsigned char *x, size_t xlen)
1117{
1118	/*
1119	 * qz is a flag that is initially 1, and remains equal to 1
1120	 * as long as the point is the point at infinity.
1121	 *
1122	 * We use a 2-bit window to handle multiplier bits by pairs.
1123	 * The precomputed window really is the points P2 and P3.
1124	 */
1125	uint32_t qz;
1126	p256_jacobian P2, P3, Q, T, U;
1127
1128	/*
1129	 * Compute window values.
1130	 */
1131	P2 = *P;
1132	p256_double(&P2);
1133	P3 = *P;
1134	p256_add(&P3, &P2);
1135
1136	/*
1137	 * We start with Q = 0. We process multiplier bits 2 by 2.
1138	 */
1139	memset(&Q, 0, sizeof Q);
1140	qz = 1;
1141	while (xlen -- > 0) {
1142		int k;
1143
1144		for (k = 6; k >= 0; k -= 2) {
1145			uint32_t bits;
1146			uint32_t bnz;
1147
1148			p256_double(&Q);
1149			p256_double(&Q);
1150			T = *P;
1151			U = Q;
1152			bits = (*x >> k) & (uint32_t)3;
1153			bnz = NEQ(bits, 0);
1154			CCOPY(EQ(bits, 2), &T, &P2, sizeof T);
1155			CCOPY(EQ(bits, 3), &T, &P3, sizeof T);
1156			p256_add(&U, &T);
1157			CCOPY(bnz & qz, &Q, &T, sizeof Q);
1158			CCOPY(bnz & ~qz, &Q, &U, sizeof Q);
1159			qz &= ~bnz;
1160		}
1161		x ++;
1162	}
1163	*P = Q;
1164}
1165
1166/*
1167 * Precomputed window: k*G points, where G is the curve generator, and k
1168 * is an integer from 1 to 15 (inclusive). The X and Y coordinates of
1169 * the point are encoded as 9 words of 30 bits each (little-endian
1170 * order).
1171 */
1172static const uint32_t Gwin[15][18] = {
1173
1174	{ 0x1898C296, 0x1284E517, 0x1EB33A0F, 0x00DF604B,
1175	  0x2440F277, 0x339B958E, 0x04247F8B, 0x347CB84B,
1176	  0x00006B17, 0x37BF51F5, 0x2ED901A0, 0x3315ECEC,
1177	  0x338CD5DA, 0x0F9E162B, 0x1FAD29F0, 0x27F9B8EE,
1178	  0x10B8BF86, 0x00004FE3 },
1179
1180	{ 0x07669978, 0x182D23F1, 0x3F21B35A, 0x225A789D,
1181	  0x351AC3C0, 0x08E00C12, 0x34F7E8A5, 0x1EC62340,
1182	  0x00007CF2, 0x227873D1, 0x3812DE74, 0x0E982299,
1183	  0x1F6B798F, 0x3430DBBA, 0x366B1A7D, 0x2D040293,
1184	  0x154436E3, 0x00000777 },
1185
1186	{ 0x06E7FD6C, 0x2D05986F, 0x3ADA985F, 0x31ADC87B,
1187	  0x0BF165E6, 0x1FBE5475, 0x30A44C8F, 0x3934698C,
1188	  0x00005ECB, 0x227D5032, 0x29E6C49E, 0x04FB83D9,
1189	  0x0AAC0D8E, 0x24A2ECD8, 0x2C1B3869, 0x0FF7E374,
1190	  0x19031266, 0x00008734 },
1191
1192	{ 0x2B030852, 0x024C0911, 0x05596EF5, 0x07F8B6DE,
1193	  0x262BD003, 0x3779967B, 0x08FBBA02, 0x128D4CB4,
1194	  0x0000E253, 0x184ED8C6, 0x310B08FC, 0x30EE0055,
1195	  0x3F25B0FC, 0x062D764E, 0x3FB97F6A, 0x33CC719D,
1196	  0x15D69318, 0x0000E0F1 },
1197
1198	{ 0x03D033ED, 0x05552837, 0x35BE5242, 0x2320BF47,
1199	  0x268FDFEF, 0x13215821, 0x140D2D78, 0x02DE9454,
1200	  0x00005159, 0x3DA16DA4, 0x0742ED13, 0x0D80888D,
1201	  0x004BC035, 0x0A79260D, 0x06FCDAFE, 0x2727D8AE,
1202	  0x1F6A2412, 0x0000E0C1 },
1203
1204	{ 0x3C2291A9, 0x1AC2ABA4, 0x3B215B4C, 0x131D037A,
1205	  0x17DDE302, 0x0C90B2E2, 0x0602C92D, 0x05CA9DA9,
1206	  0x0000B01A, 0x0FC77FE2, 0x35F1214E, 0x07E16BDF,
1207	  0x003DDC07, 0x2703791C, 0x3038B7EE, 0x3DAD56FE,
1208	  0x041D0C8D, 0x0000E85C },
1209
1210	{ 0x3187B2A3, 0x0018A1C0, 0x00FEF5B3, 0x3E7E2E2A,
1211	  0x01FB607E, 0x2CC199F0, 0x37B4625B, 0x0EDBE82F,
1212	  0x00008E53, 0x01F400B4, 0x15786A1B, 0x3041B21C,
1213	  0x31CD8CF2, 0x35900053, 0x1A7E0E9B, 0x318366D0,
1214	  0x076F780C, 0x000073EB },
1215
1216	{ 0x1B6FB393, 0x13767707, 0x3CE97DBB, 0x348E2603,
1217	  0x354CADC1, 0x09D0B4EA, 0x1B053404, 0x1DE76FBA,
1218	  0x000062D9, 0x0F09957E, 0x295029A8, 0x3E76A78D,
1219	  0x3B547DAE, 0x27CEE0A2, 0x0575DC45, 0x1D8244FF,
1220	  0x332F647A, 0x0000AD5A },
1221
1222	{ 0x10949EE0, 0x1E7A292E, 0x06DF8B3D, 0x02B2E30B,
1223	  0x31F8729E, 0x24E35475, 0x30B71878, 0x35EDBFB7,
1224	  0x0000EA68, 0x0DD048FA, 0x21688929, 0x0DE823FE,
1225	  0x1C53FAA9, 0x0EA0C84D, 0x052A592A, 0x1FCE7870,
1226	  0x11325CB2, 0x00002A27 },
1227
1228	{ 0x04C5723F, 0x30D81A50, 0x048306E4, 0x329B11C7,
1229	  0x223FB545, 0x085347A8, 0x2993E591, 0x1B5ACA8E,
1230	  0x0000CEF6, 0x04AF0773, 0x28D2EEA9, 0x2751EEEC,
1231	  0x037B4A7F, 0x3B4C1059, 0x08F37674, 0x2AE906E1,
1232	  0x18A88A6A, 0x00008786 },
1233
1234	{ 0x34BC21D1, 0x0CCE474D, 0x15048BF4, 0x1D0BB409,
1235	  0x021CDA16, 0x20DE76C3, 0x34C59063, 0x04EDE20E,
1236	  0x00003ED1, 0x282A3740, 0x0BE3BBF3, 0x29889DAE,
1237	  0x03413697, 0x34C68A09, 0x210EBE93, 0x0C8A224C,
1238	  0x0826B331, 0x00009099 },
1239
1240	{ 0x0624E3C4, 0x140317BA, 0x2F82C99D, 0x260C0A2C,
1241	  0x25D55179, 0x194DCC83, 0x3D95E462, 0x356F6A05,
1242	  0x0000741D, 0x0D4481D3, 0x2657FC8B, 0x1BA5CA71,
1243	  0x3AE44B0D, 0x07B1548E, 0x0E0D5522, 0x05FDC567,
1244	  0x2D1AA70E, 0x00000770 },
1245
1246	{ 0x06072C01, 0x23857675, 0x1EAD58A9, 0x0B8A12D9,
1247	  0x1EE2FC79, 0x0177CB61, 0x0495A618, 0x20DEB82B,
1248	  0x0000177C, 0x2FC7BFD8, 0x310EEF8B, 0x1FB4DF39,
1249	  0x3B8530E8, 0x0F4E7226, 0x0246B6D0, 0x2A558A24,
1250	  0x163353AF, 0x000063BB },
1251
1252	{ 0x24D2920B, 0x1C249DCC, 0x2069C5E5, 0x09AB2F9E,
1253	  0x36DF3CF1, 0x1991FD0C, 0x062B97A7, 0x1E80070E,
1254	  0x000054E7, 0x20D0B375, 0x2E9F20BD, 0x35090081,
1255	  0x1C7A9DDC, 0x22E7C371, 0x087E3016, 0x03175421,
1256	  0x3C6ECA7D, 0x0000F599 },
1257
1258	{ 0x259B9D5F, 0x0D9A318F, 0x23A0EF16, 0x00EBE4B7,
1259	  0x088265AE, 0x2CDE2666, 0x2BAE7ADF, 0x1371A5C6,
1260	  0x0000F045, 0x0D034F36, 0x1F967378, 0x1B5FA3F4,
1261	  0x0EC8739D, 0x1643E62A, 0x1653947E, 0x22D1F4E6,
1262	  0x0FB8D64B, 0x0000B5B9 }
1263};
1264
1265/*
1266 * Lookup one of the Gwin[] values, by index. This is constant-time.
1267 */
1268static void
1269lookup_Gwin(p256_jacobian *T, uint32_t idx)
1270{
1271	uint32_t xy[18];
1272	uint32_t k;
1273	size_t u;
1274
1275	memset(xy, 0, sizeof xy);
1276	for (k = 0; k < 15; k ++) {
1277		uint32_t m;
1278
1279		m = -EQ(idx, k + 1);
1280		for (u = 0; u < 18; u ++) {
1281			xy[u] |= m & Gwin[k][u];
1282		}
1283	}
1284	memcpy(T->x, &xy[0], sizeof T->x);
1285	memcpy(T->y, &xy[9], sizeof T->y);
1286	memset(T->z, 0, sizeof T->z);
1287	T->z[0] = 1;
1288}
1289
1290/*
1291 * Multiply the generator by an integer. The integer is assumed non-zero
1292 * and lower than the curve order.
1293 */
1294static void
1295p256_mulgen(p256_jacobian *P, const unsigned char *x, size_t xlen)
1296{
1297	/*
1298	 * qz is a flag that is initially 1, and remains equal to 1
1299	 * as long as the point is the point at infinity.
1300	 *
1301	 * We use a 4-bit window to handle multiplier bits by groups
1302	 * of 4. The precomputed window is constant static data, with
1303	 * points in affine coordinates; we use a constant-time lookup.
1304	 */
1305	p256_jacobian Q;
1306	uint32_t qz;
1307
1308	memset(&Q, 0, sizeof Q);
1309	qz = 1;
1310	while (xlen -- > 0) {
1311		int k;
1312		unsigned bx;
1313
1314		bx = *x ++;
1315		for (k = 0; k < 2; k ++) {
1316			uint32_t bits;
1317			uint32_t bnz;
1318			p256_jacobian T, U;
1319
1320			p256_double(&Q);
1321			p256_double(&Q);
1322			p256_double(&Q);
1323			p256_double(&Q);
1324			bits = (bx >> 4) & 0x0F;
1325			bnz = NEQ(bits, 0);
1326			lookup_Gwin(&T, bits);
1327			U = Q;
1328			p256_add_mixed(&U, &T);
1329			CCOPY(bnz & qz, &Q, &T, sizeof Q);
1330			CCOPY(bnz & ~qz, &Q, &U, sizeof Q);
1331			qz &= ~bnz;
1332			bx <<= 4;
1333		}
1334	}
1335	*P = Q;
1336}
1337
1338static const unsigned char P256_G[] = {
1339	0x04, 0x6B, 0x17, 0xD1, 0xF2, 0xE1, 0x2C, 0x42, 0x47, 0xF8,
1340	0xBC, 0xE6, 0xE5, 0x63, 0xA4, 0x40, 0xF2, 0x77, 0x03, 0x7D,
1341	0x81, 0x2D, 0xEB, 0x33, 0xA0, 0xF4, 0xA1, 0x39, 0x45, 0xD8,
1342	0x98, 0xC2, 0x96, 0x4F, 0xE3, 0x42, 0xE2, 0xFE, 0x1A, 0x7F,
1343	0x9B, 0x8E, 0xE7, 0xEB, 0x4A, 0x7C, 0x0F, 0x9E, 0x16, 0x2B,
1344	0xCE, 0x33, 0x57, 0x6B, 0x31, 0x5E, 0xCE, 0xCB, 0xB6, 0x40,
1345	0x68, 0x37, 0xBF, 0x51, 0xF5
1346};
1347
1348static const unsigned char P256_N[] = {
1349	0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF,
1350	0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xBC, 0xE6, 0xFA, 0xAD,
1351	0xA7, 0x17, 0x9E, 0x84, 0xF3, 0xB9, 0xCA, 0xC2, 0xFC, 0x63,
1352	0x25, 0x51
1353};
1354
1355static const unsigned char *
1356api_generator(int curve, size_t *len)
1357{
1358	(void)curve;
1359	*len = sizeof P256_G;
1360	return P256_G;
1361}
1362
1363static const unsigned char *
1364api_order(int curve, size_t *len)
1365{
1366	(void)curve;
1367	*len = sizeof P256_N;
1368	return P256_N;
1369}
1370
1371static size_t
1372api_xoff(int curve, size_t *len)
1373{
1374	(void)curve;
1375	*len = 32;
1376	return 1;
1377}
1378
1379static uint32_t
1380api_mul(unsigned char *G, size_t Glen,
1381	const unsigned char *x, size_t xlen, int curve)
1382{
1383	uint32_t r;
1384	p256_jacobian P;
1385
1386	(void)curve;
1387	r = p256_decode(&P, G, Glen);
1388	p256_mul(&P, x, xlen);
1389	if (Glen >= 65) {
1390		p256_to_affine(&P);
1391		p256_encode(G, &P);
1392	}
1393	return r;
1394}
1395
1396static size_t
1397api_mulgen(unsigned char *R,
1398	const unsigned char *x, size_t xlen, int curve)
1399{
1400	p256_jacobian P;
1401
1402	(void)curve;
1403	p256_mulgen(&P, x, xlen);
1404	p256_to_affine(&P);
1405	p256_encode(R, &P);
1406	return 65;
1407
1408	/*
1409	const unsigned char *G;
1410	size_t Glen;
1411
1412	G = api_generator(curve, &Glen);
1413	memcpy(R, G, Glen);
1414	api_mul(R, Glen, x, xlen, curve);
1415	return Glen;
1416	*/
1417}
1418
1419static uint32_t
1420api_muladd(unsigned char *A, const unsigned char *B, size_t len,
1421	const unsigned char *x, size_t xlen,
1422	const unsigned char *y, size_t ylen, int curve)
1423{
1424	p256_jacobian P, Q;
1425	uint32_t r, t, z;
1426	int i;
1427
1428	(void)curve;
1429	r = p256_decode(&P, A, len);
1430	p256_mul(&P, x, xlen);
1431	if (B == NULL) {
1432		p256_mulgen(&Q, y, ylen);
1433	} else {
1434		r &= p256_decode(&Q, B, len);
1435		p256_mul(&Q, y, ylen);
1436	}
1437
1438	/*
1439	 * The final addition may fail in case both points are equal.
1440	 */
1441	t = p256_add(&P, &Q);
1442	reduce_final_f256(P.z);
1443	z = 0;
1444	for (i = 0; i < 9; i ++) {
1445		z |= P.z[i];
1446	}
1447	z = EQ(z, 0);
1448	p256_double(&Q);
1449
1450	/*
1451	 * If z is 1 then either P+Q = 0 (t = 1) or P = Q (t = 0). So we
1452	 * have the following:
1453	 *
1454	 *   z = 0, t = 0   return P (normal addition)
1455	 *   z = 0, t = 1   return P (normal addition)
1456	 *   z = 1, t = 0   return Q (a 'double' case)
1457	 *   z = 1, t = 1   report an error (P+Q = 0)
1458	 */
1459	CCOPY(z & ~t, &P, &Q, sizeof Q);
1460	p256_to_affine(&P);
1461	p256_encode(A, &P);
1462	r &= ~(z & t);
1463	return r;
1464}
1465
1466/* see bearssl_ec.h */
1467const br_ec_impl br_ec_p256_m31 = {
1468	(uint32_t)0x00800000,
1469	&api_generator,
1470	&api_order,
1471	&api_xoff,
1472	&api_mul,
1473	&api_mulgen,
1474	&api_muladd
1475};
1476