1/*
2 * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining
5 * a copy of this software and associated documentation files (the
6 * "Software"), to deal in the Software without restriction, including
7 * without limitation the rights to use, copy, modify, merge, publish,
8 * distribute, sublicense, and/or sell copies of the Software, and to
9 * permit persons to whom the Software is furnished to do so, subject to
10 * the following conditions:
11 *
12 * The above copyright notice and this permission notice shall be
13 * included in all copies or substantial portions of the Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
19 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
20 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
21 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25#include "inner.h"
26
27/*
28 * If BR_NO_ARITH_SHIFT is undefined, or defined to 0, then we _assume_
29 * that right-shifting a signed negative integer copies the sign bit
30 * (arithmetic right-shift). This is "implementation-defined behaviour",
31 * i.e. it is not undefined, but it may differ between compilers. Each
32 * compiler is supposed to document its behaviour in that respect. GCC
33 * explicitly defines that an arithmetic right shift is used. We expect
34 * all other compilers to do the same, because underlying CPU offer an
35 * arithmetic right shift opcode that could not be used otherwise.
36 */
37#if BR_NO_ARITH_SHIFT
38#define ARSH(x, n)   (((uint32_t)(x) >> (n)) \
39                    | ((-((uint32_t)(x) >> 31)) << (32 - (n))))
40#else
41#define ARSH(x, n)   ((*(int32_t *)&(x)) >> (n))
42#endif
43
44/*
45 * Convert an integer from unsigned big-endian encoding to a sequence of
46 * 13-bit words in little-endian order. The final "partial" word is
47 * returned.
48 */
49static uint32_t
50be8_to_le13(uint32_t *dst, const unsigned char *src, size_t len)
51{
52	uint32_t acc;
53	int acc_len;
54
55	acc = 0;
56	acc_len = 0;
57	while (len -- > 0) {
58		acc |= (uint32_t)src[len] << acc_len;
59		acc_len += 8;
60		if (acc_len >= 13) {
61			*dst ++ = acc & 0x1FFF;
62			acc >>= 13;
63			acc_len -= 13;
64		}
65	}
66	return acc;
67}
68
69/*
70 * Convert an integer (13-bit words, little-endian) to unsigned
71 * big-endian encoding. The total encoding length is provided; all
72 * the destination bytes will be filled.
73 */
74static void
75le13_to_be8(unsigned char *dst, size_t len, const uint32_t *src)
76{
77	uint32_t acc;
78	int acc_len;
79
80	acc = 0;
81	acc_len = 0;
82	while (len -- > 0) {
83		if (acc_len < 8) {
84			acc |= (*src ++) << acc_len;
85			acc_len += 13;
86		}
87		dst[len] = (unsigned char)acc;
88		acc >>= 8;
89		acc_len -= 8;
90	}
91}
92
93/*
94 * Normalise an array of words to a strict 13 bits per word. Returned
95 * value is the resulting carry. The source (w) and destination (d)
96 * arrays may be identical, but shall not overlap partially.
97 */
98static inline uint32_t
99norm13(uint32_t *d, const uint32_t *w, size_t len)
100{
101	size_t u;
102	uint32_t cc;
103
104	cc = 0;
105	for (u = 0; u < len; u ++) {
106		int32_t z;
107
108		z = w[u] + cc;
109		d[u] = z & 0x1FFF;
110		cc = ARSH(z, 13);
111	}
112	return cc;
113}
114
115/*
116 * mul20() multiplies two 260-bit integers together. Each word must fit
117 * on 13 bits; source operands use 20 words, destination operand
118 * receives 40 words. All overlaps allowed.
119 *
120 * square20() computes the square of a 260-bit integer. Each word must
121 * fit on 13 bits; source operand uses 20 words, destination operand
122 * receives 40 words. All overlaps allowed.
123 */
124
125#if BR_SLOW_MUL15
126
127static void
128mul20(uint32_t *d, const uint32_t *a, const uint32_t *b)
129{
130	/*
131	 * Two-level Karatsuba: turns a 20x20 multiplication into
132	 * nine 5x5 multiplications. We use 13-bit words but do not
133	 * propagate carries immediately, so words may expand:
134	 *
135	 *  - First Karatsuba decomposition turns the 20x20 mul on
136	 *    13-bit words into three 10x10 muls, two on 13-bit words
137	 *    and one on 14-bit words.
138	 *
139	 *  - Second Karatsuba decomposition further splits these into:
140	 *
141	 *     * four 5x5 muls on 13-bit words
142	 *     * four 5x5 muls on 14-bit words
143	 *     * one 5x5 mul on 15-bit words
144	 *
145	 * Highest word value is 8191, 16382 or 32764, for 13-bit, 14-bit
146	 * or 15-bit words, respectively.
147	 */
148	uint32_t u[45], v[45], w[90];
149	uint32_t cc;
150	int i;
151
152#define ZADD(dw, d_off, s1w, s1_off, s2w, s2_off)   do { \
153		(dw)[5 * (d_off) + 0] = (s1w)[5 * (s1_off) + 0] \
154			+ (s2w)[5 * (s2_off) + 0]; \
155		(dw)[5 * (d_off) + 1] = (s1w)[5 * (s1_off) + 1] \
156			+ (s2w)[5 * (s2_off) + 1]; \
157		(dw)[5 * (d_off) + 2] = (s1w)[5 * (s1_off) + 2] \
158			+ (s2w)[5 * (s2_off) + 2]; \
159		(dw)[5 * (d_off) + 3] = (s1w)[5 * (s1_off) + 3] \
160			+ (s2w)[5 * (s2_off) + 3]; \
161		(dw)[5 * (d_off) + 4] = (s1w)[5 * (s1_off) + 4] \
162			+ (s2w)[5 * (s2_off) + 4]; \
163	} while (0)
164
165#define ZADDT(dw, d_off, sw, s_off)   do { \
166		(dw)[5 * (d_off) + 0] += (sw)[5 * (s_off) + 0]; \
167		(dw)[5 * (d_off) + 1] += (sw)[5 * (s_off) + 1]; \
168		(dw)[5 * (d_off) + 2] += (sw)[5 * (s_off) + 2]; \
169		(dw)[5 * (d_off) + 3] += (sw)[5 * (s_off) + 3]; \
170		(dw)[5 * (d_off) + 4] += (sw)[5 * (s_off) + 4]; \
171	} while (0)
172
173#define ZSUB2F(dw, d_off, s1w, s1_off, s2w, s2_off)   do { \
174		(dw)[5 * (d_off) + 0] -= (s1w)[5 * (s1_off) + 0] \
175			+ (s2w)[5 * (s2_off) + 0]; \
176		(dw)[5 * (d_off) + 1] -= (s1w)[5 * (s1_off) + 1] \
177			+ (s2w)[5 * (s2_off) + 1]; \
178		(dw)[5 * (d_off) + 2] -= (s1w)[5 * (s1_off) + 2] \
179			+ (s2w)[5 * (s2_off) + 2]; \
180		(dw)[5 * (d_off) + 3] -= (s1w)[5 * (s1_off) + 3] \
181			+ (s2w)[5 * (s2_off) + 3]; \
182		(dw)[5 * (d_off) + 4] -= (s1w)[5 * (s1_off) + 4] \
183			+ (s2w)[5 * (s2_off) + 4]; \
184	} while (0)
185
186#define CPR1(w, cprcc)   do { \
187		uint32_t cprz = (w) + cprcc; \
188		(w) = cprz & 0x1FFF; \
189		cprcc = cprz >> 13; \
190	} while (0)
191
192#define CPR(dw, d_off)   do { \
193		uint32_t cprcc; \
194		cprcc = 0; \
195		CPR1((dw)[(d_off) + 0], cprcc); \
196		CPR1((dw)[(d_off) + 1], cprcc); \
197		CPR1((dw)[(d_off) + 2], cprcc); \
198		CPR1((dw)[(d_off) + 3], cprcc); \
199		CPR1((dw)[(d_off) + 4], cprcc); \
200		CPR1((dw)[(d_off) + 5], cprcc); \
201		CPR1((dw)[(d_off) + 6], cprcc); \
202		CPR1((dw)[(d_off) + 7], cprcc); \
203		CPR1((dw)[(d_off) + 8], cprcc); \
204		(dw)[(d_off) + 9] = cprcc; \
205	} while (0)
206
207	memcpy(u, a, 20 * sizeof *a);
208	ZADD(u, 4, a, 0, a, 1);
209	ZADD(u, 5, a, 2, a, 3);
210	ZADD(u, 6, a, 0, a, 2);
211	ZADD(u, 7, a, 1, a, 3);
212	ZADD(u, 8, u, 6, u, 7);
213
214	memcpy(v, b, 20 * sizeof *b);
215	ZADD(v, 4, b, 0, b, 1);
216	ZADD(v, 5, b, 2, b, 3);
217	ZADD(v, 6, b, 0, b, 2);
218	ZADD(v, 7, b, 1, b, 3);
219	ZADD(v, 8, v, 6, v, 7);
220
221	/*
222	 * Do the eight first 8x8 muls. Source words are at most 16382
223	 * each, so we can add product results together "as is" in 32-bit
224	 * words.
225	 */
226	for (i = 0; i < 40; i += 5) {
227		w[(i << 1) + 0] = MUL15(u[i + 0], v[i + 0]);
228		w[(i << 1) + 1] = MUL15(u[i + 0], v[i + 1])
229			+ MUL15(u[i + 1], v[i + 0]);
230		w[(i << 1) + 2] = MUL15(u[i + 0], v[i + 2])
231			+ MUL15(u[i + 1], v[i + 1])
232			+ MUL15(u[i + 2], v[i + 0]);
233		w[(i << 1) + 3] = MUL15(u[i + 0], v[i + 3])
234			+ MUL15(u[i + 1], v[i + 2])
235			+ MUL15(u[i + 2], v[i + 1])
236			+ MUL15(u[i + 3], v[i + 0]);
237		w[(i << 1) + 4] = MUL15(u[i + 0], v[i + 4])
238			+ MUL15(u[i + 1], v[i + 3])
239			+ MUL15(u[i + 2], v[i + 2])
240			+ MUL15(u[i + 3], v[i + 1])
241			+ MUL15(u[i + 4], v[i + 0]);
242		w[(i << 1) + 5] = MUL15(u[i + 1], v[i + 4])
243			+ MUL15(u[i + 2], v[i + 3])
244			+ MUL15(u[i + 3], v[i + 2])
245			+ MUL15(u[i + 4], v[i + 1]);
246		w[(i << 1) + 6] = MUL15(u[i + 2], v[i + 4])
247			+ MUL15(u[i + 3], v[i + 3])
248			+ MUL15(u[i + 4], v[i + 2]);
249		w[(i << 1) + 7] = MUL15(u[i + 3], v[i + 4])
250			+ MUL15(u[i + 4], v[i + 3]);
251		w[(i << 1) + 8] = MUL15(u[i + 4], v[i + 4]);
252		w[(i << 1) + 9] = 0;
253	}
254
255	/*
256	 * For the 9th multiplication, source words are up to 32764,
257	 * so we must do some carry propagation. If we add up to
258	 * 4 products and the carry is no more than 524224, then the
259	 * result fits in 32 bits, and the next carry will be no more
260	 * than 524224 (because 4*(32764^2)+524224 < 8192*524225).
261	 *
262	 * We thus just skip one of the products in the middle word,
263	 * then do a carry propagation (this reduces words to 13 bits
264	 * each, except possibly the last, which may use up to 17 bits
265	 * or so), then add the missing product.
266	 */
267	w[80 + 0] = MUL15(u[40 + 0], v[40 + 0]);
268	w[80 + 1] = MUL15(u[40 + 0], v[40 + 1])
269		+ MUL15(u[40 + 1], v[40 + 0]);
270	w[80 + 2] = MUL15(u[40 + 0], v[40 + 2])
271		+ MUL15(u[40 + 1], v[40 + 1])
272		+ MUL15(u[40 + 2], v[40 + 0]);
273	w[80 + 3] = MUL15(u[40 + 0], v[40 + 3])
274		+ MUL15(u[40 + 1], v[40 + 2])
275		+ MUL15(u[40 + 2], v[40 + 1])
276		+ MUL15(u[40 + 3], v[40 + 0]);
277	w[80 + 4] = MUL15(u[40 + 0], v[40 + 4])
278		+ MUL15(u[40 + 1], v[40 + 3])
279		+ MUL15(u[40 + 2], v[40 + 2])
280		+ MUL15(u[40 + 3], v[40 + 1]);
281		/* + MUL15(u[40 + 4], v[40 + 0]) */
282	w[80 + 5] = MUL15(u[40 + 1], v[40 + 4])
283		+ MUL15(u[40 + 2], v[40 + 3])
284		+ MUL15(u[40 + 3], v[40 + 2])
285		+ MUL15(u[40 + 4], v[40 + 1]);
286	w[80 + 6] = MUL15(u[40 + 2], v[40 + 4])
287		+ MUL15(u[40 + 3], v[40 + 3])
288		+ MUL15(u[40 + 4], v[40 + 2]);
289	w[80 + 7] = MUL15(u[40 + 3], v[40 + 4])
290		+ MUL15(u[40 + 4], v[40 + 3]);
291	w[80 + 8] = MUL15(u[40 + 4], v[40 + 4]);
292
293	CPR(w, 80);
294
295	w[80 + 4] += MUL15(u[40 + 4], v[40 + 0]);
296
297	/*
298	 * The products on 14-bit words in slots 6 and 7 yield values
299	 * up to 5*(16382^2) each, and we need to subtract two such
300	 * values from the higher word. We need the subtraction to fit
301	 * in a _signed_ 32-bit integer, i.e. 31 bits + a sign bit.
302	 * However, 10*(16382^2) does not fit. So we must perform a
303	 * bit of reduction here.
304	 */
305	CPR(w, 60);
306	CPR(w, 70);
307
308	/*
309	 * Recompose results.
310	 */
311
312	/* 0..1*0..1 into 0..3 */
313	ZSUB2F(w, 8, w, 0, w, 2);
314	ZSUB2F(w, 9, w, 1, w, 3);
315	ZADDT(w, 1, w, 8);
316	ZADDT(w, 2, w, 9);
317
318	/* 2..3*2..3 into 4..7 */
319	ZSUB2F(w, 10, w, 4, w, 6);
320	ZSUB2F(w, 11, w, 5, w, 7);
321	ZADDT(w, 5, w, 10);
322	ZADDT(w, 6, w, 11);
323
324	/* (0..1+2..3)*(0..1+2..3) into 12..15 */
325	ZSUB2F(w, 16, w, 12, w, 14);
326	ZSUB2F(w, 17, w, 13, w, 15);
327	ZADDT(w, 13, w, 16);
328	ZADDT(w, 14, w, 17);
329
330	/* first-level recomposition */
331	ZSUB2F(w, 12, w, 0, w, 4);
332	ZSUB2F(w, 13, w, 1, w, 5);
333	ZSUB2F(w, 14, w, 2, w, 6);
334	ZSUB2F(w, 15, w, 3, w, 7);
335	ZADDT(w, 2, w, 12);
336	ZADDT(w, 3, w, 13);
337	ZADDT(w, 4, w, 14);
338	ZADDT(w, 5, w, 15);
339
340	/*
341	 * Perform carry propagation to bring all words down to 13 bits.
342	 */
343	cc = norm13(d, w, 40);
344	d[39] += (cc << 13);
345
346#undef ZADD
347#undef ZADDT
348#undef ZSUB2F
349#undef CPR1
350#undef CPR
351}
352
353static inline void
354square20(uint32_t *d, const uint32_t *a)
355{
356	mul20(d, a, a);
357}
358
359#else
360
361static void
362mul20(uint32_t *d, const uint32_t *a, const uint32_t *b)
363{
364	uint32_t t[39];
365
366	t[ 0] = MUL15(a[ 0], b[ 0]);
367	t[ 1] = MUL15(a[ 0], b[ 1])
368		+ MUL15(a[ 1], b[ 0]);
369	t[ 2] = MUL15(a[ 0], b[ 2])
370		+ MUL15(a[ 1], b[ 1])
371		+ MUL15(a[ 2], b[ 0]);
372	t[ 3] = MUL15(a[ 0], b[ 3])
373		+ MUL15(a[ 1], b[ 2])
374		+ MUL15(a[ 2], b[ 1])
375		+ MUL15(a[ 3], b[ 0]);
376	t[ 4] = MUL15(a[ 0], b[ 4])
377		+ MUL15(a[ 1], b[ 3])
378		+ MUL15(a[ 2], b[ 2])
379		+ MUL15(a[ 3], b[ 1])
380		+ MUL15(a[ 4], b[ 0]);
381	t[ 5] = MUL15(a[ 0], b[ 5])
382		+ MUL15(a[ 1], b[ 4])
383		+ MUL15(a[ 2], b[ 3])
384		+ MUL15(a[ 3], b[ 2])
385		+ MUL15(a[ 4], b[ 1])
386		+ MUL15(a[ 5], b[ 0]);
387	t[ 6] = MUL15(a[ 0], b[ 6])
388		+ MUL15(a[ 1], b[ 5])
389		+ MUL15(a[ 2], b[ 4])
390		+ MUL15(a[ 3], b[ 3])
391		+ MUL15(a[ 4], b[ 2])
392		+ MUL15(a[ 5], b[ 1])
393		+ MUL15(a[ 6], b[ 0]);
394	t[ 7] = MUL15(a[ 0], b[ 7])
395		+ MUL15(a[ 1], b[ 6])
396		+ MUL15(a[ 2], b[ 5])
397		+ MUL15(a[ 3], b[ 4])
398		+ MUL15(a[ 4], b[ 3])
399		+ MUL15(a[ 5], b[ 2])
400		+ MUL15(a[ 6], b[ 1])
401		+ MUL15(a[ 7], b[ 0]);
402	t[ 8] = MUL15(a[ 0], b[ 8])
403		+ MUL15(a[ 1], b[ 7])
404		+ MUL15(a[ 2], b[ 6])
405		+ MUL15(a[ 3], b[ 5])
406		+ MUL15(a[ 4], b[ 4])
407		+ MUL15(a[ 5], b[ 3])
408		+ MUL15(a[ 6], b[ 2])
409		+ MUL15(a[ 7], b[ 1])
410		+ MUL15(a[ 8], b[ 0]);
411	t[ 9] = MUL15(a[ 0], b[ 9])
412		+ MUL15(a[ 1], b[ 8])
413		+ MUL15(a[ 2], b[ 7])
414		+ MUL15(a[ 3], b[ 6])
415		+ MUL15(a[ 4], b[ 5])
416		+ MUL15(a[ 5], b[ 4])
417		+ MUL15(a[ 6], b[ 3])
418		+ MUL15(a[ 7], b[ 2])
419		+ MUL15(a[ 8], b[ 1])
420		+ MUL15(a[ 9], b[ 0]);
421	t[10] = MUL15(a[ 0], b[10])
422		+ MUL15(a[ 1], b[ 9])
423		+ MUL15(a[ 2], b[ 8])
424		+ MUL15(a[ 3], b[ 7])
425		+ MUL15(a[ 4], b[ 6])
426		+ MUL15(a[ 5], b[ 5])
427		+ MUL15(a[ 6], b[ 4])
428		+ MUL15(a[ 7], b[ 3])
429		+ MUL15(a[ 8], b[ 2])
430		+ MUL15(a[ 9], b[ 1])
431		+ MUL15(a[10], b[ 0]);
432	t[11] = MUL15(a[ 0], b[11])
433		+ MUL15(a[ 1], b[10])
434		+ MUL15(a[ 2], b[ 9])
435		+ MUL15(a[ 3], b[ 8])
436		+ MUL15(a[ 4], b[ 7])
437		+ MUL15(a[ 5], b[ 6])
438		+ MUL15(a[ 6], b[ 5])
439		+ MUL15(a[ 7], b[ 4])
440		+ MUL15(a[ 8], b[ 3])
441		+ MUL15(a[ 9], b[ 2])
442		+ MUL15(a[10], b[ 1])
443		+ MUL15(a[11], b[ 0]);
444	t[12] = MUL15(a[ 0], b[12])
445		+ MUL15(a[ 1], b[11])
446		+ MUL15(a[ 2], b[10])
447		+ MUL15(a[ 3], b[ 9])
448		+ MUL15(a[ 4], b[ 8])
449		+ MUL15(a[ 5], b[ 7])
450		+ MUL15(a[ 6], b[ 6])
451		+ MUL15(a[ 7], b[ 5])
452		+ MUL15(a[ 8], b[ 4])
453		+ MUL15(a[ 9], b[ 3])
454		+ MUL15(a[10], b[ 2])
455		+ MUL15(a[11], b[ 1])
456		+ MUL15(a[12], b[ 0]);
457	t[13] = MUL15(a[ 0], b[13])
458		+ MUL15(a[ 1], b[12])
459		+ MUL15(a[ 2], b[11])
460		+ MUL15(a[ 3], b[10])
461		+ MUL15(a[ 4], b[ 9])
462		+ MUL15(a[ 5], b[ 8])
463		+ MUL15(a[ 6], b[ 7])
464		+ MUL15(a[ 7], b[ 6])
465		+ MUL15(a[ 8], b[ 5])
466		+ MUL15(a[ 9], b[ 4])
467		+ MUL15(a[10], b[ 3])
468		+ MUL15(a[11], b[ 2])
469		+ MUL15(a[12], b[ 1])
470		+ MUL15(a[13], b[ 0]);
471	t[14] = MUL15(a[ 0], b[14])
472		+ MUL15(a[ 1], b[13])
473		+ MUL15(a[ 2], b[12])
474		+ MUL15(a[ 3], b[11])
475		+ MUL15(a[ 4], b[10])
476		+ MUL15(a[ 5], b[ 9])
477		+ MUL15(a[ 6], b[ 8])
478		+ MUL15(a[ 7], b[ 7])
479		+ MUL15(a[ 8], b[ 6])
480		+ MUL15(a[ 9], b[ 5])
481		+ MUL15(a[10], b[ 4])
482		+ MUL15(a[11], b[ 3])
483		+ MUL15(a[12], b[ 2])
484		+ MUL15(a[13], b[ 1])
485		+ MUL15(a[14], b[ 0]);
486	t[15] = MUL15(a[ 0], b[15])
487		+ MUL15(a[ 1], b[14])
488		+ MUL15(a[ 2], b[13])
489		+ MUL15(a[ 3], b[12])
490		+ MUL15(a[ 4], b[11])
491		+ MUL15(a[ 5], b[10])
492		+ MUL15(a[ 6], b[ 9])
493		+ MUL15(a[ 7], b[ 8])
494		+ MUL15(a[ 8], b[ 7])
495		+ MUL15(a[ 9], b[ 6])
496		+ MUL15(a[10], b[ 5])
497		+ MUL15(a[11], b[ 4])
498		+ MUL15(a[12], b[ 3])
499		+ MUL15(a[13], b[ 2])
500		+ MUL15(a[14], b[ 1])
501		+ MUL15(a[15], b[ 0]);
502	t[16] = MUL15(a[ 0], b[16])
503		+ MUL15(a[ 1], b[15])
504		+ MUL15(a[ 2], b[14])
505		+ MUL15(a[ 3], b[13])
506		+ MUL15(a[ 4], b[12])
507		+ MUL15(a[ 5], b[11])
508		+ MUL15(a[ 6], b[10])
509		+ MUL15(a[ 7], b[ 9])
510		+ MUL15(a[ 8], b[ 8])
511		+ MUL15(a[ 9], b[ 7])
512		+ MUL15(a[10], b[ 6])
513		+ MUL15(a[11], b[ 5])
514		+ MUL15(a[12], b[ 4])
515		+ MUL15(a[13], b[ 3])
516		+ MUL15(a[14], b[ 2])
517		+ MUL15(a[15], b[ 1])
518		+ MUL15(a[16], b[ 0]);
519	t[17] = MUL15(a[ 0], b[17])
520		+ MUL15(a[ 1], b[16])
521		+ MUL15(a[ 2], b[15])
522		+ MUL15(a[ 3], b[14])
523		+ MUL15(a[ 4], b[13])
524		+ MUL15(a[ 5], b[12])
525		+ MUL15(a[ 6], b[11])
526		+ MUL15(a[ 7], b[10])
527		+ MUL15(a[ 8], b[ 9])
528		+ MUL15(a[ 9], b[ 8])
529		+ MUL15(a[10], b[ 7])
530		+ MUL15(a[11], b[ 6])
531		+ MUL15(a[12], b[ 5])
532		+ MUL15(a[13], b[ 4])
533		+ MUL15(a[14], b[ 3])
534		+ MUL15(a[15], b[ 2])
535		+ MUL15(a[16], b[ 1])
536		+ MUL15(a[17], b[ 0]);
537	t[18] = MUL15(a[ 0], b[18])
538		+ MUL15(a[ 1], b[17])
539		+ MUL15(a[ 2], b[16])
540		+ MUL15(a[ 3], b[15])
541		+ MUL15(a[ 4], b[14])
542		+ MUL15(a[ 5], b[13])
543		+ MUL15(a[ 6], b[12])
544		+ MUL15(a[ 7], b[11])
545		+ MUL15(a[ 8], b[10])
546		+ MUL15(a[ 9], b[ 9])
547		+ MUL15(a[10], b[ 8])
548		+ MUL15(a[11], b[ 7])
549		+ MUL15(a[12], b[ 6])
550		+ MUL15(a[13], b[ 5])
551		+ MUL15(a[14], b[ 4])
552		+ MUL15(a[15], b[ 3])
553		+ MUL15(a[16], b[ 2])
554		+ MUL15(a[17], b[ 1])
555		+ MUL15(a[18], b[ 0]);
556	t[19] = MUL15(a[ 0], b[19])
557		+ MUL15(a[ 1], b[18])
558		+ MUL15(a[ 2], b[17])
559		+ MUL15(a[ 3], b[16])
560		+ MUL15(a[ 4], b[15])
561		+ MUL15(a[ 5], b[14])
562		+ MUL15(a[ 6], b[13])
563		+ MUL15(a[ 7], b[12])
564		+ MUL15(a[ 8], b[11])
565		+ MUL15(a[ 9], b[10])
566		+ MUL15(a[10], b[ 9])
567		+ MUL15(a[11], b[ 8])
568		+ MUL15(a[12], b[ 7])
569		+ MUL15(a[13], b[ 6])
570		+ MUL15(a[14], b[ 5])
571		+ MUL15(a[15], b[ 4])
572		+ MUL15(a[16], b[ 3])
573		+ MUL15(a[17], b[ 2])
574		+ MUL15(a[18], b[ 1])
575		+ MUL15(a[19], b[ 0]);
576	t[20] = MUL15(a[ 1], b[19])
577		+ MUL15(a[ 2], b[18])
578		+ MUL15(a[ 3], b[17])
579		+ MUL15(a[ 4], b[16])
580		+ MUL15(a[ 5], b[15])
581		+ MUL15(a[ 6], b[14])
582		+ MUL15(a[ 7], b[13])
583		+ MUL15(a[ 8], b[12])
584		+ MUL15(a[ 9], b[11])
585		+ MUL15(a[10], b[10])
586		+ MUL15(a[11], b[ 9])
587		+ MUL15(a[12], b[ 8])
588		+ MUL15(a[13], b[ 7])
589		+ MUL15(a[14], b[ 6])
590		+ MUL15(a[15], b[ 5])
591		+ MUL15(a[16], b[ 4])
592		+ MUL15(a[17], b[ 3])
593		+ MUL15(a[18], b[ 2])
594		+ MUL15(a[19], b[ 1]);
595	t[21] = MUL15(a[ 2], b[19])
596		+ MUL15(a[ 3], b[18])
597		+ MUL15(a[ 4], b[17])
598		+ MUL15(a[ 5], b[16])
599		+ MUL15(a[ 6], b[15])
600		+ MUL15(a[ 7], b[14])
601		+ MUL15(a[ 8], b[13])
602		+ MUL15(a[ 9], b[12])
603		+ MUL15(a[10], b[11])
604		+ MUL15(a[11], b[10])
605		+ MUL15(a[12], b[ 9])
606		+ MUL15(a[13], b[ 8])
607		+ MUL15(a[14], b[ 7])
608		+ MUL15(a[15], b[ 6])
609		+ MUL15(a[16], b[ 5])
610		+ MUL15(a[17], b[ 4])
611		+ MUL15(a[18], b[ 3])
612		+ MUL15(a[19], b[ 2]);
613	t[22] = MUL15(a[ 3], b[19])
614		+ MUL15(a[ 4], b[18])
615		+ MUL15(a[ 5], b[17])
616		+ MUL15(a[ 6], b[16])
617		+ MUL15(a[ 7], b[15])
618		+ MUL15(a[ 8], b[14])
619		+ MUL15(a[ 9], b[13])
620		+ MUL15(a[10], b[12])
621		+ MUL15(a[11], b[11])
622		+ MUL15(a[12], b[10])
623		+ MUL15(a[13], b[ 9])
624		+ MUL15(a[14], b[ 8])
625		+ MUL15(a[15], b[ 7])
626		+ MUL15(a[16], b[ 6])
627		+ MUL15(a[17], b[ 5])
628		+ MUL15(a[18], b[ 4])
629		+ MUL15(a[19], b[ 3]);
630	t[23] = MUL15(a[ 4], b[19])
631		+ MUL15(a[ 5], b[18])
632		+ MUL15(a[ 6], b[17])
633		+ MUL15(a[ 7], b[16])
634		+ MUL15(a[ 8], b[15])
635		+ MUL15(a[ 9], b[14])
636		+ MUL15(a[10], b[13])
637		+ MUL15(a[11], b[12])
638		+ MUL15(a[12], b[11])
639		+ MUL15(a[13], b[10])
640		+ MUL15(a[14], b[ 9])
641		+ MUL15(a[15], b[ 8])
642		+ MUL15(a[16], b[ 7])
643		+ MUL15(a[17], b[ 6])
644		+ MUL15(a[18], b[ 5])
645		+ MUL15(a[19], b[ 4]);
646	t[24] = MUL15(a[ 5], b[19])
647		+ MUL15(a[ 6], b[18])
648		+ MUL15(a[ 7], b[17])
649		+ MUL15(a[ 8], b[16])
650		+ MUL15(a[ 9], b[15])
651		+ MUL15(a[10], b[14])
652		+ MUL15(a[11], b[13])
653		+ MUL15(a[12], b[12])
654		+ MUL15(a[13], b[11])
655		+ MUL15(a[14], b[10])
656		+ MUL15(a[15], b[ 9])
657		+ MUL15(a[16], b[ 8])
658		+ MUL15(a[17], b[ 7])
659		+ MUL15(a[18], b[ 6])
660		+ MUL15(a[19], b[ 5]);
661	t[25] = MUL15(a[ 6], b[19])
662		+ MUL15(a[ 7], b[18])
663		+ MUL15(a[ 8], b[17])
664		+ MUL15(a[ 9], b[16])
665		+ MUL15(a[10], b[15])
666		+ MUL15(a[11], b[14])
667		+ MUL15(a[12], b[13])
668		+ MUL15(a[13], b[12])
669		+ MUL15(a[14], b[11])
670		+ MUL15(a[15], b[10])
671		+ MUL15(a[16], b[ 9])
672		+ MUL15(a[17], b[ 8])
673		+ MUL15(a[18], b[ 7])
674		+ MUL15(a[19], b[ 6]);
675	t[26] = MUL15(a[ 7], b[19])
676		+ MUL15(a[ 8], b[18])
677		+ MUL15(a[ 9], b[17])
678		+ MUL15(a[10], b[16])
679		+ MUL15(a[11], b[15])
680		+ MUL15(a[12], b[14])
681		+ MUL15(a[13], b[13])
682		+ MUL15(a[14], b[12])
683		+ MUL15(a[15], b[11])
684		+ MUL15(a[16], b[10])
685		+ MUL15(a[17], b[ 9])
686		+ MUL15(a[18], b[ 8])
687		+ MUL15(a[19], b[ 7]);
688	t[27] = MUL15(a[ 8], b[19])
689		+ MUL15(a[ 9], b[18])
690		+ MUL15(a[10], b[17])
691		+ MUL15(a[11], b[16])
692		+ MUL15(a[12], b[15])
693		+ MUL15(a[13], b[14])
694		+ MUL15(a[14], b[13])
695		+ MUL15(a[15], b[12])
696		+ MUL15(a[16], b[11])
697		+ MUL15(a[17], b[10])
698		+ MUL15(a[18], b[ 9])
699		+ MUL15(a[19], b[ 8]);
700	t[28] = MUL15(a[ 9], b[19])
701		+ MUL15(a[10], b[18])
702		+ MUL15(a[11], b[17])
703		+ MUL15(a[12], b[16])
704		+ MUL15(a[13], b[15])
705		+ MUL15(a[14], b[14])
706		+ MUL15(a[15], b[13])
707		+ MUL15(a[16], b[12])
708		+ MUL15(a[17], b[11])
709		+ MUL15(a[18], b[10])
710		+ MUL15(a[19], b[ 9]);
711	t[29] = MUL15(a[10], b[19])
712		+ MUL15(a[11], b[18])
713		+ MUL15(a[12], b[17])
714		+ MUL15(a[13], b[16])
715		+ MUL15(a[14], b[15])
716		+ MUL15(a[15], b[14])
717		+ MUL15(a[16], b[13])
718		+ MUL15(a[17], b[12])
719		+ MUL15(a[18], b[11])
720		+ MUL15(a[19], b[10]);
721	t[30] = MUL15(a[11], b[19])
722		+ MUL15(a[12], b[18])
723		+ MUL15(a[13], b[17])
724		+ MUL15(a[14], b[16])
725		+ MUL15(a[15], b[15])
726		+ MUL15(a[16], b[14])
727		+ MUL15(a[17], b[13])
728		+ MUL15(a[18], b[12])
729		+ MUL15(a[19], b[11]);
730	t[31] = MUL15(a[12], b[19])
731		+ MUL15(a[13], b[18])
732		+ MUL15(a[14], b[17])
733		+ MUL15(a[15], b[16])
734		+ MUL15(a[16], b[15])
735		+ MUL15(a[17], b[14])
736		+ MUL15(a[18], b[13])
737		+ MUL15(a[19], b[12]);
738	t[32] = MUL15(a[13], b[19])
739		+ MUL15(a[14], b[18])
740		+ MUL15(a[15], b[17])
741		+ MUL15(a[16], b[16])
742		+ MUL15(a[17], b[15])
743		+ MUL15(a[18], b[14])
744		+ MUL15(a[19], b[13]);
745	t[33] = MUL15(a[14], b[19])
746		+ MUL15(a[15], b[18])
747		+ MUL15(a[16], b[17])
748		+ MUL15(a[17], b[16])
749		+ MUL15(a[18], b[15])
750		+ MUL15(a[19], b[14]);
751	t[34] = MUL15(a[15], b[19])
752		+ MUL15(a[16], b[18])
753		+ MUL15(a[17], b[17])
754		+ MUL15(a[18], b[16])
755		+ MUL15(a[19], b[15]);
756	t[35] = MUL15(a[16], b[19])
757		+ MUL15(a[17], b[18])
758		+ MUL15(a[18], b[17])
759		+ MUL15(a[19], b[16]);
760	t[36] = MUL15(a[17], b[19])
761		+ MUL15(a[18], b[18])
762		+ MUL15(a[19], b[17]);
763	t[37] = MUL15(a[18], b[19])
764		+ MUL15(a[19], b[18]);
765	t[38] = MUL15(a[19], b[19]);
766	d[39] = norm13(d, t, 39);
767}
768
769static void
770square20(uint32_t *d, const uint32_t *a)
771{
772	uint32_t t[39];
773
774	t[ 0] = MUL15(a[ 0], a[ 0]);
775	t[ 1] = ((MUL15(a[ 0], a[ 1])) << 1);
776	t[ 2] = MUL15(a[ 1], a[ 1])
777		+ ((MUL15(a[ 0], a[ 2])) << 1);
778	t[ 3] = ((MUL15(a[ 0], a[ 3])
779		+ MUL15(a[ 1], a[ 2])) << 1);
780	t[ 4] = MUL15(a[ 2], a[ 2])
781		+ ((MUL15(a[ 0], a[ 4])
782		+ MUL15(a[ 1], a[ 3])) << 1);
783	t[ 5] = ((MUL15(a[ 0], a[ 5])
784		+ MUL15(a[ 1], a[ 4])
785		+ MUL15(a[ 2], a[ 3])) << 1);
786	t[ 6] = MUL15(a[ 3], a[ 3])
787		+ ((MUL15(a[ 0], a[ 6])
788		+ MUL15(a[ 1], a[ 5])
789		+ MUL15(a[ 2], a[ 4])) << 1);
790	t[ 7] = ((MUL15(a[ 0], a[ 7])
791		+ MUL15(a[ 1], a[ 6])
792		+ MUL15(a[ 2], a[ 5])
793		+ MUL15(a[ 3], a[ 4])) << 1);
794	t[ 8] = MUL15(a[ 4], a[ 4])
795		+ ((MUL15(a[ 0], a[ 8])
796		+ MUL15(a[ 1], a[ 7])
797		+ MUL15(a[ 2], a[ 6])
798		+ MUL15(a[ 3], a[ 5])) << 1);
799	t[ 9] = ((MUL15(a[ 0], a[ 9])
800		+ MUL15(a[ 1], a[ 8])
801		+ MUL15(a[ 2], a[ 7])
802		+ MUL15(a[ 3], a[ 6])
803		+ MUL15(a[ 4], a[ 5])) << 1);
804	t[10] = MUL15(a[ 5], a[ 5])
805		+ ((MUL15(a[ 0], a[10])
806		+ MUL15(a[ 1], a[ 9])
807		+ MUL15(a[ 2], a[ 8])
808		+ MUL15(a[ 3], a[ 7])
809		+ MUL15(a[ 4], a[ 6])) << 1);
810	t[11] = ((MUL15(a[ 0], a[11])
811		+ MUL15(a[ 1], a[10])
812		+ MUL15(a[ 2], a[ 9])
813		+ MUL15(a[ 3], a[ 8])
814		+ MUL15(a[ 4], a[ 7])
815		+ MUL15(a[ 5], a[ 6])) << 1);
816	t[12] = MUL15(a[ 6], a[ 6])
817		+ ((MUL15(a[ 0], a[12])
818		+ MUL15(a[ 1], a[11])
819		+ MUL15(a[ 2], a[10])
820		+ MUL15(a[ 3], a[ 9])
821		+ MUL15(a[ 4], a[ 8])
822		+ MUL15(a[ 5], a[ 7])) << 1);
823	t[13] = ((MUL15(a[ 0], a[13])
824		+ MUL15(a[ 1], a[12])
825		+ MUL15(a[ 2], a[11])
826		+ MUL15(a[ 3], a[10])
827		+ MUL15(a[ 4], a[ 9])
828		+ MUL15(a[ 5], a[ 8])
829		+ MUL15(a[ 6], a[ 7])) << 1);
830	t[14] = MUL15(a[ 7], a[ 7])
831		+ ((MUL15(a[ 0], a[14])
832		+ MUL15(a[ 1], a[13])
833		+ MUL15(a[ 2], a[12])
834		+ MUL15(a[ 3], a[11])
835		+ MUL15(a[ 4], a[10])
836		+ MUL15(a[ 5], a[ 9])
837		+ MUL15(a[ 6], a[ 8])) << 1);
838	t[15] = ((MUL15(a[ 0], a[15])
839		+ MUL15(a[ 1], a[14])
840		+ MUL15(a[ 2], a[13])
841		+ MUL15(a[ 3], a[12])
842		+ MUL15(a[ 4], a[11])
843		+ MUL15(a[ 5], a[10])
844		+ MUL15(a[ 6], a[ 9])
845		+ MUL15(a[ 7], a[ 8])) << 1);
846	t[16] = MUL15(a[ 8], a[ 8])
847		+ ((MUL15(a[ 0], a[16])
848		+ MUL15(a[ 1], a[15])
849		+ MUL15(a[ 2], a[14])
850		+ MUL15(a[ 3], a[13])
851		+ MUL15(a[ 4], a[12])
852		+ MUL15(a[ 5], a[11])
853		+ MUL15(a[ 6], a[10])
854		+ MUL15(a[ 7], a[ 9])) << 1);
855	t[17] = ((MUL15(a[ 0], a[17])
856		+ MUL15(a[ 1], a[16])
857		+ MUL15(a[ 2], a[15])
858		+ MUL15(a[ 3], a[14])
859		+ MUL15(a[ 4], a[13])
860		+ MUL15(a[ 5], a[12])
861		+ MUL15(a[ 6], a[11])
862		+ MUL15(a[ 7], a[10])
863		+ MUL15(a[ 8], a[ 9])) << 1);
864	t[18] = MUL15(a[ 9], a[ 9])
865		+ ((MUL15(a[ 0], a[18])
866		+ MUL15(a[ 1], a[17])
867		+ MUL15(a[ 2], a[16])
868		+ MUL15(a[ 3], a[15])
869		+ MUL15(a[ 4], a[14])
870		+ MUL15(a[ 5], a[13])
871		+ MUL15(a[ 6], a[12])
872		+ MUL15(a[ 7], a[11])
873		+ MUL15(a[ 8], a[10])) << 1);
874	t[19] = ((MUL15(a[ 0], a[19])
875		+ MUL15(a[ 1], a[18])
876		+ MUL15(a[ 2], a[17])
877		+ MUL15(a[ 3], a[16])
878		+ MUL15(a[ 4], a[15])
879		+ MUL15(a[ 5], a[14])
880		+ MUL15(a[ 6], a[13])
881		+ MUL15(a[ 7], a[12])
882		+ MUL15(a[ 8], a[11])
883		+ MUL15(a[ 9], a[10])) << 1);
884	t[20] = MUL15(a[10], a[10])
885		+ ((MUL15(a[ 1], a[19])
886		+ MUL15(a[ 2], a[18])
887		+ MUL15(a[ 3], a[17])
888		+ MUL15(a[ 4], a[16])
889		+ MUL15(a[ 5], a[15])
890		+ MUL15(a[ 6], a[14])
891		+ MUL15(a[ 7], a[13])
892		+ MUL15(a[ 8], a[12])
893		+ MUL15(a[ 9], a[11])) << 1);
894	t[21] = ((MUL15(a[ 2], a[19])
895		+ MUL15(a[ 3], a[18])
896		+ MUL15(a[ 4], a[17])
897		+ MUL15(a[ 5], a[16])
898		+ MUL15(a[ 6], a[15])
899		+ MUL15(a[ 7], a[14])
900		+ MUL15(a[ 8], a[13])
901		+ MUL15(a[ 9], a[12])
902		+ MUL15(a[10], a[11])) << 1);
903	t[22] = MUL15(a[11], a[11])
904		+ ((MUL15(a[ 3], a[19])
905		+ MUL15(a[ 4], a[18])
906		+ MUL15(a[ 5], a[17])
907		+ MUL15(a[ 6], a[16])
908		+ MUL15(a[ 7], a[15])
909		+ MUL15(a[ 8], a[14])
910		+ MUL15(a[ 9], a[13])
911		+ MUL15(a[10], a[12])) << 1);
912	t[23] = ((MUL15(a[ 4], a[19])
913		+ MUL15(a[ 5], a[18])
914		+ MUL15(a[ 6], a[17])
915		+ MUL15(a[ 7], a[16])
916		+ MUL15(a[ 8], a[15])
917		+ MUL15(a[ 9], a[14])
918		+ MUL15(a[10], a[13])
919		+ MUL15(a[11], a[12])) << 1);
920	t[24] = MUL15(a[12], a[12])
921		+ ((MUL15(a[ 5], a[19])
922		+ MUL15(a[ 6], a[18])
923		+ MUL15(a[ 7], a[17])
924		+ MUL15(a[ 8], a[16])
925		+ MUL15(a[ 9], a[15])
926		+ MUL15(a[10], a[14])
927		+ MUL15(a[11], a[13])) << 1);
928	t[25] = ((MUL15(a[ 6], a[19])
929		+ MUL15(a[ 7], a[18])
930		+ MUL15(a[ 8], a[17])
931		+ MUL15(a[ 9], a[16])
932		+ MUL15(a[10], a[15])
933		+ MUL15(a[11], a[14])
934		+ MUL15(a[12], a[13])) << 1);
935	t[26] = MUL15(a[13], a[13])
936		+ ((MUL15(a[ 7], a[19])
937		+ MUL15(a[ 8], a[18])
938		+ MUL15(a[ 9], a[17])
939		+ MUL15(a[10], a[16])
940		+ MUL15(a[11], a[15])
941		+ MUL15(a[12], a[14])) << 1);
942	t[27] = ((MUL15(a[ 8], a[19])
943		+ MUL15(a[ 9], a[18])
944		+ MUL15(a[10], a[17])
945		+ MUL15(a[11], a[16])
946		+ MUL15(a[12], a[15])
947		+ MUL15(a[13], a[14])) << 1);
948	t[28] = MUL15(a[14], a[14])
949		+ ((MUL15(a[ 9], a[19])
950		+ MUL15(a[10], a[18])
951		+ MUL15(a[11], a[17])
952		+ MUL15(a[12], a[16])
953		+ MUL15(a[13], a[15])) << 1);
954	t[29] = ((MUL15(a[10], a[19])
955		+ MUL15(a[11], a[18])
956		+ MUL15(a[12], a[17])
957		+ MUL15(a[13], a[16])
958		+ MUL15(a[14], a[15])) << 1);
959	t[30] = MUL15(a[15], a[15])
960		+ ((MUL15(a[11], a[19])
961		+ MUL15(a[12], a[18])
962		+ MUL15(a[13], a[17])
963		+ MUL15(a[14], a[16])) << 1);
964	t[31] = ((MUL15(a[12], a[19])
965		+ MUL15(a[13], a[18])
966		+ MUL15(a[14], a[17])
967		+ MUL15(a[15], a[16])) << 1);
968	t[32] = MUL15(a[16], a[16])
969		+ ((MUL15(a[13], a[19])
970		+ MUL15(a[14], a[18])
971		+ MUL15(a[15], a[17])) << 1);
972	t[33] = ((MUL15(a[14], a[19])
973		+ MUL15(a[15], a[18])
974		+ MUL15(a[16], a[17])) << 1);
975	t[34] = MUL15(a[17], a[17])
976		+ ((MUL15(a[15], a[19])
977		+ MUL15(a[16], a[18])) << 1);
978	t[35] = ((MUL15(a[16], a[19])
979		+ MUL15(a[17], a[18])) << 1);
980	t[36] = MUL15(a[18], a[18])
981		+ ((MUL15(a[17], a[19])) << 1);
982	t[37] = ((MUL15(a[18], a[19])) << 1);
983	t[38] = MUL15(a[19], a[19]);
984	d[39] = norm13(d, t, 39);
985}
986
987#endif
988
989/*
990 * Modulus for field F256 (field for point coordinates in curve P-256).
991 */
992static const uint32_t F256[] = {
993	0x1FFF, 0x1FFF, 0x1FFF, 0x1FFF, 0x1FFF, 0x1FFF, 0x1FFF, 0x001F,
994	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0400, 0x0000,
995	0x0000, 0x1FF8, 0x1FFF, 0x01FF
996};
997
998/*
999 * The 'b' curve equation coefficient for P-256.
1000 */
1001static const uint32_t P256_B[] = {
1002	0x004B, 0x1E93, 0x0F89, 0x1C78, 0x03BC, 0x187B, 0x114E, 0x1619,
1003	0x1D06, 0x0328, 0x01AF, 0x0D31, 0x1557, 0x15DE, 0x1ECF, 0x127C,
1004	0x0A3A, 0x0EC5, 0x118D, 0x00B5
1005};
1006
1007/*
1008 * Perform a "short reduction" in field F256 (field for curve P-256).
1009 * The source value should be less than 262 bits; on output, it will
1010 * be at most 257 bits, and less than twice the modulus.
1011 */
1012static void
1013reduce_f256(uint32_t *d)
1014{
1015	uint32_t x;
1016
1017	x = d[19] >> 9;
1018	d[19] &= 0x01FF;
1019	d[17] += x << 3;
1020	d[14] -= x << 10;
1021	d[7] -= x << 5;
1022	d[0] += x;
1023	norm13(d, d, 20);
1024}
1025
1026/*
1027 * Perform a "final reduction" in field F256 (field for curve P-256).
1028 * The source value must be less than twice the modulus. If the value
1029 * is not lower than the modulus, then the modulus is subtracted and
1030 * this function returns 1; otherwise, it leaves it untouched and it
1031 * returns 0.
1032 */
1033static uint32_t
1034reduce_final_f256(uint32_t *d)
1035{
1036	uint32_t t[20];
1037	uint32_t cc;
1038	int i;
1039
1040	memcpy(t, d, sizeof t);
1041	cc = 0;
1042	for (i = 0; i < 20; i ++) {
1043		uint32_t w;
1044
1045		w = t[i] - F256[i] - cc;
1046		cc = w >> 31;
1047		t[i] = w & 0x1FFF;
1048	}
1049	cc ^= 1;
1050	CCOPY(cc, d, t, sizeof t);
1051	return cc;
1052}
1053
1054/*
1055 * Perform a multiplication of two integers modulo
1056 * 2^256-2^224+2^192+2^96-1 (for NIST curve P-256). Operands are arrays
1057 * of 20 words, each containing 13 bits of data, in little-endian order.
1058 * On input, upper word may be up to 13 bits (hence value up to 2^260-1);
1059 * on output, value fits on 257 bits and is lower than twice the modulus.
1060 */
1061static void
1062mul_f256(uint32_t *d, const uint32_t *a, const uint32_t *b)
1063{
1064	uint32_t t[40], cc;
1065	int i;
1066
1067	/*
1068	 * Compute raw multiplication. All result words fit in 13 bits
1069	 * each.
1070	 */
1071	mul20(t, a, b);
1072
1073	/*
1074	 * Modular reduction: each high word in added/subtracted where
1075	 * necessary.
1076	 *
1077	 * The modulus is:
1078	 *    p = 2^256 - 2^224 + 2^192 + 2^96 - 1
1079	 * Therefore:
1080	 *    2^256 = 2^224 - 2^192 - 2^96 + 1 mod p
1081	 *
1082	 * For a word x at bit offset n (n >= 256), we have:
1083	 *    x*2^n = x*2^(n-32) - x*2^(n-64)
1084	 *            - x*2^(n - 160) + x*2^(n-256) mod p
1085	 *
1086	 * Thus, we can nullify the high word if we reinject it at some
1087	 * proper emplacements.
1088	 */
1089	for (i = 39; i >= 20; i --) {
1090		uint32_t x;
1091
1092		x = t[i];
1093		t[i - 2] += ARSH(x, 6);
1094		t[i - 3] += (x << 7) & 0x1FFF;
1095		t[i - 4] -= ARSH(x, 12);
1096		t[i - 5] -= (x << 1) & 0x1FFF;
1097		t[i - 12] -= ARSH(x, 4);
1098		t[i - 13] -= (x << 9) & 0x1FFF;
1099		t[i - 19] += ARSH(x, 9);
1100		t[i - 20] += (x << 4) & 0x1FFF;
1101	}
1102
1103	/*
1104	 * Propagate carries. This is a signed propagation, and the
1105	 * result may be negative. The loop above may enlarge values,
1106	 * but not two much: worst case is the chain involving t[i - 3],
1107	 * in which a value may be added to itself up to 7 times. Since
1108	 * starting values are 13-bit each, all words fit on 20 bits
1109	 * (21 to account for the sign bit).
1110	 */
1111	cc = norm13(t, t, 20);
1112
1113	/*
1114	 * Perform modular reduction again for the bits beyond 256 (the carry
1115	 * and the bits 256..259). Since the largest shift below is by 10
1116	 * bits, and the values fit on 21 bits, values fit in 32-bit words,
1117	 * thereby allowing injecting full word values.
1118	 */
1119	cc = (cc << 4) | (t[19] >> 9);
1120	t[19] &= 0x01FF;
1121	t[17] += cc << 3;
1122	t[14] -= cc << 10;
1123	t[7] -= cc << 5;
1124	t[0] += cc;
1125
1126	/*
1127	 * If the carry is negative, then after carry propagation, we may
1128	 * end up with a value which is negative, and we don't want that.
1129	 * Thus, in that case, we add the modulus. Note that the subtraction
1130	 * result, when the carry is negative, is always smaller than the
1131	 * modulus, so the extra addition will not make the value exceed
1132	 * twice the modulus.
1133	 */
1134	cc >>= 31;
1135	t[0] -= cc;
1136	t[7] += cc << 5;
1137	t[14] += cc << 10;
1138	t[17] -= cc << 3;
1139	t[19] += cc << 9;
1140
1141	norm13(d, t, 20);
1142}
1143
1144/*
1145 * Square an integer modulo 2^256-2^224+2^192+2^96-1 (for NIST curve
1146 * P-256). Operand is an array of 20 words, each containing 13 bits of
1147 * data, in little-endian order. On input, upper word may be up to 13
1148 * bits (hence value up to 2^260-1); on output, value fits on 257 bits
1149 * and is lower than twice the modulus.
1150 */
1151static void
1152square_f256(uint32_t *d, const uint32_t *a)
1153{
1154	uint32_t t[40], cc;
1155	int i;
1156
1157	/*
1158	 * Compute raw square. All result words fit in 13 bits each.
1159	 */
1160	square20(t, a);
1161
1162	/*
1163	 * Modular reduction: each high word in added/subtracted where
1164	 * necessary.
1165	 *
1166	 * The modulus is:
1167	 *    p = 2^256 - 2^224 + 2^192 + 2^96 - 1
1168	 * Therefore:
1169	 *    2^256 = 2^224 - 2^192 - 2^96 + 1 mod p
1170	 *
1171	 * For a word x at bit offset n (n >= 256), we have:
1172	 *    x*2^n = x*2^(n-32) - x*2^(n-64)
1173	 *            - x*2^(n - 160) + x*2^(n-256) mod p
1174	 *
1175	 * Thus, we can nullify the high word if we reinject it at some
1176	 * proper emplacements.
1177	 */
1178	for (i = 39; i >= 20; i --) {
1179		uint32_t x;
1180
1181		x = t[i];
1182		t[i - 2] += ARSH(x, 6);
1183		t[i - 3] += (x << 7) & 0x1FFF;
1184		t[i - 4] -= ARSH(x, 12);
1185		t[i - 5] -= (x << 1) & 0x1FFF;
1186		t[i - 12] -= ARSH(x, 4);
1187		t[i - 13] -= (x << 9) & 0x1FFF;
1188		t[i - 19] += ARSH(x, 9);
1189		t[i - 20] += (x << 4) & 0x1FFF;
1190	}
1191
1192	/*
1193	 * Propagate carries. This is a signed propagation, and the
1194	 * result may be negative. The loop above may enlarge values,
1195	 * but not two much: worst case is the chain involving t[i - 3],
1196	 * in which a value may be added to itself up to 7 times. Since
1197	 * starting values are 13-bit each, all words fit on 20 bits
1198	 * (21 to account for the sign bit).
1199	 */
1200	cc = norm13(t, t, 20);
1201
1202	/*
1203	 * Perform modular reduction again for the bits beyond 256 (the carry
1204	 * and the bits 256..259). Since the largest shift below is by 10
1205	 * bits, and the values fit on 21 bits, values fit in 32-bit words,
1206	 * thereby allowing injecting full word values.
1207	 */
1208	cc = (cc << 4) | (t[19] >> 9);
1209	t[19] &= 0x01FF;
1210	t[17] += cc << 3;
1211	t[14] -= cc << 10;
1212	t[7] -= cc << 5;
1213	t[0] += cc;
1214
1215	/*
1216	 * If the carry is negative, then after carry propagation, we may
1217	 * end up with a value which is negative, and we don't want that.
1218	 * Thus, in that case, we add the modulus. Note that the subtraction
1219	 * result, when the carry is negative, is always smaller than the
1220	 * modulus, so the extra addition will not make the value exceed
1221	 * twice the modulus.
1222	 */
1223	cc >>= 31;
1224	t[0] -= cc;
1225	t[7] += cc << 5;
1226	t[14] += cc << 10;
1227	t[17] -= cc << 3;
1228	t[19] += cc << 9;
1229
1230	norm13(d, t, 20);
1231}
1232
1233/*
1234 * Jacobian coordinates for a point in P-256: affine coordinates (X,Y)
1235 * are such that:
1236 *   X = x / z^2
1237 *   Y = y / z^3
1238 * For the point at infinity, z = 0.
1239 * Each point thus admits many possible representations.
1240 *
1241 * Coordinates are represented in arrays of 32-bit integers, each holding
1242 * 13 bits of data. Values may also be slightly greater than the modulus,
1243 * but they will always be lower than twice the modulus.
1244 */
1245typedef struct {
1246	uint32_t x[20];
1247	uint32_t y[20];
1248	uint32_t z[20];
1249} p256_jacobian;
1250
1251/*
1252 * Convert a point to affine coordinates:
1253 *  - If the point is the point at infinity, then all three coordinates
1254 *    are set to 0.
1255 *  - Otherwise, the 'z' coordinate is set to 1, and the 'x' and 'y'
1256 *    coordinates are the 'X' and 'Y' affine coordinates.
1257 * The coordinates are guaranteed to be lower than the modulus.
1258 */
1259static void
1260p256_to_affine(p256_jacobian *P)
1261{
1262	uint32_t t1[20], t2[20];
1263	int i;
1264
1265	/*
1266	 * Invert z with a modular exponentiation: the modulus is
1267	 * p = 2^256 - 2^224 + 2^192 + 2^96 - 1, and the exponent is
1268	 * p-2. Exponent bit pattern (from high to low) is:
1269	 *  - 32 bits of value 1
1270	 *  - 31 bits of value 0
1271	 *  - 1 bit of value 1
1272	 *  - 96 bits of value 0
1273	 *  - 94 bits of value 1
1274	 *  - 1 bit of value 0
1275	 *  - 1 bit of value 1
1276	 * Thus, we precompute z^(2^31-1) to speed things up.
1277	 *
1278	 * If z = 0 (point at infinity) then the modular exponentiation
1279	 * will yield 0, which leads to the expected result (all three
1280	 * coordinates set to 0).
1281	 */
1282
1283	/*
1284	 * A simple square-and-multiply for z^(2^31-1). We could save about
1285	 * two dozen multiplications here with an addition chain, but
1286	 * this would require a bit more code, and extra stack buffers.
1287	 */
1288	memcpy(t1, P->z, sizeof P->z);
1289	for (i = 0; i < 30; i ++) {
1290		square_f256(t1, t1);
1291		mul_f256(t1, t1, P->z);
1292	}
1293
1294	/*
1295	 * Square-and-multiply. Apart from the squarings, we have a few
1296	 * multiplications to set bits to 1; we multiply by the original z
1297	 * for setting 1 bit, and by t1 for setting 31 bits.
1298	 */
1299	memcpy(t2, P->z, sizeof P->z);
1300	for (i = 1; i < 256; i ++) {
1301		square_f256(t2, t2);
1302		switch (i) {
1303		case 31:
1304		case 190:
1305		case 221:
1306		case 252:
1307			mul_f256(t2, t2, t1);
1308			break;
1309		case 63:
1310		case 253:
1311		case 255:
1312			mul_f256(t2, t2, P->z);
1313			break;
1314		}
1315	}
1316
1317	/*
1318	 * Now that we have 1/z, multiply x by 1/z^2 and y by 1/z^3.
1319	 */
1320	mul_f256(t1, t2, t2);
1321	mul_f256(P->x, t1, P->x);
1322	mul_f256(t1, t1, t2);
1323	mul_f256(P->y, t1, P->y);
1324	reduce_final_f256(P->x);
1325	reduce_final_f256(P->y);
1326
1327	/*
1328	 * Multiply z by 1/z. If z = 0, then this will yield 0, otherwise
1329	 * this will set z to 1.
1330	 */
1331	mul_f256(P->z, P->z, t2);
1332	reduce_final_f256(P->z);
1333}
1334
1335/*
1336 * Double a point in P-256. This function works for all valid points,
1337 * including the point at infinity.
1338 */
1339static void
1340p256_double(p256_jacobian *Q)
1341{
1342	/*
1343	 * Doubling formulas are:
1344	 *
1345	 *   s = 4*x*y^2
1346	 *   m = 3*(x + z^2)*(x - z^2)
1347	 *   x' = m^2 - 2*s
1348	 *   y' = m*(s - x') - 8*y^4
1349	 *   z' = 2*y*z
1350	 *
1351	 * These formulas work for all points, including points of order 2
1352	 * and points at infinity:
1353	 *   - If y = 0 then z' = 0. But there is no such point in P-256
1354	 *     anyway.
1355	 *   - If z = 0 then z' = 0.
1356	 */
1357	uint32_t t1[20], t2[20], t3[20], t4[20];
1358	int i;
1359
1360	/*
1361	 * Compute z^2 in t1.
1362	 */
1363	square_f256(t1, Q->z);
1364
1365	/*
1366	 * Compute x-z^2 in t2 and x+z^2 in t1.
1367	 */
1368	for (i = 0; i < 20; i ++) {
1369		t2[i] = (F256[i] << 1) + Q->x[i] - t1[i];
1370		t1[i] += Q->x[i];
1371	}
1372	norm13(t1, t1, 20);
1373	norm13(t2, t2, 20);
1374
1375	/*
1376	 * Compute 3*(x+z^2)*(x-z^2) in t1.
1377	 */
1378	mul_f256(t3, t1, t2);
1379	for (i = 0; i < 20; i ++) {
1380		t1[i] = MUL15(3, t3[i]);
1381	}
1382	norm13(t1, t1, 20);
1383
1384	/*
1385	 * Compute 4*x*y^2 (in t2) and 2*y^2 (in t3).
1386	 */
1387	square_f256(t3, Q->y);
1388	for (i = 0; i < 20; i ++) {
1389		t3[i] <<= 1;
1390	}
1391	norm13(t3, t3, 20);
1392	mul_f256(t2, Q->x, t3);
1393	for (i = 0; i < 20; i ++) {
1394		t2[i] <<= 1;
1395	}
1396	norm13(t2, t2, 20);
1397	reduce_f256(t2);
1398
1399	/*
1400	 * Compute x' = m^2 - 2*s.
1401	 */
1402	square_f256(Q->x, t1);
1403	for (i = 0; i < 20; i ++) {
1404		Q->x[i] += (F256[i] << 2) - (t2[i] << 1);
1405	}
1406	norm13(Q->x, Q->x, 20);
1407	reduce_f256(Q->x);
1408
1409	/*
1410	 * Compute z' = 2*y*z.
1411	 */
1412	mul_f256(t4, Q->y, Q->z);
1413	for (i = 0; i < 20; i ++) {
1414		Q->z[i] = t4[i] << 1;
1415	}
1416	norm13(Q->z, Q->z, 20);
1417	reduce_f256(Q->z);
1418
1419	/*
1420	 * Compute y' = m*(s - x') - 8*y^4. Note that we already have
1421	 * 2*y^2 in t3.
1422	 */
1423	for (i = 0; i < 20; i ++) {
1424		t2[i] += (F256[i] << 1) - Q->x[i];
1425	}
1426	norm13(t2, t2, 20);
1427	mul_f256(Q->y, t1, t2);
1428	square_f256(t4, t3);
1429	for (i = 0; i < 20; i ++) {
1430		Q->y[i] += (F256[i] << 2) - (t4[i] << 1);
1431	}
1432	norm13(Q->y, Q->y, 20);
1433	reduce_f256(Q->y);
1434}
1435
1436/*
1437 * Add point P2 to point P1.
1438 *
1439 * This function computes the wrong result in the following cases:
1440 *
1441 *   - If P1 == 0 but P2 != 0
1442 *   - If P1 != 0 but P2 == 0
1443 *   - If P1 == P2
1444 *
1445 * In all three cases, P1 is set to the point at infinity.
1446 *
1447 * Returned value is 0 if one of the following occurs:
1448 *
1449 *   - P1 and P2 have the same Y coordinate
1450 *   - P1 == 0 and P2 == 0
1451 *   - The Y coordinate of one of the points is 0 and the other point is
1452 *     the point at infinity.
1453 *
1454 * The third case cannot actually happen with valid points, since a point
1455 * with Y == 0 is a point of order 2, and there is no point of order 2 on
1456 * curve P-256.
1457 *
1458 * Therefore, assuming that P1 != 0 and P2 != 0 on input, then the caller
1459 * can apply the following:
1460 *
1461 *   - If the result is not the point at infinity, then it is correct.
1462 *   - Otherwise, if the returned value is 1, then this is a case of
1463 *     P1+P2 == 0, so the result is indeed the point at infinity.
1464 *   - Otherwise, P1 == P2, so a "double" operation should have been
1465 *     performed.
1466 */
1467static uint32_t
1468p256_add(p256_jacobian *P1, const p256_jacobian *P2)
1469{
1470	/*
1471	 * Addtions formulas are:
1472	 *
1473	 *   u1 = x1 * z2^2
1474	 *   u2 = x2 * z1^2
1475	 *   s1 = y1 * z2^3
1476	 *   s2 = y2 * z1^3
1477	 *   h = u2 - u1
1478	 *   r = s2 - s1
1479	 *   x3 = r^2 - h^3 - 2 * u1 * h^2
1480	 *   y3 = r * (u1 * h^2 - x3) - s1 * h^3
1481	 *   z3 = h * z1 * z2
1482	 */
1483	uint32_t t1[20], t2[20], t3[20], t4[20], t5[20], t6[20], t7[20];
1484	uint32_t ret;
1485	int i;
1486
1487	/*
1488	 * Compute u1 = x1*z2^2 (in t1) and s1 = y1*z2^3 (in t3).
1489	 */
1490	square_f256(t3, P2->z);
1491	mul_f256(t1, P1->x, t3);
1492	mul_f256(t4, P2->z, t3);
1493	mul_f256(t3, P1->y, t4);
1494
1495	/*
1496	 * Compute u2 = x2*z1^2 (in t2) and s2 = y2*z1^3 (in t4).
1497	 */
1498	square_f256(t4, P1->z);
1499	mul_f256(t2, P2->x, t4);
1500	mul_f256(t5, P1->z, t4);
1501	mul_f256(t4, P2->y, t5);
1502
1503	/*
1504	 * Compute h = h2 - u1 (in t2) and r = s2 - s1 (in t4).
1505	 * We need to test whether r is zero, so we will do some extra
1506	 * reduce.
1507	 */
1508	for (i = 0; i < 20; i ++) {
1509		t2[i] += (F256[i] << 1) - t1[i];
1510		t4[i] += (F256[i] << 1) - t3[i];
1511	}
1512	norm13(t2, t2, 20);
1513	norm13(t4, t4, 20);
1514	reduce_f256(t4);
1515	reduce_final_f256(t4);
1516	ret = 0;
1517	for (i = 0; i < 20; i ++) {
1518		ret |= t4[i];
1519	}
1520	ret = (ret | -ret) >> 31;
1521
1522	/*
1523	 * Compute u1*h^2 (in t6) and h^3 (in t5);
1524	 */
1525	square_f256(t7, t2);
1526	mul_f256(t6, t1, t7);
1527	mul_f256(t5, t7, t2);
1528
1529	/*
1530	 * Compute x3 = r^2 - h^3 - 2*u1*h^2.
1531	 */
1532	square_f256(P1->x, t4);
1533	for (i = 0; i < 20; i ++) {
1534		P1->x[i] += (F256[i] << 3) - t5[i] - (t6[i] << 1);
1535	}
1536	norm13(P1->x, P1->x, 20);
1537	reduce_f256(P1->x);
1538
1539	/*
1540	 * Compute y3 = r*(u1*h^2 - x3) - s1*h^3.
1541	 */
1542	for (i = 0; i < 20; i ++) {
1543		t6[i] += (F256[i] << 1) - P1->x[i];
1544	}
1545	norm13(t6, t6, 20);
1546	mul_f256(P1->y, t4, t6);
1547	mul_f256(t1, t5, t3);
1548	for (i = 0; i < 20; i ++) {
1549		P1->y[i] += (F256[i] << 1) - t1[i];
1550	}
1551	norm13(P1->y, P1->y, 20);
1552	reduce_f256(P1->y);
1553
1554	/*
1555	 * Compute z3 = h*z1*z2.
1556	 */
1557	mul_f256(t1, P1->z, P2->z);
1558	mul_f256(P1->z, t1, t2);
1559
1560	return ret;
1561}
1562
1563/*
1564 * Add point P2 to point P1. This is a specialised function for the
1565 * case when P2 is a non-zero point in affine coordinate.
1566 *
1567 * This function computes the wrong result in the following cases:
1568 *
1569 *   - If P1 == 0
1570 *   - If P1 == P2
1571 *
1572 * In both cases, P1 is set to the point at infinity.
1573 *
1574 * Returned value is 0 if one of the following occurs:
1575 *
1576 *   - P1 and P2 have the same Y coordinate
1577 *   - The Y coordinate of P2 is 0 and P1 is the point at infinity.
1578 *
1579 * The second case cannot actually happen with valid points, since a point
1580 * with Y == 0 is a point of order 2, and there is no point of order 2 on
1581 * curve P-256.
1582 *
1583 * Therefore, assuming that P1 != 0 on input, then the caller
1584 * can apply the following:
1585 *
1586 *   - If the result is not the point at infinity, then it is correct.
1587 *   - Otherwise, if the returned value is 1, then this is a case of
1588 *     P1+P2 == 0, so the result is indeed the point at infinity.
1589 *   - Otherwise, P1 == P2, so a "double" operation should have been
1590 *     performed.
1591 */
1592static uint32_t
1593p256_add_mixed(p256_jacobian *P1, const p256_jacobian *P2)
1594{
1595	/*
1596	 * Addtions formulas are:
1597	 *
1598	 *   u1 = x1
1599	 *   u2 = x2 * z1^2
1600	 *   s1 = y1
1601	 *   s2 = y2 * z1^3
1602	 *   h = u2 - u1
1603	 *   r = s2 - s1
1604	 *   x3 = r^2 - h^3 - 2 * u1 * h^2
1605	 *   y3 = r * (u1 * h^2 - x3) - s1 * h^3
1606	 *   z3 = h * z1
1607	 */
1608	uint32_t t1[20], t2[20], t3[20], t4[20], t5[20], t6[20], t7[20];
1609	uint32_t ret;
1610	int i;
1611
1612	/*
1613	 * Compute u1 = x1 (in t1) and s1 = y1 (in t3).
1614	 */
1615	memcpy(t1, P1->x, sizeof t1);
1616	memcpy(t3, P1->y, sizeof t3);
1617
1618	/*
1619	 * Compute u2 = x2*z1^2 (in t2) and s2 = y2*z1^3 (in t4).
1620	 */
1621	square_f256(t4, P1->z);
1622	mul_f256(t2, P2->x, t4);
1623	mul_f256(t5, P1->z, t4);
1624	mul_f256(t4, P2->y, t5);
1625
1626	/*
1627	 * Compute h = h2 - u1 (in t2) and r = s2 - s1 (in t4).
1628	 * We need to test whether r is zero, so we will do some extra
1629	 * reduce.
1630	 */
1631	for (i = 0; i < 20; i ++) {
1632		t2[i] += (F256[i] << 1) - t1[i];
1633		t4[i] += (F256[i] << 1) - t3[i];
1634	}
1635	norm13(t2, t2, 20);
1636	norm13(t4, t4, 20);
1637	reduce_f256(t4);
1638	reduce_final_f256(t4);
1639	ret = 0;
1640	for (i = 0; i < 20; i ++) {
1641		ret |= t4[i];
1642	}
1643	ret = (ret | -ret) >> 31;
1644
1645	/*
1646	 * Compute u1*h^2 (in t6) and h^3 (in t5);
1647	 */
1648	square_f256(t7, t2);
1649	mul_f256(t6, t1, t7);
1650	mul_f256(t5, t7, t2);
1651
1652	/*
1653	 * Compute x3 = r^2 - h^3 - 2*u1*h^2.
1654	 */
1655	square_f256(P1->x, t4);
1656	for (i = 0; i < 20; i ++) {
1657		P1->x[i] += (F256[i] << 3) - t5[i] - (t6[i] << 1);
1658	}
1659	norm13(P1->x, P1->x, 20);
1660	reduce_f256(P1->x);
1661
1662	/*
1663	 * Compute y3 = r*(u1*h^2 - x3) - s1*h^3.
1664	 */
1665	for (i = 0; i < 20; i ++) {
1666		t6[i] += (F256[i] << 1) - P1->x[i];
1667	}
1668	norm13(t6, t6, 20);
1669	mul_f256(P1->y, t4, t6);
1670	mul_f256(t1, t5, t3);
1671	for (i = 0; i < 20; i ++) {
1672		P1->y[i] += (F256[i] << 1) - t1[i];
1673	}
1674	norm13(P1->y, P1->y, 20);
1675	reduce_f256(P1->y);
1676
1677	/*
1678	 * Compute z3 = h*z1*z2.
1679	 */
1680	mul_f256(P1->z, P1->z, t2);
1681
1682	return ret;
1683}
1684
1685/*
1686 * Decode a P-256 point. This function does not support the point at
1687 * infinity. Returned value is 0 if the point is invalid, 1 otherwise.
1688 */
1689static uint32_t
1690p256_decode(p256_jacobian *P, const void *src, size_t len)
1691{
1692	const unsigned char *buf;
1693	uint32_t tx[20], ty[20], t1[20], t2[20];
1694	uint32_t bad;
1695	int i;
1696
1697	if (len != 65) {
1698		return 0;
1699	}
1700	buf = src;
1701
1702	/*
1703	 * First byte must be 0x04 (uncompressed format). We could support
1704	 * "hybrid format" (first byte is 0x06 or 0x07, and encodes the
1705	 * least significant bit of the Y coordinate), but it is explicitly
1706	 * forbidden by RFC 5480 (section 2.2).
1707	 */
1708	bad = NEQ(buf[0], 0x04);
1709
1710	/*
1711	 * Decode the coordinates, and check that they are both lower
1712	 * than the modulus.
1713	 */
1714	tx[19] = be8_to_le13(tx, buf + 1, 32);
1715	ty[19] = be8_to_le13(ty, buf + 33, 32);
1716	bad |= reduce_final_f256(tx);
1717	bad |= reduce_final_f256(ty);
1718
1719	/*
1720	 * Check curve equation.
1721	 */
1722	square_f256(t1, tx);
1723	mul_f256(t1, tx, t1);
1724	square_f256(t2, ty);
1725	for (i = 0; i < 20; i ++) {
1726		t1[i] += (F256[i] << 3) - MUL15(3, tx[i]) + P256_B[i] - t2[i];
1727	}
1728	norm13(t1, t1, 20);
1729	reduce_f256(t1);
1730	reduce_final_f256(t1);
1731	for (i = 0; i < 20; i ++) {
1732		bad |= t1[i];
1733	}
1734
1735	/*
1736	 * Copy coordinates to the point structure.
1737	 */
1738	memcpy(P->x, tx, sizeof tx);
1739	memcpy(P->y, ty, sizeof ty);
1740	memset(P->z, 0, sizeof P->z);
1741	P->z[0] = 1;
1742	return EQ(bad, 0);
1743}
1744
1745/*
1746 * Encode a point into a buffer. This function assumes that the point is
1747 * valid, in affine coordinates, and not the point at infinity.
1748 */
1749static void
1750p256_encode(void *dst, const p256_jacobian *P)
1751{
1752	unsigned char *buf;
1753
1754	buf = dst;
1755	buf[0] = 0x04;
1756	le13_to_be8(buf + 1, 32, P->x);
1757	le13_to_be8(buf + 33, 32, P->y);
1758}
1759
1760/*
1761 * Multiply a curve point by an integer. The integer is assumed to be
1762 * lower than the curve order, and the base point must not be the point
1763 * at infinity.
1764 */
1765static void
1766p256_mul(p256_jacobian *P, const unsigned char *x, size_t xlen)
1767{
1768	/*
1769	 * qz is a flag that is initially 1, and remains equal to 1
1770	 * as long as the point is the point at infinity.
1771	 *
1772	 * We use a 2-bit window to handle multiplier bits by pairs.
1773	 * The precomputed window really is the points P2 and P3.
1774	 */
1775	uint32_t qz;
1776	p256_jacobian P2, P3, Q, T, U;
1777
1778	/*
1779	 * Compute window values.
1780	 */
1781	P2 = *P;
1782	p256_double(&P2);
1783	P3 = *P;
1784	p256_add(&P3, &P2);
1785
1786	/*
1787	 * We start with Q = 0. We process multiplier bits 2 by 2.
1788	 */
1789	memset(&Q, 0, sizeof Q);
1790	qz = 1;
1791	while (xlen -- > 0) {
1792		int k;
1793
1794		for (k = 6; k >= 0; k -= 2) {
1795			uint32_t bits;
1796			uint32_t bnz;
1797
1798			p256_double(&Q);
1799			p256_double(&Q);
1800			T = *P;
1801			U = Q;
1802			bits = (*x >> k) & (uint32_t)3;
1803			bnz = NEQ(bits, 0);
1804			CCOPY(EQ(bits, 2), &T, &P2, sizeof T);
1805			CCOPY(EQ(bits, 3), &T, &P3, sizeof T);
1806			p256_add(&U, &T);
1807			CCOPY(bnz & qz, &Q, &T, sizeof Q);
1808			CCOPY(bnz & ~qz, &Q, &U, sizeof Q);
1809			qz &= ~bnz;
1810		}
1811		x ++;
1812	}
1813	*P = Q;
1814}
1815
1816/*
1817 * Precomputed window: k*G points, where G is the curve generator, and k
1818 * is an integer from 1 to 15 (inclusive). The X and Y coordinates of
1819 * the point are encoded as 20 words of 13 bits each (little-endian
1820 * order); 13-bit words are then grouped 2-by-2 into 32-bit words
1821 * (little-endian order within each word).
1822 */
1823static const uint32_t Gwin[15][20] = {
1824
1825	{ 0x04C60296, 0x02721176, 0x19D00F4A, 0x102517AC,
1826	  0x13B8037D, 0x0748103C, 0x1E730E56, 0x08481FE2,
1827	  0x0F97012C, 0x00D605F4, 0x1DFA11F5, 0x0C801A0D,
1828	  0x0F670CBB, 0x0AED0CC5, 0x115E0E33, 0x181F0785,
1829	  0x13F514A7, 0x0FF30E3B, 0x17171E1A, 0x009F18D0 },
1830
1831	{ 0x1B341978, 0x16911F11, 0x0D9A1A60, 0x1C4E1FC8,
1832	  0x1E040969, 0x096A06B0, 0x091C0030, 0x09EF1A29,
1833	  0x18C40D03, 0x00F91C9E, 0x13C313D1, 0x096F0748,
1834	  0x011419E0, 0x1CC713A6, 0x1DD31DAD, 0x1EE80C36,
1835	  0x1ECD0C69, 0x1A0800A4, 0x08861B8E, 0x000E1DD5 },
1836
1837	{ 0x173F1D6C, 0x02CC06F1, 0x14C21FB4, 0x043D1EB6,
1838	  0x0F3606B7, 0x1A971C59, 0x1BF71951, 0x01481323,
1839	  0x068D0633, 0x00BD12F9, 0x13EA1032, 0x136209E8,
1840	  0x1C1E19A7, 0x06C7013E, 0x06C10AB0, 0x14C908BB,
1841	  0x05830CE1, 0x1FEF18DD, 0x00620998, 0x010E0D19 },
1842
1843	{ 0x18180852, 0x0604111A, 0x0B771509, 0x1B6F0156,
1844	  0x00181FE2, 0x1DCC0AF4, 0x16EF0659, 0x11F70E80,
1845	  0x11A912D0, 0x01C414D2, 0x027618C6, 0x05840FC6,
1846	  0x100215C4, 0x187E0C3B, 0x12771C96, 0x150C0B5D,
1847	  0x0FF705FD, 0x07981C67, 0x1AD20C63, 0x01C11C55 },
1848
1849	{ 0x1E8113ED, 0x0A940370, 0x12920215, 0x1FA31D6F,
1850	  0x1F7C0C82, 0x10CD03F7, 0x02640560, 0x081A0B5E,
1851	  0x1BD21151, 0x00A21642, 0x0D0B0DA4, 0x0176113F,
1852	  0x04440D1D, 0x001A1360, 0x1068012F, 0x1F141E49,
1853	  0x10DF136B, 0x0E4F162B, 0x0D44104A, 0x01C1105F },
1854
1855	{ 0x011411A9, 0x01551A4F, 0x0ADA0C6B, 0x01BD0EC8,
1856	  0x18120C74, 0x112F1778, 0x099202CB, 0x0C05124B,
1857	  0x195316A4, 0x01600685, 0x1E3B1FE2, 0x189014E3,
1858	  0x0B5E1FD7, 0x0E0311F8, 0x08E000F7, 0x174E00DE,
1859	  0x160702DF, 0x1B5A15BF, 0x03A11237, 0x01D01704 },
1860
1861	{ 0x0C3D12A3, 0x0C501C0C, 0x17AD1300, 0x1715003F,
1862	  0x03F719F8, 0x18031ED8, 0x1D980667, 0x0F681896,
1863	  0x1B7D00BF, 0x011C14CE, 0x0FA000B4, 0x1C3501B0,
1864	  0x0D901C55, 0x06790C10, 0x029E0736, 0x0DEB0400,
1865	  0x034F183A, 0x030619B4, 0x0DEF0033, 0x00E71AC7 },
1866
1867	{ 0x1B7D1393, 0x1B3B1076, 0x0BED1B4D, 0x13011F3A,
1868	  0x0E0E1238, 0x156A132B, 0x013A02D3, 0x160A0D01,
1869	  0x1CED1EE9, 0x00C5165D, 0x184C157E, 0x08141A83,
1870	  0x153C0DA5, 0x1ED70F9D, 0x05170D51, 0x02CF13B8,
1871	  0x18AE1771, 0x1B04113F, 0x05EC11E9, 0x015A16B3 },
1872
1873	{ 0x04A41EE0, 0x1D1412E4, 0x1C591D79, 0x118511B7,
1874	  0x14F00ACB, 0x1AE31E1C, 0x049C0D51, 0x016E061E,
1875	  0x1DB71EDF, 0x01D41A35, 0x0E8208FA, 0x14441293,
1876	  0x011F1E85, 0x1D54137A, 0x026B114F, 0x151D0832,
1877	  0x00A50964, 0x1F9C1E1C, 0x064B12C9, 0x005409D1 },
1878
1879	{ 0x062B123F, 0x0C0D0501, 0x183704C3, 0x08E31120,
1880	  0x0A2E0A6C, 0x14440FED, 0x090A0D1E, 0x13271964,
1881	  0x0B590A3A, 0x019D1D9B, 0x05780773, 0x09770A91,
1882	  0x0F770CA3, 0x053F19D4, 0x02C80DED, 0x1A761304,
1883	  0x091E0DD9, 0x15D201B8, 0x151109AA, 0x010F0198 },
1884
1885	{ 0x05E101D1, 0x072314DD, 0x045F1433, 0x1A041541,
1886	  0x10B3142E, 0x01840736, 0x1C1B19DB, 0x098B0418,
1887	  0x1DBC083B, 0x007D1444, 0x01511740, 0x11DD1F3A,
1888	  0x04ED0E2F, 0x1B4B1A62, 0x10480D04, 0x09E911A2,
1889	  0x04211AFA, 0x19140893, 0x04D60CC4, 0x01210648 },
1890
1891	{ 0x112703C4, 0x018B1BA1, 0x164C1D50, 0x05160BE0,
1892	  0x0BCC1830, 0x01CB1554, 0x13291732, 0x1B2B1918,
1893	  0x0DED0817, 0x00E80775, 0x0A2401D3, 0x0BFE08B3,
1894	  0x0E531199, 0x058616E9, 0x04770B91, 0x110F0C55,
1895	  0x19C11554, 0x0BFB1159, 0x03541C38, 0x000E1C2D },
1896
1897	{ 0x10390C01, 0x02BB0751, 0x0AC5098E, 0x096C17AB,
1898	  0x03C90E28, 0x10BD18BF, 0x002E1F2D, 0x092B0986,
1899	  0x1BD700AC, 0x002E1F20, 0x1E3D1FD8, 0x077718BB,
1900	  0x06F919C4, 0x187407ED, 0x11370E14, 0x081E139C,
1901	  0x00481ADB, 0x14AB0289, 0x066A0EBE, 0x00C70ED6 },
1902
1903	{ 0x0694120B, 0x124E1CC9, 0x0E2F0570, 0x17CF081A,
1904	  0x078906AC, 0x066D17CF, 0x1B3207F4, 0x0C5705E9,
1905	  0x10001C38, 0x00A919DE, 0x06851375, 0x0F900BD8,
1906	  0x080401BA, 0x0EEE0D42, 0x1B8B11EA, 0x0B4519F0,
1907	  0x090F18C0, 0x062E1508, 0x0DD909F4, 0x01EB067C },
1908
1909	{ 0x0CDC1D5F, 0x0D1818F9, 0x07781636, 0x125B18E8,
1910	  0x0D7003AF, 0x13110099, 0x1D9B1899, 0x175C1EB7,
1911	  0x0E34171A, 0x01E01153, 0x081A0F36, 0x0B391783,
1912	  0x1D1F147E, 0x19CE16D7, 0x11511B21, 0x1F2C10F9,
1913	  0x12CA0E51, 0x05A31D39, 0x171A192E, 0x016B0E4F }
1914};
1915
1916/*
1917 * Lookup one of the Gwin[] values, by index. This is constant-time.
1918 */
1919static void
1920lookup_Gwin(p256_jacobian *T, uint32_t idx)
1921{
1922	uint32_t xy[20];
1923	uint32_t k;
1924	size_t u;
1925
1926	memset(xy, 0, sizeof xy);
1927	for (k = 0; k < 15; k ++) {
1928		uint32_t m;
1929
1930		m = -EQ(idx, k + 1);
1931		for (u = 0; u < 20; u ++) {
1932			xy[u] |= m & Gwin[k][u];
1933		}
1934	}
1935	for (u = 0; u < 10; u ++) {
1936		T->x[(u << 1) + 0] = xy[u] & 0xFFFF;
1937		T->x[(u << 1) + 1] = xy[u] >> 16;
1938		T->y[(u << 1) + 0] = xy[u + 10] & 0xFFFF;
1939		T->y[(u << 1) + 1] = xy[u + 10] >> 16;
1940	}
1941	memset(T->z, 0, sizeof T->z);
1942	T->z[0] = 1;
1943}
1944
1945/*
1946 * Multiply the generator by an integer. The integer is assumed non-zero
1947 * and lower than the curve order.
1948 */
1949static void
1950p256_mulgen(p256_jacobian *P, const unsigned char *x, size_t xlen)
1951{
1952	/*
1953	 * qz is a flag that is initially 1, and remains equal to 1
1954	 * as long as the point is the point at infinity.
1955	 *
1956	 * We use a 4-bit window to handle multiplier bits by groups
1957	 * of 4. The precomputed window is constant static data, with
1958	 * points in affine coordinates; we use a constant-time lookup.
1959	 */
1960	p256_jacobian Q;
1961	uint32_t qz;
1962
1963	memset(&Q, 0, sizeof Q);
1964	qz = 1;
1965	while (xlen -- > 0) {
1966		int k;
1967		unsigned bx;
1968
1969		bx = *x ++;
1970		for (k = 0; k < 2; k ++) {
1971			uint32_t bits;
1972			uint32_t bnz;
1973			p256_jacobian T, U;
1974
1975			p256_double(&Q);
1976			p256_double(&Q);
1977			p256_double(&Q);
1978			p256_double(&Q);
1979			bits = (bx >> 4) & 0x0F;
1980			bnz = NEQ(bits, 0);
1981			lookup_Gwin(&T, bits);
1982			U = Q;
1983			p256_add_mixed(&U, &T);
1984			CCOPY(bnz & qz, &Q, &T, sizeof Q);
1985			CCOPY(bnz & ~qz, &Q, &U, sizeof Q);
1986			qz &= ~bnz;
1987			bx <<= 4;
1988		}
1989	}
1990	*P = Q;
1991}
1992
1993static const unsigned char P256_G[] = {
1994	0x04, 0x6B, 0x17, 0xD1, 0xF2, 0xE1, 0x2C, 0x42, 0x47, 0xF8,
1995	0xBC, 0xE6, 0xE5, 0x63, 0xA4, 0x40, 0xF2, 0x77, 0x03, 0x7D,
1996	0x81, 0x2D, 0xEB, 0x33, 0xA0, 0xF4, 0xA1, 0x39, 0x45, 0xD8,
1997	0x98, 0xC2, 0x96, 0x4F, 0xE3, 0x42, 0xE2, 0xFE, 0x1A, 0x7F,
1998	0x9B, 0x8E, 0xE7, 0xEB, 0x4A, 0x7C, 0x0F, 0x9E, 0x16, 0x2B,
1999	0xCE, 0x33, 0x57, 0x6B, 0x31, 0x5E, 0xCE, 0xCB, 0xB6, 0x40,
2000	0x68, 0x37, 0xBF, 0x51, 0xF5
2001};
2002
2003static const unsigned char P256_N[] = {
2004	0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF,
2005	0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xBC, 0xE6, 0xFA, 0xAD,
2006	0xA7, 0x17, 0x9E, 0x84, 0xF3, 0xB9, 0xCA, 0xC2, 0xFC, 0x63,
2007	0x25, 0x51
2008};
2009
2010static const unsigned char *
2011api_generator(int curve, size_t *len)
2012{
2013	(void)curve;
2014	*len = sizeof P256_G;
2015	return P256_G;
2016}
2017
2018static const unsigned char *
2019api_order(int curve, size_t *len)
2020{
2021	(void)curve;
2022	*len = sizeof P256_N;
2023	return P256_N;
2024}
2025
2026static size_t
2027api_xoff(int curve, size_t *len)
2028{
2029	(void)curve;
2030	*len = 32;
2031	return 1;
2032}
2033
2034static uint32_t
2035api_mul(unsigned char *G, size_t Glen,
2036	const unsigned char *x, size_t xlen, int curve)
2037{
2038	uint32_t r;
2039	p256_jacobian P;
2040
2041	(void)curve;
2042	if (Glen != 65) {
2043		return 0;
2044	}
2045	r = p256_decode(&P, G, Glen);
2046	p256_mul(&P, x, xlen);
2047	p256_to_affine(&P);
2048	p256_encode(G, &P);
2049	return r;
2050}
2051
2052static size_t
2053api_mulgen(unsigned char *R,
2054	const unsigned char *x, size_t xlen, int curve)
2055{
2056	p256_jacobian P;
2057
2058	(void)curve;
2059	p256_mulgen(&P, x, xlen);
2060	p256_to_affine(&P);
2061	p256_encode(R, &P);
2062	return 65;
2063}
2064
2065static uint32_t
2066api_muladd(unsigned char *A, const unsigned char *B, size_t len,
2067	const unsigned char *x, size_t xlen,
2068	const unsigned char *y, size_t ylen, int curve)
2069{
2070	p256_jacobian P, Q;
2071	uint32_t r, t, z;
2072	int i;
2073
2074	(void)curve;
2075	if (len != 65) {
2076		return 0;
2077	}
2078	r = p256_decode(&P, A, len);
2079	p256_mul(&P, x, xlen);
2080	if (B == NULL) {
2081		p256_mulgen(&Q, y, ylen);
2082	} else {
2083		r &= p256_decode(&Q, B, len);
2084		p256_mul(&Q, y, ylen);
2085	}
2086
2087	/*
2088	 * The final addition may fail in case both points are equal.
2089	 */
2090	t = p256_add(&P, &Q);
2091	reduce_final_f256(P.z);
2092	z = 0;
2093	for (i = 0; i < 20; i ++) {
2094		z |= P.z[i];
2095	}
2096	z = EQ(z, 0);
2097	p256_double(&Q);
2098
2099	/*
2100	 * If z is 1 then either P+Q = 0 (t = 1) or P = Q (t = 0). So we
2101	 * have the following:
2102	 *
2103	 *   z = 0, t = 0   return P (normal addition)
2104	 *   z = 0, t = 1   return P (normal addition)
2105	 *   z = 1, t = 0   return Q (a 'double' case)
2106	 *   z = 1, t = 1   report an error (P+Q = 0)
2107	 */
2108	CCOPY(z & ~t, &P, &Q, sizeof Q);
2109	p256_to_affine(&P);
2110	p256_encode(A, &P);
2111	r &= ~(z & t);
2112	return r;
2113}
2114
2115/* see bearssl_ec.h */
2116const br_ec_impl br_ec_p256_m15 = {
2117	(uint32_t)0x00800000,
2118	&api_generator,
2119	&api_order,
2120	&api_xoff,
2121	&api_mul,
2122	&api_mulgen,
2123	&api_muladd
2124};
2125