1/*
2 * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining
5 * a copy of this software and associated documentation files (the
6 * "Software"), to deal in the Software without restriction, including
7 * without limitation the rights to use, copy, modify, merge, publish,
8 * distribute, sublicense, and/or sell copies of the Software, and to
9 * permit persons to whom the Software is furnished to do so, subject to
10 * the following conditions:
11 *
12 * The above copyright notice and this permission notice shall be
13 * included in all copies or substantial portions of the Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
19 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
20 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
21 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25#include "inner.h"
26
27/* obsolete
28#include <stdio.h>
29#include <stdlib.h>
30static void
31print_int(const char *name, const uint32_t *x)
32{
33	size_t u;
34	unsigned char tmp[36];
35
36	printf("%s = ", name);
37	for (u = 0; u < 20; u ++) {
38		if (x[u] > 0x1FFF) {
39			printf("INVALID:");
40			for (u = 0; u < 20; u ++) {
41				printf(" %04X", x[u]);
42			}
43			printf("\n");
44			return;
45		}
46	}
47	memset(tmp, 0, sizeof tmp);
48	for (u = 0; u < 20; u ++) {
49		uint32_t w;
50		int j, k;
51
52		w = x[u];
53		j = 13 * (int)u;
54		k = j & 7;
55		if (k != 0) {
56			w <<= k;
57			j -= k;
58		}
59		k = j >> 3;
60		tmp[35 - k] |= (unsigned char)w;
61		tmp[34 - k] |= (unsigned char)(w >> 8);
62		tmp[33 - k] |= (unsigned char)(w >> 16);
63		tmp[32 - k] |= (unsigned char)(w >> 24);
64	}
65	for (u = 4; u < 36; u ++) {
66		printf("%02X", tmp[u]);
67	}
68	printf("\n");
69}
70*/
71
72/*
73 * If BR_NO_ARITH_SHIFT is undefined, or defined to 0, then we _assume_
74 * that right-shifting a signed negative integer copies the sign bit
75 * (arithmetic right-shift). This is "implementation-defined behaviour",
76 * i.e. it is not undefined, but it may differ between compilers. Each
77 * compiler is supposed to document its behaviour in that respect. GCC
78 * explicitly defines that an arithmetic right shift is used. We expect
79 * all other compilers to do the same, because underlying CPU offer an
80 * arithmetic right shift opcode that could not be used otherwise.
81 */
82#if BR_NO_ARITH_SHIFT
83#define ARSH(x, n)   (((uint32_t)(x) >> (n)) \
84                    | ((-((uint32_t)(x) >> 31)) << (32 - (n))))
85#else
86#define ARSH(x, n)   ((*(int32_t *)&(x)) >> (n))
87#endif
88
89/*
90 * Convert an integer from unsigned little-endian encoding to a sequence of
91 * 13-bit words in little-endian order. The final "partial" word is
92 * returned.
93 */
94static uint32_t
95le8_to_le13(uint32_t *dst, const unsigned char *src, size_t len)
96{
97	uint32_t acc;
98	int acc_len;
99
100	acc = 0;
101	acc_len = 0;
102	while (len -- > 0) {
103		acc |= (uint32_t)(*src ++) << acc_len;
104		acc_len += 8;
105		if (acc_len >= 13) {
106			*dst ++ = acc & 0x1FFF;
107			acc >>= 13;
108			acc_len -= 13;
109		}
110	}
111	return acc;
112}
113
114/*
115 * Convert an integer (13-bit words, little-endian) to unsigned
116 * little-endian encoding. The total encoding length is provided; all
117 * the destination bytes will be filled.
118 */
119static void
120le13_to_le8(unsigned char *dst, size_t len, const uint32_t *src)
121{
122	uint32_t acc;
123	int acc_len;
124
125	acc = 0;
126	acc_len = 0;
127	while (len -- > 0) {
128		if (acc_len < 8) {
129			acc |= (*src ++) << acc_len;
130			acc_len += 13;
131		}
132		*dst ++ = (unsigned char)acc;
133		acc >>= 8;
134		acc_len -= 8;
135	}
136}
137
138/*
139 * Normalise an array of words to a strict 13 bits per word. Returned
140 * value is the resulting carry. The source (w) and destination (d)
141 * arrays may be identical, but shall not overlap partially.
142 */
143static inline uint32_t
144norm13(uint32_t *d, const uint32_t *w, size_t len)
145{
146	size_t u;
147	uint32_t cc;
148
149	cc = 0;
150	for (u = 0; u < len; u ++) {
151		int32_t z;
152
153		z = w[u] + cc;
154		d[u] = z & 0x1FFF;
155		cc = ARSH(z, 13);
156	}
157	return cc;
158}
159
160/*
161 * mul20() multiplies two 260-bit integers together. Each word must fit
162 * on 13 bits; source operands use 20 words, destination operand
163 * receives 40 words. All overlaps allowed.
164 *
165 * square20() computes the square of a 260-bit integer. Each word must
166 * fit on 13 bits; source operand uses 20 words, destination operand
167 * receives 40 words. All overlaps allowed.
168 */
169
170#if BR_SLOW_MUL15
171
172static void
173mul20(uint32_t *d, const uint32_t *a, const uint32_t *b)
174{
175	/*
176	 * Two-level Karatsuba: turns a 20x20 multiplication into
177	 * nine 5x5 multiplications. We use 13-bit words but do not
178	 * propagate carries immediately, so words may expand:
179	 *
180	 *  - First Karatsuba decomposition turns the 20x20 mul on
181	 *    13-bit words into three 10x10 muls, two on 13-bit words
182	 *    and one on 14-bit words.
183	 *
184	 *  - Second Karatsuba decomposition further splits these into:
185	 *
186	 *     * four 5x5 muls on 13-bit words
187	 *     * four 5x5 muls on 14-bit words
188	 *     * one 5x5 mul on 15-bit words
189	 *
190	 * Highest word value is 8191, 16382 or 32764, for 13-bit, 14-bit
191	 * or 15-bit words, respectively.
192	 */
193	uint32_t u[45], v[45], w[90];
194	uint32_t cc;
195	int i;
196
197#define ZADD(dw, d_off, s1w, s1_off, s2w, s2_off)   do { \
198		(dw)[5 * (d_off) + 0] = (s1w)[5 * (s1_off) + 0] \
199			+ (s2w)[5 * (s2_off) + 0]; \
200		(dw)[5 * (d_off) + 1] = (s1w)[5 * (s1_off) + 1] \
201			+ (s2w)[5 * (s2_off) + 1]; \
202		(dw)[5 * (d_off) + 2] = (s1w)[5 * (s1_off) + 2] \
203			+ (s2w)[5 * (s2_off) + 2]; \
204		(dw)[5 * (d_off) + 3] = (s1w)[5 * (s1_off) + 3] \
205			+ (s2w)[5 * (s2_off) + 3]; \
206		(dw)[5 * (d_off) + 4] = (s1w)[5 * (s1_off) + 4] \
207			+ (s2w)[5 * (s2_off) + 4]; \
208	} while (0)
209
210#define ZADDT(dw, d_off, sw, s_off)   do { \
211		(dw)[5 * (d_off) + 0] += (sw)[5 * (s_off) + 0]; \
212		(dw)[5 * (d_off) + 1] += (sw)[5 * (s_off) + 1]; \
213		(dw)[5 * (d_off) + 2] += (sw)[5 * (s_off) + 2]; \
214		(dw)[5 * (d_off) + 3] += (sw)[5 * (s_off) + 3]; \
215		(dw)[5 * (d_off) + 4] += (sw)[5 * (s_off) + 4]; \
216	} while (0)
217
218#define ZSUB2F(dw, d_off, s1w, s1_off, s2w, s2_off)   do { \
219		(dw)[5 * (d_off) + 0] -= (s1w)[5 * (s1_off) + 0] \
220			+ (s2w)[5 * (s2_off) + 0]; \
221		(dw)[5 * (d_off) + 1] -= (s1w)[5 * (s1_off) + 1] \
222			+ (s2w)[5 * (s2_off) + 1]; \
223		(dw)[5 * (d_off) + 2] -= (s1w)[5 * (s1_off) + 2] \
224			+ (s2w)[5 * (s2_off) + 2]; \
225		(dw)[5 * (d_off) + 3] -= (s1w)[5 * (s1_off) + 3] \
226			+ (s2w)[5 * (s2_off) + 3]; \
227		(dw)[5 * (d_off) + 4] -= (s1w)[5 * (s1_off) + 4] \
228			+ (s2w)[5 * (s2_off) + 4]; \
229	} while (0)
230
231#define CPR1(w, cprcc)   do { \
232		uint32_t cprz = (w) + cprcc; \
233		(w) = cprz & 0x1FFF; \
234		cprcc = cprz >> 13; \
235	} while (0)
236
237#define CPR(dw, d_off)   do { \
238		uint32_t cprcc; \
239		cprcc = 0; \
240		CPR1((dw)[(d_off) + 0], cprcc); \
241		CPR1((dw)[(d_off) + 1], cprcc); \
242		CPR1((dw)[(d_off) + 2], cprcc); \
243		CPR1((dw)[(d_off) + 3], cprcc); \
244		CPR1((dw)[(d_off) + 4], cprcc); \
245		CPR1((dw)[(d_off) + 5], cprcc); \
246		CPR1((dw)[(d_off) + 6], cprcc); \
247		CPR1((dw)[(d_off) + 7], cprcc); \
248		CPR1((dw)[(d_off) + 8], cprcc); \
249		(dw)[(d_off) + 9] = cprcc; \
250	} while (0)
251
252	memcpy(u, a, 20 * sizeof *a);
253	ZADD(u, 4, a, 0, a, 1);
254	ZADD(u, 5, a, 2, a, 3);
255	ZADD(u, 6, a, 0, a, 2);
256	ZADD(u, 7, a, 1, a, 3);
257	ZADD(u, 8, u, 6, u, 7);
258
259	memcpy(v, b, 20 * sizeof *b);
260	ZADD(v, 4, b, 0, b, 1);
261	ZADD(v, 5, b, 2, b, 3);
262	ZADD(v, 6, b, 0, b, 2);
263	ZADD(v, 7, b, 1, b, 3);
264	ZADD(v, 8, v, 6, v, 7);
265
266	/*
267	 * Do the eight first 8x8 muls. Source words are at most 16382
268	 * each, so we can add product results together "as is" in 32-bit
269	 * words.
270	 */
271	for (i = 0; i < 40; i += 5) {
272		w[(i << 1) + 0] = MUL15(u[i + 0], v[i + 0]);
273		w[(i << 1) + 1] = MUL15(u[i + 0], v[i + 1])
274			+ MUL15(u[i + 1], v[i + 0]);
275		w[(i << 1) + 2] = MUL15(u[i + 0], v[i + 2])
276			+ MUL15(u[i + 1], v[i + 1])
277			+ MUL15(u[i + 2], v[i + 0]);
278		w[(i << 1) + 3] = MUL15(u[i + 0], v[i + 3])
279			+ MUL15(u[i + 1], v[i + 2])
280			+ MUL15(u[i + 2], v[i + 1])
281			+ MUL15(u[i + 3], v[i + 0]);
282		w[(i << 1) + 4] = MUL15(u[i + 0], v[i + 4])
283			+ MUL15(u[i + 1], v[i + 3])
284			+ MUL15(u[i + 2], v[i + 2])
285			+ MUL15(u[i + 3], v[i + 1])
286			+ MUL15(u[i + 4], v[i + 0]);
287		w[(i << 1) + 5] = MUL15(u[i + 1], v[i + 4])
288			+ MUL15(u[i + 2], v[i + 3])
289			+ MUL15(u[i + 3], v[i + 2])
290			+ MUL15(u[i + 4], v[i + 1]);
291		w[(i << 1) + 6] = MUL15(u[i + 2], v[i + 4])
292			+ MUL15(u[i + 3], v[i + 3])
293			+ MUL15(u[i + 4], v[i + 2]);
294		w[(i << 1) + 7] = MUL15(u[i + 3], v[i + 4])
295			+ MUL15(u[i + 4], v[i + 3]);
296		w[(i << 1) + 8] = MUL15(u[i + 4], v[i + 4]);
297		w[(i << 1) + 9] = 0;
298	}
299
300	/*
301	 * For the 9th multiplication, source words are up to 32764,
302	 * so we must do some carry propagation. If we add up to
303	 * 4 products and the carry is no more than 524224, then the
304	 * result fits in 32 bits, and the next carry will be no more
305	 * than 524224 (because 4*(32764^2)+524224 < 8192*524225).
306	 *
307	 * We thus just skip one of the products in the middle word,
308	 * then do a carry propagation (this reduces words to 13 bits
309	 * each, except possibly the last, which may use up to 17 bits
310	 * or so), then add the missing product.
311	 */
312	w[80 + 0] = MUL15(u[40 + 0], v[40 + 0]);
313	w[80 + 1] = MUL15(u[40 + 0], v[40 + 1])
314		+ MUL15(u[40 + 1], v[40 + 0]);
315	w[80 + 2] = MUL15(u[40 + 0], v[40 + 2])
316		+ MUL15(u[40 + 1], v[40 + 1])
317		+ MUL15(u[40 + 2], v[40 + 0]);
318	w[80 + 3] = MUL15(u[40 + 0], v[40 + 3])
319		+ MUL15(u[40 + 1], v[40 + 2])
320		+ MUL15(u[40 + 2], v[40 + 1])
321		+ MUL15(u[40 + 3], v[40 + 0]);
322	w[80 + 4] = MUL15(u[40 + 0], v[40 + 4])
323		+ MUL15(u[40 + 1], v[40 + 3])
324		+ MUL15(u[40 + 2], v[40 + 2])
325		+ MUL15(u[40 + 3], v[40 + 1]);
326		/* + MUL15(u[40 + 4], v[40 + 0]) */
327	w[80 + 5] = MUL15(u[40 + 1], v[40 + 4])
328		+ MUL15(u[40 + 2], v[40 + 3])
329		+ MUL15(u[40 + 3], v[40 + 2])
330		+ MUL15(u[40 + 4], v[40 + 1]);
331	w[80 + 6] = MUL15(u[40 + 2], v[40 + 4])
332		+ MUL15(u[40 + 3], v[40 + 3])
333		+ MUL15(u[40 + 4], v[40 + 2]);
334	w[80 + 7] = MUL15(u[40 + 3], v[40 + 4])
335		+ MUL15(u[40 + 4], v[40 + 3]);
336	w[80 + 8] = MUL15(u[40 + 4], v[40 + 4]);
337
338	CPR(w, 80);
339
340	w[80 + 4] += MUL15(u[40 + 4], v[40 + 0]);
341
342	/*
343	 * The products on 14-bit words in slots 6 and 7 yield values
344	 * up to 5*(16382^2) each, and we need to subtract two such
345	 * values from the higher word. We need the subtraction to fit
346	 * in a _signed_ 32-bit integer, i.e. 31 bits + a sign bit.
347	 * However, 10*(16382^2) does not fit. So we must perform a
348	 * bit of reduction here.
349	 */
350	CPR(w, 60);
351	CPR(w, 70);
352
353	/*
354	 * Recompose results.
355	 */
356
357	/* 0..1*0..1 into 0..3 */
358	ZSUB2F(w, 8, w, 0, w, 2);
359	ZSUB2F(w, 9, w, 1, w, 3);
360	ZADDT(w, 1, w, 8);
361	ZADDT(w, 2, w, 9);
362
363	/* 2..3*2..3 into 4..7 */
364	ZSUB2F(w, 10, w, 4, w, 6);
365	ZSUB2F(w, 11, w, 5, w, 7);
366	ZADDT(w, 5, w, 10);
367	ZADDT(w, 6, w, 11);
368
369	/* (0..1+2..3)*(0..1+2..3) into 12..15 */
370	ZSUB2F(w, 16, w, 12, w, 14);
371	ZSUB2F(w, 17, w, 13, w, 15);
372	ZADDT(w, 13, w, 16);
373	ZADDT(w, 14, w, 17);
374
375	/* first-level recomposition */
376	ZSUB2F(w, 12, w, 0, w, 4);
377	ZSUB2F(w, 13, w, 1, w, 5);
378	ZSUB2F(w, 14, w, 2, w, 6);
379	ZSUB2F(w, 15, w, 3, w, 7);
380	ZADDT(w, 2, w, 12);
381	ZADDT(w, 3, w, 13);
382	ZADDT(w, 4, w, 14);
383	ZADDT(w, 5, w, 15);
384
385	/*
386	 * Perform carry propagation to bring all words down to 13 bits.
387	 */
388	cc = norm13(d, w, 40);
389	d[39] += (cc << 13);
390
391#undef ZADD
392#undef ZADDT
393#undef ZSUB2F
394#undef CPR1
395#undef CPR
396}
397
398static inline void
399square20(uint32_t *d, const uint32_t *a)
400{
401	mul20(d, a, a);
402}
403
404#else
405
406static void
407mul20(uint32_t *d, const uint32_t *a, const uint32_t *b)
408{
409	uint32_t t[39];
410
411	t[ 0] = MUL15(a[ 0], b[ 0]);
412	t[ 1] = MUL15(a[ 0], b[ 1])
413		+ MUL15(a[ 1], b[ 0]);
414	t[ 2] = MUL15(a[ 0], b[ 2])
415		+ MUL15(a[ 1], b[ 1])
416		+ MUL15(a[ 2], b[ 0]);
417	t[ 3] = MUL15(a[ 0], b[ 3])
418		+ MUL15(a[ 1], b[ 2])
419		+ MUL15(a[ 2], b[ 1])
420		+ MUL15(a[ 3], b[ 0]);
421	t[ 4] = MUL15(a[ 0], b[ 4])
422		+ MUL15(a[ 1], b[ 3])
423		+ MUL15(a[ 2], b[ 2])
424		+ MUL15(a[ 3], b[ 1])
425		+ MUL15(a[ 4], b[ 0]);
426	t[ 5] = MUL15(a[ 0], b[ 5])
427		+ MUL15(a[ 1], b[ 4])
428		+ MUL15(a[ 2], b[ 3])
429		+ MUL15(a[ 3], b[ 2])
430		+ MUL15(a[ 4], b[ 1])
431		+ MUL15(a[ 5], b[ 0]);
432	t[ 6] = MUL15(a[ 0], b[ 6])
433		+ MUL15(a[ 1], b[ 5])
434		+ MUL15(a[ 2], b[ 4])
435		+ MUL15(a[ 3], b[ 3])
436		+ MUL15(a[ 4], b[ 2])
437		+ MUL15(a[ 5], b[ 1])
438		+ MUL15(a[ 6], b[ 0]);
439	t[ 7] = MUL15(a[ 0], b[ 7])
440		+ MUL15(a[ 1], b[ 6])
441		+ MUL15(a[ 2], b[ 5])
442		+ MUL15(a[ 3], b[ 4])
443		+ MUL15(a[ 4], b[ 3])
444		+ MUL15(a[ 5], b[ 2])
445		+ MUL15(a[ 6], b[ 1])
446		+ MUL15(a[ 7], b[ 0]);
447	t[ 8] = MUL15(a[ 0], b[ 8])
448		+ MUL15(a[ 1], b[ 7])
449		+ MUL15(a[ 2], b[ 6])
450		+ MUL15(a[ 3], b[ 5])
451		+ MUL15(a[ 4], b[ 4])
452		+ MUL15(a[ 5], b[ 3])
453		+ MUL15(a[ 6], b[ 2])
454		+ MUL15(a[ 7], b[ 1])
455		+ MUL15(a[ 8], b[ 0]);
456	t[ 9] = MUL15(a[ 0], b[ 9])
457		+ MUL15(a[ 1], b[ 8])
458		+ MUL15(a[ 2], b[ 7])
459		+ MUL15(a[ 3], b[ 6])
460		+ MUL15(a[ 4], b[ 5])
461		+ MUL15(a[ 5], b[ 4])
462		+ MUL15(a[ 6], b[ 3])
463		+ MUL15(a[ 7], b[ 2])
464		+ MUL15(a[ 8], b[ 1])
465		+ MUL15(a[ 9], b[ 0]);
466	t[10] = MUL15(a[ 0], b[10])
467		+ MUL15(a[ 1], b[ 9])
468		+ MUL15(a[ 2], b[ 8])
469		+ MUL15(a[ 3], b[ 7])
470		+ MUL15(a[ 4], b[ 6])
471		+ MUL15(a[ 5], b[ 5])
472		+ MUL15(a[ 6], b[ 4])
473		+ MUL15(a[ 7], b[ 3])
474		+ MUL15(a[ 8], b[ 2])
475		+ MUL15(a[ 9], b[ 1])
476		+ MUL15(a[10], b[ 0]);
477	t[11] = MUL15(a[ 0], b[11])
478		+ MUL15(a[ 1], b[10])
479		+ MUL15(a[ 2], b[ 9])
480		+ MUL15(a[ 3], b[ 8])
481		+ MUL15(a[ 4], b[ 7])
482		+ MUL15(a[ 5], b[ 6])
483		+ MUL15(a[ 6], b[ 5])
484		+ MUL15(a[ 7], b[ 4])
485		+ MUL15(a[ 8], b[ 3])
486		+ MUL15(a[ 9], b[ 2])
487		+ MUL15(a[10], b[ 1])
488		+ MUL15(a[11], b[ 0]);
489	t[12] = MUL15(a[ 0], b[12])
490		+ MUL15(a[ 1], b[11])
491		+ MUL15(a[ 2], b[10])
492		+ MUL15(a[ 3], b[ 9])
493		+ MUL15(a[ 4], b[ 8])
494		+ MUL15(a[ 5], b[ 7])
495		+ MUL15(a[ 6], b[ 6])
496		+ MUL15(a[ 7], b[ 5])
497		+ MUL15(a[ 8], b[ 4])
498		+ MUL15(a[ 9], b[ 3])
499		+ MUL15(a[10], b[ 2])
500		+ MUL15(a[11], b[ 1])
501		+ MUL15(a[12], b[ 0]);
502	t[13] = MUL15(a[ 0], b[13])
503		+ MUL15(a[ 1], b[12])
504		+ MUL15(a[ 2], b[11])
505		+ MUL15(a[ 3], b[10])
506		+ MUL15(a[ 4], b[ 9])
507		+ MUL15(a[ 5], b[ 8])
508		+ MUL15(a[ 6], b[ 7])
509		+ MUL15(a[ 7], b[ 6])
510		+ MUL15(a[ 8], b[ 5])
511		+ MUL15(a[ 9], b[ 4])
512		+ MUL15(a[10], b[ 3])
513		+ MUL15(a[11], b[ 2])
514		+ MUL15(a[12], b[ 1])
515		+ MUL15(a[13], b[ 0]);
516	t[14] = MUL15(a[ 0], b[14])
517		+ MUL15(a[ 1], b[13])
518		+ MUL15(a[ 2], b[12])
519		+ MUL15(a[ 3], b[11])
520		+ MUL15(a[ 4], b[10])
521		+ MUL15(a[ 5], b[ 9])
522		+ MUL15(a[ 6], b[ 8])
523		+ MUL15(a[ 7], b[ 7])
524		+ MUL15(a[ 8], b[ 6])
525		+ MUL15(a[ 9], b[ 5])
526		+ MUL15(a[10], b[ 4])
527		+ MUL15(a[11], b[ 3])
528		+ MUL15(a[12], b[ 2])
529		+ MUL15(a[13], b[ 1])
530		+ MUL15(a[14], b[ 0]);
531	t[15] = MUL15(a[ 0], b[15])
532		+ MUL15(a[ 1], b[14])
533		+ MUL15(a[ 2], b[13])
534		+ MUL15(a[ 3], b[12])
535		+ MUL15(a[ 4], b[11])
536		+ MUL15(a[ 5], b[10])
537		+ MUL15(a[ 6], b[ 9])
538		+ MUL15(a[ 7], b[ 8])
539		+ MUL15(a[ 8], b[ 7])
540		+ MUL15(a[ 9], b[ 6])
541		+ MUL15(a[10], b[ 5])
542		+ MUL15(a[11], b[ 4])
543		+ MUL15(a[12], b[ 3])
544		+ MUL15(a[13], b[ 2])
545		+ MUL15(a[14], b[ 1])
546		+ MUL15(a[15], b[ 0]);
547	t[16] = MUL15(a[ 0], b[16])
548		+ MUL15(a[ 1], b[15])
549		+ MUL15(a[ 2], b[14])
550		+ MUL15(a[ 3], b[13])
551		+ MUL15(a[ 4], b[12])
552		+ MUL15(a[ 5], b[11])
553		+ MUL15(a[ 6], b[10])
554		+ MUL15(a[ 7], b[ 9])
555		+ MUL15(a[ 8], b[ 8])
556		+ MUL15(a[ 9], b[ 7])
557		+ MUL15(a[10], b[ 6])
558		+ MUL15(a[11], b[ 5])
559		+ MUL15(a[12], b[ 4])
560		+ MUL15(a[13], b[ 3])
561		+ MUL15(a[14], b[ 2])
562		+ MUL15(a[15], b[ 1])
563		+ MUL15(a[16], b[ 0]);
564	t[17] = MUL15(a[ 0], b[17])
565		+ MUL15(a[ 1], b[16])
566		+ MUL15(a[ 2], b[15])
567		+ MUL15(a[ 3], b[14])
568		+ MUL15(a[ 4], b[13])
569		+ MUL15(a[ 5], b[12])
570		+ MUL15(a[ 6], b[11])
571		+ MUL15(a[ 7], b[10])
572		+ MUL15(a[ 8], b[ 9])
573		+ MUL15(a[ 9], b[ 8])
574		+ MUL15(a[10], b[ 7])
575		+ MUL15(a[11], b[ 6])
576		+ MUL15(a[12], b[ 5])
577		+ MUL15(a[13], b[ 4])
578		+ MUL15(a[14], b[ 3])
579		+ MUL15(a[15], b[ 2])
580		+ MUL15(a[16], b[ 1])
581		+ MUL15(a[17], b[ 0]);
582	t[18] = MUL15(a[ 0], b[18])
583		+ MUL15(a[ 1], b[17])
584		+ MUL15(a[ 2], b[16])
585		+ MUL15(a[ 3], b[15])
586		+ MUL15(a[ 4], b[14])
587		+ MUL15(a[ 5], b[13])
588		+ MUL15(a[ 6], b[12])
589		+ MUL15(a[ 7], b[11])
590		+ MUL15(a[ 8], b[10])
591		+ MUL15(a[ 9], b[ 9])
592		+ MUL15(a[10], b[ 8])
593		+ MUL15(a[11], b[ 7])
594		+ MUL15(a[12], b[ 6])
595		+ MUL15(a[13], b[ 5])
596		+ MUL15(a[14], b[ 4])
597		+ MUL15(a[15], b[ 3])
598		+ MUL15(a[16], b[ 2])
599		+ MUL15(a[17], b[ 1])
600		+ MUL15(a[18], b[ 0]);
601	t[19] = MUL15(a[ 0], b[19])
602		+ MUL15(a[ 1], b[18])
603		+ MUL15(a[ 2], b[17])
604		+ MUL15(a[ 3], b[16])
605		+ MUL15(a[ 4], b[15])
606		+ MUL15(a[ 5], b[14])
607		+ MUL15(a[ 6], b[13])
608		+ MUL15(a[ 7], b[12])
609		+ MUL15(a[ 8], b[11])
610		+ MUL15(a[ 9], b[10])
611		+ MUL15(a[10], b[ 9])
612		+ MUL15(a[11], b[ 8])
613		+ MUL15(a[12], b[ 7])
614		+ MUL15(a[13], b[ 6])
615		+ MUL15(a[14], b[ 5])
616		+ MUL15(a[15], b[ 4])
617		+ MUL15(a[16], b[ 3])
618		+ MUL15(a[17], b[ 2])
619		+ MUL15(a[18], b[ 1])
620		+ MUL15(a[19], b[ 0]);
621	t[20] = MUL15(a[ 1], b[19])
622		+ MUL15(a[ 2], b[18])
623		+ MUL15(a[ 3], b[17])
624		+ MUL15(a[ 4], b[16])
625		+ MUL15(a[ 5], b[15])
626		+ MUL15(a[ 6], b[14])
627		+ MUL15(a[ 7], b[13])
628		+ MUL15(a[ 8], b[12])
629		+ MUL15(a[ 9], b[11])
630		+ MUL15(a[10], b[10])
631		+ MUL15(a[11], b[ 9])
632		+ MUL15(a[12], b[ 8])
633		+ MUL15(a[13], b[ 7])
634		+ MUL15(a[14], b[ 6])
635		+ MUL15(a[15], b[ 5])
636		+ MUL15(a[16], b[ 4])
637		+ MUL15(a[17], b[ 3])
638		+ MUL15(a[18], b[ 2])
639		+ MUL15(a[19], b[ 1]);
640	t[21] = MUL15(a[ 2], b[19])
641		+ MUL15(a[ 3], b[18])
642		+ MUL15(a[ 4], b[17])
643		+ MUL15(a[ 5], b[16])
644		+ MUL15(a[ 6], b[15])
645		+ MUL15(a[ 7], b[14])
646		+ MUL15(a[ 8], b[13])
647		+ MUL15(a[ 9], b[12])
648		+ MUL15(a[10], b[11])
649		+ MUL15(a[11], b[10])
650		+ MUL15(a[12], b[ 9])
651		+ MUL15(a[13], b[ 8])
652		+ MUL15(a[14], b[ 7])
653		+ MUL15(a[15], b[ 6])
654		+ MUL15(a[16], b[ 5])
655		+ MUL15(a[17], b[ 4])
656		+ MUL15(a[18], b[ 3])
657		+ MUL15(a[19], b[ 2]);
658	t[22] = MUL15(a[ 3], b[19])
659		+ MUL15(a[ 4], b[18])
660		+ MUL15(a[ 5], b[17])
661		+ MUL15(a[ 6], b[16])
662		+ MUL15(a[ 7], b[15])
663		+ MUL15(a[ 8], b[14])
664		+ MUL15(a[ 9], b[13])
665		+ MUL15(a[10], b[12])
666		+ MUL15(a[11], b[11])
667		+ MUL15(a[12], b[10])
668		+ MUL15(a[13], b[ 9])
669		+ MUL15(a[14], b[ 8])
670		+ MUL15(a[15], b[ 7])
671		+ MUL15(a[16], b[ 6])
672		+ MUL15(a[17], b[ 5])
673		+ MUL15(a[18], b[ 4])
674		+ MUL15(a[19], b[ 3]);
675	t[23] = MUL15(a[ 4], b[19])
676		+ MUL15(a[ 5], b[18])
677		+ MUL15(a[ 6], b[17])
678		+ MUL15(a[ 7], b[16])
679		+ MUL15(a[ 8], b[15])
680		+ MUL15(a[ 9], b[14])
681		+ MUL15(a[10], b[13])
682		+ MUL15(a[11], b[12])
683		+ MUL15(a[12], b[11])
684		+ MUL15(a[13], b[10])
685		+ MUL15(a[14], b[ 9])
686		+ MUL15(a[15], b[ 8])
687		+ MUL15(a[16], b[ 7])
688		+ MUL15(a[17], b[ 6])
689		+ MUL15(a[18], b[ 5])
690		+ MUL15(a[19], b[ 4]);
691	t[24] = MUL15(a[ 5], b[19])
692		+ MUL15(a[ 6], b[18])
693		+ MUL15(a[ 7], b[17])
694		+ MUL15(a[ 8], b[16])
695		+ MUL15(a[ 9], b[15])
696		+ MUL15(a[10], b[14])
697		+ MUL15(a[11], b[13])
698		+ MUL15(a[12], b[12])
699		+ MUL15(a[13], b[11])
700		+ MUL15(a[14], b[10])
701		+ MUL15(a[15], b[ 9])
702		+ MUL15(a[16], b[ 8])
703		+ MUL15(a[17], b[ 7])
704		+ MUL15(a[18], b[ 6])
705		+ MUL15(a[19], b[ 5]);
706	t[25] = MUL15(a[ 6], b[19])
707		+ MUL15(a[ 7], b[18])
708		+ MUL15(a[ 8], b[17])
709		+ MUL15(a[ 9], b[16])
710		+ MUL15(a[10], b[15])
711		+ MUL15(a[11], b[14])
712		+ MUL15(a[12], b[13])
713		+ MUL15(a[13], b[12])
714		+ MUL15(a[14], b[11])
715		+ MUL15(a[15], b[10])
716		+ MUL15(a[16], b[ 9])
717		+ MUL15(a[17], b[ 8])
718		+ MUL15(a[18], b[ 7])
719		+ MUL15(a[19], b[ 6]);
720	t[26] = MUL15(a[ 7], b[19])
721		+ MUL15(a[ 8], b[18])
722		+ MUL15(a[ 9], b[17])
723		+ MUL15(a[10], b[16])
724		+ MUL15(a[11], b[15])
725		+ MUL15(a[12], b[14])
726		+ MUL15(a[13], b[13])
727		+ MUL15(a[14], b[12])
728		+ MUL15(a[15], b[11])
729		+ MUL15(a[16], b[10])
730		+ MUL15(a[17], b[ 9])
731		+ MUL15(a[18], b[ 8])
732		+ MUL15(a[19], b[ 7]);
733	t[27] = MUL15(a[ 8], b[19])
734		+ MUL15(a[ 9], b[18])
735		+ MUL15(a[10], b[17])
736		+ MUL15(a[11], b[16])
737		+ MUL15(a[12], b[15])
738		+ MUL15(a[13], b[14])
739		+ MUL15(a[14], b[13])
740		+ MUL15(a[15], b[12])
741		+ MUL15(a[16], b[11])
742		+ MUL15(a[17], b[10])
743		+ MUL15(a[18], b[ 9])
744		+ MUL15(a[19], b[ 8]);
745	t[28] = MUL15(a[ 9], b[19])
746		+ MUL15(a[10], b[18])
747		+ MUL15(a[11], b[17])
748		+ MUL15(a[12], b[16])
749		+ MUL15(a[13], b[15])
750		+ MUL15(a[14], b[14])
751		+ MUL15(a[15], b[13])
752		+ MUL15(a[16], b[12])
753		+ MUL15(a[17], b[11])
754		+ MUL15(a[18], b[10])
755		+ MUL15(a[19], b[ 9]);
756	t[29] = MUL15(a[10], b[19])
757		+ MUL15(a[11], b[18])
758		+ MUL15(a[12], b[17])
759		+ MUL15(a[13], b[16])
760		+ MUL15(a[14], b[15])
761		+ MUL15(a[15], b[14])
762		+ MUL15(a[16], b[13])
763		+ MUL15(a[17], b[12])
764		+ MUL15(a[18], b[11])
765		+ MUL15(a[19], b[10]);
766	t[30] = MUL15(a[11], b[19])
767		+ MUL15(a[12], b[18])
768		+ MUL15(a[13], b[17])
769		+ MUL15(a[14], b[16])
770		+ MUL15(a[15], b[15])
771		+ MUL15(a[16], b[14])
772		+ MUL15(a[17], b[13])
773		+ MUL15(a[18], b[12])
774		+ MUL15(a[19], b[11]);
775	t[31] = MUL15(a[12], b[19])
776		+ MUL15(a[13], b[18])
777		+ MUL15(a[14], b[17])
778		+ MUL15(a[15], b[16])
779		+ MUL15(a[16], b[15])
780		+ MUL15(a[17], b[14])
781		+ MUL15(a[18], b[13])
782		+ MUL15(a[19], b[12]);
783	t[32] = MUL15(a[13], b[19])
784		+ MUL15(a[14], b[18])
785		+ MUL15(a[15], b[17])
786		+ MUL15(a[16], b[16])
787		+ MUL15(a[17], b[15])
788		+ MUL15(a[18], b[14])
789		+ MUL15(a[19], b[13]);
790	t[33] = MUL15(a[14], b[19])
791		+ MUL15(a[15], b[18])
792		+ MUL15(a[16], b[17])
793		+ MUL15(a[17], b[16])
794		+ MUL15(a[18], b[15])
795		+ MUL15(a[19], b[14]);
796	t[34] = MUL15(a[15], b[19])
797		+ MUL15(a[16], b[18])
798		+ MUL15(a[17], b[17])
799		+ MUL15(a[18], b[16])
800		+ MUL15(a[19], b[15]);
801	t[35] = MUL15(a[16], b[19])
802		+ MUL15(a[17], b[18])
803		+ MUL15(a[18], b[17])
804		+ MUL15(a[19], b[16]);
805	t[36] = MUL15(a[17], b[19])
806		+ MUL15(a[18], b[18])
807		+ MUL15(a[19], b[17]);
808	t[37] = MUL15(a[18], b[19])
809		+ MUL15(a[19], b[18]);
810	t[38] = MUL15(a[19], b[19]);
811
812	d[39] = norm13(d, t, 39);
813}
814
815static void
816square20(uint32_t *d, const uint32_t *a)
817{
818	uint32_t t[39];
819
820	t[ 0] = MUL15(a[ 0], a[ 0]);
821	t[ 1] = ((MUL15(a[ 0], a[ 1])) << 1);
822	t[ 2] = MUL15(a[ 1], a[ 1])
823		+ ((MUL15(a[ 0], a[ 2])) << 1);
824	t[ 3] = ((MUL15(a[ 0], a[ 3])
825		+ MUL15(a[ 1], a[ 2])) << 1);
826	t[ 4] = MUL15(a[ 2], a[ 2])
827		+ ((MUL15(a[ 0], a[ 4])
828		+ MUL15(a[ 1], a[ 3])) << 1);
829	t[ 5] = ((MUL15(a[ 0], a[ 5])
830		+ MUL15(a[ 1], a[ 4])
831		+ MUL15(a[ 2], a[ 3])) << 1);
832	t[ 6] = MUL15(a[ 3], a[ 3])
833		+ ((MUL15(a[ 0], a[ 6])
834		+ MUL15(a[ 1], a[ 5])
835		+ MUL15(a[ 2], a[ 4])) << 1);
836	t[ 7] = ((MUL15(a[ 0], a[ 7])
837		+ MUL15(a[ 1], a[ 6])
838		+ MUL15(a[ 2], a[ 5])
839		+ MUL15(a[ 3], a[ 4])) << 1);
840	t[ 8] = MUL15(a[ 4], a[ 4])
841		+ ((MUL15(a[ 0], a[ 8])
842		+ MUL15(a[ 1], a[ 7])
843		+ MUL15(a[ 2], a[ 6])
844		+ MUL15(a[ 3], a[ 5])) << 1);
845	t[ 9] = ((MUL15(a[ 0], a[ 9])
846		+ MUL15(a[ 1], a[ 8])
847		+ MUL15(a[ 2], a[ 7])
848		+ MUL15(a[ 3], a[ 6])
849		+ MUL15(a[ 4], a[ 5])) << 1);
850	t[10] = MUL15(a[ 5], a[ 5])
851		+ ((MUL15(a[ 0], a[10])
852		+ MUL15(a[ 1], a[ 9])
853		+ MUL15(a[ 2], a[ 8])
854		+ MUL15(a[ 3], a[ 7])
855		+ MUL15(a[ 4], a[ 6])) << 1);
856	t[11] = ((MUL15(a[ 0], a[11])
857		+ MUL15(a[ 1], a[10])
858		+ MUL15(a[ 2], a[ 9])
859		+ MUL15(a[ 3], a[ 8])
860		+ MUL15(a[ 4], a[ 7])
861		+ MUL15(a[ 5], a[ 6])) << 1);
862	t[12] = MUL15(a[ 6], a[ 6])
863		+ ((MUL15(a[ 0], a[12])
864		+ MUL15(a[ 1], a[11])
865		+ MUL15(a[ 2], a[10])
866		+ MUL15(a[ 3], a[ 9])
867		+ MUL15(a[ 4], a[ 8])
868		+ MUL15(a[ 5], a[ 7])) << 1);
869	t[13] = ((MUL15(a[ 0], a[13])
870		+ MUL15(a[ 1], a[12])
871		+ MUL15(a[ 2], a[11])
872		+ MUL15(a[ 3], a[10])
873		+ MUL15(a[ 4], a[ 9])
874		+ MUL15(a[ 5], a[ 8])
875		+ MUL15(a[ 6], a[ 7])) << 1);
876	t[14] = MUL15(a[ 7], a[ 7])
877		+ ((MUL15(a[ 0], a[14])
878		+ MUL15(a[ 1], a[13])
879		+ MUL15(a[ 2], a[12])
880		+ MUL15(a[ 3], a[11])
881		+ MUL15(a[ 4], a[10])
882		+ MUL15(a[ 5], a[ 9])
883		+ MUL15(a[ 6], a[ 8])) << 1);
884	t[15] = ((MUL15(a[ 0], a[15])
885		+ MUL15(a[ 1], a[14])
886		+ MUL15(a[ 2], a[13])
887		+ MUL15(a[ 3], a[12])
888		+ MUL15(a[ 4], a[11])
889		+ MUL15(a[ 5], a[10])
890		+ MUL15(a[ 6], a[ 9])
891		+ MUL15(a[ 7], a[ 8])) << 1);
892	t[16] = MUL15(a[ 8], a[ 8])
893		+ ((MUL15(a[ 0], a[16])
894		+ MUL15(a[ 1], a[15])
895		+ MUL15(a[ 2], a[14])
896		+ MUL15(a[ 3], a[13])
897		+ MUL15(a[ 4], a[12])
898		+ MUL15(a[ 5], a[11])
899		+ MUL15(a[ 6], a[10])
900		+ MUL15(a[ 7], a[ 9])) << 1);
901	t[17] = ((MUL15(a[ 0], a[17])
902		+ MUL15(a[ 1], a[16])
903		+ MUL15(a[ 2], a[15])
904		+ MUL15(a[ 3], a[14])
905		+ MUL15(a[ 4], a[13])
906		+ MUL15(a[ 5], a[12])
907		+ MUL15(a[ 6], a[11])
908		+ MUL15(a[ 7], a[10])
909		+ MUL15(a[ 8], a[ 9])) << 1);
910	t[18] = MUL15(a[ 9], a[ 9])
911		+ ((MUL15(a[ 0], a[18])
912		+ MUL15(a[ 1], a[17])
913		+ MUL15(a[ 2], a[16])
914		+ MUL15(a[ 3], a[15])
915		+ MUL15(a[ 4], a[14])
916		+ MUL15(a[ 5], a[13])
917		+ MUL15(a[ 6], a[12])
918		+ MUL15(a[ 7], a[11])
919		+ MUL15(a[ 8], a[10])) << 1);
920	t[19] = ((MUL15(a[ 0], a[19])
921		+ MUL15(a[ 1], a[18])
922		+ MUL15(a[ 2], a[17])
923		+ MUL15(a[ 3], a[16])
924		+ MUL15(a[ 4], a[15])
925		+ MUL15(a[ 5], a[14])
926		+ MUL15(a[ 6], a[13])
927		+ MUL15(a[ 7], a[12])
928		+ MUL15(a[ 8], a[11])
929		+ MUL15(a[ 9], a[10])) << 1);
930	t[20] = MUL15(a[10], a[10])
931		+ ((MUL15(a[ 1], a[19])
932		+ MUL15(a[ 2], a[18])
933		+ MUL15(a[ 3], a[17])
934		+ MUL15(a[ 4], a[16])
935		+ MUL15(a[ 5], a[15])
936		+ MUL15(a[ 6], a[14])
937		+ MUL15(a[ 7], a[13])
938		+ MUL15(a[ 8], a[12])
939		+ MUL15(a[ 9], a[11])) << 1);
940	t[21] = ((MUL15(a[ 2], a[19])
941		+ MUL15(a[ 3], a[18])
942		+ MUL15(a[ 4], a[17])
943		+ MUL15(a[ 5], a[16])
944		+ MUL15(a[ 6], a[15])
945		+ MUL15(a[ 7], a[14])
946		+ MUL15(a[ 8], a[13])
947		+ MUL15(a[ 9], a[12])
948		+ MUL15(a[10], a[11])) << 1);
949	t[22] = MUL15(a[11], a[11])
950		+ ((MUL15(a[ 3], a[19])
951		+ MUL15(a[ 4], a[18])
952		+ MUL15(a[ 5], a[17])
953		+ MUL15(a[ 6], a[16])
954		+ MUL15(a[ 7], a[15])
955		+ MUL15(a[ 8], a[14])
956		+ MUL15(a[ 9], a[13])
957		+ MUL15(a[10], a[12])) << 1);
958	t[23] = ((MUL15(a[ 4], a[19])
959		+ MUL15(a[ 5], a[18])
960		+ MUL15(a[ 6], a[17])
961		+ MUL15(a[ 7], a[16])
962		+ MUL15(a[ 8], a[15])
963		+ MUL15(a[ 9], a[14])
964		+ MUL15(a[10], a[13])
965		+ MUL15(a[11], a[12])) << 1);
966	t[24] = MUL15(a[12], a[12])
967		+ ((MUL15(a[ 5], a[19])
968		+ MUL15(a[ 6], a[18])
969		+ MUL15(a[ 7], a[17])
970		+ MUL15(a[ 8], a[16])
971		+ MUL15(a[ 9], a[15])
972		+ MUL15(a[10], a[14])
973		+ MUL15(a[11], a[13])) << 1);
974	t[25] = ((MUL15(a[ 6], a[19])
975		+ MUL15(a[ 7], a[18])
976		+ MUL15(a[ 8], a[17])
977		+ MUL15(a[ 9], a[16])
978		+ MUL15(a[10], a[15])
979		+ MUL15(a[11], a[14])
980		+ MUL15(a[12], a[13])) << 1);
981	t[26] = MUL15(a[13], a[13])
982		+ ((MUL15(a[ 7], a[19])
983		+ MUL15(a[ 8], a[18])
984		+ MUL15(a[ 9], a[17])
985		+ MUL15(a[10], a[16])
986		+ MUL15(a[11], a[15])
987		+ MUL15(a[12], a[14])) << 1);
988	t[27] = ((MUL15(a[ 8], a[19])
989		+ MUL15(a[ 9], a[18])
990		+ MUL15(a[10], a[17])
991		+ MUL15(a[11], a[16])
992		+ MUL15(a[12], a[15])
993		+ MUL15(a[13], a[14])) << 1);
994	t[28] = MUL15(a[14], a[14])
995		+ ((MUL15(a[ 9], a[19])
996		+ MUL15(a[10], a[18])
997		+ MUL15(a[11], a[17])
998		+ MUL15(a[12], a[16])
999		+ MUL15(a[13], a[15])) << 1);
1000	t[29] = ((MUL15(a[10], a[19])
1001		+ MUL15(a[11], a[18])
1002		+ MUL15(a[12], a[17])
1003		+ MUL15(a[13], a[16])
1004		+ MUL15(a[14], a[15])) << 1);
1005	t[30] = MUL15(a[15], a[15])
1006		+ ((MUL15(a[11], a[19])
1007		+ MUL15(a[12], a[18])
1008		+ MUL15(a[13], a[17])
1009		+ MUL15(a[14], a[16])) << 1);
1010	t[31] = ((MUL15(a[12], a[19])
1011		+ MUL15(a[13], a[18])
1012		+ MUL15(a[14], a[17])
1013		+ MUL15(a[15], a[16])) << 1);
1014	t[32] = MUL15(a[16], a[16])
1015		+ ((MUL15(a[13], a[19])
1016		+ MUL15(a[14], a[18])
1017		+ MUL15(a[15], a[17])) << 1);
1018	t[33] = ((MUL15(a[14], a[19])
1019		+ MUL15(a[15], a[18])
1020		+ MUL15(a[16], a[17])) << 1);
1021	t[34] = MUL15(a[17], a[17])
1022		+ ((MUL15(a[15], a[19])
1023		+ MUL15(a[16], a[18])) << 1);
1024	t[35] = ((MUL15(a[16], a[19])
1025		+ MUL15(a[17], a[18])) << 1);
1026	t[36] = MUL15(a[18], a[18])
1027		+ ((MUL15(a[17], a[19])) << 1);
1028	t[37] = ((MUL15(a[18], a[19])) << 1);
1029	t[38] = MUL15(a[19], a[19]);
1030
1031	d[39] = norm13(d, t, 39);
1032}
1033
1034#endif
1035
1036/*
1037 * Perform a "final reduction" in field F255 (field for Curve25519)
1038 * The source value must be less than twice the modulus. If the value
1039 * is not lower than the modulus, then the modulus is subtracted and
1040 * this function returns 1; otherwise, it leaves it untouched and it
1041 * returns 0.
1042 */
1043static uint32_t
1044reduce_final_f255(uint32_t *d)
1045{
1046	uint32_t t[20];
1047	uint32_t cc;
1048	int i;
1049
1050	memcpy(t, d, sizeof t);
1051	cc = 19;
1052	for (i = 0; i < 20; i ++) {
1053		uint32_t w;
1054
1055		w = t[i] + cc;
1056		cc = w >> 13;
1057		t[i] = w & 0x1FFF;
1058	}
1059	cc = t[19] >> 8;
1060	t[19] &= 0xFF;
1061	CCOPY(cc, d, t, sizeof t);
1062	return cc;
1063}
1064
1065static void
1066f255_mulgen(uint32_t *d, const uint32_t *a, const uint32_t *b, int square)
1067{
1068	uint32_t t[40], cc, w;
1069
1070	/*
1071	 * Compute raw multiplication. All result words fit in 13 bits
1072	 * each; upper word (t[39]) must fit on 5 bits, since the product
1073	 * of two 256-bit integers must fit on 512 bits.
1074	 */
1075	if (square) {
1076		square20(t, a);
1077	} else {
1078		mul20(t, a, b);
1079	}
1080
1081	/*
1082	 * Modular reduction: each high word is added where necessary.
1083	 * Since the modulus is 2^255-19 and word 20 corresponds to
1084	 * offset 20*13 = 260, word 20+k must be added to word k with
1085	 * a factor of 19*2^5 = 608. The extra bits in word 19 are also
1086	 * added that way.
1087	 */
1088	cc = MUL15(t[19] >> 8, 19);
1089	t[19] &= 0xFF;
1090
1091#define MM1(x)   do { \
1092		w = t[x] + cc + MUL15(t[(x) + 20], 608); \
1093		t[x] = w & 0x1FFF; \
1094		cc = w >> 13; \
1095	} while (0)
1096
1097	MM1( 0);
1098	MM1( 1);
1099	MM1( 2);
1100	MM1( 3);
1101	MM1( 4);
1102	MM1( 5);
1103	MM1( 6);
1104	MM1( 7);
1105	MM1( 8);
1106	MM1( 9);
1107	MM1(10);
1108	MM1(11);
1109	MM1(12);
1110	MM1(13);
1111	MM1(14);
1112	MM1(15);
1113	MM1(16);
1114	MM1(17);
1115	MM1(18);
1116	MM1(19);
1117
1118#undef MM1
1119
1120	cc = MUL15(w >> 8, 19);
1121	t[19] &= 0xFF;
1122
1123#define MM2(x)   do { \
1124		w = t[x] + cc; \
1125		d[x] = w & 0x1FFF; \
1126		cc = w >> 13; \
1127	} while (0)
1128
1129	MM2( 0);
1130	MM2( 1);
1131	MM2( 2);
1132	MM2( 3);
1133	MM2( 4);
1134	MM2( 5);
1135	MM2( 6);
1136	MM2( 7);
1137	MM2( 8);
1138	MM2( 9);
1139	MM2(10);
1140	MM2(11);
1141	MM2(12);
1142	MM2(13);
1143	MM2(14);
1144	MM2(15);
1145	MM2(16);
1146	MM2(17);
1147	MM2(18);
1148	MM2(19);
1149
1150#undef MM2
1151}
1152
1153/*
1154 * Perform a multiplication of two integers modulo 2^255-19.
1155 * Operands are arrays of 20 words, each containing 13 bits of data, in
1156 * little-endian order. Input value may be up to 2^256-1; on output, value
1157 * fits on 256 bits and is lower than twice the modulus.
1158 *
1159 * f255_mul() is the general multiplication, f255_square() is specialised
1160 * for squarings.
1161 */
1162#define f255_mul(d, a, b)   f255_mulgen(d, a, b, 0)
1163#define f255_square(d, a)   f255_mulgen(d, a, a, 1)
1164
1165/*
1166 * Add two values in F255. Partial reduction is performed (down to less
1167 * than twice the modulus).
1168 */
1169static void
1170f255_add(uint32_t *d, const uint32_t *a, const uint32_t *b)
1171{
1172	int i;
1173	uint32_t cc, w;
1174
1175	cc = 0;
1176	for (i = 0; i < 20; i ++) {
1177		w = a[i] + b[i] + cc;
1178		d[i] = w & 0x1FFF;
1179		cc = w >> 13;
1180	}
1181	cc = MUL15(w >> 8, 19);
1182	d[19] &= 0xFF;
1183	for (i = 0; i < 20; i ++) {
1184		w = d[i] + cc;
1185		d[i] = w & 0x1FFF;
1186		cc = w >> 13;
1187	}
1188}
1189
1190/*
1191 * Subtract one value from another in F255. Partial reduction is
1192 * performed (down to less than twice the modulus).
1193 */
1194static void
1195f255_sub(uint32_t *d, const uint32_t *a, const uint32_t *b)
1196{
1197	/*
1198	 * We actually compute a - b + 2*p, so that the final value is
1199	 * necessarily positive.
1200	 */
1201	int i;
1202	uint32_t cc, w;
1203
1204	cc = (uint32_t)-38;
1205	for (i = 0; i < 20; i ++) {
1206		w = a[i] - b[i] + cc;
1207		d[i] = w & 0x1FFF;
1208		cc = ARSH(w, 13);
1209	}
1210	cc = MUL15((w + 0x200) >> 8, 19);
1211	d[19] &= 0xFF;
1212	for (i = 0; i < 20; i ++) {
1213		w = d[i] + cc;
1214		d[i] = w & 0x1FFF;
1215		cc = w >> 13;
1216	}
1217}
1218
1219/*
1220 * Multiply an integer by the 'A24' constant (121665). Partial reduction
1221 * is performed (down to less than twice the modulus).
1222 */
1223static void
1224f255_mul_a24(uint32_t *d, const uint32_t *a)
1225{
1226	int i;
1227	uint32_t cc, w;
1228
1229	cc = 0;
1230	for (i = 0; i < 20; i ++) {
1231		w = MUL15(a[i], 121665) + cc;
1232		d[i] = w & 0x1FFF;
1233		cc = w >> 13;
1234	}
1235	cc = MUL15(w >> 8, 19);
1236	d[19] &= 0xFF;
1237	for (i = 0; i < 20; i ++) {
1238		w = d[i] + cc;
1239		d[i] = w & 0x1FFF;
1240		cc = w >> 13;
1241	}
1242}
1243
1244static const unsigned char GEN[] = {
1245	0x09, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1246	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1247	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1248	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
1249};
1250
1251static const unsigned char ORDER[] = {
1252	0x7F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1253	0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1254	0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1255	0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF
1256};
1257
1258static const unsigned char *
1259api_generator(int curve, size_t *len)
1260{
1261	(void)curve;
1262	*len = 32;
1263	return GEN;
1264}
1265
1266static const unsigned char *
1267api_order(int curve, size_t *len)
1268{
1269	(void)curve;
1270	*len = 32;
1271	return ORDER;
1272}
1273
1274static size_t
1275api_xoff(int curve, size_t *len)
1276{
1277	(void)curve;
1278	*len = 32;
1279	return 0;
1280}
1281
1282static void
1283cswap(uint32_t *a, uint32_t *b, uint32_t ctl)
1284{
1285	int i;
1286
1287	ctl = -ctl;
1288	for (i = 0; i < 20; i ++) {
1289		uint32_t aw, bw, tw;
1290
1291		aw = a[i];
1292		bw = b[i];
1293		tw = ctl & (aw ^ bw);
1294		a[i] = aw ^ tw;
1295		b[i] = bw ^ tw;
1296	}
1297}
1298
1299static uint32_t
1300api_mul(unsigned char *G, size_t Glen,
1301	const unsigned char *kb, size_t kblen, int curve)
1302{
1303	uint32_t x1[20], x2[20], x3[20], z2[20], z3[20];
1304	uint32_t a[20], aa[20], b[20], bb[20];
1305	uint32_t c[20], d[20], e[20], da[20], cb[20];
1306	unsigned char k[32];
1307	uint32_t swap;
1308	int i;
1309
1310	(void)curve;
1311
1312	/*
1313	 * Points are encoded over exactly 32 bytes. Multipliers must fit
1314	 * in 32 bytes as well.
1315	 * RFC 7748 mandates that the high bit of the last point byte must
1316	 * be ignored/cleared.
1317	 */
1318	if (Glen != 32 || kblen > 32) {
1319		return 0;
1320	}
1321	G[31] &= 0x7F;
1322
1323	/*
1324	 * Initialise variables x1, x2, z2, x3 and z3. We set all of them
1325	 * into Montgomery representation.
1326	 */
1327	x1[19] = le8_to_le13(x1, G, 32);
1328	memcpy(x3, x1, sizeof x1);
1329	memset(z2, 0, sizeof z2);
1330	memset(x2, 0, sizeof x2);
1331	x2[0] = 1;
1332	memset(z3, 0, sizeof z3);
1333	z3[0] = 1;
1334
1335	memset(k, 0, (sizeof k) - kblen);
1336	memcpy(k + (sizeof k) - kblen, kb, kblen);
1337	k[31] &= 0xF8;
1338	k[0] &= 0x7F;
1339	k[0] |= 0x40;
1340
1341	/* obsolete
1342	print_int("x1", x1);
1343	*/
1344
1345	swap = 0;
1346	for (i = 254; i >= 0; i --) {
1347		uint32_t kt;
1348
1349		kt = (k[31 - (i >> 3)] >> (i & 7)) & 1;
1350		swap ^= kt;
1351		cswap(x2, x3, swap);
1352		cswap(z2, z3, swap);
1353		swap = kt;
1354
1355		/* obsolete
1356		print_int("x2", x2);
1357		print_int("z2", z2);
1358		print_int("x3", x3);
1359		print_int("z3", z3);
1360		*/
1361
1362		f255_add(a, x2, z2);
1363		f255_square(aa, a);
1364		f255_sub(b, x2, z2);
1365		f255_square(bb, b);
1366		f255_sub(e, aa, bb);
1367		f255_add(c, x3, z3);
1368		f255_sub(d, x3, z3);
1369		f255_mul(da, d, a);
1370		f255_mul(cb, c, b);
1371
1372		/* obsolete
1373		print_int("a ", a);
1374		print_int("aa", aa);
1375		print_int("b ", b);
1376		print_int("bb", bb);
1377		print_int("e ", e);
1378		print_int("c ", c);
1379		print_int("d ", d);
1380		print_int("da", da);
1381		print_int("cb", cb);
1382		*/
1383
1384		f255_add(x3, da, cb);
1385		f255_square(x3, x3);
1386		f255_sub(z3, da, cb);
1387		f255_square(z3, z3);
1388		f255_mul(z3, z3, x1);
1389		f255_mul(x2, aa, bb);
1390		f255_mul_a24(z2, e);
1391		f255_add(z2, z2, aa);
1392		f255_mul(z2, e, z2);
1393
1394		/* obsolete
1395		print_int("x2", x2);
1396		print_int("z2", z2);
1397		print_int("x3", x3);
1398		print_int("z3", z3);
1399		*/
1400	}
1401	cswap(x2, x3, swap);
1402	cswap(z2, z3, swap);
1403
1404	/*
1405	 * Inverse z2 with a modular exponentiation. This is a simple
1406	 * square-and-multiply algorithm; we mutualise most non-squarings
1407	 * since the exponent contains almost only ones.
1408	 */
1409	memcpy(a, z2, sizeof z2);
1410	for (i = 0; i < 15; i ++) {
1411		f255_square(a, a);
1412		f255_mul(a, a, z2);
1413	}
1414	memcpy(b, a, sizeof a);
1415	for (i = 0; i < 14; i ++) {
1416		int j;
1417
1418		for (j = 0; j < 16; j ++) {
1419			f255_square(b, b);
1420		}
1421		f255_mul(b, b, a);
1422	}
1423	for (i = 14; i >= 0; i --) {
1424		f255_square(b, b);
1425		if ((0xFFEB >> i) & 1) {
1426			f255_mul(b, z2, b);
1427		}
1428	}
1429	f255_mul(x2, x2, b);
1430	reduce_final_f255(x2);
1431	le13_to_le8(G, 32, x2);
1432	return 1;
1433}
1434
1435static size_t
1436api_mulgen(unsigned char *R,
1437	const unsigned char *x, size_t xlen, int curve)
1438{
1439	const unsigned char *G;
1440	size_t Glen;
1441
1442	G = api_generator(curve, &Glen);
1443	memcpy(R, G, Glen);
1444	api_mul(R, Glen, x, xlen, curve);
1445	return Glen;
1446}
1447
1448static uint32_t
1449api_muladd(unsigned char *A, const unsigned char *B, size_t len,
1450	const unsigned char *x, size_t xlen,
1451	const unsigned char *y, size_t ylen, int curve)
1452{
1453	/*
1454	 * We don't implement this method, since it is used for ECDSA
1455	 * only, and there is no ECDSA over Curve25519 (which instead
1456	 * uses EdDSA).
1457	 */
1458	(void)A;
1459	(void)B;
1460	(void)len;
1461	(void)x;
1462	(void)xlen;
1463	(void)y;
1464	(void)ylen;
1465	(void)curve;
1466	return 0;
1467}
1468
1469/* see bearssl_ec.h */
1470const br_ec_impl br_ec_c25519_m15 = {
1471	(uint32_t)0x20000000,
1472	&api_generator,
1473	&api_order,
1474	&api_xoff,
1475	&api_mul,
1476	&api_mulgen,
1477	&api_muladd
1478};
1479