1/*
2 * Copyright (c) 2000-2001 Apple Computer, Inc. All Rights Reserved.
3 *
4 * The contents of this file constitute Original Code as defined in and are
5 * subject to the Apple Public Source License Version 1.2 (the 'License').
6 * You may not use this file except in compliance with the License. Please obtain
7 * a copy of the License at http://www.apple.com/publicsource and read it before
8 * using this file.
9 *
10 * This Original Code and all software distributed under the License are
11 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESS
12 * OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, INCLUDING WITHOUT
13 * LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
14 * PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. Please see the License for the
15 * specific language governing rights and limitations under the License.
16 */
17
18
19/* rijndael-alg-ref.c   v2.0   August '99
20 * Reference ANSI C code
21 * authors: Paulo Barreto
22 *          Vincent Rijmen
23 *
24 * PPC and 128-bit block optimization by Doug Mitchell May 2001.
25 */
26
27#include <stdio.h>
28#include <stdlib.h>
29#include <string.h>
30
31#include "rijndael-alg-ref.h"
32#include <cspdebugging.h>
33
34#define SC	((BC - 4) >> 1)
35
36#include "boxes-ref.h"
37
38static const word8 shifts[3][4][2] = {
39 { { 0, 0 },
40   { 1, 3 },
41   { 2, 2 },
42   { 3, 1 }
43 },
44 { { 0, 0 },
45   { 1, 5 },
46   { 2, 4 },
47   { 3, 3 }
48 },
49 { { 0, 0 },
50   { 1, 7 },
51   { 3, 5 },
52   { 4, 4 }
53 }
54};
55
56#if 	!GLADMAN_AES_128_ENABLE
57
58/* 128 bit key/word shift table in bits */
59static const word8 shifts128[4][2] = {
60 { 0,  0 },
61 { 8,  24 },
62 { 16, 16 },
63 { 24, 8 }
64};
65
66#endif	/* GLADMAN_AES_128_ENABLE */
67
68#if		!AES_MUL_BY_LOOKUP
69/*
70 * Profiling measurements showed that the mul routine is where a large propertion of
71 * the time is spent. Since the first argument to  mul is always one of six
72 * constants (2, 3, 0xe, etc.), we implement six 256x256 byte lookup tables to
73 * do the multiplies. This eliminates the need for the log/antilog tables, so
74 * it's only adding one kilobyte of const data. Throughput improvement for this
75 * mod is a factor of 3.3 for encrypt and 4.1 for decrypt in the 128-bit optimized
76 * case. Improvement for the general case (with a 256 bit key) is 1.46 for encrypt
77 * and 1.88 for decrypt. (Decrypt wins more for this enhancement because the
78 * InvMixColumn does four muls, vs. 2 muls for MixColumn). Measurements taken
79 * on a 500 MHz G4 with 1 MB of L2 cache.
80 */
81
82/*
83 * The mod 255 op in mul is really expensive...
84 *
85 * We know that b <= (254 * 2), so there are only two cases. Either return b,
86 * or return b-255.
87 *
88 * On a G4 this single optimization results in a 24% speedup for encrypt and
89 * a 25% speedup for decrypt.
90 */
91static inline word8 mod255(word32 b)
92{
93	if(b >= 255) {
94		b -= 255;
95	}
96	return b;
97}
98
99word8 mul(word8 a, word8 b) {
100   /* multiply two elements of GF(2^m)
101    * needed for MixColumn and InvMixColumn
102    */
103	if (a && b) return Alogtable[mod255(Logtable[a] + Logtable[b])];
104	else return 0;
105}
106#endif	/* !AES_MUL_BY_LOOKUP */
107
108static
109void KeyAddition(word8 a[4][MAXBC], word8 rk[4][MAXBC], word8 BC) {
110	/* Exor corresponding text input and round key input bytes
111	 */
112	int i, j;
113
114	for(i = 0; i < 4; i++)
115   		for(j = 0; j < BC; j++) a[i][j] ^= rk[i][j];
116}
117
118static
119void ShiftRow(word8 a[4][MAXBC], word8 d, word8 BC) {
120	/* Row 0 remains unchanged
121	 * The other three rows are shifted a variable amount
122	 */
123	word8 tmp[MAXBC];
124	int i, j;
125
126	for(i = 1; i < 4; i++) {
127		for(j = 0; j < BC; j++) tmp[j] = a[i][(j + shifts[SC][i][d]) % BC];
128		for(j = 0; j < BC; j++) a[i][j] = tmp[j];
129	}
130}
131
132static
133void Substitution(word8 a[4][MAXBC], const word8 box[256], word8 BC) {
134	/* Replace every byte of the input by the byte at that place
135	 * in the nonlinear S-box
136	 */
137	int i, j;
138
139	for(i = 0; i < 4; i++)
140		for(j = 0; j < BC; j++) a[i][j] = box[a[i][j]] ;
141}
142
143static
144void MixColumn(word8 a[4][MAXBC], word8 BC) {
145	/* Mix the four bytes of every column in a linear way
146	 */
147	word8 b[4][MAXBC];
148	int i, j;
149
150	for(j = 0; j < BC; j++) {
151		for(i = 0; i < 4; i++) {
152			#if		AES_MUL_BY_LOOKUP
153			b[i][j] = mulBy0x02[a[i][j]]
154				^ mulBy0x03[a[(i + 1) % 4][j]]
155				^ a[(i + 2) % 4][j]
156				^ a[(i + 3) % 4][j];
157			#else
158			b[i][j] = mul(2,a[i][j])
159				^ mul(3,a[(i + 1) % 4][j])
160				^ a[(i + 2) % 4][j]
161				^ a[(i + 3) % 4][j];
162			#endif
163		}
164	}
165	for(i = 0; i < 4; i++) {
166		for(j = 0; j < BC; j++) a[i][j] = b[i][j];
167	}
168}
169
170static
171void InvMixColumn(word8 a[4][MAXBC], word8 BC) {
172	/* Mix the four bytes of every column in a linear way
173	 * This is the opposite operation of Mixcolumn
174	 */
175	word8 b[4][MAXBC];
176	int i, j;
177
178	for(j = 0; j < BC; j++) {
179		for(i = 0; i < 4; i++) {
180			#if		AES_MUL_BY_LOOKUP
181			b[i][j] = mulBy0x0e[a[i][j]]
182				^ mulBy0x0b[a[(i + 1) % 4][j]]
183				^ mulBy0x0d[a[(i + 2) % 4][j]]
184				^ mulBy0x09[a[(i + 3) % 4][j]];
185			#else
186			b[i][j] = mul(0xe,a[i][j])
187				^ mul(0xb,a[(i + 1) % 4][j])
188				^ mul(0xd,a[(i + 2) % 4][j])
189				^ mul(0x9,a[(i + 3) % 4][j]);
190			#endif
191		}
192	}
193	for(i = 0; i < 4; i++) {
194		for(j = 0; j < BC; j++) a[i][j] = b[i][j];
195	}
196}
197
198int rijndaelKeySched (
199	word8 k[4][MAXKC],
200	int keyBits,
201	int blockBits,
202	word8 W[MAXROUNDS+1][4][MAXBC]) {
203
204	/* Calculate the necessary round keys
205	 * The number of calculations depends on keyBits and blockBits
206	 */
207	int KC, BC, ROUNDS;
208	int i, j, t, rconpointer = 0;
209	word8 tk[4][MAXKC];
210
211	switch (keyBits) {
212	case 128: KC = 4; break;
213	case 192: KC = 6; break;
214	case 256: KC = 8; break;
215	default : return (-1);
216	}
217
218	switch (blockBits) {
219	case 128: BC = 4; break;
220	case 192: BC = 6; break;
221	case 256: BC = 8; break;
222	default : return (-2);
223	}
224
225	switch (keyBits >= blockBits ? keyBits : blockBits) {
226	case 128: ROUNDS = 10; break;
227	case 192: ROUNDS = 12; break;
228	case 256: ROUNDS = 14; break;
229	default : return (-3); /* this cannot happen */
230	}
231
232
233	for(j = 0; j < KC; j++)
234		for(i = 0; i < 4; i++)
235			tk[i][j] = k[i][j];
236	t = 0;
237	/* copy values into round key array */
238	for(j = 0; (j < KC) && (t < (ROUNDS+1)*BC); j++, t++)
239		for(i = 0; i < 4; i++) W[t / BC][i][t % BC] = tk[i][j];
240
241	while (t < (ROUNDS+1)*BC) { /* while not enough round key material calculated */
242		/* calculate new values */
243		for(i = 0; i < 4; i++)
244			tk[i][0] ^= S[tk[(i+1)%4][KC-1]];
245		tk[0][0] ^= rcon[rconpointer++];
246
247		if (KC != 8)
248			for(j = 1; j < KC; j++)
249				for(i = 0; i < 4; i++) tk[i][j] ^= tk[i][j-1];
250		else {
251			for(j = 1; j < KC/2; j++)
252				for(i = 0; i < 4; i++) tk[i][j] ^= tk[i][j-1];
253			for(i = 0; i < 4; i++) tk[i][KC/2] ^= S[tk[i][KC/2 - 1]];
254			for(j = KC/2 + 1; j < KC; j++)
255				for(i = 0; i < 4; i++) tk[i][j] ^= tk[i][j-1];
256	}
257	/* copy values into round key array */
258	for(j = 0; (j < KC) && (t < (ROUNDS+1)*BC); j++, t++)
259		for(i = 0; i < 4; i++) W[t / BC][i][t % BC] = tk[i][j];
260	}
261
262	return 0;
263}
264
265int rijndaelEncrypt (
266	word8 a[4][MAXBC],
267	int keyBits,
268	int blockBits,
269	word8 rk[MAXROUNDS+1][4][MAXBC])
270{
271	/* Encryption of one block, general case.
272	 */
273	int r, BC, ROUNDS;
274
275	switch (blockBits) {
276	case 128: BC = 4; break;
277	case 192: BC = 6; break;
278	case 256: BC = 8; break;
279	default : return (-2);
280	}
281
282	switch (keyBits >= blockBits ? keyBits : blockBits) {
283	case 128: ROUNDS = 10; break;
284	case 192: ROUNDS = 12; break;
285	case 256: ROUNDS = 14; break;
286	default : return (-3); /* this cannot happen */
287	}
288
289	/* begin with a key addition
290	 */
291	KeyAddition(a,rk[0],BC);
292
293	/* ROUNDS-1 ordinary rounds
294	 */
295	for(r = 1; r < ROUNDS; r++) {
296		Substitution(a,S,BC);
297		ShiftRow(a,0,BC);
298		MixColumn(a,BC);
299		KeyAddition(a,rk[r],BC);
300	}
301
302	/* Last round is special: there is no MixColumn
303	 */
304	Substitution(a,S,BC);
305	ShiftRow(a,0,BC);
306	KeyAddition(a,rk[ROUNDS],BC);
307
308	return 0;
309}
310
311int rijndaelDecrypt (
312	word8 a[4][MAXBC],
313	int keyBits,
314	int blockBits,
315	word8 rk[MAXROUNDS+1][4][MAXBC])
316{
317	int r, BC, ROUNDS;
318
319	switch (blockBits) {
320	case 128: BC = 4; break;
321	case 192: BC = 6; break;
322	case 256: BC = 8; break;
323	default : return (-2);
324	}
325
326	switch (keyBits >= blockBits ? keyBits : blockBits) {
327	case 128: ROUNDS = 10; break;
328	case 192: ROUNDS = 12; break;
329	case 256: ROUNDS = 14; break;
330	default : return (-3); /* this cannot happen */
331	}
332
333	/* To decrypt: apply the inverse operations of the encrypt routine,
334	 *             in opposite order
335	 *
336	 * (KeyAddition is an involution: it 's equal to its inverse)
337	 * (the inverse of Substitution with table S is Substitution with the
338	 *  inverse table of S)
339	 * (the inverse of Shiftrow is Shiftrow over a suitable distance)
340	 */
341
342	/* First the special round:
343	 *   without InvMixColumn
344	 *   with extra KeyAddition
345	 */
346	KeyAddition(a,rk[ROUNDS],BC);
347	Substitution(a,Si,BC);
348	ShiftRow(a,1,BC);
349
350	/* ROUNDS-1 ordinary rounds
351	 */
352	for(r = ROUNDS-1; r > 0; r--) {
353		KeyAddition(a,rk[r],BC);
354		InvMixColumn(a,BC);
355		Substitution(a,Si,BC);
356		ShiftRow(a,1,BC);
357	}
358
359	/* End with the extra key addition
360	 */
361
362	KeyAddition(a,rk[0],BC);
363
364	return 0;
365}
366
367#if		!GLADMAN_AES_128_ENABLE
368
369/*
370 * All of these 128-bit-key-and-block routines require 32-bit word-aligned
371 * char array pointers.�The key schedule arrays are easy; they come from
372 * keyInstance which has a 4-byte-aligned element preceeding the key schedule.
373 * Others require manual alignment of a local variable by the caller.
374 */
375
376static inline void KeyAddition128(
377	word8 a[4][BC_128_OPT],
378	word8 rk[4][MAXBC]) {
379
380	/* these casts are endian-independent */
381	((word32 *)a)[0] ^= *((word32 *)(&rk[0]));
382	((word32 *)a)[1] ^= *((word32 *)(&rk[1]));
383	((word32 *)a)[2] ^= *((word32 *)(&rk[2]));
384	((word32 *)a)[3] ^= *((word32 *)(&rk[3]));
385}
386
387static void Substitution128(
388	word8 a[4][BC_128_OPT],
389	const word8 box[256]) {
390	/* Replace every byte of the input by the byte at that place
391	 * in the nonlinear S-box
392	 */
393	int i, j;
394
395	/* still to be optimized - larger S boxes? */
396	for(i = 0; i < 4; i++) {
397		for(j = 0; j < BC_128_OPT; j++) {
398			a[i][j] = box[a[i][j]];
399		}
400	}
401}
402
403#if	defined(__ppc__) && defined(__GNUC__)
404
405static inline void rotateWordLeft(
406	word8 *word,			// known to be word aligned
407	unsigned rotCount)		// in bits
408{
409	word32 lword = *((word32 *)word);
410	asm("rlwnm %0,%1,%2,0,31" : "=r"(lword) : "0"(lword), "r"(rotCount));
411	*((word32 *)word) = lword;
412}
413
414#else
415
416/*
417 * Insert your machine/compiler dependent code here,
418 * or just use this, which works on any platform and compiler
419 * which supports the __attribute__((aligned(4))) directive.
420 */
421static void rotateWordLeft(
422	word8 *word,			// known to be word aligned
423	unsigned rotCount)		// in bits
424{
425	word8 tmp[BC_128_OPT] __attribute__((aligned(4)));
426	unsigned bytes = rotCount / 8;
427
428	tmp[0] = word[bytes     & (BC_128_OPT-1)];
429	tmp[1] = word[(1+bytes) & (BC_128_OPT-1)];
430	tmp[2] = word[(2+bytes) & (BC_128_OPT-1)];
431	tmp[3] = word[(3+bytes) & (BC_128_OPT-1)];
432	*((word32 *)word) = *((word32 *)tmp);
433}
434#endif
435
436static inline void ShiftRow128(
437	word8 a[4][BC_128_OPT],
438	word8 d) {
439	/* Row 0 remains unchanged
440	 * The other three rows are shifted (actually rotated) a variable amount
441	 */
442	int i;
443
444	for(i = 1; i < 4; i++) {
445		rotateWordLeft(a[i], shifts128[i][d]);
446	}
447}
448
449/*
450 * The following two routines are where most of the time is spent in this
451 * module. Further optimization would have to focus here.
452 */
453static void MixColumn128(word8 a[4][BC_128_OPT]) {
454	/* Mix the four bytes of every column in a linear way
455	 */
456	word8 b[4][BC_128_OPT];
457	int i, j;
458
459	for(j = 0; j < BC_128_OPT; j++) {
460		for(i = 0; i < 4; i++) {
461			#if		AES_MUL_BY_LOOKUP
462			b[i][j] = mulBy0x02[a[i][j]]
463				^ mulBy0x03[a[(i + 1) % 4][j]]
464				^ a[(i + 2) % 4][j]
465				^ a[(i + 3) % 4][j];
466			#else
467			b[i][j] = mul(2,a[i][j])
468				^ mul(3,a[(i + 1) % 4][j])
469				^ a[(i + 2) % 4][j]
470				^ a[(i + 3) % 4][j];
471			#endif
472		}
473	}
474	memmove(a, b, 4 * BC_128_OPT);
475}
476
477static void InvMixColumn128(word8 a[4][BC_128_OPT]) {
478	/* Mix the four bytes of every column in a linear way
479	 * This is the opposite operation of Mixcolumn
480	 */
481	word8 b[4][BC_128_OPT];
482	int i, j;
483
484	for(j = 0; j < BC_128_OPT; j++) {
485		for(i = 0; i < 4; i++) {
486			#if		AES_MUL_BY_LOOKUP
487			b[i][j] = mulBy0x0e[a[i][j]]
488				^ mulBy0x0b[a[(i + 1) % 4][j]]
489				^ mulBy0x0d[a[(i + 2) % 4][j]]
490				^ mulBy0x09[a[(i + 3) % 4][j]];
491			#else
492			b[i][j] = mul(0xe,a[i][j])
493				^ mul(0xb,a[(i + 1) % 4][j])
494				^ mul(0xd,a[(i + 2) % 4][j])
495				^ mul(0x9,a[(i + 3) % 4][j]);
496			#endif
497		}
498	}
499	memmove(a, b, 4 * BC_128_OPT);
500}
501
502int rijndaelKeySched128 (
503	word8 k[4][KC_128_OPT],
504	word8 W[MAXROUNDS+1][4][MAXBC]) {
505
506	/* Calculate the necessary round keys
507	 * The number of calculations depends on keyBits and blockBits
508	 */
509	int i, j, t, rconpointer = 0;
510	word8 tk[4][KC_128_OPT];
511	unsigned numSchedRows = (ROUNDS_128_OPT + 1) * BC_128_OPT;
512
513	for(j = 0; j < KC_128_OPT; j++)
514		for(i = 0; i < 4; i++)
515			tk[i][j] = k[i][j];
516	t = 0;
517	/* copy values into round key array */
518	for(j = 0; (j < KC_128_OPT) && (t < numSchedRows); j++, t++) {
519		for(i = 0; i < 4; i++) {
520			W[t / BC_128_OPT][i][t % BC_128_OPT] = tk[i][j];
521		}
522	}
523
524	while (t < numSchedRows) {
525		/* while not enough round key material calculated */
526		/* calculate new values */
527		for(i = 0; i < 4; i++) {
528			tk[i][0] ^= S[tk[(i+1)%4][KC_128_OPT-1]];
529		}
530		tk[0][0] ^= rcon[rconpointer++];
531
532		for(j = 1; j < KC_128_OPT; j++) {
533			for(i = 0; i < 4; i++) {
534				tk[i][j] ^= tk[i][j-1];
535			}
536		}
537
538		/* copy values into round key array */
539		for(j = 0; (j < KC_128_OPT) && (t < numSchedRows); j++, t++) {
540			for(i = 0; i < 4; i++) {
541				W[t / BC_128_OPT][i][t % BC_128_OPT] = tk[i][j];
542			}
543		}
544	}
545
546	return 0;
547}
548
549int rijndaelEncrypt128 (
550	word8 a[4][BC_128_OPT],
551	word8 rk[MAXROUNDS+1][4][MAXBC])
552{
553	/* Encryption of one block.
554	 */
555	int r;
556
557	/* begin with a key addition
558	 */
559	KeyAddition128(a,rk[0]);
560
561	/* ROUNDS-1 ordinary rounds
562	 */
563	for(r = 1; r < ROUNDS_128_OPT; r++) {
564		Substitution128(a,S);
565		ShiftRow128(a,0);
566		MixColumn128(a);
567		KeyAddition128(a,rk[r]);
568	}
569
570	/* Last round is special: there is no MixColumn
571	 */
572	Substitution128(a,S);
573	ShiftRow128(a,0);
574	KeyAddition128(a,rk[ROUNDS_128_OPT]);
575
576	return 0;
577}
578
579int rijndaelDecrypt128 (
580	word8 a[4][BC_128_OPT],
581	word8 rk[MAXROUNDS+1][4][MAXBC])
582{
583	int r;
584
585	/* To decrypt: apply the inverse operations of the encrypt routine,
586	 *             in opposite order
587	 *
588	 * (KeyAddition is an involution: it 's equal to its inverse)
589	 * (the inverse of Substitution with table S is Substitution with the
590	 *  inverse table of S)
591	 * (the inverse of Shiftrow is Shiftrow over a suitable distance)
592	 */
593
594	/* First the special round:
595	 *   without InvMixColumn
596	 *   with extra KeyAddition
597	 */
598	KeyAddition128(a,rk[ROUNDS_128_OPT]);
599	Substitution128(a,Si);
600	ShiftRow128(a,1);
601
602	/* ROUNDS-1 ordinary rounds
603	 */
604	for(r = ROUNDS_128_OPT-1; r > 0; r--) {
605		KeyAddition128(a,rk[r]);
606		InvMixColumn128(a);
607		Substitution128(a,Si);
608		ShiftRow128(a,1);
609	}
610
611	/* End with the extra key addition
612	 */
613
614	KeyAddition128(a,rk[0]);
615
616	return 0;
617}
618
619#endif		/* !GLADMAN_AES_128_ENABLE */
620
621