1238384Sjkim/* crypto/aes/aes_core.c -*- mode:C; c-file-style: "eay" -*- */
2238384Sjkim/**
3238384Sjkim * rijndael-alg-fst.c
4238384Sjkim *
5238384Sjkim * @version 3.0 (December 2000)
6238384Sjkim *
7238384Sjkim * Optimised ANSI C code for the Rijndael cipher (now AES)
8238384Sjkim *
9238384Sjkim * @author Vincent Rijmen <vincent.rijmen@esat.kuleuven.ac.be>
10238384Sjkim * @author Antoon Bosselaers <antoon.bosselaers@esat.kuleuven.ac.be>
11238384Sjkim * @author Paulo Barreto <paulo.barreto@terra.com.br>
12238384Sjkim *
13238384Sjkim * This code is hereby placed in the public domain.
14238384Sjkim *
15238384Sjkim * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS
16238384Sjkim * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17238384Sjkim * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18238384Sjkim * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE
19238384Sjkim * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
20238384Sjkim * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
21238384Sjkim * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
22238384Sjkim * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
23238384Sjkim * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
24238384Sjkim * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
25238384Sjkim * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26238384Sjkim */
27238384Sjkim
28238384Sjkim/*
29238384Sjkim * This is experimental x86[_64] derivative. It assumes little-endian
30238384Sjkim * byte order and expects CPU to sustain unaligned memory references.
31238384Sjkim * It is used as playground for cache-time attack mitigations and
32238384Sjkim * serves as reference C implementation for x86[_64] assembler.
33238384Sjkim *
34238384Sjkim *					<appro@fy.chalmers.se>
35238384Sjkim */
36238384Sjkim
37238384Sjkim
38238384Sjkim#ifndef AES_DEBUG
39238384Sjkim# ifndef NDEBUG
40238384Sjkim#  define NDEBUG
41238384Sjkim# endif
42238384Sjkim#endif
43238384Sjkim#include <assert.h>
44238384Sjkim
45238384Sjkim#include <stdlib.h>
46238384Sjkim#include <openssl/aes.h>
47238384Sjkim#include "aes_locl.h"
48238384Sjkim
49238384Sjkim/*
50238384Sjkim * These two parameters control which table, 256-byte or 2KB, is
51238384Sjkim * referenced in outer and respectively inner rounds.
52238384Sjkim */
53238384Sjkim#define AES_COMPACT_IN_OUTER_ROUNDS
54238384Sjkim#ifdef  AES_COMPACT_IN_OUTER_ROUNDS
55238384Sjkim/* AES_COMPACT_IN_OUTER_ROUNDS costs ~30% in performance, while
56238384Sjkim * adding AES_COMPACT_IN_INNER_ROUNDS reduces benchmark *further*
57238384Sjkim * by factor of ~2. */
58238384Sjkim# undef  AES_COMPACT_IN_INNER_ROUNDS
59238384Sjkim#endif
60238384Sjkim
61238384Sjkim#if 1
62238384Sjkimstatic void prefetch256(const void *table)
63238384Sjkim{
64238384Sjkim	volatile unsigned long *t=(void *)table,ret;
65238384Sjkim	unsigned long sum;
66238384Sjkim	int i;
67238384Sjkim
68238384Sjkim	/* 32 is common least cache-line size */
69238384Sjkim	for (sum=0,i=0;i<256/sizeof(t[0]);i+=32/sizeof(t[0]))	sum ^= t[i];
70238384Sjkim
71238384Sjkim	ret = sum;
72238384Sjkim}
73238384Sjkim#else
74238384Sjkim# define prefetch256(t)
75238384Sjkim#endif
76238384Sjkim
77238384Sjkim#undef GETU32
78238384Sjkim#define GETU32(p) (*((u32*)(p)))
79238384Sjkim
80238384Sjkim#if (defined(_WIN32) || defined(_WIN64)) && !defined(__MINGW32__)
81238384Sjkimtypedef unsigned __int64 u64;
82238384Sjkim#define U64(C)	C##UI64
83238384Sjkim#elif defined(__arch64__)
84238384Sjkimtypedef unsigned long u64;
85238384Sjkim#define U64(C)	C##UL
86238384Sjkim#else
87238384Sjkimtypedef unsigned long long u64;
88238384Sjkim#define U64(C)	C##ULL
89238384Sjkim#endif
90238384Sjkim
91238384Sjkim#undef ROTATE
92238384Sjkim#if defined(_MSC_VER) || defined(__ICC)
93238384Sjkim# define ROTATE(a,n)	_lrotl(a,n)
94238384Sjkim#elif defined(__GNUC__) && __GNUC__>=2
95238384Sjkim# if defined(__i386) || defined(__i386__) || defined(__x86_64) || defined(__x86_64__)
96238384Sjkim#   define ROTATE(a,n)	({ register unsigned int ret;	\
97238384Sjkim				asm (			\
98238384Sjkim				"roll %1,%0"		\
99238384Sjkim				: "=r"(ret)		\
100238384Sjkim				: "I"(n), "0"(a)	\
101238384Sjkim				: "cc");		\
102238384Sjkim			   ret;				\
103238384Sjkim			})
104238384Sjkim# endif
105238384Sjkim#endif
106238384Sjkim/*
107238384SjkimTe [x] = S [x].[02, 01, 01, 03, 02, 01, 01, 03];
108238384SjkimTe0[x] = S [x].[02, 01, 01, 03];
109238384SjkimTe1[x] = S [x].[03, 02, 01, 01];
110238384SjkimTe2[x] = S [x].[01, 03, 02, 01];
111238384SjkimTe3[x] = S [x].[01, 01, 03, 02];
112238384Sjkim*/
113238384Sjkim#define Te0 (u32)((u64*)((u8*)Te+0))
114238384Sjkim#define Te1 (u32)((u64*)((u8*)Te+3))
115238384Sjkim#define Te2 (u32)((u64*)((u8*)Te+2))
116238384Sjkim#define Te3 (u32)((u64*)((u8*)Te+1))
117238384Sjkim/*
118238384SjkimTd [x] = Si[x].[0e, 09, 0d, 0b, 0e, 09, 0d, 0b];
119238384SjkimTd0[x] = Si[x].[0e, 09, 0d, 0b];
120238384SjkimTd1[x] = Si[x].[0b, 0e, 09, 0d];
121238384SjkimTd2[x] = Si[x].[0d, 0b, 0e, 09];
122238384SjkimTd3[x] = Si[x].[09, 0d, 0b, 0e];
123238384SjkimTd4[x] = Si[x].[01];
124238384Sjkim*/
125238384Sjkim#define Td0 (u32)((u64*)((u8*)Td+0))
126238384Sjkim#define Td1 (u32)((u64*)((u8*)Td+3))
127238384Sjkim#define Td2 (u32)((u64*)((u8*)Td+2))
128238384Sjkim#define Td3 (u32)((u64*)((u8*)Td+1))
129238384Sjkim
130238384Sjkimstatic const u64 Te[256] = {
131238384Sjkim    U64(0xa56363c6a56363c6), U64(0x847c7cf8847c7cf8),
132238384Sjkim    U64(0x997777ee997777ee), U64(0x8d7b7bf68d7b7bf6),
133238384Sjkim    U64(0x0df2f2ff0df2f2ff), U64(0xbd6b6bd6bd6b6bd6),
134238384Sjkim    U64(0xb16f6fdeb16f6fde), U64(0x54c5c59154c5c591),
135238384Sjkim    U64(0x5030306050303060), U64(0x0301010203010102),
136238384Sjkim    U64(0xa96767cea96767ce), U64(0x7d2b2b567d2b2b56),
137238384Sjkim    U64(0x19fefee719fefee7), U64(0x62d7d7b562d7d7b5),
138238384Sjkim    U64(0xe6abab4de6abab4d), U64(0x9a7676ec9a7676ec),
139238384Sjkim    U64(0x45caca8f45caca8f), U64(0x9d82821f9d82821f),
140238384Sjkim    U64(0x40c9c98940c9c989), U64(0x877d7dfa877d7dfa),
141238384Sjkim    U64(0x15fafaef15fafaef), U64(0xeb5959b2eb5959b2),
142238384Sjkim    U64(0xc947478ec947478e), U64(0x0bf0f0fb0bf0f0fb),
143238384Sjkim    U64(0xecadad41ecadad41), U64(0x67d4d4b367d4d4b3),
144238384Sjkim    U64(0xfda2a25ffda2a25f), U64(0xeaafaf45eaafaf45),
145238384Sjkim    U64(0xbf9c9c23bf9c9c23), U64(0xf7a4a453f7a4a453),
146238384Sjkim    U64(0x967272e4967272e4), U64(0x5bc0c09b5bc0c09b),
147238384Sjkim    U64(0xc2b7b775c2b7b775), U64(0x1cfdfde11cfdfde1),
148238384Sjkim    U64(0xae93933dae93933d), U64(0x6a26264c6a26264c),
149238384Sjkim    U64(0x5a36366c5a36366c), U64(0x413f3f7e413f3f7e),
150238384Sjkim    U64(0x02f7f7f502f7f7f5), U64(0x4fcccc834fcccc83),
151238384Sjkim    U64(0x5c3434685c343468), U64(0xf4a5a551f4a5a551),
152238384Sjkim    U64(0x34e5e5d134e5e5d1), U64(0x08f1f1f908f1f1f9),
153238384Sjkim    U64(0x937171e2937171e2), U64(0x73d8d8ab73d8d8ab),
154238384Sjkim    U64(0x5331316253313162), U64(0x3f15152a3f15152a),
155238384Sjkim    U64(0x0c0404080c040408), U64(0x52c7c79552c7c795),
156238384Sjkim    U64(0x6523234665232346), U64(0x5ec3c39d5ec3c39d),
157238384Sjkim    U64(0x2818183028181830), U64(0xa1969637a1969637),
158238384Sjkim    U64(0x0f05050a0f05050a), U64(0xb59a9a2fb59a9a2f),
159238384Sjkim    U64(0x0907070e0907070e), U64(0x3612122436121224),
160238384Sjkim    U64(0x9b80801b9b80801b), U64(0x3de2e2df3de2e2df),
161238384Sjkim    U64(0x26ebebcd26ebebcd), U64(0x6927274e6927274e),
162238384Sjkim    U64(0xcdb2b27fcdb2b27f), U64(0x9f7575ea9f7575ea),
163238384Sjkim    U64(0x1b0909121b090912), U64(0x9e83831d9e83831d),
164238384Sjkim    U64(0x742c2c58742c2c58), U64(0x2e1a1a342e1a1a34),
165238384Sjkim    U64(0x2d1b1b362d1b1b36), U64(0xb26e6edcb26e6edc),
166238384Sjkim    U64(0xee5a5ab4ee5a5ab4), U64(0xfba0a05bfba0a05b),
167238384Sjkim    U64(0xf65252a4f65252a4), U64(0x4d3b3b764d3b3b76),
168238384Sjkim    U64(0x61d6d6b761d6d6b7), U64(0xceb3b37dceb3b37d),
169238384Sjkim    U64(0x7b2929527b292952), U64(0x3ee3e3dd3ee3e3dd),
170238384Sjkim    U64(0x712f2f5e712f2f5e), U64(0x9784841397848413),
171238384Sjkim    U64(0xf55353a6f55353a6), U64(0x68d1d1b968d1d1b9),
172238384Sjkim    U64(0x0000000000000000), U64(0x2cededc12cededc1),
173238384Sjkim    U64(0x6020204060202040), U64(0x1ffcfce31ffcfce3),
174238384Sjkim    U64(0xc8b1b179c8b1b179), U64(0xed5b5bb6ed5b5bb6),
175238384Sjkim    U64(0xbe6a6ad4be6a6ad4), U64(0x46cbcb8d46cbcb8d),
176238384Sjkim    U64(0xd9bebe67d9bebe67), U64(0x4b3939724b393972),
177238384Sjkim    U64(0xde4a4a94de4a4a94), U64(0xd44c4c98d44c4c98),
178238384Sjkim    U64(0xe85858b0e85858b0), U64(0x4acfcf854acfcf85),
179238384Sjkim    U64(0x6bd0d0bb6bd0d0bb), U64(0x2aefefc52aefefc5),
180238384Sjkim    U64(0xe5aaaa4fe5aaaa4f), U64(0x16fbfbed16fbfbed),
181238384Sjkim    U64(0xc5434386c5434386), U64(0xd74d4d9ad74d4d9a),
182238384Sjkim    U64(0x5533336655333366), U64(0x9485851194858511),
183238384Sjkim    U64(0xcf45458acf45458a), U64(0x10f9f9e910f9f9e9),
184238384Sjkim    U64(0x0602020406020204), U64(0x817f7ffe817f7ffe),
185238384Sjkim    U64(0xf05050a0f05050a0), U64(0x443c3c78443c3c78),
186238384Sjkim    U64(0xba9f9f25ba9f9f25), U64(0xe3a8a84be3a8a84b),
187238384Sjkim    U64(0xf35151a2f35151a2), U64(0xfea3a35dfea3a35d),
188238384Sjkim    U64(0xc0404080c0404080), U64(0x8a8f8f058a8f8f05),
189238384Sjkim    U64(0xad92923fad92923f), U64(0xbc9d9d21bc9d9d21),
190238384Sjkim    U64(0x4838387048383870), U64(0x04f5f5f104f5f5f1),
191238384Sjkim    U64(0xdfbcbc63dfbcbc63), U64(0xc1b6b677c1b6b677),
192238384Sjkim    U64(0x75dadaaf75dadaaf), U64(0x6321214263212142),
193238384Sjkim    U64(0x3010102030101020), U64(0x1affffe51affffe5),
194238384Sjkim    U64(0x0ef3f3fd0ef3f3fd), U64(0x6dd2d2bf6dd2d2bf),
195238384Sjkim    U64(0x4ccdcd814ccdcd81), U64(0x140c0c18140c0c18),
196238384Sjkim    U64(0x3513132635131326), U64(0x2fececc32fececc3),
197238384Sjkim    U64(0xe15f5fbee15f5fbe), U64(0xa2979735a2979735),
198238384Sjkim    U64(0xcc444488cc444488), U64(0x3917172e3917172e),
199238384Sjkim    U64(0x57c4c49357c4c493), U64(0xf2a7a755f2a7a755),
200238384Sjkim    U64(0x827e7efc827e7efc), U64(0x473d3d7a473d3d7a),
201238384Sjkim    U64(0xac6464c8ac6464c8), U64(0xe75d5dbae75d5dba),
202238384Sjkim    U64(0x2b1919322b191932), U64(0x957373e6957373e6),
203238384Sjkim    U64(0xa06060c0a06060c0), U64(0x9881811998818119),
204238384Sjkim    U64(0xd14f4f9ed14f4f9e), U64(0x7fdcdca37fdcdca3),
205238384Sjkim    U64(0x6622224466222244), U64(0x7e2a2a547e2a2a54),
206238384Sjkim    U64(0xab90903bab90903b), U64(0x8388880b8388880b),
207238384Sjkim    U64(0xca46468cca46468c), U64(0x29eeeec729eeeec7),
208238384Sjkim    U64(0xd3b8b86bd3b8b86b), U64(0x3c1414283c141428),
209238384Sjkim    U64(0x79dedea779dedea7), U64(0xe25e5ebce25e5ebc),
210238384Sjkim    U64(0x1d0b0b161d0b0b16), U64(0x76dbdbad76dbdbad),
211238384Sjkim    U64(0x3be0e0db3be0e0db), U64(0x5632326456323264),
212238384Sjkim    U64(0x4e3a3a744e3a3a74), U64(0x1e0a0a141e0a0a14),
213238384Sjkim    U64(0xdb494992db494992), U64(0x0a06060c0a06060c),
214238384Sjkim    U64(0x6c2424486c242448), U64(0xe45c5cb8e45c5cb8),
215238384Sjkim    U64(0x5dc2c29f5dc2c29f), U64(0x6ed3d3bd6ed3d3bd),
216238384Sjkim    U64(0xefacac43efacac43), U64(0xa66262c4a66262c4),
217238384Sjkim    U64(0xa8919139a8919139), U64(0xa4959531a4959531),
218238384Sjkim    U64(0x37e4e4d337e4e4d3), U64(0x8b7979f28b7979f2),
219238384Sjkim    U64(0x32e7e7d532e7e7d5), U64(0x43c8c88b43c8c88b),
220238384Sjkim    U64(0x5937376e5937376e), U64(0xb76d6ddab76d6dda),
221238384Sjkim    U64(0x8c8d8d018c8d8d01), U64(0x64d5d5b164d5d5b1),
222238384Sjkim    U64(0xd24e4e9cd24e4e9c), U64(0xe0a9a949e0a9a949),
223238384Sjkim    U64(0xb46c6cd8b46c6cd8), U64(0xfa5656acfa5656ac),
224238384Sjkim    U64(0x07f4f4f307f4f4f3), U64(0x25eaeacf25eaeacf),
225238384Sjkim    U64(0xaf6565caaf6565ca), U64(0x8e7a7af48e7a7af4),
226238384Sjkim    U64(0xe9aeae47e9aeae47), U64(0x1808081018080810),
227238384Sjkim    U64(0xd5baba6fd5baba6f), U64(0x887878f0887878f0),
228238384Sjkim    U64(0x6f25254a6f25254a), U64(0x722e2e5c722e2e5c),
229238384Sjkim    U64(0x241c1c38241c1c38), U64(0xf1a6a657f1a6a657),
230238384Sjkim    U64(0xc7b4b473c7b4b473), U64(0x51c6c69751c6c697),
231238384Sjkim    U64(0x23e8e8cb23e8e8cb), U64(0x7cdddda17cdddda1),
232238384Sjkim    U64(0x9c7474e89c7474e8), U64(0x211f1f3e211f1f3e),
233238384Sjkim    U64(0xdd4b4b96dd4b4b96), U64(0xdcbdbd61dcbdbd61),
234238384Sjkim    U64(0x868b8b0d868b8b0d), U64(0x858a8a0f858a8a0f),
235238384Sjkim    U64(0x907070e0907070e0), U64(0x423e3e7c423e3e7c),
236238384Sjkim    U64(0xc4b5b571c4b5b571), U64(0xaa6666ccaa6666cc),
237238384Sjkim    U64(0xd8484890d8484890), U64(0x0503030605030306),
238238384Sjkim    U64(0x01f6f6f701f6f6f7), U64(0x120e0e1c120e0e1c),
239238384Sjkim    U64(0xa36161c2a36161c2), U64(0x5f35356a5f35356a),
240238384Sjkim    U64(0xf95757aef95757ae), U64(0xd0b9b969d0b9b969),
241238384Sjkim    U64(0x9186861791868617), U64(0x58c1c19958c1c199),
242238384Sjkim    U64(0x271d1d3a271d1d3a), U64(0xb99e9e27b99e9e27),
243238384Sjkim    U64(0x38e1e1d938e1e1d9), U64(0x13f8f8eb13f8f8eb),
244238384Sjkim    U64(0xb398982bb398982b), U64(0x3311112233111122),
245238384Sjkim    U64(0xbb6969d2bb6969d2), U64(0x70d9d9a970d9d9a9),
246238384Sjkim    U64(0x898e8e07898e8e07), U64(0xa7949433a7949433),
247238384Sjkim    U64(0xb69b9b2db69b9b2d), U64(0x221e1e3c221e1e3c),
248238384Sjkim    U64(0x9287871592878715), U64(0x20e9e9c920e9e9c9),
249238384Sjkim    U64(0x49cece8749cece87), U64(0xff5555aaff5555aa),
250238384Sjkim    U64(0x7828285078282850), U64(0x7adfdfa57adfdfa5),
251238384Sjkim    U64(0x8f8c8c038f8c8c03), U64(0xf8a1a159f8a1a159),
252238384Sjkim    U64(0x8089890980898909), U64(0x170d0d1a170d0d1a),
253238384Sjkim    U64(0xdabfbf65dabfbf65), U64(0x31e6e6d731e6e6d7),
254238384Sjkim    U64(0xc6424284c6424284), U64(0xb86868d0b86868d0),
255238384Sjkim    U64(0xc3414182c3414182), U64(0xb0999929b0999929),
256238384Sjkim    U64(0x772d2d5a772d2d5a), U64(0x110f0f1e110f0f1e),
257238384Sjkim    U64(0xcbb0b07bcbb0b07b), U64(0xfc5454a8fc5454a8),
258238384Sjkim    U64(0xd6bbbb6dd6bbbb6d), U64(0x3a16162c3a16162c)
259238384Sjkim};
260238384Sjkim
261238384Sjkimstatic const u8 Te4[256] = {
262238384Sjkim    0x63U, 0x7cU, 0x77U, 0x7bU, 0xf2U, 0x6bU, 0x6fU, 0xc5U,
263238384Sjkim    0x30U, 0x01U, 0x67U, 0x2bU, 0xfeU, 0xd7U, 0xabU, 0x76U,
264238384Sjkim    0xcaU, 0x82U, 0xc9U, 0x7dU, 0xfaU, 0x59U, 0x47U, 0xf0U,
265238384Sjkim    0xadU, 0xd4U, 0xa2U, 0xafU, 0x9cU, 0xa4U, 0x72U, 0xc0U,
266238384Sjkim    0xb7U, 0xfdU, 0x93U, 0x26U, 0x36U, 0x3fU, 0xf7U, 0xccU,
267238384Sjkim    0x34U, 0xa5U, 0xe5U, 0xf1U, 0x71U, 0xd8U, 0x31U, 0x15U,
268238384Sjkim    0x04U, 0xc7U, 0x23U, 0xc3U, 0x18U, 0x96U, 0x05U, 0x9aU,
269238384Sjkim    0x07U, 0x12U, 0x80U, 0xe2U, 0xebU, 0x27U, 0xb2U, 0x75U,
270238384Sjkim    0x09U, 0x83U, 0x2cU, 0x1aU, 0x1bU, 0x6eU, 0x5aU, 0xa0U,
271238384Sjkim    0x52U, 0x3bU, 0xd6U, 0xb3U, 0x29U, 0xe3U, 0x2fU, 0x84U,
272238384Sjkim    0x53U, 0xd1U, 0x00U, 0xedU, 0x20U, 0xfcU, 0xb1U, 0x5bU,
273238384Sjkim    0x6aU, 0xcbU, 0xbeU, 0x39U, 0x4aU, 0x4cU, 0x58U, 0xcfU,
274238384Sjkim    0xd0U, 0xefU, 0xaaU, 0xfbU, 0x43U, 0x4dU, 0x33U, 0x85U,
275238384Sjkim    0x45U, 0xf9U, 0x02U, 0x7fU, 0x50U, 0x3cU, 0x9fU, 0xa8U,
276238384Sjkim    0x51U, 0xa3U, 0x40U, 0x8fU, 0x92U, 0x9dU, 0x38U, 0xf5U,
277238384Sjkim    0xbcU, 0xb6U, 0xdaU, 0x21U, 0x10U, 0xffU, 0xf3U, 0xd2U,
278238384Sjkim    0xcdU, 0x0cU, 0x13U, 0xecU, 0x5fU, 0x97U, 0x44U, 0x17U,
279238384Sjkim    0xc4U, 0xa7U, 0x7eU, 0x3dU, 0x64U, 0x5dU, 0x19U, 0x73U,
280238384Sjkim    0x60U, 0x81U, 0x4fU, 0xdcU, 0x22U, 0x2aU, 0x90U, 0x88U,
281238384Sjkim    0x46U, 0xeeU, 0xb8U, 0x14U, 0xdeU, 0x5eU, 0x0bU, 0xdbU,
282238384Sjkim    0xe0U, 0x32U, 0x3aU, 0x0aU, 0x49U, 0x06U, 0x24U, 0x5cU,
283238384Sjkim    0xc2U, 0xd3U, 0xacU, 0x62U, 0x91U, 0x95U, 0xe4U, 0x79U,
284238384Sjkim    0xe7U, 0xc8U, 0x37U, 0x6dU, 0x8dU, 0xd5U, 0x4eU, 0xa9U,
285238384Sjkim    0x6cU, 0x56U, 0xf4U, 0xeaU, 0x65U, 0x7aU, 0xaeU, 0x08U,
286238384Sjkim    0xbaU, 0x78U, 0x25U, 0x2eU, 0x1cU, 0xa6U, 0xb4U, 0xc6U,
287238384Sjkim    0xe8U, 0xddU, 0x74U, 0x1fU, 0x4bU, 0xbdU, 0x8bU, 0x8aU,
288238384Sjkim    0x70U, 0x3eU, 0xb5U, 0x66U, 0x48U, 0x03U, 0xf6U, 0x0eU,
289238384Sjkim    0x61U, 0x35U, 0x57U, 0xb9U, 0x86U, 0xc1U, 0x1dU, 0x9eU,
290238384Sjkim    0xe1U, 0xf8U, 0x98U, 0x11U, 0x69U, 0xd9U, 0x8eU, 0x94U,
291238384Sjkim    0x9bU, 0x1eU, 0x87U, 0xe9U, 0xceU, 0x55U, 0x28U, 0xdfU,
292238384Sjkim    0x8cU, 0xa1U, 0x89U, 0x0dU, 0xbfU, 0xe6U, 0x42U, 0x68U,
293238384Sjkim    0x41U, 0x99U, 0x2dU, 0x0fU, 0xb0U, 0x54U, 0xbbU, 0x16U
294238384Sjkim};
295238384Sjkim
296238384Sjkimstatic const u64 Td[256] = {
297238384Sjkim    U64(0x50a7f45150a7f451), U64(0x5365417e5365417e),
298238384Sjkim    U64(0xc3a4171ac3a4171a), U64(0x965e273a965e273a),
299238384Sjkim    U64(0xcb6bab3bcb6bab3b), U64(0xf1459d1ff1459d1f),
300238384Sjkim    U64(0xab58faacab58faac), U64(0x9303e34b9303e34b),
301238384Sjkim    U64(0x55fa302055fa3020), U64(0xf66d76adf66d76ad),
302238384Sjkim    U64(0x9176cc889176cc88), U64(0x254c02f5254c02f5),
303238384Sjkim    U64(0xfcd7e54ffcd7e54f), U64(0xd7cb2ac5d7cb2ac5),
304238384Sjkim    U64(0x8044352680443526), U64(0x8fa362b58fa362b5),
305238384Sjkim    U64(0x495ab1de495ab1de), U64(0x671bba25671bba25),
306238384Sjkim    U64(0x980eea45980eea45), U64(0xe1c0fe5de1c0fe5d),
307238384Sjkim    U64(0x02752fc302752fc3), U64(0x12f04c8112f04c81),
308238384Sjkim    U64(0xa397468da397468d), U64(0xc6f9d36bc6f9d36b),
309238384Sjkim    U64(0xe75f8f03e75f8f03), U64(0x959c9215959c9215),
310238384Sjkim    U64(0xeb7a6dbfeb7a6dbf), U64(0xda595295da595295),
311238384Sjkim    U64(0x2d83bed42d83bed4), U64(0xd3217458d3217458),
312238384Sjkim    U64(0x2969e0492969e049), U64(0x44c8c98e44c8c98e),
313238384Sjkim    U64(0x6a89c2756a89c275), U64(0x78798ef478798ef4),
314238384Sjkim    U64(0x6b3e58996b3e5899), U64(0xdd71b927dd71b927),
315238384Sjkim    U64(0xb64fe1beb64fe1be), U64(0x17ad88f017ad88f0),
316238384Sjkim    U64(0x66ac20c966ac20c9), U64(0xb43ace7db43ace7d),
317238384Sjkim    U64(0x184adf63184adf63), U64(0x82311ae582311ae5),
318238384Sjkim    U64(0x6033519760335197), U64(0x457f5362457f5362),
319238384Sjkim    U64(0xe07764b1e07764b1), U64(0x84ae6bbb84ae6bbb),
320238384Sjkim    U64(0x1ca081fe1ca081fe), U64(0x942b08f9942b08f9),
321238384Sjkim    U64(0x5868487058684870), U64(0x19fd458f19fd458f),
322238384Sjkim    U64(0x876cde94876cde94), U64(0xb7f87b52b7f87b52),
323238384Sjkim    U64(0x23d373ab23d373ab), U64(0xe2024b72e2024b72),
324238384Sjkim    U64(0x578f1fe3578f1fe3), U64(0x2aab55662aab5566),
325238384Sjkim    U64(0x0728ebb20728ebb2), U64(0x03c2b52f03c2b52f),
326238384Sjkim    U64(0x9a7bc5869a7bc586), U64(0xa50837d3a50837d3),
327238384Sjkim    U64(0xf2872830f2872830), U64(0xb2a5bf23b2a5bf23),
328238384Sjkim    U64(0xba6a0302ba6a0302), U64(0x5c8216ed5c8216ed),
329238384Sjkim    U64(0x2b1ccf8a2b1ccf8a), U64(0x92b479a792b479a7),
330238384Sjkim    U64(0xf0f207f3f0f207f3), U64(0xa1e2694ea1e2694e),
331238384Sjkim    U64(0xcdf4da65cdf4da65), U64(0xd5be0506d5be0506),
332238384Sjkim    U64(0x1f6234d11f6234d1), U64(0x8afea6c48afea6c4),
333238384Sjkim    U64(0x9d532e349d532e34), U64(0xa055f3a2a055f3a2),
334238384Sjkim    U64(0x32e18a0532e18a05), U64(0x75ebf6a475ebf6a4),
335238384Sjkim    U64(0x39ec830b39ec830b), U64(0xaaef6040aaef6040),
336238384Sjkim    U64(0x069f715e069f715e), U64(0x51106ebd51106ebd),
337238384Sjkim    U64(0xf98a213ef98a213e), U64(0x3d06dd963d06dd96),
338238384Sjkim    U64(0xae053eddae053edd), U64(0x46bde64d46bde64d),
339238384Sjkim    U64(0xb58d5491b58d5491), U64(0x055dc471055dc471),
340238384Sjkim    U64(0x6fd406046fd40604), U64(0xff155060ff155060),
341238384Sjkim    U64(0x24fb981924fb9819), U64(0x97e9bdd697e9bdd6),
342238384Sjkim    U64(0xcc434089cc434089), U64(0x779ed967779ed967),
343238384Sjkim    U64(0xbd42e8b0bd42e8b0), U64(0x888b8907888b8907),
344238384Sjkim    U64(0x385b19e7385b19e7), U64(0xdbeec879dbeec879),
345238384Sjkim    U64(0x470a7ca1470a7ca1), U64(0xe90f427ce90f427c),
346238384Sjkim    U64(0xc91e84f8c91e84f8), U64(0x0000000000000000),
347238384Sjkim    U64(0x8386800983868009), U64(0x48ed2b3248ed2b32),
348238384Sjkim    U64(0xac70111eac70111e), U64(0x4e725a6c4e725a6c),
349238384Sjkim    U64(0xfbff0efdfbff0efd), U64(0x5638850f5638850f),
350238384Sjkim    U64(0x1ed5ae3d1ed5ae3d), U64(0x27392d3627392d36),
351238384Sjkim    U64(0x64d90f0a64d90f0a), U64(0x21a65c6821a65c68),
352238384Sjkim    U64(0xd1545b9bd1545b9b), U64(0x3a2e36243a2e3624),
353238384Sjkim    U64(0xb1670a0cb1670a0c), U64(0x0fe757930fe75793),
354238384Sjkim    U64(0xd296eeb4d296eeb4), U64(0x9e919b1b9e919b1b),
355238384Sjkim    U64(0x4fc5c0804fc5c080), U64(0xa220dc61a220dc61),
356238384Sjkim    U64(0x694b775a694b775a), U64(0x161a121c161a121c),
357238384Sjkim    U64(0x0aba93e20aba93e2), U64(0xe52aa0c0e52aa0c0),
358238384Sjkim    U64(0x43e0223c43e0223c), U64(0x1d171b121d171b12),
359238384Sjkim    U64(0x0b0d090e0b0d090e), U64(0xadc78bf2adc78bf2),
360238384Sjkim    U64(0xb9a8b62db9a8b62d), U64(0xc8a91e14c8a91e14),
361238384Sjkim    U64(0x8519f1578519f157), U64(0x4c0775af4c0775af),
362238384Sjkim    U64(0xbbdd99eebbdd99ee), U64(0xfd607fa3fd607fa3),
363238384Sjkim    U64(0x9f2601f79f2601f7), U64(0xbcf5725cbcf5725c),
364238384Sjkim    U64(0xc53b6644c53b6644), U64(0x347efb5b347efb5b),
365238384Sjkim    U64(0x7629438b7629438b), U64(0xdcc623cbdcc623cb),
366238384Sjkim    U64(0x68fcedb668fcedb6), U64(0x63f1e4b863f1e4b8),
367238384Sjkim    U64(0xcadc31d7cadc31d7), U64(0x1085634210856342),
368238384Sjkim    U64(0x4022971340229713), U64(0x2011c6842011c684),
369238384Sjkim    U64(0x7d244a857d244a85), U64(0xf83dbbd2f83dbbd2),
370238384Sjkim    U64(0x1132f9ae1132f9ae), U64(0x6da129c76da129c7),
371238384Sjkim    U64(0x4b2f9e1d4b2f9e1d), U64(0xf330b2dcf330b2dc),
372238384Sjkim    U64(0xec52860dec52860d), U64(0xd0e3c177d0e3c177),
373238384Sjkim    U64(0x6c16b32b6c16b32b), U64(0x99b970a999b970a9),
374238384Sjkim    U64(0xfa489411fa489411), U64(0x2264e9472264e947),
375238384Sjkim    U64(0xc48cfca8c48cfca8), U64(0x1a3ff0a01a3ff0a0),
376238384Sjkim    U64(0xd82c7d56d82c7d56), U64(0xef903322ef903322),
377238384Sjkim    U64(0xc74e4987c74e4987), U64(0xc1d138d9c1d138d9),
378238384Sjkim    U64(0xfea2ca8cfea2ca8c), U64(0x360bd498360bd498),
379238384Sjkim    U64(0xcf81f5a6cf81f5a6), U64(0x28de7aa528de7aa5),
380238384Sjkim    U64(0x268eb7da268eb7da), U64(0xa4bfad3fa4bfad3f),
381238384Sjkim    U64(0xe49d3a2ce49d3a2c), U64(0x0d9278500d927850),
382238384Sjkim    U64(0x9bcc5f6a9bcc5f6a), U64(0x62467e5462467e54),
383238384Sjkim    U64(0xc2138df6c2138df6), U64(0xe8b8d890e8b8d890),
384238384Sjkim    U64(0x5ef7392e5ef7392e), U64(0xf5afc382f5afc382),
385238384Sjkim    U64(0xbe805d9fbe805d9f), U64(0x7c93d0697c93d069),
386238384Sjkim    U64(0xa92dd56fa92dd56f), U64(0xb31225cfb31225cf),
387238384Sjkim    U64(0x3b99acc83b99acc8), U64(0xa77d1810a77d1810),
388238384Sjkim    U64(0x6e639ce86e639ce8), U64(0x7bbb3bdb7bbb3bdb),
389238384Sjkim    U64(0x097826cd097826cd), U64(0xf418596ef418596e),
390238384Sjkim    U64(0x01b79aec01b79aec), U64(0xa89a4f83a89a4f83),
391238384Sjkim    U64(0x656e95e6656e95e6), U64(0x7ee6ffaa7ee6ffaa),
392238384Sjkim    U64(0x08cfbc2108cfbc21), U64(0xe6e815efe6e815ef),
393238384Sjkim    U64(0xd99be7bad99be7ba), U64(0xce366f4ace366f4a),
394238384Sjkim    U64(0xd4099fead4099fea), U64(0xd67cb029d67cb029),
395238384Sjkim    U64(0xafb2a431afb2a431), U64(0x31233f2a31233f2a),
396238384Sjkim    U64(0x3094a5c63094a5c6), U64(0xc066a235c066a235),
397238384Sjkim    U64(0x37bc4e7437bc4e74), U64(0xa6ca82fca6ca82fc),
398238384Sjkim    U64(0xb0d090e0b0d090e0), U64(0x15d8a73315d8a733),
399238384Sjkim    U64(0x4a9804f14a9804f1), U64(0xf7daec41f7daec41),
400238384Sjkim    U64(0x0e50cd7f0e50cd7f), U64(0x2ff691172ff69117),
401238384Sjkim    U64(0x8dd64d768dd64d76), U64(0x4db0ef434db0ef43),
402238384Sjkim    U64(0x544daacc544daacc), U64(0xdf0496e4df0496e4),
403238384Sjkim    U64(0xe3b5d19ee3b5d19e), U64(0x1b886a4c1b886a4c),
404238384Sjkim    U64(0xb81f2cc1b81f2cc1), U64(0x7f5165467f516546),
405238384Sjkim    U64(0x04ea5e9d04ea5e9d), U64(0x5d358c015d358c01),
406238384Sjkim    U64(0x737487fa737487fa), U64(0x2e410bfb2e410bfb),
407238384Sjkim    U64(0x5a1d67b35a1d67b3), U64(0x52d2db9252d2db92),
408238384Sjkim    U64(0x335610e9335610e9), U64(0x1347d66d1347d66d),
409238384Sjkim    U64(0x8c61d79a8c61d79a), U64(0x7a0ca1377a0ca137),
410238384Sjkim    U64(0x8e14f8598e14f859), U64(0x893c13eb893c13eb),
411238384Sjkim    U64(0xee27a9ceee27a9ce), U64(0x35c961b735c961b7),
412238384Sjkim    U64(0xede51ce1ede51ce1), U64(0x3cb1477a3cb1477a),
413238384Sjkim    U64(0x59dfd29c59dfd29c), U64(0x3f73f2553f73f255),
414238384Sjkim    U64(0x79ce141879ce1418), U64(0xbf37c773bf37c773),
415238384Sjkim    U64(0xeacdf753eacdf753), U64(0x5baafd5f5baafd5f),
416238384Sjkim    U64(0x146f3ddf146f3ddf), U64(0x86db447886db4478),
417238384Sjkim    U64(0x81f3afca81f3afca), U64(0x3ec468b93ec468b9),
418238384Sjkim    U64(0x2c3424382c342438), U64(0x5f40a3c25f40a3c2),
419238384Sjkim    U64(0x72c31d1672c31d16), U64(0x0c25e2bc0c25e2bc),
420238384Sjkim    U64(0x8b493c288b493c28), U64(0x41950dff41950dff),
421238384Sjkim    U64(0x7101a8397101a839), U64(0xdeb30c08deb30c08),
422238384Sjkim    U64(0x9ce4b4d89ce4b4d8), U64(0x90c1566490c15664),
423238384Sjkim    U64(0x6184cb7b6184cb7b), U64(0x70b632d570b632d5),
424238384Sjkim    U64(0x745c6c48745c6c48), U64(0x4257b8d04257b8d0)
425238384Sjkim};
426238384Sjkimstatic const u8 Td4[256] = {
427238384Sjkim    0x52U, 0x09U, 0x6aU, 0xd5U, 0x30U, 0x36U, 0xa5U, 0x38U,
428238384Sjkim    0xbfU, 0x40U, 0xa3U, 0x9eU, 0x81U, 0xf3U, 0xd7U, 0xfbU,
429238384Sjkim    0x7cU, 0xe3U, 0x39U, 0x82U, 0x9bU, 0x2fU, 0xffU, 0x87U,
430238384Sjkim    0x34U, 0x8eU, 0x43U, 0x44U, 0xc4U, 0xdeU, 0xe9U, 0xcbU,
431238384Sjkim    0x54U, 0x7bU, 0x94U, 0x32U, 0xa6U, 0xc2U, 0x23U, 0x3dU,
432238384Sjkim    0xeeU, 0x4cU, 0x95U, 0x0bU, 0x42U, 0xfaU, 0xc3U, 0x4eU,
433238384Sjkim    0x08U, 0x2eU, 0xa1U, 0x66U, 0x28U, 0xd9U, 0x24U, 0xb2U,
434238384Sjkim    0x76U, 0x5bU, 0xa2U, 0x49U, 0x6dU, 0x8bU, 0xd1U, 0x25U,
435238384Sjkim    0x72U, 0xf8U, 0xf6U, 0x64U, 0x86U, 0x68U, 0x98U, 0x16U,
436238384Sjkim    0xd4U, 0xa4U, 0x5cU, 0xccU, 0x5dU, 0x65U, 0xb6U, 0x92U,
437238384Sjkim    0x6cU, 0x70U, 0x48U, 0x50U, 0xfdU, 0xedU, 0xb9U, 0xdaU,
438238384Sjkim    0x5eU, 0x15U, 0x46U, 0x57U, 0xa7U, 0x8dU, 0x9dU, 0x84U,
439238384Sjkim    0x90U, 0xd8U, 0xabU, 0x00U, 0x8cU, 0xbcU, 0xd3U, 0x0aU,
440238384Sjkim    0xf7U, 0xe4U, 0x58U, 0x05U, 0xb8U, 0xb3U, 0x45U, 0x06U,
441238384Sjkim    0xd0U, 0x2cU, 0x1eU, 0x8fU, 0xcaU, 0x3fU, 0x0fU, 0x02U,
442238384Sjkim    0xc1U, 0xafU, 0xbdU, 0x03U, 0x01U, 0x13U, 0x8aU, 0x6bU,
443238384Sjkim    0x3aU, 0x91U, 0x11U, 0x41U, 0x4fU, 0x67U, 0xdcU, 0xeaU,
444238384Sjkim    0x97U, 0xf2U, 0xcfU, 0xceU, 0xf0U, 0xb4U, 0xe6U, 0x73U,
445238384Sjkim    0x96U, 0xacU, 0x74U, 0x22U, 0xe7U, 0xadU, 0x35U, 0x85U,
446238384Sjkim    0xe2U, 0xf9U, 0x37U, 0xe8U, 0x1cU, 0x75U, 0xdfU, 0x6eU,
447238384Sjkim    0x47U, 0xf1U, 0x1aU, 0x71U, 0x1dU, 0x29U, 0xc5U, 0x89U,
448238384Sjkim    0x6fU, 0xb7U, 0x62U, 0x0eU, 0xaaU, 0x18U, 0xbeU, 0x1bU,
449238384Sjkim    0xfcU, 0x56U, 0x3eU, 0x4bU, 0xc6U, 0xd2U, 0x79U, 0x20U,
450238384Sjkim    0x9aU, 0xdbU, 0xc0U, 0xfeU, 0x78U, 0xcdU, 0x5aU, 0xf4U,
451238384Sjkim    0x1fU, 0xddU, 0xa8U, 0x33U, 0x88U, 0x07U, 0xc7U, 0x31U,
452238384Sjkim    0xb1U, 0x12U, 0x10U, 0x59U, 0x27U, 0x80U, 0xecU, 0x5fU,
453238384Sjkim    0x60U, 0x51U, 0x7fU, 0xa9U, 0x19U, 0xb5U, 0x4aU, 0x0dU,
454238384Sjkim    0x2dU, 0xe5U, 0x7aU, 0x9fU, 0x93U, 0xc9U, 0x9cU, 0xefU,
455238384Sjkim    0xa0U, 0xe0U, 0x3bU, 0x4dU, 0xaeU, 0x2aU, 0xf5U, 0xb0U,
456238384Sjkim    0xc8U, 0xebU, 0xbbU, 0x3cU, 0x83U, 0x53U, 0x99U, 0x61U,
457238384Sjkim    0x17U, 0x2bU, 0x04U, 0x7eU, 0xbaU, 0x77U, 0xd6U, 0x26U,
458238384Sjkim    0xe1U, 0x69U, 0x14U, 0x63U, 0x55U, 0x21U, 0x0cU, 0x7dU
459238384Sjkim};
460238384Sjkim
461238384Sjkimstatic const u32 rcon[] = {
462238384Sjkim    0x00000001U, 0x00000002U, 0x00000004U, 0x00000008U,
463238384Sjkim    0x00000010U, 0x00000020U, 0x00000040U, 0x00000080U,
464238384Sjkim    0x0000001bU, 0x00000036U, /* for 128-bit blocks, Rijndael never uses more than 10 rcon values */
465238384Sjkim};
466238384Sjkim
467238384Sjkim/**
468238384Sjkim * Expand the cipher key into the encryption key schedule.
469238384Sjkim */
470238384Sjkimint AES_set_encrypt_key(const unsigned char *userKey, const int bits,
471238384Sjkim			AES_KEY *key) {
472238384Sjkim
473238384Sjkim	u32 *rk;
474238384Sjkim   	int i = 0;
475238384Sjkim	u32 temp;
476238384Sjkim
477238384Sjkim	if (!userKey || !key)
478238384Sjkim		return -1;
479238384Sjkim	if (bits != 128 && bits != 192 && bits != 256)
480238384Sjkim		return -2;
481238384Sjkim
482238384Sjkim	rk = key->rd_key;
483238384Sjkim
484238384Sjkim	if (bits==128)
485238384Sjkim		key->rounds = 10;
486238384Sjkim	else if (bits==192)
487238384Sjkim		key->rounds = 12;
488238384Sjkim	else
489238384Sjkim		key->rounds = 14;
490238384Sjkim
491238384Sjkim	rk[0] = GETU32(userKey     );
492238384Sjkim	rk[1] = GETU32(userKey +  4);
493238384Sjkim	rk[2] = GETU32(userKey +  8);
494238384Sjkim	rk[3] = GETU32(userKey + 12);
495238384Sjkim	if (bits == 128) {
496238384Sjkim		while (1) {
497238384Sjkim			temp  = rk[3];
498238384Sjkim			rk[4] = rk[0] ^
499238384Sjkim				(Te4[(temp >>  8) & 0xff]      ) ^
500238384Sjkim				(Te4[(temp >> 16) & 0xff] <<  8) ^
501238384Sjkim				(Te4[(temp >> 24)       ] << 16) ^
502238384Sjkim				(Te4[(temp      ) & 0xff] << 24) ^
503238384Sjkim				rcon[i];
504238384Sjkim			rk[5] = rk[1] ^ rk[4];
505238384Sjkim			rk[6] = rk[2] ^ rk[5];
506238384Sjkim			rk[7] = rk[3] ^ rk[6];
507238384Sjkim			if (++i == 10) {
508238384Sjkim				return 0;
509238384Sjkim			}
510238384Sjkim			rk += 4;
511238384Sjkim		}
512238384Sjkim	}
513238384Sjkim	rk[4] = GETU32(userKey + 16);
514238384Sjkim	rk[5] = GETU32(userKey + 20);
515238384Sjkim	if (bits == 192) {
516238384Sjkim		while (1) {
517238384Sjkim			temp = rk[ 5];
518238384Sjkim			rk[ 6] = rk[ 0] ^
519238384Sjkim				(Te4[(temp >>  8) & 0xff]      ) ^
520238384Sjkim				(Te4[(temp >> 16) & 0xff] <<  8) ^
521238384Sjkim				(Te4[(temp >> 24)       ] << 16) ^
522238384Sjkim				(Te4[(temp      ) & 0xff] << 24) ^
523238384Sjkim				rcon[i];
524238384Sjkim			rk[ 7] = rk[ 1] ^ rk[ 6];
525238384Sjkim			rk[ 8] = rk[ 2] ^ rk[ 7];
526238384Sjkim			rk[ 9] = rk[ 3] ^ rk[ 8];
527238384Sjkim			if (++i == 8) {
528238384Sjkim				return 0;
529238384Sjkim			}
530238384Sjkim			rk[10] = rk[ 4] ^ rk[ 9];
531238384Sjkim			rk[11] = rk[ 5] ^ rk[10];
532238384Sjkim			rk += 6;
533238384Sjkim		}
534238384Sjkim	}
535238384Sjkim	rk[6] = GETU32(userKey + 24);
536238384Sjkim	rk[7] = GETU32(userKey + 28);
537238384Sjkim	if (bits == 256) {
538238384Sjkim		while (1) {
539238384Sjkim			temp = rk[ 7];
540238384Sjkim			rk[ 8] = rk[ 0] ^
541238384Sjkim				(Te4[(temp >>  8) & 0xff]      ) ^
542238384Sjkim				(Te4[(temp >> 16) & 0xff] <<  8) ^
543238384Sjkim				(Te4[(temp >> 24)       ] << 16) ^
544238384Sjkim				(Te4[(temp      ) & 0xff] << 24) ^
545238384Sjkim				rcon[i];
546238384Sjkim			rk[ 9] = rk[ 1] ^ rk[ 8];
547238384Sjkim			rk[10] = rk[ 2] ^ rk[ 9];
548238384Sjkim			rk[11] = rk[ 3] ^ rk[10];
549238384Sjkim			if (++i == 7) {
550238384Sjkim				return 0;
551238384Sjkim			}
552238384Sjkim			temp = rk[11];
553238384Sjkim			rk[12] = rk[ 4] ^
554238384Sjkim				(Te4[(temp      ) & 0xff]      ) ^
555238384Sjkim				(Te4[(temp >>  8) & 0xff] <<  8) ^
556238384Sjkim				(Te4[(temp >> 16) & 0xff] << 16) ^
557238384Sjkim				(Te4[(temp >> 24)       ] << 24);
558238384Sjkim			rk[13] = rk[ 5] ^ rk[12];
559238384Sjkim			rk[14] = rk[ 6] ^ rk[13];
560238384Sjkim			rk[15] = rk[ 7] ^ rk[14];
561238384Sjkim
562238384Sjkim			rk += 8;
563238384Sjkim        	}
564238384Sjkim	}
565238384Sjkim	return 0;
566238384Sjkim}
567238384Sjkim
568238384Sjkim/**
569238384Sjkim * Expand the cipher key into the decryption key schedule.
570238384Sjkim */
571238384Sjkimint AES_set_decrypt_key(const unsigned char *userKey, const int bits,
572238384Sjkim			 AES_KEY *key) {
573238384Sjkim
574238384Sjkim        u32 *rk;
575238384Sjkim	int i, j, status;
576238384Sjkim	u32 temp;
577238384Sjkim
578238384Sjkim	/* first, start with an encryption schedule */
579238384Sjkim	status = AES_set_encrypt_key(userKey, bits, key);
580238384Sjkim	if (status < 0)
581238384Sjkim		return status;
582238384Sjkim
583238384Sjkim	rk = key->rd_key;
584238384Sjkim
585238384Sjkim	/* invert the order of the round keys: */
586238384Sjkim	for (i = 0, j = 4*(key->rounds); i < j; i += 4, j -= 4) {
587238384Sjkim		temp = rk[i    ]; rk[i    ] = rk[j    ]; rk[j    ] = temp;
588238384Sjkim		temp = rk[i + 1]; rk[i + 1] = rk[j + 1]; rk[j + 1] = temp;
589238384Sjkim		temp = rk[i + 2]; rk[i + 2] = rk[j + 2]; rk[j + 2] = temp;
590238384Sjkim		temp = rk[i + 3]; rk[i + 3] = rk[j + 3]; rk[j + 3] = temp;
591238384Sjkim	}
592238384Sjkim	/* apply the inverse MixColumn transform to all round keys but the first and the last: */
593238384Sjkim	for (i = 1; i < (key->rounds); i++) {
594238384Sjkim		rk += 4;
595238384Sjkim#if 1
596238384Sjkim		for (j = 0; j < 4; j++) {
597238384Sjkim			u32 tp1, tp2, tp4, tp8, tp9, tpb, tpd, tpe, m;
598238384Sjkim
599238384Sjkim			tp1 = rk[j];
600238384Sjkim			m = tp1 & 0x80808080;
601238384Sjkim			tp2 = ((tp1 & 0x7f7f7f7f) << 1) ^
602238384Sjkim				((m - (m >> 7)) & 0x1b1b1b1b);
603238384Sjkim			m = tp2 & 0x80808080;
604238384Sjkim			tp4 = ((tp2 & 0x7f7f7f7f) << 1) ^
605238384Sjkim				((m - (m >> 7)) & 0x1b1b1b1b);
606238384Sjkim			m = tp4 & 0x80808080;
607238384Sjkim			tp8 = ((tp4 & 0x7f7f7f7f) << 1) ^
608238384Sjkim				((m - (m >> 7)) & 0x1b1b1b1b);
609238384Sjkim			tp9 = tp8 ^ tp1;
610238384Sjkim			tpb = tp9 ^ tp2;
611238384Sjkim			tpd = tp9 ^ tp4;
612238384Sjkim			tpe = tp8 ^ tp4 ^ tp2;
613238384Sjkim#if defined(ROTATE)
614238384Sjkim			rk[j] = tpe ^ ROTATE(tpd,16) ^
615238384Sjkim				ROTATE(tp9,8) ^ ROTATE(tpb,24);
616238384Sjkim#else
617238384Sjkim			rk[j] = tpe ^ (tpd >> 16) ^ (tpd << 16) ^
618238384Sjkim				(tp9 >> 24) ^ (tp9 << 8) ^
619238384Sjkim				(tpb >> 8) ^ (tpb << 24);
620238384Sjkim#endif
621238384Sjkim		}
622238384Sjkim#else
623238384Sjkim		rk[0] =
624238384Sjkim			Td0[Te2[(rk[0]      ) & 0xff] & 0xff] ^
625238384Sjkim			Td1[Te2[(rk[0] >>  8) & 0xff] & 0xff] ^
626238384Sjkim			Td2[Te2[(rk[0] >> 16) & 0xff] & 0xff] ^
627238384Sjkim			Td3[Te2[(rk[0] >> 24)       ] & 0xff];
628238384Sjkim		rk[1] =
629238384Sjkim			Td0[Te2[(rk[1]      ) & 0xff] & 0xff] ^
630238384Sjkim			Td1[Te2[(rk[1] >>  8) & 0xff] & 0xff] ^
631238384Sjkim			Td2[Te2[(rk[1] >> 16) & 0xff] & 0xff] ^
632238384Sjkim			Td3[Te2[(rk[1] >> 24)       ] & 0xff];
633238384Sjkim		rk[2] =
634238384Sjkim			Td0[Te2[(rk[2]      ) & 0xff] & 0xff] ^
635238384Sjkim			Td1[Te2[(rk[2] >>  8) & 0xff] & 0xff] ^
636238384Sjkim			Td2[Te2[(rk[2] >> 16) & 0xff] & 0xff] ^
637238384Sjkim			Td3[Te2[(rk[2] >> 24)       ] & 0xff];
638238384Sjkim		rk[3] =
639238384Sjkim			Td0[Te2[(rk[3]      ) & 0xff] & 0xff] ^
640238384Sjkim			Td1[Te2[(rk[3] >>  8) & 0xff] & 0xff] ^
641238384Sjkim			Td2[Te2[(rk[3] >> 16) & 0xff] & 0xff] ^
642238384Sjkim			Td3[Te2[(rk[3] >> 24)       ] & 0xff];
643238384Sjkim#endif
644238384Sjkim	}
645238384Sjkim	return 0;
646238384Sjkim}
647238384Sjkim
648238384Sjkim/*
649238384Sjkim * Encrypt a single block
650238384Sjkim * in and out can overlap
651238384Sjkim */
652238384Sjkimvoid AES_encrypt(const unsigned char *in, unsigned char *out,
653238384Sjkim		 const AES_KEY *key) {
654238384Sjkim
655238384Sjkim	const u32 *rk;
656238384Sjkim	u32 s0, s1, s2, s3, t[4];
657238384Sjkim	int r;
658238384Sjkim
659238384Sjkim	assert(in && out && key);
660238384Sjkim	rk = key->rd_key;
661238384Sjkim
662238384Sjkim	/*
663238384Sjkim	 * map byte array block to cipher state
664238384Sjkim	 * and add initial round key:
665238384Sjkim	 */
666238384Sjkim	s0 = GETU32(in     ) ^ rk[0];
667238384Sjkim	s1 = GETU32(in +  4) ^ rk[1];
668238384Sjkim	s2 = GETU32(in +  8) ^ rk[2];
669238384Sjkim	s3 = GETU32(in + 12) ^ rk[3];
670238384Sjkim
671238384Sjkim#if defined(AES_COMPACT_IN_OUTER_ROUNDS)
672238384Sjkim	prefetch256(Te4);
673238384Sjkim
674238384Sjkim	t[0] =	Te4[(s0      ) & 0xff]       ^
675238384Sjkim		Te4[(s1 >>  8) & 0xff] <<  8 ^
676238384Sjkim		Te4[(s2 >> 16) & 0xff] << 16 ^
677238384Sjkim		Te4[(s3 >> 24)       ] << 24;
678238384Sjkim	t[1] =	Te4[(s1      ) & 0xff]       ^
679238384Sjkim		Te4[(s2 >>  8) & 0xff] <<  8 ^
680238384Sjkim		Te4[(s3 >> 16) & 0xff] << 16 ^
681238384Sjkim		Te4[(s0 >> 24)       ] << 24;
682238384Sjkim	t[2] =	Te4[(s2      ) & 0xff]       ^
683238384Sjkim		Te4[(s3 >>  8) & 0xff] <<  8 ^
684238384Sjkim		Te4[(s0 >> 16) & 0xff] << 16 ^
685238384Sjkim		Te4[(s1 >> 24)       ] << 24;
686238384Sjkim	t[3] =	Te4[(s3      ) & 0xff]       ^
687238384Sjkim		Te4[(s0 >>  8) & 0xff] <<  8 ^
688238384Sjkim		Te4[(s1 >> 16) & 0xff] << 16 ^
689238384Sjkim		Te4[(s2 >> 24)       ] << 24;
690238384Sjkim
691238384Sjkim	/* now do the linear transform using words */
692238384Sjkim	{	int i;
693238384Sjkim		u32 r0, r1, r2;
694238384Sjkim
695238384Sjkim		for (i = 0; i < 4; i++) {
696238384Sjkim			r0 = t[i];
697238384Sjkim			r1 = r0 & 0x80808080;
698238384Sjkim			r2 = ((r0 & 0x7f7f7f7f) << 1) ^
699238384Sjkim				((r1 - (r1 >> 7)) & 0x1b1b1b1b);
700238384Sjkim#if defined(ROTATE)
701238384Sjkim			t[i] = r2 ^ ROTATE(r2,24) ^ ROTATE(r0,24) ^
702238384Sjkim				ROTATE(r0,16) ^ ROTATE(r0,8);
703238384Sjkim#else
704238384Sjkim			t[i] = r2 ^ ((r2 ^ r0) << 24) ^ ((r2 ^ r0) >> 8) ^
705238384Sjkim				(r0 << 16) ^ (r0 >> 16) ^
706238384Sjkim				(r0 << 8) ^ (r0 >> 24);
707238384Sjkim#endif
708238384Sjkim			t[i] ^= rk[4+i];
709238384Sjkim		}
710238384Sjkim	}
711238384Sjkim#else
712238384Sjkim	t[0] =	Te0[(s0      ) & 0xff] ^
713238384Sjkim		Te1[(s1 >>  8) & 0xff] ^
714238384Sjkim		Te2[(s2 >> 16) & 0xff] ^
715238384Sjkim		Te3[(s3 >> 24)       ] ^
716238384Sjkim		rk[4];
717238384Sjkim	t[1] =	Te0[(s1      ) & 0xff] ^
718238384Sjkim		Te1[(s2 >>  8) & 0xff] ^
719238384Sjkim		Te2[(s3 >> 16) & 0xff] ^
720238384Sjkim		Te3[(s0 >> 24)       ] ^
721238384Sjkim		rk[5];
722238384Sjkim	t[2] =	Te0[(s2      ) & 0xff] ^
723238384Sjkim		Te1[(s3 >>  8) & 0xff] ^
724238384Sjkim		Te2[(s0 >> 16) & 0xff] ^
725238384Sjkim		Te3[(s1 >> 24)       ] ^
726238384Sjkim		rk[6];
727238384Sjkim	t[3] =	Te0[(s3      ) & 0xff] ^
728238384Sjkim		Te1[(s0 >>  8) & 0xff] ^
729238384Sjkim		Te2[(s1 >> 16) & 0xff] ^
730238384Sjkim		Te3[(s2 >> 24)       ] ^
731238384Sjkim		rk[7];
732238384Sjkim#endif
733238384Sjkim	s0 = t[0]; s1 = t[1]; s2 = t[2]; s3 = t[3];
734238384Sjkim
735238384Sjkim    /*
736238384Sjkim     * Nr - 2 full rounds:
737238384Sjkim     */
738238384Sjkim    for (rk+=8,r=key->rounds-2; r>0; rk+=4,r--) {
739238384Sjkim#if defined(AES_COMPACT_IN_INNER_ROUNDS)
740238384Sjkim	t[0] =	Te4[(s0      ) & 0xff]       ^
741238384Sjkim		Te4[(s1 >>  8) & 0xff] <<  8 ^
742238384Sjkim		Te4[(s2 >> 16) & 0xff] << 16 ^
743238384Sjkim		Te4[(s3 >> 24)       ] << 24;
744238384Sjkim	t[1] =	Te4[(s1      ) & 0xff]       ^
745238384Sjkim		Te4[(s2 >>  8) & 0xff] <<  8 ^
746238384Sjkim		Te4[(s3 >> 16) & 0xff] << 16 ^
747238384Sjkim		Te4[(s0 >> 24)       ] << 24;
748238384Sjkim	t[2] =	Te4[(s2      ) & 0xff]       ^
749238384Sjkim		Te4[(s3 >>  8) & 0xff] <<  8 ^
750238384Sjkim		Te4[(s0 >> 16) & 0xff] << 16 ^
751238384Sjkim		Te4[(s1 >> 24)       ] << 24;
752238384Sjkim	t[3] =	Te4[(s3      ) & 0xff]       ^
753238384Sjkim		Te4[(s0 >>  8) & 0xff] <<  8 ^
754238384Sjkim		Te4[(s1 >> 16) & 0xff] << 16 ^
755238384Sjkim		Te4[(s2 >> 24)       ] << 24;
756238384Sjkim
757238384Sjkim	/* now do the linear transform using words */
758238384Sjkim	{	int i;
759238384Sjkim		u32 r0, r1, r2;
760238384Sjkim
761238384Sjkim		for (i = 0; i < 4; i++) {
762238384Sjkim			r0 = t[i];
763238384Sjkim			r1 = r0 & 0x80808080;
764238384Sjkim			r2 = ((r0 & 0x7f7f7f7f) << 1) ^
765238384Sjkim				((r1 - (r1 >> 7)) & 0x1b1b1b1b);
766238384Sjkim#if defined(ROTATE)
767238384Sjkim			t[i] = r2 ^ ROTATE(r2,24) ^ ROTATE(r0,24) ^
768238384Sjkim				ROTATE(r0,16) ^ ROTATE(r0,8);
769238384Sjkim#else
770238384Sjkim			t[i] = r2 ^ ((r2 ^ r0) << 24) ^ ((r2 ^ r0) >> 8) ^
771238384Sjkim				(r0 << 16) ^ (r0 >> 16) ^
772238384Sjkim				(r0 << 8) ^ (r0 >> 24);
773238384Sjkim#endif
774238384Sjkim			t[i] ^= rk[i];
775238384Sjkim		}
776238384Sjkim	}
777238384Sjkim#else
778238384Sjkim	t[0] =	Te0[(s0      ) & 0xff] ^
779238384Sjkim		Te1[(s1 >>  8) & 0xff] ^
780238384Sjkim		Te2[(s2 >> 16) & 0xff] ^
781238384Sjkim		Te3[(s3 >> 24)       ] ^
782238384Sjkim		rk[0];
783238384Sjkim	t[1] =	Te0[(s1      ) & 0xff] ^
784238384Sjkim		Te1[(s2 >>  8) & 0xff] ^
785238384Sjkim		Te2[(s3 >> 16) & 0xff] ^
786238384Sjkim		Te3[(s0 >> 24)       ] ^
787238384Sjkim		rk[1];
788238384Sjkim	t[2] =	Te0[(s2      ) & 0xff] ^
789238384Sjkim		Te1[(s3 >>  8) & 0xff] ^
790238384Sjkim		Te2[(s0 >> 16) & 0xff] ^
791238384Sjkim		Te3[(s1 >> 24)       ] ^
792238384Sjkim		rk[2];
793238384Sjkim	t[3] =	Te0[(s3      ) & 0xff] ^
794238384Sjkim		Te1[(s0 >>  8) & 0xff] ^
795238384Sjkim		Te2[(s1 >> 16) & 0xff] ^
796238384Sjkim		Te3[(s2 >> 24)       ] ^
797238384Sjkim		rk[3];
798238384Sjkim#endif
799238384Sjkim	s0 = t[0]; s1 = t[1]; s2 = t[2]; s3 = t[3];
800238384Sjkim    }
801238384Sjkim    /*
802238384Sjkim	 * apply last round and
803238384Sjkim	 * map cipher state to byte array block:
804238384Sjkim	 */
805238384Sjkim#if defined(AES_COMPACT_IN_OUTER_ROUNDS)
806238384Sjkim	prefetch256(Te4);
807238384Sjkim
808238384Sjkim	*(u32*)(out+0) =
809238384Sjkim		Te4[(s0      ) & 0xff]       ^
810238384Sjkim		Te4[(s1 >>  8) & 0xff] <<  8 ^
811238384Sjkim		Te4[(s2 >> 16) & 0xff] << 16 ^
812238384Sjkim		Te4[(s3 >> 24)       ] << 24 ^
813238384Sjkim		rk[0];
814238384Sjkim	*(u32*)(out+4) =
815238384Sjkim		Te4[(s1      ) & 0xff]       ^
816238384Sjkim		Te4[(s2 >>  8) & 0xff] <<  8 ^
817238384Sjkim		Te4[(s3 >> 16) & 0xff] << 16 ^
818238384Sjkim		Te4[(s0 >> 24)       ] << 24 ^
819238384Sjkim		rk[1];
820238384Sjkim	*(u32*)(out+8) =
821238384Sjkim		Te4[(s2      ) & 0xff]       ^
822238384Sjkim		Te4[(s3 >>  8) & 0xff] <<  8 ^
823238384Sjkim		Te4[(s0 >> 16) & 0xff] << 16 ^
824238384Sjkim		Te4[(s1 >> 24)       ] << 24 ^
825238384Sjkim		rk[2];
826238384Sjkim	*(u32*)(out+12) =
827238384Sjkim		Te4[(s3      ) & 0xff]       ^
828238384Sjkim		Te4[(s0 >>  8) & 0xff] <<  8 ^
829238384Sjkim		Te4[(s1 >> 16) & 0xff] << 16 ^
830238384Sjkim		Te4[(s2 >> 24)       ] << 24 ^
831238384Sjkim		rk[3];
832238384Sjkim#else
833238384Sjkim	*(u32*)(out+0) =
834238384Sjkim		(Te2[(s0      ) & 0xff] & 0x000000ffU) ^
835238384Sjkim		(Te3[(s1 >>  8) & 0xff] & 0x0000ff00U) ^
836238384Sjkim		(Te0[(s2 >> 16) & 0xff] & 0x00ff0000U) ^
837238384Sjkim		(Te1[(s3 >> 24)       ] & 0xff000000U) ^
838238384Sjkim		rk[0];
839238384Sjkim	*(u32*)(out+4) =
840238384Sjkim		(Te2[(s1      ) & 0xff] & 0x000000ffU) ^
841238384Sjkim		(Te3[(s2 >>  8) & 0xff] & 0x0000ff00U) ^
842238384Sjkim		(Te0[(s3 >> 16) & 0xff] & 0x00ff0000U) ^
843238384Sjkim		(Te1[(s0 >> 24)       ] & 0xff000000U) ^
844238384Sjkim		rk[1];
845238384Sjkim	*(u32*)(out+8) =
846238384Sjkim		(Te2[(s2      ) & 0xff] & 0x000000ffU) ^
847238384Sjkim		(Te3[(s3 >>  8) & 0xff] & 0x0000ff00U) ^
848238384Sjkim		(Te0[(s0 >> 16) & 0xff] & 0x00ff0000U) ^
849238384Sjkim		(Te1[(s1 >> 24)       ] & 0xff000000U) ^
850238384Sjkim		rk[2];
851238384Sjkim	*(u32*)(out+12) =
852238384Sjkim		(Te2[(s3      ) & 0xff] & 0x000000ffU) ^
853238384Sjkim		(Te3[(s0 >>  8) & 0xff] & 0x0000ff00U) ^
854238384Sjkim		(Te0[(s1 >> 16) & 0xff] & 0x00ff0000U) ^
855238384Sjkim		(Te1[(s2 >> 24)       ] & 0xff000000U) ^
856238384Sjkim		rk[3];
857238384Sjkim#endif
858238384Sjkim}
859238384Sjkim
860238384Sjkim/*
861238384Sjkim * Decrypt a single block
862238384Sjkim * in and out can overlap
863238384Sjkim */
864238384Sjkimvoid AES_decrypt(const unsigned char *in, unsigned char *out,
865238384Sjkim		 const AES_KEY *key) {
866238384Sjkim
867238384Sjkim	const u32 *rk;
868238384Sjkim	u32 s0, s1, s2, s3, t[4];
869238384Sjkim	int r;
870238384Sjkim
871238384Sjkim	assert(in && out && key);
872238384Sjkim	rk = key->rd_key;
873238384Sjkim
874238384Sjkim	/*
875238384Sjkim	 * map byte array block to cipher state
876238384Sjkim	 * and add initial round key:
877238384Sjkim	 */
878238384Sjkim	s0 = GETU32(in     ) ^ rk[0];
879238384Sjkim	s1 = GETU32(in +  4) ^ rk[1];
880238384Sjkim	s2 = GETU32(in +  8) ^ rk[2];
881238384Sjkim	s3 = GETU32(in + 12) ^ rk[3];
882238384Sjkim
883238384Sjkim#if defined(AES_COMPACT_IN_OUTER_ROUNDS)
884238384Sjkim	prefetch256(Td4);
885238384Sjkim
886238384Sjkim        t[0] =	Td4[(s0      ) & 0xff]       ^
887238384Sjkim		Td4[(s3 >>  8) & 0xff] <<  8 ^
888238384Sjkim		Td4[(s2 >> 16) & 0xff] << 16 ^
889238384Sjkim		Td4[(s1 >> 24)       ] << 24;
890238384Sjkim        t[1] =	Td4[(s1      ) & 0xff]       ^
891238384Sjkim		Td4[(s0 >>  8) & 0xff] <<  8 ^
892238384Sjkim		Td4[(s3 >> 16) & 0xff] << 16 ^
893238384Sjkim		Td4[(s2 >> 24)       ] << 24;
894238384Sjkim        t[2] =	Td4[(s2      ) & 0xff]       ^
895238384Sjkim		Td4[(s1 >>  8) & 0xff] <<  8 ^
896238384Sjkim		Td4[(s0 >> 16) & 0xff] << 16 ^
897238384Sjkim		Td4[(s3 >> 24)       ] << 24;
898238384Sjkim        t[3] =	Td4[(s3      ) & 0xff]       ^
899238384Sjkim		Td4[(s2 >>  8) & 0xff] <<  8 ^
900238384Sjkim		Td4[(s1 >> 16) & 0xff] << 16 ^
901238384Sjkim		Td4[(s0 >> 24)       ] << 24;
902238384Sjkim
903238384Sjkim	/* now do the linear transform using words */
904238384Sjkim	{	int i;
905238384Sjkim		u32 tp1, tp2, tp4, tp8, tp9, tpb, tpd, tpe, m;
906238384Sjkim
907238384Sjkim		for (i = 0; i < 4; i++) {
908238384Sjkim			tp1 = t[i];
909238384Sjkim			m = tp1 & 0x80808080;
910238384Sjkim			tp2 = ((tp1 & 0x7f7f7f7f) << 1) ^
911238384Sjkim				((m - (m >> 7)) & 0x1b1b1b1b);
912238384Sjkim			m = tp2 & 0x80808080;
913238384Sjkim			tp4 = ((tp2 & 0x7f7f7f7f) << 1) ^
914238384Sjkim				((m - (m >> 7)) & 0x1b1b1b1b);
915238384Sjkim			m = tp4 & 0x80808080;
916238384Sjkim			tp8 = ((tp4 & 0x7f7f7f7f) << 1) ^
917238384Sjkim				((m - (m >> 7)) & 0x1b1b1b1b);
918238384Sjkim			tp9 = tp8 ^ tp1;
919238384Sjkim			tpb = tp9 ^ tp2;
920238384Sjkim			tpd = tp9 ^ tp4;
921238384Sjkim			tpe = tp8 ^ tp4 ^ tp2;
922238384Sjkim#if defined(ROTATE)
923238384Sjkim			t[i] = tpe ^ ROTATE(tpd,16) ^
924238384Sjkim				ROTATE(tp9,8) ^ ROTATE(tpb,24);
925238384Sjkim#else
926238384Sjkim			t[i] = tpe ^ (tpd >> 16) ^ (tpd << 16) ^
927238384Sjkim				(tp9 >> 24) ^ (tp9 << 8) ^
928238384Sjkim				(tpb >> 8) ^ (tpb << 24);
929238384Sjkim#endif
930238384Sjkim			t[i] ^= rk[4+i];
931238384Sjkim		}
932238384Sjkim	}
933238384Sjkim#else
934238384Sjkim	t[0] =	Td0[(s0      ) & 0xff] ^
935238384Sjkim		Td1[(s3 >>  8) & 0xff] ^
936238384Sjkim		Td2[(s2 >> 16) & 0xff] ^
937238384Sjkim		Td3[(s1 >> 24)       ] ^
938238384Sjkim		rk[4];
939238384Sjkim	t[1] =	Td0[(s1      ) & 0xff] ^
940238384Sjkim		Td1[(s0 >>  8) & 0xff] ^
941238384Sjkim		Td2[(s3 >> 16) & 0xff] ^
942238384Sjkim		Td3[(s2 >> 24)       ] ^
943238384Sjkim		rk[5];
944238384Sjkim	t[2] =	Td0[(s2      ) & 0xff] ^
945238384Sjkim		Td1[(s1 >>  8) & 0xff] ^
946238384Sjkim		Td2[(s0 >> 16) & 0xff] ^
947238384Sjkim		Td3[(s3 >> 24)       ] ^
948238384Sjkim		rk[6];
949238384Sjkim	t[3] =	Td0[(s3      ) & 0xff] ^
950238384Sjkim		Td1[(s2 >>  8) & 0xff] ^
951238384Sjkim		Td2[(s1 >> 16) & 0xff] ^
952238384Sjkim		Td3[(s0 >> 24)       ] ^
953238384Sjkim		rk[7];
954238384Sjkim#endif
955238384Sjkim	s0 = t[0]; s1 = t[1]; s2 = t[2]; s3 = t[3];
956238384Sjkim
957238384Sjkim    /*
958238384Sjkim     * Nr - 2 full rounds:
959238384Sjkim     */
960238384Sjkim    for (rk+=8,r=key->rounds-2; r>0; rk+=4,r--) {
961238384Sjkim#if defined(AES_COMPACT_IN_INNER_ROUNDS)
962238384Sjkim        t[0] =	Td4[(s0      ) & 0xff]       ^
963238384Sjkim		Td4[(s3 >>  8) & 0xff] <<  8 ^
964238384Sjkim		Td4[(s2 >> 16) & 0xff] << 16 ^
965238384Sjkim		Td4[(s1 >> 24)       ] << 24;
966238384Sjkim        t[1] =	Td4[(s1      ) & 0xff]       ^
967238384Sjkim		Td4[(s0 >>  8) & 0xff] <<  8 ^
968238384Sjkim		Td4[(s3 >> 16) & 0xff] << 16 ^
969238384Sjkim		Td4[(s2 >> 24)       ] << 24;
970238384Sjkim        t[2] =	Td4[(s2      ) & 0xff]       ^
971238384Sjkim		Td4[(s1 >>  8) & 0xff] <<  8 ^
972238384Sjkim		Td4[(s0 >> 16) & 0xff] << 16 ^
973238384Sjkim		Td4[(s3 >> 24)       ] << 24;
974238384Sjkim        t[3] =	Td4[(s3      ) & 0xff]       ^
975238384Sjkim		Td4[(s2 >>  8) & 0xff] <<  8 ^
976238384Sjkim		Td4[(s1 >> 16) & 0xff] << 16 ^
977238384Sjkim		Td4[(s0 >> 24)       ] << 24;
978238384Sjkim
979238384Sjkim	/* now do the linear transform using words */
980238384Sjkim	{	int i;
981238384Sjkim		u32 tp1, tp2, tp4, tp8, tp9, tpb, tpd, tpe, m;
982238384Sjkim
983238384Sjkim		for (i = 0; i < 4; i++) {
984238384Sjkim			tp1 = t[i];
985238384Sjkim			m = tp1 & 0x80808080;
986238384Sjkim			tp2 = ((tp1 & 0x7f7f7f7f) << 1) ^
987238384Sjkim				((m - (m >> 7)) & 0x1b1b1b1b);
988238384Sjkim			m = tp2 & 0x80808080;
989238384Sjkim			tp4 = ((tp2 & 0x7f7f7f7f) << 1) ^
990238384Sjkim				((m - (m >> 7)) & 0x1b1b1b1b);
991238384Sjkim			m = tp4 & 0x80808080;
992238384Sjkim			tp8 = ((tp4 & 0x7f7f7f7f) << 1) ^
993238384Sjkim				((m - (m >> 7)) & 0x1b1b1b1b);
994238384Sjkim			tp9 = tp8 ^ tp1;
995238384Sjkim			tpb = tp9 ^ tp2;
996238384Sjkim			tpd = tp9 ^ tp4;
997238384Sjkim			tpe = tp8 ^ tp4 ^ tp2;
998238384Sjkim#if defined(ROTATE)
999238384Sjkim			t[i] = tpe ^ ROTATE(tpd,16) ^
1000238384Sjkim				ROTATE(tp9,8) ^ ROTATE(tpb,24);
1001238384Sjkim#else
1002238384Sjkim			t[i] = tpe ^ (tpd >> 16) ^ (tpd << 16) ^
1003238384Sjkim				(tp9 >> 24) ^ (tp9 << 8) ^
1004238384Sjkim				(tpb >> 8) ^ (tpb << 24);
1005238384Sjkim#endif
1006238384Sjkim			t[i] ^= rk[i];
1007238384Sjkim		}
1008238384Sjkim	}
1009238384Sjkim#else
1010238384Sjkim	t[0] =	Td0[(s0      ) & 0xff] ^
1011238384Sjkim		Td1[(s3 >>  8) & 0xff] ^
1012238384Sjkim		Td2[(s2 >> 16) & 0xff] ^
1013238384Sjkim		Td3[(s1 >> 24)       ] ^
1014238384Sjkim		rk[0];
1015238384Sjkim	t[1] =	Td0[(s1      ) & 0xff] ^
1016238384Sjkim		Td1[(s0 >>  8) & 0xff] ^
1017238384Sjkim		Td2[(s3 >> 16) & 0xff] ^
1018238384Sjkim		Td3[(s2 >> 24)       ] ^
1019238384Sjkim		rk[1];
1020238384Sjkim	t[2] =	Td0[(s2      ) & 0xff] ^
1021238384Sjkim		Td1[(s1 >>  8) & 0xff] ^
1022238384Sjkim		Td2[(s0 >> 16) & 0xff] ^
1023238384Sjkim		Td3[(s3 >> 24)       ] ^
1024238384Sjkim		rk[2];
1025238384Sjkim	t[3] =	Td0[(s3      ) & 0xff] ^
1026238384Sjkim		Td1[(s2 >>  8) & 0xff] ^
1027238384Sjkim		Td2[(s1 >> 16) & 0xff] ^
1028238384Sjkim		Td3[(s0 >> 24)       ] ^
1029238384Sjkim		rk[3];
1030238384Sjkim#endif
1031238384Sjkim	s0 = t[0]; s1 = t[1]; s2 = t[2]; s3 = t[3];
1032238384Sjkim    }
1033238384Sjkim    /*
1034238384Sjkim	 * apply last round and
1035238384Sjkim	 * map cipher state to byte array block:
1036238384Sjkim	 */
1037238384Sjkim	prefetch256(Td4);
1038238384Sjkim
1039238384Sjkim	*(u32*)(out+0) =
1040238384Sjkim		(Td4[(s0      ) & 0xff])	^
1041238384Sjkim		(Td4[(s3 >>  8) & 0xff] <<  8) ^
1042238384Sjkim		(Td4[(s2 >> 16) & 0xff] << 16) ^
1043238384Sjkim		(Td4[(s1 >> 24)       ] << 24) ^
1044238384Sjkim		rk[0];
1045238384Sjkim	*(u32*)(out+4) =
1046238384Sjkim		(Td4[(s1      ) & 0xff])	 ^
1047238384Sjkim		(Td4[(s0 >>  8) & 0xff] <<  8) ^
1048238384Sjkim		(Td4[(s3 >> 16) & 0xff] << 16) ^
1049238384Sjkim		(Td4[(s2 >> 24)       ] << 24) ^
1050238384Sjkim		rk[1];
1051238384Sjkim	*(u32*)(out+8) =
1052238384Sjkim		(Td4[(s2      ) & 0xff])	 ^
1053238384Sjkim		(Td4[(s1 >>  8) & 0xff] <<  8) ^
1054238384Sjkim		(Td4[(s0 >> 16) & 0xff] << 16) ^
1055238384Sjkim		(Td4[(s3 >> 24)       ] << 24) ^
1056238384Sjkim		rk[2];
1057238384Sjkim	*(u32*)(out+12) =
1058238384Sjkim		(Td4[(s3      ) & 0xff])	 ^
1059238384Sjkim		(Td4[(s2 >>  8) & 0xff] <<  8) ^
1060238384Sjkim		(Td4[(s1 >> 16) & 0xff] << 16) ^
1061238384Sjkim		(Td4[(s0 >> 24)       ] << 24) ^
1062238384Sjkim		rk[3];
1063238384Sjkim}
1064