1/* crypto/aes/aes_core.c */
2/**
3 * rijndael-alg-fst.c
4 *
5 * @version 3.0 (December 2000)
6 *
7 * Optimised ANSI C code for the Rijndael cipher (now AES)
8 *
9 * @author Vincent Rijmen <vincent.rijmen@esat.kuleuven.ac.be>
10 * @author Antoon Bosselaers <antoon.bosselaers@esat.kuleuven.ac.be>
11 * @author Paulo Barreto <paulo.barreto@terra.com.br>
12 *
13 * This code is hereby placed in the public domain.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS
16 * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE
19 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
20 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
21 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
22 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
23 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
24 * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
25 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28/*
29 * This is experimental x86[_64] derivative. It assumes little-endian
30 * byte order and expects CPU to sustain unaligned memory references.
31 * It is used as playground for cache-time attack mitigations and
32 * serves as reference C implementation for x86[_64] assembler.
33 *
34 *                  <appro@fy.chalmers.se>
35 */
36
37
38#ifndef AES_DEBUG
39# ifndef NDEBUG
40#  define NDEBUG
41# endif
42#endif
43#include <assert.h>
44
45#include <stdlib.h>
46#include <openssl/aes.h>
47#include "aes_locl.h"
48
49/*
50 * These two parameters control which table, 256-byte or 2KB, is
51 * referenced in outer and respectively inner rounds.
52 */
53#define AES_COMPACT_IN_OUTER_ROUNDS
54#ifdef  AES_COMPACT_IN_OUTER_ROUNDS
55/* AES_COMPACT_IN_OUTER_ROUNDS costs ~30% in performance, while
56 * adding AES_COMPACT_IN_INNER_ROUNDS reduces benchmark *further*
57 * by factor of ~2. */
58# undef  AES_COMPACT_IN_INNER_ROUNDS
59#endif
60
61#if 1
62static void prefetch256(const void *table)
63{
64    volatile unsigned long *t=(void *)table,ret;
65    unsigned long sum;
66    int i;
67
68    /* 32 is common least cache-line size */
69    for (sum=0,i=0;i<256/sizeof(t[0]);i+=32/sizeof(t[0]))   sum ^= t[i];
70
71    ret = sum;
72}
73#else
74# define prefetch256(t)
75#endif
76
77#undef GETU32
78#define GETU32(p) (*((u32*)(p)))
79
80#if (defined(_WIN32) || defined(_WIN64)) && !defined(__MINGW32__)
81typedef unsigned __int64 u64;
82#define U64(C)  C##UI64
83#elif defined(__arch64__)
84typedef unsigned long u64;
85#define U64(C)  C##UL
86#else
87typedef unsigned long long u64;
88#define U64(C)  C##ULL
89#endif
90
91#undef ROTATE
92#if defined(_MSC_VER)
93# define ROTATE(a,n)    _lrotl(a,n)
94#elif defined(__ICC)
95# define ROTATE(a,n)    _rotl(a,n)
96#elif defined(__GNUC__) && __GNUC__>=2
97# if defined(__i386) || defined(__i386__) || defined(__x86_64) || defined(__x86_64__)
98#   define ROTATE(a,n)  ({ register unsigned int ret;   \
99                asm (           \
100                "roll %1,%0"        \
101                : "=r"(ret)     \
102                : "I"(n), "0"(a)    \
103                : "cc");        \
104               ret;             \
105            })
106# endif
107#endif
108/*-
109Te [x] = S [x].[02, 01, 01, 03, 02, 01, 01, 03];
110Te0[x] = S [x].[02, 01, 01, 03];
111Te1[x] = S [x].[03, 02, 01, 01];
112Te2[x] = S [x].[01, 03, 02, 01];
113Te3[x] = S [x].[01, 01, 03, 02];
114*/
115#define Te0 (u32)((u64*)((u8*)Te+0))
116#define Te1 (u32)((u64*)((u8*)Te+3))
117#define Te2 (u32)((u64*)((u8*)Te+2))
118#define Te3 (u32)((u64*)((u8*)Te+1))
119/*-
120Td [x] = Si[x].[0e, 09, 0d, 0b, 0e, 09, 0d, 0b];
121Td0[x] = Si[x].[0e, 09, 0d, 0b];
122Td1[x] = Si[x].[0b, 0e, 09, 0d];
123Td2[x] = Si[x].[0d, 0b, 0e, 09];
124Td3[x] = Si[x].[09, 0d, 0b, 0e];
125Td4[x] = Si[x].[01];
126*/
127#define Td0 (u32)((u64*)((u8*)Td+0))
128#define Td1 (u32)((u64*)((u8*)Td+3))
129#define Td2 (u32)((u64*)((u8*)Td+2))
130#define Td3 (u32)((u64*)((u8*)Td+1))
131
132static const u64 Te[256] = {
133    U64(0xa56363c6a56363c6), U64(0x847c7cf8847c7cf8),
134    U64(0x997777ee997777ee), U64(0x8d7b7bf68d7b7bf6),
135    U64(0x0df2f2ff0df2f2ff), U64(0xbd6b6bd6bd6b6bd6),
136    U64(0xb16f6fdeb16f6fde), U64(0x54c5c59154c5c591),
137    U64(0x5030306050303060), U64(0x0301010203010102),
138    U64(0xa96767cea96767ce), U64(0x7d2b2b567d2b2b56),
139    U64(0x19fefee719fefee7), U64(0x62d7d7b562d7d7b5),
140    U64(0xe6abab4de6abab4d), U64(0x9a7676ec9a7676ec),
141    U64(0x45caca8f45caca8f), U64(0x9d82821f9d82821f),
142    U64(0x40c9c98940c9c989), U64(0x877d7dfa877d7dfa),
143    U64(0x15fafaef15fafaef), U64(0xeb5959b2eb5959b2),
144    U64(0xc947478ec947478e), U64(0x0bf0f0fb0bf0f0fb),
145    U64(0xecadad41ecadad41), U64(0x67d4d4b367d4d4b3),
146    U64(0xfda2a25ffda2a25f), U64(0xeaafaf45eaafaf45),
147    U64(0xbf9c9c23bf9c9c23), U64(0xf7a4a453f7a4a453),
148    U64(0x967272e4967272e4), U64(0x5bc0c09b5bc0c09b),
149    U64(0xc2b7b775c2b7b775), U64(0x1cfdfde11cfdfde1),
150    U64(0xae93933dae93933d), U64(0x6a26264c6a26264c),
151    U64(0x5a36366c5a36366c), U64(0x413f3f7e413f3f7e),
152    U64(0x02f7f7f502f7f7f5), U64(0x4fcccc834fcccc83),
153    U64(0x5c3434685c343468), U64(0xf4a5a551f4a5a551),
154    U64(0x34e5e5d134e5e5d1), U64(0x08f1f1f908f1f1f9),
155    U64(0x937171e2937171e2), U64(0x73d8d8ab73d8d8ab),
156    U64(0x5331316253313162), U64(0x3f15152a3f15152a),
157    U64(0x0c0404080c040408), U64(0x52c7c79552c7c795),
158    U64(0x6523234665232346), U64(0x5ec3c39d5ec3c39d),
159    U64(0x2818183028181830), U64(0xa1969637a1969637),
160    U64(0x0f05050a0f05050a), U64(0xb59a9a2fb59a9a2f),
161    U64(0x0907070e0907070e), U64(0x3612122436121224),
162    U64(0x9b80801b9b80801b), U64(0x3de2e2df3de2e2df),
163    U64(0x26ebebcd26ebebcd), U64(0x6927274e6927274e),
164    U64(0xcdb2b27fcdb2b27f), U64(0x9f7575ea9f7575ea),
165    U64(0x1b0909121b090912), U64(0x9e83831d9e83831d),
166    U64(0x742c2c58742c2c58), U64(0x2e1a1a342e1a1a34),
167    U64(0x2d1b1b362d1b1b36), U64(0xb26e6edcb26e6edc),
168    U64(0xee5a5ab4ee5a5ab4), U64(0xfba0a05bfba0a05b),
169    U64(0xf65252a4f65252a4), U64(0x4d3b3b764d3b3b76),
170    U64(0x61d6d6b761d6d6b7), U64(0xceb3b37dceb3b37d),
171    U64(0x7b2929527b292952), U64(0x3ee3e3dd3ee3e3dd),
172    U64(0x712f2f5e712f2f5e), U64(0x9784841397848413),
173    U64(0xf55353a6f55353a6), U64(0x68d1d1b968d1d1b9),
174    U64(0x0000000000000000), U64(0x2cededc12cededc1),
175    U64(0x6020204060202040), U64(0x1ffcfce31ffcfce3),
176    U64(0xc8b1b179c8b1b179), U64(0xed5b5bb6ed5b5bb6),
177    U64(0xbe6a6ad4be6a6ad4), U64(0x46cbcb8d46cbcb8d),
178    U64(0xd9bebe67d9bebe67), U64(0x4b3939724b393972),
179    U64(0xde4a4a94de4a4a94), U64(0xd44c4c98d44c4c98),
180    U64(0xe85858b0e85858b0), U64(0x4acfcf854acfcf85),
181    U64(0x6bd0d0bb6bd0d0bb), U64(0x2aefefc52aefefc5),
182    U64(0xe5aaaa4fe5aaaa4f), U64(0x16fbfbed16fbfbed),
183    U64(0xc5434386c5434386), U64(0xd74d4d9ad74d4d9a),
184    U64(0x5533336655333366), U64(0x9485851194858511),
185    U64(0xcf45458acf45458a), U64(0x10f9f9e910f9f9e9),
186    U64(0x0602020406020204), U64(0x817f7ffe817f7ffe),
187    U64(0xf05050a0f05050a0), U64(0x443c3c78443c3c78),
188    U64(0xba9f9f25ba9f9f25), U64(0xe3a8a84be3a8a84b),
189    U64(0xf35151a2f35151a2), U64(0xfea3a35dfea3a35d),
190    U64(0xc0404080c0404080), U64(0x8a8f8f058a8f8f05),
191    U64(0xad92923fad92923f), U64(0xbc9d9d21bc9d9d21),
192    U64(0x4838387048383870), U64(0x04f5f5f104f5f5f1),
193    U64(0xdfbcbc63dfbcbc63), U64(0xc1b6b677c1b6b677),
194    U64(0x75dadaaf75dadaaf), U64(0x6321214263212142),
195    U64(0x3010102030101020), U64(0x1affffe51affffe5),
196    U64(0x0ef3f3fd0ef3f3fd), U64(0x6dd2d2bf6dd2d2bf),
197    U64(0x4ccdcd814ccdcd81), U64(0x140c0c18140c0c18),
198    U64(0x3513132635131326), U64(0x2fececc32fececc3),
199    U64(0xe15f5fbee15f5fbe), U64(0xa2979735a2979735),
200    U64(0xcc444488cc444488), U64(0x3917172e3917172e),
201    U64(0x57c4c49357c4c493), U64(0xf2a7a755f2a7a755),
202    U64(0x827e7efc827e7efc), U64(0x473d3d7a473d3d7a),
203    U64(0xac6464c8ac6464c8), U64(0xe75d5dbae75d5dba),
204    U64(0x2b1919322b191932), U64(0x957373e6957373e6),
205    U64(0xa06060c0a06060c0), U64(0x9881811998818119),
206    U64(0xd14f4f9ed14f4f9e), U64(0x7fdcdca37fdcdca3),
207    U64(0x6622224466222244), U64(0x7e2a2a547e2a2a54),
208    U64(0xab90903bab90903b), U64(0x8388880b8388880b),
209    U64(0xca46468cca46468c), U64(0x29eeeec729eeeec7),
210    U64(0xd3b8b86bd3b8b86b), U64(0x3c1414283c141428),
211    U64(0x79dedea779dedea7), U64(0xe25e5ebce25e5ebc),
212    U64(0x1d0b0b161d0b0b16), U64(0x76dbdbad76dbdbad),
213    U64(0x3be0e0db3be0e0db), U64(0x5632326456323264),
214    U64(0x4e3a3a744e3a3a74), U64(0x1e0a0a141e0a0a14),
215    U64(0xdb494992db494992), U64(0x0a06060c0a06060c),
216    U64(0x6c2424486c242448), U64(0xe45c5cb8e45c5cb8),
217    U64(0x5dc2c29f5dc2c29f), U64(0x6ed3d3bd6ed3d3bd),
218    U64(0xefacac43efacac43), U64(0xa66262c4a66262c4),
219    U64(0xa8919139a8919139), U64(0xa4959531a4959531),
220    U64(0x37e4e4d337e4e4d3), U64(0x8b7979f28b7979f2),
221    U64(0x32e7e7d532e7e7d5), U64(0x43c8c88b43c8c88b),
222    U64(0x5937376e5937376e), U64(0xb76d6ddab76d6dda),
223    U64(0x8c8d8d018c8d8d01), U64(0x64d5d5b164d5d5b1),
224    U64(0xd24e4e9cd24e4e9c), U64(0xe0a9a949e0a9a949),
225    U64(0xb46c6cd8b46c6cd8), U64(0xfa5656acfa5656ac),
226    U64(0x07f4f4f307f4f4f3), U64(0x25eaeacf25eaeacf),
227    U64(0xaf6565caaf6565ca), U64(0x8e7a7af48e7a7af4),
228    U64(0xe9aeae47e9aeae47), U64(0x1808081018080810),
229    U64(0xd5baba6fd5baba6f), U64(0x887878f0887878f0),
230    U64(0x6f25254a6f25254a), U64(0x722e2e5c722e2e5c),
231    U64(0x241c1c38241c1c38), U64(0xf1a6a657f1a6a657),
232    U64(0xc7b4b473c7b4b473), U64(0x51c6c69751c6c697),
233    U64(0x23e8e8cb23e8e8cb), U64(0x7cdddda17cdddda1),
234    U64(0x9c7474e89c7474e8), U64(0x211f1f3e211f1f3e),
235    U64(0xdd4b4b96dd4b4b96), U64(0xdcbdbd61dcbdbd61),
236    U64(0x868b8b0d868b8b0d), U64(0x858a8a0f858a8a0f),
237    U64(0x907070e0907070e0), U64(0x423e3e7c423e3e7c),
238    U64(0xc4b5b571c4b5b571), U64(0xaa6666ccaa6666cc),
239    U64(0xd8484890d8484890), U64(0x0503030605030306),
240    U64(0x01f6f6f701f6f6f7), U64(0x120e0e1c120e0e1c),
241    U64(0xa36161c2a36161c2), U64(0x5f35356a5f35356a),
242    U64(0xf95757aef95757ae), U64(0xd0b9b969d0b9b969),
243    U64(0x9186861791868617), U64(0x58c1c19958c1c199),
244    U64(0x271d1d3a271d1d3a), U64(0xb99e9e27b99e9e27),
245    U64(0x38e1e1d938e1e1d9), U64(0x13f8f8eb13f8f8eb),
246    U64(0xb398982bb398982b), U64(0x3311112233111122),
247    U64(0xbb6969d2bb6969d2), U64(0x70d9d9a970d9d9a9),
248    U64(0x898e8e07898e8e07), U64(0xa7949433a7949433),
249    U64(0xb69b9b2db69b9b2d), U64(0x221e1e3c221e1e3c),
250    U64(0x9287871592878715), U64(0x20e9e9c920e9e9c9),
251    U64(0x49cece8749cece87), U64(0xff5555aaff5555aa),
252    U64(0x7828285078282850), U64(0x7adfdfa57adfdfa5),
253    U64(0x8f8c8c038f8c8c03), U64(0xf8a1a159f8a1a159),
254    U64(0x8089890980898909), U64(0x170d0d1a170d0d1a),
255    U64(0xdabfbf65dabfbf65), U64(0x31e6e6d731e6e6d7),
256    U64(0xc6424284c6424284), U64(0xb86868d0b86868d0),
257    U64(0xc3414182c3414182), U64(0xb0999929b0999929),
258    U64(0x772d2d5a772d2d5a), U64(0x110f0f1e110f0f1e),
259    U64(0xcbb0b07bcbb0b07b), U64(0xfc5454a8fc5454a8),
260    U64(0xd6bbbb6dd6bbbb6d), U64(0x3a16162c3a16162c)
261};
262
263static const u8 Te4[256] = {
264    0x63U, 0x7cU, 0x77U, 0x7bU, 0xf2U, 0x6bU, 0x6fU, 0xc5U,
265    0x30U, 0x01U, 0x67U, 0x2bU, 0xfeU, 0xd7U, 0xabU, 0x76U,
266    0xcaU, 0x82U, 0xc9U, 0x7dU, 0xfaU, 0x59U, 0x47U, 0xf0U,
267    0xadU, 0xd4U, 0xa2U, 0xafU, 0x9cU, 0xa4U, 0x72U, 0xc0U,
268    0xb7U, 0xfdU, 0x93U, 0x26U, 0x36U, 0x3fU, 0xf7U, 0xccU,
269    0x34U, 0xa5U, 0xe5U, 0xf1U, 0x71U, 0xd8U, 0x31U, 0x15U,
270    0x04U, 0xc7U, 0x23U, 0xc3U, 0x18U, 0x96U, 0x05U, 0x9aU,
271    0x07U, 0x12U, 0x80U, 0xe2U, 0xebU, 0x27U, 0xb2U, 0x75U,
272    0x09U, 0x83U, 0x2cU, 0x1aU, 0x1bU, 0x6eU, 0x5aU, 0xa0U,
273    0x52U, 0x3bU, 0xd6U, 0xb3U, 0x29U, 0xe3U, 0x2fU, 0x84U,
274    0x53U, 0xd1U, 0x00U, 0xedU, 0x20U, 0xfcU, 0xb1U, 0x5bU,
275    0x6aU, 0xcbU, 0xbeU, 0x39U, 0x4aU, 0x4cU, 0x58U, 0xcfU,
276    0xd0U, 0xefU, 0xaaU, 0xfbU, 0x43U, 0x4dU, 0x33U, 0x85U,
277    0x45U, 0xf9U, 0x02U, 0x7fU, 0x50U, 0x3cU, 0x9fU, 0xa8U,
278    0x51U, 0xa3U, 0x40U, 0x8fU, 0x92U, 0x9dU, 0x38U, 0xf5U,
279    0xbcU, 0xb6U, 0xdaU, 0x21U, 0x10U, 0xffU, 0xf3U, 0xd2U,
280    0xcdU, 0x0cU, 0x13U, 0xecU, 0x5fU, 0x97U, 0x44U, 0x17U,
281    0xc4U, 0xa7U, 0x7eU, 0x3dU, 0x64U, 0x5dU, 0x19U, 0x73U,
282    0x60U, 0x81U, 0x4fU, 0xdcU, 0x22U, 0x2aU, 0x90U, 0x88U,
283    0x46U, 0xeeU, 0xb8U, 0x14U, 0xdeU, 0x5eU, 0x0bU, 0xdbU,
284    0xe0U, 0x32U, 0x3aU, 0x0aU, 0x49U, 0x06U, 0x24U, 0x5cU,
285    0xc2U, 0xd3U, 0xacU, 0x62U, 0x91U, 0x95U, 0xe4U, 0x79U,
286    0xe7U, 0xc8U, 0x37U, 0x6dU, 0x8dU, 0xd5U, 0x4eU, 0xa9U,
287    0x6cU, 0x56U, 0xf4U, 0xeaU, 0x65U, 0x7aU, 0xaeU, 0x08U,
288    0xbaU, 0x78U, 0x25U, 0x2eU, 0x1cU, 0xa6U, 0xb4U, 0xc6U,
289    0xe8U, 0xddU, 0x74U, 0x1fU, 0x4bU, 0xbdU, 0x8bU, 0x8aU,
290    0x70U, 0x3eU, 0xb5U, 0x66U, 0x48U, 0x03U, 0xf6U, 0x0eU,
291    0x61U, 0x35U, 0x57U, 0xb9U, 0x86U, 0xc1U, 0x1dU, 0x9eU,
292    0xe1U, 0xf8U, 0x98U, 0x11U, 0x69U, 0xd9U, 0x8eU, 0x94U,
293    0x9bU, 0x1eU, 0x87U, 0xe9U, 0xceU, 0x55U, 0x28U, 0xdfU,
294    0x8cU, 0xa1U, 0x89U, 0x0dU, 0xbfU, 0xe6U, 0x42U, 0x68U,
295    0x41U, 0x99U, 0x2dU, 0x0fU, 0xb0U, 0x54U, 0xbbU, 0x16U
296};
297
298static const u64 Td[256] = {
299    U64(0x50a7f45150a7f451), U64(0x5365417e5365417e),
300    U64(0xc3a4171ac3a4171a), U64(0x965e273a965e273a),
301    U64(0xcb6bab3bcb6bab3b), U64(0xf1459d1ff1459d1f),
302    U64(0xab58faacab58faac), U64(0x9303e34b9303e34b),
303    U64(0x55fa302055fa3020), U64(0xf66d76adf66d76ad),
304    U64(0x9176cc889176cc88), U64(0x254c02f5254c02f5),
305    U64(0xfcd7e54ffcd7e54f), U64(0xd7cb2ac5d7cb2ac5),
306    U64(0x8044352680443526), U64(0x8fa362b58fa362b5),
307    U64(0x495ab1de495ab1de), U64(0x671bba25671bba25),
308    U64(0x980eea45980eea45), U64(0xe1c0fe5de1c0fe5d),
309    U64(0x02752fc302752fc3), U64(0x12f04c8112f04c81),
310    U64(0xa397468da397468d), U64(0xc6f9d36bc6f9d36b),
311    U64(0xe75f8f03e75f8f03), U64(0x959c9215959c9215),
312    U64(0xeb7a6dbfeb7a6dbf), U64(0xda595295da595295),
313    U64(0x2d83bed42d83bed4), U64(0xd3217458d3217458),
314    U64(0x2969e0492969e049), U64(0x44c8c98e44c8c98e),
315    U64(0x6a89c2756a89c275), U64(0x78798ef478798ef4),
316    U64(0x6b3e58996b3e5899), U64(0xdd71b927dd71b927),
317    U64(0xb64fe1beb64fe1be), U64(0x17ad88f017ad88f0),
318    U64(0x66ac20c966ac20c9), U64(0xb43ace7db43ace7d),
319    U64(0x184adf63184adf63), U64(0x82311ae582311ae5),
320    U64(0x6033519760335197), U64(0x457f5362457f5362),
321    U64(0xe07764b1e07764b1), U64(0x84ae6bbb84ae6bbb),
322    U64(0x1ca081fe1ca081fe), U64(0x942b08f9942b08f9),
323    U64(0x5868487058684870), U64(0x19fd458f19fd458f),
324    U64(0x876cde94876cde94), U64(0xb7f87b52b7f87b52),
325    U64(0x23d373ab23d373ab), U64(0xe2024b72e2024b72),
326    U64(0x578f1fe3578f1fe3), U64(0x2aab55662aab5566),
327    U64(0x0728ebb20728ebb2), U64(0x03c2b52f03c2b52f),
328    U64(0x9a7bc5869a7bc586), U64(0xa50837d3a50837d3),
329    U64(0xf2872830f2872830), U64(0xb2a5bf23b2a5bf23),
330    U64(0xba6a0302ba6a0302), U64(0x5c8216ed5c8216ed),
331    U64(0x2b1ccf8a2b1ccf8a), U64(0x92b479a792b479a7),
332    U64(0xf0f207f3f0f207f3), U64(0xa1e2694ea1e2694e),
333    U64(0xcdf4da65cdf4da65), U64(0xd5be0506d5be0506),
334    U64(0x1f6234d11f6234d1), U64(0x8afea6c48afea6c4),
335    U64(0x9d532e349d532e34), U64(0xa055f3a2a055f3a2),
336    U64(0x32e18a0532e18a05), U64(0x75ebf6a475ebf6a4),
337    U64(0x39ec830b39ec830b), U64(0xaaef6040aaef6040),
338    U64(0x069f715e069f715e), U64(0x51106ebd51106ebd),
339    U64(0xf98a213ef98a213e), U64(0x3d06dd963d06dd96),
340    U64(0xae053eddae053edd), U64(0x46bde64d46bde64d),
341    U64(0xb58d5491b58d5491), U64(0x055dc471055dc471),
342    U64(0x6fd406046fd40604), U64(0xff155060ff155060),
343    U64(0x24fb981924fb9819), U64(0x97e9bdd697e9bdd6),
344    U64(0xcc434089cc434089), U64(0x779ed967779ed967),
345    U64(0xbd42e8b0bd42e8b0), U64(0x888b8907888b8907),
346    U64(0x385b19e7385b19e7), U64(0xdbeec879dbeec879),
347    U64(0x470a7ca1470a7ca1), U64(0xe90f427ce90f427c),
348    U64(0xc91e84f8c91e84f8), U64(0x0000000000000000),
349    U64(0x8386800983868009), U64(0x48ed2b3248ed2b32),
350    U64(0xac70111eac70111e), U64(0x4e725a6c4e725a6c),
351    U64(0xfbff0efdfbff0efd), U64(0x5638850f5638850f),
352    U64(0x1ed5ae3d1ed5ae3d), U64(0x27392d3627392d36),
353    U64(0x64d90f0a64d90f0a), U64(0x21a65c6821a65c68),
354    U64(0xd1545b9bd1545b9b), U64(0x3a2e36243a2e3624),
355    U64(0xb1670a0cb1670a0c), U64(0x0fe757930fe75793),
356    U64(0xd296eeb4d296eeb4), U64(0x9e919b1b9e919b1b),
357    U64(0x4fc5c0804fc5c080), U64(0xa220dc61a220dc61),
358    U64(0x694b775a694b775a), U64(0x161a121c161a121c),
359    U64(0x0aba93e20aba93e2), U64(0xe52aa0c0e52aa0c0),
360    U64(0x43e0223c43e0223c), U64(0x1d171b121d171b12),
361    U64(0x0b0d090e0b0d090e), U64(0xadc78bf2adc78bf2),
362    U64(0xb9a8b62db9a8b62d), U64(0xc8a91e14c8a91e14),
363    U64(0x8519f1578519f157), U64(0x4c0775af4c0775af),
364    U64(0xbbdd99eebbdd99ee), U64(0xfd607fa3fd607fa3),
365    U64(0x9f2601f79f2601f7), U64(0xbcf5725cbcf5725c),
366    U64(0xc53b6644c53b6644), U64(0x347efb5b347efb5b),
367    U64(0x7629438b7629438b), U64(0xdcc623cbdcc623cb),
368    U64(0x68fcedb668fcedb6), U64(0x63f1e4b863f1e4b8),
369    U64(0xcadc31d7cadc31d7), U64(0x1085634210856342),
370    U64(0x4022971340229713), U64(0x2011c6842011c684),
371    U64(0x7d244a857d244a85), U64(0xf83dbbd2f83dbbd2),
372    U64(0x1132f9ae1132f9ae), U64(0x6da129c76da129c7),
373    U64(0x4b2f9e1d4b2f9e1d), U64(0xf330b2dcf330b2dc),
374    U64(0xec52860dec52860d), U64(0xd0e3c177d0e3c177),
375    U64(0x6c16b32b6c16b32b), U64(0x99b970a999b970a9),
376    U64(0xfa489411fa489411), U64(0x2264e9472264e947),
377    U64(0xc48cfca8c48cfca8), U64(0x1a3ff0a01a3ff0a0),
378    U64(0xd82c7d56d82c7d56), U64(0xef903322ef903322),
379    U64(0xc74e4987c74e4987), U64(0xc1d138d9c1d138d9),
380    U64(0xfea2ca8cfea2ca8c), U64(0x360bd498360bd498),
381    U64(0xcf81f5a6cf81f5a6), U64(0x28de7aa528de7aa5),
382    U64(0x268eb7da268eb7da), U64(0xa4bfad3fa4bfad3f),
383    U64(0xe49d3a2ce49d3a2c), U64(0x0d9278500d927850),
384    U64(0x9bcc5f6a9bcc5f6a), U64(0x62467e5462467e54),
385    U64(0xc2138df6c2138df6), U64(0xe8b8d890e8b8d890),
386    U64(0x5ef7392e5ef7392e), U64(0xf5afc382f5afc382),
387    U64(0xbe805d9fbe805d9f), U64(0x7c93d0697c93d069),
388    U64(0xa92dd56fa92dd56f), U64(0xb31225cfb31225cf),
389    U64(0x3b99acc83b99acc8), U64(0xa77d1810a77d1810),
390    U64(0x6e639ce86e639ce8), U64(0x7bbb3bdb7bbb3bdb),
391    U64(0x097826cd097826cd), U64(0xf418596ef418596e),
392    U64(0x01b79aec01b79aec), U64(0xa89a4f83a89a4f83),
393    U64(0x656e95e6656e95e6), U64(0x7ee6ffaa7ee6ffaa),
394    U64(0x08cfbc2108cfbc21), U64(0xe6e815efe6e815ef),
395    U64(0xd99be7bad99be7ba), U64(0xce366f4ace366f4a),
396    U64(0xd4099fead4099fea), U64(0xd67cb029d67cb029),
397    U64(0xafb2a431afb2a431), U64(0x31233f2a31233f2a),
398    U64(0x3094a5c63094a5c6), U64(0xc066a235c066a235),
399    U64(0x37bc4e7437bc4e74), U64(0xa6ca82fca6ca82fc),
400    U64(0xb0d090e0b0d090e0), U64(0x15d8a73315d8a733),
401    U64(0x4a9804f14a9804f1), U64(0xf7daec41f7daec41),
402    U64(0x0e50cd7f0e50cd7f), U64(0x2ff691172ff69117),
403    U64(0x8dd64d768dd64d76), U64(0x4db0ef434db0ef43),
404    U64(0x544daacc544daacc), U64(0xdf0496e4df0496e4),
405    U64(0xe3b5d19ee3b5d19e), U64(0x1b886a4c1b886a4c),
406    U64(0xb81f2cc1b81f2cc1), U64(0x7f5165467f516546),
407    U64(0x04ea5e9d04ea5e9d), U64(0x5d358c015d358c01),
408    U64(0x737487fa737487fa), U64(0x2e410bfb2e410bfb),
409    U64(0x5a1d67b35a1d67b3), U64(0x52d2db9252d2db92),
410    U64(0x335610e9335610e9), U64(0x1347d66d1347d66d),
411    U64(0x8c61d79a8c61d79a), U64(0x7a0ca1377a0ca137),
412    U64(0x8e14f8598e14f859), U64(0x893c13eb893c13eb),
413    U64(0xee27a9ceee27a9ce), U64(0x35c961b735c961b7),
414    U64(0xede51ce1ede51ce1), U64(0x3cb1477a3cb1477a),
415    U64(0x59dfd29c59dfd29c), U64(0x3f73f2553f73f255),
416    U64(0x79ce141879ce1418), U64(0xbf37c773bf37c773),
417    U64(0xeacdf753eacdf753), U64(0x5baafd5f5baafd5f),
418    U64(0x146f3ddf146f3ddf), U64(0x86db447886db4478),
419    U64(0x81f3afca81f3afca), U64(0x3ec468b93ec468b9),
420    U64(0x2c3424382c342438), U64(0x5f40a3c25f40a3c2),
421    U64(0x72c31d1672c31d16), U64(0x0c25e2bc0c25e2bc),
422    U64(0x8b493c288b493c28), U64(0x41950dff41950dff),
423    U64(0x7101a8397101a839), U64(0xdeb30c08deb30c08),
424    U64(0x9ce4b4d89ce4b4d8), U64(0x90c1566490c15664),
425    U64(0x6184cb7b6184cb7b), U64(0x70b632d570b632d5),
426    U64(0x745c6c48745c6c48), U64(0x4257b8d04257b8d0)
427};
428static const u8 Td4[256] = {
429    0x52U, 0x09U, 0x6aU, 0xd5U, 0x30U, 0x36U, 0xa5U, 0x38U,
430    0xbfU, 0x40U, 0xa3U, 0x9eU, 0x81U, 0xf3U, 0xd7U, 0xfbU,
431    0x7cU, 0xe3U, 0x39U, 0x82U, 0x9bU, 0x2fU, 0xffU, 0x87U,
432    0x34U, 0x8eU, 0x43U, 0x44U, 0xc4U, 0xdeU, 0xe9U, 0xcbU,
433    0x54U, 0x7bU, 0x94U, 0x32U, 0xa6U, 0xc2U, 0x23U, 0x3dU,
434    0xeeU, 0x4cU, 0x95U, 0x0bU, 0x42U, 0xfaU, 0xc3U, 0x4eU,
435    0x08U, 0x2eU, 0xa1U, 0x66U, 0x28U, 0xd9U, 0x24U, 0xb2U,
436    0x76U, 0x5bU, 0xa2U, 0x49U, 0x6dU, 0x8bU, 0xd1U, 0x25U,
437    0x72U, 0xf8U, 0xf6U, 0x64U, 0x86U, 0x68U, 0x98U, 0x16U,
438    0xd4U, 0xa4U, 0x5cU, 0xccU, 0x5dU, 0x65U, 0xb6U, 0x92U,
439    0x6cU, 0x70U, 0x48U, 0x50U, 0xfdU, 0xedU, 0xb9U, 0xdaU,
440    0x5eU, 0x15U, 0x46U, 0x57U, 0xa7U, 0x8dU, 0x9dU, 0x84U,
441    0x90U, 0xd8U, 0xabU, 0x00U, 0x8cU, 0xbcU, 0xd3U, 0x0aU,
442    0xf7U, 0xe4U, 0x58U, 0x05U, 0xb8U, 0xb3U, 0x45U, 0x06U,
443    0xd0U, 0x2cU, 0x1eU, 0x8fU, 0xcaU, 0x3fU, 0x0fU, 0x02U,
444    0xc1U, 0xafU, 0xbdU, 0x03U, 0x01U, 0x13U, 0x8aU, 0x6bU,
445    0x3aU, 0x91U, 0x11U, 0x41U, 0x4fU, 0x67U, 0xdcU, 0xeaU,
446    0x97U, 0xf2U, 0xcfU, 0xceU, 0xf0U, 0xb4U, 0xe6U, 0x73U,
447    0x96U, 0xacU, 0x74U, 0x22U, 0xe7U, 0xadU, 0x35U, 0x85U,
448    0xe2U, 0xf9U, 0x37U, 0xe8U, 0x1cU, 0x75U, 0xdfU, 0x6eU,
449    0x47U, 0xf1U, 0x1aU, 0x71U, 0x1dU, 0x29U, 0xc5U, 0x89U,
450    0x6fU, 0xb7U, 0x62U, 0x0eU, 0xaaU, 0x18U, 0xbeU, 0x1bU,
451    0xfcU, 0x56U, 0x3eU, 0x4bU, 0xc6U, 0xd2U, 0x79U, 0x20U,
452    0x9aU, 0xdbU, 0xc0U, 0xfeU, 0x78U, 0xcdU, 0x5aU, 0xf4U,
453    0x1fU, 0xddU, 0xa8U, 0x33U, 0x88U, 0x07U, 0xc7U, 0x31U,
454    0xb1U, 0x12U, 0x10U, 0x59U, 0x27U, 0x80U, 0xecU, 0x5fU,
455    0x60U, 0x51U, 0x7fU, 0xa9U, 0x19U, 0xb5U, 0x4aU, 0x0dU,
456    0x2dU, 0xe5U, 0x7aU, 0x9fU, 0x93U, 0xc9U, 0x9cU, 0xefU,
457    0xa0U, 0xe0U, 0x3bU, 0x4dU, 0xaeU, 0x2aU, 0xf5U, 0xb0U,
458    0xc8U, 0xebU, 0xbbU, 0x3cU, 0x83U, 0x53U, 0x99U, 0x61U,
459    0x17U, 0x2bU, 0x04U, 0x7eU, 0xbaU, 0x77U, 0xd6U, 0x26U,
460    0xe1U, 0x69U, 0x14U, 0x63U, 0x55U, 0x21U, 0x0cU, 0x7dU
461};
462
463static const u32 rcon[] = {
464    0x00000001U, 0x00000002U, 0x00000004U, 0x00000008U,
465    0x00000010U, 0x00000020U, 0x00000040U, 0x00000080U,
466    0x0000001bU, 0x00000036U, /* for 128-bit blocks, Rijndael never uses more than 10 rcon values */
467};
468
469/**
470 * Expand the cipher key into the encryption key schedule.
471 */
472int AES_set_encrypt_key(const unsigned char *userKey, const int bits,
473                        AES_KEY *key)
474{
475
476    u32 *rk;
477    int i = 0;
478    u32 temp;
479
480    if (!userKey || !key)
481        return -1;
482    if (bits != 128 && bits != 192 && bits != 256)
483        return -2;
484
485    rk = key->rd_key;
486
487    if (bits==128)
488        key->rounds = 10;
489    else if (bits==192)
490        key->rounds = 12;
491    else
492        key->rounds = 14;
493
494    rk[0] = GETU32(userKey     );
495    rk[1] = GETU32(userKey +  4);
496    rk[2] = GETU32(userKey +  8);
497    rk[3] = GETU32(userKey + 12);
498    if (bits == 128) {
499        while (1) {
500            temp  = rk[3];
501            rk[4] = rk[0] ^
502                ((u32)Te4[(temp >>  8) & 0xff]      ) ^
503                ((u32)Te4[(temp >> 16) & 0xff] <<  8) ^
504                ((u32)Te4[(temp >> 24)       ] << 16) ^
505                ((u32)Te4[(temp      ) & 0xff] << 24) ^
506                rcon[i];
507            rk[5] = rk[1] ^ rk[4];
508            rk[6] = rk[2] ^ rk[5];
509            rk[7] = rk[3] ^ rk[6];
510            if (++i == 10) {
511                return 0;
512            }
513            rk += 4;
514        }
515    }
516    rk[4] = GETU32(userKey + 16);
517    rk[5] = GETU32(userKey + 20);
518    if (bits == 192) {
519        while (1) {
520            temp = rk[ 5];
521            rk[ 6] = rk[ 0] ^
522                ((u32)Te4[(temp >>  8) & 0xff]      ) ^
523                ((u32)Te4[(temp >> 16) & 0xff] <<  8) ^
524                ((u32)Te4[(temp >> 24)       ] << 16) ^
525                ((u32)Te4[(temp      ) & 0xff] << 24) ^
526                rcon[i];
527            rk[ 7] = rk[ 1] ^ rk[ 6];
528            rk[ 8] = rk[ 2] ^ rk[ 7];
529            rk[ 9] = rk[ 3] ^ rk[ 8];
530            if (++i == 8) {
531                return 0;
532            }
533            rk[10] = rk[ 4] ^ rk[ 9];
534            rk[11] = rk[ 5] ^ rk[10];
535            rk += 6;
536        }
537    }
538    rk[6] = GETU32(userKey + 24);
539    rk[7] = GETU32(userKey + 28);
540    if (bits == 256) {
541        while (1) {
542            temp = rk[ 7];
543            rk[ 8] = rk[ 0] ^
544                ((u32)Te4[(temp >>  8) & 0xff]      ) ^
545                ((u32)Te4[(temp >> 16) & 0xff] <<  8) ^
546                ((u32)Te4[(temp >> 24)       ] << 16) ^
547                ((u32)Te4[(temp      ) & 0xff] << 24) ^
548                rcon[i];
549            rk[ 9] = rk[ 1] ^ rk[ 8];
550            rk[10] = rk[ 2] ^ rk[ 9];
551            rk[11] = rk[ 3] ^ rk[10];
552            if (++i == 7) {
553                return 0;
554            }
555            temp = rk[11];
556            rk[12] = rk[ 4] ^
557                ((u32)Te4[(temp      ) & 0xff]      ) ^
558                ((u32)Te4[(temp >>  8) & 0xff] <<  8) ^
559                ((u32)Te4[(temp >> 16) & 0xff] << 16) ^
560                ((u32)Te4[(temp >> 24)       ] << 24);
561            rk[13] = rk[ 5] ^ rk[12];
562            rk[14] = rk[ 6] ^ rk[13];
563            rk[15] = rk[ 7] ^ rk[14];
564
565            rk += 8;
566            }
567    }
568    return 0;
569}
570
571/**
572 * Expand the cipher key into the decryption key schedule.
573 */
574int AES_set_decrypt_key(const unsigned char *userKey, const int bits,
575                        AES_KEY *key)
576{
577
578    u32 *rk;
579    int i, j, status;
580    u32 temp;
581
582    /* first, start with an encryption schedule */
583    status = AES_set_encrypt_key(userKey, bits, key);
584    if (status < 0)
585        return status;
586
587    rk = key->rd_key;
588
589    /* invert the order of the round keys: */
590    for (i = 0, j = 4*(key->rounds); i < j; i += 4, j -= 4) {
591        temp = rk[i    ]; rk[i    ] = rk[j    ]; rk[j    ] = temp;
592        temp = rk[i + 1]; rk[i + 1] = rk[j + 1]; rk[j + 1] = temp;
593        temp = rk[i + 2]; rk[i + 2] = rk[j + 2]; rk[j + 2] = temp;
594        temp = rk[i + 3]; rk[i + 3] = rk[j + 3]; rk[j + 3] = temp;
595    }
596    /* apply the inverse MixColumn transform to all round keys but the first and the last: */
597    for (i = 1; i < (key->rounds); i++) {
598        rk += 4;
599#if 1
600        for (j = 0; j < 4; j++) {
601            u32 tp1, tp2, tp4, tp8, tp9, tpb, tpd, tpe, m;
602
603            tp1 = rk[j];
604            m = tp1 & 0x80808080;
605            tp2 = ((tp1 & 0x7f7f7f7f) << 1) ^
606                ((m - (m >> 7)) & 0x1b1b1b1b);
607            m = tp2 & 0x80808080;
608            tp4 = ((tp2 & 0x7f7f7f7f) << 1) ^
609                ((m - (m >> 7)) & 0x1b1b1b1b);
610            m = tp4 & 0x80808080;
611            tp8 = ((tp4 & 0x7f7f7f7f) << 1) ^
612                ((m - (m >> 7)) & 0x1b1b1b1b);
613            tp9 = tp8 ^ tp1;
614            tpb = tp9 ^ tp2;
615            tpd = tp9 ^ tp4;
616            tpe = tp8 ^ tp4 ^ tp2;
617#if defined(ROTATE)
618            rk[j] = tpe ^ ROTATE(tpd,16) ^
619                ROTATE(tp9,8) ^ ROTATE(tpb,24);
620#else
621            rk[j] = tpe ^ (tpd >> 16) ^ (tpd << 16) ^
622                (tp9 >> 24) ^ (tp9 << 8) ^
623                (tpb >> 8) ^ (tpb << 24);
624#endif
625        }
626#else
627        rk[0] =
628            Td0[Te2[(rk[0]      ) & 0xff] & 0xff] ^
629            Td1[Te2[(rk[0] >>  8) & 0xff] & 0xff] ^
630            Td2[Te2[(rk[0] >> 16) & 0xff] & 0xff] ^
631            Td3[Te2[(rk[0] >> 24)       ] & 0xff];
632        rk[1] =
633            Td0[Te2[(rk[1]      ) & 0xff] & 0xff] ^
634            Td1[Te2[(rk[1] >>  8) & 0xff] & 0xff] ^
635            Td2[Te2[(rk[1] >> 16) & 0xff] & 0xff] ^
636            Td3[Te2[(rk[1] >> 24)       ] & 0xff];
637        rk[2] =
638            Td0[Te2[(rk[2]      ) & 0xff] & 0xff] ^
639            Td1[Te2[(rk[2] >>  8) & 0xff] & 0xff] ^
640            Td2[Te2[(rk[2] >> 16) & 0xff] & 0xff] ^
641            Td3[Te2[(rk[2] >> 24)       ] & 0xff];
642        rk[3] =
643            Td0[Te2[(rk[3]      ) & 0xff] & 0xff] ^
644            Td1[Te2[(rk[3] >>  8) & 0xff] & 0xff] ^
645            Td2[Te2[(rk[3] >> 16) & 0xff] & 0xff] ^
646            Td3[Te2[(rk[3] >> 24)       ] & 0xff];
647#endif
648    }
649    return 0;
650}
651
652/*
653 * Encrypt a single block
654 * in and out can overlap
655 */
656void AES_encrypt(const unsigned char *in, unsigned char *out,
657                 const AES_KEY *key)
658{
659
660    const u32 *rk;
661    u32 s0, s1, s2, s3, t[4];
662    int r;
663
664    assert(in && out && key);
665    rk = key->rd_key;
666
667    /*
668     * map byte array block to cipher state
669     * and add initial round key:
670     */
671    s0 = GETU32(in     ) ^ rk[0];
672    s1 = GETU32(in +  4) ^ rk[1];
673    s2 = GETU32(in +  8) ^ rk[2];
674    s3 = GETU32(in + 12) ^ rk[3];
675
676#if defined(AES_COMPACT_IN_OUTER_ROUNDS)
677    prefetch256(Te4);
678
679    t[0] = (u32)Te4[(s0      ) & 0xff]       ^
680           (u32)Te4[(s1 >>  8) & 0xff] <<  8 ^
681           (u32)Te4[(s2 >> 16) & 0xff] << 16 ^
682           (u32)Te4[(s3 >> 24)       ] << 24;
683    t[1] = (u32)Te4[(s1      ) & 0xff]       ^
684           (u32)Te4[(s2 >>  8) & 0xff] <<  8 ^
685           (u32)Te4[(s3 >> 16) & 0xff] << 16 ^
686           (u32)Te4[(s0 >> 24)       ] << 24;
687    t[2] = (u32)Te4[(s2      ) & 0xff]       ^
688           (u32)Te4[(s3 >>  8) & 0xff] <<  8 ^
689           (u32)Te4[(s0 >> 16) & 0xff] << 16 ^
690           (u32)Te4[(s1 >> 24)       ] << 24;
691    t[3] = (u32)Te4[(s3      ) & 0xff]       ^
692           (u32)Te4[(s0 >>  8) & 0xff] <<  8 ^
693           (u32)Te4[(s1 >> 16) & 0xff] << 16 ^
694           (u32)Te4[(s2 >> 24)       ] << 24;
695
696    /* now do the linear transform using words */
697    {   int i;
698        u32 r0, r1, r2;
699
700        for (i = 0; i < 4; i++) {
701            r0 = t[i];
702            r1 = r0 & 0x80808080;
703            r2 = ((r0 & 0x7f7f7f7f) << 1) ^
704                ((r1 - (r1 >> 7)) & 0x1b1b1b1b);
705#if defined(ROTATE)
706            t[i] = r2 ^ ROTATE(r2,24) ^ ROTATE(r0,24) ^
707                ROTATE(r0,16) ^ ROTATE(r0,8);
708#else
709            t[i] = r2 ^ ((r2 ^ r0) << 24) ^ ((r2 ^ r0) >> 8) ^
710                (r0 << 16) ^ (r0 >> 16) ^
711                (r0 << 8) ^ (r0 >> 24);
712#endif
713            t[i] ^= rk[4+i];
714        }
715    }
716#else
717    t[0] =  Te0[(s0      ) & 0xff] ^
718        Te1[(s1 >>  8) & 0xff] ^
719        Te2[(s2 >> 16) & 0xff] ^
720        Te3[(s3 >> 24)       ] ^
721        rk[4];
722    t[1] =  Te0[(s1      ) & 0xff] ^
723        Te1[(s2 >>  8) & 0xff] ^
724        Te2[(s3 >> 16) & 0xff] ^
725        Te3[(s0 >> 24)       ] ^
726        rk[5];
727    t[2] =  Te0[(s2      ) & 0xff] ^
728        Te1[(s3 >>  8) & 0xff] ^
729        Te2[(s0 >> 16) & 0xff] ^
730        Te3[(s1 >> 24)       ] ^
731        rk[6];
732    t[3] =  Te0[(s3      ) & 0xff] ^
733        Te1[(s0 >>  8) & 0xff] ^
734        Te2[(s1 >> 16) & 0xff] ^
735        Te3[(s2 >> 24)       ] ^
736        rk[7];
737#endif
738    s0 = t[0]; s1 = t[1]; s2 = t[2]; s3 = t[3];
739
740    /*
741     * Nr - 2 full rounds:
742     */
743    for (rk+=8,r=key->rounds-2; r>0; rk+=4,r--) {
744#if defined(AES_COMPACT_IN_INNER_ROUNDS)
745        t[0] = (u32)Te4[(s0      ) & 0xff]       ^
746               (u32)Te4[(s1 >>  8) & 0xff] <<  8 ^
747               (u32)Te4[(s2 >> 16) & 0xff] << 16 ^
748               (u32)Te4[(s3 >> 24)       ] << 24;
749        t[1] = (u32)Te4[(s1      ) & 0xff]       ^
750               (u32)Te4[(s2 >>  8) & 0xff] <<  8 ^
751               (u32)Te4[(s3 >> 16) & 0xff] << 16 ^
752               (u32)Te4[(s0 >> 24)       ] << 24;
753        t[2] = (u32)Te4[(s2      ) & 0xff]       ^
754               (u32)Te4[(s3 >>  8) & 0xff] <<  8 ^
755               (u32)Te4[(s0 >> 16) & 0xff] << 16 ^
756               (u32)Te4[(s1 >> 24)       ] << 24;
757        t[3] = (u32)Te4[(s3      ) & 0xff]       ^
758               (u32)Te4[(s0 >>  8) & 0xff] <<  8 ^
759               (u32)Te4[(s1 >> 16) & 0xff] << 16 ^
760               (u32)Te4[(s2 >> 24)       ] << 24;
761
762        /* now do the linear transform using words */
763        {
764            int i;
765            u32 r0, r1, r2;
766
767            for (i = 0; i < 4; i++) {
768                r0 = t[i];
769                r1 = r0 & 0x80808080;
770                r2 = ((r0 & 0x7f7f7f7f) << 1) ^
771                    ((r1 - (r1 >> 7)) & 0x1b1b1b1b);
772#if defined(ROTATE)
773                t[i] = r2 ^ ROTATE(r2,24) ^ ROTATE(r0,24) ^
774                    ROTATE(r0,16) ^ ROTATE(r0,8);
775#else
776                t[i] = r2 ^ ((r2 ^ r0) << 24) ^ ((r2 ^ r0) >> 8) ^
777                    (r0 << 16) ^ (r0 >> 16) ^
778                    (r0 << 8) ^ (r0 >> 24);
779#endif
780                t[i] ^= rk[i];
781            }
782        }
783#else
784        t[0] =  Te0[(s0      ) & 0xff] ^
785            Te1[(s1 >>  8) & 0xff] ^
786            Te2[(s2 >> 16) & 0xff] ^
787            Te3[(s3 >> 24)       ] ^
788            rk[0];
789        t[1] =  Te0[(s1      ) & 0xff] ^
790            Te1[(s2 >>  8) & 0xff] ^
791            Te2[(s3 >> 16) & 0xff] ^
792            Te3[(s0 >> 24)       ] ^
793            rk[1];
794        t[2] =  Te0[(s2      ) & 0xff] ^
795            Te1[(s3 >>  8) & 0xff] ^
796            Te2[(s0 >> 16) & 0xff] ^
797            Te3[(s1 >> 24)       ] ^
798            rk[2];
799        t[3] =  Te0[(s3      ) & 0xff] ^
800            Te1[(s0 >>  8) & 0xff] ^
801            Te2[(s1 >> 16) & 0xff] ^
802            Te3[(s2 >> 24)       ] ^
803            rk[3];
804#endif
805        s0 = t[0]; s1 = t[1]; s2 = t[2]; s3 = t[3];
806    }
807    /*
808     * apply last round and
809     * map cipher state to byte array block:
810     */
811#if defined(AES_COMPACT_IN_OUTER_ROUNDS)
812    prefetch256(Te4);
813
814    *(u32*)(out+0) =
815           (u32)Te4[(s0      ) & 0xff]       ^
816           (u32)Te4[(s1 >>  8) & 0xff] <<  8 ^
817           (u32)Te4[(s2 >> 16) & 0xff] << 16 ^
818           (u32)Te4[(s3 >> 24)       ] << 24 ^
819        rk[0];
820    *(u32*)(out+4) =
821           (u32)Te4[(s1      ) & 0xff]       ^
822           (u32)Te4[(s2 >>  8) & 0xff] <<  8 ^
823           (u32)Te4[(s3 >> 16) & 0xff] << 16 ^
824           (u32)Te4[(s0 >> 24)       ] << 24 ^
825        rk[1];
826    *(u32*)(out+8) =
827           (u32)Te4[(s2      ) & 0xff]       ^
828           (u32)Te4[(s3 >>  8) & 0xff] <<  8 ^
829           (u32)Te4[(s0 >> 16) & 0xff] << 16 ^
830           (u32)Te4[(s1 >> 24)       ] << 24 ^
831        rk[2];
832    *(u32*)(out+12) =
833           (u32)Te4[(s3      ) & 0xff]       ^
834           (u32)Te4[(s0 >>  8) & 0xff] <<  8 ^
835           (u32)Te4[(s1 >> 16) & 0xff] << 16 ^
836           (u32)Te4[(s2 >> 24)       ] << 24 ^
837        rk[3];
838#else
839    *(u32*)(out+0) =
840        (Te2[(s0      ) & 0xff] & 0x000000ffU) ^
841        (Te3[(s1 >>  8) & 0xff] & 0x0000ff00U) ^
842        (Te0[(s2 >> 16) & 0xff] & 0x00ff0000U) ^
843        (Te1[(s3 >> 24)       ] & 0xff000000U) ^
844        rk[0];
845    *(u32*)(out+4) =
846        (Te2[(s1      ) & 0xff] & 0x000000ffU) ^
847        (Te3[(s2 >>  8) & 0xff] & 0x0000ff00U) ^
848        (Te0[(s3 >> 16) & 0xff] & 0x00ff0000U) ^
849        (Te1[(s0 >> 24)       ] & 0xff000000U) ^
850        rk[1];
851    *(u32*)(out+8) =
852        (Te2[(s2      ) & 0xff] & 0x000000ffU) ^
853        (Te3[(s3 >>  8) & 0xff] & 0x0000ff00U) ^
854        (Te0[(s0 >> 16) & 0xff] & 0x00ff0000U) ^
855        (Te1[(s1 >> 24)       ] & 0xff000000U) ^
856        rk[2];
857    *(u32*)(out+12) =
858        (Te2[(s3      ) & 0xff] & 0x000000ffU) ^
859        (Te3[(s0 >>  8) & 0xff] & 0x0000ff00U) ^
860        (Te0[(s1 >> 16) & 0xff] & 0x00ff0000U) ^
861        (Te1[(s2 >> 24)       ] & 0xff000000U) ^
862        rk[3];
863#endif
864}
865
866/*
867 * Decrypt a single block
868 * in and out can overlap
869 */
870void AES_decrypt(const unsigned char *in, unsigned char *out,
871                 const AES_KEY *key)
872{
873
874    const u32 *rk;
875    u32 s0, s1, s2, s3, t[4];
876    int r;
877
878    assert(in && out && key);
879    rk = key->rd_key;
880
881    /*
882     * map byte array block to cipher state
883     * and add initial round key:
884     */
885    s0 = GETU32(in     ) ^ rk[0];
886    s1 = GETU32(in +  4) ^ rk[1];
887    s2 = GETU32(in +  8) ^ rk[2];
888    s3 = GETU32(in + 12) ^ rk[3];
889
890#if defined(AES_COMPACT_IN_OUTER_ROUNDS)
891    prefetch256(Td4);
892
893    t[0] = (u32)Td4[(s0      ) & 0xff]       ^
894           (u32)Td4[(s3 >>  8) & 0xff] <<  8 ^
895           (u32)Td4[(s2 >> 16) & 0xff] << 16 ^
896           (u32)Td4[(s1 >> 24)       ] << 24;
897    t[1] = (u32)Td4[(s1      ) & 0xff]       ^
898           (u32)Td4[(s0 >>  8) & 0xff] <<  8 ^
899           (u32)Td4[(s3 >> 16) & 0xff] << 16 ^
900           (u32)Td4[(s2 >> 24)       ] << 24;
901    t[2] = (u32)Td4[(s2      ) & 0xff]       ^
902           (u32)Td4[(s1 >>  8) & 0xff] <<  8 ^
903           (u32)Td4[(s0 >> 16) & 0xff] << 16 ^
904           (u32)Td4[(s3 >> 24)       ] << 24;
905    t[3] = (u32)Td4[(s3      ) & 0xff]       ^
906           (u32)Td4[(s2 >>  8) & 0xff] <<  8 ^
907           (u32)Td4[(s1 >> 16) & 0xff] << 16 ^
908           (u32)Td4[(s0 >> 24)       ] << 24;
909
910    /* now do the linear transform using words */
911    {
912        int i;
913        u32 tp1, tp2, tp4, tp8, tp9, tpb, tpd, tpe, m;
914
915        for (i = 0; i < 4; i++) {
916            tp1 = t[i];
917            m = tp1 & 0x80808080;
918            tp2 = ((tp1 & 0x7f7f7f7f) << 1) ^
919                ((m - (m >> 7)) & 0x1b1b1b1b);
920            m = tp2 & 0x80808080;
921            tp4 = ((tp2 & 0x7f7f7f7f) << 1) ^
922                ((m - (m >> 7)) & 0x1b1b1b1b);
923            m = tp4 & 0x80808080;
924            tp8 = ((tp4 & 0x7f7f7f7f) << 1) ^
925                ((m - (m >> 7)) & 0x1b1b1b1b);
926            tp9 = tp8 ^ tp1;
927            tpb = tp9 ^ tp2;
928            tpd = tp9 ^ tp4;
929            tpe = tp8 ^ tp4 ^ tp2;
930#if defined(ROTATE)
931            t[i] = tpe ^ ROTATE(tpd,16) ^
932                ROTATE(tp9,8) ^ ROTATE(tpb,24);
933#else
934            t[i] = tpe ^ (tpd >> 16) ^ (tpd << 16) ^
935                (tp9 >> 24) ^ (tp9 << 8) ^
936                (tpb >> 8) ^ (tpb << 24);
937#endif
938            t[i] ^= rk[4+i];
939        }
940    }
941#else
942    t[0] =  Td0[(s0      ) & 0xff] ^
943        Td1[(s3 >>  8) & 0xff] ^
944        Td2[(s2 >> 16) & 0xff] ^
945        Td3[(s1 >> 24)       ] ^
946        rk[4];
947    t[1] =  Td0[(s1      ) & 0xff] ^
948        Td1[(s0 >>  8) & 0xff] ^
949        Td2[(s3 >> 16) & 0xff] ^
950        Td3[(s2 >> 24)       ] ^
951        rk[5];
952    t[2] =  Td0[(s2      ) & 0xff] ^
953        Td1[(s1 >>  8) & 0xff] ^
954        Td2[(s0 >> 16) & 0xff] ^
955        Td3[(s3 >> 24)       ] ^
956        rk[6];
957    t[3] =  Td0[(s3      ) & 0xff] ^
958        Td1[(s2 >>  8) & 0xff] ^
959        Td2[(s1 >> 16) & 0xff] ^
960        Td3[(s0 >> 24)       ] ^
961        rk[7];
962#endif
963    s0 = t[0]; s1 = t[1]; s2 = t[2]; s3 = t[3];
964
965    /*
966     * Nr - 2 full rounds:
967     */
968    for (rk+=8,r=key->rounds-2; r>0; rk+=4,r--) {
969#if defined(AES_COMPACT_IN_INNER_ROUNDS)
970        t[0] = (u32)Td4[(s0      ) & 0xff]       ^
971               (u32)Td4[(s3 >>  8) & 0xff] <<  8 ^
972               (u32)Td4[(s2 >> 16) & 0xff] << 16 ^
973               (u32)Td4[(s1 >> 24)       ] << 24;
974        t[1] = (u32)Td4[(s1      ) & 0xff]       ^
975               (u32)Td4[(s0 >>  8) & 0xff] <<  8 ^
976               (u32)Td4[(s3 >> 16) & 0xff] << 16 ^
977               (u32)Td4[(s2 >> 24)       ] << 24;
978        t[2] = (u32)Td4[(s2      ) & 0xff]       ^
979               (u32)Td4[(s1 >>  8) & 0xff] <<  8 ^
980               (u32)Td4[(s0 >> 16) & 0xff] << 16 ^
981               (u32)Td4[(s3 >> 24)       ] << 24;
982        t[3] = (u32)Td4[(s3      ) & 0xff]       ^
983               (u32)Td4[(s2 >>  8) & 0xff] <<  8 ^
984               (u32)Td4[(s1 >> 16) & 0xff] << 16 ^
985               (u32)Td4[(s0 >> 24)       ] << 24;
986
987    /* now do the linear transform using words */
988    {
989        int i;
990        u32 tp1, tp2, tp4, tp8, tp9, tpb, tpd, tpe, m;
991
992        for (i = 0; i < 4; i++) {
993            tp1 = t[i];
994            m = tp1 & 0x80808080;
995            tp2 = ((tp1 & 0x7f7f7f7f) << 1) ^
996                ((m - (m >> 7)) & 0x1b1b1b1b);
997            m = tp2 & 0x80808080;
998            tp4 = ((tp2 & 0x7f7f7f7f) << 1) ^
999                ((m - (m >> 7)) & 0x1b1b1b1b);
1000            m = tp4 & 0x80808080;
1001            tp8 = ((tp4 & 0x7f7f7f7f) << 1) ^
1002                ((m - (m >> 7)) & 0x1b1b1b1b);
1003            tp9 = tp8 ^ tp1;
1004            tpb = tp9 ^ tp2;
1005            tpd = tp9 ^ tp4;
1006            tpe = tp8 ^ tp4 ^ tp2;
1007#if defined(ROTATE)
1008            t[i] = tpe ^ ROTATE(tpd,16) ^
1009                ROTATE(tp9,8) ^ ROTATE(tpb,24);
1010#else
1011            t[i] = tpe ^ (tpd >> 16) ^ (tpd << 16) ^
1012                (tp9 >> 24) ^ (tp9 << 8) ^
1013                (tpb >> 8) ^ (tpb << 24);
1014#endif
1015            t[i] ^= rk[i];
1016        }
1017    }
1018#else
1019    t[0] =  Td0[(s0      ) & 0xff] ^
1020        Td1[(s3 >>  8) & 0xff] ^
1021        Td2[(s2 >> 16) & 0xff] ^
1022        Td3[(s1 >> 24)       ] ^
1023        rk[0];
1024    t[1] =  Td0[(s1      ) & 0xff] ^
1025        Td1[(s0 >>  8) & 0xff] ^
1026        Td2[(s3 >> 16) & 0xff] ^
1027        Td3[(s2 >> 24)       ] ^
1028        rk[1];
1029    t[2] =  Td0[(s2      ) & 0xff] ^
1030        Td1[(s1 >>  8) & 0xff] ^
1031        Td2[(s0 >> 16) & 0xff] ^
1032        Td3[(s3 >> 24)       ] ^
1033        rk[2];
1034    t[3] =  Td0[(s3      ) & 0xff] ^
1035        Td1[(s2 >>  8) & 0xff] ^
1036        Td2[(s1 >> 16) & 0xff] ^
1037        Td3[(s0 >> 24)       ] ^
1038        rk[3];
1039#endif
1040    s0 = t[0]; s1 = t[1]; s2 = t[2]; s3 = t[3];
1041    }
1042    /*
1043     * apply last round and
1044     * map cipher state to byte array block:
1045     */
1046    prefetch256(Td4);
1047
1048    *(u32*)(out+0) =
1049        ((u32)Td4[(s0      ) & 0xff])    ^
1050        ((u32)Td4[(s3 >>  8) & 0xff] <<  8) ^
1051        ((u32)Td4[(s2 >> 16) & 0xff] << 16) ^
1052        ((u32)Td4[(s1 >> 24)       ] << 24) ^
1053        rk[0];
1054    *(u32*)(out+4) =
1055        ((u32)Td4[(s1      ) & 0xff])     ^
1056        ((u32)Td4[(s0 >>  8) & 0xff] <<  8) ^
1057        ((u32)Td4[(s3 >> 16) & 0xff] << 16) ^
1058        ((u32)Td4[(s2 >> 24)       ] << 24) ^
1059        rk[1];
1060    *(u32*)(out+8) =
1061        ((u32)Td4[(s2      ) & 0xff])     ^
1062        ((u32)Td4[(s1 >>  8) & 0xff] <<  8) ^
1063        ((u32)Td4[(s0 >> 16) & 0xff] << 16) ^
1064        ((u32)Td4[(s3 >> 24)       ] << 24) ^
1065        rk[2];
1066    *(u32*)(out+12) =
1067        ((u32)Td4[(s3      ) & 0xff])     ^
1068        ((u32)Td4[(s2 >>  8) & 0xff] <<  8) ^
1069        ((u32)Td4[(s1 >> 16) & 0xff] << 16) ^
1070        ((u32)Td4[(s0 >> 24)       ] << 24) ^
1071        rk[3];
1072}
1073