1/*
2 * Copyright (c) 2008, 2016, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.
8 *
9 * This code is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
12 * version 2 for more details (a copy is included in the LICENSE file that
13 * accompanied this code).
14 *
15 * You should have received a copy of the GNU General Public License version
16 * 2 along with this work; if not, write to the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18 *
19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20 * or visit www.oracle.com if you need additional information or have any
21 * questions.
22 *
23 */
24
25#ifdef COMPILE_CRYPTO
26
27// The Rijndael S-box and inverted S-box are embedded here for a faster access.
28//
29// Note about lookup tables (T1...T4 and T5..T8):
30// The tables (boxes) combine ahead-of-time precalculated transposition and mixing steps as
31// an alternative to a runtime calculation.
32// The tables are statically generated in com/sun/crypto/provider/AESCrypt class.
33// Only the first table reference is passed to AES methods below. The other 3 tables
34// in ecryption and decryption are calculated in runtime by rotating the T1 result accordingly.
35// It is a free operation on ARM with embedded register-shifted-register EOR capability.
36// The table reference is passed in a form of a last argument on the parametes list.
37// The tables lookup method proves to perform better then a runtime Galois Field caclulation,
38// due to a lack of HW acceleration for the later.
39
40unsigned char * SBox;
41unsigned char * SInvBox;
42
43void  aes_init() {
44
45  const static unsigned char Si[256] =
46    {
47      0x52, 0x09, 0x6A, 0xD5, 0x30, 0x36, 0xA5, 0x38,
48      0xBF, 0x40, 0xA3, 0x9E, 0x81, 0xF3, 0xD7, 0xFB,
49      0x7C, 0xE3, 0x39, 0x82, 0x9B, 0x2F, 0xFF, 0x87,
50      0x34, 0x8E, 0x43, 0x44, 0xC4, 0xDE, 0xE9, 0xCB,
51      0x54, 0x7B, 0x94, 0x32, 0xA6, 0xC2, 0x23, 0x3D,
52      0xEE, 0x4C, 0x95, 0x0B, 0x42, 0xFA, 0xC3, 0x4E,
53      0x08, 0x2E, 0xA1, 0x66, 0x28, 0xD9, 0x24, 0xB2,
54      0x76, 0x5B, 0xA2, 0x49, 0x6D, 0x8B, 0xD1, 0x25,
55      0x72, 0xF8, 0xF6, 0x64, 0x86, 0x68, 0x98, 0x16,
56      0xD4, 0xA4, 0x5C, 0xCC, 0x5D, 0x65, 0xB6, 0x92,
57      0x6C, 0x70, 0x48, 0x50, 0xFD, 0xED, 0xB9, 0xDA,
58      0x5E, 0x15, 0x46, 0x57, 0xA7, 0x8D, 0x9D, 0x84,
59      0x90, 0xD8, 0xAB, 0x00, 0x8C, 0xBC, 0xD3, 0x0A,
60      0xF7, 0xE4, 0x58, 0x05, 0xB8, 0xB3, 0x45, 0x06,
61      0xD0, 0x2C, 0x1E, 0x8F, 0xCA, 0x3F, 0x0F, 0x02,
62      0xC1, 0xAF, 0xBD, 0x03, 0x01, 0x13, 0x8A, 0x6B,
63      0x3A, 0x91, 0x11, 0x41, 0x4F, 0x67, 0xDC, 0xEA,
64      0x97, 0xF2, 0xCF, 0xCE, 0xF0, 0xB4, 0xE6, 0x73,
65      0x96, 0xAC, 0x74, 0x22, 0xE7, 0xAD, 0x35, 0x85,
66      0xE2, 0xF9, 0x37, 0xE8, 0x1C, 0x75, 0xDF, 0x6E,
67      0x47, 0xF1, 0x1A, 0x71, 0x1D, 0x29, 0xC5, 0x89,
68      0x6F, 0xB7, 0x62, 0x0E, 0xAA, 0x18, 0xBE, 0x1B,
69      0xFC, 0x56, 0x3E, 0x4B, 0xC6, 0xD2, 0x79, 0x20,
70      0x9A, 0xDB, 0xC0, 0xFE, 0x78, 0xCD, 0x5A, 0xF4,
71      0x1F, 0xDD, 0xA8, 0x33, 0x88, 0x07, 0xC7, 0x31,
72      0xB1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xEC, 0x5F,
73      0x60, 0x51, 0x7F, 0xA9, 0x19, 0xB5, 0x4A, 0x0D,
74      0x2D, 0xE5, 0x7A, 0x9F, 0x93, 0xC9, 0x9C, 0xEF,
75      0xA0, 0xE0, 0x3B, 0x4D, 0xAE, 0x2A, 0xF5, 0xB0,
76      0xC8, 0xEB, 0xBB, 0x3C, 0x83, 0x53, 0x99, 0x61,
77      0x17, 0x2B, 0x04, 0x7E, 0xBA, 0x77, 0xD6, 0x26,
78      0xE1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0C, 0x7D
79    };
80
81  static const unsigned char S[256]={
82      0x63, 0x7C, 0x77, 0x7B, 0xF2, 0x6B, 0x6F, 0xC5,
83      0x30, 0x01, 0x67, 0x2B, 0xFE, 0xD7, 0xAB, 0x76,
84      0xCA, 0x82, 0xC9, 0x7D, 0xFA, 0x59, 0x47, 0xF0,
85      0xAD, 0xD4, 0xA2, 0xAF, 0x9C, 0xA4, 0x72, 0xC0,
86      0xB7, 0xFD, 0x93, 0x26, 0x36, 0x3F, 0xF7, 0xCC,
87      0x34, 0xA5, 0xE5, 0xF1, 0x71, 0xD8, 0x31, 0x15,
88      0x04, 0xC7, 0x23, 0xC3, 0x18, 0x96, 0x05, 0x9A,
89      0x07, 0x12, 0x80, 0xE2, 0xEB, 0x27, 0xB2, 0x75,
90      0x09, 0x83, 0x2C, 0x1A, 0x1B, 0x6E, 0x5A, 0xA0,
91      0x52, 0x3B, 0xD6, 0xB3, 0x29, 0xE3, 0x2F, 0x84,
92      0x53, 0xD1, 0x00, 0xED, 0x20, 0xFC, 0xB1, 0x5B,
93      0x6A, 0xCB, 0xBE, 0x39, 0x4A, 0x4C, 0x58, 0xCF,
94      0xD0, 0xEF, 0xAA, 0xFB, 0x43, 0x4D, 0x33, 0x85,
95      0x45, 0xF9, 0x02, 0x7F, 0x50, 0x3C, 0x9F, 0xA8,
96      0x51, 0xA3, 0x40, 0x8F, 0x92, 0x9D, 0x38, 0xF5,
97      0xBC, 0xB6, 0xDA, 0x21, 0x10, 0xFF, 0xF3, 0xD2,
98      0xCD, 0x0C, 0x13, 0xEC, 0x5F, 0x97, 0x44, 0x17,
99      0xC4, 0xA7, 0x7E, 0x3D, 0x64, 0x5D, 0x19, 0x73,
100      0x60, 0x81, 0x4F, 0xDC, 0x22, 0x2A, 0x90, 0x88,
101      0x46, 0xEE, 0xB8, 0x14, 0xDE, 0x5E, 0x0B, 0xDB,
102      0xE0, 0x32, 0x3A, 0x0A, 0x49, 0x06, 0x24, 0x5C,
103      0xC2, 0xD3, 0xAC, 0x62, 0x91, 0x95, 0xE4, 0x79,
104      0xE7, 0xC8, 0x37, 0x6D, 0x8D, 0xD5, 0x4E, 0xA9,
105      0x6C, 0x56, 0xF4, 0xEA, 0x65, 0x7A, 0xAE, 0x08,
106      0xBA, 0x78, 0x25, 0x2E, 0x1C, 0xA6, 0xB4, 0xC6,
107      0xE8, 0xDD, 0x74, 0x1F, 0x4B, 0xBD, 0x8B, 0x8A,
108      0x70, 0x3E, 0xB5, 0x66, 0x48, 0x03, 0xF6, 0x0E,
109      0x61, 0x35, 0x57, 0xB9, 0x86, 0xC1, 0x1D, 0x9E,
110      0xE1, 0xF8, 0x98, 0x11, 0x69, 0xD9, 0x8E, 0x94,
111      0x9B, 0x1E, 0x87, 0xE9, 0xCE, 0x55, 0x28, 0xDF,
112      0x8C, 0xA1, 0x89, 0x0D, 0xBF, 0xE6, 0x42, 0x68,
113      0x41, 0x99, 0x2D, 0x0F, 0xB0, 0x54, 0xBB, 0x16
114  };
115
116  SBox = (unsigned char*)S;
117  SInvBox = (unsigned char*)Si;
118}
119
120address generate_aescrypt_encryptBlock() {
121  __ align(CodeEntryAlignment);
122  StubCodeMark mark(this, "StubRoutines", "aesencryptBlock");
123
124  address start = __ pc();
125
126  //    Register from = R0; // source byte array
127  //    Register to = R1;   // destination byte array
128  //    Register key = R2;  // expanded key array
129  //    Register tbox = R3; // transposition box reference
130
131  __ push (RegisterSet(R4, R12) | LR);
132  __ fstmdbd(SP, FloatRegisterSet(D0, 4), writeback);
133  __ sub(SP, SP, 32);
134
135  // preserve TBox references
136  __ add(R3, R3, arrayOopDesc::base_offset_in_bytes(T_INT));
137  __ str(R3, Address(SP, 16));
138
139  // retrieve key length. The length is used to determine the number of subsequent rounds (10, 12 or 14)
140  __ ldr(R9, Address(R2, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
141
142  __ ldr(R5, Address(R0));
143  __ ldr(R10, Address(R2, 4, post_indexed));
144  __ rev(R5, R5);
145  __ eor(R5, R5, R10);
146  __ ldr(R6, Address(R0, 4));
147  __ ldr(R10, Address(R2, 4, post_indexed));
148  __ rev(R6, R6);
149  __ eor(R6, R6, R10);
150  __ ldr(R7, Address(R0, 8));
151  __ ldr(R10, Address(R2, 4, post_indexed));
152  __ rev(R7, R7);
153  __ eor(R7, R7, R10);
154  __ ldr(R8, Address(R0, 12));
155  __ ldr(R10, Address(R2, 4, post_indexed));
156  __ rev(R8, R8);
157  __ eor(R8, R8, R10);
158
159  // Store the key size; However before doing that adjust the key to compensate for the Initial and Last rounds
160  __ sub(R9, R9, 8);
161  __ fmsr(S7, R1);
162
163  // load first transporistion box (T1)
164  __ ldr(R0, Address(SP, 16));
165
166  __ mov(LR, R2);
167
168  Label round;
169
170  __ bind(round);
171
172  // Utilize a Transposition Box lookup along with subsequent shift and EOR with a round key.
173  // instructions ordering is rearranged to minimize ReadAferWrite dependency. Not that important on A15 target
174  // with register renaming but performs ~10% better on A9.
175  __ mov(R12, AsmOperand(R5, lsr, 24));
176  __ ubfx(R4, R6, 16, 8);
177  __ ldr (R1, Address(R0, R12, lsl, 2));
178  __ ldr(R2, Address(R0, R4, lsl, 2));
179  __ ubfx(R3, R7, 8, 8);
180  __ eor(R1, R1, AsmOperand(R2, ror, 8));
181  __ uxtb(R4, R8);
182  __ ldr(R3, Address(R0, R3, lsl, 2));
183  __ ldr(R4, Address(R0, R4, lsl, 2));
184  __ ldr(R12, Address(LR, 4, post_indexed));
185  __ eor(R1, R1, AsmOperand(R3, ror, 16));
186  __ eor(R12, R12, AsmOperand(R4, ror, 24));
187  __ eor(R10, R1, R12);
188
189  __ mov(R12, AsmOperand(R6, lsr, 24));
190  __ ubfx(R4, R7, 16, 8);
191  __ ldr (R1, Address(R0, R12, lsl, 2));
192  __ ldr(R2, Address(R0, R4, lsl, 2));
193  __ ubfx(R3, R8, 8, 8);
194  __ eor(R1, R1, AsmOperand(R2, ror, 8));
195  __ uxtb(R4, R5);
196  __ ldr(R3, Address(R0, R3, lsl, 2));
197  __ ldr(R4, Address(R0, R4, lsl, 2));
198  __ ldr(R12, Address(LR, 4, post_indexed));
199  __ eor(R1, R1, AsmOperand(R3, ror, 16));
200  __ eor(R12, R12, AsmOperand(R4, ror, 24));
201  __ eor(R11, R1, R12);
202
203  __ mov(R12, AsmOperand(R7, lsr, 24));
204  __ ubfx(R4, R8, 16, 8);
205  __ ldr (R1, Address(R0, R12, lsl, 2));
206  __ ldr(R2, Address(R0, R4, lsl, 2));
207  __ ubfx(R3, R5, 8, 8);
208  __ eor(R1, R1, AsmOperand(R2, ror, 8));
209  __ uxtb(R4, R6);
210  __ ldr(R3, Address(R0, R3, lsl, 2));
211  __ ldr(R4, Address(R0, R4, lsl, 2));
212  __ ldr(R12, Address(LR, 4, post_indexed));
213  __ eor(R1, R1, AsmOperand(R3, ror, 16));
214  __ eor(R12, R12, AsmOperand(R4, ror, 24));
215  __ eor(R3, R1, R12);
216  __ str(R3, Address(SP, 0));
217
218  __ mov(R12, AsmOperand(R8, lsr, 24));
219  __ ubfx(R4, R5, 16, 8);
220  __ ldr (R1, Address(R0, R12, lsl, 2));
221  __ ldr(R2, Address(R0, R4, lsl, 2));
222  __ ubfx(R3, R6, 8, 8);
223  __ eor(R1, R1, AsmOperand(R2, ror, 8));
224  __ uxtb(R4, R7);
225  __ ldr(R3, Address(R0, R3, lsl, 2));
226  __ ldr(R4, Address(R0, R4, lsl, 2));
227  __ ldr(R12, Address(LR, 4, post_indexed));
228  __ eor(R1, R1, AsmOperand(R3, ror, 16));
229  __ eor(R12, R12, AsmOperand(R4, ror, 24));
230  __ eor(R8, R1, R12);
231
232  // update round count
233  __ subs(R9, R9, 4);
234
235  __ mov(R5, R10);
236  __ mov(R6, R11);
237  __ ldr(R7, Address(SP, 0));
238
239  __ b(round, gt);
240
241
242  // last round - a special case, no MixColumn
243  __ mov_slow(R10, (int)SBox);
244
245
246  // output buffer pointer
247  __ fmrs(R9, S7);
248
249  __ ldr(R11, Address(LR, 4, post_indexed));
250  __ ldrb(R0, Address(R10, R5, lsr, 24));
251  __ ubfx(R12, R6, 16, 8);
252  __ ldrb(R1, Address(R10, R12));
253  __ orr(R0, R1, AsmOperand(R0, lsl, 8));
254  __ ubfx(R12, R7, 8, 8);
255  __ ldrb(R2, Address(R10, R12));
256  __ orr(R0, R2, AsmOperand(R0, lsl, 8));
257  __ uxtb (R12, R8);
258  __ ldrb(R3, Address(R10, R12));
259  __ orr(R0, R3, AsmOperand(R0, lsl, 8));
260  __ eor(R0, R0, R11);
261  __ rev(R0, R0);
262  __ str(R0, Address(R9, 4, post_indexed));
263
264  __ ldr(R11, Address(LR, 4, post_indexed));
265  __ ldrb(R0, Address(R10, R6, lsr, 24));
266  __ ubfx(R12, R7, 16, 8);
267  __ ldrb(R1, Address(R10, R12));
268  __ orr(R0, R1, AsmOperand(R0, lsl, 8));
269  __ ubfx(R12, R8, 8, 8);
270  __ ldrb(R2, Address(R10, R12));
271  __ orr(R0, R2, AsmOperand(R0, lsl, 8));
272  __ uxtb (R12, R5);
273  __ ldrb(R3, Address(R10, R12));
274  __ orr(R0, R3, AsmOperand(R0, lsl, 8));
275  __ eor(R0, R0, R11);
276  __ rev(R0, R0);
277
278  __ str(R0, Address(R9, 4, post_indexed));
279  __ ldr(R11, Address(LR, 4, post_indexed));
280  __ ldrb(R0, Address(R10, R7, lsr, 24));
281  __ ubfx(R12, R8, 16, 8);
282  __ ldrb(R1, Address(R10, R12));
283  __ orr(R0, R1, AsmOperand(R0, lsl, 8));
284  __ ubfx(R12, R5, 8, 8);
285  __ ldrb(R2, Address(R10, R12));
286  __ orr(R0, R2, AsmOperand(R0, lsl, 8));
287  __ uxtb (R12, R6);
288  __ ldrb(R3, Address(R10, R12));
289  __ orr(R0, R3, AsmOperand(R0, lsl, 8));
290  __ eor(R0, R0, R11);
291  __ rev(R0, R0);
292
293  __ str(R0, Address(R9, 4, post_indexed));
294  __ ldr(R11, Address(LR));
295  __ ldrb(R0, Address(R10, R8, lsr, 24));
296  __ ubfx(R12, R5, 16, 8);
297  __ ldrb(R1, Address(R10, R12));
298  __ orr(R0, R1, AsmOperand(R0, lsl, 8));
299  __ ubfx(R12, R6, 8, 8);
300  __ ldrb(R2, Address(R10, R12));
301  __ orr(R0, R2, AsmOperand(R0, lsl, 8));
302  __ uxtb (R12, R7);
303  __ ldrb(R3, Address(R10, R12));
304  __ orr(R0, R3, AsmOperand(R0, lsl, 8));
305  __ eor(R0, R0, R11);
306  __ rev(R0, R0);
307
308  __ str(R0, Address(R9));
309
310  __ add(SP, SP, 32);
311  __ fldmiad(SP, FloatRegisterSet(D0, 4), writeback);;
312
313  __ pop(RegisterSet(R4, R12) | PC);
314  return start;
315}
316
317address generate_aescrypt_decryptBlock() {
318  __ align(CodeEntryAlignment);
319  StubCodeMark mark(this, "StubRoutines", "aesdecryptBlock");
320
321  address start = __ pc();
322
323  //    Register from = R0; // source byte array
324  //    Register to = R1;   // destination byte array
325  //    Register key = R2;  // expanded key array
326  //    Register tbox = R3; // transposition box reference
327
328  __ push (RegisterSet(R4, R12) | LR);
329  __ fstmdbd(SP, FloatRegisterSet(D0, 4), writeback);
330  __ sub(SP, SP, 32);
331
332  // retrieve key length
333  __ ldr(R9, Address(R2, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
334
335  // preserve TBox references
336  __ add(R3, R3, arrayOopDesc::base_offset_in_bytes(T_INT));
337  __ str(R3, Address(SP, 16));
338
339
340  // Preserve the expanded key pointer
341  __ fmsr(S8, R2);
342
343  // The first key round is applied to the last round
344  __ add(LR, R2, 16);
345
346
347  __ ldr(R5, Address(R0));
348  __ ldr(R10, Address(LR, 4, post_indexed));
349  __ rev(R5, R5);
350  __ eor(R5, R5, R10);
351  __ ldr(R6, Address(R0, 4));
352  __ ldr(R10, Address(LR, 4, post_indexed));
353  __ rev(R6, R6);
354  __ eor(R6, R6, R10);
355  __ ldr(R7, Address(R0, 8));
356  __ ldr(R10, Address(LR, 4, post_indexed));
357  __ rev(R7, R7);
358  __ eor(R7, R7, R10);
359  __ ldr(R8, Address(R0, 12));
360  __ ldr(R10, Address(LR, 4, post_indexed));
361  __ rev(R8, R8);
362  __ eor(R8, R8, R10);
363
364
365  // Store the key size; However before doing that adjust the key to compensate for the Initial and Last rounds
366  __ sub(R9, R9, 8);
367  __ fmsr(S7, R1);
368
369  // load transporistion box (T5)
370  __ ldr(R0, Address(SP, 16));
371
372  Label round;
373
374  __ bind(round);
375  // each sub-block is treated similary:
376
377  // combine SubBytes|ShiftRows|MixColumn through a precalculated set of tables
378  // Utilize a Transposition Box lookup along with subsequent shift and EOR with a round key.
379  // instructions ordering is rearranged to minimize ReadAferWrite dependency. Not that important on A15 target
380  // with register renaming but performs ~10% better on A9.
381  __ mov(R12, AsmOperand(R5, lsr, 24));
382  __ ubfx(R4, R8, 16, 8);
383  __ ldr (R1, Address(R0, R12, lsl, 2));
384  __ ldr(R2, Address(R0, R4, lsl, 2));
385  __ ubfx(R3, R7, 8, 8);
386  __ eor(R1, R1, AsmOperand(R2, ror, 8));
387  __ uxtb(R4, R6);
388  __ ldr(R3, Address(R0, R3, lsl, 2));
389  __ ldr(R4, Address(R0, R4, lsl, 2));
390  __ ldr(R12, Address(LR, 4, post_indexed));
391  __ eor(R1, R1, AsmOperand(R3, ror, 16));
392  __ eor(R12, R12, AsmOperand(R4, ror, 24));
393  __ eor(R10, R1, R12);
394
395  __ mov(R12, AsmOperand(R6, lsr, 24));
396  __ ubfx(R4, R5, 16, 8);
397  __ ldr (R1, Address(R0, R12, lsl, 2));
398  __ ldr(R2, Address(R0, R4, lsl, 2));
399  __ ubfx(R3, R8, 8, 8);
400  __ eor(R1, R1, AsmOperand(R2, ror, 8));
401  __ uxtb(R4, R7);
402  __ ldr(R3, Address(R0, R3, lsl, 2));
403  __ ldr(R4, Address(R0, R4, lsl, 2));
404  __ ldr(R12, Address(LR, 4, post_indexed));
405  __ eor(R1, R1, AsmOperand(R3, ror, 16));
406  __ eor(R12, R12, AsmOperand(R4, ror, 24));
407  __ eor(R11, R1, R12);
408
409  __ mov(R12, AsmOperand(R7, lsr, 24));
410  __ ubfx(R4, R6, 16, 8);
411  __ ldr (R1, Address(R0, R12, lsl, 2));
412  __ ldr(R2, Address(R0, R4, lsl, 2));
413  __ ubfx(R3, R5, 8, 8);
414  __ eor(R1, R1, AsmOperand(R2, ror, 8));
415  __ uxtb(R4, R8);
416  __ ldr(R3, Address(R0, R3, lsl, 2));
417  __ ldr(R4, Address(R0, R4, lsl, 2));
418  __ ldr(R12, Address(LR, 4, post_indexed));
419  __ eor(R1, R1, AsmOperand(R3, ror, 16));
420  __ eor(R12, R12, AsmOperand(R4, ror, 24));
421  __ eor(R3, R1, R12);
422  __ str(R3, Address(SP, 0));
423
424  __ mov(R12, AsmOperand(R8, lsr, 24));
425  __ ubfx(R4, R7, 16, 8);
426  __ ldr (R1, Address(R0, R12, lsl, 2));
427  __ ldr(R2, Address(R0, R4, lsl, 2));
428  __ ubfx(R3, R6, 8, 8);
429  __ eor(R1, R1, AsmOperand(R2, ror, 8));
430  __ uxtb(R4, R5);
431  __ ldr(R3, Address(R0, R3, lsl, 2));
432  __ ldr(R4, Address(R0, R4, lsl, 2));
433  __ ldr(R12, Address(LR, 4, post_indexed));
434  __ eor(R1, R1, AsmOperand(R3, ror, 16));
435  __ eor(R12, R12, AsmOperand(R4, ror, 24));
436  __ eor(R8, R1, R12);
437
438  // update round count
439  __ subs(R9, R9, 4);
440
441  __ mov(R5, R10);
442  __ mov(R6, R11);
443  __ ldr(R7, Address(SP, 0));
444
445  __ b(round, gt);
446
447  // last round - a special case, no MixColumn:
448
449  // Retrieve expanded key pointer
450  __ fmrs(LR, S8);
451
452  __ mov_slow(R10, (int)SInvBox);
453
454  // output buffer pointer
455  __ fmrs(R9, S7);
456
457  // process each sub-block in a similar manner:
458  // 1. load a corresponding round key
459  __ ldr(R11, Address(LR, 4, post_indexed));
460  // 2. combine SubBytes and ShiftRows stages
461  __ ldrb(R0, Address(R10, R5, lsr, 24));
462  __ ubfx(R12, R8, 16, 8);
463  __ ldrb(R1, Address(R10, R12));
464  __ orr(R0, R1, AsmOperand(R0, lsl, 8));
465  __ ubfx(R12, R7, 8, 8);
466  __ ldrb(R2, Address(R10, R12));
467  __ orr(R0, R2, AsmOperand(R0, lsl, 8));
468  __ uxtb (R12, R6);
469  __ ldrb(R3, Address(R10, R12));
470  __ orr(R3, R3, AsmOperand(R0, lsl, 8));
471  // 3. AddRoundKey stage
472  __ eor(R0, R3, R11);
473  // 4. convert the result to LE representation
474  __ rev(R0, R0);
475  // 5. store in the output buffer
476  __ str(R0, Address(R9, 4, post_indexed));
477
478  __ ldr(R11, Address(LR, 4, post_indexed));
479  __ ldrb(R0, Address(R10, R6, lsr, 24));
480  __ ubfx(R12, R5, 16, 8);
481  __ ldrb(R1, Address(R10, R12));
482  __ orr(R0, R1, AsmOperand(R0, lsl, 8));
483  __ ubfx(R12, R8, 8, 8);
484  __ ldrb(R2, Address(R10, R12));
485  __ orr(R0, R2, AsmOperand(R0, lsl, 8));
486  __ uxtb (R12, R7);
487  __ ldrb(R3, Address(R10, R12));
488  __ orr(R0, R3, AsmOperand(R0, lsl, 8));
489  __ eor(R0, R0, R11);
490  __ rev(R0, R0);
491  __ str(R0, Address(R9, 4, post_indexed));
492
493  __ ldr(R11, Address(LR, 4, post_indexed));
494  __ ldrb(R0, Address(R10, R7, lsr, 24));
495  __ ubfx(R12, R6, 16, 8);
496  __ ldrb(R1, Address(R10, R12));
497  __ orr(R0, R1, AsmOperand(R0, lsl, 8));
498  __ ubfx(R12, R5, 8, 8);
499  __ ldrb(R2, Address(R10, R12));
500  __ orr(R0, R2, AsmOperand(R0, lsl, 8));
501  __ uxtb (R12, R8);
502  __ ldrb(R3, Address(R10, R12));
503  __ orr(R0, R3, AsmOperand(R0, lsl, 8));
504  __ eor(R0, R0, R11);
505  __ rev(R0, R0);
506  __ str(R0, Address(R9, 4, post_indexed));
507
508  __ ldr(R11, Address(LR));
509  __ ldrb(R0, Address(R10, R8, lsr, 24));
510  __ ubfx(R12, R7, 16, 8);
511  __ ldrb(R1, Address(R10, R12));
512  __ orr(R0, R1, AsmOperand(R0, lsl, 8));
513  __ ubfx(R12, R6, 8, 8);
514  __ ldrb(R2, Address(R10, R12));
515  __ orr(R0, R2, AsmOperand(R0, lsl, 8));
516  __ uxtb (R12, R5);
517  __ ldrb(R3, Address(R10, R12));
518  __ orr(R0, R3, AsmOperand(R0, lsl, 8));
519  __ eor(R0, R0, R11);
520  __ rev(R0, R0);
521  __ str(R0, Address(R9));
522
523  __ add(SP, SP, 32);
524  __ fldmiad(SP, FloatRegisterSet(D0, 4), writeback);;
525  __ pop(RegisterSet(R4, R12) | PC);
526
527  return start;
528}
529
530address generate_cipherBlockChaining_encryptAESCrypt() {
531  // R0 - plain
532  // R1 - cipher
533  // R2 - expanded key
534  // R3 - Initialization Vector (IV)
535  // [sp+0] - cipher len
536  // [sp+4] Transposition Box reference
537
538  __ align(CodeEntryAlignment);
539  StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
540
541  address start = __ pc();
542
543  __ push(RegisterSet(R4, R12) | LR);
544  // load cipher length (which is first element on the original calling stack)
545  __ ldr(R4, Address(SP, 40));
546
547  __ sub(SP, SP, 32);
548
549  // preserve some arguments
550  __ mov(R5, R1);
551  __ mov(R6, R2);
552
553  // load IV
554  __ ldmia(R3, RegisterSet(R9, R12), writeback);
555
556  // preserve original source buffer on stack
557  __ str(R0, Address(SP, 16));
558
559  Label loop;
560  __ bind(loop);
561  __ ldmia(R0, RegisterSet(R0, R1) | RegisterSet(R7, R8));
562
563  __ eor(R0, R0, R9);
564  __ eor(R1, R1, R10);
565  __ eor(R7, R7, R11);
566  __ eor(R8, R8, R12);
567  __ stmia(SP, RegisterSet(R0, R1) | RegisterSet(R7, R8));
568
569  __ mov(R0, SP);
570  __ mov(R1, R5);
571  __ mov(R2, R6);
572  __ ldr(R3, Address(SP, 40+32+4));
573
574  // near call is sufficient since the target is also in the stubs
575  __ bl(StubRoutines::_aescrypt_encryptBlock);
576
577  __ subs(R4, R4, 16);
578  __ ldr(R0, Address(SP, 16), gt);
579  __ ldmia(R5, RegisterSet(R9, R12), writeback);
580  __ add(R0, R0, 16, gt);
581  __ str(R0, Address(SP, 16), gt);
582  __ b(loop, gt);
583
584  __ add(SP, SP, 32);
585  __ pop(RegisterSet(R4, R12) | LR);
586  // return cipher len (copied from the original argument)
587  __ ldr(R0, Address(SP));
588  __ bx(LR);
589
590  return start;
591}
592
593
594// The CBC decryption could benefit from parallel processing as the blocks could be
595// decrypted separatly from each other.
596// NEON is utilized (if available) to perform parallel execution on 8 blocks at a time.
597// Since Transposition Box (tbox) is used the parallel execution will only apply to an
598// Initial Round and the last round. It's not practical to use NEON for a table lookup
599// larger than 128 bytes. It also appears to be faster performing  tbox lookup
600// sequentially then execute Galois Field calculation in parallel.
601
602address generate_cipherBlockChaining_decryptAESCrypt() {
603  __ align(CodeEntryAlignment);
604  StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
605
606  address start = __ pc();
607
608  Label single_block_done, single_block, cbc_done;
609  // R0 - cipher
610  // R1 - plain
611  // R2 - expanded key
612  // R3 - Initialization Vector (iv)
613  // [sp+0] - cipher len
614  // [sp+4] - Transpotition Box reference
615
616  __ push(RegisterSet(R4, R12) | LR);
617
618  // load cipher len: must be modulo 16
619  __ ldr(R4, Address(SP, 40));
620
621  if (VM_Version::has_simd()) {
622    __ andrs(R4, R4, 0x7f);
623  }
624
625  // preserve registers based arguments
626  __ mov(R7, R2);
627  __ mov(R8, R3);
628
629  if (VM_Version::has_simd()) {
630    __ b(single_block_done, eq);
631  }
632
633  __ bind(single_block);
634  // preserve args
635  __ mov(R5, R0);
636  __ mov(R6, R1);
637
638  // reload arguments
639  __ mov(R2, R7);
640  __ ldr(R3, Address(SP, 40+4));
641
642  // near call is sufficient as the method is part of the StubGenerator
643  __ bl((address)StubRoutines::_aescrypt_decryptBlock);
644
645  // check remainig cipher size (for individual block processing)
646  __ subs(R4, R4, 16);
647  if (VM_Version::has_simd()) {
648    __ tst(R4, 0x7f);
649  }
650
651  // load IV (changes based on a CBC schedule)
652  __ ldmia(R8, RegisterSet(R9, R12));
653
654  // load plaintext from the previous block processing
655  __ ldmia(R6, RegisterSet(R0, R3));
656
657  // perform IV addition and save the plaintext for good now
658  __ eor(R0, R0, R9);
659  __ eor(R1, R1, R10);
660  __ eor(R2, R2, R11);
661  __ eor(R3, R3, R12);
662  __ stmia(R6, RegisterSet(R0, R3));
663
664  // adjust pointers for next block processing
665  __ mov(R8, R5);
666  __ add(R0, R5, 16);
667  __ add(R1, R6, 16);
668  __ b(single_block, ne);
669
670  __ bind(single_block_done);
671  if (!VM_Version::has_simd()) {
672    __ b(cbc_done);
673  } else {
674  // done with single blocks.
675  // check if any 8 block chunks are available for parallel processing
676  __ ldr(R4, Address(SP, 40));
677  __ bics(R4, R4, 0x7f);
678  __ b(cbc_done, eq);
679
680  Label decrypt_8_blocks;
681  int quad = 1;
682  // Process 8 blocks in parallel
683  __ fstmdbd(SP, FloatRegisterSet(D8, 8), writeback);
684  __ sub(SP, SP, 40);
685
686  // record output buffer end address (used as a block counter)
687  Address output_buffer_end(SP, 16);
688  __ add(R5, R1, R4);
689  __ str(R5, output_buffer_end);
690
691  // preserve key pointer
692  Address rounds_key(SP, 28);
693  __ str(R7, rounds_key);
694  // in decryption the first 16 bytes of expanded key are used in the last round
695  __ add(LR, R7, 16);
696
697
698  // Record the end of the key which is used to indicate a last round
699  __ ldr(R3, Address(R7, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
700  __ add(R9, R7, AsmOperand(R3, lsl, 2));
701
702  // preserve IV
703  Address iv(SP, 36);
704  __ str(R8, iv);
705
706  __ bind(decrypt_8_blocks);
707  __ mov(R5, R1);
708
709  // preserve original source pointer
710  Address original_src(SP, 32);
711  __ str(R0, original_src);
712
713  // Apply ShiftRow for 8 block at once:
714  // use output buffer for a temp storage to preload it into cache
715
716  __ vld1(D18, LR, MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS);
717  __ vld1(D0, Address(R0, 0, post_indexed),  MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS);
718  __ vrev(D0, D0, quad, 32, MacroAssembler::VELEM_SIZE_8);
719  __ veor(D20, D0, D18, quad);
720  __ vst1(D20, Address(R5, 0, post_indexed),  MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS);
721
722  __ vld1(D2, Address(R0, 0, post_indexed),  MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS);
723  __ vrev(D2, D2, quad, 32, MacroAssembler::VELEM_SIZE_8);
724  __ veor(D20, D2, D18, quad);
725  __ vst1(D20, Address(R5, 0, post_indexed),  MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS);
726
727  __ vld1(D4, Address(R0, 0, post_indexed),  MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS);
728  __ vrev(D4, D4, quad, 32, MacroAssembler::VELEM_SIZE_8);
729  __ veor(D20, D4, D18, quad);
730  __ vst1(D20, Address(R5, 0, post_indexed),  MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS);
731
732  __ vld1(D6, Address(R0, 0, post_indexed),  MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS);
733  __ vrev(D6, D6, quad, 32, MacroAssembler::VELEM_SIZE_8);
734  __ veor(D20, D6, D18, quad);
735  __ vst1(D20, Address(R5, 0, post_indexed),  MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS);
736
737  __ vld1(D8, Address(R0, 0, post_indexed),  MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS);
738  __ vrev(D8, D8, quad, 32, MacroAssembler::VELEM_SIZE_8);
739  __ veor(D20, D8, D18, quad);
740  __ vst1(D20, Address(R5, 0, post_indexed),  MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS);
741
742  __ vld1(D10, Address(R0, 0, post_indexed), MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS);
743  __ vrev(D10, D10, quad, 32, MacroAssembler::VELEM_SIZE_8);
744  __ veor(D20, D10, D18, quad);
745  __ vst1(D20, Address(R5, 0, post_indexed), MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS);
746
747  __ vld1(D12, Address(R0, 0, post_indexed), MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS);
748  __ vrev(D12, D12, quad, 32, MacroAssembler::VELEM_SIZE_8);
749  __ veor(D20, D12, D18, quad);
750  __ vst1(D20, Address(R5, 0, post_indexed), MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS);
751
752  __ vld1(D14, Address(R0, 0, post_indexed), MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS);
753  __ vrev(D14, D14, quad, 32, MacroAssembler::VELEM_SIZE_8);
754  __ veor(D20, D14, D18, quad);
755  __ vst1(D20, Address(R5, 0, post_indexed), MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS);
756
757
758  // Local frame map:
759  // sp+20 - ouput buffer pointer
760  // sp+28 - key pointer
761  // sp+32 - original source
762  // sp+36 - block counter
763
764
765  // preserve output buffer pointer
766  Address block_current_output_buffer(SP, 20);
767  __ str(R1, block_current_output_buffer);
768
769  // individual rounds in block processing are executed sequentially .
770  Label block_start;
771
772  // record end of the output buffer
773  __ add(R0, R1, 128);
774  __ str(R0, Address(SP, 12));
775
776  __ bind(block_start);
777
778  // load transporistion box reference (T5)
779  // location of the reference (6th incoming argument, second slot on the stack):
780  // 10 scalar registers on stack
781  //  8 double-precision FP registers
782  // 40 bytes frame size for local storage
783  //  4 bytes offset to the original arguments list
784  __ ldr(R0, Address(SP, 40+64+40+4));
785  __ add(R0, R0, arrayOopDesc::base_offset_in_bytes(T_INT));
786
787  // load rounds key and compensate for the first and last rounds
788  __ ldr(LR, rounds_key);
789  __ add(LR, LR, 32);
790
791  // load block data out buffer
792  __ ldr(R2, block_current_output_buffer);
793  __ ldmia(R2, RegisterSet(R5, R8));
794
795  Label round;
796  __ bind(round);
797
798  // Utilize a Transposition Box lookup along with subsequent shift and EOR with a round key.
799  // instructions ordering is rearranged to minimize ReadAferWrite dependency. Not that important on A15 target
800  // with register renaming but performs ~10% better on A9.
801  __ mov(R12, AsmOperand(R5, lsr, 24));
802  __ ubfx(R4, R8, 16, 8);
803  __ ldr (R1, Address(R0, R12, lsl, 2));
804  __ ldr(R2, Address(R0, R4, lsl, 2));
805  __ ubfx(R3, R7, 8, 8);
806  __ eor(R1, R1, AsmOperand(R2, ror, 8));
807  __ uxtb(R4, R6);
808  __ ldr(R3, Address(R0, R3, lsl, 2));
809  __ ldr(R4, Address(R0, R4, lsl, 2));
810  __ ldr(R12, Address(LR, 4, post_indexed));
811  __ eor(R1, R1, AsmOperand(R3, ror, 16));
812  __ eor(R12, R12, AsmOperand(R4, ror, 24));
813  __ eor(R10, R1, R12);
814
815  __ mov(R12, AsmOperand(R6, lsr, 24));
816  __ ubfx(R4, R5, 16, 8);
817  __ ldr (R1, Address(R0, R12, lsl, 2));
818  __ ldr(R2, Address(R0, R4, lsl, 2));
819  __ ubfx(R3, R8, 8, 8);
820  __ eor(R1, R1, AsmOperand(R2, ror, 8));
821  __ uxtb(R4, R7);
822  __ ldr(R3, Address(R0, R3, lsl, 2));
823  __ ldr(R4, Address(R0, R4, lsl, 2));
824  __ ldr(R12, Address(LR, 4, post_indexed));
825  __ eor(R1, R1, AsmOperand(R3, ror, 16));
826  __ eor(R12, R12, AsmOperand(R4, ror, 24));
827  __ eor(R11, R1, R12);
828
829  __ mov(R12, AsmOperand(R7, lsr, 24));
830  __ ubfx(R4, R6, 16, 8);
831  __ ldr (R1, Address(R0, R12, lsl, 2));
832  __ ldr(R2, Address(R0, R4, lsl, 2));
833  __ ubfx(R3, R5, 8, 8);
834  __ eor(R1, R1, AsmOperand(R2, ror, 8));
835  __ uxtb(R4, R8);
836  __ ldr(R3, Address(R0, R3, lsl, 2));
837  __ ldr(R4, Address(R0, R4, lsl, 2));
838  __ ldr(R12, Address(LR, 4, post_indexed));
839  __ eor(R1, R1, AsmOperand(R3, ror, 16));
840  __ eor(R12, R12, AsmOperand(R4, ror, 24));
841  __ eor(R3, R1, R12);
842  __ str(R3, Address(SP, 0));
843
844  __ mov(R12, AsmOperand(R8, lsr, 24));
845  __ ubfx(R4, R7, 16, 8);
846  __ ldr (R1, Address(R0, R12, lsl, 2));
847  __ ldr(R2, Address(R0, R4, lsl, 2));
848  __ ubfx(R3, R6, 8, 8);
849  __ eor(R1, R1, AsmOperand(R2, ror, 8));
850  __ uxtb(R4, R5);
851  __ ldr(R3, Address(R0, R3, lsl, 2));
852  __ ldr(R4, Address(R0, R4, lsl, 2));
853  __ ldr(R12, Address(LR, 4, post_indexed));
854  __ eor(R1, R1, AsmOperand(R3, ror, 16));
855  __ eor(R12, R12, AsmOperand(R4, ror, 24));
856  __ eor(R8, R1, R12);
857
858  // see if we reached the key array end
859  __ cmp(R9, LR);
860
861  //  load processed data
862  __ mov(R5, R10);
863  __ mov(R6, R11);
864  __ ldr(R7, Address(SP, 0));
865
866  __ b(round, gt);
867
868
869  // last round is special
870  // this round could be implemented through vtbl instruction in NEON. However vtbl is limited to a 32-byte wide table (4 vectors),
871  // thus it requires 8 lookup rounds to cover 256-byte wide Si table. On the other hand scalar lookup is independent of the
872  // lookup table size and thus proves to be faster.
873  __ ldr(LR, block_current_output_buffer);
874
875  // cipher counter
876  __ ldr(R11, Address(SP, 12));
877
878  __ mov_slow(R10, (int)SInvBox);
879  __ ldrb(R0, Address(R10, R5, lsr, 24));
880  __ ubfx(R12, R8, 16, 8);
881  __ ldrb (R1, Address(R10, R12));
882  __ orr(R0, R1, AsmOperand(R0, lsl, 8));
883  __ ubfx(R12, R7, 8, 8);
884  __ ldrb(R2, Address(R10, R12));
885  __ orr(R0, R2, AsmOperand(R0, lsl, 8));
886  __ uxtb(R12, R6);
887  __ ldrb(R3, Address(R10, R12));
888  __ orr(R0, R3, AsmOperand(R0, lsl, 8));
889  __ str(R0, Address(LR, 4, post_indexed));
890
891  __ ldrb(R0, Address(R10, R6, lsr, 24));
892  __ ubfx(R12, R5, 16, 8);
893  __ ldrb (R1, Address(R10, R12));
894  __ orr(R0, R1, AsmOperand(R0, lsl, 8));
895  __ ubfx(R12, R8, 8, 8);
896  __ ldrb(R2, Address(R10, R12));
897  __ orr(R0, R2, AsmOperand(R0, lsl, 8));
898  __ uxtb(R12, R7);
899  __ ldrb(R3, Address(R10, R12));
900  __ orr(R0, R3, AsmOperand(R0, lsl, 8));
901  __ str(R0, Address(LR, 4, post_indexed));
902
903
904  __ ldrb(R0, Address(R10, R7, lsr, 24));
905  __ ubfx(R12, R6, 16, 8);
906  __ ldrb (R1, Address(R10, R12));
907  __ orr(R0, R1, AsmOperand(R0, lsl, 8));
908  __ ubfx(R12, R5, 8, 8);
909  __ ldrb(R2, Address(R10, R12));
910  __ orr(R0, R2, AsmOperand(R0, lsl, 8));
911  __ uxtb(R12, R8);
912  __ ldrb(R3, Address(R10, R12));
913  __ orr(R0, R3, AsmOperand(R0, lsl, 8));
914  __ str(R0, Address(LR, 4, post_indexed));
915
916
917  __ ldrb(R0, Address(R10, R8, lsr, 24));
918  __ ubfx(R12, R7, 16, 8);
919  __ ldrb (R1, Address(R10, R12));
920  __ orr(R0, R1, AsmOperand(R0, lsl, 8));
921  __ ubfx(R12, R6, 8, 8);
922  __ ldrb(R2, Address(R10, R12));
923  __ orr(R0, R2, AsmOperand(R0, lsl, 8));
924  __ uxtb(R12, R5);
925  __ ldrb(R3, Address(R10, R12));
926  __ orr(R0, R3, AsmOperand(R0, lsl, 8));
927  __ str(R0, Address(LR, 4, post_indexed));
928
929
930  // preserve current scratch buffer pointer
931  __ cmp(R11, LR);
932  __ str(LR, block_current_output_buffer);
933
934  // go to the next block processing
935  __ b(block_start, ne);
936
937
938
939  // Perform last round AddRoundKey state on all 8 blocks
940
941  // load key pointer (remember that [sp+24]  points to a byte #32 at the key array)
942  // last round is processed with the key[0 ..3]
943  __ ldr(LR, rounds_key);
944
945  // retireve original output buffer pointer
946  __ ldr(R1, block_current_output_buffer);
947  __ sub(R1, R1, 128);
948  __ mov(R5, R1);
949
950
951  // retrieve original cipher (source) pointer
952  __ ldr(R0, original_src);
953
954  // retrieve IV (second argument on stack)
955  __ ldr(R6, iv);
956
957  __ vld1(D20, R6, MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS);
958  __ vrev(D20, D20, quad, 32, MacroAssembler::VELEM_SIZE_8);
959
960  // perform last AddRoundKey and IV addition
961  __ vld1(D18, Address(LR, 0, post_indexed), MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS);
962
963  __ vld1(D22, Address(R1, 0, post_indexed),  MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS);
964  __ veor(D22, D22, D18, quad);
965  __ veor(D22, D22, D20, quad);
966  __ vrev(D22, D22, quad, 32, MacroAssembler::VELEM_SIZE_8);
967  __ vst1(D22, Address(R5, 0, post_indexed),  MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS);
968
969
970  __ vld1(D22, Address(R1, 0, post_indexed),  MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS);
971  __ veor(D22, D22, D18, quad);
972  __ veor(D22, D22, D0, quad);
973  __ vrev(D22, D22, quad, 32, MacroAssembler::VELEM_SIZE_8);
974  __ vst1(D22, Address(R5, 0, post_indexed),  MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS);
975
976  __ vld1(D22, Address(R1, 0, post_indexed),  MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS);
977  __ veor(D22, D22, D18, quad);
978  __ veor(D22, D22, D2, quad);
979  __ vrev(D22, D22, quad, 32, MacroAssembler::VELEM_SIZE_8);
980  __ vst1(D22, Address(R5, 0, post_indexed),  MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS);
981
982  __ vld1(D22, Address(R1, 0, post_indexed),  MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS);
983  __ veor(D22, D22, D18, quad);
984  __ veor(D22, D22, D4, quad);
985  __ vrev(D22, D22, quad, 32, MacroAssembler::VELEM_SIZE_8);
986  __ vst1(D22, Address(R5, 0, post_indexed),  MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS);
987
988  __ vld1(D22, Address(R1, 0, post_indexed),  MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS);
989  __ veor(D22, D22, D18, quad);
990  __ veor(D22, D22, D6, quad);
991  __ vrev(D22, D22, quad, 32, MacroAssembler::VELEM_SIZE_8);
992  __ vst1(D22, Address(R5, 0, post_indexed),  MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS);
993
994  __ vld1(D22, Address(R1, 0, post_indexed),  MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS);
995  __ veor(D22, D22, D18, quad);
996  __ veor(D22, D22, D8, quad);
997  __ vrev(D22, D22, quad, 32, MacroAssembler::VELEM_SIZE_8);
998  __ vst1(D22, Address(R5, 0, post_indexed),  MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS);
999
1000  __ vld1(D22, Address(R1, 0, post_indexed),  MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS);
1001  __ veor(D22, D22, D18, quad);
1002  __ veor(D22, D22, D10, quad);
1003  __ vrev(D22, D22, quad, 32, MacroAssembler::VELEM_SIZE_8);
1004  __ vst1(D22, Address(R5, 0, post_indexed),  MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS);
1005
1006  __ vld1(D22, Address(R1, 0, post_indexed),  MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS);
1007  __ veor(D22, D22, D18, quad);
1008  __ veor(D22, D22, D12, quad);
1009  __ vrev(D22, D22, quad, 32, MacroAssembler::VELEM_SIZE_8);
1010  __ vst1(D22, Address(R5, 0, post_indexed),  MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS);
1011
1012
1013  // check if we're done
1014  __ ldr(R4, output_buffer_end);
1015  __ cmp(R4, R1);
1016  __ add(R0, R0, 128-16);
1017  __ str(R0, iv);
1018  __ add(R0, R0, 16);
1019
1020  __ b(decrypt_8_blocks, ne);
1021
1022  __ add(SP, SP, 40);
1023  __ fldmiad(SP, FloatRegisterSet(D8, 8), writeback);;
1024  }
1025
1026  __ bind(cbc_done);
1027  __ pop(RegisterSet(R4, R12) | LR);
1028  __ ldr(R0, Address(SP));
1029  __ bx(LR);
1030
1031  return start;
1032}
1033#endif // USE_CRYPTO
1034