1/*
2 * Copyright (c) 2000-2001,2011,2014 Apple Inc. All Rights Reserved.
3 *
4 * The contents of this file constitute Original Code as defined in and are
5 * subject to the Apple Public Source License Version 1.2 (the 'License').
6 * You may not use this file except in compliance with the License. Please obtain
7 * a copy of the License at http://www.apple.com/publicsource and read it before
8 * using this file.
9 *
10 * This Original Code and all software distributed under the License are
11 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESS
12 * OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, INCLUDING WITHOUT
13 * LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
14 * PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. Please see the License for the
15 * specific language governing rights and limitations under the License.
16 */
17
18
19/*
20 *  vRijndael-alg-ref.c
21 *
22 *  Copyright (c) 2001,2011,2014 Apple Inc. All Rights Reserved.
23 *
24 */
25
26#include "rijndaelApi.h"
27#include "rijndael-alg-ref.h"
28#include "boxes-ref.h"
29#include <string.h>
30
31/* debugger seems to have trouble with this code... */
32#define VAES_DEBUG	1
33#if		VAES_DEBUG
34#include <stdio.h>
35#define vdprintf(s)		printf s
36#else
37#define vdprintf(s)
38#endif
39
40#define SC	((BC - 4) >> 1)
41
42#if defined(__ppc__) && defined(ALTIVEC_ENABLE)
43
44typedef union {
45	unsigned char		s[4][8];
46	unsigned long		l[8];
47	vector unsigned char 	v[2];
48} doubleVec;
49
50typedef union {
51	unsigned long		s[4];
52	vector unsigned long	v;
53} vecLong;
54
55static word8 shifts[3][4][2] = {
56 { { 0, 0 },
57   { 1, 3 },
58   { 2, 2 },
59   { 3, 1 }
60 },
61 { { 0, 0 },
62   { 1, 5 },
63   { 2, 4 },
64   { 3, 3 }
65 },
66 { { 0, 0 },
67   { 1, 7 },
68   { 3, 5 },
69   { 4, 4 }
70 }
71};
72
73int vRijndaelKeySched ( vector unsigned char vk[2], int keyBits, int blockBits,
74                unsigned char W[MAXROUNDS+1][4][MAXBC])
75{
76	/* Calculate the necessary round keys
77	 * The number of calculations depends on keyBits and blockBits
78	 */
79	int KC, BC, ROUNDS;
80	int i, j, t, rconpointer = 0;
81	doubleVec tk;
82	register  vector unsigned char v1, v2, mask;
83
84	switch (keyBits) {
85	case 128: KC = 4; break;
86	case 192: KC = 6; break;
87	case 256: KC = 8; break;
88	default : return (-1);
89	}
90
91	switch (blockBits) {
92	case 128: BC = 4; break;
93	case 192: BC = 6; break;
94	case 256: BC = 8; break;
95	default : return (-2);
96	}
97
98	switch (keyBits >= blockBits ? keyBits : blockBits) {
99	case 128: ROUNDS = 10; break;
100	case 192: ROUNDS = 12; break;
101	case 256: ROUNDS = 14; break;
102	default : return (-3); /* this cannot happen */
103	}
104
105	tk.v[0] = vk[0];
106	tk.v[1] = vk[1];
107
108	t = 0;
109	/* copy values into round key array */
110	for(j = 0; (j < KC) && (t < (ROUNDS+1)*BC); j++, t++)
111		for(i = 0; i < 4; i++) W[t / BC][i][t % BC] = tk.s[i][j];
112
113	while (t < (ROUNDS+1)*BC) { /* while not enough round key material calculated */
114		/* calculate new values */
115		for(i = 0; i < 4; i++)
116			tk.s[i][0] ^= *((word8 *)S + tk.s[(i+1)%4][KC-1]);
117		tk.s[0][0] ^= rcon[rconpointer++];
118
119		if (KC != 8) {
120			/* xor bytes 1-7 of each row with previous byte */
121			mask = (vector unsigned char) ( 0, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff );
122			for ( i = 0; i < 2; i++ ) {
123				v1 = vec_sld( tk.v[i], tk.v[i], 15 );
124				v2 = vec_and( v1, mask );
125				tk.v[i] = vec_xor( tk.v[i], v2 );
126			}
127		}
128		else {
129			/* xor bytes 1-3 of each row with previous byte */
130			mask = (vector unsigned char) ( 0, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0xff, 0xff, 0xff, 0, 0, 0, 0 );
131			for ( i = 0; i < 2; i++ ) {
132				v1 = vec_sld( tk.v[i], tk.v[i], 15 );
133				v2 = vec_and( v1, mask );
134				tk.v[i] = vec_xor( tk.v[i], v2 );
135				for(j = 0; j < 4; j++) tk.s[i][KC/2] ^= *((word8 *)S + tk.s[i][KC/2 - 1]);
136				/* xor bytes 5-7 of each row with previous byte */
137				mask = vec_sld( mask, mask, 4 );
138				v2 = vec_and( v1, mask );
139				tk.v[i] = vec_xor( tk.v[i], v2 );
140				mask = vec_sld( mask, mask, 4 );
141			}
142		}
143		/* copy values into round key array */
144		for(j = 0; (j < KC) && (t < (ROUNDS+1)*BC); j++, t++)
145			for(i = 0; i < 4; i++) W[t / BC][i][t % BC] = tk.s[i][j];
146	}
147	return 0;
148}
149
150
151void vMakeKey(BYTE *keyMaterial, keyInstance *key)
152{
153        register vector unsigned char v1, v2, v3, mask;
154        vector unsigned char	  vk[2];
155
156        /* load and align input */
157        v1 = vec_ld( 0, (vector unsigned char *) keyMaterial );
158        v2 = vec_ld( 16, (vector unsigned char *) keyMaterial );
159        if ( (long) keyMaterial & 0x0fL )
160        {	// this is required if keyMaterial is not on a 16-byte boundary
161                v3 = vec_ld( 32, (vector unsigned char *) keyMaterial );
162                mask = vec_lvsl( 0, keyMaterial );
163                v1 = vec_perm( v1, v2, mask );
164                v2 = vec_perm( v2, v3, mask );
165        }
166
167        /* parse input stream into rectangular array */
168        vk[0] = vec_perm( v1, v2, (vector unsigned char) ( 0,  4,  8, 12, 16, 20, 24, 28,  1,  5,  9, 13, 17, 21, 25, 29 ) );
169        vk[1] = vec_perm( v1, v2, (vector unsigned char) ( 2,  6, 10, 14, 18, 22, 26, 30,  3,  7, 11, 15, 19, 23, 27, 31 ) );
170        vRijndaelKeySched (vk, key->keyLen, key->blockLen, key->keySched);
171        memset( (char *) vk, 0, 4 * MAXKC);
172}
173
174
175/*	This routine does 16 simultaneous lookups in a 256-byte table.	*/
176vector unsigned char rimskyKorsakov ( vector unsigned char v, vector unsigned char * table )
177{
178	register vector unsigned char	upperBits000, upperBits001, upperBits010, upperBits011,
179                                        upperBits100, upperBits101, upperBits110, upperBits111,
180                                        lookupBit00,  lookupBit01, lookupBit10, lookupBit11,
181                                        lookupBit0, lookupBit1, lookup,
182                                        maskForBit6, maskForBit7, maskForBit8, seven;
183	register vector unsigned char	*tabeven, *tabodd;
184
185	seven = vec_splat_u8 ( 7 );
186	tabeven = table++;
187	tabodd = table;
188
189//	Each variable contains the correct values for the corresponding bits 6, 7 and 8.
190	upperBits000 = vec_perm ( *tabeven, *tabodd, v );
191	tabeven += 2; tabodd += 2;
192	upperBits001 = vec_perm ( *tabeven, *tabodd, v );
193	tabeven += 2; tabodd += 2;
194	upperBits010 = vec_perm ( *tabeven, *tabodd, v );
195	tabeven += 2; tabodd += 2;
196	upperBits011 = vec_perm ( *tabeven, *tabodd, v );
197	tabeven += 2; tabodd += 2;
198	upperBits100 = vec_perm ( *tabeven, *tabodd, v );
199	tabeven += 2; tabodd += 2;
200	upperBits101 = vec_perm ( *tabeven, *tabodd, v );
201	tabeven += 2; tabodd += 2;
202	upperBits110 = vec_perm ( *tabeven, *tabodd, v );
203	tabeven += 2; tabodd += 2;
204	upperBits111 = vec_perm ( *tabeven, *tabodd, v );
205
206//	Here we extract all the correct values for bit 6.
207	maskForBit6  = vec_sl  ( v, vec_splat_u8 ( 2 ) );
208	maskForBit6  = vec_sra ( maskForBit6, seven );
209	lookupBit00 = vec_sel ( upperBits000, upperBits001, maskForBit6 );
210	lookupBit01 = vec_sel ( upperBits010, upperBits011, maskForBit6 );
211	lookupBit10 = vec_sel ( upperBits100, upperBits101, maskForBit6 );
212	lookupBit11 = vec_sel ( upperBits110, upperBits111, maskForBit6 );
213
214//	Then we get the correct values for bit 7.
215	maskForBit7  = vec_sl  ( v, vec_splat_u8 ( 1 ) );
216	maskForBit7  = vec_sra ( maskForBit7, seven );
217	lookupBit0 = vec_sel ( lookupBit00, lookupBit01, maskForBit7 );
218	lookupBit1 = vec_sel ( lookupBit10, lookupBit11, maskForBit7 );
219
220//	Finally, the entire correct result vector.
221	maskForBit8 = vec_sra ( v, seven );
222
223	lookup = vec_sel ( lookupBit0, lookupBit1, maskForBit8 );
224
225    return lookup;
226}
227
228vector unsigned char vmul(vector unsigned char a, vector unsigned char b)
229{
230	register vector unsigned char x, y, zero;
231	register vector unsigned short xh, yh, zhi, zlo, two54, two55;
232
233	zero = vec_splat_u8( 0 );
234	two55 = vec_splat_u16( -1 );
235	two55 = (vector unsigned short) vec_mergeh( zero, (vector unsigned char) two55 );
236	two54 = vec_sub( two55, vec_splat_u16( 1 ) );
237
238	x = rimskyKorsakov( a, (vector unsigned char *)Logtable );	// Logtable[a]
239	y = rimskyKorsakov( b, (vector unsigned char *)Logtable );	// Logtable[b]
240
241	//	Convert upper 8 bytes to shorts for addition ond modulo
242	xh = (vector unsigned short) vec_mergeh( zero, x );
243	yh = (vector unsigned short) vec_mergeh( zero, y );
244	xh = vec_add( xh, yh );			// xh = Logtable[a] + Logtable[b]
245	yh = vec_sub( xh, two55 );
246	zhi = vec_sel( xh, yh, vec_cmpgt( xh, two54 ) );	// xh%255
247
248	//	Convert lower 8 bytes to shorts for addition ond modulo
249	xh = (vector unsigned short) vec_mergel( zero, x );
250	yh = (vector unsigned short) vec_mergel( zero, y );
251	xh = vec_add( xh, yh );
252	yh = vec_sub( xh, two55 );
253	zlo = vec_sel( xh, yh, vec_cmpgt( xh, two54 ) );
254
255	x = vec_pack( zhi, zlo );			// recombine into single byte vector
256	x = rimskyKorsakov( x, (vector unsigned char *)Alogtable );		// Alogtable[x]
257	x = vec_sel( x, zero, vec_cmpeq( a, zero ) );	// check a = 0
258	x = vec_sel( x, zero, vec_cmpeq( b, zero ) );	// check b = 0
259	return x;
260}
261
262void vKeyAddition(vector unsigned char v[2], vector unsigned char rk[2])
263{
264	v[0] = vec_xor( v[0], rk[0] );		// first vector contains rows 0 and 1
265	v[1] = vec_xor( v[1], rk[1] );		// second vector contains rows 2 and 3
266}
267
268
269void vShiftRow(vector unsigned char v[2], word8 d, word8 BC)
270{
271	vecLong			sh;
272	register vector unsigned char mask, mask1, t;
273	register vector bool char c;
274	register int	i, j;
275
276	sh.s[0] = 0;
277	for (i = 1; i < 4; i++)
278		sh.s[i] = shifts[SC][i][d] % BC;	//	contains the number of elements to shift each row
279
280	// each vector contains two BC-byte long rows
281	j = 0;
282	for ( i = 0; i < 2; i++ ) {
283		mask = vec_lvsl( 0, (int *) sh.s[j++]);		//	mask for even row
284		mask1 = vec_lvsl( 0, (int *) sh.s[j++]);	//	mask for odd row
285		if (BC == 4) {
286			mask = vec_sld( mask, mask1, 8 );		//	combined rotation mask for both rows
287			mask = vec_and( mask, vec_splat_u8( 3 ) );
288		} else if (BC == 6) {
289			mask = vec_sld( mask, mask, 8 );
290			mask = vec_sld( mask, mask1, 8 );		//	combined rotation mask for both rows
291			t = vec_sub( mask, vec_splat_u8( 6 ) );
292			c = vec_cmpgt( mask, vec_splat_u8( 5 ) );
293			mask = vec_sel( mask, t, c );
294		} else {
295			mask = vec_sld( mask, mask1, 8 );		//	combined rotation mask for both rows
296			mask = vec_and( mask, vec_splat_u8( 7 ) );
297		}
298		mask1 = vec_sld( vec_splat_u8( 0 ), vec_splat_u8( 8 ), 8 );
299		mask = vec_add( mask, mask1 );
300		v[i] = vec_perm( v[i], v[i], mask );		//	rotate each row as required
301	}
302}
303
304void vSubstitution( vector unsigned char v[2], vector unsigned char box[16] )
305{
306	v[0] = rimskyKorsakov( v[0], box );		// first vector contains rows 0 and 1
307	v[1] = rimskyKorsakov( v[1], box );		// second vector contains rows 2 and 3
308}
309
310void vMixColumn(vector unsigned char v[2])
311{
312	//	vector 0 contains row 0 in bytes 0-7 and row 1 in bytes 8-f
313	//	vector 1 contains row 2 in bytes 0-7 and row 3 in bytes 8-f
314
315	register vector unsigned char a0, a1, a2, a3, b0, b1, b2, b3;
316	register vector unsigned char two, three;
317
318	two = vec_splat_u8( 2 );
319	three = vec_splat_u8( 3 );
320
321	a1 = vec_sld( v[0], v[1], 8 );		// equivalent to a[i+1] % 4
322	b1 = vec_sld( v[1], v[0], 8 );
323	a2 = vec_sld( a1, b1, 8 );		// equivalent to a[i+2] % 4
324	b2 = vec_sld( b1, a1, 8 );
325	a3 = vec_sld( a2, b2, 8 );		// equivalent to a[i+3] % 4
326	b3 = vec_sld( b2, a2, 8 );
327
328	//	Calculations for rows 0 and 1
329	a0 = vmul( two, v[0] );				// mul(2,a[i][j])
330	a0 = vec_xor( a0, vmul( three, a1 ) );		// ^ mul(3,a[(i + 1) % 4][j])
331	a0 = vec_xor( a0, a2 );				// ^ a[(i + 2) % 4][j]
332	v[0]  = vec_xor( a0, a3 );			// ^ a[(i + 3) % 4][j]
333
334	//	Calculations for rows 2 and 3
335	b0 = vmul( two, v[1] );
336	b0 = vec_xor( b0, vmul( three, b1 ) );
337	b0 = vec_xor( b0, b2 );
338	v[1] = vec_xor( b0, b3 );
339}
340
341void vInvMixColumn(vector unsigned char v[2])
342{
343	//	vector 0 contains row 0 in bytes 0-7 and row 1 in bytes 8-f
344	//	vector 1 contains row 2 in bytes 0-7 and row 3 in bytes 8-f
345
346	register vector unsigned char a0, a1, a2, a3, b0, b1, b2, b3;
347	register vector unsigned char nine, eleven, thirteen, fourteen;;
348
349	nine = vec_splat_u8( 0x9 );
350	eleven = vec_splat_u8( 0xb );
351	thirteen = vec_splat_u8( 0xd );
352	fourteen = vec_splat_u8( 0xe );
353
354	a1 = vec_sld( v[0], v[1], 8 );			// equivalent to a[i+1] % 4
355	b1 = vec_sld( v[1], v[0], 8 );
356	a2 = vec_sld( a1, b1, 8 );			// equivalent to a[i+2] % 4
357	b2 = vec_sld( b1, a1, 8 );
358	a3 = vec_sld( a2, b2, 8 );			// equivalent to a[i+3] % 4
359	b3 = vec_sld( b2, a2, 8 );
360
361	//	Calculations for rows 0 and 1
362	a0 = vmul( fourteen, v[0] );				// mul(0xe,a[i][j])
363	a0 = vec_xor( a0, vmul( eleven, a1 ) );		// ^ mul(0xb,a[(i + 1) % 4][j])
364	a0 = vec_xor( a0, vmul( thirteen, a2 ) );	// ^ mul(0xd,a[(i + 2) % 4][j])
365	v[0]  = vec_xor( a0, vmul( nine, a3 ) );	// ^ mul(0x9,a[(i + 3) % 4][j])
366
367	//	Calculations for rows 2 and 3
368	b0 = vmul( fourteen, v[1] );
369	b0 = vec_xor( b0, vmul( eleven, b1 ) );
370	b0 = vec_xor( b0, vmul( thirteen, b2 ) );
371	v[1]  = vec_xor( b0, vmul( nine, b3 ) );
372}
373
374int vRijndaelEncrypt (vector unsigned char a[2], int keyBits, int blockBits, vector unsigned char rk[MAXROUNDS+1][2])
375{
376	/* Encryption of one block.
377	 */
378	int r, BC, ROUNDS;
379
380	switch (blockBits) {
381	case 128: BC = 4; break;
382	case 192: BC = 6; break;
383	case 256: BC = 8; break;
384	default : return (-2);
385	}
386
387	switch (keyBits >= blockBits ? keyBits : blockBits) {
388	case 128: ROUNDS = 10; break;
389	case 192: ROUNDS = 12; break;
390	case 256: ROUNDS = 14; break;
391	default : return (-3); /* this cannot happen */
392	}
393
394        vKeyAddition( a, rk[0] );
395        for(r = 1; r < ROUNDS; r++) {
396                vSubstitution( a, (vector unsigned char *)S);
397                vShiftRow( a, 0, BC);
398                vMixColumn( a );
399                vKeyAddition( a, rk[r] );
400        }
401        vSubstitution( a, (vector unsigned char *)S);
402        vShiftRow( a, 0, BC);
403        vKeyAddition( a, rk[ROUNDS] );
404
405	return 0;
406}
407
408int vRijndaelDecrypt (vector unsigned char a[2], int keyBits, int blockBits, vector unsigned char rk[MAXROUNDS+1][2])
409{
410	int r, BC, ROUNDS;
411
412	switch (blockBits) {
413	case 128: BC = 4; break;
414	case 192: BC = 6; break;
415	case 256: BC = 8; break;
416	default : return (-2);
417	}
418
419	switch (keyBits >= blockBits ? keyBits : blockBits) {
420	case 128: ROUNDS = 10; break;
421	case 192: ROUNDS = 12; break;
422	case 256: ROUNDS = 14; break;
423	default : return (-3); /* this cannot happen */
424	}
425
426        vKeyAddition( a, rk[ROUNDS] );
427        vSubstitution( a, (vector unsigned char *)Si);
428        vShiftRow( a, 1, BC);
429        for(r = ROUNDS-1; r > 0; r--) {
430                vKeyAddition( a, rk[r] );
431                vInvMixColumn( a );
432                vSubstitution( a, (vector unsigned char *)Si);
433                vShiftRow( a, 1, BC);
434        }
435        vKeyAddition( a, rk[0] );
436
437	return 0;
438}
439
440#if 0
441/* Murley's code, to be deleted */
442void vBlockEncrypt(cipherInstance *cipher, keyInstance *key, BYTE *input, int inputLen, BYTE *outBuffer)
443{
444        register vector unsigned char v1, v2, v3, v4, mask;
445        register vector bool char cmp;
446
447        /* load and align input */
448        v1 = vec_ld( 0, (vector unsigned char *) input );
449        v2 = vec_ld( 16, (vector unsigned char *) input );
450        if ( (long) input & 0x0fL )
451        {	// this is required if input is not on a 16-byte boundary
452                v3 = vec_ld( 32, (vector unsigned char *) input );
453                mask = vec_lvsl( 0, input );
454                v1 = vec_perm( v1, v2, mask );
455                v2 = vec_perm( v2, v3, mask );
456        }
457
458        /* parse input stream into rectangular array */
459        v3 = vec_perm( v1, v2, (vector unsigned char) ( 0,  4,  8, 12, 16, 20, 24, 28,  1,  5,  9, 13, 17, 21, 25, 29 ) );
460        v4 = vec_perm( v1, v2, (vector unsigned char) ( 2,  6, 10, 14, 18, 22, 26, 30,  3,  7, 11, 15, 19, 23, 27, 31 ) );
461
462        /* store into cipher structure */
463        if (cipher->mode == MODE_CBC) {
464                v3 = vec_xor( v3, *((vector unsigned char *) cipher->chainBlock ) );
465                v4 = vec_xor( v4, *((vector unsigned char *) cipher->chainBlock + 1 ) );
466        }
467        vec_st( v3, 0, (vector unsigned char *) cipher->chainBlock );
468        vec_st( v4, 16, (vector unsigned char *) cipher->chainBlock );
469
470        vRijndaelEncrypt((vector unsigned char *) cipher->chainBlock, key->keyLen, cipher->blockLen, (vector unsigned char *) key->keySched);
471
472        v1 = vec_ld( 0, (vector unsigned char *) cipher->chainBlock );
473        v2 = vec_ld( 16, (vector unsigned char *) cipher->chainBlock );
474
475        /* parse rectangular array into output ciphertext bytes */
476        v3 = vec_perm( v1, v2, (vector unsigned char) ( 0,  8, 16, 24,  1,  9, 17, 25,  2, 10, 18, 26,  3, 11, 19, 27 ) );
477        v4 = vec_perm( v1, v2, (vector unsigned char) ( 4, 12, 20, 28,  5, 13, 21, 29,  6, 14, 22, 30,  7, 15, 23, 31 ) );
478
479        if ( (long) outBuffer & 0x0fL )
480        {
481                /* store output data into a non-aligned buffer */
482                mask = vec_lvsr( 0, outBuffer );
483                cmp = vec_cmpgt( mask, vec_splat_u8( 0x0f ) );
484                v1 = vec_perm( v3, v3, mask );
485                v2 = vec_perm( v4, v4, mask );
486                v3 = vec_ld( 0, (vector unsigned char *) outBuffer );
487                v4 = vec_sel( v3, v1, cmp );
488                vec_st( v4, 0, (vector unsigned char *) outBuffer );
489                v1 = vec_sel( v1, v2, cmp );
490                vec_st( v1, 16, (vector unsigned char *) outBuffer );
491                v3 = vec_ld( 32, (vector unsigned char *) outBuffer );
492                v2 = vec_sel( v2, v3, cmp );
493                vec_st( v2, 32, (vector unsigned char *) outBuffer );
494        } else {
495                // store output data into an aligned buffer
496                vec_st( v3, 0, (vector unsigned char *) outBuffer );
497                vec_st( v4, 16, (vector unsigned char *) outBuffer );
498        }
499        return;
500}
501
502void vBlockDecrypt(cipherInstance *cipher, keyInstance *key, BYTE *input, int inputLen, BYTE *outBuffer)
503{
504        // for vector machines
505        register vector unsigned char v1, v2, v3, v4, mask;
506        register vector bool char cmp;
507        vector unsigned  char	block[2], cblock[2];
508
509        /* load and align input */
510        v1 = vec_ld( 0, (vector unsigned char *) input );
511        v2 = vec_ld( 16, (vector unsigned char *) input );
512        if ( (long) input & 0x0fL )
513        {	// this is required if input is not on a 16-byte boundary
514                v3 = vec_ld( 32, (vector unsigned char *) input );
515                mask = vec_lvsl( 0, input );
516                v1 = vec_perm( v1, v2, mask );
517                v2 = vec_perm( v2, v3, mask );
518        }
519
520        /* parse input stream into rectangular array */
521        v3 = vec_perm( v1, v2, (vector unsigned char) ( 0,  4,  8, 12, 16, 20, 24, 28,  1,  5,  9, 13, 17, 21, 25, 29 ) );
522        v4 = vec_perm( v1, v2, (vector unsigned char) ( 2,  6, 10, 14, 18, 22, 26, 30,  3,  7, 11, 15, 19, 23, 27, 31 ) );
523        block[0] = v3;
524        block[1] = v4;
525
526        /* save a copy of incoming ciphertext for later chain */
527        if (cipher->mode == MODE_CBC) {
528                cblock[0] = v3;
529                cblock[1] = v4;
530        }
531
532        vRijndaelDecrypt ((vector unsigned char *) block, key->keyLen, cipher->blockLen, (vector unsigned char *) key->keySched);
533
534        v1 = block[0];
535        v2 = block[1];
536
537        /* exor with last ciphertext */
538        if (cipher->mode == MODE_CBC) {
539                v1 = vec_xor( v1, *((vector unsigned char *) cipher->chainBlock) );
540                v2 = vec_xor( v2, *((vector unsigned char *) cipher->chainBlock + 1) );
541                vec_st( cblock[0], 0, (vector unsigned char *) cipher->chainBlock );
542                vec_st( cblock[1], 16, (vector unsigned char *) cipher->chainBlock );
543        }
544
545        /* parse rectangular array into output ciphertext bytes */
546        v3 = vec_perm( v1, v2, (vector unsigned char) ( 0,  8, 16, 24,  1,  9, 17, 25,  2, 10, 18, 26,  3, 11, 19, 27 ) );
547        v4 = vec_perm( v1, v2, (vector unsigned char) ( 4, 12, 20, 28,  5, 13, 21, 29,  6, 14, 22, 30,  7, 15, 23, 31 ) );
548
549        if ( (long) outBuffer & 0x0fL )
550        {	/* store output data into a non-aligned buffer */
551                mask = vec_lvsr( 0, outBuffer );
552                cmp = vec_cmpgt( mask, vec_splat_u8( 0x0f ) );
553                v1 = vec_perm( v3, v3, mask );
554                v2 = vec_perm( v4, v4, mask );
555                v3 = vec_ld( 0, (vector unsigned char *) outBuffer );
556                v4 = vec_sel( v3, v1, cmp );
557                vec_st( v4, 0, (vector unsigned char *) outBuffer );
558                v1 = vec_sel( v1, v2, cmp );
559                vec_st( v1, 16, (vector unsigned char *) outBuffer );
560                v3 = vec_ld( 32, (vector unsigned char *) outBuffer );
561                v2 = vec_sel( v2, v3, cmp );
562                vec_st( v2, 32, (vector unsigned char *) outBuffer );
563        } else {
564                // store output data into an aligned buffer
565                vec_st( v3, 0, (vector unsigned char *) outBuffer );
566                vec_st( v4, 16, (vector unsigned char *) outBuffer );
567        }
568}
569#endif	/* Murley's code, to be deleted */
570
571/*
572 * dmitch addenda 4/11/2001: 128-bit only encrypt/decrypt with no CBC
573 */
574void vBlockEncrypt128(
575	keyInstance *key,
576	BYTE *input,
577	BYTE *outBuffer)
578{
579	vector unsigned char block[2];
580	register vector unsigned char v1, v2;
581
582	if ( (long) input & 0x0fL ) {
583		BYTE	localBuf[16];
584		vdprintf(("vBlockEncrypt128: unaligned input\n"));
585		/* manually re-align - the compiler is supposed to 16-byte align this for us */
586		if((unsigned)localBuf & 0xf) {
587			vdprintf(("vBlockEncrypt128: unaligned localBuf!\n"));
588		}
589		memmove(localBuf, input, 16);
590		v1 = vec_ld(0, (vector unsigned char *)localBuf);
591	}
592	else {
593		vdprintf(("vBlockEncrypt128: aligned input\n"));
594		v1 = vec_ld( 0, (vector unsigned char *) input );
595	}
596
597	/* parse input stream into rectangular array */
598	/* FIXME - do we need to zero v2 (or something)? */
599	block[0] = vec_perm(v1, v2,
600		(vector unsigned char) ( 0,  4,  8, 12, 16, 20, 24, 28,  1,
601		5,  9, 13, 17, 21, 25, 29 ) );
602	block[1] = vec_perm( v1, v2,
603		(vector unsigned char) ( 2,  6, 10, 14, 18, 22, 26, 30,  3,
604		7, 11, 15, 19, 23, 27, 31 ) );
605
606	vRijndaelEncrypt(block, key->keyLen, 128, (vector unsigned char *) key->keySched);
607
608	/* parse rectangular array into output ciphertext bytes */
609	v1 = vec_perm(block[0], block[1],
610		(vector unsigned char) ( 0,  8, 16, 24,  1,  9, 17, 25,  2,
611		10, 18, 26,  3, 11, 19, 27 ) );
612	v2 = vec_perm(block[0], block[1],
613		(vector unsigned char) ( 4, 12, 20, 28,  5, 13, 21, 29,  6,
614		14, 22, 30,  7, 15, 23, 31 ) );
615
616	if ( (long) outBuffer & 0x0fL )
617	{
618		/* store output data into a non-aligned buffer */
619		BYTE	localBuf[16];
620		vec_st(v1, 0, (vector unsigned char *) localBuf );
621		memmove(outBuffer, localBuf, 16);
622	} else {
623		/* store output data into an aligned buffer */
624		vec_st( v1, 0, (vector unsigned char *) outBuffer );
625	}
626	return;
627}
628
629void vBlockDecrypt128(
630	keyInstance *key,
631	BYTE *input,
632	BYTE *outBuffer)
633{
634	vector unsigned char block[2];
635	register vector unsigned char v1, v2;
636
637	if ( (long) input & 0x0fL ) {
638		/* manually re-align - the compiler is supposed to 16-byte align this for us */
639		BYTE	localBuf[16];
640		vdprintf(("vBlockDecrypt128: unaligned input\n"));
641		if((unsigned)localBuf & 0xf) {
642			vdprintf(("vBlockDecrypt128: unaligned localBuf!\n"));
643		}
644		memmove(localBuf, input, 16);
645		v1 = vec_ld(0, (vector unsigned char *)localBuf);
646	}
647	else {
648		vdprintf(("vBlockDecrypt128: aligned input\n"));
649		v1 = vec_ld( 0, (vector unsigned char *) input );
650	}
651
652	/* parse input stream into rectangular array */
653	/* FIXME - do we need to zero v2 (or something)? */
654	block[0] = vec_perm(v1, v2,
655		(vector unsigned char) ( 0,  4,  8, 12, 16, 20, 24, 28,  1,
656		5,  9, 13, 17, 21, 25, 29 ) );
657	block[1] = vec_perm( v1, v2,
658		(vector unsigned char) ( 2,  6, 10, 14, 18, 22, 26, 30,  3,
659		7, 11, 15, 19, 23, 27, 31 ) );
660
661	vRijndaelDecrypt(block, key->keyLen, 128, (vector unsigned char *) key->keySched);
662
663	/* parse rectangular array into output ciphertext bytes */
664	v1 = vec_perm(block[0], block[1],
665		(vector unsigned char) ( 0,  8, 16, 24,  1,  9, 17, 25,  2,
666		10, 18, 26,  3, 11, 19, 27 ) );
667	v2 = vec_perm(block[0], block[1],
668		(vector unsigned char) ( 4, 12, 20, 28,  5, 13, 21, 29,  6,
669		14, 22, 30,  7, 15, 23, 31 ) );
670
671	if ( (long) outBuffer & 0x0fL ) {
672		/* store output data into a non-aligned buffer */
673		BYTE	localBuf[16];
674		vec_st(v1, 0, (vector unsigned char *) localBuf );
675		memmove(outBuffer, localBuf, 16);
676	} else {
677		/* store output data into an aligned buffer */
678		vec_st( v1, 0, (vector unsigned char *) outBuffer );
679	}
680	return;
681}
682
683#endif	/* defined(__ppc__) && defined(ALTIVEC_ENABLE) */
684