1/*
2 * Copyright (c) 2000-2001 Apple Computer, Inc. All Rights Reserved.
3 *
4 * The contents of this file constitute Original Code as defined in and are
5 * subject to the Apple Public Source License Version 1.2 (the 'License').
6 * You may not use this file except in compliance with the License. Please obtain
7 * a copy of the License at http://www.apple.com/publicsource and read it before
8 * using this file.
9 *
10 * This Original Code and all software distributed under the License are
11 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESS
12 * OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, INCLUDING WITHOUT
13 * LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
14 * PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. Please see the License for the
15 * specific language governing rights and limitations under the License.
16 */
17
18
19/*
20 *  vRijndael-alg-ref.c
21 *
22 *  Created by Robert A. Murley on Mon Jan 22 2001.
23 *  Copyright (c) 2001 Apple Computer, Inc. All rights reserved.
24 *
25 */
26
27#include "rijndaelApi.h"
28#include "rijndael-alg-ref.h"
29#include "boxes-ref.h"
30#include <string.h>
31
32/* debugger seems to have trouble with this code... */
33#define VAES_DEBUG	1
34#if		VAES_DEBUG
35#include <stdio.h>
36#define vdprintf(s)		printf s
37#else
38#define vdprintf(s)
39#endif
40
41#define SC	((BC - 4) >> 1)
42
43#if defined(__ppc__) && defined(ALTIVEC_ENABLE)
44
45typedef union {
46	unsigned char		s[4][8];
47	unsigned long		l[8];
48	vector unsigned char 	v[2];
49} doubleVec;
50
51typedef union {
52	unsigned long		s[4];
53	vector unsigned long	v;
54} vecLong;
55
56static word8 shifts[3][4][2] = {
57 { { 0, 0 },
58   { 1, 3 },
59   { 2, 2 },
60   { 3, 1 }
61 },
62 { { 0, 0 },
63   { 1, 5 },
64   { 2, 4 },
65   { 3, 3 }
66 },
67 { { 0, 0 },
68   { 1, 7 },
69   { 3, 5 },
70   { 4, 4 }
71 }
72};
73
74int vRijndaelKeySched ( vector unsigned char vk[2], int keyBits, int blockBits,
75                unsigned char W[MAXROUNDS+1][4][MAXBC])
76{
77	/* Calculate the necessary round keys
78	 * The number of calculations depends on keyBits and blockBits
79	 */
80	int KC, BC, ROUNDS;
81	int i, j, t, rconpointer = 0;
82	doubleVec tk;
83	register  vector unsigned char v1, v2, mask;
84
85	switch (keyBits) {
86	case 128: KC = 4; break;
87	case 192: KC = 6; break;
88	case 256: KC = 8; break;
89	default : return (-1);
90	}
91
92	switch (blockBits) {
93	case 128: BC = 4; break;
94	case 192: BC = 6; break;
95	case 256: BC = 8; break;
96	default : return (-2);
97	}
98
99	switch (keyBits >= blockBits ? keyBits : blockBits) {
100	case 128: ROUNDS = 10; break;
101	case 192: ROUNDS = 12; break;
102	case 256: ROUNDS = 14; break;
103	default : return (-3); /* this cannot happen */
104	}
105
106	tk.v[0] = vk[0];
107	tk.v[1] = vk[1];
108
109	t = 0;
110	/* copy values into round key array */
111	for(j = 0; (j < KC) && (t < (ROUNDS+1)*BC); j++, t++)
112		for(i = 0; i < 4; i++) W[t / BC][i][t % BC] = tk.s[i][j];
113
114	while (t < (ROUNDS+1)*BC) { /* while not enough round key material calculated */
115		/* calculate new values */
116		for(i = 0; i < 4; i++)
117			tk.s[i][0] ^= *((word8 *)S + tk.s[(i+1)%4][KC-1]);
118		tk.s[0][0] ^= rcon[rconpointer++];
119
120		if (KC != 8) {
121			/* xor bytes 1-7 of each row with previous byte */
122			mask = (vector unsigned char) ( 0, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff );
123			for ( i = 0; i < 2; i++ ) {
124				v1 = vec_sld( tk.v[i], tk.v[i], 15 );
125				v2 = vec_and( v1, mask );
126				tk.v[i] = vec_xor( tk.v[i], v2 );
127			}
128		}
129		else {
130			/* xor bytes 1-3 of each row with previous byte */
131			mask = (vector unsigned char) ( 0, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0xff, 0xff, 0xff, 0, 0, 0, 0 );
132			for ( i = 0; i < 2; i++ ) {
133				v1 = vec_sld( tk.v[i], tk.v[i], 15 );
134				v2 = vec_and( v1, mask );
135				tk.v[i] = vec_xor( tk.v[i], v2 );
136				for(j = 0; j < 4; j++) tk.s[i][KC/2] ^= *((word8 *)S + tk.s[i][KC/2 - 1]);
137				/* xor bytes 5-7 of each row with previous byte */
138				mask = vec_sld( mask, mask, 4 );
139				v2 = vec_and( v1, mask );
140				tk.v[i] = vec_xor( tk.v[i], v2 );
141				mask = vec_sld( mask, mask, 4 );
142			}
143		}
144		/* copy values into round key array */
145		for(j = 0; (j < KC) && (t < (ROUNDS+1)*BC); j++, t++)
146			for(i = 0; i < 4; i++) W[t / BC][i][t % BC] = tk.s[i][j];
147	}
148	return 0;
149}
150
151
152void vMakeKey(BYTE *keyMaterial, keyInstance *key)
153{
154        register vector unsigned char v1, v2, v3, mask;
155        vector unsigned char	  vk[2];
156
157        /* load and align input */
158        v1 = vec_ld( 0, (vector unsigned char *) keyMaterial );
159        v2 = vec_ld( 16, (vector unsigned char *) keyMaterial );
160        if ( (long) keyMaterial & 0x0fL )
161        {	// this is required if keyMaterial is not on a 16-byte boundary
162                v3 = vec_ld( 32, (vector unsigned char *) keyMaterial );
163                mask = vec_lvsl( 0, keyMaterial );
164                v1 = vec_perm( v1, v2, mask );
165                v2 = vec_perm( v2, v3, mask );
166        }
167
168        /* parse input stream into rectangular array */
169        vk[0] = vec_perm( v1, v2, (vector unsigned char) ( 0,  4,  8, 12, 16, 20, 24, 28,  1,  5,  9, 13, 17, 21, 25, 29 ) );
170        vk[1] = vec_perm( v1, v2, (vector unsigned char) ( 2,  6, 10, 14, 18, 22, 26, 30,  3,  7, 11, 15, 19, 23, 27, 31 ) );
171        vRijndaelKeySched (vk, key->keyLen, key->blockLen, key->keySched);
172        memset( (char *) vk, 0, 4 * MAXKC);
173}
174
175
176/*	This routine does 16 simultaneous lookups in a 256-byte table.	*/
177vector unsigned char rimskyKorsakov ( vector unsigned char v, vector unsigned char * table )
178{
179	register vector unsigned char	upperBits000, upperBits001, upperBits010, upperBits011,
180                                        upperBits100, upperBits101, upperBits110, upperBits111,
181                                        lookupBit00,  lookupBit01, lookupBit10, lookupBit11,
182                                        lookupBit0, lookupBit1, lookup,
183                                        maskForBit6, maskForBit7, maskForBit8, seven;
184	register vector unsigned char	*tabeven, *tabodd;
185
186	seven = vec_splat_u8 ( 7 );
187	tabeven = table++;
188	tabodd = table;
189
190//	Each variable contains the correct values for the corresponding bits 6, 7 and 8.
191	upperBits000 = vec_perm ( *tabeven, *tabodd, v );
192	tabeven += 2; tabodd += 2;
193	upperBits001 = vec_perm ( *tabeven, *tabodd, v );
194	tabeven += 2; tabodd += 2;
195	upperBits010 = vec_perm ( *tabeven, *tabodd, v );
196	tabeven += 2; tabodd += 2;
197	upperBits011 = vec_perm ( *tabeven, *tabodd, v );
198	tabeven += 2; tabodd += 2;
199	upperBits100 = vec_perm ( *tabeven, *tabodd, v );
200	tabeven += 2; tabodd += 2;
201	upperBits101 = vec_perm ( *tabeven, *tabodd, v );
202	tabeven += 2; tabodd += 2;
203	upperBits110 = vec_perm ( *tabeven, *tabodd, v );
204	tabeven += 2; tabodd += 2;
205	upperBits111 = vec_perm ( *tabeven, *tabodd, v );
206
207//	Here we extract all the correct values for bit 6.
208	maskForBit6  = vec_sl  ( v, vec_splat_u8 ( 2 ) );
209	maskForBit6  = vec_sra ( maskForBit6, seven );
210	lookupBit00 = vec_sel ( upperBits000, upperBits001, maskForBit6 );
211	lookupBit01 = vec_sel ( upperBits010, upperBits011, maskForBit6 );
212	lookupBit10 = vec_sel ( upperBits100, upperBits101, maskForBit6 );
213	lookupBit11 = vec_sel ( upperBits110, upperBits111, maskForBit6 );
214
215//	Then we get the correct values for bit 7.
216	maskForBit7  = vec_sl  ( v, vec_splat_u8 ( 1 ) );
217	maskForBit7  = vec_sra ( maskForBit7, seven );
218	lookupBit0 = vec_sel ( lookupBit00, lookupBit01, maskForBit7 );
219	lookupBit1 = vec_sel ( lookupBit10, lookupBit11, maskForBit7 );
220
221//	Finally, the entire correct result vector.
222	maskForBit8 = vec_sra ( v, seven );
223
224	lookup = vec_sel ( lookupBit0, lookupBit1, maskForBit8 );
225
226    return lookup;
227}
228
229vector unsigned char vmul(vector unsigned char a, vector unsigned char b)
230{
231	register vector unsigned char x, y, zero;
232	register vector unsigned short xh, yh, zhi, zlo, two54, two55;
233
234	zero = vec_splat_u8( 0 );
235	two55 = vec_splat_u16( -1 );
236	two55 = (vector unsigned short) vec_mergeh( zero, (vector unsigned char) two55 );
237	two54 = vec_sub( two55, vec_splat_u16( 1 ) );
238
239	x = rimskyKorsakov( a, (vector unsigned char *)Logtable );	// Logtable[a]
240	y = rimskyKorsakov( b, (vector unsigned char *)Logtable );	// Logtable[b]
241
242	//	Convert upper 8 bytes to shorts for addition ond modulo
243	xh = (vector unsigned short) vec_mergeh( zero, x );
244	yh = (vector unsigned short) vec_mergeh( zero, y );
245	xh = vec_add( xh, yh );			// xh = Logtable[a] + Logtable[b]
246	yh = vec_sub( xh, two55 );
247	zhi = vec_sel( xh, yh, vec_cmpgt( xh, two54 ) );	// xh%255
248
249	//	Convert lower 8 bytes to shorts for addition ond modulo
250	xh = (vector unsigned short) vec_mergel( zero, x );
251	yh = (vector unsigned short) vec_mergel( zero, y );
252	xh = vec_add( xh, yh );
253	yh = vec_sub( xh, two55 );
254	zlo = vec_sel( xh, yh, vec_cmpgt( xh, two54 ) );
255
256	x = vec_pack( zhi, zlo );			// recombine into single byte vector
257	x = rimskyKorsakov( x, (vector unsigned char *)Alogtable );		// Alogtable[x]
258	x = vec_sel( x, zero, vec_cmpeq( a, zero ) );	// check a = 0
259	x = vec_sel( x, zero, vec_cmpeq( b, zero ) );	// check b = 0
260	return x;
261}
262
263void vKeyAddition(vector unsigned char v[2], vector unsigned char rk[2])
264{
265	v[0] = vec_xor( v[0], rk[0] );		// first vector contains rows 0 and 1
266	v[1] = vec_xor( v[1], rk[1] );		// second vector contains rows 2 and 3
267}
268
269
270void vShiftRow(vector unsigned char v[2], word8 d, word8 BC)
271{
272	vecLong			sh;
273	register vector unsigned char mask, mask1, t;
274	register vector bool char c;
275	register int	i, j;
276
277	sh.s[0] = 0;
278	for (i = 1; i < 4; i++)
279		sh.s[i] = shifts[SC][i][d] % BC;	//	contains the number of elements to shift each row
280
281	// each vector contains two BC-byte long rows
282	j = 0;
283	for ( i = 0; i < 2; i++ ) {
284		mask = vec_lvsl( 0, (int *) sh.s[j++]);		//	mask for even row
285		mask1 = vec_lvsl( 0, (int *) sh.s[j++]);	//	mask for odd row
286		if (BC == 4) {
287			mask = vec_sld( mask, mask1, 8 );		//	combined rotation mask for both rows
288			mask = vec_and( mask, vec_splat_u8( 3 ) );
289		} else if (BC == 6) {
290			mask = vec_sld( mask, mask, 8 );
291			mask = vec_sld( mask, mask1, 8 );		//	combined rotation mask for both rows
292			t = vec_sub( mask, vec_splat_u8( 6 ) );
293			c = vec_cmpgt( mask, vec_splat_u8( 5 ) );
294			mask = vec_sel( mask, t, c );
295		} else {
296			mask = vec_sld( mask, mask1, 8 );		//	combined rotation mask for both rows
297			mask = vec_and( mask, vec_splat_u8( 7 ) );
298		}
299		mask1 = vec_sld( vec_splat_u8( 0 ), vec_splat_u8( 8 ), 8 );
300		mask = vec_add( mask, mask1 );
301		v[i] = vec_perm( v[i], v[i], mask );		//	rotate each row as required
302	}
303}
304
305void vSubstitution( vector unsigned char v[2], vector unsigned char box[16] )
306{
307	v[0] = rimskyKorsakov( v[0], box );		// first vector contains rows 0 and 1
308	v[1] = rimskyKorsakov( v[1], box );		// second vector contains rows 2 and 3
309}
310
311void vMixColumn(vector unsigned char v[2])
312{
313	//	vector 0 contains row 0 in bytes 0-7 and row 1 in bytes 8-f
314	//	vector 1 contains row 2 in bytes 0-7 and row 3 in bytes 8-f
315
316	register vector unsigned char a0, a1, a2, a3, b0, b1, b2, b3;
317	register vector unsigned char two, three;
318
319	two = vec_splat_u8( 2 );
320	three = vec_splat_u8( 3 );
321
322	a1 = vec_sld( v[0], v[1], 8 );		// equivalent to a[i+1] % 4
323	b1 = vec_sld( v[1], v[0], 8 );
324	a2 = vec_sld( a1, b1, 8 );		// equivalent to a[i+2] % 4
325	b2 = vec_sld( b1, a1, 8 );
326	a3 = vec_sld( a2, b2, 8 );		// equivalent to a[i+3] % 4
327	b3 = vec_sld( b2, a2, 8 );
328
329	//	Calculations for rows 0 and 1
330	a0 = vmul( two, v[0] );				// mul(2,a[i][j])
331	a0 = vec_xor( a0, vmul( three, a1 ) );		// ^ mul(3,a[(i + 1) % 4][j])
332	a0 = vec_xor( a0, a2 );				// ^ a[(i + 2) % 4][j]
333	v[0]  = vec_xor( a0, a3 );			// ^ a[(i + 3) % 4][j]
334
335	//	Calculations for rows 2 and 3
336	b0 = vmul( two, v[1] );
337	b0 = vec_xor( b0, vmul( three, b1 ) );
338	b0 = vec_xor( b0, b2 );
339	v[1] = vec_xor( b0, b3 );
340}
341
342void vInvMixColumn(vector unsigned char v[2])
343{
344	//	vector 0 contains row 0 in bytes 0-7 and row 1 in bytes 8-f
345	//	vector 1 contains row 2 in bytes 0-7 and row 3 in bytes 8-f
346
347	register vector unsigned char a0, a1, a2, a3, b0, b1, b2, b3;
348	register vector unsigned char nine, eleven, thirteen, fourteen;;
349
350	nine = vec_splat_u8( 0x9 );
351	eleven = vec_splat_u8( 0xb );
352	thirteen = vec_splat_u8( 0xd );
353	fourteen = vec_splat_u8( 0xe );
354
355	a1 = vec_sld( v[0], v[1], 8 );			// equivalent to a[i+1] % 4
356	b1 = vec_sld( v[1], v[0], 8 );
357	a2 = vec_sld( a1, b1, 8 );			// equivalent to a[i+2] % 4
358	b2 = vec_sld( b1, a1, 8 );
359	a3 = vec_sld( a2, b2, 8 );			// equivalent to a[i+3] % 4
360	b3 = vec_sld( b2, a2, 8 );
361
362	//	Calculations for rows 0 and 1
363	a0 = vmul( fourteen, v[0] );				// mul(0xe,a[i][j])
364	a0 = vec_xor( a0, vmul( eleven, a1 ) );		// ^ mul(0xb,a[(i + 1) % 4][j])
365	a0 = vec_xor( a0, vmul( thirteen, a2 ) );	// ^ mul(0xd,a[(i + 2) % 4][j])
366	v[0]  = vec_xor( a0, vmul( nine, a3 ) );	// ^ mul(0x9,a[(i + 3) % 4][j])
367
368	//	Calculations for rows 2 and 3
369	b0 = vmul( fourteen, v[1] );
370	b0 = vec_xor( b0, vmul( eleven, b1 ) );
371	b0 = vec_xor( b0, vmul( thirteen, b2 ) );
372	v[1]  = vec_xor( b0, vmul( nine, b3 ) );
373}
374
375int vRijndaelEncrypt (vector unsigned char a[2], int keyBits, int blockBits, vector unsigned char rk[MAXROUNDS+1][2])
376{
377	/* Encryption of one block.
378	 */
379	int r, BC, ROUNDS;
380
381	switch (blockBits) {
382	case 128: BC = 4; break;
383	case 192: BC = 6; break;
384	case 256: BC = 8; break;
385	default : return (-2);
386	}
387
388	switch (keyBits >= blockBits ? keyBits : blockBits) {
389	case 128: ROUNDS = 10; break;
390	case 192: ROUNDS = 12; break;
391	case 256: ROUNDS = 14; break;
392	default : return (-3); /* this cannot happen */
393	}
394
395        vKeyAddition( a, rk[0] );
396        for(r = 1; r < ROUNDS; r++) {
397                vSubstitution( a, (vector unsigned char *)S);
398                vShiftRow( a, 0, BC);
399                vMixColumn( a );
400                vKeyAddition( a, rk[r] );
401        }
402        vSubstitution( a, (vector unsigned char *)S);
403        vShiftRow( a, 0, BC);
404        vKeyAddition( a, rk[ROUNDS] );
405
406	return 0;
407}
408
409int vRijndaelDecrypt (vector unsigned char a[2], int keyBits, int blockBits, vector unsigned char rk[MAXROUNDS+1][2])
410{
411	int r, BC, ROUNDS;
412
413	switch (blockBits) {
414	case 128: BC = 4; break;
415	case 192: BC = 6; break;
416	case 256: BC = 8; break;
417	default : return (-2);
418	}
419
420	switch (keyBits >= blockBits ? keyBits : blockBits) {
421	case 128: ROUNDS = 10; break;
422	case 192: ROUNDS = 12; break;
423	case 256: ROUNDS = 14; break;
424	default : return (-3); /* this cannot happen */
425	}
426
427        vKeyAddition( a, rk[ROUNDS] );
428        vSubstitution( a, (vector unsigned char *)Si);
429        vShiftRow( a, 1, BC);
430        for(r = ROUNDS-1; r > 0; r--) {
431                vKeyAddition( a, rk[r] );
432                vInvMixColumn( a );
433                vSubstitution( a, (vector unsigned char *)Si);
434                vShiftRow( a, 1, BC);
435        }
436        vKeyAddition( a, rk[0] );
437
438	return 0;
439}
440
441#if 0
442/* Murley's code, to be deleted */
443void vBlockEncrypt(cipherInstance *cipher, keyInstance *key, BYTE *input, int inputLen, BYTE *outBuffer)
444{
445        register vector unsigned char v1, v2, v3, v4, mask;
446        register vector bool char cmp;
447
448        /* load and align input */
449        v1 = vec_ld( 0, (vector unsigned char *) input );
450        v2 = vec_ld( 16, (vector unsigned char *) input );
451        if ( (long) input & 0x0fL )
452        {	// this is required if input is not on a 16-byte boundary
453                v3 = vec_ld( 32, (vector unsigned char *) input );
454                mask = vec_lvsl( 0, input );
455                v1 = vec_perm( v1, v2, mask );
456                v2 = vec_perm( v2, v3, mask );
457        }
458
459        /* parse input stream into rectangular array */
460        v3 = vec_perm( v1, v2, (vector unsigned char) ( 0,  4,  8, 12, 16, 20, 24, 28,  1,  5,  9, 13, 17, 21, 25, 29 ) );
461        v4 = vec_perm( v1, v2, (vector unsigned char) ( 2,  6, 10, 14, 18, 22, 26, 30,  3,  7, 11, 15, 19, 23, 27, 31 ) );
462
463        /* store into cipher structure */
464        if (cipher->mode == MODE_CBC) {
465                v3 = vec_xor( v3, *((vector unsigned char *) cipher->chainBlock ) );
466                v4 = vec_xor( v4, *((vector unsigned char *) cipher->chainBlock + 1 ) );
467        }
468        vec_st( v3, 0, (vector unsigned char *) cipher->chainBlock );
469        vec_st( v4, 16, (vector unsigned char *) cipher->chainBlock );
470
471        vRijndaelEncrypt((vector unsigned char *) cipher->chainBlock, key->keyLen, cipher->blockLen, (vector unsigned char *) key->keySched);
472
473        v1 = vec_ld( 0, (vector unsigned char *) cipher->chainBlock );
474        v2 = vec_ld( 16, (vector unsigned char *) cipher->chainBlock );
475
476        /* parse rectangular array into output ciphertext bytes */
477        v3 = vec_perm( v1, v2, (vector unsigned char) ( 0,  8, 16, 24,  1,  9, 17, 25,  2, 10, 18, 26,  3, 11, 19, 27 ) );
478        v4 = vec_perm( v1, v2, (vector unsigned char) ( 4, 12, 20, 28,  5, 13, 21, 29,  6, 14, 22, 30,  7, 15, 23, 31 ) );
479
480        if ( (long) outBuffer & 0x0fL )
481        {
482                /* store output data into a non-aligned buffer */
483                mask = vec_lvsr( 0, outBuffer );
484                cmp = vec_cmpgt( mask, vec_splat_u8( 0x0f ) );
485                v1 = vec_perm( v3, v3, mask );
486                v2 = vec_perm( v4, v4, mask );
487                v3 = vec_ld( 0, (vector unsigned char *) outBuffer );
488                v4 = vec_sel( v3, v1, cmp );
489                vec_st( v4, 0, (vector unsigned char *) outBuffer );
490                v1 = vec_sel( v1, v2, cmp );
491                vec_st( v1, 16, (vector unsigned char *) outBuffer );
492                v3 = vec_ld( 32, (vector unsigned char *) outBuffer );
493                v2 = vec_sel( v2, v3, cmp );
494                vec_st( v2, 32, (vector unsigned char *) outBuffer );
495        } else {
496                // store output data into an aligned buffer
497                vec_st( v3, 0, (vector unsigned char *) outBuffer );
498                vec_st( v4, 16, (vector unsigned char *) outBuffer );
499        }
500        return;
501}
502
503void vBlockDecrypt(cipherInstance *cipher, keyInstance *key, BYTE *input, int inputLen, BYTE *outBuffer)
504{
505        // for vector machines
506        register vector unsigned char v1, v2, v3, v4, mask;
507        register vector bool char cmp;
508        vector unsigned  char	block[2], cblock[2];
509
510        /* load and align input */
511        v1 = vec_ld( 0, (vector unsigned char *) input );
512        v2 = vec_ld( 16, (vector unsigned char *) input );
513        if ( (long) input & 0x0fL )
514        {	// this is required if input is not on a 16-byte boundary
515                v3 = vec_ld( 32, (vector unsigned char *) input );
516                mask = vec_lvsl( 0, input );
517                v1 = vec_perm( v1, v2, mask );
518                v2 = vec_perm( v2, v3, mask );
519        }
520
521        /* parse input stream into rectangular array */
522        v3 = vec_perm( v1, v2, (vector unsigned char) ( 0,  4,  8, 12, 16, 20, 24, 28,  1,  5,  9, 13, 17, 21, 25, 29 ) );
523        v4 = vec_perm( v1, v2, (vector unsigned char) ( 2,  6, 10, 14, 18, 22, 26, 30,  3,  7, 11, 15, 19, 23, 27, 31 ) );
524        block[0] = v3;
525        block[1] = v4;
526
527        /* save a copy of incoming ciphertext for later chain */
528        if (cipher->mode == MODE_CBC) {
529                cblock[0] = v3;
530                cblock[1] = v4;
531        }
532
533        vRijndaelDecrypt ((vector unsigned char *) block, key->keyLen, cipher->blockLen, (vector unsigned char *) key->keySched);
534
535        v1 = block[0];
536        v2 = block[1];
537
538        /* exor with last ciphertext */
539        if (cipher->mode == MODE_CBC) {
540                v1 = vec_xor( v1, *((vector unsigned char *) cipher->chainBlock) );
541                v2 = vec_xor( v2, *((vector unsigned char *) cipher->chainBlock + 1) );
542                vec_st( cblock[0], 0, (vector unsigned char *) cipher->chainBlock );
543                vec_st( cblock[1], 16, (vector unsigned char *) cipher->chainBlock );
544        }
545
546        /* parse rectangular array into output ciphertext bytes */
547        v3 = vec_perm( v1, v2, (vector unsigned char) ( 0,  8, 16, 24,  1,  9, 17, 25,  2, 10, 18, 26,  3, 11, 19, 27 ) );
548        v4 = vec_perm( v1, v2, (vector unsigned char) ( 4, 12, 20, 28,  5, 13, 21, 29,  6, 14, 22, 30,  7, 15, 23, 31 ) );
549
550        if ( (long) outBuffer & 0x0fL )
551        {	/* store output data into a non-aligned buffer */
552                mask = vec_lvsr( 0, outBuffer );
553                cmp = vec_cmpgt( mask, vec_splat_u8( 0x0f ) );
554                v1 = vec_perm( v3, v3, mask );
555                v2 = vec_perm( v4, v4, mask );
556                v3 = vec_ld( 0, (vector unsigned char *) outBuffer );
557                v4 = vec_sel( v3, v1, cmp );
558                vec_st( v4, 0, (vector unsigned char *) outBuffer );
559                v1 = vec_sel( v1, v2, cmp );
560                vec_st( v1, 16, (vector unsigned char *) outBuffer );
561                v3 = vec_ld( 32, (vector unsigned char *) outBuffer );
562                v2 = vec_sel( v2, v3, cmp );
563                vec_st( v2, 32, (vector unsigned char *) outBuffer );
564        } else {
565                // store output data into an aligned buffer
566                vec_st( v3, 0, (vector unsigned char *) outBuffer );
567                vec_st( v4, 16, (vector unsigned char *) outBuffer );
568        }
569}
570#endif	/* Murley's code, to be deleted */
571
572/*
573 * dmitch addenda 4/11/2001: 128-bit only encrypt/decrypt with no CBC
574 */
575void vBlockEncrypt128(
576	keyInstance *key,
577	BYTE *input,
578	BYTE *outBuffer)
579{
580	vector unsigned char block[2];
581	register vector unsigned char v1, v2;
582
583	if ( (long) input & 0x0fL ) {
584		BYTE	localBuf[16];
585		vdprintf(("vBlockEncrypt128: unaligned input\n"));
586		/* manually re-align - the compiler is supposed to 16-byte align this for us */
587		if((unsigned)localBuf & 0xf) {
588			vdprintf(("vBlockEncrypt128: unaligned localBuf!\n"));
589		}
590		memmove(localBuf, input, 16);
591		v1 = vec_ld(0, (vector unsigned char *)localBuf);
592	}
593	else {
594		vdprintf(("vBlockEncrypt128: aligned input\n"));
595		v1 = vec_ld( 0, (vector unsigned char *) input );
596	}
597
598	/* parse input stream into rectangular array */
599	/* FIXME - do we need to zero v2 (or something)? */
600	block[0] = vec_perm(v1, v2,
601		(vector unsigned char) ( 0,  4,  8, 12, 16, 20, 24, 28,  1,
602		5,  9, 13, 17, 21, 25, 29 ) );
603	block[1] = vec_perm( v1, v2,
604		(vector unsigned char) ( 2,  6, 10, 14, 18, 22, 26, 30,  3,
605		7, 11, 15, 19, 23, 27, 31 ) );
606
607	vRijndaelEncrypt(block, key->keyLen, 128, (vector unsigned char *) key->keySched);
608
609	/* parse rectangular array into output ciphertext bytes */
610	v1 = vec_perm(block[0], block[1],
611		(vector unsigned char) ( 0,  8, 16, 24,  1,  9, 17, 25,  2,
612		10, 18, 26,  3, 11, 19, 27 ) );
613	v2 = vec_perm(block[0], block[1],
614		(vector unsigned char) ( 4, 12, 20, 28,  5, 13, 21, 29,  6,
615		14, 22, 30,  7, 15, 23, 31 ) );
616
617	if ( (long) outBuffer & 0x0fL )
618	{
619		/* store output data into a non-aligned buffer */
620		BYTE	localBuf[16];
621		vec_st(v1, 0, (vector unsigned char *) localBuf );
622		memmove(outBuffer, localBuf, 16);
623	} else {
624		/* store output data into an aligned buffer */
625		vec_st( v1, 0, (vector unsigned char *) outBuffer );
626	}
627	return;
628}
629
630void vBlockDecrypt128(
631	keyInstance *key,
632	BYTE *input,
633	BYTE *outBuffer)
634{
635	vector unsigned char block[2];
636	register vector unsigned char v1, v2;
637
638	if ( (long) input & 0x0fL ) {
639		/* manually re-align - the compiler is supposed to 16-byte align this for us */
640		BYTE	localBuf[16];
641		vdprintf(("vBlockDecrypt128: unaligned input\n"));
642		if((unsigned)localBuf & 0xf) {
643			vdprintf(("vBlockDecrypt128: unaligned localBuf!\n"));
644		}
645		memmove(localBuf, input, 16);
646		v1 = vec_ld(0, (vector unsigned char *)localBuf);
647	}
648	else {
649		vdprintf(("vBlockDecrypt128: aligned input\n"));
650		v1 = vec_ld( 0, (vector unsigned char *) input );
651	}
652
653	/* parse input stream into rectangular array */
654	/* FIXME - do we need to zero v2 (or something)? */
655	block[0] = vec_perm(v1, v2,
656		(vector unsigned char) ( 0,  4,  8, 12, 16, 20, 24, 28,  1,
657		5,  9, 13, 17, 21, 25, 29 ) );
658	block[1] = vec_perm( v1, v2,
659		(vector unsigned char) ( 2,  6, 10, 14, 18, 22, 26, 30,  3,
660		7, 11, 15, 19, 23, 27, 31 ) );
661
662	vRijndaelDecrypt(block, key->keyLen, 128, (vector unsigned char *) key->keySched);
663
664	/* parse rectangular array into output ciphertext bytes */
665	v1 = vec_perm(block[0], block[1],
666		(vector unsigned char) ( 0,  8, 16, 24,  1,  9, 17, 25,  2,
667		10, 18, 26,  3, 11, 19, 27 ) );
668	v2 = vec_perm(block[0], block[1],
669		(vector unsigned char) ( 4, 12, 20, 28,  5, 13, 21, 29,  6,
670		14, 22, 30,  7, 15, 23, 31 ) );
671
672	if ( (long) outBuffer & 0x0fL ) {
673		/* store output data into a non-aligned buffer */
674		BYTE	localBuf[16];
675		vec_st(v1, 0, (vector unsigned char *) localBuf );
676		memmove(outBuffer, localBuf, 16);
677	} else {
678		/* store output data into an aligned buffer */
679		vec_st( v1, 0, (vector unsigned char *) outBuffer );
680	}
681	return;
682}
683
684#endif	/* defined(__ppc__) && defined(ALTIVEC_ENABLE) */
685