e_padlock.c revision 279264
1/*
2 * Support for VIA PadLock Advanced Cryptography Engine (ACE)
3 * Written by Michal Ludvig <michal@logix.cz>
4 *            http://www.logix.cz/michal
5 *
6 * Big thanks to Andy Polyakov for a help with optimization,
7 * assembler fixes, port to MS Windows and a lot of other
8 * valuable work on this engine!
9 */
10
11/* ====================================================================
12 * Copyright (c) 1999-2001 The OpenSSL Project.  All rights reserved.
13 *
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions
16 * are met:
17 *
18 * 1. Redistributions of source code must retain the above copyright
19 *    notice, this list of conditions and the following disclaimer.
20 *
21 * 2. Redistributions in binary form must reproduce the above copyright
22 *    notice, this list of conditions and the following disclaimer in
23 *    the documentation and/or other materials provided with the
24 *    distribution.
25 *
26 * 3. All advertising materials mentioning features or use of this
27 *    software must display the following acknowledgment:
28 *    "This product includes software developed by the OpenSSL Project
29 *    for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
30 *
31 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
32 *    endorse or promote products derived from this software without
33 *    prior written permission. For written permission, please contact
34 *    licensing@OpenSSL.org.
35 *
36 * 5. Products derived from this software may not be called "OpenSSL"
37 *    nor may "OpenSSL" appear in their names without prior written
38 *    permission of the OpenSSL Project.
39 *
40 * 6. Redistributions of any form whatsoever must retain the following
41 *    acknowledgment:
42 *    "This product includes software developed by the OpenSSL Project
43 *    for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
44 *
45 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
46 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
47 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
48 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
49 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
50 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
51 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
52 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
53 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
54 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
55 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
56 * OF THE POSSIBILITY OF SUCH DAMAGE.
57 * ====================================================================
58 *
59 * This product includes cryptographic software written by Eric Young
60 * (eay@cryptsoft.com).  This product includes software written by Tim
61 * Hudson (tjh@cryptsoft.com).
62 *
63 */
64
65
66#include <stdio.h>
67#include <string.h>
68
69#include <openssl/opensslconf.h>
70#include <openssl/crypto.h>
71#include <openssl/dso.h>
72#include <openssl/engine.h>
73#include <openssl/evp.h>
74#ifndef OPENSSL_NO_AES
75#include <openssl/aes.h>
76#endif
77#include <openssl/rand.h>
78#include <openssl/err.h>
79
80#ifndef OPENSSL_NO_HW
81#ifndef OPENSSL_NO_HW_PADLOCK
82
83/* Attempt to have a single source for both 0.9.7 and 0.9.8 :-) */
84#if (OPENSSL_VERSION_NUMBER >= 0x00908000L)
85#  ifndef OPENSSL_NO_DYNAMIC_ENGINE
86#    define DYNAMIC_ENGINE
87#  endif
88#elif (OPENSSL_VERSION_NUMBER >= 0x00907000L)
89#  ifdef ENGINE_DYNAMIC_SUPPORT
90#    define DYNAMIC_ENGINE
91#  endif
92#else
93#  error "Only OpenSSL >= 0.9.7 is supported"
94#endif
95
96/* VIA PadLock AES is available *ONLY* on some x86 CPUs.
97   Not only that it doesn't exist elsewhere, but it
98   even can't be compiled on other platforms!
99
100   In addition, because of the heavy use of inline assembler,
101   compiler choice is limited to GCC and Microsoft C. */
102#undef COMPILE_HW_PADLOCK
103#if !defined(I386_ONLY) && !defined(OPENSSL_NO_INLINE_ASM)
104# if (defined(__GNUC__) && (defined(__i386__) || defined(__i386))) || \
105     (defined(_MSC_VER) && defined(_M_IX86))
106#  define COMPILE_HW_PADLOCK
107# endif
108#endif
109
110#ifdef OPENSSL_NO_DYNAMIC_ENGINE
111#ifdef COMPILE_HW_PADLOCK
112static ENGINE *ENGINE_padlock (void);
113#endif
114
115void ENGINE_load_padlock (void)
116{
117/* On non-x86 CPUs it just returns. */
118#ifdef COMPILE_HW_PADLOCK
119	ENGINE *toadd = ENGINE_padlock ();
120	if (!toadd) return;
121	ENGINE_add (toadd);
122	ENGINE_free (toadd);
123	ERR_clear_error ();
124#endif
125}
126
127#endif
128
129#ifdef COMPILE_HW_PADLOCK
130/* We do these includes here to avoid header problems on platforms that
131   do not have the VIA padlock anyway... */
132#include <stdlib.h>
133#ifdef _WIN32
134# include <malloc.h>
135# ifndef alloca
136#  define alloca _alloca
137# endif
138#elif defined(__GNUC__)
139# ifndef alloca
140#  define alloca(s) __builtin_alloca(s)
141# endif
142#endif
143
144/* Function for ENGINE detection and control */
145static int padlock_available(void);
146static int padlock_init(ENGINE *e);
147
148/* RNG Stuff */
149static RAND_METHOD padlock_rand;
150
151/* Cipher Stuff */
152#ifndef OPENSSL_NO_AES
153static int padlock_ciphers(ENGINE *e, const EVP_CIPHER **cipher, const int **nids, int nid);
154#endif
155
156/* Engine names */
157static const char *padlock_id = "padlock";
158static char padlock_name[100];
159
160/* Available features */
161static int padlock_use_ace = 0;	/* Advanced Cryptography Engine */
162static int padlock_use_rng = 0;	/* Random Number Generator */
163#ifndef OPENSSL_NO_AES
164static int padlock_aes_align_required = 1;
165#endif
166
167/* ===== Engine "management" functions ===== */
168
169/* Prepare the ENGINE structure for registration */
170static int
171padlock_bind_helper(ENGINE *e)
172{
173	/* Check available features */
174	padlock_available();
175
176#if 1	/* disable RNG for now, see commentary in vicinity of RNG code */
177	padlock_use_rng=0;
178#endif
179
180	/* Generate a nice engine name with available features */
181	BIO_snprintf(padlock_name, sizeof(padlock_name),
182		"VIA PadLock (%s, %s)",
183		 padlock_use_rng ? "RNG" : "no-RNG",
184		 padlock_use_ace ? "ACE" : "no-ACE");
185
186	/* Register everything or return with an error */
187	if (!ENGINE_set_id(e, padlock_id) ||
188	    !ENGINE_set_name(e, padlock_name) ||
189
190	    !ENGINE_set_init_function(e, padlock_init) ||
191#ifndef OPENSSL_NO_AES
192	    (padlock_use_ace && !ENGINE_set_ciphers (e, padlock_ciphers)) ||
193#endif
194	    (padlock_use_rng && !ENGINE_set_RAND (e, &padlock_rand))) {
195		return 0;
196	}
197
198	/* Everything looks good */
199	return 1;
200}
201
202#ifdef OPENSSL_NO_DYNAMIC_ENGINE
203
204/* Constructor */
205static ENGINE *
206ENGINE_padlock(void)
207{
208	ENGINE *eng = ENGINE_new();
209
210	if (!eng) {
211		return NULL;
212	}
213
214	if (!padlock_bind_helper(eng)) {
215		ENGINE_free(eng);
216		return NULL;
217	}
218
219	return eng;
220}
221
222#endif
223
224/* Check availability of the engine */
225static int
226padlock_init(ENGINE *e)
227{
228	return (padlock_use_rng || padlock_use_ace);
229}
230
231/* This stuff is needed if this ENGINE is being compiled into a self-contained
232 * shared-library.
233 */
234#ifdef DYNAMIC_ENGINE
235static int
236padlock_bind_fn(ENGINE *e, const char *id)
237{
238	if (id && (strcmp(id, padlock_id) != 0)) {
239		return 0;
240	}
241
242	if (!padlock_bind_helper(e))  {
243		return 0;
244	}
245
246	return 1;
247}
248
249IMPLEMENT_DYNAMIC_CHECK_FN()
250IMPLEMENT_DYNAMIC_BIND_FN (padlock_bind_fn)
251#endif /* DYNAMIC_ENGINE */
252
253/* ===== Here comes the "real" engine ===== */
254
255#ifndef OPENSSL_NO_AES
256/* Some AES-related constants */
257#define AES_BLOCK_SIZE		16
258#define AES_KEY_SIZE_128	16
259#define AES_KEY_SIZE_192	24
260#define AES_KEY_SIZE_256	32
261
262/* Here we store the status information relevant to the
263   current context. */
264/* BIG FAT WARNING:
265 * 	Inline assembler in PADLOCK_XCRYPT_ASM()
266 * 	depends on the order of items in this structure.
267 * 	Don't blindly modify, reorder, etc!
268 */
269struct padlock_cipher_data
270{
271	unsigned char iv[AES_BLOCK_SIZE];	/* Initialization vector */
272	union {	unsigned int pad[4];
273		struct {
274			int rounds:4;
275			int dgst:1;	/* n/a in C3 */
276			int align:1;	/* n/a in C3 */
277			int ciphr:1;	/* n/a in C3 */
278			unsigned int keygen:1;
279			int interm:1;
280			unsigned int encdec:1;
281			int ksize:2;
282		} b;
283	} cword;		/* Control word */
284	AES_KEY ks;		/* Encryption key */
285};
286
287/*
288 * Essentially this variable belongs in thread local storage.
289 * Having this variable global on the other hand can only cause
290 * few bogus key reloads [if any at all on single-CPU system],
291 * so we accept the penatly...
292 */
293static volatile struct padlock_cipher_data *padlock_saved_context;
294#endif
295
296/*
297 * =======================================================
298 * Inline assembler section(s).
299 * =======================================================
300 * Order of arguments is chosen to facilitate Windows port
301 * using __fastcall calling convention. If you wish to add
302 * more routines, keep in mind that first __fastcall
303 * argument is passed in %ecx and second - in %edx.
304 * =======================================================
305 */
306#if defined(__GNUC__) && __GNUC__>=2
307/*
308 * As for excessive "push %ebx"/"pop %ebx" found all over.
309 * When generating position-independent code GCC won't let
310 * us use "b" in assembler templates nor even respect "ebx"
311 * in "clobber description." Therefore the trouble...
312 */
313
314/* Helper function - check if a CPUID instruction
315   is available on this CPU */
316static int
317padlock_insn_cpuid_available(void)
318{
319	int result = -1;
320
321	/* We're checking if the bit #21 of EFLAGS
322	   can be toggled. If yes = CPUID is available. */
323	asm volatile (
324		"pushf\n"
325		"popl %%eax\n"
326		"xorl $0x200000, %%eax\n"
327		"movl %%eax, %%ecx\n"
328		"andl $0x200000, %%ecx\n"
329		"pushl %%eax\n"
330		"popf\n"
331		"pushf\n"
332		"popl %%eax\n"
333		"andl $0x200000, %%eax\n"
334		"xorl %%eax, %%ecx\n"
335		"movl %%ecx, %0\n"
336		: "=r" (result) : : "eax", "ecx");
337
338	return (result == 0);
339}
340
341/* Load supported features of the CPU to see if
342   the PadLock is available. */
343static int
344padlock_available(void)
345{
346	char vendor_string[16];
347	unsigned int eax, edx;
348
349	/* First check if the CPUID instruction is available at all... */
350	if (! padlock_insn_cpuid_available())
351		return 0;
352
353	/* Are we running on the Centaur (VIA) CPU? */
354	eax = 0x00000000;
355	vendor_string[12] = 0;
356	asm volatile (
357		"pushl	%%ebx\n"
358		"cpuid\n"
359		"movl	%%ebx,(%%edi)\n"
360		"movl	%%edx,4(%%edi)\n"
361		"movl	%%ecx,8(%%edi)\n"
362		"popl	%%ebx"
363		: "+a"(eax) : "D"(vendor_string) : "ecx", "edx");
364	if (strcmp(vendor_string, "CentaurHauls") != 0)
365		return 0;
366
367	/* Check for Centaur Extended Feature Flags presence */
368	eax = 0xC0000000;
369	asm volatile ("pushl %%ebx; cpuid; popl	%%ebx"
370		: "+a"(eax) : : "ecx", "edx");
371	if (eax < 0xC0000001)
372		return 0;
373
374	/* Read the Centaur Extended Feature Flags */
375	eax = 0xC0000001;
376	asm volatile ("pushl %%ebx; cpuid; popl %%ebx"
377		: "+a"(eax), "=d"(edx) : : "ecx");
378
379	/* Fill up some flags */
380	padlock_use_ace = ((edx & (0x3<<6)) == (0x3<<6));
381	padlock_use_rng = ((edx & (0x3<<2)) == (0x3<<2));
382
383	return padlock_use_ace + padlock_use_rng;
384}
385
386#ifndef OPENSSL_NO_AES
387#ifndef AES_ASM
388/* Our own htonl()/ntohl() */
389static inline void
390padlock_bswapl(AES_KEY *ks)
391{
392	size_t i = sizeof(ks->rd_key)/sizeof(ks->rd_key[0]);
393	unsigned int *key = ks->rd_key;
394
395	while (i--) {
396		asm volatile ("bswapl %0" : "+r"(*key));
397		key++;
398	}
399}
400#endif
401#endif
402
403/* Force key reload from memory to the CPU microcode.
404   Loading EFLAGS from the stack clears EFLAGS[30]
405   which does the trick. */
406static inline void
407padlock_reload_key(void)
408{
409	asm volatile ("pushfl; popfl");
410}
411
412#ifndef OPENSSL_NO_AES
413/*
414 * This is heuristic key context tracing. At first one
415 * believes that one should use atomic swap instructions,
416 * but it's not actually necessary. Point is that if
417 * padlock_saved_context was changed by another thread
418 * after we've read it and before we compare it with cdata,
419 * our key *shall* be reloaded upon thread context switch
420 * and we are therefore set in either case...
421 */
422static inline void
423padlock_verify_context(struct padlock_cipher_data *cdata)
424{
425	asm volatile (
426	"pushfl\n"
427"	btl	$30,(%%esp)\n"
428"	jnc	1f\n"
429"	cmpl	%2,%1\n"
430"	je	1f\n"
431"	popfl\n"
432"	subl	$4,%%esp\n"
433"1:	addl	$4,%%esp\n"
434"	movl	%2,%0"
435	:"+m"(padlock_saved_context)
436	: "r"(padlock_saved_context), "r"(cdata) : "cc");
437}
438
439/* Template for padlock_xcrypt_* modes */
440/* BIG FAT WARNING:
441 * 	The offsets used with 'leal' instructions
442 * 	describe items of the 'padlock_cipher_data'
443 * 	structure.
444 */
445#define PADLOCK_XCRYPT_ASM(name,rep_xcrypt)	\
446static inline void *name(size_t cnt,		\
447	struct padlock_cipher_data *cdata,	\
448	void *out, const void *inp) 		\
449{	void *iv; 				\
450	asm volatile ( "pushl	%%ebx\n"	\
451		"	leal	16(%0),%%edx\n"	\
452		"	leal	32(%0),%%ebx\n"	\
453			rep_xcrypt "\n"		\
454		"	popl	%%ebx"		\
455		: "=a"(iv), "=c"(cnt), "=D"(out), "=S"(inp) \
456		: "0"(cdata), "1"(cnt), "2"(out), "3"(inp)  \
457		: "edx", "cc", "memory");	\
458	return iv;				\
459}
460
461/* Generate all functions with appropriate opcodes */
462PADLOCK_XCRYPT_ASM(padlock_xcrypt_ecb, ".byte 0xf3,0x0f,0xa7,0xc8")	/* rep xcryptecb */
463PADLOCK_XCRYPT_ASM(padlock_xcrypt_cbc, ".byte 0xf3,0x0f,0xa7,0xd0")	/* rep xcryptcbc */
464PADLOCK_XCRYPT_ASM(padlock_xcrypt_cfb, ".byte 0xf3,0x0f,0xa7,0xe0")	/* rep xcryptcfb */
465PADLOCK_XCRYPT_ASM(padlock_xcrypt_ofb, ".byte 0xf3,0x0f,0xa7,0xe8")	/* rep xcryptofb */
466#endif
467
468/* The RNG call itself */
469static inline unsigned int
470padlock_xstore(void *addr, unsigned int edx_in)
471{
472	unsigned int eax_out;
473
474	asm volatile (".byte 0x0f,0xa7,0xc0"	/* xstore */
475	    : "=a"(eax_out),"=m"(*(unsigned *)addr)
476	    : "D"(addr), "d" (edx_in)
477	    );
478
479	return eax_out;
480}
481
482/* Why not inline 'rep movsd'? I failed to find information on what
483 * value in Direction Flag one can expect and consequently have to
484 * apply "better-safe-than-sorry" approach and assume "undefined."
485 * I could explicitly clear it and restore the original value upon
486 * return from padlock_aes_cipher, but it's presumably too much
487 * trouble for too little gain...
488 *
489 * In case you wonder 'rep xcrypt*' instructions above are *not*
490 * affected by the Direction Flag and pointers advance toward
491 * larger addresses unconditionally.
492 */
493static inline unsigned char *
494padlock_memcpy(void *dst,const void *src,size_t n)
495{
496	long       *d=dst;
497	const long *s=src;
498
499	n /= sizeof(*d);
500	do { *d++ = *s++; } while (--n);
501
502	return dst;
503}
504
505#elif defined(_MSC_VER)
506/*
507 * Unlike GCC these are real functions. In order to minimize impact
508 * on performance we adhere to __fastcall calling convention in
509 * order to get two first arguments passed through %ecx and %edx.
510 * Which kind of suits very well, as instructions in question use
511 * both %ecx and %edx as input:-)
512 */
513#define REP_XCRYPT(code)		\
514	_asm _emit 0xf3			\
515	_asm _emit 0x0f _asm _emit 0xa7	\
516	_asm _emit code
517
518/* BIG FAT WARNING:
519 * 	The offsets used with 'lea' instructions
520 * 	describe items of the 'padlock_cipher_data'
521 * 	structure.
522 */
523#define PADLOCK_XCRYPT_ASM(name,code)	\
524static void * __fastcall 		\
525	name (size_t cnt, void *cdata,	\
526	void *outp, const void *inp)	\
527{	_asm	mov	eax,edx		\
528	_asm	lea	edx,[eax+16]	\
529	_asm	lea	ebx,[eax+32]	\
530	_asm	mov	edi,outp	\
531	_asm	mov	esi,inp		\
532	REP_XCRYPT(code)		\
533}
534
535PADLOCK_XCRYPT_ASM(padlock_xcrypt_ecb,0xc8)
536PADLOCK_XCRYPT_ASM(padlock_xcrypt_cbc,0xd0)
537PADLOCK_XCRYPT_ASM(padlock_xcrypt_cfb,0xe0)
538PADLOCK_XCRYPT_ASM(padlock_xcrypt_ofb,0xe8)
539
540static int __fastcall
541padlock_xstore(void *outp,unsigned int code)
542{	_asm	mov	edi,ecx
543	_asm _emit 0x0f _asm _emit 0xa7 _asm _emit 0xc0
544}
545
546static void __fastcall
547padlock_reload_key(void)
548{	_asm pushfd _asm popfd		}
549
550static void __fastcall
551padlock_verify_context(void *cdata)
552{	_asm	{
553		pushfd
554		bt	DWORD PTR[esp],30
555		jnc	skip
556		cmp	ecx,padlock_saved_context
557		je	skip
558		popfd
559		sub	esp,4
560	skip:	add	esp,4
561		mov	padlock_saved_context,ecx
562		}
563}
564
565static int
566padlock_available(void)
567{	_asm	{
568		pushfd
569		pop	eax
570		mov	ecx,eax
571		xor	eax,1<<21
572		push	eax
573		popfd
574		pushfd
575		pop	eax
576		xor	eax,ecx
577		bt	eax,21
578		jnc	noluck
579		mov	eax,0
580		cpuid
581		xor	eax,eax
582		cmp	ebx,'tneC'
583		jne	noluck
584		cmp	edx,'Hrua'
585		jne	noluck
586		cmp	ecx,'slua'
587		jne	noluck
588		mov	eax,0xC0000000
589		cpuid
590		mov	edx,eax
591		xor	eax,eax
592		cmp	edx,0xC0000001
593		jb	noluck
594		mov	eax,0xC0000001
595		cpuid
596		xor	eax,eax
597		bt	edx,6
598		jnc	skip_a
599		bt	edx,7
600		jnc	skip_a
601		mov	padlock_use_ace,1
602		inc	eax
603	skip_a:	bt	edx,2
604		jnc	skip_r
605		bt	edx,3
606		jnc	skip_r
607		mov	padlock_use_rng,1
608		inc	eax
609	skip_r:
610	noluck:
611		}
612}
613
614static void __fastcall
615padlock_bswapl(void *key)
616{	_asm	{
617		pushfd
618		cld
619		mov	esi,ecx
620		mov	edi,ecx
621		mov	ecx,60
622	up:	lodsd
623		bswap	eax
624		stosd
625		loop	up
626		popfd
627		}
628}
629
630/* MS actually specifies status of Direction Flag and compiler even
631 * manages to compile following as 'rep movsd' all by itself...
632 */
633#define padlock_memcpy(o,i,n) ((unsigned char *)memcpy((o),(i),(n)&~3U))
634#endif
635
636/* ===== AES encryption/decryption ===== */
637#ifndef OPENSSL_NO_AES
638
639#if defined(NID_aes_128_cfb128) && ! defined (NID_aes_128_cfb)
640#define NID_aes_128_cfb	NID_aes_128_cfb128
641#endif
642
643#if defined(NID_aes_128_ofb128) && ! defined (NID_aes_128_ofb)
644#define NID_aes_128_ofb	NID_aes_128_ofb128
645#endif
646
647#if defined(NID_aes_192_cfb128) && ! defined (NID_aes_192_cfb)
648#define NID_aes_192_cfb	NID_aes_192_cfb128
649#endif
650
651#if defined(NID_aes_192_ofb128) && ! defined (NID_aes_192_ofb)
652#define NID_aes_192_ofb	NID_aes_192_ofb128
653#endif
654
655#if defined(NID_aes_256_cfb128) && ! defined (NID_aes_256_cfb)
656#define NID_aes_256_cfb	NID_aes_256_cfb128
657#endif
658
659#if defined(NID_aes_256_ofb128) && ! defined (NID_aes_256_ofb)
660#define NID_aes_256_ofb	NID_aes_256_ofb128
661#endif
662
663/* List of supported ciphers. */
664static int padlock_cipher_nids[] = {
665	NID_aes_128_ecb,
666	NID_aes_128_cbc,
667	NID_aes_128_cfb,
668	NID_aes_128_ofb,
669
670	NID_aes_192_ecb,
671	NID_aes_192_cbc,
672	NID_aes_192_cfb,
673	NID_aes_192_ofb,
674
675	NID_aes_256_ecb,
676	NID_aes_256_cbc,
677	NID_aes_256_cfb,
678	NID_aes_256_ofb,
679};
680static int padlock_cipher_nids_num = (sizeof(padlock_cipher_nids)/
681				      sizeof(padlock_cipher_nids[0]));
682
683/* Function prototypes ... */
684static int padlock_aes_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
685				const unsigned char *iv, int enc);
686static int padlock_aes_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
687			      const unsigned char *in, size_t nbytes);
688
689#define NEAREST_ALIGNED(ptr) ( (unsigned char *)(ptr) +		\
690	( (0x10 - ((size_t)(ptr) & 0x0F)) & 0x0F )	)
691#define ALIGNED_CIPHER_DATA(ctx) ((struct padlock_cipher_data *)\
692	NEAREST_ALIGNED(ctx->cipher_data))
693
694#define EVP_CIPHER_block_size_ECB	AES_BLOCK_SIZE
695#define EVP_CIPHER_block_size_CBC	AES_BLOCK_SIZE
696#define EVP_CIPHER_block_size_OFB	1
697#define EVP_CIPHER_block_size_CFB	1
698
699/* Declaring so many ciphers by hand would be a pain.
700   Instead introduce a bit of preprocessor magic :-) */
701#define	DECLARE_AES_EVP(ksize,lmode,umode)	\
702static const EVP_CIPHER padlock_aes_##ksize##_##lmode = {	\
703	NID_aes_##ksize##_##lmode,		\
704	EVP_CIPHER_block_size_##umode,	\
705	AES_KEY_SIZE_##ksize,		\
706	AES_BLOCK_SIZE,			\
707	0 | EVP_CIPH_##umode##_MODE,	\
708	padlock_aes_init_key,		\
709	padlock_aes_cipher,		\
710	NULL,				\
711	sizeof(struct padlock_cipher_data) + 16,	\
712	EVP_CIPHER_set_asn1_iv,		\
713	EVP_CIPHER_get_asn1_iv,		\
714	NULL,				\
715	NULL				\
716}
717
718DECLARE_AES_EVP(128,ecb,ECB);
719DECLARE_AES_EVP(128,cbc,CBC);
720DECLARE_AES_EVP(128,cfb,CFB);
721DECLARE_AES_EVP(128,ofb,OFB);
722
723DECLARE_AES_EVP(192,ecb,ECB);
724DECLARE_AES_EVP(192,cbc,CBC);
725DECLARE_AES_EVP(192,cfb,CFB);
726DECLARE_AES_EVP(192,ofb,OFB);
727
728DECLARE_AES_EVP(256,ecb,ECB);
729DECLARE_AES_EVP(256,cbc,CBC);
730DECLARE_AES_EVP(256,cfb,CFB);
731DECLARE_AES_EVP(256,ofb,OFB);
732
733static int
734padlock_ciphers (ENGINE *e, const EVP_CIPHER **cipher, const int **nids, int nid)
735{
736	/* No specific cipher => return a list of supported nids ... */
737	if (!cipher) {
738		*nids = padlock_cipher_nids;
739		return padlock_cipher_nids_num;
740	}
741
742	/* ... or the requested "cipher" otherwise */
743	switch (nid) {
744	  case NID_aes_128_ecb:
745	    *cipher = &padlock_aes_128_ecb;
746	    break;
747	  case NID_aes_128_cbc:
748	    *cipher = &padlock_aes_128_cbc;
749	    break;
750	  case NID_aes_128_cfb:
751	    *cipher = &padlock_aes_128_cfb;
752	    break;
753	  case NID_aes_128_ofb:
754	    *cipher = &padlock_aes_128_ofb;
755	    break;
756
757	  case NID_aes_192_ecb:
758	    *cipher = &padlock_aes_192_ecb;
759	    break;
760	  case NID_aes_192_cbc:
761	    *cipher = &padlock_aes_192_cbc;
762	    break;
763	  case NID_aes_192_cfb:
764	    *cipher = &padlock_aes_192_cfb;
765	    break;
766	  case NID_aes_192_ofb:
767	    *cipher = &padlock_aes_192_ofb;
768	    break;
769
770	  case NID_aes_256_ecb:
771	    *cipher = &padlock_aes_256_ecb;
772	    break;
773	  case NID_aes_256_cbc:
774	    *cipher = &padlock_aes_256_cbc;
775	    break;
776	  case NID_aes_256_cfb:
777	    *cipher = &padlock_aes_256_cfb;
778	    break;
779	  case NID_aes_256_ofb:
780	    *cipher = &padlock_aes_256_ofb;
781	    break;
782
783	  default:
784	    /* Sorry, we don't support this NID */
785	    *cipher = NULL;
786	    return 0;
787	}
788
789	return 1;
790}
791
792/* Prepare the encryption key for PadLock usage */
793static int
794padlock_aes_init_key (EVP_CIPHER_CTX *ctx, const unsigned char *key,
795		      const unsigned char *iv, int enc)
796{
797	struct padlock_cipher_data *cdata;
798	int key_len = EVP_CIPHER_CTX_key_length(ctx) * 8;
799
800	if (key==NULL) return 0;	/* ERROR */
801
802	cdata = ALIGNED_CIPHER_DATA(ctx);
803	memset(cdata, 0, sizeof(struct padlock_cipher_data));
804
805	/* Prepare Control word. */
806	if (EVP_CIPHER_CTX_mode(ctx) == EVP_CIPH_OFB_MODE)
807		cdata->cword.b.encdec = 0;
808	else
809		cdata->cword.b.encdec = (ctx->encrypt == 0);
810	cdata->cword.b.rounds = 10 + (key_len - 128) / 32;
811	cdata->cword.b.ksize = (key_len - 128) / 64;
812
813	switch(key_len) {
814		case 128:
815			/* PadLock can generate an extended key for
816			   AES128 in hardware */
817			memcpy(cdata->ks.rd_key, key, AES_KEY_SIZE_128);
818			cdata->cword.b.keygen = 0;
819			break;
820
821		case 192:
822		case 256:
823			/* Generate an extended AES key in software.
824			   Needed for AES192/AES256 */
825			/* Well, the above applies to Stepping 8 CPUs
826			   and is listed as hardware errata. They most
827			   likely will fix it at some point and then
828			   a check for stepping would be due here. */
829			if (EVP_CIPHER_CTX_mode(ctx) == EVP_CIPH_CFB_MODE ||
830			    EVP_CIPHER_CTX_mode(ctx) == EVP_CIPH_OFB_MODE ||
831			    enc)
832				AES_set_encrypt_key(key, key_len, &cdata->ks);
833			else
834				AES_set_decrypt_key(key, key_len, &cdata->ks);
835#ifndef AES_ASM
836			/* OpenSSL C functions use byte-swapped extended key. */
837			padlock_bswapl(&cdata->ks);
838#endif
839			cdata->cword.b.keygen = 1;
840			break;
841
842		default:
843			/* ERROR */
844			return 0;
845	}
846
847	/*
848	 * This is done to cover for cases when user reuses the
849	 * context for new key. The catch is that if we don't do
850	 * this, padlock_eas_cipher might proceed with old key...
851	 */
852	padlock_reload_key ();
853
854	return 1;
855}
856
857/*
858 * Simplified version of padlock_aes_cipher() used when
859 * 1) both input and output buffers are at aligned addresses.
860 * or when
861 * 2) running on a newer CPU that doesn't require aligned buffers.
862 */
863static int
864padlock_aes_cipher_omnivorous(EVP_CIPHER_CTX *ctx, unsigned char *out_arg,
865		const unsigned char *in_arg, size_t nbytes)
866{
867	struct padlock_cipher_data *cdata;
868	void  *iv;
869
870	cdata = ALIGNED_CIPHER_DATA(ctx);
871	padlock_verify_context(cdata);
872
873	switch (EVP_CIPHER_CTX_mode(ctx)) {
874	case EVP_CIPH_ECB_MODE:
875		padlock_xcrypt_ecb(nbytes/AES_BLOCK_SIZE, cdata, out_arg, in_arg);
876		break;
877
878	case EVP_CIPH_CBC_MODE:
879		memcpy(cdata->iv, ctx->iv, AES_BLOCK_SIZE);
880		iv = padlock_xcrypt_cbc(nbytes/AES_BLOCK_SIZE, cdata, out_arg, in_arg);
881		memcpy(ctx->iv, iv, AES_BLOCK_SIZE);
882		break;
883
884	case EVP_CIPH_CFB_MODE:
885		memcpy(cdata->iv, ctx->iv, AES_BLOCK_SIZE);
886		iv = padlock_xcrypt_cfb(nbytes/AES_BLOCK_SIZE, cdata, out_arg, in_arg);
887		memcpy(ctx->iv, iv, AES_BLOCK_SIZE);
888		break;
889
890	case EVP_CIPH_OFB_MODE:
891		memcpy(cdata->iv, ctx->iv, AES_BLOCK_SIZE);
892		padlock_xcrypt_ofb(nbytes/AES_BLOCK_SIZE, cdata, out_arg, in_arg);
893		memcpy(ctx->iv, cdata->iv, AES_BLOCK_SIZE);
894		break;
895
896	default:
897		return 0;
898	}
899
900	memset(cdata->iv, 0, AES_BLOCK_SIZE);
901
902	return 1;
903}
904
905#ifndef  PADLOCK_CHUNK
906# define PADLOCK_CHUNK	512	/* Must be a power of 2 larger than 16 */
907#endif
908#if PADLOCK_CHUNK<16 || PADLOCK_CHUNK&(PADLOCK_CHUNK-1)
909# error "insane PADLOCK_CHUNK..."
910#endif
911
912/* Re-align the arguments to 16-Bytes boundaries and run the
913   encryption function itself. This function is not AES-specific. */
914static int
915padlock_aes_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out_arg,
916		   const unsigned char *in_arg, size_t nbytes)
917{
918	struct padlock_cipher_data *cdata;
919	const  void *inp;
920	unsigned char  *out;
921	void  *iv;
922	int    inp_misaligned, out_misaligned, realign_in_loop;
923	size_t chunk, allocated=0;
924
925	/* ctx->num is maintained in byte-oriented modes,
926	   such as CFB and OFB... */
927	if ((chunk = ctx->num)) { /* borrow chunk variable */
928		unsigned char *ivp=ctx->iv;
929
930		switch (EVP_CIPHER_CTX_mode(ctx)) {
931		case EVP_CIPH_CFB_MODE:
932			if (chunk >= AES_BLOCK_SIZE)
933				return 0; /* bogus value */
934
935			if (ctx->encrypt)
936				while (chunk<AES_BLOCK_SIZE && nbytes!=0) {
937					ivp[chunk] = *(out_arg++) = *(in_arg++) ^ ivp[chunk];
938					chunk++, nbytes--;
939				}
940			else	while (chunk<AES_BLOCK_SIZE && nbytes!=0) {
941					unsigned char c = *(in_arg++);
942					*(out_arg++) = c ^ ivp[chunk];
943					ivp[chunk++] = c, nbytes--;
944				}
945
946			ctx->num = chunk%AES_BLOCK_SIZE;
947			break;
948		case EVP_CIPH_OFB_MODE:
949			if (chunk >= AES_BLOCK_SIZE)
950				return 0; /* bogus value */
951
952			while (chunk<AES_BLOCK_SIZE && nbytes!=0) {
953				*(out_arg++) = *(in_arg++) ^ ivp[chunk];
954				chunk++, nbytes--;
955			}
956
957			ctx->num = chunk%AES_BLOCK_SIZE;
958			break;
959		}
960	}
961
962	if (nbytes == 0)
963		return 1;
964#if 0
965	if (nbytes % AES_BLOCK_SIZE)
966		return 0; /* are we expected to do tail processing? */
967#else
968	/* nbytes is always multiple of AES_BLOCK_SIZE in ECB and CBC
969	   modes and arbitrary value in byte-oriented modes, such as
970	   CFB and OFB... */
971#endif
972
973	/* VIA promises CPUs that won't require alignment in the future.
974	   For now padlock_aes_align_required is initialized to 1 and
975	   the condition is never met... */
976	/* C7 core is capable to manage unaligned input in non-ECB[!]
977	   mode, but performance penalties appear to be approximately
978	   same as for software alignment below or ~3x. They promise to
979	   improve it in the future, but for now we can just as well
980	   pretend that it can only handle aligned input... */
981	if (!padlock_aes_align_required && (nbytes%AES_BLOCK_SIZE)==0)
982		return padlock_aes_cipher_omnivorous(ctx, out_arg, in_arg, nbytes);
983
984	inp_misaligned = (((size_t)in_arg) & 0x0F);
985	out_misaligned = (((size_t)out_arg) & 0x0F);
986
987	/* Note that even if output is aligned and input not,
988	 * I still prefer to loop instead of copy the whole
989	 * input and then encrypt in one stroke. This is done
990	 * in order to improve L1 cache utilization... */
991	realign_in_loop = out_misaligned|inp_misaligned;
992
993	if (!realign_in_loop && (nbytes%AES_BLOCK_SIZE)==0)
994		return padlock_aes_cipher_omnivorous(ctx, out_arg, in_arg, nbytes);
995
996	/* this takes one "if" out of the loops */
997	chunk  = nbytes;
998	chunk %= PADLOCK_CHUNK;
999	if (chunk==0) chunk = PADLOCK_CHUNK;
1000
1001	if (out_misaligned) {
1002		/* optmize for small input */
1003		allocated = (chunk<nbytes?PADLOCK_CHUNK:nbytes);
1004		out = alloca(0x10 + allocated);
1005		out = NEAREST_ALIGNED(out);
1006	}
1007	else
1008		out = out_arg;
1009
1010	cdata = ALIGNED_CIPHER_DATA(ctx);
1011	padlock_verify_context(cdata);
1012
1013	switch (EVP_CIPHER_CTX_mode(ctx)) {
1014	case EVP_CIPH_ECB_MODE:
1015		do	{
1016			if (inp_misaligned)
1017				inp = padlock_memcpy(out, in_arg, chunk);
1018			else
1019				inp = in_arg;
1020			in_arg += chunk;
1021
1022			padlock_xcrypt_ecb(chunk/AES_BLOCK_SIZE, cdata, out, inp);
1023
1024			if (out_misaligned)
1025				out_arg = padlock_memcpy(out_arg, out, chunk) + chunk;
1026			else
1027				out     = out_arg+=chunk;
1028
1029			nbytes -= chunk;
1030			chunk   = PADLOCK_CHUNK;
1031		} while (nbytes);
1032		break;
1033
1034	case EVP_CIPH_CBC_MODE:
1035		memcpy(cdata->iv, ctx->iv, AES_BLOCK_SIZE);
1036		goto cbc_shortcut;
1037		do	{
1038			if (iv != cdata->iv)
1039				memcpy(cdata->iv, iv, AES_BLOCK_SIZE);
1040			chunk = PADLOCK_CHUNK;
1041		cbc_shortcut: /* optimize for small input */
1042			if (inp_misaligned)
1043				inp = padlock_memcpy(out, in_arg, chunk);
1044			else
1045				inp = in_arg;
1046			in_arg += chunk;
1047
1048			iv = padlock_xcrypt_cbc(chunk/AES_BLOCK_SIZE, cdata, out, inp);
1049
1050			if (out_misaligned)
1051				out_arg = padlock_memcpy(out_arg, out, chunk) + chunk;
1052			else
1053				out     = out_arg+=chunk;
1054
1055		} while (nbytes -= chunk);
1056		memcpy(ctx->iv, iv, AES_BLOCK_SIZE);
1057		break;
1058
1059	case EVP_CIPH_CFB_MODE:
1060		memcpy (iv = cdata->iv, ctx->iv, AES_BLOCK_SIZE);
1061		chunk &= ~(AES_BLOCK_SIZE-1);
1062		if (chunk)	goto cfb_shortcut;
1063		else		goto cfb_skiploop;
1064		do	{
1065			if (iv != cdata->iv)
1066				memcpy(cdata->iv, iv, AES_BLOCK_SIZE);
1067			chunk = PADLOCK_CHUNK;
1068		cfb_shortcut: /* optimize for small input */
1069			if (inp_misaligned)
1070				inp = padlock_memcpy(out, in_arg, chunk);
1071			else
1072				inp = in_arg;
1073			in_arg += chunk;
1074
1075			iv = padlock_xcrypt_cfb(chunk/AES_BLOCK_SIZE, cdata, out, inp);
1076
1077			if (out_misaligned)
1078				out_arg = padlock_memcpy(out_arg, out, chunk) + chunk;
1079			else
1080				out     = out_arg+=chunk;
1081
1082			nbytes -= chunk;
1083		} while (nbytes >= AES_BLOCK_SIZE);
1084
1085		cfb_skiploop:
1086		if (nbytes) {
1087			unsigned char *ivp = cdata->iv;
1088
1089			if (iv != ivp) {
1090				memcpy(ivp, iv, AES_BLOCK_SIZE);
1091				iv = ivp;
1092			}
1093			ctx->num = nbytes;
1094			if (cdata->cword.b.encdec) {
1095				cdata->cword.b.encdec=0;
1096				padlock_reload_key();
1097				padlock_xcrypt_ecb(1,cdata,ivp,ivp);
1098				cdata->cword.b.encdec=1;
1099				padlock_reload_key();
1100				while(nbytes) {
1101					unsigned char c = *(in_arg++);
1102					*(out_arg++) = c ^ *ivp;
1103					*(ivp++) = c, nbytes--;
1104				}
1105			}
1106			else {	padlock_reload_key();
1107				padlock_xcrypt_ecb(1,cdata,ivp,ivp);
1108				padlock_reload_key();
1109				while (nbytes) {
1110					*ivp = *(out_arg++) = *(in_arg++) ^ *ivp;
1111					ivp++, nbytes--;
1112				}
1113			}
1114		}
1115
1116		memcpy(ctx->iv, iv, AES_BLOCK_SIZE);
1117		break;
1118
1119	case EVP_CIPH_OFB_MODE:
1120		memcpy(cdata->iv, ctx->iv, AES_BLOCK_SIZE);
1121		chunk &= ~(AES_BLOCK_SIZE-1);
1122		if (chunk) do	{
1123			if (inp_misaligned)
1124				inp = padlock_memcpy(out, in_arg, chunk);
1125			else
1126				inp = in_arg;
1127			in_arg += chunk;
1128
1129			padlock_xcrypt_ofb(chunk/AES_BLOCK_SIZE, cdata, out, inp);
1130
1131			if (out_misaligned)
1132				out_arg = padlock_memcpy(out_arg, out, chunk) + chunk;
1133			else
1134				out     = out_arg+=chunk;
1135
1136			nbytes -= chunk;
1137			chunk   = PADLOCK_CHUNK;
1138		} while (nbytes >= AES_BLOCK_SIZE);
1139
1140		if (nbytes) {
1141			unsigned char *ivp = cdata->iv;
1142
1143			ctx->num = nbytes;
1144			padlock_reload_key();	/* empirically found */
1145			padlock_xcrypt_ecb(1,cdata,ivp,ivp);
1146			padlock_reload_key();	/* empirically found */
1147			while (nbytes) {
1148				*(out_arg++) = *(in_arg++) ^ *ivp;
1149				ivp++, nbytes--;
1150			}
1151		}
1152
1153		memcpy(ctx->iv, cdata->iv, AES_BLOCK_SIZE);
1154		break;
1155
1156	default:
1157		return 0;
1158	}
1159
1160	/* Clean the realign buffer if it was used */
1161	if (out_misaligned) {
1162		volatile unsigned long *p=(void *)out;
1163		size_t   n = allocated/sizeof(*p);
1164		while (n--) *p++=0;
1165	}
1166
1167	memset(cdata->iv, 0, AES_BLOCK_SIZE);
1168
1169	return 1;
1170}
1171
1172#endif /* OPENSSL_NO_AES */
1173
1174/* ===== Random Number Generator ===== */
1175/*
1176 * This code is not engaged. The reason is that it does not comply
1177 * with recommendations for VIA RNG usage for secure applications
1178 * (posted at http://www.via.com.tw/en/viac3/c3.jsp) nor does it
1179 * provide meaningful error control...
1180 */
1181/* Wrapper that provides an interface between the API and
1182   the raw PadLock RNG */
1183static int
1184padlock_rand_bytes(unsigned char *output, int count)
1185{
1186	unsigned int eax, buf;
1187
1188	while (count >= 8) {
1189		eax = padlock_xstore(output, 0);
1190		if (!(eax&(1<<6)))	return 0; /* RNG disabled */
1191		/* this ---vv--- covers DC bias, Raw Bits and String Filter */
1192		if (eax&(0x1F<<10))	return 0;
1193		if ((eax&0x1F)==0)	continue; /* no data, retry... */
1194		if ((eax&0x1F)!=8)	return 0; /* fatal failure...  */
1195		output += 8;
1196		count  -= 8;
1197	}
1198	while (count > 0) {
1199		eax = padlock_xstore(&buf, 3);
1200		if (!(eax&(1<<6)))	return 0; /* RNG disabled */
1201		/* this ---vv--- covers DC bias, Raw Bits and String Filter */
1202		if (eax&(0x1F<<10))	return 0;
1203		if ((eax&0x1F)==0)	continue; /* no data, retry... */
1204		if ((eax&0x1F)!=1)	return 0; /* fatal failure...  */
1205		*output++ = (unsigned char)buf;
1206		count--;
1207	}
1208	*(volatile unsigned int *)&buf=0;
1209
1210	return 1;
1211}
1212
1213/* Dummy but necessary function */
1214static int
1215padlock_rand_status(void)
1216{
1217	return 1;
1218}
1219
1220/* Prepare structure for registration */
1221static RAND_METHOD padlock_rand = {
1222	NULL,			/* seed */
1223	padlock_rand_bytes,	/* bytes */
1224	NULL,			/* cleanup */
1225	NULL,			/* add */
1226	padlock_rand_bytes,	/* pseudorand */
1227	padlock_rand_status,	/* rand status */
1228};
1229
1230#else  /* !COMPILE_HW_PADLOCK */
1231#ifndef OPENSSL_NO_DYNAMIC_ENGINE
1232OPENSSL_EXPORT
1233int bind_engine(ENGINE *e, const char *id, const dynamic_fns *fns);
1234OPENSSL_EXPORT
1235int bind_engine(ENGINE *e, const char *id, const dynamic_fns *fns) { return 0; }
1236IMPLEMENT_DYNAMIC_CHECK_FN()
1237#endif
1238#endif /* COMPILE_HW_PADLOCK */
1239
1240#endif /* !OPENSSL_NO_HW_PADLOCK */
1241#endif /* !OPENSSL_NO_HW */
1242