1/*
2 * Support for VIA PadLock Advanced Cryptography Engine (ACE)
3 * Written by Michal Ludvig <michal@logix.cz>
4 *            http://www.logix.cz/michal
5 *
6 * Big thanks to Andy Polyakov for a help with optimization,
7 * assembler fixes, port to MS Windows and a lot of other
8 * valuable work on this engine!
9 */
10
11/* ====================================================================
12 * Copyright (c) 1999-2001 The OpenSSL Project.  All rights reserved.
13 *
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions
16 * are met:
17 *
18 * 1. Redistributions of source code must retain the above copyright
19 *    notice, this list of conditions and the following disclaimer.
20 *
21 * 2. Redistributions in binary form must reproduce the above copyright
22 *    notice, this list of conditions and the following disclaimer in
23 *    the documentation and/or other materials provided with the
24 *    distribution.
25 *
26 * 3. All advertising materials mentioning features or use of this
27 *    software must display the following acknowledgment:
28 *    "This product includes software developed by the OpenSSL Project
29 *    for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
30 *
31 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
32 *    endorse or promote products derived from this software without
33 *    prior written permission. For written permission, please contact
34 *    licensing@OpenSSL.org.
35 *
36 * 5. Products derived from this software may not be called "OpenSSL"
37 *    nor may "OpenSSL" appear in their names without prior written
38 *    permission of the OpenSSL Project.
39 *
40 * 6. Redistributions of any form whatsoever must retain the following
41 *    acknowledgment:
42 *    "This product includes software developed by the OpenSSL Project
43 *    for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
44 *
45 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
46 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
47 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
48 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
49 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
50 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
51 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
52 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
53 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
54 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
55 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
56 * OF THE POSSIBILITY OF SUCH DAMAGE.
57 * ====================================================================
58 *
59 * This product includes cryptographic software written by Eric Young
60 * (eay@cryptsoft.com).  This product includes software written by Tim
61 * Hudson (tjh@cryptsoft.com).
62 *
63 */
64
65
66#include <stdio.h>
67#include <string.h>
68
69#include <openssl/opensslconf.h>
70#include <openssl/crypto.h>
71#include <openssl/dso.h>
72#include <openssl/engine.h>
73#include <openssl/evp.h>
74#ifndef OPENSSL_NO_AES
75#include <openssl/aes.h>
76#endif
77#include <openssl/rand.h>
78#include <openssl/err.h>
79
80#ifndef OPENSSL_NO_HW
81#ifndef OPENSSL_NO_HW_PADLOCK
82
83/* Attempt to have a single source for both 0.9.7 and 0.9.8 :-) */
84#if (OPENSSL_VERSION_NUMBER >= 0x00908000L)
85#  ifndef OPENSSL_NO_DYNAMIC_ENGINE
86#    define DYNAMIC_ENGINE
87#  endif
88#elif (OPENSSL_VERSION_NUMBER >= 0x00907000L)
89#  ifdef ENGINE_DYNAMIC_SUPPORT
90#    define DYNAMIC_ENGINE
91#  endif
92#else
93#  error "Only OpenSSL >= 0.9.7 is supported"
94#endif
95
96/* VIA PadLock AES is available *ONLY* on some x86 CPUs.
97   Not only that it doesn't exist elsewhere, but it
98   even can't be compiled on other platforms!
99
100   In addition, because of the heavy use of inline assembler,
101   compiler choice is limited to GCC and Microsoft C. */
102#undef COMPILE_HW_PADLOCK
103#if !defined(I386_ONLY) && !defined(OPENSSL_NO_INLINE_ASM)
104# if (defined(__GNUC__) && (defined(__i386__) || defined(__i386))) || \
105     (defined(_MSC_VER) && defined(_M_IX86))
106#  define COMPILE_HW_PADLOCK
107static ENGINE *ENGINE_padlock (void);
108# endif
109#endif
110
111#ifdef OPENSSL_NO_DYNAMIC_ENGINE
112
113void ENGINE_load_padlock (void)
114{
115/* On non-x86 CPUs it just returns. */
116#ifdef COMPILE_HW_PADLOCK
117	ENGINE *toadd = ENGINE_padlock ();
118	if (!toadd) return;
119	ENGINE_add (toadd);
120	ENGINE_free (toadd);
121	ERR_clear_error ();
122#endif
123}
124
125#endif
126
127#ifdef COMPILE_HW_PADLOCK
128/* We do these includes here to avoid header problems on platforms that
129   do not have the VIA padlock anyway... */
130#include <stdlib.h>
131#ifdef _WIN32
132# include <malloc.h>
133# ifndef alloca
134#  define alloca _alloca
135# endif
136#elif defined(__GNUC__)
137# ifndef alloca
138#  define alloca(s) __builtin_alloca(s)
139# endif
140#endif
141
142/* Function for ENGINE detection and control */
143static int padlock_available(void);
144static int padlock_init(ENGINE *e);
145
146/* RNG Stuff */
147static RAND_METHOD padlock_rand;
148
149/* Cipher Stuff */
150#ifndef OPENSSL_NO_AES
151static int padlock_ciphers(ENGINE *e, const EVP_CIPHER **cipher, const int **nids, int nid);
152#endif
153
154/* Engine names */
155static const char *padlock_id = "padlock";
156static char padlock_name[100];
157
158/* Available features */
159static int padlock_use_ace = 0;	/* Advanced Cryptography Engine */
160static int padlock_use_rng = 0;	/* Random Number Generator */
161#ifndef OPENSSL_NO_AES
162static int padlock_aes_align_required = 1;
163#endif
164
165/* ===== Engine "management" functions ===== */
166
167/* Prepare the ENGINE structure for registration */
168static int
169padlock_bind_helper(ENGINE *e)
170{
171	/* Check available features */
172	padlock_available();
173
174#if 1	/* disable RNG for now, see commentary in vicinity of RNG code */
175	padlock_use_rng=0;
176#endif
177
178	/* Generate a nice engine name with available features */
179	BIO_snprintf(padlock_name, sizeof(padlock_name),
180		"VIA PadLock (%s, %s)",
181		 padlock_use_rng ? "RNG" : "no-RNG",
182		 padlock_use_ace ? "ACE" : "no-ACE");
183
184	/* Register everything or return with an error */
185	if (!ENGINE_set_id(e, padlock_id) ||
186	    !ENGINE_set_name(e, padlock_name) ||
187
188	    !ENGINE_set_init_function(e, padlock_init) ||
189#ifndef OPENSSL_NO_AES
190	    (padlock_use_ace && !ENGINE_set_ciphers (e, padlock_ciphers)) ||
191#endif
192	    (padlock_use_rng && !ENGINE_set_RAND (e, &padlock_rand))) {
193		return 0;
194	}
195
196	/* Everything looks good */
197	return 1;
198}
199
200/* Constructor */
201static ENGINE *
202ENGINE_padlock(void)
203{
204	ENGINE *eng = ENGINE_new();
205
206	if (!eng) {
207		return NULL;
208	}
209
210	if (!padlock_bind_helper(eng)) {
211		ENGINE_free(eng);
212		return NULL;
213	}
214
215	return eng;
216}
217
218/* Check availability of the engine */
219static int
220padlock_init(ENGINE *e)
221{
222	return (padlock_use_rng || padlock_use_ace);
223}
224
225/* This stuff is needed if this ENGINE is being compiled into a self-contained
226 * shared-library.
227 */
228#ifdef DYNAMIC_ENGINE
229static int
230padlock_bind_fn(ENGINE *e, const char *id)
231{
232	if (id && (strcmp(id, padlock_id) != 0)) {
233		return 0;
234	}
235
236	if (!padlock_bind_helper(e))  {
237		return 0;
238	}
239
240	return 1;
241}
242
243IMPLEMENT_DYNAMIC_CHECK_FN()
244IMPLEMENT_DYNAMIC_BIND_FN (padlock_bind_fn)
245#endif /* DYNAMIC_ENGINE */
246
247/* ===== Here comes the "real" engine ===== */
248
249#ifndef OPENSSL_NO_AES
250/* Some AES-related constants */
251#define AES_BLOCK_SIZE		16
252#define AES_KEY_SIZE_128	16
253#define AES_KEY_SIZE_192	24
254#define AES_KEY_SIZE_256	32
255
256/* Here we store the status information relevant to the
257   current context. */
258/* BIG FAT WARNING:
259 * 	Inline assembler in PADLOCK_XCRYPT_ASM()
260 * 	depends on the order of items in this structure.
261 * 	Don't blindly modify, reorder, etc!
262 */
263struct padlock_cipher_data
264{
265	unsigned char iv[AES_BLOCK_SIZE];	/* Initialization vector */
266	union {	unsigned int pad[4];
267		struct {
268			int rounds:4;
269			int dgst:1;	/* n/a in C3 */
270			int align:1;	/* n/a in C3 */
271			int ciphr:1;	/* n/a in C3 */
272			unsigned int keygen:1;
273			int interm:1;
274			unsigned int encdec:1;
275			int ksize:2;
276		} b;
277	} cword;		/* Control word */
278	AES_KEY ks;		/* Encryption key */
279};
280
281/*
282 * Essentially this variable belongs in thread local storage.
283 * Having this variable global on the other hand can only cause
284 * few bogus key reloads [if any at all on single-CPU system],
285 * so we accept the penatly...
286 */
287static volatile struct padlock_cipher_data *padlock_saved_context;
288#endif
289
290/*
291 * =======================================================
292 * Inline assembler section(s).
293 * =======================================================
294 * Order of arguments is chosen to facilitate Windows port
295 * using __fastcall calling convention. If you wish to add
296 * more routines, keep in mind that first __fastcall
297 * argument is passed in %ecx and second - in %edx.
298 * =======================================================
299 */
300#if defined(__GNUC__) && __GNUC__>=2
301/*
302 * As for excessive "push %ebx"/"pop %ebx" found all over.
303 * When generating position-independent code GCC won't let
304 * us use "b" in assembler templates nor even respect "ebx"
305 * in "clobber description." Therefore the trouble...
306 */
307
308/* Helper function - check if a CPUID instruction
309   is available on this CPU */
310static int
311padlock_insn_cpuid_available(void)
312{
313	int result = -1;
314
315	/* We're checking if the bit #21 of EFLAGS
316	   can be toggled. If yes = CPUID is available. */
317	asm volatile (
318		"pushf\n"
319		"popl %%eax\n"
320		"xorl $0x200000, %%eax\n"
321		"movl %%eax, %%ecx\n"
322		"andl $0x200000, %%ecx\n"
323		"pushl %%eax\n"
324		"popf\n"
325		"pushf\n"
326		"popl %%eax\n"
327		"andl $0x200000, %%eax\n"
328		"xorl %%eax, %%ecx\n"
329		"movl %%ecx, %0\n"
330		: "=r" (result) : : "eax", "ecx");
331
332	return (result == 0);
333}
334
335/* Load supported features of the CPU to see if
336   the PadLock is available. */
337static int
338padlock_available(void)
339{
340	char vendor_string[16];
341	unsigned int eax, edx;
342
343	/* First check if the CPUID instruction is available at all... */
344	if (! padlock_insn_cpuid_available())
345		return 0;
346
347	/* Are we running on the Centaur (VIA) CPU? */
348	eax = 0x00000000;
349	vendor_string[12] = 0;
350	asm volatile (
351		"pushl	%%ebx\n"
352		"cpuid\n"
353		"movl	%%ebx,(%%edi)\n"
354		"movl	%%edx,4(%%edi)\n"
355		"movl	%%ecx,8(%%edi)\n"
356		"popl	%%ebx"
357		: "+a"(eax) : "D"(vendor_string) : "ecx", "edx");
358	if (strcmp(vendor_string, "CentaurHauls") != 0)
359		return 0;
360
361	/* Check for Centaur Extended Feature Flags presence */
362	eax = 0xC0000000;
363	asm volatile ("pushl %%ebx; cpuid; popl	%%ebx"
364		: "+a"(eax) : : "ecx", "edx");
365	if (eax < 0xC0000001)
366		return 0;
367
368	/* Read the Centaur Extended Feature Flags */
369	eax = 0xC0000001;
370	asm volatile ("pushl %%ebx; cpuid; popl %%ebx"
371		: "+a"(eax), "=d"(edx) : : "ecx");
372
373	/* Fill up some flags */
374	padlock_use_ace = ((edx & (0x3<<6)) == (0x3<<6));
375	padlock_use_rng = ((edx & (0x3<<2)) == (0x3<<2));
376
377	return padlock_use_ace + padlock_use_rng;
378}
379
380#ifndef OPENSSL_NO_AES
381/* Our own htonl()/ntohl() */
382static inline void
383padlock_bswapl(AES_KEY *ks)
384{
385	size_t i = sizeof(ks->rd_key)/sizeof(ks->rd_key[0]);
386	unsigned int *key = ks->rd_key;
387
388	while (i--) {
389		asm volatile ("bswapl %0" : "+r"(*key));
390		key++;
391	}
392}
393#endif
394
395/* Force key reload from memory to the CPU microcode.
396   Loading EFLAGS from the stack clears EFLAGS[30]
397   which does the trick. */
398static inline void
399padlock_reload_key(void)
400{
401	asm volatile ("pushfl; popfl");
402}
403
404#ifndef OPENSSL_NO_AES
405/*
406 * This is heuristic key context tracing. At first one
407 * believes that one should use atomic swap instructions,
408 * but it's not actually necessary. Point is that if
409 * padlock_saved_context was changed by another thread
410 * after we've read it and before we compare it with cdata,
411 * our key *shall* be reloaded upon thread context switch
412 * and we are therefore set in either case...
413 */
414static inline void
415padlock_verify_context(struct padlock_cipher_data *cdata)
416{
417	asm volatile (
418	"pushfl\n"
419"	btl	$30,(%%esp)\n"
420"	jnc	1f\n"
421"	cmpl	%2,%1\n"
422"	je	1f\n"
423"	popfl\n"
424"	subl	$4,%%esp\n"
425"1:	addl	$4,%%esp\n"
426"	movl	%2,%0"
427	:"+m"(padlock_saved_context)
428	: "r"(padlock_saved_context), "r"(cdata) : "cc");
429}
430
431/* Template for padlock_xcrypt_* modes */
432/* BIG FAT WARNING:
433 * 	The offsets used with 'leal' instructions
434 * 	describe items of the 'padlock_cipher_data'
435 * 	structure.
436 */
437#define PADLOCK_XCRYPT_ASM(name,rep_xcrypt)	\
438static inline void *name(size_t cnt,		\
439	struct padlock_cipher_data *cdata,	\
440	void *out, const void *inp) 		\
441{	void *iv; 				\
442	asm volatile ( "pushl	%%ebx\n"	\
443		"	leal	16(%0),%%edx\n"	\
444		"	leal	32(%0),%%ebx\n"	\
445			rep_xcrypt "\n"		\
446		"	popl	%%ebx"		\
447		: "=a"(iv), "=c"(cnt), "=D"(out), "=S"(inp) \
448		: "0"(cdata), "1"(cnt), "2"(out), "3"(inp)  \
449		: "edx", "cc", "memory");	\
450	return iv;				\
451}
452
453/* Generate all functions with appropriate opcodes */
454PADLOCK_XCRYPT_ASM(padlock_xcrypt_ecb, ".byte 0xf3,0x0f,0xa7,0xc8")	/* rep xcryptecb */
455PADLOCK_XCRYPT_ASM(padlock_xcrypt_cbc, ".byte 0xf3,0x0f,0xa7,0xd0")	/* rep xcryptcbc */
456PADLOCK_XCRYPT_ASM(padlock_xcrypt_cfb, ".byte 0xf3,0x0f,0xa7,0xe0")	/* rep xcryptcfb */
457PADLOCK_XCRYPT_ASM(padlock_xcrypt_ofb, ".byte 0xf3,0x0f,0xa7,0xe8")	/* rep xcryptofb */
458#endif
459
460/* The RNG call itself */
461static inline unsigned int
462padlock_xstore(void *addr, unsigned int edx_in)
463{
464	unsigned int eax_out;
465
466	asm volatile (".byte 0x0f,0xa7,0xc0"	/* xstore */
467	    : "=a"(eax_out),"=m"(*(unsigned *)addr)
468	    : "D"(addr), "d" (edx_in)
469	    );
470
471	return eax_out;
472}
473
474/* Why not inline 'rep movsd'? I failed to find information on what
475 * value in Direction Flag one can expect and consequently have to
476 * apply "better-safe-than-sorry" approach and assume "undefined."
477 * I could explicitly clear it and restore the original value upon
478 * return from padlock_aes_cipher, but it's presumably too much
479 * trouble for too little gain...
480 *
481 * In case you wonder 'rep xcrypt*' instructions above are *not*
482 * affected by the Direction Flag and pointers advance toward
483 * larger addresses unconditionally.
484 */
485static inline unsigned char *
486padlock_memcpy(void *dst,const void *src,size_t n)
487{
488	long       *d=dst;
489	const long *s=src;
490
491	n /= sizeof(*d);
492	do { *d++ = *s++; } while (--n);
493
494	return dst;
495}
496
497#elif defined(_MSC_VER)
498/*
499 * Unlike GCC these are real functions. In order to minimize impact
500 * on performance we adhere to __fastcall calling convention in
501 * order to get two first arguments passed through %ecx and %edx.
502 * Which kind of suits very well, as instructions in question use
503 * both %ecx and %edx as input:-)
504 */
505#define REP_XCRYPT(code)		\
506	_asm _emit 0xf3			\
507	_asm _emit 0x0f _asm _emit 0xa7	\
508	_asm _emit code
509
510/* BIG FAT WARNING:
511 * 	The offsets used with 'lea' instructions
512 * 	describe items of the 'padlock_cipher_data'
513 * 	structure.
514 */
515#define PADLOCK_XCRYPT_ASM(name,code)	\
516static void * __fastcall 		\
517	name (size_t cnt, void *cdata,	\
518	void *outp, const void *inp)	\
519{	_asm	mov	eax,edx		\
520	_asm	lea	edx,[eax+16]	\
521	_asm	lea	ebx,[eax+32]	\
522	_asm	mov	edi,outp	\
523	_asm	mov	esi,inp		\
524	REP_XCRYPT(code)		\
525}
526
527PADLOCK_XCRYPT_ASM(padlock_xcrypt_ecb,0xc8)
528PADLOCK_XCRYPT_ASM(padlock_xcrypt_cbc,0xd0)
529PADLOCK_XCRYPT_ASM(padlock_xcrypt_cfb,0xe0)
530PADLOCK_XCRYPT_ASM(padlock_xcrypt_ofb,0xe8)
531
532static int __fastcall
533padlock_xstore(void *outp,unsigned int code)
534{	_asm	mov	edi,ecx
535	_asm _emit 0x0f _asm _emit 0xa7 _asm _emit 0xc0
536}
537
538static void __fastcall
539padlock_reload_key(void)
540{	_asm pushfd _asm popfd		}
541
542static void __fastcall
543padlock_verify_context(void *cdata)
544{	_asm	{
545		pushfd
546		bt	DWORD PTR[esp],30
547		jnc	skip
548		cmp	ecx,padlock_saved_context
549		je	skip
550		popfd
551		sub	esp,4
552	skip:	add	esp,4
553		mov	padlock_saved_context,ecx
554		}
555}
556
557static int
558padlock_available(void)
559{	_asm	{
560		pushfd
561		pop	eax
562		mov	ecx,eax
563		xor	eax,1<<21
564		push	eax
565		popfd
566		pushfd
567		pop	eax
568		xor	eax,ecx
569		bt	eax,21
570		jnc	noluck
571		mov	eax,0
572		cpuid
573		xor	eax,eax
574		cmp	ebx,'tneC'
575		jne	noluck
576		cmp	edx,'Hrua'
577		jne	noluck
578		cmp	ecx,'slua'
579		jne	noluck
580		mov	eax,0xC0000000
581		cpuid
582		mov	edx,eax
583		xor	eax,eax
584		cmp	edx,0xC0000001
585		jb	noluck
586		mov	eax,0xC0000001
587		cpuid
588		xor	eax,eax
589		bt	edx,6
590		jnc	skip_a
591		bt	edx,7
592		jnc	skip_a
593		mov	padlock_use_ace,1
594		inc	eax
595	skip_a:	bt	edx,2
596		jnc	skip_r
597		bt	edx,3
598		jnc	skip_r
599		mov	padlock_use_rng,1
600		inc	eax
601	skip_r:
602	noluck:
603		}
604}
605
606static void __fastcall
607padlock_bswapl(void *key)
608{	_asm	{
609		pushfd
610		cld
611		mov	esi,ecx
612		mov	edi,ecx
613		mov	ecx,60
614	up:	lodsd
615		bswap	eax
616		stosd
617		loop	up
618		popfd
619		}
620}
621
622/* MS actually specifies status of Direction Flag and compiler even
623 * manages to compile following as 'rep movsd' all by itself...
624 */
625#define padlock_memcpy(o,i,n) ((unsigned char *)memcpy((o),(i),(n)&~3U))
626#endif
627
628/* ===== AES encryption/decryption ===== */
629#ifndef OPENSSL_NO_AES
630
631#if defined(NID_aes_128_cfb128) && ! defined (NID_aes_128_cfb)
632#define NID_aes_128_cfb	NID_aes_128_cfb128
633#endif
634
635#if defined(NID_aes_128_ofb128) && ! defined (NID_aes_128_ofb)
636#define NID_aes_128_ofb	NID_aes_128_ofb128
637#endif
638
639#if defined(NID_aes_192_cfb128) && ! defined (NID_aes_192_cfb)
640#define NID_aes_192_cfb	NID_aes_192_cfb128
641#endif
642
643#if defined(NID_aes_192_ofb128) && ! defined (NID_aes_192_ofb)
644#define NID_aes_192_ofb	NID_aes_192_ofb128
645#endif
646
647#if defined(NID_aes_256_cfb128) && ! defined (NID_aes_256_cfb)
648#define NID_aes_256_cfb	NID_aes_256_cfb128
649#endif
650
651#if defined(NID_aes_256_ofb128) && ! defined (NID_aes_256_ofb)
652#define NID_aes_256_ofb	NID_aes_256_ofb128
653#endif
654
655/* List of supported ciphers. */
656static int padlock_cipher_nids[] = {
657	NID_aes_128_ecb,
658	NID_aes_128_cbc,
659	NID_aes_128_cfb,
660	NID_aes_128_ofb,
661
662	NID_aes_192_ecb,
663	NID_aes_192_cbc,
664	NID_aes_192_cfb,
665	NID_aes_192_ofb,
666
667	NID_aes_256_ecb,
668	NID_aes_256_cbc,
669	NID_aes_256_cfb,
670	NID_aes_256_ofb,
671};
672static int padlock_cipher_nids_num = (sizeof(padlock_cipher_nids)/
673				      sizeof(padlock_cipher_nids[0]));
674
675/* Function prototypes ... */
676static int padlock_aes_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
677				const unsigned char *iv, int enc);
678static int padlock_aes_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
679			      const unsigned char *in, size_t nbytes);
680
681#define NEAREST_ALIGNED(ptr) ( (unsigned char *)(ptr) +		\
682	( (0x10 - ((size_t)(ptr) & 0x0F)) & 0x0F )	)
683#define ALIGNED_CIPHER_DATA(ctx) ((struct padlock_cipher_data *)\
684	NEAREST_ALIGNED(ctx->cipher_data))
685
686#define EVP_CIPHER_block_size_ECB	AES_BLOCK_SIZE
687#define EVP_CIPHER_block_size_CBC	AES_BLOCK_SIZE
688#define EVP_CIPHER_block_size_OFB	1
689#define EVP_CIPHER_block_size_CFB	1
690
691/* Declaring so many ciphers by hand would be a pain.
692   Instead introduce a bit of preprocessor magic :-) */
693#define	DECLARE_AES_EVP(ksize,lmode,umode)	\
694static const EVP_CIPHER padlock_aes_##ksize##_##lmode = {	\
695	NID_aes_##ksize##_##lmode,		\
696	EVP_CIPHER_block_size_##umode,	\
697	AES_KEY_SIZE_##ksize,		\
698	AES_BLOCK_SIZE,			\
699	0 | EVP_CIPH_##umode##_MODE,	\
700	padlock_aes_init_key,		\
701	padlock_aes_cipher,		\
702	NULL,				\
703	sizeof(struct padlock_cipher_data) + 16,	\
704	EVP_CIPHER_set_asn1_iv,		\
705	EVP_CIPHER_get_asn1_iv,		\
706	NULL,				\
707	NULL				\
708}
709
710DECLARE_AES_EVP(128,ecb,ECB);
711DECLARE_AES_EVP(128,cbc,CBC);
712DECLARE_AES_EVP(128,cfb,CFB);
713DECLARE_AES_EVP(128,ofb,OFB);
714
715DECLARE_AES_EVP(192,ecb,ECB);
716DECLARE_AES_EVP(192,cbc,CBC);
717DECLARE_AES_EVP(192,cfb,CFB);
718DECLARE_AES_EVP(192,ofb,OFB);
719
720DECLARE_AES_EVP(256,ecb,ECB);
721DECLARE_AES_EVP(256,cbc,CBC);
722DECLARE_AES_EVP(256,cfb,CFB);
723DECLARE_AES_EVP(256,ofb,OFB);
724
725static int
726padlock_ciphers (ENGINE *e, const EVP_CIPHER **cipher, const int **nids, int nid)
727{
728	/* No specific cipher => return a list of supported nids ... */
729	if (!cipher) {
730		*nids = padlock_cipher_nids;
731		return padlock_cipher_nids_num;
732	}
733
734	/* ... or the requested "cipher" otherwise */
735	switch (nid) {
736	  case NID_aes_128_ecb:
737	    *cipher = &padlock_aes_128_ecb;
738	    break;
739	  case NID_aes_128_cbc:
740	    *cipher = &padlock_aes_128_cbc;
741	    break;
742	  case NID_aes_128_cfb:
743	    *cipher = &padlock_aes_128_cfb;
744	    break;
745	  case NID_aes_128_ofb:
746	    *cipher = &padlock_aes_128_ofb;
747	    break;
748
749	  case NID_aes_192_ecb:
750	    *cipher = &padlock_aes_192_ecb;
751	    break;
752	  case NID_aes_192_cbc:
753	    *cipher = &padlock_aes_192_cbc;
754	    break;
755	  case NID_aes_192_cfb:
756	    *cipher = &padlock_aes_192_cfb;
757	    break;
758	  case NID_aes_192_ofb:
759	    *cipher = &padlock_aes_192_ofb;
760	    break;
761
762	  case NID_aes_256_ecb:
763	    *cipher = &padlock_aes_256_ecb;
764	    break;
765	  case NID_aes_256_cbc:
766	    *cipher = &padlock_aes_256_cbc;
767	    break;
768	  case NID_aes_256_cfb:
769	    *cipher = &padlock_aes_256_cfb;
770	    break;
771	  case NID_aes_256_ofb:
772	    *cipher = &padlock_aes_256_ofb;
773	    break;
774
775	  default:
776	    /* Sorry, we don't support this NID */
777	    *cipher = NULL;
778	    return 0;
779	}
780
781	return 1;
782}
783
784/* Prepare the encryption key for PadLock usage */
785static int
786padlock_aes_init_key (EVP_CIPHER_CTX *ctx, const unsigned char *key,
787		      const unsigned char *iv, int enc)
788{
789	struct padlock_cipher_data *cdata;
790	int key_len = EVP_CIPHER_CTX_key_length(ctx) * 8;
791
792	if (key==NULL) return 0;	/* ERROR */
793
794	cdata = ALIGNED_CIPHER_DATA(ctx);
795	memset(cdata, 0, sizeof(struct padlock_cipher_data));
796
797	/* Prepare Control word. */
798	if (EVP_CIPHER_CTX_mode(ctx) == EVP_CIPH_OFB_MODE)
799		cdata->cword.b.encdec = 0;
800	else
801		cdata->cword.b.encdec = (ctx->encrypt == 0);
802	cdata->cword.b.rounds = 10 + (key_len - 128) / 32;
803	cdata->cword.b.ksize = (key_len - 128) / 64;
804
805	switch(key_len) {
806		case 128:
807			/* PadLock can generate an extended key for
808			   AES128 in hardware */
809			memcpy(cdata->ks.rd_key, key, AES_KEY_SIZE_128);
810			cdata->cword.b.keygen = 0;
811			break;
812
813		case 192:
814		case 256:
815			/* Generate an extended AES key in software.
816			   Needed for AES192/AES256 */
817			/* Well, the above applies to Stepping 8 CPUs
818			   and is listed as hardware errata. They most
819			   likely will fix it at some point and then
820			   a check for stepping would be due here. */
821			if (EVP_CIPHER_CTX_mode(ctx) == EVP_CIPH_CFB_MODE ||
822			    EVP_CIPHER_CTX_mode(ctx) == EVP_CIPH_OFB_MODE ||
823			    enc)
824				AES_set_encrypt_key(key, key_len, &cdata->ks);
825			else
826				AES_set_decrypt_key(key, key_len, &cdata->ks);
827#ifndef AES_ASM
828			/* OpenSSL C functions use byte-swapped extended key. */
829			padlock_bswapl(&cdata->ks);
830#endif
831			cdata->cword.b.keygen = 1;
832			break;
833
834		default:
835			/* ERROR */
836			return 0;
837	}
838
839	/*
840	 * This is done to cover for cases when user reuses the
841	 * context for new key. The catch is that if we don't do
842	 * this, padlock_eas_cipher might proceed with old key...
843	 */
844	padlock_reload_key ();
845
846	return 1;
847}
848
849/*
850 * Simplified version of padlock_aes_cipher() used when
851 * 1) both input and output buffers are at aligned addresses.
852 * or when
853 * 2) running on a newer CPU that doesn't require aligned buffers.
854 */
855static int
856padlock_aes_cipher_omnivorous(EVP_CIPHER_CTX *ctx, unsigned char *out_arg,
857		const unsigned char *in_arg, size_t nbytes)
858{
859	struct padlock_cipher_data *cdata;
860	void  *iv;
861
862	cdata = ALIGNED_CIPHER_DATA(ctx);
863	padlock_verify_context(cdata);
864
865	switch (EVP_CIPHER_CTX_mode(ctx)) {
866	case EVP_CIPH_ECB_MODE:
867		padlock_xcrypt_ecb(nbytes/AES_BLOCK_SIZE, cdata, out_arg, in_arg);
868		break;
869
870	case EVP_CIPH_CBC_MODE:
871		memcpy(cdata->iv, ctx->iv, AES_BLOCK_SIZE);
872		iv = padlock_xcrypt_cbc(nbytes/AES_BLOCK_SIZE, cdata, out_arg, in_arg);
873		memcpy(ctx->iv, iv, AES_BLOCK_SIZE);
874		break;
875
876	case EVP_CIPH_CFB_MODE:
877		memcpy(cdata->iv, ctx->iv, AES_BLOCK_SIZE);
878		iv = padlock_xcrypt_cfb(nbytes/AES_BLOCK_SIZE, cdata, out_arg, in_arg);
879		memcpy(ctx->iv, iv, AES_BLOCK_SIZE);
880		break;
881
882	case EVP_CIPH_OFB_MODE:
883		memcpy(cdata->iv, ctx->iv, AES_BLOCK_SIZE);
884		padlock_xcrypt_ofb(nbytes/AES_BLOCK_SIZE, cdata, out_arg, in_arg);
885		memcpy(ctx->iv, cdata->iv, AES_BLOCK_SIZE);
886		break;
887
888	default:
889		return 0;
890	}
891
892	memset(cdata->iv, 0, AES_BLOCK_SIZE);
893
894	return 1;
895}
896
897#ifndef  PADLOCK_CHUNK
898# define PADLOCK_CHUNK	512	/* Must be a power of 2 larger than 16 */
899#endif
900#if PADLOCK_CHUNK<16 || PADLOCK_CHUNK&(PADLOCK_CHUNK-1)
901# error "insane PADLOCK_CHUNK..."
902#endif
903
904/* Re-align the arguments to 16-Bytes boundaries and run the
905   encryption function itself. This function is not AES-specific. */
906static int
907padlock_aes_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out_arg,
908		   const unsigned char *in_arg, size_t nbytes)
909{
910	struct padlock_cipher_data *cdata;
911	const  void *inp;
912	unsigned char  *out;
913	void  *iv;
914	int    inp_misaligned, out_misaligned, realign_in_loop;
915	size_t chunk, allocated=0;
916
917	/* ctx->num is maintained in byte-oriented modes,
918	   such as CFB and OFB... */
919	if ((chunk = ctx->num)) { /* borrow chunk variable */
920		unsigned char *ivp=ctx->iv;
921
922		switch (EVP_CIPHER_CTX_mode(ctx)) {
923		case EVP_CIPH_CFB_MODE:
924			if (chunk >= AES_BLOCK_SIZE)
925				return 0; /* bogus value */
926
927			if (ctx->encrypt)
928				while (chunk<AES_BLOCK_SIZE && nbytes!=0) {
929					ivp[chunk] = *(out_arg++) = *(in_arg++) ^ ivp[chunk];
930					chunk++, nbytes--;
931				}
932			else	while (chunk<AES_BLOCK_SIZE && nbytes!=0) {
933					unsigned char c = *(in_arg++);
934					*(out_arg++) = c ^ ivp[chunk];
935					ivp[chunk++] = c, nbytes--;
936				}
937
938			ctx->num = chunk%AES_BLOCK_SIZE;
939			break;
940		case EVP_CIPH_OFB_MODE:
941			if (chunk >= AES_BLOCK_SIZE)
942				return 0; /* bogus value */
943
944			while (chunk<AES_BLOCK_SIZE && nbytes!=0) {
945				*(out_arg++) = *(in_arg++) ^ ivp[chunk];
946				chunk++, nbytes--;
947			}
948
949			ctx->num = chunk%AES_BLOCK_SIZE;
950			break;
951		}
952	}
953
954	if (nbytes == 0)
955		return 1;
956#if 0
957	if (nbytes % AES_BLOCK_SIZE)
958		return 0; /* are we expected to do tail processing? */
959#else
960	/* nbytes is always multiple of AES_BLOCK_SIZE in ECB and CBC
961	   modes and arbitrary value in byte-oriented modes, such as
962	   CFB and OFB... */
963#endif
964
965	/* VIA promises CPUs that won't require alignment in the future.
966	   For now padlock_aes_align_required is initialized to 1 and
967	   the condition is never met... */
968	/* C7 core is capable to manage unaligned input in non-ECB[!]
969	   mode, but performance penalties appear to be approximately
970	   same as for software alignment below or ~3x. They promise to
971	   improve it in the future, but for now we can just as well
972	   pretend that it can only handle aligned input... */
973	if (!padlock_aes_align_required && (nbytes%AES_BLOCK_SIZE)==0)
974		return padlock_aes_cipher_omnivorous(ctx, out_arg, in_arg, nbytes);
975
976	inp_misaligned = (((size_t)in_arg) & 0x0F);
977	out_misaligned = (((size_t)out_arg) & 0x0F);
978
979	/* Note that even if output is aligned and input not,
980	 * I still prefer to loop instead of copy the whole
981	 * input and then encrypt in one stroke. This is done
982	 * in order to improve L1 cache utilization... */
983	realign_in_loop = out_misaligned|inp_misaligned;
984
985	if (!realign_in_loop && (nbytes%AES_BLOCK_SIZE)==0)
986		return padlock_aes_cipher_omnivorous(ctx, out_arg, in_arg, nbytes);
987
988	/* this takes one "if" out of the loops */
989	chunk  = nbytes;
990	chunk %= PADLOCK_CHUNK;
991	if (chunk==0) chunk = PADLOCK_CHUNK;
992
993	if (out_misaligned) {
994		/* optmize for small input */
995		allocated = (chunk<nbytes?PADLOCK_CHUNK:nbytes);
996		out = alloca(0x10 + allocated);
997		out = NEAREST_ALIGNED(out);
998	}
999	else
1000		out = out_arg;
1001
1002	cdata = ALIGNED_CIPHER_DATA(ctx);
1003	padlock_verify_context(cdata);
1004
1005	switch (EVP_CIPHER_CTX_mode(ctx)) {
1006	case EVP_CIPH_ECB_MODE:
1007		do	{
1008			if (inp_misaligned)
1009				inp = padlock_memcpy(out, in_arg, chunk);
1010			else
1011				inp = in_arg;
1012			in_arg += chunk;
1013
1014			padlock_xcrypt_ecb(chunk/AES_BLOCK_SIZE, cdata, out, inp);
1015
1016			if (out_misaligned)
1017				out_arg = padlock_memcpy(out_arg, out, chunk) + chunk;
1018			else
1019				out     = out_arg+=chunk;
1020
1021			nbytes -= chunk;
1022			chunk   = PADLOCK_CHUNK;
1023		} while (nbytes);
1024		break;
1025
1026	case EVP_CIPH_CBC_MODE:
1027		memcpy(cdata->iv, ctx->iv, AES_BLOCK_SIZE);
1028		goto cbc_shortcut;
1029		do	{
1030			if (iv != cdata->iv)
1031				memcpy(cdata->iv, iv, AES_BLOCK_SIZE);
1032			chunk = PADLOCK_CHUNK;
1033		cbc_shortcut: /* optimize for small input */
1034			if (inp_misaligned)
1035				inp = padlock_memcpy(out, in_arg, chunk);
1036			else
1037				inp = in_arg;
1038			in_arg += chunk;
1039
1040			iv = padlock_xcrypt_cbc(chunk/AES_BLOCK_SIZE, cdata, out, inp);
1041
1042			if (out_misaligned)
1043				out_arg = padlock_memcpy(out_arg, out, chunk) + chunk;
1044			else
1045				out     = out_arg+=chunk;
1046
1047		} while (nbytes -= chunk);
1048		memcpy(ctx->iv, iv, AES_BLOCK_SIZE);
1049		break;
1050
1051	case EVP_CIPH_CFB_MODE:
1052		memcpy (iv = cdata->iv, ctx->iv, AES_BLOCK_SIZE);
1053		chunk &= ~(AES_BLOCK_SIZE-1);
1054		if (chunk)	goto cfb_shortcut;
1055		else		goto cfb_skiploop;
1056		do	{
1057			if (iv != cdata->iv)
1058				memcpy(cdata->iv, iv, AES_BLOCK_SIZE);
1059			chunk = PADLOCK_CHUNK;
1060		cfb_shortcut: /* optimize for small input */
1061			if (inp_misaligned)
1062				inp = padlock_memcpy(out, in_arg, chunk);
1063			else
1064				inp = in_arg;
1065			in_arg += chunk;
1066
1067			iv = padlock_xcrypt_cfb(chunk/AES_BLOCK_SIZE, cdata, out, inp);
1068
1069			if (out_misaligned)
1070				out_arg = padlock_memcpy(out_arg, out, chunk) + chunk;
1071			else
1072				out     = out_arg+=chunk;
1073
1074			nbytes -= chunk;
1075		} while (nbytes >= AES_BLOCK_SIZE);
1076
1077		cfb_skiploop:
1078		if (nbytes) {
1079			unsigned char *ivp = cdata->iv;
1080
1081			if (iv != ivp) {
1082				memcpy(ivp, iv, AES_BLOCK_SIZE);
1083				iv = ivp;
1084			}
1085			ctx->num = nbytes;
1086			if (cdata->cword.b.encdec) {
1087				cdata->cword.b.encdec=0;
1088				padlock_reload_key();
1089				padlock_xcrypt_ecb(1,cdata,ivp,ivp);
1090				cdata->cword.b.encdec=1;
1091				padlock_reload_key();
1092				while(nbytes) {
1093					unsigned char c = *(in_arg++);
1094					*(out_arg++) = c ^ *ivp;
1095					*(ivp++) = c, nbytes--;
1096				}
1097			}
1098			else {	padlock_reload_key();
1099				padlock_xcrypt_ecb(1,cdata,ivp,ivp);
1100				padlock_reload_key();
1101				while (nbytes) {
1102					*ivp = *(out_arg++) = *(in_arg++) ^ *ivp;
1103					ivp++, nbytes--;
1104				}
1105			}
1106		}
1107
1108		memcpy(ctx->iv, iv, AES_BLOCK_SIZE);
1109		break;
1110
1111	case EVP_CIPH_OFB_MODE:
1112		memcpy(cdata->iv, ctx->iv, AES_BLOCK_SIZE);
1113		chunk &= ~(AES_BLOCK_SIZE-1);
1114		if (chunk) do	{
1115			if (inp_misaligned)
1116				inp = padlock_memcpy(out, in_arg, chunk);
1117			else
1118				inp = in_arg;
1119			in_arg += chunk;
1120
1121			padlock_xcrypt_ofb(chunk/AES_BLOCK_SIZE, cdata, out, inp);
1122
1123			if (out_misaligned)
1124				out_arg = padlock_memcpy(out_arg, out, chunk) + chunk;
1125			else
1126				out     = out_arg+=chunk;
1127
1128			nbytes -= chunk;
1129			chunk   = PADLOCK_CHUNK;
1130		} while (nbytes >= AES_BLOCK_SIZE);
1131
1132		if (nbytes) {
1133			unsigned char *ivp = cdata->iv;
1134
1135			ctx->num = nbytes;
1136			padlock_reload_key();	/* empirically found */
1137			padlock_xcrypt_ecb(1,cdata,ivp,ivp);
1138			padlock_reload_key();	/* empirically found */
1139			while (nbytes) {
1140				*(out_arg++) = *(in_arg++) ^ *ivp;
1141				ivp++, nbytes--;
1142			}
1143		}
1144
1145		memcpy(ctx->iv, cdata->iv, AES_BLOCK_SIZE);
1146		break;
1147
1148	default:
1149		return 0;
1150	}
1151
1152	/* Clean the realign buffer if it was used */
1153	if (out_misaligned) {
1154		volatile unsigned long *p=(void *)out;
1155		size_t   n = allocated/sizeof(*p);
1156		while (n--) *p++=0;
1157	}
1158
1159	memset(cdata->iv, 0, AES_BLOCK_SIZE);
1160
1161	return 1;
1162}
1163
1164#endif /* OPENSSL_NO_AES */
1165
1166/* ===== Random Number Generator ===== */
1167/*
1168 * This code is not engaged. The reason is that it does not comply
1169 * with recommendations for VIA RNG usage for secure applications
1170 * (posted at http://www.via.com.tw/en/viac3/c3.jsp) nor does it
1171 * provide meaningful error control...
1172 */
1173/* Wrapper that provides an interface between the API and
1174   the raw PadLock RNG */
1175static int
1176padlock_rand_bytes(unsigned char *output, int count)
1177{
1178	unsigned int eax, buf;
1179
1180	while (count >= 8) {
1181		eax = padlock_xstore(output, 0);
1182		if (!(eax&(1<<6)))	return 0; /* RNG disabled */
1183		/* this ---vv--- covers DC bias, Raw Bits and String Filter */
1184		if (eax&(0x1F<<10))	return 0;
1185		if ((eax&0x1F)==0)	continue; /* no data, retry... */
1186		if ((eax&0x1F)!=8)	return 0; /* fatal failure...  */
1187		output += 8;
1188		count  -= 8;
1189	}
1190	while (count > 0) {
1191		eax = padlock_xstore(&buf, 3);
1192		if (!(eax&(1<<6)))	return 0; /* RNG disabled */
1193		/* this ---vv--- covers DC bias, Raw Bits and String Filter */
1194		if (eax&(0x1F<<10))	return 0;
1195		if ((eax&0x1F)==0)	continue; /* no data, retry... */
1196		if ((eax&0x1F)!=1)	return 0; /* fatal failure...  */
1197		*output++ = (unsigned char)buf;
1198		count--;
1199	}
1200	*(volatile unsigned int *)&buf=0;
1201
1202	return 1;
1203}
1204
1205/* Dummy but necessary function */
1206static int
1207padlock_rand_status(void)
1208{
1209	return 1;
1210}
1211
1212/* Prepare structure for registration */
1213static RAND_METHOD padlock_rand = {
1214	NULL,			/* seed */
1215	padlock_rand_bytes,	/* bytes */
1216	NULL,			/* cleanup */
1217	NULL,			/* add */
1218	padlock_rand_bytes,	/* pseudorand */
1219	padlock_rand_status,	/* rand status */
1220};
1221
1222#else  /* !COMPILE_HW_PADLOCK */
1223#ifndef OPENSSL_NO_DYNAMIC_ENGINE
1224OPENSSL_EXPORT
1225int bind_engine(ENGINE *e, const char *id, const dynamic_fns *fns);
1226OPENSSL_EXPORT
1227int bind_engine(ENGINE *e, const char *id, const dynamic_fns *fns) { return 0; }
1228IMPLEMENT_DYNAMIC_CHECK_FN()
1229#endif
1230#endif /* COMPILE_HW_PADLOCK */
1231
1232#endif /* !OPENSSL_NO_HW_PADLOCK */
1233#endif /* !OPENSSL_NO_HW */
1234