1/*
2 * Support for VIA PadLock Advanced Cryptography Engine (ACE)
3 * Written by Michal Ludvig <michal@logix.cz>
4 *            http://www.logix.cz/michal
5 *
6 * Big thanks to Andy Polyakov for a help with optimization,
7 * assembler fixes, port to MS Windows and a lot of other
8 * valuable work on this engine!
9 */
10
11/* ====================================================================
12 * Copyright (c) 1999-2001 The OpenSSL Project.  All rights reserved.
13 *
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions
16 * are met:
17 *
18 * 1. Redistributions of source code must retain the above copyright
19 *    notice, this list of conditions and the following disclaimer.
20 *
21 * 2. Redistributions in binary form must reproduce the above copyright
22 *    notice, this list of conditions and the following disclaimer in
23 *    the documentation and/or other materials provided with the
24 *    distribution.
25 *
26 * 3. All advertising materials mentioning features or use of this
27 *    software must display the following acknowledgment:
28 *    "This product includes software developed by the OpenSSL Project
29 *    for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
30 *
31 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
32 *    endorse or promote products derived from this software without
33 *    prior written permission. For written permission, please contact
34 *    licensing@OpenSSL.org.
35 *
36 * 5. Products derived from this software may not be called "OpenSSL"
37 *    nor may "OpenSSL" appear in their names without prior written
38 *    permission of the OpenSSL Project.
39 *
40 * 6. Redistributions of any form whatsoever must retain the following
41 *    acknowledgment:
42 *    "This product includes software developed by the OpenSSL Project
43 *    for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
44 *
45 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
46 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
47 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
48 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
49 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
50 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
51 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
52 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
53 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
54 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
55 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
56 * OF THE POSSIBILITY OF SUCH DAMAGE.
57 * ====================================================================
58 *
59 * This product includes cryptographic software written by Eric Young
60 * (eay@cryptsoft.com).  This product includes software written by Tim
61 * Hudson (tjh@cryptsoft.com).
62 *
63 */
64
65
66#include <stdio.h>
67#include <string.h>
68
69#include <openssl/opensslconf.h>
70#include <openssl/crypto.h>
71#include <openssl/dso.h>
72#include <openssl/engine.h>
73#include <openssl/evp.h>
74#ifndef OPENSSL_NO_AES
75#include <openssl/aes.h>
76#endif
77#include <openssl/rand.h>
78#include <openssl/err.h>
79
80#ifndef OPENSSL_NO_HW
81#ifndef OPENSSL_NO_HW_PADLOCK
82
83/* Attempt to have a single source for both 0.9.7 and 0.9.8 :-) */
84#if (OPENSSL_VERSION_NUMBER >= 0x00908000L)
85#  ifndef OPENSSL_NO_DYNAMIC_ENGINE
86#    define DYNAMIC_ENGINE
87#  endif
88#elif (OPENSSL_VERSION_NUMBER >= 0x00907000L)
89#  ifdef ENGINE_DYNAMIC_SUPPORT
90#    define DYNAMIC_ENGINE
91#  endif
92#else
93#  error "Only OpenSSL >= 0.9.7 is supported"
94#endif
95
96/* VIA PadLock AES is available *ONLY* on some x86 CPUs.
97   Not only that it doesn't exist elsewhere, but it
98   even can't be compiled on other platforms!
99
100   In addition, because of the heavy use of inline assembler,
101   compiler choice is limited to GCC and Microsoft C. */
102#undef COMPILE_HW_PADLOCK
103#if !defined(I386_ONLY) && !defined(OPENSSL_NO_INLINE_ASM)
104# if (defined(__GNUC__) && (defined(__i386__) || defined(__i386))) || \
105     (defined(_MSC_VER) && defined(_M_IX86))
106#  define COMPILE_HW_PADLOCK
107static ENGINE *ENGINE_padlock (void);
108# endif
109#endif
110
111void ENGINE_load_padlock (void)
112{
113/* On non-x86 CPUs it just returns. */
114#ifdef COMPILE_HW_PADLOCK
115	ENGINE *toadd = ENGINE_padlock ();
116	if (!toadd) return;
117	ENGINE_add (toadd);
118	ENGINE_free (toadd);
119	ERR_clear_error ();
120#endif
121}
122
123#ifdef COMPILE_HW_PADLOCK
124/* We do these includes here to avoid header problems on platforms that
125   do not have the VIA padlock anyway... */
126#ifdef _MSC_VER
127# include <malloc.h>
128# define alloca _alloca
129#else
130# include <stdlib.h>
131#endif
132
133/* Function for ENGINE detection and control */
134static int padlock_available(void);
135static int padlock_init(ENGINE *e);
136
137/* RNG Stuff */
138static RAND_METHOD padlock_rand;
139
140/* Cipher Stuff */
141#ifndef OPENSSL_NO_AES
142static int padlock_ciphers(ENGINE *e, const EVP_CIPHER **cipher, const int **nids, int nid);
143#endif
144
145/* Engine names */
146static const char *padlock_id = "padlock";
147static char padlock_name[100];
148
149/* Available features */
150static int padlock_use_ace = 0;	/* Advanced Cryptography Engine */
151static int padlock_use_rng = 0;	/* Random Number Generator */
152#ifndef OPENSSL_NO_AES
153static int padlock_aes_align_required = 1;
154#endif
155
156/* ===== Engine "management" functions ===== */
157
158/* Prepare the ENGINE structure for registration */
159static int
160padlock_bind_helper(ENGINE *e)
161{
162	/* Check available features */
163	padlock_available();
164
165#if 1	/* disable RNG for now, see commentary in vicinity of RNG code */
166	padlock_use_rng=0;
167#endif
168
169	/* Generate a nice engine name with available features */
170	BIO_snprintf(padlock_name, sizeof(padlock_name),
171		"VIA PadLock (%s, %s)",
172		 padlock_use_rng ? "RNG" : "no-RNG",
173		 padlock_use_ace ? "ACE" : "no-ACE");
174
175	/* Register everything or return with an error */
176	if (!ENGINE_set_id(e, padlock_id) ||
177	    !ENGINE_set_name(e, padlock_name) ||
178
179	    !ENGINE_set_init_function(e, padlock_init) ||
180#ifndef OPENSSL_NO_AES
181	    (padlock_use_ace && !ENGINE_set_ciphers (e, padlock_ciphers)) ||
182#endif
183	    (padlock_use_rng && !ENGINE_set_RAND (e, &padlock_rand))) {
184		return 0;
185	}
186
187	/* Everything looks good */
188	return 1;
189}
190
191/* Constructor */
192static ENGINE *
193ENGINE_padlock(void)
194{
195	ENGINE *eng = ENGINE_new();
196
197	if (!eng) {
198		return NULL;
199	}
200
201	if (!padlock_bind_helper(eng)) {
202		ENGINE_free(eng);
203		return NULL;
204	}
205
206	return eng;
207}
208
209/* Check availability of the engine */
210static int
211padlock_init(ENGINE *e)
212{
213	return (padlock_use_rng || padlock_use_ace);
214}
215
216/* This stuff is needed if this ENGINE is being compiled into a self-contained
217 * shared-library.
218 */
219#ifdef DYNAMIC_ENGINE
220static int
221padlock_bind_fn(ENGINE *e, const char *id)
222{
223	if (id && (strcmp(id, padlock_id) != 0)) {
224		return 0;
225	}
226
227	if (!padlock_bind_helper(e))  {
228		return 0;
229	}
230
231	return 1;
232}
233
234IMPLEMENT_DYNAMIC_CHECK_FN ();
235IMPLEMENT_DYNAMIC_BIND_FN (padlock_bind_fn);
236#endif /* DYNAMIC_ENGINE */
237
238/* ===== Here comes the "real" engine ===== */
239
240#ifndef OPENSSL_NO_AES
241/* Some AES-related constants */
242#define AES_BLOCK_SIZE		16
243#define AES_KEY_SIZE_128	16
244#define AES_KEY_SIZE_192	24
245#define AES_KEY_SIZE_256	32
246
247/* Here we store the status information relevant to the
248   current context. */
249/* BIG FAT WARNING:
250 * 	Inline assembler in PADLOCK_XCRYPT_ASM()
251 * 	depends on the order of items in this structure.
252 * 	Don't blindly modify, reorder, etc!
253 */
254struct padlock_cipher_data
255{
256	unsigned char iv[AES_BLOCK_SIZE];	/* Initialization vector */
257	union {	unsigned int pad[4];
258		struct {
259			int rounds:4;
260			int dgst:1;	/* n/a in C3 */
261			int align:1;	/* n/a in C3 */
262			int ciphr:1;	/* n/a in C3 */
263			unsigned int keygen:1;
264			int interm:1;
265			unsigned int encdec:1;
266			int ksize:2;
267		} b;
268	} cword;		/* Control word */
269	AES_KEY ks;		/* Encryption key */
270};
271
272/*
273 * Essentially this variable belongs in thread local storage.
274 * Having this variable global on the other hand can only cause
275 * few bogus key reloads [if any at all on single-CPU system],
276 * so we accept the penatly...
277 */
278static volatile struct padlock_cipher_data *padlock_saved_context;
279#endif
280
281/*
282 * =======================================================
283 * Inline assembler section(s).
284 * =======================================================
285 * Order of arguments is chosen to facilitate Windows port
286 * using __fastcall calling convention. If you wish to add
287 * more routines, keep in mind that first __fastcall
288 * argument is passed in %ecx and second - in %edx.
289 * =======================================================
290 */
291#if defined(__GNUC__) && __GNUC__>=2
292/*
293 * As for excessive "push %ebx"/"pop %ebx" found all over.
294 * When generating position-independent code GCC won't let
295 * us use "b" in assembler templates nor even respect "ebx"
296 * in "clobber description." Therefore the trouble...
297 */
298
299/* Helper function - check if a CPUID instruction
300   is available on this CPU */
301static int
302padlock_insn_cpuid_available(void)
303{
304	int result = -1;
305
306	/* We're checking if the bit #21 of EFLAGS
307	   can be toggled. If yes = CPUID is available. */
308	asm volatile (
309		"pushf\n"
310		"popl %%eax\n"
311		"xorl $0x200000, %%eax\n"
312		"movl %%eax, %%ecx\n"
313		"andl $0x200000, %%ecx\n"
314		"pushl %%eax\n"
315		"popf\n"
316		"pushf\n"
317		"popl %%eax\n"
318		"andl $0x200000, %%eax\n"
319		"xorl %%eax, %%ecx\n"
320		"movl %%ecx, %0\n"
321		: "=r" (result) : : "eax", "ecx");
322
323	return (result == 0);
324}
325
326/* Load supported features of the CPU to see if
327   the PadLock is available. */
328static int
329padlock_available(void)
330{
331	char vendor_string[16];
332	unsigned int eax, edx;
333
334	/* First check if the CPUID instruction is available at all... */
335	if (! padlock_insn_cpuid_available())
336		return 0;
337
338	/* Are we running on the Centaur (VIA) CPU? */
339	eax = 0x00000000;
340	vendor_string[12] = 0;
341	asm volatile (
342		"pushl	%%ebx\n"
343		"cpuid\n"
344		"movl	%%ebx,(%%edi)\n"
345		"movl	%%edx,4(%%edi)\n"
346		"movl	%%ecx,8(%%edi)\n"
347		"popl	%%ebx"
348		: "+a"(eax) : "D"(vendor_string) : "ecx", "edx");
349	if (strcmp(vendor_string, "CentaurHauls") != 0)
350		return 0;
351
352	/* Check for Centaur Extended Feature Flags presence */
353	eax = 0xC0000000;
354	asm volatile ("pushl %%ebx; cpuid; popl	%%ebx"
355		: "+a"(eax) : : "ecx", "edx");
356	if (eax < 0xC0000001)
357		return 0;
358
359	/* Read the Centaur Extended Feature Flags */
360	eax = 0xC0000001;
361	asm volatile ("pushl %%ebx; cpuid; popl %%ebx"
362		: "+a"(eax), "=d"(edx) : : "ecx");
363
364	/* Fill up some flags */
365	padlock_use_ace = ((edx & (0x3<<6)) == (0x3<<6));
366	padlock_use_rng = ((edx & (0x3<<2)) == (0x3<<2));
367
368	return padlock_use_ace + padlock_use_rng;
369}
370
371#ifndef OPENSSL_NO_AES
372/* Our own htonl()/ntohl() */
373static inline void
374padlock_bswapl(AES_KEY *ks)
375{
376	size_t i = sizeof(ks->rd_key)/sizeof(ks->rd_key[0]);
377	unsigned int *key = ks->rd_key;
378
379	while (i--) {
380		asm volatile ("bswapl %0" : "+r"(*key));
381		key++;
382	}
383}
384#endif
385
386/* Force key reload from memory to the CPU microcode.
387   Loading EFLAGS from the stack clears EFLAGS[30]
388   which does the trick. */
389static inline void
390padlock_reload_key(void)
391{
392	asm volatile ("pushfl; popfl");
393}
394
395#ifndef OPENSSL_NO_AES
396/*
397 * This is heuristic key context tracing. At first one
398 * believes that one should use atomic swap instructions,
399 * but it's not actually necessary. Point is that if
400 * padlock_saved_context was changed by another thread
401 * after we've read it and before we compare it with cdata,
402 * our key *shall* be reloaded upon thread context switch
403 * and we are therefore set in either case...
404 */
405static inline void
406padlock_verify_context(struct padlock_cipher_data *cdata)
407{
408	asm volatile (
409	"pushfl\n"
410"	btl	$30,(%%esp)\n"
411"	jnc	1f\n"
412"	cmpl	%2,%1\n"
413"	je	1f\n"
414"	popfl\n"
415"	subl	$4,%%esp\n"
416"1:	addl	$4,%%esp\n"
417"	movl	%2,%0"
418	:"+m"(padlock_saved_context)
419	: "r"(padlock_saved_context), "r"(cdata) : "cc");
420}
421
422/* Template for padlock_xcrypt_* modes */
423/* BIG FAT WARNING:
424 * 	The offsets used with 'leal' instructions
425 * 	describe items of the 'padlock_cipher_data'
426 * 	structure.
427 */
428#define PADLOCK_XCRYPT_ASM(name,rep_xcrypt)	\
429static inline void *name(size_t cnt,		\
430	struct padlock_cipher_data *cdata,	\
431	void *out, const void *inp) 		\
432{	void *iv; 				\
433	asm volatile ( "pushl	%%ebx\n"	\
434		"	leal	16(%0),%%edx\n"	\
435		"	leal	32(%0),%%ebx\n"	\
436			rep_xcrypt "\n"		\
437		"	popl	%%ebx"		\
438		: "=a"(iv), "=c"(cnt), "=D"(out), "=S"(inp) \
439		: "0"(cdata), "1"(cnt), "2"(out), "3"(inp), "m"(*cdata)  \
440		: "edx", "cc", "memory");	\
441	return iv;				\
442}
443
444/* Generate all functions with appropriate opcodes */
445PADLOCK_XCRYPT_ASM(padlock_xcrypt_ecb, ".byte 0xf3,0x0f,0xa7,0xc8")	/* rep xcryptecb */
446PADLOCK_XCRYPT_ASM(padlock_xcrypt_cbc, ".byte 0xf3,0x0f,0xa7,0xd0")	/* rep xcryptcbc */
447PADLOCK_XCRYPT_ASM(padlock_xcrypt_cfb, ".byte 0xf3,0x0f,0xa7,0xe0")	/* rep xcryptcfb */
448PADLOCK_XCRYPT_ASM(padlock_xcrypt_ofb, ".byte 0xf3,0x0f,0xa7,0xe8")	/* rep xcryptofb */
449#endif
450
451/* The RNG call itself */
452static inline unsigned int
453padlock_xstore(void *addr, unsigned int edx_in)
454{
455	unsigned int eax_out;
456
457	asm volatile (".byte 0x0f,0xa7,0xc0"	/* xstore */
458	    : "=a"(eax_out),"=m"(*(unsigned *)addr)
459	    : "D"(addr), "d" (edx_in)
460	    );
461
462	return eax_out;
463}
464
465/* Why not inline 'rep movsd'? I failed to find information on what
466 * value in Direction Flag one can expect and consequently have to
467 * apply "better-safe-than-sorry" approach and assume "undefined."
468 * I could explicitly clear it and restore the original value upon
469 * return from padlock_aes_cipher, but it's presumably too much
470 * trouble for too little gain...
471 *
472 * In case you wonder 'rep xcrypt*' instructions above are *not*
473 * affected by the Direction Flag and pointers advance toward
474 * larger addresses unconditionally.
475 */
476static inline unsigned char *
477padlock_memcpy(void *dst,const void *src,size_t n)
478{
479	long       *d=dst;
480	const long *s=src;
481
482	n /= sizeof(*d);
483	do { *d++ = *s++; } while (--n);
484
485	return dst;
486}
487
488#elif defined(_MSC_VER)
489/*
490 * Unlike GCC these are real functions. In order to minimize impact
491 * on performance we adhere to __fastcall calling convention in
492 * order to get two first arguments passed through %ecx and %edx.
493 * Which kind of suits very well, as instructions in question use
494 * both %ecx and %edx as input:-)
495 */
496#define REP_XCRYPT(code)		\
497	_asm _emit 0xf3			\
498	_asm _emit 0x0f _asm _emit 0xa7	\
499	_asm _emit code
500
501/* BIG FAT WARNING:
502 * 	The offsets used with 'lea' instructions
503 * 	describe items of the 'padlock_cipher_data'
504 * 	structure.
505 */
506#define PADLOCK_XCRYPT_ASM(name,code)	\
507static void * __fastcall 		\
508	name (size_t cnt, void *cdata,	\
509	void *outp, const void *inp)	\
510{	_asm	mov	eax,edx		\
511	_asm	lea	edx,[eax+16]	\
512	_asm	lea	ebx,[eax+32]	\
513	_asm	mov	edi,outp	\
514	_asm	mov	esi,inp		\
515	REP_XCRYPT(code)		\
516}
517
518PADLOCK_XCRYPT_ASM(padlock_xcrypt_ecb,0xc8)
519PADLOCK_XCRYPT_ASM(padlock_xcrypt_cbc,0xd0)
520PADLOCK_XCRYPT_ASM(padlock_xcrypt_cfb,0xe0)
521PADLOCK_XCRYPT_ASM(padlock_xcrypt_ofb,0xe8)
522
523static int __fastcall
524padlock_xstore(void *outp,unsigned int code)
525{	_asm	mov	edi,ecx
526	_asm _emit 0x0f _asm _emit 0xa7 _asm _emit 0xc0
527}
528
529static void __fastcall
530padlock_reload_key(void)
531{	_asm pushfd _asm popfd		}
532
533static void __fastcall
534padlock_verify_context(void *cdata)
535{	_asm	{
536		pushfd
537		bt	DWORD PTR[esp],30
538		jnc	skip
539		cmp	ecx,padlock_saved_context
540		je	skip
541		popfd
542		sub	esp,4
543	skip:	add	esp,4
544		mov	padlock_saved_context,ecx
545		}
546}
547
548static int
549padlock_available(void)
550{	_asm	{
551		pushfd
552		pop	eax
553		mov	ecx,eax
554		xor	eax,1<<21
555		push	eax
556		popfd
557		pushfd
558		pop	eax
559		xor	eax,ecx
560		bt	eax,21
561		jnc	noluck
562		mov	eax,0
563		cpuid
564		xor	eax,eax
565		cmp	ebx,'tneC'
566		jne	noluck
567		cmp	edx,'Hrua'
568		jne	noluck
569		cmp	ecx,'slua'
570		jne	noluck
571		mov	eax,0xC0000000
572		cpuid
573		mov	edx,eax
574		xor	eax,eax
575		cmp	edx,0xC0000001
576		jb	noluck
577		mov	eax,0xC0000001
578		cpuid
579		xor	eax,eax
580		bt	edx,6
581		jnc	skip_a
582		bt	edx,7
583		jnc	skip_a
584		mov	padlock_use_ace,1
585		inc	eax
586	skip_a:	bt	edx,2
587		jnc	skip_r
588		bt	edx,3
589		jnc	skip_r
590		mov	padlock_use_rng,1
591		inc	eax
592	skip_r:
593	noluck:
594		}
595}
596
597static void __fastcall
598padlock_bswapl(void *key)
599{	_asm	{
600		pushfd
601		cld
602		mov	esi,ecx
603		mov	edi,ecx
604		mov	ecx,60
605	up:	lodsd
606		bswap	eax
607		stosd
608		loop	up
609		popfd
610		}
611}
612
613/* MS actually specifies status of Direction Flag and compiler even
614 * manages to compile following as 'rep movsd' all by itself...
615 */
616#define padlock_memcpy(o,i,n) ((unsigned char *)memcpy((o),(i),(n)&~3U))
617#endif
618
619/* ===== AES encryption/decryption ===== */
620#ifndef OPENSSL_NO_AES
621
622#if defined(NID_aes_128_cfb128) && ! defined (NID_aes_128_cfb)
623#define NID_aes_128_cfb	NID_aes_128_cfb128
624#endif
625
626#if defined(NID_aes_128_ofb128) && ! defined (NID_aes_128_ofb)
627#define NID_aes_128_ofb	NID_aes_128_ofb128
628#endif
629
630#if defined(NID_aes_192_cfb128) && ! defined (NID_aes_192_cfb)
631#define NID_aes_192_cfb	NID_aes_192_cfb128
632#endif
633
634#if defined(NID_aes_192_ofb128) && ! defined (NID_aes_192_ofb)
635#define NID_aes_192_ofb	NID_aes_192_ofb128
636#endif
637
638#if defined(NID_aes_256_cfb128) && ! defined (NID_aes_256_cfb)
639#define NID_aes_256_cfb	NID_aes_256_cfb128
640#endif
641
642#if defined(NID_aes_256_ofb128) && ! defined (NID_aes_256_ofb)
643#define NID_aes_256_ofb	NID_aes_256_ofb128
644#endif
645
646/* List of supported ciphers. */
647static int padlock_cipher_nids[] = {
648	NID_aes_128_ecb,
649	NID_aes_128_cbc,
650	NID_aes_128_cfb,
651	NID_aes_128_ofb,
652
653	NID_aes_192_ecb,
654	NID_aes_192_cbc,
655	NID_aes_192_cfb,
656	NID_aes_192_ofb,
657
658	NID_aes_256_ecb,
659	NID_aes_256_cbc,
660	NID_aes_256_cfb,
661	NID_aes_256_ofb,
662};
663static int padlock_cipher_nids_num = (sizeof(padlock_cipher_nids)/
664				      sizeof(padlock_cipher_nids[0]));
665
666/* Function prototypes ... */
667static int padlock_aes_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
668				const unsigned char *iv, int enc);
669static int padlock_aes_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
670			      const unsigned char *in, size_t nbytes);
671
672#define NEAREST_ALIGNED(ptr) ( (unsigned char *)(ptr) +		\
673	( (0x10 - ((size_t)(ptr) & 0x0F)) & 0x0F )	)
674#define ALIGNED_CIPHER_DATA(ctx) ((struct padlock_cipher_data *)\
675	NEAREST_ALIGNED(ctx->cipher_data))
676
677#define EVP_CIPHER_block_size_ECB	AES_BLOCK_SIZE
678#define EVP_CIPHER_block_size_CBC	AES_BLOCK_SIZE
679#define EVP_CIPHER_block_size_OFB	1
680#define EVP_CIPHER_block_size_CFB	1
681
682/* Declaring so many ciphers by hand would be a pain.
683   Instead introduce a bit of preprocessor magic :-) */
684#define	DECLARE_AES_EVP(ksize,lmode,umode)	\
685static const EVP_CIPHER padlock_aes_##ksize##_##lmode = {	\
686	NID_aes_##ksize##_##lmode,		\
687	EVP_CIPHER_block_size_##umode,	\
688	AES_KEY_SIZE_##ksize,		\
689	AES_BLOCK_SIZE,			\
690	0 | EVP_CIPH_##umode##_MODE,	\
691	padlock_aes_init_key,		\
692	padlock_aes_cipher,		\
693	NULL,				\
694	sizeof(struct padlock_cipher_data) + 16,	\
695	EVP_CIPHER_set_asn1_iv,		\
696	EVP_CIPHER_get_asn1_iv,		\
697	NULL,				\
698	NULL				\
699}
700
701DECLARE_AES_EVP(128,ecb,ECB);
702DECLARE_AES_EVP(128,cbc,CBC);
703DECLARE_AES_EVP(128,cfb,CFB);
704DECLARE_AES_EVP(128,ofb,OFB);
705
706DECLARE_AES_EVP(192,ecb,ECB);
707DECLARE_AES_EVP(192,cbc,CBC);
708DECLARE_AES_EVP(192,cfb,CFB);
709DECLARE_AES_EVP(192,ofb,OFB);
710
711DECLARE_AES_EVP(256,ecb,ECB);
712DECLARE_AES_EVP(256,cbc,CBC);
713DECLARE_AES_EVP(256,cfb,CFB);
714DECLARE_AES_EVP(256,ofb,OFB);
715
716static int
717padlock_ciphers (ENGINE *e, const EVP_CIPHER **cipher, const int **nids, int nid)
718{
719	/* No specific cipher => return a list of supported nids ... */
720	if (!cipher) {
721		*nids = padlock_cipher_nids;
722		return padlock_cipher_nids_num;
723	}
724
725	/* ... or the requested "cipher" otherwise */
726	switch (nid) {
727	  case NID_aes_128_ecb:
728	    *cipher = &padlock_aes_128_ecb;
729	    break;
730	  case NID_aes_128_cbc:
731	    *cipher = &padlock_aes_128_cbc;
732	    break;
733	  case NID_aes_128_cfb:
734	    *cipher = &padlock_aes_128_cfb;
735	    break;
736	  case NID_aes_128_ofb:
737	    *cipher = &padlock_aes_128_ofb;
738	    break;
739
740	  case NID_aes_192_ecb:
741	    *cipher = &padlock_aes_192_ecb;
742	    break;
743	  case NID_aes_192_cbc:
744	    *cipher = &padlock_aes_192_cbc;
745	    break;
746	  case NID_aes_192_cfb:
747	    *cipher = &padlock_aes_192_cfb;
748	    break;
749	  case NID_aes_192_ofb:
750	    *cipher = &padlock_aes_192_ofb;
751	    break;
752
753	  case NID_aes_256_ecb:
754	    *cipher = &padlock_aes_256_ecb;
755	    break;
756	  case NID_aes_256_cbc:
757	    *cipher = &padlock_aes_256_cbc;
758	    break;
759	  case NID_aes_256_cfb:
760	    *cipher = &padlock_aes_256_cfb;
761	    break;
762	  case NID_aes_256_ofb:
763	    *cipher = &padlock_aes_256_ofb;
764	    break;
765
766	  default:
767	    /* Sorry, we don't support this NID */
768	    *cipher = NULL;
769	    return 0;
770	}
771
772	return 1;
773}
774
775/* Prepare the encryption key for PadLock usage */
776static int
777padlock_aes_init_key (EVP_CIPHER_CTX *ctx, const unsigned char *key,
778		      const unsigned char *iv, int enc)
779{
780	struct padlock_cipher_data *cdata;
781	int key_len = EVP_CIPHER_CTX_key_length(ctx) * 8;
782
783	if (key==NULL) return 0;	/* ERROR */
784
785	cdata = ALIGNED_CIPHER_DATA(ctx);
786	memset(cdata, 0, sizeof(struct padlock_cipher_data));
787
788	/* Prepare Control word. */
789	if (EVP_CIPHER_CTX_mode(ctx) == EVP_CIPH_OFB_MODE)
790		cdata->cword.b.encdec = 0;
791	else
792		cdata->cword.b.encdec = (ctx->encrypt == 0);
793	cdata->cword.b.rounds = 10 + (key_len - 128) / 32;
794	cdata->cword.b.ksize = (key_len - 128) / 64;
795
796	switch(key_len) {
797		case 128:
798			/* PadLock can generate an extended key for
799			   AES128 in hardware */
800			memcpy(cdata->ks.rd_key, key, AES_KEY_SIZE_128);
801			cdata->cword.b.keygen = 0;
802			break;
803
804		case 192:
805		case 256:
806			/* Generate an extended AES key in software.
807			   Needed for AES192/AES256 */
808			/* Well, the above applies to Stepping 8 CPUs
809			   and is listed as hardware errata. They most
810			   likely will fix it at some point and then
811			   a check for stepping would be due here. */
812			if (EVP_CIPHER_CTX_mode(ctx) == EVP_CIPH_CFB_MODE ||
813			    EVP_CIPHER_CTX_mode(ctx) == EVP_CIPH_OFB_MODE ||
814			    enc)
815				AES_set_encrypt_key(key, key_len, &cdata->ks);
816			else
817				AES_set_decrypt_key(key, key_len, &cdata->ks);
818#ifndef AES_ASM
819			/* OpenSSL C functions use byte-swapped extended key. */
820			padlock_bswapl(&cdata->ks);
821#endif
822			cdata->cword.b.keygen = 1;
823			break;
824
825		default:
826			/* ERROR */
827			return 0;
828	}
829
830	/*
831	 * This is done to cover for cases when user reuses the
832	 * context for new key. The catch is that if we don't do
833	 * this, padlock_eas_cipher might proceed with old key...
834	 */
835	padlock_reload_key ();
836
837	return 1;
838}
839
840/*
841 * Simplified version of padlock_aes_cipher() used when
842 * 1) both input and output buffers are at aligned addresses.
843 * or when
844 * 2) running on a newer CPU that doesn't require aligned buffers.
845 */
846static int
847padlock_aes_cipher_omnivorous(EVP_CIPHER_CTX *ctx, unsigned char *out_arg,
848		const unsigned char *in_arg, size_t nbytes)
849{
850	struct padlock_cipher_data *cdata;
851	void  *iv;
852
853	cdata = ALIGNED_CIPHER_DATA(ctx);
854	padlock_verify_context(cdata);
855
856	switch (EVP_CIPHER_CTX_mode(ctx)) {
857	case EVP_CIPH_ECB_MODE:
858		padlock_xcrypt_ecb(nbytes/AES_BLOCK_SIZE, cdata, out_arg, in_arg);
859		break;
860
861	case EVP_CIPH_CBC_MODE:
862		memcpy(cdata->iv, ctx->iv, AES_BLOCK_SIZE);
863		iv = padlock_xcrypt_cbc(nbytes/AES_BLOCK_SIZE, cdata, out_arg, in_arg);
864		memcpy(ctx->iv, iv, AES_BLOCK_SIZE);
865		break;
866
867	case EVP_CIPH_CFB_MODE:
868		memcpy(cdata->iv, ctx->iv, AES_BLOCK_SIZE);
869		iv = padlock_xcrypt_cfb(nbytes/AES_BLOCK_SIZE, cdata, out_arg, in_arg);
870		memcpy(ctx->iv, iv, AES_BLOCK_SIZE);
871		break;
872
873	case EVP_CIPH_OFB_MODE:
874		memcpy(cdata->iv, ctx->iv, AES_BLOCK_SIZE);
875		padlock_xcrypt_ofb(nbytes/AES_BLOCK_SIZE, cdata, out_arg, in_arg);
876		memcpy(ctx->iv, cdata->iv, AES_BLOCK_SIZE);
877		break;
878
879	default:
880		return 0;
881	}
882
883	memset(cdata->iv, 0, AES_BLOCK_SIZE);
884
885	return 1;
886}
887
888#ifndef  PADLOCK_CHUNK
889# define PADLOCK_CHUNK	512	/* Must be a power of 2 larger than 16 */
890#endif
891#if PADLOCK_CHUNK<16 || PADLOCK_CHUNK&(PADLOCK_CHUNK-1)
892# error "insane PADLOCK_CHUNK..."
893#endif
894
895/* Re-align the arguments to 16-Bytes boundaries and run the
896   encryption function itself. This function is not AES-specific. */
897static int
898padlock_aes_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out_arg,
899		   const unsigned char *in_arg, size_t nbytes)
900{
901	struct padlock_cipher_data *cdata;
902	const  void *inp;
903	unsigned char  *out;
904	void  *iv;
905	int    inp_misaligned, out_misaligned, realign_in_loop;
906	size_t chunk, allocated=0;
907
908	/* ctx->num is maintained in byte-oriented modes,
909	   such as CFB and OFB... */
910	if ((chunk = ctx->num)) { /* borrow chunk variable */
911		unsigned char *ivp=ctx->iv;
912
913		switch (EVP_CIPHER_CTX_mode(ctx)) {
914		case EVP_CIPH_CFB_MODE:
915			if (chunk >= AES_BLOCK_SIZE)
916				return 0; /* bogus value */
917
918			if (ctx->encrypt)
919				while (chunk<AES_BLOCK_SIZE && nbytes!=0) {
920					ivp[chunk] = *(out_arg++) = *(in_arg++) ^ ivp[chunk];
921					chunk++, nbytes--;
922				}
923			else	while (chunk<AES_BLOCK_SIZE && nbytes!=0) {
924					unsigned char c = *(in_arg++);
925					*(out_arg++) = c ^ ivp[chunk];
926					ivp[chunk++] = c, nbytes--;
927				}
928
929			ctx->num = chunk%AES_BLOCK_SIZE;
930			break;
931		case EVP_CIPH_OFB_MODE:
932			if (chunk >= AES_BLOCK_SIZE)
933				return 0; /* bogus value */
934
935			while (chunk<AES_BLOCK_SIZE && nbytes!=0) {
936				*(out_arg++) = *(in_arg++) ^ ivp[chunk];
937				chunk++, nbytes--;
938			}
939
940			ctx->num = chunk%AES_BLOCK_SIZE;
941			break;
942		}
943	}
944
945	if (nbytes == 0)
946		return 1;
947#if 0
948	if (nbytes % AES_BLOCK_SIZE)
949		return 0; /* are we expected to do tail processing? */
950#else
951	/* nbytes is always multiple of AES_BLOCK_SIZE in ECB and CBC
952	   modes and arbitrary value in byte-oriented modes, such as
953	   CFB and OFB... */
954#endif
955
956	/* VIA promises CPUs that won't require alignment in the future.
957	   For now padlock_aes_align_required is initialized to 1 and
958	   the condition is never met... */
959	/* C7 core is capable to manage unaligned input in non-ECB[!]
960	   mode, but performance penalties appear to be approximately
961	   same as for software alignment below or ~3x. They promise to
962	   improve it in the future, but for now we can just as well
963	   pretend that it can only handle aligned input... */
964	if (!padlock_aes_align_required && (nbytes%AES_BLOCK_SIZE)==0)
965		return padlock_aes_cipher_omnivorous(ctx, out_arg, in_arg, nbytes);
966
967	inp_misaligned = (((size_t)in_arg) & 0x0F);
968	out_misaligned = (((size_t)out_arg) & 0x0F);
969
970	/* Note that even if output is aligned and input not,
971	 * I still prefer to loop instead of copy the whole
972	 * input and then encrypt in one stroke. This is done
973	 * in order to improve L1 cache utilization... */
974	realign_in_loop = out_misaligned|inp_misaligned;
975
976	if (!realign_in_loop && (nbytes%AES_BLOCK_SIZE)==0)
977		return padlock_aes_cipher_omnivorous(ctx, out_arg, in_arg, nbytes);
978
979	/* this takes one "if" out of the loops */
980	chunk  = nbytes;
981	chunk %= PADLOCK_CHUNK;
982	if (chunk==0) chunk = PADLOCK_CHUNK;
983
984	if (out_misaligned) {
985		/* optmize for small input */
986		allocated = (chunk<nbytes?PADLOCK_CHUNK:nbytes);
987		out = alloca(0x10 + allocated);
988		out = NEAREST_ALIGNED(out);
989	}
990	else
991		out = out_arg;
992
993	cdata = ALIGNED_CIPHER_DATA(ctx);
994	padlock_verify_context(cdata);
995
996	switch (EVP_CIPHER_CTX_mode(ctx)) {
997	case EVP_CIPH_ECB_MODE:
998		do	{
999			if (inp_misaligned)
1000				inp = padlock_memcpy(out, in_arg, chunk);
1001			else
1002				inp = in_arg;
1003			in_arg += chunk;
1004
1005			padlock_xcrypt_ecb(chunk/AES_BLOCK_SIZE, cdata, out, inp);
1006
1007			if (out_misaligned)
1008				out_arg = padlock_memcpy(out_arg, out, chunk) + chunk;
1009			else
1010				out     = out_arg+=chunk;
1011
1012			nbytes -= chunk;
1013			chunk   = PADLOCK_CHUNK;
1014		} while (nbytes);
1015		break;
1016
1017	case EVP_CIPH_CBC_MODE:
1018		memcpy(cdata->iv, ctx->iv, AES_BLOCK_SIZE);
1019		goto cbc_shortcut;
1020		do	{
1021			if (iv != cdata->iv)
1022				memcpy(cdata->iv, iv, AES_BLOCK_SIZE);
1023			chunk = PADLOCK_CHUNK;
1024		cbc_shortcut: /* optimize for small input */
1025			if (inp_misaligned)
1026				inp = padlock_memcpy(out, in_arg, chunk);
1027			else
1028				inp = in_arg;
1029			in_arg += chunk;
1030
1031			iv = padlock_xcrypt_cbc(chunk/AES_BLOCK_SIZE, cdata, out, inp);
1032
1033			if (out_misaligned)
1034				out_arg = padlock_memcpy(out_arg, out, chunk) + chunk;
1035			else
1036				out     = out_arg+=chunk;
1037
1038		} while (nbytes -= chunk);
1039		memcpy(ctx->iv, iv, AES_BLOCK_SIZE);
1040		break;
1041
1042	case EVP_CIPH_CFB_MODE:
1043		memcpy (iv = cdata->iv, ctx->iv, AES_BLOCK_SIZE);
1044		chunk &= ~(AES_BLOCK_SIZE-1);
1045		if (chunk)	goto cfb_shortcut;
1046		else		goto cfb_skiploop;
1047		do	{
1048			if (iv != cdata->iv)
1049				memcpy(cdata->iv, iv, AES_BLOCK_SIZE);
1050			chunk = PADLOCK_CHUNK;
1051		cfb_shortcut: /* optimize for small input */
1052			if (inp_misaligned)
1053				inp = padlock_memcpy(out, in_arg, chunk);
1054			else
1055				inp = in_arg;
1056			in_arg += chunk;
1057
1058			iv = padlock_xcrypt_cfb(chunk/AES_BLOCK_SIZE, cdata, out, inp);
1059
1060			if (out_misaligned)
1061				out_arg = padlock_memcpy(out_arg, out, chunk) + chunk;
1062			else
1063				out     = out_arg+=chunk;
1064
1065			nbytes -= chunk;
1066		} while (nbytes >= AES_BLOCK_SIZE);
1067
1068		cfb_skiploop:
1069		if (nbytes) {
1070			unsigned char *ivp = cdata->iv;
1071
1072			if (iv != ivp) {
1073				memcpy(ivp, iv, AES_BLOCK_SIZE);
1074				iv = ivp;
1075			}
1076			ctx->num = nbytes;
1077			if (cdata->cword.b.encdec) {
1078				cdata->cword.b.encdec=0;
1079				padlock_reload_key();
1080				padlock_xcrypt_ecb(1,cdata,ivp,ivp);
1081				cdata->cword.b.encdec=1;
1082				padlock_reload_key();
1083				while(nbytes) {
1084					unsigned char c = *(in_arg++);
1085					*(out_arg++) = c ^ *ivp;
1086					*(ivp++) = c, nbytes--;
1087				}
1088			}
1089			else {	padlock_reload_key();
1090				padlock_xcrypt_ecb(1,cdata,ivp,ivp);
1091				padlock_reload_key();
1092				while (nbytes) {
1093					*ivp = *(out_arg++) = *(in_arg++) ^ *ivp;
1094					ivp++, nbytes--;
1095				}
1096			}
1097		}
1098
1099		memcpy(ctx->iv, iv, AES_BLOCK_SIZE);
1100		break;
1101
1102	case EVP_CIPH_OFB_MODE:
1103		memcpy(cdata->iv, ctx->iv, AES_BLOCK_SIZE);
1104		chunk &= ~(AES_BLOCK_SIZE-1);
1105		if (chunk) do	{
1106			if (inp_misaligned)
1107				inp = padlock_memcpy(out, in_arg, chunk);
1108			else
1109				inp = in_arg;
1110			in_arg += chunk;
1111
1112			padlock_xcrypt_ofb(chunk/AES_BLOCK_SIZE, cdata, out, inp);
1113
1114			if (out_misaligned)
1115				out_arg = padlock_memcpy(out_arg, out, chunk) + chunk;
1116			else
1117				out     = out_arg+=chunk;
1118
1119			nbytes -= chunk;
1120			chunk   = PADLOCK_CHUNK;
1121		} while (nbytes >= AES_BLOCK_SIZE);
1122
1123		if (nbytes) {
1124			unsigned char *ivp = cdata->iv;
1125
1126			ctx->num = nbytes;
1127			padlock_reload_key();	/* empirically found */
1128			padlock_xcrypt_ecb(1,cdata,ivp,ivp);
1129			padlock_reload_key();	/* empirically found */
1130			while (nbytes) {
1131				*(out_arg++) = *(in_arg++) ^ *ivp;
1132				ivp++, nbytes--;
1133			}
1134		}
1135
1136		memcpy(ctx->iv, cdata->iv, AES_BLOCK_SIZE);
1137		break;
1138
1139	default:
1140		return 0;
1141	}
1142
1143	/* Clean the realign buffer if it was used */
1144	if (out_misaligned) {
1145		volatile unsigned long *p=(void *)out;
1146		size_t   n = allocated/sizeof(*p);
1147		while (n--) *p++=0;
1148	}
1149
1150	memset(cdata->iv, 0, AES_BLOCK_SIZE);
1151
1152	return 1;
1153}
1154
1155#endif /* OPENSSL_NO_AES */
1156
1157/* ===== Random Number Generator ===== */
1158/*
1159 * This code is not engaged. The reason is that it does not comply
1160 * with recommendations for VIA RNG usage for secure applications
1161 * (posted at http://www.via.com.tw/en/viac3/c3.jsp) nor does it
1162 * provide meaningful error control...
1163 */
1164/* Wrapper that provides an interface between the API and
1165   the raw PadLock RNG */
1166static int
1167padlock_rand_bytes(unsigned char *output, int count)
1168{
1169	unsigned int eax, buf;
1170
1171	while (count >= 8) {
1172		eax = padlock_xstore(output, 0);
1173		if (!(eax&(1<<6)))	return 0; /* RNG disabled */
1174		/* this ---vv--- covers DC bias, Raw Bits and String Filter */
1175		if (eax&(0x1F<<10))	return 0;
1176		if ((eax&0x1F)==0)	continue; /* no data, retry... */
1177		if ((eax&0x1F)!=8)	return 0; /* fatal failure...  */
1178		output += 8;
1179		count  -= 8;
1180	}
1181	while (count > 0) {
1182		eax = padlock_xstore(&buf, 3);
1183		if (!(eax&(1<<6)))	return 0; /* RNG disabled */
1184		/* this ---vv--- covers DC bias, Raw Bits and String Filter */
1185		if (eax&(0x1F<<10))	return 0;
1186		if ((eax&0x1F)==0)	continue; /* no data, retry... */
1187		if ((eax&0x1F)!=1)	return 0; /* fatal failure...  */
1188		*output++ = (unsigned char)buf;
1189		count--;
1190	}
1191	*(volatile unsigned int *)&buf=0;
1192
1193	return 1;
1194}
1195
1196/* Dummy but necessary function */
1197static int
1198padlock_rand_status(void)
1199{
1200	return 1;
1201}
1202
1203/* Prepare structure for registration */
1204static RAND_METHOD padlock_rand = {
1205	NULL,			/* seed */
1206	padlock_rand_bytes,	/* bytes */
1207	NULL,			/* cleanup */
1208	NULL,			/* add */
1209	padlock_rand_bytes,	/* pseudorand */
1210	padlock_rand_status,	/* rand status */
1211};
1212
1213#endif /* COMPILE_HW_PADLOCK */
1214
1215#endif /* !OPENSSL_NO_HW_PADLOCK */
1216#endif /* !OPENSSL_NO_HW */
1217