1238384Sjkim/*
2238384Sjkim * Support for VIA PadLock Advanced Cryptography Engine (ACE)
3238384Sjkim * Written by Michal Ludvig <michal@logix.cz>
4238384Sjkim *            http://www.logix.cz/michal
5238384Sjkim *
6238384Sjkim * Big thanks to Andy Polyakov for a help with optimization,
7238384Sjkim * assembler fixes, port to MS Windows and a lot of other
8238384Sjkim * valuable work on this engine!
9238384Sjkim */
10238384Sjkim
11238384Sjkim/* ====================================================================
12238384Sjkim * Copyright (c) 1999-2001 The OpenSSL Project.  All rights reserved.
13238384Sjkim *
14238384Sjkim * Redistribution and use in source and binary forms, with or without
15238384Sjkim * modification, are permitted provided that the following conditions
16238384Sjkim * are met:
17238384Sjkim *
18238384Sjkim * 1. Redistributions of source code must retain the above copyright
19238384Sjkim *    notice, this list of conditions and the following disclaimer.
20238384Sjkim *
21238384Sjkim * 2. Redistributions in binary form must reproduce the above copyright
22238384Sjkim *    notice, this list of conditions and the following disclaimer in
23238384Sjkim *    the documentation and/or other materials provided with the
24238384Sjkim *    distribution.
25238384Sjkim *
26238384Sjkim * 3. All advertising materials mentioning features or use of this
27238384Sjkim *    software must display the following acknowledgment:
28238384Sjkim *    "This product includes software developed by the OpenSSL Project
29238384Sjkim *    for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
30238384Sjkim *
31238384Sjkim * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
32238384Sjkim *    endorse or promote products derived from this software without
33238384Sjkim *    prior written permission. For written permission, please contact
34238384Sjkim *    licensing@OpenSSL.org.
35238384Sjkim *
36238384Sjkim * 5. Products derived from this software may not be called "OpenSSL"
37238384Sjkim *    nor may "OpenSSL" appear in their names without prior written
38238384Sjkim *    permission of the OpenSSL Project.
39238384Sjkim *
40238384Sjkim * 6. Redistributions of any form whatsoever must retain the following
41238384Sjkim *    acknowledgment:
42238384Sjkim *    "This product includes software developed by the OpenSSL Project
43238384Sjkim *    for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
44238384Sjkim *
45238384Sjkim * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
46238384Sjkim * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
47238384Sjkim * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
48238384Sjkim * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
49238384Sjkim * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
50238384Sjkim * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
51238384Sjkim * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
52238384Sjkim * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
53238384Sjkim * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
54238384Sjkim * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
55238384Sjkim * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
56238384Sjkim * OF THE POSSIBILITY OF SUCH DAMAGE.
57238384Sjkim * ====================================================================
58238384Sjkim *
59238384Sjkim * This product includes cryptographic software written by Eric Young
60238384Sjkim * (eay@cryptsoft.com).  This product includes software written by Tim
61238384Sjkim * Hudson (tjh@cryptsoft.com).
62238384Sjkim *
63238384Sjkim */
64238384Sjkim
65238384Sjkim
66238384Sjkim#include <stdio.h>
67238384Sjkim#include <string.h>
68238384Sjkim
69238384Sjkim#include <openssl/opensslconf.h>
70238384Sjkim#include <openssl/crypto.h>
71238384Sjkim#include <openssl/dso.h>
72238384Sjkim#include <openssl/engine.h>
73238384Sjkim#include <openssl/evp.h>
74238384Sjkim#ifndef OPENSSL_NO_AES
75238384Sjkim#include <openssl/aes.h>
76238384Sjkim#endif
77238384Sjkim#include <openssl/rand.h>
78238384Sjkim#include <openssl/err.h>
79238384Sjkim
80238384Sjkim#ifndef OPENSSL_NO_HW
81238384Sjkim#ifndef OPENSSL_NO_HW_PADLOCK
82238384Sjkim
83238384Sjkim/* Attempt to have a single source for both 0.9.7 and 0.9.8 :-) */
84238384Sjkim#if (OPENSSL_VERSION_NUMBER >= 0x00908000L)
85238384Sjkim#  ifndef OPENSSL_NO_DYNAMIC_ENGINE
86238384Sjkim#    define DYNAMIC_ENGINE
87238384Sjkim#  endif
88238384Sjkim#elif (OPENSSL_VERSION_NUMBER >= 0x00907000L)
89238384Sjkim#  ifdef ENGINE_DYNAMIC_SUPPORT
90238384Sjkim#    define DYNAMIC_ENGINE
91238384Sjkim#  endif
92238384Sjkim#else
93238384Sjkim#  error "Only OpenSSL >= 0.9.7 is supported"
94238384Sjkim#endif
95238384Sjkim
96238384Sjkim/* VIA PadLock AES is available *ONLY* on some x86 CPUs.
97238384Sjkim   Not only that it doesn't exist elsewhere, but it
98238384Sjkim   even can't be compiled on other platforms!
99238384Sjkim
100238384Sjkim   In addition, because of the heavy use of inline assembler,
101238384Sjkim   compiler choice is limited to GCC and Microsoft C. */
102238384Sjkim#undef COMPILE_HW_PADLOCK
103238384Sjkim#if !defined(I386_ONLY) && !defined(OPENSSL_NO_INLINE_ASM)
104238384Sjkim# if (defined(__GNUC__) && (defined(__i386__) || defined(__i386))) || \
105238384Sjkim     (defined(_MSC_VER) && defined(_M_IX86))
106238384Sjkim#  define COMPILE_HW_PADLOCK
107238384Sjkim# endif
108238384Sjkim#endif
109238384Sjkim
110238384Sjkim#ifdef OPENSSL_NO_DYNAMIC_ENGINE
111238384Sjkim#ifdef COMPILE_HW_PADLOCK
112238384Sjkimstatic ENGINE *ENGINE_padlock (void);
113238384Sjkim#endif
114238384Sjkim
115238384Sjkimvoid ENGINE_load_padlock (void)
116238384Sjkim{
117238384Sjkim/* On non-x86 CPUs it just returns. */
118238384Sjkim#ifdef COMPILE_HW_PADLOCK
119238384Sjkim	ENGINE *toadd = ENGINE_padlock ();
120238384Sjkim	if (!toadd) return;
121238384Sjkim	ENGINE_add (toadd);
122238384Sjkim	ENGINE_free (toadd);
123238384Sjkim	ERR_clear_error ();
124238384Sjkim#endif
125238384Sjkim}
126238384Sjkim
127238384Sjkim#endif
128238384Sjkim
129238384Sjkim#ifdef COMPILE_HW_PADLOCK
130238384Sjkim/* We do these includes here to avoid header problems on platforms that
131238384Sjkim   do not have the VIA padlock anyway... */
132238384Sjkim#include <stdlib.h>
133238384Sjkim#ifdef _WIN32
134238384Sjkim# include <malloc.h>
135238384Sjkim# ifndef alloca
136238384Sjkim#  define alloca _alloca
137238384Sjkim# endif
138238384Sjkim#elif defined(__GNUC__)
139238384Sjkim# ifndef alloca
140238384Sjkim#  define alloca(s) __builtin_alloca(s)
141238384Sjkim# endif
142238384Sjkim#endif
143238384Sjkim
144238384Sjkim/* Function for ENGINE detection and control */
145238384Sjkimstatic int padlock_available(void);
146238384Sjkimstatic int padlock_init(ENGINE *e);
147238384Sjkim
148238384Sjkim/* RNG Stuff */
149238384Sjkimstatic RAND_METHOD padlock_rand;
150238384Sjkim
151238384Sjkim/* Cipher Stuff */
152238384Sjkim#ifndef OPENSSL_NO_AES
153238384Sjkimstatic int padlock_ciphers(ENGINE *e, const EVP_CIPHER **cipher, const int **nids, int nid);
154238384Sjkim#endif
155238384Sjkim
156238384Sjkim/* Engine names */
157238384Sjkimstatic const char *padlock_id = "padlock";
158238384Sjkimstatic char padlock_name[100];
159238384Sjkim
160238384Sjkim/* Available features */
161238384Sjkimstatic int padlock_use_ace = 0;	/* Advanced Cryptography Engine */
162238384Sjkimstatic int padlock_use_rng = 0;	/* Random Number Generator */
163238384Sjkim#ifndef OPENSSL_NO_AES
164238384Sjkimstatic int padlock_aes_align_required = 1;
165238384Sjkim#endif
166238384Sjkim
167238384Sjkim/* ===== Engine "management" functions ===== */
168238384Sjkim
169238384Sjkim/* Prepare the ENGINE structure for registration */
170238384Sjkimstatic int
171238384Sjkimpadlock_bind_helper(ENGINE *e)
172238384Sjkim{
173238384Sjkim	/* Check available features */
174238384Sjkim	padlock_available();
175238384Sjkim
176238384Sjkim#if 1	/* disable RNG for now, see commentary in vicinity of RNG code */
177238384Sjkim	padlock_use_rng=0;
178238384Sjkim#endif
179238384Sjkim
180238384Sjkim	/* Generate a nice engine name with available features */
181238384Sjkim	BIO_snprintf(padlock_name, sizeof(padlock_name),
182238384Sjkim		"VIA PadLock (%s, %s)",
183238384Sjkim		 padlock_use_rng ? "RNG" : "no-RNG",
184238384Sjkim		 padlock_use_ace ? "ACE" : "no-ACE");
185238384Sjkim
186238384Sjkim	/* Register everything or return with an error */
187238384Sjkim	if (!ENGINE_set_id(e, padlock_id) ||
188238384Sjkim	    !ENGINE_set_name(e, padlock_name) ||
189238384Sjkim
190238384Sjkim	    !ENGINE_set_init_function(e, padlock_init) ||
191238384Sjkim#ifndef OPENSSL_NO_AES
192238384Sjkim	    (padlock_use_ace && !ENGINE_set_ciphers (e, padlock_ciphers)) ||
193238384Sjkim#endif
194238384Sjkim	    (padlock_use_rng && !ENGINE_set_RAND (e, &padlock_rand))) {
195238384Sjkim		return 0;
196238384Sjkim	}
197238384Sjkim
198238384Sjkim	/* Everything looks good */
199238384Sjkim	return 1;
200238384Sjkim}
201238384Sjkim
202238384Sjkim#ifdef OPENSSL_NO_DYNAMIC_ENGINE
203238384Sjkim
204238384Sjkim/* Constructor */
205238384Sjkimstatic ENGINE *
206238384SjkimENGINE_padlock(void)
207238384Sjkim{
208238384Sjkim	ENGINE *eng = ENGINE_new();
209238384Sjkim
210238384Sjkim	if (!eng) {
211238384Sjkim		return NULL;
212238384Sjkim	}
213238384Sjkim
214238384Sjkim	if (!padlock_bind_helper(eng)) {
215238384Sjkim		ENGINE_free(eng);
216238384Sjkim		return NULL;
217238384Sjkim	}
218238384Sjkim
219238384Sjkim	return eng;
220238384Sjkim}
221238384Sjkim
222238384Sjkim#endif
223238384Sjkim
224238384Sjkim/* Check availability of the engine */
225238384Sjkimstatic int
226238384Sjkimpadlock_init(ENGINE *e)
227238384Sjkim{
228238384Sjkim	return (padlock_use_rng || padlock_use_ace);
229238384Sjkim}
230238384Sjkim
231238384Sjkim/* This stuff is needed if this ENGINE is being compiled into a self-contained
232238384Sjkim * shared-library.
233238384Sjkim */
234238384Sjkim#ifdef DYNAMIC_ENGINE
235238384Sjkimstatic int
236238384Sjkimpadlock_bind_fn(ENGINE *e, const char *id)
237238384Sjkim{
238238384Sjkim	if (id && (strcmp(id, padlock_id) != 0)) {
239238384Sjkim		return 0;
240238384Sjkim	}
241238384Sjkim
242238384Sjkim	if (!padlock_bind_helper(e))  {
243238384Sjkim		return 0;
244238384Sjkim	}
245238384Sjkim
246238384Sjkim	return 1;
247238384Sjkim}
248238384Sjkim
249238384SjkimIMPLEMENT_DYNAMIC_CHECK_FN()
250238384SjkimIMPLEMENT_DYNAMIC_BIND_FN (padlock_bind_fn)
251238384Sjkim#endif /* DYNAMIC_ENGINE */
252238384Sjkim
253238384Sjkim/* ===== Here comes the "real" engine ===== */
254238384Sjkim
255238384Sjkim#ifndef OPENSSL_NO_AES
256238384Sjkim/* Some AES-related constants */
257238384Sjkim#define AES_BLOCK_SIZE		16
258238384Sjkim#define AES_KEY_SIZE_128	16
259238384Sjkim#define AES_KEY_SIZE_192	24
260238384Sjkim#define AES_KEY_SIZE_256	32
261238384Sjkim
262238384Sjkim/* Here we store the status information relevant to the
263238384Sjkim   current context. */
264238384Sjkim/* BIG FAT WARNING:
265238384Sjkim * 	Inline assembler in PADLOCK_XCRYPT_ASM()
266238384Sjkim * 	depends on the order of items in this structure.
267238384Sjkim * 	Don't blindly modify, reorder, etc!
268238384Sjkim */
269238384Sjkimstruct padlock_cipher_data
270238384Sjkim{
271238384Sjkim	unsigned char iv[AES_BLOCK_SIZE];	/* Initialization vector */
272238384Sjkim	union {	unsigned int pad[4];
273238384Sjkim		struct {
274238384Sjkim			int rounds:4;
275238384Sjkim			int dgst:1;	/* n/a in C3 */
276238384Sjkim			int align:1;	/* n/a in C3 */
277238384Sjkim			int ciphr:1;	/* n/a in C3 */
278238384Sjkim			unsigned int keygen:1;
279238384Sjkim			int interm:1;
280238384Sjkim			unsigned int encdec:1;
281238384Sjkim			int ksize:2;
282238384Sjkim		} b;
283238384Sjkim	} cword;		/* Control word */
284238384Sjkim	AES_KEY ks;		/* Encryption key */
285238384Sjkim};
286238384Sjkim
287238384Sjkim/*
288238384Sjkim * Essentially this variable belongs in thread local storage.
289238384Sjkim * Having this variable global on the other hand can only cause
290238384Sjkim * few bogus key reloads [if any at all on single-CPU system],
291238384Sjkim * so we accept the penatly...
292238384Sjkim */
293238384Sjkimstatic volatile struct padlock_cipher_data *padlock_saved_context;
294238384Sjkim#endif
295238384Sjkim
296238384Sjkim/*
297238384Sjkim * =======================================================
298238384Sjkim * Inline assembler section(s).
299238384Sjkim * =======================================================
300238384Sjkim * Order of arguments is chosen to facilitate Windows port
301238384Sjkim * using __fastcall calling convention. If you wish to add
302238384Sjkim * more routines, keep in mind that first __fastcall
303238384Sjkim * argument is passed in %ecx and second - in %edx.
304238384Sjkim * =======================================================
305238384Sjkim */
306238384Sjkim#if defined(__GNUC__) && __GNUC__>=2
307238384Sjkim/*
308238384Sjkim * As for excessive "push %ebx"/"pop %ebx" found all over.
309238384Sjkim * When generating position-independent code GCC won't let
310238384Sjkim * us use "b" in assembler templates nor even respect "ebx"
311238384Sjkim * in "clobber description." Therefore the trouble...
312238384Sjkim */
313238384Sjkim
314238384Sjkim/* Helper function - check if a CPUID instruction
315238384Sjkim   is available on this CPU */
316238384Sjkimstatic int
317238384Sjkimpadlock_insn_cpuid_available(void)
318238384Sjkim{
319238384Sjkim	int result = -1;
320238384Sjkim
321238384Sjkim	/* We're checking if the bit #21 of EFLAGS
322238384Sjkim	   can be toggled. If yes = CPUID is available. */
323238384Sjkim	asm volatile (
324238384Sjkim		"pushf\n"
325238384Sjkim		"popl %%eax\n"
326238384Sjkim		"xorl $0x200000, %%eax\n"
327238384Sjkim		"movl %%eax, %%ecx\n"
328238384Sjkim		"andl $0x200000, %%ecx\n"
329238384Sjkim		"pushl %%eax\n"
330238384Sjkim		"popf\n"
331238384Sjkim		"pushf\n"
332238384Sjkim		"popl %%eax\n"
333238384Sjkim		"andl $0x200000, %%eax\n"
334238384Sjkim		"xorl %%eax, %%ecx\n"
335238384Sjkim		"movl %%ecx, %0\n"
336238384Sjkim		: "=r" (result) : : "eax", "ecx");
337238384Sjkim
338238384Sjkim	return (result == 0);
339238384Sjkim}
340238384Sjkim
341238384Sjkim/* Load supported features of the CPU to see if
342238384Sjkim   the PadLock is available. */
343238384Sjkimstatic int
344238384Sjkimpadlock_available(void)
345238384Sjkim{
346238384Sjkim	char vendor_string[16];
347238384Sjkim	unsigned int eax, edx;
348238384Sjkim
349238384Sjkim	/* First check if the CPUID instruction is available at all... */
350238384Sjkim	if (! padlock_insn_cpuid_available())
351238384Sjkim		return 0;
352238384Sjkim
353238384Sjkim	/* Are we running on the Centaur (VIA) CPU? */
354238384Sjkim	eax = 0x00000000;
355238384Sjkim	vendor_string[12] = 0;
356238384Sjkim	asm volatile (
357238384Sjkim		"pushl	%%ebx\n"
358238384Sjkim		"cpuid\n"
359238384Sjkim		"movl	%%ebx,(%%edi)\n"
360238384Sjkim		"movl	%%edx,4(%%edi)\n"
361238384Sjkim		"movl	%%ecx,8(%%edi)\n"
362238384Sjkim		"popl	%%ebx"
363238384Sjkim		: "+a"(eax) : "D"(vendor_string) : "ecx", "edx");
364238384Sjkim	if (strcmp(vendor_string, "CentaurHauls") != 0)
365238384Sjkim		return 0;
366238384Sjkim
367238384Sjkim	/* Check for Centaur Extended Feature Flags presence */
368238384Sjkim	eax = 0xC0000000;
369238384Sjkim	asm volatile ("pushl %%ebx; cpuid; popl	%%ebx"
370238384Sjkim		: "+a"(eax) : : "ecx", "edx");
371238384Sjkim	if (eax < 0xC0000001)
372238384Sjkim		return 0;
373238384Sjkim
374238384Sjkim	/* Read the Centaur Extended Feature Flags */
375238384Sjkim	eax = 0xC0000001;
376238384Sjkim	asm volatile ("pushl %%ebx; cpuid; popl %%ebx"
377238384Sjkim		: "+a"(eax), "=d"(edx) : : "ecx");
378238384Sjkim
379238384Sjkim	/* Fill up some flags */
380238384Sjkim	padlock_use_ace = ((edx & (0x3<<6)) == (0x3<<6));
381238384Sjkim	padlock_use_rng = ((edx & (0x3<<2)) == (0x3<<2));
382238384Sjkim
383238384Sjkim	return padlock_use_ace + padlock_use_rng;
384238384Sjkim}
385238384Sjkim
386238384Sjkim#ifndef OPENSSL_NO_AES
387279264Sdelphij#ifndef AES_ASM
388238384Sjkim/* Our own htonl()/ntohl() */
389238384Sjkimstatic inline void
390238384Sjkimpadlock_bswapl(AES_KEY *ks)
391238384Sjkim{
392238384Sjkim	size_t i = sizeof(ks->rd_key)/sizeof(ks->rd_key[0]);
393238384Sjkim	unsigned int *key = ks->rd_key;
394238384Sjkim
395238384Sjkim	while (i--) {
396238384Sjkim		asm volatile ("bswapl %0" : "+r"(*key));
397238384Sjkim		key++;
398238384Sjkim	}
399238384Sjkim}
400238384Sjkim#endif
401279264Sdelphij#endif
402238384Sjkim
403238384Sjkim/* Force key reload from memory to the CPU microcode.
404238384Sjkim   Loading EFLAGS from the stack clears EFLAGS[30]
405238384Sjkim   which does the trick. */
406238384Sjkimstatic inline void
407238384Sjkimpadlock_reload_key(void)
408238384Sjkim{
409238384Sjkim	asm volatile ("pushfl; popfl");
410238384Sjkim}
411238384Sjkim
412238384Sjkim#ifndef OPENSSL_NO_AES
413238384Sjkim/*
414238384Sjkim * This is heuristic key context tracing. At first one
415238384Sjkim * believes that one should use atomic swap instructions,
416238384Sjkim * but it's not actually necessary. Point is that if
417238384Sjkim * padlock_saved_context was changed by another thread
418238384Sjkim * after we've read it and before we compare it with cdata,
419238384Sjkim * our key *shall* be reloaded upon thread context switch
420238384Sjkim * and we are therefore set in either case...
421238384Sjkim */
422238384Sjkimstatic inline void
423238384Sjkimpadlock_verify_context(struct padlock_cipher_data *cdata)
424238384Sjkim{
425238384Sjkim	asm volatile (
426238384Sjkim	"pushfl\n"
427238384Sjkim"	btl	$30,(%%esp)\n"
428238384Sjkim"	jnc	1f\n"
429238384Sjkim"	cmpl	%2,%1\n"
430238384Sjkim"	je	1f\n"
431238384Sjkim"	popfl\n"
432238384Sjkim"	subl	$4,%%esp\n"
433238384Sjkim"1:	addl	$4,%%esp\n"
434238384Sjkim"	movl	%2,%0"
435238384Sjkim	:"+m"(padlock_saved_context)
436238384Sjkim	: "r"(padlock_saved_context), "r"(cdata) : "cc");
437238384Sjkim}
438238384Sjkim
439238384Sjkim/* Template for padlock_xcrypt_* modes */
440238384Sjkim/* BIG FAT WARNING:
441238384Sjkim * 	The offsets used with 'leal' instructions
442238384Sjkim * 	describe items of the 'padlock_cipher_data'
443238384Sjkim * 	structure.
444238384Sjkim */
445238384Sjkim#define PADLOCK_XCRYPT_ASM(name,rep_xcrypt)	\
446238384Sjkimstatic inline void *name(size_t cnt,		\
447238384Sjkim	struct padlock_cipher_data *cdata,	\
448238384Sjkim	void *out, const void *inp) 		\
449238384Sjkim{	void *iv; 				\
450238384Sjkim	asm volatile ( "pushl	%%ebx\n"	\
451238384Sjkim		"	leal	16(%0),%%edx\n"	\
452238384Sjkim		"	leal	32(%0),%%ebx\n"	\
453238384Sjkim			rep_xcrypt "\n"		\
454238384Sjkim		"	popl	%%ebx"		\
455238384Sjkim		: "=a"(iv), "=c"(cnt), "=D"(out), "=S"(inp) \
456238384Sjkim		: "0"(cdata), "1"(cnt), "2"(out), "3"(inp)  \
457238384Sjkim		: "edx", "cc", "memory");	\
458238384Sjkim	return iv;				\
459238384Sjkim}
460238384Sjkim
461238384Sjkim/* Generate all functions with appropriate opcodes */
462238384SjkimPADLOCK_XCRYPT_ASM(padlock_xcrypt_ecb, ".byte 0xf3,0x0f,0xa7,0xc8")	/* rep xcryptecb */
463238384SjkimPADLOCK_XCRYPT_ASM(padlock_xcrypt_cbc, ".byte 0xf3,0x0f,0xa7,0xd0")	/* rep xcryptcbc */
464238384SjkimPADLOCK_XCRYPT_ASM(padlock_xcrypt_cfb, ".byte 0xf3,0x0f,0xa7,0xe0")	/* rep xcryptcfb */
465238384SjkimPADLOCK_XCRYPT_ASM(padlock_xcrypt_ofb, ".byte 0xf3,0x0f,0xa7,0xe8")	/* rep xcryptofb */
466238384Sjkim#endif
467238384Sjkim
468238384Sjkim/* The RNG call itself */
469238384Sjkimstatic inline unsigned int
470238384Sjkimpadlock_xstore(void *addr, unsigned int edx_in)
471238384Sjkim{
472238384Sjkim	unsigned int eax_out;
473238384Sjkim
474238384Sjkim	asm volatile (".byte 0x0f,0xa7,0xc0"	/* xstore */
475238384Sjkim	    : "=a"(eax_out),"=m"(*(unsigned *)addr)
476238384Sjkim	    : "D"(addr), "d" (edx_in)
477238384Sjkim	    );
478238384Sjkim
479238384Sjkim	return eax_out;
480238384Sjkim}
481238384Sjkim
482238384Sjkim/* Why not inline 'rep movsd'? I failed to find information on what
483238384Sjkim * value in Direction Flag one can expect and consequently have to
484238384Sjkim * apply "better-safe-than-sorry" approach and assume "undefined."
485238384Sjkim * I could explicitly clear it and restore the original value upon
486238384Sjkim * return from padlock_aes_cipher, but it's presumably too much
487238384Sjkim * trouble for too little gain...
488238384Sjkim *
489238384Sjkim * In case you wonder 'rep xcrypt*' instructions above are *not*
490238384Sjkim * affected by the Direction Flag and pointers advance toward
491238384Sjkim * larger addresses unconditionally.
492238384Sjkim */
493238384Sjkimstatic inline unsigned char *
494238384Sjkimpadlock_memcpy(void *dst,const void *src,size_t n)
495238384Sjkim{
496238384Sjkim	long       *d=dst;
497238384Sjkim	const long *s=src;
498238384Sjkim
499238384Sjkim	n /= sizeof(*d);
500238384Sjkim	do { *d++ = *s++; } while (--n);
501238384Sjkim
502238384Sjkim	return dst;
503238384Sjkim}
504238384Sjkim
505238384Sjkim#elif defined(_MSC_VER)
506238384Sjkim/*
507238384Sjkim * Unlike GCC these are real functions. In order to minimize impact
508238384Sjkim * on performance we adhere to __fastcall calling convention in
509238384Sjkim * order to get two first arguments passed through %ecx and %edx.
510238384Sjkim * Which kind of suits very well, as instructions in question use
511238384Sjkim * both %ecx and %edx as input:-)
512238384Sjkim */
513238384Sjkim#define REP_XCRYPT(code)		\
514238384Sjkim	_asm _emit 0xf3			\
515238384Sjkim	_asm _emit 0x0f _asm _emit 0xa7	\
516238384Sjkim	_asm _emit code
517238384Sjkim
518238384Sjkim/* BIG FAT WARNING:
519238384Sjkim * 	The offsets used with 'lea' instructions
520238384Sjkim * 	describe items of the 'padlock_cipher_data'
521238384Sjkim * 	structure.
522238384Sjkim */
523238384Sjkim#define PADLOCK_XCRYPT_ASM(name,code)	\
524238384Sjkimstatic void * __fastcall 		\
525238384Sjkim	name (size_t cnt, void *cdata,	\
526238384Sjkim	void *outp, const void *inp)	\
527238384Sjkim{	_asm	mov	eax,edx		\
528238384Sjkim	_asm	lea	edx,[eax+16]	\
529238384Sjkim	_asm	lea	ebx,[eax+32]	\
530238384Sjkim	_asm	mov	edi,outp	\
531238384Sjkim	_asm	mov	esi,inp		\
532238384Sjkim	REP_XCRYPT(code)		\
533238384Sjkim}
534238384Sjkim
535238384SjkimPADLOCK_XCRYPT_ASM(padlock_xcrypt_ecb,0xc8)
536238384SjkimPADLOCK_XCRYPT_ASM(padlock_xcrypt_cbc,0xd0)
537238384SjkimPADLOCK_XCRYPT_ASM(padlock_xcrypt_cfb,0xe0)
538238384SjkimPADLOCK_XCRYPT_ASM(padlock_xcrypt_ofb,0xe8)
539238384Sjkim
540238384Sjkimstatic int __fastcall
541238384Sjkimpadlock_xstore(void *outp,unsigned int code)
542238384Sjkim{	_asm	mov	edi,ecx
543238384Sjkim	_asm _emit 0x0f _asm _emit 0xa7 _asm _emit 0xc0
544238384Sjkim}
545238384Sjkim
546238384Sjkimstatic void __fastcall
547238384Sjkimpadlock_reload_key(void)
548238384Sjkim{	_asm pushfd _asm popfd		}
549238384Sjkim
550238384Sjkimstatic void __fastcall
551238384Sjkimpadlock_verify_context(void *cdata)
552238384Sjkim{	_asm	{
553238384Sjkim		pushfd
554238384Sjkim		bt	DWORD PTR[esp],30
555238384Sjkim		jnc	skip
556238384Sjkim		cmp	ecx,padlock_saved_context
557238384Sjkim		je	skip
558238384Sjkim		popfd
559238384Sjkim		sub	esp,4
560238384Sjkim	skip:	add	esp,4
561238384Sjkim		mov	padlock_saved_context,ecx
562238384Sjkim		}
563238384Sjkim}
564238384Sjkim
565238384Sjkimstatic int
566238384Sjkimpadlock_available(void)
567238384Sjkim{	_asm	{
568238384Sjkim		pushfd
569238384Sjkim		pop	eax
570238384Sjkim		mov	ecx,eax
571238384Sjkim		xor	eax,1<<21
572238384Sjkim		push	eax
573238384Sjkim		popfd
574238384Sjkim		pushfd
575238384Sjkim		pop	eax
576238384Sjkim		xor	eax,ecx
577238384Sjkim		bt	eax,21
578238384Sjkim		jnc	noluck
579238384Sjkim		mov	eax,0
580238384Sjkim		cpuid
581238384Sjkim		xor	eax,eax
582238384Sjkim		cmp	ebx,'tneC'
583238384Sjkim		jne	noluck
584238384Sjkim		cmp	edx,'Hrua'
585238384Sjkim		jne	noluck
586238384Sjkim		cmp	ecx,'slua'
587238384Sjkim		jne	noluck
588238384Sjkim		mov	eax,0xC0000000
589238384Sjkim		cpuid
590238384Sjkim		mov	edx,eax
591238384Sjkim		xor	eax,eax
592238384Sjkim		cmp	edx,0xC0000001
593238384Sjkim		jb	noluck
594238384Sjkim		mov	eax,0xC0000001
595238384Sjkim		cpuid
596238384Sjkim		xor	eax,eax
597238384Sjkim		bt	edx,6
598238384Sjkim		jnc	skip_a
599238384Sjkim		bt	edx,7
600238384Sjkim		jnc	skip_a
601238384Sjkim		mov	padlock_use_ace,1
602238384Sjkim		inc	eax
603238384Sjkim	skip_a:	bt	edx,2
604238384Sjkim		jnc	skip_r
605238384Sjkim		bt	edx,3
606238384Sjkim		jnc	skip_r
607238384Sjkim		mov	padlock_use_rng,1
608238384Sjkim		inc	eax
609238384Sjkim	skip_r:
610238384Sjkim	noluck:
611238384Sjkim		}
612238384Sjkim}
613238384Sjkim
614238384Sjkimstatic void __fastcall
615238384Sjkimpadlock_bswapl(void *key)
616238384Sjkim{	_asm	{
617238384Sjkim		pushfd
618238384Sjkim		cld
619238384Sjkim		mov	esi,ecx
620238384Sjkim		mov	edi,ecx
621238384Sjkim		mov	ecx,60
622238384Sjkim	up:	lodsd
623238384Sjkim		bswap	eax
624238384Sjkim		stosd
625238384Sjkim		loop	up
626238384Sjkim		popfd
627238384Sjkim		}
628238384Sjkim}
629238384Sjkim
630238384Sjkim/* MS actually specifies status of Direction Flag and compiler even
631238384Sjkim * manages to compile following as 'rep movsd' all by itself...
632238384Sjkim */
633238384Sjkim#define padlock_memcpy(o,i,n) ((unsigned char *)memcpy((o),(i),(n)&~3U))
634238384Sjkim#endif
635238384Sjkim
636238384Sjkim/* ===== AES encryption/decryption ===== */
637238384Sjkim#ifndef OPENSSL_NO_AES
638238384Sjkim
639238384Sjkim#if defined(NID_aes_128_cfb128) && ! defined (NID_aes_128_cfb)
640238384Sjkim#define NID_aes_128_cfb	NID_aes_128_cfb128
641238384Sjkim#endif
642238384Sjkim
643238384Sjkim#if defined(NID_aes_128_ofb128) && ! defined (NID_aes_128_ofb)
644238384Sjkim#define NID_aes_128_ofb	NID_aes_128_ofb128
645238384Sjkim#endif
646238384Sjkim
647238384Sjkim#if defined(NID_aes_192_cfb128) && ! defined (NID_aes_192_cfb)
648238384Sjkim#define NID_aes_192_cfb	NID_aes_192_cfb128
649238384Sjkim#endif
650238384Sjkim
651238384Sjkim#if defined(NID_aes_192_ofb128) && ! defined (NID_aes_192_ofb)
652238384Sjkim#define NID_aes_192_ofb	NID_aes_192_ofb128
653238384Sjkim#endif
654238384Sjkim
655238384Sjkim#if defined(NID_aes_256_cfb128) && ! defined (NID_aes_256_cfb)
656238384Sjkim#define NID_aes_256_cfb	NID_aes_256_cfb128
657238384Sjkim#endif
658238384Sjkim
659238384Sjkim#if defined(NID_aes_256_ofb128) && ! defined (NID_aes_256_ofb)
660238384Sjkim#define NID_aes_256_ofb	NID_aes_256_ofb128
661238384Sjkim#endif
662238384Sjkim
663238384Sjkim/* List of supported ciphers. */
664238384Sjkimstatic int padlock_cipher_nids[] = {
665238384Sjkim	NID_aes_128_ecb,
666238384Sjkim	NID_aes_128_cbc,
667238384Sjkim	NID_aes_128_cfb,
668238384Sjkim	NID_aes_128_ofb,
669238384Sjkim
670238384Sjkim	NID_aes_192_ecb,
671238384Sjkim	NID_aes_192_cbc,
672238384Sjkim	NID_aes_192_cfb,
673238384Sjkim	NID_aes_192_ofb,
674238384Sjkim
675238384Sjkim	NID_aes_256_ecb,
676238384Sjkim	NID_aes_256_cbc,
677238384Sjkim	NID_aes_256_cfb,
678238384Sjkim	NID_aes_256_ofb,
679238384Sjkim};
680238384Sjkimstatic int padlock_cipher_nids_num = (sizeof(padlock_cipher_nids)/
681238384Sjkim				      sizeof(padlock_cipher_nids[0]));
682238384Sjkim
683238384Sjkim/* Function prototypes ... */
684238384Sjkimstatic int padlock_aes_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
685238384Sjkim				const unsigned char *iv, int enc);
686238384Sjkimstatic int padlock_aes_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
687238384Sjkim			      const unsigned char *in, size_t nbytes);
688238384Sjkim
689238384Sjkim#define NEAREST_ALIGNED(ptr) ( (unsigned char *)(ptr) +		\
690238384Sjkim	( (0x10 - ((size_t)(ptr) & 0x0F)) & 0x0F )	)
691238384Sjkim#define ALIGNED_CIPHER_DATA(ctx) ((struct padlock_cipher_data *)\
692238384Sjkim	NEAREST_ALIGNED(ctx->cipher_data))
693238384Sjkim
694238384Sjkim#define EVP_CIPHER_block_size_ECB	AES_BLOCK_SIZE
695238384Sjkim#define EVP_CIPHER_block_size_CBC	AES_BLOCK_SIZE
696238384Sjkim#define EVP_CIPHER_block_size_OFB	1
697238384Sjkim#define EVP_CIPHER_block_size_CFB	1
698238384Sjkim
699238384Sjkim/* Declaring so many ciphers by hand would be a pain.
700238384Sjkim   Instead introduce a bit of preprocessor magic :-) */
701238384Sjkim#define	DECLARE_AES_EVP(ksize,lmode,umode)	\
702238384Sjkimstatic const EVP_CIPHER padlock_aes_##ksize##_##lmode = {	\
703238384Sjkim	NID_aes_##ksize##_##lmode,		\
704238384Sjkim	EVP_CIPHER_block_size_##umode,	\
705238384Sjkim	AES_KEY_SIZE_##ksize,		\
706238384Sjkim	AES_BLOCK_SIZE,			\
707238384Sjkim	0 | EVP_CIPH_##umode##_MODE,	\
708238384Sjkim	padlock_aes_init_key,		\
709238384Sjkim	padlock_aes_cipher,		\
710238384Sjkim	NULL,				\
711238384Sjkim	sizeof(struct padlock_cipher_data) + 16,	\
712238384Sjkim	EVP_CIPHER_set_asn1_iv,		\
713238384Sjkim	EVP_CIPHER_get_asn1_iv,		\
714238384Sjkim	NULL,				\
715238384Sjkim	NULL				\
716238384Sjkim}
717238384Sjkim
718238384SjkimDECLARE_AES_EVP(128,ecb,ECB);
719238384SjkimDECLARE_AES_EVP(128,cbc,CBC);
720238384SjkimDECLARE_AES_EVP(128,cfb,CFB);
721238384SjkimDECLARE_AES_EVP(128,ofb,OFB);
722238384Sjkim
723238384SjkimDECLARE_AES_EVP(192,ecb,ECB);
724238384SjkimDECLARE_AES_EVP(192,cbc,CBC);
725238384SjkimDECLARE_AES_EVP(192,cfb,CFB);
726238384SjkimDECLARE_AES_EVP(192,ofb,OFB);
727238384Sjkim
728238384SjkimDECLARE_AES_EVP(256,ecb,ECB);
729238384SjkimDECLARE_AES_EVP(256,cbc,CBC);
730238384SjkimDECLARE_AES_EVP(256,cfb,CFB);
731238384SjkimDECLARE_AES_EVP(256,ofb,OFB);
732238384Sjkim
733238384Sjkimstatic int
734238384Sjkimpadlock_ciphers (ENGINE *e, const EVP_CIPHER **cipher, const int **nids, int nid)
735238384Sjkim{
736238384Sjkim	/* No specific cipher => return a list of supported nids ... */
737238384Sjkim	if (!cipher) {
738238384Sjkim		*nids = padlock_cipher_nids;
739238384Sjkim		return padlock_cipher_nids_num;
740238384Sjkim	}
741238384Sjkim
742238384Sjkim	/* ... or the requested "cipher" otherwise */
743238384Sjkim	switch (nid) {
744238384Sjkim	  case NID_aes_128_ecb:
745238384Sjkim	    *cipher = &padlock_aes_128_ecb;
746238384Sjkim	    break;
747238384Sjkim	  case NID_aes_128_cbc:
748238384Sjkim	    *cipher = &padlock_aes_128_cbc;
749238384Sjkim	    break;
750238384Sjkim	  case NID_aes_128_cfb:
751238384Sjkim	    *cipher = &padlock_aes_128_cfb;
752238384Sjkim	    break;
753238384Sjkim	  case NID_aes_128_ofb:
754238384Sjkim	    *cipher = &padlock_aes_128_ofb;
755238384Sjkim	    break;
756238384Sjkim
757238384Sjkim	  case NID_aes_192_ecb:
758238384Sjkim	    *cipher = &padlock_aes_192_ecb;
759238384Sjkim	    break;
760238384Sjkim	  case NID_aes_192_cbc:
761238384Sjkim	    *cipher = &padlock_aes_192_cbc;
762238384Sjkim	    break;
763238384Sjkim	  case NID_aes_192_cfb:
764238384Sjkim	    *cipher = &padlock_aes_192_cfb;
765238384Sjkim	    break;
766238384Sjkim	  case NID_aes_192_ofb:
767238384Sjkim	    *cipher = &padlock_aes_192_ofb;
768238384Sjkim	    break;
769238384Sjkim
770238384Sjkim	  case NID_aes_256_ecb:
771238384Sjkim	    *cipher = &padlock_aes_256_ecb;
772238384Sjkim	    break;
773238384Sjkim	  case NID_aes_256_cbc:
774238384Sjkim	    *cipher = &padlock_aes_256_cbc;
775238384Sjkim	    break;
776238384Sjkim	  case NID_aes_256_cfb:
777238384Sjkim	    *cipher = &padlock_aes_256_cfb;
778238384Sjkim	    break;
779238384Sjkim	  case NID_aes_256_ofb:
780238384Sjkim	    *cipher = &padlock_aes_256_ofb;
781238384Sjkim	    break;
782238384Sjkim
783238384Sjkim	  default:
784238384Sjkim	    /* Sorry, we don't support this NID */
785238384Sjkim	    *cipher = NULL;
786238384Sjkim	    return 0;
787238384Sjkim	}
788238384Sjkim
789238384Sjkim	return 1;
790238384Sjkim}
791238384Sjkim
792238384Sjkim/* Prepare the encryption key for PadLock usage */
793238384Sjkimstatic int
794238384Sjkimpadlock_aes_init_key (EVP_CIPHER_CTX *ctx, const unsigned char *key,
795238384Sjkim		      const unsigned char *iv, int enc)
796238384Sjkim{
797238384Sjkim	struct padlock_cipher_data *cdata;
798238384Sjkim	int key_len = EVP_CIPHER_CTX_key_length(ctx) * 8;
799238384Sjkim
800238384Sjkim	if (key==NULL) return 0;	/* ERROR */
801238384Sjkim
802238384Sjkim	cdata = ALIGNED_CIPHER_DATA(ctx);
803238384Sjkim	memset(cdata, 0, sizeof(struct padlock_cipher_data));
804238384Sjkim
805238384Sjkim	/* Prepare Control word. */
806238384Sjkim	if (EVP_CIPHER_CTX_mode(ctx) == EVP_CIPH_OFB_MODE)
807238384Sjkim		cdata->cword.b.encdec = 0;
808238384Sjkim	else
809238384Sjkim		cdata->cword.b.encdec = (ctx->encrypt == 0);
810238384Sjkim	cdata->cword.b.rounds = 10 + (key_len - 128) / 32;
811238384Sjkim	cdata->cword.b.ksize = (key_len - 128) / 64;
812238384Sjkim
813238384Sjkim	switch(key_len) {
814238384Sjkim		case 128:
815238384Sjkim			/* PadLock can generate an extended key for
816238384Sjkim			   AES128 in hardware */
817238384Sjkim			memcpy(cdata->ks.rd_key, key, AES_KEY_SIZE_128);
818238384Sjkim			cdata->cword.b.keygen = 0;
819238384Sjkim			break;
820238384Sjkim
821238384Sjkim		case 192:
822238384Sjkim		case 256:
823238384Sjkim			/* Generate an extended AES key in software.
824238384Sjkim			   Needed for AES192/AES256 */
825238384Sjkim			/* Well, the above applies to Stepping 8 CPUs
826238384Sjkim			   and is listed as hardware errata. They most
827238384Sjkim			   likely will fix it at some point and then
828238384Sjkim			   a check for stepping would be due here. */
829238384Sjkim			if (EVP_CIPHER_CTX_mode(ctx) == EVP_CIPH_CFB_MODE ||
830238384Sjkim			    EVP_CIPHER_CTX_mode(ctx) == EVP_CIPH_OFB_MODE ||
831238384Sjkim			    enc)
832238384Sjkim				AES_set_encrypt_key(key, key_len, &cdata->ks);
833238384Sjkim			else
834238384Sjkim				AES_set_decrypt_key(key, key_len, &cdata->ks);
835238384Sjkim#ifndef AES_ASM
836238384Sjkim			/* OpenSSL C functions use byte-swapped extended key. */
837238384Sjkim			padlock_bswapl(&cdata->ks);
838238384Sjkim#endif
839238384Sjkim			cdata->cword.b.keygen = 1;
840238384Sjkim			break;
841238384Sjkim
842238384Sjkim		default:
843238384Sjkim			/* ERROR */
844238384Sjkim			return 0;
845238384Sjkim	}
846238384Sjkim
847238384Sjkim	/*
848238384Sjkim	 * This is done to cover for cases when user reuses the
849238384Sjkim	 * context for new key. The catch is that if we don't do
850238384Sjkim	 * this, padlock_eas_cipher might proceed with old key...
851238384Sjkim	 */
852238384Sjkim	padlock_reload_key ();
853238384Sjkim
854238384Sjkim	return 1;
855238384Sjkim}
856238384Sjkim
857238384Sjkim/*
858238384Sjkim * Simplified version of padlock_aes_cipher() used when
859238384Sjkim * 1) both input and output buffers are at aligned addresses.
860238384Sjkim * or when
861238384Sjkim * 2) running on a newer CPU that doesn't require aligned buffers.
862238384Sjkim */
863238384Sjkimstatic int
864238384Sjkimpadlock_aes_cipher_omnivorous(EVP_CIPHER_CTX *ctx, unsigned char *out_arg,
865238384Sjkim		const unsigned char *in_arg, size_t nbytes)
866238384Sjkim{
867238384Sjkim	struct padlock_cipher_data *cdata;
868238384Sjkim	void  *iv;
869238384Sjkim
870238384Sjkim	cdata = ALIGNED_CIPHER_DATA(ctx);
871238384Sjkim	padlock_verify_context(cdata);
872238384Sjkim
873238384Sjkim	switch (EVP_CIPHER_CTX_mode(ctx)) {
874238384Sjkim	case EVP_CIPH_ECB_MODE:
875238384Sjkim		padlock_xcrypt_ecb(nbytes/AES_BLOCK_SIZE, cdata, out_arg, in_arg);
876238384Sjkim		break;
877238384Sjkim
878238384Sjkim	case EVP_CIPH_CBC_MODE:
879238384Sjkim		memcpy(cdata->iv, ctx->iv, AES_BLOCK_SIZE);
880238384Sjkim		iv = padlock_xcrypt_cbc(nbytes/AES_BLOCK_SIZE, cdata, out_arg, in_arg);
881238384Sjkim		memcpy(ctx->iv, iv, AES_BLOCK_SIZE);
882238384Sjkim		break;
883238384Sjkim
884238384Sjkim	case EVP_CIPH_CFB_MODE:
885238384Sjkim		memcpy(cdata->iv, ctx->iv, AES_BLOCK_SIZE);
886238384Sjkim		iv = padlock_xcrypt_cfb(nbytes/AES_BLOCK_SIZE, cdata, out_arg, in_arg);
887238384Sjkim		memcpy(ctx->iv, iv, AES_BLOCK_SIZE);
888238384Sjkim		break;
889238384Sjkim
890238384Sjkim	case EVP_CIPH_OFB_MODE:
891238384Sjkim		memcpy(cdata->iv, ctx->iv, AES_BLOCK_SIZE);
892238384Sjkim		padlock_xcrypt_ofb(nbytes/AES_BLOCK_SIZE, cdata, out_arg, in_arg);
893238384Sjkim		memcpy(ctx->iv, cdata->iv, AES_BLOCK_SIZE);
894238384Sjkim		break;
895238384Sjkim
896238384Sjkim	default:
897238384Sjkim		return 0;
898238384Sjkim	}
899238384Sjkim
900238384Sjkim	memset(cdata->iv, 0, AES_BLOCK_SIZE);
901238384Sjkim
902238384Sjkim	return 1;
903238384Sjkim}
904238384Sjkim
905238384Sjkim#ifndef  PADLOCK_CHUNK
906238384Sjkim# define PADLOCK_CHUNK	512	/* Must be a power of 2 larger than 16 */
907238384Sjkim#endif
908238384Sjkim#if PADLOCK_CHUNK<16 || PADLOCK_CHUNK&(PADLOCK_CHUNK-1)
909238384Sjkim# error "insane PADLOCK_CHUNK..."
910238384Sjkim#endif
911238384Sjkim
912238384Sjkim/* Re-align the arguments to 16-Bytes boundaries and run the
913238384Sjkim   encryption function itself. This function is not AES-specific. */
914238384Sjkimstatic int
915238384Sjkimpadlock_aes_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out_arg,
916238384Sjkim		   const unsigned char *in_arg, size_t nbytes)
917238384Sjkim{
918238384Sjkim	struct padlock_cipher_data *cdata;
919238384Sjkim	const  void *inp;
920238384Sjkim	unsigned char  *out;
921238384Sjkim	void  *iv;
922238384Sjkim	int    inp_misaligned, out_misaligned, realign_in_loop;
923238384Sjkim	size_t chunk, allocated=0;
924238384Sjkim
925238384Sjkim	/* ctx->num is maintained in byte-oriented modes,
926238384Sjkim	   such as CFB and OFB... */
927238384Sjkim	if ((chunk = ctx->num)) { /* borrow chunk variable */
928238384Sjkim		unsigned char *ivp=ctx->iv;
929238384Sjkim
930238384Sjkim		switch (EVP_CIPHER_CTX_mode(ctx)) {
931238384Sjkim		case EVP_CIPH_CFB_MODE:
932238384Sjkim			if (chunk >= AES_BLOCK_SIZE)
933238384Sjkim				return 0; /* bogus value */
934238384Sjkim
935238384Sjkim			if (ctx->encrypt)
936238384Sjkim				while (chunk<AES_BLOCK_SIZE && nbytes!=0) {
937238384Sjkim					ivp[chunk] = *(out_arg++) = *(in_arg++) ^ ivp[chunk];
938238384Sjkim					chunk++, nbytes--;
939238384Sjkim				}
940238384Sjkim			else	while (chunk<AES_BLOCK_SIZE && nbytes!=0) {
941238384Sjkim					unsigned char c = *(in_arg++);
942238384Sjkim					*(out_arg++) = c ^ ivp[chunk];
943238384Sjkim					ivp[chunk++] = c, nbytes--;
944238384Sjkim				}
945238384Sjkim
946238384Sjkim			ctx->num = chunk%AES_BLOCK_SIZE;
947238384Sjkim			break;
948238384Sjkim		case EVP_CIPH_OFB_MODE:
949238384Sjkim			if (chunk >= AES_BLOCK_SIZE)
950238384Sjkim				return 0; /* bogus value */
951238384Sjkim
952238384Sjkim			while (chunk<AES_BLOCK_SIZE && nbytes!=0) {
953238384Sjkim				*(out_arg++) = *(in_arg++) ^ ivp[chunk];
954238384Sjkim				chunk++, nbytes--;
955238384Sjkim			}
956238384Sjkim
957238384Sjkim			ctx->num = chunk%AES_BLOCK_SIZE;
958238384Sjkim			break;
959238384Sjkim		}
960238384Sjkim	}
961238384Sjkim
962238384Sjkim	if (nbytes == 0)
963238384Sjkim		return 1;
964238384Sjkim#if 0
965238384Sjkim	if (nbytes % AES_BLOCK_SIZE)
966238384Sjkim		return 0; /* are we expected to do tail processing? */
967238384Sjkim#else
968238384Sjkim	/* nbytes is always multiple of AES_BLOCK_SIZE in ECB and CBC
969238384Sjkim	   modes and arbitrary value in byte-oriented modes, such as
970238384Sjkim	   CFB and OFB... */
971238384Sjkim#endif
972238384Sjkim
973238384Sjkim	/* VIA promises CPUs that won't require alignment in the future.
974238384Sjkim	   For now padlock_aes_align_required is initialized to 1 and
975238384Sjkim	   the condition is never met... */
976238384Sjkim	/* C7 core is capable to manage unaligned input in non-ECB[!]
977238384Sjkim	   mode, but performance penalties appear to be approximately
978238384Sjkim	   same as for software alignment below or ~3x. They promise to
979238384Sjkim	   improve it in the future, but for now we can just as well
980238384Sjkim	   pretend that it can only handle aligned input... */
981238384Sjkim	if (!padlock_aes_align_required && (nbytes%AES_BLOCK_SIZE)==0)
982238384Sjkim		return padlock_aes_cipher_omnivorous(ctx, out_arg, in_arg, nbytes);
983238384Sjkim
984238384Sjkim	inp_misaligned = (((size_t)in_arg) & 0x0F);
985238384Sjkim	out_misaligned = (((size_t)out_arg) & 0x0F);
986238384Sjkim
987238384Sjkim	/* Note that even if output is aligned and input not,
988238384Sjkim	 * I still prefer to loop instead of copy the whole
989238384Sjkim	 * input and then encrypt in one stroke. This is done
990238384Sjkim	 * in order to improve L1 cache utilization... */
991238384Sjkim	realign_in_loop = out_misaligned|inp_misaligned;
992238384Sjkim
993238384Sjkim	if (!realign_in_loop && (nbytes%AES_BLOCK_SIZE)==0)
994238384Sjkim		return padlock_aes_cipher_omnivorous(ctx, out_arg, in_arg, nbytes);
995238384Sjkim
996238384Sjkim	/* this takes one "if" out of the loops */
997238384Sjkim	chunk  = nbytes;
998238384Sjkim	chunk %= PADLOCK_CHUNK;
999238384Sjkim	if (chunk==0) chunk = PADLOCK_CHUNK;
1000238384Sjkim
1001238384Sjkim	if (out_misaligned) {
1002238384Sjkim		/* optmize for small input */
1003238384Sjkim		allocated = (chunk<nbytes?PADLOCK_CHUNK:nbytes);
1004238384Sjkim		out = alloca(0x10 + allocated);
1005238384Sjkim		out = NEAREST_ALIGNED(out);
1006238384Sjkim	}
1007238384Sjkim	else
1008238384Sjkim		out = out_arg;
1009238384Sjkim
1010238384Sjkim	cdata = ALIGNED_CIPHER_DATA(ctx);
1011238384Sjkim	padlock_verify_context(cdata);
1012238384Sjkim
1013238384Sjkim	switch (EVP_CIPHER_CTX_mode(ctx)) {
1014238384Sjkim	case EVP_CIPH_ECB_MODE:
1015238384Sjkim		do	{
1016238384Sjkim			if (inp_misaligned)
1017238384Sjkim				inp = padlock_memcpy(out, in_arg, chunk);
1018238384Sjkim			else
1019238384Sjkim				inp = in_arg;
1020238384Sjkim			in_arg += chunk;
1021238384Sjkim
1022238384Sjkim			padlock_xcrypt_ecb(chunk/AES_BLOCK_SIZE, cdata, out, inp);
1023238384Sjkim
1024238384Sjkim			if (out_misaligned)
1025238384Sjkim				out_arg = padlock_memcpy(out_arg, out, chunk) + chunk;
1026238384Sjkim			else
1027238384Sjkim				out     = out_arg+=chunk;
1028238384Sjkim
1029238384Sjkim			nbytes -= chunk;
1030238384Sjkim			chunk   = PADLOCK_CHUNK;
1031238384Sjkim		} while (nbytes);
1032238384Sjkim		break;
1033238384Sjkim
1034238384Sjkim	case EVP_CIPH_CBC_MODE:
1035238384Sjkim		memcpy(cdata->iv, ctx->iv, AES_BLOCK_SIZE);
1036238384Sjkim		goto cbc_shortcut;
1037238384Sjkim		do	{
1038238384Sjkim			if (iv != cdata->iv)
1039238384Sjkim				memcpy(cdata->iv, iv, AES_BLOCK_SIZE);
1040238384Sjkim			chunk = PADLOCK_CHUNK;
1041238384Sjkim		cbc_shortcut: /* optimize for small input */
1042238384Sjkim			if (inp_misaligned)
1043238384Sjkim				inp = padlock_memcpy(out, in_arg, chunk);
1044238384Sjkim			else
1045238384Sjkim				inp = in_arg;
1046238384Sjkim			in_arg += chunk;
1047238384Sjkim
1048238384Sjkim			iv = padlock_xcrypt_cbc(chunk/AES_BLOCK_SIZE, cdata, out, inp);
1049238384Sjkim
1050238384Sjkim			if (out_misaligned)
1051238384Sjkim				out_arg = padlock_memcpy(out_arg, out, chunk) + chunk;
1052238384Sjkim			else
1053238384Sjkim				out     = out_arg+=chunk;
1054238384Sjkim
1055238384Sjkim		} while (nbytes -= chunk);
1056238384Sjkim		memcpy(ctx->iv, iv, AES_BLOCK_SIZE);
1057238384Sjkim		break;
1058238384Sjkim
1059238384Sjkim	case EVP_CIPH_CFB_MODE:
1060238384Sjkim		memcpy (iv = cdata->iv, ctx->iv, AES_BLOCK_SIZE);
1061238384Sjkim		chunk &= ~(AES_BLOCK_SIZE-1);
1062238384Sjkim		if (chunk)	goto cfb_shortcut;
1063238384Sjkim		else		goto cfb_skiploop;
1064238384Sjkim		do	{
1065238384Sjkim			if (iv != cdata->iv)
1066238384Sjkim				memcpy(cdata->iv, iv, AES_BLOCK_SIZE);
1067238384Sjkim			chunk = PADLOCK_CHUNK;
1068238384Sjkim		cfb_shortcut: /* optimize for small input */
1069238384Sjkim			if (inp_misaligned)
1070238384Sjkim				inp = padlock_memcpy(out, in_arg, chunk);
1071238384Sjkim			else
1072238384Sjkim				inp = in_arg;
1073238384Sjkim			in_arg += chunk;
1074238384Sjkim
1075238384Sjkim			iv = padlock_xcrypt_cfb(chunk/AES_BLOCK_SIZE, cdata, out, inp);
1076238384Sjkim
1077238384Sjkim			if (out_misaligned)
1078238384Sjkim				out_arg = padlock_memcpy(out_arg, out, chunk) + chunk;
1079238384Sjkim			else
1080238384Sjkim				out     = out_arg+=chunk;
1081238384Sjkim
1082238384Sjkim			nbytes -= chunk;
1083238384Sjkim		} while (nbytes >= AES_BLOCK_SIZE);
1084238384Sjkim
1085238384Sjkim		cfb_skiploop:
1086238384Sjkim		if (nbytes) {
1087238384Sjkim			unsigned char *ivp = cdata->iv;
1088238384Sjkim
1089238384Sjkim			if (iv != ivp) {
1090238384Sjkim				memcpy(ivp, iv, AES_BLOCK_SIZE);
1091238384Sjkim				iv = ivp;
1092238384Sjkim			}
1093238384Sjkim			ctx->num = nbytes;
1094238384Sjkim			if (cdata->cword.b.encdec) {
1095238384Sjkim				cdata->cword.b.encdec=0;
1096238384Sjkim				padlock_reload_key();
1097238384Sjkim				padlock_xcrypt_ecb(1,cdata,ivp,ivp);
1098238384Sjkim				cdata->cword.b.encdec=1;
1099238384Sjkim				padlock_reload_key();
1100238384Sjkim				while(nbytes) {
1101238384Sjkim					unsigned char c = *(in_arg++);
1102238384Sjkim					*(out_arg++) = c ^ *ivp;
1103238384Sjkim					*(ivp++) = c, nbytes--;
1104238384Sjkim				}
1105238384Sjkim			}
1106238384Sjkim			else {	padlock_reload_key();
1107238384Sjkim				padlock_xcrypt_ecb(1,cdata,ivp,ivp);
1108238384Sjkim				padlock_reload_key();
1109238384Sjkim				while (nbytes) {
1110238384Sjkim					*ivp = *(out_arg++) = *(in_arg++) ^ *ivp;
1111238384Sjkim					ivp++, nbytes--;
1112238384Sjkim				}
1113238384Sjkim			}
1114238384Sjkim		}
1115238384Sjkim
1116238384Sjkim		memcpy(ctx->iv, iv, AES_BLOCK_SIZE);
1117238384Sjkim		break;
1118238384Sjkim
1119238384Sjkim	case EVP_CIPH_OFB_MODE:
1120238384Sjkim		memcpy(cdata->iv, ctx->iv, AES_BLOCK_SIZE);
1121238384Sjkim		chunk &= ~(AES_BLOCK_SIZE-1);
1122238384Sjkim		if (chunk) do	{
1123238384Sjkim			if (inp_misaligned)
1124238384Sjkim				inp = padlock_memcpy(out, in_arg, chunk);
1125238384Sjkim			else
1126238384Sjkim				inp = in_arg;
1127238384Sjkim			in_arg += chunk;
1128238384Sjkim
1129238384Sjkim			padlock_xcrypt_ofb(chunk/AES_BLOCK_SIZE, cdata, out, inp);
1130238384Sjkim
1131238384Sjkim			if (out_misaligned)
1132238384Sjkim				out_arg = padlock_memcpy(out_arg, out, chunk) + chunk;
1133238384Sjkim			else
1134238384Sjkim				out     = out_arg+=chunk;
1135238384Sjkim
1136238384Sjkim			nbytes -= chunk;
1137238384Sjkim			chunk   = PADLOCK_CHUNK;
1138238384Sjkim		} while (nbytes >= AES_BLOCK_SIZE);
1139238384Sjkim
1140238384Sjkim		if (nbytes) {
1141238384Sjkim			unsigned char *ivp = cdata->iv;
1142238384Sjkim
1143238384Sjkim			ctx->num = nbytes;
1144238384Sjkim			padlock_reload_key();	/* empirically found */
1145238384Sjkim			padlock_xcrypt_ecb(1,cdata,ivp,ivp);
1146238384Sjkim			padlock_reload_key();	/* empirically found */
1147238384Sjkim			while (nbytes) {
1148238384Sjkim				*(out_arg++) = *(in_arg++) ^ *ivp;
1149238384Sjkim				ivp++, nbytes--;
1150238384Sjkim			}
1151238384Sjkim		}
1152238384Sjkim
1153238384Sjkim		memcpy(ctx->iv, cdata->iv, AES_BLOCK_SIZE);
1154238384Sjkim		break;
1155238384Sjkim
1156238384Sjkim	default:
1157238384Sjkim		return 0;
1158238384Sjkim	}
1159238384Sjkim
1160238384Sjkim	/* Clean the realign buffer if it was used */
1161238384Sjkim	if (out_misaligned) {
1162238384Sjkim		volatile unsigned long *p=(void *)out;
1163238384Sjkim		size_t   n = allocated/sizeof(*p);
1164238384Sjkim		while (n--) *p++=0;
1165238384Sjkim	}
1166238384Sjkim
1167238384Sjkim	memset(cdata->iv, 0, AES_BLOCK_SIZE);
1168238384Sjkim
1169238384Sjkim	return 1;
1170238384Sjkim}
1171238384Sjkim
1172238384Sjkim#endif /* OPENSSL_NO_AES */
1173238384Sjkim
1174238384Sjkim/* ===== Random Number Generator ===== */
1175238384Sjkim/*
1176238384Sjkim * This code is not engaged. The reason is that it does not comply
1177238384Sjkim * with recommendations for VIA RNG usage for secure applications
1178238384Sjkim * (posted at http://www.via.com.tw/en/viac3/c3.jsp) nor does it
1179238384Sjkim * provide meaningful error control...
1180238384Sjkim */
1181238384Sjkim/* Wrapper that provides an interface between the API and
1182238384Sjkim   the raw PadLock RNG */
1183238384Sjkimstatic int
1184238384Sjkimpadlock_rand_bytes(unsigned char *output, int count)
1185238384Sjkim{
1186238384Sjkim	unsigned int eax, buf;
1187238384Sjkim
1188238384Sjkim	while (count >= 8) {
1189238384Sjkim		eax = padlock_xstore(output, 0);
1190238384Sjkim		if (!(eax&(1<<6)))	return 0; /* RNG disabled */
1191238384Sjkim		/* this ---vv--- covers DC bias, Raw Bits and String Filter */
1192238384Sjkim		if (eax&(0x1F<<10))	return 0;
1193238384Sjkim		if ((eax&0x1F)==0)	continue; /* no data, retry... */
1194238384Sjkim		if ((eax&0x1F)!=8)	return 0; /* fatal failure...  */
1195238384Sjkim		output += 8;
1196238384Sjkim		count  -= 8;
1197238384Sjkim	}
1198238384Sjkim	while (count > 0) {
1199238384Sjkim		eax = padlock_xstore(&buf, 3);
1200238384Sjkim		if (!(eax&(1<<6)))	return 0; /* RNG disabled */
1201238384Sjkim		/* this ---vv--- covers DC bias, Raw Bits and String Filter */
1202238384Sjkim		if (eax&(0x1F<<10))	return 0;
1203238384Sjkim		if ((eax&0x1F)==0)	continue; /* no data, retry... */
1204238384Sjkim		if ((eax&0x1F)!=1)	return 0; /* fatal failure...  */
1205238384Sjkim		*output++ = (unsigned char)buf;
1206238384Sjkim		count--;
1207238384Sjkim	}
1208238384Sjkim	*(volatile unsigned int *)&buf=0;
1209238384Sjkim
1210238384Sjkim	return 1;
1211238384Sjkim}
1212238384Sjkim
1213238384Sjkim/* Dummy but necessary function */
1214238384Sjkimstatic int
1215238384Sjkimpadlock_rand_status(void)
1216238384Sjkim{
1217238384Sjkim	return 1;
1218238384Sjkim}
1219238384Sjkim
1220238384Sjkim/* Prepare structure for registration */
1221238384Sjkimstatic RAND_METHOD padlock_rand = {
1222238384Sjkim	NULL,			/* seed */
1223238384Sjkim	padlock_rand_bytes,	/* bytes */
1224238384Sjkim	NULL,			/* cleanup */
1225238384Sjkim	NULL,			/* add */
1226238384Sjkim	padlock_rand_bytes,	/* pseudorand */
1227238384Sjkim	padlock_rand_status,	/* rand status */
1228238384Sjkim};
1229238384Sjkim
1230238384Sjkim#else  /* !COMPILE_HW_PADLOCK */
1231238384Sjkim#ifndef OPENSSL_NO_DYNAMIC_ENGINE
1232238384SjkimOPENSSL_EXPORT
1233238384Sjkimint bind_engine(ENGINE *e, const char *id, const dynamic_fns *fns);
1234238384SjkimOPENSSL_EXPORT
1235238384Sjkimint bind_engine(ENGINE *e, const char *id, const dynamic_fns *fns) { return 0; }
1236238384SjkimIMPLEMENT_DYNAMIC_CHECK_FN()
1237238384Sjkim#endif
1238238384Sjkim#endif /* COMPILE_HW_PADLOCK */
1239238384Sjkim
1240238384Sjkim#endif /* !OPENSSL_NO_HW_PADLOCK */
1241238384Sjkim#endif /* !OPENSSL_NO_HW */
1242