eng_padlock.c revision 296465
1/*-
2 * Support for VIA PadLock Advanced Cryptography Engine (ACE)
3 * Written by Michal Ludvig <michal@logix.cz>
4 *            http://www.logix.cz/michal
5 *
6 * Big thanks to Andy Polyakov for a help with optimization,
7 * assembler fixes, port to MS Windows and a lot of other
8 * valuable work on this engine!
9 */
10
11/* ====================================================================
12 * Copyright (c) 1999-2001 The OpenSSL Project.  All rights reserved.
13 *
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions
16 * are met:
17 *
18 * 1. Redistributions of source code must retain the above copyright
19 *    notice, this list of conditions and the following disclaimer.
20 *
21 * 2. Redistributions in binary form must reproduce the above copyright
22 *    notice, this list of conditions and the following disclaimer in
23 *    the documentation and/or other materials provided with the
24 *    distribution.
25 *
26 * 3. All advertising materials mentioning features or use of this
27 *    software must display the following acknowledgment:
28 *    "This product includes software developed by the OpenSSL Project
29 *    for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
30 *
31 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
32 *    endorse or promote products derived from this software without
33 *    prior written permission. For written permission, please contact
34 *    licensing@OpenSSL.org.
35 *
36 * 5. Products derived from this software may not be called "OpenSSL"
37 *    nor may "OpenSSL" appear in their names without prior written
38 *    permission of the OpenSSL Project.
39 *
40 * 6. Redistributions of any form whatsoever must retain the following
41 *    acknowledgment:
42 *    "This product includes software developed by the OpenSSL Project
43 *    for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
44 *
45 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
46 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
47 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
48 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
49 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
50 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
51 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
52 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
53 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
54 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
55 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
56 * OF THE POSSIBILITY OF SUCH DAMAGE.
57 * ====================================================================
58 *
59 * This product includes cryptographic software written by Eric Young
60 * (eay@cryptsoft.com).  This product includes software written by Tim
61 * Hudson (tjh@cryptsoft.com).
62 *
63 */
64
65#include <stdio.h>
66#include <string.h>
67
68#include <openssl/opensslconf.h>
69#include <openssl/crypto.h>
70#include <openssl/dso.h>
71#include <openssl/engine.h>
72#include <openssl/evp.h>
73#ifndef OPENSSL_NO_AES
74# include <openssl/aes.h>
75#endif
76#include <openssl/rand.h>
77#include <openssl/err.h>
78
79#ifndef OPENSSL_NO_HW
80# ifndef OPENSSL_NO_HW_PADLOCK
81
82/* Attempt to have a single source for both 0.9.7 and 0.9.8 :-) */
83#  if (OPENSSL_VERSION_NUMBER >= 0x00908000L)
84#   ifndef OPENSSL_NO_DYNAMIC_ENGINE
85#    define DYNAMIC_ENGINE
86#   endif
87#  elif (OPENSSL_VERSION_NUMBER >= 0x00907000L)
88#   ifdef ENGINE_DYNAMIC_SUPPORT
89#    define DYNAMIC_ENGINE
90#   endif
91#  else
92#   error "Only OpenSSL >= 0.9.7 is supported"
93#  endif
94
95/*
96 * VIA PadLock AES is available *ONLY* on some x86 CPUs. Not only that it
97 * doesn't exist elsewhere, but it even can't be compiled on other platforms!
98 *
99 * In addition, because of the heavy use of inline assembler, compiler choice
100 * is limited to GCC and Microsoft C.
101 */
102#  undef COMPILE_HW_PADLOCK
103#  if !defined(I386_ONLY) && !defined(OPENSSL_NO_INLINE_ASM)
104#   if (defined(__GNUC__) && (defined(__i386__) || defined(__i386))) || \
105     (defined(_MSC_VER) && defined(_M_IX86))
106#    define COMPILE_HW_PADLOCK
107static ENGINE *ENGINE_padlock(void);
108#   endif
109#  endif
110
111void ENGINE_load_padlock(void)
112{
113/* On non-x86 CPUs it just returns. */
114#  ifdef COMPILE_HW_PADLOCK
115    ENGINE *toadd = ENGINE_padlock();
116    if (!toadd)
117        return;
118    ENGINE_add(toadd);
119    ENGINE_free(toadd);
120    ERR_clear_error();
121#  endif
122}
123
124#  ifdef COMPILE_HW_PADLOCK
125/*
126 * We do these includes here to avoid header problems on platforms that do
127 * not have the VIA padlock anyway...
128 */
129#   ifdef _MSC_VER
130#    include <malloc.h>
131#    define alloca _alloca
132#   elif defined(NETWARE_CLIB) && defined(__GNUC__)
133void *alloca(size_t);
134#    define alloca(s) __builtin_alloca(s)
135#   else
136#    include <stdlib.h>
137#   endif
138
139/* Function for ENGINE detection and control */
140static int padlock_available(void);
141static int padlock_init(ENGINE *e);
142
143/* RNG Stuff */
144static RAND_METHOD padlock_rand;
145
146/* Cipher Stuff */
147#   ifndef OPENSSL_NO_AES
148static int padlock_ciphers(ENGINE *e, const EVP_CIPHER **cipher,
149                           const int **nids, int nid);
150#   endif
151
152/* Engine names */
153static const char *padlock_id = "padlock";
154static char padlock_name[100];
155
156/* Available features */
157static int padlock_use_ace = 0; /* Advanced Cryptography Engine */
158static int padlock_use_rng = 0; /* Random Number Generator */
159#   ifndef OPENSSL_NO_AES
160static int padlock_aes_align_required = 1;
161#   endif
162
163/* ===== Engine "management" functions ===== */
164
165/* Prepare the ENGINE structure for registration */
166static int padlock_bind_helper(ENGINE *e)
167{
168    /* Check available features */
169    padlock_available();
170
171#   if 1                        /* disable RNG for now, see commentary in
172                                 * vicinity of RNG code */
173    padlock_use_rng = 0;
174#   endif
175
176    /* Generate a nice engine name with available features */
177    BIO_snprintf(padlock_name, sizeof(padlock_name),
178                 "VIA PadLock (%s, %s)",
179                 padlock_use_rng ? "RNG" : "no-RNG",
180                 padlock_use_ace ? "ACE" : "no-ACE");
181
182    /* Register everything or return with an error */
183    if (!ENGINE_set_id(e, padlock_id) ||
184        !ENGINE_set_name(e, padlock_name) ||
185        !ENGINE_set_init_function(e, padlock_init) ||
186#   ifndef OPENSSL_NO_AES
187        (padlock_use_ace && !ENGINE_set_ciphers(e, padlock_ciphers)) ||
188#   endif
189        (padlock_use_rng && !ENGINE_set_RAND(e, &padlock_rand))) {
190        return 0;
191    }
192
193    /* Everything looks good */
194    return 1;
195}
196
197/* Constructor */
198static ENGINE *ENGINE_padlock(void)
199{
200    ENGINE *eng = ENGINE_new();
201
202    if (!eng) {
203        return NULL;
204    }
205
206    if (!padlock_bind_helper(eng)) {
207        ENGINE_free(eng);
208        return NULL;
209    }
210
211    return eng;
212}
213
214/* Check availability of the engine */
215static int padlock_init(ENGINE *e)
216{
217    return (padlock_use_rng || padlock_use_ace);
218}
219
220/*
221 * This stuff is needed if this ENGINE is being compiled into a
222 * self-contained shared-library.
223 */
224#   ifdef DYNAMIC_ENGINE
225static int padlock_bind_fn(ENGINE *e, const char *id)
226{
227    if (id && (strcmp(id, padlock_id) != 0)) {
228        return 0;
229    }
230
231    if (!padlock_bind_helper(e)) {
232        return 0;
233    }
234
235    return 1;
236}
237
238IMPLEMENT_DYNAMIC_CHECK_FN()
239    IMPLEMENT_DYNAMIC_BIND_FN(padlock_bind_fn)
240#   endif                       /* DYNAMIC_ENGINE */
241/* ===== Here comes the "real" engine ===== */
242#   ifndef OPENSSL_NO_AES
243/* Some AES-related constants */
244#    define AES_BLOCK_SIZE          16
245#    define AES_KEY_SIZE_128        16
246#    define AES_KEY_SIZE_192        24
247#    define AES_KEY_SIZE_256        32
248    /*
249     * Here we store the status information relevant to the current context.
250     */
251    /*
252     * BIG FAT WARNING: Inline assembler in PADLOCK_XCRYPT_ASM() depends on
253     * the order of items in this structure.  Don't blindly modify, reorder,
254     * etc!
255     */
256struct padlock_cipher_data {
257    unsigned char iv[AES_BLOCK_SIZE]; /* Initialization vector */
258    union {
259        unsigned int pad[4];
260        struct {
261            int rounds:4;
262            int dgst:1;         /* n/a in C3 */
263            int align:1;        /* n/a in C3 */
264            int ciphr:1;        /* n/a in C3 */
265            unsigned int keygen:1;
266            int interm:1;
267            unsigned int encdec:1;
268            int ksize:2;
269        } b;
270    } cword;                    /* Control word */
271    AES_KEY ks;                 /* Encryption key */
272};
273
274/*
275 * Essentially this variable belongs in thread local storage.
276 * Having this variable global on the other hand can only cause
277 * few bogus key reloads [if any at all on single-CPU system],
278 * so we accept the penatly...
279 */
280static volatile struct padlock_cipher_data *padlock_saved_context;
281#   endif
282
283/*-
284 * =======================================================
285 * Inline assembler section(s).
286 * =======================================================
287 * Order of arguments is chosen to facilitate Windows port
288 * using __fastcall calling convention. If you wish to add
289 * more routines, keep in mind that first __fastcall
290 * argument is passed in %ecx and second - in %edx.
291 * =======================================================
292 */
293#   if defined(__GNUC__) && __GNUC__>=2
294/*
295 * As for excessive "push %ebx"/"pop %ebx" found all over.
296 * When generating position-independent code GCC won't let
297 * us use "b" in assembler templates nor even respect "ebx"
298 * in "clobber description." Therefore the trouble...
299 */
300
301/*
302 * Helper function - check if a CPUID instruction is available on this CPU
303 */
304static int padlock_insn_cpuid_available(void)
305{
306    int result = -1;
307
308    /*
309     * We're checking if the bit #21 of EFLAGS can be toggled. If yes =
310     * CPUID is available.
311     */
312    asm volatile ("pushf\n"
313                  "popl %%eax\n"
314                  "xorl $0x200000, %%eax\n"
315                  "movl %%eax, %%ecx\n"
316                  "andl $0x200000, %%ecx\n"
317                  "pushl %%eax\n"
318                  "popf\n"
319                  "pushf\n"
320                  "popl %%eax\n"
321                  "andl $0x200000, %%eax\n"
322                  "xorl %%eax, %%ecx\n"
323                  "movl %%ecx, %0\n":"=r" (result)::"eax", "ecx");
324
325    return (result == 0);
326}
327
328/*
329 * Load supported features of the CPU to see if the PadLock is available.
330 */
331static int padlock_available(void)
332{
333    char vendor_string[16];
334    unsigned int eax, edx;
335
336    /* First check if the CPUID instruction is available at all... */
337    if (!padlock_insn_cpuid_available())
338        return 0;
339
340    /* Are we running on the Centaur (VIA) CPU? */
341    eax = 0x00000000;
342    vendor_string[12] = 0;
343    asm volatile ("pushl  %%ebx\n"
344                  "cpuid\n"
345                  "movl   %%ebx,(%%edi)\n"
346                  "movl   %%edx,4(%%edi)\n"
347                  "movl   %%ecx,8(%%edi)\n"
348                  "popl   %%ebx":"+a" (eax):"D"(vendor_string):"ecx", "edx");
349    if (strcmp(vendor_string, "CentaurHauls") != 0)
350        return 0;
351
352    /* Check for Centaur Extended Feature Flags presence */
353    eax = 0xC0000000;
354    asm volatile ("pushl %%ebx; cpuid; popl %%ebx":"+a" (eax)::"ecx", "edx");
355    if (eax < 0xC0000001)
356        return 0;
357
358    /* Read the Centaur Extended Feature Flags */
359    eax = 0xC0000001;
360    asm volatile ("pushl %%ebx; cpuid; popl %%ebx":"+a" (eax),
361                  "=d"(edx)::"ecx");
362
363    /* Fill up some flags */
364    padlock_use_ace = ((edx & (0x3 << 6)) == (0x3 << 6));
365    padlock_use_rng = ((edx & (0x3 << 2)) == (0x3 << 2));
366
367    return padlock_use_ace + padlock_use_rng;
368}
369
370#    ifndef OPENSSL_NO_AES
371/* Our own htonl()/ntohl() */
372static inline void padlock_bswapl(AES_KEY *ks)
373{
374    size_t i = sizeof(ks->rd_key) / sizeof(ks->rd_key[0]);
375    unsigned int *key = ks->rd_key;
376
377    while (i--) {
378        asm volatile ("bswapl %0":"+r" (*key));
379        key++;
380    }
381}
382#    endif
383
384/*
385 * Force key reload from memory to the CPU microcode. Loading EFLAGS from the
386 * stack clears EFLAGS[30] which does the trick.
387 */
388static inline void padlock_reload_key(void)
389{
390    asm volatile ("pushfl; popfl");
391}
392
393#    ifndef OPENSSL_NO_AES
394/*
395 * This is heuristic key context tracing. At first one
396 * believes that one should use atomic swap instructions,
397 * but it's not actually necessary. Point is that if
398 * padlock_saved_context was changed by another thread
399 * after we've read it and before we compare it with cdata,
400 * our key *shall* be reloaded upon thread context switch
401 * and we are therefore set in either case...
402 */
403static inline void padlock_verify_context(struct padlock_cipher_data *cdata)
404{
405    asm volatile ("pushfl\n"
406                  "       btl     $30,(%%esp)\n"
407                  "       jnc     1f\n"
408                  "       cmpl    %2,%1\n"
409                  "       je      1f\n"
410                  "       popfl\n"
411                  "       subl    $4,%%esp\n"
412                  "1:     addl    $4,%%esp\n"
413                  "       movl    %2,%0":"+m" (padlock_saved_context)
414                  :"r"(padlock_saved_context), "r"(cdata):"cc");
415}
416
417/* Template for padlock_xcrypt_* modes */
418/*
419 * BIG FAT WARNING: The offsets used with 'leal' instructions describe items
420 * of the 'padlock_cipher_data' structure.
421 */
422#     define PADLOCK_XCRYPT_ASM(name,rep_xcrypt)     \
423static inline void *name(size_t cnt,            \
424        struct padlock_cipher_data *cdata,      \
425        void *out, const void *inp)             \
426{       void *iv;                               \
427        asm volatile ( "pushl   %%ebx\n"        \
428                "       leal    16(%0),%%edx\n" \
429                "       leal    32(%0),%%ebx\n" \
430                        rep_xcrypt "\n"         \
431                "       popl    %%ebx"          \
432                : "=a"(iv), "=c"(cnt), "=D"(out), "=S"(inp) \
433                : "0"(cdata), "1"(cnt), "2"(out), "3"(inp)  \
434                : "edx", "cc", "memory");       \
435        return iv;                              \
436}
437
438/* Generate all functions with appropriate opcodes */
439/* rep xcryptecb */
440PADLOCK_XCRYPT_ASM(padlock_xcrypt_ecb, ".byte 0xf3,0x0f,0xa7,0xc8")
441/* rep xcryptcbc */
442    PADLOCK_XCRYPT_ASM(padlock_xcrypt_cbc, ".byte 0xf3,0x0f,0xa7,0xd0")
443/* rep xcryptcfb */
444    PADLOCK_XCRYPT_ASM(padlock_xcrypt_cfb, ".byte 0xf3,0x0f,0xa7,0xe0")
445/* rep xcryptofb */
446    PADLOCK_XCRYPT_ASM(padlock_xcrypt_ofb, ".byte 0xf3,0x0f,0xa7,0xe8")
447#    endif
448/* The RNG call itself */
449static inline unsigned int padlock_xstore(void *addr, unsigned int edx_in)
450{
451    unsigned int eax_out;
452
453    asm volatile (".byte 0x0f,0xa7,0xc0" /* xstore */
454                  :"=a" (eax_out), "=m"(*(unsigned *)addr)
455                  :"D"(addr), "d"(edx_in)
456        );
457
458    return eax_out;
459}
460
461/*
462 * Why not inline 'rep movsd'? I failed to find information on what value in
463 * Direction Flag one can expect and consequently have to apply
464 * "better-safe-than-sorry" approach and assume "undefined." I could
465 * explicitly clear it and restore the original value upon return from
466 * padlock_aes_cipher, but it's presumably too much trouble for too little
467 * gain... In case you wonder 'rep xcrypt*' instructions above are *not*
468 * affected by the Direction Flag and pointers advance toward larger
469 * addresses unconditionally.
470 */
471static inline unsigned char *padlock_memcpy(void *dst, const void *src,
472                                            size_t n)
473{
474    long *d = dst;
475    const long *s = src;
476
477    n /= sizeof(*d);
478    do {
479        *d++ = *s++;
480    } while (--n);
481
482    return dst;
483}
484
485#   elif defined(_MSC_VER)
486/*
487 * Unlike GCC these are real functions. In order to minimize impact
488 * on performance we adhere to __fastcall calling convention in
489 * order to get two first arguments passed through %ecx and %edx.
490 * Which kind of suits very well, as instructions in question use
491 * both %ecx and %edx as input:-)
492 */
493#    define REP_XCRYPT(code)                \
494        _asm _emit 0xf3                 \
495        _asm _emit 0x0f _asm _emit 0xa7 \
496        _asm _emit code
497
498/*
499 * BIG FAT WARNING: The offsets used with 'lea' instructions describe items
500 * of the 'padlock_cipher_data' structure.
501 */
502#    define PADLOCK_XCRYPT_ASM(name,code)   \
503static void * __fastcall                \
504        name (size_t cnt, void *cdata,  \
505        void *outp, const void *inp)    \
506{       _asm    mov     eax,edx         \
507        _asm    lea     edx,[eax+16]    \
508        _asm    lea     ebx,[eax+32]    \
509        _asm    mov     edi,outp        \
510        _asm    mov     esi,inp         \
511        REP_XCRYPT(code)                \
512}
513
514PADLOCK_XCRYPT_ASM(padlock_xcrypt_ecb,0xc8)
515PADLOCK_XCRYPT_ASM(padlock_xcrypt_cbc,0xd0)
516PADLOCK_XCRYPT_ASM(padlock_xcrypt_cfb,0xe0)
517PADLOCK_XCRYPT_ASM(padlock_xcrypt_ofb,0xe8)
518
519static int __fastcall padlock_xstore(void *outp, unsigned int code)
520{
521    _asm    mov edi,ecx
522    _asm _emit 0x0f _asm _emit 0xa7 _asm _emit 0xc0
523}
524
525static void __fastcall padlock_reload_key(void)
526{
527    _asm pushfd
528    _asm popfd
529}
530
531static void __fastcall padlock_verify_context(void *cdata)
532{
533    _asm    {
534        pushfd
535        bt  DWORD PTR[esp],30
536        jnc skip
537        cmp ecx,padlock_saved_context
538        je  skip
539        popfd
540        sub esp,4
541    skip:   add esp,4
542        mov padlock_saved_context,ecx
543    }
544}
545
546static int
547padlock_available(void)
548{
549    _asm    {
550        pushfd
551        pop eax
552        mov ecx,eax
553        xor eax,1<<21
554        push    eax
555        popfd
556        pushfd
557        pop eax
558        xor eax,ecx
559        bt  eax,21
560        jnc noluck
561        mov eax,0
562        cpuid
563        xor eax,eax
564        cmp ebx,'tneC'
565        jne noluck
566        cmp edx,'Hrua'
567        jne noluck
568        cmp ecx,'slua'
569        jne noluck
570        mov eax,0xC0000000
571        cpuid
572        mov edx,eax
573        xor eax,eax
574        cmp edx,0xC0000001
575        jb  noluck
576        mov eax,0xC0000001
577        cpuid
578        xor eax,eax
579        bt  edx,6
580        jnc skip_a
581        bt  edx,7
582        jnc skip_a
583        mov padlock_use_ace,1
584        inc eax
585    skip_a: bt  edx,2
586        jnc skip_r
587        bt  edx,3
588        jnc skip_r
589        mov padlock_use_rng,1
590        inc eax
591    skip_r:
592    noluck:
593    }
594}
595
596static void __fastcall padlock_bswapl(void *key)
597{
598    _asm    {
599        pushfd
600        cld
601        mov esi,ecx
602        mov edi,ecx
603        mov ecx,60
604    up: lodsd
605        bswap   eax
606        stosd
607        loop    up
608        popfd
609    }
610}
611
612/*
613 * MS actually specifies status of Direction Flag and compiler even manages
614 * to compile following as 'rep movsd' all by itself...
615 */
616#    define padlock_memcpy(o,i,n) ((unsigned char *)memcpy((o),(i),(n)&~3U))
617#   endif
618/* ===== AES encryption/decryption ===== */
619#   ifndef OPENSSL_NO_AES
620#    if defined(NID_aes_128_cfb128) && ! defined (NID_aes_128_cfb)
621#     define NID_aes_128_cfb NID_aes_128_cfb128
622#    endif
623#    if defined(NID_aes_128_ofb128) && ! defined (NID_aes_128_ofb)
624#     define NID_aes_128_ofb NID_aes_128_ofb128
625#    endif
626#    if defined(NID_aes_192_cfb128) && ! defined (NID_aes_192_cfb)
627#     define NID_aes_192_cfb NID_aes_192_cfb128
628#    endif
629#    if defined(NID_aes_192_ofb128) && ! defined (NID_aes_192_ofb)
630#     define NID_aes_192_ofb NID_aes_192_ofb128
631#    endif
632#    if defined(NID_aes_256_cfb128) && ! defined (NID_aes_256_cfb)
633#     define NID_aes_256_cfb NID_aes_256_cfb128
634#    endif
635#    if defined(NID_aes_256_ofb128) && ! defined (NID_aes_256_ofb)
636#     define NID_aes_256_ofb NID_aes_256_ofb128
637#    endif
638/*
639 * List of supported ciphers.
640 */ static int padlock_cipher_nids[] = {
641    NID_aes_128_ecb,
642    NID_aes_128_cbc,
643    NID_aes_128_cfb,
644    NID_aes_128_ofb,
645
646    NID_aes_192_ecb,
647    NID_aes_192_cbc,
648    NID_aes_192_cfb,
649    NID_aes_192_ofb,
650
651    NID_aes_256_ecb,
652    NID_aes_256_cbc,
653    NID_aes_256_cfb,
654    NID_aes_256_ofb,
655};
656
657static int padlock_cipher_nids_num = (sizeof(padlock_cipher_nids) /
658                                      sizeof(padlock_cipher_nids[0]));
659
660/* Function prototypes ... */
661static int padlock_aes_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
662                                const unsigned char *iv, int enc);
663static int padlock_aes_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
664                              const unsigned char *in, size_t nbytes);
665
666#    define NEAREST_ALIGNED(ptr) ( (unsigned char *)(ptr) +         \
667        ( (0x10 - ((size_t)(ptr) & 0x0F)) & 0x0F )      )
668#    define ALIGNED_CIPHER_DATA(ctx) ((struct padlock_cipher_data *)\
669        NEAREST_ALIGNED(ctx->cipher_data))
670
671#    define EVP_CIPHER_block_size_ECB       AES_BLOCK_SIZE
672#    define EVP_CIPHER_block_size_CBC       AES_BLOCK_SIZE
673#    define EVP_CIPHER_block_size_OFB       1
674#    define EVP_CIPHER_block_size_CFB       1
675
676/*
677 * Declaring so many ciphers by hand would be a pain. Instead introduce a bit
678 * of preprocessor magic :-)
679 */
680#    define DECLARE_AES_EVP(ksize,lmode,umode)      \
681static const EVP_CIPHER padlock_aes_##ksize##_##lmode = {       \
682        NID_aes_##ksize##_##lmode,              \
683        EVP_CIPHER_block_size_##umode,  \
684        AES_KEY_SIZE_##ksize,           \
685        AES_BLOCK_SIZE,                 \
686        0 | EVP_CIPH_##umode##_MODE,    \
687        padlock_aes_init_key,           \
688        padlock_aes_cipher,             \
689        NULL,                           \
690        sizeof(struct padlock_cipher_data) + 16,        \
691        EVP_CIPHER_set_asn1_iv,         \
692        EVP_CIPHER_get_asn1_iv,         \
693        NULL,                           \
694        NULL                            \
695}
696
697DECLARE_AES_EVP(128, ecb, ECB);
698DECLARE_AES_EVP(128, cbc, CBC);
699DECLARE_AES_EVP(128, cfb, CFB);
700DECLARE_AES_EVP(128, ofb, OFB);
701
702DECLARE_AES_EVP(192, ecb, ECB);
703DECLARE_AES_EVP(192, cbc, CBC);
704DECLARE_AES_EVP(192, cfb, CFB);
705DECLARE_AES_EVP(192, ofb, OFB);
706
707DECLARE_AES_EVP(256, ecb, ECB);
708DECLARE_AES_EVP(256, cbc, CBC);
709DECLARE_AES_EVP(256, cfb, CFB);
710DECLARE_AES_EVP(256, ofb, OFB);
711
712static int
713padlock_ciphers(ENGINE *e, const EVP_CIPHER **cipher, const int **nids,
714                int nid)
715{
716    /* No specific cipher => return a list of supported nids ... */
717    if (!cipher) {
718        *nids = padlock_cipher_nids;
719        return padlock_cipher_nids_num;
720    }
721
722    /* ... or the requested "cipher" otherwise */
723    switch (nid) {
724    case NID_aes_128_ecb:
725        *cipher = &padlock_aes_128_ecb;
726        break;
727    case NID_aes_128_cbc:
728        *cipher = &padlock_aes_128_cbc;
729        break;
730    case NID_aes_128_cfb:
731        *cipher = &padlock_aes_128_cfb;
732        break;
733    case NID_aes_128_ofb:
734        *cipher = &padlock_aes_128_ofb;
735        break;
736
737    case NID_aes_192_ecb:
738        *cipher = &padlock_aes_192_ecb;
739        break;
740    case NID_aes_192_cbc:
741        *cipher = &padlock_aes_192_cbc;
742        break;
743    case NID_aes_192_cfb:
744        *cipher = &padlock_aes_192_cfb;
745        break;
746    case NID_aes_192_ofb:
747        *cipher = &padlock_aes_192_ofb;
748        break;
749
750    case NID_aes_256_ecb:
751        *cipher = &padlock_aes_256_ecb;
752        break;
753    case NID_aes_256_cbc:
754        *cipher = &padlock_aes_256_cbc;
755        break;
756    case NID_aes_256_cfb:
757        *cipher = &padlock_aes_256_cfb;
758        break;
759    case NID_aes_256_ofb:
760        *cipher = &padlock_aes_256_ofb;
761        break;
762
763    default:
764        /* Sorry, we don't support this NID */
765        *cipher = NULL;
766        return 0;
767    }
768
769    return 1;
770}
771
772/* Prepare the encryption key for PadLock usage */
773static int
774padlock_aes_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
775                     const unsigned char *iv, int enc)
776{
777    struct padlock_cipher_data *cdata;
778    int key_len = EVP_CIPHER_CTX_key_length(ctx) * 8;
779
780    if (key == NULL)
781        return 0;               /* ERROR */
782
783    cdata = ALIGNED_CIPHER_DATA(ctx);
784    memset(cdata, 0, sizeof(struct padlock_cipher_data));
785
786    /* Prepare Control word. */
787    if (EVP_CIPHER_CTX_mode(ctx) == EVP_CIPH_OFB_MODE)
788        cdata->cword.b.encdec = 0;
789    else
790        cdata->cword.b.encdec = (ctx->encrypt == 0);
791    cdata->cword.b.rounds = 10 + (key_len - 128) / 32;
792    cdata->cword.b.ksize = (key_len - 128) / 64;
793
794    switch (key_len) {
795    case 128:
796        /*
797         * PadLock can generate an extended key for AES128 in hardware
798         */
799        memcpy(cdata->ks.rd_key, key, AES_KEY_SIZE_128);
800        cdata->cword.b.keygen = 0;
801        break;
802
803    case 192:
804    case 256:
805        /*
806         * Generate an extended AES key in software. Needed for AES192/AES256
807         */
808        /*
809         * Well, the above applies to Stepping 8 CPUs and is listed as
810         * hardware errata. They most likely will fix it at some point and
811         * then a check for stepping would be due here.
812         */
813        if (EVP_CIPHER_CTX_mode(ctx) == EVP_CIPH_CFB_MODE ||
814            EVP_CIPHER_CTX_mode(ctx) == EVP_CIPH_OFB_MODE || enc)
815            AES_set_encrypt_key(key, key_len, &cdata->ks);
816        else
817            AES_set_decrypt_key(key, key_len, &cdata->ks);
818#    ifndef AES_ASM
819        /*
820         * OpenSSL C functions use byte-swapped extended key.
821         */
822        padlock_bswapl(&cdata->ks);
823#    endif
824        cdata->cword.b.keygen = 1;
825        break;
826
827    default:
828        /* ERROR */
829        return 0;
830    }
831
832    /*
833     * This is done to cover for cases when user reuses the
834     * context for new key. The catch is that if we don't do
835     * this, padlock_eas_cipher might proceed with old key...
836     */
837    padlock_reload_key();
838
839    return 1;
840}
841
842/*-
843 * Simplified version of padlock_aes_cipher() used when
844 * 1) both input and output buffers are at aligned addresses.
845 * or when
846 * 2) running on a newer CPU that doesn't require aligned buffers.
847 */
848static int
849padlock_aes_cipher_omnivorous(EVP_CIPHER_CTX *ctx, unsigned char *out_arg,
850                              const unsigned char *in_arg, size_t nbytes)
851{
852    struct padlock_cipher_data *cdata;
853    void *iv;
854
855    cdata = ALIGNED_CIPHER_DATA(ctx);
856    padlock_verify_context(cdata);
857
858    switch (EVP_CIPHER_CTX_mode(ctx)) {
859    case EVP_CIPH_ECB_MODE:
860        padlock_xcrypt_ecb(nbytes / AES_BLOCK_SIZE, cdata, out_arg, in_arg);
861        break;
862
863    case EVP_CIPH_CBC_MODE:
864        memcpy(cdata->iv, ctx->iv, AES_BLOCK_SIZE);
865        iv = padlock_xcrypt_cbc(nbytes / AES_BLOCK_SIZE, cdata, out_arg,
866                                in_arg);
867        memcpy(ctx->iv, iv, AES_BLOCK_SIZE);
868        break;
869
870    case EVP_CIPH_CFB_MODE:
871        memcpy(cdata->iv, ctx->iv, AES_BLOCK_SIZE);
872        iv = padlock_xcrypt_cfb(nbytes / AES_BLOCK_SIZE, cdata, out_arg,
873                                in_arg);
874        memcpy(ctx->iv, iv, AES_BLOCK_SIZE);
875        break;
876
877    case EVP_CIPH_OFB_MODE:
878        memcpy(cdata->iv, ctx->iv, AES_BLOCK_SIZE);
879        padlock_xcrypt_ofb(nbytes / AES_BLOCK_SIZE, cdata, out_arg, in_arg);
880        memcpy(ctx->iv, cdata->iv, AES_BLOCK_SIZE);
881        break;
882
883    default:
884        return 0;
885    }
886
887    memset(cdata->iv, 0, AES_BLOCK_SIZE);
888
889    return 1;
890}
891
892#    ifndef  PADLOCK_CHUNK
893#     define PADLOCK_CHUNK  512 /* Must be a power of 2 larger than 16 */
894#    endif
895#    if PADLOCK_CHUNK<16 || PADLOCK_CHUNK&(PADLOCK_CHUNK-1)
896#     error "insane PADLOCK_CHUNK..."
897#    endif
898
899/*
900 * Re-align the arguments to 16-Bytes boundaries and run the encryption
901 * function itself. This function is not AES-specific.
902 */
903static int
904padlock_aes_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out_arg,
905                   const unsigned char *in_arg, size_t nbytes)
906{
907    struct padlock_cipher_data *cdata;
908    const void *inp;
909    unsigned char *out;
910    void *iv;
911    int inp_misaligned, out_misaligned, realign_in_loop;
912    size_t chunk, allocated = 0;
913
914    /*
915     * ctx->num is maintained in byte-oriented modes, such as CFB and OFB...
916     */
917    if ((chunk = ctx->num)) {   /* borrow chunk variable */
918        unsigned char *ivp = ctx->iv;
919
920        switch (EVP_CIPHER_CTX_mode(ctx)) {
921        case EVP_CIPH_CFB_MODE:
922            if (chunk >= AES_BLOCK_SIZE)
923                return 0;       /* bogus value */
924
925            if (ctx->encrypt)
926                while (chunk < AES_BLOCK_SIZE && nbytes != 0) {
927                    ivp[chunk] = *(out_arg++) = *(in_arg++) ^ ivp[chunk];
928                    chunk++, nbytes--;
929            } else
930                while (chunk < AES_BLOCK_SIZE && nbytes != 0) {
931                    unsigned char c = *(in_arg++);
932                    *(out_arg++) = c ^ ivp[chunk];
933                    ivp[chunk++] = c, nbytes--;
934                }
935
936            ctx->num = chunk % AES_BLOCK_SIZE;
937            break;
938        case EVP_CIPH_OFB_MODE:
939            if (chunk >= AES_BLOCK_SIZE)
940                return 0;       /* bogus value */
941
942            while (chunk < AES_BLOCK_SIZE && nbytes != 0) {
943                *(out_arg++) = *(in_arg++) ^ ivp[chunk];
944                chunk++, nbytes--;
945            }
946
947            ctx->num = chunk % AES_BLOCK_SIZE;
948            break;
949        }
950    }
951
952    if (nbytes == 0)
953        return 1;
954#    if 0
955    if (nbytes % AES_BLOCK_SIZE)
956        return 0;               /* are we expected to do tail processing? */
957#    else
958    /*
959     * nbytes is always multiple of AES_BLOCK_SIZE in ECB and CBC modes and
960     * arbitrary value in byte-oriented modes, such as CFB and OFB...
961     */
962#    endif
963
964    /*
965     * VIA promises CPUs that won't require alignment in the future. For now
966     * padlock_aes_align_required is initialized to 1 and the condition is
967     * never met...
968     */
969    /*
970     * C7 core is capable to manage unaligned input in non-ECB[!] mode, but
971     * performance penalties appear to be approximately same as for software
972     * alignment below or ~3x. They promise to improve it in the future, but
973     * for now we can just as well pretend that it can only handle aligned
974     * input...
975     */
976    if (!padlock_aes_align_required && (nbytes % AES_BLOCK_SIZE) == 0)
977        return padlock_aes_cipher_omnivorous(ctx, out_arg, in_arg, nbytes);
978
979    inp_misaligned = (((size_t)in_arg) & 0x0F);
980    out_misaligned = (((size_t)out_arg) & 0x0F);
981
982    /*
983     * Note that even if output is aligned and input not, I still prefer to
984     * loop instead of copy the whole input and then encrypt in one stroke.
985     * This is done in order to improve L1 cache utilization...
986     */
987    realign_in_loop = out_misaligned | inp_misaligned;
988
989    if (!realign_in_loop && (nbytes % AES_BLOCK_SIZE) == 0)
990        return padlock_aes_cipher_omnivorous(ctx, out_arg, in_arg, nbytes);
991
992    /* this takes one "if" out of the loops */
993    chunk = nbytes;
994    chunk %= PADLOCK_CHUNK;
995    if (chunk == 0)
996        chunk = PADLOCK_CHUNK;
997
998    if (out_misaligned) {
999        /* optmize for small input */
1000        allocated = (chunk < nbytes ? PADLOCK_CHUNK : nbytes);
1001        out = alloca(0x10 + allocated);
1002        out = NEAREST_ALIGNED(out);
1003    } else
1004        out = out_arg;
1005
1006    cdata = ALIGNED_CIPHER_DATA(ctx);
1007    padlock_verify_context(cdata);
1008
1009    switch (EVP_CIPHER_CTX_mode(ctx)) {
1010    case EVP_CIPH_ECB_MODE:
1011        do {
1012            if (inp_misaligned)
1013                inp = padlock_memcpy(out, in_arg, chunk);
1014            else
1015                inp = in_arg;
1016            in_arg += chunk;
1017
1018            padlock_xcrypt_ecb(chunk / AES_BLOCK_SIZE, cdata, out, inp);
1019
1020            if (out_misaligned)
1021                out_arg = padlock_memcpy(out_arg, out, chunk) + chunk;
1022            else
1023                out = out_arg += chunk;
1024
1025            nbytes -= chunk;
1026            chunk = PADLOCK_CHUNK;
1027        } while (nbytes);
1028        break;
1029
1030    case EVP_CIPH_CBC_MODE:
1031        memcpy(cdata->iv, ctx->iv, AES_BLOCK_SIZE);
1032        goto cbc_shortcut;
1033        do {
1034            if (iv != cdata->iv)
1035                memcpy(cdata->iv, iv, AES_BLOCK_SIZE);
1036            chunk = PADLOCK_CHUNK;
1037 cbc_shortcut:                 /* optimize for small input */
1038            if (inp_misaligned)
1039                inp = padlock_memcpy(out, in_arg, chunk);
1040            else
1041                inp = in_arg;
1042            in_arg += chunk;
1043
1044            iv = padlock_xcrypt_cbc(chunk / AES_BLOCK_SIZE, cdata, out, inp);
1045
1046            if (out_misaligned)
1047                out_arg = padlock_memcpy(out_arg, out, chunk) + chunk;
1048            else
1049                out = out_arg += chunk;
1050
1051        } while (nbytes -= chunk);
1052        memcpy(ctx->iv, iv, AES_BLOCK_SIZE);
1053        break;
1054
1055    case EVP_CIPH_CFB_MODE:
1056        memcpy(iv = cdata->iv, ctx->iv, AES_BLOCK_SIZE);
1057        chunk &= ~(AES_BLOCK_SIZE - 1);
1058        if (chunk)
1059            goto cfb_shortcut;
1060        else
1061            goto cfb_skiploop;
1062        do {
1063            if (iv != cdata->iv)
1064                memcpy(cdata->iv, iv, AES_BLOCK_SIZE);
1065            chunk = PADLOCK_CHUNK;
1066 cfb_shortcut:                 /* optimize for small input */
1067            if (inp_misaligned)
1068                inp = padlock_memcpy(out, in_arg, chunk);
1069            else
1070                inp = in_arg;
1071            in_arg += chunk;
1072
1073            iv = padlock_xcrypt_cfb(chunk / AES_BLOCK_SIZE, cdata, out, inp);
1074
1075            if (out_misaligned)
1076                out_arg = padlock_memcpy(out_arg, out, chunk) + chunk;
1077            else
1078                out = out_arg += chunk;
1079
1080            nbytes -= chunk;
1081        } while (nbytes >= AES_BLOCK_SIZE);
1082
1083 cfb_skiploop:
1084        if (nbytes) {
1085            unsigned char *ivp = cdata->iv;
1086
1087            if (iv != ivp) {
1088                memcpy(ivp, iv, AES_BLOCK_SIZE);
1089                iv = ivp;
1090            }
1091            ctx->num = nbytes;
1092            if (cdata->cword.b.encdec) {
1093                cdata->cword.b.encdec = 0;
1094                padlock_reload_key();
1095                padlock_xcrypt_ecb(1, cdata, ivp, ivp);
1096                cdata->cword.b.encdec = 1;
1097                padlock_reload_key();
1098                while (nbytes) {
1099                    unsigned char c = *(in_arg++);
1100                    *(out_arg++) = c ^ *ivp;
1101                    *(ivp++) = c, nbytes--;
1102                }
1103            } else {
1104                padlock_reload_key();
1105                padlock_xcrypt_ecb(1, cdata, ivp, ivp);
1106                padlock_reload_key();
1107                while (nbytes) {
1108                    *ivp = *(out_arg++) = *(in_arg++) ^ *ivp;
1109                    ivp++, nbytes--;
1110                }
1111            }
1112        }
1113
1114        memcpy(ctx->iv, iv, AES_BLOCK_SIZE);
1115        break;
1116
1117    case EVP_CIPH_OFB_MODE:
1118        memcpy(cdata->iv, ctx->iv, AES_BLOCK_SIZE);
1119        chunk &= ~(AES_BLOCK_SIZE - 1);
1120        if (chunk)
1121            do {
1122                if (inp_misaligned)
1123                    inp = padlock_memcpy(out, in_arg, chunk);
1124                else
1125                    inp = in_arg;
1126                in_arg += chunk;
1127
1128                padlock_xcrypt_ofb(chunk / AES_BLOCK_SIZE, cdata, out, inp);
1129
1130                if (out_misaligned)
1131                    out_arg = padlock_memcpy(out_arg, out, chunk) + chunk;
1132                else
1133                    out = out_arg += chunk;
1134
1135                nbytes -= chunk;
1136                chunk = PADLOCK_CHUNK;
1137            } while (nbytes >= AES_BLOCK_SIZE);
1138
1139        if (nbytes) {
1140            unsigned char *ivp = cdata->iv;
1141
1142            ctx->num = nbytes;
1143            padlock_reload_key(); /* empirically found */
1144            padlock_xcrypt_ecb(1, cdata, ivp, ivp);
1145            padlock_reload_key(); /* empirically found */
1146            while (nbytes) {
1147                *(out_arg++) = *(in_arg++) ^ *ivp;
1148                ivp++, nbytes--;
1149            }
1150        }
1151
1152        memcpy(ctx->iv, cdata->iv, AES_BLOCK_SIZE);
1153        break;
1154
1155    default:
1156        return 0;
1157    }
1158
1159    /* Clean the realign buffer if it was used */
1160    if (out_misaligned) {
1161        volatile unsigned long *p = (void *)out;
1162        size_t n = allocated / sizeof(*p);
1163        while (n--)
1164            *p++ = 0;
1165    }
1166
1167    memset(cdata->iv, 0, AES_BLOCK_SIZE);
1168
1169    return 1;
1170}
1171
1172#   endif                       /* OPENSSL_NO_AES */
1173
1174/* ===== Random Number Generator ===== */
1175/*
1176 * This code is not engaged. The reason is that it does not comply
1177 * with recommendations for VIA RNG usage for secure applications
1178 * (posted at http://www.via.com.tw/en/viac3/c3.jsp) nor does it
1179 * provide meaningful error control...
1180 */
1181/*
1182 * Wrapper that provides an interface between the API and the raw PadLock
1183 * RNG
1184 */
1185static int padlock_rand_bytes(unsigned char *output, int count)
1186{
1187    unsigned int eax, buf;
1188
1189    while (count >= 8) {
1190        eax = padlock_xstore(output, 0);
1191        if (!(eax & (1 << 6)))
1192            return 0;           /* RNG disabled */
1193        /* this ---vv--- covers DC bias, Raw Bits and String Filter */
1194        if (eax & (0x1F << 10))
1195            return 0;
1196        if ((eax & 0x1F) == 0)
1197            continue;           /* no data, retry... */
1198        if ((eax & 0x1F) != 8)
1199            return 0;           /* fatal failure...  */
1200        output += 8;
1201        count -= 8;
1202    }
1203    while (count > 0) {
1204        eax = padlock_xstore(&buf, 3);
1205        if (!(eax & (1 << 6)))
1206            return 0;           /* RNG disabled */
1207        /* this ---vv--- covers DC bias, Raw Bits and String Filter */
1208        if (eax & (0x1F << 10))
1209            return 0;
1210        if ((eax & 0x1F) == 0)
1211            continue;           /* no data, retry... */
1212        if ((eax & 0x1F) != 1)
1213            return 0;           /* fatal failure...  */
1214        *output++ = (unsigned char)buf;
1215        count--;
1216    }
1217    *(volatile unsigned int *)&buf = 0;
1218
1219    return 1;
1220}
1221
1222/* Dummy but necessary function */
1223static int padlock_rand_status(void)
1224{
1225    return 1;
1226}
1227
1228/* Prepare structure for registration */
1229static RAND_METHOD padlock_rand = {
1230    NULL,                       /* seed */
1231    padlock_rand_bytes,         /* bytes */
1232    NULL,                       /* cleanup */
1233    NULL,                       /* add */
1234    padlock_rand_bytes,         /* pseudorand */
1235    padlock_rand_status,        /* rand status */
1236};
1237
1238#  endif                        /* COMPILE_HW_PADLOCK */
1239
1240# endif                         /* !OPENSSL_NO_HW_PADLOCK */
1241#endif                          /* !OPENSSL_NO_HW */
1242