e_padlock.c revision 296341
1/*-
2 * Support for VIA PadLock Advanced Cryptography Engine (ACE)
3 * Written by Michal Ludvig <michal@logix.cz>
4 *            http://www.logix.cz/michal
5 *
6 * Big thanks to Andy Polyakov for a help with optimization,
7 * assembler fixes, port to MS Windows and a lot of other
8 * valuable work on this engine!
9 */
10
11/* ====================================================================
12 * Copyright (c) 1999-2001 The OpenSSL Project.  All rights reserved.
13 *
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions
16 * are met:
17 *
18 * 1. Redistributions of source code must retain the above copyright
19 *    notice, this list of conditions and the following disclaimer.
20 *
21 * 2. Redistributions in binary form must reproduce the above copyright
22 *    notice, this list of conditions and the following disclaimer in
23 *    the documentation and/or other materials provided with the
24 *    distribution.
25 *
26 * 3. All advertising materials mentioning features or use of this
27 *    software must display the following acknowledgment:
28 *    "This product includes software developed by the OpenSSL Project
29 *    for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
30 *
31 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
32 *    endorse or promote products derived from this software without
33 *    prior written permission. For written permission, please contact
34 *    licensing@OpenSSL.org.
35 *
36 * 5. Products derived from this software may not be called "OpenSSL"
37 *    nor may "OpenSSL" appear in their names without prior written
38 *    permission of the OpenSSL Project.
39 *
40 * 6. Redistributions of any form whatsoever must retain the following
41 *    acknowledgment:
42 *    "This product includes software developed by the OpenSSL Project
43 *    for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
44 *
45 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
46 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
47 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
48 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
49 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
50 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
51 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
52 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
53 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
54 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
55 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
56 * OF THE POSSIBILITY OF SUCH DAMAGE.
57 * ====================================================================
58 *
59 * This product includes cryptographic software written by Eric Young
60 * (eay@cryptsoft.com).  This product includes software written by Tim
61 * Hudson (tjh@cryptsoft.com).
62 *
63 */
64
65#include <stdio.h>
66#include <string.h>
67
68#include <openssl/opensslconf.h>
69#include <openssl/crypto.h>
70#include <openssl/dso.h>
71#include <openssl/engine.h>
72#include <openssl/evp.h>
73#ifndef OPENSSL_NO_AES
74# include <openssl/aes.h>
75#endif
76#include <openssl/rand.h>
77#include <openssl/err.h>
78
79#ifndef OPENSSL_NO_HW
80# ifndef OPENSSL_NO_HW_PADLOCK
81
82/* Attempt to have a single source for both 0.9.7 and 0.9.8 :-) */
83#  if (OPENSSL_VERSION_NUMBER >= 0x00908000L)
84#   ifndef OPENSSL_NO_DYNAMIC_ENGINE
85#    define DYNAMIC_ENGINE
86#   endif
87#  elif (OPENSSL_VERSION_NUMBER >= 0x00907000L)
88#   ifdef ENGINE_DYNAMIC_SUPPORT
89#    define DYNAMIC_ENGINE
90#   endif
91#  else
92#   error "Only OpenSSL >= 0.9.7 is supported"
93#  endif
94
95/*
96 * VIA PadLock AES is available *ONLY* on some x86 CPUs. Not only that it
97 * doesn't exist elsewhere, but it even can't be compiled on other platforms!
98 *
99 * In addition, because of the heavy use of inline assembler, compiler choice
100 * is limited to GCC and Microsoft C.
101 */
102#  undef COMPILE_HW_PADLOCK
103#  if !defined(I386_ONLY) && !defined(OPENSSL_NO_INLINE_ASM)
104#   if (defined(__GNUC__) && (defined(__i386__) || defined(__i386))) || \
105     (defined(_MSC_VER) && defined(_M_IX86))
106#    define COMPILE_HW_PADLOCK
107#   endif
108#  endif
109
110#  ifdef OPENSSL_NO_DYNAMIC_ENGINE
111#   ifdef COMPILE_HW_PADLOCK
112static ENGINE *ENGINE_padlock(void);
113#   endif
114
115void ENGINE_load_padlock(void)
116{
117/* On non-x86 CPUs it just returns. */
118#   ifdef COMPILE_HW_PADLOCK
119    ENGINE *toadd = ENGINE_padlock();
120    if (!toadd)
121        return;
122    ENGINE_add(toadd);
123    ENGINE_free(toadd);
124    ERR_clear_error();
125#   endif
126}
127
128#  endif
129
130#  ifdef COMPILE_HW_PADLOCK
131/*
132 * We do these includes here to avoid header problems on platforms that do
133 * not have the VIA padlock anyway...
134 */
135#   include <stdlib.h>
136#   ifdef _WIN32
137#    include <malloc.h>
138#    ifndef alloca
139#     define alloca _alloca
140#    endif
141#   elif defined(__GNUC__)
142#    ifndef alloca
143#     define alloca(s) __builtin_alloca(s)
144#    endif
145#   endif
146
147/* Function for ENGINE detection and control */
148static int padlock_available(void);
149static int padlock_init(ENGINE *e);
150
151/* RNG Stuff */
152static RAND_METHOD padlock_rand;
153
154/* Cipher Stuff */
155#   ifndef OPENSSL_NO_AES
156static int padlock_ciphers(ENGINE *e, const EVP_CIPHER **cipher,
157                           const int **nids, int nid);
158#   endif
159
160/* Engine names */
161static const char *padlock_id = "padlock";
162static char padlock_name[100];
163
164/* Available features */
165static int padlock_use_ace = 0; /* Advanced Cryptography Engine */
166static int padlock_use_rng = 0; /* Random Number Generator */
167#   ifndef OPENSSL_NO_AES
168static int padlock_aes_align_required = 1;
169#   endif
170
171/* ===== Engine "management" functions ===== */
172
173/* Prepare the ENGINE structure for registration */
174static int padlock_bind_helper(ENGINE *e)
175{
176    /* Check available features */
177    padlock_available();
178
179#   if 1                        /* disable RNG for now, see commentary in
180                                 * vicinity of RNG code */
181    padlock_use_rng = 0;
182#   endif
183
184    /* Generate a nice engine name with available features */
185    BIO_snprintf(padlock_name, sizeof(padlock_name),
186                 "VIA PadLock (%s, %s)",
187                 padlock_use_rng ? "RNG" : "no-RNG",
188                 padlock_use_ace ? "ACE" : "no-ACE");
189
190    /* Register everything or return with an error */
191    if (!ENGINE_set_id(e, padlock_id) ||
192        !ENGINE_set_name(e, padlock_name) ||
193        !ENGINE_set_init_function(e, padlock_init) ||
194#   ifndef OPENSSL_NO_AES
195        (padlock_use_ace && !ENGINE_set_ciphers(e, padlock_ciphers)) ||
196#   endif
197        (padlock_use_rng && !ENGINE_set_RAND(e, &padlock_rand))) {
198        return 0;
199    }
200
201    /* Everything looks good */
202    return 1;
203}
204
205#   ifdef OPENSSL_NO_DYNAMIC_ENGINE
206
207/* Constructor */
208static ENGINE *ENGINE_padlock(void)
209{
210    ENGINE *eng = ENGINE_new();
211
212    if (!eng) {
213        return NULL;
214    }
215
216    if (!padlock_bind_helper(eng)) {
217        ENGINE_free(eng);
218        return NULL;
219    }
220
221    return eng;
222}
223
224#   endif
225
226/* Check availability of the engine */
227static int padlock_init(ENGINE *e)
228{
229    return (padlock_use_rng || padlock_use_ace);
230}
231
232/*
233 * This stuff is needed if this ENGINE is being compiled into a
234 * self-contained shared-library.
235 */
236#   ifdef DYNAMIC_ENGINE
237static int padlock_bind_fn(ENGINE *e, const char *id)
238{
239    if (id && (strcmp(id, padlock_id) != 0)) {
240        return 0;
241    }
242
243    if (!padlock_bind_helper(e)) {
244        return 0;
245    }
246
247    return 1;
248}
249
250IMPLEMENT_DYNAMIC_CHECK_FN()
251    IMPLEMENT_DYNAMIC_BIND_FN(padlock_bind_fn)
252#   endif                       /* DYNAMIC_ENGINE */
253/* ===== Here comes the "real" engine ===== */
254#   ifndef OPENSSL_NO_AES
255/* Some AES-related constants */
256#    define AES_BLOCK_SIZE          16
257#    define AES_KEY_SIZE_128        16
258#    define AES_KEY_SIZE_192        24
259#    define AES_KEY_SIZE_256        32
260    /*
261     * Here we store the status information relevant to the current context.
262     */
263    /*
264     * BIG FAT WARNING: Inline assembler in PADLOCK_XCRYPT_ASM() depends on
265     * the order of items in this structure.  Don't blindly modify, reorder,
266     * etc!
267     */
268struct padlock_cipher_data {
269    unsigned char iv[AES_BLOCK_SIZE]; /* Initialization vector */
270    union {
271        unsigned int pad[4];
272        struct {
273            int rounds:4;
274            int dgst:1;         /* n/a in C3 */
275            int align:1;        /* n/a in C3 */
276            int ciphr:1;        /* n/a in C3 */
277            unsigned int keygen:1;
278            int interm:1;
279            unsigned int encdec:1;
280            int ksize:2;
281        } b;
282    } cword;                    /* Control word */
283    AES_KEY ks;                 /* Encryption key */
284};
285
286/*
287 * Essentially this variable belongs in thread local storage.
288 * Having this variable global on the other hand can only cause
289 * few bogus key reloads [if any at all on single-CPU system],
290 * so we accept the penatly...
291 */
292static volatile struct padlock_cipher_data *padlock_saved_context;
293#   endif
294
295/*-
296 * =======================================================
297 * Inline assembler section(s).
298 * =======================================================
299 * Order of arguments is chosen to facilitate Windows port
300 * using __fastcall calling convention. If you wish to add
301 * more routines, keep in mind that first __fastcall
302 * argument is passed in %ecx and second - in %edx.
303 * =======================================================
304 */
305#   if defined(__GNUC__) && __GNUC__>=2
306/*
307 * As for excessive "push %ebx"/"pop %ebx" found all over.
308 * When generating position-independent code GCC won't let
309 * us use "b" in assembler templates nor even respect "ebx"
310 * in "clobber description." Therefore the trouble...
311 */
312
313/*
314 * Helper function - check if a CPUID instruction is available on this CPU
315 */
316static int padlock_insn_cpuid_available(void)
317{
318    int result = -1;
319
320    /*
321     * We're checking if the bit #21 of EFLAGS can be toggled. If yes =
322     * CPUID is available.
323     */
324    asm volatile ("pushf\n"
325                  "popl %%eax\n"
326                  "xorl $0x200000, %%eax\n"
327                  "movl %%eax, %%ecx\n"
328                  "andl $0x200000, %%ecx\n"
329                  "pushl %%eax\n"
330                  "popf\n"
331                  "pushf\n"
332                  "popl %%eax\n"
333                  "andl $0x200000, %%eax\n"
334                  "xorl %%eax, %%ecx\n"
335                  "movl %%ecx, %0\n":"=r" (result)::"eax", "ecx");
336
337    return (result == 0);
338}
339
340/*
341 * Load supported features of the CPU to see if the PadLock is available.
342 */
343static int padlock_available(void)
344{
345    char vendor_string[16];
346    unsigned int eax, edx;
347
348    /* First check if the CPUID instruction is available at all... */
349    if (!padlock_insn_cpuid_available())
350        return 0;
351
352    /* Are we running on the Centaur (VIA) CPU? */
353    eax = 0x00000000;
354    vendor_string[12] = 0;
355    asm volatile ("pushl  %%ebx\n"
356                  "cpuid\n"
357                  "movl   %%ebx,(%%edi)\n"
358                  "movl   %%edx,4(%%edi)\n"
359                  "movl   %%ecx,8(%%edi)\n"
360                  "popl   %%ebx":"+a" (eax):"D"(vendor_string):"ecx", "edx");
361    if (strcmp(vendor_string, "CentaurHauls") != 0)
362        return 0;
363
364    /* Check for Centaur Extended Feature Flags presence */
365    eax = 0xC0000000;
366    asm volatile ("pushl %%ebx; cpuid; popl %%ebx":"+a" (eax)::"ecx", "edx");
367    if (eax < 0xC0000001)
368        return 0;
369
370    /* Read the Centaur Extended Feature Flags */
371    eax = 0xC0000001;
372    asm volatile ("pushl %%ebx; cpuid; popl %%ebx":"+a" (eax),
373                  "=d"(edx)::"ecx");
374
375    /* Fill up some flags */
376    padlock_use_ace = ((edx & (0x3 << 6)) == (0x3 << 6));
377    padlock_use_rng = ((edx & (0x3 << 2)) == (0x3 << 2));
378
379    return padlock_use_ace + padlock_use_rng;
380}
381
382#    ifndef OPENSSL_NO_AES
383#     ifndef AES_ASM
384/* Our own htonl()/ntohl() */
385static inline void padlock_bswapl(AES_KEY *ks)
386{
387    size_t i = sizeof(ks->rd_key) / sizeof(ks->rd_key[0]);
388    unsigned int *key = ks->rd_key;
389
390    while (i--) {
391        asm volatile ("bswapl %0":"+r" (*key));
392        key++;
393    }
394}
395#     endif
396#    endif
397
398/*
399 * Force key reload from memory to the CPU microcode. Loading EFLAGS from the
400 * stack clears EFLAGS[30] which does the trick.
401 */
402static inline void padlock_reload_key(void)
403{
404    asm volatile ("pushfl; popfl");
405}
406
407#    ifndef OPENSSL_NO_AES
408/*
409 * This is heuristic key context tracing. At first one
410 * believes that one should use atomic swap instructions,
411 * but it's not actually necessary. Point is that if
412 * padlock_saved_context was changed by another thread
413 * after we've read it and before we compare it with cdata,
414 * our key *shall* be reloaded upon thread context switch
415 * and we are therefore set in either case...
416 */
417static inline void padlock_verify_context(struct padlock_cipher_data *cdata)
418{
419    asm volatile ("pushfl\n"
420                  "       btl     $30,(%%esp)\n"
421                  "       jnc     1f\n"
422                  "       cmpl    %2,%1\n"
423                  "       je      1f\n"
424                  "       popfl\n"
425                  "       subl    $4,%%esp\n"
426                  "1:     addl    $4,%%esp\n"
427                  "       movl    %2,%0":"+m" (padlock_saved_context)
428                  :"r"(padlock_saved_context), "r"(cdata):"cc");
429}
430
431/* Template for padlock_xcrypt_* modes */
432/*
433 * BIG FAT WARNING: The offsets used with 'leal' instructions describe items
434 * of the 'padlock_cipher_data' structure.
435 */
436#     define PADLOCK_XCRYPT_ASM(name,rep_xcrypt)     \
437static inline void *name(size_t cnt,            \
438        struct padlock_cipher_data *cdata,      \
439        void *out, const void *inp)             \
440{       void *iv;                               \
441        asm volatile ( "pushl   %%ebx\n"        \
442                "       leal    16(%0),%%edx\n" \
443                "       leal    32(%0),%%ebx\n" \
444                        rep_xcrypt "\n"         \
445                "       popl    %%ebx"          \
446                : "=a"(iv), "=c"(cnt), "=D"(out), "=S"(inp) \
447                : "0"(cdata), "1"(cnt), "2"(out), "3"(inp)  \
448                : "edx", "cc", "memory");       \
449        return iv;                              \
450}
451
452/* Generate all functions with appropriate opcodes */
453/* rep xcryptecb */
454PADLOCK_XCRYPT_ASM(padlock_xcrypt_ecb, ".byte 0xf3,0x0f,0xa7,0xc8")
455/* rep xcryptcbc */
456    PADLOCK_XCRYPT_ASM(padlock_xcrypt_cbc, ".byte 0xf3,0x0f,0xa7,0xd0")
457/* rep xcryptcfb */
458    PADLOCK_XCRYPT_ASM(padlock_xcrypt_cfb, ".byte 0xf3,0x0f,0xa7,0xe0")
459/* rep xcryptofb */
460    PADLOCK_XCRYPT_ASM(padlock_xcrypt_ofb, ".byte 0xf3,0x0f,0xa7,0xe8")
461#    endif
462/* The RNG call itself */
463static inline unsigned int padlock_xstore(void *addr, unsigned int edx_in)
464{
465    unsigned int eax_out;
466
467    asm volatile (".byte 0x0f,0xa7,0xc0" /* xstore */
468                  :"=a" (eax_out), "=m"(*(unsigned *)addr)
469                  :"D"(addr), "d"(edx_in)
470        );
471
472    return eax_out;
473}
474
475/*
476 * Why not inline 'rep movsd'? I failed to find information on what value in
477 * Direction Flag one can expect and consequently have to apply
478 * "better-safe-than-sorry" approach and assume "undefined." I could
479 * explicitly clear it and restore the original value upon return from
480 * padlock_aes_cipher, but it's presumably too much trouble for too little
481 * gain... In case you wonder 'rep xcrypt*' instructions above are *not*
482 * affected by the Direction Flag and pointers advance toward larger
483 * addresses unconditionally.
484 */
485static inline unsigned char *padlock_memcpy(void *dst, const void *src,
486                                            size_t n)
487{
488    long *d = dst;
489    const long *s = src;
490
491    n /= sizeof(*d);
492    do {
493        *d++ = *s++;
494    } while (--n);
495
496    return dst;
497}
498
499#   elif defined(_MSC_VER)
500/*
501 * Unlike GCC these are real functions. In order to minimize impact
502 * on performance we adhere to __fastcall calling convention in
503 * order to get two first arguments passed through %ecx and %edx.
504 * Which kind of suits very well, as instructions in question use
505 * both %ecx and %edx as input:-)
506 */
507#    define REP_XCRYPT(code)                \
508        _asm _emit 0xf3                 \
509        _asm _emit 0x0f _asm _emit 0xa7 \
510        _asm _emit code
511
512/*
513 * BIG FAT WARNING: The offsets used with 'lea' instructions describe items
514 * of the 'padlock_cipher_data' structure.
515 */
516#    define PADLOCK_XCRYPT_ASM(name,code)   \
517static void * __fastcall                \
518        name (size_t cnt, void *cdata,  \
519        void *outp, const void *inp)    \
520{       _asm    mov     eax,edx         \
521        _asm    lea     edx,[eax+16]    \
522        _asm    lea     ebx,[eax+32]    \
523        _asm    mov     edi,outp        \
524        _asm    mov     esi,inp         \
525        REP_XCRYPT(code)                \
526}
527
528PADLOCK_XCRYPT_ASM(padlock_xcrypt_ecb,0xc8)
529PADLOCK_XCRYPT_ASM(padlock_xcrypt_cbc,0xd0)
530PADLOCK_XCRYPT_ASM(padlock_xcrypt_cfb,0xe0)
531PADLOCK_XCRYPT_ASM(padlock_xcrypt_ofb,0xe8)
532
533static int __fastcall padlock_xstore(void *outp, unsigned int code)
534{
535    _asm    mov edi,ecx
536    _asm _emit 0x0f _asm _emit 0xa7 _asm _emit 0xc0
537}
538
539static void __fastcall padlock_reload_key(void)
540{
541    _asm pushfd
542    _asm popfd
543}
544
545static void __fastcall padlock_verify_context(void *cdata)
546{
547    _asm    {
548        pushfd
549        bt  DWORD PTR[esp],30
550        jnc skip
551        cmp ecx,padlock_saved_context
552        je  skip
553        popfd
554        sub esp,4
555    skip:   add esp,4
556        mov padlock_saved_context,ecx
557    }
558}
559
560static int
561padlock_available(void)
562{
563    _asm    {
564        pushfd
565        pop eax
566        mov ecx,eax
567        xor eax,1<<21
568        push    eax
569        popfd
570        pushfd
571        pop eax
572        xor eax,ecx
573        bt  eax,21
574        jnc noluck
575        mov eax,0
576        cpuid
577        xor eax,eax
578        cmp ebx,'tneC'
579        jne noluck
580        cmp edx,'Hrua'
581        jne noluck
582        cmp ecx,'slua'
583        jne noluck
584        mov eax,0xC0000000
585        cpuid
586        mov edx,eax
587        xor eax,eax
588        cmp edx,0xC0000001
589        jb  noluck
590        mov eax,0xC0000001
591        cpuid
592        xor eax,eax
593        bt  edx,6
594        jnc skip_a
595        bt  edx,7
596        jnc skip_a
597        mov padlock_use_ace,1
598        inc eax
599    skip_a: bt  edx,2
600        jnc skip_r
601        bt  edx,3
602        jnc skip_r
603        mov padlock_use_rng,1
604        inc eax
605    skip_r:
606    noluck:
607    }
608}
609
610static void __fastcall padlock_bswapl(void *key)
611{
612    _asm    {
613        pushfd
614        cld
615        mov esi,ecx
616        mov edi,ecx
617        mov ecx,60
618    up: lodsd
619        bswap   eax
620        stosd
621        loop    up
622        popfd
623    }
624}
625
626/*
627 * MS actually specifies status of Direction Flag and compiler even manages
628 * to compile following as 'rep movsd' all by itself...
629 */
630#    define padlock_memcpy(o,i,n) ((unsigned char *)memcpy((o),(i),(n)&~3U))
631#   endif
632/* ===== AES encryption/decryption ===== */
633#   ifndef OPENSSL_NO_AES
634#    if defined(NID_aes_128_cfb128) && ! defined (NID_aes_128_cfb)
635#     define NID_aes_128_cfb NID_aes_128_cfb128
636#    endif
637#    if defined(NID_aes_128_ofb128) && ! defined (NID_aes_128_ofb)
638#     define NID_aes_128_ofb NID_aes_128_ofb128
639#    endif
640#    if defined(NID_aes_192_cfb128) && ! defined (NID_aes_192_cfb)
641#     define NID_aes_192_cfb NID_aes_192_cfb128
642#    endif
643#    if defined(NID_aes_192_ofb128) && ! defined (NID_aes_192_ofb)
644#     define NID_aes_192_ofb NID_aes_192_ofb128
645#    endif
646#    if defined(NID_aes_256_cfb128) && ! defined (NID_aes_256_cfb)
647#     define NID_aes_256_cfb NID_aes_256_cfb128
648#    endif
649#    if defined(NID_aes_256_ofb128) && ! defined (NID_aes_256_ofb)
650#     define NID_aes_256_ofb NID_aes_256_ofb128
651#    endif
652/*
653 * List of supported ciphers.
654 */ static int padlock_cipher_nids[] = {
655    NID_aes_128_ecb,
656    NID_aes_128_cbc,
657    NID_aes_128_cfb,
658    NID_aes_128_ofb,
659
660    NID_aes_192_ecb,
661    NID_aes_192_cbc,
662    NID_aes_192_cfb,
663    NID_aes_192_ofb,
664
665    NID_aes_256_ecb,
666    NID_aes_256_cbc,
667    NID_aes_256_cfb,
668    NID_aes_256_ofb,
669};
670
671static int padlock_cipher_nids_num = (sizeof(padlock_cipher_nids) /
672                                      sizeof(padlock_cipher_nids[0]));
673
674/* Function prototypes ... */
675static int padlock_aes_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
676                                const unsigned char *iv, int enc);
677static int padlock_aes_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
678                              const unsigned char *in, size_t nbytes);
679
680#    define NEAREST_ALIGNED(ptr) ( (unsigned char *)(ptr) +         \
681        ( (0x10 - ((size_t)(ptr) & 0x0F)) & 0x0F )      )
682#    define ALIGNED_CIPHER_DATA(ctx) ((struct padlock_cipher_data *)\
683        NEAREST_ALIGNED(ctx->cipher_data))
684
685#    define EVP_CIPHER_block_size_ECB       AES_BLOCK_SIZE
686#    define EVP_CIPHER_block_size_CBC       AES_BLOCK_SIZE
687#    define EVP_CIPHER_block_size_OFB       1
688#    define EVP_CIPHER_block_size_CFB       1
689
690/*
691 * Declaring so many ciphers by hand would be a pain. Instead introduce a bit
692 * of preprocessor magic :-)
693 */
694#    define DECLARE_AES_EVP(ksize,lmode,umode)      \
695static const EVP_CIPHER padlock_aes_##ksize##_##lmode = {       \
696        NID_aes_##ksize##_##lmode,              \
697        EVP_CIPHER_block_size_##umode,  \
698        AES_KEY_SIZE_##ksize,           \
699        AES_BLOCK_SIZE,                 \
700        0 | EVP_CIPH_##umode##_MODE,    \
701        padlock_aes_init_key,           \
702        padlock_aes_cipher,             \
703        NULL,                           \
704        sizeof(struct padlock_cipher_data) + 16,        \
705        EVP_CIPHER_set_asn1_iv,         \
706        EVP_CIPHER_get_asn1_iv,         \
707        NULL,                           \
708        NULL                            \
709}
710
711DECLARE_AES_EVP(128, ecb, ECB);
712DECLARE_AES_EVP(128, cbc, CBC);
713DECLARE_AES_EVP(128, cfb, CFB);
714DECLARE_AES_EVP(128, ofb, OFB);
715
716DECLARE_AES_EVP(192, ecb, ECB);
717DECLARE_AES_EVP(192, cbc, CBC);
718DECLARE_AES_EVP(192, cfb, CFB);
719DECLARE_AES_EVP(192, ofb, OFB);
720
721DECLARE_AES_EVP(256, ecb, ECB);
722DECLARE_AES_EVP(256, cbc, CBC);
723DECLARE_AES_EVP(256, cfb, CFB);
724DECLARE_AES_EVP(256, ofb, OFB);
725
726static int
727padlock_ciphers(ENGINE *e, const EVP_CIPHER **cipher, const int **nids,
728                int nid)
729{
730    /* No specific cipher => return a list of supported nids ... */
731    if (!cipher) {
732        *nids = padlock_cipher_nids;
733        return padlock_cipher_nids_num;
734    }
735
736    /* ... or the requested "cipher" otherwise */
737    switch (nid) {
738    case NID_aes_128_ecb:
739        *cipher = &padlock_aes_128_ecb;
740        break;
741    case NID_aes_128_cbc:
742        *cipher = &padlock_aes_128_cbc;
743        break;
744    case NID_aes_128_cfb:
745        *cipher = &padlock_aes_128_cfb;
746        break;
747    case NID_aes_128_ofb:
748        *cipher = &padlock_aes_128_ofb;
749        break;
750
751    case NID_aes_192_ecb:
752        *cipher = &padlock_aes_192_ecb;
753        break;
754    case NID_aes_192_cbc:
755        *cipher = &padlock_aes_192_cbc;
756        break;
757    case NID_aes_192_cfb:
758        *cipher = &padlock_aes_192_cfb;
759        break;
760    case NID_aes_192_ofb:
761        *cipher = &padlock_aes_192_ofb;
762        break;
763
764    case NID_aes_256_ecb:
765        *cipher = &padlock_aes_256_ecb;
766        break;
767    case NID_aes_256_cbc:
768        *cipher = &padlock_aes_256_cbc;
769        break;
770    case NID_aes_256_cfb:
771        *cipher = &padlock_aes_256_cfb;
772        break;
773    case NID_aes_256_ofb:
774        *cipher = &padlock_aes_256_ofb;
775        break;
776
777    default:
778        /* Sorry, we don't support this NID */
779        *cipher = NULL;
780        return 0;
781    }
782
783    return 1;
784}
785
786/* Prepare the encryption key for PadLock usage */
787static int
788padlock_aes_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
789                     const unsigned char *iv, int enc)
790{
791    struct padlock_cipher_data *cdata;
792    int key_len = EVP_CIPHER_CTX_key_length(ctx) * 8;
793
794    if (key == NULL)
795        return 0;               /* ERROR */
796
797    cdata = ALIGNED_CIPHER_DATA(ctx);
798    memset(cdata, 0, sizeof(struct padlock_cipher_data));
799
800    /* Prepare Control word. */
801    if (EVP_CIPHER_CTX_mode(ctx) == EVP_CIPH_OFB_MODE)
802        cdata->cword.b.encdec = 0;
803    else
804        cdata->cword.b.encdec = (ctx->encrypt == 0);
805    cdata->cword.b.rounds = 10 + (key_len - 128) / 32;
806    cdata->cword.b.ksize = (key_len - 128) / 64;
807
808    switch (key_len) {
809    case 128:
810        /*
811         * PadLock can generate an extended key for AES128 in hardware
812         */
813        memcpy(cdata->ks.rd_key, key, AES_KEY_SIZE_128);
814        cdata->cword.b.keygen = 0;
815        break;
816
817    case 192:
818    case 256:
819        /*
820         * Generate an extended AES key in software. Needed for AES192/AES256
821         */
822        /*
823         * Well, the above applies to Stepping 8 CPUs and is listed as
824         * hardware errata. They most likely will fix it at some point and
825         * then a check for stepping would be due here.
826         */
827        if (EVP_CIPHER_CTX_mode(ctx) == EVP_CIPH_CFB_MODE ||
828            EVP_CIPHER_CTX_mode(ctx) == EVP_CIPH_OFB_MODE || enc)
829            AES_set_encrypt_key(key, key_len, &cdata->ks);
830        else
831            AES_set_decrypt_key(key, key_len, &cdata->ks);
832#    ifndef AES_ASM
833        /*
834         * OpenSSL C functions use byte-swapped extended key.
835         */
836        padlock_bswapl(&cdata->ks);
837#    endif
838        cdata->cword.b.keygen = 1;
839        break;
840
841    default:
842        /* ERROR */
843        return 0;
844    }
845
846    /*
847     * This is done to cover for cases when user reuses the
848     * context for new key. The catch is that if we don't do
849     * this, padlock_eas_cipher might proceed with old key...
850     */
851    padlock_reload_key();
852
853    return 1;
854}
855
856/*-
857 * Simplified version of padlock_aes_cipher() used when
858 * 1) both input and output buffers are at aligned addresses.
859 * or when
860 * 2) running on a newer CPU that doesn't require aligned buffers.
861 */
862static int
863padlock_aes_cipher_omnivorous(EVP_CIPHER_CTX *ctx, unsigned char *out_arg,
864                              const unsigned char *in_arg, size_t nbytes)
865{
866    struct padlock_cipher_data *cdata;
867    void *iv;
868
869    cdata = ALIGNED_CIPHER_DATA(ctx);
870    padlock_verify_context(cdata);
871
872    switch (EVP_CIPHER_CTX_mode(ctx)) {
873    case EVP_CIPH_ECB_MODE:
874        padlock_xcrypt_ecb(nbytes / AES_BLOCK_SIZE, cdata, out_arg, in_arg);
875        break;
876
877    case EVP_CIPH_CBC_MODE:
878        memcpy(cdata->iv, ctx->iv, AES_BLOCK_SIZE);
879        iv = padlock_xcrypt_cbc(nbytes / AES_BLOCK_SIZE, cdata, out_arg,
880                                in_arg);
881        memcpy(ctx->iv, iv, AES_BLOCK_SIZE);
882        break;
883
884    case EVP_CIPH_CFB_MODE:
885        memcpy(cdata->iv, ctx->iv, AES_BLOCK_SIZE);
886        iv = padlock_xcrypt_cfb(nbytes / AES_BLOCK_SIZE, cdata, out_arg,
887                                in_arg);
888        memcpy(ctx->iv, iv, AES_BLOCK_SIZE);
889        break;
890
891    case EVP_CIPH_OFB_MODE:
892        memcpy(cdata->iv, ctx->iv, AES_BLOCK_SIZE);
893        padlock_xcrypt_ofb(nbytes / AES_BLOCK_SIZE, cdata, out_arg, in_arg);
894        memcpy(ctx->iv, cdata->iv, AES_BLOCK_SIZE);
895        break;
896
897    default:
898        return 0;
899    }
900
901    memset(cdata->iv, 0, AES_BLOCK_SIZE);
902
903    return 1;
904}
905
906#    ifndef  PADLOCK_CHUNK
907#     define PADLOCK_CHUNK  512 /* Must be a power of 2 larger than 16 */
908#    endif
909#    if PADLOCK_CHUNK<16 || PADLOCK_CHUNK&(PADLOCK_CHUNK-1)
910#     error "insane PADLOCK_CHUNK..."
911#    endif
912
913/*
914 * Re-align the arguments to 16-Bytes boundaries and run the encryption
915 * function itself. This function is not AES-specific.
916 */
917static int
918padlock_aes_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out_arg,
919                   const unsigned char *in_arg, size_t nbytes)
920{
921    struct padlock_cipher_data *cdata;
922    const void *inp;
923    unsigned char *out;
924    void *iv;
925    int inp_misaligned, out_misaligned, realign_in_loop;
926    size_t chunk, allocated = 0;
927
928    /*
929     * ctx->num is maintained in byte-oriented modes, such as CFB and OFB...
930     */
931    if ((chunk = ctx->num)) {   /* borrow chunk variable */
932        unsigned char *ivp = ctx->iv;
933
934        switch (EVP_CIPHER_CTX_mode(ctx)) {
935        case EVP_CIPH_CFB_MODE:
936            if (chunk >= AES_BLOCK_SIZE)
937                return 0;       /* bogus value */
938
939            if (ctx->encrypt)
940                while (chunk < AES_BLOCK_SIZE && nbytes != 0) {
941                    ivp[chunk] = *(out_arg++) = *(in_arg++) ^ ivp[chunk];
942                    chunk++, nbytes--;
943            } else
944                while (chunk < AES_BLOCK_SIZE && nbytes != 0) {
945                    unsigned char c = *(in_arg++);
946                    *(out_arg++) = c ^ ivp[chunk];
947                    ivp[chunk++] = c, nbytes--;
948                }
949
950            ctx->num = chunk % AES_BLOCK_SIZE;
951            break;
952        case EVP_CIPH_OFB_MODE:
953            if (chunk >= AES_BLOCK_SIZE)
954                return 0;       /* bogus value */
955
956            while (chunk < AES_BLOCK_SIZE && nbytes != 0) {
957                *(out_arg++) = *(in_arg++) ^ ivp[chunk];
958                chunk++, nbytes--;
959            }
960
961            ctx->num = chunk % AES_BLOCK_SIZE;
962            break;
963        }
964    }
965
966    if (nbytes == 0)
967        return 1;
968#    if 0
969    if (nbytes % AES_BLOCK_SIZE)
970        return 0;               /* are we expected to do tail processing? */
971#    else
972    /*
973     * nbytes is always multiple of AES_BLOCK_SIZE in ECB and CBC modes and
974     * arbitrary value in byte-oriented modes, such as CFB and OFB...
975     */
976#    endif
977
978    /*
979     * VIA promises CPUs that won't require alignment in the future. For now
980     * padlock_aes_align_required is initialized to 1 and the condition is
981     * never met...
982     */
983    /*
984     * C7 core is capable to manage unaligned input in non-ECB[!] mode, but
985     * performance penalties appear to be approximately same as for software
986     * alignment below or ~3x. They promise to improve it in the future, but
987     * for now we can just as well pretend that it can only handle aligned
988     * input...
989     */
990    if (!padlock_aes_align_required && (nbytes % AES_BLOCK_SIZE) == 0)
991        return padlock_aes_cipher_omnivorous(ctx, out_arg, in_arg, nbytes);
992
993    inp_misaligned = (((size_t)in_arg) & 0x0F);
994    out_misaligned = (((size_t)out_arg) & 0x0F);
995
996    /*
997     * Note that even if output is aligned and input not, I still prefer to
998     * loop instead of copy the whole input and then encrypt in one stroke.
999     * This is done in order to improve L1 cache utilization...
1000     */
1001    realign_in_loop = out_misaligned | inp_misaligned;
1002
1003    if (!realign_in_loop && (nbytes % AES_BLOCK_SIZE) == 0)
1004        return padlock_aes_cipher_omnivorous(ctx, out_arg, in_arg, nbytes);
1005
1006    /* this takes one "if" out of the loops */
1007    chunk = nbytes;
1008    chunk %= PADLOCK_CHUNK;
1009    if (chunk == 0)
1010        chunk = PADLOCK_CHUNK;
1011
1012    if (out_misaligned) {
1013        /* optmize for small input */
1014        allocated = (chunk < nbytes ? PADLOCK_CHUNK : nbytes);
1015        out = alloca(0x10 + allocated);
1016        out = NEAREST_ALIGNED(out);
1017    } else
1018        out = out_arg;
1019
1020    cdata = ALIGNED_CIPHER_DATA(ctx);
1021    padlock_verify_context(cdata);
1022
1023    switch (EVP_CIPHER_CTX_mode(ctx)) {
1024    case EVP_CIPH_ECB_MODE:
1025        do {
1026            if (inp_misaligned)
1027                inp = padlock_memcpy(out, in_arg, chunk);
1028            else
1029                inp = in_arg;
1030            in_arg += chunk;
1031
1032            padlock_xcrypt_ecb(chunk / AES_BLOCK_SIZE, cdata, out, inp);
1033
1034            if (out_misaligned)
1035                out_arg = padlock_memcpy(out_arg, out, chunk) + chunk;
1036            else
1037                out = out_arg += chunk;
1038
1039            nbytes -= chunk;
1040            chunk = PADLOCK_CHUNK;
1041        } while (nbytes);
1042        break;
1043
1044    case EVP_CIPH_CBC_MODE:
1045        memcpy(cdata->iv, ctx->iv, AES_BLOCK_SIZE);
1046        goto cbc_shortcut;
1047        do {
1048            if (iv != cdata->iv)
1049                memcpy(cdata->iv, iv, AES_BLOCK_SIZE);
1050            chunk = PADLOCK_CHUNK;
1051 cbc_shortcut:                 /* optimize for small input */
1052            if (inp_misaligned)
1053                inp = padlock_memcpy(out, in_arg, chunk);
1054            else
1055                inp = in_arg;
1056            in_arg += chunk;
1057
1058            iv = padlock_xcrypt_cbc(chunk / AES_BLOCK_SIZE, cdata, out, inp);
1059
1060            if (out_misaligned)
1061                out_arg = padlock_memcpy(out_arg, out, chunk) + chunk;
1062            else
1063                out = out_arg += chunk;
1064
1065        } while (nbytes -= chunk);
1066        memcpy(ctx->iv, iv, AES_BLOCK_SIZE);
1067        break;
1068
1069    case EVP_CIPH_CFB_MODE:
1070        memcpy(iv = cdata->iv, ctx->iv, AES_BLOCK_SIZE);
1071        chunk &= ~(AES_BLOCK_SIZE - 1);
1072        if (chunk)
1073            goto cfb_shortcut;
1074        else
1075            goto cfb_skiploop;
1076        do {
1077            if (iv != cdata->iv)
1078                memcpy(cdata->iv, iv, AES_BLOCK_SIZE);
1079            chunk = PADLOCK_CHUNK;
1080 cfb_shortcut:                 /* optimize for small input */
1081            if (inp_misaligned)
1082                inp = padlock_memcpy(out, in_arg, chunk);
1083            else
1084                inp = in_arg;
1085            in_arg += chunk;
1086
1087            iv = padlock_xcrypt_cfb(chunk / AES_BLOCK_SIZE, cdata, out, inp);
1088
1089            if (out_misaligned)
1090                out_arg = padlock_memcpy(out_arg, out, chunk) + chunk;
1091            else
1092                out = out_arg += chunk;
1093
1094            nbytes -= chunk;
1095        } while (nbytes >= AES_BLOCK_SIZE);
1096
1097 cfb_skiploop:
1098        if (nbytes) {
1099            unsigned char *ivp = cdata->iv;
1100
1101            if (iv != ivp) {
1102                memcpy(ivp, iv, AES_BLOCK_SIZE);
1103                iv = ivp;
1104            }
1105            ctx->num = nbytes;
1106            if (cdata->cword.b.encdec) {
1107                cdata->cword.b.encdec = 0;
1108                padlock_reload_key();
1109                padlock_xcrypt_ecb(1, cdata, ivp, ivp);
1110                cdata->cword.b.encdec = 1;
1111                padlock_reload_key();
1112                while (nbytes) {
1113                    unsigned char c = *(in_arg++);
1114                    *(out_arg++) = c ^ *ivp;
1115                    *(ivp++) = c, nbytes--;
1116                }
1117            } else {
1118                padlock_reload_key();
1119                padlock_xcrypt_ecb(1, cdata, ivp, ivp);
1120                padlock_reload_key();
1121                while (nbytes) {
1122                    *ivp = *(out_arg++) = *(in_arg++) ^ *ivp;
1123                    ivp++, nbytes--;
1124                }
1125            }
1126        }
1127
1128        memcpy(ctx->iv, iv, AES_BLOCK_SIZE);
1129        break;
1130
1131    case EVP_CIPH_OFB_MODE:
1132        memcpy(cdata->iv, ctx->iv, AES_BLOCK_SIZE);
1133        chunk &= ~(AES_BLOCK_SIZE - 1);
1134        if (chunk)
1135            do {
1136                if (inp_misaligned)
1137                    inp = padlock_memcpy(out, in_arg, chunk);
1138                else
1139                    inp = in_arg;
1140                in_arg += chunk;
1141
1142                padlock_xcrypt_ofb(chunk / AES_BLOCK_SIZE, cdata, out, inp);
1143
1144                if (out_misaligned)
1145                    out_arg = padlock_memcpy(out_arg, out, chunk) + chunk;
1146                else
1147                    out = out_arg += chunk;
1148
1149                nbytes -= chunk;
1150                chunk = PADLOCK_CHUNK;
1151            } while (nbytes >= AES_BLOCK_SIZE);
1152
1153        if (nbytes) {
1154            unsigned char *ivp = cdata->iv;
1155
1156            ctx->num = nbytes;
1157            padlock_reload_key(); /* empirically found */
1158            padlock_xcrypt_ecb(1, cdata, ivp, ivp);
1159            padlock_reload_key(); /* empirically found */
1160            while (nbytes) {
1161                *(out_arg++) = *(in_arg++) ^ *ivp;
1162                ivp++, nbytes--;
1163            }
1164        }
1165
1166        memcpy(ctx->iv, cdata->iv, AES_BLOCK_SIZE);
1167        break;
1168
1169    default:
1170        return 0;
1171    }
1172
1173    /* Clean the realign buffer if it was used */
1174    if (out_misaligned) {
1175        volatile unsigned long *p = (void *)out;
1176        size_t n = allocated / sizeof(*p);
1177        while (n--)
1178            *p++ = 0;
1179    }
1180
1181    memset(cdata->iv, 0, AES_BLOCK_SIZE);
1182
1183    return 1;
1184}
1185
1186#   endif                       /* OPENSSL_NO_AES */
1187
1188/* ===== Random Number Generator ===== */
1189/*
1190 * This code is not engaged. The reason is that it does not comply
1191 * with recommendations for VIA RNG usage for secure applications
1192 * (posted at http://www.via.com.tw/en/viac3/c3.jsp) nor does it
1193 * provide meaningful error control...
1194 */
1195/*
1196 * Wrapper that provides an interface between the API and the raw PadLock
1197 * RNG
1198 */
1199static int padlock_rand_bytes(unsigned char *output, int count)
1200{
1201    unsigned int eax, buf;
1202
1203    while (count >= 8) {
1204        eax = padlock_xstore(output, 0);
1205        if (!(eax & (1 << 6)))
1206            return 0;           /* RNG disabled */
1207        /* this ---vv--- covers DC bias, Raw Bits and String Filter */
1208        if (eax & (0x1F << 10))
1209            return 0;
1210        if ((eax & 0x1F) == 0)
1211            continue;           /* no data, retry... */
1212        if ((eax & 0x1F) != 8)
1213            return 0;           /* fatal failure...  */
1214        output += 8;
1215        count -= 8;
1216    }
1217    while (count > 0) {
1218        eax = padlock_xstore(&buf, 3);
1219        if (!(eax & (1 << 6)))
1220            return 0;           /* RNG disabled */
1221        /* this ---vv--- covers DC bias, Raw Bits and String Filter */
1222        if (eax & (0x1F << 10))
1223            return 0;
1224        if ((eax & 0x1F) == 0)
1225            continue;           /* no data, retry... */
1226        if ((eax & 0x1F) != 1)
1227            return 0;           /* fatal failure...  */
1228        *output++ = (unsigned char)buf;
1229        count--;
1230    }
1231    *(volatile unsigned int *)&buf = 0;
1232
1233    return 1;
1234}
1235
1236/* Dummy but necessary function */
1237static int padlock_rand_status(void)
1238{
1239    return 1;
1240}
1241
1242/* Prepare structure for registration */
1243static RAND_METHOD padlock_rand = {
1244    NULL,                       /* seed */
1245    padlock_rand_bytes,         /* bytes */
1246    NULL,                       /* cleanup */
1247    NULL,                       /* add */
1248    padlock_rand_bytes,         /* pseudorand */
1249    padlock_rand_status,        /* rand status */
1250};
1251
1252#  else                         /* !COMPILE_HW_PADLOCK */
1253#   ifndef OPENSSL_NO_DYNAMIC_ENGINE
1254OPENSSL_EXPORT
1255    int bind_engine(ENGINE *e, const char *id, const dynamic_fns *fns);
1256OPENSSL_EXPORT
1257    int bind_engine(ENGINE *e, const char *id, const dynamic_fns *fns)
1258{
1259    return 0;
1260}
1261
1262IMPLEMENT_DYNAMIC_CHECK_FN()
1263#   endif
1264#  endif                        /* COMPILE_HW_PADLOCK */
1265# endif                         /* !OPENSSL_NO_HW_PADLOCK */
1266#endif                          /* !OPENSSL_NO_HW */
1267