1/*
2 * Copyright 2010-2021 The OpenSSL Project Authors. All Rights Reserved.
3 *
4 * Licensed under the OpenSSL license (the "License").  You may not use
5 * this file except in compliance with the License.  You can obtain a copy
6 * in the file LICENSE in the source distribution or at
7 * https://www.openssl.org/source/license.html
8 */
9
10#include <openssl/crypto.h>
11#include "modes_local.h"
12#include <string.h>
13
14#if defined(__GNUC__) && !defined(STRICT_ALIGNMENT)
15typedef size_t size_t_aX __attribute((__aligned__(1)));
16#else
17typedef size_t size_t_aX;
18#endif
19
20#if defined(BSWAP4) && defined(STRICT_ALIGNMENT)
21/* redefine, because alignment is ensured */
22# undef  GETU32
23# define GETU32(p)       BSWAP4(*(const u32 *)(p))
24# undef  PUTU32
25# define PUTU32(p,v)     *(u32 *)(p) = BSWAP4(v)
26#endif
27
28#define PACK(s)         ((size_t)(s)<<(sizeof(size_t)*8-16))
29#define REDUCE1BIT(V)   do { \
30        if (sizeof(size_t)==8) { \
31                u64 T = U64(0xe100000000000000) & (0-(V.lo&1)); \
32                V.lo  = (V.hi<<63)|(V.lo>>1); \
33                V.hi  = (V.hi>>1 )^T; \
34        } \
35        else { \
36                u32 T = 0xe1000000U & (0-(u32)(V.lo&1)); \
37                V.lo  = (V.hi<<63)|(V.lo>>1); \
38                V.hi  = (V.hi>>1 )^((u64)T<<32); \
39        } \
40} while(0)
41
42/*-
43 * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should
44 * never be set to 8. 8 is effectively reserved for testing purposes.
45 * TABLE_BITS>1 are lookup-table-driven implementations referred to as
46 * "Shoup's" in GCM specification. In other words OpenSSL does not cover
47 * whole spectrum of possible table driven implementations. Why? In
48 * non-"Shoup's" case memory access pattern is segmented in such manner,
49 * that it's trivial to see that cache timing information can reveal
50 * fair portion of intermediate hash value. Given that ciphertext is
51 * always available to attacker, it's possible for him to attempt to
52 * deduce secret parameter H and if successful, tamper with messages
53 * [which is nothing but trivial in CTR mode]. In "Shoup's" case it's
54 * not as trivial, but there is no reason to believe that it's resistant
55 * to cache-timing attack. And the thing about "8-bit" implementation is
56 * that it consumes 16 (sixteen) times more memory, 4KB per individual
57 * key + 1KB shared. Well, on pros side it should be twice as fast as
58 * "4-bit" version. And for gcc-generated x86[_64] code, "8-bit" version
59 * was observed to run ~75% faster, closer to 100% for commercial
60 * compilers... Yet "4-bit" procedure is preferred, because it's
61 * believed to provide better security-performance balance and adequate
62 * all-round performance. "All-round" refers to things like:
63 *
64 * - shorter setup time effectively improves overall timing for
65 *   handling short messages;
66 * - larger table allocation can become unbearable because of VM
67 *   subsystem penalties (for example on Windows large enough free
68 *   results in VM working set trimming, meaning that consequent
69 *   malloc would immediately incur working set expansion);
70 * - larger table has larger cache footprint, which can affect
71 *   performance of other code paths (not necessarily even from same
72 *   thread in Hyper-Threading world);
73 *
74 * Value of 1 is not appropriate for performance reasons.
75 */
76#if     TABLE_BITS==8
77
78static void gcm_init_8bit(u128 Htable[256], u64 H[2])
79{
80    int i, j;
81    u128 V;
82
83    Htable[0].hi = 0;
84    Htable[0].lo = 0;
85    V.hi = H[0];
86    V.lo = H[1];
87
88    for (Htable[128] = V, i = 64; i > 0; i >>= 1) {
89        REDUCE1BIT(V);
90        Htable[i] = V;
91    }
92
93    for (i = 2; i < 256; i <<= 1) {
94        u128 *Hi = Htable + i, H0 = *Hi;
95        for (j = 1; j < i; ++j) {
96            Hi[j].hi = H0.hi ^ Htable[j].hi;
97            Hi[j].lo = H0.lo ^ Htable[j].lo;
98        }
99    }
100}
101
102static void gcm_gmult_8bit(u64 Xi[2], const u128 Htable[256])
103{
104    u128 Z = { 0, 0 };
105    const u8 *xi = (const u8 *)Xi + 15;
106    size_t rem, n = *xi;
107    const union {
108        long one;
109        char little;
110    } is_endian = { 1 };
111    static const size_t rem_8bit[256] = {
112        PACK(0x0000), PACK(0x01C2), PACK(0x0384), PACK(0x0246),
113        PACK(0x0708), PACK(0x06CA), PACK(0x048C), PACK(0x054E),
114        PACK(0x0E10), PACK(0x0FD2), PACK(0x0D94), PACK(0x0C56),
115        PACK(0x0918), PACK(0x08DA), PACK(0x0A9C), PACK(0x0B5E),
116        PACK(0x1C20), PACK(0x1DE2), PACK(0x1FA4), PACK(0x1E66),
117        PACK(0x1B28), PACK(0x1AEA), PACK(0x18AC), PACK(0x196E),
118        PACK(0x1230), PACK(0x13F2), PACK(0x11B4), PACK(0x1076),
119        PACK(0x1538), PACK(0x14FA), PACK(0x16BC), PACK(0x177E),
120        PACK(0x3840), PACK(0x3982), PACK(0x3BC4), PACK(0x3A06),
121        PACK(0x3F48), PACK(0x3E8A), PACK(0x3CCC), PACK(0x3D0E),
122        PACK(0x3650), PACK(0x3792), PACK(0x35D4), PACK(0x3416),
123        PACK(0x3158), PACK(0x309A), PACK(0x32DC), PACK(0x331E),
124        PACK(0x2460), PACK(0x25A2), PACK(0x27E4), PACK(0x2626),
125        PACK(0x2368), PACK(0x22AA), PACK(0x20EC), PACK(0x212E),
126        PACK(0x2A70), PACK(0x2BB2), PACK(0x29F4), PACK(0x2836),
127        PACK(0x2D78), PACK(0x2CBA), PACK(0x2EFC), PACK(0x2F3E),
128        PACK(0x7080), PACK(0x7142), PACK(0x7304), PACK(0x72C6),
129        PACK(0x7788), PACK(0x764A), PACK(0x740C), PACK(0x75CE),
130        PACK(0x7E90), PACK(0x7F52), PACK(0x7D14), PACK(0x7CD6),
131        PACK(0x7998), PACK(0x785A), PACK(0x7A1C), PACK(0x7BDE),
132        PACK(0x6CA0), PACK(0x6D62), PACK(0x6F24), PACK(0x6EE6),
133        PACK(0x6BA8), PACK(0x6A6A), PACK(0x682C), PACK(0x69EE),
134        PACK(0x62B0), PACK(0x6372), PACK(0x6134), PACK(0x60F6),
135        PACK(0x65B8), PACK(0x647A), PACK(0x663C), PACK(0x67FE),
136        PACK(0x48C0), PACK(0x4902), PACK(0x4B44), PACK(0x4A86),
137        PACK(0x4FC8), PACK(0x4E0A), PACK(0x4C4C), PACK(0x4D8E),
138        PACK(0x46D0), PACK(0x4712), PACK(0x4554), PACK(0x4496),
139        PACK(0x41D8), PACK(0x401A), PACK(0x425C), PACK(0x439E),
140        PACK(0x54E0), PACK(0x5522), PACK(0x5764), PACK(0x56A6),
141        PACK(0x53E8), PACK(0x522A), PACK(0x506C), PACK(0x51AE),
142        PACK(0x5AF0), PACK(0x5B32), PACK(0x5974), PACK(0x58B6),
143        PACK(0x5DF8), PACK(0x5C3A), PACK(0x5E7C), PACK(0x5FBE),
144        PACK(0xE100), PACK(0xE0C2), PACK(0xE284), PACK(0xE346),
145        PACK(0xE608), PACK(0xE7CA), PACK(0xE58C), PACK(0xE44E),
146        PACK(0xEF10), PACK(0xEED2), PACK(0xEC94), PACK(0xED56),
147        PACK(0xE818), PACK(0xE9DA), PACK(0xEB9C), PACK(0xEA5E),
148        PACK(0xFD20), PACK(0xFCE2), PACK(0xFEA4), PACK(0xFF66),
149        PACK(0xFA28), PACK(0xFBEA), PACK(0xF9AC), PACK(0xF86E),
150        PACK(0xF330), PACK(0xF2F2), PACK(0xF0B4), PACK(0xF176),
151        PACK(0xF438), PACK(0xF5FA), PACK(0xF7BC), PACK(0xF67E),
152        PACK(0xD940), PACK(0xD882), PACK(0xDAC4), PACK(0xDB06),
153        PACK(0xDE48), PACK(0xDF8A), PACK(0xDDCC), PACK(0xDC0E),
154        PACK(0xD750), PACK(0xD692), PACK(0xD4D4), PACK(0xD516),
155        PACK(0xD058), PACK(0xD19A), PACK(0xD3DC), PACK(0xD21E),
156        PACK(0xC560), PACK(0xC4A2), PACK(0xC6E4), PACK(0xC726),
157        PACK(0xC268), PACK(0xC3AA), PACK(0xC1EC), PACK(0xC02E),
158        PACK(0xCB70), PACK(0xCAB2), PACK(0xC8F4), PACK(0xC936),
159        PACK(0xCC78), PACK(0xCDBA), PACK(0xCFFC), PACK(0xCE3E),
160        PACK(0x9180), PACK(0x9042), PACK(0x9204), PACK(0x93C6),
161        PACK(0x9688), PACK(0x974A), PACK(0x950C), PACK(0x94CE),
162        PACK(0x9F90), PACK(0x9E52), PACK(0x9C14), PACK(0x9DD6),
163        PACK(0x9898), PACK(0x995A), PACK(0x9B1C), PACK(0x9ADE),
164        PACK(0x8DA0), PACK(0x8C62), PACK(0x8E24), PACK(0x8FE6),
165        PACK(0x8AA8), PACK(0x8B6A), PACK(0x892C), PACK(0x88EE),
166        PACK(0x83B0), PACK(0x8272), PACK(0x8034), PACK(0x81F6),
167        PACK(0x84B8), PACK(0x857A), PACK(0x873C), PACK(0x86FE),
168        PACK(0xA9C0), PACK(0xA802), PACK(0xAA44), PACK(0xAB86),
169        PACK(0xAEC8), PACK(0xAF0A), PACK(0xAD4C), PACK(0xAC8E),
170        PACK(0xA7D0), PACK(0xA612), PACK(0xA454), PACK(0xA596),
171        PACK(0xA0D8), PACK(0xA11A), PACK(0xA35C), PACK(0xA29E),
172        PACK(0xB5E0), PACK(0xB422), PACK(0xB664), PACK(0xB7A6),
173        PACK(0xB2E8), PACK(0xB32A), PACK(0xB16C), PACK(0xB0AE),
174        PACK(0xBBF0), PACK(0xBA32), PACK(0xB874), PACK(0xB9B6),
175        PACK(0xBCF8), PACK(0xBD3A), PACK(0xBF7C), PACK(0xBEBE)
176    };
177
178    while (1) {
179        Z.hi ^= Htable[n].hi;
180        Z.lo ^= Htable[n].lo;
181
182        if ((u8 *)Xi == xi)
183            break;
184
185        n = *(--xi);
186
187        rem = (size_t)Z.lo & 0xff;
188        Z.lo = (Z.hi << 56) | (Z.lo >> 8);
189        Z.hi = (Z.hi >> 8);
190        if (sizeof(size_t) == 8)
191            Z.hi ^= rem_8bit[rem];
192        else
193            Z.hi ^= (u64)rem_8bit[rem] << 32;
194    }
195
196    if (is_endian.little) {
197# ifdef BSWAP8
198        Xi[0] = BSWAP8(Z.hi);
199        Xi[1] = BSWAP8(Z.lo);
200# else
201        u8 *p = (u8 *)Xi;
202        u32 v;
203        v = (u32)(Z.hi >> 32);
204        PUTU32(p, v);
205        v = (u32)(Z.hi);
206        PUTU32(p + 4, v);
207        v = (u32)(Z.lo >> 32);
208        PUTU32(p + 8, v);
209        v = (u32)(Z.lo);
210        PUTU32(p + 12, v);
211# endif
212    } else {
213        Xi[0] = Z.hi;
214        Xi[1] = Z.lo;
215    }
216}
217
218# define GCM_MUL(ctx)      gcm_gmult_8bit(ctx->Xi.u,ctx->Htable)
219
220#elif   TABLE_BITS==4
221
222static void gcm_init_4bit(u128 Htable[16], u64 H[2])
223{
224    u128 V;
225# if defined(OPENSSL_SMALL_FOOTPRINT)
226    int i;
227# endif
228
229    Htable[0].hi = 0;
230    Htable[0].lo = 0;
231    V.hi = H[0];
232    V.lo = H[1];
233
234# if defined(OPENSSL_SMALL_FOOTPRINT)
235    for (Htable[8] = V, i = 4; i > 0; i >>= 1) {
236        REDUCE1BIT(V);
237        Htable[i] = V;
238    }
239
240    for (i = 2; i < 16; i <<= 1) {
241        u128 *Hi = Htable + i;
242        int j;
243        for (V = *Hi, j = 1; j < i; ++j) {
244            Hi[j].hi = V.hi ^ Htable[j].hi;
245            Hi[j].lo = V.lo ^ Htable[j].lo;
246        }
247    }
248# else
249    Htable[8] = V;
250    REDUCE1BIT(V);
251    Htable[4] = V;
252    REDUCE1BIT(V);
253    Htable[2] = V;
254    REDUCE1BIT(V);
255    Htable[1] = V;
256    Htable[3].hi = V.hi ^ Htable[2].hi, Htable[3].lo = V.lo ^ Htable[2].lo;
257    V = Htable[4];
258    Htable[5].hi = V.hi ^ Htable[1].hi, Htable[5].lo = V.lo ^ Htable[1].lo;
259    Htable[6].hi = V.hi ^ Htable[2].hi, Htable[6].lo = V.lo ^ Htable[2].lo;
260    Htable[7].hi = V.hi ^ Htable[3].hi, Htable[7].lo = V.lo ^ Htable[3].lo;
261    V = Htable[8];
262    Htable[9].hi = V.hi ^ Htable[1].hi, Htable[9].lo = V.lo ^ Htable[1].lo;
263    Htable[10].hi = V.hi ^ Htable[2].hi, Htable[10].lo = V.lo ^ Htable[2].lo;
264    Htable[11].hi = V.hi ^ Htable[3].hi, Htable[11].lo = V.lo ^ Htable[3].lo;
265    Htable[12].hi = V.hi ^ Htable[4].hi, Htable[12].lo = V.lo ^ Htable[4].lo;
266    Htable[13].hi = V.hi ^ Htable[5].hi, Htable[13].lo = V.lo ^ Htable[5].lo;
267    Htable[14].hi = V.hi ^ Htable[6].hi, Htable[14].lo = V.lo ^ Htable[6].lo;
268    Htable[15].hi = V.hi ^ Htable[7].hi, Htable[15].lo = V.lo ^ Htable[7].lo;
269# endif
270# if defined(GHASH_ASM) && (defined(__arm__) || defined(__arm))
271    /*
272     * ARM assembler expects specific dword order in Htable.
273     */
274    {
275        int j;
276        const union {
277            long one;
278            char little;
279        } is_endian = { 1 };
280
281        if (is_endian.little)
282            for (j = 0; j < 16; ++j) {
283                V = Htable[j];
284                Htable[j].hi = V.lo;
285                Htable[j].lo = V.hi;
286        } else
287            for (j = 0; j < 16; ++j) {
288                V = Htable[j];
289                Htable[j].hi = V.lo << 32 | V.lo >> 32;
290                Htable[j].lo = V.hi << 32 | V.hi >> 32;
291            }
292    }
293# endif
294}
295
296# ifndef GHASH_ASM
297static const size_t rem_4bit[16] = {
298    PACK(0x0000), PACK(0x1C20), PACK(0x3840), PACK(0x2460),
299    PACK(0x7080), PACK(0x6CA0), PACK(0x48C0), PACK(0x54E0),
300    PACK(0xE100), PACK(0xFD20), PACK(0xD940), PACK(0xC560),
301    PACK(0x9180), PACK(0x8DA0), PACK(0xA9C0), PACK(0xB5E0)
302};
303
304static void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16])
305{
306    u128 Z;
307    int cnt = 15;
308    size_t rem, nlo, nhi;
309    const union {
310        long one;
311        char little;
312    } is_endian = { 1 };
313
314    nlo = ((const u8 *)Xi)[15];
315    nhi = nlo >> 4;
316    nlo &= 0xf;
317
318    Z.hi = Htable[nlo].hi;
319    Z.lo = Htable[nlo].lo;
320
321    while (1) {
322        rem = (size_t)Z.lo & 0xf;
323        Z.lo = (Z.hi << 60) | (Z.lo >> 4);
324        Z.hi = (Z.hi >> 4);
325        if (sizeof(size_t) == 8)
326            Z.hi ^= rem_4bit[rem];
327        else
328            Z.hi ^= (u64)rem_4bit[rem] << 32;
329
330        Z.hi ^= Htable[nhi].hi;
331        Z.lo ^= Htable[nhi].lo;
332
333        if (--cnt < 0)
334            break;
335
336        nlo = ((const u8 *)Xi)[cnt];
337        nhi = nlo >> 4;
338        nlo &= 0xf;
339
340        rem = (size_t)Z.lo & 0xf;
341        Z.lo = (Z.hi << 60) | (Z.lo >> 4);
342        Z.hi = (Z.hi >> 4);
343        if (sizeof(size_t) == 8)
344            Z.hi ^= rem_4bit[rem];
345        else
346            Z.hi ^= (u64)rem_4bit[rem] << 32;
347
348        Z.hi ^= Htable[nlo].hi;
349        Z.lo ^= Htable[nlo].lo;
350    }
351
352    if (is_endian.little) {
353#  ifdef BSWAP8
354        Xi[0] = BSWAP8(Z.hi);
355        Xi[1] = BSWAP8(Z.lo);
356#  else
357        u8 *p = (u8 *)Xi;
358        u32 v;
359        v = (u32)(Z.hi >> 32);
360        PUTU32(p, v);
361        v = (u32)(Z.hi);
362        PUTU32(p + 4, v);
363        v = (u32)(Z.lo >> 32);
364        PUTU32(p + 8, v);
365        v = (u32)(Z.lo);
366        PUTU32(p + 12, v);
367#  endif
368    } else {
369        Xi[0] = Z.hi;
370        Xi[1] = Z.lo;
371    }
372}
373
374#  if !defined(OPENSSL_SMALL_FOOTPRINT)
375/*
376 * Streamed gcm_mult_4bit, see CRYPTO_gcm128_[en|de]crypt for
377 * details... Compiler-generated code doesn't seem to give any
378 * performance improvement, at least not on x86[_64]. It's here
379 * mostly as reference and a placeholder for possible future
380 * non-trivial optimization[s]...
381 */
382static void gcm_ghash_4bit(u64 Xi[2], const u128 Htable[16],
383                           const u8 *inp, size_t len)
384{
385    u128 Z;
386    int cnt;
387    size_t rem, nlo, nhi;
388    const union {
389        long one;
390        char little;
391    } is_endian = { 1 };
392
393#   if 1
394    do {
395        cnt = 15;
396        nlo = ((const u8 *)Xi)[15];
397        nlo ^= inp[15];
398        nhi = nlo >> 4;
399        nlo &= 0xf;
400
401        Z.hi = Htable[nlo].hi;
402        Z.lo = Htable[nlo].lo;
403
404        while (1) {
405            rem = (size_t)Z.lo & 0xf;
406            Z.lo = (Z.hi << 60) | (Z.lo >> 4);
407            Z.hi = (Z.hi >> 4);
408            if (sizeof(size_t) == 8)
409                Z.hi ^= rem_4bit[rem];
410            else
411                Z.hi ^= (u64)rem_4bit[rem] << 32;
412
413            Z.hi ^= Htable[nhi].hi;
414            Z.lo ^= Htable[nhi].lo;
415
416            if (--cnt < 0)
417                break;
418
419            nlo = ((const u8 *)Xi)[cnt];
420            nlo ^= inp[cnt];
421            nhi = nlo >> 4;
422            nlo &= 0xf;
423
424            rem = (size_t)Z.lo & 0xf;
425            Z.lo = (Z.hi << 60) | (Z.lo >> 4);
426            Z.hi = (Z.hi >> 4);
427            if (sizeof(size_t) == 8)
428                Z.hi ^= rem_4bit[rem];
429            else
430                Z.hi ^= (u64)rem_4bit[rem] << 32;
431
432            Z.hi ^= Htable[nlo].hi;
433            Z.lo ^= Htable[nlo].lo;
434        }
435#   else
436    /*
437     * Extra 256+16 bytes per-key plus 512 bytes shared tables
438     * [should] give ~50% improvement... One could have PACK()-ed
439     * the rem_8bit even here, but the priority is to minimize
440     * cache footprint...
441     */
442    u128 Hshr4[16];             /* Htable shifted right by 4 bits */
443    u8 Hshl4[16];               /* Htable shifted left by 4 bits */
444    static const unsigned short rem_8bit[256] = {
445        0x0000, 0x01C2, 0x0384, 0x0246, 0x0708, 0x06CA, 0x048C, 0x054E,
446        0x0E10, 0x0FD2, 0x0D94, 0x0C56, 0x0918, 0x08DA, 0x0A9C, 0x0B5E,
447        0x1C20, 0x1DE2, 0x1FA4, 0x1E66, 0x1B28, 0x1AEA, 0x18AC, 0x196E,
448        0x1230, 0x13F2, 0x11B4, 0x1076, 0x1538, 0x14FA, 0x16BC, 0x177E,
449        0x3840, 0x3982, 0x3BC4, 0x3A06, 0x3F48, 0x3E8A, 0x3CCC, 0x3D0E,
450        0x3650, 0x3792, 0x35D4, 0x3416, 0x3158, 0x309A, 0x32DC, 0x331E,
451        0x2460, 0x25A2, 0x27E4, 0x2626, 0x2368, 0x22AA, 0x20EC, 0x212E,
452        0x2A70, 0x2BB2, 0x29F4, 0x2836, 0x2D78, 0x2CBA, 0x2EFC, 0x2F3E,
453        0x7080, 0x7142, 0x7304, 0x72C6, 0x7788, 0x764A, 0x740C, 0x75CE,
454        0x7E90, 0x7F52, 0x7D14, 0x7CD6, 0x7998, 0x785A, 0x7A1C, 0x7BDE,
455        0x6CA0, 0x6D62, 0x6F24, 0x6EE6, 0x6BA8, 0x6A6A, 0x682C, 0x69EE,
456        0x62B0, 0x6372, 0x6134, 0x60F6, 0x65B8, 0x647A, 0x663C, 0x67FE,
457        0x48C0, 0x4902, 0x4B44, 0x4A86, 0x4FC8, 0x4E0A, 0x4C4C, 0x4D8E,
458        0x46D0, 0x4712, 0x4554, 0x4496, 0x41D8, 0x401A, 0x425C, 0x439E,
459        0x54E0, 0x5522, 0x5764, 0x56A6, 0x53E8, 0x522A, 0x506C, 0x51AE,
460        0x5AF0, 0x5B32, 0x5974, 0x58B6, 0x5DF8, 0x5C3A, 0x5E7C, 0x5FBE,
461        0xE100, 0xE0C2, 0xE284, 0xE346, 0xE608, 0xE7CA, 0xE58C, 0xE44E,
462        0xEF10, 0xEED2, 0xEC94, 0xED56, 0xE818, 0xE9DA, 0xEB9C, 0xEA5E,
463        0xFD20, 0xFCE2, 0xFEA4, 0xFF66, 0xFA28, 0xFBEA, 0xF9AC, 0xF86E,
464        0xF330, 0xF2F2, 0xF0B4, 0xF176, 0xF438, 0xF5FA, 0xF7BC, 0xF67E,
465        0xD940, 0xD882, 0xDAC4, 0xDB06, 0xDE48, 0xDF8A, 0xDDCC, 0xDC0E,
466        0xD750, 0xD692, 0xD4D4, 0xD516, 0xD058, 0xD19A, 0xD3DC, 0xD21E,
467        0xC560, 0xC4A2, 0xC6E4, 0xC726, 0xC268, 0xC3AA, 0xC1EC, 0xC02E,
468        0xCB70, 0xCAB2, 0xC8F4, 0xC936, 0xCC78, 0xCDBA, 0xCFFC, 0xCE3E,
469        0x9180, 0x9042, 0x9204, 0x93C6, 0x9688, 0x974A, 0x950C, 0x94CE,
470        0x9F90, 0x9E52, 0x9C14, 0x9DD6, 0x9898, 0x995A, 0x9B1C, 0x9ADE,
471        0x8DA0, 0x8C62, 0x8E24, 0x8FE6, 0x8AA8, 0x8B6A, 0x892C, 0x88EE,
472        0x83B0, 0x8272, 0x8034, 0x81F6, 0x84B8, 0x857A, 0x873C, 0x86FE,
473        0xA9C0, 0xA802, 0xAA44, 0xAB86, 0xAEC8, 0xAF0A, 0xAD4C, 0xAC8E,
474        0xA7D0, 0xA612, 0xA454, 0xA596, 0xA0D8, 0xA11A, 0xA35C, 0xA29E,
475        0xB5E0, 0xB422, 0xB664, 0xB7A6, 0xB2E8, 0xB32A, 0xB16C, 0xB0AE,
476        0xBBF0, 0xBA32, 0xB874, 0xB9B6, 0xBCF8, 0xBD3A, 0xBF7C, 0xBEBE
477    };
478    /*
479     * This pre-processing phase slows down procedure by approximately
480     * same time as it makes each loop spin faster. In other words
481     * single block performance is approximately same as straightforward
482     * "4-bit" implementation, and then it goes only faster...
483     */
484    for (cnt = 0; cnt < 16; ++cnt) {
485        Z.hi = Htable[cnt].hi;
486        Z.lo = Htable[cnt].lo;
487        Hshr4[cnt].lo = (Z.hi << 60) | (Z.lo >> 4);
488        Hshr4[cnt].hi = (Z.hi >> 4);
489        Hshl4[cnt] = (u8)(Z.lo << 4);
490    }
491
492    do {
493        for (Z.lo = 0, Z.hi = 0, cnt = 15; cnt; --cnt) {
494            nlo = ((const u8 *)Xi)[cnt];
495            nlo ^= inp[cnt];
496            nhi = nlo >> 4;
497            nlo &= 0xf;
498
499            Z.hi ^= Htable[nlo].hi;
500            Z.lo ^= Htable[nlo].lo;
501
502            rem = (size_t)Z.lo & 0xff;
503
504            Z.lo = (Z.hi << 56) | (Z.lo >> 8);
505            Z.hi = (Z.hi >> 8);
506
507            Z.hi ^= Hshr4[nhi].hi;
508            Z.lo ^= Hshr4[nhi].lo;
509            Z.hi ^= (u64)rem_8bit[rem ^ Hshl4[nhi]] << 48;
510        }
511
512        nlo = ((const u8 *)Xi)[0];
513        nlo ^= inp[0];
514        nhi = nlo >> 4;
515        nlo &= 0xf;
516
517        Z.hi ^= Htable[nlo].hi;
518        Z.lo ^= Htable[nlo].lo;
519
520        rem = (size_t)Z.lo & 0xf;
521
522        Z.lo = (Z.hi << 60) | (Z.lo >> 4);
523        Z.hi = (Z.hi >> 4);
524
525        Z.hi ^= Htable[nhi].hi;
526        Z.lo ^= Htable[nhi].lo;
527        Z.hi ^= ((u64)rem_8bit[rem << 4]) << 48;
528#   endif
529
530        if (is_endian.little) {
531#   ifdef BSWAP8
532            Xi[0] = BSWAP8(Z.hi);
533            Xi[1] = BSWAP8(Z.lo);
534#   else
535            u8 *p = (u8 *)Xi;
536            u32 v;
537            v = (u32)(Z.hi >> 32);
538            PUTU32(p, v);
539            v = (u32)(Z.hi);
540            PUTU32(p + 4, v);
541            v = (u32)(Z.lo >> 32);
542            PUTU32(p + 8, v);
543            v = (u32)(Z.lo);
544            PUTU32(p + 12, v);
545#   endif
546        } else {
547            Xi[0] = Z.hi;
548            Xi[1] = Z.lo;
549        }
550    } while (inp += 16, len -= 16);
551}
552#  endif
553# else
554void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16]);
555void gcm_ghash_4bit(u64 Xi[2], const u128 Htable[16], const u8 *inp,
556                    size_t len);
557# endif
558
559# define GCM_MUL(ctx)      gcm_gmult_4bit(ctx->Xi.u,ctx->Htable)
560# if defined(GHASH_ASM) || !defined(OPENSSL_SMALL_FOOTPRINT)
561#  define GHASH(ctx,in,len) gcm_ghash_4bit((ctx)->Xi.u,(ctx)->Htable,in,len)
562/*
563 * GHASH_CHUNK is "stride parameter" missioned to mitigate cache trashing
564 * effect. In other words idea is to hash data while it's still in L1 cache
565 * after encryption pass...
566 */
567#  define GHASH_CHUNK       (3*1024)
568# endif
569
570#else                           /* TABLE_BITS */
571
572static void gcm_gmult_1bit(u64 Xi[2], const u64 H[2])
573{
574    u128 V, Z = { 0, 0 };
575    long X;
576    int i, j;
577    const long *xi = (const long *)Xi;
578    const union {
579        long one;
580        char little;
581    } is_endian = { 1 };
582
583    V.hi = H[0];                /* H is in host byte order, no byte swapping */
584    V.lo = H[1];
585
586    for (j = 0; j < 16 / sizeof(long); ++j) {
587        if (is_endian.little) {
588            if (sizeof(long) == 8) {
589# ifdef BSWAP8
590                X = (long)(BSWAP8(xi[j]));
591# else
592                const u8 *p = (const u8 *)(xi + j);
593                X = (long)((u64)GETU32(p) << 32 | GETU32(p + 4));
594# endif
595            } else {
596                const u8 *p = (const u8 *)(xi + j);
597                X = (long)GETU32(p);
598            }
599        } else
600            X = xi[j];
601
602        for (i = 0; i < 8 * sizeof(long); ++i, X <<= 1) {
603            u64 M = (u64)(X >> (8 * sizeof(long) - 1));
604            Z.hi ^= V.hi & M;
605            Z.lo ^= V.lo & M;
606
607            REDUCE1BIT(V);
608        }
609    }
610
611    if (is_endian.little) {
612# ifdef BSWAP8
613        Xi[0] = BSWAP8(Z.hi);
614        Xi[1] = BSWAP8(Z.lo);
615# else
616        u8 *p = (u8 *)Xi;
617        u32 v;
618        v = (u32)(Z.hi >> 32);
619        PUTU32(p, v);
620        v = (u32)(Z.hi);
621        PUTU32(p + 4, v);
622        v = (u32)(Z.lo >> 32);
623        PUTU32(p + 8, v);
624        v = (u32)(Z.lo);
625        PUTU32(p + 12, v);
626# endif
627    } else {
628        Xi[0] = Z.hi;
629        Xi[1] = Z.lo;
630    }
631}
632
633# define GCM_MUL(ctx)      gcm_gmult_1bit(ctx->Xi.u,ctx->H.u)
634
635#endif
636
637#if     TABLE_BITS==4 && (defined(GHASH_ASM) || defined(OPENSSL_CPUID_OBJ))
638# if    !defined(I386_ONLY) && \
639        (defined(__i386)        || defined(__i386__)    || \
640         defined(__x86_64)      || defined(__x86_64__)  || \
641         defined(_M_IX86)       || defined(_M_AMD64)    || defined(_M_X64))
642#  define GHASH_ASM_X86_OR_64
643#  define GCM_FUNCREF_4BIT
644extern unsigned int OPENSSL_ia32cap_P[];
645
646void gcm_init_clmul(u128 Htable[16], const u64 Xi[2]);
647void gcm_gmult_clmul(u64 Xi[2], const u128 Htable[16]);
648void gcm_ghash_clmul(u64 Xi[2], const u128 Htable[16], const u8 *inp,
649                     size_t len);
650
651#  if defined(__i386) || defined(__i386__) || defined(_M_IX86)
652#   define gcm_init_avx   gcm_init_clmul
653#   define gcm_gmult_avx  gcm_gmult_clmul
654#   define gcm_ghash_avx  gcm_ghash_clmul
655#  else
656void gcm_init_avx(u128 Htable[16], const u64 Xi[2]);
657void gcm_gmult_avx(u64 Xi[2], const u128 Htable[16]);
658void gcm_ghash_avx(u64 Xi[2], const u128 Htable[16], const u8 *inp,
659                   size_t len);
660#  endif
661
662#  if   defined(__i386) || defined(__i386__) || defined(_M_IX86)
663#   define GHASH_ASM_X86
664void gcm_gmult_4bit_mmx(u64 Xi[2], const u128 Htable[16]);
665void gcm_ghash_4bit_mmx(u64 Xi[2], const u128 Htable[16], const u8 *inp,
666                        size_t len);
667
668void gcm_gmult_4bit_x86(u64 Xi[2], const u128 Htable[16]);
669void gcm_ghash_4bit_x86(u64 Xi[2], const u128 Htable[16], const u8 *inp,
670                        size_t len);
671#  endif
672# elif defined(__arm__) || defined(__arm) || defined(__aarch64__)
673#  include "arm_arch.h"
674#  if __ARM_MAX_ARCH__>=7
675#   define GHASH_ASM_ARM
676#   define GCM_FUNCREF_4BIT
677#   define PMULL_CAPABLE        (OPENSSL_armcap_P & ARMV8_PMULL)
678#   if defined(__arm__) || defined(__arm)
679#    define NEON_CAPABLE        (OPENSSL_armcap_P & ARMV7_NEON)
680#   endif
681void gcm_init_neon(u128 Htable[16], const u64 Xi[2]);
682void gcm_gmult_neon(u64 Xi[2], const u128 Htable[16]);
683void gcm_ghash_neon(u64 Xi[2], const u128 Htable[16], const u8 *inp,
684                    size_t len);
685void gcm_init_v8(u128 Htable[16], const u64 Xi[2]);
686void gcm_gmult_v8(u64 Xi[2], const u128 Htable[16]);
687void gcm_ghash_v8(u64 Xi[2], const u128 Htable[16], const u8 *inp,
688                  size_t len);
689#  endif
690# elif defined(__sparc__) || defined(__sparc)
691#  include "sparc_arch.h"
692#  define GHASH_ASM_SPARC
693#  define GCM_FUNCREF_4BIT
694extern unsigned int OPENSSL_sparcv9cap_P[];
695void gcm_init_vis3(u128 Htable[16], const u64 Xi[2]);
696void gcm_gmult_vis3(u64 Xi[2], const u128 Htable[16]);
697void gcm_ghash_vis3(u64 Xi[2], const u128 Htable[16], const u8 *inp,
698                    size_t len);
699# elif defined(OPENSSL_CPUID_OBJ) && (defined(__powerpc__) || defined(__ppc__) || defined(_ARCH_PPC))
700#  include "ppc_arch.h"
701#  define GHASH_ASM_PPC
702#  define GCM_FUNCREF_4BIT
703void gcm_init_p8(u128 Htable[16], const u64 Xi[2]);
704void gcm_gmult_p8(u64 Xi[2], const u128 Htable[16]);
705void gcm_ghash_p8(u64 Xi[2], const u128 Htable[16], const u8 *inp,
706                  size_t len);
707# endif
708#endif
709
710#ifdef GCM_FUNCREF_4BIT
711# undef  GCM_MUL
712# define GCM_MUL(ctx)           (*gcm_gmult_p)(ctx->Xi.u,ctx->Htable)
713# ifdef GHASH
714#  undef  GHASH
715#  define GHASH(ctx,in,len)     (*gcm_ghash_p)(ctx->Xi.u,ctx->Htable,in,len)
716# endif
717#endif
718
719void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx, void *key, block128_f block)
720{
721    const union {
722        long one;
723        char little;
724    } is_endian = { 1 };
725
726    memset(ctx, 0, sizeof(*ctx));
727    ctx->block = block;
728    ctx->key = key;
729
730    (*block) (ctx->H.c, ctx->H.c, key);
731
732    if (is_endian.little) {
733        /* H is stored in host byte order */
734#ifdef BSWAP8
735        ctx->H.u[0] = BSWAP8(ctx->H.u[0]);
736        ctx->H.u[1] = BSWAP8(ctx->H.u[1]);
737#else
738        u8 *p = ctx->H.c;
739        u64 hi, lo;
740        hi = (u64)GETU32(p) << 32 | GETU32(p + 4);
741        lo = (u64)GETU32(p + 8) << 32 | GETU32(p + 12);
742        ctx->H.u[0] = hi;
743        ctx->H.u[1] = lo;
744#endif
745    }
746#if     TABLE_BITS==8
747    gcm_init_8bit(ctx->Htable, ctx->H.u);
748#elif   TABLE_BITS==4
749# if    defined(GHASH)
750#  define CTX__GHASH(f) (ctx->ghash = (f))
751# else
752#  define CTX__GHASH(f) (ctx->ghash = NULL)
753# endif
754# if    defined(GHASH_ASM_X86_OR_64)
755#  if   !defined(GHASH_ASM_X86) || defined(OPENSSL_IA32_SSE2)
756    if (OPENSSL_ia32cap_P[1] & (1 << 1)) { /* check PCLMULQDQ bit */
757        if (((OPENSSL_ia32cap_P[1] >> 22) & 0x41) == 0x41) { /* AVX+MOVBE */
758            gcm_init_avx(ctx->Htable, ctx->H.u);
759            ctx->gmult = gcm_gmult_avx;
760            CTX__GHASH(gcm_ghash_avx);
761        } else {
762            gcm_init_clmul(ctx->Htable, ctx->H.u);
763            ctx->gmult = gcm_gmult_clmul;
764            CTX__GHASH(gcm_ghash_clmul);
765        }
766        return;
767    }
768#  endif
769    gcm_init_4bit(ctx->Htable, ctx->H.u);
770#  if   defined(GHASH_ASM_X86)  /* x86 only */
771#   if  defined(OPENSSL_IA32_SSE2)
772    if (OPENSSL_ia32cap_P[0] & (1 << 25)) { /* check SSE bit */
773#   else
774    if (OPENSSL_ia32cap_P[0] & (1 << 23)) { /* check MMX bit */
775#   endif
776        ctx->gmult = gcm_gmult_4bit_mmx;
777        CTX__GHASH(gcm_ghash_4bit_mmx);
778    } else {
779        ctx->gmult = gcm_gmult_4bit_x86;
780        CTX__GHASH(gcm_ghash_4bit_x86);
781    }
782#  else
783    ctx->gmult = gcm_gmult_4bit;
784    CTX__GHASH(gcm_ghash_4bit);
785#  endif
786# elif  defined(GHASH_ASM_ARM)
787#  ifdef PMULL_CAPABLE
788    if (PMULL_CAPABLE) {
789        gcm_init_v8(ctx->Htable, ctx->H.u);
790        ctx->gmult = gcm_gmult_v8;
791        CTX__GHASH(gcm_ghash_v8);
792    } else
793#  endif
794#  ifdef NEON_CAPABLE
795    if (NEON_CAPABLE) {
796        gcm_init_neon(ctx->Htable, ctx->H.u);
797        ctx->gmult = gcm_gmult_neon;
798        CTX__GHASH(gcm_ghash_neon);
799    } else
800#  endif
801    {
802        gcm_init_4bit(ctx->Htable, ctx->H.u);
803        ctx->gmult = gcm_gmult_4bit;
804        CTX__GHASH(gcm_ghash_4bit);
805    }
806# elif  defined(GHASH_ASM_SPARC)
807    if (OPENSSL_sparcv9cap_P[0] & SPARCV9_VIS3) {
808        gcm_init_vis3(ctx->Htable, ctx->H.u);
809        ctx->gmult = gcm_gmult_vis3;
810        CTX__GHASH(gcm_ghash_vis3);
811    } else {
812        gcm_init_4bit(ctx->Htable, ctx->H.u);
813        ctx->gmult = gcm_gmult_4bit;
814        CTX__GHASH(gcm_ghash_4bit);
815    }
816# elif  defined(GHASH_ASM_PPC)
817    if (OPENSSL_ppccap_P & PPC_CRYPTO207) {
818        gcm_init_p8(ctx->Htable, ctx->H.u);
819        ctx->gmult = gcm_gmult_p8;
820        CTX__GHASH(gcm_ghash_p8);
821    } else {
822        gcm_init_4bit(ctx->Htable, ctx->H.u);
823        ctx->gmult = gcm_gmult_4bit;
824        CTX__GHASH(gcm_ghash_4bit);
825    }
826# else
827    gcm_init_4bit(ctx->Htable, ctx->H.u);
828# endif
829# undef CTX__GHASH
830#endif
831}
832
833void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx, const unsigned char *iv,
834                         size_t len)
835{
836    const union {
837        long one;
838        char little;
839    } is_endian = { 1 };
840    unsigned int ctr;
841#ifdef GCM_FUNCREF_4BIT
842    void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
843#endif
844
845    ctx->len.u[0] = 0;          /* AAD length */
846    ctx->len.u[1] = 0;          /* message length */
847    ctx->ares = 0;
848    ctx->mres = 0;
849
850    if (len == 12) {
851        memcpy(ctx->Yi.c, iv, 12);
852        ctx->Yi.c[12] = 0;
853        ctx->Yi.c[13] = 0;
854        ctx->Yi.c[14] = 0;
855        ctx->Yi.c[15] = 1;
856        ctr = 1;
857    } else {
858        size_t i;
859        u64 len0 = len;
860
861        /* Borrow ctx->Xi to calculate initial Yi */
862        ctx->Xi.u[0] = 0;
863        ctx->Xi.u[1] = 0;
864
865        while (len >= 16) {
866            for (i = 0; i < 16; ++i)
867                ctx->Xi.c[i] ^= iv[i];
868            GCM_MUL(ctx);
869            iv += 16;
870            len -= 16;
871        }
872        if (len) {
873            for (i = 0; i < len; ++i)
874                ctx->Xi.c[i] ^= iv[i];
875            GCM_MUL(ctx);
876        }
877        len0 <<= 3;
878        if (is_endian.little) {
879#ifdef BSWAP8
880            ctx->Xi.u[1] ^= BSWAP8(len0);
881#else
882            ctx->Xi.c[8] ^= (u8)(len0 >> 56);
883            ctx->Xi.c[9] ^= (u8)(len0 >> 48);
884            ctx->Xi.c[10] ^= (u8)(len0 >> 40);
885            ctx->Xi.c[11] ^= (u8)(len0 >> 32);
886            ctx->Xi.c[12] ^= (u8)(len0 >> 24);
887            ctx->Xi.c[13] ^= (u8)(len0 >> 16);
888            ctx->Xi.c[14] ^= (u8)(len0 >> 8);
889            ctx->Xi.c[15] ^= (u8)(len0);
890#endif
891        } else {
892            ctx->Xi.u[1] ^= len0;
893        }
894
895        GCM_MUL(ctx);
896
897        if (is_endian.little)
898#ifdef BSWAP4
899            ctr = BSWAP4(ctx->Xi.d[3]);
900#else
901            ctr = GETU32(ctx->Xi.c + 12);
902#endif
903        else
904            ctr = ctx->Xi.d[3];
905
906        /* Copy borrowed Xi to Yi */
907        ctx->Yi.u[0] = ctx->Xi.u[0];
908        ctx->Yi.u[1] = ctx->Xi.u[1];
909    }
910
911    ctx->Xi.u[0] = 0;
912    ctx->Xi.u[1] = 0;
913
914    (*ctx->block) (ctx->Yi.c, ctx->EK0.c, ctx->key);
915    ++ctr;
916    if (is_endian.little)
917#ifdef BSWAP4
918        ctx->Yi.d[3] = BSWAP4(ctr);
919#else
920        PUTU32(ctx->Yi.c + 12, ctr);
921#endif
922    else
923        ctx->Yi.d[3] = ctr;
924}
925
926int CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx, const unsigned char *aad,
927                      size_t len)
928{
929    size_t i;
930    unsigned int n;
931    u64 alen = ctx->len.u[0];
932#ifdef GCM_FUNCREF_4BIT
933    void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
934# ifdef GHASH
935    void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
936                         const u8 *inp, size_t len) = ctx->ghash;
937# endif
938#endif
939
940    if (ctx->len.u[1])
941        return -2;
942
943    alen += len;
944    if (alen > (U64(1) << 61) || (sizeof(len) == 8 && alen < len))
945        return -1;
946    ctx->len.u[0] = alen;
947
948    n = ctx->ares;
949    if (n) {
950        while (n && len) {
951            ctx->Xi.c[n] ^= *(aad++);
952            --len;
953            n = (n + 1) % 16;
954        }
955        if (n == 0)
956            GCM_MUL(ctx);
957        else {
958            ctx->ares = n;
959            return 0;
960        }
961    }
962#ifdef GHASH
963    if ((i = (len & (size_t)-16))) {
964        GHASH(ctx, aad, i);
965        aad += i;
966        len -= i;
967    }
968#else
969    while (len >= 16) {
970        for (i = 0; i < 16; ++i)
971            ctx->Xi.c[i] ^= aad[i];
972        GCM_MUL(ctx);
973        aad += 16;
974        len -= 16;
975    }
976#endif
977    if (len) {
978        n = (unsigned int)len;
979        for (i = 0; i < len; ++i)
980            ctx->Xi.c[i] ^= aad[i];
981    }
982
983    ctx->ares = n;
984    return 0;
985}
986
987int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx,
988                          const unsigned char *in, unsigned char *out,
989                          size_t len)
990{
991    const union {
992        long one;
993        char little;
994    } is_endian = { 1 };
995    unsigned int n, ctr, mres;
996    size_t i;
997    u64 mlen = ctx->len.u[1];
998    block128_f block = ctx->block;
999    void *key = ctx->key;
1000#ifdef GCM_FUNCREF_4BIT
1001    void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
1002# if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1003    void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
1004                         const u8 *inp, size_t len) = ctx->ghash;
1005# endif
1006#endif
1007
1008    mlen += len;
1009    if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len))
1010        return -1;
1011    ctx->len.u[1] = mlen;
1012
1013    mres = ctx->mres;
1014
1015    if (ctx->ares) {
1016        /* First call to encrypt finalizes GHASH(AAD) */
1017#if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1018        if (len == 0) {
1019            GCM_MUL(ctx);
1020            ctx->ares = 0;
1021            return 0;
1022        }
1023        memcpy(ctx->Xn, ctx->Xi.c, sizeof(ctx->Xi));
1024        ctx->Xi.u[0] = 0;
1025        ctx->Xi.u[1] = 0;
1026        mres = sizeof(ctx->Xi);
1027#else
1028        GCM_MUL(ctx);
1029#endif
1030        ctx->ares = 0;
1031    }
1032
1033    if (is_endian.little)
1034#ifdef BSWAP4
1035        ctr = BSWAP4(ctx->Yi.d[3]);
1036#else
1037        ctr = GETU32(ctx->Yi.c + 12);
1038#endif
1039    else
1040        ctr = ctx->Yi.d[3];
1041
1042    n = mres % 16;
1043#if !defined(OPENSSL_SMALL_FOOTPRINT)
1044    if (16 % sizeof(size_t) == 0) { /* always true actually */
1045        do {
1046            if (n) {
1047# if defined(GHASH)
1048                while (n && len) {
1049                    ctx->Xn[mres++] = *(out++) = *(in++) ^ ctx->EKi.c[n];
1050                    --len;
1051                    n = (n + 1) % 16;
1052                }
1053                if (n == 0) {
1054                    GHASH(ctx, ctx->Xn, mres);
1055                    mres = 0;
1056                } else {
1057                    ctx->mres = mres;
1058                    return 0;
1059                }
1060# else
1061                while (n && len) {
1062                    ctx->Xi.c[n] ^= *(out++) = *(in++) ^ ctx->EKi.c[n];
1063                    --len;
1064                    n = (n + 1) % 16;
1065                }
1066                if (n == 0) {
1067                    GCM_MUL(ctx);
1068                    mres = 0;
1069                } else {
1070                    ctx->mres = n;
1071                    return 0;
1072                }
1073# endif
1074            }
1075# if defined(STRICT_ALIGNMENT)
1076            if (((size_t)in | (size_t)out) % sizeof(size_t) != 0)
1077                break;
1078# endif
1079# if defined(GHASH)
1080            if (len >= 16 && mres) {
1081                GHASH(ctx, ctx->Xn, mres);
1082                mres = 0;
1083            }
1084#  if defined(GHASH_CHUNK)
1085            while (len >= GHASH_CHUNK) {
1086                size_t j = GHASH_CHUNK;
1087
1088                while (j) {
1089                    size_t_aX *out_t = (size_t_aX *)out;
1090                    const size_t_aX *in_t = (const size_t_aX *)in;
1091
1092                    (*block) (ctx->Yi.c, ctx->EKi.c, key);
1093                    ++ctr;
1094                    if (is_endian.little)
1095#   ifdef BSWAP4
1096                        ctx->Yi.d[3] = BSWAP4(ctr);
1097#   else
1098                        PUTU32(ctx->Yi.c + 12, ctr);
1099#   endif
1100                    else
1101                        ctx->Yi.d[3] = ctr;
1102                    for (i = 0; i < 16 / sizeof(size_t); ++i)
1103                        out_t[i] = in_t[i] ^ ctx->EKi.t[i];
1104                    out += 16;
1105                    in += 16;
1106                    j -= 16;
1107                }
1108                GHASH(ctx, out - GHASH_CHUNK, GHASH_CHUNK);
1109                len -= GHASH_CHUNK;
1110            }
1111#  endif
1112            if ((i = (len & (size_t)-16))) {
1113                size_t j = i;
1114
1115                while (len >= 16) {
1116                    size_t_aX *out_t = (size_t_aX *)out;
1117                    const size_t_aX *in_t = (const size_t_aX *)in;
1118
1119                    (*block) (ctx->Yi.c, ctx->EKi.c, key);
1120                    ++ctr;
1121                    if (is_endian.little)
1122#  ifdef BSWAP4
1123                        ctx->Yi.d[3] = BSWAP4(ctr);
1124#  else
1125                        PUTU32(ctx->Yi.c + 12, ctr);
1126#  endif
1127                    else
1128                        ctx->Yi.d[3] = ctr;
1129                    for (i = 0; i < 16 / sizeof(size_t); ++i)
1130                        out_t[i] = in_t[i] ^ ctx->EKi.t[i];
1131                    out += 16;
1132                    in += 16;
1133                    len -= 16;
1134                }
1135                GHASH(ctx, out - j, j);
1136            }
1137# else
1138            while (len >= 16) {
1139                size_t *out_t = (size_t *)out;
1140                const size_t *in_t = (const size_t *)in;
1141
1142                (*block) (ctx->Yi.c, ctx->EKi.c, key);
1143                ++ctr;
1144                if (is_endian.little)
1145#  ifdef BSWAP4
1146                    ctx->Yi.d[3] = BSWAP4(ctr);
1147#  else
1148                    PUTU32(ctx->Yi.c + 12, ctr);
1149#  endif
1150                else
1151                    ctx->Yi.d[3] = ctr;
1152                for (i = 0; i < 16 / sizeof(size_t); ++i)
1153                    ctx->Xi.t[i] ^= out_t[i] = in_t[i] ^ ctx->EKi.t[i];
1154                GCM_MUL(ctx);
1155                out += 16;
1156                in += 16;
1157                len -= 16;
1158            }
1159# endif
1160            if (len) {
1161                (*block) (ctx->Yi.c, ctx->EKi.c, key);
1162                ++ctr;
1163                if (is_endian.little)
1164# ifdef BSWAP4
1165                    ctx->Yi.d[3] = BSWAP4(ctr);
1166# else
1167                    PUTU32(ctx->Yi.c + 12, ctr);
1168# endif
1169                else
1170                    ctx->Yi.d[3] = ctr;
1171# if defined(GHASH)
1172                while (len--) {
1173                    ctx->Xn[mres++] = out[n] = in[n] ^ ctx->EKi.c[n];
1174                    ++n;
1175                }
1176# else
1177                while (len--) {
1178                    ctx->Xi.c[n] ^= out[n] = in[n] ^ ctx->EKi.c[n];
1179                    ++n;
1180                }
1181                mres = n;
1182# endif
1183            }
1184
1185            ctx->mres = mres;
1186            return 0;
1187        } while (0);
1188    }
1189#endif
1190    for (i = 0; i < len; ++i) {
1191        if (n == 0) {
1192            (*block) (ctx->Yi.c, ctx->EKi.c, key);
1193            ++ctr;
1194            if (is_endian.little)
1195#ifdef BSWAP4
1196                ctx->Yi.d[3] = BSWAP4(ctr);
1197#else
1198                PUTU32(ctx->Yi.c + 12, ctr);
1199#endif
1200            else
1201                ctx->Yi.d[3] = ctr;
1202        }
1203#if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1204        ctx->Xn[mres++] = out[i] = in[i] ^ ctx->EKi.c[n];
1205        n = (n + 1) % 16;
1206        if (mres == sizeof(ctx->Xn)) {
1207            GHASH(ctx,ctx->Xn,sizeof(ctx->Xn));
1208            mres = 0;
1209        }
1210#else
1211        ctx->Xi.c[n] ^= out[i] = in[i] ^ ctx->EKi.c[n];
1212        mres = n = (n + 1) % 16;
1213        if (n == 0)
1214            GCM_MUL(ctx);
1215#endif
1216    }
1217
1218    ctx->mres = mres;
1219    return 0;
1220}
1221
1222int CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx,
1223                          const unsigned char *in, unsigned char *out,
1224                          size_t len)
1225{
1226    const union {
1227        long one;
1228        char little;
1229    } is_endian = { 1 };
1230    unsigned int n, ctr, mres;
1231    size_t i;
1232    u64 mlen = ctx->len.u[1];
1233    block128_f block = ctx->block;
1234    void *key = ctx->key;
1235#ifdef GCM_FUNCREF_4BIT
1236    void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
1237# if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1238    void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
1239                         const u8 *inp, size_t len) = ctx->ghash;
1240# endif
1241#endif
1242
1243    mlen += len;
1244    if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len))
1245        return -1;
1246    ctx->len.u[1] = mlen;
1247
1248    mres = ctx->mres;
1249
1250    if (ctx->ares) {
1251        /* First call to decrypt finalizes GHASH(AAD) */
1252#if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1253        if (len == 0) {
1254            GCM_MUL(ctx);
1255            ctx->ares = 0;
1256            return 0;
1257        }
1258        memcpy(ctx->Xn, ctx->Xi.c, sizeof(ctx->Xi));
1259        ctx->Xi.u[0] = 0;
1260        ctx->Xi.u[1] = 0;
1261        mres = sizeof(ctx->Xi);
1262#else
1263        GCM_MUL(ctx);
1264#endif
1265        ctx->ares = 0;
1266    }
1267
1268    if (is_endian.little)
1269#ifdef BSWAP4
1270        ctr = BSWAP4(ctx->Yi.d[3]);
1271#else
1272        ctr = GETU32(ctx->Yi.c + 12);
1273#endif
1274    else
1275        ctr = ctx->Yi.d[3];
1276
1277    n = mres % 16;
1278#if !defined(OPENSSL_SMALL_FOOTPRINT)
1279    if (16 % sizeof(size_t) == 0) { /* always true actually */
1280        do {
1281            if (n) {
1282# if defined(GHASH)
1283                while (n && len) {
1284                    *(out++) = (ctx->Xn[mres++] = *(in++)) ^ ctx->EKi.c[n];
1285                    --len;
1286                    n = (n + 1) % 16;
1287                }
1288                if (n == 0) {
1289                    GHASH(ctx, ctx->Xn, mres);
1290                    mres = 0;
1291                } else {
1292                    ctx->mres = mres;
1293                    return 0;
1294                }
1295# else
1296                while (n && len) {
1297                    u8 c = *(in++);
1298                    *(out++) = c ^ ctx->EKi.c[n];
1299                    ctx->Xi.c[n] ^= c;
1300                    --len;
1301                    n = (n + 1) % 16;
1302                }
1303                if (n == 0) {
1304                    GCM_MUL(ctx);
1305                    mres = 0;
1306                } else {
1307                    ctx->mres = n;
1308                    return 0;
1309                }
1310# endif
1311            }
1312# if defined(STRICT_ALIGNMENT)
1313            if (((size_t)in | (size_t)out) % sizeof(size_t) != 0)
1314                break;
1315# endif
1316# if defined(GHASH)
1317            if (len >= 16 && mres) {
1318                GHASH(ctx, ctx->Xn, mres);
1319                mres = 0;
1320            }
1321#  if defined(GHASH_CHUNK)
1322            while (len >= GHASH_CHUNK) {
1323                size_t j = GHASH_CHUNK;
1324
1325                GHASH(ctx, in, GHASH_CHUNK);
1326                while (j) {
1327                    size_t_aX *out_t = (size_t_aX *)out;
1328                    const size_t_aX *in_t = (const size_t_aX *)in;
1329
1330                    (*block) (ctx->Yi.c, ctx->EKi.c, key);
1331                    ++ctr;
1332                    if (is_endian.little)
1333#   ifdef BSWAP4
1334                        ctx->Yi.d[3] = BSWAP4(ctr);
1335#   else
1336                        PUTU32(ctx->Yi.c + 12, ctr);
1337#   endif
1338                    else
1339                        ctx->Yi.d[3] = ctr;
1340                    for (i = 0; i < 16 / sizeof(size_t); ++i)
1341                        out_t[i] = in_t[i] ^ ctx->EKi.t[i];
1342                    out += 16;
1343                    in += 16;
1344                    j -= 16;
1345                }
1346                len -= GHASH_CHUNK;
1347            }
1348#  endif
1349            if ((i = (len & (size_t)-16))) {
1350                GHASH(ctx, in, i);
1351                while (len >= 16) {
1352                    size_t_aX *out_t = (size_t_aX *)out;
1353                    const size_t_aX *in_t = (const size_t_aX *)in;
1354
1355                    (*block) (ctx->Yi.c, ctx->EKi.c, key);
1356                    ++ctr;
1357                    if (is_endian.little)
1358#  ifdef BSWAP4
1359                        ctx->Yi.d[3] = BSWAP4(ctr);
1360#  else
1361                        PUTU32(ctx->Yi.c + 12, ctr);
1362#  endif
1363                    else
1364                        ctx->Yi.d[3] = ctr;
1365                    for (i = 0; i < 16 / sizeof(size_t); ++i)
1366                        out_t[i] = in_t[i] ^ ctx->EKi.t[i];
1367                    out += 16;
1368                    in += 16;
1369                    len -= 16;
1370                }
1371            }
1372# else
1373            while (len >= 16) {
1374                size_t *out_t = (size_t *)out;
1375                const size_t *in_t = (const size_t *)in;
1376
1377                (*block) (ctx->Yi.c, ctx->EKi.c, key);
1378                ++ctr;
1379                if (is_endian.little)
1380#  ifdef BSWAP4
1381                    ctx->Yi.d[3] = BSWAP4(ctr);
1382#  else
1383                    PUTU32(ctx->Yi.c + 12, ctr);
1384#  endif
1385                else
1386                    ctx->Yi.d[3] = ctr;
1387                for (i = 0; i < 16 / sizeof(size_t); ++i) {
1388                    size_t c = in_t[i];
1389                    out_t[i] = c ^ ctx->EKi.t[i];
1390                    ctx->Xi.t[i] ^= c;
1391                }
1392                GCM_MUL(ctx);
1393                out += 16;
1394                in += 16;
1395                len -= 16;
1396            }
1397# endif
1398            if (len) {
1399                (*block) (ctx->Yi.c, ctx->EKi.c, key);
1400                ++ctr;
1401                if (is_endian.little)
1402# ifdef BSWAP4
1403                    ctx->Yi.d[3] = BSWAP4(ctr);
1404# else
1405                    PUTU32(ctx->Yi.c + 12, ctr);
1406# endif
1407                else
1408                    ctx->Yi.d[3] = ctr;
1409# if defined(GHASH)
1410                while (len--) {
1411                    out[n] = (ctx->Xn[mres++] = in[n]) ^ ctx->EKi.c[n];
1412                    ++n;
1413                }
1414# else
1415                while (len--) {
1416                    u8 c = in[n];
1417                    ctx->Xi.c[n] ^= c;
1418                    out[n] = c ^ ctx->EKi.c[n];
1419                    ++n;
1420                }
1421                mres = n;
1422# endif
1423            }
1424
1425            ctx->mres = mres;
1426            return 0;
1427        } while (0);
1428    }
1429#endif
1430    for (i = 0; i < len; ++i) {
1431        u8 c;
1432        if (n == 0) {
1433            (*block) (ctx->Yi.c, ctx->EKi.c, key);
1434            ++ctr;
1435            if (is_endian.little)
1436#ifdef BSWAP4
1437                ctx->Yi.d[3] = BSWAP4(ctr);
1438#else
1439                PUTU32(ctx->Yi.c + 12, ctr);
1440#endif
1441            else
1442                ctx->Yi.d[3] = ctr;
1443        }
1444#if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1445        out[i] = (ctx->Xn[mres++] = c = in[i]) ^ ctx->EKi.c[n];
1446        n = (n + 1) % 16;
1447        if (mres == sizeof(ctx->Xn)) {
1448            GHASH(ctx,ctx->Xn,sizeof(ctx->Xn));
1449            mres = 0;
1450        }
1451#else
1452        c = in[i];
1453        out[i] = c ^ ctx->EKi.c[n];
1454        ctx->Xi.c[n] ^= c;
1455        mres = n = (n + 1) % 16;
1456        if (n == 0)
1457            GCM_MUL(ctx);
1458#endif
1459    }
1460
1461    ctx->mres = mres;
1462    return 0;
1463}
1464
1465int CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx,
1466                                const unsigned char *in, unsigned char *out,
1467                                size_t len, ctr128_f stream)
1468{
1469#if defined(OPENSSL_SMALL_FOOTPRINT)
1470    return CRYPTO_gcm128_encrypt(ctx, in, out, len);
1471#else
1472    const union {
1473        long one;
1474        char little;
1475    } is_endian = { 1 };
1476    unsigned int n, ctr, mres;
1477    size_t i;
1478    u64 mlen = ctx->len.u[1];
1479    void *key = ctx->key;
1480# ifdef GCM_FUNCREF_4BIT
1481    void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
1482#  ifdef GHASH
1483    void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
1484                         const u8 *inp, size_t len) = ctx->ghash;
1485#  endif
1486# endif
1487
1488    mlen += len;
1489    if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len))
1490        return -1;
1491    ctx->len.u[1] = mlen;
1492
1493    mres = ctx->mres;
1494
1495    if (ctx->ares) {
1496        /* First call to encrypt finalizes GHASH(AAD) */
1497#if defined(GHASH)
1498        if (len == 0) {
1499            GCM_MUL(ctx);
1500            ctx->ares = 0;
1501            return 0;
1502        }
1503        memcpy(ctx->Xn, ctx->Xi.c, sizeof(ctx->Xi));
1504        ctx->Xi.u[0] = 0;
1505        ctx->Xi.u[1] = 0;
1506        mres = sizeof(ctx->Xi);
1507#else
1508        GCM_MUL(ctx);
1509#endif
1510        ctx->ares = 0;
1511    }
1512
1513    if (is_endian.little)
1514# ifdef BSWAP4
1515        ctr = BSWAP4(ctx->Yi.d[3]);
1516# else
1517        ctr = GETU32(ctx->Yi.c + 12);
1518# endif
1519    else
1520        ctr = ctx->Yi.d[3];
1521
1522    n = mres % 16;
1523    if (n) {
1524# if defined(GHASH)
1525        while (n && len) {
1526            ctx->Xn[mres++] = *(out++) = *(in++) ^ ctx->EKi.c[n];
1527            --len;
1528            n = (n + 1) % 16;
1529        }
1530        if (n == 0) {
1531            GHASH(ctx, ctx->Xn, mres);
1532            mres = 0;
1533        } else {
1534            ctx->mres = mres;
1535            return 0;
1536        }
1537# else
1538        while (n && len) {
1539            ctx->Xi.c[n] ^= *(out++) = *(in++) ^ ctx->EKi.c[n];
1540            --len;
1541            n = (n + 1) % 16;
1542        }
1543        if (n == 0) {
1544            GCM_MUL(ctx);
1545            mres = 0;
1546        } else {
1547            ctx->mres = n;
1548            return 0;
1549        }
1550# endif
1551    }
1552# if defined(GHASH)
1553        if (len >= 16 && mres) {
1554            GHASH(ctx, ctx->Xn, mres);
1555            mres = 0;
1556        }
1557#  if defined(GHASH_CHUNK)
1558    while (len >= GHASH_CHUNK) {
1559        (*stream) (in, out, GHASH_CHUNK / 16, key, ctx->Yi.c);
1560        ctr += GHASH_CHUNK / 16;
1561        if (is_endian.little)
1562#   ifdef BSWAP4
1563            ctx->Yi.d[3] = BSWAP4(ctr);
1564#   else
1565            PUTU32(ctx->Yi.c + 12, ctr);
1566#   endif
1567        else
1568            ctx->Yi.d[3] = ctr;
1569        GHASH(ctx, out, GHASH_CHUNK);
1570        out += GHASH_CHUNK;
1571        in += GHASH_CHUNK;
1572        len -= GHASH_CHUNK;
1573    }
1574#  endif
1575# endif
1576    if ((i = (len & (size_t)-16))) {
1577        size_t j = i / 16;
1578
1579        (*stream) (in, out, j, key, ctx->Yi.c);
1580        ctr += (unsigned int)j;
1581        if (is_endian.little)
1582# ifdef BSWAP4
1583            ctx->Yi.d[3] = BSWAP4(ctr);
1584# else
1585            PUTU32(ctx->Yi.c + 12, ctr);
1586# endif
1587        else
1588            ctx->Yi.d[3] = ctr;
1589        in += i;
1590        len -= i;
1591# if defined(GHASH)
1592        GHASH(ctx, out, i);
1593        out += i;
1594# else
1595        while (j--) {
1596            for (i = 0; i < 16; ++i)
1597                ctx->Xi.c[i] ^= out[i];
1598            GCM_MUL(ctx);
1599            out += 16;
1600        }
1601# endif
1602    }
1603    if (len) {
1604        (*ctx->block) (ctx->Yi.c, ctx->EKi.c, key);
1605        ++ctr;
1606        if (is_endian.little)
1607# ifdef BSWAP4
1608            ctx->Yi.d[3] = BSWAP4(ctr);
1609# else
1610            PUTU32(ctx->Yi.c + 12, ctr);
1611# endif
1612        else
1613            ctx->Yi.d[3] = ctr;
1614        while (len--) {
1615# if defined(GHASH)
1616            ctx->Xn[mres++] = out[n] = in[n] ^ ctx->EKi.c[n];
1617# else
1618            ctx->Xi.c[mres++] ^= out[n] = in[n] ^ ctx->EKi.c[n];
1619# endif
1620            ++n;
1621        }
1622    }
1623
1624    ctx->mres = mres;
1625    return 0;
1626#endif
1627}
1628
1629int CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx,
1630                                const unsigned char *in, unsigned char *out,
1631                                size_t len, ctr128_f stream)
1632{
1633#if defined(OPENSSL_SMALL_FOOTPRINT)
1634    return CRYPTO_gcm128_decrypt(ctx, in, out, len);
1635#else
1636    const union {
1637        long one;
1638        char little;
1639    } is_endian = { 1 };
1640    unsigned int n, ctr, mres;
1641    size_t i;
1642    u64 mlen = ctx->len.u[1];
1643    void *key = ctx->key;
1644# ifdef GCM_FUNCREF_4BIT
1645    void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
1646#  ifdef GHASH
1647    void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
1648                         const u8 *inp, size_t len) = ctx->ghash;
1649#  endif
1650# endif
1651
1652    mlen += len;
1653    if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len))
1654        return -1;
1655    ctx->len.u[1] = mlen;
1656
1657    mres = ctx->mres;
1658
1659    if (ctx->ares) {
1660        /* First call to decrypt finalizes GHASH(AAD) */
1661# if defined(GHASH)
1662        if (len == 0) {
1663            GCM_MUL(ctx);
1664            ctx->ares = 0;
1665            return 0;
1666        }
1667        memcpy(ctx->Xn, ctx->Xi.c, sizeof(ctx->Xi));
1668        ctx->Xi.u[0] = 0;
1669        ctx->Xi.u[1] = 0;
1670        mres = sizeof(ctx->Xi);
1671# else
1672        GCM_MUL(ctx);
1673# endif
1674        ctx->ares = 0;
1675    }
1676
1677    if (is_endian.little)
1678# ifdef BSWAP4
1679        ctr = BSWAP4(ctx->Yi.d[3]);
1680# else
1681        ctr = GETU32(ctx->Yi.c + 12);
1682# endif
1683    else
1684        ctr = ctx->Yi.d[3];
1685
1686    n = mres % 16;
1687    if (n) {
1688# if defined(GHASH)
1689        while (n && len) {
1690            *(out++) = (ctx->Xn[mres++] = *(in++)) ^ ctx->EKi.c[n];
1691            --len;
1692            n = (n + 1) % 16;
1693        }
1694        if (n == 0) {
1695            GHASH(ctx, ctx->Xn, mres);
1696            mres = 0;
1697        } else {
1698            ctx->mres = mres;
1699            return 0;
1700        }
1701# else
1702        while (n && len) {
1703            u8 c = *(in++);
1704            *(out++) = c ^ ctx->EKi.c[n];
1705            ctx->Xi.c[n] ^= c;
1706            --len;
1707            n = (n + 1) % 16;
1708        }
1709        if (n == 0) {
1710            GCM_MUL(ctx);
1711            mres = 0;
1712        } else {
1713            ctx->mres = n;
1714            return 0;
1715        }
1716# endif
1717    }
1718# if defined(GHASH)
1719    if (len >= 16 && mres) {
1720        GHASH(ctx, ctx->Xn, mres);
1721        mres = 0;
1722    }
1723#  if defined(GHASH_CHUNK)
1724    while (len >= GHASH_CHUNK) {
1725        GHASH(ctx, in, GHASH_CHUNK);
1726        (*stream) (in, out, GHASH_CHUNK / 16, key, ctx->Yi.c);
1727        ctr += GHASH_CHUNK / 16;
1728        if (is_endian.little)
1729#   ifdef BSWAP4
1730            ctx->Yi.d[3] = BSWAP4(ctr);
1731#   else
1732            PUTU32(ctx->Yi.c + 12, ctr);
1733#   endif
1734        else
1735            ctx->Yi.d[3] = ctr;
1736        out += GHASH_CHUNK;
1737        in += GHASH_CHUNK;
1738        len -= GHASH_CHUNK;
1739    }
1740#  endif
1741# endif
1742    if ((i = (len & (size_t)-16))) {
1743        size_t j = i / 16;
1744
1745# if defined(GHASH)
1746        GHASH(ctx, in, i);
1747# else
1748        while (j--) {
1749            size_t k;
1750            for (k = 0; k < 16; ++k)
1751                ctx->Xi.c[k] ^= in[k];
1752            GCM_MUL(ctx);
1753            in += 16;
1754        }
1755        j = i / 16;
1756        in -= i;
1757# endif
1758        (*stream) (in, out, j, key, ctx->Yi.c);
1759        ctr += (unsigned int)j;
1760        if (is_endian.little)
1761# ifdef BSWAP4
1762            ctx->Yi.d[3] = BSWAP4(ctr);
1763# else
1764            PUTU32(ctx->Yi.c + 12, ctr);
1765# endif
1766        else
1767            ctx->Yi.d[3] = ctr;
1768        out += i;
1769        in += i;
1770        len -= i;
1771    }
1772    if (len) {
1773        (*ctx->block) (ctx->Yi.c, ctx->EKi.c, key);
1774        ++ctr;
1775        if (is_endian.little)
1776# ifdef BSWAP4
1777            ctx->Yi.d[3] = BSWAP4(ctr);
1778# else
1779            PUTU32(ctx->Yi.c + 12, ctr);
1780# endif
1781        else
1782            ctx->Yi.d[3] = ctr;
1783        while (len--) {
1784# if defined(GHASH)
1785            out[n] = (ctx->Xn[mres++] = in[n]) ^ ctx->EKi.c[n];
1786# else
1787            u8 c = in[n];
1788            ctx->Xi.c[mres++] ^= c;
1789            out[n] = c ^ ctx->EKi.c[n];
1790# endif
1791            ++n;
1792        }
1793    }
1794
1795    ctx->mres = mres;
1796    return 0;
1797#endif
1798}
1799
1800int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx, const unsigned char *tag,
1801                         size_t len)
1802{
1803    const union {
1804        long one;
1805        char little;
1806    } is_endian = { 1 };
1807    u64 alen = ctx->len.u[0] << 3;
1808    u64 clen = ctx->len.u[1] << 3;
1809#ifdef GCM_FUNCREF_4BIT
1810    void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
1811# if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1812    void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
1813                         const u8 *inp, size_t len) = ctx->ghash;
1814# endif
1815#endif
1816
1817#if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1818    u128 bitlen;
1819    unsigned int mres = ctx->mres;
1820
1821    if (mres) {
1822        unsigned blocks = (mres + 15) & -16;
1823
1824        memset(ctx->Xn + mres, 0, blocks - mres);
1825        mres = blocks;
1826        if (mres == sizeof(ctx->Xn)) {
1827            GHASH(ctx, ctx->Xn, mres);
1828            mres = 0;
1829        }
1830    } else if (ctx->ares) {
1831        GCM_MUL(ctx);
1832    }
1833#else
1834    if (ctx->mres || ctx->ares)
1835        GCM_MUL(ctx);
1836#endif
1837
1838    if (is_endian.little) {
1839#ifdef BSWAP8
1840        alen = BSWAP8(alen);
1841        clen = BSWAP8(clen);
1842#else
1843        u8 *p = ctx->len.c;
1844
1845        ctx->len.u[0] = alen;
1846        ctx->len.u[1] = clen;
1847
1848        alen = (u64)GETU32(p) << 32 | GETU32(p + 4);
1849        clen = (u64)GETU32(p + 8) << 32 | GETU32(p + 12);
1850#endif
1851    }
1852
1853#if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1854    bitlen.hi = alen;
1855    bitlen.lo = clen;
1856    memcpy(ctx->Xn + mres, &bitlen, sizeof(bitlen));
1857    mres += sizeof(bitlen);
1858    GHASH(ctx, ctx->Xn, mres);
1859#else
1860    ctx->Xi.u[0] ^= alen;
1861    ctx->Xi.u[1] ^= clen;
1862    GCM_MUL(ctx);
1863#endif
1864
1865    ctx->Xi.u[0] ^= ctx->EK0.u[0];
1866    ctx->Xi.u[1] ^= ctx->EK0.u[1];
1867
1868    if (tag && len <= sizeof(ctx->Xi))
1869        return CRYPTO_memcmp(ctx->Xi.c, tag, len);
1870    else
1871        return -1;
1872}
1873
1874void CRYPTO_gcm128_tag(GCM128_CONTEXT *ctx, unsigned char *tag, size_t len)
1875{
1876    CRYPTO_gcm128_finish(ctx, NULL, 0);
1877    memcpy(tag, ctx->Xi.c,
1878           len <= sizeof(ctx->Xi.c) ? len : sizeof(ctx->Xi.c));
1879}
1880
1881GCM128_CONTEXT *CRYPTO_gcm128_new(void *key, block128_f block)
1882{
1883    GCM128_CONTEXT *ret;
1884
1885    if ((ret = OPENSSL_malloc(sizeof(*ret))) != NULL)
1886        CRYPTO_gcm128_init(ret, key, block);
1887
1888    return ret;
1889}
1890
1891void CRYPTO_gcm128_release(GCM128_CONTEXT *ctx)
1892{
1893    OPENSSL_clear_free(ctx, sizeof(*ctx));
1894}
1895