1/*
2 * Copyright 2010-2021 The OpenSSL Project Authors. All Rights Reserved.
3 *
4 * Licensed under the Apache License 2.0 (the "License").  You may not use
5 * this file except in compliance with the License.  You can obtain a copy
6 * in the file LICENSE in the source distribution or at
7 * https://www.openssl.org/source/license.html
8 */
9
10#include <string.h>
11#include <openssl/crypto.h>
12#include "internal/cryptlib.h"
13#include "internal/endian.h"
14#include "crypto/modes.h"
15
16#if defined(__GNUC__) && !defined(STRICT_ALIGNMENT)
17typedef size_t size_t_aX __attribute((__aligned__(1)));
18#else
19typedef size_t size_t_aX;
20#endif
21
22#if defined(BSWAP4) && defined(STRICT_ALIGNMENT)
23/* redefine, because alignment is ensured */
24# undef  GETU32
25# define GETU32(p)       BSWAP4(*(const u32 *)(p))
26# undef  PUTU32
27# define PUTU32(p,v)     *(u32 *)(p) = BSWAP4(v)
28#endif
29
30#define PACK(s)         ((size_t)(s)<<(sizeof(size_t)*8-16))
31#define REDUCE1BIT(V)   do { \
32        if (sizeof(size_t)==8) { \
33                u64 T = U64(0xe100000000000000) & (0-(V.lo&1)); \
34                V.lo  = (V.hi<<63)|(V.lo>>1); \
35                V.hi  = (V.hi>>1 )^T; \
36        } \
37        else { \
38                u32 T = 0xe1000000U & (0-(u32)(V.lo&1)); \
39                V.lo  = (V.hi<<63)|(V.lo>>1); \
40                V.hi  = (V.hi>>1 )^((u64)T<<32); \
41        } \
42} while(0)
43
44/*-
45 * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should
46 * never be set to 8. 8 is effectively reserved for testing purposes.
47 * TABLE_BITS>1 are lookup-table-driven implementations referred to as
48 * "Shoup's" in GCM specification. In other words OpenSSL does not cover
49 * whole spectrum of possible table driven implementations. Why? In
50 * non-"Shoup's" case memory access pattern is segmented in such manner,
51 * that it's trivial to see that cache timing information can reveal
52 * fair portion of intermediate hash value. Given that ciphertext is
53 * always available to attacker, it's possible for him to attempt to
54 * deduce secret parameter H and if successful, tamper with messages
55 * [which is nothing but trivial in CTR mode]. In "Shoup's" case it's
56 * not as trivial, but there is no reason to believe that it's resistant
57 * to cache-timing attack. And the thing about "8-bit" implementation is
58 * that it consumes 16 (sixteen) times more memory, 4KB per individual
59 * key + 1KB shared. Well, on pros side it should be twice as fast as
60 * "4-bit" version. And for gcc-generated x86[_64] code, "8-bit" version
61 * was observed to run ~75% faster, closer to 100% for commercial
62 * compilers... Yet "4-bit" procedure is preferred, because it's
63 * believed to provide better security-performance balance and adequate
64 * all-round performance. "All-round" refers to things like:
65 *
66 * - shorter setup time effectively improves overall timing for
67 *   handling short messages;
68 * - larger table allocation can become unbearable because of VM
69 *   subsystem penalties (for example on Windows large enough free
70 *   results in VM working set trimming, meaning that consequent
71 *   malloc would immediately incur working set expansion);
72 * - larger table has larger cache footprint, which can affect
73 *   performance of other code paths (not necessarily even from same
74 *   thread in Hyper-Threading world);
75 *
76 * Value of 1 is not appropriate for performance reasons.
77 */
78#if     TABLE_BITS==8
79
80static void gcm_init_8bit(u128 Htable[256], u64 H[2])
81{
82    int i, j;
83    u128 V;
84
85    Htable[0].hi = 0;
86    Htable[0].lo = 0;
87    V.hi = H[0];
88    V.lo = H[1];
89
90    for (Htable[128] = V, i = 64; i > 0; i >>= 1) {
91        REDUCE1BIT(V);
92        Htable[i] = V;
93    }
94
95    for (i = 2; i < 256; i <<= 1) {
96        u128 *Hi = Htable + i, H0 = *Hi;
97        for (j = 1; j < i; ++j) {
98            Hi[j].hi = H0.hi ^ Htable[j].hi;
99            Hi[j].lo = H0.lo ^ Htable[j].lo;
100        }
101    }
102}
103
104static void gcm_gmult_8bit(u64 Xi[2], const u128 Htable[256])
105{
106    u128 Z = { 0, 0 };
107    const u8 *xi = (const u8 *)Xi + 15;
108    size_t rem, n = *xi;
109    DECLARE_IS_ENDIAN;
110    static const size_t rem_8bit[256] = {
111        PACK(0x0000), PACK(0x01C2), PACK(0x0384), PACK(0x0246),
112        PACK(0x0708), PACK(0x06CA), PACK(0x048C), PACK(0x054E),
113        PACK(0x0E10), PACK(0x0FD2), PACK(0x0D94), PACK(0x0C56),
114        PACK(0x0918), PACK(0x08DA), PACK(0x0A9C), PACK(0x0B5E),
115        PACK(0x1C20), PACK(0x1DE2), PACK(0x1FA4), PACK(0x1E66),
116        PACK(0x1B28), PACK(0x1AEA), PACK(0x18AC), PACK(0x196E),
117        PACK(0x1230), PACK(0x13F2), PACK(0x11B4), PACK(0x1076),
118        PACK(0x1538), PACK(0x14FA), PACK(0x16BC), PACK(0x177E),
119        PACK(0x3840), PACK(0x3982), PACK(0x3BC4), PACK(0x3A06),
120        PACK(0x3F48), PACK(0x3E8A), PACK(0x3CCC), PACK(0x3D0E),
121        PACK(0x3650), PACK(0x3792), PACK(0x35D4), PACK(0x3416),
122        PACK(0x3158), PACK(0x309A), PACK(0x32DC), PACK(0x331E),
123        PACK(0x2460), PACK(0x25A2), PACK(0x27E4), PACK(0x2626),
124        PACK(0x2368), PACK(0x22AA), PACK(0x20EC), PACK(0x212E),
125        PACK(0x2A70), PACK(0x2BB2), PACK(0x29F4), PACK(0x2836),
126        PACK(0x2D78), PACK(0x2CBA), PACK(0x2EFC), PACK(0x2F3E),
127        PACK(0x7080), PACK(0x7142), PACK(0x7304), PACK(0x72C6),
128        PACK(0x7788), PACK(0x764A), PACK(0x740C), PACK(0x75CE),
129        PACK(0x7E90), PACK(0x7F52), PACK(0x7D14), PACK(0x7CD6),
130        PACK(0x7998), PACK(0x785A), PACK(0x7A1C), PACK(0x7BDE),
131        PACK(0x6CA0), PACK(0x6D62), PACK(0x6F24), PACK(0x6EE6),
132        PACK(0x6BA8), PACK(0x6A6A), PACK(0x682C), PACK(0x69EE),
133        PACK(0x62B0), PACK(0x6372), PACK(0x6134), PACK(0x60F6),
134        PACK(0x65B8), PACK(0x647A), PACK(0x663C), PACK(0x67FE),
135        PACK(0x48C0), PACK(0x4902), PACK(0x4B44), PACK(0x4A86),
136        PACK(0x4FC8), PACK(0x4E0A), PACK(0x4C4C), PACK(0x4D8E),
137        PACK(0x46D0), PACK(0x4712), PACK(0x4554), PACK(0x4496),
138        PACK(0x41D8), PACK(0x401A), PACK(0x425C), PACK(0x439E),
139        PACK(0x54E0), PACK(0x5522), PACK(0x5764), PACK(0x56A6),
140        PACK(0x53E8), PACK(0x522A), PACK(0x506C), PACK(0x51AE),
141        PACK(0x5AF0), PACK(0x5B32), PACK(0x5974), PACK(0x58B6),
142        PACK(0x5DF8), PACK(0x5C3A), PACK(0x5E7C), PACK(0x5FBE),
143        PACK(0xE100), PACK(0xE0C2), PACK(0xE284), PACK(0xE346),
144        PACK(0xE608), PACK(0xE7CA), PACK(0xE58C), PACK(0xE44E),
145        PACK(0xEF10), PACK(0xEED2), PACK(0xEC94), PACK(0xED56),
146        PACK(0xE818), PACK(0xE9DA), PACK(0xEB9C), PACK(0xEA5E),
147        PACK(0xFD20), PACK(0xFCE2), PACK(0xFEA4), PACK(0xFF66),
148        PACK(0xFA28), PACK(0xFBEA), PACK(0xF9AC), PACK(0xF86E),
149        PACK(0xF330), PACK(0xF2F2), PACK(0xF0B4), PACK(0xF176),
150        PACK(0xF438), PACK(0xF5FA), PACK(0xF7BC), PACK(0xF67E),
151        PACK(0xD940), PACK(0xD882), PACK(0xDAC4), PACK(0xDB06),
152        PACK(0xDE48), PACK(0xDF8A), PACK(0xDDCC), PACK(0xDC0E),
153        PACK(0xD750), PACK(0xD692), PACK(0xD4D4), PACK(0xD516),
154        PACK(0xD058), PACK(0xD19A), PACK(0xD3DC), PACK(0xD21E),
155        PACK(0xC560), PACK(0xC4A2), PACK(0xC6E4), PACK(0xC726),
156        PACK(0xC268), PACK(0xC3AA), PACK(0xC1EC), PACK(0xC02E),
157        PACK(0xCB70), PACK(0xCAB2), PACK(0xC8F4), PACK(0xC936),
158        PACK(0xCC78), PACK(0xCDBA), PACK(0xCFFC), PACK(0xCE3E),
159        PACK(0x9180), PACK(0x9042), PACK(0x9204), PACK(0x93C6),
160        PACK(0x9688), PACK(0x974A), PACK(0x950C), PACK(0x94CE),
161        PACK(0x9F90), PACK(0x9E52), PACK(0x9C14), PACK(0x9DD6),
162        PACK(0x9898), PACK(0x995A), PACK(0x9B1C), PACK(0x9ADE),
163        PACK(0x8DA0), PACK(0x8C62), PACK(0x8E24), PACK(0x8FE6),
164        PACK(0x8AA8), PACK(0x8B6A), PACK(0x892C), PACK(0x88EE),
165        PACK(0x83B0), PACK(0x8272), PACK(0x8034), PACK(0x81F6),
166        PACK(0x84B8), PACK(0x857A), PACK(0x873C), PACK(0x86FE),
167        PACK(0xA9C0), PACK(0xA802), PACK(0xAA44), PACK(0xAB86),
168        PACK(0xAEC8), PACK(0xAF0A), PACK(0xAD4C), PACK(0xAC8E),
169        PACK(0xA7D0), PACK(0xA612), PACK(0xA454), PACK(0xA596),
170        PACK(0xA0D8), PACK(0xA11A), PACK(0xA35C), PACK(0xA29E),
171        PACK(0xB5E0), PACK(0xB422), PACK(0xB664), PACK(0xB7A6),
172        PACK(0xB2E8), PACK(0xB32A), PACK(0xB16C), PACK(0xB0AE),
173        PACK(0xBBF0), PACK(0xBA32), PACK(0xB874), PACK(0xB9B6),
174        PACK(0xBCF8), PACK(0xBD3A), PACK(0xBF7C), PACK(0xBEBE)
175    };
176
177    while (1) {
178        Z.hi ^= Htable[n].hi;
179        Z.lo ^= Htable[n].lo;
180
181        if ((u8 *)Xi == xi)
182            break;
183
184        n = *(--xi);
185
186        rem = (size_t)Z.lo & 0xff;
187        Z.lo = (Z.hi << 56) | (Z.lo >> 8);
188        Z.hi = (Z.hi >> 8);
189        if (sizeof(size_t) == 8)
190            Z.hi ^= rem_8bit[rem];
191        else
192            Z.hi ^= (u64)rem_8bit[rem] << 32;
193    }
194
195    if (IS_LITTLE_ENDIAN) {
196# ifdef BSWAP8
197        Xi[0] = BSWAP8(Z.hi);
198        Xi[1] = BSWAP8(Z.lo);
199# else
200        u8 *p = (u8 *)Xi;
201        u32 v;
202        v = (u32)(Z.hi >> 32);
203        PUTU32(p, v);
204        v = (u32)(Z.hi);
205        PUTU32(p + 4, v);
206        v = (u32)(Z.lo >> 32);
207        PUTU32(p + 8, v);
208        v = (u32)(Z.lo);
209        PUTU32(p + 12, v);
210# endif
211    } else {
212        Xi[0] = Z.hi;
213        Xi[1] = Z.lo;
214    }
215}
216
217# define GCM_MUL(ctx)      gcm_gmult_8bit(ctx->Xi.u,ctx->Htable)
218
219#elif   TABLE_BITS==4
220
221static void gcm_init_4bit(u128 Htable[16], u64 H[2])
222{
223    u128 V;
224# if defined(OPENSSL_SMALL_FOOTPRINT)
225    int i;
226# endif
227
228    Htable[0].hi = 0;
229    Htable[0].lo = 0;
230    V.hi = H[0];
231    V.lo = H[1];
232
233# if defined(OPENSSL_SMALL_FOOTPRINT)
234    for (Htable[8] = V, i = 4; i > 0; i >>= 1) {
235        REDUCE1BIT(V);
236        Htable[i] = V;
237    }
238
239    for (i = 2; i < 16; i <<= 1) {
240        u128 *Hi = Htable + i;
241        int j;
242        for (V = *Hi, j = 1; j < i; ++j) {
243            Hi[j].hi = V.hi ^ Htable[j].hi;
244            Hi[j].lo = V.lo ^ Htable[j].lo;
245        }
246    }
247# else
248    Htable[8] = V;
249    REDUCE1BIT(V);
250    Htable[4] = V;
251    REDUCE1BIT(V);
252    Htable[2] = V;
253    REDUCE1BIT(V);
254    Htable[1] = V;
255    Htable[3].hi = V.hi ^ Htable[2].hi, Htable[3].lo = V.lo ^ Htable[2].lo;
256    V = Htable[4];
257    Htable[5].hi = V.hi ^ Htable[1].hi, Htable[5].lo = V.lo ^ Htable[1].lo;
258    Htable[6].hi = V.hi ^ Htable[2].hi, Htable[6].lo = V.lo ^ Htable[2].lo;
259    Htable[7].hi = V.hi ^ Htable[3].hi, Htable[7].lo = V.lo ^ Htable[3].lo;
260    V = Htable[8];
261    Htable[9].hi = V.hi ^ Htable[1].hi, Htable[9].lo = V.lo ^ Htable[1].lo;
262    Htable[10].hi = V.hi ^ Htable[2].hi, Htable[10].lo = V.lo ^ Htable[2].lo;
263    Htable[11].hi = V.hi ^ Htable[3].hi, Htable[11].lo = V.lo ^ Htable[3].lo;
264    Htable[12].hi = V.hi ^ Htable[4].hi, Htable[12].lo = V.lo ^ Htable[4].lo;
265    Htable[13].hi = V.hi ^ Htable[5].hi, Htable[13].lo = V.lo ^ Htable[5].lo;
266    Htable[14].hi = V.hi ^ Htable[6].hi, Htable[14].lo = V.lo ^ Htable[6].lo;
267    Htable[15].hi = V.hi ^ Htable[7].hi, Htable[15].lo = V.lo ^ Htable[7].lo;
268# endif
269# if defined(GHASH_ASM) && (defined(__arm__) || defined(__arm))
270    /*
271     * ARM assembler expects specific dword order in Htable.
272     */
273    {
274        int j;
275        DECLARE_IS_ENDIAN;
276
277        if (IS_LITTLE_ENDIAN)
278            for (j = 0; j < 16; ++j) {
279                V = Htable[j];
280                Htable[j].hi = V.lo;
281                Htable[j].lo = V.hi;
282        } else
283            for (j = 0; j < 16; ++j) {
284                V = Htable[j];
285                Htable[j].hi = V.lo << 32 | V.lo >> 32;
286                Htable[j].lo = V.hi << 32 | V.hi >> 32;
287            }
288    }
289# endif
290}
291
292# ifndef GHASH_ASM
293static const size_t rem_4bit[16] = {
294    PACK(0x0000), PACK(0x1C20), PACK(0x3840), PACK(0x2460),
295    PACK(0x7080), PACK(0x6CA0), PACK(0x48C0), PACK(0x54E0),
296    PACK(0xE100), PACK(0xFD20), PACK(0xD940), PACK(0xC560),
297    PACK(0x9180), PACK(0x8DA0), PACK(0xA9C0), PACK(0xB5E0)
298};
299
300static void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16])
301{
302    u128 Z;
303    int cnt = 15;
304    size_t rem, nlo, nhi;
305    DECLARE_IS_ENDIAN;
306
307    nlo = ((const u8 *)Xi)[15];
308    nhi = nlo >> 4;
309    nlo &= 0xf;
310
311    Z.hi = Htable[nlo].hi;
312    Z.lo = Htable[nlo].lo;
313
314    while (1) {
315        rem = (size_t)Z.lo & 0xf;
316        Z.lo = (Z.hi << 60) | (Z.lo >> 4);
317        Z.hi = (Z.hi >> 4);
318        if (sizeof(size_t) == 8)
319            Z.hi ^= rem_4bit[rem];
320        else
321            Z.hi ^= (u64)rem_4bit[rem] << 32;
322
323        Z.hi ^= Htable[nhi].hi;
324        Z.lo ^= Htable[nhi].lo;
325
326        if (--cnt < 0)
327            break;
328
329        nlo = ((const u8 *)Xi)[cnt];
330        nhi = nlo >> 4;
331        nlo &= 0xf;
332
333        rem = (size_t)Z.lo & 0xf;
334        Z.lo = (Z.hi << 60) | (Z.lo >> 4);
335        Z.hi = (Z.hi >> 4);
336        if (sizeof(size_t) == 8)
337            Z.hi ^= rem_4bit[rem];
338        else
339            Z.hi ^= (u64)rem_4bit[rem] << 32;
340
341        Z.hi ^= Htable[nlo].hi;
342        Z.lo ^= Htable[nlo].lo;
343    }
344
345    if (IS_LITTLE_ENDIAN) {
346#  ifdef BSWAP8
347        Xi[0] = BSWAP8(Z.hi);
348        Xi[1] = BSWAP8(Z.lo);
349#  else
350        u8 *p = (u8 *)Xi;
351        u32 v;
352        v = (u32)(Z.hi >> 32);
353        PUTU32(p, v);
354        v = (u32)(Z.hi);
355        PUTU32(p + 4, v);
356        v = (u32)(Z.lo >> 32);
357        PUTU32(p + 8, v);
358        v = (u32)(Z.lo);
359        PUTU32(p + 12, v);
360#  endif
361    } else {
362        Xi[0] = Z.hi;
363        Xi[1] = Z.lo;
364    }
365}
366
367#  if !defined(OPENSSL_SMALL_FOOTPRINT)
368/*
369 * Streamed gcm_mult_4bit, see CRYPTO_gcm128_[en|de]crypt for
370 * details... Compiler-generated code doesn't seem to give any
371 * performance improvement, at least not on x86[_64]. It's here
372 * mostly as reference and a placeholder for possible future
373 * non-trivial optimization[s]...
374 */
375static void gcm_ghash_4bit(u64 Xi[2], const u128 Htable[16],
376                           const u8 *inp, size_t len)
377{
378    u128 Z;
379    int cnt;
380    size_t rem, nlo, nhi;
381    DECLARE_IS_ENDIAN;
382
383#   if 1
384    do {
385        cnt = 15;
386        nlo = ((const u8 *)Xi)[15];
387        nlo ^= inp[15];
388        nhi = nlo >> 4;
389        nlo &= 0xf;
390
391        Z.hi = Htable[nlo].hi;
392        Z.lo = Htable[nlo].lo;
393
394        while (1) {
395            rem = (size_t)Z.lo & 0xf;
396            Z.lo = (Z.hi << 60) | (Z.lo >> 4);
397            Z.hi = (Z.hi >> 4);
398            if (sizeof(size_t) == 8)
399                Z.hi ^= rem_4bit[rem];
400            else
401                Z.hi ^= (u64)rem_4bit[rem] << 32;
402
403            Z.hi ^= Htable[nhi].hi;
404            Z.lo ^= Htable[nhi].lo;
405
406            if (--cnt < 0)
407                break;
408
409            nlo = ((const u8 *)Xi)[cnt];
410            nlo ^= inp[cnt];
411            nhi = nlo >> 4;
412            nlo &= 0xf;
413
414            rem = (size_t)Z.lo & 0xf;
415            Z.lo = (Z.hi << 60) | (Z.lo >> 4);
416            Z.hi = (Z.hi >> 4);
417            if (sizeof(size_t) == 8)
418                Z.hi ^= rem_4bit[rem];
419            else
420                Z.hi ^= (u64)rem_4bit[rem] << 32;
421
422            Z.hi ^= Htable[nlo].hi;
423            Z.lo ^= Htable[nlo].lo;
424        }
425#   else
426    /*
427     * Extra 256+16 bytes per-key plus 512 bytes shared tables
428     * [should] give ~50% improvement... One could have PACK()-ed
429     * the rem_8bit even here, but the priority is to minimize
430     * cache footprint...
431     */
432    u128 Hshr4[16];             /* Htable shifted right by 4 bits */
433    u8 Hshl4[16];               /* Htable shifted left by 4 bits */
434    static const unsigned short rem_8bit[256] = {
435        0x0000, 0x01C2, 0x0384, 0x0246, 0x0708, 0x06CA, 0x048C, 0x054E,
436        0x0E10, 0x0FD2, 0x0D94, 0x0C56, 0x0918, 0x08DA, 0x0A9C, 0x0B5E,
437        0x1C20, 0x1DE2, 0x1FA4, 0x1E66, 0x1B28, 0x1AEA, 0x18AC, 0x196E,
438        0x1230, 0x13F2, 0x11B4, 0x1076, 0x1538, 0x14FA, 0x16BC, 0x177E,
439        0x3840, 0x3982, 0x3BC4, 0x3A06, 0x3F48, 0x3E8A, 0x3CCC, 0x3D0E,
440        0x3650, 0x3792, 0x35D4, 0x3416, 0x3158, 0x309A, 0x32DC, 0x331E,
441        0x2460, 0x25A2, 0x27E4, 0x2626, 0x2368, 0x22AA, 0x20EC, 0x212E,
442        0x2A70, 0x2BB2, 0x29F4, 0x2836, 0x2D78, 0x2CBA, 0x2EFC, 0x2F3E,
443        0x7080, 0x7142, 0x7304, 0x72C6, 0x7788, 0x764A, 0x740C, 0x75CE,
444        0x7E90, 0x7F52, 0x7D14, 0x7CD6, 0x7998, 0x785A, 0x7A1C, 0x7BDE,
445        0x6CA0, 0x6D62, 0x6F24, 0x6EE6, 0x6BA8, 0x6A6A, 0x682C, 0x69EE,
446        0x62B0, 0x6372, 0x6134, 0x60F6, 0x65B8, 0x647A, 0x663C, 0x67FE,
447        0x48C0, 0x4902, 0x4B44, 0x4A86, 0x4FC8, 0x4E0A, 0x4C4C, 0x4D8E,
448        0x46D0, 0x4712, 0x4554, 0x4496, 0x41D8, 0x401A, 0x425C, 0x439E,
449        0x54E0, 0x5522, 0x5764, 0x56A6, 0x53E8, 0x522A, 0x506C, 0x51AE,
450        0x5AF0, 0x5B32, 0x5974, 0x58B6, 0x5DF8, 0x5C3A, 0x5E7C, 0x5FBE,
451        0xE100, 0xE0C2, 0xE284, 0xE346, 0xE608, 0xE7CA, 0xE58C, 0xE44E,
452        0xEF10, 0xEED2, 0xEC94, 0xED56, 0xE818, 0xE9DA, 0xEB9C, 0xEA5E,
453        0xFD20, 0xFCE2, 0xFEA4, 0xFF66, 0xFA28, 0xFBEA, 0xF9AC, 0xF86E,
454        0xF330, 0xF2F2, 0xF0B4, 0xF176, 0xF438, 0xF5FA, 0xF7BC, 0xF67E,
455        0xD940, 0xD882, 0xDAC4, 0xDB06, 0xDE48, 0xDF8A, 0xDDCC, 0xDC0E,
456        0xD750, 0xD692, 0xD4D4, 0xD516, 0xD058, 0xD19A, 0xD3DC, 0xD21E,
457        0xC560, 0xC4A2, 0xC6E4, 0xC726, 0xC268, 0xC3AA, 0xC1EC, 0xC02E,
458        0xCB70, 0xCAB2, 0xC8F4, 0xC936, 0xCC78, 0xCDBA, 0xCFFC, 0xCE3E,
459        0x9180, 0x9042, 0x9204, 0x93C6, 0x9688, 0x974A, 0x950C, 0x94CE,
460        0x9F90, 0x9E52, 0x9C14, 0x9DD6, 0x9898, 0x995A, 0x9B1C, 0x9ADE,
461        0x8DA0, 0x8C62, 0x8E24, 0x8FE6, 0x8AA8, 0x8B6A, 0x892C, 0x88EE,
462        0x83B0, 0x8272, 0x8034, 0x81F6, 0x84B8, 0x857A, 0x873C, 0x86FE,
463        0xA9C0, 0xA802, 0xAA44, 0xAB86, 0xAEC8, 0xAF0A, 0xAD4C, 0xAC8E,
464        0xA7D0, 0xA612, 0xA454, 0xA596, 0xA0D8, 0xA11A, 0xA35C, 0xA29E,
465        0xB5E0, 0xB422, 0xB664, 0xB7A6, 0xB2E8, 0xB32A, 0xB16C, 0xB0AE,
466        0xBBF0, 0xBA32, 0xB874, 0xB9B6, 0xBCF8, 0xBD3A, 0xBF7C, 0xBEBE
467    };
468    /*
469     * This pre-processing phase slows down procedure by approximately
470     * same time as it makes each loop spin faster. In other words
471     * single block performance is approximately same as straightforward
472     * "4-bit" implementation, and then it goes only faster...
473     */
474    for (cnt = 0; cnt < 16; ++cnt) {
475        Z.hi = Htable[cnt].hi;
476        Z.lo = Htable[cnt].lo;
477        Hshr4[cnt].lo = (Z.hi << 60) | (Z.lo >> 4);
478        Hshr4[cnt].hi = (Z.hi >> 4);
479        Hshl4[cnt] = (u8)(Z.lo << 4);
480    }
481
482    do {
483        for (Z.lo = 0, Z.hi = 0, cnt = 15; cnt; --cnt) {
484            nlo = ((const u8 *)Xi)[cnt];
485            nlo ^= inp[cnt];
486            nhi = nlo >> 4;
487            nlo &= 0xf;
488
489            Z.hi ^= Htable[nlo].hi;
490            Z.lo ^= Htable[nlo].lo;
491
492            rem = (size_t)Z.lo & 0xff;
493
494            Z.lo = (Z.hi << 56) | (Z.lo >> 8);
495            Z.hi = (Z.hi >> 8);
496
497            Z.hi ^= Hshr4[nhi].hi;
498            Z.lo ^= Hshr4[nhi].lo;
499            Z.hi ^= (u64)rem_8bit[rem ^ Hshl4[nhi]] << 48;
500        }
501
502        nlo = ((const u8 *)Xi)[0];
503        nlo ^= inp[0];
504        nhi = nlo >> 4;
505        nlo &= 0xf;
506
507        Z.hi ^= Htable[nlo].hi;
508        Z.lo ^= Htable[nlo].lo;
509
510        rem = (size_t)Z.lo & 0xf;
511
512        Z.lo = (Z.hi << 60) | (Z.lo >> 4);
513        Z.hi = (Z.hi >> 4);
514
515        Z.hi ^= Htable[nhi].hi;
516        Z.lo ^= Htable[nhi].lo;
517        Z.hi ^= ((u64)rem_8bit[rem << 4]) << 48;
518#   endif
519
520        if (IS_LITTLE_ENDIAN) {
521#   ifdef BSWAP8
522            Xi[0] = BSWAP8(Z.hi);
523            Xi[1] = BSWAP8(Z.lo);
524#   else
525            u8 *p = (u8 *)Xi;
526            u32 v;
527            v = (u32)(Z.hi >> 32);
528            PUTU32(p, v);
529            v = (u32)(Z.hi);
530            PUTU32(p + 4, v);
531            v = (u32)(Z.lo >> 32);
532            PUTU32(p + 8, v);
533            v = (u32)(Z.lo);
534            PUTU32(p + 12, v);
535#   endif
536        } else {
537            Xi[0] = Z.hi;
538            Xi[1] = Z.lo;
539        }
540    } while (inp += 16, len -= 16);
541}
542#  endif
543# else
544void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16]);
545void gcm_ghash_4bit(u64 Xi[2], const u128 Htable[16], const u8 *inp,
546                    size_t len);
547# endif
548
549# define GCM_MUL(ctx)      gcm_gmult_4bit(ctx->Xi.u,ctx->Htable)
550# if defined(GHASH_ASM) || !defined(OPENSSL_SMALL_FOOTPRINT)
551#  define GHASH(ctx,in,len) gcm_ghash_4bit((ctx)->Xi.u,(ctx)->Htable,in,len)
552/*
553 * GHASH_CHUNK is "stride parameter" missioned to mitigate cache trashing
554 * effect. In other words idea is to hash data while it's still in L1 cache
555 * after encryption pass...
556 */
557#  define GHASH_CHUNK       (3*1024)
558# endif
559
560#else                           /* TABLE_BITS */
561
562static void gcm_gmult_1bit(u64 Xi[2], const u64 H[2])
563{
564    u128 V, Z = { 0, 0 };
565    long X;
566    int i, j;
567    const long *xi = (const long *)Xi;
568    DECLARE_IS_ENDIAN;
569
570    V.hi = H[0];                /* H is in host byte order, no byte swapping */
571    V.lo = H[1];
572
573    for (j = 0; j < 16 / sizeof(long); ++j) {
574        if (IS_LITTLE_ENDIAN) {
575            if (sizeof(long) == 8) {
576# ifdef BSWAP8
577                X = (long)(BSWAP8(xi[j]));
578# else
579                const u8 *p = (const u8 *)(xi + j);
580                X = (long)((u64)GETU32(p) << 32 | GETU32(p + 4));
581# endif
582            } else {
583                const u8 *p = (const u8 *)(xi + j);
584                X = (long)GETU32(p);
585            }
586        } else
587            X = xi[j];
588
589        for (i = 0; i < 8 * sizeof(long); ++i, X <<= 1) {
590            u64 M = (u64)(X >> (8 * sizeof(long) - 1));
591            Z.hi ^= V.hi & M;
592            Z.lo ^= V.lo & M;
593
594            REDUCE1BIT(V);
595        }
596    }
597
598    if (IS_LITTLE_ENDIAN) {
599# ifdef BSWAP8
600        Xi[0] = BSWAP8(Z.hi);
601        Xi[1] = BSWAP8(Z.lo);
602# else
603        u8 *p = (u8 *)Xi;
604        u32 v;
605        v = (u32)(Z.hi >> 32);
606        PUTU32(p, v);
607        v = (u32)(Z.hi);
608        PUTU32(p + 4, v);
609        v = (u32)(Z.lo >> 32);
610        PUTU32(p + 8, v);
611        v = (u32)(Z.lo);
612        PUTU32(p + 12, v);
613# endif
614    } else {
615        Xi[0] = Z.hi;
616        Xi[1] = Z.lo;
617    }
618}
619
620# define GCM_MUL(ctx)      gcm_gmult_1bit(ctx->Xi.u,ctx->H.u)
621
622#endif
623
624#if     TABLE_BITS==4 && (defined(GHASH_ASM) || defined(OPENSSL_CPUID_OBJ))
625# if    !defined(I386_ONLY) && \
626        (defined(__i386)        || defined(__i386__)    || \
627         defined(__x86_64)      || defined(__x86_64__)  || \
628         defined(_M_IX86)       || defined(_M_AMD64)    || defined(_M_X64))
629#  define GHASH_ASM_X86_OR_64
630#  define GCM_FUNCREF_4BIT
631
632void gcm_init_clmul(u128 Htable[16], const u64 Xi[2]);
633void gcm_gmult_clmul(u64 Xi[2], const u128 Htable[16]);
634void gcm_ghash_clmul(u64 Xi[2], const u128 Htable[16], const u8 *inp,
635                     size_t len);
636
637#  if defined(__i386) || defined(__i386__) || defined(_M_IX86)
638#   define gcm_init_avx   gcm_init_clmul
639#   define gcm_gmult_avx  gcm_gmult_clmul
640#   define gcm_ghash_avx  gcm_ghash_clmul
641#  else
642void gcm_init_avx(u128 Htable[16], const u64 Xi[2]);
643void gcm_gmult_avx(u64 Xi[2], const u128 Htable[16]);
644void gcm_ghash_avx(u64 Xi[2], const u128 Htable[16], const u8 *inp,
645                   size_t len);
646#  endif
647
648#  if   defined(__i386) || defined(__i386__) || defined(_M_IX86)
649#   define GHASH_ASM_X86
650void gcm_gmult_4bit_mmx(u64 Xi[2], const u128 Htable[16]);
651void gcm_ghash_4bit_mmx(u64 Xi[2], const u128 Htable[16], const u8 *inp,
652                        size_t len);
653
654void gcm_gmult_4bit_x86(u64 Xi[2], const u128 Htable[16]);
655void gcm_ghash_4bit_x86(u64 Xi[2], const u128 Htable[16], const u8 *inp,
656                        size_t len);
657#  endif
658# elif (defined(__arm__) || defined(__arm) || defined(__aarch64__)) && defined(GHASH_ASM)
659#  include "arm_arch.h"
660#  if __ARM_MAX_ARCH__>=7
661#   define GHASH_ASM_ARM
662#   define GCM_FUNCREF_4BIT
663#   define PMULL_CAPABLE        (OPENSSL_armcap_P & ARMV8_PMULL)
664#   if defined(__arm__) || defined(__arm)
665#    define NEON_CAPABLE        (OPENSSL_armcap_P & ARMV7_NEON)
666#   endif
667void gcm_init_neon(u128 Htable[16], const u64 Xi[2]);
668void gcm_gmult_neon(u64 Xi[2], const u128 Htable[16]);
669void gcm_ghash_neon(u64 Xi[2], const u128 Htable[16], const u8 *inp,
670                    size_t len);
671void gcm_init_v8(u128 Htable[16], const u64 Xi[2]);
672void gcm_gmult_v8(u64 Xi[2], const u128 Htable[16]);
673void gcm_ghash_v8(u64 Xi[2], const u128 Htable[16], const u8 *inp,
674                  size_t len);
675#  endif
676# elif defined(__sparc__) || defined(__sparc)
677#  include "crypto/sparc_arch.h"
678#  if defined(__arch64__)
679#   define GHASH_ASM_SPARC
680#   define GCM_FUNCREF_4BIT
681extern unsigned int OPENSSL_sparcv9cap_P[];
682void gcm_init_vis3(u128 Htable[16], const u64 Xi[2]);
683void gcm_gmult_vis3(u64 Xi[2], const u128 Htable[16]);
684void gcm_ghash_vis3(u64 Xi[2], const u128 Htable[16], const u8 *inp,
685                    size_t len);
686#  endif /* __arch64__ */
687# elif defined(OPENSSL_CPUID_OBJ) && (defined(__powerpc__) || defined(__ppc__) || defined(_ARCH_PPC))
688#  include "crypto/ppc_arch.h"
689#  define GHASH_ASM_PPC
690#  define GCM_FUNCREF_4BIT
691void gcm_init_p8(u128 Htable[16], const u64 Xi[2]);
692void gcm_gmult_p8(u64 Xi[2], const u128 Htable[16]);
693void gcm_ghash_p8(u64 Xi[2], const u128 Htable[16], const u8 *inp,
694                  size_t len);
695# endif
696#endif
697
698#ifdef GCM_FUNCREF_4BIT
699# undef  GCM_MUL
700# define GCM_MUL(ctx)           (*gcm_gmult_p)(ctx->Xi.u,ctx->Htable)
701# ifdef GHASH
702#  undef  GHASH
703#  define GHASH(ctx,in,len)     (*gcm_ghash_p)(ctx->Xi.u,ctx->Htable,in,len)
704# endif
705#endif
706
707void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx, void *key, block128_f block)
708{
709    DECLARE_IS_ENDIAN;
710
711    memset(ctx, 0, sizeof(*ctx));
712    ctx->block = block;
713    ctx->key = key;
714
715    (*block) (ctx->H.c, ctx->H.c, key);
716
717    if (IS_LITTLE_ENDIAN) {
718        /* H is stored in host byte order */
719#ifdef BSWAP8
720        ctx->H.u[0] = BSWAP8(ctx->H.u[0]);
721        ctx->H.u[1] = BSWAP8(ctx->H.u[1]);
722#else
723        u8 *p = ctx->H.c;
724        u64 hi, lo;
725        hi = (u64)GETU32(p) << 32 | GETU32(p + 4);
726        lo = (u64)GETU32(p + 8) << 32 | GETU32(p + 12);
727        ctx->H.u[0] = hi;
728        ctx->H.u[1] = lo;
729#endif
730    }
731#if     TABLE_BITS==8
732    gcm_init_8bit(ctx->Htable, ctx->H.u);
733#elif   TABLE_BITS==4
734# if    defined(GHASH)
735#  define CTX__GHASH(f) (ctx->ghash = (f))
736# else
737#  define CTX__GHASH(f) (ctx->ghash = NULL)
738# endif
739# if    defined(GHASH_ASM_X86_OR_64)
740#  if   !defined(GHASH_ASM_X86) || defined(OPENSSL_IA32_SSE2)
741    if (OPENSSL_ia32cap_P[1] & (1 << 1)) { /* check PCLMULQDQ bit */
742        if (((OPENSSL_ia32cap_P[1] >> 22) & 0x41) == 0x41) { /* AVX+MOVBE */
743            gcm_init_avx(ctx->Htable, ctx->H.u);
744            ctx->gmult = gcm_gmult_avx;
745            CTX__GHASH(gcm_ghash_avx);
746        } else {
747            gcm_init_clmul(ctx->Htable, ctx->H.u);
748            ctx->gmult = gcm_gmult_clmul;
749            CTX__GHASH(gcm_ghash_clmul);
750        }
751        return;
752    }
753#  endif
754    gcm_init_4bit(ctx->Htable, ctx->H.u);
755#  if   defined(GHASH_ASM_X86)  /* x86 only */
756#   if  defined(OPENSSL_IA32_SSE2)
757    if (OPENSSL_ia32cap_P[0] & (1 << 25)) { /* check SSE bit */
758#   else
759    if (OPENSSL_ia32cap_P[0] & (1 << 23)) { /* check MMX bit */
760#   endif
761        ctx->gmult = gcm_gmult_4bit_mmx;
762        CTX__GHASH(gcm_ghash_4bit_mmx);
763    } else {
764        ctx->gmult = gcm_gmult_4bit_x86;
765        CTX__GHASH(gcm_ghash_4bit_x86);
766    }
767#  else
768    ctx->gmult = gcm_gmult_4bit;
769    CTX__GHASH(gcm_ghash_4bit);
770#  endif
771# elif  defined(GHASH_ASM_ARM)
772#  ifdef PMULL_CAPABLE
773    if (PMULL_CAPABLE) {
774        gcm_init_v8(ctx->Htable, ctx->H.u);
775        ctx->gmult = gcm_gmult_v8;
776        CTX__GHASH(gcm_ghash_v8);
777    } else
778#  endif
779#  ifdef NEON_CAPABLE
780    if (NEON_CAPABLE) {
781        gcm_init_neon(ctx->Htable, ctx->H.u);
782        ctx->gmult = gcm_gmult_neon;
783        CTX__GHASH(gcm_ghash_neon);
784    } else
785#  endif
786    {
787        gcm_init_4bit(ctx->Htable, ctx->H.u);
788        ctx->gmult = gcm_gmult_4bit;
789        CTX__GHASH(gcm_ghash_4bit);
790    }
791# elif  defined(GHASH_ASM_SPARC)
792    if (OPENSSL_sparcv9cap_P[0] & SPARCV9_VIS3) {
793        gcm_init_vis3(ctx->Htable, ctx->H.u);
794        ctx->gmult = gcm_gmult_vis3;
795        CTX__GHASH(gcm_ghash_vis3);
796    } else {
797        gcm_init_4bit(ctx->Htable, ctx->H.u);
798        ctx->gmult = gcm_gmult_4bit;
799        CTX__GHASH(gcm_ghash_4bit);
800    }
801# elif  defined(GHASH_ASM_PPC)
802    if (OPENSSL_ppccap_P & PPC_CRYPTO207) {
803        gcm_init_p8(ctx->Htable, ctx->H.u);
804        ctx->gmult = gcm_gmult_p8;
805        CTX__GHASH(gcm_ghash_p8);
806    } else {
807        gcm_init_4bit(ctx->Htable, ctx->H.u);
808        ctx->gmult = gcm_gmult_4bit;
809        CTX__GHASH(gcm_ghash_4bit);
810    }
811# else
812    gcm_init_4bit(ctx->Htable, ctx->H.u);
813# endif
814# undef CTX__GHASH
815#endif
816}
817
818void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx, const unsigned char *iv,
819                         size_t len)
820{
821    DECLARE_IS_ENDIAN;
822    unsigned int ctr;
823#ifdef GCM_FUNCREF_4BIT
824    void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
825#endif
826
827    ctx->len.u[0] = 0;          /* AAD length */
828    ctx->len.u[1] = 0;          /* message length */
829    ctx->ares = 0;
830    ctx->mres = 0;
831
832    if (len == 12) {
833        memcpy(ctx->Yi.c, iv, 12);
834        ctx->Yi.c[12] = 0;
835        ctx->Yi.c[13] = 0;
836        ctx->Yi.c[14] = 0;
837        ctx->Yi.c[15] = 1;
838        ctr = 1;
839    } else {
840        size_t i;
841        u64 len0 = len;
842
843        /* Borrow ctx->Xi to calculate initial Yi */
844        ctx->Xi.u[0] = 0;
845        ctx->Xi.u[1] = 0;
846
847        while (len >= 16) {
848            for (i = 0; i < 16; ++i)
849                ctx->Xi.c[i] ^= iv[i];
850            GCM_MUL(ctx);
851            iv += 16;
852            len -= 16;
853        }
854        if (len) {
855            for (i = 0; i < len; ++i)
856                ctx->Xi.c[i] ^= iv[i];
857            GCM_MUL(ctx);
858        }
859        len0 <<= 3;
860        if (IS_LITTLE_ENDIAN) {
861#ifdef BSWAP8
862            ctx->Xi.u[1] ^= BSWAP8(len0);
863#else
864            ctx->Xi.c[8] ^= (u8)(len0 >> 56);
865            ctx->Xi.c[9] ^= (u8)(len0 >> 48);
866            ctx->Xi.c[10] ^= (u8)(len0 >> 40);
867            ctx->Xi.c[11] ^= (u8)(len0 >> 32);
868            ctx->Xi.c[12] ^= (u8)(len0 >> 24);
869            ctx->Xi.c[13] ^= (u8)(len0 >> 16);
870            ctx->Xi.c[14] ^= (u8)(len0 >> 8);
871            ctx->Xi.c[15] ^= (u8)(len0);
872#endif
873        } else {
874            ctx->Xi.u[1] ^= len0;
875        }
876
877        GCM_MUL(ctx);
878
879        if (IS_LITTLE_ENDIAN)
880#ifdef BSWAP4
881            ctr = BSWAP4(ctx->Xi.d[3]);
882#else
883            ctr = GETU32(ctx->Xi.c + 12);
884#endif
885        else
886            ctr = ctx->Xi.d[3];
887
888        /* Copy borrowed Xi to Yi */
889        ctx->Yi.u[0] = ctx->Xi.u[0];
890        ctx->Yi.u[1] = ctx->Xi.u[1];
891    }
892
893    ctx->Xi.u[0] = 0;
894    ctx->Xi.u[1] = 0;
895
896    (*ctx->block) (ctx->Yi.c, ctx->EK0.c, ctx->key);
897    ++ctr;
898    if (IS_LITTLE_ENDIAN)
899#ifdef BSWAP4
900        ctx->Yi.d[3] = BSWAP4(ctr);
901#else
902        PUTU32(ctx->Yi.c + 12, ctr);
903#endif
904    else
905        ctx->Yi.d[3] = ctr;
906}
907
908int CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx, const unsigned char *aad,
909                      size_t len)
910{
911    size_t i;
912    unsigned int n;
913    u64 alen = ctx->len.u[0];
914#ifdef GCM_FUNCREF_4BIT
915    void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
916# ifdef GHASH
917    void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
918                         const u8 *inp, size_t len) = ctx->ghash;
919# endif
920#endif
921
922    if (ctx->len.u[1])
923        return -2;
924
925    alen += len;
926    if (alen > (U64(1) << 61) || (sizeof(len) == 8 && alen < len))
927        return -1;
928    ctx->len.u[0] = alen;
929
930    n = ctx->ares;
931    if (n) {
932        while (n && len) {
933            ctx->Xi.c[n] ^= *(aad++);
934            --len;
935            n = (n + 1) % 16;
936        }
937        if (n == 0)
938            GCM_MUL(ctx);
939        else {
940            ctx->ares = n;
941            return 0;
942        }
943    }
944#ifdef GHASH
945    if ((i = (len & (size_t)-16))) {
946        GHASH(ctx, aad, i);
947        aad += i;
948        len -= i;
949    }
950#else
951    while (len >= 16) {
952        for (i = 0; i < 16; ++i)
953            ctx->Xi.c[i] ^= aad[i];
954        GCM_MUL(ctx);
955        aad += 16;
956        len -= 16;
957    }
958#endif
959    if (len) {
960        n = (unsigned int)len;
961        for (i = 0; i < len; ++i)
962            ctx->Xi.c[i] ^= aad[i];
963    }
964
965    ctx->ares = n;
966    return 0;
967}
968
969int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx,
970                          const unsigned char *in, unsigned char *out,
971                          size_t len)
972{
973    DECLARE_IS_ENDIAN;
974    unsigned int n, ctr, mres;
975    size_t i;
976    u64 mlen = ctx->len.u[1];
977    block128_f block = ctx->block;
978    void *key = ctx->key;
979#ifdef GCM_FUNCREF_4BIT
980    void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
981# if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
982    void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
983                         const u8 *inp, size_t len) = ctx->ghash;
984# endif
985#endif
986
987    mlen += len;
988    if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len))
989        return -1;
990    ctx->len.u[1] = mlen;
991
992    mres = ctx->mres;
993
994    if (ctx->ares) {
995        /* First call to encrypt finalizes GHASH(AAD) */
996#if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
997        if (len == 0) {
998            GCM_MUL(ctx);
999            ctx->ares = 0;
1000            return 0;
1001        }
1002        memcpy(ctx->Xn, ctx->Xi.c, sizeof(ctx->Xi));
1003        ctx->Xi.u[0] = 0;
1004        ctx->Xi.u[1] = 0;
1005        mres = sizeof(ctx->Xi);
1006#else
1007        GCM_MUL(ctx);
1008#endif
1009        ctx->ares = 0;
1010    }
1011
1012    if (IS_LITTLE_ENDIAN)
1013#ifdef BSWAP4
1014        ctr = BSWAP4(ctx->Yi.d[3]);
1015#else
1016        ctr = GETU32(ctx->Yi.c + 12);
1017#endif
1018    else
1019        ctr = ctx->Yi.d[3];
1020
1021    n = mres % 16;
1022#if !defined(OPENSSL_SMALL_FOOTPRINT)
1023    if (16 % sizeof(size_t) == 0) { /* always true actually */
1024        do {
1025            if (n) {
1026# if defined(GHASH)
1027                while (n && len) {
1028                    ctx->Xn[mres++] = *(out++) = *(in++) ^ ctx->EKi.c[n];
1029                    --len;
1030                    n = (n + 1) % 16;
1031                }
1032                if (n == 0) {
1033                    GHASH(ctx, ctx->Xn, mres);
1034                    mres = 0;
1035                } else {
1036                    ctx->mres = mres;
1037                    return 0;
1038                }
1039# else
1040                while (n && len) {
1041                    ctx->Xi.c[n] ^= *(out++) = *(in++) ^ ctx->EKi.c[n];
1042                    --len;
1043                    n = (n + 1) % 16;
1044                }
1045                if (n == 0) {
1046                    GCM_MUL(ctx);
1047                    mres = 0;
1048                } else {
1049                    ctx->mres = n;
1050                    return 0;
1051                }
1052# endif
1053            }
1054# if defined(STRICT_ALIGNMENT)
1055            if (((size_t)in | (size_t)out) % sizeof(size_t) != 0)
1056                break;
1057# endif
1058# if defined(GHASH)
1059            if (len >= 16 && mres) {
1060                GHASH(ctx, ctx->Xn, mres);
1061                mres = 0;
1062            }
1063#  if defined(GHASH_CHUNK)
1064            while (len >= GHASH_CHUNK) {
1065                size_t j = GHASH_CHUNK;
1066
1067                while (j) {
1068                    size_t_aX *out_t = (size_t_aX *)out;
1069                    const size_t_aX *in_t = (const size_t_aX *)in;
1070
1071                    (*block) (ctx->Yi.c, ctx->EKi.c, key);
1072                    ++ctr;
1073                    if (IS_LITTLE_ENDIAN)
1074#   ifdef BSWAP4
1075                        ctx->Yi.d[3] = BSWAP4(ctr);
1076#   else
1077                        PUTU32(ctx->Yi.c + 12, ctr);
1078#   endif
1079                    else
1080                        ctx->Yi.d[3] = ctr;
1081                    for (i = 0; i < 16 / sizeof(size_t); ++i)
1082                        out_t[i] = in_t[i] ^ ctx->EKi.t[i];
1083                    out += 16;
1084                    in += 16;
1085                    j -= 16;
1086                }
1087                GHASH(ctx, out - GHASH_CHUNK, GHASH_CHUNK);
1088                len -= GHASH_CHUNK;
1089            }
1090#  endif
1091            if ((i = (len & (size_t)-16))) {
1092                size_t j = i;
1093
1094                while (len >= 16) {
1095                    size_t_aX *out_t = (size_t_aX *)out;
1096                    const size_t_aX *in_t = (const size_t_aX *)in;
1097
1098                    (*block) (ctx->Yi.c, ctx->EKi.c, key);
1099                    ++ctr;
1100                    if (IS_LITTLE_ENDIAN)
1101#  ifdef BSWAP4
1102                        ctx->Yi.d[3] = BSWAP4(ctr);
1103#  else
1104                        PUTU32(ctx->Yi.c + 12, ctr);
1105#  endif
1106                    else
1107                        ctx->Yi.d[3] = ctr;
1108                    for (i = 0; i < 16 / sizeof(size_t); ++i)
1109                        out_t[i] = in_t[i] ^ ctx->EKi.t[i];
1110                    out += 16;
1111                    in += 16;
1112                    len -= 16;
1113                }
1114                GHASH(ctx, out - j, j);
1115            }
1116# else
1117            while (len >= 16) {
1118                size_t *out_t = (size_t *)out;
1119                const size_t *in_t = (const size_t *)in;
1120
1121                (*block) (ctx->Yi.c, ctx->EKi.c, key);
1122                ++ctr;
1123                if (IS_LITTLE_ENDIAN)
1124#  ifdef BSWAP4
1125                    ctx->Yi.d[3] = BSWAP4(ctr);
1126#  else
1127                    PUTU32(ctx->Yi.c + 12, ctr);
1128#  endif
1129                else
1130                    ctx->Yi.d[3] = ctr;
1131                for (i = 0; i < 16 / sizeof(size_t); ++i)
1132                    ctx->Xi.t[i] ^= out_t[i] = in_t[i] ^ ctx->EKi.t[i];
1133                GCM_MUL(ctx);
1134                out += 16;
1135                in += 16;
1136                len -= 16;
1137            }
1138# endif
1139            if (len) {
1140                (*block) (ctx->Yi.c, ctx->EKi.c, key);
1141                ++ctr;
1142                if (IS_LITTLE_ENDIAN)
1143# ifdef BSWAP4
1144                    ctx->Yi.d[3] = BSWAP4(ctr);
1145# else
1146                    PUTU32(ctx->Yi.c + 12, ctr);
1147# endif
1148                else
1149                    ctx->Yi.d[3] = ctr;
1150# if defined(GHASH)
1151                while (len--) {
1152                    ctx->Xn[mres++] = out[n] = in[n] ^ ctx->EKi.c[n];
1153                    ++n;
1154                }
1155# else
1156                while (len--) {
1157                    ctx->Xi.c[n] ^= out[n] = in[n] ^ ctx->EKi.c[n];
1158                    ++n;
1159                }
1160                mres = n;
1161# endif
1162            }
1163
1164            ctx->mres = mres;
1165            return 0;
1166        } while (0);
1167    }
1168#endif
1169    for (i = 0; i < len; ++i) {
1170        if (n == 0) {
1171            (*block) (ctx->Yi.c, ctx->EKi.c, key);
1172            ++ctr;
1173            if (IS_LITTLE_ENDIAN)
1174#ifdef BSWAP4
1175                ctx->Yi.d[3] = BSWAP4(ctr);
1176#else
1177                PUTU32(ctx->Yi.c + 12, ctr);
1178#endif
1179            else
1180                ctx->Yi.d[3] = ctr;
1181        }
1182#if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1183        ctx->Xn[mres++] = out[i] = in[i] ^ ctx->EKi.c[n];
1184        n = (n + 1) % 16;
1185        if (mres == sizeof(ctx->Xn)) {
1186            GHASH(ctx,ctx->Xn,sizeof(ctx->Xn));
1187            mres = 0;
1188        }
1189#else
1190        ctx->Xi.c[n] ^= out[i] = in[i] ^ ctx->EKi.c[n];
1191        mres = n = (n + 1) % 16;
1192        if (n == 0)
1193            GCM_MUL(ctx);
1194#endif
1195    }
1196
1197    ctx->mres = mres;
1198    return 0;
1199}
1200
1201int CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx,
1202                          const unsigned char *in, unsigned char *out,
1203                          size_t len)
1204{
1205    DECLARE_IS_ENDIAN;
1206    unsigned int n, ctr, mres;
1207    size_t i;
1208    u64 mlen = ctx->len.u[1];
1209    block128_f block = ctx->block;
1210    void *key = ctx->key;
1211#ifdef GCM_FUNCREF_4BIT
1212    void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
1213# if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1214    void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
1215                         const u8 *inp, size_t len) = ctx->ghash;
1216# endif
1217#endif
1218
1219    mlen += len;
1220    if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len))
1221        return -1;
1222    ctx->len.u[1] = mlen;
1223
1224    mres = ctx->mres;
1225
1226    if (ctx->ares) {
1227        /* First call to decrypt finalizes GHASH(AAD) */
1228#if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1229        if (len == 0) {
1230            GCM_MUL(ctx);
1231            ctx->ares = 0;
1232            return 0;
1233        }
1234        memcpy(ctx->Xn, ctx->Xi.c, sizeof(ctx->Xi));
1235        ctx->Xi.u[0] = 0;
1236        ctx->Xi.u[1] = 0;
1237        mres = sizeof(ctx->Xi);
1238#else
1239        GCM_MUL(ctx);
1240#endif
1241        ctx->ares = 0;
1242    }
1243
1244    if (IS_LITTLE_ENDIAN)
1245#ifdef BSWAP4
1246        ctr = BSWAP4(ctx->Yi.d[3]);
1247#else
1248        ctr = GETU32(ctx->Yi.c + 12);
1249#endif
1250    else
1251        ctr = ctx->Yi.d[3];
1252
1253    n = mres % 16;
1254#if !defined(OPENSSL_SMALL_FOOTPRINT)
1255    if (16 % sizeof(size_t) == 0) { /* always true actually */
1256        do {
1257            if (n) {
1258# if defined(GHASH)
1259                while (n && len) {
1260                    *(out++) = (ctx->Xn[mres++] = *(in++)) ^ ctx->EKi.c[n];
1261                    --len;
1262                    n = (n + 1) % 16;
1263                }
1264                if (n == 0) {
1265                    GHASH(ctx, ctx->Xn, mres);
1266                    mres = 0;
1267                } else {
1268                    ctx->mres = mres;
1269                    return 0;
1270                }
1271# else
1272                while (n && len) {
1273                    u8 c = *(in++);
1274                    *(out++) = c ^ ctx->EKi.c[n];
1275                    ctx->Xi.c[n] ^= c;
1276                    --len;
1277                    n = (n + 1) % 16;
1278                }
1279                if (n == 0) {
1280                    GCM_MUL(ctx);
1281                    mres = 0;
1282                } else {
1283                    ctx->mres = n;
1284                    return 0;
1285                }
1286# endif
1287            }
1288# if defined(STRICT_ALIGNMENT)
1289            if (((size_t)in | (size_t)out) % sizeof(size_t) != 0)
1290                break;
1291# endif
1292# if defined(GHASH)
1293            if (len >= 16 && mres) {
1294                GHASH(ctx, ctx->Xn, mres);
1295                mres = 0;
1296            }
1297#  if defined(GHASH_CHUNK)
1298            while (len >= GHASH_CHUNK) {
1299                size_t j = GHASH_CHUNK;
1300
1301                GHASH(ctx, in, GHASH_CHUNK);
1302                while (j) {
1303                    size_t_aX *out_t = (size_t_aX *)out;
1304                    const size_t_aX *in_t = (const size_t_aX *)in;
1305
1306                    (*block) (ctx->Yi.c, ctx->EKi.c, key);
1307                    ++ctr;
1308                    if (IS_LITTLE_ENDIAN)
1309#   ifdef BSWAP4
1310                        ctx->Yi.d[3] = BSWAP4(ctr);
1311#   else
1312                        PUTU32(ctx->Yi.c + 12, ctr);
1313#   endif
1314                    else
1315                        ctx->Yi.d[3] = ctr;
1316                    for (i = 0; i < 16 / sizeof(size_t); ++i)
1317                        out_t[i] = in_t[i] ^ ctx->EKi.t[i];
1318                    out += 16;
1319                    in += 16;
1320                    j -= 16;
1321                }
1322                len -= GHASH_CHUNK;
1323            }
1324#  endif
1325            if ((i = (len & (size_t)-16))) {
1326                GHASH(ctx, in, i);
1327                while (len >= 16) {
1328                    size_t_aX *out_t = (size_t_aX *)out;
1329                    const size_t_aX *in_t = (const size_t_aX *)in;
1330
1331                    (*block) (ctx->Yi.c, ctx->EKi.c, key);
1332                    ++ctr;
1333                    if (IS_LITTLE_ENDIAN)
1334#  ifdef BSWAP4
1335                        ctx->Yi.d[3] = BSWAP4(ctr);
1336#  else
1337                        PUTU32(ctx->Yi.c + 12, ctr);
1338#  endif
1339                    else
1340                        ctx->Yi.d[3] = ctr;
1341                    for (i = 0; i < 16 / sizeof(size_t); ++i)
1342                        out_t[i] = in_t[i] ^ ctx->EKi.t[i];
1343                    out += 16;
1344                    in += 16;
1345                    len -= 16;
1346                }
1347            }
1348# else
1349            while (len >= 16) {
1350                size_t *out_t = (size_t *)out;
1351                const size_t *in_t = (const size_t *)in;
1352
1353                (*block) (ctx->Yi.c, ctx->EKi.c, key);
1354                ++ctr;
1355                if (IS_LITTLE_ENDIAN)
1356#  ifdef BSWAP4
1357                    ctx->Yi.d[3] = BSWAP4(ctr);
1358#  else
1359                    PUTU32(ctx->Yi.c + 12, ctr);
1360#  endif
1361                else
1362                    ctx->Yi.d[3] = ctr;
1363                for (i = 0; i < 16 / sizeof(size_t); ++i) {
1364                    size_t c = in_t[i];
1365                    out_t[i] = c ^ ctx->EKi.t[i];
1366                    ctx->Xi.t[i] ^= c;
1367                }
1368                GCM_MUL(ctx);
1369                out += 16;
1370                in += 16;
1371                len -= 16;
1372            }
1373# endif
1374            if (len) {
1375                (*block) (ctx->Yi.c, ctx->EKi.c, key);
1376                ++ctr;
1377                if (IS_LITTLE_ENDIAN)
1378# ifdef BSWAP4
1379                    ctx->Yi.d[3] = BSWAP4(ctr);
1380# else
1381                    PUTU32(ctx->Yi.c + 12, ctr);
1382# endif
1383                else
1384                    ctx->Yi.d[3] = ctr;
1385# if defined(GHASH)
1386                while (len--) {
1387                    out[n] = (ctx->Xn[mres++] = in[n]) ^ ctx->EKi.c[n];
1388                    ++n;
1389                }
1390# else
1391                while (len--) {
1392                    u8 c = in[n];
1393                    ctx->Xi.c[n] ^= c;
1394                    out[n] = c ^ ctx->EKi.c[n];
1395                    ++n;
1396                }
1397                mres = n;
1398# endif
1399            }
1400
1401            ctx->mres = mres;
1402            return 0;
1403        } while (0);
1404    }
1405#endif
1406    for (i = 0; i < len; ++i) {
1407        u8 c;
1408        if (n == 0) {
1409            (*block) (ctx->Yi.c, ctx->EKi.c, key);
1410            ++ctr;
1411            if (IS_LITTLE_ENDIAN)
1412#ifdef BSWAP4
1413                ctx->Yi.d[3] = BSWAP4(ctr);
1414#else
1415                PUTU32(ctx->Yi.c + 12, ctr);
1416#endif
1417            else
1418                ctx->Yi.d[3] = ctr;
1419        }
1420#if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1421        out[i] = (ctx->Xn[mres++] = c = in[i]) ^ ctx->EKi.c[n];
1422        n = (n + 1) % 16;
1423        if (mres == sizeof(ctx->Xn)) {
1424            GHASH(ctx,ctx->Xn,sizeof(ctx->Xn));
1425            mres = 0;
1426        }
1427#else
1428        c = in[i];
1429        out[i] = c ^ ctx->EKi.c[n];
1430        ctx->Xi.c[n] ^= c;
1431        mres = n = (n + 1) % 16;
1432        if (n == 0)
1433            GCM_MUL(ctx);
1434#endif
1435    }
1436
1437    ctx->mres = mres;
1438    return 0;
1439}
1440
1441int CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx,
1442                                const unsigned char *in, unsigned char *out,
1443                                size_t len, ctr128_f stream)
1444{
1445#if defined(OPENSSL_SMALL_FOOTPRINT)
1446    return CRYPTO_gcm128_encrypt(ctx, in, out, len);
1447#else
1448    DECLARE_IS_ENDIAN;
1449    unsigned int n, ctr, mres;
1450    size_t i;
1451    u64 mlen = ctx->len.u[1];
1452    void *key = ctx->key;
1453# ifdef GCM_FUNCREF_4BIT
1454    void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
1455#  ifdef GHASH
1456    void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
1457                         const u8 *inp, size_t len) = ctx->ghash;
1458#  endif
1459# endif
1460
1461    mlen += len;
1462    if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len))
1463        return -1;
1464    ctx->len.u[1] = mlen;
1465
1466    mres = ctx->mres;
1467
1468    if (ctx->ares) {
1469        /* First call to encrypt finalizes GHASH(AAD) */
1470#if defined(GHASH)
1471        if (len == 0) {
1472            GCM_MUL(ctx);
1473            ctx->ares = 0;
1474            return 0;
1475        }
1476        memcpy(ctx->Xn, ctx->Xi.c, sizeof(ctx->Xi));
1477        ctx->Xi.u[0] = 0;
1478        ctx->Xi.u[1] = 0;
1479        mres = sizeof(ctx->Xi);
1480#else
1481        GCM_MUL(ctx);
1482#endif
1483        ctx->ares = 0;
1484    }
1485
1486    if (IS_LITTLE_ENDIAN)
1487# ifdef BSWAP4
1488        ctr = BSWAP4(ctx->Yi.d[3]);
1489# else
1490        ctr = GETU32(ctx->Yi.c + 12);
1491# endif
1492    else
1493        ctr = ctx->Yi.d[3];
1494
1495    n = mres % 16;
1496    if (n) {
1497# if defined(GHASH)
1498        while (n && len) {
1499            ctx->Xn[mres++] = *(out++) = *(in++) ^ ctx->EKi.c[n];
1500            --len;
1501            n = (n + 1) % 16;
1502        }
1503        if (n == 0) {
1504            GHASH(ctx, ctx->Xn, mres);
1505            mres = 0;
1506        } else {
1507            ctx->mres = mres;
1508            return 0;
1509        }
1510# else
1511        while (n && len) {
1512            ctx->Xi.c[n] ^= *(out++) = *(in++) ^ ctx->EKi.c[n];
1513            --len;
1514            n = (n + 1) % 16;
1515        }
1516        if (n == 0) {
1517            GCM_MUL(ctx);
1518            mres = 0;
1519        } else {
1520            ctx->mres = n;
1521            return 0;
1522        }
1523# endif
1524    }
1525# if defined(GHASH)
1526        if (len >= 16 && mres) {
1527            GHASH(ctx, ctx->Xn, mres);
1528            mres = 0;
1529        }
1530#  if defined(GHASH_CHUNK)
1531    while (len >= GHASH_CHUNK) {
1532        (*stream) (in, out, GHASH_CHUNK / 16, key, ctx->Yi.c);
1533        ctr += GHASH_CHUNK / 16;
1534        if (IS_LITTLE_ENDIAN)
1535#   ifdef BSWAP4
1536            ctx->Yi.d[3] = BSWAP4(ctr);
1537#   else
1538            PUTU32(ctx->Yi.c + 12, ctr);
1539#   endif
1540        else
1541            ctx->Yi.d[3] = ctr;
1542        GHASH(ctx, out, GHASH_CHUNK);
1543        out += GHASH_CHUNK;
1544        in += GHASH_CHUNK;
1545        len -= GHASH_CHUNK;
1546    }
1547#  endif
1548# endif
1549    if ((i = (len & (size_t)-16))) {
1550        size_t j = i / 16;
1551
1552        (*stream) (in, out, j, key, ctx->Yi.c);
1553        ctr += (unsigned int)j;
1554        if (IS_LITTLE_ENDIAN)
1555# ifdef BSWAP4
1556            ctx->Yi.d[3] = BSWAP4(ctr);
1557# else
1558            PUTU32(ctx->Yi.c + 12, ctr);
1559# endif
1560        else
1561            ctx->Yi.d[3] = ctr;
1562        in += i;
1563        len -= i;
1564# if defined(GHASH)
1565        GHASH(ctx, out, i);
1566        out += i;
1567# else
1568        while (j--) {
1569            for (i = 0; i < 16; ++i)
1570                ctx->Xi.c[i] ^= out[i];
1571            GCM_MUL(ctx);
1572            out += 16;
1573        }
1574# endif
1575    }
1576    if (len) {
1577        (*ctx->block) (ctx->Yi.c, ctx->EKi.c, key);
1578        ++ctr;
1579        if (IS_LITTLE_ENDIAN)
1580# ifdef BSWAP4
1581            ctx->Yi.d[3] = BSWAP4(ctr);
1582# else
1583            PUTU32(ctx->Yi.c + 12, ctr);
1584# endif
1585        else
1586            ctx->Yi.d[3] = ctr;
1587        while (len--) {
1588# if defined(GHASH)
1589            ctx->Xn[mres++] = out[n] = in[n] ^ ctx->EKi.c[n];
1590# else
1591            ctx->Xi.c[mres++] ^= out[n] = in[n] ^ ctx->EKi.c[n];
1592# endif
1593            ++n;
1594        }
1595    }
1596
1597    ctx->mres = mres;
1598    return 0;
1599#endif
1600}
1601
1602int CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx,
1603                                const unsigned char *in, unsigned char *out,
1604                                size_t len, ctr128_f stream)
1605{
1606#if defined(OPENSSL_SMALL_FOOTPRINT)
1607    return CRYPTO_gcm128_decrypt(ctx, in, out, len);
1608#else
1609    DECLARE_IS_ENDIAN;
1610    unsigned int n, ctr, mres;
1611    size_t i;
1612    u64 mlen = ctx->len.u[1];
1613    void *key = ctx->key;
1614# ifdef GCM_FUNCREF_4BIT
1615    void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
1616#  ifdef GHASH
1617    void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
1618                         const u8 *inp, size_t len) = ctx->ghash;
1619#  endif
1620# endif
1621
1622    mlen += len;
1623    if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len))
1624        return -1;
1625    ctx->len.u[1] = mlen;
1626
1627    mres = ctx->mres;
1628
1629    if (ctx->ares) {
1630        /* First call to decrypt finalizes GHASH(AAD) */
1631# if defined(GHASH)
1632        if (len == 0) {
1633            GCM_MUL(ctx);
1634            ctx->ares = 0;
1635            return 0;
1636        }
1637        memcpy(ctx->Xn, ctx->Xi.c, sizeof(ctx->Xi));
1638        ctx->Xi.u[0] = 0;
1639        ctx->Xi.u[1] = 0;
1640        mres = sizeof(ctx->Xi);
1641# else
1642        GCM_MUL(ctx);
1643# endif
1644        ctx->ares = 0;
1645    }
1646
1647    if (IS_LITTLE_ENDIAN)
1648# ifdef BSWAP4
1649        ctr = BSWAP4(ctx->Yi.d[3]);
1650# else
1651        ctr = GETU32(ctx->Yi.c + 12);
1652# endif
1653    else
1654        ctr = ctx->Yi.d[3];
1655
1656    n = mres % 16;
1657    if (n) {
1658# if defined(GHASH)
1659        while (n && len) {
1660            *(out++) = (ctx->Xn[mres++] = *(in++)) ^ ctx->EKi.c[n];
1661            --len;
1662            n = (n + 1) % 16;
1663        }
1664        if (n == 0) {
1665            GHASH(ctx, ctx->Xn, mres);
1666            mres = 0;
1667        } else {
1668            ctx->mres = mres;
1669            return 0;
1670        }
1671# else
1672        while (n && len) {
1673            u8 c = *(in++);
1674            *(out++) = c ^ ctx->EKi.c[n];
1675            ctx->Xi.c[n] ^= c;
1676            --len;
1677            n = (n + 1) % 16;
1678        }
1679        if (n == 0) {
1680            GCM_MUL(ctx);
1681            mres = 0;
1682        } else {
1683            ctx->mres = n;
1684            return 0;
1685        }
1686# endif
1687    }
1688# if defined(GHASH)
1689    if (len >= 16 && mres) {
1690        GHASH(ctx, ctx->Xn, mres);
1691        mres = 0;
1692    }
1693#  if defined(GHASH_CHUNK)
1694    while (len >= GHASH_CHUNK) {
1695        GHASH(ctx, in, GHASH_CHUNK);
1696        (*stream) (in, out, GHASH_CHUNK / 16, key, ctx->Yi.c);
1697        ctr += GHASH_CHUNK / 16;
1698        if (IS_LITTLE_ENDIAN)
1699#   ifdef BSWAP4
1700            ctx->Yi.d[3] = BSWAP4(ctr);
1701#   else
1702            PUTU32(ctx->Yi.c + 12, ctr);
1703#   endif
1704        else
1705            ctx->Yi.d[3] = ctr;
1706        out += GHASH_CHUNK;
1707        in += GHASH_CHUNK;
1708        len -= GHASH_CHUNK;
1709    }
1710#  endif
1711# endif
1712    if ((i = (len & (size_t)-16))) {
1713        size_t j = i / 16;
1714
1715# if defined(GHASH)
1716        GHASH(ctx, in, i);
1717# else
1718        while (j--) {
1719            size_t k;
1720            for (k = 0; k < 16; ++k)
1721                ctx->Xi.c[k] ^= in[k];
1722            GCM_MUL(ctx);
1723            in += 16;
1724        }
1725        j = i / 16;
1726        in -= i;
1727# endif
1728        (*stream) (in, out, j, key, ctx->Yi.c);
1729        ctr += (unsigned int)j;
1730        if (IS_LITTLE_ENDIAN)
1731# ifdef BSWAP4
1732            ctx->Yi.d[3] = BSWAP4(ctr);
1733# else
1734            PUTU32(ctx->Yi.c + 12, ctr);
1735# endif
1736        else
1737            ctx->Yi.d[3] = ctr;
1738        out += i;
1739        in += i;
1740        len -= i;
1741    }
1742    if (len) {
1743        (*ctx->block) (ctx->Yi.c, ctx->EKi.c, key);
1744        ++ctr;
1745        if (IS_LITTLE_ENDIAN)
1746# ifdef BSWAP4
1747            ctx->Yi.d[3] = BSWAP4(ctr);
1748# else
1749            PUTU32(ctx->Yi.c + 12, ctr);
1750# endif
1751        else
1752            ctx->Yi.d[3] = ctr;
1753        while (len--) {
1754# if defined(GHASH)
1755            out[n] = (ctx->Xn[mres++] = in[n]) ^ ctx->EKi.c[n];
1756# else
1757            u8 c = in[n];
1758            ctx->Xi.c[mres++] ^= c;
1759            out[n] = c ^ ctx->EKi.c[n];
1760# endif
1761            ++n;
1762        }
1763    }
1764
1765    ctx->mres = mres;
1766    return 0;
1767#endif
1768}
1769
1770int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx, const unsigned char *tag,
1771                         size_t len)
1772{
1773    DECLARE_IS_ENDIAN;
1774    u64 alen = ctx->len.u[0] << 3;
1775    u64 clen = ctx->len.u[1] << 3;
1776#ifdef GCM_FUNCREF_4BIT
1777    void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
1778# if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1779    void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
1780                         const u8 *inp, size_t len) = ctx->ghash;
1781# endif
1782#endif
1783
1784#if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1785    u128 bitlen;
1786    unsigned int mres = ctx->mres;
1787
1788    if (mres) {
1789        unsigned blocks = (mres + 15) & -16;
1790
1791        memset(ctx->Xn + mres, 0, blocks - mres);
1792        mres = blocks;
1793        if (mres == sizeof(ctx->Xn)) {
1794            GHASH(ctx, ctx->Xn, mres);
1795            mres = 0;
1796        }
1797    } else if (ctx->ares) {
1798        GCM_MUL(ctx);
1799    }
1800#else
1801    if (ctx->mres || ctx->ares)
1802        GCM_MUL(ctx);
1803#endif
1804
1805    if (IS_LITTLE_ENDIAN) {
1806#ifdef BSWAP8
1807        alen = BSWAP8(alen);
1808        clen = BSWAP8(clen);
1809#else
1810        u8 *p = ctx->len.c;
1811
1812        ctx->len.u[0] = alen;
1813        ctx->len.u[1] = clen;
1814
1815        alen = (u64)GETU32(p) << 32 | GETU32(p + 4);
1816        clen = (u64)GETU32(p + 8) << 32 | GETU32(p + 12);
1817#endif
1818    }
1819
1820#if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1821    bitlen.hi = alen;
1822    bitlen.lo = clen;
1823    memcpy(ctx->Xn + mres, &bitlen, sizeof(bitlen));
1824    mres += sizeof(bitlen);
1825    GHASH(ctx, ctx->Xn, mres);
1826#else
1827    ctx->Xi.u[0] ^= alen;
1828    ctx->Xi.u[1] ^= clen;
1829    GCM_MUL(ctx);
1830#endif
1831
1832    ctx->Xi.u[0] ^= ctx->EK0.u[0];
1833    ctx->Xi.u[1] ^= ctx->EK0.u[1];
1834
1835    if (tag && len <= sizeof(ctx->Xi))
1836        return CRYPTO_memcmp(ctx->Xi.c, tag, len);
1837    else
1838        return -1;
1839}
1840
1841void CRYPTO_gcm128_tag(GCM128_CONTEXT *ctx, unsigned char *tag, size_t len)
1842{
1843    CRYPTO_gcm128_finish(ctx, NULL, 0);
1844    memcpy(tag, ctx->Xi.c,
1845           len <= sizeof(ctx->Xi.c) ? len : sizeof(ctx->Xi.c));
1846}
1847
1848GCM128_CONTEXT *CRYPTO_gcm128_new(void *key, block128_f block)
1849{
1850    GCM128_CONTEXT *ret;
1851
1852    if ((ret = OPENSSL_malloc(sizeof(*ret))) != NULL)
1853        CRYPTO_gcm128_init(ret, key, block);
1854
1855    return ret;
1856}
1857
1858void CRYPTO_gcm128_release(GCM128_CONTEXT *ctx)
1859{
1860    OPENSSL_clear_free(ctx, sizeof(*ctx));
1861}
1862