1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or https://opensource.org/licenses/CDDL-1.0.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
23 */
24
25#include <sys/zfs_context.h>
26#include <sys/cmn_err.h>
27#include <modes/modes.h>
28#include <sys/crypto/common.h>
29#include <sys/crypto/icp.h>
30#include <sys/crypto/impl.h>
31#include <sys/byteorder.h>
32#include <sys/simd.h>
33#include <modes/gcm_impl.h>
34#ifdef CAN_USE_GCM_ASM
35#include <aes/aes_impl.h>
36#endif
37
38#define	GHASH(c, d, t, o) \
39	xor_block((uint8_t *)(d), (uint8_t *)(c)->gcm_ghash); \
40	(o)->mul((uint64_t *)(void *)(c)->gcm_ghash, (c)->gcm_H, \
41	(uint64_t *)(void *)(t));
42
43/* Select GCM implementation */
44#define	IMPL_FASTEST	(UINT32_MAX)
45#define	IMPL_CYCLE	(UINT32_MAX-1)
46#ifdef CAN_USE_GCM_ASM
47#define	IMPL_AVX	(UINT32_MAX-2)
48#endif
49#define	GCM_IMPL_READ(i) (*(volatile uint32_t *) &(i))
50static uint32_t icp_gcm_impl = IMPL_FASTEST;
51static uint32_t user_sel_impl = IMPL_FASTEST;
52
53static inline int gcm_init_ctx_impl(boolean_t, gcm_ctx_t *, char *, size_t,
54    int (*)(const void *, const uint8_t *, uint8_t *),
55    void (*)(uint8_t *, uint8_t *),
56    void (*)(uint8_t *, uint8_t *));
57
58#ifdef CAN_USE_GCM_ASM
59/* Does the architecture we run on support the MOVBE instruction? */
60boolean_t gcm_avx_can_use_movbe = B_FALSE;
61/*
62 * Whether to use the optimized openssl gcm and ghash implementations.
63 * Set to true if module parameter icp_gcm_impl == "avx".
64 */
65static boolean_t gcm_use_avx = B_FALSE;
66#define	GCM_IMPL_USE_AVX	(*(volatile boolean_t *)&gcm_use_avx)
67
68extern boolean_t ASMABI atomic_toggle_boolean_nv(volatile boolean_t *);
69
70static inline boolean_t gcm_avx_will_work(void);
71static inline void gcm_set_avx(boolean_t);
72static inline boolean_t gcm_toggle_avx(void);
73static inline size_t gcm_simd_get_htab_size(boolean_t);
74
75static int gcm_mode_encrypt_contiguous_blocks_avx(gcm_ctx_t *, char *, size_t,
76    crypto_data_t *, size_t);
77
78static int gcm_encrypt_final_avx(gcm_ctx_t *, crypto_data_t *, size_t);
79static int gcm_decrypt_final_avx(gcm_ctx_t *, crypto_data_t *, size_t);
80static int gcm_init_avx(gcm_ctx_t *, const uint8_t *, size_t, const uint8_t *,
81    size_t, size_t);
82#endif /* ifdef CAN_USE_GCM_ASM */
83
84/*
85 * Encrypt multiple blocks of data in GCM mode.  Decrypt for GCM mode
86 * is done in another function.
87 */
88int
89gcm_mode_encrypt_contiguous_blocks(gcm_ctx_t *ctx, char *data, size_t length,
90    crypto_data_t *out, size_t block_size,
91    int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
92    void (*copy_block)(uint8_t *, uint8_t *),
93    void (*xor_block)(uint8_t *, uint8_t *))
94{
95#ifdef CAN_USE_GCM_ASM
96	if (ctx->gcm_use_avx == B_TRUE)
97		return (gcm_mode_encrypt_contiguous_blocks_avx(
98		    ctx, data, length, out, block_size));
99#endif
100
101	const gcm_impl_ops_t *gops;
102	size_t remainder = length;
103	size_t need = 0;
104	uint8_t *datap = (uint8_t *)data;
105	uint8_t *blockp;
106	uint8_t *lastp;
107	void *iov_or_mp;
108	offset_t offset;
109	uint8_t *out_data_1;
110	uint8_t *out_data_2;
111	size_t out_data_1_len;
112	uint64_t counter;
113	uint64_t counter_mask = ntohll(0x00000000ffffffffULL);
114
115	if (length + ctx->gcm_remainder_len < block_size) {
116		/* accumulate bytes here and return */
117		memcpy((uint8_t *)ctx->gcm_remainder + ctx->gcm_remainder_len,
118		    datap,
119		    length);
120		ctx->gcm_remainder_len += length;
121		if (ctx->gcm_copy_to == NULL) {
122			ctx->gcm_copy_to = datap;
123		}
124		return (CRYPTO_SUCCESS);
125	}
126
127	crypto_init_ptrs(out, &iov_or_mp, &offset);
128
129	gops = gcm_impl_get_ops();
130	do {
131		/* Unprocessed data from last call. */
132		if (ctx->gcm_remainder_len > 0) {
133			need = block_size - ctx->gcm_remainder_len;
134
135			if (need > remainder)
136				return (CRYPTO_DATA_LEN_RANGE);
137
138			memcpy(&((uint8_t *)ctx->gcm_remainder)
139			    [ctx->gcm_remainder_len], datap, need);
140
141			blockp = (uint8_t *)ctx->gcm_remainder;
142		} else {
143			blockp = datap;
144		}
145
146		/*
147		 * Increment counter. Counter bits are confined
148		 * to the bottom 32 bits of the counter block.
149		 */
150		counter = ntohll(ctx->gcm_cb[1] & counter_mask);
151		counter = htonll(counter + 1);
152		counter &= counter_mask;
153		ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter;
154
155		encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_cb,
156		    (uint8_t *)ctx->gcm_tmp);
157		xor_block(blockp, (uint8_t *)ctx->gcm_tmp);
158
159		lastp = (uint8_t *)ctx->gcm_tmp;
160
161		ctx->gcm_processed_data_len += block_size;
162
163		crypto_get_ptrs(out, &iov_or_mp, &offset, &out_data_1,
164		    &out_data_1_len, &out_data_2, block_size);
165
166		/* copy block to where it belongs */
167		if (out_data_1_len == block_size) {
168			copy_block(lastp, out_data_1);
169		} else {
170			memcpy(out_data_1, lastp, out_data_1_len);
171			if (out_data_2 != NULL) {
172				memcpy(out_data_2,
173				    lastp + out_data_1_len,
174				    block_size - out_data_1_len);
175			}
176		}
177		/* update offset */
178		out->cd_offset += block_size;
179
180		/* add ciphertext to the hash */
181		GHASH(ctx, ctx->gcm_tmp, ctx->gcm_ghash, gops);
182
183		/* Update pointer to next block of data to be processed. */
184		if (ctx->gcm_remainder_len != 0) {
185			datap += need;
186			ctx->gcm_remainder_len = 0;
187		} else {
188			datap += block_size;
189		}
190
191		remainder = (size_t)&data[length] - (size_t)datap;
192
193		/* Incomplete last block. */
194		if (remainder > 0 && remainder < block_size) {
195			memcpy(ctx->gcm_remainder, datap, remainder);
196			ctx->gcm_remainder_len = remainder;
197			ctx->gcm_copy_to = datap;
198			goto out;
199		}
200		ctx->gcm_copy_to = NULL;
201
202	} while (remainder > 0);
203out:
204	return (CRYPTO_SUCCESS);
205}
206
207int
208gcm_encrypt_final(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size,
209    int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
210    void (*copy_block)(uint8_t *, uint8_t *),
211    void (*xor_block)(uint8_t *, uint8_t *))
212{
213	(void) copy_block;
214#ifdef CAN_USE_GCM_ASM
215	if (ctx->gcm_use_avx == B_TRUE)
216		return (gcm_encrypt_final_avx(ctx, out, block_size));
217#endif
218
219	const gcm_impl_ops_t *gops;
220	uint64_t counter_mask = ntohll(0x00000000ffffffffULL);
221	uint8_t *ghash, *macp = NULL;
222	int i, rv;
223
224	if (out->cd_length <
225	    (ctx->gcm_remainder_len + ctx->gcm_tag_len)) {
226		return (CRYPTO_DATA_LEN_RANGE);
227	}
228
229	gops = gcm_impl_get_ops();
230	ghash = (uint8_t *)ctx->gcm_ghash;
231
232	if (ctx->gcm_remainder_len > 0) {
233		uint64_t counter;
234		uint8_t *tmpp = (uint8_t *)ctx->gcm_tmp;
235
236		/*
237		 * Here is where we deal with data that is not a
238		 * multiple of the block size.
239		 */
240
241		/*
242		 * Increment counter.
243		 */
244		counter = ntohll(ctx->gcm_cb[1] & counter_mask);
245		counter = htonll(counter + 1);
246		counter &= counter_mask;
247		ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter;
248
249		encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_cb,
250		    (uint8_t *)ctx->gcm_tmp);
251
252		macp = (uint8_t *)ctx->gcm_remainder;
253		memset(macp + ctx->gcm_remainder_len, 0,
254		    block_size - ctx->gcm_remainder_len);
255
256		/* XOR with counter block */
257		for (i = 0; i < ctx->gcm_remainder_len; i++) {
258			macp[i] ^= tmpp[i];
259		}
260
261		/* add ciphertext to the hash */
262		GHASH(ctx, macp, ghash, gops);
263
264		ctx->gcm_processed_data_len += ctx->gcm_remainder_len;
265	}
266
267	ctx->gcm_len_a_len_c[1] =
268	    htonll(CRYPTO_BYTES2BITS(ctx->gcm_processed_data_len));
269	GHASH(ctx, ctx->gcm_len_a_len_c, ghash, gops);
270	encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_J0,
271	    (uint8_t *)ctx->gcm_J0);
272	xor_block((uint8_t *)ctx->gcm_J0, ghash);
273
274	if (ctx->gcm_remainder_len > 0) {
275		rv = crypto_put_output_data(macp, out, ctx->gcm_remainder_len);
276		if (rv != CRYPTO_SUCCESS)
277			return (rv);
278	}
279	out->cd_offset += ctx->gcm_remainder_len;
280	ctx->gcm_remainder_len = 0;
281	rv = crypto_put_output_data(ghash, out, ctx->gcm_tag_len);
282	if (rv != CRYPTO_SUCCESS)
283		return (rv);
284	out->cd_offset += ctx->gcm_tag_len;
285
286	return (CRYPTO_SUCCESS);
287}
288
289/*
290 * This will only deal with decrypting the last block of the input that
291 * might not be a multiple of block length.
292 */
293static void
294gcm_decrypt_incomplete_block(gcm_ctx_t *ctx, size_t block_size, size_t index,
295    int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
296    void (*xor_block)(uint8_t *, uint8_t *))
297{
298	uint8_t *datap, *outp, *counterp;
299	uint64_t counter;
300	uint64_t counter_mask = ntohll(0x00000000ffffffffULL);
301	int i;
302
303	/*
304	 * Increment counter.
305	 * Counter bits are confined to the bottom 32 bits
306	 */
307	counter = ntohll(ctx->gcm_cb[1] & counter_mask);
308	counter = htonll(counter + 1);
309	counter &= counter_mask;
310	ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter;
311
312	datap = (uint8_t *)ctx->gcm_remainder;
313	outp = &((ctx->gcm_pt_buf)[index]);
314	counterp = (uint8_t *)ctx->gcm_tmp;
315
316	/* authentication tag */
317	memset((uint8_t *)ctx->gcm_tmp, 0, block_size);
318	memcpy((uint8_t *)ctx->gcm_tmp, datap, ctx->gcm_remainder_len);
319
320	/* add ciphertext to the hash */
321	GHASH(ctx, ctx->gcm_tmp, ctx->gcm_ghash, gcm_impl_get_ops());
322
323	/* decrypt remaining ciphertext */
324	encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_cb, counterp);
325
326	/* XOR with counter block */
327	for (i = 0; i < ctx->gcm_remainder_len; i++) {
328		outp[i] = datap[i] ^ counterp[i];
329	}
330}
331
332int
333gcm_mode_decrypt_contiguous_blocks(gcm_ctx_t *ctx, char *data, size_t length,
334    crypto_data_t *out, size_t block_size,
335    int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
336    void (*copy_block)(uint8_t *, uint8_t *),
337    void (*xor_block)(uint8_t *, uint8_t *))
338{
339	(void) out, (void) block_size, (void) encrypt_block, (void) copy_block,
340	    (void) xor_block;
341	size_t new_len;
342	uint8_t *new;
343
344	/*
345	 * Copy contiguous ciphertext input blocks to plaintext buffer.
346	 * Ciphertext will be decrypted in the final.
347	 */
348	if (length > 0) {
349		new_len = ctx->gcm_pt_buf_len + length;
350		new = vmem_alloc(new_len, KM_SLEEP);
351		if (new == NULL) {
352			vmem_free(ctx->gcm_pt_buf, ctx->gcm_pt_buf_len);
353			ctx->gcm_pt_buf = NULL;
354			return (CRYPTO_HOST_MEMORY);
355		}
356
357		if (ctx->gcm_pt_buf != NULL) {
358			memcpy(new, ctx->gcm_pt_buf, ctx->gcm_pt_buf_len);
359			vmem_free(ctx->gcm_pt_buf, ctx->gcm_pt_buf_len);
360		} else {
361			ASSERT0(ctx->gcm_pt_buf_len);
362		}
363
364		ctx->gcm_pt_buf = new;
365		ctx->gcm_pt_buf_len = new_len;
366		memcpy(&ctx->gcm_pt_buf[ctx->gcm_processed_data_len], data,
367		    length);
368		ctx->gcm_processed_data_len += length;
369	}
370
371	ctx->gcm_remainder_len = 0;
372	return (CRYPTO_SUCCESS);
373}
374
375int
376gcm_decrypt_final(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size,
377    int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
378    void (*xor_block)(uint8_t *, uint8_t *))
379{
380#ifdef CAN_USE_GCM_ASM
381	if (ctx->gcm_use_avx == B_TRUE)
382		return (gcm_decrypt_final_avx(ctx, out, block_size));
383#endif
384
385	const gcm_impl_ops_t *gops;
386	size_t pt_len;
387	size_t remainder;
388	uint8_t *ghash;
389	uint8_t *blockp;
390	uint8_t *cbp;
391	uint64_t counter;
392	uint64_t counter_mask = ntohll(0x00000000ffffffffULL);
393	int processed = 0, rv;
394
395	ASSERT(ctx->gcm_processed_data_len == ctx->gcm_pt_buf_len);
396
397	gops = gcm_impl_get_ops();
398	pt_len = ctx->gcm_processed_data_len - ctx->gcm_tag_len;
399	ghash = (uint8_t *)ctx->gcm_ghash;
400	blockp = ctx->gcm_pt_buf;
401	remainder = pt_len;
402	while (remainder > 0) {
403		/* Incomplete last block */
404		if (remainder < block_size) {
405			memcpy(ctx->gcm_remainder, blockp, remainder);
406			ctx->gcm_remainder_len = remainder;
407			/*
408			 * not expecting anymore ciphertext, just
409			 * compute plaintext for the remaining input
410			 */
411			gcm_decrypt_incomplete_block(ctx, block_size,
412			    processed, encrypt_block, xor_block);
413			ctx->gcm_remainder_len = 0;
414			goto out;
415		}
416		/* add ciphertext to the hash */
417		GHASH(ctx, blockp, ghash, gops);
418
419		/*
420		 * Increment counter.
421		 * Counter bits are confined to the bottom 32 bits
422		 */
423		counter = ntohll(ctx->gcm_cb[1] & counter_mask);
424		counter = htonll(counter + 1);
425		counter &= counter_mask;
426		ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter;
427
428		cbp = (uint8_t *)ctx->gcm_tmp;
429		encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_cb, cbp);
430
431		/* XOR with ciphertext */
432		xor_block(cbp, blockp);
433
434		processed += block_size;
435		blockp += block_size;
436		remainder -= block_size;
437	}
438out:
439	ctx->gcm_len_a_len_c[1] = htonll(CRYPTO_BYTES2BITS(pt_len));
440	GHASH(ctx, ctx->gcm_len_a_len_c, ghash, gops);
441	encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_J0,
442	    (uint8_t *)ctx->gcm_J0);
443	xor_block((uint8_t *)ctx->gcm_J0, ghash);
444
445	/* compare the input authentication tag with what we calculated */
446	if (memcmp(&ctx->gcm_pt_buf[pt_len], ghash, ctx->gcm_tag_len)) {
447		/* They don't match */
448		return (CRYPTO_INVALID_MAC);
449	} else {
450		rv = crypto_put_output_data(ctx->gcm_pt_buf, out, pt_len);
451		if (rv != CRYPTO_SUCCESS)
452			return (rv);
453		out->cd_offset += pt_len;
454	}
455	return (CRYPTO_SUCCESS);
456}
457
458static int
459gcm_validate_args(CK_AES_GCM_PARAMS *gcm_param)
460{
461	size_t tag_len;
462
463	/*
464	 * Check the length of the authentication tag (in bits).
465	 */
466	tag_len = gcm_param->ulTagBits;
467	switch (tag_len) {
468	case 32:
469	case 64:
470	case 96:
471	case 104:
472	case 112:
473	case 120:
474	case 128:
475		break;
476	default:
477		return (CRYPTO_MECHANISM_PARAM_INVALID);
478	}
479
480	if (gcm_param->ulIvLen == 0)
481		return (CRYPTO_MECHANISM_PARAM_INVALID);
482
483	return (CRYPTO_SUCCESS);
484}
485
486static void
487gcm_format_initial_blocks(const uint8_t *iv, ulong_t iv_len,
488    gcm_ctx_t *ctx, size_t block_size,
489    void (*copy_block)(uint8_t *, uint8_t *),
490    void (*xor_block)(uint8_t *, uint8_t *))
491{
492	const gcm_impl_ops_t *gops;
493	uint8_t *cb;
494	ulong_t remainder = iv_len;
495	ulong_t processed = 0;
496	uint8_t *datap, *ghash;
497	uint64_t len_a_len_c[2];
498
499	gops = gcm_impl_get_ops();
500	ghash = (uint8_t *)ctx->gcm_ghash;
501	cb = (uint8_t *)ctx->gcm_cb;
502	if (iv_len == 12) {
503		memcpy(cb, iv, 12);
504		cb[12] = 0;
505		cb[13] = 0;
506		cb[14] = 0;
507		cb[15] = 1;
508		/* J0 will be used again in the final */
509		copy_block(cb, (uint8_t *)ctx->gcm_J0);
510	} else {
511		/* GHASH the IV */
512		do {
513			if (remainder < block_size) {
514				memset(cb, 0, block_size);
515				memcpy(cb, &(iv[processed]), remainder);
516				datap = (uint8_t *)cb;
517				remainder = 0;
518			} else {
519				datap = (uint8_t *)(&(iv[processed]));
520				processed += block_size;
521				remainder -= block_size;
522			}
523			GHASH(ctx, datap, ghash, gops);
524		} while (remainder > 0);
525
526		len_a_len_c[0] = 0;
527		len_a_len_c[1] = htonll(CRYPTO_BYTES2BITS(iv_len));
528		GHASH(ctx, len_a_len_c, ctx->gcm_J0, gops);
529
530		/* J0 will be used again in the final */
531		copy_block((uint8_t *)ctx->gcm_J0, (uint8_t *)cb);
532	}
533}
534
535static int
536gcm_init(gcm_ctx_t *ctx, const uint8_t *iv, size_t iv_len,
537    const uint8_t *auth_data, size_t auth_data_len, size_t block_size,
538    int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
539    void (*copy_block)(uint8_t *, uint8_t *),
540    void (*xor_block)(uint8_t *, uint8_t *))
541{
542	const gcm_impl_ops_t *gops;
543	uint8_t *ghash, *datap, *authp;
544	size_t remainder, processed;
545
546	/* encrypt zero block to get subkey H */
547	memset(ctx->gcm_H, 0, sizeof (ctx->gcm_H));
548	encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_H,
549	    (uint8_t *)ctx->gcm_H);
550
551	gcm_format_initial_blocks(iv, iv_len, ctx, block_size,
552	    copy_block, xor_block);
553
554	gops = gcm_impl_get_ops();
555	authp = (uint8_t *)ctx->gcm_tmp;
556	ghash = (uint8_t *)ctx->gcm_ghash;
557	memset(authp, 0, block_size);
558	memset(ghash, 0, block_size);
559
560	processed = 0;
561	remainder = auth_data_len;
562	do {
563		if (remainder < block_size) {
564			/*
565			 * There's not a block full of data, pad rest of
566			 * buffer with zero
567			 */
568
569			if (auth_data != NULL) {
570				memset(authp, 0, block_size);
571				memcpy(authp, &(auth_data[processed]),
572				    remainder);
573			} else {
574				ASSERT0(remainder);
575			}
576
577			datap = (uint8_t *)authp;
578			remainder = 0;
579		} else {
580			datap = (uint8_t *)(&(auth_data[processed]));
581			processed += block_size;
582			remainder -= block_size;
583		}
584
585		/* add auth data to the hash */
586		GHASH(ctx, datap, ghash, gops);
587
588	} while (remainder > 0);
589
590	return (CRYPTO_SUCCESS);
591}
592
593/*
594 * The following function is called at encrypt or decrypt init time
595 * for AES GCM mode.
596 */
597int
598gcm_init_ctx(gcm_ctx_t *gcm_ctx, char *param, size_t block_size,
599    int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
600    void (*copy_block)(uint8_t *, uint8_t *),
601    void (*xor_block)(uint8_t *, uint8_t *))
602{
603	return (gcm_init_ctx_impl(B_FALSE, gcm_ctx, param, block_size,
604	    encrypt_block, copy_block, xor_block));
605}
606
607/*
608 * The following function is called at encrypt or decrypt init time
609 * for AES GMAC mode.
610 */
611int
612gmac_init_ctx(gcm_ctx_t *gcm_ctx, char *param, size_t block_size,
613    int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
614    void (*copy_block)(uint8_t *, uint8_t *),
615    void (*xor_block)(uint8_t *, uint8_t *))
616{
617	return (gcm_init_ctx_impl(B_TRUE, gcm_ctx, param, block_size,
618	    encrypt_block, copy_block, xor_block));
619}
620
621/*
622 * Init the GCM context struct. Handle the cycle and avx implementations here.
623 * Initialization of a GMAC context differs slightly from a GCM context.
624 */
625static inline int
626gcm_init_ctx_impl(boolean_t gmac_mode, gcm_ctx_t *gcm_ctx, char *param,
627    size_t block_size, int (*encrypt_block)(const void *, const uint8_t *,
628    uint8_t *), void (*copy_block)(uint8_t *, uint8_t *),
629    void (*xor_block)(uint8_t *, uint8_t *))
630{
631	CK_AES_GCM_PARAMS *gcm_param;
632	int rv = CRYPTO_SUCCESS;
633	size_t tag_len, iv_len;
634
635	if (param != NULL) {
636		gcm_param = (CK_AES_GCM_PARAMS *)(void *)param;
637
638		if (gmac_mode == B_FALSE) {
639			/* GCM mode. */
640			if ((rv = gcm_validate_args(gcm_param)) != 0) {
641				return (rv);
642			}
643			gcm_ctx->gcm_flags |= GCM_MODE;
644
645			size_t tbits = gcm_param->ulTagBits;
646			tag_len = CRYPTO_BITS2BYTES(tbits);
647			iv_len = gcm_param->ulIvLen;
648		} else {
649			/* GMAC mode. */
650			gcm_ctx->gcm_flags |= GMAC_MODE;
651			tag_len = CRYPTO_BITS2BYTES(AES_GMAC_TAG_BITS);
652			iv_len = AES_GMAC_IV_LEN;
653		}
654		gcm_ctx->gcm_tag_len = tag_len;
655		gcm_ctx->gcm_processed_data_len = 0;
656
657		/* these values are in bits */
658		gcm_ctx->gcm_len_a_len_c[0]
659		    = htonll(CRYPTO_BYTES2BITS(gcm_param->ulAADLen));
660	} else {
661		return (CRYPTO_MECHANISM_PARAM_INVALID);
662	}
663
664	const uint8_t *iv = (const uint8_t *)gcm_param->pIv;
665	const uint8_t *aad = (const uint8_t *)gcm_param->pAAD;
666	size_t aad_len = gcm_param->ulAADLen;
667
668#ifdef CAN_USE_GCM_ASM
669	boolean_t needs_bswap =
670	    ((aes_key_t *)gcm_ctx->gcm_keysched)->ops->needs_byteswap;
671
672	if (GCM_IMPL_READ(icp_gcm_impl) != IMPL_CYCLE) {
673		gcm_ctx->gcm_use_avx = GCM_IMPL_USE_AVX;
674	} else {
675		/*
676		 * Handle the "cycle" implementation by creating avx and
677		 * non-avx contexts alternately.
678		 */
679		gcm_ctx->gcm_use_avx = gcm_toggle_avx();
680
681		/* The avx impl. doesn't handle byte swapped key schedules. */
682		if (gcm_ctx->gcm_use_avx == B_TRUE && needs_bswap == B_TRUE) {
683			gcm_ctx->gcm_use_avx = B_FALSE;
684		}
685		/*
686		 * If this is a GCM context, use the MOVBE and the BSWAP
687		 * variants alternately. GMAC contexts code paths do not
688		 * use the MOVBE instruction.
689		 */
690		if (gcm_ctx->gcm_use_avx == B_TRUE && gmac_mode == B_FALSE &&
691		    zfs_movbe_available() == B_TRUE) {
692			(void) atomic_toggle_boolean_nv(
693			    (volatile boolean_t *)&gcm_avx_can_use_movbe);
694		}
695	}
696	/*
697	 * We don't handle byte swapped key schedules in the avx code path,
698	 * still they could be created by the aes generic implementation.
699	 * Make sure not to use them since we'll corrupt data if we do.
700	 */
701	if (gcm_ctx->gcm_use_avx == B_TRUE && needs_bswap == B_TRUE) {
702		gcm_ctx->gcm_use_avx = B_FALSE;
703
704		cmn_err_once(CE_WARN,
705		    "ICP: Can't use the aes generic or cycle implementations "
706		    "in combination with the gcm avx implementation!");
707		cmn_err_once(CE_WARN,
708		    "ICP: Falling back to a compatible implementation, "
709		    "aes-gcm performance will likely be degraded.");
710		cmn_err_once(CE_WARN,
711		    "ICP: Choose at least the x86_64 aes implementation to "
712		    "restore performance.");
713	}
714
715	/* Allocate Htab memory as needed. */
716	if (gcm_ctx->gcm_use_avx == B_TRUE) {
717		size_t htab_len = gcm_simd_get_htab_size(gcm_ctx->gcm_use_avx);
718
719		if (htab_len == 0) {
720			return (CRYPTO_MECHANISM_PARAM_INVALID);
721		}
722		gcm_ctx->gcm_htab_len = htab_len;
723		gcm_ctx->gcm_Htable =
724		    kmem_alloc(htab_len, KM_SLEEP);
725
726		if (gcm_ctx->gcm_Htable == NULL) {
727			return (CRYPTO_HOST_MEMORY);
728		}
729	}
730	/* Avx and non avx context initialization differs from here on. */
731	if (gcm_ctx->gcm_use_avx == B_FALSE) {
732#endif /* ifdef CAN_USE_GCM_ASM */
733		if (gcm_init(gcm_ctx, iv, iv_len, aad, aad_len, block_size,
734		    encrypt_block, copy_block, xor_block) != CRYPTO_SUCCESS) {
735			rv = CRYPTO_MECHANISM_PARAM_INVALID;
736		}
737#ifdef CAN_USE_GCM_ASM
738	} else {
739		if (gcm_init_avx(gcm_ctx, iv, iv_len, aad, aad_len,
740		    block_size) != CRYPTO_SUCCESS) {
741			rv = CRYPTO_MECHANISM_PARAM_INVALID;
742		}
743	}
744#endif /* ifdef CAN_USE_GCM_ASM */
745
746	return (rv);
747}
748
749void *
750gcm_alloc_ctx(int kmflag)
751{
752	gcm_ctx_t *gcm_ctx;
753
754	if ((gcm_ctx = kmem_zalloc(sizeof (gcm_ctx_t), kmflag)) == NULL)
755		return (NULL);
756
757	gcm_ctx->gcm_flags = GCM_MODE;
758	return (gcm_ctx);
759}
760
761void *
762gmac_alloc_ctx(int kmflag)
763{
764	gcm_ctx_t *gcm_ctx;
765
766	if ((gcm_ctx = kmem_zalloc(sizeof (gcm_ctx_t), kmflag)) == NULL)
767		return (NULL);
768
769	gcm_ctx->gcm_flags = GMAC_MODE;
770	return (gcm_ctx);
771}
772
773/* GCM implementation that contains the fastest methods */
774static gcm_impl_ops_t gcm_fastest_impl = {
775	.name = "fastest"
776};
777
778/* All compiled in implementations */
779static const gcm_impl_ops_t *gcm_all_impl[] = {
780	&gcm_generic_impl,
781#if defined(__x86_64) && defined(HAVE_PCLMULQDQ)
782	&gcm_pclmulqdq_impl,
783#endif
784};
785
786/* Indicate that benchmark has been completed */
787static boolean_t gcm_impl_initialized = B_FALSE;
788
789/* Hold all supported implementations */
790static size_t gcm_supp_impl_cnt = 0;
791static gcm_impl_ops_t *gcm_supp_impl[ARRAY_SIZE(gcm_all_impl)];
792
793/*
794 * Returns the GCM operations for encrypt/decrypt/key setup.  When a
795 * SIMD implementation is not allowed in the current context, then
796 * fallback to the fastest generic implementation.
797 */
798const gcm_impl_ops_t *
799gcm_impl_get_ops(void)
800{
801	if (!kfpu_allowed())
802		return (&gcm_generic_impl);
803
804	const gcm_impl_ops_t *ops = NULL;
805	const uint32_t impl = GCM_IMPL_READ(icp_gcm_impl);
806
807	switch (impl) {
808	case IMPL_FASTEST:
809		ASSERT(gcm_impl_initialized);
810		ops = &gcm_fastest_impl;
811		break;
812	case IMPL_CYCLE:
813		/* Cycle through supported implementations */
814		ASSERT(gcm_impl_initialized);
815		ASSERT3U(gcm_supp_impl_cnt, >, 0);
816		static size_t cycle_impl_idx = 0;
817		size_t idx = (++cycle_impl_idx) % gcm_supp_impl_cnt;
818		ops = gcm_supp_impl[idx];
819		break;
820#ifdef CAN_USE_GCM_ASM
821	case IMPL_AVX:
822		/*
823		 * Make sure that we return a valid implementation while
824		 * switching to the avx implementation since there still
825		 * may be unfinished non-avx contexts around.
826		 */
827		ops = &gcm_generic_impl;
828		break;
829#endif
830	default:
831		ASSERT3U(impl, <, gcm_supp_impl_cnt);
832		ASSERT3U(gcm_supp_impl_cnt, >, 0);
833		if (impl < ARRAY_SIZE(gcm_all_impl))
834			ops = gcm_supp_impl[impl];
835		break;
836	}
837
838	ASSERT3P(ops, !=, NULL);
839
840	return (ops);
841}
842
843/*
844 * Initialize all supported implementations.
845 */
846void
847gcm_impl_init(void)
848{
849	gcm_impl_ops_t *curr_impl;
850	int i, c;
851
852	/* Move supported implementations into gcm_supp_impls */
853	for (i = 0, c = 0; i < ARRAY_SIZE(gcm_all_impl); i++) {
854		curr_impl = (gcm_impl_ops_t *)gcm_all_impl[i];
855
856		if (curr_impl->is_supported())
857			gcm_supp_impl[c++] = (gcm_impl_ops_t *)curr_impl;
858	}
859	gcm_supp_impl_cnt = c;
860
861	/*
862	 * Set the fastest implementation given the assumption that the
863	 * hardware accelerated version is the fastest.
864	 */
865#if defined(__x86_64) && defined(HAVE_PCLMULQDQ)
866	if (gcm_pclmulqdq_impl.is_supported()) {
867		memcpy(&gcm_fastest_impl, &gcm_pclmulqdq_impl,
868		    sizeof (gcm_fastest_impl));
869	} else
870#endif
871	{
872		memcpy(&gcm_fastest_impl, &gcm_generic_impl,
873		    sizeof (gcm_fastest_impl));
874	}
875
876	strlcpy(gcm_fastest_impl.name, "fastest", GCM_IMPL_NAME_MAX);
877
878#ifdef CAN_USE_GCM_ASM
879	/*
880	 * Use the avx implementation if it's available and the implementation
881	 * hasn't changed from its default value of fastest on module load.
882	 */
883	if (gcm_avx_will_work()) {
884#ifdef HAVE_MOVBE
885		if (zfs_movbe_available() == B_TRUE) {
886			atomic_swap_32(&gcm_avx_can_use_movbe, B_TRUE);
887		}
888#endif
889		if (GCM_IMPL_READ(user_sel_impl) == IMPL_FASTEST) {
890			gcm_set_avx(B_TRUE);
891		}
892	}
893#endif
894	/* Finish initialization */
895	atomic_swap_32(&icp_gcm_impl, user_sel_impl);
896	gcm_impl_initialized = B_TRUE;
897}
898
899static const struct {
900	const char *name;
901	uint32_t sel;
902} gcm_impl_opts[] = {
903		{ "cycle",	IMPL_CYCLE },
904		{ "fastest",	IMPL_FASTEST },
905#ifdef CAN_USE_GCM_ASM
906		{ "avx",	IMPL_AVX },
907#endif
908};
909
910/*
911 * Function sets desired gcm implementation.
912 *
913 * If we are called before init(), user preference will be saved in
914 * user_sel_impl, and applied in later init() call. This occurs when module
915 * parameter is specified on module load. Otherwise, directly update
916 * icp_gcm_impl.
917 *
918 * @val		Name of gcm implementation to use
919 * @param	Unused.
920 */
921int
922gcm_impl_set(const char *val)
923{
924	int err = -EINVAL;
925	char req_name[GCM_IMPL_NAME_MAX];
926	uint32_t impl = GCM_IMPL_READ(user_sel_impl);
927	size_t i;
928
929	/* sanitize input */
930	i = strnlen(val, GCM_IMPL_NAME_MAX);
931	if (i == 0 || i >= GCM_IMPL_NAME_MAX)
932		return (err);
933
934	strlcpy(req_name, val, GCM_IMPL_NAME_MAX);
935	while (i > 0 && isspace(req_name[i-1]))
936		i--;
937	req_name[i] = '\0';
938
939	/* Check mandatory options */
940	for (i = 0; i < ARRAY_SIZE(gcm_impl_opts); i++) {
941#ifdef CAN_USE_GCM_ASM
942		/* Ignore avx implementation if it won't work. */
943		if (gcm_impl_opts[i].sel == IMPL_AVX && !gcm_avx_will_work()) {
944			continue;
945		}
946#endif
947		if (strcmp(req_name, gcm_impl_opts[i].name) == 0) {
948			impl = gcm_impl_opts[i].sel;
949			err = 0;
950			break;
951		}
952	}
953
954	/* check all supported impl if init() was already called */
955	if (err != 0 && gcm_impl_initialized) {
956		/* check all supported implementations */
957		for (i = 0; i < gcm_supp_impl_cnt; i++) {
958			if (strcmp(req_name, gcm_supp_impl[i]->name) == 0) {
959				impl = i;
960				err = 0;
961				break;
962			}
963		}
964	}
965#ifdef CAN_USE_GCM_ASM
966	/*
967	 * Use the avx implementation if available and the requested one is
968	 * avx or fastest.
969	 */
970	if (gcm_avx_will_work() == B_TRUE &&
971	    (impl == IMPL_AVX || impl == IMPL_FASTEST)) {
972		gcm_set_avx(B_TRUE);
973	} else {
974		gcm_set_avx(B_FALSE);
975	}
976#endif
977
978	if (err == 0) {
979		if (gcm_impl_initialized)
980			atomic_swap_32(&icp_gcm_impl, impl);
981		else
982			atomic_swap_32(&user_sel_impl, impl);
983	}
984
985	return (err);
986}
987
988#if defined(_KERNEL) && defined(__linux__)
989
990static int
991icp_gcm_impl_set(const char *val, zfs_kernel_param_t *kp)
992{
993	return (gcm_impl_set(val));
994}
995
996static int
997icp_gcm_impl_get(char *buffer, zfs_kernel_param_t *kp)
998{
999	int i, cnt = 0;
1000	char *fmt;
1001	const uint32_t impl = GCM_IMPL_READ(icp_gcm_impl);
1002
1003	ASSERT(gcm_impl_initialized);
1004
1005	/* list mandatory options */
1006	for (i = 0; i < ARRAY_SIZE(gcm_impl_opts); i++) {
1007#ifdef CAN_USE_GCM_ASM
1008		/* Ignore avx implementation if it won't work. */
1009		if (gcm_impl_opts[i].sel == IMPL_AVX && !gcm_avx_will_work()) {
1010			continue;
1011		}
1012#endif
1013		fmt = (impl == gcm_impl_opts[i].sel) ? "[%s] " : "%s ";
1014		cnt += kmem_scnprintf(buffer + cnt, PAGE_SIZE - cnt, fmt,
1015		    gcm_impl_opts[i].name);
1016	}
1017
1018	/* list all supported implementations */
1019	for (i = 0; i < gcm_supp_impl_cnt; i++) {
1020		fmt = (i == impl) ? "[%s] " : "%s ";
1021		cnt += kmem_scnprintf(buffer + cnt, PAGE_SIZE - cnt, fmt,
1022		    gcm_supp_impl[i]->name);
1023	}
1024
1025	return (cnt);
1026}
1027
1028module_param_call(icp_gcm_impl, icp_gcm_impl_set, icp_gcm_impl_get,
1029    NULL, 0644);
1030MODULE_PARM_DESC(icp_gcm_impl, "Select gcm implementation.");
1031#endif /* defined(__KERNEL) */
1032
1033#ifdef CAN_USE_GCM_ASM
1034#define	GCM_BLOCK_LEN 16
1035/*
1036 * The openssl asm routines are 6x aggregated and need that many bytes
1037 * at minimum.
1038 */
1039#define	GCM_AVX_MIN_DECRYPT_BYTES (GCM_BLOCK_LEN * 6)
1040#define	GCM_AVX_MIN_ENCRYPT_BYTES (GCM_BLOCK_LEN * 6 * 3)
1041/*
1042 * Ensure the chunk size is reasonable since we are allocating a
1043 * GCM_AVX_MAX_CHUNK_SIZEd buffer and disabling preemption and interrupts.
1044 */
1045#define	GCM_AVX_MAX_CHUNK_SIZE \
1046	(((128*1024)/GCM_AVX_MIN_DECRYPT_BYTES) * GCM_AVX_MIN_DECRYPT_BYTES)
1047
1048/* Clear the FPU registers since they hold sensitive internal state. */
1049#define	clear_fpu_regs() clear_fpu_regs_avx()
1050#define	GHASH_AVX(ctx, in, len) \
1051    gcm_ghash_avx((ctx)->gcm_ghash, (const uint64_t *)(ctx)->gcm_Htable, \
1052    in, len)
1053
1054#define	gcm_incr_counter_block(ctx) gcm_incr_counter_block_by(ctx, 1)
1055
1056/* Get the chunk size module parameter. */
1057#define	GCM_CHUNK_SIZE_READ *(volatile uint32_t *) &gcm_avx_chunk_size
1058
1059/*
1060 * Module parameter: number of bytes to process at once while owning the FPU.
1061 * Rounded down to the next GCM_AVX_MIN_DECRYPT_BYTES byte boundary and is
1062 * ensured to be greater or equal than GCM_AVX_MIN_DECRYPT_BYTES.
1063 */
1064static uint32_t gcm_avx_chunk_size =
1065	((32 * 1024) / GCM_AVX_MIN_DECRYPT_BYTES) * GCM_AVX_MIN_DECRYPT_BYTES;
1066
1067extern void ASMABI clear_fpu_regs_avx(void);
1068extern void ASMABI gcm_xor_avx(const uint8_t *src, uint8_t *dst);
1069extern void ASMABI aes_encrypt_intel(const uint32_t rk[], int nr,
1070    const uint32_t pt[4], uint32_t ct[4]);
1071
1072extern void ASMABI gcm_init_htab_avx(uint64_t *Htable, const uint64_t H[2]);
1073extern void ASMABI gcm_ghash_avx(uint64_t ghash[2], const uint64_t *Htable,
1074    const uint8_t *in, size_t len);
1075
1076extern size_t ASMABI aesni_gcm_encrypt(const uint8_t *, uint8_t *, size_t,
1077    const void *, uint64_t *, uint64_t *);
1078
1079extern size_t ASMABI aesni_gcm_decrypt(const uint8_t *, uint8_t *, size_t,
1080    const void *, uint64_t *, uint64_t *);
1081
1082static inline boolean_t
1083gcm_avx_will_work(void)
1084{
1085	/* Avx should imply aes-ni and pclmulqdq, but make sure anyhow. */
1086	return (kfpu_allowed() &&
1087	    zfs_avx_available() && zfs_aes_available() &&
1088	    zfs_pclmulqdq_available());
1089}
1090
1091static inline void
1092gcm_set_avx(boolean_t val)
1093{
1094	if (gcm_avx_will_work() == B_TRUE) {
1095		atomic_swap_32(&gcm_use_avx, val);
1096	}
1097}
1098
1099static inline boolean_t
1100gcm_toggle_avx(void)
1101{
1102	if (gcm_avx_will_work() == B_TRUE) {
1103		return (atomic_toggle_boolean_nv(&GCM_IMPL_USE_AVX));
1104	} else {
1105		return (B_FALSE);
1106	}
1107}
1108
1109static inline size_t
1110gcm_simd_get_htab_size(boolean_t simd_mode)
1111{
1112	switch (simd_mode) {
1113	case B_TRUE:
1114		return (2 * 6 * 2 * sizeof (uint64_t));
1115
1116	default:
1117		return (0);
1118	}
1119}
1120
1121
1122/* Increment the GCM counter block by n. */
1123static inline void
1124gcm_incr_counter_block_by(gcm_ctx_t *ctx, int n)
1125{
1126	uint64_t counter_mask = ntohll(0x00000000ffffffffULL);
1127	uint64_t counter = ntohll(ctx->gcm_cb[1] & counter_mask);
1128
1129	counter = htonll(counter + n);
1130	counter &= counter_mask;
1131	ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter;
1132}
1133
1134/*
1135 * Encrypt multiple blocks of data in GCM mode.
1136 * This is done in gcm_avx_chunk_size chunks, utilizing AVX assembler routines
1137 * if possible. While processing a chunk the FPU is "locked".
1138 */
1139static int
1140gcm_mode_encrypt_contiguous_blocks_avx(gcm_ctx_t *ctx, char *data,
1141    size_t length, crypto_data_t *out, size_t block_size)
1142{
1143	size_t bleft = length;
1144	size_t need = 0;
1145	size_t done = 0;
1146	uint8_t *datap = (uint8_t *)data;
1147	size_t chunk_size = (size_t)GCM_CHUNK_SIZE_READ;
1148	const aes_key_t *key = ((aes_key_t *)ctx->gcm_keysched);
1149	uint64_t *ghash = ctx->gcm_ghash;
1150	uint64_t *cb = ctx->gcm_cb;
1151	uint8_t *ct_buf = NULL;
1152	uint8_t *tmp = (uint8_t *)ctx->gcm_tmp;
1153	int rv = CRYPTO_SUCCESS;
1154
1155	ASSERT(block_size == GCM_BLOCK_LEN);
1156	ASSERT3S(((aes_key_t *)ctx->gcm_keysched)->ops->needs_byteswap, ==,
1157	    B_FALSE);
1158	/*
1159	 * If the last call left an incomplete block, try to fill
1160	 * it first.
1161	 */
1162	if (ctx->gcm_remainder_len > 0) {
1163		need = block_size - ctx->gcm_remainder_len;
1164		if (length < need) {
1165			/* Accumulate bytes here and return. */
1166			memcpy((uint8_t *)ctx->gcm_remainder +
1167			    ctx->gcm_remainder_len, datap, length);
1168
1169			ctx->gcm_remainder_len += length;
1170			if (ctx->gcm_copy_to == NULL) {
1171				ctx->gcm_copy_to = datap;
1172			}
1173			return (CRYPTO_SUCCESS);
1174		} else {
1175			/* Complete incomplete block. */
1176			memcpy((uint8_t *)ctx->gcm_remainder +
1177			    ctx->gcm_remainder_len, datap, need);
1178
1179			ctx->gcm_copy_to = NULL;
1180		}
1181	}
1182
1183	/* Allocate a buffer to encrypt to if there is enough input. */
1184	if (bleft >= GCM_AVX_MIN_ENCRYPT_BYTES) {
1185		ct_buf = vmem_alloc(chunk_size, KM_SLEEP);
1186		if (ct_buf == NULL) {
1187			return (CRYPTO_HOST_MEMORY);
1188		}
1189	}
1190
1191	/* If we completed an incomplete block, encrypt and write it out. */
1192	if (ctx->gcm_remainder_len > 0) {
1193		kfpu_begin();
1194		aes_encrypt_intel(key->encr_ks.ks32, key->nr,
1195		    (const uint32_t *)cb, (uint32_t *)tmp);
1196
1197		gcm_xor_avx((const uint8_t *) ctx->gcm_remainder, tmp);
1198		GHASH_AVX(ctx, tmp, block_size);
1199		clear_fpu_regs();
1200		kfpu_end();
1201		rv = crypto_put_output_data(tmp, out, block_size);
1202		out->cd_offset += block_size;
1203		gcm_incr_counter_block(ctx);
1204		ctx->gcm_processed_data_len += block_size;
1205		bleft -= need;
1206		datap += need;
1207		ctx->gcm_remainder_len = 0;
1208	}
1209
1210	/* Do the bulk encryption in chunk_size blocks. */
1211	for (; bleft >= chunk_size; bleft -= chunk_size) {
1212		kfpu_begin();
1213		done = aesni_gcm_encrypt(
1214		    datap, ct_buf, chunk_size, key, cb, ghash);
1215
1216		clear_fpu_regs();
1217		kfpu_end();
1218		if (done != chunk_size) {
1219			rv = CRYPTO_FAILED;
1220			goto out_nofpu;
1221		}
1222		rv = crypto_put_output_data(ct_buf, out, chunk_size);
1223		if (rv != CRYPTO_SUCCESS) {
1224			goto out_nofpu;
1225		}
1226		out->cd_offset += chunk_size;
1227		datap += chunk_size;
1228		ctx->gcm_processed_data_len += chunk_size;
1229	}
1230	/* Check if we are already done. */
1231	if (bleft == 0) {
1232		goto out_nofpu;
1233	}
1234	/* Bulk encrypt the remaining data. */
1235	kfpu_begin();
1236	if (bleft >= GCM_AVX_MIN_ENCRYPT_BYTES) {
1237		done = aesni_gcm_encrypt(datap, ct_buf, bleft, key, cb, ghash);
1238		if (done == 0) {
1239			rv = CRYPTO_FAILED;
1240			goto out;
1241		}
1242		rv = crypto_put_output_data(ct_buf, out, done);
1243		if (rv != CRYPTO_SUCCESS) {
1244			goto out;
1245		}
1246		out->cd_offset += done;
1247		ctx->gcm_processed_data_len += done;
1248		datap += done;
1249		bleft -= done;
1250
1251	}
1252	/* Less than GCM_AVX_MIN_ENCRYPT_BYTES remain, operate on blocks. */
1253	while (bleft > 0) {
1254		if (bleft < block_size) {
1255			memcpy(ctx->gcm_remainder, datap, bleft);
1256			ctx->gcm_remainder_len = bleft;
1257			ctx->gcm_copy_to = datap;
1258			goto out;
1259		}
1260		/* Encrypt, hash and write out. */
1261		aes_encrypt_intel(key->encr_ks.ks32, key->nr,
1262		    (const uint32_t *)cb, (uint32_t *)tmp);
1263
1264		gcm_xor_avx(datap, tmp);
1265		GHASH_AVX(ctx, tmp, block_size);
1266		rv = crypto_put_output_data(tmp, out, block_size);
1267		if (rv != CRYPTO_SUCCESS) {
1268			goto out;
1269		}
1270		out->cd_offset += block_size;
1271		gcm_incr_counter_block(ctx);
1272		ctx->gcm_processed_data_len += block_size;
1273		datap += block_size;
1274		bleft -= block_size;
1275	}
1276out:
1277	clear_fpu_regs();
1278	kfpu_end();
1279out_nofpu:
1280	if (ct_buf != NULL) {
1281		vmem_free(ct_buf, chunk_size);
1282	}
1283	return (rv);
1284}
1285
1286/*
1287 * Finalize the encryption: Zero fill, encrypt, hash and write out an eventual
1288 * incomplete last block. Encrypt the ICB. Calculate the tag and write it out.
1289 */
1290static int
1291gcm_encrypt_final_avx(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size)
1292{
1293	uint8_t *ghash = (uint8_t *)ctx->gcm_ghash;
1294	uint32_t *J0 = (uint32_t *)ctx->gcm_J0;
1295	uint8_t *remainder = (uint8_t *)ctx->gcm_remainder;
1296	size_t rem_len = ctx->gcm_remainder_len;
1297	const void *keysched = ((aes_key_t *)ctx->gcm_keysched)->encr_ks.ks32;
1298	int aes_rounds = ((aes_key_t *)keysched)->nr;
1299	int rv;
1300
1301	ASSERT(block_size == GCM_BLOCK_LEN);
1302	ASSERT3S(((aes_key_t *)ctx->gcm_keysched)->ops->needs_byteswap, ==,
1303	    B_FALSE);
1304
1305	if (out->cd_length < (rem_len + ctx->gcm_tag_len)) {
1306		return (CRYPTO_DATA_LEN_RANGE);
1307	}
1308
1309	kfpu_begin();
1310	/* Pad last incomplete block with zeros, encrypt and hash. */
1311	if (rem_len > 0) {
1312		uint8_t *tmp = (uint8_t *)ctx->gcm_tmp;
1313		const uint32_t *cb = (uint32_t *)ctx->gcm_cb;
1314
1315		aes_encrypt_intel(keysched, aes_rounds, cb, (uint32_t *)tmp);
1316		memset(remainder + rem_len, 0, block_size - rem_len);
1317		for (int i = 0; i < rem_len; i++) {
1318			remainder[i] ^= tmp[i];
1319		}
1320		GHASH_AVX(ctx, remainder, block_size);
1321		ctx->gcm_processed_data_len += rem_len;
1322		/* No need to increment counter_block, it's the last block. */
1323	}
1324	/* Finish tag. */
1325	ctx->gcm_len_a_len_c[1] =
1326	    htonll(CRYPTO_BYTES2BITS(ctx->gcm_processed_data_len));
1327	GHASH_AVX(ctx, (const uint8_t *)ctx->gcm_len_a_len_c, block_size);
1328	aes_encrypt_intel(keysched, aes_rounds, J0, J0);
1329
1330	gcm_xor_avx((uint8_t *)J0, ghash);
1331	clear_fpu_regs();
1332	kfpu_end();
1333
1334	/* Output remainder. */
1335	if (rem_len > 0) {
1336		rv = crypto_put_output_data(remainder, out, rem_len);
1337		if (rv != CRYPTO_SUCCESS)
1338			return (rv);
1339	}
1340	out->cd_offset += rem_len;
1341	ctx->gcm_remainder_len = 0;
1342	rv = crypto_put_output_data(ghash, out, ctx->gcm_tag_len);
1343	if (rv != CRYPTO_SUCCESS)
1344		return (rv);
1345
1346	out->cd_offset += ctx->gcm_tag_len;
1347	return (CRYPTO_SUCCESS);
1348}
1349
1350/*
1351 * Finalize decryption: We just have accumulated crypto text, so now we
1352 * decrypt it here inplace.
1353 */
1354static int
1355gcm_decrypt_final_avx(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size)
1356{
1357	ASSERT3U(ctx->gcm_processed_data_len, ==, ctx->gcm_pt_buf_len);
1358	ASSERT3U(block_size, ==, 16);
1359	ASSERT3S(((aes_key_t *)ctx->gcm_keysched)->ops->needs_byteswap, ==,
1360	    B_FALSE);
1361
1362	size_t chunk_size = (size_t)GCM_CHUNK_SIZE_READ;
1363	size_t pt_len = ctx->gcm_processed_data_len - ctx->gcm_tag_len;
1364	uint8_t *datap = ctx->gcm_pt_buf;
1365	const aes_key_t *key = ((aes_key_t *)ctx->gcm_keysched);
1366	uint32_t *cb = (uint32_t *)ctx->gcm_cb;
1367	uint64_t *ghash = ctx->gcm_ghash;
1368	uint32_t *tmp = (uint32_t *)ctx->gcm_tmp;
1369	int rv = CRYPTO_SUCCESS;
1370	size_t bleft, done;
1371
1372	/*
1373	 * Decrypt in chunks of gcm_avx_chunk_size, which is asserted to be
1374	 * greater or equal than GCM_AVX_MIN_ENCRYPT_BYTES, and a multiple of
1375	 * GCM_AVX_MIN_DECRYPT_BYTES.
1376	 */
1377	for (bleft = pt_len; bleft >= chunk_size; bleft -= chunk_size) {
1378		kfpu_begin();
1379		done = aesni_gcm_decrypt(datap, datap, chunk_size,
1380		    (const void *)key, ctx->gcm_cb, ghash);
1381		clear_fpu_regs();
1382		kfpu_end();
1383		if (done != chunk_size) {
1384			return (CRYPTO_FAILED);
1385		}
1386		datap += done;
1387	}
1388	/* Decrypt remainder, which is less than chunk size, in one go. */
1389	kfpu_begin();
1390	if (bleft >= GCM_AVX_MIN_DECRYPT_BYTES) {
1391		done = aesni_gcm_decrypt(datap, datap, bleft,
1392		    (const void *)key, ctx->gcm_cb, ghash);
1393		if (done == 0) {
1394			clear_fpu_regs();
1395			kfpu_end();
1396			return (CRYPTO_FAILED);
1397		}
1398		datap += done;
1399		bleft -= done;
1400	}
1401	ASSERT(bleft < GCM_AVX_MIN_DECRYPT_BYTES);
1402
1403	/*
1404	 * Now less than GCM_AVX_MIN_DECRYPT_BYTES bytes remain,
1405	 * decrypt them block by block.
1406	 */
1407	while (bleft > 0) {
1408		/* Incomplete last block. */
1409		if (bleft < block_size) {
1410			uint8_t *lastb = (uint8_t *)ctx->gcm_remainder;
1411
1412			memset(lastb, 0, block_size);
1413			memcpy(lastb, datap, bleft);
1414			/* The GCM processing. */
1415			GHASH_AVX(ctx, lastb, block_size);
1416			aes_encrypt_intel(key->encr_ks.ks32, key->nr, cb, tmp);
1417			for (size_t i = 0; i < bleft; i++) {
1418				datap[i] = lastb[i] ^ ((uint8_t *)tmp)[i];
1419			}
1420			break;
1421		}
1422		/* The GCM processing. */
1423		GHASH_AVX(ctx, datap, block_size);
1424		aes_encrypt_intel(key->encr_ks.ks32, key->nr, cb, tmp);
1425		gcm_xor_avx((uint8_t *)tmp, datap);
1426		gcm_incr_counter_block(ctx);
1427
1428		datap += block_size;
1429		bleft -= block_size;
1430	}
1431	if (rv != CRYPTO_SUCCESS) {
1432		clear_fpu_regs();
1433		kfpu_end();
1434		return (rv);
1435	}
1436	/* Decryption done, finish the tag. */
1437	ctx->gcm_len_a_len_c[1] = htonll(CRYPTO_BYTES2BITS(pt_len));
1438	GHASH_AVX(ctx, (uint8_t *)ctx->gcm_len_a_len_c, block_size);
1439	aes_encrypt_intel(key->encr_ks.ks32, key->nr, (uint32_t *)ctx->gcm_J0,
1440	    (uint32_t *)ctx->gcm_J0);
1441
1442	gcm_xor_avx((uint8_t *)ctx->gcm_J0, (uint8_t *)ghash);
1443
1444	/* We are done with the FPU, restore its state. */
1445	clear_fpu_regs();
1446	kfpu_end();
1447
1448	/* Compare the input authentication tag with what we calculated. */
1449	if (memcmp(&ctx->gcm_pt_buf[pt_len], ghash, ctx->gcm_tag_len)) {
1450		/* They don't match. */
1451		return (CRYPTO_INVALID_MAC);
1452	}
1453	rv = crypto_put_output_data(ctx->gcm_pt_buf, out, pt_len);
1454	if (rv != CRYPTO_SUCCESS) {
1455		return (rv);
1456	}
1457	out->cd_offset += pt_len;
1458	return (CRYPTO_SUCCESS);
1459}
1460
1461/*
1462 * Initialize the GCM params H, Htabtle and the counter block. Save the
1463 * initial counter block.
1464 */
1465static int
1466gcm_init_avx(gcm_ctx_t *ctx, const uint8_t *iv, size_t iv_len,
1467    const uint8_t *auth_data, size_t auth_data_len, size_t block_size)
1468{
1469	uint8_t *cb = (uint8_t *)ctx->gcm_cb;
1470	uint64_t *H = ctx->gcm_H;
1471	const void *keysched = ((aes_key_t *)ctx->gcm_keysched)->encr_ks.ks32;
1472	int aes_rounds = ((aes_key_t *)ctx->gcm_keysched)->nr;
1473	const uint8_t *datap = auth_data;
1474	size_t chunk_size = (size_t)GCM_CHUNK_SIZE_READ;
1475	size_t bleft;
1476
1477	ASSERT(block_size == GCM_BLOCK_LEN);
1478	ASSERT3S(((aes_key_t *)ctx->gcm_keysched)->ops->needs_byteswap, ==,
1479	    B_FALSE);
1480
1481	/* Init H (encrypt zero block) and create the initial counter block. */
1482	memset(ctx->gcm_ghash, 0, sizeof (ctx->gcm_ghash));
1483	memset(H, 0, sizeof (ctx->gcm_H));
1484	kfpu_begin();
1485	aes_encrypt_intel(keysched, aes_rounds,
1486	    (const uint32_t *)H, (uint32_t *)H);
1487
1488	gcm_init_htab_avx(ctx->gcm_Htable, H);
1489
1490	if (iv_len == 12) {
1491		memcpy(cb, iv, 12);
1492		cb[12] = 0;
1493		cb[13] = 0;
1494		cb[14] = 0;
1495		cb[15] = 1;
1496		/* We need the ICB later. */
1497		memcpy(ctx->gcm_J0, cb, sizeof (ctx->gcm_J0));
1498	} else {
1499		/*
1500		 * Most consumers use 12 byte IVs, so it's OK to use the
1501		 * original routines for other IV sizes, just avoid nesting
1502		 * kfpu_begin calls.
1503		 */
1504		clear_fpu_regs();
1505		kfpu_end();
1506		gcm_format_initial_blocks(iv, iv_len, ctx, block_size,
1507		    aes_copy_block, aes_xor_block);
1508		kfpu_begin();
1509	}
1510
1511	/* Openssl post increments the counter, adjust for that. */
1512	gcm_incr_counter_block(ctx);
1513
1514	/* Ghash AAD in chunk_size blocks. */
1515	for (bleft = auth_data_len; bleft >= chunk_size; bleft -= chunk_size) {
1516		GHASH_AVX(ctx, datap, chunk_size);
1517		datap += chunk_size;
1518		clear_fpu_regs();
1519		kfpu_end();
1520		kfpu_begin();
1521	}
1522	/* Ghash the remainder and handle possible incomplete GCM block. */
1523	if (bleft > 0) {
1524		size_t incomp = bleft % block_size;
1525
1526		bleft -= incomp;
1527		if (bleft > 0) {
1528			GHASH_AVX(ctx, datap, bleft);
1529			datap += bleft;
1530		}
1531		if (incomp > 0) {
1532			/* Zero pad and hash incomplete last block. */
1533			uint8_t *authp = (uint8_t *)ctx->gcm_tmp;
1534
1535			memset(authp, 0, block_size);
1536			memcpy(authp, datap, incomp);
1537			GHASH_AVX(ctx, authp, block_size);
1538		}
1539	}
1540	clear_fpu_regs();
1541	kfpu_end();
1542	return (CRYPTO_SUCCESS);
1543}
1544
1545#if defined(_KERNEL)
1546static int
1547icp_gcm_avx_set_chunk_size(const char *buf, zfs_kernel_param_t *kp)
1548{
1549	unsigned long val;
1550	char val_rounded[16];
1551	int error = 0;
1552
1553	error = kstrtoul(buf, 0, &val);
1554	if (error)
1555		return (error);
1556
1557	val = (val / GCM_AVX_MIN_DECRYPT_BYTES) * GCM_AVX_MIN_DECRYPT_BYTES;
1558
1559	if (val < GCM_AVX_MIN_ENCRYPT_BYTES || val > GCM_AVX_MAX_CHUNK_SIZE)
1560		return (-EINVAL);
1561
1562	snprintf(val_rounded, 16, "%u", (uint32_t)val);
1563	error = param_set_uint(val_rounded, kp);
1564	return (error);
1565}
1566
1567module_param_call(icp_gcm_avx_chunk_size, icp_gcm_avx_set_chunk_size,
1568    param_get_uint, &gcm_avx_chunk_size, 0644);
1569
1570MODULE_PARM_DESC(icp_gcm_avx_chunk_size,
1571	"How many bytes to process while owning the FPU");
1572
1573#endif /* defined(__KERNEL) */
1574#endif /* ifdef CAN_USE_GCM_ASM */
1575