1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
23 */
24
25#include <sys/zfs_context.h>
26#include <modes/modes.h>
27#include <sys/crypto/common.h>
28#include <sys/crypto/icp.h>
29#include <sys/crypto/impl.h>
30#include <sys/byteorder.h>
31#include <sys/simd.h>
32#include <modes/gcm_impl.h>
33#ifdef CAN_USE_GCM_ASM
34#include <aes/aes_impl.h>
35#endif
36
37#define	GHASH(c, d, t, o) \
38	xor_block((uint8_t *)(d), (uint8_t *)(c)->gcm_ghash); \
39	(o)->mul((uint64_t *)(void *)(c)->gcm_ghash, (c)->gcm_H, \
40	(uint64_t *)(void *)(t));
41
42/* Select GCM implementation */
43#define	IMPL_FASTEST	(UINT32_MAX)
44#define	IMPL_CYCLE	(UINT32_MAX-1)
45#ifdef CAN_USE_GCM_ASM
46#define	IMPL_AVX	(UINT32_MAX-2)
47#endif
48#define	GCM_IMPL_READ(i) (*(volatile uint32_t *) &(i))
49static uint32_t icp_gcm_impl = IMPL_FASTEST;
50static uint32_t user_sel_impl = IMPL_FASTEST;
51
52#ifdef CAN_USE_GCM_ASM
53/* Does the architecture we run on support the MOVBE instruction? */
54boolean_t gcm_avx_can_use_movbe = B_FALSE;
55/*
56 * Whether to use the optimized openssl gcm and ghash implementations.
57 * Set to true if module parameter icp_gcm_impl == "avx".
58 */
59static boolean_t gcm_use_avx = B_FALSE;
60#define	GCM_IMPL_USE_AVX	(*(volatile boolean_t *)&gcm_use_avx)
61
62extern boolean_t atomic_toggle_boolean_nv(volatile boolean_t *);
63
64static inline boolean_t gcm_avx_will_work(void);
65static inline void gcm_set_avx(boolean_t);
66static inline boolean_t gcm_toggle_avx(void);
67static inline size_t gcm_simd_get_htab_size(boolean_t);
68
69static int gcm_mode_encrypt_contiguous_blocks_avx(gcm_ctx_t *, char *, size_t,
70    crypto_data_t *, size_t);
71
72static int gcm_encrypt_final_avx(gcm_ctx_t *, crypto_data_t *, size_t);
73static int gcm_decrypt_final_avx(gcm_ctx_t *, crypto_data_t *, size_t);
74static int gcm_init_avx(gcm_ctx_t *, unsigned char *, size_t, unsigned char *,
75    size_t, size_t);
76#endif /* ifdef CAN_USE_GCM_ASM */
77
78/*
79 * Encrypt multiple blocks of data in GCM mode.  Decrypt for GCM mode
80 * is done in another function.
81 */
82int
83gcm_mode_encrypt_contiguous_blocks(gcm_ctx_t *ctx, char *data, size_t length,
84    crypto_data_t *out, size_t block_size,
85    int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
86    void (*copy_block)(uint8_t *, uint8_t *),
87    void (*xor_block)(uint8_t *, uint8_t *))
88{
89#ifdef CAN_USE_GCM_ASM
90	if (ctx->gcm_use_avx == B_TRUE)
91		return (gcm_mode_encrypt_contiguous_blocks_avx(
92		    ctx, data, length, out, block_size));
93#endif
94
95	const gcm_impl_ops_t *gops;
96	size_t remainder = length;
97	size_t need = 0;
98	uint8_t *datap = (uint8_t *)data;
99	uint8_t *blockp;
100	uint8_t *lastp;
101	void *iov_or_mp;
102	offset_t offset;
103	uint8_t *out_data_1;
104	uint8_t *out_data_2;
105	size_t out_data_1_len;
106	uint64_t counter;
107	uint64_t counter_mask = ntohll(0x00000000ffffffffULL);
108
109	if (length + ctx->gcm_remainder_len < block_size) {
110		/* accumulate bytes here and return */
111		bcopy(datap,
112		    (uint8_t *)ctx->gcm_remainder + ctx->gcm_remainder_len,
113		    length);
114		ctx->gcm_remainder_len += length;
115		if (ctx->gcm_copy_to == NULL) {
116			ctx->gcm_copy_to = datap;
117		}
118		return (CRYPTO_SUCCESS);
119	}
120
121	lastp = (uint8_t *)ctx->gcm_cb;
122	crypto_init_ptrs(out, &iov_or_mp, &offset);
123
124	gops = gcm_impl_get_ops();
125	do {
126		/* Unprocessed data from last call. */
127		if (ctx->gcm_remainder_len > 0) {
128			need = block_size - ctx->gcm_remainder_len;
129
130			if (need > remainder)
131				return (CRYPTO_DATA_LEN_RANGE);
132
133			bcopy(datap, &((uint8_t *)ctx->gcm_remainder)
134			    [ctx->gcm_remainder_len], need);
135
136			blockp = (uint8_t *)ctx->gcm_remainder;
137		} else {
138			blockp = datap;
139		}
140
141		/*
142		 * Increment counter. Counter bits are confined
143		 * to the bottom 32 bits of the counter block.
144		 */
145		counter = ntohll(ctx->gcm_cb[1] & counter_mask);
146		counter = htonll(counter + 1);
147		counter &= counter_mask;
148		ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter;
149
150		encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_cb,
151		    (uint8_t *)ctx->gcm_tmp);
152		xor_block(blockp, (uint8_t *)ctx->gcm_tmp);
153
154		lastp = (uint8_t *)ctx->gcm_tmp;
155
156		ctx->gcm_processed_data_len += block_size;
157
158		crypto_get_ptrs(out, &iov_or_mp, &offset, &out_data_1,
159		    &out_data_1_len, &out_data_2, block_size);
160
161		/* copy block to where it belongs */
162		if (out_data_1_len == block_size) {
163			copy_block(lastp, out_data_1);
164		} else {
165			bcopy(lastp, out_data_1, out_data_1_len);
166			if (out_data_2 != NULL) {
167				bcopy(lastp + out_data_1_len,
168				    out_data_2,
169				    block_size - out_data_1_len);
170			}
171		}
172		/* update offset */
173		out->cd_offset += block_size;
174
175		/* add ciphertext to the hash */
176		GHASH(ctx, ctx->gcm_tmp, ctx->gcm_ghash, gops);
177
178		/* Update pointer to next block of data to be processed. */
179		if (ctx->gcm_remainder_len != 0) {
180			datap += need;
181			ctx->gcm_remainder_len = 0;
182		} else {
183			datap += block_size;
184		}
185
186		remainder = (size_t)&data[length] - (size_t)datap;
187
188		/* Incomplete last block. */
189		if (remainder > 0 && remainder < block_size) {
190			bcopy(datap, ctx->gcm_remainder, remainder);
191			ctx->gcm_remainder_len = remainder;
192			ctx->gcm_copy_to = datap;
193			goto out;
194		}
195		ctx->gcm_copy_to = NULL;
196
197	} while (remainder > 0);
198out:
199	return (CRYPTO_SUCCESS);
200}
201
202/* ARGSUSED */
203int
204gcm_encrypt_final(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size,
205    int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
206    void (*copy_block)(uint8_t *, uint8_t *),
207    void (*xor_block)(uint8_t *, uint8_t *))
208{
209#ifdef CAN_USE_GCM_ASM
210	if (ctx->gcm_use_avx == B_TRUE)
211		return (gcm_encrypt_final_avx(ctx, out, block_size));
212#endif
213
214	const gcm_impl_ops_t *gops;
215	uint64_t counter_mask = ntohll(0x00000000ffffffffULL);
216	uint8_t *ghash, *macp = NULL;
217	int i, rv;
218
219	if (out->cd_length <
220	    (ctx->gcm_remainder_len + ctx->gcm_tag_len)) {
221		return (CRYPTO_DATA_LEN_RANGE);
222	}
223
224	gops = gcm_impl_get_ops();
225	ghash = (uint8_t *)ctx->gcm_ghash;
226
227	if (ctx->gcm_remainder_len > 0) {
228		uint64_t counter;
229		uint8_t *tmpp = (uint8_t *)ctx->gcm_tmp;
230
231		/*
232		 * Here is where we deal with data that is not a
233		 * multiple of the block size.
234		 */
235
236		/*
237		 * Increment counter.
238		 */
239		counter = ntohll(ctx->gcm_cb[1] & counter_mask);
240		counter = htonll(counter + 1);
241		counter &= counter_mask;
242		ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter;
243
244		encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_cb,
245		    (uint8_t *)ctx->gcm_tmp);
246
247		macp = (uint8_t *)ctx->gcm_remainder;
248		bzero(macp + ctx->gcm_remainder_len,
249		    block_size - ctx->gcm_remainder_len);
250
251		/* XOR with counter block */
252		for (i = 0; i < ctx->gcm_remainder_len; i++) {
253			macp[i] ^= tmpp[i];
254		}
255
256		/* add ciphertext to the hash */
257		GHASH(ctx, macp, ghash, gops);
258
259		ctx->gcm_processed_data_len += ctx->gcm_remainder_len;
260	}
261
262	ctx->gcm_len_a_len_c[1] =
263	    htonll(CRYPTO_BYTES2BITS(ctx->gcm_processed_data_len));
264	GHASH(ctx, ctx->gcm_len_a_len_c, ghash, gops);
265	encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_J0,
266	    (uint8_t *)ctx->gcm_J0);
267	xor_block((uint8_t *)ctx->gcm_J0, ghash);
268
269	if (ctx->gcm_remainder_len > 0) {
270		rv = crypto_put_output_data(macp, out, ctx->gcm_remainder_len);
271		if (rv != CRYPTO_SUCCESS)
272			return (rv);
273	}
274	out->cd_offset += ctx->gcm_remainder_len;
275	ctx->gcm_remainder_len = 0;
276	rv = crypto_put_output_data(ghash, out, ctx->gcm_tag_len);
277	if (rv != CRYPTO_SUCCESS)
278		return (rv);
279	out->cd_offset += ctx->gcm_tag_len;
280
281	return (CRYPTO_SUCCESS);
282}
283
284/*
285 * This will only deal with decrypting the last block of the input that
286 * might not be a multiple of block length.
287 */
288static void
289gcm_decrypt_incomplete_block(gcm_ctx_t *ctx, size_t block_size, size_t index,
290    int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
291    void (*xor_block)(uint8_t *, uint8_t *))
292{
293	uint8_t *datap, *outp, *counterp;
294	uint64_t counter;
295	uint64_t counter_mask = ntohll(0x00000000ffffffffULL);
296	int i;
297
298	/*
299	 * Increment counter.
300	 * Counter bits are confined to the bottom 32 bits
301	 */
302	counter = ntohll(ctx->gcm_cb[1] & counter_mask);
303	counter = htonll(counter + 1);
304	counter &= counter_mask;
305	ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter;
306
307	datap = (uint8_t *)ctx->gcm_remainder;
308	outp = &((ctx->gcm_pt_buf)[index]);
309	counterp = (uint8_t *)ctx->gcm_tmp;
310
311	/* authentication tag */
312	bzero((uint8_t *)ctx->gcm_tmp, block_size);
313	bcopy(datap, (uint8_t *)ctx->gcm_tmp, ctx->gcm_remainder_len);
314
315	/* add ciphertext to the hash */
316	GHASH(ctx, ctx->gcm_tmp, ctx->gcm_ghash, gcm_impl_get_ops());
317
318	/* decrypt remaining ciphertext */
319	encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_cb, counterp);
320
321	/* XOR with counter block */
322	for (i = 0; i < ctx->gcm_remainder_len; i++) {
323		outp[i] = datap[i] ^ counterp[i];
324	}
325}
326
327/* ARGSUSED */
328int
329gcm_mode_decrypt_contiguous_blocks(gcm_ctx_t *ctx, char *data, size_t length,
330    crypto_data_t *out, size_t block_size,
331    int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
332    void (*copy_block)(uint8_t *, uint8_t *),
333    void (*xor_block)(uint8_t *, uint8_t *))
334{
335	size_t new_len;
336	uint8_t *new;
337
338	/*
339	 * Copy contiguous ciphertext input blocks to plaintext buffer.
340	 * Ciphertext will be decrypted in the final.
341	 */
342	if (length > 0) {
343		new_len = ctx->gcm_pt_buf_len + length;
344		new = vmem_alloc(new_len, ctx->gcm_kmflag);
345		if (new == NULL) {
346			vmem_free(ctx->gcm_pt_buf, ctx->gcm_pt_buf_len);
347			ctx->gcm_pt_buf = NULL;
348			return (CRYPTO_HOST_MEMORY);
349		}
350		bcopy(ctx->gcm_pt_buf, new, ctx->gcm_pt_buf_len);
351		vmem_free(ctx->gcm_pt_buf, ctx->gcm_pt_buf_len);
352		ctx->gcm_pt_buf = new;
353		ctx->gcm_pt_buf_len = new_len;
354		bcopy(data, &ctx->gcm_pt_buf[ctx->gcm_processed_data_len],
355		    length);
356		ctx->gcm_processed_data_len += length;
357	}
358
359	ctx->gcm_remainder_len = 0;
360	return (CRYPTO_SUCCESS);
361}
362
363int
364gcm_decrypt_final(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size,
365    int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
366    void (*xor_block)(uint8_t *, uint8_t *))
367{
368#ifdef CAN_USE_GCM_ASM
369	if (ctx->gcm_use_avx == B_TRUE)
370		return (gcm_decrypt_final_avx(ctx, out, block_size));
371#endif
372
373	const gcm_impl_ops_t *gops;
374	size_t pt_len;
375	size_t remainder;
376	uint8_t *ghash;
377	uint8_t *blockp;
378	uint8_t *cbp;
379	uint64_t counter;
380	uint64_t counter_mask = ntohll(0x00000000ffffffffULL);
381	int processed = 0, rv;
382
383	ASSERT(ctx->gcm_processed_data_len == ctx->gcm_pt_buf_len);
384
385	gops = gcm_impl_get_ops();
386	pt_len = ctx->gcm_processed_data_len - ctx->gcm_tag_len;
387	ghash = (uint8_t *)ctx->gcm_ghash;
388	blockp = ctx->gcm_pt_buf;
389	remainder = pt_len;
390	while (remainder > 0) {
391		/* Incomplete last block */
392		if (remainder < block_size) {
393			bcopy(blockp, ctx->gcm_remainder, remainder);
394			ctx->gcm_remainder_len = remainder;
395			/*
396			 * not expecting anymore ciphertext, just
397			 * compute plaintext for the remaining input
398			 */
399			gcm_decrypt_incomplete_block(ctx, block_size,
400			    processed, encrypt_block, xor_block);
401			ctx->gcm_remainder_len = 0;
402			goto out;
403		}
404		/* add ciphertext to the hash */
405		GHASH(ctx, blockp, ghash, gops);
406
407		/*
408		 * Increment counter.
409		 * Counter bits are confined to the bottom 32 bits
410		 */
411		counter = ntohll(ctx->gcm_cb[1] & counter_mask);
412		counter = htonll(counter + 1);
413		counter &= counter_mask;
414		ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter;
415
416		cbp = (uint8_t *)ctx->gcm_tmp;
417		encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_cb, cbp);
418
419		/* XOR with ciphertext */
420		xor_block(cbp, blockp);
421
422		processed += block_size;
423		blockp += block_size;
424		remainder -= block_size;
425	}
426out:
427	ctx->gcm_len_a_len_c[1] = htonll(CRYPTO_BYTES2BITS(pt_len));
428	GHASH(ctx, ctx->gcm_len_a_len_c, ghash, gops);
429	encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_J0,
430	    (uint8_t *)ctx->gcm_J0);
431	xor_block((uint8_t *)ctx->gcm_J0, ghash);
432
433	/* compare the input authentication tag with what we calculated */
434	if (bcmp(&ctx->gcm_pt_buf[pt_len], ghash, ctx->gcm_tag_len)) {
435		/* They don't match */
436		return (CRYPTO_INVALID_MAC);
437	} else {
438		rv = crypto_put_output_data(ctx->gcm_pt_buf, out, pt_len);
439		if (rv != CRYPTO_SUCCESS)
440			return (rv);
441		out->cd_offset += pt_len;
442	}
443	return (CRYPTO_SUCCESS);
444}
445
446static int
447gcm_validate_args(CK_AES_GCM_PARAMS *gcm_param)
448{
449	size_t tag_len;
450
451	/*
452	 * Check the length of the authentication tag (in bits).
453	 */
454	tag_len = gcm_param->ulTagBits;
455	switch (tag_len) {
456	case 32:
457	case 64:
458	case 96:
459	case 104:
460	case 112:
461	case 120:
462	case 128:
463		break;
464	default:
465		return (CRYPTO_MECHANISM_PARAM_INVALID);
466	}
467
468	if (gcm_param->ulIvLen == 0)
469		return (CRYPTO_MECHANISM_PARAM_INVALID);
470
471	return (CRYPTO_SUCCESS);
472}
473
474static void
475gcm_format_initial_blocks(uchar_t *iv, ulong_t iv_len,
476    gcm_ctx_t *ctx, size_t block_size,
477    void (*copy_block)(uint8_t *, uint8_t *),
478    void (*xor_block)(uint8_t *, uint8_t *))
479{
480	const gcm_impl_ops_t *gops;
481	uint8_t *cb;
482	ulong_t remainder = iv_len;
483	ulong_t processed = 0;
484	uint8_t *datap, *ghash;
485	uint64_t len_a_len_c[2];
486
487	gops = gcm_impl_get_ops();
488	ghash = (uint8_t *)ctx->gcm_ghash;
489	cb = (uint8_t *)ctx->gcm_cb;
490	if (iv_len == 12) {
491		bcopy(iv, cb, 12);
492		cb[12] = 0;
493		cb[13] = 0;
494		cb[14] = 0;
495		cb[15] = 1;
496		/* J0 will be used again in the final */
497		copy_block(cb, (uint8_t *)ctx->gcm_J0);
498	} else {
499		/* GHASH the IV */
500		do {
501			if (remainder < block_size) {
502				bzero(cb, block_size);
503				bcopy(&(iv[processed]), cb, remainder);
504				datap = (uint8_t *)cb;
505				remainder = 0;
506			} else {
507				datap = (uint8_t *)(&(iv[processed]));
508				processed += block_size;
509				remainder -= block_size;
510			}
511			GHASH(ctx, datap, ghash, gops);
512		} while (remainder > 0);
513
514		len_a_len_c[0] = 0;
515		len_a_len_c[1] = htonll(CRYPTO_BYTES2BITS(iv_len));
516		GHASH(ctx, len_a_len_c, ctx->gcm_J0, gops);
517
518		/* J0 will be used again in the final */
519		copy_block((uint8_t *)ctx->gcm_J0, (uint8_t *)cb);
520	}
521}
522
523static int
524gcm_init(gcm_ctx_t *ctx, unsigned char *iv, size_t iv_len,
525    unsigned char *auth_data, size_t auth_data_len, size_t block_size,
526    int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
527    void (*copy_block)(uint8_t *, uint8_t *),
528    void (*xor_block)(uint8_t *, uint8_t *))
529{
530	const gcm_impl_ops_t *gops;
531	uint8_t *ghash, *datap, *authp;
532	size_t remainder, processed;
533
534	/* encrypt zero block to get subkey H */
535	bzero(ctx->gcm_H, sizeof (ctx->gcm_H));
536	encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_H,
537	    (uint8_t *)ctx->gcm_H);
538
539	gcm_format_initial_blocks(iv, iv_len, ctx, block_size,
540	    copy_block, xor_block);
541
542	gops = gcm_impl_get_ops();
543	authp = (uint8_t *)ctx->gcm_tmp;
544	ghash = (uint8_t *)ctx->gcm_ghash;
545	bzero(authp, block_size);
546	bzero(ghash, block_size);
547
548	processed = 0;
549	remainder = auth_data_len;
550	do {
551		if (remainder < block_size) {
552			/*
553			 * There's not a block full of data, pad rest of
554			 * buffer with zero
555			 */
556			bzero(authp, block_size);
557			bcopy(&(auth_data[processed]), authp, remainder);
558			datap = (uint8_t *)authp;
559			remainder = 0;
560		} else {
561			datap = (uint8_t *)(&(auth_data[processed]));
562			processed += block_size;
563			remainder -= block_size;
564		}
565
566		/* add auth data to the hash */
567		GHASH(ctx, datap, ghash, gops);
568
569	} while (remainder > 0);
570
571	return (CRYPTO_SUCCESS);
572}
573
574/*
575 * The following function is called at encrypt or decrypt init time
576 * for AES GCM mode.
577 *
578 * Init the GCM context struct. Handle the cycle and avx implementations here.
579 */
580int
581gcm_init_ctx(gcm_ctx_t *gcm_ctx, char *param, size_t block_size,
582    int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
583    void (*copy_block)(uint8_t *, uint8_t *),
584    void (*xor_block)(uint8_t *, uint8_t *))
585{
586	int rv;
587	CK_AES_GCM_PARAMS *gcm_param;
588
589	if (param != NULL) {
590		gcm_param = (CK_AES_GCM_PARAMS *)(void *)param;
591
592		if ((rv = gcm_validate_args(gcm_param)) != 0) {
593			return (rv);
594		}
595
596		gcm_ctx->gcm_tag_len = gcm_param->ulTagBits;
597		gcm_ctx->gcm_tag_len >>= 3;
598		gcm_ctx->gcm_processed_data_len = 0;
599
600		/* these values are in bits */
601		gcm_ctx->gcm_len_a_len_c[0]
602		    = htonll(CRYPTO_BYTES2BITS(gcm_param->ulAADLen));
603
604		rv = CRYPTO_SUCCESS;
605		gcm_ctx->gcm_flags |= GCM_MODE;
606	} else {
607		return (CRYPTO_MECHANISM_PARAM_INVALID);
608	}
609
610#ifdef CAN_USE_GCM_ASM
611	if (GCM_IMPL_READ(icp_gcm_impl) != IMPL_CYCLE) {
612		gcm_ctx->gcm_use_avx = GCM_IMPL_USE_AVX;
613	} else {
614		/*
615		 * Handle the "cycle" implementation by creating avx and
616		 * non-avx contexts alternately.
617		 */
618		gcm_ctx->gcm_use_avx = gcm_toggle_avx();
619		/*
620		 * We don't handle byte swapped key schedules in the avx
621		 * code path.
622		 */
623		aes_key_t *ks = (aes_key_t *)gcm_ctx->gcm_keysched;
624		if (ks->ops->needs_byteswap == B_TRUE) {
625			gcm_ctx->gcm_use_avx = B_FALSE;
626		}
627		/* Use the MOVBE and the BSWAP variants alternately. */
628		if (gcm_ctx->gcm_use_avx == B_TRUE &&
629		    zfs_movbe_available() == B_TRUE) {
630			(void) atomic_toggle_boolean_nv(
631			    (volatile boolean_t *)&gcm_avx_can_use_movbe);
632		}
633	}
634	/* Allocate Htab memory as needed. */
635	if (gcm_ctx->gcm_use_avx == B_TRUE) {
636		size_t htab_len = gcm_simd_get_htab_size(gcm_ctx->gcm_use_avx);
637
638		if (htab_len == 0) {
639			return (CRYPTO_MECHANISM_PARAM_INVALID);
640		}
641		gcm_ctx->gcm_htab_len = htab_len;
642		gcm_ctx->gcm_Htable =
643		    (uint64_t *)kmem_alloc(htab_len, gcm_ctx->gcm_kmflag);
644
645		if (gcm_ctx->gcm_Htable == NULL) {
646			return (CRYPTO_HOST_MEMORY);
647		}
648	}
649	/* Avx and non avx context initialization differs from here on. */
650	if (gcm_ctx->gcm_use_avx == B_FALSE) {
651#endif /* ifdef CAN_USE_GCM_ASM */
652		if (gcm_init(gcm_ctx, gcm_param->pIv, gcm_param->ulIvLen,
653		    gcm_param->pAAD, gcm_param->ulAADLen, block_size,
654		    encrypt_block, copy_block, xor_block) != 0) {
655			rv = CRYPTO_MECHANISM_PARAM_INVALID;
656		}
657#ifdef CAN_USE_GCM_ASM
658	} else {
659		if (gcm_init_avx(gcm_ctx, gcm_param->pIv, gcm_param->ulIvLen,
660		    gcm_param->pAAD, gcm_param->ulAADLen, block_size) != 0) {
661			rv = CRYPTO_MECHANISM_PARAM_INVALID;
662		}
663	}
664#endif /* ifdef CAN_USE_GCM_ASM */
665
666	return (rv);
667}
668
669int
670gmac_init_ctx(gcm_ctx_t *gcm_ctx, char *param, size_t block_size,
671    int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
672    void (*copy_block)(uint8_t *, uint8_t *),
673    void (*xor_block)(uint8_t *, uint8_t *))
674{
675	int rv;
676	CK_AES_GMAC_PARAMS *gmac_param;
677
678	if (param != NULL) {
679		gmac_param = (CK_AES_GMAC_PARAMS *)(void *)param;
680
681		gcm_ctx->gcm_tag_len = CRYPTO_BITS2BYTES(AES_GMAC_TAG_BITS);
682		gcm_ctx->gcm_processed_data_len = 0;
683
684		/* these values are in bits */
685		gcm_ctx->gcm_len_a_len_c[0]
686		    = htonll(CRYPTO_BYTES2BITS(gmac_param->ulAADLen));
687
688		rv = CRYPTO_SUCCESS;
689		gcm_ctx->gcm_flags |= GMAC_MODE;
690	} else {
691		return (CRYPTO_MECHANISM_PARAM_INVALID);
692	}
693
694#ifdef CAN_USE_GCM_ASM
695	/*
696	 * Handle the "cycle" implementation by creating avx and non avx
697	 * contexts alternately.
698	 */
699	if (GCM_IMPL_READ(icp_gcm_impl) != IMPL_CYCLE) {
700		gcm_ctx->gcm_use_avx = GCM_IMPL_USE_AVX;
701	} else {
702		gcm_ctx->gcm_use_avx = gcm_toggle_avx();
703	}
704	/* We don't handle byte swapped key schedules in the avx code path. */
705	aes_key_t *ks = (aes_key_t *)gcm_ctx->gcm_keysched;
706	if (ks->ops->needs_byteswap == B_TRUE) {
707		gcm_ctx->gcm_use_avx = B_FALSE;
708	}
709	/* Allocate Htab memory as needed. */
710	if (gcm_ctx->gcm_use_avx == B_TRUE) {
711		size_t htab_len = gcm_simd_get_htab_size(gcm_ctx->gcm_use_avx);
712
713		if (htab_len == 0) {
714			return (CRYPTO_MECHANISM_PARAM_INVALID);
715		}
716		gcm_ctx->gcm_htab_len = htab_len;
717		gcm_ctx->gcm_Htable =
718		    (uint64_t *)kmem_alloc(htab_len, gcm_ctx->gcm_kmflag);
719
720		if (gcm_ctx->gcm_Htable == NULL) {
721			return (CRYPTO_HOST_MEMORY);
722		}
723	}
724
725	/* Avx and non avx context initialization differs from here on. */
726	if (gcm_ctx->gcm_use_avx == B_FALSE) {
727#endif	/* ifdef CAN_USE_GCM_ASM */
728		if (gcm_init(gcm_ctx, gmac_param->pIv, AES_GMAC_IV_LEN,
729		    gmac_param->pAAD, gmac_param->ulAADLen, block_size,
730		    encrypt_block, copy_block, xor_block) != 0) {
731			rv = CRYPTO_MECHANISM_PARAM_INVALID;
732		}
733#ifdef CAN_USE_GCM_ASM
734	} else {
735		if (gcm_init_avx(gcm_ctx, gmac_param->pIv, AES_GMAC_IV_LEN,
736		    gmac_param->pAAD, gmac_param->ulAADLen, block_size) != 0) {
737			rv = CRYPTO_MECHANISM_PARAM_INVALID;
738		}
739	}
740#endif /* ifdef CAN_USE_GCM_ASM */
741
742	return (rv);
743}
744
745void *
746gcm_alloc_ctx(int kmflag)
747{
748	gcm_ctx_t *gcm_ctx;
749
750	if ((gcm_ctx = kmem_zalloc(sizeof (gcm_ctx_t), kmflag)) == NULL)
751		return (NULL);
752
753	gcm_ctx->gcm_flags = GCM_MODE;
754	return (gcm_ctx);
755}
756
757void *
758gmac_alloc_ctx(int kmflag)
759{
760	gcm_ctx_t *gcm_ctx;
761
762	if ((gcm_ctx = kmem_zalloc(sizeof (gcm_ctx_t), kmflag)) == NULL)
763		return (NULL);
764
765	gcm_ctx->gcm_flags = GMAC_MODE;
766	return (gcm_ctx);
767}
768
769void
770gcm_set_kmflag(gcm_ctx_t *ctx, int kmflag)
771{
772	ctx->gcm_kmflag = kmflag;
773}
774
775/* GCM implementation that contains the fastest methods */
776static gcm_impl_ops_t gcm_fastest_impl = {
777	.name = "fastest"
778};
779
780/* All compiled in implementations */
781const gcm_impl_ops_t *gcm_all_impl[] = {
782	&gcm_generic_impl,
783#if defined(__x86_64) && defined(HAVE_PCLMULQDQ)
784	&gcm_pclmulqdq_impl,
785#endif
786};
787
788/* Indicate that benchmark has been completed */
789static boolean_t gcm_impl_initialized = B_FALSE;
790
791/* Hold all supported implementations */
792static size_t gcm_supp_impl_cnt = 0;
793static gcm_impl_ops_t *gcm_supp_impl[ARRAY_SIZE(gcm_all_impl)];
794
795/*
796 * Returns the GCM operations for encrypt/decrypt/key setup.  When a
797 * SIMD implementation is not allowed in the current context, then
798 * fallback to the fastest generic implementation.
799 */
800const gcm_impl_ops_t *
801gcm_impl_get_ops()
802{
803	if (!kfpu_allowed())
804		return (&gcm_generic_impl);
805
806	const gcm_impl_ops_t *ops = NULL;
807	const uint32_t impl = GCM_IMPL_READ(icp_gcm_impl);
808
809	switch (impl) {
810	case IMPL_FASTEST:
811		ASSERT(gcm_impl_initialized);
812		ops = &gcm_fastest_impl;
813		break;
814	case IMPL_CYCLE:
815		/* Cycle through supported implementations */
816		ASSERT(gcm_impl_initialized);
817		ASSERT3U(gcm_supp_impl_cnt, >, 0);
818		static size_t cycle_impl_idx = 0;
819		size_t idx = (++cycle_impl_idx) % gcm_supp_impl_cnt;
820		ops = gcm_supp_impl[idx];
821		break;
822#ifdef CAN_USE_GCM_ASM
823	case IMPL_AVX:
824		/*
825		 * Make sure that we return a valid implementation while
826		 * switching to the avx implementation since there still
827		 * may be unfinished non-avx contexts around.
828		 */
829		ops = &gcm_generic_impl;
830		break;
831#endif
832	default:
833		ASSERT3U(impl, <, gcm_supp_impl_cnt);
834		ASSERT3U(gcm_supp_impl_cnt, >, 0);
835		if (impl < ARRAY_SIZE(gcm_all_impl))
836			ops = gcm_supp_impl[impl];
837		break;
838	}
839
840	ASSERT3P(ops, !=, NULL);
841
842	return (ops);
843}
844
845/*
846 * Initialize all supported implementations.
847 */
848void
849gcm_impl_init(void)
850{
851	gcm_impl_ops_t *curr_impl;
852	int i, c;
853
854	/* Move supported implementations into gcm_supp_impls */
855	for (i = 0, c = 0; i < ARRAY_SIZE(gcm_all_impl); i++) {
856		curr_impl = (gcm_impl_ops_t *)gcm_all_impl[i];
857
858		if (curr_impl->is_supported())
859			gcm_supp_impl[c++] = (gcm_impl_ops_t *)curr_impl;
860	}
861	gcm_supp_impl_cnt = c;
862
863	/*
864	 * Set the fastest implementation given the assumption that the
865	 * hardware accelerated version is the fastest.
866	 */
867#if defined(__x86_64) && defined(HAVE_PCLMULQDQ)
868	if (gcm_pclmulqdq_impl.is_supported()) {
869		memcpy(&gcm_fastest_impl, &gcm_pclmulqdq_impl,
870		    sizeof (gcm_fastest_impl));
871	} else
872#endif
873	{
874		memcpy(&gcm_fastest_impl, &gcm_generic_impl,
875		    sizeof (gcm_fastest_impl));
876	}
877
878	strlcpy(gcm_fastest_impl.name, "fastest", GCM_IMPL_NAME_MAX);
879
880#ifdef CAN_USE_GCM_ASM
881	/*
882	 * Use the avx implementation if it's available and the implementation
883	 * hasn't changed from its default value of fastest on module load.
884	 */
885	if (gcm_avx_will_work()) {
886#ifdef HAVE_MOVBE
887		if (zfs_movbe_available() == B_TRUE) {
888			atomic_swap_32(&gcm_avx_can_use_movbe, B_TRUE);
889		}
890#endif
891		if (GCM_IMPL_READ(user_sel_impl) == IMPL_FASTEST) {
892			gcm_set_avx(B_TRUE);
893		}
894	}
895#endif
896	/* Finish initialization */
897	atomic_swap_32(&icp_gcm_impl, user_sel_impl);
898	gcm_impl_initialized = B_TRUE;
899}
900
901static const struct {
902	char *name;
903	uint32_t sel;
904} gcm_impl_opts[] = {
905		{ "cycle",	IMPL_CYCLE },
906		{ "fastest",	IMPL_FASTEST },
907#ifdef CAN_USE_GCM_ASM
908		{ "avx",	IMPL_AVX },
909#endif
910};
911
912/*
913 * Function sets desired gcm implementation.
914 *
915 * If we are called before init(), user preference will be saved in
916 * user_sel_impl, and applied in later init() call. This occurs when module
917 * parameter is specified on module load. Otherwise, directly update
918 * icp_gcm_impl.
919 *
920 * @val		Name of gcm implementation to use
921 * @param	Unused.
922 */
923int
924gcm_impl_set(const char *val)
925{
926	int err = -EINVAL;
927	char req_name[GCM_IMPL_NAME_MAX];
928	uint32_t impl = GCM_IMPL_READ(user_sel_impl);
929	size_t i;
930
931	/* sanitize input */
932	i = strnlen(val, GCM_IMPL_NAME_MAX);
933	if (i == 0 || i >= GCM_IMPL_NAME_MAX)
934		return (err);
935
936	strlcpy(req_name, val, GCM_IMPL_NAME_MAX);
937	while (i > 0 && isspace(req_name[i-1]))
938		i--;
939	req_name[i] = '\0';
940
941	/* Check mandatory options */
942	for (i = 0; i < ARRAY_SIZE(gcm_impl_opts); i++) {
943#ifdef CAN_USE_GCM_ASM
944		/* Ignore avx implementation if it won't work. */
945		if (gcm_impl_opts[i].sel == IMPL_AVX && !gcm_avx_will_work()) {
946			continue;
947		}
948#endif
949		if (strcmp(req_name, gcm_impl_opts[i].name) == 0) {
950			impl = gcm_impl_opts[i].sel;
951			err = 0;
952			break;
953		}
954	}
955
956	/* check all supported impl if init() was already called */
957	if (err != 0 && gcm_impl_initialized) {
958		/* check all supported implementations */
959		for (i = 0; i < gcm_supp_impl_cnt; i++) {
960			if (strcmp(req_name, gcm_supp_impl[i]->name) == 0) {
961				impl = i;
962				err = 0;
963				break;
964			}
965		}
966	}
967#ifdef CAN_USE_GCM_ASM
968	/*
969	 * Use the avx implementation if available and the requested one is
970	 * avx or fastest.
971	 */
972	if (gcm_avx_will_work() == B_TRUE &&
973	    (impl == IMPL_AVX || impl == IMPL_FASTEST)) {
974		gcm_set_avx(B_TRUE);
975	} else {
976		gcm_set_avx(B_FALSE);
977	}
978#endif
979
980	if (err == 0) {
981		if (gcm_impl_initialized)
982			atomic_swap_32(&icp_gcm_impl, impl);
983		else
984			atomic_swap_32(&user_sel_impl, impl);
985	}
986
987	return (err);
988}
989
990#if defined(_KERNEL) && defined(__linux__)
991
992static int
993icp_gcm_impl_set(const char *val, zfs_kernel_param_t *kp)
994{
995	return (gcm_impl_set(val));
996}
997
998static int
999icp_gcm_impl_get(char *buffer, zfs_kernel_param_t *kp)
1000{
1001	int i, cnt = 0;
1002	char *fmt;
1003	const uint32_t impl = GCM_IMPL_READ(icp_gcm_impl);
1004
1005	ASSERT(gcm_impl_initialized);
1006
1007	/* list mandatory options */
1008	for (i = 0; i < ARRAY_SIZE(gcm_impl_opts); i++) {
1009#ifdef CAN_USE_GCM_ASM
1010		/* Ignore avx implementation if it won't work. */
1011		if (gcm_impl_opts[i].sel == IMPL_AVX && !gcm_avx_will_work()) {
1012			continue;
1013		}
1014#endif
1015		fmt = (impl == gcm_impl_opts[i].sel) ? "[%s] " : "%s ";
1016		cnt += sprintf(buffer + cnt, fmt, gcm_impl_opts[i].name);
1017	}
1018
1019	/* list all supported implementations */
1020	for (i = 0; i < gcm_supp_impl_cnt; i++) {
1021		fmt = (i == impl) ? "[%s] " : "%s ";
1022		cnt += sprintf(buffer + cnt, fmt, gcm_supp_impl[i]->name);
1023	}
1024
1025	return (cnt);
1026}
1027
1028module_param_call(icp_gcm_impl, icp_gcm_impl_set, icp_gcm_impl_get,
1029    NULL, 0644);
1030MODULE_PARM_DESC(icp_gcm_impl, "Select gcm implementation.");
1031#endif /* defined(__KERNEL) */
1032
1033#ifdef CAN_USE_GCM_ASM
1034#define	GCM_BLOCK_LEN 16
1035/*
1036 * The openssl asm routines are 6x aggregated and need that many bytes
1037 * at minimum.
1038 */
1039#define	GCM_AVX_MIN_DECRYPT_BYTES (GCM_BLOCK_LEN * 6)
1040#define	GCM_AVX_MIN_ENCRYPT_BYTES (GCM_BLOCK_LEN * 6 * 3)
1041/*
1042 * Ensure the chunk size is reasonable since we are allocating a
1043 * GCM_AVX_MAX_CHUNK_SIZEd buffer and disabling preemption and interrupts.
1044 */
1045#define	GCM_AVX_MAX_CHUNK_SIZE \
1046	(((128*1024)/GCM_AVX_MIN_DECRYPT_BYTES) * GCM_AVX_MIN_DECRYPT_BYTES)
1047
1048/* Get the chunk size module parameter. */
1049#define	GCM_CHUNK_SIZE_READ *(volatile uint32_t *) &gcm_avx_chunk_size
1050
1051/* Clear the FPU registers since they hold sensitive internal state. */
1052#define	clear_fpu_regs() clear_fpu_regs_avx()
1053#define	GHASH_AVX(ctx, in, len) \
1054    gcm_ghash_avx((ctx)->gcm_ghash, (const uint64_t *)(ctx)->gcm_Htable, \
1055    in, len)
1056
1057#define	gcm_incr_counter_block(ctx) gcm_incr_counter_block_by(ctx, 1)
1058
1059/*
1060 * Module parameter: number of bytes to process at once while owning the FPU.
1061 * Rounded down to the next GCM_AVX_MIN_DECRYPT_BYTES byte boundary and is
1062 * ensured to be greater or equal than GCM_AVX_MIN_DECRYPT_BYTES.
1063 */
1064static uint32_t gcm_avx_chunk_size =
1065	((32 * 1024) / GCM_AVX_MIN_DECRYPT_BYTES) * GCM_AVX_MIN_DECRYPT_BYTES;
1066
1067extern void clear_fpu_regs_avx(void);
1068extern void gcm_xor_avx(const uint8_t *src, uint8_t *dst);
1069extern void aes_encrypt_intel(const uint32_t rk[], int nr,
1070    const uint32_t pt[4], uint32_t ct[4]);
1071
1072extern void gcm_init_htab_avx(uint64_t *Htable, const uint64_t H[2]);
1073extern void gcm_ghash_avx(uint64_t ghash[2], const uint64_t *Htable,
1074    const uint8_t *in, size_t len);
1075
1076extern size_t aesni_gcm_encrypt(const uint8_t *, uint8_t *, size_t,
1077    const void *, uint64_t *, uint64_t *);
1078
1079extern size_t aesni_gcm_decrypt(const uint8_t *, uint8_t *, size_t,
1080    const void *, uint64_t *, uint64_t *);
1081
1082static inline boolean_t
1083gcm_avx_will_work(void)
1084{
1085	/* Avx should imply aes-ni and pclmulqdq, but make sure anyhow. */
1086	return (kfpu_allowed() &&
1087	    zfs_avx_available() && zfs_aes_available() &&
1088	    zfs_pclmulqdq_available());
1089}
1090
1091static inline void
1092gcm_set_avx(boolean_t val)
1093{
1094	if (gcm_avx_will_work() == B_TRUE) {
1095		atomic_swap_32(&gcm_use_avx, val);
1096	}
1097}
1098
1099static inline boolean_t
1100gcm_toggle_avx(void)
1101{
1102	if (gcm_avx_will_work() == B_TRUE) {
1103		return (atomic_toggle_boolean_nv(&GCM_IMPL_USE_AVX));
1104	} else {
1105		return (B_FALSE);
1106	}
1107}
1108
1109static inline size_t
1110gcm_simd_get_htab_size(boolean_t simd_mode)
1111{
1112	switch (simd_mode) {
1113	case B_TRUE:
1114		return (2 * 6 * 2 * sizeof (uint64_t));
1115
1116	default:
1117		return (0);
1118	}
1119}
1120
1121/*
1122 * Clear sensitive data in the context.
1123 *
1124 * ctx->gcm_remainder may contain a plaintext remainder. ctx->gcm_H and
1125 * ctx->gcm_Htable contain the hash sub key which protects authentication.
1126 *
1127 * Although extremely unlikely, ctx->gcm_J0 and ctx->gcm_tmp could be used for
1128 * a known plaintext attack, they consists of the IV and the first and last
1129 * counter respectively. If they should be cleared is debatable.
1130 */
1131static inline void
1132gcm_clear_ctx(gcm_ctx_t *ctx)
1133{
1134	bzero(ctx->gcm_remainder, sizeof (ctx->gcm_remainder));
1135	bzero(ctx->gcm_H, sizeof (ctx->gcm_H));
1136	bzero(ctx->gcm_J0, sizeof (ctx->gcm_J0));
1137	bzero(ctx->gcm_tmp, sizeof (ctx->gcm_tmp));
1138}
1139
1140/* Increment the GCM counter block by n. */
1141static inline void
1142gcm_incr_counter_block_by(gcm_ctx_t *ctx, int n)
1143{
1144	uint64_t counter_mask = ntohll(0x00000000ffffffffULL);
1145	uint64_t counter = ntohll(ctx->gcm_cb[1] & counter_mask);
1146
1147	counter = htonll(counter + n);
1148	counter &= counter_mask;
1149	ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter;
1150}
1151
1152/*
1153 * Encrypt multiple blocks of data in GCM mode.
1154 * This is done in gcm_avx_chunk_size chunks, utilizing AVX assembler routines
1155 * if possible. While processing a chunk the FPU is "locked".
1156 */
1157static int
1158gcm_mode_encrypt_contiguous_blocks_avx(gcm_ctx_t *ctx, char *data,
1159    size_t length, crypto_data_t *out, size_t block_size)
1160{
1161	size_t bleft = length;
1162	size_t need = 0;
1163	size_t done = 0;
1164	uint8_t *datap = (uint8_t *)data;
1165	size_t chunk_size = (size_t)GCM_CHUNK_SIZE_READ;
1166	const aes_key_t *key = ((aes_key_t *)ctx->gcm_keysched);
1167	uint64_t *ghash = ctx->gcm_ghash;
1168	uint64_t *cb = ctx->gcm_cb;
1169	uint8_t *ct_buf = NULL;
1170	uint8_t *tmp = (uint8_t *)ctx->gcm_tmp;
1171	int rv = CRYPTO_SUCCESS;
1172
1173	ASSERT(block_size == GCM_BLOCK_LEN);
1174	/*
1175	 * If the last call left an incomplete block, try to fill
1176	 * it first.
1177	 */
1178	if (ctx->gcm_remainder_len > 0) {
1179		need = block_size - ctx->gcm_remainder_len;
1180		if (length < need) {
1181			/* Accumulate bytes here and return. */
1182			bcopy(datap, (uint8_t *)ctx->gcm_remainder +
1183			    ctx->gcm_remainder_len, length);
1184
1185			ctx->gcm_remainder_len += length;
1186			if (ctx->gcm_copy_to == NULL) {
1187				ctx->gcm_copy_to = datap;
1188			}
1189			return (CRYPTO_SUCCESS);
1190		} else {
1191			/* Complete incomplete block. */
1192			bcopy(datap, (uint8_t *)ctx->gcm_remainder +
1193			    ctx->gcm_remainder_len, need);
1194
1195			ctx->gcm_copy_to = NULL;
1196		}
1197	}
1198
1199	/* Allocate a buffer to encrypt to if there is enough input. */
1200	if (bleft >= GCM_AVX_MIN_ENCRYPT_BYTES) {
1201		ct_buf = vmem_alloc(chunk_size, ctx->gcm_kmflag);
1202		if (ct_buf == NULL) {
1203			return (CRYPTO_HOST_MEMORY);
1204		}
1205	}
1206
1207	/* If we completed an incomplete block, encrypt and write it out. */
1208	if (ctx->gcm_remainder_len > 0) {
1209		kfpu_begin();
1210		aes_encrypt_intel(key->encr_ks.ks32, key->nr,
1211		    (const uint32_t *)cb, (uint32_t *)tmp);
1212
1213		gcm_xor_avx((const uint8_t *) ctx->gcm_remainder, tmp);
1214		GHASH_AVX(ctx, tmp, block_size);
1215		clear_fpu_regs();
1216		kfpu_end();
1217		rv = crypto_put_output_data(tmp, out, block_size);
1218		out->cd_offset += block_size;
1219		gcm_incr_counter_block(ctx);
1220		ctx->gcm_processed_data_len += block_size;
1221		bleft -= need;
1222		datap += need;
1223		ctx->gcm_remainder_len = 0;
1224	}
1225
1226	/* Do the bulk encryption in chunk_size blocks. */
1227	for (; bleft >= chunk_size; bleft -= chunk_size) {
1228		kfpu_begin();
1229		done = aesni_gcm_encrypt(
1230		    datap, ct_buf, chunk_size, key, cb, ghash);
1231
1232		clear_fpu_regs();
1233		kfpu_end();
1234		if (done != chunk_size) {
1235			rv = CRYPTO_FAILED;
1236			goto out_nofpu;
1237		}
1238		rv = crypto_put_output_data(ct_buf, out, chunk_size);
1239		if (rv != CRYPTO_SUCCESS) {
1240			goto out_nofpu;
1241		}
1242		out->cd_offset += chunk_size;
1243		datap += chunk_size;
1244		ctx->gcm_processed_data_len += chunk_size;
1245	}
1246	/* Check if we are already done. */
1247	if (bleft == 0) {
1248		goto out_nofpu;
1249	}
1250	/* Bulk encrypt the remaining data. */
1251	kfpu_begin();
1252	if (bleft >= GCM_AVX_MIN_ENCRYPT_BYTES) {
1253		done = aesni_gcm_encrypt(datap, ct_buf, bleft, key, cb, ghash);
1254		if (done == 0) {
1255			rv = CRYPTO_FAILED;
1256			goto out;
1257		}
1258		rv = crypto_put_output_data(ct_buf, out, done);
1259		if (rv != CRYPTO_SUCCESS) {
1260			goto out;
1261		}
1262		out->cd_offset += done;
1263		ctx->gcm_processed_data_len += done;
1264		datap += done;
1265		bleft -= done;
1266
1267	}
1268	/* Less than GCM_AVX_MIN_ENCRYPT_BYTES remain, operate on blocks. */
1269	while (bleft > 0) {
1270		if (bleft < block_size) {
1271			bcopy(datap, ctx->gcm_remainder, bleft);
1272			ctx->gcm_remainder_len = bleft;
1273			ctx->gcm_copy_to = datap;
1274			goto out;
1275		}
1276		/* Encrypt, hash and write out. */
1277		aes_encrypt_intel(key->encr_ks.ks32, key->nr,
1278		    (const uint32_t *)cb, (uint32_t *)tmp);
1279
1280		gcm_xor_avx(datap, tmp);
1281		GHASH_AVX(ctx, tmp, block_size);
1282		rv = crypto_put_output_data(tmp, out, block_size);
1283		if (rv != CRYPTO_SUCCESS) {
1284			goto out;
1285		}
1286		out->cd_offset += block_size;
1287		gcm_incr_counter_block(ctx);
1288		ctx->gcm_processed_data_len += block_size;
1289		datap += block_size;
1290		bleft -= block_size;
1291	}
1292out:
1293	clear_fpu_regs();
1294	kfpu_end();
1295out_nofpu:
1296	if (ct_buf != NULL) {
1297		vmem_free(ct_buf, chunk_size);
1298	}
1299	return (rv);
1300}
1301
1302/*
1303 * Finalize the encryption: Zero fill, encrypt, hash and write out an eventual
1304 * incomplete last block. Encrypt the ICB. Calculate the tag and write it out.
1305 */
1306static int
1307gcm_encrypt_final_avx(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size)
1308{
1309	uint8_t *ghash = (uint8_t *)ctx->gcm_ghash;
1310	uint32_t *J0 = (uint32_t *)ctx->gcm_J0;
1311	uint8_t *remainder = (uint8_t *)ctx->gcm_remainder;
1312	size_t rem_len = ctx->gcm_remainder_len;
1313	const void *keysched = ((aes_key_t *)ctx->gcm_keysched)->encr_ks.ks32;
1314	int aes_rounds = ((aes_key_t *)keysched)->nr;
1315	int rv;
1316
1317	ASSERT(block_size == GCM_BLOCK_LEN);
1318
1319	if (out->cd_length < (rem_len + ctx->gcm_tag_len)) {
1320		return (CRYPTO_DATA_LEN_RANGE);
1321	}
1322
1323	kfpu_begin();
1324	/* Pad last incomplete block with zeros, encrypt and hash. */
1325	if (rem_len > 0) {
1326		uint8_t *tmp = (uint8_t *)ctx->gcm_tmp;
1327		const uint32_t *cb = (uint32_t *)ctx->gcm_cb;
1328
1329		aes_encrypt_intel(keysched, aes_rounds, cb, (uint32_t *)tmp);
1330		bzero(remainder + rem_len, block_size - rem_len);
1331		for (int i = 0; i < rem_len; i++) {
1332			remainder[i] ^= tmp[i];
1333		}
1334		GHASH_AVX(ctx, remainder, block_size);
1335		ctx->gcm_processed_data_len += rem_len;
1336		/* No need to increment counter_block, it's the last block. */
1337	}
1338	/* Finish tag. */
1339	ctx->gcm_len_a_len_c[1] =
1340	    htonll(CRYPTO_BYTES2BITS(ctx->gcm_processed_data_len));
1341	GHASH_AVX(ctx, (const uint8_t *)ctx->gcm_len_a_len_c, block_size);
1342	aes_encrypt_intel(keysched, aes_rounds, J0, J0);
1343
1344	gcm_xor_avx((uint8_t *)J0, ghash);
1345	clear_fpu_regs();
1346	kfpu_end();
1347
1348	/* Output remainder. */
1349	if (rem_len > 0) {
1350		rv = crypto_put_output_data(remainder, out, rem_len);
1351		if (rv != CRYPTO_SUCCESS)
1352			return (rv);
1353	}
1354	out->cd_offset += rem_len;
1355	ctx->gcm_remainder_len = 0;
1356	rv = crypto_put_output_data(ghash, out, ctx->gcm_tag_len);
1357	if (rv != CRYPTO_SUCCESS)
1358		return (rv);
1359
1360	out->cd_offset += ctx->gcm_tag_len;
1361	/* Clear sensitive data in the context before returning. */
1362	gcm_clear_ctx(ctx);
1363	return (CRYPTO_SUCCESS);
1364}
1365
1366/*
1367 * Finalize decryption: We just have accumulated crypto text, so now we
1368 * decrypt it here inplace.
1369 */
1370static int
1371gcm_decrypt_final_avx(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size)
1372{
1373	ASSERT3U(ctx->gcm_processed_data_len, ==, ctx->gcm_pt_buf_len);
1374	ASSERT3U(block_size, ==, 16);
1375
1376	size_t chunk_size = (size_t)GCM_CHUNK_SIZE_READ;
1377	size_t pt_len = ctx->gcm_processed_data_len - ctx->gcm_tag_len;
1378	uint8_t *datap = ctx->gcm_pt_buf;
1379	const aes_key_t *key = ((aes_key_t *)ctx->gcm_keysched);
1380	uint32_t *cb = (uint32_t *)ctx->gcm_cb;
1381	uint64_t *ghash = ctx->gcm_ghash;
1382	uint32_t *tmp = (uint32_t *)ctx->gcm_tmp;
1383	int rv = CRYPTO_SUCCESS;
1384	size_t bleft, done;
1385
1386	/*
1387	 * Decrypt in chunks of gcm_avx_chunk_size, which is asserted to be
1388	 * greater or equal than GCM_AVX_MIN_ENCRYPT_BYTES, and a multiple of
1389	 * GCM_AVX_MIN_DECRYPT_BYTES.
1390	 */
1391	for (bleft = pt_len; bleft >= chunk_size; bleft -= chunk_size) {
1392		kfpu_begin();
1393		done = aesni_gcm_decrypt(datap, datap, chunk_size,
1394		    (const void *)key, ctx->gcm_cb, ghash);
1395		clear_fpu_regs();
1396		kfpu_end();
1397		if (done != chunk_size) {
1398			return (CRYPTO_FAILED);
1399		}
1400		datap += done;
1401	}
1402	/* Decrypt remainder, which is less than chunk size, in one go. */
1403	kfpu_begin();
1404	if (bleft >= GCM_AVX_MIN_DECRYPT_BYTES) {
1405		done = aesni_gcm_decrypt(datap, datap, bleft,
1406		    (const void *)key, ctx->gcm_cb, ghash);
1407		if (done == 0) {
1408			clear_fpu_regs();
1409			kfpu_end();
1410			return (CRYPTO_FAILED);
1411		}
1412		datap += done;
1413		bleft -= done;
1414	}
1415	ASSERT(bleft < GCM_AVX_MIN_DECRYPT_BYTES);
1416
1417	/*
1418	 * Now less than GCM_AVX_MIN_DECRYPT_BYTES bytes remain,
1419	 * decrypt them block by block.
1420	 */
1421	while (bleft > 0) {
1422		/* Incomplete last block. */
1423		if (bleft < block_size) {
1424			uint8_t *lastb = (uint8_t *)ctx->gcm_remainder;
1425
1426			bzero(lastb, block_size);
1427			bcopy(datap, lastb, bleft);
1428			/* The GCM processing. */
1429			GHASH_AVX(ctx, lastb, block_size);
1430			aes_encrypt_intel(key->encr_ks.ks32, key->nr, cb, tmp);
1431			for (size_t i = 0; i < bleft; i++) {
1432				datap[i] = lastb[i] ^ ((uint8_t *)tmp)[i];
1433			}
1434			break;
1435		}
1436		/* The GCM processing. */
1437		GHASH_AVX(ctx, datap, block_size);
1438		aes_encrypt_intel(key->encr_ks.ks32, key->nr, cb, tmp);
1439		gcm_xor_avx((uint8_t *)tmp, datap);
1440		gcm_incr_counter_block(ctx);
1441
1442		datap += block_size;
1443		bleft -= block_size;
1444	}
1445	if (rv != CRYPTO_SUCCESS) {
1446		clear_fpu_regs();
1447		kfpu_end();
1448		return (rv);
1449	}
1450	/* Decryption done, finish the tag. */
1451	ctx->gcm_len_a_len_c[1] = htonll(CRYPTO_BYTES2BITS(pt_len));
1452	GHASH_AVX(ctx, (uint8_t *)ctx->gcm_len_a_len_c, block_size);
1453	aes_encrypt_intel(key->encr_ks.ks32, key->nr, (uint32_t *)ctx->gcm_J0,
1454	    (uint32_t *)ctx->gcm_J0);
1455
1456	gcm_xor_avx((uint8_t *)ctx->gcm_J0, (uint8_t *)ghash);
1457
1458	/* We are done with the FPU, restore its state. */
1459	clear_fpu_regs();
1460	kfpu_end();
1461
1462	/* Compare the input authentication tag with what we calculated. */
1463	if (bcmp(&ctx->gcm_pt_buf[pt_len], ghash, ctx->gcm_tag_len)) {
1464		/* They don't match. */
1465		return (CRYPTO_INVALID_MAC);
1466	}
1467	rv = crypto_put_output_data(ctx->gcm_pt_buf, out, pt_len);
1468	if (rv != CRYPTO_SUCCESS) {
1469		return (rv);
1470	}
1471	out->cd_offset += pt_len;
1472	gcm_clear_ctx(ctx);
1473	return (CRYPTO_SUCCESS);
1474}
1475
1476/*
1477 * Initialize the GCM params H, Htabtle and the counter block. Save the
1478 * initial counter block.
1479 */
1480static int
1481gcm_init_avx(gcm_ctx_t *ctx, unsigned char *iv, size_t iv_len,
1482    unsigned char *auth_data, size_t auth_data_len, size_t block_size)
1483{
1484	uint8_t *cb = (uint8_t *)ctx->gcm_cb;
1485	uint64_t *H = ctx->gcm_H;
1486	const void *keysched = ((aes_key_t *)ctx->gcm_keysched)->encr_ks.ks32;
1487	int aes_rounds = ((aes_key_t *)ctx->gcm_keysched)->nr;
1488	uint8_t *datap = auth_data;
1489	size_t chunk_size = (size_t)GCM_CHUNK_SIZE_READ;
1490	size_t bleft;
1491
1492	ASSERT(block_size == GCM_BLOCK_LEN);
1493
1494	/* Init H (encrypt zero block) and create the initial counter block. */
1495	bzero(ctx->gcm_ghash, sizeof (ctx->gcm_ghash));
1496	bzero(H, sizeof (ctx->gcm_H));
1497	kfpu_begin();
1498	aes_encrypt_intel(keysched, aes_rounds,
1499	    (const uint32_t *)H, (uint32_t *)H);
1500
1501	gcm_init_htab_avx(ctx->gcm_Htable, H);
1502
1503	if (iv_len == 12) {
1504		bcopy(iv, cb, 12);
1505		cb[12] = 0;
1506		cb[13] = 0;
1507		cb[14] = 0;
1508		cb[15] = 1;
1509		/* We need the ICB later. */
1510		bcopy(cb, ctx->gcm_J0, sizeof (ctx->gcm_J0));
1511	} else {
1512		/*
1513		 * Most consumers use 12 byte IVs, so it's OK to use the
1514		 * original routines for other IV sizes, just avoid nesting
1515		 * kfpu_begin calls.
1516		 */
1517		clear_fpu_regs();
1518		kfpu_end();
1519		gcm_format_initial_blocks(iv, iv_len, ctx, block_size,
1520		    aes_copy_block, aes_xor_block);
1521		kfpu_begin();
1522	}
1523
1524	/* Openssl post increments the counter, adjust for that. */
1525	gcm_incr_counter_block(ctx);
1526
1527	/* Ghash AAD in chunk_size blocks. */
1528	for (bleft = auth_data_len; bleft >= chunk_size; bleft -= chunk_size) {
1529		GHASH_AVX(ctx, datap, chunk_size);
1530		datap += chunk_size;
1531		clear_fpu_regs();
1532		kfpu_end();
1533		kfpu_begin();
1534	}
1535	/* Ghash the remainder and handle possible incomplete GCM block. */
1536	if (bleft > 0) {
1537		size_t incomp = bleft % block_size;
1538
1539		bleft -= incomp;
1540		if (bleft > 0) {
1541			GHASH_AVX(ctx, datap, bleft);
1542			datap += bleft;
1543		}
1544		if (incomp > 0) {
1545			/* Zero pad and hash incomplete last block. */
1546			uint8_t *authp = (uint8_t *)ctx->gcm_tmp;
1547
1548			bzero(authp, block_size);
1549			bcopy(datap, authp, incomp);
1550			GHASH_AVX(ctx, authp, block_size);
1551		}
1552	}
1553	clear_fpu_regs();
1554	kfpu_end();
1555	return (CRYPTO_SUCCESS);
1556}
1557
1558#if defined(_KERNEL)
1559static int
1560icp_gcm_avx_set_chunk_size(const char *buf, zfs_kernel_param_t *kp)
1561{
1562	unsigned long val;
1563	char val_rounded[16];
1564	int error = 0;
1565
1566	error = kstrtoul(buf, 0, &val);
1567	if (error)
1568		return (error);
1569
1570	val = (val / GCM_AVX_MIN_DECRYPT_BYTES) * GCM_AVX_MIN_DECRYPT_BYTES;
1571
1572	if (val < GCM_AVX_MIN_ENCRYPT_BYTES || val > GCM_AVX_MAX_CHUNK_SIZE)
1573		return (-EINVAL);
1574
1575	snprintf(val_rounded, 16, "%u", (uint32_t)val);
1576	error = param_set_uint(val_rounded, kp);
1577	return (error);
1578}
1579
1580module_param_call(icp_gcm_avx_chunk_size, icp_gcm_avx_set_chunk_size,
1581    param_get_uint, &gcm_avx_chunk_size, 0644);
1582
1583MODULE_PARM_DESC(icp_gcm_avx_chunk_size,
1584	"How many bytes to process while owning the FPU");
1585
1586#endif /* defined(__KERNEL) */
1587#endif /* ifdef CAN_USE_GCM_ASM */
1588