1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or https://opensource.org/licenses/CDDL-1.0.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 2018 Intel Corporation.
23 * Copyright (c) 2020 by Lawrence Livermore National Security, LLC.
24 */
25
26#include <stdio.h>
27#include <zlib.h>
28#include <zfs_fletcher.h>
29#include <sys/vdev_draid.h>
30#include <sys/nvpair.h>
31#include <sys/stat.h>
32
33/*
34 * The number of rows to generate for new permutation maps.
35 */
36#define	MAP_ROWS_DEFAULT	256
37
38/*
39 * Key values for dRAID maps when stored as nvlists.
40 */
41#define	MAP_SEED		"seed"
42#define	MAP_CHECKSUM		"checksum"
43#define	MAP_WORST_RATIO		"worst_ratio"
44#define	MAP_AVG_RATIO		"avg_ratio"
45#define	MAP_CHILDREN		"children"
46#define	MAP_NPERMS		"nperms"
47#define	MAP_PERMS		"perms"
48
49static void
50draid_usage(void)
51{
52	(void) fprintf(stderr,
53	    "usage: draid command args ...\n"
54	    "Available commands are:\n"
55	    "\n"
56	    "\tdraid generate [-cv] [-m min] [-n max] [-p passes] FILE\n"
57	    "\tdraid verify [-rv] FILE\n"
58	    "\tdraid dump [-v] [-m min] [-n max] FILE\n"
59	    "\tdraid table FILE\n"
60	    "\tdraid merge FILE SRC SRC...\n");
61	exit(1);
62}
63
64static int
65read_map(const char *filename, nvlist_t **allcfgs)
66{
67	int block_size = 131072;
68	int buf_size = 131072;
69	int tmp_size, error;
70	char *tmp_buf;
71
72	struct stat64 stat;
73	if (lstat64(filename, &stat) != 0)
74		return (errno);
75
76	if (stat.st_size == 0 ||
77	    !(S_ISREG(stat.st_mode) || S_ISLNK(stat.st_mode))) {
78		return (EINVAL);
79	}
80
81	gzFile fp = gzopen(filename, "rb");
82	if (fp == Z_NULL)
83		return (errno);
84
85	char *buf = malloc(buf_size);
86	if (buf == NULL) {
87		(void) gzclose(fp);
88		return (ENOMEM);
89	}
90
91	ssize_t rc, bytes = 0;
92	while (!gzeof(fp)) {
93		rc = gzread(fp, buf + bytes, block_size);
94		if ((rc < 0) || (rc == 0 && !gzeof(fp))) {
95			free(buf);
96			(void) gzerror(fp, &error);
97			(void) gzclose(fp);
98			return (error);
99		} else {
100			bytes += rc;
101
102			if (bytes + block_size >= buf_size) {
103				tmp_size = 2 * buf_size;
104				tmp_buf = malloc(tmp_size);
105				if (tmp_buf == NULL) {
106					free(buf);
107					(void) gzclose(fp);
108					return (ENOMEM);
109				}
110
111				memcpy(tmp_buf, buf, bytes);
112				free(buf);
113				buf = tmp_buf;
114				buf_size = tmp_size;
115			}
116		}
117	}
118
119	(void) gzclose(fp);
120
121	error = nvlist_unpack(buf, bytes, allcfgs, 0);
122	free(buf);
123
124	return (error);
125}
126
127/*
128 * Read a map from the specified filename.  A file contains multiple maps
129 * which are indexed by the number of children. The caller is responsible
130 * for freeing the configuration returned.
131 */
132static int
133read_map_key(const char *filename, const char *key, nvlist_t **cfg)
134{
135	nvlist_t *allcfgs, *foundcfg = NULL;
136	int error;
137
138	error = read_map(filename, &allcfgs);
139	if (error != 0)
140		return (error);
141
142	(void) nvlist_lookup_nvlist(allcfgs, key, &foundcfg);
143	if (foundcfg != NULL) {
144		nvlist_dup(foundcfg, cfg, KM_SLEEP);
145		error = 0;
146	} else {
147		error = ENOENT;
148	}
149
150	nvlist_free(allcfgs);
151
152	return (error);
153}
154
155/*
156 * Write all mappings to the map file.
157 */
158static int
159write_map(const char *filename, nvlist_t *allcfgs)
160{
161	size_t buflen = 0;
162	int error;
163
164	error = nvlist_size(allcfgs, &buflen, NV_ENCODE_XDR);
165	if (error)
166		return (error);
167
168	char *buf = malloc(buflen);
169	if (buf == NULL)
170		return (ENOMEM);
171
172	error = nvlist_pack(allcfgs, &buf, &buflen, NV_ENCODE_XDR, KM_SLEEP);
173	if (error) {
174		free(buf);
175		return (error);
176	}
177
178	/*
179	 * Atomically update the file using a temporary file and the
180	 * traditional unlink then rename steps.  This code provides
181	 * no locking, it only guarantees the packed nvlist on disk
182	 * is updated atomically and is internally consistent.
183	 */
184	char *tmpname = calloc(1, MAXPATHLEN);
185	if (tmpname == NULL) {
186		free(buf);
187		return (ENOMEM);
188	}
189
190	snprintf(tmpname, MAXPATHLEN - 1, "%s.XXXXXX", filename);
191
192	int fd = mkstemp(tmpname);
193	if (fd < 0) {
194		error = errno;
195		free(buf);
196		free(tmpname);
197		return (error);
198	}
199	(void) close(fd);
200
201	gzFile fp = gzopen(tmpname, "w9b");
202	if (fp == Z_NULL) {
203		error = errno;
204		free(buf);
205		free(tmpname);
206		return (errno);
207	}
208
209	ssize_t rc, bytes = 0;
210	while (bytes < buflen) {
211		size_t size = MIN(buflen - bytes, 131072);
212		rc = gzwrite(fp, buf + bytes, size);
213		if (rc < 0) {
214			free(buf);
215			(void) gzerror(fp, &error);
216			(void) gzclose(fp);
217			(void) unlink(tmpname);
218			free(tmpname);
219			return (error);
220		} else if (rc == 0) {
221			break;
222		} else {
223			bytes += rc;
224		}
225	}
226
227	free(buf);
228	(void) gzclose(fp);
229
230	if (bytes != buflen) {
231		(void) unlink(tmpname);
232		free(tmpname);
233		return (EIO);
234	}
235
236	/*
237	 * Unlink the previous config file and replace it with the updated
238	 * version.  If we're able to unlink the file then directory is
239	 * writable by us and the subsequent rename should never fail.
240	 */
241	error = unlink(filename);
242	if (error != 0 && errno != ENOENT) {
243		error = errno;
244		(void) unlink(tmpname);
245		free(tmpname);
246		return (error);
247	}
248
249	error = rename(tmpname, filename);
250	if (error != 0) {
251		error = errno;
252		(void) unlink(tmpname);
253		free(tmpname);
254		return (error);
255	}
256
257	free(tmpname);
258
259	return (0);
260}
261
262/*
263 * Add the dRAID map to the file and write it out.
264 */
265static int
266write_map_key(const char *filename, char *key, draid_map_t *map,
267    double worst_ratio, double avg_ratio)
268{
269	nvlist_t *nv_cfg, *allcfgs;
270	int error;
271
272	/*
273	 * Add the configuration to an existing or new file.  The new
274	 * configuration will replace an existing configuration with the
275	 * same key if it has a lower ratio and is therefore better.
276	 */
277	error = read_map(filename, &allcfgs);
278	if (error == ENOENT) {
279		allcfgs = fnvlist_alloc();
280	} else if (error != 0) {
281		return (error);
282	}
283
284	error = nvlist_lookup_nvlist(allcfgs, key, &nv_cfg);
285	if (error == 0) {
286		uint64_t nv_cfg_worst_ratio = fnvlist_lookup_uint64(nv_cfg,
287		    MAP_WORST_RATIO);
288		double nv_worst_ratio = (double)nv_cfg_worst_ratio / 1000.0;
289
290		if (worst_ratio < nv_worst_ratio) {
291			/* Replace old map with the more balanced new map. */
292			fnvlist_remove(allcfgs, key);
293		} else {
294			/* The old map is preferable, keep it. */
295			nvlist_free(allcfgs);
296			return (EEXIST);
297		}
298	}
299
300	nvlist_t *cfg = fnvlist_alloc();
301	fnvlist_add_uint64(cfg, MAP_SEED, map->dm_seed);
302	fnvlist_add_uint64(cfg, MAP_CHECKSUM, map->dm_checksum);
303	fnvlist_add_uint64(cfg, MAP_CHILDREN, map->dm_children);
304	fnvlist_add_uint64(cfg, MAP_NPERMS, map->dm_nperms);
305	fnvlist_add_uint8_array(cfg, MAP_PERMS,  map->dm_perms,
306	    map->dm_children * map->dm_nperms * sizeof (uint8_t));
307
308	fnvlist_add_uint64(cfg, MAP_WORST_RATIO,
309	    (uint64_t)(worst_ratio * 1000.0));
310	fnvlist_add_uint64(cfg, MAP_AVG_RATIO,
311	    (uint64_t)(avg_ratio * 1000.0));
312
313	error = nvlist_add_nvlist(allcfgs, key, cfg);
314	if (error == 0)
315		error = write_map(filename, allcfgs);
316
317	nvlist_free(cfg);
318	nvlist_free(allcfgs);
319	return (error);
320}
321
322static void
323dump_map(draid_map_t *map, const char *key, double worst_ratio,
324    double avg_ratio, int verbose)
325{
326	if (verbose == 0) {
327		return;
328	} else if (verbose == 1) {
329		printf("    \"%s\": seed: 0x%016llx worst_ratio: %2.03f "
330		    "avg_ratio: %2.03f\n", key, (u_longlong_t)map->dm_seed,
331		    worst_ratio, avg_ratio);
332		return;
333	} else {
334		printf("    \"%s\":\n"
335		    "        seed: 0x%016llx\n"
336		    "        checksum: 0x%016llx\n"
337		    "        worst_ratio: %2.03f\n"
338		    "        avg_ratio: %2.03f\n"
339		    "        children: %llu\n"
340		    "        nperms: %llu\n",
341		    key, (u_longlong_t)map->dm_seed,
342		    (u_longlong_t)map->dm_checksum, worst_ratio, avg_ratio,
343		    (u_longlong_t)map->dm_children,
344		    (u_longlong_t)map->dm_nperms);
345
346		if (verbose > 2) {
347			printf("        perms = {\n");
348			for (int i = 0; i < map->dm_nperms; i++) {
349				printf("            { ");
350				for (int j = 0; j < map->dm_children; j++) {
351					printf("%3d%s ", map->dm_perms[
352					    i * map->dm_children + j],
353					    j < map->dm_children - 1 ?
354					    "," : "");
355				}
356				printf(" },\n");
357			}
358			printf("        }\n");
359		} else if (verbose == 2) {
360			printf("        draid_perms = <omitted>\n");
361		}
362	}
363}
364
365static void
366dump_map_nv(const char *key, nvlist_t *cfg, int verbose)
367{
368	draid_map_t map;
369	uint_t c;
370
371	uint64_t worst_ratio = fnvlist_lookup_uint64(cfg, MAP_WORST_RATIO);
372	uint64_t avg_ratio = fnvlist_lookup_uint64(cfg, MAP_AVG_RATIO);
373
374	map.dm_seed = fnvlist_lookup_uint64(cfg, MAP_SEED);
375	map.dm_checksum = fnvlist_lookup_uint64(cfg, MAP_CHECKSUM);
376	map.dm_children = fnvlist_lookup_uint64(cfg, MAP_CHILDREN);
377	map.dm_nperms = fnvlist_lookup_uint64(cfg, MAP_NPERMS);
378	map.dm_perms = fnvlist_lookup_uint8_array(cfg, MAP_PERMS, &c);
379
380	dump_map(&map, key, (double)worst_ratio / 1000.0,
381	    avg_ratio / 1000.0, verbose);
382}
383
384/*
385 * Print a summary of the mapping.
386 */
387static int
388dump_map_key(const char *filename, const char *key, int verbose)
389{
390	nvlist_t *cfg;
391	int error;
392
393	error = read_map_key(filename, key, &cfg);
394	if (error != 0)
395		return (error);
396
397	dump_map_nv(key, cfg, verbose);
398
399	return (0);
400}
401
402/*
403 * Allocate a new permutation map for evaluation.
404 */
405static int
406alloc_new_map(uint64_t children, uint64_t nperms, uint64_t seed,
407    draid_map_t **mapp)
408{
409	draid_map_t *map;
410	int error;
411
412	map = malloc(sizeof (draid_map_t));
413	if (map == NULL)
414		return (ENOMEM);
415
416	map->dm_children = children;
417	map->dm_nperms = nperms;
418	map->dm_seed = seed;
419	map->dm_checksum = 0;
420
421	error = vdev_draid_generate_perms(map, &map->dm_perms);
422	if (error) {
423		free(map);
424		return (error);
425	}
426
427	*mapp = map;
428
429	return (0);
430}
431
432/*
433 * Allocate the fixed permutation map for N children.
434 */
435static int
436alloc_fixed_map(uint64_t children, draid_map_t **mapp)
437{
438	const draid_map_t *fixed_map;
439	draid_map_t *map;
440	int error;
441
442	error = vdev_draid_lookup_map(children, &fixed_map);
443	if (error)
444		return (error);
445
446	map = malloc(sizeof (draid_map_t));
447	if (map == NULL)
448		return (ENOMEM);
449
450	memcpy(map, fixed_map, sizeof (draid_map_t));
451	VERIFY3U(map->dm_checksum, !=, 0);
452
453	error = vdev_draid_generate_perms(map, &map->dm_perms);
454	if (error) {
455		free(map);
456		return (error);
457	}
458
459	*mapp = map;
460
461	return (0);
462}
463
464/*
465 * Free a permutation map.
466 */
467static void
468free_map(draid_map_t *map)
469{
470	free(map->dm_perms);
471	free(map);
472}
473
474/*
475 * Check if dev is in the provided list of faulted devices.
476 */
477static inline boolean_t
478is_faulted(int *faulted_devs, int nfaulted, int dev)
479{
480	for (int i = 0; i < nfaulted; i++)
481		if (faulted_devs[i] == dev)
482			return (B_TRUE);
483
484	return (B_FALSE);
485}
486
487/*
488 * Evaluate how resilvering I/O will be distributed given a list of faulted
489 * vdevs.  As a simplification we assume one IO is sufficient to repair each
490 * damaged device in a group.
491 */
492static double
493eval_resilver(draid_map_t *map, uint64_t groupwidth, uint64_t nspares,
494    int *faulted_devs, int nfaulted, int *min_child_ios, int *max_child_ios)
495{
496	uint64_t children = map->dm_children;
497	uint64_t ngroups = 1;
498	uint64_t ndisks = children - nspares;
499
500	/*
501	 * Calculate the minimum number of groups required to fill a slice.
502	 */
503	while (ngroups * (groupwidth) % (children - nspares) != 0)
504		ngroups++;
505
506	int *ios = calloc(map->dm_children, sizeof (uint64_t));
507
508	ASSERT3P(ios, !=, NULL);
509
510	/* Resilver all rows */
511	for (int i = 0; i < map->dm_nperms; i++) {
512		uint8_t *row = &map->dm_perms[i * map->dm_children];
513
514		/* Resilver all groups with faulted drives */
515		for (int j = 0; j < ngroups; j++) {
516			uint64_t spareidx = map->dm_children - nspares;
517			boolean_t repair_needed = B_FALSE;
518
519			/* See if any devices in this group are faulted */
520			uint64_t groupstart = (j * groupwidth) % ndisks;
521
522			for (int k = 0; k < groupwidth; k++) {
523				uint64_t groupidx = (groupstart + k) % ndisks;
524
525				repair_needed = is_faulted(faulted_devs,
526				    nfaulted, row[groupidx]);
527				if (repair_needed)
528					break;
529			}
530
531			if (repair_needed == B_FALSE)
532				continue;
533
534			/*
535			 * This group is degraded. Calculate the number of
536			 * reads the non-faulted drives require and the number
537			 * of writes to the distributed hot spare for this row.
538			 */
539			for (int k = 0; k < groupwidth; k++) {
540				uint64_t groupidx = (groupstart + k) % ndisks;
541
542				if (!is_faulted(faulted_devs, nfaulted,
543				    row[groupidx])) {
544					ios[row[groupidx]]++;
545				} else if (nspares > 0) {
546					while (is_faulted(faulted_devs,
547					    nfaulted, row[spareidx])) {
548						spareidx++;
549					}
550
551					ASSERT3U(spareidx, <, map->dm_children);
552					ios[row[spareidx]]++;
553					spareidx++;
554				}
555			}
556		}
557	}
558
559	*min_child_ios = INT_MAX;
560	*max_child_ios = 0;
561
562	/*
563	 * Find the drives with fewest and most required I/O.  These values
564	 * are used to calculate the imbalance ratio.  To avoid returning an
565	 * infinite value for permutations which have children that perform
566	 * no IO a floor of 1 IO per child is set.  This ensures a meaningful
567	 * ratio is returned for comparison and it is not an uncommon when
568	 * there are a large number of children.
569	 */
570	for (int i = 0; i < map->dm_children; i++) {
571
572		if (is_faulted(faulted_devs, nfaulted, i)) {
573			ASSERT0(ios[i]);
574			continue;
575		}
576
577		if (ios[i] == 0)
578			ios[i] = 1;
579
580		if (ios[i] < *min_child_ios)
581			*min_child_ios = ios[i];
582
583		if (ios[i] > *max_child_ios)
584			*max_child_ios = ios[i];
585	}
586
587	ASSERT3S(*min_child_ios, !=, INT_MAX);
588	ASSERT3S(*max_child_ios, !=, 0);
589
590	double ratio = (double)(*max_child_ios) / (double)(*min_child_ios);
591
592	free(ios);
593
594	return (ratio);
595}
596
597/*
598 * Evaluate the quality of the permutation mapping by considering possible
599 * device failures.  Returns the imbalance ratio for the worst mapping which
600 * is defined to be the largest number of child IOs over the fewest number
601 * child IOs. A value of 1.0 indicates the mapping is perfectly balance and
602 * all children perform an equal amount of work during reconstruction.
603 */
604static void
605eval_decluster(draid_map_t *map, double *worst_ratiop, double *avg_ratiop)
606{
607	uint64_t children = map->dm_children;
608	double worst_ratio = 1.0;
609	double sum = 0;
610	int worst_min_ios = 0, worst_max_ios = 0;
611	int n = 0;
612
613	/*
614	 * When there are only 2 children there can be no distributed
615	 * spare and no resilver to evaluate.  Default to a ratio of 1.0
616	 * for this degenerate case.
617	 */
618	if (children == VDEV_DRAID_MIN_CHILDREN) {
619		*worst_ratiop = 1.0;
620		*avg_ratiop = 1.0;
621		return;
622	}
623
624	/*
625	 * Score the mapping as if it had either 1 or 2 distributed spares.
626	 */
627	for (int nspares = 1; nspares <= 2; nspares++) {
628		uint64_t faults = nspares;
629
630		/*
631		 * Score groupwidths up to 19.  This value was chosen as the
632		 * largest reasonable width (16d+3p).  dRAID pools may be still
633		 * be created with wider stripes but they are not considered in
634		 * this analysis in order to optimize for the most common cases.
635		 */
636		for (uint64_t groupwidth = 2;
637		    groupwidth <= MIN(children - nspares, 19);
638		    groupwidth++) {
639			int faulted_devs[2];
640			int min_ios, max_ios;
641
642			/*
643			 * Score possible devices faults.  This is limited
644			 * to exactly one fault per distributed spare for
645			 * the purposes of this similation.
646			 */
647			for (int f1 = 0; f1 < children; f1++) {
648				faulted_devs[0] = f1;
649				double ratio;
650
651				if (faults == 1) {
652					ratio = eval_resilver(map, groupwidth,
653					    nspares, faulted_devs, faults,
654					    &min_ios, &max_ios);
655
656					if (ratio > worst_ratio) {
657						worst_ratio = ratio;
658						worst_min_ios = min_ios;
659						worst_max_ios = max_ios;
660					}
661
662					sum += ratio;
663					n++;
664				} else if (faults == 2) {
665					for (int f2 = f1 + 1; f2 < children;
666					    f2++) {
667						faulted_devs[1] = f2;
668
669						ratio = eval_resilver(map,
670						    groupwidth, nspares,
671						    faulted_devs, faults,
672						    &min_ios, &max_ios);
673
674						if (ratio > worst_ratio) {
675							worst_ratio = ratio;
676							worst_min_ios = min_ios;
677							worst_max_ios = max_ios;
678						}
679
680						sum += ratio;
681						n++;
682					}
683				}
684			}
685		}
686	}
687
688	*worst_ratiop = worst_ratio;
689	*avg_ratiop = sum / n;
690
691	/*
692	 * Log the min/max io values for particularly unbalanced maps.
693	 * Since the maps are generated entirely randomly these are possible
694	 * be exceedingly unlikely.  We log it for possible investigation.
695	 */
696	if (worst_ratio > 100.0) {
697		dump_map(map, "DEBUG", worst_ratio, *avg_ratiop, 2);
698		printf("worst_min_ios=%d worst_max_ios=%d\n",
699		    worst_min_ios, worst_max_ios);
700	}
701}
702
703static int
704eval_maps(uint64_t children, int passes, uint64_t *map_seed,
705    draid_map_t **best_mapp, double *best_ratiop, double *avg_ratiop)
706{
707	draid_map_t *best_map = NULL;
708	double best_worst_ratio = 1000.0;
709	double best_avg_ratio = 1000.0;
710
711	/*
712	 * Perform the requested number of passes evaluating randomly
713	 * generated permutation maps.  Only the best version is kept.
714	 */
715	for (int i = 0; i < passes; i++) {
716		double worst_ratio, avg_ratio;
717		draid_map_t *map;
718		int error;
719
720		/*
721		 * Calculate the next seed and generate a new candidate map.
722		 */
723		error = alloc_new_map(children, MAP_ROWS_DEFAULT,
724		    vdev_draid_rand(map_seed), &map);
725		if (error) {
726			if (best_map != NULL)
727				free_map(best_map);
728			return (error);
729		}
730
731		/*
732		 * Consider maps with a lower worst_ratio to be of higher
733		 * quality.  Some maps may have a lower avg_ratio but they
734		 * are discarded since they might include some particularly
735		 * imbalanced permutations.  The average is tracked to in
736		 * order to get a sense of the average permutation quality.
737		 */
738		eval_decluster(map, &worst_ratio, &avg_ratio);
739
740		if (best_map == NULL || worst_ratio < best_worst_ratio) {
741
742			if (best_map != NULL)
743				free_map(best_map);
744
745			best_map = map;
746			best_worst_ratio = worst_ratio;
747			best_avg_ratio = avg_ratio;
748		} else {
749			free_map(map);
750		}
751	}
752
753	/*
754	 * After determining the best map generate a checksum over the full
755	 * permutation array.  This checksum is verified when opening a dRAID
756	 * pool to ensure the generated in memory permutations are correct.
757	 */
758	zio_cksum_t cksum;
759	fletcher_4_native_varsize(best_map->dm_perms,
760	    sizeof (uint8_t) * best_map->dm_children * best_map->dm_nperms,
761	    &cksum);
762	best_map->dm_checksum = cksum.zc_word[0];
763
764	*best_mapp = best_map;
765	*best_ratiop = best_worst_ratio;
766	*avg_ratiop = best_avg_ratio;
767
768	return (0);
769}
770
771static int
772draid_generate(int argc, char *argv[])
773{
774	char filename[MAXPATHLEN] = {0};
775	uint64_t map_seed[2];
776	int c, fd, error, verbose = 0, passes = 1, continuous = 0;
777	int min_children = VDEV_DRAID_MIN_CHILDREN;
778	int max_children = VDEV_DRAID_MAX_CHILDREN;
779	int restarts = 0;
780
781	while ((c = getopt(argc, argv, ":cm:n:p:v")) != -1) {
782		switch (c) {
783		case 'c':
784			continuous++;
785			break;
786		case 'm':
787			min_children = (int)strtol(optarg, NULL, 0);
788			if (min_children < VDEV_DRAID_MIN_CHILDREN) {
789				(void) fprintf(stderr, "A minimum of 2 "
790				    "children are required.\n");
791				return (1);
792			}
793
794			break;
795		case 'n':
796			max_children = (int)strtol(optarg, NULL, 0);
797			if (max_children > VDEV_DRAID_MAX_CHILDREN) {
798				(void) fprintf(stderr, "A maximum of %d "
799				    "children are allowed.\n",
800				    VDEV_DRAID_MAX_CHILDREN);
801				return (1);
802			}
803			break;
804		case 'p':
805			passes = (int)strtol(optarg, NULL, 0);
806			break;
807		case 'v':
808			/*
809			 * 0 - Only log when a better map is added to the file.
810			 * 1 - Log the current best map for each child count.
811			 *     Minimal output on a single summary line.
812			 * 2 - Log the current best map for each child count.
813			 *     More verbose includes most map fields.
814			 * 3 - Log the current best map for each child count.
815			 *     Very verbose all fields including the full map.
816			 */
817			verbose++;
818			break;
819		case ':':
820			(void) fprintf(stderr,
821			    "missing argument for '%c' option\n", optopt);
822			draid_usage();
823			break;
824		case '?':
825			(void) fprintf(stderr, "invalid option '%c'\n",
826			    optopt);
827			draid_usage();
828			break;
829		}
830	}
831
832	if (argc > optind)
833		strlcpy(filename, argv[optind], sizeof (filename));
834	else {
835		(void) fprintf(stderr, "A FILE must be specified.\n");
836		return (1);
837	}
838
839restart:
840	/*
841	 * Start with a fresh seed from /dev/urandom.
842	 */
843	fd = open("/dev/urandom", O_RDONLY);
844	if (fd < 0) {
845		printf("Unable to open /dev/urandom: %s\n:", strerror(errno));
846		return (1);
847	} else {
848		ssize_t bytes = sizeof (map_seed);
849		ssize_t bytes_read = 0;
850
851		while (bytes_read < bytes) {
852			ssize_t rc = read(fd, ((char *)map_seed) + bytes_read,
853			    bytes - bytes_read);
854			if (rc < 0) {
855				printf("Unable to read /dev/urandom: %s\n:",
856				    strerror(errno));
857				close(fd);
858				return (1);
859			}
860			bytes_read += rc;
861		}
862
863		(void) close(fd);
864	}
865
866	if (restarts == 0)
867		printf("Writing generated mappings to '%s':\n", filename);
868
869	/*
870	 * Generate maps for all requested child counts. The best map for
871	 * each child count is written out to the specified file.  If the file
872	 * already contains a better mapping this map will not be added.
873	 */
874	for (uint64_t children = min_children;
875	    children <= max_children; children++) {
876		char key[8] = { 0 };
877		draid_map_t *map;
878		double worst_ratio = 1000.0;
879		double avg_ratio = 1000.0;
880
881		error = eval_maps(children, passes, map_seed, &map,
882		    &worst_ratio, &avg_ratio);
883		if (error) {
884			printf("Error eval_maps(): %s\n", strerror(error));
885			return (1);
886		}
887
888		if (worst_ratio < 1.0 || avg_ratio < 1.0) {
889			printf("Error ratio < 1.0: worst_ratio = %2.03f "
890			    "avg_ratio = %2.03f\n", worst_ratio, avg_ratio);
891			return (1);
892		}
893
894		snprintf(key, 7, "%llu", (u_longlong_t)children);
895		error = write_map_key(filename, key, map, worst_ratio,
896		    avg_ratio);
897		if (error == 0) {
898			/* The new map was added to the file. */
899			dump_map(map, key, worst_ratio, avg_ratio,
900			    MAX(verbose, 1));
901		} else if (error == EEXIST) {
902			/* The existing map was preferable and kept. */
903			if (verbose > 0)
904				dump_map_key(filename, key, verbose);
905		} else {
906			printf("Error write_map_key(): %s\n", strerror(error));
907			return (1);
908		}
909
910		free_map(map);
911	}
912
913	/*
914	 * When the continuous option is set restart at the minimum number of
915	 * children instead of exiting. This option is useful as a mechanism
916	 * to continuous try and refine the discovered permutations.
917	 */
918	if (continuous) {
919		restarts++;
920		printf("Restarting by request (-c): %d\n", restarts);
921		goto restart;
922	}
923
924	return (0);
925}
926
927/*
928 * Verify each map in the file by generating its in-memory permutation array
929 * and comfirming its checksum is correct.
930 */
931static int
932draid_verify(int argc, char *argv[])
933{
934	char filename[MAXPATHLEN] = {0};
935	int n = 0, c, error, verbose = 1;
936	int check_ratios = 0;
937
938	while ((c = getopt(argc, argv, ":rv")) != -1) {
939		switch (c) {
940		case 'r':
941			check_ratios++;
942			break;
943		case 'v':
944			verbose++;
945			break;
946		case ':':
947			(void) fprintf(stderr,
948			    "missing argument for '%c' option\n", optopt);
949			draid_usage();
950			break;
951		case '?':
952			(void) fprintf(stderr, "invalid option '%c'\n",
953			    optopt);
954			draid_usage();
955			break;
956		}
957	}
958
959	if (argc > optind) {
960		char *abspath = malloc(MAXPATHLEN);
961		if (abspath == NULL)
962			return (ENOMEM);
963
964		if (realpath(argv[optind], abspath) != NULL)
965			strlcpy(filename, abspath, sizeof (filename));
966		else
967			strlcpy(filename, argv[optind], sizeof (filename));
968
969		free(abspath);
970	} else {
971		(void) fprintf(stderr, "A FILE must be specified.\n");
972		return (1);
973	}
974
975	printf("Verifying permutation maps: '%s'\n", filename);
976
977	/*
978	 * Lookup hardcoded permutation map for each valid number of children
979	 * and verify a generated map has the correct checksum.  Then compare
980	 * the generated map values with the nvlist map values read from the
981	 * reference file to cross-check the permutation.
982	 */
983	for (uint64_t children = VDEV_DRAID_MIN_CHILDREN;
984	    children <= VDEV_DRAID_MAX_CHILDREN;
985	    children++) {
986		draid_map_t *map;
987		char key[8] = {0};
988
989		snprintf(key, 8, "%llu", (u_longlong_t)children);
990
991		error = alloc_fixed_map(children, &map);
992		if (error) {
993			printf("Error alloc_fixed_map() failed: %s\n",
994			    error == ECKSUM ? "Invalid checksum" :
995			    strerror(error));
996			return (1);
997		}
998
999		uint64_t nv_seed, nv_checksum, nv_children, nv_nperms;
1000		uint8_t *nv_perms;
1001		nvlist_t *cfg;
1002		uint_t c;
1003
1004		error = read_map_key(filename, key, &cfg);
1005		if (error != 0) {
1006			printf("Error read_map_key() failed: %s\n",
1007			    strerror(error));
1008			free_map(map);
1009			return (1);
1010		}
1011
1012		nv_seed = fnvlist_lookup_uint64(cfg, MAP_SEED);
1013		nv_checksum = fnvlist_lookup_uint64(cfg, MAP_CHECKSUM);
1014		nv_children = fnvlist_lookup_uint64(cfg, MAP_CHILDREN);
1015		nv_nperms = fnvlist_lookup_uint64(cfg, MAP_NPERMS);
1016		nvlist_lookup_uint8_array(cfg, MAP_PERMS, &nv_perms, &c);
1017
1018		/*
1019		 * Compare draid_map_t and nvlist reference values.
1020		 */
1021		if (map->dm_seed != nv_seed) {
1022			printf("Error different seeds: 0x%016llx != "
1023			    "0x%016llx\n", (u_longlong_t)map->dm_seed,
1024			    (u_longlong_t)nv_seed);
1025			error = EINVAL;
1026		}
1027
1028		if (map->dm_checksum != nv_checksum) {
1029			printf("Error different checksums: 0x%016llx "
1030			    "!= 0x%016llx\n",
1031			    (u_longlong_t)map->dm_checksum,
1032			    (u_longlong_t)nv_checksum);
1033			error = EINVAL;
1034		}
1035
1036		if (map->dm_children != nv_children) {
1037			printf("Error different children: %llu "
1038			    "!= %llu\n", (u_longlong_t)map->dm_children,
1039			    (u_longlong_t)nv_children);
1040			error = EINVAL;
1041		}
1042
1043		if (map->dm_nperms != nv_nperms) {
1044			printf("Error different nperms: %llu "
1045			    "!= %llu\n", (u_longlong_t)map->dm_nperms,
1046			    (u_longlong_t)nv_nperms);
1047			error = EINVAL;
1048		}
1049
1050		for (uint64_t i = 0; i < nv_children * nv_nperms; i++) {
1051			if (map->dm_perms[i] != nv_perms[i]) {
1052				printf("Error different perms[%llu]: "
1053				    "%d != %d\n", (u_longlong_t)i,
1054				    (int)map->dm_perms[i],
1055				    (int)nv_perms[i]);
1056				error = EINVAL;
1057				break;
1058			}
1059		}
1060
1061		/*
1062		 * For good measure recalculate the worst and average
1063		 * ratios and confirm they match the nvlist values.
1064		 */
1065		if (check_ratios) {
1066			uint64_t nv_worst_ratio, nv_avg_ratio;
1067			double worst_ratio, avg_ratio;
1068
1069			eval_decluster(map, &worst_ratio, &avg_ratio);
1070
1071			nv_worst_ratio = fnvlist_lookup_uint64(cfg,
1072			    MAP_WORST_RATIO);
1073			nv_avg_ratio = fnvlist_lookup_uint64(cfg,
1074			    MAP_AVG_RATIO);
1075
1076			if (worst_ratio < 1.0 || avg_ratio < 1.0) {
1077				printf("Error ratio out of range %2.03f, "
1078				    "%2.03f\n", worst_ratio, avg_ratio);
1079				error = EINVAL;
1080			}
1081
1082			if ((uint64_t)(worst_ratio * 1000.0) !=
1083			    nv_worst_ratio) {
1084				printf("Error different worst_ratio %2.03f "
1085				    "!= %2.03f\n", (double)nv_worst_ratio /
1086				    1000.0, worst_ratio);
1087				error = EINVAL;
1088			}
1089
1090			if ((uint64_t)(avg_ratio * 1000.0) != nv_avg_ratio) {
1091				printf("Error different average_ratio %2.03f "
1092				    "!= %2.03f\n", (double)nv_avg_ratio /
1093				    1000.0, avg_ratio);
1094				error = EINVAL;
1095			}
1096		}
1097
1098		if (error) {
1099			free_map(map);
1100			nvlist_free(cfg);
1101			return (1);
1102		}
1103
1104		if (verbose > 0) {
1105			printf("- %llu children: good\n",
1106			    (u_longlong_t)children);
1107		}
1108		n++;
1109
1110		free_map(map);
1111		nvlist_free(cfg);
1112	}
1113
1114	if (n != (VDEV_DRAID_MAX_CHILDREN - 1)) {
1115		printf("Error permutation maps missing: %d / %d checked\n",
1116		    n, VDEV_DRAID_MAX_CHILDREN - 1);
1117		return (1);
1118	}
1119
1120	printf("Successfully verified %d / %d permutation maps\n",
1121	    n, VDEV_DRAID_MAX_CHILDREN - 1);
1122
1123	return (0);
1124}
1125
1126/*
1127 * Dump the contents of the specified mapping(s) for inspection.
1128 */
1129static int
1130draid_dump(int argc, char *argv[])
1131{
1132	char filename[MAXPATHLEN] = {0};
1133	int c, error, verbose = 1;
1134	int min_children = VDEV_DRAID_MIN_CHILDREN;
1135	int max_children = VDEV_DRAID_MAX_CHILDREN;
1136
1137	while ((c = getopt(argc, argv, ":vm:n:")) != -1) {
1138		switch (c) {
1139		case 'm':
1140			min_children = (int)strtol(optarg, NULL, 0);
1141			if (min_children < 2) {
1142				(void) fprintf(stderr, "A minimum of 2 "
1143				    "children are required.\n");
1144				return (1);
1145			}
1146
1147			break;
1148		case 'n':
1149			max_children = (int)strtol(optarg, NULL, 0);
1150			if (max_children > VDEV_DRAID_MAX_CHILDREN) {
1151				(void) fprintf(stderr, "A maximum of %d "
1152				    "children are allowed.\n",
1153				    VDEV_DRAID_MAX_CHILDREN);
1154				return (1);
1155			}
1156			break;
1157		case 'v':
1158			verbose++;
1159			break;
1160		case ':':
1161			(void) fprintf(stderr,
1162			    "missing argument for '%c' option\n", optopt);
1163			draid_usage();
1164			break;
1165		case '?':
1166			(void) fprintf(stderr, "invalid option '%c'\n",
1167			    optopt);
1168			draid_usage();
1169			break;
1170		}
1171	}
1172
1173	if (argc > optind)
1174		strlcpy(filename, argv[optind], sizeof (filename));
1175	else {
1176		(void) fprintf(stderr, "A FILE must be specified.\n");
1177		return (1);
1178	}
1179
1180	/*
1181	 * Dump maps for the requested child counts.
1182	 */
1183	for (uint64_t children = min_children;
1184	    children <= max_children; children++) {
1185		char key[8] = { 0 };
1186
1187		snprintf(key, 7, "%llu", (u_longlong_t)children);
1188		error = dump_map_key(filename, key, verbose);
1189		if (error) {
1190			printf("Error dump_map_key(): %s\n", strerror(error));
1191			return (1);
1192		}
1193	}
1194
1195	return (0);
1196}
1197
1198/*
1199 * Print all of the mappings as a C formatted draid_map_t array.  This table
1200 * is found in the module/zcommon/zfs_draid.c file and is the definitive
1201 * source for all mapping used by dRAID.  It cannot be updated without
1202 * changing the dRAID on disk format.
1203 */
1204static int
1205draid_table(int argc, char *argv[])
1206{
1207	char filename[MAXPATHLEN] = {0};
1208	int error;
1209
1210	if (argc > optind)
1211		strlcpy(filename, argv[optind], sizeof (filename));
1212	else {
1213		(void) fprintf(stderr, "A FILE must be specified.\n");
1214		return (1);
1215	}
1216
1217	printf("static const draid_map_t "
1218	    "draid_maps[VDEV_DRAID_MAX_MAPS] = {\n");
1219
1220	for (uint64_t children = VDEV_DRAID_MIN_CHILDREN;
1221	    children <= VDEV_DRAID_MAX_CHILDREN;
1222	    children++) {
1223		uint64_t seed, checksum, nperms, avg_ratio;
1224		nvlist_t *cfg;
1225		char key[8] = {0};
1226
1227		snprintf(key, 8, "%llu", (u_longlong_t)children);
1228
1229		error = read_map_key(filename, key, &cfg);
1230		if (error != 0) {
1231			printf("Error read_map_key() failed: %s\n",
1232			    strerror(error));
1233			return (1);
1234		}
1235
1236		seed = fnvlist_lookup_uint64(cfg, MAP_SEED);
1237		checksum = fnvlist_lookup_uint64(cfg, MAP_CHECKSUM);
1238		children = fnvlist_lookup_uint64(cfg, MAP_CHILDREN);
1239		nperms = fnvlist_lookup_uint64(cfg, MAP_NPERMS);
1240		avg_ratio = fnvlist_lookup_uint64(cfg, MAP_AVG_RATIO);
1241
1242		printf("\t{ %3llu, %3llu, 0x%016llx, 0x%016llx },\t"
1243		    "/* %2.03f */\n", (u_longlong_t)children,
1244		    (u_longlong_t)nperms, (u_longlong_t)seed,
1245		    (u_longlong_t)checksum, (double)avg_ratio / 1000.0);
1246
1247		nvlist_free(cfg);
1248	}
1249
1250	printf("};\n");
1251
1252	return (0);
1253}
1254
1255static int
1256draid_merge_impl(nvlist_t *allcfgs, const char *srcfilename, int *mergedp)
1257{
1258	nvlist_t *srccfgs;
1259	nvpair_t *elem = NULL;
1260	int error, merged = 0;
1261
1262	error = read_map(srcfilename, &srccfgs);
1263	if (error != 0)
1264		return (error);
1265
1266	while ((elem = nvlist_next_nvpair(srccfgs, elem)) != NULL) {
1267		uint64_t nv_worst_ratio;
1268		uint64_t allcfg_worst_ratio;
1269		nvlist_t *cfg, *allcfg;
1270		const char *key;
1271
1272		switch (nvpair_type(elem)) {
1273		case DATA_TYPE_NVLIST:
1274
1275			(void) nvpair_value_nvlist(elem, &cfg);
1276			key = nvpair_name(elem);
1277
1278			nv_worst_ratio = fnvlist_lookup_uint64(cfg,
1279			    MAP_WORST_RATIO);
1280
1281			error = nvlist_lookup_nvlist(allcfgs, key, &allcfg);
1282			if (error == 0) {
1283				allcfg_worst_ratio = fnvlist_lookup_uint64(
1284				    allcfg, MAP_WORST_RATIO);
1285
1286				if (nv_worst_ratio < allcfg_worst_ratio) {
1287					fnvlist_remove(allcfgs, key);
1288					fnvlist_add_nvlist(allcfgs, key, cfg);
1289					merged++;
1290				}
1291			} else if (error == ENOENT) {
1292				fnvlist_add_nvlist(allcfgs, key, cfg);
1293				merged++;
1294			} else {
1295				return (error);
1296			}
1297
1298			break;
1299		default:
1300			continue;
1301		}
1302	}
1303
1304	nvlist_free(srccfgs);
1305
1306	*mergedp = merged;
1307
1308	return (0);
1309}
1310
1311/*
1312 * Merge the best map for each child count found in the listed files into
1313 * a new file.  This allows 'draid generate' to be run in parallel and for
1314 * the results maps to be combined.
1315 */
1316static int
1317draid_merge(int argc, char *argv[])
1318{
1319	char filename[MAXPATHLEN] = {0};
1320	int c, error, total_merged = 0;
1321	nvlist_t *allcfgs;
1322
1323	while ((c = getopt(argc, argv, ":")) != -1) {
1324		switch (c) {
1325		case ':':
1326			(void) fprintf(stderr,
1327			    "missing argument for '%c' option\n", optopt);
1328			draid_usage();
1329			break;
1330		case '?':
1331			(void) fprintf(stderr, "invalid option '%c'\n",
1332			    optopt);
1333			draid_usage();
1334			break;
1335		}
1336	}
1337
1338	if (argc < 4) {
1339		(void) fprintf(stderr,
1340		    "A FILE and multiple SRCs must be specified.\n");
1341		return (1);
1342	}
1343
1344	strlcpy(filename, argv[optind], sizeof (filename));
1345	optind++;
1346
1347	error = read_map(filename, &allcfgs);
1348	if (error == ENOENT) {
1349		allcfgs = fnvlist_alloc();
1350	} else if (error != 0) {
1351		printf("Error read_map(): %s\n", strerror(error));
1352		return (error);
1353	}
1354
1355	while (optind < argc) {
1356		char srcfilename[MAXPATHLEN] = {0};
1357		int merged = 0;
1358
1359		strlcpy(srcfilename, argv[optind], sizeof (srcfilename));
1360
1361		error = draid_merge_impl(allcfgs, srcfilename, &merged);
1362		if (error) {
1363			printf("Error draid_merge_impl(): %s\n",
1364			    strerror(error));
1365			nvlist_free(allcfgs);
1366			return (1);
1367		}
1368
1369		total_merged += merged;
1370		printf("Merged %d key(s) from '%s' into '%s'\n", merged,
1371		    srcfilename, filename);
1372
1373		optind++;
1374	}
1375
1376	if (total_merged > 0)
1377		write_map(filename, allcfgs);
1378
1379	printf("Merged a total of %d key(s) into '%s'\n", total_merged,
1380	    filename);
1381
1382	nvlist_free(allcfgs);
1383
1384	return (0);
1385}
1386
1387int
1388main(int argc, char *argv[])
1389{
1390	if (argc < 2)
1391		draid_usage();
1392
1393	char *subcommand = argv[1];
1394
1395	if (strcmp(subcommand, "generate") == 0) {
1396		return (draid_generate(argc - 1, argv + 1));
1397	} else if (strcmp(subcommand, "verify") == 0) {
1398		return (draid_verify(argc - 1, argv + 1));
1399	} else if (strcmp(subcommand, "dump") == 0) {
1400		return (draid_dump(argc - 1, argv + 1));
1401	} else if (strcmp(subcommand, "table") == 0) {
1402		return (draid_table(argc - 1, argv + 1));
1403	} else if (strcmp(subcommand, "merge") == 0) {
1404		return (draid_merge(argc - 1, argv + 1));
1405	} else {
1406		draid_usage();
1407	}
1408}
1409