1/*
2 * Gather top-level ZFS pool and resilver/scan statistics and print using
3 * influxdb line protocol
4 * usage: [options] [pool_name]
5 * where options are:
6 *   --execd, -e           run in telegraf execd input plugin mode, [CR] on
7 *                         stdin causes a sample to be printed and wait for
8 *                         the next [CR]
9 *   --no-histograms, -n   don't print histogram data (reduces cardinality
10 *                         if you don't care about histograms)
11 *   --sum-histogram-buckets, -s sum histogram bucket values
12 *
13 * To integrate into telegraf use one of:
14 * 1. the `inputs.execd` plugin with the `--execd` option
15 * 2. the `inputs.exec` plugin to simply run with no options
16 *
17 * NOTE: libzfs is an unstable interface. YMMV.
18 *
19 * The design goals of this software include:
20 * + be as lightweight as possible
21 * + reduce the number of external dependencies as far as possible, hence
22 *   there is no dependency on a client library for managing the metric
23 *   collection -- info is printed, KISS
24 * + broken pools or kernel bugs can cause this process to hang in an
25 *   unkillable state. For this reason, it is best to keep the damage limited
26 *   to a small process like zpool_influxdb rather than a larger collector.
27 *
28 * Copyright 2018-2020 Richard Elling
29 *
30 * This software is dual-licensed MIT and CDDL.
31 *
32 * The MIT License (MIT)
33 *
34 * Permission is hereby granted, free of charge, to any person obtaining a copy
35 * of this software and associated documentation files (the "Software"), to deal
36 * in the Software without restriction, including without limitation the rights
37 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
38 * copies of the Software, and to permit persons to whom the Software is
39 * furnished to do so, subject to the following conditions:
40 *
41 * The above copyright notice and this permission notice shall be included in
42 * all copies or substantial portions of the Software.
43 *
44 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
45 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
46 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
47 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
48 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
49 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
50 * SOFTWARE.
51 *
52 * CDDL HEADER START
53 *
54 * The contents of this file are subject to the terms of the
55 * Common Development and Distribution License (the "License").
56 * You may not use this file except in compliance with the License.
57 *
58 * The contents of this file are subject to the terms of the
59 * Common Development and Distribution License Version 1.0 (CDDL-1.0).
60 * You can obtain a copy of the license from the top-level file
61 * "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>.
62 * You may not use this file except in compliance with the license.
63 *
64 * See the License for the specific language governing permissions
65 * and limitations under the License.
66 *
67 * CDDL HEADER END
68 */
69#include <string.h>
70#include <getopt.h>
71#include <stdio.h>
72#include <stdint.h>
73#include <inttypes.h>
74#include <libzfs.h>
75
76#define	POOL_MEASUREMENT	"zpool_stats"
77#define	SCAN_MEASUREMENT	"zpool_scan_stats"
78#define	VDEV_MEASUREMENT	"zpool_vdev_stats"
79#define	POOL_LATENCY_MEASUREMENT	"zpool_latency"
80#define	POOL_QUEUE_MEASUREMENT	"zpool_vdev_queue"
81#define	MIN_LAT_INDEX	10  /* minimum latency index 10 = 1024ns */
82#define	POOL_IO_SIZE_MEASUREMENT	"zpool_io_size"
83#define	MIN_SIZE_INDEX	9  /* minimum size index 9 = 512 bytes */
84
85/* global options */
86int execd_mode = 0;
87int no_histograms = 0;
88int sum_histogram_buckets = 0;
89char metric_data_type = 'u';
90uint64_t metric_value_mask = UINT64_MAX;
91uint64_t timestamp = 0;
92int complained_about_sync = 0;
93const char *tags = "";
94
95typedef int (*stat_printer_f)(nvlist_t *, const char *, const char *);
96
97/*
98 * influxdb line protocol rules for escaping are important because the
99 * zpool name can include characters that need to be escaped
100 *
101 * caller is responsible for freeing result
102 */
103static char *
104escape_string(const char *s)
105{
106	const char *c;
107	char *d;
108	char *t = (char *)malloc(ZFS_MAX_DATASET_NAME_LEN * 2);
109	if (t == NULL) {
110		fprintf(stderr, "error: cannot allocate memory\n");
111		exit(1);
112	}
113
114	for (c = s, d = t; *c != '\0'; c++, d++) {
115		switch (*c) {
116		case ' ':
117		case ',':
118		case '=':
119		case '\\':
120			*d++ = '\\';
121			zfs_fallthrough;
122		default:
123			*d = *c;
124		}
125	}
126	*d = '\0';
127	return (t);
128}
129
130/*
131 * print key=value where value is a uint64_t
132 */
133static void
134print_kv(const char *key, uint64_t value)
135{
136	printf("%s=%llu%c", key,
137	    (u_longlong_t)value & metric_value_mask, metric_data_type);
138}
139
140/*
141 * print_scan_status() prints the details as often seen in the "zpool status"
142 * output. However, unlike the zpool command, which is intended for humans,
143 * this output is suitable for long-term tracking in influxdb.
144 * TODO: update to include issued scan data
145 */
146static int
147print_scan_status(nvlist_t *nvroot, const char *pool_name)
148{
149	uint_t c;
150	int64_t elapsed;
151	uint64_t examined, pass_exam, paused_time, paused_ts, rate;
152	uint64_t remaining_time;
153	pool_scan_stat_t *ps = NULL;
154	double pct_done;
155	const char *const state[DSS_NUM_STATES] = {
156	    "none", "scanning", "finished", "canceled"};
157	const char *func;
158
159	(void) nvlist_lookup_uint64_array(nvroot,
160	    ZPOOL_CONFIG_SCAN_STATS,
161	    (uint64_t **)&ps, &c);
162
163	/*
164	 * ignore if there are no stats
165	 */
166	if (ps == NULL)
167		return (0);
168
169	/*
170	 * return error if state is bogus
171	 */
172	if (ps->pss_state >= DSS_NUM_STATES ||
173	    ps->pss_func >= POOL_SCAN_FUNCS) {
174		if (complained_about_sync % 1000 == 0) {
175			fprintf(stderr, "error: cannot decode scan stats: "
176			    "ZFS is out of sync with compiled zpool_influxdb");
177			complained_about_sync++;
178		}
179		return (1);
180	}
181
182	switch (ps->pss_func) {
183	case POOL_SCAN_NONE:
184		func = "none_requested";
185		break;
186	case POOL_SCAN_SCRUB:
187		func = "scrub";
188		break;
189	case POOL_SCAN_RESILVER:
190		func = "resilver";
191		break;
192#ifdef POOL_SCAN_REBUILD
193	case POOL_SCAN_REBUILD:
194		func = "rebuild";
195		break;
196#endif
197	default:
198		func = "scan";
199	}
200
201	/* overall progress */
202	examined = ps->pss_examined ? ps->pss_examined : 1;
203	pct_done = 0.0;
204	if (ps->pss_to_examine > 0)
205		pct_done = 100.0 * examined / ps->pss_to_examine;
206
207#ifdef EZFS_SCRUB_PAUSED
208	paused_ts = ps->pss_pass_scrub_pause;
209	paused_time = ps->pss_pass_scrub_spent_paused;
210#else
211	paused_ts = 0;
212	paused_time = 0;
213#endif
214
215	/* calculations for this pass */
216	if (ps->pss_state == DSS_SCANNING) {
217		elapsed = (int64_t)time(NULL) - (int64_t)ps->pss_pass_start -
218		    (int64_t)paused_time;
219		elapsed = (elapsed > 0) ? elapsed : 1;
220		pass_exam = ps->pss_pass_exam ? ps->pss_pass_exam : 1;
221		rate = pass_exam / elapsed;
222		rate = (rate > 0) ? rate : 1;
223		remaining_time = ps->pss_to_examine - examined / rate;
224	} else {
225		elapsed =
226		    (int64_t)ps->pss_end_time - (int64_t)ps->pss_pass_start -
227		    (int64_t)paused_time;
228		elapsed = (elapsed > 0) ? elapsed : 1;
229		pass_exam = ps->pss_pass_exam ? ps->pss_pass_exam : 1;
230		rate = pass_exam / elapsed;
231		remaining_time = 0;
232	}
233	rate = rate ? rate : 1;
234
235	/* influxdb line protocol format: "tags metrics timestamp" */
236	printf("%s%s,function=%s,name=%s,state=%s ",
237	    SCAN_MEASUREMENT, tags, func, pool_name, state[ps->pss_state]);
238	print_kv("end_ts", ps->pss_end_time);
239	print_kv(",errors", ps->pss_errors);
240	print_kv(",examined", examined);
241	print_kv(",skipped", ps->pss_skipped);
242	print_kv(",issued", ps->pss_issued);
243	print_kv(",pass_examined", pass_exam);
244	print_kv(",pass_issued", ps->pss_pass_issued);
245	print_kv(",paused_ts", paused_ts);
246	print_kv(",paused_t", paused_time);
247	printf(",pct_done=%.2f", pct_done);
248	print_kv(",processed", ps->pss_processed);
249	print_kv(",rate", rate);
250	print_kv(",remaining_t", remaining_time);
251	print_kv(",start_ts", ps->pss_start_time);
252	print_kv(",to_examine", ps->pss_to_examine);
253	printf(" %llu\n", (u_longlong_t)timestamp);
254	return (0);
255}
256
257/*
258 * get a vdev name that corresponds to the top-level vdev names
259 * printed by `zpool status`
260 */
261static char *
262get_vdev_name(nvlist_t *nvroot, const char *parent_name)
263{
264	static char vdev_name[256];
265	uint64_t vdev_id = 0;
266
267	const char *vdev_type = "unknown";
268	(void) nvlist_lookup_string(nvroot, ZPOOL_CONFIG_TYPE, &vdev_type);
269
270	if (nvlist_lookup_uint64(
271	    nvroot, ZPOOL_CONFIG_ID, &vdev_id) != 0)
272		vdev_id = UINT64_MAX;
273
274	if (parent_name == NULL) {
275		(void) snprintf(vdev_name, sizeof (vdev_name), "%s",
276		    vdev_type);
277	} else {
278		(void) snprintf(vdev_name, sizeof (vdev_name),
279		    "%.220s/%s-%llu",
280		    parent_name, vdev_type, (u_longlong_t)vdev_id);
281	}
282	return (vdev_name);
283}
284
285/*
286 * get a string suitable for an influxdb tag that describes this vdev
287 *
288 * By default only the vdev hierarchical name is shown, separated by '/'
289 * If the vdev has an associated path, which is typical of leaf vdevs,
290 * then the path is added.
291 * It would be nice to have the devid instead of the path, but under
292 * Linux we cannot be sure a devid will exist and we'd rather have
293 * something than nothing, so we'll use path instead.
294 */
295static char *
296get_vdev_desc(nvlist_t *nvroot, const char *parent_name)
297{
298	static char vdev_desc[2 * MAXPATHLEN];
299	char vdev_value[MAXPATHLEN];
300	char *s, *t;
301
302	const char *vdev_type = "unknown";
303	uint64_t vdev_id = UINT64_MAX;
304	const char *vdev_path = NULL;
305	(void) nvlist_lookup_string(nvroot, ZPOOL_CONFIG_TYPE, &vdev_type);
306	(void) nvlist_lookup_uint64(nvroot, ZPOOL_CONFIG_ID, &vdev_id);
307	(void) nvlist_lookup_string(nvroot, ZPOOL_CONFIG_PATH, &vdev_path);
308
309	if (parent_name == NULL) {
310		s = escape_string(vdev_type);
311		(void) snprintf(vdev_value, sizeof (vdev_value), "vdev=%s", s);
312		free(s);
313	} else {
314		s = escape_string((char *)parent_name);
315		t = escape_string(vdev_type);
316		(void) snprintf(vdev_value, sizeof (vdev_value),
317		    "vdev=%s/%s-%llu", s, t, (u_longlong_t)vdev_id);
318		free(s);
319		free(t);
320	}
321	if (vdev_path == NULL) {
322		(void) snprintf(vdev_desc, sizeof (vdev_desc), "%s",
323		    vdev_value);
324	} else {
325		s = escape_string(vdev_path);
326		(void) snprintf(vdev_desc, sizeof (vdev_desc), "path=%s,%s",
327		    s, vdev_value);
328		free(s);
329	}
330	return (vdev_desc);
331}
332
333/*
334 * vdev summary stats are a combination of the data shown by
335 * `zpool status` and `zpool list -v`
336 */
337static int
338print_summary_stats(nvlist_t *nvroot, const char *pool_name,
339    const char *parent_name)
340{
341	uint_t c;
342	vdev_stat_t *vs;
343	char *vdev_desc = NULL;
344	vdev_desc = get_vdev_desc(nvroot, parent_name);
345	if (nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_VDEV_STATS,
346	    (uint64_t **)&vs, &c) != 0) {
347		return (1);
348	}
349	printf("%s%s,name=%s,state=%s,%s ", POOL_MEASUREMENT, tags,
350	    pool_name, zpool_state_to_name((vdev_state_t)vs->vs_state,
351	    (vdev_aux_t)vs->vs_aux), vdev_desc);
352	print_kv("alloc", vs->vs_alloc);
353	print_kv(",free", vs->vs_space - vs->vs_alloc);
354	print_kv(",size", vs->vs_space);
355	print_kv(",read_bytes", vs->vs_bytes[ZIO_TYPE_READ]);
356	print_kv(",read_errors", vs->vs_read_errors);
357	print_kv(",read_ops", vs->vs_ops[ZIO_TYPE_READ]);
358	print_kv(",write_bytes", vs->vs_bytes[ZIO_TYPE_WRITE]);
359	print_kv(",write_errors", vs->vs_write_errors);
360	print_kv(",write_ops", vs->vs_ops[ZIO_TYPE_WRITE]);
361	print_kv(",checksum_errors", vs->vs_checksum_errors);
362	print_kv(",fragmentation", vs->vs_fragmentation);
363	printf(" %llu\n", (u_longlong_t)timestamp);
364	return (0);
365}
366
367/*
368 * vdev latency stats are histograms stored as nvlist arrays of uint64.
369 * Latency stats include the ZIO scheduler classes plus lower-level
370 * vdev latencies.
371 *
372 * In many cases, the top-level "root" view obscures the underlying
373 * top-level vdev operations. For example, if a pool has a log, special,
374 * or cache device, then each can behave very differently. It is useful
375 * to see how each is responding.
376 */
377static int
378print_vdev_latency_stats(nvlist_t *nvroot, const char *pool_name,
379    const char *parent_name)
380{
381	uint_t c, end = 0;
382	nvlist_t *nv_ex;
383	char *vdev_desc = NULL;
384
385	/* short_names become part of the metric name and are influxdb-ready */
386	struct lat_lookup {
387	    const char *name;
388	    const char *short_name;
389	    uint64_t sum;
390	    uint64_t *array;
391	};
392	struct lat_lookup lat_type[] = {
393	    {ZPOOL_CONFIG_VDEV_TOT_R_LAT_HISTO,   "total_read", 0},
394	    {ZPOOL_CONFIG_VDEV_TOT_W_LAT_HISTO,   "total_write", 0},
395	    {ZPOOL_CONFIG_VDEV_DISK_R_LAT_HISTO,  "disk_read", 0},
396	    {ZPOOL_CONFIG_VDEV_DISK_W_LAT_HISTO,  "disk_write", 0},
397	    {ZPOOL_CONFIG_VDEV_SYNC_R_LAT_HISTO,  "sync_read", 0},
398	    {ZPOOL_CONFIG_VDEV_SYNC_W_LAT_HISTO,  "sync_write", 0},
399	    {ZPOOL_CONFIG_VDEV_ASYNC_R_LAT_HISTO, "async_read", 0},
400	    {ZPOOL_CONFIG_VDEV_ASYNC_W_LAT_HISTO, "async_write", 0},
401	    {ZPOOL_CONFIG_VDEV_SCRUB_LAT_HISTO,   "scrub", 0},
402#ifdef ZPOOL_CONFIG_VDEV_TRIM_LAT_HISTO
403	    {ZPOOL_CONFIG_VDEV_TRIM_LAT_HISTO,    "trim", 0},
404#endif
405	    {ZPOOL_CONFIG_VDEV_REBUILD_LAT_HISTO,    "rebuild", 0},
406	    {NULL,	NULL}
407	};
408
409	if (nvlist_lookup_nvlist(nvroot,
410	    ZPOOL_CONFIG_VDEV_STATS_EX, &nv_ex) != 0) {
411		return (6);
412	}
413
414	vdev_desc = get_vdev_desc(nvroot, parent_name);
415
416	for (int i = 0; lat_type[i].name; i++) {
417		if (nvlist_lookup_uint64_array(nv_ex,
418		    lat_type[i].name, &lat_type[i].array, &c) != 0) {
419			fprintf(stderr, "error: can't get %s\n",
420			    lat_type[i].name);
421			return (3);
422		}
423		/* end count count, all of the arrays are the same size */
424		end = c - 1;
425	}
426
427	for (int bucket = 0; bucket <= end; bucket++) {
428		if (bucket < MIN_LAT_INDEX) {
429			/* don't print, but collect the sum */
430			for (int i = 0; lat_type[i].name; i++) {
431				lat_type[i].sum += lat_type[i].array[bucket];
432			}
433			continue;
434		}
435		if (bucket < end) {
436			printf("%s%s,le=%0.6f,name=%s,%s ",
437			    POOL_LATENCY_MEASUREMENT, tags,
438			    (float)(1ULL << bucket) * 1e-9,
439			    pool_name, vdev_desc);
440		} else {
441			printf("%s%s,le=+Inf,name=%s,%s ",
442			    POOL_LATENCY_MEASUREMENT, tags, pool_name,
443			    vdev_desc);
444		}
445		for (int i = 0; lat_type[i].name; i++) {
446			if (bucket <= MIN_LAT_INDEX || sum_histogram_buckets) {
447				lat_type[i].sum += lat_type[i].array[bucket];
448			} else {
449				lat_type[i].sum = lat_type[i].array[bucket];
450			}
451			print_kv(lat_type[i].short_name, lat_type[i].sum);
452			if (lat_type[i + 1].name != NULL) {
453				printf(",");
454			}
455		}
456		printf(" %llu\n", (u_longlong_t)timestamp);
457	}
458	return (0);
459}
460
461/*
462 * vdev request size stats are histograms stored as nvlist arrays of uint64.
463 * Request size stats include the ZIO scheduler classes plus lower-level
464 * vdev sizes. Both independent (ind) and aggregated (agg) sizes are reported.
465 *
466 * In many cases, the top-level "root" view obscures the underlying
467 * top-level vdev operations. For example, if a pool has a log, special,
468 * or cache device, then each can behave very differently. It is useful
469 * to see how each is responding.
470 */
471static int
472print_vdev_size_stats(nvlist_t *nvroot, const char *pool_name,
473    const char *parent_name)
474{
475	uint_t c, end = 0;
476	nvlist_t *nv_ex;
477	char *vdev_desc = NULL;
478
479	/* short_names become the field name */
480	struct size_lookup {
481	    const char *name;
482	    const char *short_name;
483	    uint64_t sum;
484	    uint64_t *array;
485	};
486	struct size_lookup size_type[] = {
487	    {ZPOOL_CONFIG_VDEV_SYNC_IND_R_HISTO,   "sync_read_ind"},
488	    {ZPOOL_CONFIG_VDEV_SYNC_IND_W_HISTO,   "sync_write_ind"},
489	    {ZPOOL_CONFIG_VDEV_ASYNC_IND_R_HISTO,  "async_read_ind"},
490	    {ZPOOL_CONFIG_VDEV_ASYNC_IND_W_HISTO,  "async_write_ind"},
491	    {ZPOOL_CONFIG_VDEV_IND_SCRUB_HISTO,    "scrub_read_ind"},
492	    {ZPOOL_CONFIG_VDEV_SYNC_AGG_R_HISTO,   "sync_read_agg"},
493	    {ZPOOL_CONFIG_VDEV_SYNC_AGG_W_HISTO,   "sync_write_agg"},
494	    {ZPOOL_CONFIG_VDEV_ASYNC_AGG_R_HISTO,  "async_read_agg"},
495	    {ZPOOL_CONFIG_VDEV_ASYNC_AGG_W_HISTO,  "async_write_agg"},
496	    {ZPOOL_CONFIG_VDEV_AGG_SCRUB_HISTO,    "scrub_read_agg"},
497#ifdef ZPOOL_CONFIG_VDEV_IND_TRIM_HISTO
498	    {ZPOOL_CONFIG_VDEV_IND_TRIM_HISTO,    "trim_write_ind"},
499	    {ZPOOL_CONFIG_VDEV_AGG_TRIM_HISTO,    "trim_write_agg"},
500#endif
501	    {ZPOOL_CONFIG_VDEV_IND_REBUILD_HISTO,    "rebuild_write_ind"},
502	    {ZPOOL_CONFIG_VDEV_AGG_REBUILD_HISTO,    "rebuild_write_agg"},
503	    {NULL,	NULL}
504	};
505
506	if (nvlist_lookup_nvlist(nvroot,
507	    ZPOOL_CONFIG_VDEV_STATS_EX, &nv_ex) != 0) {
508		return (6);
509	}
510
511	vdev_desc = get_vdev_desc(nvroot, parent_name);
512
513	for (int i = 0; size_type[i].name; i++) {
514		if (nvlist_lookup_uint64_array(nv_ex, size_type[i].name,
515		    &size_type[i].array, &c) != 0) {
516			fprintf(stderr, "error: can't get %s\n",
517			    size_type[i].name);
518			return (3);
519		}
520		/* end count count, all of the arrays are the same size */
521		end = c - 1;
522	}
523
524	for (int bucket = 0; bucket <= end; bucket++) {
525		if (bucket < MIN_SIZE_INDEX) {
526			/* don't print, but collect the sum */
527			for (int i = 0; size_type[i].name; i++) {
528				size_type[i].sum += size_type[i].array[bucket];
529			}
530			continue;
531		}
532
533		if (bucket < end) {
534			printf("%s%s,le=%llu,name=%s,%s ",
535			    POOL_IO_SIZE_MEASUREMENT, tags, 1ULL << bucket,
536			    pool_name, vdev_desc);
537		} else {
538			printf("%s%s,le=+Inf,name=%s,%s ",
539			    POOL_IO_SIZE_MEASUREMENT, tags, pool_name,
540			    vdev_desc);
541		}
542		for (int i = 0; size_type[i].name; i++) {
543			if (bucket <= MIN_SIZE_INDEX || sum_histogram_buckets) {
544				size_type[i].sum += size_type[i].array[bucket];
545			} else {
546				size_type[i].sum = size_type[i].array[bucket];
547			}
548			print_kv(size_type[i].short_name, size_type[i].sum);
549			if (size_type[i + 1].name != NULL) {
550				printf(",");
551			}
552		}
553		printf(" %llu\n", (u_longlong_t)timestamp);
554	}
555	return (0);
556}
557
558/*
559 * ZIO scheduler queue stats are stored as gauges. This is unfortunate
560 * because the values can change very rapidly and any point-in-time
561 * value will quickly be obsoleted. It is also not easy to downsample.
562 * Thus only the top-level queue stats might be beneficial... maybe.
563 */
564static int
565print_queue_stats(nvlist_t *nvroot, const char *pool_name,
566    const char *parent_name)
567{
568	nvlist_t *nv_ex;
569	uint64_t value;
570
571	/* short_names are used for the field name */
572	struct queue_lookup {
573	    const char *name;
574	    const char *short_name;
575	};
576	struct queue_lookup queue_type[] = {
577	    {ZPOOL_CONFIG_VDEV_SYNC_R_ACTIVE_QUEUE,	"sync_r_active"},
578	    {ZPOOL_CONFIG_VDEV_SYNC_W_ACTIVE_QUEUE,	"sync_w_active"},
579	    {ZPOOL_CONFIG_VDEV_ASYNC_R_ACTIVE_QUEUE,	"async_r_active"},
580	    {ZPOOL_CONFIG_VDEV_ASYNC_W_ACTIVE_QUEUE,	"async_w_active"},
581	    {ZPOOL_CONFIG_VDEV_SCRUB_ACTIVE_QUEUE,	"async_scrub_active"},
582	    {ZPOOL_CONFIG_VDEV_REBUILD_ACTIVE_QUEUE,	"rebuild_active"},
583	    {ZPOOL_CONFIG_VDEV_SYNC_R_PEND_QUEUE,	"sync_r_pend"},
584	    {ZPOOL_CONFIG_VDEV_SYNC_W_PEND_QUEUE,	"sync_w_pend"},
585	    {ZPOOL_CONFIG_VDEV_ASYNC_R_PEND_QUEUE,	"async_r_pend"},
586	    {ZPOOL_CONFIG_VDEV_ASYNC_W_PEND_QUEUE,	"async_w_pend"},
587	    {ZPOOL_CONFIG_VDEV_SCRUB_PEND_QUEUE,	"async_scrub_pend"},
588	    {ZPOOL_CONFIG_VDEV_REBUILD_PEND_QUEUE,	"rebuild_pend"},
589	    {NULL,	NULL}
590	};
591
592	if (nvlist_lookup_nvlist(nvroot,
593	    ZPOOL_CONFIG_VDEV_STATS_EX, &nv_ex) != 0) {
594		return (6);
595	}
596
597	printf("%s%s,name=%s,%s ", POOL_QUEUE_MEASUREMENT, tags, pool_name,
598	    get_vdev_desc(nvroot, parent_name));
599	for (int i = 0; queue_type[i].name; i++) {
600		if (nvlist_lookup_uint64(nv_ex,
601		    queue_type[i].name, &value) != 0) {
602			fprintf(stderr, "error: can't get %s\n",
603			    queue_type[i].name);
604			return (3);
605		}
606		print_kv(queue_type[i].short_name, value);
607		if (queue_type[i + 1].name != NULL) {
608			printf(",");
609		}
610	}
611	printf(" %llu\n", (u_longlong_t)timestamp);
612	return (0);
613}
614
615/*
616 * top-level vdev stats are at the pool level
617 */
618static int
619print_top_level_vdev_stats(nvlist_t *nvroot, const char *pool_name)
620{
621	nvlist_t *nv_ex;
622	uint64_t value;
623
624	/* short_names become part of the metric name */
625	struct queue_lookup {
626	    const char *name;
627	    const char *short_name;
628	};
629	struct queue_lookup queue_type[] = {
630	    {ZPOOL_CONFIG_VDEV_SYNC_R_ACTIVE_QUEUE, "sync_r_active_queue"},
631	    {ZPOOL_CONFIG_VDEV_SYNC_W_ACTIVE_QUEUE, "sync_w_active_queue"},
632	    {ZPOOL_CONFIG_VDEV_ASYNC_R_ACTIVE_QUEUE, "async_r_active_queue"},
633	    {ZPOOL_CONFIG_VDEV_ASYNC_W_ACTIVE_QUEUE, "async_w_active_queue"},
634	    {ZPOOL_CONFIG_VDEV_SCRUB_ACTIVE_QUEUE, "async_scrub_active_queue"},
635	    {ZPOOL_CONFIG_VDEV_REBUILD_ACTIVE_QUEUE, "rebuild_active_queue"},
636	    {ZPOOL_CONFIG_VDEV_SYNC_R_PEND_QUEUE, "sync_r_pend_queue"},
637	    {ZPOOL_CONFIG_VDEV_SYNC_W_PEND_QUEUE, "sync_w_pend_queue"},
638	    {ZPOOL_CONFIG_VDEV_ASYNC_R_PEND_QUEUE, "async_r_pend_queue"},
639	    {ZPOOL_CONFIG_VDEV_ASYNC_W_PEND_QUEUE, "async_w_pend_queue"},
640	    {ZPOOL_CONFIG_VDEV_SCRUB_PEND_QUEUE, "async_scrub_pend_queue"},
641	    {ZPOOL_CONFIG_VDEV_REBUILD_PEND_QUEUE, "rebuild_pend_queue"},
642	    {NULL, NULL}
643	};
644
645	if (nvlist_lookup_nvlist(nvroot,
646	    ZPOOL_CONFIG_VDEV_STATS_EX, &nv_ex) != 0) {
647		return (6);
648	}
649
650	printf("%s%s,name=%s,vdev=root ", VDEV_MEASUREMENT, tags,
651	    pool_name);
652	for (int i = 0; queue_type[i].name; i++) {
653		if (nvlist_lookup_uint64(nv_ex,
654		    queue_type[i].name, &value) != 0) {
655			fprintf(stderr, "error: can't get %s\n",
656			    queue_type[i].name);
657			return (3);
658		}
659		if (i > 0)
660			printf(",");
661		print_kv(queue_type[i].short_name, value);
662	}
663
664	printf(" %llu\n", (u_longlong_t)timestamp);
665	return (0);
666}
667
668/*
669 * recursive stats printer
670 */
671static int
672print_recursive_stats(stat_printer_f func, nvlist_t *nvroot,
673    const char *pool_name, const char *parent_name, int descend)
674{
675	uint_t c, children;
676	nvlist_t **child;
677	char vdev_name[256];
678	int err;
679
680	err = func(nvroot, pool_name, parent_name);
681	if (err)
682		return (err);
683
684	if (descend && nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
685	    &child, &children) == 0) {
686		(void) strlcpy(vdev_name, get_vdev_name(nvroot, parent_name),
687		    sizeof (vdev_name));
688
689		for (c = 0; c < children; c++) {
690			err = print_recursive_stats(func, child[c], pool_name,
691			    vdev_name, descend);
692			if (err)
693				return (err);
694		}
695	}
696	return (0);
697}
698
699/*
700 * call-back to print the stats from the pool config
701 *
702 * Note: if the pool is broken, this can hang indefinitely and perhaps in an
703 * unkillable state.
704 */
705static int
706print_stats(zpool_handle_t *zhp, void *data)
707{
708	uint_t c;
709	int err;
710	boolean_t missing;
711	nvlist_t *config, *nvroot;
712	vdev_stat_t *vs;
713	struct timespec tv;
714	char *pool_name;
715
716	/* if not this pool return quickly */
717	if (data &&
718	    strncmp(data, zpool_get_name(zhp), ZFS_MAX_DATASET_NAME_LEN) != 0) {
719		zpool_close(zhp);
720		return (0);
721	}
722
723	if (zpool_refresh_stats(zhp, &missing) != 0) {
724		zpool_close(zhp);
725		return (1);
726	}
727
728	config = zpool_get_config(zhp, NULL);
729	if (clock_gettime(CLOCK_REALTIME, &tv) != 0)
730		timestamp = (uint64_t)time(NULL) * 1000000000;
731	else
732		timestamp =
733		    ((uint64_t)tv.tv_sec * 1000000000) + (uint64_t)tv.tv_nsec;
734
735	if (nvlist_lookup_nvlist(
736	    config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) != 0) {
737	zpool_close(zhp);
738		return (2);
739	}
740	if (nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_VDEV_STATS,
741	    (uint64_t **)&vs, &c) != 0) {
742	zpool_close(zhp);
743		return (3);
744	}
745
746	pool_name = escape_string(zpool_get_name(zhp));
747	err = print_recursive_stats(print_summary_stats, nvroot,
748	    pool_name, NULL, 1);
749	/* if any of these return an error, skip the rest */
750	if (err == 0)
751	err = print_top_level_vdev_stats(nvroot, pool_name);
752
753	if (no_histograms == 0) {
754	if (err == 0)
755		err = print_recursive_stats(print_vdev_latency_stats, nvroot,
756		    pool_name, NULL, 1);
757	if (err == 0)
758		err = print_recursive_stats(print_vdev_size_stats, nvroot,
759		    pool_name, NULL, 1);
760	if (err == 0)
761		err = print_recursive_stats(print_queue_stats, nvroot,
762		    pool_name, NULL, 0);
763	}
764	if (err == 0)
765		err = print_scan_status(nvroot, pool_name);
766
767	free(pool_name);
768	zpool_close(zhp);
769	return (err);
770}
771
772static void
773usage(char *name)
774{
775	fprintf(stderr, "usage: %s [--execd][--no-histograms]"
776	    "[--sum-histogram-buckets] [--signed-int] [poolname]\n", name);
777	exit(EXIT_FAILURE);
778}
779
780int
781main(int argc, char *argv[])
782{
783	int opt;
784	int ret = 8;
785	char *line = NULL, *ttags = NULL;
786	size_t len, tagslen = 0;
787	struct option long_options[] = {
788	    {"execd", no_argument, NULL, 'e'},
789	    {"help", no_argument, NULL, 'h'},
790	    {"no-histograms", no_argument, NULL, 'n'},
791	    {"signed-int", no_argument, NULL, 'i'},
792	    {"sum-histogram-buckets", no_argument, NULL, 's'},
793	    {"tags", required_argument, NULL, 't'},
794	    {0, 0, 0, 0}
795	};
796	while ((opt = getopt_long(
797	    argc, argv, "ehinst:", long_options, NULL)) != -1) {
798		switch (opt) {
799		case 'e':
800			execd_mode = 1;
801			break;
802		case 'i':
803			metric_data_type = 'i';
804			metric_value_mask = INT64_MAX;
805			break;
806		case 'n':
807			no_histograms = 1;
808			break;
809		case 's':
810			sum_histogram_buckets = 1;
811			break;
812		case 't':
813			free(ttags);
814			tagslen = strlen(optarg) + 2;
815			ttags = calloc(1, tagslen);
816			if (ttags == NULL) {
817				fprintf(stderr,
818				    "error: cannot allocate memory "
819				    "for tags\n");
820				exit(1);
821			}
822			(void) snprintf(ttags, tagslen, ",%s", optarg);
823			tags = ttags;
824			break;
825		default:
826			usage(argv[0]);
827		}
828	}
829
830	libzfs_handle_t *g_zfs;
831	if ((g_zfs = libzfs_init()) == NULL) {
832		fprintf(stderr,
833		    "error: cannot initialize libzfs. "
834		    "Is the zfs module loaded or zrepl running?\n");
835		exit(EXIT_FAILURE);
836	}
837	if (execd_mode == 0) {
838		ret = zpool_iter(g_zfs, print_stats, argv[optind]);
839		return (ret);
840	}
841	while (getline(&line, &len, stdin) != -1) {
842		ret = zpool_iter(g_zfs, print_stats, argv[optind]);
843		fflush(stdout);
844	}
845	return (ret);
846}
847