1/*
2 * Copyright (c) 2021 Netflix, Inc
3 *
4 * SPDX-License-Identifier: BSD-2-Clause
5 */
6
7
8#include <sys/param.h>
9#include <sys/sysctl.h>
10#include <sys/resource.h>
11
12#include <devstat.h>
13#include <err.h>
14#include <errno.h>
15#include <math.h>
16#include <stdbool.h>
17#include <stdlib.h>
18#include <string.h>
19
20#include <sys/queue.h>
21#include <sys/sysctl.h>
22
23#include "systat.h"
24#include "extern.h"
25#include "devs.h"
26
27#define CAM_BASE "kern.cam"
28#define LATENCY ".latencies"
29#define CAM_IOSCHED_BASE "kern.cam.iosched.bucket_base_us"
30
31#define DEV_NAMSIZE	32
32#define OP_NAMSIZE	16
33#define MAX_LATS	32
34
35static double high_thresh = 500;
36static double med_thresh = 300;
37static bool docolor = true;
38
39static int ndevs;
40static SLIST_HEAD(, iosched_stat)	curlist;
41
42struct iosched_op_stat {
43	int		nlats;
44	uint64_t	lats[MAX_LATS];
45	uint64_t	prev_lats[MAX_LATS];
46};
47
48enum { OP_READ = 0, OP_WRITE, OP_TRIM, NUM_OPS };
49static const char *ops[NUM_OPS] = { "read", "write", "trim" };
50#define OP_READ_MASK (1 << OP_READ)
51#define OP_WRITE_MASK (1 << OP_WRITE)
52#define OP_TRIM_MASK (1 << OP_TRIM)
53
54static uint32_t flags = OP_READ_MASK | OP_WRITE_MASK | OP_TRIM_MASK;
55
56struct iosched_stat {
57	SLIST_ENTRY(iosched_stat)	 link;
58	char		dev_name[DEV_NAMSIZE];
59	int		unit;
60	struct iosched_op_stat op_stats[NUM_OPS];
61};
62
63static int	name2oid(const char *, int *);
64static int	walk_sysctl(int *, size_t);
65
66static int
67name2oid(const char *name, int *oidp)
68{
69	int oid[2];
70	int i;
71	size_t j;
72
73	oid[0] = CTL_SYSCTL;
74	oid[1] = CTL_SYSCTL_NAME2OID;
75
76	j = CTL_MAXNAME * sizeof(int);
77	i = sysctl(oid, 2, oidp, &j, name, strlen(name));
78	if (i < 0)
79		return (i);
80	j /= sizeof(int);
81	return (j);
82}
83
84static size_t /* Includes the trailing NUL */
85oid2name(int *oid, size_t nlen, char *name, size_t namlen)
86{
87	int qoid[CTL_MAXNAME + 2];
88	int i;
89	size_t j;
90
91	bzero(name, namlen);
92	qoid[0] = CTL_SYSCTL;
93	qoid[1] = CTL_SYSCTL_NAME;
94	memcpy(qoid + 2, oid, nlen * sizeof(int));
95	j = namlen;
96	i = sysctl(qoid, nlen + 2, name, &j, 0, 0);
97	if (i || !j)
98		err(1, "sysctl name %d %zu %d", i, j, errno);
99	return (j);
100}
101
102static int
103oidfmt(int *oid, int len, u_int *kind)
104{
105	int qoid[CTL_MAXNAME+2];
106	u_char buf[BUFSIZ];
107	int i;
108	size_t j;
109
110	qoid[0] = CTL_SYSCTL;
111	qoid[1] = CTL_SYSCTL_OIDFMT;
112	memcpy(qoid + 2, oid, len * sizeof(int));
113
114	j = sizeof(buf);
115	i = sysctl(qoid, len + 2, buf, &j, 0, 0);
116	if (i)
117		err(1, "sysctl fmt %d %zu %d", i, j, errno);
118	*kind = *(u_int *)buf;
119	return (0);
120}
121
122static int
123split_u64(char *str, const char *delim, uint64_t *buckets, int *nbuckets)
124{
125	int n = *nbuckets, i;
126	char *v;
127
128	memset(buckets, 0, n * sizeof(buckets[0]));
129	for (i = 0; (v = strsep(&str, delim)) != NULL && i < n; i++) {
130		buckets[i] = strtoull(v, NULL, 10);
131	}
132	if (i < n)
133		*nbuckets = i;
134	return (i < n);
135}
136
137static double baselat = 0.000020;
138
139static float
140pest(int permill, uint64_t *lats, int nlat)
141{
142	uint64_t tot, samp;
143	int i;
144	float b1, b2;
145
146	for (tot = 0, i = 0; i < nlat; i++)
147		tot += lats[i];
148	if (tot == 0)
149		return -nanf("");
150	if (tot < (uint64_t)2000 / (1000 - permill))
151		return nanf("");
152	samp = tot * permill / 1000;
153	if (samp < lats[0])
154		return baselat * (float)samp / lats[0]; /* linear interpolation 0 and baselat */
155	for (tot = 0, i = 0; samp >= tot && i < nlat; i++)
156		tot += lats[i];
157	i--;
158	b1 = baselat * (1 << (i - 1));
159	b2 = baselat * (1 << i);
160	/* Should expoentially interpolate between buckets -- doing linear instead */
161	return b1 + (b2 - b1) * (float)(lats[i] - (tot - samp)) / lats[i];
162}
163
164static int
165op2num(const char *op)
166{
167	for (int i = 0; i < NUM_OPS; i++)
168		if (strcmp(op, ops[i]) == 0)
169			return i;
170	return -1;
171}
172
173static struct iosched_op_stat *
174find_dev(const char *dev, int unit, int op)
175{
176	struct iosched_stat *isp;
177	struct iosched_op_stat *iosp;
178
179	SLIST_FOREACH(isp, &curlist, link) {
180		if (strcmp(isp->dev_name, dev) != 0 || isp->unit != unit)
181			continue;
182		iosp = &isp->op_stats[op];
183		return iosp;
184	}
185	return NULL;
186}
187
188static struct iosched_op_stat *
189alloc_dev(const char *dev, int unit, int op)
190{
191	struct iosched_stat *isp;
192	struct iosched_op_stat *iosp;
193
194	isp = malloc(sizeof(*isp));
195	if (isp == NULL)
196		return NULL;
197	strlcpy(isp->dev_name, dev, sizeof(isp->dev_name));
198	isp->unit = unit;
199	SLIST_INSERT_HEAD(&curlist, isp, link);
200	ndevs++;
201	iosp = &isp->op_stats[op];
202	return iosp;
203}
204
205#define E3 1000.0
206static void
207update_dev(const char *dev, int unit, int op, uint64_t *lats, int nlat)
208{
209	struct iosched_op_stat *iosp;
210
211	iosp = find_dev(dev, unit, op);
212	if (iosp == NULL)
213		iosp = alloc_dev(dev, unit, op);
214	if (iosp == NULL)
215		return;
216	iosp->nlats = nlat;
217	memcpy(iosp->prev_lats, iosp->lats, iosp->nlats * sizeof(uint64_t));
218	memcpy(iosp->lats, lats, iosp->nlats * sizeof(uint64_t));
219//	printf("%s%d: %-6s %.3f %.3f %.3f %.3f\r\n",
220//	    dev, unit, operation, E3 * pest(500, lats, nlat), E3 * pest(900, lats, nlat),
221//	    E3 * pest(990, lats, nlat), E3 * pest(999, lats, nlat));
222}
223
224static int
225walk_sysctl(int *base_oid, size_t len)
226{
227	int qoid[CTL_MAXNAME + 2], oid[CTL_MAXNAME];
228	size_t l1, l2;
229	char name[BUFSIZ];
230
231	if (len > CTL_MAXNAME)
232		err(1, "Length %zd too long", len);
233
234	qoid[0] = CTL_SYSCTL;
235	qoid[1] = CTL_SYSCTL_NEXT;
236	l1 = 2;
237	memcpy(qoid + 2, base_oid, len * sizeof(int));
238	l1 += len;
239	for (;;) {
240		/*
241		 * Get the next one or return when we get to the end of the
242		 * sysctls in the kernel.
243		 */
244		l2 = sizeof(oid);
245		if (sysctl(qoid, l1, oid, &l2, 0, 0) != 0) {
246			if (errno == ENOENT)
247				return (0);
248			err(1, "sysctl(getnext) %zu", l2);
249		}
250
251		l2 /= sizeof(int);
252
253		/*
254		 * Bail if we're seeing OIDs that don't have the
255		 * same prefix or can't have the same prefix.
256		 */
257		if (l2 < len ||
258		    memcmp(oid, base_oid, len * sizeof(int)) != 0)
259			return (0);
260
261		/*
262		 * Get the name, validate it's one we're looking for,
263		 * parse the latency and add to list.
264		 */
265		do {
266			int nlat;
267			size_t l3;
268			char val[BUFSIZ];
269			char *walker, *dev, *opstr;
270			uint64_t latvals[MAX_LATS];
271			u_int kind;
272			int unit, op;
273
274			l1 = oid2name(oid, l2, name, sizeof(name));
275			if (strcmp(name + l1 - strlen(LATENCY) - 1, LATENCY) != 0)
276				break;
277			if (oidfmt(oid, l2, &kind) != 0)
278				err(1, "oidfmt");
279			if ((kind & CTLTYPE) != CTLTYPE_STRING)
280				errx(1, "string");
281			l3 = sizeof(val);
282			if (sysctl(oid, l2, val, &l3, 0, 0) != 0)
283				err(1, "sysctl");
284			val[l3] = '\0';
285			nlat = nitems(latvals);
286			if (split_u64(val, ",", latvals, &nlat) == 0)
287				break;
288			walker = name + strlen(CAM_BASE) + 1;
289			dev = strsep(&walker, ".");
290			unit = (int)strtol(strsep(&walker, "."), NULL, 10);
291			strsep(&walker, ".");
292			opstr = strsep(&walker, ".");
293			op = op2num(opstr);
294			if (op < 0)
295				break;
296			update_dev(dev, unit, op, latvals, nlat);
297		} while (false);
298
299		memcpy(qoid + 2, oid, l2 * sizeof(int));
300		l1 = 2 + l2;
301	}
302}
303
304void
305closeiolat(WINDOW *w)
306{
307	if (w == NULL)
308		return;
309	wclear(w);
310	wrefresh(w);
311	delwin(w);
312}
313
314static void
315doublecmd(const char *cmd, double *v)
316{
317	const char *p;
318	double tv;
319
320	p = strchr(cmd, '=');
321	if (p == NULL)
322		return;	/* XXX Tell the user something? */
323	if (sscanf(p + 1, "%lf", &tv) != 1)
324		return;	/* XXX Tell the user something? */
325	*v = tv;
326}
327
328int
329cmdiolat(const char *cmd __unused, const char *args __unused)
330{
331	fprintf(stderr, "CMD IS '%s'\n\n", cmd);
332	if (prefix(cmd, "trim"))
333		flags ^= OP_TRIM_MASK;
334	else if (prefix(cmd, "read"))
335		flags ^= OP_READ_MASK;
336	else if (prefix(cmd, "write"))
337		flags ^= OP_WRITE_MASK;
338	else if (prefix(cmd, "color"))
339		docolor = !docolor;
340	else if (prefix("high", cmd))
341		doublecmd(cmd, &high_thresh);
342	else if (prefix("med", cmd))
343		doublecmd(cmd, &med_thresh);
344	else
345		return (0);
346	wclear(wnd);
347	labeliolat();
348	refresh();
349	return (1);
350}
351
352int
353initiolat(void)
354{
355	int cam[CTL_MAXNAME];
356	uint64_t sbt_base;
357	size_t len = sizeof(sbt_base);
358
359	SLIST_INIT(&curlist);
360
361	baselat = 1e-3;		/* old default */
362	if (sysctlbyname(CAM_IOSCHED_BASE, &sbt_base, &len, NULL, 0) == 0)
363		baselat = sbt_base * 1e-6;	/* Convert to microseconds */
364
365	name2oid(CAM_BASE, cam);
366	walk_sysctl(cam, 2);
367	return (1);
368}
369
370void
371fetchiolat(void)
372{
373	int cam[CTL_MAXNAME];
374
375	name2oid(CAM_BASE, cam);
376	walk_sysctl(cam, 2);
377}
378
379#define	INSET	10
380
381void
382labeliolat(void)
383{
384	int _col, ndrives, lpr, row, j;
385	int regions __unused;
386	struct iosched_stat *isp;
387	char tmpstr[32];
388#define COLWIDTH	29
389#define DRIVESPERLINE	((getmaxx(wnd) - 1 - INSET) / COLWIDTH)
390	ndrives = ndevs; // XXX FILTER XXX
391	regions = howmany(ndrives, DRIVESPERLINE);
392	lpr = 2; /* for headers */
393	for (int i = 0; i < NUM_OPS; i++) {
394		if (flags & (1 << i))
395			lpr++;
396	}
397	row = 0;
398	_col = INSET;
399	j = 2;
400	if (flags & OP_READ_MASK)
401		mvwaddstr(wnd, row + j++, 1, "read");
402	if (flags & OP_WRITE_MASK)
403		mvwaddstr(wnd, row + j++, 1, "write");
404	if (flags & OP_TRIM_MASK)
405		mvwaddstr(wnd, row + j++, 1, "trim");
406	SLIST_FOREACH(isp, &curlist, link) {
407		if (_col + COLWIDTH >= getmaxx(wnd) - 1 - INSET) {
408			_col = INSET;
409			row += lpr + 1;
410			if (row > getmaxy(wnd) - 1 - (lpr + 1))
411				break;
412			j = 2;
413			if (flags & OP_READ_MASK)
414				mvwaddstr(wnd, row + j++, 1, "read");
415			if (flags & OP_WRITE_MASK)
416				mvwaddstr(wnd, row + j++, 1, "write");
417			if (flags & OP_TRIM_MASK)
418				mvwaddstr(wnd, row + j++, 1, "trim");
419		}
420		snprintf(tmpstr, sizeof(tmpstr), "%s%d", isp->dev_name, isp->unit);
421		mvwaddstr(wnd, row, _col + (COLWIDTH - strlen(tmpstr)) / 2, tmpstr);
422		mvwaddstr(wnd, row + 1, _col, "   p50    p90    p99  p99.9");
423		_col += COLWIDTH;
424	}
425}
426
427WINDOW *
428openiolat(void)
429{
430	return (subwin(stdscr, LINES-3-1, 0, MAINWIN_ROW, 0));
431}
432
433static void
434fmt(float f, char *buf, size_t len)
435{
436	if (isnan(f))
437		strlcpy(buf, "   -  ", len);
438	else if (f >= 1000.0)
439		snprintf(buf, len, "%6d", (int)f);
440	else if (f >= 100.0)
441		snprintf(buf, len, "%6.1f", f);
442	else if (f >= 10.0)
443		snprintf(buf, len, "%6.2f", f);
444	else
445		snprintf(buf, len, "%6.3f", f);
446}
447
448static void
449latout(double lat, int y, int x)
450{
451	int i;
452	char tmpstr[32];
453
454	fmt(lat, tmpstr, sizeof(tmpstr));
455	if (isnan(lat))
456		i = 4;
457	else if (lat > high_thresh)
458		i = 3;
459	else if (lat > med_thresh)
460		i = 2;
461	else
462		i = 1;
463	if (docolor)
464		wattron(wnd, COLOR_PAIR(i));
465	mvwaddstr(wnd, y, x, tmpstr);
466	if (docolor)
467		wattroff(wnd, COLOR_PAIR(i));
468}
469
470void
471showiolat(void)
472{
473	int _col, ndrives, lpr, row, k;
474	int regions __unused;
475	struct iosched_stat *isp;
476	struct iosched_op_stat *iosp;
477#define COLWIDTH	29
478#define DRIVESPERLINE	((getmaxx(wnd) - 1 - INSET) / COLWIDTH)
479	ndrives = ndevs; // XXX FILTER XXX
480	regions = howmany(ndrives, DRIVESPERLINE);
481	lpr = 2; /* XXX */
482	for (int i = 0; i < NUM_OPS; i++) {
483		if (flags & (1 << i))
484			lpr++;
485	}
486	row = 0;
487	_col = INSET;
488	SLIST_FOREACH(isp, &curlist, link) {
489		if (_col + COLWIDTH >= getmaxx(wnd) - 1 - INSET) {
490			_col = INSET;
491			row += lpr + 1;
492			if (row > getmaxy(wnd) - 1 - (lpr + 1))
493				break;
494		}
495		k = 2;
496		for (int i = 0; i < NUM_OPS; i++) {
497			uint64_t lats[MAX_LATS];
498			int nlats;
499			float p50, p90, p99, p999;
500
501			if ((flags & (1 << i)) == 0)
502				continue;
503			iosp = &isp->op_stats[i];
504			nlats = iosp->nlats;
505			memset(lats, 0, sizeof(lats));
506			for (int j = 0; j < iosp->nlats; j++)
507				lats[j] = iosp->lats[j] - iosp->prev_lats[j];
508			p50 = pest(500, lats, nlats) * E3;
509			p90 = pest(900, lats, nlats) * E3;
510			p99 = pest(990, lats, nlats) * E3;
511			p999 = pest(999, lats, nlats) * E3;
512			latout(p50, row + k, _col);
513			latout(p90, row + k, _col + 7);
514			latout(p99, row + k, _col + 14);
515			latout(p999, row + k, _col + 21);
516			k++;
517		}
518		_col += COLWIDTH;
519	}
520}
521