1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25
26#include <mdb/mdb_param.h>
27#include <mdb/mdb_modapi.h>
28#include <mdb/mdb_ctf.h>
29#include <mdb/mdb_whatis.h>
30#include <sys/cpuvar.h>
31#include <sys/kmem_impl.h>
32#include <sys/vmem_impl.h>
33#include <sys/machelf.h>
34#include <sys/modctl.h>
35#include <sys/kobj.h>
36#include <sys/panic.h>
37#include <sys/stack.h>
38#include <sys/sysmacros.h>
39#include <vm/page.h>
40
41#include "avl.h"
42#include "combined.h"
43#include "dist.h"
44#include "kmem.h"
45#include "list.h"
46
47#define	dprintf(x) if (mdb_debug_level) { \
48	mdb_printf("kmem debug: ");  \
49	/*CSTYLED*/\
50	mdb_printf x ;\
51}
52
53#define	KM_ALLOCATED		0x01
54#define	KM_FREE			0x02
55#define	KM_BUFCTL		0x04
56#define	KM_CONSTRUCTED		0x08	/* only constructed free buffers */
57#define	KM_HASH			0x10
58
59static int mdb_debug_level = 0;
60
61/*ARGSUSED*/
62static int
63kmem_init_walkers(uintptr_t addr, const kmem_cache_t *c, void *ignored)
64{
65	mdb_walker_t w;
66	char descr[64];
67
68	(void) mdb_snprintf(descr, sizeof (descr),
69	    "walk the %s cache", c->cache_name);
70
71	w.walk_name = c->cache_name;
72	w.walk_descr = descr;
73	w.walk_init = kmem_walk_init;
74	w.walk_step = kmem_walk_step;
75	w.walk_fini = kmem_walk_fini;
76	w.walk_init_arg = (void *)addr;
77
78	if (mdb_add_walker(&w) == -1)
79		mdb_warn("failed to add %s walker", c->cache_name);
80
81	return (WALK_NEXT);
82}
83
84/*ARGSUSED*/
85int
86kmem_debug(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
87{
88	mdb_debug_level ^= 1;
89
90	mdb_printf("kmem: debugging is now %s\n",
91	    mdb_debug_level ? "on" : "off");
92
93	return (DCMD_OK);
94}
95
96int
97kmem_cache_walk_init(mdb_walk_state_t *wsp)
98{
99	GElf_Sym sym;
100
101	if (mdb_lookup_by_name("kmem_caches", &sym) == -1) {
102		mdb_warn("couldn't find kmem_caches");
103		return (WALK_ERR);
104	}
105
106	wsp->walk_addr = (uintptr_t)sym.st_value;
107
108	return (list_walk_init_named(wsp, "cache list", "cache"));
109}
110
111int
112kmem_cpu_cache_walk_init(mdb_walk_state_t *wsp)
113{
114	if (wsp->walk_addr == NULL) {
115		mdb_warn("kmem_cpu_cache doesn't support global walks");
116		return (WALK_ERR);
117	}
118
119	if (mdb_layered_walk("cpu", wsp) == -1) {
120		mdb_warn("couldn't walk 'cpu'");
121		return (WALK_ERR);
122	}
123
124	wsp->walk_data = (void *)wsp->walk_addr;
125
126	return (WALK_NEXT);
127}
128
129int
130kmem_cpu_cache_walk_step(mdb_walk_state_t *wsp)
131{
132	uintptr_t caddr = (uintptr_t)wsp->walk_data;
133	const cpu_t *cpu = wsp->walk_layer;
134	kmem_cpu_cache_t cc;
135
136	caddr += OFFSETOF(kmem_cache_t, cache_cpu[cpu->cpu_seqid]);
137
138	if (mdb_vread(&cc, sizeof (kmem_cpu_cache_t), caddr) == -1) {
139		mdb_warn("couldn't read kmem_cpu_cache at %p", caddr);
140		return (WALK_ERR);
141	}
142
143	return (wsp->walk_callback(caddr, &cc, wsp->walk_cbdata));
144}
145
146static int
147kmem_slab_check(void *p, uintptr_t saddr, void *arg)
148{
149	kmem_slab_t *sp = p;
150	uintptr_t caddr = (uintptr_t)arg;
151	if ((uintptr_t)sp->slab_cache != caddr) {
152		mdb_warn("slab %p isn't in cache %p (in cache %p)\n",
153		    saddr, caddr, sp->slab_cache);
154		return (-1);
155	}
156
157	return (0);
158}
159
160static int
161kmem_partial_slab_check(void *p, uintptr_t saddr, void *arg)
162{
163	kmem_slab_t *sp = p;
164
165	int rc = kmem_slab_check(p, saddr, arg);
166	if (rc != 0) {
167		return (rc);
168	}
169
170	if (!KMEM_SLAB_IS_PARTIAL(sp)) {
171		mdb_warn("slab %p is not a partial slab\n", saddr);
172		return (-1);
173	}
174
175	return (0);
176}
177
178static int
179kmem_complete_slab_check(void *p, uintptr_t saddr, void *arg)
180{
181	kmem_slab_t *sp = p;
182
183	int rc = kmem_slab_check(p, saddr, arg);
184	if (rc != 0) {
185		return (rc);
186	}
187
188	if (!KMEM_SLAB_IS_ALL_USED(sp)) {
189		mdb_warn("slab %p is not completely allocated\n", saddr);
190		return (-1);
191	}
192
193	return (0);
194}
195
196typedef struct {
197	uintptr_t kns_cache_addr;
198	int kns_nslabs;
199} kmem_nth_slab_t;
200
201static int
202kmem_nth_slab_check(void *p, uintptr_t saddr, void *arg)
203{
204	kmem_nth_slab_t *chkp = arg;
205
206	int rc = kmem_slab_check(p, saddr, (void *)chkp->kns_cache_addr);
207	if (rc != 0) {
208		return (rc);
209	}
210
211	return (chkp->kns_nslabs-- == 0 ? 1 : 0);
212}
213
214static int
215kmem_complete_slab_walk_init(mdb_walk_state_t *wsp)
216{
217	uintptr_t caddr = wsp->walk_addr;
218
219	wsp->walk_addr = (uintptr_t)(caddr +
220	    offsetof(kmem_cache_t, cache_complete_slabs));
221
222	return (list_walk_init_checked(wsp, "slab list", "slab",
223	    kmem_complete_slab_check, (void *)caddr));
224}
225
226static int
227kmem_partial_slab_walk_init(mdb_walk_state_t *wsp)
228{
229	uintptr_t caddr = wsp->walk_addr;
230
231	wsp->walk_addr = (uintptr_t)(caddr +
232	    offsetof(kmem_cache_t, cache_partial_slabs));
233
234	return (avl_walk_init_checked(wsp, "slab list", "slab",
235	    kmem_partial_slab_check, (void *)caddr));
236}
237
238int
239kmem_slab_walk_init(mdb_walk_state_t *wsp)
240{
241	uintptr_t caddr = wsp->walk_addr;
242
243	if (caddr == NULL) {
244		mdb_warn("kmem_slab doesn't support global walks\n");
245		return (WALK_ERR);
246	}
247
248	combined_walk_init(wsp);
249	combined_walk_add(wsp,
250	    kmem_complete_slab_walk_init, list_walk_step, list_walk_fini);
251	combined_walk_add(wsp,
252	    kmem_partial_slab_walk_init, avl_walk_step, avl_walk_fini);
253
254	return (WALK_NEXT);
255}
256
257static int
258kmem_first_complete_slab_walk_init(mdb_walk_state_t *wsp)
259{
260	uintptr_t caddr = wsp->walk_addr;
261	kmem_nth_slab_t *chk;
262
263	chk = mdb_alloc(sizeof (kmem_nth_slab_t),
264	    UM_SLEEP | UM_GC);
265	chk->kns_cache_addr = caddr;
266	chk->kns_nslabs = 1;
267	wsp->walk_addr = (uintptr_t)(caddr +
268	    offsetof(kmem_cache_t, cache_complete_slabs));
269
270	return (list_walk_init_checked(wsp, "slab list", "slab",
271	    kmem_nth_slab_check, chk));
272}
273
274int
275kmem_slab_walk_partial_init(mdb_walk_state_t *wsp)
276{
277	uintptr_t caddr = wsp->walk_addr;
278	kmem_cache_t c;
279
280	if (caddr == NULL) {
281		mdb_warn("kmem_slab_partial doesn't support global walks\n");
282		return (WALK_ERR);
283	}
284
285	if (mdb_vread(&c, sizeof (c), caddr) == -1) {
286		mdb_warn("couldn't read kmem_cache at %p", caddr);
287		return (WALK_ERR);
288	}
289
290	combined_walk_init(wsp);
291
292	/*
293	 * Some consumers (umem_walk_step(), in particular) require at
294	 * least one callback if there are any buffers in the cache.  So
295	 * if there are *no* partial slabs, report the first full slab, if
296	 * any.
297	 *
298	 * Yes, this is ugly, but it's cleaner than the other possibilities.
299	 */
300	if (c.cache_partial_slabs.avl_numnodes == 0) {
301		combined_walk_add(wsp, kmem_first_complete_slab_walk_init,
302		    list_walk_step, list_walk_fini);
303	} else {
304		combined_walk_add(wsp, kmem_partial_slab_walk_init,
305		    avl_walk_step, avl_walk_fini);
306	}
307
308	return (WALK_NEXT);
309}
310
311int
312kmem_cache(uintptr_t addr, uint_t flags, int ac, const mdb_arg_t *argv)
313{
314	kmem_cache_t c;
315	const char *filter = NULL;
316
317	if (mdb_getopts(ac, argv,
318	    'n', MDB_OPT_STR, &filter,
319	    NULL) != ac) {
320		return (DCMD_USAGE);
321	}
322
323	if (!(flags & DCMD_ADDRSPEC)) {
324		if (mdb_walk_dcmd("kmem_cache", "kmem_cache", ac, argv) == -1) {
325			mdb_warn("can't walk kmem_cache");
326			return (DCMD_ERR);
327		}
328		return (DCMD_OK);
329	}
330
331	if (DCMD_HDRSPEC(flags))
332		mdb_printf("%-?s %-25s %4s %6s %8s %8s\n", "ADDR", "NAME",
333		    "FLAG", "CFLAG", "BUFSIZE", "BUFTOTL");
334
335	if (mdb_vread(&c, sizeof (c), addr) == -1) {
336		mdb_warn("couldn't read kmem_cache at %p", addr);
337		return (DCMD_ERR);
338	}
339
340	if ((filter != NULL) && (strstr(c.cache_name, filter) == NULL))
341		return (DCMD_OK);
342
343	mdb_printf("%0?p %-25s %04x %06x %8ld %8lld\n", addr, c.cache_name,
344	    c.cache_flags, c.cache_cflags, c.cache_bufsize, c.cache_buftotal);
345
346	return (DCMD_OK);
347}
348
349void
350kmem_cache_help(void)
351{
352	mdb_printf("%s", "Print kernel memory caches.\n\n");
353	mdb_dec_indent(2);
354	mdb_printf("%<b>OPTIONS%</b>\n");
355	mdb_inc_indent(2);
356	mdb_printf("%s",
357"  -n name\n"
358"        name of kmem cache (or matching partial name)\n"
359"\n"
360"Column\tDescription\n"
361"\n"
362"ADDR\t\taddress of kmem cache\n"
363"NAME\t\tname of kmem cache\n"
364"FLAG\t\tvarious cache state flags\n"
365"CFLAG\t\tcache creation flags\n"
366"BUFSIZE\tobject size in bytes\n"
367"BUFTOTL\tcurrent total buffers in cache (allocated and free)\n");
368}
369
370#define	LABEL_WIDTH	11
371static void
372kmem_slabs_print_dist(uint_t *ks_bucket, size_t buffers_per_slab,
373    size_t maxbuckets, size_t minbucketsize)
374{
375	uint64_t total;
376	int buckets;
377	int i;
378	const int *distarray;
379	int complete[2];
380
381	buckets = buffers_per_slab;
382
383	total = 0;
384	for (i = 0; i <= buffers_per_slab; i++)
385		total += ks_bucket[i];
386
387	if (maxbuckets > 1)
388		buckets = MIN(buckets, maxbuckets);
389
390	if (minbucketsize > 1) {
391		/*
392		 * minbucketsize does not apply to the first bucket reserved
393		 * for completely allocated slabs
394		 */
395		buckets = MIN(buckets, 1 + ((buffers_per_slab - 1) /
396		    minbucketsize));
397		if ((buckets < 2) && (buffers_per_slab > 1)) {
398			buckets = 2;
399			minbucketsize = (buffers_per_slab - 1);
400		}
401	}
402
403	/*
404	 * The first printed bucket is reserved for completely allocated slabs.
405	 * Passing (buckets - 1) excludes that bucket from the generated
406	 * distribution, since we're handling it as a special case.
407	 */
408	complete[0] = buffers_per_slab;
409	complete[1] = buffers_per_slab + 1;
410	distarray = dist_linear(buckets - 1, 1, buffers_per_slab - 1);
411
412	mdb_printf("%*s\n", LABEL_WIDTH, "Allocated");
413	dist_print_header("Buffers", LABEL_WIDTH, "Slabs");
414
415	dist_print_bucket(complete, 0, ks_bucket, total, LABEL_WIDTH);
416	/*
417	 * Print bucket ranges in descending order after the first bucket for
418	 * completely allocated slabs, so a person can see immediately whether
419	 * or not there is fragmentation without having to scan possibly
420	 * multiple screens of output. Starting at (buckets - 2) excludes the
421	 * extra terminating bucket.
422	 */
423	for (i = buckets - 2; i >= 0; i--) {
424		dist_print_bucket(distarray, i, ks_bucket, total, LABEL_WIDTH);
425	}
426	mdb_printf("\n");
427}
428#undef LABEL_WIDTH
429
430/*ARGSUSED*/
431static int
432kmem_first_slab(uintptr_t addr, const kmem_slab_t *sp, boolean_t *is_slab)
433{
434	*is_slab = B_TRUE;
435	return (WALK_DONE);
436}
437
438/*ARGSUSED*/
439static int
440kmem_first_partial_slab(uintptr_t addr, const kmem_slab_t *sp,
441    boolean_t *is_slab)
442{
443	/*
444	 * The "kmem_partial_slab" walker reports the first full slab if there
445	 * are no partial slabs (for the sake of consumers that require at least
446	 * one callback if there are any buffers in the cache).
447	 */
448	*is_slab = KMEM_SLAB_IS_PARTIAL(sp);
449	return (WALK_DONE);
450}
451
452typedef struct kmem_slab_usage {
453	int ksu_refcnt;			/* count of allocated buffers on slab */
454	boolean_t ksu_nomove;		/* slab marked non-reclaimable */
455} kmem_slab_usage_t;
456
457typedef struct kmem_slab_stats {
458	const kmem_cache_t *ks_cp;
459	int ks_slabs;			/* slabs in cache */
460	int ks_partial_slabs;		/* partially allocated slabs in cache */
461	uint64_t ks_unused_buffers;	/* total unused buffers in cache */
462	int ks_max_buffers_per_slab;	/* max buffers per slab */
463	int ks_usage_len;		/* ks_usage array length */
464	kmem_slab_usage_t *ks_usage;	/* partial slab usage */
465	uint_t *ks_bucket;		/* slab usage distribution */
466} kmem_slab_stats_t;
467
468/*ARGSUSED*/
469static int
470kmem_slablist_stat(uintptr_t addr, const kmem_slab_t *sp,
471    kmem_slab_stats_t *ks)
472{
473	kmem_slab_usage_t *ksu;
474	long unused;
475
476	ks->ks_slabs++;
477	ks->ks_bucket[sp->slab_refcnt]++;
478
479	unused = (sp->slab_chunks - sp->slab_refcnt);
480	if (unused == 0) {
481		return (WALK_NEXT);
482	}
483
484	ks->ks_partial_slabs++;
485	ks->ks_unused_buffers += unused;
486
487	if (ks->ks_partial_slabs > ks->ks_usage_len) {
488		kmem_slab_usage_t *usage;
489		int len = ks->ks_usage_len;
490
491		len = (len == 0 ? 16 : len * 2);
492		usage = mdb_zalloc(len * sizeof (kmem_slab_usage_t), UM_SLEEP);
493		if (ks->ks_usage != NULL) {
494			bcopy(ks->ks_usage, usage,
495			    ks->ks_usage_len * sizeof (kmem_slab_usage_t));
496			mdb_free(ks->ks_usage,
497			    ks->ks_usage_len * sizeof (kmem_slab_usage_t));
498		}
499		ks->ks_usage = usage;
500		ks->ks_usage_len = len;
501	}
502
503	ksu = &ks->ks_usage[ks->ks_partial_slabs - 1];
504	ksu->ksu_refcnt = sp->slab_refcnt;
505	ksu->ksu_nomove = (sp->slab_flags & KMEM_SLAB_NOMOVE);
506	return (WALK_NEXT);
507}
508
509static void
510kmem_slabs_header()
511{
512	mdb_printf("%-25s %8s %8s %9s %9s %6s\n",
513	    "", "", "Partial", "", "Unused", "");
514	mdb_printf("%-25s %8s %8s %9s %9s %6s\n",
515	    "Cache Name", "Slabs", "Slabs", "Buffers", "Buffers", "Waste");
516	mdb_printf("%-25s %8s %8s %9s %9s %6s\n",
517	    "-------------------------", "--------", "--------", "---------",
518	    "---------", "------");
519}
520
521int
522kmem_slabs(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
523{
524	kmem_cache_t c;
525	kmem_slab_stats_t stats;
526	mdb_walk_cb_t cb;
527	int pct;
528	int tenths_pct;
529	size_t maxbuckets = 1;
530	size_t minbucketsize = 0;
531	const char *filter = NULL;
532	const char *name = NULL;
533	uint_t opt_v = FALSE;
534	boolean_t buckets = B_FALSE;
535	boolean_t skip = B_FALSE;
536
537	if (mdb_getopts(argc, argv,
538	    'B', MDB_OPT_UINTPTR, &minbucketsize,
539	    'b', MDB_OPT_UINTPTR, &maxbuckets,
540	    'n', MDB_OPT_STR, &filter,
541	    'N', MDB_OPT_STR, &name,
542	    'v', MDB_OPT_SETBITS, TRUE, &opt_v,
543	    NULL) != argc) {
544		return (DCMD_USAGE);
545	}
546
547	if ((maxbuckets != 1) || (minbucketsize != 0)) {
548		buckets = B_TRUE;
549	}
550
551	if (!(flags & DCMD_ADDRSPEC)) {
552		if (mdb_walk_dcmd("kmem_cache", "kmem_slabs", argc,
553		    argv) == -1) {
554			mdb_warn("can't walk kmem_cache");
555			return (DCMD_ERR);
556		}
557		return (DCMD_OK);
558	}
559
560	if (mdb_vread(&c, sizeof (c), addr) == -1) {
561		mdb_warn("couldn't read kmem_cache at %p", addr);
562		return (DCMD_ERR);
563	}
564
565	if (name == NULL) {
566		skip = ((filter != NULL) &&
567		    (strstr(c.cache_name, filter) == NULL));
568	} else if (filter == NULL) {
569		skip = (strcmp(c.cache_name, name) != 0);
570	} else {
571		/* match either -n or -N */
572		skip = ((strcmp(c.cache_name, name) != 0) &&
573		    (strstr(c.cache_name, filter) == NULL));
574	}
575
576	if (!(opt_v || buckets) && DCMD_HDRSPEC(flags)) {
577		kmem_slabs_header();
578	} else if ((opt_v || buckets) && !skip) {
579		if (DCMD_HDRSPEC(flags)) {
580			kmem_slabs_header();
581		} else {
582			boolean_t is_slab = B_FALSE;
583			const char *walker_name;
584			if (opt_v) {
585				cb = (mdb_walk_cb_t)kmem_first_partial_slab;
586				walker_name = "kmem_slab_partial";
587			} else {
588				cb = (mdb_walk_cb_t)kmem_first_slab;
589				walker_name = "kmem_slab";
590			}
591			(void) mdb_pwalk(walker_name, cb, &is_slab, addr);
592			if (is_slab) {
593				kmem_slabs_header();
594			}
595		}
596	}
597
598	if (skip) {
599		return (DCMD_OK);
600	}
601
602	bzero(&stats, sizeof (kmem_slab_stats_t));
603	stats.ks_cp = &c;
604	stats.ks_max_buffers_per_slab = c.cache_maxchunks;
605	/* +1 to include a zero bucket */
606	stats.ks_bucket = mdb_zalloc((stats.ks_max_buffers_per_slab + 1) *
607	    sizeof (*stats.ks_bucket), UM_SLEEP);
608	cb = (mdb_walk_cb_t)kmem_slablist_stat;
609	(void) mdb_pwalk("kmem_slab", cb, &stats, addr);
610
611	if (c.cache_buftotal == 0) {
612		pct = 0;
613		tenths_pct = 0;
614	} else {
615		uint64_t n = stats.ks_unused_buffers * 10000;
616		pct = (int)(n / c.cache_buftotal);
617		tenths_pct = pct - ((pct / 100) * 100);
618		tenths_pct = (tenths_pct + 5) / 10; /* round nearest tenth */
619		if (tenths_pct == 10) {
620			pct += 100;
621			tenths_pct = 0;
622		}
623	}
624
625	pct /= 100;
626	mdb_printf("%-25s %8d %8d %9lld %9lld %3d.%1d%%\n", c.cache_name,
627	    stats.ks_slabs, stats.ks_partial_slabs, c.cache_buftotal,
628	    stats.ks_unused_buffers, pct, tenths_pct);
629
630	if (maxbuckets == 0) {
631		maxbuckets = stats.ks_max_buffers_per_slab;
632	}
633
634	if (((maxbuckets > 1) || (minbucketsize > 0)) &&
635	    (stats.ks_slabs > 0)) {
636		mdb_printf("\n");
637		kmem_slabs_print_dist(stats.ks_bucket,
638		    stats.ks_max_buffers_per_slab, maxbuckets, minbucketsize);
639	}
640
641	mdb_free(stats.ks_bucket, (stats.ks_max_buffers_per_slab + 1) *
642	    sizeof (*stats.ks_bucket));
643
644	if (!opt_v) {
645		return (DCMD_OK);
646	}
647
648	if (opt_v && (stats.ks_partial_slabs > 0)) {
649		int i;
650		kmem_slab_usage_t *ksu;
651
652		mdb_printf("  %d complete (%d), %d partial:",
653		    (stats.ks_slabs - stats.ks_partial_slabs),
654		    stats.ks_max_buffers_per_slab,
655		    stats.ks_partial_slabs);
656
657		for (i = 0; i < stats.ks_partial_slabs; i++) {
658			ksu = &stats.ks_usage[i];
659			mdb_printf(" %d%s", ksu->ksu_refcnt,
660			    (ksu->ksu_nomove ? "*" : ""));
661		}
662		mdb_printf("\n\n");
663	}
664
665	if (stats.ks_usage_len > 0) {
666		mdb_free(stats.ks_usage,
667		    stats.ks_usage_len * sizeof (kmem_slab_usage_t));
668	}
669
670	return (DCMD_OK);
671}
672
673void
674kmem_slabs_help(void)
675{
676	mdb_printf("%s",
677"Display slab usage per kmem cache.\n\n");
678	mdb_dec_indent(2);
679	mdb_printf("%<b>OPTIONS%</b>\n");
680	mdb_inc_indent(2);
681	mdb_printf("%s",
682"  -n name\n"
683"        name of kmem cache (or matching partial name)\n"
684"  -N name\n"
685"        exact name of kmem cache\n"
686"  -b maxbins\n"
687"        Print a distribution of allocated buffers per slab using at\n"
688"        most maxbins bins. The first bin is reserved for completely\n"
689"        allocated slabs. Setting maxbins to zero (-b 0) has the same\n"
690"        effect as specifying the maximum allocated buffers per slab\n"
691"        or setting minbinsize to 1 (-B 1).\n"
692"  -B minbinsize\n"
693"        Print a distribution of allocated buffers per slab, making\n"
694"        all bins (except the first, reserved for completely allocated\n"
695"        slabs) at least minbinsize buffers apart.\n"
696"  -v    verbose output: List the allocated buffer count of each partial\n"
697"        slab on the free list in order from front to back to show how\n"
698"        closely the slabs are ordered by usage. For example\n"
699"\n"
700"          10 complete, 3 partial (8): 7 3 1\n"
701"\n"
702"        means there are thirteen slabs with eight buffers each, including\n"
703"        three partially allocated slabs with less than all eight buffers\n"
704"        allocated.\n"
705"\n"
706"        Buffer allocations are always from the front of the partial slab\n"
707"        list. When a buffer is freed from a completely used slab, that\n"
708"        slab is added to the front of the partial slab list. Assuming\n"
709"        that all buffers are equally likely to be freed soon, the\n"
710"        desired order of partial slabs is most-used at the front of the\n"
711"        list and least-used at the back (as in the example above).\n"
712"        However, if a slab contains an allocated buffer that will not\n"
713"        soon be freed, it would be better for that slab to be at the\n"
714"        front where all of its buffers can be allocated. Taking a slab\n"
715"        off the partial slab list (either with all buffers freed or all\n"
716"        buffers allocated) reduces cache fragmentation.\n"
717"\n"
718"        A slab's allocated buffer count representing a partial slab (9 in\n"
719"        the example below) may be marked as follows:\n"
720"\n"
721"        9*   An asterisk indicates that kmem has marked the slab non-\n"
722"        reclaimable because the kmem client refused to move one of the\n"
723"        slab's buffers. Since kmem does not expect to completely free the\n"
724"        slab, it moves it to the front of the list in the hope of\n"
725"        completely allocating it instead. A slab marked with an asterisk\n"
726"        stays marked for as long as it remains on the partial slab list.\n"
727"\n"
728"Column\t\tDescription\n"
729"\n"
730"Cache Name\t\tname of kmem cache\n"
731"Slabs\t\t\ttotal slab count\n"
732"Partial Slabs\t\tcount of partially allocated slabs on the free list\n"
733"Buffers\t\ttotal buffer count (Slabs * (buffers per slab))\n"
734"Unused Buffers\tcount of unallocated buffers across all partial slabs\n"
735"Waste\t\t\t(Unused Buffers / Buffers) does not include space\n"
736"\t\t\t  for accounting structures (debug mode), slab\n"
737"\t\t\t  coloring (incremental small offsets to stagger\n"
738"\t\t\t  buffer alignment), or the per-CPU magazine layer\n");
739}
740
741static int
742addrcmp(const void *lhs, const void *rhs)
743{
744	uintptr_t p1 = *((uintptr_t *)lhs);
745	uintptr_t p2 = *((uintptr_t *)rhs);
746
747	if (p1 < p2)
748		return (-1);
749	if (p1 > p2)
750		return (1);
751	return (0);
752}
753
754static int
755bufctlcmp(const kmem_bufctl_audit_t **lhs, const kmem_bufctl_audit_t **rhs)
756{
757	const kmem_bufctl_audit_t *bcp1 = *lhs;
758	const kmem_bufctl_audit_t *bcp2 = *rhs;
759
760	if (bcp1->bc_timestamp > bcp2->bc_timestamp)
761		return (-1);
762
763	if (bcp1->bc_timestamp < bcp2->bc_timestamp)
764		return (1);
765
766	return (0);
767}
768
769typedef struct kmem_hash_walk {
770	uintptr_t *kmhw_table;
771	size_t kmhw_nelems;
772	size_t kmhw_pos;
773	kmem_bufctl_t kmhw_cur;
774} kmem_hash_walk_t;
775
776int
777kmem_hash_walk_init(mdb_walk_state_t *wsp)
778{
779	kmem_hash_walk_t *kmhw;
780	uintptr_t *hash;
781	kmem_cache_t c;
782	uintptr_t haddr, addr = wsp->walk_addr;
783	size_t nelems;
784	size_t hsize;
785
786	if (addr == NULL) {
787		mdb_warn("kmem_hash doesn't support global walks\n");
788		return (WALK_ERR);
789	}
790
791	if (mdb_vread(&c, sizeof (c), addr) == -1) {
792		mdb_warn("couldn't read cache at addr %p", addr);
793		return (WALK_ERR);
794	}
795
796	if (!(c.cache_flags & KMF_HASH)) {
797		mdb_warn("cache %p doesn't have a hash table\n", addr);
798		return (WALK_DONE);		/* nothing to do */
799	}
800
801	kmhw = mdb_zalloc(sizeof (kmem_hash_walk_t), UM_SLEEP);
802	kmhw->kmhw_cur.bc_next = NULL;
803	kmhw->kmhw_pos = 0;
804
805	kmhw->kmhw_nelems = nelems = c.cache_hash_mask + 1;
806	hsize = nelems * sizeof (uintptr_t);
807	haddr = (uintptr_t)c.cache_hash_table;
808
809	kmhw->kmhw_table = hash = mdb_alloc(hsize, UM_SLEEP);
810	if (mdb_vread(hash, hsize, haddr) == -1) {
811		mdb_warn("failed to read hash table at %p", haddr);
812		mdb_free(hash, hsize);
813		mdb_free(kmhw, sizeof (kmem_hash_walk_t));
814		return (WALK_ERR);
815	}
816
817	wsp->walk_data = kmhw;
818
819	return (WALK_NEXT);
820}
821
822int
823kmem_hash_walk_step(mdb_walk_state_t *wsp)
824{
825	kmem_hash_walk_t *kmhw = wsp->walk_data;
826	uintptr_t addr = NULL;
827
828	if ((addr = (uintptr_t)kmhw->kmhw_cur.bc_next) == NULL) {
829		while (kmhw->kmhw_pos < kmhw->kmhw_nelems) {
830			if ((addr = kmhw->kmhw_table[kmhw->kmhw_pos++]) != NULL)
831				break;
832		}
833	}
834	if (addr == NULL)
835		return (WALK_DONE);
836
837	if (mdb_vread(&kmhw->kmhw_cur, sizeof (kmem_bufctl_t), addr) == -1) {
838		mdb_warn("couldn't read kmem_bufctl_t at addr %p", addr);
839		return (WALK_ERR);
840	}
841
842	return (wsp->walk_callback(addr, &kmhw->kmhw_cur, wsp->walk_cbdata));
843}
844
845void
846kmem_hash_walk_fini(mdb_walk_state_t *wsp)
847{
848	kmem_hash_walk_t *kmhw = wsp->walk_data;
849
850	if (kmhw == NULL)
851		return;
852
853	mdb_free(kmhw->kmhw_table, kmhw->kmhw_nelems * sizeof (uintptr_t));
854	mdb_free(kmhw, sizeof (kmem_hash_walk_t));
855}
856
857/*
858 * Find the address of the bufctl structure for the address 'buf' in cache
859 * 'cp', which is at address caddr, and place it in *out.
860 */
861static int
862kmem_hash_lookup(kmem_cache_t *cp, uintptr_t caddr, void *buf, uintptr_t *out)
863{
864	uintptr_t bucket = (uintptr_t)KMEM_HASH(cp, buf);
865	kmem_bufctl_t *bcp;
866	kmem_bufctl_t bc;
867
868	if (mdb_vread(&bcp, sizeof (kmem_bufctl_t *), bucket) == -1) {
869		mdb_warn("unable to read hash bucket for %p in cache %p",
870		    buf, caddr);
871		return (-1);
872	}
873
874	while (bcp != NULL) {
875		if (mdb_vread(&bc, sizeof (kmem_bufctl_t),
876		    (uintptr_t)bcp) == -1) {
877			mdb_warn("unable to read bufctl at %p", bcp);
878			return (-1);
879		}
880		if (bc.bc_addr == buf) {
881			*out = (uintptr_t)bcp;
882			return (0);
883		}
884		bcp = bc.bc_next;
885	}
886
887	mdb_warn("unable to find bufctl for %p in cache %p\n", buf, caddr);
888	return (-1);
889}
890
891int
892kmem_get_magsize(const kmem_cache_t *cp)
893{
894	uintptr_t addr = (uintptr_t)cp->cache_magtype;
895	GElf_Sym mt_sym;
896	kmem_magtype_t mt;
897	int res;
898
899	/*
900	 * if cpu 0 has a non-zero magsize, it must be correct.  caches
901	 * with KMF_NOMAGAZINE have disabled their magazine layers, so
902	 * it is okay to return 0 for them.
903	 */
904	if ((res = cp->cache_cpu[0].cc_magsize) != 0 ||
905	    (cp->cache_flags & KMF_NOMAGAZINE))
906		return (res);
907
908	if (mdb_lookup_by_name("kmem_magtype", &mt_sym) == -1) {
909		mdb_warn("unable to read 'kmem_magtype'");
910	} else if (addr < mt_sym.st_value ||
911	    addr + sizeof (mt) - 1 > mt_sym.st_value + mt_sym.st_size - 1 ||
912	    ((addr - mt_sym.st_value) % sizeof (mt)) != 0) {
913		mdb_warn("cache '%s' has invalid magtype pointer (%p)\n",
914		    cp->cache_name, addr);
915		return (0);
916	}
917	if (mdb_vread(&mt, sizeof (mt), addr) == -1) {
918		mdb_warn("unable to read magtype at %a", addr);
919		return (0);
920	}
921	return (mt.mt_magsize);
922}
923
924/*ARGSUSED*/
925static int
926kmem_estimate_slab(uintptr_t addr, const kmem_slab_t *sp, size_t *est)
927{
928	*est -= (sp->slab_chunks - sp->slab_refcnt);
929
930	return (WALK_NEXT);
931}
932
933/*
934 * Returns an upper bound on the number of allocated buffers in a given
935 * cache.
936 */
937size_t
938kmem_estimate_allocated(uintptr_t addr, const kmem_cache_t *cp)
939{
940	int magsize;
941	size_t cache_est;
942
943	cache_est = cp->cache_buftotal;
944
945	(void) mdb_pwalk("kmem_slab_partial",
946	    (mdb_walk_cb_t)kmem_estimate_slab, &cache_est, addr);
947
948	if ((magsize = kmem_get_magsize(cp)) != 0) {
949		size_t mag_est = cp->cache_full.ml_total * magsize;
950
951		if (cache_est >= mag_est) {
952			cache_est -= mag_est;
953		} else {
954			mdb_warn("cache %p's magazine layer holds more buffers "
955			    "than the slab layer.\n", addr);
956		}
957	}
958	return (cache_est);
959}
960
961#define	READMAG_ROUNDS(rounds) { \
962	if (mdb_vread(mp, magbsize, (uintptr_t)kmp) == -1) { \
963		mdb_warn("couldn't read magazine at %p", kmp); \
964		goto fail; \
965	} \
966	for (i = 0; i < rounds; i++) { \
967		maglist[magcnt++] = mp->mag_round[i]; \
968		if (magcnt == magmax) { \
969			mdb_warn("%d magazines exceeds fudge factor\n", \
970			    magcnt); \
971			goto fail; \
972		} \
973	} \
974}
975
976int
977kmem_read_magazines(kmem_cache_t *cp, uintptr_t addr, int ncpus,
978    void ***maglistp, size_t *magcntp, size_t *magmaxp, int alloc_flags)
979{
980	kmem_magazine_t *kmp, *mp;
981	void **maglist = NULL;
982	int i, cpu;
983	size_t magsize, magmax, magbsize;
984	size_t magcnt = 0;
985
986	/*
987	 * Read the magtype out of the cache, after verifying the pointer's
988	 * correctness.
989	 */
990	magsize = kmem_get_magsize(cp);
991	if (magsize == 0) {
992		*maglistp = NULL;
993		*magcntp = 0;
994		*magmaxp = 0;
995		return (WALK_NEXT);
996	}
997
998	/*
999	 * There are several places where we need to go buffer hunting:
1000	 * the per-CPU loaded magazine, the per-CPU spare full magazine,
1001	 * and the full magazine list in the depot.
1002	 *
1003	 * For an upper bound on the number of buffers in the magazine
1004	 * layer, we have the number of magazines on the cache_full
1005	 * list plus at most two magazines per CPU (the loaded and the
1006	 * spare).  Toss in 100 magazines as a fudge factor in case this
1007	 * is live (the number "100" comes from the same fudge factor in
1008	 * crash(1M)).
1009	 */
1010	magmax = (cp->cache_full.ml_total + 2 * ncpus + 100) * magsize;
1011	magbsize = offsetof(kmem_magazine_t, mag_round[magsize]);
1012
1013	if (magbsize >= PAGESIZE / 2) {
1014		mdb_warn("magazine size for cache %p unreasonable (%x)\n",
1015		    addr, magbsize);
1016		return (WALK_ERR);
1017	}
1018
1019	maglist = mdb_alloc(magmax * sizeof (void *), alloc_flags);
1020	mp = mdb_alloc(magbsize, alloc_flags);
1021	if (mp == NULL || maglist == NULL)
1022		goto fail;
1023
1024	/*
1025	 * First up: the magazines in the depot (i.e. on the cache_full list).
1026	 */
1027	for (kmp = cp->cache_full.ml_list; kmp != NULL; ) {
1028		READMAG_ROUNDS(magsize);
1029		kmp = mp->mag_next;
1030
1031		if (kmp == cp->cache_full.ml_list)
1032			break; /* cache_full list loop detected */
1033	}
1034
1035	dprintf(("cache_full list done\n"));
1036
1037	/*
1038	 * Now whip through the CPUs, snagging the loaded magazines
1039	 * and full spares.
1040	 *
1041	 * In order to prevent inconsistent dumps, rounds and prounds
1042	 * are copied aside before dumping begins.
1043	 */
1044	for (cpu = 0; cpu < ncpus; cpu++) {
1045		kmem_cpu_cache_t *ccp = &cp->cache_cpu[cpu];
1046		short rounds, prounds;
1047
1048		if (KMEM_DUMPCC(ccp)) {
1049			rounds = ccp->cc_dump_rounds;
1050			prounds = ccp->cc_dump_prounds;
1051		} else {
1052			rounds = ccp->cc_rounds;
1053			prounds = ccp->cc_prounds;
1054		}
1055
1056		dprintf(("reading cpu cache %p\n",
1057		    (uintptr_t)ccp - (uintptr_t)cp + addr));
1058
1059		if (rounds > 0 &&
1060		    (kmp = ccp->cc_loaded) != NULL) {
1061			dprintf(("reading %d loaded rounds\n", rounds));
1062			READMAG_ROUNDS(rounds);
1063		}
1064
1065		if (prounds > 0 &&
1066		    (kmp = ccp->cc_ploaded) != NULL) {
1067			dprintf(("reading %d previously loaded rounds\n",
1068			    prounds));
1069			READMAG_ROUNDS(prounds);
1070		}
1071	}
1072
1073	dprintf(("magazine layer: %d buffers\n", magcnt));
1074
1075	if (!(alloc_flags & UM_GC))
1076		mdb_free(mp, magbsize);
1077
1078	*maglistp = maglist;
1079	*magcntp = magcnt;
1080	*magmaxp = magmax;
1081
1082	return (WALK_NEXT);
1083
1084fail:
1085	if (!(alloc_flags & UM_GC)) {
1086		if (mp)
1087			mdb_free(mp, magbsize);
1088		if (maglist)
1089			mdb_free(maglist, magmax * sizeof (void *));
1090	}
1091	return (WALK_ERR);
1092}
1093
1094static int
1095kmem_walk_callback(mdb_walk_state_t *wsp, uintptr_t buf)
1096{
1097	return (wsp->walk_callback(buf, NULL, wsp->walk_cbdata));
1098}
1099
1100static int
1101bufctl_walk_callback(kmem_cache_t *cp, mdb_walk_state_t *wsp, uintptr_t buf)
1102{
1103	kmem_bufctl_audit_t b;
1104
1105	/*
1106	 * if KMF_AUDIT is not set, we know that we're looking at a
1107	 * kmem_bufctl_t.
1108	 */
1109	if (!(cp->cache_flags & KMF_AUDIT) ||
1110	    mdb_vread(&b, sizeof (kmem_bufctl_audit_t), buf) == -1) {
1111		(void) memset(&b, 0, sizeof (b));
1112		if (mdb_vread(&b, sizeof (kmem_bufctl_t), buf) == -1) {
1113			mdb_warn("unable to read bufctl at %p", buf);
1114			return (WALK_ERR);
1115		}
1116	}
1117
1118	return (wsp->walk_callback(buf, &b, wsp->walk_cbdata));
1119}
1120
1121typedef struct kmem_walk {
1122	int kmw_type;
1123
1124	int kmw_addr;			/* cache address */
1125	kmem_cache_t *kmw_cp;
1126	size_t kmw_csize;
1127
1128	/*
1129	 * magazine layer
1130	 */
1131	void **kmw_maglist;
1132	size_t kmw_max;
1133	size_t kmw_count;
1134	size_t kmw_pos;
1135
1136	/*
1137	 * slab layer
1138	 */
1139	char *kmw_valid;	/* to keep track of freed buffers */
1140	char *kmw_ubase;	/* buffer for slab data */
1141} kmem_walk_t;
1142
1143static int
1144kmem_walk_init_common(mdb_walk_state_t *wsp, int type)
1145{
1146	kmem_walk_t *kmw;
1147	int ncpus, csize;
1148	kmem_cache_t *cp;
1149	size_t vm_quantum;
1150
1151	size_t magmax, magcnt;
1152	void **maglist = NULL;
1153	uint_t chunksize, slabsize;
1154	int status = WALK_ERR;
1155	uintptr_t addr = wsp->walk_addr;
1156	const char *layered;
1157
1158	type &= ~KM_HASH;
1159
1160	if (addr == NULL) {
1161		mdb_warn("kmem walk doesn't support global walks\n");
1162		return (WALK_ERR);
1163	}
1164
1165	dprintf(("walking %p\n", addr));
1166
1167	/*
1168	 * First we need to figure out how many CPUs are configured in the
1169	 * system to know how much to slurp out.
1170	 */
1171	mdb_readvar(&ncpus, "max_ncpus");
1172
1173	csize = KMEM_CACHE_SIZE(ncpus);
1174	cp = mdb_alloc(csize, UM_SLEEP);
1175
1176	if (mdb_vread(cp, csize, addr) == -1) {
1177		mdb_warn("couldn't read cache at addr %p", addr);
1178		goto out2;
1179	}
1180
1181	/*
1182	 * It's easy for someone to hand us an invalid cache address.
1183	 * Unfortunately, it is hard for this walker to survive an
1184	 * invalid cache cleanly.  So we make sure that:
1185	 *
1186	 *	1. the vmem arena for the cache is readable,
1187	 *	2. the vmem arena's quantum is a power of 2,
1188	 *	3. our slabsize is a multiple of the quantum, and
1189	 *	4. our chunksize is >0 and less than our slabsize.
1190	 */
1191	if (mdb_vread(&vm_quantum, sizeof (vm_quantum),
1192	    (uintptr_t)&cp->cache_arena->vm_quantum) == -1 ||
1193	    vm_quantum == 0 ||
1194	    (vm_quantum & (vm_quantum - 1)) != 0 ||
1195	    cp->cache_slabsize < vm_quantum ||
1196	    P2PHASE(cp->cache_slabsize, vm_quantum) != 0 ||
1197	    cp->cache_chunksize == 0 ||
1198	    cp->cache_chunksize > cp->cache_slabsize) {
1199		mdb_warn("%p is not a valid kmem_cache_t\n", addr);
1200		goto out2;
1201	}
1202
1203	dprintf(("buf total is %d\n", cp->cache_buftotal));
1204
1205	if (cp->cache_buftotal == 0) {
1206		mdb_free(cp, csize);
1207		return (WALK_DONE);
1208	}
1209
1210	/*
1211	 * If they ask for bufctls, but it's a small-slab cache,
1212	 * there is nothing to report.
1213	 */
1214	if ((type & KM_BUFCTL) && !(cp->cache_flags & KMF_HASH)) {
1215		dprintf(("bufctl requested, not KMF_HASH (flags: %p)\n",
1216		    cp->cache_flags));
1217		mdb_free(cp, csize);
1218		return (WALK_DONE);
1219	}
1220
1221	/*
1222	 * If they want constructed buffers, but there's no constructor or
1223	 * the cache has DEADBEEF checking enabled, there is nothing to report.
1224	 */
1225	if ((type & KM_CONSTRUCTED) && (!(type & KM_FREE) ||
1226	    cp->cache_constructor == NULL ||
1227	    (cp->cache_flags & (KMF_DEADBEEF | KMF_LITE)) == KMF_DEADBEEF)) {
1228		mdb_free(cp, csize);
1229		return (WALK_DONE);
1230	}
1231
1232	/*
1233	 * Read in the contents of the magazine layer
1234	 */
1235	if (kmem_read_magazines(cp, addr, ncpus, &maglist, &magcnt,
1236	    &magmax, UM_SLEEP) == WALK_ERR)
1237		goto out2;
1238
1239	/*
1240	 * We have all of the buffers from the magazines;  if we are walking
1241	 * allocated buffers, sort them so we can bsearch them later.
1242	 */
1243	if (type & KM_ALLOCATED)
1244		qsort(maglist, magcnt, sizeof (void *), addrcmp);
1245
1246	wsp->walk_data = kmw = mdb_zalloc(sizeof (kmem_walk_t), UM_SLEEP);
1247
1248	kmw->kmw_type = type;
1249	kmw->kmw_addr = addr;
1250	kmw->kmw_cp = cp;
1251	kmw->kmw_csize = csize;
1252	kmw->kmw_maglist = maglist;
1253	kmw->kmw_max = magmax;
1254	kmw->kmw_count = magcnt;
1255	kmw->kmw_pos = 0;
1256
1257	/*
1258	 * When walking allocated buffers in a KMF_HASH cache, we walk the
1259	 * hash table instead of the slab layer.
1260	 */
1261	if ((cp->cache_flags & KMF_HASH) && (type & KM_ALLOCATED)) {
1262		layered = "kmem_hash";
1263
1264		kmw->kmw_type |= KM_HASH;
1265	} else {
1266		/*
1267		 * If we are walking freed buffers, we only need the
1268		 * magazine layer plus the partially allocated slabs.
1269		 * To walk allocated buffers, we need all of the slabs.
1270		 */
1271		if (type & KM_ALLOCATED)
1272			layered = "kmem_slab";
1273		else
1274			layered = "kmem_slab_partial";
1275
1276		/*
1277		 * for small-slab caches, we read in the entire slab.  For
1278		 * freed buffers, we can just walk the freelist.  For
1279		 * allocated buffers, we use a 'valid' array to track
1280		 * the freed buffers.
1281		 */
1282		if (!(cp->cache_flags & KMF_HASH)) {
1283			chunksize = cp->cache_chunksize;
1284			slabsize = cp->cache_slabsize;
1285
1286			kmw->kmw_ubase = mdb_alloc(slabsize +
1287			    sizeof (kmem_bufctl_t), UM_SLEEP);
1288
1289			if (type & KM_ALLOCATED)
1290				kmw->kmw_valid =
1291				    mdb_alloc(slabsize / chunksize, UM_SLEEP);
1292		}
1293	}
1294
1295	status = WALK_NEXT;
1296
1297	if (mdb_layered_walk(layered, wsp) == -1) {
1298		mdb_warn("unable to start layered '%s' walk", layered);
1299		status = WALK_ERR;
1300	}
1301
1302out1:
1303	if (status == WALK_ERR) {
1304		if (kmw->kmw_valid)
1305			mdb_free(kmw->kmw_valid, slabsize / chunksize);
1306
1307		if (kmw->kmw_ubase)
1308			mdb_free(kmw->kmw_ubase, slabsize +
1309			    sizeof (kmem_bufctl_t));
1310
1311		if (kmw->kmw_maglist)
1312			mdb_free(kmw->kmw_maglist,
1313			    kmw->kmw_max * sizeof (uintptr_t));
1314
1315		mdb_free(kmw, sizeof (kmem_walk_t));
1316		wsp->walk_data = NULL;
1317	}
1318
1319out2:
1320	if (status == WALK_ERR)
1321		mdb_free(cp, csize);
1322
1323	return (status);
1324}
1325
1326int
1327kmem_walk_step(mdb_walk_state_t *wsp)
1328{
1329	kmem_walk_t *kmw = wsp->walk_data;
1330	int type = kmw->kmw_type;
1331	kmem_cache_t *cp = kmw->kmw_cp;
1332
1333	void **maglist = kmw->kmw_maglist;
1334	int magcnt = kmw->kmw_count;
1335
1336	uintptr_t chunksize, slabsize;
1337	uintptr_t addr;
1338	const kmem_slab_t *sp;
1339	const kmem_bufctl_t *bcp;
1340	kmem_bufctl_t bc;
1341
1342	int chunks;
1343	char *kbase;
1344	void *buf;
1345	int i, ret;
1346
1347	char *valid, *ubase;
1348
1349	/*
1350	 * first, handle the 'kmem_hash' layered walk case
1351	 */
1352	if (type & KM_HASH) {
1353		/*
1354		 * We have a buffer which has been allocated out of the
1355		 * global layer. We need to make sure that it's not
1356		 * actually sitting in a magazine before we report it as
1357		 * an allocated buffer.
1358		 */
1359		buf = ((const kmem_bufctl_t *)wsp->walk_layer)->bc_addr;
1360
1361		if (magcnt > 0 &&
1362		    bsearch(&buf, maglist, magcnt, sizeof (void *),
1363		    addrcmp) != NULL)
1364			return (WALK_NEXT);
1365
1366		if (type & KM_BUFCTL)
1367			return (bufctl_walk_callback(cp, wsp, wsp->walk_addr));
1368
1369		return (kmem_walk_callback(wsp, (uintptr_t)buf));
1370	}
1371
1372	ret = WALK_NEXT;
1373
1374	addr = kmw->kmw_addr;
1375
1376	/*
1377	 * If we're walking freed buffers, report everything in the
1378	 * magazine layer before processing the first slab.
1379	 */
1380	if ((type & KM_FREE) && magcnt != 0) {
1381		kmw->kmw_count = 0;		/* only do this once */
1382		for (i = 0; i < magcnt; i++) {
1383			buf = maglist[i];
1384
1385			if (type & KM_BUFCTL) {
1386				uintptr_t out;
1387
1388				if (cp->cache_flags & KMF_BUFTAG) {
1389					kmem_buftag_t *btp;
1390					kmem_buftag_t tag;
1391
1392					/* LINTED - alignment */
1393					btp = KMEM_BUFTAG(cp, buf);
1394					if (mdb_vread(&tag, sizeof (tag),
1395					    (uintptr_t)btp) == -1) {
1396						mdb_warn("reading buftag for "
1397						    "%p at %p", buf, btp);
1398						continue;
1399					}
1400					out = (uintptr_t)tag.bt_bufctl;
1401				} else {
1402					if (kmem_hash_lookup(cp, addr, buf,
1403					    &out) == -1)
1404						continue;
1405				}
1406				ret = bufctl_walk_callback(cp, wsp, out);
1407			} else {
1408				ret = kmem_walk_callback(wsp, (uintptr_t)buf);
1409			}
1410
1411			if (ret != WALK_NEXT)
1412				return (ret);
1413		}
1414	}
1415
1416	/*
1417	 * If they want constructed buffers, we're finished, since the
1418	 * magazine layer holds them all.
1419	 */
1420	if (type & KM_CONSTRUCTED)
1421		return (WALK_DONE);
1422
1423	/*
1424	 * Handle the buffers in the current slab
1425	 */
1426	chunksize = cp->cache_chunksize;
1427	slabsize = cp->cache_slabsize;
1428
1429	sp = wsp->walk_layer;
1430	chunks = sp->slab_chunks;
1431	kbase = sp->slab_base;
1432
1433	dprintf(("kbase is %p\n", kbase));
1434
1435	if (!(cp->cache_flags & KMF_HASH)) {
1436		valid = kmw->kmw_valid;
1437		ubase = kmw->kmw_ubase;
1438
1439		if (mdb_vread(ubase, chunks * chunksize,
1440		    (uintptr_t)kbase) == -1) {
1441			mdb_warn("failed to read slab contents at %p", kbase);
1442			return (WALK_ERR);
1443		}
1444
1445		/*
1446		 * Set up the valid map as fully allocated -- we'll punch
1447		 * out the freelist.
1448		 */
1449		if (type & KM_ALLOCATED)
1450			(void) memset(valid, 1, chunks);
1451	} else {
1452		valid = NULL;
1453		ubase = NULL;
1454	}
1455
1456	/*
1457	 * walk the slab's freelist
1458	 */
1459	bcp = sp->slab_head;
1460
1461	dprintf(("refcnt is %d; chunks is %d\n", sp->slab_refcnt, chunks));
1462
1463	/*
1464	 * since we could be in the middle of allocating a buffer,
1465	 * our refcnt could be one higher than it aught.  So we
1466	 * check one further on the freelist than the count allows.
1467	 */
1468	for (i = sp->slab_refcnt; i <= chunks; i++) {
1469		uint_t ndx;
1470
1471		dprintf(("bcp is %p\n", bcp));
1472
1473		if (bcp == NULL) {
1474			if (i == chunks)
1475				break;
1476			mdb_warn(
1477			    "slab %p in cache %p freelist too short by %d\n",
1478			    sp, addr, chunks - i);
1479			break;
1480		}
1481
1482		if (cp->cache_flags & KMF_HASH) {
1483			if (mdb_vread(&bc, sizeof (bc), (uintptr_t)bcp) == -1) {
1484				mdb_warn("failed to read bufctl ptr at %p",
1485				    bcp);
1486				break;
1487			}
1488			buf = bc.bc_addr;
1489		} else {
1490			/*
1491			 * Otherwise the buffer is in the slab which
1492			 * we've read in;  we just need to determine
1493			 * its offset in the slab to find the
1494			 * kmem_bufctl_t.
1495			 */
1496			bc = *((kmem_bufctl_t *)
1497			    ((uintptr_t)bcp - (uintptr_t)kbase +
1498			    (uintptr_t)ubase));
1499
1500			buf = KMEM_BUF(cp, bcp);
1501		}
1502
1503		ndx = ((uintptr_t)buf - (uintptr_t)kbase) / chunksize;
1504
1505		if (ndx > slabsize / cp->cache_bufsize) {
1506			/*
1507			 * This is very wrong; we have managed to find
1508			 * a buffer in the slab which shouldn't
1509			 * actually be here.  Emit a warning, and
1510			 * try to continue.
1511			 */
1512			mdb_warn("buf %p is out of range for "
1513			    "slab %p, cache %p\n", buf, sp, addr);
1514		} else if (type & KM_ALLOCATED) {
1515			/*
1516			 * we have found a buffer on the slab's freelist;
1517			 * clear its entry
1518			 */
1519			valid[ndx] = 0;
1520		} else {
1521			/*
1522			 * Report this freed buffer
1523			 */
1524			if (type & KM_BUFCTL) {
1525				ret = bufctl_walk_callback(cp, wsp,
1526				    (uintptr_t)bcp);
1527			} else {
1528				ret = kmem_walk_callback(wsp, (uintptr_t)buf);
1529			}
1530			if (ret != WALK_NEXT)
1531				return (ret);
1532		}
1533
1534		bcp = bc.bc_next;
1535	}
1536
1537	if (bcp != NULL) {
1538		dprintf(("slab %p in cache %p freelist too long (%p)\n",
1539		    sp, addr, bcp));
1540	}
1541
1542	/*
1543	 * If we are walking freed buffers, the loop above handled reporting
1544	 * them.
1545	 */
1546	if (type & KM_FREE)
1547		return (WALK_NEXT);
1548
1549	if (type & KM_BUFCTL) {
1550		mdb_warn("impossible situation: small-slab KM_BUFCTL walk for "
1551		    "cache %p\n", addr);
1552		return (WALK_ERR);
1553	}
1554
1555	/*
1556	 * Report allocated buffers, skipping buffers in the magazine layer.
1557	 * We only get this far for small-slab caches.
1558	 */
1559	for (i = 0; ret == WALK_NEXT && i < chunks; i++) {
1560		buf = (char *)kbase + i * chunksize;
1561
1562		if (!valid[i])
1563			continue;		/* on slab freelist */
1564
1565		if (magcnt > 0 &&
1566		    bsearch(&buf, maglist, magcnt, sizeof (void *),
1567		    addrcmp) != NULL)
1568			continue;		/* in magazine layer */
1569
1570		ret = kmem_walk_callback(wsp, (uintptr_t)buf);
1571	}
1572	return (ret);
1573}
1574
1575void
1576kmem_walk_fini(mdb_walk_state_t *wsp)
1577{
1578	kmem_walk_t *kmw = wsp->walk_data;
1579	uintptr_t chunksize;
1580	uintptr_t slabsize;
1581
1582	if (kmw == NULL)
1583		return;
1584
1585	if (kmw->kmw_maglist != NULL)
1586		mdb_free(kmw->kmw_maglist, kmw->kmw_max * sizeof (void *));
1587
1588	chunksize = kmw->kmw_cp->cache_chunksize;
1589	slabsize = kmw->kmw_cp->cache_slabsize;
1590
1591	if (kmw->kmw_valid != NULL)
1592		mdb_free(kmw->kmw_valid, slabsize / chunksize);
1593	if (kmw->kmw_ubase != NULL)
1594		mdb_free(kmw->kmw_ubase, slabsize + sizeof (kmem_bufctl_t));
1595
1596	mdb_free(kmw->kmw_cp, kmw->kmw_csize);
1597	mdb_free(kmw, sizeof (kmem_walk_t));
1598}
1599
1600/*ARGSUSED*/
1601static int
1602kmem_walk_all(uintptr_t addr, const kmem_cache_t *c, mdb_walk_state_t *wsp)
1603{
1604	/*
1605	 * Buffers allocated from NOTOUCH caches can also show up as freed
1606	 * memory in other caches.  This can be a little confusing, so we
1607	 * don't walk NOTOUCH caches when walking all caches (thereby assuring
1608	 * that "::walk kmem" and "::walk freemem" yield disjoint output).
1609	 */
1610	if (c->cache_cflags & KMC_NOTOUCH)
1611		return (WALK_NEXT);
1612
1613	if (mdb_pwalk(wsp->walk_data, wsp->walk_callback,
1614	    wsp->walk_cbdata, addr) == -1)
1615		return (WALK_DONE);
1616
1617	return (WALK_NEXT);
1618}
1619
1620#define	KMEM_WALK_ALL(name, wsp) { \
1621	wsp->walk_data = (name); \
1622	if (mdb_walk("kmem_cache", (mdb_walk_cb_t)kmem_walk_all, wsp) == -1) \
1623		return (WALK_ERR); \
1624	return (WALK_DONE); \
1625}
1626
1627int
1628kmem_walk_init(mdb_walk_state_t *wsp)
1629{
1630	if (wsp->walk_arg != NULL)
1631		wsp->walk_addr = (uintptr_t)wsp->walk_arg;
1632
1633	if (wsp->walk_addr == NULL)
1634		KMEM_WALK_ALL("kmem", wsp);
1635	return (kmem_walk_init_common(wsp, KM_ALLOCATED));
1636}
1637
1638int
1639bufctl_walk_init(mdb_walk_state_t *wsp)
1640{
1641	if (wsp->walk_addr == NULL)
1642		KMEM_WALK_ALL("bufctl", wsp);
1643	return (kmem_walk_init_common(wsp, KM_ALLOCATED | KM_BUFCTL));
1644}
1645
1646int
1647freemem_walk_init(mdb_walk_state_t *wsp)
1648{
1649	if (wsp->walk_addr == NULL)
1650		KMEM_WALK_ALL("freemem", wsp);
1651	return (kmem_walk_init_common(wsp, KM_FREE));
1652}
1653
1654int
1655freemem_constructed_walk_init(mdb_walk_state_t *wsp)
1656{
1657	if (wsp->walk_addr == NULL)
1658		KMEM_WALK_ALL("freemem_constructed", wsp);
1659	return (kmem_walk_init_common(wsp, KM_FREE | KM_CONSTRUCTED));
1660}
1661
1662int
1663freectl_walk_init(mdb_walk_state_t *wsp)
1664{
1665	if (wsp->walk_addr == NULL)
1666		KMEM_WALK_ALL("freectl", wsp);
1667	return (kmem_walk_init_common(wsp, KM_FREE | KM_BUFCTL));
1668}
1669
1670int
1671freectl_constructed_walk_init(mdb_walk_state_t *wsp)
1672{
1673	if (wsp->walk_addr == NULL)
1674		KMEM_WALK_ALL("freectl_constructed", wsp);
1675	return (kmem_walk_init_common(wsp,
1676	    KM_FREE | KM_BUFCTL | KM_CONSTRUCTED));
1677}
1678
1679typedef struct bufctl_history_walk {
1680	void		*bhw_next;
1681	kmem_cache_t	*bhw_cache;
1682	kmem_slab_t	*bhw_slab;
1683	hrtime_t	bhw_timestamp;
1684} bufctl_history_walk_t;
1685
1686int
1687bufctl_history_walk_init(mdb_walk_state_t *wsp)
1688{
1689	bufctl_history_walk_t *bhw;
1690	kmem_bufctl_audit_t bc;
1691	kmem_bufctl_audit_t bcn;
1692
1693	if (wsp->walk_addr == NULL) {
1694		mdb_warn("bufctl_history walk doesn't support global walks\n");
1695		return (WALK_ERR);
1696	}
1697
1698	if (mdb_vread(&bc, sizeof (bc), wsp->walk_addr) == -1) {
1699		mdb_warn("unable to read bufctl at %p", wsp->walk_addr);
1700		return (WALK_ERR);
1701	}
1702
1703	bhw = mdb_zalloc(sizeof (*bhw), UM_SLEEP);
1704	bhw->bhw_timestamp = 0;
1705	bhw->bhw_cache = bc.bc_cache;
1706	bhw->bhw_slab = bc.bc_slab;
1707
1708	/*
1709	 * sometimes the first log entry matches the base bufctl;  in that
1710	 * case, skip the base bufctl.
1711	 */
1712	if (bc.bc_lastlog != NULL &&
1713	    mdb_vread(&bcn, sizeof (bcn), (uintptr_t)bc.bc_lastlog) != -1 &&
1714	    bc.bc_addr == bcn.bc_addr &&
1715	    bc.bc_cache == bcn.bc_cache &&
1716	    bc.bc_slab == bcn.bc_slab &&
1717	    bc.bc_timestamp == bcn.bc_timestamp &&
1718	    bc.bc_thread == bcn.bc_thread)
1719		bhw->bhw_next = bc.bc_lastlog;
1720	else
1721		bhw->bhw_next = (void *)wsp->walk_addr;
1722
1723	wsp->walk_addr = (uintptr_t)bc.bc_addr;
1724	wsp->walk_data = bhw;
1725
1726	return (WALK_NEXT);
1727}
1728
1729int
1730bufctl_history_walk_step(mdb_walk_state_t *wsp)
1731{
1732	bufctl_history_walk_t *bhw = wsp->walk_data;
1733	uintptr_t addr = (uintptr_t)bhw->bhw_next;
1734	uintptr_t baseaddr = wsp->walk_addr;
1735	kmem_bufctl_audit_t bc;
1736
1737	if (addr == NULL)
1738		return (WALK_DONE);
1739
1740	if (mdb_vread(&bc, sizeof (bc), addr) == -1) {
1741		mdb_warn("unable to read bufctl at %p", bhw->bhw_next);
1742		return (WALK_ERR);
1743	}
1744
1745	/*
1746	 * The bufctl is only valid if the address, cache, and slab are
1747	 * correct.  We also check that the timestamp is decreasing, to
1748	 * prevent infinite loops.
1749	 */
1750	if ((uintptr_t)bc.bc_addr != baseaddr ||
1751	    bc.bc_cache != bhw->bhw_cache ||
1752	    bc.bc_slab != bhw->bhw_slab ||
1753	    (bhw->bhw_timestamp != 0 && bc.bc_timestamp >= bhw->bhw_timestamp))
1754		return (WALK_DONE);
1755
1756	bhw->bhw_next = bc.bc_lastlog;
1757	bhw->bhw_timestamp = bc.bc_timestamp;
1758
1759	return (wsp->walk_callback(addr, &bc, wsp->walk_cbdata));
1760}
1761
1762void
1763bufctl_history_walk_fini(mdb_walk_state_t *wsp)
1764{
1765	bufctl_history_walk_t *bhw = wsp->walk_data;
1766
1767	mdb_free(bhw, sizeof (*bhw));
1768}
1769
1770typedef struct kmem_log_walk {
1771	kmem_bufctl_audit_t *klw_base;
1772	kmem_bufctl_audit_t **klw_sorted;
1773	kmem_log_header_t klw_lh;
1774	size_t klw_size;
1775	size_t klw_maxndx;
1776	size_t klw_ndx;
1777} kmem_log_walk_t;
1778
1779int
1780kmem_log_walk_init(mdb_walk_state_t *wsp)
1781{
1782	uintptr_t lp = wsp->walk_addr;
1783	kmem_log_walk_t *klw;
1784	kmem_log_header_t *lhp;
1785	int maxndx, i, j, k;
1786
1787	/*
1788	 * By default (global walk), walk the kmem_transaction_log.  Otherwise
1789	 * read the log whose kmem_log_header_t is stored at walk_addr.
1790	 */
1791	if (lp == NULL && mdb_readvar(&lp, "kmem_transaction_log") == -1) {
1792		mdb_warn("failed to read 'kmem_transaction_log'");
1793		return (WALK_ERR);
1794	}
1795
1796	if (lp == NULL) {
1797		mdb_warn("log is disabled\n");
1798		return (WALK_ERR);
1799	}
1800
1801	klw = mdb_zalloc(sizeof (kmem_log_walk_t), UM_SLEEP);
1802	lhp = &klw->klw_lh;
1803
1804	if (mdb_vread(lhp, sizeof (kmem_log_header_t), lp) == -1) {
1805		mdb_warn("failed to read log header at %p", lp);
1806		mdb_free(klw, sizeof (kmem_log_walk_t));
1807		return (WALK_ERR);
1808	}
1809
1810	klw->klw_size = lhp->lh_chunksize * lhp->lh_nchunks;
1811	klw->klw_base = mdb_alloc(klw->klw_size, UM_SLEEP);
1812	maxndx = lhp->lh_chunksize / sizeof (kmem_bufctl_audit_t) - 1;
1813
1814	if (mdb_vread(klw->klw_base, klw->klw_size,
1815	    (uintptr_t)lhp->lh_base) == -1) {
1816		mdb_warn("failed to read log at base %p", lhp->lh_base);
1817		mdb_free(klw->klw_base, klw->klw_size);
1818		mdb_free(klw, sizeof (kmem_log_walk_t));
1819		return (WALK_ERR);
1820	}
1821
1822	klw->klw_sorted = mdb_alloc(maxndx * lhp->lh_nchunks *
1823	    sizeof (kmem_bufctl_audit_t *), UM_SLEEP);
1824
1825	for (i = 0, k = 0; i < lhp->lh_nchunks; i++) {
1826		kmem_bufctl_audit_t *chunk = (kmem_bufctl_audit_t *)
1827		    ((uintptr_t)klw->klw_base + i * lhp->lh_chunksize);
1828
1829		for (j = 0; j < maxndx; j++)
1830			klw->klw_sorted[k++] = &chunk[j];
1831	}
1832
1833	qsort(klw->klw_sorted, k, sizeof (kmem_bufctl_audit_t *),
1834	    (int(*)(const void *, const void *))bufctlcmp);
1835
1836	klw->klw_maxndx = k;
1837	wsp->walk_data = klw;
1838
1839	return (WALK_NEXT);
1840}
1841
1842int
1843kmem_log_walk_step(mdb_walk_state_t *wsp)
1844{
1845	kmem_log_walk_t *klw = wsp->walk_data;
1846	kmem_bufctl_audit_t *bcp;
1847
1848	if (klw->klw_ndx == klw->klw_maxndx)
1849		return (WALK_DONE);
1850
1851	bcp = klw->klw_sorted[klw->klw_ndx++];
1852
1853	return (wsp->walk_callback((uintptr_t)bcp - (uintptr_t)klw->klw_base +
1854	    (uintptr_t)klw->klw_lh.lh_base, bcp, wsp->walk_cbdata));
1855}
1856
1857void
1858kmem_log_walk_fini(mdb_walk_state_t *wsp)
1859{
1860	kmem_log_walk_t *klw = wsp->walk_data;
1861
1862	mdb_free(klw->klw_base, klw->klw_size);
1863	mdb_free(klw->klw_sorted, klw->klw_maxndx *
1864	    sizeof (kmem_bufctl_audit_t *));
1865	mdb_free(klw, sizeof (kmem_log_walk_t));
1866}
1867
1868typedef struct allocdby_bufctl {
1869	uintptr_t abb_addr;
1870	hrtime_t abb_ts;
1871} allocdby_bufctl_t;
1872
1873typedef struct allocdby_walk {
1874	const char *abw_walk;
1875	uintptr_t abw_thread;
1876	size_t abw_nbufs;
1877	size_t abw_size;
1878	allocdby_bufctl_t *abw_buf;
1879	size_t abw_ndx;
1880} allocdby_walk_t;
1881
1882int
1883allocdby_walk_bufctl(uintptr_t addr, const kmem_bufctl_audit_t *bcp,
1884    allocdby_walk_t *abw)
1885{
1886	if ((uintptr_t)bcp->bc_thread != abw->abw_thread)
1887		return (WALK_NEXT);
1888
1889	if (abw->abw_nbufs == abw->abw_size) {
1890		allocdby_bufctl_t *buf;
1891		size_t oldsize = sizeof (allocdby_bufctl_t) * abw->abw_size;
1892
1893		buf = mdb_zalloc(oldsize << 1, UM_SLEEP);
1894
1895		bcopy(abw->abw_buf, buf, oldsize);
1896		mdb_free(abw->abw_buf, oldsize);
1897
1898		abw->abw_size <<= 1;
1899		abw->abw_buf = buf;
1900	}
1901
1902	abw->abw_buf[abw->abw_nbufs].abb_addr = addr;
1903	abw->abw_buf[abw->abw_nbufs].abb_ts = bcp->bc_timestamp;
1904	abw->abw_nbufs++;
1905
1906	return (WALK_NEXT);
1907}
1908
1909/*ARGSUSED*/
1910int
1911allocdby_walk_cache(uintptr_t addr, const kmem_cache_t *c, allocdby_walk_t *abw)
1912{
1913	if (mdb_pwalk(abw->abw_walk, (mdb_walk_cb_t)allocdby_walk_bufctl,
1914	    abw, addr) == -1) {
1915		mdb_warn("couldn't walk bufctl for cache %p", addr);
1916		return (WALK_DONE);
1917	}
1918
1919	return (WALK_NEXT);
1920}
1921
1922static int
1923allocdby_cmp(const allocdby_bufctl_t *lhs, const allocdby_bufctl_t *rhs)
1924{
1925	if (lhs->abb_ts < rhs->abb_ts)
1926		return (1);
1927	if (lhs->abb_ts > rhs->abb_ts)
1928		return (-1);
1929	return (0);
1930}
1931
1932static int
1933allocdby_walk_init_common(mdb_walk_state_t *wsp, const char *walk)
1934{
1935	allocdby_walk_t *abw;
1936
1937	if (wsp->walk_addr == NULL) {
1938		mdb_warn("allocdby walk doesn't support global walks\n");
1939		return (WALK_ERR);
1940	}
1941
1942	abw = mdb_zalloc(sizeof (allocdby_walk_t), UM_SLEEP);
1943
1944	abw->abw_thread = wsp->walk_addr;
1945	abw->abw_walk = walk;
1946	abw->abw_size = 128;	/* something reasonable */
1947	abw->abw_buf =
1948	    mdb_zalloc(abw->abw_size * sizeof (allocdby_bufctl_t), UM_SLEEP);
1949
1950	wsp->walk_data = abw;
1951
1952	if (mdb_walk("kmem_cache",
1953	    (mdb_walk_cb_t)allocdby_walk_cache, abw) == -1) {
1954		mdb_warn("couldn't walk kmem_cache");
1955		allocdby_walk_fini(wsp);
1956		return (WALK_ERR);
1957	}
1958
1959	qsort(abw->abw_buf, abw->abw_nbufs, sizeof (allocdby_bufctl_t),
1960	    (int(*)(const void *, const void *))allocdby_cmp);
1961
1962	return (WALK_NEXT);
1963}
1964
1965int
1966allocdby_walk_init(mdb_walk_state_t *wsp)
1967{
1968	return (allocdby_walk_init_common(wsp, "bufctl"));
1969}
1970
1971int
1972freedby_walk_init(mdb_walk_state_t *wsp)
1973{
1974	return (allocdby_walk_init_common(wsp, "freectl"));
1975}
1976
1977int
1978allocdby_walk_step(mdb_walk_state_t *wsp)
1979{
1980	allocdby_walk_t *abw = wsp->walk_data;
1981	kmem_bufctl_audit_t bc;
1982	uintptr_t addr;
1983
1984	if (abw->abw_ndx == abw->abw_nbufs)
1985		return (WALK_DONE);
1986
1987	addr = abw->abw_buf[abw->abw_ndx++].abb_addr;
1988
1989	if (mdb_vread(&bc, sizeof (bc), addr) == -1) {
1990		mdb_warn("couldn't read bufctl at %p", addr);
1991		return (WALK_DONE);
1992	}
1993
1994	return (wsp->walk_callback(addr, &bc, wsp->walk_cbdata));
1995}
1996
1997void
1998allocdby_walk_fini(mdb_walk_state_t *wsp)
1999{
2000	allocdby_walk_t *abw = wsp->walk_data;
2001
2002	mdb_free(abw->abw_buf, sizeof (allocdby_bufctl_t) * abw->abw_size);
2003	mdb_free(abw, sizeof (allocdby_walk_t));
2004}
2005
2006/*ARGSUSED*/
2007int
2008allocdby_walk(uintptr_t addr, const kmem_bufctl_audit_t *bcp, void *ignored)
2009{
2010	char c[MDB_SYM_NAMLEN];
2011	GElf_Sym sym;
2012	int i;
2013
2014	mdb_printf("%0?p %12llx ", addr, bcp->bc_timestamp);
2015	for (i = 0; i < bcp->bc_depth; i++) {
2016		if (mdb_lookup_by_addr(bcp->bc_stack[i],
2017		    MDB_SYM_FUZZY, c, sizeof (c), &sym) == -1)
2018			continue;
2019		if (strncmp(c, "kmem_", 5) == 0)
2020			continue;
2021		mdb_printf("%s+0x%lx",
2022		    c, bcp->bc_stack[i] - (uintptr_t)sym.st_value);
2023		break;
2024	}
2025	mdb_printf("\n");
2026
2027	return (WALK_NEXT);
2028}
2029
2030static int
2031allocdby_common(uintptr_t addr, uint_t flags, const char *w)
2032{
2033	if (!(flags & DCMD_ADDRSPEC))
2034		return (DCMD_USAGE);
2035
2036	mdb_printf("%-?s %12s %s\n", "BUFCTL", "TIMESTAMP", "CALLER");
2037
2038	if (mdb_pwalk(w, (mdb_walk_cb_t)allocdby_walk, NULL, addr) == -1) {
2039		mdb_warn("can't walk '%s' for %p", w, addr);
2040		return (DCMD_ERR);
2041	}
2042
2043	return (DCMD_OK);
2044}
2045
2046/*ARGSUSED*/
2047int
2048allocdby(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
2049{
2050	return (allocdby_common(addr, flags, "allocdby"));
2051}
2052
2053/*ARGSUSED*/
2054int
2055freedby(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
2056{
2057	return (allocdby_common(addr, flags, "freedby"));
2058}
2059
2060/*
2061 * Return a string describing the address in relation to the given thread's
2062 * stack.
2063 *
2064 * - If the thread state is TS_FREE, return " (inactive interrupt thread)".
2065 *
2066 * - If the address is above the stack pointer, return an empty string
2067 *   signifying that the address is active.
2068 *
2069 * - If the address is below the stack pointer, and the thread is not on proc,
2070 *   return " (below sp)".
2071 *
2072 * - If the address is below the stack pointer, and the thread is on proc,
2073 *   return " (possibly below sp)".  Depending on context, we may or may not
2074 *   have an accurate t_sp.
2075 */
2076static const char *
2077stack_active(const kthread_t *t, uintptr_t addr)
2078{
2079	uintptr_t panicstk;
2080	GElf_Sym sym;
2081
2082	if (t->t_state == TS_FREE)
2083		return (" (inactive interrupt thread)");
2084
2085	/*
2086	 * Check to see if we're on the panic stack.  If so, ignore t_sp, as it
2087	 * no longer relates to the thread's real stack.
2088	 */
2089	if (mdb_lookup_by_name("panic_stack", &sym) == 0) {
2090		panicstk = (uintptr_t)sym.st_value;
2091
2092		if (t->t_sp >= panicstk && t->t_sp < panicstk + PANICSTKSIZE)
2093			return ("");
2094	}
2095
2096	if (addr >= t->t_sp + STACK_BIAS)
2097		return ("");
2098
2099	if (t->t_state == TS_ONPROC)
2100		return (" (possibly below sp)");
2101
2102	return (" (below sp)");
2103}
2104
2105/*
2106 * Additional state for the kmem and vmem ::whatis handlers
2107 */
2108typedef struct whatis_info {
2109	mdb_whatis_t *wi_w;
2110	const kmem_cache_t *wi_cache;
2111	const vmem_t *wi_vmem;
2112	vmem_t *wi_msb_arena;
2113	size_t wi_slab_size;
2114	uint_t wi_slab_found;
2115	uint_t wi_kmem_lite_count;
2116	uint_t wi_freemem;
2117} whatis_info_t;
2118
2119/* call one of our dcmd functions with "-v" and the provided address */
2120static void
2121whatis_call_printer(mdb_dcmd_f *dcmd, uintptr_t addr)
2122{
2123	mdb_arg_t a;
2124	a.a_type = MDB_TYPE_STRING;
2125	a.a_un.a_str = "-v";
2126
2127	mdb_printf(":\n");
2128	(void) (*dcmd)(addr, DCMD_ADDRSPEC, 1, &a);
2129}
2130
2131static void
2132whatis_print_kmf_lite(uintptr_t btaddr, size_t count)
2133{
2134#define	KMEM_LITE_MAX	16
2135	pc_t callers[KMEM_LITE_MAX];
2136	pc_t uninit = (pc_t)KMEM_UNINITIALIZED_PATTERN;
2137
2138	kmem_buftag_t bt;
2139	intptr_t stat;
2140	const char *plural = "";
2141	int i;
2142
2143	/* validate our arguments and read in the buftag */
2144	if (count == 0 || count > KMEM_LITE_MAX ||
2145	    mdb_vread(&bt, sizeof (bt), btaddr) == -1)
2146		return;
2147
2148	/* validate the buffer state and read in the callers */
2149	stat = (intptr_t)bt.bt_bufctl ^ bt.bt_bxstat;
2150
2151	if (stat != KMEM_BUFTAG_ALLOC || stat != KMEM_BUFTAG_FREE ||
2152	    mdb_vread(callers, count * sizeof (pc_t),
2153	    btaddr + offsetof(kmem_buftag_lite_t, bt_history)) == -1)
2154		return;
2155
2156	/* If there aren't any filled in callers, bail */
2157	if (callers[0] == uninit)
2158		return;
2159
2160	plural = (callers[1] == uninit) ? "" : "s";
2161
2162	/* Everything's done and checked; print them out */
2163	mdb_printf(":\n");
2164
2165	mdb_inc_indent(8);
2166	mdb_printf("recent caller%s: %a", plural, callers[0]);
2167	for (i = 1; i < count; i++) {
2168		if (callers[i] == uninit)
2169			break;
2170		mdb_printf(", %a", callers[i]);
2171	}
2172	mdb_dec_indent(8);
2173}
2174
2175static void
2176whatis_print_kmem(whatis_info_t *wi, uintptr_t maddr, uintptr_t addr,
2177    uintptr_t baddr)
2178{
2179	mdb_whatis_t *w = wi->wi_w;
2180
2181	const kmem_cache_t *cp = wi->wi_cache;
2182	/* LINTED pointer cast may result in improper alignment */
2183	uintptr_t btaddr = (uintptr_t)KMEM_BUFTAG(cp, addr);
2184	int quiet = (mdb_whatis_flags(w) & WHATIS_QUIET);
2185	int call_printer = (!quiet && (cp->cache_flags & KMF_AUDIT));
2186
2187	mdb_whatis_report_object(w, maddr, addr, "");
2188
2189	if (baddr != 0 && !call_printer)
2190		mdb_printf("bufctl %p ", baddr);
2191
2192	mdb_printf("%s from %s",
2193	    (wi->wi_freemem == FALSE) ? "allocated" : "freed", cp->cache_name);
2194
2195	if (baddr != 0 && call_printer) {
2196		whatis_call_printer(bufctl, baddr);
2197		return;
2198	}
2199
2200	/* for KMF_LITE caches, try to print out the previous callers */
2201	if (!quiet && (cp->cache_flags & KMF_LITE))
2202		whatis_print_kmf_lite(btaddr, wi->wi_kmem_lite_count);
2203
2204	mdb_printf("\n");
2205}
2206
2207/*ARGSUSED*/
2208static int
2209whatis_walk_kmem(uintptr_t addr, void *ignored, whatis_info_t *wi)
2210{
2211	mdb_whatis_t *w = wi->wi_w;
2212
2213	uintptr_t cur;
2214	size_t size = wi->wi_cache->cache_bufsize;
2215
2216	while (mdb_whatis_match(w, addr, size, &cur))
2217		whatis_print_kmem(wi, cur, addr, NULL);
2218
2219	return (WHATIS_WALKRET(w));
2220}
2221
2222/*ARGSUSED*/
2223static int
2224whatis_walk_bufctl(uintptr_t baddr, const kmem_bufctl_t *bcp, whatis_info_t *wi)
2225{
2226	mdb_whatis_t *w = wi->wi_w;
2227
2228	uintptr_t cur;
2229	uintptr_t addr = (uintptr_t)bcp->bc_addr;
2230	size_t size = wi->wi_cache->cache_bufsize;
2231
2232	while (mdb_whatis_match(w, addr, size, &cur))
2233		whatis_print_kmem(wi, cur, addr, baddr);
2234
2235	return (WHATIS_WALKRET(w));
2236}
2237
2238static int
2239whatis_walk_seg(uintptr_t addr, const vmem_seg_t *vs, whatis_info_t *wi)
2240{
2241	mdb_whatis_t *w = wi->wi_w;
2242
2243	size_t size = vs->vs_end - vs->vs_start;
2244	uintptr_t cur;
2245
2246	/* We're not interested in anything but alloc and free segments */
2247	if (vs->vs_type != VMEM_ALLOC && vs->vs_type != VMEM_FREE)
2248		return (WALK_NEXT);
2249
2250	while (mdb_whatis_match(w, vs->vs_start, size, &cur)) {
2251		mdb_whatis_report_object(w, cur, vs->vs_start, "");
2252
2253		/*
2254		 * If we're not printing it seperately, provide the vmem_seg
2255		 * pointer if it has a stack trace.
2256		 */
2257		if ((mdb_whatis_flags(w) & WHATIS_QUIET) &&
2258		    (!(mdb_whatis_flags(w) & WHATIS_BUFCTL) ||
2259		    (vs->vs_type == VMEM_ALLOC && vs->vs_depth != 0))) {
2260			mdb_printf("vmem_seg %p ", addr);
2261		}
2262
2263		mdb_printf("%s from the %s vmem arena",
2264		    (vs->vs_type == VMEM_ALLOC) ? "allocated" : "freed",
2265		    wi->wi_vmem->vm_name);
2266
2267		if (!(mdb_whatis_flags(w) & WHATIS_QUIET))
2268			whatis_call_printer(vmem_seg, addr);
2269		else
2270			mdb_printf("\n");
2271	}
2272
2273	return (WHATIS_WALKRET(w));
2274}
2275
2276static int
2277whatis_walk_vmem(uintptr_t addr, const vmem_t *vmem, whatis_info_t *wi)
2278{
2279	mdb_whatis_t *w = wi->wi_w;
2280	const char *nm = vmem->vm_name;
2281
2282	int identifier = ((vmem->vm_cflags & VMC_IDENTIFIER) != 0);
2283	int idspace = ((mdb_whatis_flags(w) & WHATIS_IDSPACE) != 0);
2284
2285	if (identifier != idspace)
2286		return (WALK_NEXT);
2287
2288	wi->wi_vmem = vmem;
2289
2290	if (mdb_whatis_flags(w) & WHATIS_VERBOSE)
2291		mdb_printf("Searching vmem arena %s...\n", nm);
2292
2293	if (mdb_pwalk("vmem_seg",
2294	    (mdb_walk_cb_t)whatis_walk_seg, wi, addr) == -1) {
2295		mdb_warn("can't walk vmem_seg for %p", addr);
2296		return (WALK_NEXT);
2297	}
2298
2299	return (WHATIS_WALKRET(w));
2300}
2301
2302/*ARGSUSED*/
2303static int
2304whatis_walk_slab(uintptr_t saddr, const kmem_slab_t *sp, whatis_info_t *wi)
2305{
2306	mdb_whatis_t *w = wi->wi_w;
2307
2308	/* It must overlap with the slab data, or it's not interesting */
2309	if (mdb_whatis_overlaps(w,
2310	    (uintptr_t)sp->slab_base, wi->wi_slab_size)) {
2311		wi->wi_slab_found++;
2312		return (WALK_DONE);
2313	}
2314	return (WALK_NEXT);
2315}
2316
2317static int
2318whatis_walk_cache(uintptr_t addr, const kmem_cache_t *c, whatis_info_t *wi)
2319{
2320	mdb_whatis_t *w = wi->wi_w;
2321
2322	char *walk, *freewalk;
2323	mdb_walk_cb_t func;
2324	int do_bufctl;
2325
2326	int identifier = ((c->cache_flags & KMC_IDENTIFIER) != 0);
2327	int idspace = ((mdb_whatis_flags(w) & WHATIS_IDSPACE) != 0);
2328
2329	if (identifier != idspace)
2330		return (WALK_NEXT);
2331
2332	/* Override the '-b' flag as necessary */
2333	if (!(c->cache_flags & KMF_HASH))
2334		do_bufctl = FALSE;	/* no bufctls to walk */
2335	else if (c->cache_flags & KMF_AUDIT)
2336		do_bufctl = TRUE;	/* we always want debugging info */
2337	else
2338		do_bufctl = ((mdb_whatis_flags(w) & WHATIS_BUFCTL) != 0);
2339
2340	if (do_bufctl) {
2341		walk = "bufctl";
2342		freewalk = "freectl";
2343		func = (mdb_walk_cb_t)whatis_walk_bufctl;
2344	} else {
2345		walk = "kmem";
2346		freewalk = "freemem";
2347		func = (mdb_walk_cb_t)whatis_walk_kmem;
2348	}
2349
2350	wi->wi_cache = c;
2351
2352	if (mdb_whatis_flags(w) & WHATIS_VERBOSE)
2353		mdb_printf("Searching %s...\n", c->cache_name);
2354
2355	/*
2356	 * If more then two buffers live on each slab, figure out if we're
2357	 * interested in anything in any slab before doing the more expensive
2358	 * kmem/freemem (bufctl/freectl) walkers.
2359	 */
2360	wi->wi_slab_size = c->cache_slabsize - c->cache_maxcolor;
2361	if (!(c->cache_flags & KMF_HASH))
2362		wi->wi_slab_size -= sizeof (kmem_slab_t);
2363
2364	if ((wi->wi_slab_size / c->cache_chunksize) > 2) {
2365		wi->wi_slab_found = 0;
2366		if (mdb_pwalk("kmem_slab", (mdb_walk_cb_t)whatis_walk_slab, wi,
2367		    addr) == -1) {
2368			mdb_warn("can't find kmem_slab walker");
2369			return (WALK_DONE);
2370		}
2371		if (wi->wi_slab_found == 0)
2372			return (WALK_NEXT);
2373	}
2374
2375	wi->wi_freemem = FALSE;
2376	if (mdb_pwalk(walk, func, wi, addr) == -1) {
2377		mdb_warn("can't find %s walker", walk);
2378		return (WALK_DONE);
2379	}
2380
2381	if (mdb_whatis_done(w))
2382		return (WALK_DONE);
2383
2384	/*
2385	 * We have searched for allocated memory; now search for freed memory.
2386	 */
2387	if (mdb_whatis_flags(w) & WHATIS_VERBOSE)
2388		mdb_printf("Searching %s for free memory...\n", c->cache_name);
2389
2390	wi->wi_freemem = TRUE;
2391	if (mdb_pwalk(freewalk, func, wi, addr) == -1) {
2392		mdb_warn("can't find %s walker", freewalk);
2393		return (WALK_DONE);
2394	}
2395
2396	return (WHATIS_WALKRET(w));
2397}
2398
2399static int
2400whatis_walk_touch(uintptr_t addr, const kmem_cache_t *c, whatis_info_t *wi)
2401{
2402	if (c->cache_arena == wi->wi_msb_arena ||
2403	    (c->cache_cflags & KMC_NOTOUCH))
2404		return (WALK_NEXT);
2405
2406	return (whatis_walk_cache(addr, c, wi));
2407}
2408
2409static int
2410whatis_walk_metadata(uintptr_t addr, const kmem_cache_t *c, whatis_info_t *wi)
2411{
2412	if (c->cache_arena != wi->wi_msb_arena)
2413		return (WALK_NEXT);
2414
2415	return (whatis_walk_cache(addr, c, wi));
2416}
2417
2418static int
2419whatis_walk_notouch(uintptr_t addr, const kmem_cache_t *c, whatis_info_t *wi)
2420{
2421	if (c->cache_arena == wi->wi_msb_arena ||
2422	    !(c->cache_cflags & KMC_NOTOUCH))
2423		return (WALK_NEXT);
2424
2425	return (whatis_walk_cache(addr, c, wi));
2426}
2427
2428static int
2429whatis_walk_thread(uintptr_t addr, const kthread_t *t, mdb_whatis_t *w)
2430{
2431	uintptr_t cur;
2432	uintptr_t saddr;
2433	size_t size;
2434
2435	/*
2436	 * Often, one calls ::whatis on an address from a thread structure.
2437	 * We use this opportunity to short circuit this case...
2438	 */
2439	while (mdb_whatis_match(w, addr, sizeof (kthread_t), &cur))
2440		mdb_whatis_report_object(w, cur, addr,
2441		    "allocated as a thread structure\n");
2442
2443	/*
2444	 * Now check the stack
2445	 */
2446	if (t->t_stkbase == NULL)
2447		return (WALK_NEXT);
2448
2449	/*
2450	 * This assumes that t_stk is the end of the stack, but it's really
2451	 * only the initial stack pointer for the thread.  Arguments to the
2452	 * initial procedure, SA(MINFRAME), etc. are all after t_stk.  So
2453	 * that 't->t_stk::whatis' reports "part of t's stack", we include
2454	 * t_stk in the range (the "+ 1", below), but the kernel should
2455	 * really include the full stack bounds where we can find it.
2456	 */
2457	saddr = (uintptr_t)t->t_stkbase;
2458	size = (uintptr_t)t->t_stk - saddr + 1;
2459	while (mdb_whatis_match(w, saddr, size, &cur))
2460		mdb_whatis_report_object(w, cur, cur,
2461		    "in thread %p's stack%s\n", addr, stack_active(t, cur));
2462
2463	return (WHATIS_WALKRET(w));
2464}
2465
2466static void
2467whatis_modctl_match(mdb_whatis_t *w, const char *name,
2468    uintptr_t base, size_t size, const char *where)
2469{
2470	uintptr_t cur;
2471
2472	/*
2473	 * Since we're searching for addresses inside a module, we report
2474	 * them as symbols.
2475	 */
2476	while (mdb_whatis_match(w, base, size, &cur))
2477		mdb_whatis_report_address(w, cur, "in %s's %s\n", name, where);
2478}
2479
2480static int
2481whatis_walk_modctl(uintptr_t addr, const struct modctl *m, mdb_whatis_t *w)
2482{
2483	char name[MODMAXNAMELEN];
2484	struct module mod;
2485	Shdr shdr;
2486
2487	if (m->mod_mp == NULL)
2488		return (WALK_NEXT);
2489
2490	if (mdb_vread(&mod, sizeof (mod), (uintptr_t)m->mod_mp) == -1) {
2491		mdb_warn("couldn't read modctl %p's module", addr);
2492		return (WALK_NEXT);
2493	}
2494
2495	if (mdb_readstr(name, sizeof (name), (uintptr_t)m->mod_modname) == -1)
2496		(void) mdb_snprintf(name, sizeof (name), "0x%p", addr);
2497
2498	whatis_modctl_match(w, name,
2499	    (uintptr_t)mod.text, mod.text_size, "text segment");
2500	whatis_modctl_match(w, name,
2501	    (uintptr_t)mod.data, mod.data_size, "data segment");
2502	whatis_modctl_match(w, name,
2503	    (uintptr_t)mod.bss, mod.bss_size, "bss segment");
2504
2505	if (mdb_vread(&shdr, sizeof (shdr), (uintptr_t)mod.symhdr) == -1) {
2506		mdb_warn("couldn't read symbol header for %p's module", addr);
2507		return (WALK_NEXT);
2508	}
2509
2510	whatis_modctl_match(w, name,
2511	    (uintptr_t)mod.symtbl, mod.nsyms * shdr.sh_entsize, "symtab");
2512	whatis_modctl_match(w, name,
2513	    (uintptr_t)mod.symspace, mod.symsize, "symtab");
2514
2515	return (WHATIS_WALKRET(w));
2516}
2517
2518/*ARGSUSED*/
2519static int
2520whatis_walk_memseg(uintptr_t addr, const struct memseg *seg, mdb_whatis_t *w)
2521{
2522	uintptr_t cur;
2523
2524	uintptr_t base = (uintptr_t)seg->pages;
2525	size_t size = (uintptr_t)seg->epages - base;
2526
2527	while (mdb_whatis_match(w, base, size, &cur)) {
2528		/* round our found pointer down to the page_t base. */
2529		size_t offset = (cur - base) % sizeof (page_t);
2530
2531		mdb_whatis_report_object(w, cur, cur - offset,
2532		    "allocated as a page structure\n");
2533	}
2534
2535	return (WHATIS_WALKRET(w));
2536}
2537
2538/*ARGSUSED*/
2539static int
2540whatis_run_modules(mdb_whatis_t *w, void *arg)
2541{
2542	if (mdb_walk("modctl", (mdb_walk_cb_t)whatis_walk_modctl, w) == -1) {
2543		mdb_warn("couldn't find modctl walker");
2544		return (1);
2545	}
2546	return (0);
2547}
2548
2549/*ARGSUSED*/
2550static int
2551whatis_run_threads(mdb_whatis_t *w, void *ignored)
2552{
2553	/*
2554	 * Now search all thread stacks.  Yes, this is a little weak; we
2555	 * can save a lot of work by first checking to see if the
2556	 * address is in segkp vs. segkmem.  But hey, computers are
2557	 * fast.
2558	 */
2559	if (mdb_walk("thread", (mdb_walk_cb_t)whatis_walk_thread, w) == -1) {
2560		mdb_warn("couldn't find thread walker");
2561		return (1);
2562	}
2563	return (0);
2564}
2565
2566/*ARGSUSED*/
2567static int
2568whatis_run_pages(mdb_whatis_t *w, void *ignored)
2569{
2570	if (mdb_walk("memseg", (mdb_walk_cb_t)whatis_walk_memseg, w) == -1) {
2571		mdb_warn("couldn't find memseg walker");
2572		return (1);
2573	}
2574	return (0);
2575}
2576
2577/*ARGSUSED*/
2578static int
2579whatis_run_kmem(mdb_whatis_t *w, void *ignored)
2580{
2581	whatis_info_t wi;
2582
2583	bzero(&wi, sizeof (wi));
2584	wi.wi_w = w;
2585
2586	if (mdb_readvar(&wi.wi_msb_arena, "kmem_msb_arena") == -1)
2587		mdb_warn("unable to readvar \"kmem_msb_arena\"");
2588
2589	if (mdb_readvar(&wi.wi_kmem_lite_count,
2590	    "kmem_lite_count") == -1 || wi.wi_kmem_lite_count > 16)
2591		wi.wi_kmem_lite_count = 0;
2592
2593	/*
2594	 * We process kmem caches in the following order:
2595	 *
2596	 *	non-KMC_NOTOUCH, non-metadata	(typically the most interesting)
2597	 *	metadata			(can be huge with KMF_AUDIT)
2598	 *	KMC_NOTOUCH, non-metadata	(see kmem_walk_all())
2599	 */
2600	if (mdb_walk("kmem_cache", (mdb_walk_cb_t)whatis_walk_touch,
2601	    &wi) == -1 ||
2602	    mdb_walk("kmem_cache", (mdb_walk_cb_t)whatis_walk_metadata,
2603	    &wi) == -1 ||
2604	    mdb_walk("kmem_cache", (mdb_walk_cb_t)whatis_walk_notouch,
2605	    &wi) == -1) {
2606		mdb_warn("couldn't find kmem_cache walker");
2607		return (1);
2608	}
2609	return (0);
2610}
2611
2612/*ARGSUSED*/
2613static int
2614whatis_run_vmem(mdb_whatis_t *w, void *ignored)
2615{
2616	whatis_info_t wi;
2617
2618	bzero(&wi, sizeof (wi));
2619	wi.wi_w = w;
2620
2621	if (mdb_walk("vmem_postfix",
2622	    (mdb_walk_cb_t)whatis_walk_vmem, &wi) == -1) {
2623		mdb_warn("couldn't find vmem_postfix walker");
2624		return (1);
2625	}
2626	return (0);
2627}
2628
2629typedef struct kmem_log_cpu {
2630	uintptr_t kmc_low;
2631	uintptr_t kmc_high;
2632} kmem_log_cpu_t;
2633
2634typedef struct kmem_log_data {
2635	uintptr_t kmd_addr;
2636	kmem_log_cpu_t *kmd_cpu;
2637} kmem_log_data_t;
2638
2639int
2640kmem_log_walk(uintptr_t addr, const kmem_bufctl_audit_t *b,
2641    kmem_log_data_t *kmd)
2642{
2643	int i;
2644	kmem_log_cpu_t *kmc = kmd->kmd_cpu;
2645	size_t bufsize;
2646
2647	for (i = 0; i < NCPU; i++) {
2648		if (addr >= kmc[i].kmc_low && addr < kmc[i].kmc_high)
2649			break;
2650	}
2651
2652	if (kmd->kmd_addr) {
2653		if (b->bc_cache == NULL)
2654			return (WALK_NEXT);
2655
2656		if (mdb_vread(&bufsize, sizeof (bufsize),
2657		    (uintptr_t)&b->bc_cache->cache_bufsize) == -1) {
2658			mdb_warn(
2659			    "failed to read cache_bufsize for cache at %p",
2660			    b->bc_cache);
2661			return (WALK_ERR);
2662		}
2663
2664		if (kmd->kmd_addr < (uintptr_t)b->bc_addr ||
2665		    kmd->kmd_addr >= (uintptr_t)b->bc_addr + bufsize)
2666			return (WALK_NEXT);
2667	}
2668
2669	if (i == NCPU)
2670		mdb_printf("   ");
2671	else
2672		mdb_printf("%3d", i);
2673
2674	mdb_printf(" %0?p %0?p %16llx %0?p\n", addr, b->bc_addr,
2675	    b->bc_timestamp, b->bc_thread);
2676
2677	return (WALK_NEXT);
2678}
2679
2680/*ARGSUSED*/
2681int
2682kmem_log(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
2683{
2684	kmem_log_header_t lh;
2685	kmem_cpu_log_header_t clh;
2686	uintptr_t lhp, clhp;
2687	int ncpus;
2688	uintptr_t *cpu;
2689	GElf_Sym sym;
2690	kmem_log_cpu_t *kmc;
2691	int i;
2692	kmem_log_data_t kmd;
2693	uint_t opt_b = FALSE;
2694
2695	if (mdb_getopts(argc, argv,
2696	    'b', MDB_OPT_SETBITS, TRUE, &opt_b, NULL) != argc)
2697		return (DCMD_USAGE);
2698
2699	if (mdb_readvar(&lhp, "kmem_transaction_log") == -1) {
2700		mdb_warn("failed to read 'kmem_transaction_log'");
2701		return (DCMD_ERR);
2702	}
2703
2704	if (lhp == NULL) {
2705		mdb_warn("no kmem transaction log\n");
2706		return (DCMD_ERR);
2707	}
2708
2709	mdb_readvar(&ncpus, "ncpus");
2710
2711	if (mdb_vread(&lh, sizeof (kmem_log_header_t), lhp) == -1) {
2712		mdb_warn("failed to read log header at %p", lhp);
2713		return (DCMD_ERR);
2714	}
2715
2716	clhp = lhp + ((uintptr_t)&lh.lh_cpu[0] - (uintptr_t)&lh);
2717
2718	cpu = mdb_alloc(sizeof (uintptr_t) * NCPU, UM_SLEEP | UM_GC);
2719
2720	if (mdb_lookup_by_name("cpu", &sym) == -1) {
2721		mdb_warn("couldn't find 'cpu' array");
2722		return (DCMD_ERR);
2723	}
2724
2725	if (sym.st_size != NCPU * sizeof (uintptr_t)) {
2726		mdb_warn("expected 'cpu' to be of size %d; found %d\n",
2727		    NCPU * sizeof (uintptr_t), sym.st_size);
2728		return (DCMD_ERR);
2729	}
2730
2731	if (mdb_vread(cpu, sym.st_size, (uintptr_t)sym.st_value) == -1) {
2732		mdb_warn("failed to read cpu array at %p", sym.st_value);
2733		return (DCMD_ERR);
2734	}
2735
2736	kmc = mdb_zalloc(sizeof (kmem_log_cpu_t) * NCPU, UM_SLEEP | UM_GC);
2737	kmd.kmd_addr = NULL;
2738	kmd.kmd_cpu = kmc;
2739
2740	for (i = 0; i < NCPU; i++) {
2741
2742		if (cpu[i] == NULL)
2743			continue;
2744
2745		if (mdb_vread(&clh, sizeof (clh), clhp) == -1) {
2746			mdb_warn("cannot read cpu %d's log header at %p",
2747			    i, clhp);
2748			return (DCMD_ERR);
2749		}
2750
2751		kmc[i].kmc_low = clh.clh_chunk * lh.lh_chunksize +
2752		    (uintptr_t)lh.lh_base;
2753		kmc[i].kmc_high = (uintptr_t)clh.clh_current;
2754
2755		clhp += sizeof (kmem_cpu_log_header_t);
2756	}
2757
2758	mdb_printf("%3s %-?s %-?s %16s %-?s\n", "CPU", "ADDR", "BUFADDR",
2759	    "TIMESTAMP", "THREAD");
2760
2761	/*
2762	 * If we have been passed an address, print out only log entries
2763	 * corresponding to that address.  If opt_b is specified, then interpret
2764	 * the address as a bufctl.
2765	 */
2766	if (flags & DCMD_ADDRSPEC) {
2767		kmem_bufctl_audit_t b;
2768
2769		if (opt_b) {
2770			kmd.kmd_addr = addr;
2771		} else {
2772			if (mdb_vread(&b,
2773			    sizeof (kmem_bufctl_audit_t), addr) == -1) {
2774				mdb_warn("failed to read bufctl at %p", addr);
2775				return (DCMD_ERR);
2776			}
2777
2778			(void) kmem_log_walk(addr, &b, &kmd);
2779
2780			return (DCMD_OK);
2781		}
2782	}
2783
2784	if (mdb_walk("kmem_log", (mdb_walk_cb_t)kmem_log_walk, &kmd) == -1) {
2785		mdb_warn("can't find kmem log walker");
2786		return (DCMD_ERR);
2787	}
2788
2789	return (DCMD_OK);
2790}
2791
2792typedef struct bufctl_history_cb {
2793	int		bhc_flags;
2794	int		bhc_argc;
2795	const mdb_arg_t	*bhc_argv;
2796	int		bhc_ret;
2797} bufctl_history_cb_t;
2798
2799/*ARGSUSED*/
2800static int
2801bufctl_history_callback(uintptr_t addr, const void *ign, void *arg)
2802{
2803	bufctl_history_cb_t *bhc = arg;
2804
2805	bhc->bhc_ret =
2806	    bufctl(addr, bhc->bhc_flags, bhc->bhc_argc, bhc->bhc_argv);
2807
2808	bhc->bhc_flags &= ~DCMD_LOOPFIRST;
2809
2810	return ((bhc->bhc_ret == DCMD_OK)? WALK_NEXT : WALK_DONE);
2811}
2812
2813void
2814bufctl_help(void)
2815{
2816	mdb_printf("%s",
2817"Display the contents of kmem_bufctl_audit_ts, with optional filtering.\n\n");
2818	mdb_dec_indent(2);
2819	mdb_printf("%<b>OPTIONS%</b>\n");
2820	mdb_inc_indent(2);
2821	mdb_printf("%s",
2822"  -v    Display the full content of the bufctl, including its stack trace\n"
2823"  -h    retrieve the bufctl's transaction history, if available\n"
2824"  -a addr\n"
2825"        filter out bufctls not involving the buffer at addr\n"
2826"  -c caller\n"
2827"        filter out bufctls without the function/PC in their stack trace\n"
2828"  -e earliest\n"
2829"        filter out bufctls timestamped before earliest\n"
2830"  -l latest\n"
2831"        filter out bufctls timestamped after latest\n"
2832"  -t thread\n"
2833"        filter out bufctls not involving thread\n");
2834}
2835
2836int
2837bufctl(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
2838{
2839	kmem_bufctl_audit_t bc;
2840	uint_t verbose = FALSE;
2841	uint_t history = FALSE;
2842	uint_t in_history = FALSE;
2843	uintptr_t caller = NULL, thread = NULL;
2844	uintptr_t laddr, haddr, baddr = NULL;
2845	hrtime_t earliest = 0, latest = 0;
2846	int i, depth;
2847	char c[MDB_SYM_NAMLEN];
2848	GElf_Sym sym;
2849
2850	if (mdb_getopts(argc, argv,
2851	    'v', MDB_OPT_SETBITS, TRUE, &verbose,
2852	    'h', MDB_OPT_SETBITS, TRUE, &history,
2853	    'H', MDB_OPT_SETBITS, TRUE, &in_history,		/* internal */
2854	    'c', MDB_OPT_UINTPTR, &caller,
2855	    't', MDB_OPT_UINTPTR, &thread,
2856	    'e', MDB_OPT_UINT64, &earliest,
2857	    'l', MDB_OPT_UINT64, &latest,
2858	    'a', MDB_OPT_UINTPTR, &baddr, NULL) != argc)
2859		return (DCMD_USAGE);
2860
2861	if (!(flags & DCMD_ADDRSPEC))
2862		return (DCMD_USAGE);
2863
2864	if (in_history && !history)
2865		return (DCMD_USAGE);
2866
2867	if (history && !in_history) {
2868		mdb_arg_t *nargv = mdb_zalloc(sizeof (*nargv) * (argc + 1),
2869		    UM_SLEEP | UM_GC);
2870		bufctl_history_cb_t bhc;
2871
2872		nargv[0].a_type = MDB_TYPE_STRING;
2873		nargv[0].a_un.a_str = "-H";		/* prevent recursion */
2874
2875		for (i = 0; i < argc; i++)
2876			nargv[i + 1] = argv[i];
2877
2878		/*
2879		 * When in history mode, we treat each element as if it
2880		 * were in a seperate loop, so that the headers group
2881		 * bufctls with similar histories.
2882		 */
2883		bhc.bhc_flags = flags | DCMD_LOOP | DCMD_LOOPFIRST;
2884		bhc.bhc_argc = argc + 1;
2885		bhc.bhc_argv = nargv;
2886		bhc.bhc_ret = DCMD_OK;
2887
2888		if (mdb_pwalk("bufctl_history", bufctl_history_callback, &bhc,
2889		    addr) == -1) {
2890			mdb_warn("unable to walk bufctl_history");
2891			return (DCMD_ERR);
2892		}
2893
2894		if (bhc.bhc_ret == DCMD_OK && !(flags & DCMD_PIPE_OUT))
2895			mdb_printf("\n");
2896
2897		return (bhc.bhc_ret);
2898	}
2899
2900	if (DCMD_HDRSPEC(flags) && !(flags & DCMD_PIPE_OUT)) {
2901		if (verbose) {
2902			mdb_printf("%16s %16s %16s %16s\n"
2903			    "%<u>%16s %16s %16s %16s%</u>\n",
2904			    "ADDR", "BUFADDR", "TIMESTAMP", "THREAD",
2905			    "", "CACHE", "LASTLOG", "CONTENTS");
2906		} else {
2907			mdb_printf("%<u>%-?s %-?s %-12s %-?s %s%</u>\n",
2908			    "ADDR", "BUFADDR", "TIMESTAMP", "THREAD", "CALLER");
2909		}
2910	}
2911
2912	if (mdb_vread(&bc, sizeof (bc), addr) == -1) {
2913		mdb_warn("couldn't read bufctl at %p", addr);
2914		return (DCMD_ERR);
2915	}
2916
2917	/*
2918	 * Guard against bogus bc_depth in case the bufctl is corrupt or
2919	 * the address does not really refer to a bufctl.
2920	 */
2921	depth = MIN(bc.bc_depth, KMEM_STACK_DEPTH);
2922
2923	if (caller != NULL) {
2924		laddr = caller;
2925		haddr = caller + sizeof (caller);
2926
2927		if (mdb_lookup_by_addr(caller, MDB_SYM_FUZZY, c, sizeof (c),
2928		    &sym) != -1 && caller == (uintptr_t)sym.st_value) {
2929			/*
2930			 * We were provided an exact symbol value; any
2931			 * address in the function is valid.
2932			 */
2933			laddr = (uintptr_t)sym.st_value;
2934			haddr = (uintptr_t)sym.st_value + sym.st_size;
2935		}
2936
2937		for (i = 0; i < depth; i++)
2938			if (bc.bc_stack[i] >= laddr && bc.bc_stack[i] < haddr)
2939				break;
2940
2941		if (i == depth)
2942			return (DCMD_OK);
2943	}
2944
2945	if (thread != NULL && (uintptr_t)bc.bc_thread != thread)
2946		return (DCMD_OK);
2947
2948	if (earliest != 0 && bc.bc_timestamp < earliest)
2949		return (DCMD_OK);
2950
2951	if (latest != 0 && bc.bc_timestamp > latest)
2952		return (DCMD_OK);
2953
2954	if (baddr != 0 && (uintptr_t)bc.bc_addr != baddr)
2955		return (DCMD_OK);
2956
2957	if (flags & DCMD_PIPE_OUT) {
2958		mdb_printf("%#lr\n", addr);
2959		return (DCMD_OK);
2960	}
2961
2962	if (verbose) {
2963		mdb_printf(
2964		    "%<b>%16p%</b> %16p %16llx %16p\n"
2965		    "%16s %16p %16p %16p\n",
2966		    addr, bc.bc_addr, bc.bc_timestamp, bc.bc_thread,
2967		    "", bc.bc_cache, bc.bc_lastlog, bc.bc_contents);
2968
2969		mdb_inc_indent(17);
2970		for (i = 0; i < depth; i++)
2971			mdb_printf("%a\n", bc.bc_stack[i]);
2972		mdb_dec_indent(17);
2973		mdb_printf("\n");
2974	} else {
2975		mdb_printf("%0?p %0?p %12llx %0?p", addr, bc.bc_addr,
2976		    bc.bc_timestamp, bc.bc_thread);
2977
2978		for (i = 0; i < depth; i++) {
2979			if (mdb_lookup_by_addr(bc.bc_stack[i],
2980			    MDB_SYM_FUZZY, c, sizeof (c), &sym) == -1)
2981				continue;
2982			if (strncmp(c, "kmem_", 5) == 0)
2983				continue;
2984			mdb_printf(" %a\n", bc.bc_stack[i]);
2985			break;
2986		}
2987
2988		if (i >= depth)
2989			mdb_printf("\n");
2990	}
2991
2992	return (DCMD_OK);
2993}
2994
2995typedef struct kmem_verify {
2996	uint64_t *kmv_buf;		/* buffer to read cache contents into */
2997	size_t kmv_size;		/* number of bytes in kmv_buf */
2998	int kmv_corruption;		/* > 0 if corruption found. */
2999	int kmv_besilent;		/* report actual corruption sites */
3000	struct kmem_cache kmv_cache;	/* the cache we're operating on */
3001} kmem_verify_t;
3002
3003/*
3004 * verify_pattern()
3005 * 	verify that buf is filled with the pattern pat.
3006 */
3007static int64_t
3008verify_pattern(uint64_t *buf_arg, size_t size, uint64_t pat)
3009{
3010	/*LINTED*/
3011	uint64_t *bufend = (uint64_t *)((char *)buf_arg + size);
3012	uint64_t *buf;
3013
3014	for (buf = buf_arg; buf < bufend; buf++)
3015		if (*buf != pat)
3016			return ((uintptr_t)buf - (uintptr_t)buf_arg);
3017	return (-1);
3018}
3019
3020/*
3021 * verify_buftag()
3022 *	verify that btp->bt_bxstat == (bcp ^ pat)
3023 */
3024static int
3025verify_buftag(kmem_buftag_t *btp, uintptr_t pat)
3026{
3027	return (btp->bt_bxstat == ((intptr_t)btp->bt_bufctl ^ pat) ? 0 : -1);
3028}
3029
3030/*
3031 * verify_free()
3032 * 	verify the integrity of a free block of memory by checking
3033 * 	that it is filled with 0xdeadbeef and that its buftag is sane.
3034 */
3035/*ARGSUSED1*/
3036static int
3037verify_free(uintptr_t addr, const void *data, void *private)
3038{
3039	kmem_verify_t *kmv = (kmem_verify_t *)private;
3040	uint64_t *buf = kmv->kmv_buf;	/* buf to validate */
3041	int64_t corrupt;		/* corruption offset */
3042	kmem_buftag_t *buftagp;		/* ptr to buftag */
3043	kmem_cache_t *cp = &kmv->kmv_cache;
3044	int besilent = kmv->kmv_besilent;
3045
3046	/*LINTED*/
3047	buftagp = KMEM_BUFTAG(cp, buf);
3048
3049	/*
3050	 * Read the buffer to check.
3051	 */
3052	if (mdb_vread(buf, kmv->kmv_size, addr) == -1) {
3053		if (!besilent)
3054			mdb_warn("couldn't read %p", addr);
3055		return (WALK_NEXT);
3056	}
3057
3058	if ((corrupt = verify_pattern(buf, cp->cache_verify,
3059	    KMEM_FREE_PATTERN)) >= 0) {
3060		if (!besilent)
3061			mdb_printf("buffer %p (free) seems corrupted, at %p\n",
3062			    addr, (uintptr_t)addr + corrupt);
3063		goto corrupt;
3064	}
3065	/*
3066	 * When KMF_LITE is set, buftagp->bt_redzone is used to hold
3067	 * the first bytes of the buffer, hence we cannot check for red
3068	 * zone corruption.
3069	 */
3070	if ((cp->cache_flags & (KMF_HASH | KMF_LITE)) == KMF_HASH &&
3071	    buftagp->bt_redzone != KMEM_REDZONE_PATTERN) {
3072		if (!besilent)
3073			mdb_printf("buffer %p (free) seems to "
3074			    "have a corrupt redzone pattern\n", addr);
3075		goto corrupt;
3076	}
3077
3078	/*
3079	 * confirm bufctl pointer integrity.
3080	 */
3081	if (verify_buftag(buftagp, KMEM_BUFTAG_FREE) == -1) {
3082		if (!besilent)
3083			mdb_printf("buffer %p (free) has a corrupt "
3084			    "buftag\n", addr);
3085		goto corrupt;
3086	}
3087
3088	return (WALK_NEXT);
3089corrupt:
3090	kmv->kmv_corruption++;
3091	return (WALK_NEXT);
3092}
3093
3094/*
3095 * verify_alloc()
3096 * 	Verify that the buftag of an allocated buffer makes sense with respect
3097 * 	to the buffer.
3098 */
3099/*ARGSUSED1*/
3100static int
3101verify_alloc(uintptr_t addr, const void *data, void *private)
3102{
3103	kmem_verify_t *kmv = (kmem_verify_t *)private;
3104	kmem_cache_t *cp = &kmv->kmv_cache;
3105	uint64_t *buf = kmv->kmv_buf;	/* buf to validate */
3106	/*LINTED*/
3107	kmem_buftag_t *buftagp = KMEM_BUFTAG(cp, buf);
3108	uint32_t *ip = (uint32_t *)buftagp;
3109	uint8_t *bp = (uint8_t *)buf;
3110	int looks_ok = 0, size_ok = 1;	/* flags for finding corruption */
3111	int besilent = kmv->kmv_besilent;
3112
3113	/*
3114	 * Read the buffer to check.
3115	 */
3116	if (mdb_vread(buf, kmv->kmv_size, addr) == -1) {
3117		if (!besilent)
3118			mdb_warn("couldn't read %p", addr);
3119		return (WALK_NEXT);
3120	}
3121
3122	/*
3123	 * There are two cases to handle:
3124	 * 1. If the buf was alloc'd using kmem_cache_alloc, it will have
3125	 *    0xfeedfacefeedface at the end of it
3126	 * 2. If the buf was alloc'd using kmem_alloc, it will have
3127	 *    0xbb just past the end of the region in use.  At the buftag,
3128	 *    it will have 0xfeedface (or, if the whole buffer is in use,
3129	 *    0xfeedface & bb000000 or 0xfeedfacf & 000000bb depending on
3130	 *    endianness), followed by 32 bits containing the offset of the
3131	 *    0xbb byte in the buffer.
3132	 *
3133	 * Finally, the two 32-bit words that comprise the second half of the
3134	 * buftag should xor to KMEM_BUFTAG_ALLOC
3135	 */
3136
3137	if (buftagp->bt_redzone == KMEM_REDZONE_PATTERN)
3138		looks_ok = 1;
3139	else if (!KMEM_SIZE_VALID(ip[1]))
3140		size_ok = 0;
3141	else if (bp[KMEM_SIZE_DECODE(ip[1])] == KMEM_REDZONE_BYTE)
3142		looks_ok = 1;
3143	else
3144		size_ok = 0;
3145
3146	if (!size_ok) {
3147		if (!besilent)
3148			mdb_printf("buffer %p (allocated) has a corrupt "
3149			    "redzone size encoding\n", addr);
3150		goto corrupt;
3151	}
3152
3153	if (!looks_ok) {
3154		if (!besilent)
3155			mdb_printf("buffer %p (allocated) has a corrupt "
3156			    "redzone signature\n", addr);
3157		goto corrupt;
3158	}
3159
3160	if (verify_buftag(buftagp, KMEM_BUFTAG_ALLOC) == -1) {
3161		if (!besilent)
3162			mdb_printf("buffer %p (allocated) has a "
3163			    "corrupt buftag\n", addr);
3164		goto corrupt;
3165	}
3166
3167	return (WALK_NEXT);
3168corrupt:
3169	kmv->kmv_corruption++;
3170	return (WALK_NEXT);
3171}
3172
3173/*ARGSUSED2*/
3174int
3175kmem_verify(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
3176{
3177	if (flags & DCMD_ADDRSPEC) {
3178		int check_alloc = 0, check_free = 0;
3179		kmem_verify_t kmv;
3180
3181		if (mdb_vread(&kmv.kmv_cache, sizeof (kmv.kmv_cache),
3182		    addr) == -1) {
3183			mdb_warn("couldn't read kmem_cache %p", addr);
3184			return (DCMD_ERR);
3185		}
3186
3187		kmv.kmv_size = kmv.kmv_cache.cache_buftag +
3188		    sizeof (kmem_buftag_t);
3189		kmv.kmv_buf = mdb_alloc(kmv.kmv_size, UM_SLEEP | UM_GC);
3190		kmv.kmv_corruption = 0;
3191
3192		if ((kmv.kmv_cache.cache_flags & KMF_REDZONE)) {
3193			check_alloc = 1;
3194			if (kmv.kmv_cache.cache_flags & KMF_DEADBEEF)
3195				check_free = 1;
3196		} else {
3197			if (!(flags & DCMD_LOOP)) {
3198				mdb_warn("cache %p (%s) does not have "
3199				    "redzone checking enabled\n", addr,
3200				    kmv.kmv_cache.cache_name);
3201			}
3202			return (DCMD_ERR);
3203		}
3204
3205		if (flags & DCMD_LOOP) {
3206			/*
3207			 * table mode, don't print out every corrupt buffer
3208			 */
3209			kmv.kmv_besilent = 1;
3210		} else {
3211			mdb_printf("Summary for cache '%s'\n",
3212			    kmv.kmv_cache.cache_name);
3213			mdb_inc_indent(2);
3214			kmv.kmv_besilent = 0;
3215		}
3216
3217		if (check_alloc)
3218			(void) mdb_pwalk("kmem", verify_alloc, &kmv, addr);
3219		if (check_free)
3220			(void) mdb_pwalk("freemem", verify_free, &kmv, addr);
3221
3222		if (flags & DCMD_LOOP) {
3223			if (kmv.kmv_corruption == 0) {
3224				mdb_printf("%-*s %?p clean\n",
3225				    KMEM_CACHE_NAMELEN,
3226				    kmv.kmv_cache.cache_name, addr);
3227			} else {
3228				char *s = "";	/* optional s in "buffer[s]" */
3229				if (kmv.kmv_corruption > 1)
3230					s = "s";
3231
3232				mdb_printf("%-*s %?p %d corrupt buffer%s\n",
3233				    KMEM_CACHE_NAMELEN,
3234				    kmv.kmv_cache.cache_name, addr,
3235				    kmv.kmv_corruption, s);
3236			}
3237		} else {
3238			/*
3239			 * This is the more verbose mode, when the user has
3240			 * type addr::kmem_verify.  If the cache was clean,
3241			 * nothing will have yet been printed. So say something.
3242			 */
3243			if (kmv.kmv_corruption == 0)
3244				mdb_printf("clean\n");
3245
3246			mdb_dec_indent(2);
3247		}
3248	} else {
3249		/*
3250		 * If the user didn't specify a cache to verify, we'll walk all
3251		 * kmem_cache's, specifying ourself as a callback for each...
3252		 * this is the equivalent of '::walk kmem_cache .::kmem_verify'
3253		 */
3254		mdb_printf("%<u>%-*s %-?s %-20s%</b>\n", KMEM_CACHE_NAMELEN,
3255		    "Cache Name", "Addr", "Cache Integrity");
3256		(void) (mdb_walk_dcmd("kmem_cache", "kmem_verify", 0, NULL));
3257	}
3258
3259	return (DCMD_OK);
3260}
3261
3262typedef struct vmem_node {
3263	struct vmem_node *vn_next;
3264	struct vmem_node *vn_parent;
3265	struct vmem_node *vn_sibling;
3266	struct vmem_node *vn_children;
3267	uintptr_t vn_addr;
3268	int vn_marked;
3269	vmem_t vn_vmem;
3270} vmem_node_t;
3271
3272typedef struct vmem_walk {
3273	vmem_node_t *vw_root;
3274	vmem_node_t *vw_current;
3275} vmem_walk_t;
3276
3277int
3278vmem_walk_init(mdb_walk_state_t *wsp)
3279{
3280	uintptr_t vaddr, paddr;
3281	vmem_node_t *head = NULL, *root = NULL, *current = NULL, *parent, *vp;
3282	vmem_walk_t *vw;
3283
3284	if (mdb_readvar(&vaddr, "vmem_list") == -1) {
3285		mdb_warn("couldn't read 'vmem_list'");
3286		return (WALK_ERR);
3287	}
3288
3289	while (vaddr != NULL) {
3290		vp = mdb_zalloc(sizeof (vmem_node_t), UM_SLEEP);
3291		vp->vn_addr = vaddr;
3292		vp->vn_next = head;
3293		head = vp;
3294
3295		if (vaddr == wsp->walk_addr)
3296			current = vp;
3297
3298		if (mdb_vread(&vp->vn_vmem, sizeof (vmem_t), vaddr) == -1) {
3299			mdb_warn("couldn't read vmem_t at %p", vaddr);
3300			goto err;
3301		}
3302
3303		vaddr = (uintptr_t)vp->vn_vmem.vm_next;
3304	}
3305
3306	for (vp = head; vp != NULL; vp = vp->vn_next) {
3307
3308		if ((paddr = (uintptr_t)vp->vn_vmem.vm_source) == NULL) {
3309			vp->vn_sibling = root;
3310			root = vp;
3311			continue;
3312		}
3313
3314		for (parent = head; parent != NULL; parent = parent->vn_next) {
3315			if (parent->vn_addr != paddr)
3316				continue;
3317			vp->vn_sibling = parent->vn_children;
3318			parent->vn_children = vp;
3319			vp->vn_parent = parent;
3320			break;
3321		}
3322
3323		if (parent == NULL) {
3324			mdb_warn("couldn't find %p's parent (%p)\n",
3325			    vp->vn_addr, paddr);
3326			goto err;
3327		}
3328	}
3329
3330	vw = mdb_zalloc(sizeof (vmem_walk_t), UM_SLEEP);
3331	vw->vw_root = root;
3332
3333	if (current != NULL)
3334		vw->vw_current = current;
3335	else
3336		vw->vw_current = root;
3337
3338	wsp->walk_data = vw;
3339	return (WALK_NEXT);
3340err:
3341	for (vp = head; head != NULL; vp = head) {
3342		head = vp->vn_next;
3343		mdb_free(vp, sizeof (vmem_node_t));
3344	}
3345
3346	return (WALK_ERR);
3347}
3348
3349int
3350vmem_walk_step(mdb_walk_state_t *wsp)
3351{
3352	vmem_walk_t *vw = wsp->walk_data;
3353	vmem_node_t *vp;
3354	int rval;
3355
3356	if ((vp = vw->vw_current) == NULL)
3357		return (WALK_DONE);
3358
3359	rval = wsp->walk_callback(vp->vn_addr, &vp->vn_vmem, wsp->walk_cbdata);
3360
3361	if (vp->vn_children != NULL) {
3362		vw->vw_current = vp->vn_children;
3363		return (rval);
3364	}
3365
3366	do {
3367		vw->vw_current = vp->vn_sibling;
3368		vp = vp->vn_parent;
3369	} while (vw->vw_current == NULL && vp != NULL);
3370
3371	return (rval);
3372}
3373
3374/*
3375 * The "vmem_postfix" walk walks the vmem arenas in post-fix order; all
3376 * children are visited before their parent.  We perform the postfix walk
3377 * iteratively (rather than recursively) to allow mdb to regain control
3378 * after each callback.
3379 */
3380int
3381vmem_postfix_walk_step(mdb_walk_state_t *wsp)
3382{
3383	vmem_walk_t *vw = wsp->walk_data;
3384	vmem_node_t *vp = vw->vw_current;
3385	int rval;
3386
3387	/*
3388	 * If this node is marked, then we know that we have already visited
3389	 * all of its children.  If the node has any siblings, they need to
3390	 * be visited next; otherwise, we need to visit the parent.  Note
3391	 * that vp->vn_marked will only be zero on the first invocation of
3392	 * the step function.
3393	 */
3394	if (vp->vn_marked) {
3395		if (vp->vn_sibling != NULL)
3396			vp = vp->vn_sibling;
3397		else if (vp->vn_parent != NULL)
3398			vp = vp->vn_parent;
3399		else {
3400			/*
3401			 * We have neither a parent, nor a sibling, and we
3402			 * have already been visited; we're done.
3403			 */
3404			return (WALK_DONE);
3405		}
3406	}
3407
3408	/*
3409	 * Before we visit this node, visit its children.
3410	 */
3411	while (vp->vn_children != NULL && !vp->vn_children->vn_marked)
3412		vp = vp->vn_children;
3413
3414	vp->vn_marked = 1;
3415	vw->vw_current = vp;
3416	rval = wsp->walk_callback(vp->vn_addr, &vp->vn_vmem, wsp->walk_cbdata);
3417
3418	return (rval);
3419}
3420
3421void
3422vmem_walk_fini(mdb_walk_state_t *wsp)
3423{
3424	vmem_walk_t *vw = wsp->walk_data;
3425	vmem_node_t *root = vw->vw_root;
3426	int done;
3427
3428	if (root == NULL)
3429		return;
3430
3431	if ((vw->vw_root = root->vn_children) != NULL)
3432		vmem_walk_fini(wsp);
3433
3434	vw->vw_root = root->vn_sibling;
3435	done = (root->vn_sibling == NULL && root->vn_parent == NULL);
3436	mdb_free(root, sizeof (vmem_node_t));
3437
3438	if (done) {
3439		mdb_free(vw, sizeof (vmem_walk_t));
3440	} else {
3441		vmem_walk_fini(wsp);
3442	}
3443}
3444
3445typedef struct vmem_seg_walk {
3446	uint8_t vsw_type;
3447	uintptr_t vsw_start;
3448	uintptr_t vsw_current;
3449} vmem_seg_walk_t;
3450
3451/*ARGSUSED*/
3452int
3453vmem_seg_walk_common_init(mdb_walk_state_t *wsp, uint8_t type, char *name)
3454{
3455	vmem_seg_walk_t *vsw;
3456
3457	if (wsp->walk_addr == NULL) {
3458		mdb_warn("vmem_%s does not support global walks\n", name);
3459		return (WALK_ERR);
3460	}
3461
3462	wsp->walk_data = vsw = mdb_alloc(sizeof (vmem_seg_walk_t), UM_SLEEP);
3463
3464	vsw->vsw_type = type;
3465	vsw->vsw_start = wsp->walk_addr + offsetof(vmem_t, vm_seg0);
3466	vsw->vsw_current = vsw->vsw_start;
3467
3468	return (WALK_NEXT);
3469}
3470
3471/*
3472 * vmem segments can't have type 0 (this should be added to vmem_impl.h).
3473 */
3474#define	VMEM_NONE	0
3475
3476int
3477vmem_alloc_walk_init(mdb_walk_state_t *wsp)
3478{
3479	return (vmem_seg_walk_common_init(wsp, VMEM_ALLOC, "alloc"));
3480}
3481
3482int
3483vmem_free_walk_init(mdb_walk_state_t *wsp)
3484{
3485	return (vmem_seg_walk_common_init(wsp, VMEM_FREE, "free"));
3486}
3487
3488int
3489vmem_span_walk_init(mdb_walk_state_t *wsp)
3490{
3491	return (vmem_seg_walk_common_init(wsp, VMEM_SPAN, "span"));
3492}
3493
3494int
3495vmem_seg_walk_init(mdb_walk_state_t *wsp)
3496{
3497	return (vmem_seg_walk_common_init(wsp, VMEM_NONE, "seg"));
3498}
3499
3500int
3501vmem_seg_walk_step(mdb_walk_state_t *wsp)
3502{
3503	vmem_seg_t seg;
3504	vmem_seg_walk_t *vsw = wsp->walk_data;
3505	uintptr_t addr = vsw->vsw_current;
3506	static size_t seg_size = 0;
3507	int rval;
3508
3509	if (!seg_size) {
3510		if (mdb_readvar(&seg_size, "vmem_seg_size") == -1) {
3511			mdb_warn("failed to read 'vmem_seg_size'");
3512			seg_size = sizeof (vmem_seg_t);
3513		}
3514	}
3515
3516	if (seg_size < sizeof (seg))
3517		bzero((caddr_t)&seg + seg_size, sizeof (seg) - seg_size);
3518
3519	if (mdb_vread(&seg, seg_size, addr) == -1) {
3520		mdb_warn("couldn't read vmem_seg at %p", addr);
3521		return (WALK_ERR);
3522	}
3523
3524	vsw->vsw_current = (uintptr_t)seg.vs_anext;
3525	if (vsw->vsw_type != VMEM_NONE && seg.vs_type != vsw->vsw_type) {
3526		rval = WALK_NEXT;
3527	} else {
3528		rval = wsp->walk_callback(addr, &seg, wsp->walk_cbdata);
3529	}
3530
3531	if (vsw->vsw_current == vsw->vsw_start)
3532		return (WALK_DONE);
3533
3534	return (rval);
3535}
3536
3537void
3538vmem_seg_walk_fini(mdb_walk_state_t *wsp)
3539{
3540	vmem_seg_walk_t *vsw = wsp->walk_data;
3541
3542	mdb_free(vsw, sizeof (vmem_seg_walk_t));
3543}
3544
3545#define	VMEM_NAMEWIDTH	22
3546
3547int
3548vmem(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
3549{
3550	vmem_t v, parent;
3551	vmem_kstat_t *vkp = &v.vm_kstat;
3552	uintptr_t paddr;
3553	int ident = 0;
3554	char c[VMEM_NAMEWIDTH];
3555
3556	if (!(flags & DCMD_ADDRSPEC)) {
3557		if (mdb_walk_dcmd("vmem", "vmem", argc, argv) == -1) {
3558			mdb_warn("can't walk vmem");
3559			return (DCMD_ERR);
3560		}
3561		return (DCMD_OK);
3562	}
3563
3564	if (DCMD_HDRSPEC(flags))
3565		mdb_printf("%-?s %-*s %10s %12s %9s %5s\n",
3566		    "ADDR", VMEM_NAMEWIDTH, "NAME", "INUSE",
3567		    "TOTAL", "SUCCEED", "FAIL");
3568
3569	if (mdb_vread(&v, sizeof (v), addr) == -1) {
3570		mdb_warn("couldn't read vmem at %p", addr);
3571		return (DCMD_ERR);
3572	}
3573
3574	for (paddr = (uintptr_t)v.vm_source; paddr != NULL; ident += 2) {
3575		if (mdb_vread(&parent, sizeof (parent), paddr) == -1) {
3576			mdb_warn("couldn't trace %p's ancestry", addr);
3577			ident = 0;
3578			break;
3579		}
3580		paddr = (uintptr_t)parent.vm_source;
3581	}
3582
3583	(void) mdb_snprintf(c, VMEM_NAMEWIDTH, "%*s%s", ident, "", v.vm_name);
3584
3585	mdb_printf("%0?p %-*s %10llu %12llu %9llu %5llu\n",
3586	    addr, VMEM_NAMEWIDTH, c,
3587	    vkp->vk_mem_inuse.value.ui64, vkp->vk_mem_total.value.ui64,
3588	    vkp->vk_alloc.value.ui64, vkp->vk_fail.value.ui64);
3589
3590	return (DCMD_OK);
3591}
3592
3593void
3594vmem_seg_help(void)
3595{
3596	mdb_printf("%s",
3597"Display the contents of vmem_seg_ts, with optional filtering.\n\n"
3598"\n"
3599"A vmem_seg_t represents a range of addresses (or arbitrary numbers),\n"
3600"representing a single chunk of data.  Only ALLOC segments have debugging\n"
3601"information.\n");
3602	mdb_dec_indent(2);
3603	mdb_printf("%<b>OPTIONS%</b>\n");
3604	mdb_inc_indent(2);
3605	mdb_printf("%s",
3606"  -v    Display the full content of the vmem_seg, including its stack trace\n"
3607"  -s    report the size of the segment, instead of the end address\n"
3608"  -c caller\n"
3609"        filter out segments without the function/PC in their stack trace\n"
3610"  -e earliest\n"
3611"        filter out segments timestamped before earliest\n"
3612"  -l latest\n"
3613"        filter out segments timestamped after latest\n"
3614"  -m minsize\n"
3615"        filer out segments smaller than minsize\n"
3616"  -M maxsize\n"
3617"        filer out segments larger than maxsize\n"
3618"  -t thread\n"
3619"        filter out segments not involving thread\n"
3620"  -T type\n"
3621"        filter out segments not of type 'type'\n"
3622"        type is one of: ALLOC/FREE/SPAN/ROTOR/WALKER\n");
3623}
3624
3625/*ARGSUSED*/
3626int
3627vmem_seg(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
3628{
3629	vmem_seg_t vs;
3630	pc_t *stk = vs.vs_stack;
3631	uintptr_t sz;
3632	uint8_t t;
3633	const char *type = NULL;
3634	GElf_Sym sym;
3635	char c[MDB_SYM_NAMLEN];
3636	int no_debug;
3637	int i;
3638	int depth;
3639	uintptr_t laddr, haddr;
3640
3641	uintptr_t caller = NULL, thread = NULL;
3642	uintptr_t minsize = 0, maxsize = 0;
3643
3644	hrtime_t earliest = 0, latest = 0;
3645
3646	uint_t size = 0;
3647	uint_t verbose = 0;
3648
3649	if (!(flags & DCMD_ADDRSPEC))
3650		return (DCMD_USAGE);
3651
3652	if (mdb_getopts(argc, argv,
3653	    'c', MDB_OPT_UINTPTR, &caller,
3654	    'e', MDB_OPT_UINT64, &earliest,
3655	    'l', MDB_OPT_UINT64, &latest,
3656	    's', MDB_OPT_SETBITS, TRUE, &size,
3657	    'm', MDB_OPT_UINTPTR, &minsize,
3658	    'M', MDB_OPT_UINTPTR, &maxsize,
3659	    't', MDB_OPT_UINTPTR, &thread,
3660	    'T', MDB_OPT_STR, &type,
3661	    'v', MDB_OPT_SETBITS, TRUE, &verbose,
3662	    NULL) != argc)
3663		return (DCMD_USAGE);
3664
3665	if (DCMD_HDRSPEC(flags) && !(flags & DCMD_PIPE_OUT)) {
3666		if (verbose) {
3667			mdb_printf("%16s %4s %16s %16s %16s\n"
3668			    "%<u>%16s %4s %16s %16s %16s%</u>\n",
3669			    "ADDR", "TYPE", "START", "END", "SIZE",
3670			    "", "", "THREAD", "TIMESTAMP", "");
3671		} else {
3672			mdb_printf("%?s %4s %?s %?s %s\n", "ADDR", "TYPE",
3673			    "START", size? "SIZE" : "END", "WHO");
3674		}
3675	}
3676
3677	if (mdb_vread(&vs, sizeof (vs), addr) == -1) {
3678		mdb_warn("couldn't read vmem_seg at %p", addr);
3679		return (DCMD_ERR);
3680	}
3681
3682	if (type != NULL) {
3683		if (strcmp(type, "ALLC") == 0 || strcmp(type, "ALLOC") == 0)
3684			t = VMEM_ALLOC;
3685		else if (strcmp(type, "FREE") == 0)
3686			t = VMEM_FREE;
3687		else if (strcmp(type, "SPAN") == 0)
3688			t = VMEM_SPAN;
3689		else if (strcmp(type, "ROTR") == 0 ||
3690		    strcmp(type, "ROTOR") == 0)
3691			t = VMEM_ROTOR;
3692		else if (strcmp(type, "WLKR") == 0 ||
3693		    strcmp(type, "WALKER") == 0)
3694			t = VMEM_WALKER;
3695		else {
3696			mdb_warn("\"%s\" is not a recognized vmem_seg type\n",
3697			    type);
3698			return (DCMD_ERR);
3699		}
3700
3701		if (vs.vs_type != t)
3702			return (DCMD_OK);
3703	}
3704
3705	sz = vs.vs_end - vs.vs_start;
3706
3707	if (minsize != 0 && sz < minsize)
3708		return (DCMD_OK);
3709
3710	if (maxsize != 0 && sz > maxsize)
3711		return (DCMD_OK);
3712
3713	t = vs.vs_type;
3714	depth = vs.vs_depth;
3715
3716	/*
3717	 * debug info, when present, is only accurate for VMEM_ALLOC segments
3718	 */
3719	no_debug = (t != VMEM_ALLOC) ||
3720	    (depth == 0 || depth > VMEM_STACK_DEPTH);
3721
3722	if (no_debug) {
3723		if (caller != NULL || thread != NULL || earliest != 0 ||
3724		    latest != 0)
3725			return (DCMD_OK);		/* not enough info */
3726	} else {
3727		if (caller != NULL) {
3728			laddr = caller;
3729			haddr = caller + sizeof (caller);
3730
3731			if (mdb_lookup_by_addr(caller, MDB_SYM_FUZZY, c,
3732			    sizeof (c), &sym) != -1 &&
3733			    caller == (uintptr_t)sym.st_value) {
3734				/*
3735				 * We were provided an exact symbol value; any
3736				 * address in the function is valid.
3737				 */
3738				laddr = (uintptr_t)sym.st_value;
3739				haddr = (uintptr_t)sym.st_value + sym.st_size;
3740			}
3741
3742			for (i = 0; i < depth; i++)
3743				if (vs.vs_stack[i] >= laddr &&
3744				    vs.vs_stack[i] < haddr)
3745					break;
3746
3747			if (i == depth)
3748				return (DCMD_OK);
3749		}
3750
3751		if (thread != NULL && (uintptr_t)vs.vs_thread != thread)
3752			return (DCMD_OK);
3753
3754		if (earliest != 0 && vs.vs_timestamp < earliest)
3755			return (DCMD_OK);
3756
3757		if (latest != 0 && vs.vs_timestamp > latest)
3758			return (DCMD_OK);
3759	}
3760
3761	type = (t == VMEM_ALLOC ? "ALLC" :
3762	    t == VMEM_FREE ? "FREE" :
3763	    t == VMEM_SPAN ? "SPAN" :
3764	    t == VMEM_ROTOR ? "ROTR" :
3765	    t == VMEM_WALKER ? "WLKR" :
3766	    "????");
3767
3768	if (flags & DCMD_PIPE_OUT) {
3769		mdb_printf("%#lr\n", addr);
3770		return (DCMD_OK);
3771	}
3772
3773	if (verbose) {
3774		mdb_printf("%<b>%16p%</b> %4s %16p %16p %16d\n",
3775		    addr, type, vs.vs_start, vs.vs_end, sz);
3776
3777		if (no_debug)
3778			return (DCMD_OK);
3779
3780		mdb_printf("%16s %4s %16p %16llx\n",
3781		    "", "", vs.vs_thread, vs.vs_timestamp);
3782
3783		mdb_inc_indent(17);
3784		for (i = 0; i < depth; i++) {
3785			mdb_printf("%a\n", stk[i]);
3786		}
3787		mdb_dec_indent(17);
3788		mdb_printf("\n");
3789	} else {
3790		mdb_printf("%0?p %4s %0?p %0?p", addr, type,
3791		    vs.vs_start, size? sz : vs.vs_end);
3792
3793		if (no_debug) {
3794			mdb_printf("\n");
3795			return (DCMD_OK);
3796		}
3797
3798		for (i = 0; i < depth; i++) {
3799			if (mdb_lookup_by_addr(stk[i], MDB_SYM_FUZZY,
3800			    c, sizeof (c), &sym) == -1)
3801				continue;
3802			if (strncmp(c, "vmem_", 5) == 0)
3803				continue;
3804			break;
3805		}
3806		mdb_printf(" %a\n", stk[i]);
3807	}
3808	return (DCMD_OK);
3809}
3810
3811typedef struct kmalog_data {
3812	uintptr_t	kma_addr;
3813	hrtime_t	kma_newest;
3814} kmalog_data_t;
3815
3816/*ARGSUSED*/
3817static int
3818showbc(uintptr_t addr, const kmem_bufctl_audit_t *bcp, kmalog_data_t *kma)
3819{
3820	char name[KMEM_CACHE_NAMELEN + 1];
3821	hrtime_t delta;
3822	int i, depth;
3823	size_t bufsize;
3824
3825	if (bcp->bc_timestamp == 0)
3826		return (WALK_DONE);
3827
3828	if (kma->kma_newest == 0)
3829		kma->kma_newest = bcp->bc_timestamp;
3830
3831	if (kma->kma_addr) {
3832		if (mdb_vread(&bufsize, sizeof (bufsize),
3833		    (uintptr_t)&bcp->bc_cache->cache_bufsize) == -1) {
3834			mdb_warn(
3835			    "failed to read cache_bufsize for cache at %p",
3836			    bcp->bc_cache);
3837			return (WALK_ERR);
3838		}
3839
3840		if (kma->kma_addr < (uintptr_t)bcp->bc_addr ||
3841		    kma->kma_addr >= (uintptr_t)bcp->bc_addr + bufsize)
3842			return (WALK_NEXT);
3843	}
3844
3845	delta = kma->kma_newest - bcp->bc_timestamp;
3846	depth = MIN(bcp->bc_depth, KMEM_STACK_DEPTH);
3847
3848	if (mdb_readstr(name, sizeof (name), (uintptr_t)
3849	    &bcp->bc_cache->cache_name) <= 0)
3850		(void) mdb_snprintf(name, sizeof (name), "%a", bcp->bc_cache);
3851
3852	mdb_printf("\nT-%lld.%09lld  addr=%p  %s\n",
3853	    delta / NANOSEC, delta % NANOSEC, bcp->bc_addr, name);
3854
3855	for (i = 0; i < depth; i++)
3856		mdb_printf("\t %a\n", bcp->bc_stack[i]);
3857
3858	return (WALK_NEXT);
3859}
3860
3861int
3862kmalog(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
3863{
3864	const char *logname = "kmem_transaction_log";
3865	kmalog_data_t kma;
3866
3867	if (argc > 1)
3868		return (DCMD_USAGE);
3869
3870	kma.kma_newest = 0;
3871	if (flags & DCMD_ADDRSPEC)
3872		kma.kma_addr = addr;
3873	else
3874		kma.kma_addr = NULL;
3875
3876	if (argc > 0) {
3877		if (argv->a_type != MDB_TYPE_STRING)
3878			return (DCMD_USAGE);
3879		if (strcmp(argv->a_un.a_str, "fail") == 0)
3880			logname = "kmem_failure_log";
3881		else if (strcmp(argv->a_un.a_str, "slab") == 0)
3882			logname = "kmem_slab_log";
3883		else
3884			return (DCMD_USAGE);
3885	}
3886
3887	if (mdb_readvar(&addr, logname) == -1) {
3888		mdb_warn("failed to read %s log header pointer");
3889		return (DCMD_ERR);
3890	}
3891
3892	if (mdb_pwalk("kmem_log", (mdb_walk_cb_t)showbc, &kma, addr) == -1) {
3893		mdb_warn("failed to walk kmem log");
3894		return (DCMD_ERR);
3895	}
3896
3897	return (DCMD_OK);
3898}
3899
3900/*
3901 * As the final lure for die-hard crash(1M) users, we provide ::kmausers here.
3902 * The first piece is a structure which we use to accumulate kmem_cache_t
3903 * addresses of interest.  The kmc_add is used as a callback for the kmem_cache
3904 * walker; we either add all caches, or ones named explicitly as arguments.
3905 */
3906
3907typedef struct kmclist {
3908	const char *kmc_name;			/* Name to match (or NULL) */
3909	uintptr_t *kmc_caches;			/* List of kmem_cache_t addrs */
3910	int kmc_nelems;				/* Num entries in kmc_caches */
3911	int kmc_size;				/* Size of kmc_caches array */
3912} kmclist_t;
3913
3914static int
3915kmc_add(uintptr_t addr, const kmem_cache_t *cp, kmclist_t *kmc)
3916{
3917	void *p;
3918	int s;
3919
3920	if (kmc->kmc_name == NULL ||
3921	    strcmp(cp->cache_name, kmc->kmc_name) == 0) {
3922		/*
3923		 * If we have a match, grow our array (if necessary), and then
3924		 * add the virtual address of the matching cache to our list.
3925		 */
3926		if (kmc->kmc_nelems >= kmc->kmc_size) {
3927			s = kmc->kmc_size ? kmc->kmc_size * 2 : 256;
3928			p = mdb_alloc(sizeof (uintptr_t) * s, UM_SLEEP | UM_GC);
3929
3930			bcopy(kmc->kmc_caches, p,
3931			    sizeof (uintptr_t) * kmc->kmc_size);
3932
3933			kmc->kmc_caches = p;
3934			kmc->kmc_size = s;
3935		}
3936
3937		kmc->kmc_caches[kmc->kmc_nelems++] = addr;
3938		return (kmc->kmc_name ? WALK_DONE : WALK_NEXT);
3939	}
3940
3941	return (WALK_NEXT);
3942}
3943
3944/*
3945 * The second piece of ::kmausers is a hash table of allocations.  Each
3946 * allocation owner is identified by its stack trace and data_size.  We then
3947 * track the total bytes of all such allocations, and the number of allocations
3948 * to report at the end.  Once we have a list of caches, we walk through the
3949 * allocated bufctls of each, and update our hash table accordingly.
3950 */
3951
3952typedef struct kmowner {
3953	struct kmowner *kmo_head;		/* First hash elt in bucket */
3954	struct kmowner *kmo_next;		/* Next hash elt in chain */
3955	size_t kmo_signature;			/* Hash table signature */
3956	uint_t kmo_num;				/* Number of allocations */
3957	size_t kmo_data_size;			/* Size of each allocation */
3958	size_t kmo_total_size;			/* Total bytes of allocation */
3959	int kmo_depth;				/* Depth of stack trace */
3960	uintptr_t kmo_stack[KMEM_STACK_DEPTH];	/* Stack trace */
3961} kmowner_t;
3962
3963typedef struct kmusers {
3964	uintptr_t kmu_addr;			/* address of interest */
3965	const kmem_cache_t *kmu_cache;		/* Current kmem cache */
3966	kmowner_t *kmu_hash;			/* Hash table of owners */
3967	int kmu_nelems;				/* Number of entries in use */
3968	int kmu_size;				/* Total number of entries */
3969} kmusers_t;
3970
3971static void
3972kmu_add(kmusers_t *kmu, const kmem_bufctl_audit_t *bcp,
3973    size_t size, size_t data_size)
3974{
3975	int i, depth = MIN(bcp->bc_depth, KMEM_STACK_DEPTH);
3976	size_t bucket, signature = data_size;
3977	kmowner_t *kmo, *kmoend;
3978
3979	/*
3980	 * If the hash table is full, double its size and rehash everything.
3981	 */
3982	if (kmu->kmu_nelems >= kmu->kmu_size) {
3983		int s = kmu->kmu_size ? kmu->kmu_size * 2 : 1024;
3984
3985		kmo = mdb_alloc(sizeof (kmowner_t) * s, UM_SLEEP | UM_GC);
3986		bcopy(kmu->kmu_hash, kmo, sizeof (kmowner_t) * kmu->kmu_size);
3987		kmu->kmu_hash = kmo;
3988		kmu->kmu_size = s;
3989
3990		kmoend = kmu->kmu_hash + kmu->kmu_size;
3991		for (kmo = kmu->kmu_hash; kmo < kmoend; kmo++)
3992			kmo->kmo_head = NULL;
3993
3994		kmoend = kmu->kmu_hash + kmu->kmu_nelems;
3995		for (kmo = kmu->kmu_hash; kmo < kmoend; kmo++) {
3996			bucket = kmo->kmo_signature & (kmu->kmu_size - 1);
3997			kmo->kmo_next = kmu->kmu_hash[bucket].kmo_head;
3998			kmu->kmu_hash[bucket].kmo_head = kmo;
3999		}
4000	}
4001
4002	/*
4003	 * Finish computing the hash signature from the stack trace, and then
4004	 * see if the owner is in the hash table.  If so, update our stats.
4005	 */
4006	for (i = 0; i < depth; i++)
4007		signature += bcp->bc_stack[i];
4008
4009	bucket = signature & (kmu->kmu_size - 1);
4010
4011	for (kmo = kmu->kmu_hash[bucket].kmo_head; kmo; kmo = kmo->kmo_next) {
4012		if (kmo->kmo_signature == signature) {
4013			size_t difference = 0;
4014
4015			difference |= kmo->kmo_data_size - data_size;
4016			difference |= kmo->kmo_depth - depth;
4017
4018			for (i = 0; i < depth; i++) {
4019				difference |= kmo->kmo_stack[i] -
4020				    bcp->bc_stack[i];
4021			}
4022
4023			if (difference == 0) {
4024				kmo->kmo_total_size += size;
4025				kmo->kmo_num++;
4026				return;
4027			}
4028		}
4029	}
4030
4031	/*
4032	 * If the owner is not yet hashed, grab the next element and fill it
4033	 * in based on the allocation information.
4034	 */
4035	kmo = &kmu->kmu_hash[kmu->kmu_nelems++];
4036	kmo->kmo_next = kmu->kmu_hash[bucket].kmo_head;
4037	kmu->kmu_hash[bucket].kmo_head = kmo;
4038
4039	kmo->kmo_signature = signature;
4040	kmo->kmo_num = 1;
4041	kmo->kmo_data_size = data_size;
4042	kmo->kmo_total_size = size;
4043	kmo->kmo_depth = depth;
4044
4045	for (i = 0; i < depth; i++)
4046		kmo->kmo_stack[i] = bcp->bc_stack[i];
4047}
4048
4049/*
4050 * When ::kmausers is invoked without the -f flag, we simply update our hash
4051 * table with the information from each allocated bufctl.
4052 */
4053/*ARGSUSED*/
4054static int
4055kmause1(uintptr_t addr, const kmem_bufctl_audit_t *bcp, kmusers_t *kmu)
4056{
4057	const kmem_cache_t *cp = kmu->kmu_cache;
4058
4059	kmu_add(kmu, bcp, cp->cache_bufsize, cp->cache_bufsize);
4060	return (WALK_NEXT);
4061}
4062
4063/*
4064 * When ::kmausers is invoked with the -f flag, we print out the information
4065 * for each bufctl as well as updating the hash table.
4066 */
4067static int
4068kmause2(uintptr_t addr, const kmem_bufctl_audit_t *bcp, kmusers_t *kmu)
4069{
4070	int i, depth = MIN(bcp->bc_depth, KMEM_STACK_DEPTH);
4071	const kmem_cache_t *cp = kmu->kmu_cache;
4072	kmem_bufctl_t bufctl;
4073
4074	if (kmu->kmu_addr) {
4075		if (mdb_vread(&bufctl, sizeof (bufctl),  addr) == -1)
4076			mdb_warn("couldn't read bufctl at %p", addr);
4077		else if (kmu->kmu_addr < (uintptr_t)bufctl.bc_addr ||
4078		    kmu->kmu_addr >= (uintptr_t)bufctl.bc_addr +
4079		    cp->cache_bufsize)
4080			return (WALK_NEXT);
4081	}
4082
4083	mdb_printf("size %d, addr %p, thread %p, cache %s\n",
4084	    cp->cache_bufsize, addr, bcp->bc_thread, cp->cache_name);
4085
4086	for (i = 0; i < depth; i++)
4087		mdb_printf("\t %a\n", bcp->bc_stack[i]);
4088
4089	kmu_add(kmu, bcp, cp->cache_bufsize, cp->cache_bufsize);
4090	return (WALK_NEXT);
4091}
4092
4093/*
4094 * We sort our results by allocation size before printing them.
4095 */
4096static int
4097kmownercmp(const void *lp, const void *rp)
4098{
4099	const kmowner_t *lhs = lp;
4100	const kmowner_t *rhs = rp;
4101
4102	return (rhs->kmo_total_size - lhs->kmo_total_size);
4103}
4104
4105/*
4106 * The main engine of ::kmausers is relatively straightforward: First we
4107 * accumulate our list of kmem_cache_t addresses into the kmclist_t. Next we
4108 * iterate over the allocated bufctls of each cache in the list.  Finally,
4109 * we sort and print our results.
4110 */
4111/*ARGSUSED*/
4112int
4113kmausers(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
4114{
4115	int mem_threshold = 8192;	/* Minimum # bytes for printing */
4116	int cnt_threshold = 100;	/* Minimum # blocks for printing */
4117	int audited_caches = 0;		/* Number of KMF_AUDIT caches found */
4118	int do_all_caches = 1;		/* Do all caches (no arguments) */
4119	int opt_e = FALSE;		/* Include "small" users */
4120	int opt_f = FALSE;		/* Print stack traces */
4121
4122	mdb_walk_cb_t callback = (mdb_walk_cb_t)kmause1;
4123	kmowner_t *kmo, *kmoend;
4124	int i, oelems;
4125
4126	kmclist_t kmc;
4127	kmusers_t kmu;
4128
4129	bzero(&kmc, sizeof (kmc));
4130	bzero(&kmu, sizeof (kmu));
4131
4132	while ((i = mdb_getopts(argc, argv,
4133	    'e', MDB_OPT_SETBITS, TRUE, &opt_e,
4134	    'f', MDB_OPT_SETBITS, TRUE, &opt_f, NULL)) != argc) {
4135
4136		argv += i;	/* skip past options we just processed */
4137		argc -= i;	/* adjust argc */
4138
4139		if (argv->a_type != MDB_TYPE_STRING || *argv->a_un.a_str == '-')
4140			return (DCMD_USAGE);
4141
4142		oelems = kmc.kmc_nelems;
4143		kmc.kmc_name = argv->a_un.a_str;
4144		(void) mdb_walk("kmem_cache", (mdb_walk_cb_t)kmc_add, &kmc);
4145
4146		if (kmc.kmc_nelems == oelems) {
4147			mdb_warn("unknown kmem cache: %s\n", kmc.kmc_name);
4148			return (DCMD_ERR);
4149		}
4150
4151		do_all_caches = 0;
4152		argv++;
4153		argc--;
4154	}
4155
4156	if (flags & DCMD_ADDRSPEC) {
4157		opt_f = TRUE;
4158		kmu.kmu_addr = addr;
4159	} else {
4160		kmu.kmu_addr = NULL;
4161	}
4162
4163	if (opt_e)
4164		mem_threshold = cnt_threshold = 0;
4165
4166	if (opt_f)
4167		callback = (mdb_walk_cb_t)kmause2;
4168
4169	if (do_all_caches) {
4170		kmc.kmc_name = NULL; /* match all cache names */
4171		(void) mdb_walk("kmem_cache", (mdb_walk_cb_t)kmc_add, &kmc);
4172	}
4173
4174	for (i = 0; i < kmc.kmc_nelems; i++) {
4175		uintptr_t cp = kmc.kmc_caches[i];
4176		kmem_cache_t c;
4177
4178		if (mdb_vread(&c, sizeof (c), cp) == -1) {
4179			mdb_warn("failed to read cache at %p", cp);
4180			continue;
4181		}
4182
4183		if (!(c.cache_flags & KMF_AUDIT)) {
4184			if (!do_all_caches) {
4185				mdb_warn("KMF_AUDIT is not enabled for %s\n",
4186				    c.cache_name);
4187			}
4188			continue;
4189		}
4190
4191		kmu.kmu_cache = &c;
4192		(void) mdb_pwalk("bufctl", callback, &kmu, cp);
4193		audited_caches++;
4194	}
4195
4196	if (audited_caches == 0 && do_all_caches) {
4197		mdb_warn("KMF_AUDIT is not enabled for any caches\n");
4198		return (DCMD_ERR);
4199	}
4200
4201	qsort(kmu.kmu_hash, kmu.kmu_nelems, sizeof (kmowner_t), kmownercmp);
4202	kmoend = kmu.kmu_hash + kmu.kmu_nelems;
4203
4204	for (kmo = kmu.kmu_hash; kmo < kmoend; kmo++) {
4205		if (kmo->kmo_total_size < mem_threshold &&
4206		    kmo->kmo_num < cnt_threshold)
4207			continue;
4208		mdb_printf("%lu bytes for %u allocations with data size %lu:\n",
4209		    kmo->kmo_total_size, kmo->kmo_num, kmo->kmo_data_size);
4210		for (i = 0; i < kmo->kmo_depth; i++)
4211			mdb_printf("\t %a\n", kmo->kmo_stack[i]);
4212	}
4213
4214	return (DCMD_OK);
4215}
4216
4217void
4218kmausers_help(void)
4219{
4220	mdb_printf(
4221	    "Displays the largest users of the kmem allocator, sorted by \n"
4222	    "trace.  If one or more caches is specified, only those caches\n"
4223	    "will be searched.  By default, all caches are searched.  If an\n"
4224	    "address is specified, then only those allocations which include\n"
4225	    "the given address are displayed.  Specifying an address implies\n"
4226	    "-f.\n"
4227	    "\n"
4228	    "\t-e\tInclude all users, not just the largest\n"
4229	    "\t-f\tDisplay individual allocations.  By default, users are\n"
4230	    "\t\tgrouped by stack\n");
4231}
4232
4233static int
4234kmem_ready_check(void)
4235{
4236	int ready;
4237
4238	if (mdb_readvar(&ready, "kmem_ready") < 0)
4239		return (-1); /* errno is set for us */
4240
4241	return (ready);
4242}
4243
4244void
4245kmem_statechange(void)
4246{
4247	static int been_ready = 0;
4248
4249	if (been_ready)
4250		return;
4251
4252	if (kmem_ready_check() <= 0)
4253		return;
4254
4255	been_ready = 1;
4256	(void) mdb_walk("kmem_cache", (mdb_walk_cb_t)kmem_init_walkers, NULL);
4257}
4258
4259void
4260kmem_init(void)
4261{
4262	mdb_walker_t w = {
4263		"kmem_cache", "walk list of kmem caches", kmem_cache_walk_init,
4264		list_walk_step, list_walk_fini
4265	};
4266
4267	/*
4268	 * If kmem is ready, we'll need to invoke the kmem_cache walker
4269	 * immediately.  Walkers in the linkage structure won't be ready until
4270	 * _mdb_init returns, so we'll need to add this one manually.  If kmem
4271	 * is ready, we'll use the walker to initialize the caches.  If kmem
4272	 * isn't ready, we'll register a callback that will allow us to defer
4273	 * cache walking until it is.
4274	 */
4275	if (mdb_add_walker(&w) != 0) {
4276		mdb_warn("failed to add kmem_cache walker");
4277		return;
4278	}
4279
4280	kmem_statechange();
4281
4282	/* register our ::whatis handlers */
4283	mdb_whatis_register("modules", whatis_run_modules, NULL,
4284	    WHATIS_PRIO_EARLY, WHATIS_REG_NO_ID);
4285	mdb_whatis_register("threads", whatis_run_threads, NULL,
4286	    WHATIS_PRIO_EARLY, WHATIS_REG_NO_ID);
4287	mdb_whatis_register("pages", whatis_run_pages, NULL,
4288	    WHATIS_PRIO_EARLY, WHATIS_REG_NO_ID);
4289	mdb_whatis_register("kmem", whatis_run_kmem, NULL,
4290	    WHATIS_PRIO_ALLOCATOR, 0);
4291	mdb_whatis_register("vmem", whatis_run_vmem, NULL,
4292	    WHATIS_PRIO_ALLOCATOR, 0);
4293}
4294
4295typedef struct whatthread {
4296	uintptr_t	wt_target;
4297	int		wt_verbose;
4298} whatthread_t;
4299
4300static int
4301whatthread_walk_thread(uintptr_t addr, const kthread_t *t, whatthread_t *w)
4302{
4303	uintptr_t current, data;
4304
4305	if (t->t_stkbase == NULL)
4306		return (WALK_NEXT);
4307
4308	/*
4309	 * Warn about swapped out threads, but drive on anyway
4310	 */
4311	if (!(t->t_schedflag & TS_LOAD)) {
4312		mdb_warn("thread %p's stack swapped out\n", addr);
4313		return (WALK_NEXT);
4314	}
4315
4316	/*
4317	 * Search the thread's stack for the given pointer.  Note that it would
4318	 * be more efficient to follow ::kgrep's lead and read in page-sized
4319	 * chunks, but this routine is already fast and simple.
4320	 */
4321	for (current = (uintptr_t)t->t_stkbase; current < (uintptr_t)t->t_stk;
4322	    current += sizeof (uintptr_t)) {
4323		if (mdb_vread(&data, sizeof (data), current) == -1) {
4324			mdb_warn("couldn't read thread %p's stack at %p",
4325			    addr, current);
4326			return (WALK_ERR);
4327		}
4328
4329		if (data == w->wt_target) {
4330			if (w->wt_verbose) {
4331				mdb_printf("%p in thread %p's stack%s\n",
4332				    current, addr, stack_active(t, current));
4333			} else {
4334				mdb_printf("%#lr\n", addr);
4335				return (WALK_NEXT);
4336			}
4337		}
4338	}
4339
4340	return (WALK_NEXT);
4341}
4342
4343int
4344whatthread(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
4345{
4346	whatthread_t w;
4347
4348	if (!(flags & DCMD_ADDRSPEC))
4349		return (DCMD_USAGE);
4350
4351	w.wt_verbose = FALSE;
4352	w.wt_target = addr;
4353
4354	if (mdb_getopts(argc, argv,
4355	    'v', MDB_OPT_SETBITS, TRUE, &w.wt_verbose, NULL) != argc)
4356		return (DCMD_USAGE);
4357
4358	if (mdb_walk("thread", (mdb_walk_cb_t)whatthread_walk_thread, &w)
4359	    == -1) {
4360		mdb_warn("couldn't walk threads");
4361		return (DCMD_ERR);
4362	}
4363
4364	return (DCMD_OK);
4365}
4366