1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 */
26
27#pragma ident	"%Z%%M%	%I%	%E% SMI"
28
29#include <stdlib.h>
30#include <strings.h>
31#include <errno.h>
32#include <unistd.h>
33#include <dt_impl.h>
34#include <assert.h>
35#if defined(sun)
36#include <alloca.h>
37#else
38#include <sys/sysctl.h>
39#endif
40#include <limits.h>
41
42#define	DTRACE_AHASHSIZE	32779		/* big 'ol prime */
43
44/*
45 * Because qsort(3C) does not allow an argument to be passed to a comparison
46 * function, the variables that affect comparison must regrettably be global;
47 * they are protected by a global static lock, dt_qsort_lock.
48 */
49static pthread_mutex_t dt_qsort_lock = PTHREAD_MUTEX_INITIALIZER;
50
51static int dt_revsort;
52static int dt_keysort;
53static int dt_keypos;
54
55#define	DT_LESSTHAN	(dt_revsort == 0 ? -1 : 1)
56#define	DT_GREATERTHAN	(dt_revsort == 0 ? 1 : -1)
57
58static void
59dt_aggregate_count(int64_t *existing, int64_t *new, size_t size)
60{
61	uint_t i;
62
63	for (i = 0; i < size / sizeof (int64_t); i++)
64		existing[i] = existing[i] + new[i];
65}
66
67static int
68dt_aggregate_countcmp(int64_t *lhs, int64_t *rhs)
69{
70	int64_t lvar = *lhs;
71	int64_t rvar = *rhs;
72
73	if (lvar < rvar)
74		return (DT_LESSTHAN);
75
76	if (lvar > rvar)
77		return (DT_GREATERTHAN);
78
79	return (0);
80}
81
82/*ARGSUSED*/
83static void
84dt_aggregate_min(int64_t *existing, int64_t *new, size_t size)
85{
86	if (*new < *existing)
87		*existing = *new;
88}
89
90/*ARGSUSED*/
91static void
92dt_aggregate_max(int64_t *existing, int64_t *new, size_t size)
93{
94	if (*new > *existing)
95		*existing = *new;
96}
97
98static int
99dt_aggregate_averagecmp(int64_t *lhs, int64_t *rhs)
100{
101	int64_t lavg = lhs[0] ? (lhs[1] / lhs[0]) : 0;
102	int64_t ravg = rhs[0] ? (rhs[1] / rhs[0]) : 0;
103
104	if (lavg < ravg)
105		return (DT_LESSTHAN);
106
107	if (lavg > ravg)
108		return (DT_GREATERTHAN);
109
110	return (0);
111}
112
113static int
114dt_aggregate_stddevcmp(int64_t *lhs, int64_t *rhs)
115{
116	uint64_t lsd = dt_stddev((uint64_t *)lhs, 1);
117	uint64_t rsd = dt_stddev((uint64_t *)rhs, 1);
118
119	if (lsd < rsd)
120		return (DT_LESSTHAN);
121
122	if (lsd > rsd)
123		return (DT_GREATERTHAN);
124
125	return (0);
126}
127
128/*ARGSUSED*/
129static void
130dt_aggregate_lquantize(int64_t *existing, int64_t *new, size_t size)
131{
132	int64_t arg = *existing++;
133	uint16_t levels = DTRACE_LQUANTIZE_LEVELS(arg);
134	int i;
135
136	for (i = 0; i <= levels + 1; i++)
137		existing[i] = existing[i] + new[i + 1];
138}
139
140static long double
141dt_aggregate_lquantizedsum(int64_t *lquanta)
142{
143	int64_t arg = *lquanta++;
144	int32_t base = DTRACE_LQUANTIZE_BASE(arg);
145	uint16_t step = DTRACE_LQUANTIZE_STEP(arg);
146	uint16_t levels = DTRACE_LQUANTIZE_LEVELS(arg), i;
147	long double total = (long double)lquanta[0] * (long double)(base - 1);
148
149	for (i = 0; i < levels; base += step, i++)
150		total += (long double)lquanta[i + 1] * (long double)base;
151
152	return (total + (long double)lquanta[levels + 1] *
153	    (long double)(base + 1));
154}
155
156static int64_t
157dt_aggregate_lquantizedzero(int64_t *lquanta)
158{
159	int64_t arg = *lquanta++;
160	int32_t base = DTRACE_LQUANTIZE_BASE(arg);
161	uint16_t step = DTRACE_LQUANTIZE_STEP(arg);
162	uint16_t levels = DTRACE_LQUANTIZE_LEVELS(arg), i;
163
164	if (base - 1 == 0)
165		return (lquanta[0]);
166
167	for (i = 0; i < levels; base += step, i++) {
168		if (base != 0)
169			continue;
170
171		return (lquanta[i + 1]);
172	}
173
174	if (base + 1 == 0)
175		return (lquanta[levels + 1]);
176
177	return (0);
178}
179
180static int
181dt_aggregate_lquantizedcmp(int64_t *lhs, int64_t *rhs)
182{
183	long double lsum = dt_aggregate_lquantizedsum(lhs);
184	long double rsum = dt_aggregate_lquantizedsum(rhs);
185	int64_t lzero, rzero;
186
187	if (lsum < rsum)
188		return (DT_LESSTHAN);
189
190	if (lsum > rsum)
191		return (DT_GREATERTHAN);
192
193	/*
194	 * If they're both equal, then we will compare based on the weights at
195	 * zero.  If the weights at zero are equal (or if zero is not within
196	 * the range of the linear quantization), then this will be judged a
197	 * tie and will be resolved based on the key comparison.
198	 */
199	lzero = dt_aggregate_lquantizedzero(lhs);
200	rzero = dt_aggregate_lquantizedzero(rhs);
201
202	if (lzero < rzero)
203		return (DT_LESSTHAN);
204
205	if (lzero > rzero)
206		return (DT_GREATERTHAN);
207
208	return (0);
209}
210
211static int
212dt_aggregate_quantizedcmp(int64_t *lhs, int64_t *rhs)
213{
214	int nbuckets = DTRACE_QUANTIZE_NBUCKETS;
215	long double ltotal = 0, rtotal = 0;
216	int64_t lzero, rzero;
217	uint_t i;
218
219	for (i = 0; i < nbuckets; i++) {
220		int64_t bucketval = DTRACE_QUANTIZE_BUCKETVAL(i);
221
222		if (bucketval == 0) {
223			lzero = lhs[i];
224			rzero = rhs[i];
225		}
226
227		ltotal += (long double)bucketval * (long double)lhs[i];
228		rtotal += (long double)bucketval * (long double)rhs[i];
229	}
230
231	if (ltotal < rtotal)
232		return (DT_LESSTHAN);
233
234	if (ltotal > rtotal)
235		return (DT_GREATERTHAN);
236
237	/*
238	 * If they're both equal, then we will compare based on the weights at
239	 * zero.  If the weights at zero are equal, then this will be judged a
240	 * tie and will be resolved based on the key comparison.
241	 */
242	if (lzero < rzero)
243		return (DT_LESSTHAN);
244
245	if (lzero > rzero)
246		return (DT_GREATERTHAN);
247
248	return (0);
249}
250
251static void
252dt_aggregate_usym(dtrace_hdl_t *dtp, uint64_t *data)
253{
254#if 0	/* XXX TBD needs libproc */
255	uint64_t pid = data[0];
256	uint64_t *pc = &data[1];
257	struct ps_prochandle *P;
258	GElf_Sym sym;
259
260	if (dtp->dt_vector != NULL)
261		return;
262
263	if ((P = dt_proc_grab(dtp, pid, PGRAB_RDONLY | PGRAB_FORCE, 0)) == NULL)
264		return;
265
266	dt_proc_lock(dtp, P);
267
268#if defined(sun)
269	if (Plookup_by_addr(P, *pc, NULL, 0, &sym) == 0)
270#else
271	if (proc_addr2sym(P, *pc, NULL, 0, &sym) == 0)
272#endif
273		*pc = sym.st_value;
274
275	dt_proc_unlock(dtp, P);
276	dt_proc_release(dtp, P);
277#else
278	printf("XXX %s not implemented\n", __func__);
279#endif
280}
281
282static void
283dt_aggregate_umod(dtrace_hdl_t *dtp, uint64_t *data)
284{
285#if 0	/* XXX TBD needs libproc */
286	uint64_t pid = data[0];
287	uint64_t *pc = &data[1];
288	struct ps_prochandle *P;
289	const prmap_t *map;
290
291	if (dtp->dt_vector != NULL)
292		return;
293
294	if ((P = dt_proc_grab(dtp, pid, PGRAB_RDONLY | PGRAB_FORCE, 0)) == NULL)
295		return;
296
297	dt_proc_lock(dtp, P);
298
299#if defined(sun)
300	if ((map = Paddr_to_map(P, *pc)) != NULL)
301#else
302	if ((map = proc_addr2map(P, *pc)) != NULL)
303#endif
304		*pc = map->pr_vaddr;
305
306	dt_proc_unlock(dtp, P);
307	dt_proc_release(dtp, P);
308#else
309	printf("XXX %s not implemented\n", __func__);
310#endif
311}
312
313static void
314dt_aggregate_sym(dtrace_hdl_t *dtp, uint64_t *data)
315{
316	GElf_Sym sym;
317	uint64_t *pc = data;
318
319	if (dtrace_lookup_by_addr(dtp, *pc, &sym, NULL) == 0)
320		*pc = sym.st_value;
321}
322
323static void
324dt_aggregate_mod(dtrace_hdl_t *dtp, uint64_t *data)
325{
326	uint64_t *pc = data;
327	dt_module_t *dmp;
328
329	if (dtp->dt_vector != NULL) {
330		/*
331		 * We don't have a way of just getting the module for a
332		 * vectored open, and it doesn't seem to be worth defining
333		 * one.  This means that use of mod() won't get true
334		 * aggregation in the postmortem case (some modules may
335		 * appear more than once in aggregation output).  It seems
336		 * unlikely that anyone will ever notice or care...
337		 */
338		return;
339	}
340
341	for (dmp = dt_list_next(&dtp->dt_modlist); dmp != NULL;
342	    dmp = dt_list_next(dmp)) {
343		if (*pc - dmp->dm_text_va < dmp->dm_text_size) {
344			*pc = dmp->dm_text_va;
345			return;
346		}
347	}
348}
349
350static dtrace_aggvarid_t
351dt_aggregate_aggvarid(dt_ahashent_t *ent)
352{
353	dtrace_aggdesc_t *agg = ent->dtahe_data.dtada_desc;
354	caddr_t data = ent->dtahe_data.dtada_data;
355	dtrace_recdesc_t *rec = agg->dtagd_rec;
356
357	/*
358	 * First, we'll check the variable ID in the aggdesc.  If it's valid,
359	 * we'll return it.  If not, we'll use the compiler-generated ID
360	 * present as the first record.
361	 */
362	if (agg->dtagd_varid != DTRACE_AGGVARIDNONE)
363		return (agg->dtagd_varid);
364
365	agg->dtagd_varid = *((dtrace_aggvarid_t *)(uintptr_t)(data +
366	    rec->dtrd_offset));
367
368	return (agg->dtagd_varid);
369}
370
371
372static int
373dt_aggregate_snap_cpu(dtrace_hdl_t *dtp, processorid_t cpu)
374{
375	dtrace_epid_t id;
376	uint64_t hashval;
377	size_t offs, roffs, size, ndx;
378	int i, j, rval;
379	caddr_t addr, data;
380	dtrace_recdesc_t *rec;
381	dt_aggregate_t *agp = &dtp->dt_aggregate;
382	dtrace_aggdesc_t *agg;
383	dt_ahash_t *hash = &agp->dtat_hash;
384	dt_ahashent_t *h;
385	dtrace_bufdesc_t b = agp->dtat_buf, *buf = &b;
386	dtrace_aggdata_t *aggdata;
387	int flags = agp->dtat_flags;
388
389	buf->dtbd_cpu = cpu;
390
391#if defined(sun)
392	if (dt_ioctl(dtp, DTRACEIOC_AGGSNAP, buf) == -1) {
393#else
394	if (dt_ioctl(dtp, DTRACEIOC_AGGSNAP, &buf) == -1) {
395#endif
396		if (errno == ENOENT) {
397			/*
398			 * If that failed with ENOENT, it may be because the
399			 * CPU was unconfigured.  This is okay; we'll just
400			 * do nothing but return success.
401			 */
402			return (0);
403		}
404
405		return (dt_set_errno(dtp, errno));
406	}
407
408	if (buf->dtbd_drops != 0) {
409		if (dt_handle_cpudrop(dtp, cpu,
410		    DTRACEDROP_AGGREGATION, buf->dtbd_drops) == -1)
411			return (-1);
412	}
413
414	if (buf->dtbd_size == 0)
415		return (0);
416
417	if (hash->dtah_hash == NULL) {
418		size_t size;
419
420		hash->dtah_size = DTRACE_AHASHSIZE;
421		size = hash->dtah_size * sizeof (dt_ahashent_t *);
422
423		if ((hash->dtah_hash = malloc(size)) == NULL)
424			return (dt_set_errno(dtp, EDT_NOMEM));
425
426		bzero(hash->dtah_hash, size);
427	}
428
429	for (offs = 0; offs < buf->dtbd_size; ) {
430		/*
431		 * We're guaranteed to have an ID.
432		 */
433		id = *((dtrace_epid_t *)((uintptr_t)buf->dtbd_data +
434		    (uintptr_t)offs));
435
436		if (id == DTRACE_AGGIDNONE) {
437			/*
438			 * This is filler to assure proper alignment of the
439			 * next record; we simply ignore it.
440			 */
441			offs += sizeof (id);
442			continue;
443		}
444
445		if ((rval = dt_aggid_lookup(dtp, id, &agg)) != 0)
446			return (rval);
447
448		addr = buf->dtbd_data + offs;
449		size = agg->dtagd_size;
450		hashval = 0;
451
452		for (j = 0; j < agg->dtagd_nrecs - 1; j++) {
453			rec = &agg->dtagd_rec[j];
454			roffs = rec->dtrd_offset;
455
456			switch (rec->dtrd_action) {
457			case DTRACEACT_USYM:
458				dt_aggregate_usym(dtp,
459				    /* LINTED - alignment */
460				    (uint64_t *)&addr[roffs]);
461				break;
462
463			case DTRACEACT_UMOD:
464				dt_aggregate_umod(dtp,
465				    /* LINTED - alignment */
466				    (uint64_t *)&addr[roffs]);
467				break;
468
469			case DTRACEACT_SYM:
470				/* LINTED - alignment */
471				dt_aggregate_sym(dtp, (uint64_t *)&addr[roffs]);
472				break;
473
474			case DTRACEACT_MOD:
475				/* LINTED - alignment */
476				dt_aggregate_mod(dtp, (uint64_t *)&addr[roffs]);
477				break;
478
479			default:
480				break;
481			}
482
483			for (i = 0; i < rec->dtrd_size; i++)
484				hashval += addr[roffs + i];
485		}
486
487		ndx = hashval % hash->dtah_size;
488
489		for (h = hash->dtah_hash[ndx]; h != NULL; h = h->dtahe_next) {
490			if (h->dtahe_hashval != hashval)
491				continue;
492
493			if (h->dtahe_size != size)
494				continue;
495
496			aggdata = &h->dtahe_data;
497			data = aggdata->dtada_data;
498
499			for (j = 0; j < agg->dtagd_nrecs - 1; j++) {
500				rec = &agg->dtagd_rec[j];
501				roffs = rec->dtrd_offset;
502
503				for (i = 0; i < rec->dtrd_size; i++)
504					if (addr[roffs + i] != data[roffs + i])
505						goto hashnext;
506			}
507
508			/*
509			 * We found it.  Now we need to apply the aggregating
510			 * action on the data here.
511			 */
512			rec = &agg->dtagd_rec[agg->dtagd_nrecs - 1];
513			roffs = rec->dtrd_offset;
514			/* LINTED - alignment */
515			h->dtahe_aggregate((int64_t *)&data[roffs],
516			    /* LINTED - alignment */
517			    (int64_t *)&addr[roffs], rec->dtrd_size);
518
519			/*
520			 * If we're keeping per CPU data, apply the aggregating
521			 * action there as well.
522			 */
523			if (aggdata->dtada_percpu != NULL) {
524				data = aggdata->dtada_percpu[cpu];
525
526				/* LINTED - alignment */
527				h->dtahe_aggregate((int64_t *)data,
528				    /* LINTED - alignment */
529				    (int64_t *)&addr[roffs], rec->dtrd_size);
530			}
531
532			goto bufnext;
533hashnext:
534			continue;
535		}
536
537		/*
538		 * If we're here, we couldn't find an entry for this record.
539		 */
540		if ((h = malloc(sizeof (dt_ahashent_t))) == NULL)
541			return (dt_set_errno(dtp, EDT_NOMEM));
542		bzero(h, sizeof (dt_ahashent_t));
543		aggdata = &h->dtahe_data;
544
545		if ((aggdata->dtada_data = malloc(size)) == NULL) {
546			free(h);
547			return (dt_set_errno(dtp, EDT_NOMEM));
548		}
549
550		bcopy(addr, aggdata->dtada_data, size);
551		aggdata->dtada_size = size;
552		aggdata->dtada_desc = agg;
553		aggdata->dtada_handle = dtp;
554		(void) dt_epid_lookup(dtp, agg->dtagd_epid,
555		    &aggdata->dtada_edesc, &aggdata->dtada_pdesc);
556		aggdata->dtada_normal = 1;
557
558		h->dtahe_hashval = hashval;
559		h->dtahe_size = size;
560		(void) dt_aggregate_aggvarid(h);
561
562		rec = &agg->dtagd_rec[agg->dtagd_nrecs - 1];
563
564		if (flags & DTRACE_A_PERCPU) {
565			int max_cpus = agp->dtat_maxcpu;
566			caddr_t *percpu = malloc(max_cpus * sizeof (caddr_t));
567
568			if (percpu == NULL) {
569				free(aggdata->dtada_data);
570				free(h);
571				return (dt_set_errno(dtp, EDT_NOMEM));
572			}
573
574			for (j = 0; j < max_cpus; j++) {
575				percpu[j] = malloc(rec->dtrd_size);
576
577				if (percpu[j] == NULL) {
578					while (--j >= 0)
579						free(percpu[j]);
580
581					free(aggdata->dtada_data);
582					free(h);
583					return (dt_set_errno(dtp, EDT_NOMEM));
584				}
585
586				if (j == cpu) {
587					bcopy(&addr[rec->dtrd_offset],
588					    percpu[j], rec->dtrd_size);
589				} else {
590					bzero(percpu[j], rec->dtrd_size);
591				}
592			}
593
594			aggdata->dtada_percpu = percpu;
595		}
596
597		switch (rec->dtrd_action) {
598		case DTRACEAGG_MIN:
599			h->dtahe_aggregate = dt_aggregate_min;
600			break;
601
602		case DTRACEAGG_MAX:
603			h->dtahe_aggregate = dt_aggregate_max;
604			break;
605
606		case DTRACEAGG_LQUANTIZE:
607			h->dtahe_aggregate = dt_aggregate_lquantize;
608			break;
609
610		case DTRACEAGG_COUNT:
611		case DTRACEAGG_SUM:
612		case DTRACEAGG_AVG:
613		case DTRACEAGG_STDDEV:
614		case DTRACEAGG_QUANTIZE:
615			h->dtahe_aggregate = dt_aggregate_count;
616			break;
617
618		default:
619			return (dt_set_errno(dtp, EDT_BADAGG));
620		}
621
622		if (hash->dtah_hash[ndx] != NULL)
623			hash->dtah_hash[ndx]->dtahe_prev = h;
624
625		h->dtahe_next = hash->dtah_hash[ndx];
626		hash->dtah_hash[ndx] = h;
627
628		if (hash->dtah_all != NULL)
629			hash->dtah_all->dtahe_prevall = h;
630
631		h->dtahe_nextall = hash->dtah_all;
632		hash->dtah_all = h;
633bufnext:
634		offs += agg->dtagd_size;
635	}
636
637	return (0);
638}
639
640int
641dtrace_aggregate_snap(dtrace_hdl_t *dtp)
642{
643	int i, rval;
644	dt_aggregate_t *agp = &dtp->dt_aggregate;
645	hrtime_t now = gethrtime();
646	dtrace_optval_t interval = dtp->dt_options[DTRACEOPT_AGGRATE];
647
648	if (dtp->dt_lastagg != 0) {
649		if (now - dtp->dt_lastagg < interval)
650			return (0);
651
652		dtp->dt_lastagg += interval;
653	} else {
654		dtp->dt_lastagg = now;
655	}
656
657	if (!dtp->dt_active)
658		return (dt_set_errno(dtp, EINVAL));
659
660	if (agp->dtat_buf.dtbd_size == 0)
661		return (0);
662
663	for (i = 0; i < agp->dtat_ncpus; i++) {
664		if ((rval = dt_aggregate_snap_cpu(dtp, agp->dtat_cpus[i])))
665			return (rval);
666	}
667
668	return (0);
669}
670
671static int
672dt_aggregate_hashcmp(const void *lhs, const void *rhs)
673{
674	dt_ahashent_t *lh = *((dt_ahashent_t **)lhs);
675	dt_ahashent_t *rh = *((dt_ahashent_t **)rhs);
676	dtrace_aggdesc_t *lagg = lh->dtahe_data.dtada_desc;
677	dtrace_aggdesc_t *ragg = rh->dtahe_data.dtada_desc;
678
679	if (lagg->dtagd_nrecs < ragg->dtagd_nrecs)
680		return (DT_LESSTHAN);
681
682	if (lagg->dtagd_nrecs > ragg->dtagd_nrecs)
683		return (DT_GREATERTHAN);
684
685	return (0);
686}
687
688static int
689dt_aggregate_varcmp(const void *lhs, const void *rhs)
690{
691	dt_ahashent_t *lh = *((dt_ahashent_t **)lhs);
692	dt_ahashent_t *rh = *((dt_ahashent_t **)rhs);
693	dtrace_aggvarid_t lid, rid;
694
695	lid = dt_aggregate_aggvarid(lh);
696	rid = dt_aggregate_aggvarid(rh);
697
698	if (lid < rid)
699		return (DT_LESSTHAN);
700
701	if (lid > rid)
702		return (DT_GREATERTHAN);
703
704	return (0);
705}
706
707static int
708dt_aggregate_keycmp(const void *lhs, const void *rhs)
709{
710	dt_ahashent_t *lh = *((dt_ahashent_t **)lhs);
711	dt_ahashent_t *rh = *((dt_ahashent_t **)rhs);
712	dtrace_aggdesc_t *lagg = lh->dtahe_data.dtada_desc;
713	dtrace_aggdesc_t *ragg = rh->dtahe_data.dtada_desc;
714	dtrace_recdesc_t *lrec, *rrec;
715	char *ldata, *rdata;
716	int rval, i, j, keypos, nrecs;
717
718	if ((rval = dt_aggregate_hashcmp(lhs, rhs)) != 0)
719		return (rval);
720
721	nrecs = lagg->dtagd_nrecs - 1;
722	assert(nrecs == ragg->dtagd_nrecs - 1);
723
724	keypos = dt_keypos + 1 >= nrecs ? 0 : dt_keypos;
725
726	for (i = 1; i < nrecs; i++) {
727		uint64_t lval, rval;
728		int ndx = i + keypos;
729
730		if (ndx >= nrecs)
731			ndx = ndx - nrecs + 1;
732
733		lrec = &lagg->dtagd_rec[ndx];
734		rrec = &ragg->dtagd_rec[ndx];
735
736		ldata = lh->dtahe_data.dtada_data + lrec->dtrd_offset;
737		rdata = rh->dtahe_data.dtada_data + rrec->dtrd_offset;
738
739		if (lrec->dtrd_size < rrec->dtrd_size)
740			return (DT_LESSTHAN);
741
742		if (lrec->dtrd_size > rrec->dtrd_size)
743			return (DT_GREATERTHAN);
744
745		switch (lrec->dtrd_size) {
746		case sizeof (uint64_t):
747			/* LINTED - alignment */
748			lval = *((uint64_t *)ldata);
749			/* LINTED - alignment */
750			rval = *((uint64_t *)rdata);
751			break;
752
753		case sizeof (uint32_t):
754			/* LINTED - alignment */
755			lval = *((uint32_t *)ldata);
756			/* LINTED - alignment */
757			rval = *((uint32_t *)rdata);
758			break;
759
760		case sizeof (uint16_t):
761			/* LINTED - alignment */
762			lval = *((uint16_t *)ldata);
763			/* LINTED - alignment */
764			rval = *((uint16_t *)rdata);
765			break;
766
767		case sizeof (uint8_t):
768			lval = *((uint8_t *)ldata);
769			rval = *((uint8_t *)rdata);
770			break;
771
772		default:
773			switch (lrec->dtrd_action) {
774			case DTRACEACT_UMOD:
775			case DTRACEACT_UADDR:
776			case DTRACEACT_USYM:
777				for (j = 0; j < 2; j++) {
778					/* LINTED - alignment */
779					lval = ((uint64_t *)ldata)[j];
780					/* LINTED - alignment */
781					rval = ((uint64_t *)rdata)[j];
782
783					if (lval < rval)
784						return (DT_LESSTHAN);
785
786					if (lval > rval)
787						return (DT_GREATERTHAN);
788				}
789
790				break;
791
792			default:
793				for (j = 0; j < lrec->dtrd_size; j++) {
794					lval = ((uint8_t *)ldata)[j];
795					rval = ((uint8_t *)rdata)[j];
796
797					if (lval < rval)
798						return (DT_LESSTHAN);
799
800					if (lval > rval)
801						return (DT_GREATERTHAN);
802				}
803			}
804
805			continue;
806		}
807
808		if (lval < rval)
809			return (DT_LESSTHAN);
810
811		if (lval > rval)
812			return (DT_GREATERTHAN);
813	}
814
815	return (0);
816}
817
818static int
819dt_aggregate_valcmp(const void *lhs, const void *rhs)
820{
821	dt_ahashent_t *lh = *((dt_ahashent_t **)lhs);
822	dt_ahashent_t *rh = *((dt_ahashent_t **)rhs);
823	dtrace_aggdesc_t *lagg = lh->dtahe_data.dtada_desc;
824	dtrace_aggdesc_t *ragg = rh->dtahe_data.dtada_desc;
825	caddr_t ldata = lh->dtahe_data.dtada_data;
826	caddr_t rdata = rh->dtahe_data.dtada_data;
827	dtrace_recdesc_t *lrec, *rrec;
828	int64_t *laddr, *raddr;
829	int rval, i;
830
831	if ((rval = dt_aggregate_hashcmp(lhs, rhs)) != 0)
832		return (rval);
833
834	if (lagg->dtagd_nrecs > ragg->dtagd_nrecs)
835		return (DT_GREATERTHAN);
836
837	if (lagg->dtagd_nrecs < ragg->dtagd_nrecs)
838		return (DT_LESSTHAN);
839
840	for (i = 0; i < lagg->dtagd_nrecs; i++) {
841		lrec = &lagg->dtagd_rec[i];
842		rrec = &ragg->dtagd_rec[i];
843
844		if (lrec->dtrd_offset < rrec->dtrd_offset)
845			return (DT_LESSTHAN);
846
847		if (lrec->dtrd_offset > rrec->dtrd_offset)
848			return (DT_GREATERTHAN);
849
850		if (lrec->dtrd_action < rrec->dtrd_action)
851			return (DT_LESSTHAN);
852
853		if (lrec->dtrd_action > rrec->dtrd_action)
854			return (DT_GREATERTHAN);
855	}
856
857	laddr = (int64_t *)(uintptr_t)(ldata + lrec->dtrd_offset);
858	raddr = (int64_t *)(uintptr_t)(rdata + rrec->dtrd_offset);
859
860	switch (lrec->dtrd_action) {
861	case DTRACEAGG_AVG:
862		rval = dt_aggregate_averagecmp(laddr, raddr);
863		break;
864
865	case DTRACEAGG_STDDEV:
866		rval = dt_aggregate_stddevcmp(laddr, raddr);
867		break;
868
869	case DTRACEAGG_QUANTIZE:
870		rval = dt_aggregate_quantizedcmp(laddr, raddr);
871		break;
872
873	case DTRACEAGG_LQUANTIZE:
874		rval = dt_aggregate_lquantizedcmp(laddr, raddr);
875		break;
876
877	case DTRACEAGG_COUNT:
878	case DTRACEAGG_SUM:
879	case DTRACEAGG_MIN:
880	case DTRACEAGG_MAX:
881		rval = dt_aggregate_countcmp(laddr, raddr);
882		break;
883
884	default:
885		assert(0);
886	}
887
888	return (rval);
889}
890
891static int
892dt_aggregate_valkeycmp(const void *lhs, const void *rhs)
893{
894	int rval;
895
896	if ((rval = dt_aggregate_valcmp(lhs, rhs)) != 0)
897		return (rval);
898
899	/*
900	 * If we're here, the values for the two aggregation elements are
901	 * equal.  We already know that the key layout is the same for the two
902	 * elements; we must now compare the keys themselves as a tie-breaker.
903	 */
904	return (dt_aggregate_keycmp(lhs, rhs));
905}
906
907static int
908dt_aggregate_keyvarcmp(const void *lhs, const void *rhs)
909{
910	int rval;
911
912	if ((rval = dt_aggregate_keycmp(lhs, rhs)) != 0)
913		return (rval);
914
915	return (dt_aggregate_varcmp(lhs, rhs));
916}
917
918static int
919dt_aggregate_varkeycmp(const void *lhs, const void *rhs)
920{
921	int rval;
922
923	if ((rval = dt_aggregate_varcmp(lhs, rhs)) != 0)
924		return (rval);
925
926	return (dt_aggregate_keycmp(lhs, rhs));
927}
928
929static int
930dt_aggregate_valvarcmp(const void *lhs, const void *rhs)
931{
932	int rval;
933
934	if ((rval = dt_aggregate_valkeycmp(lhs, rhs)) != 0)
935		return (rval);
936
937	return (dt_aggregate_varcmp(lhs, rhs));
938}
939
940static int
941dt_aggregate_varvalcmp(const void *lhs, const void *rhs)
942{
943	int rval;
944
945	if ((rval = dt_aggregate_varcmp(lhs, rhs)) != 0)
946		return (rval);
947
948	return (dt_aggregate_valkeycmp(lhs, rhs));
949}
950
951static int
952dt_aggregate_keyvarrevcmp(const void *lhs, const void *rhs)
953{
954	return (dt_aggregate_keyvarcmp(rhs, lhs));
955}
956
957static int
958dt_aggregate_varkeyrevcmp(const void *lhs, const void *rhs)
959{
960	return (dt_aggregate_varkeycmp(rhs, lhs));
961}
962
963static int
964dt_aggregate_valvarrevcmp(const void *lhs, const void *rhs)
965{
966	return (dt_aggregate_valvarcmp(rhs, lhs));
967}
968
969static int
970dt_aggregate_varvalrevcmp(const void *lhs, const void *rhs)
971{
972	return (dt_aggregate_varvalcmp(rhs, lhs));
973}
974
975static int
976dt_aggregate_bundlecmp(const void *lhs, const void *rhs)
977{
978	dt_ahashent_t **lh = *((dt_ahashent_t ***)lhs);
979	dt_ahashent_t **rh = *((dt_ahashent_t ***)rhs);
980	int i, rval;
981
982	if (dt_keysort) {
983		/*
984		 * If we're sorting on keys, we need to scan until we find the
985		 * last entry -- that's the representative key.  (The order of
986		 * the bundle is values followed by key to accommodate the
987		 * default behavior of sorting by value.)  If the keys are
988		 * equal, we'll fall into the value comparison loop, below.
989		 */
990		for (i = 0; lh[i + 1] != NULL; i++)
991			continue;
992
993		assert(i != 0);
994		assert(rh[i + 1] == NULL);
995
996		if ((rval = dt_aggregate_keycmp(&lh[i], &rh[i])) != 0)
997			return (rval);
998	}
999
1000	for (i = 0; ; i++) {
1001		if (lh[i + 1] == NULL) {
1002			/*
1003			 * All of the values are equal; if we're sorting on
1004			 * keys, then we're only here because the keys were
1005			 * found to be equal and these records are therefore
1006			 * equal.  If we're not sorting on keys, we'll use the
1007			 * key comparison from the representative key as the
1008			 * tie-breaker.
1009			 */
1010			if (dt_keysort)
1011				return (0);
1012
1013			assert(i != 0);
1014			assert(rh[i + 1] == NULL);
1015			return (dt_aggregate_keycmp(&lh[i], &rh[i]));
1016		} else {
1017			if ((rval = dt_aggregate_valcmp(&lh[i], &rh[i])) != 0)
1018				return (rval);
1019		}
1020	}
1021}
1022
1023int
1024dt_aggregate_go(dtrace_hdl_t *dtp)
1025{
1026	dt_aggregate_t *agp = &dtp->dt_aggregate;
1027	dtrace_optval_t size, cpu;
1028	dtrace_bufdesc_t *buf = &agp->dtat_buf;
1029	int rval, i;
1030
1031	assert(agp->dtat_maxcpu == 0);
1032	assert(agp->dtat_ncpu == 0);
1033	assert(agp->dtat_cpus == NULL);
1034
1035	agp->dtat_maxcpu = dt_sysconf(dtp, _SC_CPUID_MAX) + 1;
1036	agp->dtat_ncpu = dt_sysconf(dtp, _SC_NPROCESSORS_MAX);
1037	agp->dtat_cpus = malloc(agp->dtat_ncpu * sizeof (processorid_t));
1038
1039	if (agp->dtat_cpus == NULL)
1040		return (dt_set_errno(dtp, EDT_NOMEM));
1041
1042	/*
1043	 * Use the aggregation buffer size as reloaded from the kernel.
1044	 */
1045	size = dtp->dt_options[DTRACEOPT_AGGSIZE];
1046
1047	rval = dtrace_getopt(dtp, "aggsize", &size);
1048	assert(rval == 0);
1049
1050	if (size == 0 || size == DTRACEOPT_UNSET)
1051		return (0);
1052
1053	buf = &agp->dtat_buf;
1054	buf->dtbd_size = size;
1055
1056	if ((buf->dtbd_data = malloc(buf->dtbd_size)) == NULL)
1057		return (dt_set_errno(dtp, EDT_NOMEM));
1058
1059	/*
1060	 * Now query for the CPUs enabled.
1061	 */
1062	rval = dtrace_getopt(dtp, "cpu", &cpu);
1063	assert(rval == 0 && cpu != DTRACEOPT_UNSET);
1064
1065	if (cpu != DTRACE_CPUALL) {
1066		assert(cpu < agp->dtat_ncpu);
1067		agp->dtat_cpus[agp->dtat_ncpus++] = (processorid_t)cpu;
1068
1069		return (0);
1070	}
1071
1072	agp->dtat_ncpus = 0;
1073	for (i = 0; i < agp->dtat_maxcpu; i++) {
1074		if (dt_status(dtp, i) == -1)
1075			continue;
1076
1077		agp->dtat_cpus[agp->dtat_ncpus++] = i;
1078	}
1079
1080	return (0);
1081}
1082
1083static int
1084dt_aggwalk_rval(dtrace_hdl_t *dtp, dt_ahashent_t *h, int rval)
1085{
1086	dt_aggregate_t *agp = &dtp->dt_aggregate;
1087	dtrace_aggdata_t *data;
1088	dtrace_aggdesc_t *aggdesc;
1089	dtrace_recdesc_t *rec;
1090	int i;
1091
1092	switch (rval) {
1093	case DTRACE_AGGWALK_NEXT:
1094		break;
1095
1096	case DTRACE_AGGWALK_CLEAR: {
1097		uint32_t size, offs = 0;
1098
1099		aggdesc = h->dtahe_data.dtada_desc;
1100		rec = &aggdesc->dtagd_rec[aggdesc->dtagd_nrecs - 1];
1101		size = rec->dtrd_size;
1102		data = &h->dtahe_data;
1103
1104		if (rec->dtrd_action == DTRACEAGG_LQUANTIZE) {
1105			offs = sizeof (uint64_t);
1106			size -= sizeof (uint64_t);
1107		}
1108
1109		bzero(&data->dtada_data[rec->dtrd_offset] + offs, size);
1110
1111		if (data->dtada_percpu == NULL)
1112			break;
1113
1114		for (i = 0; i < dtp->dt_aggregate.dtat_maxcpu; i++)
1115			bzero(data->dtada_percpu[i] + offs, size);
1116		break;
1117	}
1118
1119	case DTRACE_AGGWALK_ERROR:
1120		/*
1121		 * We assume that errno is already set in this case.
1122		 */
1123		return (dt_set_errno(dtp, errno));
1124
1125	case DTRACE_AGGWALK_ABORT:
1126		return (dt_set_errno(dtp, EDT_DIRABORT));
1127
1128	case DTRACE_AGGWALK_DENORMALIZE:
1129		h->dtahe_data.dtada_normal = 1;
1130		return (0);
1131
1132	case DTRACE_AGGWALK_NORMALIZE:
1133		if (h->dtahe_data.dtada_normal == 0) {
1134			h->dtahe_data.dtada_normal = 1;
1135			return (dt_set_errno(dtp, EDT_BADRVAL));
1136		}
1137
1138		return (0);
1139
1140	case DTRACE_AGGWALK_REMOVE: {
1141		dtrace_aggdata_t *aggdata = &h->dtahe_data;
1142		int max_cpus = agp->dtat_maxcpu;
1143
1144		/*
1145		 * First, remove this hash entry from its hash chain.
1146		 */
1147		if (h->dtahe_prev != NULL) {
1148			h->dtahe_prev->dtahe_next = h->dtahe_next;
1149		} else {
1150			dt_ahash_t *hash = &agp->dtat_hash;
1151			size_t ndx = h->dtahe_hashval % hash->dtah_size;
1152
1153			assert(hash->dtah_hash[ndx] == h);
1154			hash->dtah_hash[ndx] = h->dtahe_next;
1155		}
1156
1157		if (h->dtahe_next != NULL)
1158			h->dtahe_next->dtahe_prev = h->dtahe_prev;
1159
1160		/*
1161		 * Now remove it from the list of all hash entries.
1162		 */
1163		if (h->dtahe_prevall != NULL) {
1164			h->dtahe_prevall->dtahe_nextall = h->dtahe_nextall;
1165		} else {
1166			dt_ahash_t *hash = &agp->dtat_hash;
1167
1168			assert(hash->dtah_all == h);
1169			hash->dtah_all = h->dtahe_nextall;
1170		}
1171
1172		if (h->dtahe_nextall != NULL)
1173			h->dtahe_nextall->dtahe_prevall = h->dtahe_prevall;
1174
1175		/*
1176		 * We're unlinked.  We can safely destroy the data.
1177		 */
1178		if (aggdata->dtada_percpu != NULL) {
1179			for (i = 0; i < max_cpus; i++)
1180				free(aggdata->dtada_percpu[i]);
1181			free(aggdata->dtada_percpu);
1182		}
1183
1184		free(aggdata->dtada_data);
1185		free(h);
1186
1187		return (0);
1188	}
1189
1190	default:
1191		return (dt_set_errno(dtp, EDT_BADRVAL));
1192	}
1193
1194	return (0);
1195}
1196
1197void
1198dt_aggregate_qsort(dtrace_hdl_t *dtp, void *base, size_t nel, size_t width,
1199    int (*compar)(const void *, const void *))
1200{
1201	int rev = dt_revsort, key = dt_keysort, keypos = dt_keypos;
1202	dtrace_optval_t keyposopt = dtp->dt_options[DTRACEOPT_AGGSORTKEYPOS];
1203
1204	dt_revsort = (dtp->dt_options[DTRACEOPT_AGGSORTREV] != DTRACEOPT_UNSET);
1205	dt_keysort = (dtp->dt_options[DTRACEOPT_AGGSORTKEY] != DTRACEOPT_UNSET);
1206
1207	if (keyposopt != DTRACEOPT_UNSET && keyposopt <= INT_MAX) {
1208		dt_keypos = (int)keyposopt;
1209	} else {
1210		dt_keypos = 0;
1211	}
1212
1213	if (compar == NULL) {
1214		if (!dt_keysort) {
1215			compar = dt_aggregate_varvalcmp;
1216		} else {
1217			compar = dt_aggregate_varkeycmp;
1218		}
1219	}
1220
1221	qsort(base, nel, width, compar);
1222
1223	dt_revsort = rev;
1224	dt_keysort = key;
1225	dt_keypos = keypos;
1226}
1227
1228int
1229dtrace_aggregate_walk(dtrace_hdl_t *dtp, dtrace_aggregate_f *func, void *arg)
1230{
1231	dt_ahashent_t *h, *next;
1232	dt_ahash_t *hash = &dtp->dt_aggregate.dtat_hash;
1233
1234	for (h = hash->dtah_all; h != NULL; h = next) {
1235		/*
1236		 * dt_aggwalk_rval() can potentially remove the current hash
1237		 * entry; we need to load the next hash entry before calling
1238		 * into it.
1239		 */
1240		next = h->dtahe_nextall;
1241
1242		if (dt_aggwalk_rval(dtp, h, func(&h->dtahe_data, arg)) == -1)
1243			return (-1);
1244	}
1245
1246	return (0);
1247}
1248
1249static int
1250dt_aggregate_walk_sorted(dtrace_hdl_t *dtp,
1251    dtrace_aggregate_f *func, void *arg,
1252    int (*sfunc)(const void *, const void *))
1253{
1254	dt_aggregate_t *agp = &dtp->dt_aggregate;
1255	dt_ahashent_t *h, **sorted;
1256	dt_ahash_t *hash = &agp->dtat_hash;
1257	size_t i, nentries = 0;
1258
1259	for (h = hash->dtah_all; h != NULL; h = h->dtahe_nextall)
1260		nentries++;
1261
1262	sorted = dt_alloc(dtp, nentries * sizeof (dt_ahashent_t *));
1263
1264	if (sorted == NULL)
1265		return (-1);
1266
1267	for (h = hash->dtah_all, i = 0; h != NULL; h = h->dtahe_nextall)
1268		sorted[i++] = h;
1269
1270	(void) pthread_mutex_lock(&dt_qsort_lock);
1271
1272	if (sfunc == NULL) {
1273		dt_aggregate_qsort(dtp, sorted, nentries,
1274		    sizeof (dt_ahashent_t *), NULL);
1275	} else {
1276		/*
1277		 * If we've been explicitly passed a sorting function,
1278		 * we'll use that -- ignoring the values of the "aggsortrev",
1279		 * "aggsortkey" and "aggsortkeypos" options.
1280		 */
1281		qsort(sorted, nentries, sizeof (dt_ahashent_t *), sfunc);
1282	}
1283
1284	(void) pthread_mutex_unlock(&dt_qsort_lock);
1285
1286	for (i = 0; i < nentries; i++) {
1287		h = sorted[i];
1288
1289		if (dt_aggwalk_rval(dtp, h, func(&h->dtahe_data, arg)) == -1) {
1290			dt_free(dtp, sorted);
1291			return (-1);
1292		}
1293	}
1294
1295	dt_free(dtp, sorted);
1296	return (0);
1297}
1298
1299int
1300dtrace_aggregate_walk_sorted(dtrace_hdl_t *dtp,
1301    dtrace_aggregate_f *func, void *arg)
1302{
1303	return (dt_aggregate_walk_sorted(dtp, func, arg, NULL));
1304}
1305
1306int
1307dtrace_aggregate_walk_keysorted(dtrace_hdl_t *dtp,
1308    dtrace_aggregate_f *func, void *arg)
1309{
1310	return (dt_aggregate_walk_sorted(dtp, func,
1311	    arg, dt_aggregate_varkeycmp));
1312}
1313
1314int
1315dtrace_aggregate_walk_valsorted(dtrace_hdl_t *dtp,
1316    dtrace_aggregate_f *func, void *arg)
1317{
1318	return (dt_aggregate_walk_sorted(dtp, func,
1319	    arg, dt_aggregate_varvalcmp));
1320}
1321
1322int
1323dtrace_aggregate_walk_keyvarsorted(dtrace_hdl_t *dtp,
1324    dtrace_aggregate_f *func, void *arg)
1325{
1326	return (dt_aggregate_walk_sorted(dtp, func,
1327	    arg, dt_aggregate_keyvarcmp));
1328}
1329
1330int
1331dtrace_aggregate_walk_valvarsorted(dtrace_hdl_t *dtp,
1332    dtrace_aggregate_f *func, void *arg)
1333{
1334	return (dt_aggregate_walk_sorted(dtp, func,
1335	    arg, dt_aggregate_valvarcmp));
1336}
1337
1338int
1339dtrace_aggregate_walk_keyrevsorted(dtrace_hdl_t *dtp,
1340    dtrace_aggregate_f *func, void *arg)
1341{
1342	return (dt_aggregate_walk_sorted(dtp, func,
1343	    arg, dt_aggregate_varkeyrevcmp));
1344}
1345
1346int
1347dtrace_aggregate_walk_valrevsorted(dtrace_hdl_t *dtp,
1348    dtrace_aggregate_f *func, void *arg)
1349{
1350	return (dt_aggregate_walk_sorted(dtp, func,
1351	    arg, dt_aggregate_varvalrevcmp));
1352}
1353
1354int
1355dtrace_aggregate_walk_keyvarrevsorted(dtrace_hdl_t *dtp,
1356    dtrace_aggregate_f *func, void *arg)
1357{
1358	return (dt_aggregate_walk_sorted(dtp, func,
1359	    arg, dt_aggregate_keyvarrevcmp));
1360}
1361
1362int
1363dtrace_aggregate_walk_valvarrevsorted(dtrace_hdl_t *dtp,
1364    dtrace_aggregate_f *func, void *arg)
1365{
1366	return (dt_aggregate_walk_sorted(dtp, func,
1367	    arg, dt_aggregate_valvarrevcmp));
1368}
1369
1370int
1371dtrace_aggregate_walk_joined(dtrace_hdl_t *dtp, dtrace_aggvarid_t *aggvars,
1372    int naggvars, dtrace_aggregate_walk_joined_f *func, void *arg)
1373{
1374	dt_aggregate_t *agp = &dtp->dt_aggregate;
1375	dt_ahashent_t *h, **sorted = NULL, ***bundle, **nbundle;
1376	const dtrace_aggdata_t **data;
1377	dt_ahashent_t *zaggdata = NULL;
1378	dt_ahash_t *hash = &agp->dtat_hash;
1379	size_t nentries = 0, nbundles = 0, start, zsize = 0, bundlesize;
1380	dtrace_aggvarid_t max = 0, aggvar;
1381	int rval = -1, *map, *remap = NULL;
1382	int i, j;
1383	dtrace_optval_t sortpos = dtp->dt_options[DTRACEOPT_AGGSORTPOS];
1384
1385	/*
1386	 * If the sorting position is greater than the number of aggregation
1387	 * variable IDs, we silently set it to 0.
1388	 */
1389	if (sortpos == DTRACEOPT_UNSET || sortpos >= naggvars)
1390		sortpos = 0;
1391
1392	/*
1393	 * First we need to translate the specified aggregation variable IDs
1394	 * into a linear map that will allow us to translate an aggregation
1395	 * variable ID into its position in the specified aggvars.
1396	 */
1397	for (i = 0; i < naggvars; i++) {
1398		if (aggvars[i] == DTRACE_AGGVARIDNONE || aggvars[i] < 0)
1399			return (dt_set_errno(dtp, EDT_BADAGGVAR));
1400
1401		if (aggvars[i] > max)
1402			max = aggvars[i];
1403	}
1404
1405	if ((map = dt_zalloc(dtp, (max + 1) * sizeof (int))) == NULL)
1406		return (-1);
1407
1408	zaggdata = dt_zalloc(dtp, naggvars * sizeof (dt_ahashent_t));
1409
1410	if (zaggdata == NULL)
1411		goto out;
1412
1413	for (i = 0; i < naggvars; i++) {
1414		int ndx = i + sortpos;
1415
1416		if (ndx >= naggvars)
1417			ndx -= naggvars;
1418
1419		aggvar = aggvars[ndx];
1420		assert(aggvar <= max);
1421
1422		if (map[aggvar]) {
1423			/*
1424			 * We have an aggregation variable that is present
1425			 * more than once in the array of aggregation
1426			 * variables.  While it's unclear why one might want
1427			 * to do this, it's legal.  To support this construct,
1428			 * we will allocate a remap that will indicate the
1429			 * position from which this aggregation variable
1430			 * should be pulled.  (That is, where the remap will
1431			 * map from one position to another.)
1432			 */
1433			if (remap == NULL) {
1434				remap = dt_zalloc(dtp, naggvars * sizeof (int));
1435
1436				if (remap == NULL)
1437					goto out;
1438			}
1439
1440			/*
1441			 * Given that the variable is already present, assert
1442			 * that following through the mapping and adjusting
1443			 * for the sort position yields the same aggregation
1444			 * variable ID.
1445			 */
1446			assert(aggvars[(map[aggvar] - 1 + sortpos) %
1447			    naggvars] == aggvars[ndx]);
1448
1449			remap[i] = map[aggvar];
1450			continue;
1451		}
1452
1453		map[aggvar] = i + 1;
1454	}
1455
1456	/*
1457	 * We need to take two passes over the data to size our allocation, so
1458	 * we'll use the first pass to also fill in the zero-filled data to be
1459	 * used to properly format a zero-valued aggregation.
1460	 */
1461	for (h = hash->dtah_all; h != NULL; h = h->dtahe_nextall) {
1462		dtrace_aggvarid_t id;
1463		int ndx;
1464
1465		if ((id = dt_aggregate_aggvarid(h)) > max || !(ndx = map[id]))
1466			continue;
1467
1468		if (zaggdata[ndx - 1].dtahe_size == 0) {
1469			zaggdata[ndx - 1].dtahe_size = h->dtahe_size;
1470			zaggdata[ndx - 1].dtahe_data = h->dtahe_data;
1471		}
1472
1473		nentries++;
1474	}
1475
1476	if (nentries == 0) {
1477		/*
1478		 * We couldn't find any entries; there is nothing else to do.
1479		 */
1480		rval = 0;
1481		goto out;
1482	}
1483
1484	/*
1485	 * Before we sort the data, we're going to look for any holes in our
1486	 * zero-filled data.  This will occur if an aggregation variable that
1487	 * we are being asked to print has not yet been assigned the result of
1488	 * any aggregating action for _any_ tuple.  The issue becomes that we
1489	 * would like a zero value to be printed for all columns for this
1490	 * aggregation, but without any record description, we don't know the
1491	 * aggregating action that corresponds to the aggregation variable.  To
1492	 * try to find a match, we're simply going to lookup aggregation IDs
1493	 * (which are guaranteed to be contiguous and to start from 1), looking
1494	 * for the specified aggregation variable ID.  If we find a match,
1495	 * we'll use that.  If we iterate over all aggregation IDs and don't
1496	 * find a match, then we must be an anonymous enabling.  (Anonymous
1497	 * enablings can't currently derive either aggregation variable IDs or
1498	 * aggregation variable names given only an aggregation ID.)  In this
1499	 * obscure case (anonymous enabling, multiple aggregation printa() with
1500	 * some aggregations not represented for any tuple), our defined
1501	 * behavior is that the zero will be printed in the format of the first
1502	 * aggregation variable that contains any non-zero value.
1503	 */
1504	for (i = 0; i < naggvars; i++) {
1505		if (zaggdata[i].dtahe_size == 0) {
1506			dtrace_aggvarid_t aggvar;
1507
1508			aggvar = aggvars[(i - sortpos + naggvars) % naggvars];
1509			assert(zaggdata[i].dtahe_data.dtada_data == NULL);
1510
1511			for (j = DTRACE_AGGIDNONE + 1; ; j++) {
1512				dtrace_aggdesc_t *agg;
1513				dtrace_aggdata_t *aggdata;
1514
1515				if (dt_aggid_lookup(dtp, j, &agg) != 0)
1516					break;
1517
1518				if (agg->dtagd_varid != aggvar)
1519					continue;
1520
1521				/*
1522				 * We have our description -- now we need to
1523				 * cons up the zaggdata entry for it.
1524				 */
1525				aggdata = &zaggdata[i].dtahe_data;
1526				aggdata->dtada_size = agg->dtagd_size;
1527				aggdata->dtada_desc = agg;
1528				aggdata->dtada_handle = dtp;
1529				(void) dt_epid_lookup(dtp, agg->dtagd_epid,
1530				    &aggdata->dtada_edesc,
1531				    &aggdata->dtada_pdesc);
1532				aggdata->dtada_normal = 1;
1533				zaggdata[i].dtahe_hashval = 0;
1534				zaggdata[i].dtahe_size = agg->dtagd_size;
1535				break;
1536			}
1537
1538			if (zaggdata[i].dtahe_size == 0) {
1539				caddr_t data;
1540
1541				/*
1542				 * We couldn't find this aggregation, meaning
1543				 * that we have never seen it before for any
1544				 * tuple _and_ this is an anonymous enabling.
1545				 * That is, we're in the obscure case outlined
1546				 * above.  In this case, our defined behavior
1547				 * is to format the data in the format of the
1548				 * first non-zero aggregation -- of which, of
1549				 * course, we know there to be at least one
1550				 * (or nentries would have been zero).
1551				 */
1552				for (j = 0; j < naggvars; j++) {
1553					if (zaggdata[j].dtahe_size != 0)
1554						break;
1555				}
1556
1557				assert(j < naggvars);
1558				zaggdata[i] = zaggdata[j];
1559
1560				data = zaggdata[i].dtahe_data.dtada_data;
1561				assert(data != NULL);
1562			}
1563		}
1564	}
1565
1566	/*
1567	 * Now we need to allocate our zero-filled data for use for
1568	 * aggregations that don't have a value corresponding to a given key.
1569	 */
1570	for (i = 0; i < naggvars; i++) {
1571		dtrace_aggdata_t *aggdata = &zaggdata[i].dtahe_data;
1572		dtrace_aggdesc_t *aggdesc = aggdata->dtada_desc;
1573		dtrace_recdesc_t *rec;
1574		uint64_t larg;
1575		caddr_t zdata;
1576
1577		zsize = zaggdata[i].dtahe_size;
1578		assert(zsize != 0);
1579
1580		if ((zdata = dt_zalloc(dtp, zsize)) == NULL) {
1581			/*
1582			 * If we failed to allocated some zero-filled data, we
1583			 * need to zero out the remaining dtada_data pointers
1584			 * to prevent the wrong data from being freed below.
1585			 */
1586			for (j = i; j < naggvars; j++)
1587				zaggdata[j].dtahe_data.dtada_data = NULL;
1588			goto out;
1589		}
1590
1591		aggvar = aggvars[(i - sortpos + naggvars) % naggvars];
1592
1593		/*
1594		 * First, the easy bit.  To maintain compatibility with
1595		 * consumers that pull the compiler-generated ID out of the
1596		 * data, we put that ID at the top of the zero-filled data.
1597		 */
1598		rec = &aggdesc->dtagd_rec[0];
1599		/* LINTED - alignment */
1600		*((dtrace_aggvarid_t *)(zdata + rec->dtrd_offset)) = aggvar;
1601
1602		rec = &aggdesc->dtagd_rec[aggdesc->dtagd_nrecs - 1];
1603
1604		/*
1605		 * Now for the more complicated part.  If (and only if) this
1606		 * is an lquantize() aggregating action, zero-filled data is
1607		 * not equivalent to an empty record:  we must also get the
1608		 * parameters for the lquantize().
1609		 */
1610		if (rec->dtrd_action == DTRACEAGG_LQUANTIZE) {
1611			if (aggdata->dtada_data != NULL) {
1612				/*
1613				 * The easier case here is if we actually have
1614				 * some prototype data -- in which case we
1615				 * manually dig it out of the aggregation
1616				 * record.
1617				 */
1618				/* LINTED - alignment */
1619				larg = *((uint64_t *)(aggdata->dtada_data +
1620				    rec->dtrd_offset));
1621			} else {
1622				/*
1623				 * We don't have any prototype data.  As a
1624				 * result, we know that we _do_ have the
1625				 * compiler-generated information.  (If this
1626				 * were an anonymous enabling, all of our
1627				 * zero-filled data would have prototype data
1628				 * -- either directly or indirectly.) So as
1629				 * gross as it is, we'll grovel around in the
1630				 * compiler-generated information to find the
1631				 * lquantize() parameters.
1632				 */
1633				dtrace_stmtdesc_t *sdp;
1634				dt_ident_t *aid;
1635				dt_idsig_t *isp;
1636
1637				sdp = (dtrace_stmtdesc_t *)(uintptr_t)
1638				    aggdesc->dtagd_rec[0].dtrd_uarg;
1639				aid = sdp->dtsd_aggdata;
1640				isp = (dt_idsig_t *)aid->di_data;
1641				assert(isp->dis_auxinfo != 0);
1642				larg = isp->dis_auxinfo;
1643			}
1644
1645			/* LINTED - alignment */
1646			*((uint64_t *)(zdata + rec->dtrd_offset)) = larg;
1647		}
1648
1649		aggdata->dtada_data = zdata;
1650	}
1651
1652	/*
1653	 * Now that we've dealt with setting up our zero-filled data, we can
1654	 * allocate our sorted array, and take another pass over the data to
1655	 * fill it.
1656	 */
1657	sorted = dt_alloc(dtp, nentries * sizeof (dt_ahashent_t *));
1658
1659	if (sorted == NULL)
1660		goto out;
1661
1662	for (h = hash->dtah_all, i = 0; h != NULL; h = h->dtahe_nextall) {
1663		dtrace_aggvarid_t id;
1664
1665		if ((id = dt_aggregate_aggvarid(h)) > max || !map[id])
1666			continue;
1667
1668		sorted[i++] = h;
1669	}
1670
1671	assert(i == nentries);
1672
1673	/*
1674	 * We've loaded our array; now we need to sort by value to allow us
1675	 * to create bundles of like value.  We're going to acquire the
1676	 * dt_qsort_lock here, and hold it across all of our subsequent
1677	 * comparison and sorting.
1678	 */
1679	(void) pthread_mutex_lock(&dt_qsort_lock);
1680
1681	qsort(sorted, nentries, sizeof (dt_ahashent_t *),
1682	    dt_aggregate_keyvarcmp);
1683
1684	/*
1685	 * Now we need to go through and create bundles.  Because the number
1686	 * of bundles is bounded by the size of the sorted array, we're going
1687	 * to reuse the underlying storage.  And note that "bundle" is an
1688	 * array of pointers to arrays of pointers to dt_ahashent_t -- making
1689	 * its type (regrettably) "dt_ahashent_t ***".  (Regrettable because
1690	 * '*' -- like '_' and 'X' -- should never appear in triplicate in
1691	 * an ideal world.)
1692	 */
1693	bundle = (dt_ahashent_t ***)sorted;
1694
1695	for (i = 1, start = 0; i <= nentries; i++) {
1696		if (i < nentries &&
1697		    dt_aggregate_keycmp(&sorted[i], &sorted[i - 1]) == 0)
1698			continue;
1699
1700		/*
1701		 * We have a bundle boundary.  Everything from start to
1702		 * (i - 1) belongs in one bundle.
1703		 */
1704		assert(i - start <= naggvars);
1705		bundlesize = (naggvars + 2) * sizeof (dt_ahashent_t *);
1706
1707		if ((nbundle = dt_zalloc(dtp, bundlesize)) == NULL) {
1708			(void) pthread_mutex_unlock(&dt_qsort_lock);
1709			goto out;
1710		}
1711
1712		for (j = start; j < i; j++) {
1713			dtrace_aggvarid_t id = dt_aggregate_aggvarid(sorted[j]);
1714
1715			assert(id <= max);
1716			assert(map[id] != 0);
1717			assert(map[id] - 1 < naggvars);
1718			assert(nbundle[map[id] - 1] == NULL);
1719			nbundle[map[id] - 1] = sorted[j];
1720
1721			if (nbundle[naggvars] == NULL)
1722				nbundle[naggvars] = sorted[j];
1723		}
1724
1725		for (j = 0; j < naggvars; j++) {
1726			if (nbundle[j] != NULL)
1727				continue;
1728
1729			/*
1730			 * Before we assume that this aggregation variable
1731			 * isn't present (and fall back to using the
1732			 * zero-filled data allocated earlier), check the
1733			 * remap.  If we have a remapping, we'll drop it in
1734			 * here.  Note that we might be remapping an
1735			 * aggregation variable that isn't present for this
1736			 * key; in this case, the aggregation data that we
1737			 * copy will point to the zeroed data.
1738			 */
1739			if (remap != NULL && remap[j]) {
1740				assert(remap[j] - 1 < j);
1741				assert(nbundle[remap[j] - 1] != NULL);
1742				nbundle[j] = nbundle[remap[j] - 1];
1743			} else {
1744				nbundle[j] = &zaggdata[j];
1745			}
1746		}
1747
1748		bundle[nbundles++] = nbundle;
1749		start = i;
1750	}
1751
1752	/*
1753	 * Now we need to re-sort based on the first value.
1754	 */
1755	dt_aggregate_qsort(dtp, bundle, nbundles, sizeof (dt_ahashent_t **),
1756	    dt_aggregate_bundlecmp);
1757
1758	(void) pthread_mutex_unlock(&dt_qsort_lock);
1759
1760	/*
1761	 * We're done!  Now we just need to go back over the sorted bundles,
1762	 * calling the function.
1763	 */
1764	data = alloca((naggvars + 1) * sizeof (dtrace_aggdata_t *));
1765
1766	for (i = 0; i < nbundles; i++) {
1767		for (j = 0; j < naggvars; j++)
1768			data[j + 1] = NULL;
1769
1770		for (j = 0; j < naggvars; j++) {
1771			int ndx = j - sortpos;
1772
1773			if (ndx < 0)
1774				ndx += naggvars;
1775
1776			assert(bundle[i][ndx] != NULL);
1777			data[j + 1] = &bundle[i][ndx]->dtahe_data;
1778		}
1779
1780		for (j = 0; j < naggvars; j++)
1781			assert(data[j + 1] != NULL);
1782
1783		/*
1784		 * The representative key is the last element in the bundle.
1785		 * Assert that we have one, and then set it to be the first
1786		 * element of data.
1787		 */
1788		assert(bundle[i][j] != NULL);
1789		data[0] = &bundle[i][j]->dtahe_data;
1790
1791		if ((rval = func(data, naggvars + 1, arg)) == -1)
1792			goto out;
1793	}
1794
1795	rval = 0;
1796out:
1797	for (i = 0; i < nbundles; i++)
1798		dt_free(dtp, bundle[i]);
1799
1800	if (zaggdata != NULL) {
1801		for (i = 0; i < naggvars; i++)
1802			dt_free(dtp, zaggdata[i].dtahe_data.dtada_data);
1803	}
1804
1805	dt_free(dtp, zaggdata);
1806	dt_free(dtp, sorted);
1807	dt_free(dtp, remap);
1808	dt_free(dtp, map);
1809
1810	return (rval);
1811}
1812
1813int
1814dtrace_aggregate_print(dtrace_hdl_t *dtp, FILE *fp,
1815    dtrace_aggregate_walk_f *func)
1816{
1817	dt_print_aggdata_t pd;
1818
1819	pd.dtpa_dtp = dtp;
1820	pd.dtpa_fp = fp;
1821	pd.dtpa_allunprint = 1;
1822
1823	if (func == NULL)
1824		func = dtrace_aggregate_walk_sorted;
1825
1826	if ((*func)(dtp, dt_print_agg, &pd) == -1)
1827		return (dt_set_errno(dtp, dtp->dt_errno));
1828
1829	return (0);
1830}
1831
1832void
1833dtrace_aggregate_clear(dtrace_hdl_t *dtp)
1834{
1835	dt_aggregate_t *agp = &dtp->dt_aggregate;
1836	dt_ahash_t *hash = &agp->dtat_hash;
1837	dt_ahashent_t *h;
1838	dtrace_aggdata_t *data;
1839	dtrace_aggdesc_t *aggdesc;
1840	dtrace_recdesc_t *rec;
1841	int i, max_cpus = agp->dtat_maxcpu;
1842
1843	for (h = hash->dtah_all; h != NULL; h = h->dtahe_nextall) {
1844		aggdesc = h->dtahe_data.dtada_desc;
1845		rec = &aggdesc->dtagd_rec[aggdesc->dtagd_nrecs - 1];
1846		data = &h->dtahe_data;
1847
1848		bzero(&data->dtada_data[rec->dtrd_offset], rec->dtrd_size);
1849
1850		if (data->dtada_percpu == NULL)
1851			continue;
1852
1853		for (i = 0; i < max_cpus; i++)
1854			bzero(data->dtada_percpu[i], rec->dtrd_size);
1855	}
1856}
1857
1858void
1859dt_aggregate_destroy(dtrace_hdl_t *dtp)
1860{
1861	dt_aggregate_t *agp = &dtp->dt_aggregate;
1862	dt_ahash_t *hash = &agp->dtat_hash;
1863	dt_ahashent_t *h, *next;
1864	dtrace_aggdata_t *aggdata;
1865	int i, max_cpus = agp->dtat_maxcpu;
1866
1867	if (hash->dtah_hash == NULL) {
1868		assert(hash->dtah_all == NULL);
1869	} else {
1870		free(hash->dtah_hash);
1871
1872		for (h = hash->dtah_all; h != NULL; h = next) {
1873			next = h->dtahe_nextall;
1874
1875			aggdata = &h->dtahe_data;
1876
1877			if (aggdata->dtada_percpu != NULL) {
1878				for (i = 0; i < max_cpus; i++)
1879					free(aggdata->dtada_percpu[i]);
1880				free(aggdata->dtada_percpu);
1881			}
1882
1883			free(aggdata->dtada_data);
1884			free(h);
1885		}
1886
1887		hash->dtah_hash = NULL;
1888		hash->dtah_all = NULL;
1889		hash->dtah_size = 0;
1890	}
1891
1892	free(agp->dtat_buf.dtbd_data);
1893	free(agp->dtat_cpus);
1894}
1895