xref: /netbsd-src/external/cddl/osnet/dist/lib/libdtrace/common/dt_aggregate.c (revision b1c86f5f087524e68db12794ee9c3e3da1ab17a0)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <stdlib.h>
30 #include <strings.h>
31 #include <errno.h>
32 #include <unistd.h>
33 #include <dt_impl.h>
34 #include <assert.h>
35 #if defined(sun)
36 #include <alloca.h>
37 #else
38 #include <sys/sysctl.h>
39 #endif
40 #include <limits.h>
41 
42 #define	DTRACE_AHASHSIZE	32779		/* big 'ol prime */
43 
44 /*
45  * Because qsort(3C) does not allow an argument to be passed to a comparison
46  * function, the variables that affect comparison must regrettably be global;
47  * they are protected by a global static lock, dt_qsort_lock.
48  */
49 static pthread_mutex_t dt_qsort_lock = PTHREAD_MUTEX_INITIALIZER;
50 
51 static int dt_revsort;
52 static int dt_keysort;
53 static int dt_keypos;
54 
55 #define	DT_LESSTHAN	(dt_revsort == 0 ? -1 : 1)
56 #define	DT_GREATERTHAN	(dt_revsort == 0 ? 1 : -1)
57 
58 static void
59 dt_aggregate_count(int64_t *existing, int64_t *new, size_t size)
60 {
61 	uint_t i;
62 
63 	for (i = 0; i < size / sizeof (int64_t); i++)
64 		existing[i] = existing[i] + new[i];
65 }
66 
67 static int
68 dt_aggregate_countcmp(int64_t *lhs, int64_t *rhs)
69 {
70 	int64_t lvar = *lhs;
71 	int64_t rvar = *rhs;
72 
73 	if (lvar < rvar)
74 		return (DT_LESSTHAN);
75 
76 	if (lvar > rvar)
77 		return (DT_GREATERTHAN);
78 
79 	return (0);
80 }
81 
82 /*ARGSUSED*/
83 static void
84 dt_aggregate_min(int64_t *existing, int64_t *new, size_t size)
85 {
86 	if (*new < *existing)
87 		*existing = *new;
88 }
89 
90 /*ARGSUSED*/
91 static void
92 dt_aggregate_max(int64_t *existing, int64_t *new, size_t size)
93 {
94 	if (*new > *existing)
95 		*existing = *new;
96 }
97 
98 static int
99 dt_aggregate_averagecmp(int64_t *lhs, int64_t *rhs)
100 {
101 	int64_t lavg = lhs[0] ? (lhs[1] / lhs[0]) : 0;
102 	int64_t ravg = rhs[0] ? (rhs[1] / rhs[0]) : 0;
103 
104 	if (lavg < ravg)
105 		return (DT_LESSTHAN);
106 
107 	if (lavg > ravg)
108 		return (DT_GREATERTHAN);
109 
110 	return (0);
111 }
112 
113 static int
114 dt_aggregate_stddevcmp(int64_t *lhs, int64_t *rhs)
115 {
116 	uint64_t lsd = dt_stddev((uint64_t *)lhs, 1);
117 	uint64_t rsd = dt_stddev((uint64_t *)rhs, 1);
118 
119 	if (lsd < rsd)
120 		return (DT_LESSTHAN);
121 
122 	if (lsd > rsd)
123 		return (DT_GREATERTHAN);
124 
125 	return (0);
126 }
127 
128 /*ARGSUSED*/
129 static void
130 dt_aggregate_lquantize(int64_t *existing, int64_t *new, size_t size)
131 {
132 	int64_t arg = *existing++;
133 	uint16_t levels = DTRACE_LQUANTIZE_LEVELS(arg);
134 	int i;
135 
136 	for (i = 0; i <= levels + 1; i++)
137 		existing[i] = existing[i] + new[i + 1];
138 }
139 
140 static long double
141 dt_aggregate_lquantizedsum(int64_t *lquanta)
142 {
143 	int64_t arg = *lquanta++;
144 	int32_t base = DTRACE_LQUANTIZE_BASE(arg);
145 	uint16_t step = DTRACE_LQUANTIZE_STEP(arg);
146 	uint16_t levels = DTRACE_LQUANTIZE_LEVELS(arg), i;
147 	long double total = (long double)lquanta[0] * (long double)(base - 1);
148 
149 	for (i = 0; i < levels; base += step, i++)
150 		total += (long double)lquanta[i + 1] * (long double)base;
151 
152 	return (total + (long double)lquanta[levels + 1] *
153 	    (long double)(base + 1));
154 }
155 
156 static int64_t
157 dt_aggregate_lquantizedzero(int64_t *lquanta)
158 {
159 	int64_t arg = *lquanta++;
160 	int32_t base = DTRACE_LQUANTIZE_BASE(arg);
161 	uint16_t step = DTRACE_LQUANTIZE_STEP(arg);
162 	uint16_t levels = DTRACE_LQUANTIZE_LEVELS(arg), i;
163 
164 	if (base - 1 == 0)
165 		return (lquanta[0]);
166 
167 	for (i = 0; i < levels; base += step, i++) {
168 		if (base != 0)
169 			continue;
170 
171 		return (lquanta[i + 1]);
172 	}
173 
174 	if (base + 1 == 0)
175 		return (lquanta[levels + 1]);
176 
177 	return (0);
178 }
179 
180 static int
181 dt_aggregate_lquantizedcmp(int64_t *lhs, int64_t *rhs)
182 {
183 	long double lsum = dt_aggregate_lquantizedsum(lhs);
184 	long double rsum = dt_aggregate_lquantizedsum(rhs);
185 	int64_t lzero, rzero;
186 
187 	if (lsum < rsum)
188 		return (DT_LESSTHAN);
189 
190 	if (lsum > rsum)
191 		return (DT_GREATERTHAN);
192 
193 	/*
194 	 * If they're both equal, then we will compare based on the weights at
195 	 * zero.  If the weights at zero are equal (or if zero is not within
196 	 * the range of the linear quantization), then this will be judged a
197 	 * tie and will be resolved based on the key comparison.
198 	 */
199 	lzero = dt_aggregate_lquantizedzero(lhs);
200 	rzero = dt_aggregate_lquantizedzero(rhs);
201 
202 	if (lzero < rzero)
203 		return (DT_LESSTHAN);
204 
205 	if (lzero > rzero)
206 		return (DT_GREATERTHAN);
207 
208 	return (0);
209 }
210 
211 static int
212 dt_aggregate_quantizedcmp(int64_t *lhs, int64_t *rhs)
213 {
214 	int nbuckets = DTRACE_QUANTIZE_NBUCKETS;
215 	long double ltotal = 0, rtotal = 0;
216 	int64_t lzero, rzero;
217 	uint_t i;
218 
219 	for (i = 0; i < nbuckets; i++) {
220 		int64_t bucketval = DTRACE_QUANTIZE_BUCKETVAL(i);
221 
222 		if (bucketval == 0) {
223 			lzero = lhs[i];
224 			rzero = rhs[i];
225 		}
226 
227 		ltotal += (long double)bucketval * (long double)lhs[i];
228 		rtotal += (long double)bucketval * (long double)rhs[i];
229 	}
230 
231 	if (ltotal < rtotal)
232 		return (DT_LESSTHAN);
233 
234 	if (ltotal > rtotal)
235 		return (DT_GREATERTHAN);
236 
237 	/*
238 	 * If they're both equal, then we will compare based on the weights at
239 	 * zero.  If the weights at zero are equal, then this will be judged a
240 	 * tie and will be resolved based on the key comparison.
241 	 */
242 	if (lzero < rzero)
243 		return (DT_LESSTHAN);
244 
245 	if (lzero > rzero)
246 		return (DT_GREATERTHAN);
247 
248 	return (0);
249 }
250 
251 static void
252 dt_aggregate_usym(dtrace_hdl_t *dtp, uint64_t *data)
253 {
254 #if 0	/* XXX TBD needs libproc */
255 	uint64_t pid = data[0];
256 	uint64_t *pc = &data[1];
257 	struct ps_prochandle *P;
258 	GElf_Sym sym;
259 
260 	if (dtp->dt_vector != NULL)
261 		return;
262 
263 	if ((P = dt_proc_grab(dtp, pid, PGRAB_RDONLY | PGRAB_FORCE, 0)) == NULL)
264 		return;
265 
266 	dt_proc_lock(dtp, P);
267 
268 #if defined(sun)
269 	if (Plookup_by_addr(P, *pc, NULL, 0, &sym) == 0)
270 #else
271 	if (proc_addr2sym(P, *pc, NULL, 0, &sym) == 0)
272 #endif
273 		*pc = sym.st_value;
274 
275 	dt_proc_unlock(dtp, P);
276 	dt_proc_release(dtp, P);
277 #else
278 	printf("XXX %s not implemented\n", __func__);
279 #endif
280 }
281 
282 static void
283 dt_aggregate_umod(dtrace_hdl_t *dtp, uint64_t *data)
284 {
285 #if 0	/* XXX TBD needs libproc */
286 	uint64_t pid = data[0];
287 	uint64_t *pc = &data[1];
288 	struct ps_prochandle *P;
289 	const prmap_t *map;
290 
291 	if (dtp->dt_vector != NULL)
292 		return;
293 
294 	if ((P = dt_proc_grab(dtp, pid, PGRAB_RDONLY | PGRAB_FORCE, 0)) == NULL)
295 		return;
296 
297 	dt_proc_lock(dtp, P);
298 
299 #if defined(sun)
300 	if ((map = Paddr_to_map(P, *pc)) != NULL)
301 #else
302 	if ((map = proc_addr2map(P, *pc)) != NULL)
303 #endif
304 		*pc = map->pr_vaddr;
305 
306 	dt_proc_unlock(dtp, P);
307 	dt_proc_release(dtp, P);
308 #else
309 	printf("XXX %s not implemented\n", __func__);
310 #endif
311 }
312 
313 static void
314 dt_aggregate_sym(dtrace_hdl_t *dtp, uint64_t *data)
315 {
316 	GElf_Sym sym;
317 	uint64_t *pc = data;
318 
319 	if (dtrace_lookup_by_addr(dtp, *pc, &sym, NULL) == 0)
320 		*pc = sym.st_value;
321 }
322 
323 static void
324 dt_aggregate_mod(dtrace_hdl_t *dtp, uint64_t *data)
325 {
326 	uint64_t *pc = data;
327 	dt_module_t *dmp;
328 
329 	if (dtp->dt_vector != NULL) {
330 		/*
331 		 * We don't have a way of just getting the module for a
332 		 * vectored open, and it doesn't seem to be worth defining
333 		 * one.  This means that use of mod() won't get true
334 		 * aggregation in the postmortem case (some modules may
335 		 * appear more than once in aggregation output).  It seems
336 		 * unlikely that anyone will ever notice or care...
337 		 */
338 		return;
339 	}
340 
341 	for (dmp = dt_list_next(&dtp->dt_modlist); dmp != NULL;
342 	    dmp = dt_list_next(dmp)) {
343 		if (*pc - dmp->dm_text_va < dmp->dm_text_size) {
344 			*pc = dmp->dm_text_va;
345 			return;
346 		}
347 	}
348 }
349 
350 static dtrace_aggvarid_t
351 dt_aggregate_aggvarid(dt_ahashent_t *ent)
352 {
353 	dtrace_aggdesc_t *agg = ent->dtahe_data.dtada_desc;
354 	caddr_t data = ent->dtahe_data.dtada_data;
355 	dtrace_recdesc_t *rec = agg->dtagd_rec;
356 
357 	/*
358 	 * First, we'll check the variable ID in the aggdesc.  If it's valid,
359 	 * we'll return it.  If not, we'll use the compiler-generated ID
360 	 * present as the first record.
361 	 */
362 	if (agg->dtagd_varid != DTRACE_AGGVARIDNONE)
363 		return (agg->dtagd_varid);
364 
365 	agg->dtagd_varid = *((dtrace_aggvarid_t *)(uintptr_t)(data +
366 	    rec->dtrd_offset));
367 
368 	return (agg->dtagd_varid);
369 }
370 
371 
372 static int
373 dt_aggregate_snap_cpu(dtrace_hdl_t *dtp, processorid_t cpu)
374 {
375 	dtrace_epid_t id;
376 	uint64_t hashval;
377 	size_t offs, roffs, size, ndx;
378 	int i, j, rval;
379 	caddr_t addr, data;
380 	dtrace_recdesc_t *rec;
381 	dt_aggregate_t *agp = &dtp->dt_aggregate;
382 	dtrace_aggdesc_t *agg;
383 	dt_ahash_t *hash = &agp->dtat_hash;
384 	dt_ahashent_t *h;
385 	dtrace_bufdesc_t b = agp->dtat_buf, *buf = &b;
386 	dtrace_aggdata_t *aggdata;
387 	int flags = agp->dtat_flags;
388 
389 	buf->dtbd_cpu = cpu;
390 
391 #if defined(sun)
392 	if (dt_ioctl(dtp, DTRACEIOC_AGGSNAP, buf) == -1) {
393 #else
394 	if (dt_ioctl(dtp, DTRACEIOC_AGGSNAP, &buf) == -1) {
395 #endif
396 		if (errno == ENOENT) {
397 			/*
398 			 * If that failed with ENOENT, it may be because the
399 			 * CPU was unconfigured.  This is okay; we'll just
400 			 * do nothing but return success.
401 			 */
402 			return (0);
403 		}
404 
405 		return (dt_set_errno(dtp, errno));
406 	}
407 
408 	if (buf->dtbd_drops != 0) {
409 		if (dt_handle_cpudrop(dtp, cpu,
410 		    DTRACEDROP_AGGREGATION, buf->dtbd_drops) == -1)
411 			return (-1);
412 	}
413 
414 	if (buf->dtbd_size == 0)
415 		return (0);
416 
417 	if (hash->dtah_hash == NULL) {
418 		size_t size;
419 
420 		hash->dtah_size = DTRACE_AHASHSIZE;
421 		size = hash->dtah_size * sizeof (dt_ahashent_t *);
422 
423 		if ((hash->dtah_hash = malloc(size)) == NULL)
424 			return (dt_set_errno(dtp, EDT_NOMEM));
425 
426 		bzero(hash->dtah_hash, size);
427 	}
428 
429 	for (offs = 0; offs < buf->dtbd_size; ) {
430 		/*
431 		 * We're guaranteed to have an ID.
432 		 */
433 		id = *((dtrace_epid_t *)((uintptr_t)buf->dtbd_data +
434 		    (uintptr_t)offs));
435 
436 		if (id == DTRACE_AGGIDNONE) {
437 			/*
438 			 * This is filler to assure proper alignment of the
439 			 * next record; we simply ignore it.
440 			 */
441 			offs += sizeof (id);
442 			continue;
443 		}
444 
445 		if ((rval = dt_aggid_lookup(dtp, id, &agg)) != 0)
446 			return (rval);
447 
448 		addr = buf->dtbd_data + offs;
449 		size = agg->dtagd_size;
450 		hashval = 0;
451 
452 		for (j = 0; j < agg->dtagd_nrecs - 1; j++) {
453 			rec = &agg->dtagd_rec[j];
454 			roffs = rec->dtrd_offset;
455 
456 			switch (rec->dtrd_action) {
457 			case DTRACEACT_USYM:
458 				dt_aggregate_usym(dtp,
459 				    /* LINTED - alignment */
460 				    (uint64_t *)&addr[roffs]);
461 				break;
462 
463 			case DTRACEACT_UMOD:
464 				dt_aggregate_umod(dtp,
465 				    /* LINTED - alignment */
466 				    (uint64_t *)&addr[roffs]);
467 				break;
468 
469 			case DTRACEACT_SYM:
470 				/* LINTED - alignment */
471 				dt_aggregate_sym(dtp, (uint64_t *)&addr[roffs]);
472 				break;
473 
474 			case DTRACEACT_MOD:
475 				/* LINTED - alignment */
476 				dt_aggregate_mod(dtp, (uint64_t *)&addr[roffs]);
477 				break;
478 
479 			default:
480 				break;
481 			}
482 
483 			for (i = 0; i < rec->dtrd_size; i++)
484 				hashval += addr[roffs + i];
485 		}
486 
487 		ndx = hashval % hash->dtah_size;
488 
489 		for (h = hash->dtah_hash[ndx]; h != NULL; h = h->dtahe_next) {
490 			if (h->dtahe_hashval != hashval)
491 				continue;
492 
493 			if (h->dtahe_size != size)
494 				continue;
495 
496 			aggdata = &h->dtahe_data;
497 			data = aggdata->dtada_data;
498 
499 			for (j = 0; j < agg->dtagd_nrecs - 1; j++) {
500 				rec = &agg->dtagd_rec[j];
501 				roffs = rec->dtrd_offset;
502 
503 				for (i = 0; i < rec->dtrd_size; i++)
504 					if (addr[roffs + i] != data[roffs + i])
505 						goto hashnext;
506 			}
507 
508 			/*
509 			 * We found it.  Now we need to apply the aggregating
510 			 * action on the data here.
511 			 */
512 			rec = &agg->dtagd_rec[agg->dtagd_nrecs - 1];
513 			roffs = rec->dtrd_offset;
514 			/* LINTED - alignment */
515 			h->dtahe_aggregate((int64_t *)&data[roffs],
516 			    /* LINTED - alignment */
517 			    (int64_t *)&addr[roffs], rec->dtrd_size);
518 
519 			/*
520 			 * If we're keeping per CPU data, apply the aggregating
521 			 * action there as well.
522 			 */
523 			if (aggdata->dtada_percpu != NULL) {
524 				data = aggdata->dtada_percpu[cpu];
525 
526 				/* LINTED - alignment */
527 				h->dtahe_aggregate((int64_t *)data,
528 				    /* LINTED - alignment */
529 				    (int64_t *)&addr[roffs], rec->dtrd_size);
530 			}
531 
532 			goto bufnext;
533 hashnext:
534 			continue;
535 		}
536 
537 		/*
538 		 * If we're here, we couldn't find an entry for this record.
539 		 */
540 		if ((h = malloc(sizeof (dt_ahashent_t))) == NULL)
541 			return (dt_set_errno(dtp, EDT_NOMEM));
542 		bzero(h, sizeof (dt_ahashent_t));
543 		aggdata = &h->dtahe_data;
544 
545 		if ((aggdata->dtada_data = malloc(size)) == NULL) {
546 			free(h);
547 			return (dt_set_errno(dtp, EDT_NOMEM));
548 		}
549 
550 		bcopy(addr, aggdata->dtada_data, size);
551 		aggdata->dtada_size = size;
552 		aggdata->dtada_desc = agg;
553 		aggdata->dtada_handle = dtp;
554 		(void) dt_epid_lookup(dtp, agg->dtagd_epid,
555 		    &aggdata->dtada_edesc, &aggdata->dtada_pdesc);
556 		aggdata->dtada_normal = 1;
557 
558 		h->dtahe_hashval = hashval;
559 		h->dtahe_size = size;
560 		(void) dt_aggregate_aggvarid(h);
561 
562 		rec = &agg->dtagd_rec[agg->dtagd_nrecs - 1];
563 
564 		if (flags & DTRACE_A_PERCPU) {
565 			int max_cpus = agp->dtat_maxcpu;
566 			caddr_t *percpu = malloc(max_cpus * sizeof (caddr_t));
567 
568 			if (percpu == NULL) {
569 				free(aggdata->dtada_data);
570 				free(h);
571 				return (dt_set_errno(dtp, EDT_NOMEM));
572 			}
573 
574 			for (j = 0; j < max_cpus; j++) {
575 				percpu[j] = malloc(rec->dtrd_size);
576 
577 				if (percpu[j] == NULL) {
578 					while (--j >= 0)
579 						free(percpu[j]);
580 
581 					free(aggdata->dtada_data);
582 					free(h);
583 					return (dt_set_errno(dtp, EDT_NOMEM));
584 				}
585 
586 				if (j == cpu) {
587 					bcopy(&addr[rec->dtrd_offset],
588 					    percpu[j], rec->dtrd_size);
589 				} else {
590 					bzero(percpu[j], rec->dtrd_size);
591 				}
592 			}
593 
594 			aggdata->dtada_percpu = percpu;
595 		}
596 
597 		switch (rec->dtrd_action) {
598 		case DTRACEAGG_MIN:
599 			h->dtahe_aggregate = dt_aggregate_min;
600 			break;
601 
602 		case DTRACEAGG_MAX:
603 			h->dtahe_aggregate = dt_aggregate_max;
604 			break;
605 
606 		case DTRACEAGG_LQUANTIZE:
607 			h->dtahe_aggregate = dt_aggregate_lquantize;
608 			break;
609 
610 		case DTRACEAGG_COUNT:
611 		case DTRACEAGG_SUM:
612 		case DTRACEAGG_AVG:
613 		case DTRACEAGG_STDDEV:
614 		case DTRACEAGG_QUANTIZE:
615 			h->dtahe_aggregate = dt_aggregate_count;
616 			break;
617 
618 		default:
619 			return (dt_set_errno(dtp, EDT_BADAGG));
620 		}
621 
622 		if (hash->dtah_hash[ndx] != NULL)
623 			hash->dtah_hash[ndx]->dtahe_prev = h;
624 
625 		h->dtahe_next = hash->dtah_hash[ndx];
626 		hash->dtah_hash[ndx] = h;
627 
628 		if (hash->dtah_all != NULL)
629 			hash->dtah_all->dtahe_prevall = h;
630 
631 		h->dtahe_nextall = hash->dtah_all;
632 		hash->dtah_all = h;
633 bufnext:
634 		offs += agg->dtagd_size;
635 	}
636 
637 	return (0);
638 }
639 
640 int
641 dtrace_aggregate_snap(dtrace_hdl_t *dtp)
642 {
643 	int i, rval;
644 	dt_aggregate_t *agp = &dtp->dt_aggregate;
645 	hrtime_t now = gethrtime();
646 	dtrace_optval_t interval = dtp->dt_options[DTRACEOPT_AGGRATE];
647 
648 	if (dtp->dt_lastagg != 0) {
649 		if (now - dtp->dt_lastagg < interval)
650 			return (0);
651 
652 		dtp->dt_lastagg += interval;
653 	} else {
654 		dtp->dt_lastagg = now;
655 	}
656 
657 	if (!dtp->dt_active)
658 		return (dt_set_errno(dtp, EINVAL));
659 
660 	if (agp->dtat_buf.dtbd_size == 0)
661 		return (0);
662 
663 	for (i = 0; i < agp->dtat_ncpus; i++) {
664 		if ((rval = dt_aggregate_snap_cpu(dtp, agp->dtat_cpus[i])))
665 			return (rval);
666 	}
667 
668 	return (0);
669 }
670 
671 static int
672 dt_aggregate_hashcmp(const void *lhs, const void *rhs)
673 {
674 	dt_ahashent_t *lh = *((dt_ahashent_t **)lhs);
675 	dt_ahashent_t *rh = *((dt_ahashent_t **)rhs);
676 	dtrace_aggdesc_t *lagg = lh->dtahe_data.dtada_desc;
677 	dtrace_aggdesc_t *ragg = rh->dtahe_data.dtada_desc;
678 
679 	if (lagg->dtagd_nrecs < ragg->dtagd_nrecs)
680 		return (DT_LESSTHAN);
681 
682 	if (lagg->dtagd_nrecs > ragg->dtagd_nrecs)
683 		return (DT_GREATERTHAN);
684 
685 	return (0);
686 }
687 
688 static int
689 dt_aggregate_varcmp(const void *lhs, const void *rhs)
690 {
691 	dt_ahashent_t *lh = *((dt_ahashent_t **)lhs);
692 	dt_ahashent_t *rh = *((dt_ahashent_t **)rhs);
693 	dtrace_aggvarid_t lid, rid;
694 
695 	lid = dt_aggregate_aggvarid(lh);
696 	rid = dt_aggregate_aggvarid(rh);
697 
698 	if (lid < rid)
699 		return (DT_LESSTHAN);
700 
701 	if (lid > rid)
702 		return (DT_GREATERTHAN);
703 
704 	return (0);
705 }
706 
707 static int
708 dt_aggregate_keycmp(const void *lhs, const void *rhs)
709 {
710 	dt_ahashent_t *lh = *((dt_ahashent_t **)lhs);
711 	dt_ahashent_t *rh = *((dt_ahashent_t **)rhs);
712 	dtrace_aggdesc_t *lagg = lh->dtahe_data.dtada_desc;
713 	dtrace_aggdesc_t *ragg = rh->dtahe_data.dtada_desc;
714 	dtrace_recdesc_t *lrec, *rrec;
715 	char *ldata, *rdata;
716 	int rval, i, j, keypos, nrecs;
717 
718 	if ((rval = dt_aggregate_hashcmp(lhs, rhs)) != 0)
719 		return (rval);
720 
721 	nrecs = lagg->dtagd_nrecs - 1;
722 	assert(nrecs == ragg->dtagd_nrecs - 1);
723 
724 	keypos = dt_keypos + 1 >= nrecs ? 0 : dt_keypos;
725 
726 	for (i = 1; i < nrecs; i++) {
727 		uint64_t lval, rval;
728 		int ndx = i + keypos;
729 
730 		if (ndx >= nrecs)
731 			ndx = ndx - nrecs + 1;
732 
733 		lrec = &lagg->dtagd_rec[ndx];
734 		rrec = &ragg->dtagd_rec[ndx];
735 
736 		ldata = lh->dtahe_data.dtada_data + lrec->dtrd_offset;
737 		rdata = rh->dtahe_data.dtada_data + rrec->dtrd_offset;
738 
739 		if (lrec->dtrd_size < rrec->dtrd_size)
740 			return (DT_LESSTHAN);
741 
742 		if (lrec->dtrd_size > rrec->dtrd_size)
743 			return (DT_GREATERTHAN);
744 
745 		switch (lrec->dtrd_size) {
746 		case sizeof (uint64_t):
747 			/* LINTED - alignment */
748 			lval = *((uint64_t *)ldata);
749 			/* LINTED - alignment */
750 			rval = *((uint64_t *)rdata);
751 			break;
752 
753 		case sizeof (uint32_t):
754 			/* LINTED - alignment */
755 			lval = *((uint32_t *)ldata);
756 			/* LINTED - alignment */
757 			rval = *((uint32_t *)rdata);
758 			break;
759 
760 		case sizeof (uint16_t):
761 			/* LINTED - alignment */
762 			lval = *((uint16_t *)ldata);
763 			/* LINTED - alignment */
764 			rval = *((uint16_t *)rdata);
765 			break;
766 
767 		case sizeof (uint8_t):
768 			lval = *((uint8_t *)ldata);
769 			rval = *((uint8_t *)rdata);
770 			break;
771 
772 		default:
773 			switch (lrec->dtrd_action) {
774 			case DTRACEACT_UMOD:
775 			case DTRACEACT_UADDR:
776 			case DTRACEACT_USYM:
777 				for (j = 0; j < 2; j++) {
778 					/* LINTED - alignment */
779 					lval = ((uint64_t *)ldata)[j];
780 					/* LINTED - alignment */
781 					rval = ((uint64_t *)rdata)[j];
782 
783 					if (lval < rval)
784 						return (DT_LESSTHAN);
785 
786 					if (lval > rval)
787 						return (DT_GREATERTHAN);
788 				}
789 
790 				break;
791 
792 			default:
793 				for (j = 0; j < lrec->dtrd_size; j++) {
794 					lval = ((uint8_t *)ldata)[j];
795 					rval = ((uint8_t *)rdata)[j];
796 
797 					if (lval < rval)
798 						return (DT_LESSTHAN);
799 
800 					if (lval > rval)
801 						return (DT_GREATERTHAN);
802 				}
803 			}
804 
805 			continue;
806 		}
807 
808 		if (lval < rval)
809 			return (DT_LESSTHAN);
810 
811 		if (lval > rval)
812 			return (DT_GREATERTHAN);
813 	}
814 
815 	return (0);
816 }
817 
818 static int
819 dt_aggregate_valcmp(const void *lhs, const void *rhs)
820 {
821 	dt_ahashent_t *lh = *((dt_ahashent_t **)lhs);
822 	dt_ahashent_t *rh = *((dt_ahashent_t **)rhs);
823 	dtrace_aggdesc_t *lagg = lh->dtahe_data.dtada_desc;
824 	dtrace_aggdesc_t *ragg = rh->dtahe_data.dtada_desc;
825 	caddr_t ldata = lh->dtahe_data.dtada_data;
826 	caddr_t rdata = rh->dtahe_data.dtada_data;
827 	dtrace_recdesc_t *lrec, *rrec;
828 	int64_t *laddr, *raddr;
829 	int rval, i;
830 
831 	if ((rval = dt_aggregate_hashcmp(lhs, rhs)) != 0)
832 		return (rval);
833 
834 	if (lagg->dtagd_nrecs > ragg->dtagd_nrecs)
835 		return (DT_GREATERTHAN);
836 
837 	if (lagg->dtagd_nrecs < ragg->dtagd_nrecs)
838 		return (DT_LESSTHAN);
839 
840 	for (i = 0; i < lagg->dtagd_nrecs; i++) {
841 		lrec = &lagg->dtagd_rec[i];
842 		rrec = &ragg->dtagd_rec[i];
843 
844 		if (lrec->dtrd_offset < rrec->dtrd_offset)
845 			return (DT_LESSTHAN);
846 
847 		if (lrec->dtrd_offset > rrec->dtrd_offset)
848 			return (DT_GREATERTHAN);
849 
850 		if (lrec->dtrd_action < rrec->dtrd_action)
851 			return (DT_LESSTHAN);
852 
853 		if (lrec->dtrd_action > rrec->dtrd_action)
854 			return (DT_GREATERTHAN);
855 	}
856 
857 	laddr = (int64_t *)(uintptr_t)(ldata + lrec->dtrd_offset);
858 	raddr = (int64_t *)(uintptr_t)(rdata + rrec->dtrd_offset);
859 
860 	switch (lrec->dtrd_action) {
861 	case DTRACEAGG_AVG:
862 		rval = dt_aggregate_averagecmp(laddr, raddr);
863 		break;
864 
865 	case DTRACEAGG_STDDEV:
866 		rval = dt_aggregate_stddevcmp(laddr, raddr);
867 		break;
868 
869 	case DTRACEAGG_QUANTIZE:
870 		rval = dt_aggregate_quantizedcmp(laddr, raddr);
871 		break;
872 
873 	case DTRACEAGG_LQUANTIZE:
874 		rval = dt_aggregate_lquantizedcmp(laddr, raddr);
875 		break;
876 
877 	case DTRACEAGG_COUNT:
878 	case DTRACEAGG_SUM:
879 	case DTRACEAGG_MIN:
880 	case DTRACEAGG_MAX:
881 		rval = dt_aggregate_countcmp(laddr, raddr);
882 		break;
883 
884 	default:
885 		assert(0);
886 	}
887 
888 	return (rval);
889 }
890 
891 static int
892 dt_aggregate_valkeycmp(const void *lhs, const void *rhs)
893 {
894 	int rval;
895 
896 	if ((rval = dt_aggregate_valcmp(lhs, rhs)) != 0)
897 		return (rval);
898 
899 	/*
900 	 * If we're here, the values for the two aggregation elements are
901 	 * equal.  We already know that the key layout is the same for the two
902 	 * elements; we must now compare the keys themselves as a tie-breaker.
903 	 */
904 	return (dt_aggregate_keycmp(lhs, rhs));
905 }
906 
907 static int
908 dt_aggregate_keyvarcmp(const void *lhs, const void *rhs)
909 {
910 	int rval;
911 
912 	if ((rval = dt_aggregate_keycmp(lhs, rhs)) != 0)
913 		return (rval);
914 
915 	return (dt_aggregate_varcmp(lhs, rhs));
916 }
917 
918 static int
919 dt_aggregate_varkeycmp(const void *lhs, const void *rhs)
920 {
921 	int rval;
922 
923 	if ((rval = dt_aggregate_varcmp(lhs, rhs)) != 0)
924 		return (rval);
925 
926 	return (dt_aggregate_keycmp(lhs, rhs));
927 }
928 
929 static int
930 dt_aggregate_valvarcmp(const void *lhs, const void *rhs)
931 {
932 	int rval;
933 
934 	if ((rval = dt_aggregate_valkeycmp(lhs, rhs)) != 0)
935 		return (rval);
936 
937 	return (dt_aggregate_varcmp(lhs, rhs));
938 }
939 
940 static int
941 dt_aggregate_varvalcmp(const void *lhs, const void *rhs)
942 {
943 	int rval;
944 
945 	if ((rval = dt_aggregate_varcmp(lhs, rhs)) != 0)
946 		return (rval);
947 
948 	return (dt_aggregate_valkeycmp(lhs, rhs));
949 }
950 
951 static int
952 dt_aggregate_keyvarrevcmp(const void *lhs, const void *rhs)
953 {
954 	return (dt_aggregate_keyvarcmp(rhs, lhs));
955 }
956 
957 static int
958 dt_aggregate_varkeyrevcmp(const void *lhs, const void *rhs)
959 {
960 	return (dt_aggregate_varkeycmp(rhs, lhs));
961 }
962 
963 static int
964 dt_aggregate_valvarrevcmp(const void *lhs, const void *rhs)
965 {
966 	return (dt_aggregate_valvarcmp(rhs, lhs));
967 }
968 
969 static int
970 dt_aggregate_varvalrevcmp(const void *lhs, const void *rhs)
971 {
972 	return (dt_aggregate_varvalcmp(rhs, lhs));
973 }
974 
975 static int
976 dt_aggregate_bundlecmp(const void *lhs, const void *rhs)
977 {
978 	dt_ahashent_t **lh = *((dt_ahashent_t ***)lhs);
979 	dt_ahashent_t **rh = *((dt_ahashent_t ***)rhs);
980 	int i, rval;
981 
982 	if (dt_keysort) {
983 		/*
984 		 * If we're sorting on keys, we need to scan until we find the
985 		 * last entry -- that's the representative key.  (The order of
986 		 * the bundle is values followed by key to accommodate the
987 		 * default behavior of sorting by value.)  If the keys are
988 		 * equal, we'll fall into the value comparison loop, below.
989 		 */
990 		for (i = 0; lh[i + 1] != NULL; i++)
991 			continue;
992 
993 		assert(i != 0);
994 		assert(rh[i + 1] == NULL);
995 
996 		if ((rval = dt_aggregate_keycmp(&lh[i], &rh[i])) != 0)
997 			return (rval);
998 	}
999 
1000 	for (i = 0; ; i++) {
1001 		if (lh[i + 1] == NULL) {
1002 			/*
1003 			 * All of the values are equal; if we're sorting on
1004 			 * keys, then we're only here because the keys were
1005 			 * found to be equal and these records are therefore
1006 			 * equal.  If we're not sorting on keys, we'll use the
1007 			 * key comparison from the representative key as the
1008 			 * tie-breaker.
1009 			 */
1010 			if (dt_keysort)
1011 				return (0);
1012 
1013 			assert(i != 0);
1014 			assert(rh[i + 1] == NULL);
1015 			return (dt_aggregate_keycmp(&lh[i], &rh[i]));
1016 		} else {
1017 			if ((rval = dt_aggregate_valcmp(&lh[i], &rh[i])) != 0)
1018 				return (rval);
1019 		}
1020 	}
1021 }
1022 
1023 int
1024 dt_aggregate_go(dtrace_hdl_t *dtp)
1025 {
1026 	dt_aggregate_t *agp = &dtp->dt_aggregate;
1027 	dtrace_optval_t size, cpu;
1028 	dtrace_bufdesc_t *buf = &agp->dtat_buf;
1029 	int rval, i;
1030 
1031 	assert(agp->dtat_maxcpu == 0);
1032 	assert(agp->dtat_ncpu == 0);
1033 	assert(agp->dtat_cpus == NULL);
1034 
1035 	agp->dtat_maxcpu = dt_sysconf(dtp, _SC_CPUID_MAX) + 1;
1036 	agp->dtat_ncpu = dt_sysconf(dtp, _SC_NPROCESSORS_MAX);
1037 	agp->dtat_cpus = malloc(agp->dtat_ncpu * sizeof (processorid_t));
1038 
1039 	if (agp->dtat_cpus == NULL)
1040 		return (dt_set_errno(dtp, EDT_NOMEM));
1041 
1042 	/*
1043 	 * Use the aggregation buffer size as reloaded from the kernel.
1044 	 */
1045 	size = dtp->dt_options[DTRACEOPT_AGGSIZE];
1046 
1047 	rval = dtrace_getopt(dtp, "aggsize", &size);
1048 	assert(rval == 0);
1049 
1050 	if (size == 0 || size == DTRACEOPT_UNSET)
1051 		return (0);
1052 
1053 	buf = &agp->dtat_buf;
1054 	buf->dtbd_size = size;
1055 
1056 	if ((buf->dtbd_data = malloc(buf->dtbd_size)) == NULL)
1057 		return (dt_set_errno(dtp, EDT_NOMEM));
1058 
1059 	/*
1060 	 * Now query for the CPUs enabled.
1061 	 */
1062 	rval = dtrace_getopt(dtp, "cpu", &cpu);
1063 	assert(rval == 0 && cpu != DTRACEOPT_UNSET);
1064 
1065 	if (cpu != DTRACE_CPUALL) {
1066 		assert(cpu < agp->dtat_ncpu);
1067 		agp->dtat_cpus[agp->dtat_ncpus++] = (processorid_t)cpu;
1068 
1069 		return (0);
1070 	}
1071 
1072 	agp->dtat_ncpus = 0;
1073 	for (i = 0; i < agp->dtat_maxcpu; i++) {
1074 		if (dt_status(dtp, i) == -1)
1075 			continue;
1076 
1077 		agp->dtat_cpus[agp->dtat_ncpus++] = i;
1078 	}
1079 
1080 	return (0);
1081 }
1082 
1083 static int
1084 dt_aggwalk_rval(dtrace_hdl_t *dtp, dt_ahashent_t *h, int rval)
1085 {
1086 	dt_aggregate_t *agp = &dtp->dt_aggregate;
1087 	dtrace_aggdata_t *data;
1088 	dtrace_aggdesc_t *aggdesc;
1089 	dtrace_recdesc_t *rec;
1090 	int i;
1091 
1092 	switch (rval) {
1093 	case DTRACE_AGGWALK_NEXT:
1094 		break;
1095 
1096 	case DTRACE_AGGWALK_CLEAR: {
1097 		uint32_t size, offs = 0;
1098 
1099 		aggdesc = h->dtahe_data.dtada_desc;
1100 		rec = &aggdesc->dtagd_rec[aggdesc->dtagd_nrecs - 1];
1101 		size = rec->dtrd_size;
1102 		data = &h->dtahe_data;
1103 
1104 		if (rec->dtrd_action == DTRACEAGG_LQUANTIZE) {
1105 			offs = sizeof (uint64_t);
1106 			size -= sizeof (uint64_t);
1107 		}
1108 
1109 		bzero(&data->dtada_data[rec->dtrd_offset] + offs, size);
1110 
1111 		if (data->dtada_percpu == NULL)
1112 			break;
1113 
1114 		for (i = 0; i < dtp->dt_aggregate.dtat_maxcpu; i++)
1115 			bzero(data->dtada_percpu[i] + offs, size);
1116 		break;
1117 	}
1118 
1119 	case DTRACE_AGGWALK_ERROR:
1120 		/*
1121 		 * We assume that errno is already set in this case.
1122 		 */
1123 		return (dt_set_errno(dtp, errno));
1124 
1125 	case DTRACE_AGGWALK_ABORT:
1126 		return (dt_set_errno(dtp, EDT_DIRABORT));
1127 
1128 	case DTRACE_AGGWALK_DENORMALIZE:
1129 		h->dtahe_data.dtada_normal = 1;
1130 		return (0);
1131 
1132 	case DTRACE_AGGWALK_NORMALIZE:
1133 		if (h->dtahe_data.dtada_normal == 0) {
1134 			h->dtahe_data.dtada_normal = 1;
1135 			return (dt_set_errno(dtp, EDT_BADRVAL));
1136 		}
1137 
1138 		return (0);
1139 
1140 	case DTRACE_AGGWALK_REMOVE: {
1141 		dtrace_aggdata_t *aggdata = &h->dtahe_data;
1142 		int max_cpus = agp->dtat_maxcpu;
1143 
1144 		/*
1145 		 * First, remove this hash entry from its hash chain.
1146 		 */
1147 		if (h->dtahe_prev != NULL) {
1148 			h->dtahe_prev->dtahe_next = h->dtahe_next;
1149 		} else {
1150 			dt_ahash_t *hash = &agp->dtat_hash;
1151 			size_t ndx = h->dtahe_hashval % hash->dtah_size;
1152 
1153 			assert(hash->dtah_hash[ndx] == h);
1154 			hash->dtah_hash[ndx] = h->dtahe_next;
1155 		}
1156 
1157 		if (h->dtahe_next != NULL)
1158 			h->dtahe_next->dtahe_prev = h->dtahe_prev;
1159 
1160 		/*
1161 		 * Now remove it from the list of all hash entries.
1162 		 */
1163 		if (h->dtahe_prevall != NULL) {
1164 			h->dtahe_prevall->dtahe_nextall = h->dtahe_nextall;
1165 		} else {
1166 			dt_ahash_t *hash = &agp->dtat_hash;
1167 
1168 			assert(hash->dtah_all == h);
1169 			hash->dtah_all = h->dtahe_nextall;
1170 		}
1171 
1172 		if (h->dtahe_nextall != NULL)
1173 			h->dtahe_nextall->dtahe_prevall = h->dtahe_prevall;
1174 
1175 		/*
1176 		 * We're unlinked.  We can safely destroy the data.
1177 		 */
1178 		if (aggdata->dtada_percpu != NULL) {
1179 			for (i = 0; i < max_cpus; i++)
1180 				free(aggdata->dtada_percpu[i]);
1181 			free(aggdata->dtada_percpu);
1182 		}
1183 
1184 		free(aggdata->dtada_data);
1185 		free(h);
1186 
1187 		return (0);
1188 	}
1189 
1190 	default:
1191 		return (dt_set_errno(dtp, EDT_BADRVAL));
1192 	}
1193 
1194 	return (0);
1195 }
1196 
1197 void
1198 dt_aggregate_qsort(dtrace_hdl_t *dtp, void *base, size_t nel, size_t width,
1199     int (*compar)(const void *, const void *))
1200 {
1201 	int rev = dt_revsort, key = dt_keysort, keypos = dt_keypos;
1202 	dtrace_optval_t keyposopt = dtp->dt_options[DTRACEOPT_AGGSORTKEYPOS];
1203 
1204 	dt_revsort = (dtp->dt_options[DTRACEOPT_AGGSORTREV] != DTRACEOPT_UNSET);
1205 	dt_keysort = (dtp->dt_options[DTRACEOPT_AGGSORTKEY] != DTRACEOPT_UNSET);
1206 
1207 	if (keyposopt != DTRACEOPT_UNSET && keyposopt <= INT_MAX) {
1208 		dt_keypos = (int)keyposopt;
1209 	} else {
1210 		dt_keypos = 0;
1211 	}
1212 
1213 	if (compar == NULL) {
1214 		if (!dt_keysort) {
1215 			compar = dt_aggregate_varvalcmp;
1216 		} else {
1217 			compar = dt_aggregate_varkeycmp;
1218 		}
1219 	}
1220 
1221 	qsort(base, nel, width, compar);
1222 
1223 	dt_revsort = rev;
1224 	dt_keysort = key;
1225 	dt_keypos = keypos;
1226 }
1227 
1228 int
1229 dtrace_aggregate_walk(dtrace_hdl_t *dtp, dtrace_aggregate_f *func, void *arg)
1230 {
1231 	dt_ahashent_t *h, *next;
1232 	dt_ahash_t *hash = &dtp->dt_aggregate.dtat_hash;
1233 
1234 	for (h = hash->dtah_all; h != NULL; h = next) {
1235 		/*
1236 		 * dt_aggwalk_rval() can potentially remove the current hash
1237 		 * entry; we need to load the next hash entry before calling
1238 		 * into it.
1239 		 */
1240 		next = h->dtahe_nextall;
1241 
1242 		if (dt_aggwalk_rval(dtp, h, func(&h->dtahe_data, arg)) == -1)
1243 			return (-1);
1244 	}
1245 
1246 	return (0);
1247 }
1248 
1249 static int
1250 dt_aggregate_walk_sorted(dtrace_hdl_t *dtp,
1251     dtrace_aggregate_f *func, void *arg,
1252     int (*sfunc)(const void *, const void *))
1253 {
1254 	dt_aggregate_t *agp = &dtp->dt_aggregate;
1255 	dt_ahashent_t *h, **sorted;
1256 	dt_ahash_t *hash = &agp->dtat_hash;
1257 	size_t i, nentries = 0;
1258 
1259 	for (h = hash->dtah_all; h != NULL; h = h->dtahe_nextall)
1260 		nentries++;
1261 
1262 	sorted = dt_alloc(dtp, nentries * sizeof (dt_ahashent_t *));
1263 
1264 	if (sorted == NULL)
1265 		return (-1);
1266 
1267 	for (h = hash->dtah_all, i = 0; h != NULL; h = h->dtahe_nextall)
1268 		sorted[i++] = h;
1269 
1270 	(void) pthread_mutex_lock(&dt_qsort_lock);
1271 
1272 	if (sfunc == NULL) {
1273 		dt_aggregate_qsort(dtp, sorted, nentries,
1274 		    sizeof (dt_ahashent_t *), NULL);
1275 	} else {
1276 		/*
1277 		 * If we've been explicitly passed a sorting function,
1278 		 * we'll use that -- ignoring the values of the "aggsortrev",
1279 		 * "aggsortkey" and "aggsortkeypos" options.
1280 		 */
1281 		qsort(sorted, nentries, sizeof (dt_ahashent_t *), sfunc);
1282 	}
1283 
1284 	(void) pthread_mutex_unlock(&dt_qsort_lock);
1285 
1286 	for (i = 0; i < nentries; i++) {
1287 		h = sorted[i];
1288 
1289 		if (dt_aggwalk_rval(dtp, h, func(&h->dtahe_data, arg)) == -1) {
1290 			dt_free(dtp, sorted);
1291 			return (-1);
1292 		}
1293 	}
1294 
1295 	dt_free(dtp, sorted);
1296 	return (0);
1297 }
1298 
1299 int
1300 dtrace_aggregate_walk_sorted(dtrace_hdl_t *dtp,
1301     dtrace_aggregate_f *func, void *arg)
1302 {
1303 	return (dt_aggregate_walk_sorted(dtp, func, arg, NULL));
1304 }
1305 
1306 int
1307 dtrace_aggregate_walk_keysorted(dtrace_hdl_t *dtp,
1308     dtrace_aggregate_f *func, void *arg)
1309 {
1310 	return (dt_aggregate_walk_sorted(dtp, func,
1311 	    arg, dt_aggregate_varkeycmp));
1312 }
1313 
1314 int
1315 dtrace_aggregate_walk_valsorted(dtrace_hdl_t *dtp,
1316     dtrace_aggregate_f *func, void *arg)
1317 {
1318 	return (dt_aggregate_walk_sorted(dtp, func,
1319 	    arg, dt_aggregate_varvalcmp));
1320 }
1321 
1322 int
1323 dtrace_aggregate_walk_keyvarsorted(dtrace_hdl_t *dtp,
1324     dtrace_aggregate_f *func, void *arg)
1325 {
1326 	return (dt_aggregate_walk_sorted(dtp, func,
1327 	    arg, dt_aggregate_keyvarcmp));
1328 }
1329 
1330 int
1331 dtrace_aggregate_walk_valvarsorted(dtrace_hdl_t *dtp,
1332     dtrace_aggregate_f *func, void *arg)
1333 {
1334 	return (dt_aggregate_walk_sorted(dtp, func,
1335 	    arg, dt_aggregate_valvarcmp));
1336 }
1337 
1338 int
1339 dtrace_aggregate_walk_keyrevsorted(dtrace_hdl_t *dtp,
1340     dtrace_aggregate_f *func, void *arg)
1341 {
1342 	return (dt_aggregate_walk_sorted(dtp, func,
1343 	    arg, dt_aggregate_varkeyrevcmp));
1344 }
1345 
1346 int
1347 dtrace_aggregate_walk_valrevsorted(dtrace_hdl_t *dtp,
1348     dtrace_aggregate_f *func, void *arg)
1349 {
1350 	return (dt_aggregate_walk_sorted(dtp, func,
1351 	    arg, dt_aggregate_varvalrevcmp));
1352 }
1353 
1354 int
1355 dtrace_aggregate_walk_keyvarrevsorted(dtrace_hdl_t *dtp,
1356     dtrace_aggregate_f *func, void *arg)
1357 {
1358 	return (dt_aggregate_walk_sorted(dtp, func,
1359 	    arg, dt_aggregate_keyvarrevcmp));
1360 }
1361 
1362 int
1363 dtrace_aggregate_walk_valvarrevsorted(dtrace_hdl_t *dtp,
1364     dtrace_aggregate_f *func, void *arg)
1365 {
1366 	return (dt_aggregate_walk_sorted(dtp, func,
1367 	    arg, dt_aggregate_valvarrevcmp));
1368 }
1369 
1370 int
1371 dtrace_aggregate_walk_joined(dtrace_hdl_t *dtp, dtrace_aggvarid_t *aggvars,
1372     int naggvars, dtrace_aggregate_walk_joined_f *func, void *arg)
1373 {
1374 	dt_aggregate_t *agp = &dtp->dt_aggregate;
1375 	dt_ahashent_t *h, **sorted = NULL, ***bundle, **nbundle;
1376 	const dtrace_aggdata_t **data;
1377 	dt_ahashent_t *zaggdata = NULL;
1378 	dt_ahash_t *hash = &agp->dtat_hash;
1379 	size_t nentries = 0, nbundles = 0, start, zsize = 0, bundlesize;
1380 	dtrace_aggvarid_t max = 0, aggvar;
1381 	int rval = -1, *map, *remap = NULL;
1382 	int i, j;
1383 	dtrace_optval_t sortpos = dtp->dt_options[DTRACEOPT_AGGSORTPOS];
1384 
1385 	/*
1386 	 * If the sorting position is greater than the number of aggregation
1387 	 * variable IDs, we silently set it to 0.
1388 	 */
1389 	if (sortpos == DTRACEOPT_UNSET || sortpos >= naggvars)
1390 		sortpos = 0;
1391 
1392 	/*
1393 	 * First we need to translate the specified aggregation variable IDs
1394 	 * into a linear map that will allow us to translate an aggregation
1395 	 * variable ID into its position in the specified aggvars.
1396 	 */
1397 	for (i = 0; i < naggvars; i++) {
1398 		if (aggvars[i] == DTRACE_AGGVARIDNONE || aggvars[i] < 0)
1399 			return (dt_set_errno(dtp, EDT_BADAGGVAR));
1400 
1401 		if (aggvars[i] > max)
1402 			max = aggvars[i];
1403 	}
1404 
1405 	if ((map = dt_zalloc(dtp, (max + 1) * sizeof (int))) == NULL)
1406 		return (-1);
1407 
1408 	zaggdata = dt_zalloc(dtp, naggvars * sizeof (dt_ahashent_t));
1409 
1410 	if (zaggdata == NULL)
1411 		goto out;
1412 
1413 	for (i = 0; i < naggvars; i++) {
1414 		int ndx = i + sortpos;
1415 
1416 		if (ndx >= naggvars)
1417 			ndx -= naggvars;
1418 
1419 		aggvar = aggvars[ndx];
1420 		assert(aggvar <= max);
1421 
1422 		if (map[aggvar]) {
1423 			/*
1424 			 * We have an aggregation variable that is present
1425 			 * more than once in the array of aggregation
1426 			 * variables.  While it's unclear why one might want
1427 			 * to do this, it's legal.  To support this construct,
1428 			 * we will allocate a remap that will indicate the
1429 			 * position from which this aggregation variable
1430 			 * should be pulled.  (That is, where the remap will
1431 			 * map from one position to another.)
1432 			 */
1433 			if (remap == NULL) {
1434 				remap = dt_zalloc(dtp, naggvars * sizeof (int));
1435 
1436 				if (remap == NULL)
1437 					goto out;
1438 			}
1439 
1440 			/*
1441 			 * Given that the variable is already present, assert
1442 			 * that following through the mapping and adjusting
1443 			 * for the sort position yields the same aggregation
1444 			 * variable ID.
1445 			 */
1446 			assert(aggvars[(map[aggvar] - 1 + sortpos) %
1447 			    naggvars] == aggvars[ndx]);
1448 
1449 			remap[i] = map[aggvar];
1450 			continue;
1451 		}
1452 
1453 		map[aggvar] = i + 1;
1454 	}
1455 
1456 	/*
1457 	 * We need to take two passes over the data to size our allocation, so
1458 	 * we'll use the first pass to also fill in the zero-filled data to be
1459 	 * used to properly format a zero-valued aggregation.
1460 	 */
1461 	for (h = hash->dtah_all; h != NULL; h = h->dtahe_nextall) {
1462 		dtrace_aggvarid_t id;
1463 		int ndx;
1464 
1465 		if ((id = dt_aggregate_aggvarid(h)) > max || !(ndx = map[id]))
1466 			continue;
1467 
1468 		if (zaggdata[ndx - 1].dtahe_size == 0) {
1469 			zaggdata[ndx - 1].dtahe_size = h->dtahe_size;
1470 			zaggdata[ndx - 1].dtahe_data = h->dtahe_data;
1471 		}
1472 
1473 		nentries++;
1474 	}
1475 
1476 	if (nentries == 0) {
1477 		/*
1478 		 * We couldn't find any entries; there is nothing else to do.
1479 		 */
1480 		rval = 0;
1481 		goto out;
1482 	}
1483 
1484 	/*
1485 	 * Before we sort the data, we're going to look for any holes in our
1486 	 * zero-filled data.  This will occur if an aggregation variable that
1487 	 * we are being asked to print has not yet been assigned the result of
1488 	 * any aggregating action for _any_ tuple.  The issue becomes that we
1489 	 * would like a zero value to be printed for all columns for this
1490 	 * aggregation, but without any record description, we don't know the
1491 	 * aggregating action that corresponds to the aggregation variable.  To
1492 	 * try to find a match, we're simply going to lookup aggregation IDs
1493 	 * (which are guaranteed to be contiguous and to start from 1), looking
1494 	 * for the specified aggregation variable ID.  If we find a match,
1495 	 * we'll use that.  If we iterate over all aggregation IDs and don't
1496 	 * find a match, then we must be an anonymous enabling.  (Anonymous
1497 	 * enablings can't currently derive either aggregation variable IDs or
1498 	 * aggregation variable names given only an aggregation ID.)  In this
1499 	 * obscure case (anonymous enabling, multiple aggregation printa() with
1500 	 * some aggregations not represented for any tuple), our defined
1501 	 * behavior is that the zero will be printed in the format of the first
1502 	 * aggregation variable that contains any non-zero value.
1503 	 */
1504 	for (i = 0; i < naggvars; i++) {
1505 		if (zaggdata[i].dtahe_size == 0) {
1506 			dtrace_aggvarid_t aggvar;
1507 
1508 			aggvar = aggvars[(i - sortpos + naggvars) % naggvars];
1509 			assert(zaggdata[i].dtahe_data.dtada_data == NULL);
1510 
1511 			for (j = DTRACE_AGGIDNONE + 1; ; j++) {
1512 				dtrace_aggdesc_t *agg;
1513 				dtrace_aggdata_t *aggdata;
1514 
1515 				if (dt_aggid_lookup(dtp, j, &agg) != 0)
1516 					break;
1517 
1518 				if (agg->dtagd_varid != aggvar)
1519 					continue;
1520 
1521 				/*
1522 				 * We have our description -- now we need to
1523 				 * cons up the zaggdata entry for it.
1524 				 */
1525 				aggdata = &zaggdata[i].dtahe_data;
1526 				aggdata->dtada_size = agg->dtagd_size;
1527 				aggdata->dtada_desc = agg;
1528 				aggdata->dtada_handle = dtp;
1529 				(void) dt_epid_lookup(dtp, agg->dtagd_epid,
1530 				    &aggdata->dtada_edesc,
1531 				    &aggdata->dtada_pdesc);
1532 				aggdata->dtada_normal = 1;
1533 				zaggdata[i].dtahe_hashval = 0;
1534 				zaggdata[i].dtahe_size = agg->dtagd_size;
1535 				break;
1536 			}
1537 
1538 			if (zaggdata[i].dtahe_size == 0) {
1539 				caddr_t data;
1540 
1541 				/*
1542 				 * We couldn't find this aggregation, meaning
1543 				 * that we have never seen it before for any
1544 				 * tuple _and_ this is an anonymous enabling.
1545 				 * That is, we're in the obscure case outlined
1546 				 * above.  In this case, our defined behavior
1547 				 * is to format the data in the format of the
1548 				 * first non-zero aggregation -- of which, of
1549 				 * course, we know there to be at least one
1550 				 * (or nentries would have been zero).
1551 				 */
1552 				for (j = 0; j < naggvars; j++) {
1553 					if (zaggdata[j].dtahe_size != 0)
1554 						break;
1555 				}
1556 
1557 				assert(j < naggvars);
1558 				zaggdata[i] = zaggdata[j];
1559 
1560 				data = zaggdata[i].dtahe_data.dtada_data;
1561 				assert(data != NULL);
1562 			}
1563 		}
1564 	}
1565 
1566 	/*
1567 	 * Now we need to allocate our zero-filled data for use for
1568 	 * aggregations that don't have a value corresponding to a given key.
1569 	 */
1570 	for (i = 0; i < naggvars; i++) {
1571 		dtrace_aggdata_t *aggdata = &zaggdata[i].dtahe_data;
1572 		dtrace_aggdesc_t *aggdesc = aggdata->dtada_desc;
1573 		dtrace_recdesc_t *rec;
1574 		uint64_t larg;
1575 		caddr_t zdata;
1576 
1577 		zsize = zaggdata[i].dtahe_size;
1578 		assert(zsize != 0);
1579 
1580 		if ((zdata = dt_zalloc(dtp, zsize)) == NULL) {
1581 			/*
1582 			 * If we failed to allocated some zero-filled data, we
1583 			 * need to zero out the remaining dtada_data pointers
1584 			 * to prevent the wrong data from being freed below.
1585 			 */
1586 			for (j = i; j < naggvars; j++)
1587 				zaggdata[j].dtahe_data.dtada_data = NULL;
1588 			goto out;
1589 		}
1590 
1591 		aggvar = aggvars[(i - sortpos + naggvars) % naggvars];
1592 
1593 		/*
1594 		 * First, the easy bit.  To maintain compatibility with
1595 		 * consumers that pull the compiler-generated ID out of the
1596 		 * data, we put that ID at the top of the zero-filled data.
1597 		 */
1598 		rec = &aggdesc->dtagd_rec[0];
1599 		/* LINTED - alignment */
1600 		*((dtrace_aggvarid_t *)(zdata + rec->dtrd_offset)) = aggvar;
1601 
1602 		rec = &aggdesc->dtagd_rec[aggdesc->dtagd_nrecs - 1];
1603 
1604 		/*
1605 		 * Now for the more complicated part.  If (and only if) this
1606 		 * is an lquantize() aggregating action, zero-filled data is
1607 		 * not equivalent to an empty record:  we must also get the
1608 		 * parameters for the lquantize().
1609 		 */
1610 		if (rec->dtrd_action == DTRACEAGG_LQUANTIZE) {
1611 			if (aggdata->dtada_data != NULL) {
1612 				/*
1613 				 * The easier case here is if we actually have
1614 				 * some prototype data -- in which case we
1615 				 * manually dig it out of the aggregation
1616 				 * record.
1617 				 */
1618 				/* LINTED - alignment */
1619 				larg = *((uint64_t *)(aggdata->dtada_data +
1620 				    rec->dtrd_offset));
1621 			} else {
1622 				/*
1623 				 * We don't have any prototype data.  As a
1624 				 * result, we know that we _do_ have the
1625 				 * compiler-generated information.  (If this
1626 				 * were an anonymous enabling, all of our
1627 				 * zero-filled data would have prototype data
1628 				 * -- either directly or indirectly.) So as
1629 				 * gross as it is, we'll grovel around in the
1630 				 * compiler-generated information to find the
1631 				 * lquantize() parameters.
1632 				 */
1633 				dtrace_stmtdesc_t *sdp;
1634 				dt_ident_t *aid;
1635 				dt_idsig_t *isp;
1636 
1637 				sdp = (dtrace_stmtdesc_t *)(uintptr_t)
1638 				    aggdesc->dtagd_rec[0].dtrd_uarg;
1639 				aid = sdp->dtsd_aggdata;
1640 				isp = (dt_idsig_t *)aid->di_data;
1641 				assert(isp->dis_auxinfo != 0);
1642 				larg = isp->dis_auxinfo;
1643 			}
1644 
1645 			/* LINTED - alignment */
1646 			*((uint64_t *)(zdata + rec->dtrd_offset)) = larg;
1647 		}
1648 
1649 		aggdata->dtada_data = zdata;
1650 	}
1651 
1652 	/*
1653 	 * Now that we've dealt with setting up our zero-filled data, we can
1654 	 * allocate our sorted array, and take another pass over the data to
1655 	 * fill it.
1656 	 */
1657 	sorted = dt_alloc(dtp, nentries * sizeof (dt_ahashent_t *));
1658 
1659 	if (sorted == NULL)
1660 		goto out;
1661 
1662 	for (h = hash->dtah_all, i = 0; h != NULL; h = h->dtahe_nextall) {
1663 		dtrace_aggvarid_t id;
1664 
1665 		if ((id = dt_aggregate_aggvarid(h)) > max || !map[id])
1666 			continue;
1667 
1668 		sorted[i++] = h;
1669 	}
1670 
1671 	assert(i == nentries);
1672 
1673 	/*
1674 	 * We've loaded our array; now we need to sort by value to allow us
1675 	 * to create bundles of like value.  We're going to acquire the
1676 	 * dt_qsort_lock here, and hold it across all of our subsequent
1677 	 * comparison and sorting.
1678 	 */
1679 	(void) pthread_mutex_lock(&dt_qsort_lock);
1680 
1681 	qsort(sorted, nentries, sizeof (dt_ahashent_t *),
1682 	    dt_aggregate_keyvarcmp);
1683 
1684 	/*
1685 	 * Now we need to go through and create bundles.  Because the number
1686 	 * of bundles is bounded by the size of the sorted array, we're going
1687 	 * to reuse the underlying storage.  And note that "bundle" is an
1688 	 * array of pointers to arrays of pointers to dt_ahashent_t -- making
1689 	 * its type (regrettably) "dt_ahashent_t ***".  (Regrettable because
1690 	 * '*' -- like '_' and 'X' -- should never appear in triplicate in
1691 	 * an ideal world.)
1692 	 */
1693 	bundle = (dt_ahashent_t ***)sorted;
1694 
1695 	for (i = 1, start = 0; i <= nentries; i++) {
1696 		if (i < nentries &&
1697 		    dt_aggregate_keycmp(&sorted[i], &sorted[i - 1]) == 0)
1698 			continue;
1699 
1700 		/*
1701 		 * We have a bundle boundary.  Everything from start to
1702 		 * (i - 1) belongs in one bundle.
1703 		 */
1704 		assert(i - start <= naggvars);
1705 		bundlesize = (naggvars + 2) * sizeof (dt_ahashent_t *);
1706 
1707 		if ((nbundle = dt_zalloc(dtp, bundlesize)) == NULL) {
1708 			(void) pthread_mutex_unlock(&dt_qsort_lock);
1709 			goto out;
1710 		}
1711 
1712 		for (j = start; j < i; j++) {
1713 			dtrace_aggvarid_t id = dt_aggregate_aggvarid(sorted[j]);
1714 
1715 			assert(id <= max);
1716 			assert(map[id] != 0);
1717 			assert(map[id] - 1 < naggvars);
1718 			assert(nbundle[map[id] - 1] == NULL);
1719 			nbundle[map[id] - 1] = sorted[j];
1720 
1721 			if (nbundle[naggvars] == NULL)
1722 				nbundle[naggvars] = sorted[j];
1723 		}
1724 
1725 		for (j = 0; j < naggvars; j++) {
1726 			if (nbundle[j] != NULL)
1727 				continue;
1728 
1729 			/*
1730 			 * Before we assume that this aggregation variable
1731 			 * isn't present (and fall back to using the
1732 			 * zero-filled data allocated earlier), check the
1733 			 * remap.  If we have a remapping, we'll drop it in
1734 			 * here.  Note that we might be remapping an
1735 			 * aggregation variable that isn't present for this
1736 			 * key; in this case, the aggregation data that we
1737 			 * copy will point to the zeroed data.
1738 			 */
1739 			if (remap != NULL && remap[j]) {
1740 				assert(remap[j] - 1 < j);
1741 				assert(nbundle[remap[j] - 1] != NULL);
1742 				nbundle[j] = nbundle[remap[j] - 1];
1743 			} else {
1744 				nbundle[j] = &zaggdata[j];
1745 			}
1746 		}
1747 
1748 		bundle[nbundles++] = nbundle;
1749 		start = i;
1750 	}
1751 
1752 	/*
1753 	 * Now we need to re-sort based on the first value.
1754 	 */
1755 	dt_aggregate_qsort(dtp, bundle, nbundles, sizeof (dt_ahashent_t **),
1756 	    dt_aggregate_bundlecmp);
1757 
1758 	(void) pthread_mutex_unlock(&dt_qsort_lock);
1759 
1760 	/*
1761 	 * We're done!  Now we just need to go back over the sorted bundles,
1762 	 * calling the function.
1763 	 */
1764 	data = alloca((naggvars + 1) * sizeof (dtrace_aggdata_t *));
1765 
1766 	for (i = 0; i < nbundles; i++) {
1767 		for (j = 0; j < naggvars; j++)
1768 			data[j + 1] = NULL;
1769 
1770 		for (j = 0; j < naggvars; j++) {
1771 			int ndx = j - sortpos;
1772 
1773 			if (ndx < 0)
1774 				ndx += naggvars;
1775 
1776 			assert(bundle[i][ndx] != NULL);
1777 			data[j + 1] = &bundle[i][ndx]->dtahe_data;
1778 		}
1779 
1780 		for (j = 0; j < naggvars; j++)
1781 			assert(data[j + 1] != NULL);
1782 
1783 		/*
1784 		 * The representative key is the last element in the bundle.
1785 		 * Assert that we have one, and then set it to be the first
1786 		 * element of data.
1787 		 */
1788 		assert(bundle[i][j] != NULL);
1789 		data[0] = &bundle[i][j]->dtahe_data;
1790 
1791 		if ((rval = func(data, naggvars + 1, arg)) == -1)
1792 			goto out;
1793 	}
1794 
1795 	rval = 0;
1796 out:
1797 	for (i = 0; i < nbundles; i++)
1798 		dt_free(dtp, bundle[i]);
1799 
1800 	if (zaggdata != NULL) {
1801 		for (i = 0; i < naggvars; i++)
1802 			dt_free(dtp, zaggdata[i].dtahe_data.dtada_data);
1803 	}
1804 
1805 	dt_free(dtp, zaggdata);
1806 	dt_free(dtp, sorted);
1807 	dt_free(dtp, remap);
1808 	dt_free(dtp, map);
1809 
1810 	return (rval);
1811 }
1812 
1813 int
1814 dtrace_aggregate_print(dtrace_hdl_t *dtp, FILE *fp,
1815     dtrace_aggregate_walk_f *func)
1816 {
1817 	dt_print_aggdata_t pd;
1818 
1819 	pd.dtpa_dtp = dtp;
1820 	pd.dtpa_fp = fp;
1821 	pd.dtpa_allunprint = 1;
1822 
1823 	if (func == NULL)
1824 		func = dtrace_aggregate_walk_sorted;
1825 
1826 	if ((*func)(dtp, dt_print_agg, &pd) == -1)
1827 		return (dt_set_errno(dtp, dtp->dt_errno));
1828 
1829 	return (0);
1830 }
1831 
1832 void
1833 dtrace_aggregate_clear(dtrace_hdl_t *dtp)
1834 {
1835 	dt_aggregate_t *agp = &dtp->dt_aggregate;
1836 	dt_ahash_t *hash = &agp->dtat_hash;
1837 	dt_ahashent_t *h;
1838 	dtrace_aggdata_t *data;
1839 	dtrace_aggdesc_t *aggdesc;
1840 	dtrace_recdesc_t *rec;
1841 	int i, max_cpus = agp->dtat_maxcpu;
1842 
1843 	for (h = hash->dtah_all; h != NULL; h = h->dtahe_nextall) {
1844 		aggdesc = h->dtahe_data.dtada_desc;
1845 		rec = &aggdesc->dtagd_rec[aggdesc->dtagd_nrecs - 1];
1846 		data = &h->dtahe_data;
1847 
1848 		bzero(&data->dtada_data[rec->dtrd_offset], rec->dtrd_size);
1849 
1850 		if (data->dtada_percpu == NULL)
1851 			continue;
1852 
1853 		for (i = 0; i < max_cpus; i++)
1854 			bzero(data->dtada_percpu[i], rec->dtrd_size);
1855 	}
1856 }
1857 
1858 void
1859 dt_aggregate_destroy(dtrace_hdl_t *dtp)
1860 {
1861 	dt_aggregate_t *agp = &dtp->dt_aggregate;
1862 	dt_ahash_t *hash = &agp->dtat_hash;
1863 	dt_ahashent_t *h, *next;
1864 	dtrace_aggdata_t *aggdata;
1865 	int i, max_cpus = agp->dtat_maxcpu;
1866 
1867 	if (hash->dtah_hash == NULL) {
1868 		assert(hash->dtah_all == NULL);
1869 	} else {
1870 		free(hash->dtah_hash);
1871 
1872 		for (h = hash->dtah_all; h != NULL; h = next) {
1873 			next = h->dtahe_nextall;
1874 
1875 			aggdata = &h->dtahe_data;
1876 
1877 			if (aggdata->dtada_percpu != NULL) {
1878 				for (i = 0; i < max_cpus; i++)
1879 					free(aggdata->dtada_percpu[i]);
1880 				free(aggdata->dtada_percpu);
1881 			}
1882 
1883 			free(aggdata->dtada_data);
1884 			free(h);
1885 		}
1886 
1887 		hash->dtah_hash = NULL;
1888 		hash->dtah_all = NULL;
1889 		hash->dtah_size = 0;
1890 	}
1891 
1892 	free(agp->dtat_buf.dtbd_data);
1893 	free(agp->dtat_cpus);
1894 }
1895