xref: /netbsd-src/sys/dev/tprof/tprof.c (revision 867d70fc718005c0918b8b8b2f9d7f2d52d0a0db)
1 /*	$NetBSD: tprof.c,v 1.22 2022/12/16 17:38:56 ryo Exp $	*/
2 
3 /*-
4  * Copyright (c)2008,2009,2010 YAMAMOTO Takashi,
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 
29 #include <sys/cdefs.h>
30 __KERNEL_RCSID(0, "$NetBSD: tprof.c,v 1.22 2022/12/16 17:38:56 ryo Exp $");
31 
32 #include <sys/param.h>
33 #include <sys/systm.h>
34 #include <sys/kernel.h>
35 
36 #include <sys/callout.h>
37 #include <sys/conf.h>
38 #include <sys/cpu.h>
39 #include <sys/kmem.h>
40 #include <sys/module.h>
41 #include <sys/percpu.h>
42 #include <sys/poll.h>
43 #include <sys/proc.h>
44 #include <sys/queue.h>
45 #include <sys/select.h>
46 #include <sys/workqueue.h>
47 #include <sys/xcall.h>
48 
49 #include <dev/tprof/tprof.h>
50 #include <dev/tprof/tprof_ioctl.h>
51 
52 #include "ioconf.h"
53 
54 #ifndef TPROF_HZ
55 #define TPROF_HZ	10000
56 #endif
57 
58 /*
59  * locking order:
60  *	tprof_reader_lock -> tprof_lock
61  *	tprof_startstop_lock -> tprof_lock
62  */
63 
64 /*
65  * protected by:
66  *	L: tprof_lock
67  *	R: tprof_reader_lock
68  *	S: tprof_startstop_lock
69  *	s: writer should hold tprof_startstop_lock and tprof_lock
70  *	   reader should hold tprof_startstop_lock or tprof_lock
71  */
72 
73 typedef struct tprof_buf {
74 	u_int b_used;
75 	u_int b_size;
76 	u_int b_overflow;
77 	u_int b_unused;
78 	STAILQ_ENTRY(tprof_buf) b_list;
79 	tprof_sample_t b_data[];
80 } tprof_buf_t;
81 #define	TPROF_BUF_BYTESIZE(sz) \
82 	(sizeof(tprof_buf_t) + (sz) * sizeof(tprof_sample_t))
83 #define	TPROF_MAX_SAMPLES_PER_BUF	TPROF_HZ
84 
85 typedef struct {
86 	tprof_buf_t *c_buf;
87 	uint32_t c_cpuid;
88 	struct work c_work;
89 	callout_t c_callout;
90 } __aligned(CACHE_LINE_SIZE) tprof_cpu_t;
91 
92 typedef struct tprof_backend {
93 	/*
94 	 * tprof_backend_softc_t must be passed as an argument to the interrupt
95 	 * handler, but since this is difficult to implement in armv7/v8. Then,
96 	 * tprof_backend is exposed. Additionally, softc must be placed at the
97 	 * beginning of struct tprof_backend.
98 	 */
99 	tprof_backend_softc_t tb_softc;
100 
101 	const char *tb_name;
102 	const tprof_backend_ops_t *tb_ops;
103 	LIST_ENTRY(tprof_backend) tb_list;
104 } tprof_backend_t;
105 
106 static kmutex_t tprof_lock;
107 static u_int tprof_nworker;		/* L: # of running worker LWPs */
108 static lwp_t *tprof_owner;
109 static STAILQ_HEAD(, tprof_buf) tprof_list; /* L: global buffer list */
110 static u_int tprof_nbuf_on_list;	/* L: # of buffers on tprof_list */
111 static struct workqueue *tprof_wq;
112 static struct percpu *tprof_cpus __read_mostly;	/* tprof_cpu_t * */
113 static u_int tprof_samples_per_buf;
114 static u_int tprof_max_buf;
115 
116 tprof_backend_t *tprof_backend;	/* S: */
117 static LIST_HEAD(, tprof_backend) tprof_backends =
118     LIST_HEAD_INITIALIZER(tprof_backend); /* S: */
119 
120 static kmutex_t tprof_reader_lock;
121 static kcondvar_t tprof_reader_cv;	/* L: */
122 static off_t tprof_reader_offset;	/* R: */
123 
124 static kmutex_t tprof_startstop_lock;
125 static kcondvar_t tprof_cv;		/* L: */
126 static struct selinfo tprof_selp;	/* L: */
127 
128 static struct tprof_stat tprof_stat;	/* L: */
129 
130 static tprof_cpu_t *
131 tprof_cpu_direct(struct cpu_info *ci)
132 {
133 	tprof_cpu_t **cp;
134 
135 	cp = percpu_getptr_remote(tprof_cpus, ci);
136 	return *cp;
137 }
138 
139 static tprof_cpu_t *
140 tprof_cpu(struct cpu_info *ci)
141 {
142 	tprof_cpu_t *c;
143 
144 	/*
145 	 * As long as xcalls are blocked -- e.g., by kpreempt_disable
146 	 * -- the percpu object will not be swapped and destroyed.  We
147 	 * can't write to it, because the data may have already been
148 	 * moved to a new buffer, but we can safely read from it.
149 	 */
150 	kpreempt_disable();
151 	c = tprof_cpu_direct(ci);
152 	kpreempt_enable();
153 
154 	return c;
155 }
156 
157 static tprof_cpu_t *
158 tprof_curcpu(void)
159 {
160 
161 	return tprof_cpu(curcpu());
162 }
163 
164 static tprof_buf_t *
165 tprof_buf_alloc(void)
166 {
167 	tprof_buf_t *new;
168 	u_int size = tprof_samples_per_buf;
169 
170 	new = kmem_alloc(TPROF_BUF_BYTESIZE(size), KM_SLEEP);
171 	new->b_used = 0;
172 	new->b_size = size;
173 	new->b_overflow = 0;
174 	return new;
175 }
176 
177 static void
178 tprof_buf_free(tprof_buf_t *buf)
179 {
180 
181 	kmem_free(buf, TPROF_BUF_BYTESIZE(buf->b_size));
182 }
183 
184 static tprof_buf_t *
185 tprof_buf_switch(tprof_cpu_t *c, tprof_buf_t *new)
186 {
187 	tprof_buf_t *old;
188 
189 	old = c->c_buf;
190 	c->c_buf = new;
191 	return old;
192 }
193 
194 static tprof_buf_t *
195 tprof_buf_refresh(void)
196 {
197 	tprof_cpu_t * const c = tprof_curcpu();
198 	tprof_buf_t *new;
199 
200 	new = tprof_buf_alloc();
201 	return tprof_buf_switch(c, new);
202 }
203 
204 static void
205 tprof_worker(struct work *wk, void *dummy)
206 {
207 	tprof_cpu_t * const c = tprof_curcpu();
208 	tprof_buf_t *buf;
209 	tprof_backend_t *tb;
210 	bool shouldstop;
211 
212 	KASSERT(wk == &c->c_work);
213 	KASSERT(dummy == NULL);
214 
215 	/*
216 	 * get a per cpu buffer.
217 	 */
218 	buf = tprof_buf_refresh();
219 
220 	/*
221 	 * and put it on the global list for read(2).
222 	 */
223 	mutex_enter(&tprof_lock);
224 	tb = tprof_backend;
225 	shouldstop = (tb == NULL || tb->tb_softc.sc_ctr_running_mask == 0);
226 	if (shouldstop) {
227 		KASSERT(tprof_nworker > 0);
228 		tprof_nworker--;
229 		cv_broadcast(&tprof_cv);
230 		cv_broadcast(&tprof_reader_cv);
231 	}
232 	if (buf->b_used == 0) {
233 		tprof_stat.ts_emptybuf++;
234 	} else if (tprof_nbuf_on_list < tprof_max_buf) {
235 		tprof_stat.ts_sample += buf->b_used;
236 		tprof_stat.ts_overflow += buf->b_overflow;
237 		tprof_stat.ts_buf++;
238 		STAILQ_INSERT_TAIL(&tprof_list, buf, b_list);
239 		tprof_nbuf_on_list++;
240 		buf = NULL;
241 		selnotify(&tprof_selp, 0, NOTE_SUBMIT);
242 		cv_broadcast(&tprof_reader_cv);
243 	} else {
244 		tprof_stat.ts_dropbuf_sample += buf->b_used;
245 		tprof_stat.ts_dropbuf++;
246 	}
247 	mutex_exit(&tprof_lock);
248 	if (buf) {
249 		tprof_buf_free(buf);
250 	}
251 	if (!shouldstop) {
252 		callout_schedule(&c->c_callout, hz / 8);
253 	}
254 }
255 
256 static void
257 tprof_kick(void *vp)
258 {
259 	struct cpu_info * const ci = vp;
260 	tprof_cpu_t * const c = tprof_cpu(ci);
261 
262 	workqueue_enqueue(tprof_wq, &c->c_work, ci);
263 }
264 
265 static void
266 tprof_stop1(void)
267 {
268 	CPU_INFO_ITERATOR cii;
269 	struct cpu_info *ci;
270 
271 	KASSERT(mutex_owned(&tprof_startstop_lock));
272 	KASSERT(tprof_nworker == 0);
273 
274 	for (CPU_INFO_FOREACH(cii, ci)) {
275 		tprof_cpu_t * const c = tprof_cpu(ci);
276 		tprof_buf_t *old;
277 
278 		old = tprof_buf_switch(c, NULL);
279 		if (old != NULL) {
280 			tprof_buf_free(old);
281 		}
282 		callout_destroy(&c->c_callout);
283 	}
284 	workqueue_destroy(tprof_wq);
285 }
286 
287 static void
288 tprof_getinfo(struct tprof_info *info)
289 {
290 	tprof_backend_t *tb;
291 
292 	KASSERT(mutex_owned(&tprof_startstop_lock));
293 
294 	memset(info, 0, sizeof(*info));
295 	info->ti_version = TPROF_VERSION;
296 	if ((tb = tprof_backend) != NULL) {
297 		info->ti_ident = tb->tb_ops->tbo_ident();
298 	}
299 }
300 
301 static int
302 tprof_getncounters(u_int *ncounters)
303 {
304 	tprof_backend_t *tb;
305 
306 	tb = tprof_backend;
307 	if (tb == NULL)
308 		return ENOENT;
309 
310 	*ncounters = tb->tb_ops->tbo_ncounters();
311 	return 0;
312 }
313 
314 static void
315 tprof_start_cpu(void *arg1, void *arg2)
316 {
317 	tprof_backend_t *tb = arg1;
318 	tprof_countermask_t runmask = (uintptr_t)arg2;
319 
320 	tb->tb_ops->tbo_start(runmask);
321 }
322 
323 static void
324 tprof_stop_cpu(void *arg1, void *arg2)
325 {
326 	tprof_backend_t *tb = arg1;
327 	tprof_countermask_t stopmask = (uintptr_t)arg2;
328 
329 	tb->tb_ops->tbo_stop(stopmask);
330 }
331 
332 static int
333 tprof_start(tprof_countermask_t runmask)
334 {
335 	CPU_INFO_ITERATOR cii;
336 	struct cpu_info *ci;
337 	tprof_backend_t *tb;
338 	uint64_t xc;
339 	int error;
340 	bool firstrun;
341 
342 	KASSERT(mutex_owned(&tprof_startstop_lock));
343 
344 	tb = tprof_backend;
345 	if (tb == NULL) {
346 		error = ENOENT;
347 		goto done;
348 	}
349 
350 	runmask &= ~tb->tb_softc.sc_ctr_running_mask;
351 	runmask &= tb->tb_softc.sc_ctr_configured_mask;
352 	if (runmask == 0) {
353 		/*
354 		 * targets are already running.
355 		 * unconfigured counters are ignored.
356 		 */
357 		error = 0;
358 		goto done;
359 	}
360 
361 	firstrun = (tb->tb_softc.sc_ctr_running_mask == 0);
362 	if (firstrun) {
363 		if (tb->tb_ops->tbo_establish != NULL) {
364 			error = tb->tb_ops->tbo_establish(&tb->tb_softc);
365 			if (error != 0)
366 				goto done;
367 		}
368 
369 		tprof_samples_per_buf = TPROF_MAX_SAMPLES_PER_BUF;
370 		tprof_max_buf = ncpu * 3;
371 		error = workqueue_create(&tprof_wq, "tprofmv", tprof_worker,
372 		    NULL, PRI_NONE, IPL_SOFTCLOCK, WQ_MPSAFE | WQ_PERCPU);
373 		if (error != 0) {
374 			if (tb->tb_ops->tbo_disestablish != NULL)
375 				tb->tb_ops->tbo_disestablish(&tb->tb_softc);
376 			goto done;
377 		}
378 
379 		for (CPU_INFO_FOREACH(cii, ci)) {
380 			tprof_cpu_t * const c = tprof_cpu(ci);
381 			tprof_buf_t *new;
382 			tprof_buf_t *old;
383 
384 			new = tprof_buf_alloc();
385 			old = tprof_buf_switch(c, new);
386 			if (old != NULL) {
387 				tprof_buf_free(old);
388 			}
389 			callout_init(&c->c_callout, CALLOUT_MPSAFE);
390 			callout_setfunc(&c->c_callout, tprof_kick, ci);
391 		}
392 	}
393 
394 	runmask &= tb->tb_softc.sc_ctr_configured_mask;
395 	xc = xc_broadcast(0, tprof_start_cpu, tb, (void *)(uintptr_t)runmask);
396 	xc_wait(xc);
397 	mutex_enter(&tprof_lock);
398 	tb->tb_softc.sc_ctr_running_mask |= runmask;
399 	mutex_exit(&tprof_lock);
400 
401 	if (firstrun) {
402 		for (CPU_INFO_FOREACH(cii, ci)) {
403 			tprof_cpu_t * const c = tprof_cpu(ci);
404 
405 			mutex_enter(&tprof_lock);
406 			tprof_nworker++;
407 			mutex_exit(&tprof_lock);
408 			workqueue_enqueue(tprof_wq, &c->c_work, ci);
409 		}
410 	}
411 	error = 0;
412 
413 done:
414 	return error;
415 }
416 
417 static void
418 tprof_stop(tprof_countermask_t stopmask)
419 {
420 	tprof_backend_t *tb;
421 	uint64_t xc;
422 
423 	tb = tprof_backend;
424 	if (tb == NULL)
425 		return;
426 
427 	KASSERT(mutex_owned(&tprof_startstop_lock));
428 	stopmask &= tb->tb_softc.sc_ctr_running_mask;
429 	if (stopmask == 0) {
430 		/* targets are not running */
431 		goto done;
432 	}
433 
434 	xc = xc_broadcast(0, tprof_stop_cpu, tb, (void *)(uintptr_t)stopmask);
435 	xc_wait(xc);
436 	mutex_enter(&tprof_lock);
437 	tb->tb_softc.sc_ctr_running_mask &= ~stopmask;
438 	mutex_exit(&tprof_lock);
439 
440 	/* all counters have stopped? */
441 	if (tb->tb_softc.sc_ctr_running_mask == 0) {
442 		mutex_enter(&tprof_lock);
443 		cv_broadcast(&tprof_reader_cv);
444 		while (tprof_nworker > 0) {
445 			cv_wait(&tprof_cv, &tprof_lock);
446 		}
447 		mutex_exit(&tprof_lock);
448 
449 		tprof_stop1();
450 		if (tb->tb_ops->tbo_disestablish != NULL)
451 			tb->tb_ops->tbo_disestablish(&tb->tb_softc);
452 	}
453 done:
454 	;
455 }
456 
457 static void
458 tprof_init_percpu_counters_offset(void *vp, void *vp2, struct cpu_info *ci)
459 {
460 	uint64_t *counters_offset = vp;
461 	u_int counter = (uintptr_t)vp2;
462 
463 	tprof_backend_t *tb = tprof_backend;
464 	tprof_param_t *param = &tb->tb_softc.sc_count[counter].ctr_param;
465 	counters_offset[counter] = param->p_value;
466 }
467 
468 static void
469 tprof_configure_event_cpu(void *arg1, void *arg2)
470 {
471 	tprof_backend_t *tb = arg1;
472 	u_int counter = (uintptr_t)arg2;
473 	tprof_param_t *param = &tb->tb_softc.sc_count[counter].ctr_param;
474 
475 	tb->tb_ops->tbo_configure_event(counter, param);
476 }
477 
478 static int
479 tprof_configure_event(const tprof_param_t *param)
480 {
481 	tprof_backend_t *tb;
482 	tprof_backend_softc_t *sc;
483 	tprof_param_t *sc_param;
484 	uint64_t xc;
485 	int c, error;
486 
487 	if ((param->p_flags & (TPROF_PARAM_USER | TPROF_PARAM_KERN)) == 0) {
488 		error = EINVAL;
489 		goto done;
490 	}
491 
492 	tb = tprof_backend;
493 	if (tb == NULL) {
494 		error = ENOENT;
495 		goto done;
496 	}
497 	sc = &tb->tb_softc;
498 
499 	c = param->p_counter;
500 	if (c >= tb->tb_softc.sc_ncounters) {
501 		error = EINVAL;
502 		goto done;
503 	}
504 
505 	if (tb->tb_ops->tbo_valid_event != NULL) {
506 		error = tb->tb_ops->tbo_valid_event(param->p_counter, param);
507 		if (error != 0)
508 			goto done;
509 	}
510 
511 	/* if already running, stop the counter */
512 	if (ISSET(c, tb->tb_softc.sc_ctr_running_mask))
513 		tprof_stop(__BIT(c));
514 
515 	sc->sc_count[c].ctr_bitwidth =
516 	    tb->tb_ops->tbo_counter_bitwidth(param->p_counter);
517 
518 	sc_param = &sc->sc_count[c].ctr_param;
519 	memcpy(sc_param, param, sizeof(*sc_param));	/* save copy of param */
520 
521 	if (ISSET(param->p_flags, TPROF_PARAM_PROFILE)) {
522 		uint64_t freq, inum, dnum;
523 
524 		freq = tb->tb_ops->tbo_counter_estimate_freq(c);
525 		sc->sc_count[c].ctr_counter_val = freq / TPROF_HZ;
526 		if (sc->sc_count[c].ctr_counter_val == 0) {
527 			printf("%s: counter#%d frequency (%"PRIu64") is"
528 			    " very low relative to TPROF_HZ (%u)\n", __func__,
529 			    c, freq, TPROF_HZ);
530 			sc->sc_count[c].ctr_counter_val =
531 			    4000000000ULL / TPROF_HZ;
532 		}
533 
534 		switch (param->p_flags & TPROF_PARAM_VALUE2_MASK) {
535 		case TPROF_PARAM_VALUE2_SCALE:
536 			if (sc_param->p_value2 == 0)
537 				break;
538 			/*
539 			 * p_value2 is 64-bit fixed-point
540 			 * upper 32 bits are the integer part
541 			 * lower 32 bits are the decimal part
542 			 */
543 			inum = sc_param->p_value2 >> 32;
544 			dnum = sc_param->p_value2 & __BITS(31, 0);
545 			sc->sc_count[c].ctr_counter_val =
546 			    sc->sc_count[c].ctr_counter_val * inum +
547 			    (sc->sc_count[c].ctr_counter_val * dnum >> 32);
548 			if (sc->sc_count[c].ctr_counter_val == 0)
549 				sc->sc_count[c].ctr_counter_val = 1;
550 			break;
551 		case TPROF_PARAM_VALUE2_TRIGGERCOUNT:
552 			if (sc_param->p_value2 == 0)
553 				sc_param->p_value2 = 1;
554 			if (sc_param->p_value2 >
555 			    __BITS(sc->sc_count[c].ctr_bitwidth - 1, 0)) {
556 				sc_param->p_value2 =
557 				    __BITS(sc->sc_count[c].ctr_bitwidth - 1, 0);
558 			}
559 			sc->sc_count[c].ctr_counter_val = sc_param->p_value2;
560 			break;
561 		default:
562 			break;
563 		}
564 		sc->sc_count[c].ctr_counter_reset_val =
565 		    -sc->sc_count[c].ctr_counter_val;
566 		sc->sc_count[c].ctr_counter_reset_val &=
567 		    __BITS(sc->sc_count[c].ctr_bitwidth - 1, 0);
568 	} else {
569 		sc->sc_count[c].ctr_counter_val = 0;
570 		sc->sc_count[c].ctr_counter_reset_val = 0;
571 	}
572 
573 	/* At this point, p_value is used as an initial value */
574 	percpu_foreach(tb->tb_softc.sc_ctr_offset_percpu,
575 	    tprof_init_percpu_counters_offset, (void *)(uintptr_t)c);
576 	/* On the backend side, p_value is used as the reset value */
577 	sc_param->p_value = tb->tb_softc.sc_count[c].ctr_counter_reset_val;
578 
579 	xc = xc_broadcast(0, tprof_configure_event_cpu,
580 	    tb, (void *)(uintptr_t)c);
581 	xc_wait(xc);
582 
583 	mutex_enter(&tprof_lock);
584 	/* update counters bitmasks */
585 	SET(tb->tb_softc.sc_ctr_configured_mask, __BIT(c));
586 	CLR(tb->tb_softc.sc_ctr_prof_mask, __BIT(c));
587 	CLR(tb->tb_softc.sc_ctr_ovf_mask, __BIT(c));
588 	/* profiled counter requires overflow handling */
589 	if (ISSET(param->p_flags, TPROF_PARAM_PROFILE)) {
590 		SET(tb->tb_softc.sc_ctr_prof_mask, __BIT(c));
591 		SET(tb->tb_softc.sc_ctr_ovf_mask, __BIT(c));
592 	}
593 	/* counters with less than 64bits also require overflow handling */
594 	if (sc->sc_count[c].ctr_bitwidth != 64)
595 		SET(tb->tb_softc.sc_ctr_ovf_mask, __BIT(c));
596 	mutex_exit(&tprof_lock);
597 
598 	error = 0;
599 
600  done:
601 	return error;
602 }
603 
604 static void
605 tprof_getcounts_cpu(void *arg1, void *arg2)
606 {
607 	tprof_backend_t *tb = arg1;
608 	tprof_backend_softc_t *sc = &tb->tb_softc;
609 	uint64_t *counters = arg2;
610 	uint64_t *counters_offset;
611 	unsigned int c;
612 
613 	tprof_countermask_t configmask = sc->sc_ctr_configured_mask;
614 	counters_offset = percpu_getref(sc->sc_ctr_offset_percpu);
615 	for (c = 0; c < sc->sc_ncounters; c++) {
616 		if (ISSET(configmask, __BIT(c))) {
617 			uint64_t ctr = tb->tb_ops->tbo_counter_read(c);
618 			counters[c] = counters_offset[c] +
619 			    ((ctr - sc->sc_count[c].ctr_counter_reset_val) &
620 			    __BITS(sc->sc_count[c].ctr_bitwidth - 1, 0));
621 		} else {
622 			counters[c] = 0;
623 		}
624 	}
625 	percpu_putref(sc->sc_ctr_offset_percpu);
626 }
627 
628 static int
629 tprof_getcounts(tprof_counts_t *counts)
630 {
631 	struct cpu_info *ci;
632 	tprof_backend_t *tb;
633 	uint64_t xc;
634 
635 	tb = tprof_backend;
636 	if (tb == NULL)
637 		return ENOENT;
638 
639 	if (counts->c_cpu >= ncpu)
640 		return ESRCH;
641 	ci = cpu_lookup(counts->c_cpu);
642 	if (ci == NULL)
643 		return ESRCH;
644 
645 	xc = xc_unicast(0, tprof_getcounts_cpu, tb, counts->c_count, ci);
646 	xc_wait(xc);
647 
648 	counts->c_ncounters = tb->tb_softc.sc_ncounters;
649 	counts->c_runningmask = tb->tb_softc.sc_ctr_running_mask;
650 	return 0;
651 }
652 
653 /*
654  * tprof_clear: drain unread samples.
655  */
656 
657 static void
658 tprof_clear(void)
659 {
660 	tprof_buf_t *buf;
661 
662 	mutex_enter(&tprof_reader_lock);
663 	mutex_enter(&tprof_lock);
664 	while ((buf = STAILQ_FIRST(&tprof_list)) != NULL) {
665 		if (buf != NULL) {
666 			STAILQ_REMOVE_HEAD(&tprof_list, b_list);
667 			KASSERT(tprof_nbuf_on_list > 0);
668 			tprof_nbuf_on_list--;
669 			mutex_exit(&tprof_lock);
670 			tprof_buf_free(buf);
671 			mutex_enter(&tprof_lock);
672 		}
673 	}
674 	KASSERT(tprof_nbuf_on_list == 0);
675 	mutex_exit(&tprof_lock);
676 	tprof_reader_offset = 0;
677 	mutex_exit(&tprof_reader_lock);
678 
679 	memset(&tprof_stat, 0, sizeof(tprof_stat));
680 }
681 
682 static tprof_backend_t *
683 tprof_backend_lookup(const char *name)
684 {
685 	tprof_backend_t *tb;
686 
687 	KASSERT(mutex_owned(&tprof_startstop_lock));
688 
689 	LIST_FOREACH(tb, &tprof_backends, tb_list) {
690 		if (!strcmp(tb->tb_name, name)) {
691 			return tb;
692 		}
693 	}
694 	return NULL;
695 }
696 
697 /* -------------------- backend interfaces */
698 
699 /*
700  * tprof_sample: record a sample on the per-cpu buffer.
701  *
702  * be careful; can be called in NMI context.
703  * we are bluntly assuming the followings are safe.
704  *	curcpu()
705  *	curlwp->l_lid
706  *	curlwp->l_proc->p_pid
707  */
708 
709 void
710 tprof_sample(void *unused, const tprof_frame_info_t *tfi)
711 {
712 	tprof_cpu_t * const c = tprof_cpu_direct(curcpu());
713 	tprof_buf_t * const buf = c->c_buf;
714 	tprof_sample_t *sp;
715 	const uintptr_t pc = tfi->tfi_pc;
716 	const lwp_t * const l = curlwp;
717 	u_int idx;
718 
719 	idx = buf->b_used;
720 	if (__predict_false(idx >= buf->b_size)) {
721 		buf->b_overflow++;
722 		return;
723 	}
724 	sp = &buf->b_data[idx];
725 	sp->s_pid = l->l_proc->p_pid;
726 	sp->s_lwpid = l->l_lid;
727 	sp->s_cpuid = c->c_cpuid;
728 	sp->s_flags = ((tfi->tfi_inkernel) ? TPROF_SAMPLE_INKERNEL : 0) |
729 	    __SHIFTIN(tfi->tfi_counter, TPROF_SAMPLE_COUNTER_MASK);
730 	sp->s_pc = pc;
731 	buf->b_used = idx + 1;
732 }
733 
734 /*
735  * tprof_backend_register:
736  */
737 
738 int
739 tprof_backend_register(const char *name, const tprof_backend_ops_t *ops,
740     int vers)
741 {
742 	tprof_backend_t *tb;
743 
744 	if (vers != TPROF_BACKEND_VERSION) {
745 		return EINVAL;
746 	}
747 
748 	mutex_enter(&tprof_startstop_lock);
749 	tb = tprof_backend_lookup(name);
750 	if (tb != NULL) {
751 		mutex_exit(&tprof_startstop_lock);
752 		return EEXIST;
753 	}
754 #if 1 /* XXX for now */
755 	if (!LIST_EMPTY(&tprof_backends)) {
756 		mutex_exit(&tprof_startstop_lock);
757 		return ENOTSUP;
758 	}
759 #endif
760 	tb = kmem_zalloc(sizeof(*tb), KM_SLEEP);
761 	tb->tb_name = name;
762 	tb->tb_ops = ops;
763 	LIST_INSERT_HEAD(&tprof_backends, tb, tb_list);
764 #if 1 /* XXX for now */
765 	if (tprof_backend == NULL) {
766 		tprof_backend = tb;
767 	}
768 #endif
769 	mutex_exit(&tprof_startstop_lock);
770 
771 	/* init backend softc */
772 	tb->tb_softc.sc_ncounters = tb->tb_ops->tbo_ncounters();
773 	tb->tb_softc.sc_ctr_offset_percpu_size =
774 	    sizeof(uint64_t) * tb->tb_softc.sc_ncounters;
775 	tb->tb_softc.sc_ctr_offset_percpu =
776 	    percpu_alloc(tb->tb_softc.sc_ctr_offset_percpu_size);
777 
778 	return 0;
779 }
780 
781 /*
782  * tprof_backend_unregister:
783  */
784 
785 int
786 tprof_backend_unregister(const char *name)
787 {
788 	tprof_backend_t *tb;
789 
790 	mutex_enter(&tprof_startstop_lock);
791 	tb = tprof_backend_lookup(name);
792 #if defined(DIAGNOSTIC)
793 	if (tb == NULL) {
794 		mutex_exit(&tprof_startstop_lock);
795 		panic("%s: not found '%s'", __func__, name);
796 	}
797 #endif /* defined(DIAGNOSTIC) */
798 	if (tb->tb_softc.sc_ctr_running_mask != 0) {
799 		mutex_exit(&tprof_startstop_lock);
800 		return EBUSY;
801 	}
802 #if 1 /* XXX for now */
803 	if (tprof_backend == tb) {
804 		tprof_backend = NULL;
805 	}
806 #endif
807 	LIST_REMOVE(tb, tb_list);
808 	mutex_exit(&tprof_startstop_lock);
809 
810 	/* fini backend softc */
811 	percpu_free(tb->tb_softc.sc_ctr_offset_percpu,
812 	    tb->tb_softc.sc_ctr_offset_percpu_size);
813 
814 	/* free backend */
815 	kmem_free(tb, sizeof(*tb));
816 
817 	return 0;
818 }
819 
820 /* -------------------- cdevsw interfaces */
821 
822 static int
823 tprof_open(dev_t dev, int flags, int type, struct lwp *l)
824 {
825 
826 	if (minor(dev) != 0) {
827 		return EXDEV;
828 	}
829 	mutex_enter(&tprof_lock);
830 	if (tprof_owner != NULL) {
831 		mutex_exit(&tprof_lock);
832 		return  EBUSY;
833 	}
834 	tprof_owner = curlwp;
835 	mutex_exit(&tprof_lock);
836 
837 	return 0;
838 }
839 
840 static int
841 tprof_close(dev_t dev, int flags, int type, struct lwp *l)
842 {
843 
844 	KASSERT(minor(dev) == 0);
845 
846 	mutex_enter(&tprof_startstop_lock);
847 	mutex_enter(&tprof_lock);
848 	tprof_owner = NULL;
849 	mutex_exit(&tprof_lock);
850 	tprof_stop(TPROF_COUNTERMASK_ALL);
851 	tprof_clear();
852 
853 	tprof_backend_t *tb = tprof_backend;
854 	if (tb != NULL) {
855 		KASSERT(tb->tb_softc.sc_ctr_running_mask == 0);
856 		tb->tb_softc.sc_ctr_configured_mask = 0;
857 		tb->tb_softc.sc_ctr_prof_mask = 0;
858 		tb->tb_softc.sc_ctr_ovf_mask = 0;
859 	}
860 
861 	mutex_exit(&tprof_startstop_lock);
862 
863 	return 0;
864 }
865 
866 static int
867 tprof_poll(dev_t dev, int events, struct lwp *l)
868 {
869 	int revents;
870 
871 	revents = events & (POLLIN | POLLRDNORM);
872 	if (revents == 0)
873 		return 0;
874 
875 	mutex_enter(&tprof_lock);
876 	if (STAILQ_EMPTY(&tprof_list)) {
877 		revents = 0;
878 		selrecord(l, &tprof_selp);
879 	}
880 	mutex_exit(&tprof_lock);
881 
882 	return revents;
883 }
884 
885 static void
886 filt_tprof_read_detach(struct knote *kn)
887 {
888 	mutex_enter(&tprof_lock);
889 	selremove_knote(&tprof_selp, kn);
890 	mutex_exit(&tprof_lock);
891 }
892 
893 static int
894 filt_tprof_read_event(struct knote *kn, long hint)
895 {
896 	int rv = 0;
897 
898 	if ((hint & NOTE_SUBMIT) == 0)
899 		mutex_enter(&tprof_lock);
900 
901 	if (!STAILQ_EMPTY(&tprof_list)) {
902 		tprof_buf_t *buf;
903 		int64_t n = 0;
904 
905 		STAILQ_FOREACH(buf, &tprof_list, b_list) {
906 			n += buf->b_used;
907 		}
908 		kn->kn_data = n * sizeof(tprof_sample_t);
909 
910 		rv = 1;
911 	}
912 
913 	if ((hint & NOTE_SUBMIT) == 0)
914 		mutex_exit(&tprof_lock);
915 
916 	return rv;
917 }
918 
919 static const struct filterops tprof_read_filtops = {
920 	.f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE,
921 	.f_attach = NULL,
922 	.f_detach = filt_tprof_read_detach,
923 	.f_event = filt_tprof_read_event,
924 };
925 
926 static int
927 tprof_kqfilter(dev_t dev, struct knote *kn)
928 {
929 	switch (kn->kn_filter) {
930 	case EVFILT_READ:
931 		kn->kn_fop = &tprof_read_filtops;
932 		mutex_enter(&tprof_lock);
933 		selrecord_knote(&tprof_selp, kn);
934 		mutex_exit(&tprof_lock);
935 		break;
936 	default:
937 		return EINVAL;
938 	}
939 
940 	return 0;
941 }
942 
943 static int
944 tprof_read(dev_t dev, struct uio *uio, int flags)
945 {
946 	tprof_buf_t *buf;
947 	size_t bytes;
948 	size_t resid;
949 	size_t done = 0;
950 	int error = 0;
951 
952 	KASSERT(minor(dev) == 0);
953 	mutex_enter(&tprof_reader_lock);
954 	while (uio->uio_resid > 0 && error == 0) {
955 		/*
956 		 * take the first buffer from the list.
957 		 */
958 		mutex_enter(&tprof_lock);
959 		buf = STAILQ_FIRST(&tprof_list);
960 		if (buf == NULL) {
961 			if (tprof_nworker == 0 || done != 0) {
962 				mutex_exit(&tprof_lock);
963 				error = 0;
964 				break;
965 			}
966 			mutex_exit(&tprof_reader_lock);
967 			error = cv_wait_sig(&tprof_reader_cv, &tprof_lock);
968 			mutex_exit(&tprof_lock);
969 			mutex_enter(&tprof_reader_lock);
970 			continue;
971 		}
972 		STAILQ_REMOVE_HEAD(&tprof_list, b_list);
973 		KASSERT(tprof_nbuf_on_list > 0);
974 		tprof_nbuf_on_list--;
975 		mutex_exit(&tprof_lock);
976 
977 		/*
978 		 * copy it out.
979 		 */
980 		bytes = MIN(buf->b_used * sizeof(tprof_sample_t) -
981 		    tprof_reader_offset, uio->uio_resid);
982 		resid = uio->uio_resid;
983 		error = uiomove((char *)buf->b_data + tprof_reader_offset,
984 		    bytes, uio);
985 		done = resid - uio->uio_resid;
986 		tprof_reader_offset += done;
987 
988 		/*
989 		 * if we didn't consume the whole buffer,
990 		 * put it back to the list.
991 		 */
992 		if (tprof_reader_offset <
993 		    buf->b_used * sizeof(tprof_sample_t)) {
994 			mutex_enter(&tprof_lock);
995 			STAILQ_INSERT_HEAD(&tprof_list, buf, b_list);
996 			tprof_nbuf_on_list++;
997 			cv_broadcast(&tprof_reader_cv);
998 			mutex_exit(&tprof_lock);
999 		} else {
1000 			tprof_buf_free(buf);
1001 			tprof_reader_offset = 0;
1002 		}
1003 	}
1004 	mutex_exit(&tprof_reader_lock);
1005 
1006 	return error;
1007 }
1008 
1009 static int
1010 tprof_ioctl(dev_t dev, u_long cmd, void *data, int flags, struct lwp *l)
1011 {
1012 	const tprof_param_t *param;
1013 	tprof_counts_t *counts;
1014 	int error = 0;
1015 
1016 	KASSERT(minor(dev) == 0);
1017 
1018 	switch (cmd) {
1019 	case TPROF_IOC_GETINFO:
1020 		mutex_enter(&tprof_startstop_lock);
1021 		tprof_getinfo(data);
1022 		mutex_exit(&tprof_startstop_lock);
1023 		break;
1024 	case TPROF_IOC_GETNCOUNTERS:
1025 		mutex_enter(&tprof_lock);
1026 		error = tprof_getncounters((u_int *)data);
1027 		mutex_exit(&tprof_lock);
1028 		break;
1029 	case TPROF_IOC_START:
1030 		mutex_enter(&tprof_startstop_lock);
1031 		error = tprof_start(*(tprof_countermask_t *)data);
1032 		mutex_exit(&tprof_startstop_lock);
1033 		break;
1034 	case TPROF_IOC_STOP:
1035 		mutex_enter(&tprof_startstop_lock);
1036 		tprof_stop(*(tprof_countermask_t *)data);
1037 		mutex_exit(&tprof_startstop_lock);
1038 		break;
1039 	case TPROF_IOC_GETSTAT:
1040 		mutex_enter(&tprof_lock);
1041 		memcpy(data, &tprof_stat, sizeof(tprof_stat));
1042 		mutex_exit(&tprof_lock);
1043 		break;
1044 	case TPROF_IOC_CONFIGURE_EVENT:
1045 		param = data;
1046 		mutex_enter(&tprof_startstop_lock);
1047 		error = tprof_configure_event(param);
1048 		mutex_exit(&tprof_startstop_lock);
1049 		break;
1050 	case TPROF_IOC_GETCOUNTS:
1051 		counts = data;
1052 		mutex_enter(&tprof_startstop_lock);
1053 		error = tprof_getcounts(counts);
1054 		mutex_exit(&tprof_startstop_lock);
1055 		break;
1056 	default:
1057 		error = EINVAL;
1058 		break;
1059 	}
1060 
1061 	return error;
1062 }
1063 
1064 const struct cdevsw tprof_cdevsw = {
1065 	.d_open = tprof_open,
1066 	.d_close = tprof_close,
1067 	.d_read = tprof_read,
1068 	.d_write = nowrite,
1069 	.d_ioctl = tprof_ioctl,
1070 	.d_stop = nostop,
1071 	.d_tty = notty,
1072 	.d_poll = tprof_poll,
1073 	.d_mmap = nommap,
1074 	.d_kqfilter = tprof_kqfilter,
1075 	.d_discard = nodiscard,
1076 	.d_flag = D_OTHER | D_MPSAFE
1077 };
1078 
1079 void
1080 tprofattach(int nunits)
1081 {
1082 
1083 	/* nothing */
1084 }
1085 
1086 MODULE(MODULE_CLASS_DRIVER, tprof, NULL);
1087 
1088 static void
1089 tprof_cpu_init(void *vcp, void *vcookie, struct cpu_info *ci)
1090 {
1091 	tprof_cpu_t **cp = vcp, *c;
1092 
1093 	c = kmem_zalloc(sizeof(*c), KM_SLEEP);
1094 	c->c_buf = NULL;
1095 	c->c_cpuid = cpu_index(ci);
1096 	*cp = c;
1097 }
1098 
1099 static void
1100 tprof_cpu_fini(void *vcp, void *vcookie, struct cpu_info *ci)
1101 {
1102 	tprof_cpu_t **cp = vcp, *c;
1103 
1104 	c = *cp;
1105 	KASSERT(c->c_cpuid == cpu_index(ci));
1106 	KASSERT(c->c_buf == NULL);
1107 	kmem_free(c, sizeof(*c));
1108 	*cp = NULL;
1109 }
1110 
1111 static void
1112 tprof_driver_init(void)
1113 {
1114 
1115 	mutex_init(&tprof_lock, MUTEX_DEFAULT, IPL_NONE);
1116 	mutex_init(&tprof_reader_lock, MUTEX_DEFAULT, IPL_NONE);
1117 	mutex_init(&tprof_startstop_lock, MUTEX_DEFAULT, IPL_NONE);
1118 	selinit(&tprof_selp);
1119 	cv_init(&tprof_cv, "tprof");
1120 	cv_init(&tprof_reader_cv, "tprof_rd");
1121 	STAILQ_INIT(&tprof_list);
1122 	tprof_cpus = percpu_create(sizeof(tprof_cpu_t *),
1123 	    tprof_cpu_init, tprof_cpu_fini, NULL);
1124 }
1125 
1126 static void
1127 tprof_driver_fini(void)
1128 {
1129 
1130 	percpu_free(tprof_cpus, sizeof(tprof_cpu_t *));
1131 	mutex_destroy(&tprof_lock);
1132 	mutex_destroy(&tprof_reader_lock);
1133 	mutex_destroy(&tprof_startstop_lock);
1134 	seldestroy(&tprof_selp);
1135 	cv_destroy(&tprof_cv);
1136 	cv_destroy(&tprof_reader_cv);
1137 }
1138 
1139 static int
1140 tprof_modcmd(modcmd_t cmd, void *arg)
1141 {
1142 
1143 	switch (cmd) {
1144 	case MODULE_CMD_INIT:
1145 		tprof_driver_init();
1146 #if defined(_MODULE)
1147 		{
1148 			devmajor_t bmajor = NODEVMAJOR;
1149 			devmajor_t cmajor = NODEVMAJOR;
1150 			int error;
1151 
1152 			error = devsw_attach("tprof", NULL, &bmajor,
1153 			    &tprof_cdevsw, &cmajor);
1154 			if (error) {
1155 				tprof_driver_fini();
1156 				return error;
1157 			}
1158 		}
1159 #endif /* defined(_MODULE) */
1160 		return 0;
1161 
1162 	case MODULE_CMD_FINI:
1163 #if defined(_MODULE)
1164 		devsw_detach(NULL, &tprof_cdevsw);
1165 #endif /* defined(_MODULE) */
1166 		tprof_driver_fini();
1167 		return 0;
1168 
1169 	default:
1170 		return ENOTTY;
1171 	}
1172 }
1173