xref: /netbsd-src/sys/dev/tprof/tprof.c (revision 183ce831afd8041589560bf846730d6193878392)
1 /*	$NetBSD: tprof.c,v 1.23 2023/04/11 10:07:12 msaitoh Exp $	*/
2 
3 /*-
4  * Copyright (c)2008,2009,2010 YAMAMOTO Takashi,
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 
29 #include <sys/cdefs.h>
30 __KERNEL_RCSID(0, "$NetBSD: tprof.c,v 1.23 2023/04/11 10:07:12 msaitoh Exp $");
31 
32 #include <sys/param.h>
33 #include <sys/systm.h>
34 #include <sys/kernel.h>
35 
36 #include <sys/callout.h>
37 #include <sys/conf.h>
38 #include <sys/cpu.h>
39 #include <sys/kmem.h>
40 #include <sys/module.h>
41 #include <sys/percpu.h>
42 #include <sys/poll.h>
43 #include <sys/proc.h>
44 #include <sys/queue.h>
45 #include <sys/select.h>
46 #include <sys/workqueue.h>
47 #include <sys/xcall.h>
48 
49 #include <dev/tprof/tprof.h>
50 #include <dev/tprof/tprof_ioctl.h>
51 
52 #include "ioconf.h"
53 
54 #ifndef TPROF_HZ
55 #define TPROF_HZ	10000
56 #endif
57 
58 /*
59  * locking order:
60  *	tprof_reader_lock -> tprof_lock
61  *	tprof_startstop_lock -> tprof_lock
62  */
63 
64 /*
65  * protected by:
66  *	L: tprof_lock
67  *	R: tprof_reader_lock
68  *	S: tprof_startstop_lock
69  *	s: writer should hold tprof_startstop_lock and tprof_lock
70  *	   reader should hold tprof_startstop_lock or tprof_lock
71  */
72 
73 typedef struct tprof_buf {
74 	u_int b_used;
75 	u_int b_size;
76 	u_int b_overflow;
77 	u_int b_unused;
78 	STAILQ_ENTRY(tprof_buf) b_list;
79 	tprof_sample_t b_data[];
80 } tprof_buf_t;
81 #define	TPROF_BUF_BYTESIZE(sz) \
82 	(sizeof(tprof_buf_t) + (sz) * sizeof(tprof_sample_t))
83 #define	TPROF_MAX_SAMPLES_PER_BUF	TPROF_HZ
84 
85 typedef struct {
86 	tprof_buf_t *c_buf;
87 	uint32_t c_cpuid;
88 	struct work c_work;
89 	callout_t c_callout;
90 } __aligned(CACHE_LINE_SIZE) tprof_cpu_t;
91 
92 typedef struct tprof_backend {
93 	/*
94 	 * tprof_backend_softc_t must be passed as an argument to the interrupt
95 	 * handler, but since this is difficult to implement in armv7/v8. Then,
96 	 * tprof_backend is exposed. Additionally, softc must be placed at the
97 	 * beginning of struct tprof_backend.
98 	 */
99 	tprof_backend_softc_t tb_softc;
100 
101 	const char *tb_name;
102 	const tprof_backend_ops_t *tb_ops;
103 	LIST_ENTRY(tprof_backend) tb_list;
104 } tprof_backend_t;
105 
106 static kmutex_t tprof_lock;
107 static u_int tprof_nworker;		/* L: # of running worker LWPs */
108 static lwp_t *tprof_owner;
109 static STAILQ_HEAD(, tprof_buf) tprof_list; /* L: global buffer list */
110 static u_int tprof_nbuf_on_list;	/* L: # of buffers on tprof_list */
111 static struct workqueue *tprof_wq;
112 static struct percpu *tprof_cpus __read_mostly;	/* tprof_cpu_t * */
113 static u_int tprof_samples_per_buf;
114 static u_int tprof_max_buf;
115 
116 tprof_backend_t *tprof_backend;	/* S: */
117 static LIST_HEAD(, tprof_backend) tprof_backends =
118     LIST_HEAD_INITIALIZER(tprof_backend); /* S: */
119 
120 static kmutex_t tprof_reader_lock;
121 static kcondvar_t tprof_reader_cv;	/* L: */
122 static off_t tprof_reader_offset;	/* R: */
123 
124 static kmutex_t tprof_startstop_lock;
125 static kcondvar_t tprof_cv;		/* L: */
126 static struct selinfo tprof_selp;	/* L: */
127 
128 static struct tprof_stat tprof_stat;	/* L: */
129 
130 static tprof_cpu_t *
tprof_cpu_direct(struct cpu_info * ci)131 tprof_cpu_direct(struct cpu_info *ci)
132 {
133 	tprof_cpu_t **cp;
134 
135 	cp = percpu_getptr_remote(tprof_cpus, ci);
136 	return *cp;
137 }
138 
139 static tprof_cpu_t *
tprof_cpu(struct cpu_info * ci)140 tprof_cpu(struct cpu_info *ci)
141 {
142 	tprof_cpu_t *c;
143 
144 	/*
145 	 * As long as xcalls are blocked -- e.g., by kpreempt_disable
146 	 * -- the percpu object will not be swapped and destroyed.  We
147 	 * can't write to it, because the data may have already been
148 	 * moved to a new buffer, but we can safely read from it.
149 	 */
150 	kpreempt_disable();
151 	c = tprof_cpu_direct(ci);
152 	kpreempt_enable();
153 
154 	return c;
155 }
156 
157 static tprof_cpu_t *
tprof_curcpu(void)158 tprof_curcpu(void)
159 {
160 
161 	return tprof_cpu(curcpu());
162 }
163 
164 static tprof_buf_t *
tprof_buf_alloc(void)165 tprof_buf_alloc(void)
166 {
167 	tprof_buf_t *new;
168 	u_int size = tprof_samples_per_buf;
169 
170 	new = kmem_alloc(TPROF_BUF_BYTESIZE(size), KM_SLEEP);
171 	new->b_used = 0;
172 	new->b_size = size;
173 	new->b_overflow = 0;
174 	return new;
175 }
176 
177 static void
tprof_buf_free(tprof_buf_t * buf)178 tprof_buf_free(tprof_buf_t *buf)
179 {
180 
181 	kmem_free(buf, TPROF_BUF_BYTESIZE(buf->b_size));
182 }
183 
184 static tprof_buf_t *
tprof_buf_switch(tprof_cpu_t * c,tprof_buf_t * new)185 tprof_buf_switch(tprof_cpu_t *c, tprof_buf_t *new)
186 {
187 	tprof_buf_t *old;
188 
189 	old = c->c_buf;
190 	c->c_buf = new;
191 	return old;
192 }
193 
194 static tprof_buf_t *
tprof_buf_refresh(void)195 tprof_buf_refresh(void)
196 {
197 	tprof_cpu_t * const c = tprof_curcpu();
198 	tprof_buf_t *new;
199 
200 	new = tprof_buf_alloc();
201 	return tprof_buf_switch(c, new);
202 }
203 
204 static void
tprof_worker(struct work * wk,void * dummy)205 tprof_worker(struct work *wk, void *dummy)
206 {
207 	tprof_cpu_t * const c = tprof_curcpu();
208 	tprof_buf_t *buf;
209 	tprof_backend_t *tb;
210 	bool shouldstop;
211 
212 	KASSERT(wk == &c->c_work);
213 	KASSERT(dummy == NULL);
214 
215 	/*
216 	 * Get a per cpu buffer.
217 	 */
218 	buf = tprof_buf_refresh();
219 
220 	/*
221 	 * and put it on the global list for read(2).
222 	 */
223 	mutex_enter(&tprof_lock);
224 	tb = tprof_backend;
225 	shouldstop = (tb == NULL || tb->tb_softc.sc_ctr_running_mask == 0);
226 	if (shouldstop) {
227 		KASSERT(tprof_nworker > 0);
228 		tprof_nworker--;
229 		cv_broadcast(&tprof_cv);
230 		cv_broadcast(&tprof_reader_cv);
231 	}
232 	if (buf->b_used == 0) {
233 		tprof_stat.ts_emptybuf++;
234 	} else if (tprof_nbuf_on_list < tprof_max_buf) {
235 		tprof_stat.ts_sample += buf->b_used;
236 		tprof_stat.ts_overflow += buf->b_overflow;
237 		tprof_stat.ts_buf++;
238 		STAILQ_INSERT_TAIL(&tprof_list, buf, b_list);
239 		tprof_nbuf_on_list++;
240 		buf = NULL;
241 		selnotify(&tprof_selp, 0, NOTE_SUBMIT);
242 		cv_broadcast(&tprof_reader_cv);
243 	} else {
244 		tprof_stat.ts_dropbuf_sample += buf->b_used;
245 		tprof_stat.ts_dropbuf++;
246 	}
247 	mutex_exit(&tprof_lock);
248 	if (buf)
249 		tprof_buf_free(buf);
250 
251 	if (!shouldstop)
252 		callout_schedule(&c->c_callout, hz / 8);
253 }
254 
255 static void
tprof_kick(void * vp)256 tprof_kick(void *vp)
257 {
258 	struct cpu_info * const ci = vp;
259 	tprof_cpu_t * const c = tprof_cpu(ci);
260 
261 	workqueue_enqueue(tprof_wq, &c->c_work, ci);
262 }
263 
264 static void
tprof_stop1(void)265 tprof_stop1(void)
266 {
267 	CPU_INFO_ITERATOR cii;
268 	struct cpu_info *ci;
269 
270 	KASSERT(mutex_owned(&tprof_startstop_lock));
271 	KASSERT(tprof_nworker == 0);
272 
273 	for (CPU_INFO_FOREACH(cii, ci)) {
274 		tprof_cpu_t * const c = tprof_cpu(ci);
275 		tprof_buf_t *old;
276 
277 		old = tprof_buf_switch(c, NULL);
278 		if (old != NULL)
279 			tprof_buf_free(old);
280 
281 		callout_destroy(&c->c_callout);
282 	}
283 	workqueue_destroy(tprof_wq);
284 }
285 
286 static void
tprof_getinfo(struct tprof_info * info)287 tprof_getinfo(struct tprof_info *info)
288 {
289 	tprof_backend_t *tb;
290 
291 	KASSERT(mutex_owned(&tprof_startstop_lock));
292 
293 	memset(info, 0, sizeof(*info));
294 	info->ti_version = TPROF_VERSION;
295 	if ((tb = tprof_backend) != NULL)
296 		info->ti_ident = tb->tb_ops->tbo_ident();
297 }
298 
299 static int
tprof_getncounters(u_int * ncounters)300 tprof_getncounters(u_int *ncounters)
301 {
302 	tprof_backend_t *tb;
303 
304 	tb = tprof_backend;
305 	if (tb == NULL)
306 		return ENOENT;
307 
308 	*ncounters = tb->tb_ops->tbo_ncounters();
309 	return 0;
310 }
311 
312 static void
tprof_start_cpu(void * arg1,void * arg2)313 tprof_start_cpu(void *arg1, void *arg2)
314 {
315 	tprof_backend_t *tb = arg1;
316 	tprof_countermask_t runmask = (uintptr_t)arg2;
317 
318 	tb->tb_ops->tbo_start(runmask);
319 }
320 
321 static void
tprof_stop_cpu(void * arg1,void * arg2)322 tprof_stop_cpu(void *arg1, void *arg2)
323 {
324 	tprof_backend_t *tb = arg1;
325 	tprof_countermask_t stopmask = (uintptr_t)arg2;
326 
327 	tb->tb_ops->tbo_stop(stopmask);
328 }
329 
330 static int
tprof_start(tprof_countermask_t runmask)331 tprof_start(tprof_countermask_t runmask)
332 {
333 	CPU_INFO_ITERATOR cii;
334 	struct cpu_info *ci;
335 	tprof_backend_t *tb;
336 	uint64_t xc;
337 	int error;
338 	bool firstrun;
339 
340 	KASSERT(mutex_owned(&tprof_startstop_lock));
341 
342 	tb = tprof_backend;
343 	if (tb == NULL) {
344 		error = ENOENT;
345 		goto done;
346 	}
347 
348 	runmask &= ~tb->tb_softc.sc_ctr_running_mask;
349 	runmask &= tb->tb_softc.sc_ctr_configured_mask;
350 	if (runmask == 0) {
351 		/*
352 		 * Targets are already running.
353 		 * Unconfigured counters are ignored.
354 		 */
355 		error = 0;
356 		goto done;
357 	}
358 
359 	firstrun = (tb->tb_softc.sc_ctr_running_mask == 0);
360 	if (firstrun) {
361 		if (tb->tb_ops->tbo_establish != NULL) {
362 			error = tb->tb_ops->tbo_establish(&tb->tb_softc);
363 			if (error != 0)
364 				goto done;
365 		}
366 
367 		tprof_samples_per_buf = TPROF_MAX_SAMPLES_PER_BUF;
368 		tprof_max_buf = ncpu * 3;
369 		error = workqueue_create(&tprof_wq, "tprofmv", tprof_worker,
370 		    NULL, PRI_NONE, IPL_SOFTCLOCK, WQ_MPSAFE | WQ_PERCPU);
371 		if (error != 0) {
372 			if (tb->tb_ops->tbo_disestablish != NULL)
373 				tb->tb_ops->tbo_disestablish(&tb->tb_softc);
374 			goto done;
375 		}
376 
377 		for (CPU_INFO_FOREACH(cii, ci)) {
378 			tprof_cpu_t * const c = tprof_cpu(ci);
379 			tprof_buf_t *new;
380 			tprof_buf_t *old;
381 
382 			new = tprof_buf_alloc();
383 			old = tprof_buf_switch(c, new);
384 			if (old != NULL) {
385 				tprof_buf_free(old);
386 			}
387 			callout_init(&c->c_callout, CALLOUT_MPSAFE);
388 			callout_setfunc(&c->c_callout, tprof_kick, ci);
389 		}
390 	}
391 
392 	runmask &= tb->tb_softc.sc_ctr_configured_mask;
393 	xc = xc_broadcast(0, tprof_start_cpu, tb, (void *)(uintptr_t)runmask);
394 	xc_wait(xc);
395 	mutex_enter(&tprof_lock);
396 	tb->tb_softc.sc_ctr_running_mask |= runmask;
397 	mutex_exit(&tprof_lock);
398 
399 	if (firstrun) {
400 		for (CPU_INFO_FOREACH(cii, ci)) {
401 			tprof_cpu_t * const c = tprof_cpu(ci);
402 
403 			mutex_enter(&tprof_lock);
404 			tprof_nworker++;
405 			mutex_exit(&tprof_lock);
406 			workqueue_enqueue(tprof_wq, &c->c_work, ci);
407 		}
408 	}
409 	error = 0;
410 
411 done:
412 	return error;
413 }
414 
415 static void
tprof_stop(tprof_countermask_t stopmask)416 tprof_stop(tprof_countermask_t stopmask)
417 {
418 	tprof_backend_t *tb;
419 	uint64_t xc;
420 
421 	tb = tprof_backend;
422 	if (tb == NULL)
423 		return;
424 
425 	KASSERT(mutex_owned(&tprof_startstop_lock));
426 	stopmask &= tb->tb_softc.sc_ctr_running_mask;
427 	if (stopmask == 0) {
428 		/* Targets are not running */
429 		goto done;
430 	}
431 
432 	xc = xc_broadcast(0, tprof_stop_cpu, tb, (void *)(uintptr_t)stopmask);
433 	xc_wait(xc);
434 	mutex_enter(&tprof_lock);
435 	tb->tb_softc.sc_ctr_running_mask &= ~stopmask;
436 	mutex_exit(&tprof_lock);
437 
438 	/* All counters have stopped? */
439 	if (tb->tb_softc.sc_ctr_running_mask == 0) {
440 		mutex_enter(&tprof_lock);
441 		cv_broadcast(&tprof_reader_cv);
442 		while (tprof_nworker > 0)
443 			cv_wait(&tprof_cv, &tprof_lock);
444 
445 		mutex_exit(&tprof_lock);
446 
447 		tprof_stop1();
448 		if (tb->tb_ops->tbo_disestablish != NULL)
449 			tb->tb_ops->tbo_disestablish(&tb->tb_softc);
450 	}
451 done:
452 	;
453 }
454 
455 static void
tprof_init_percpu_counters_offset(void * vp,void * vp2,struct cpu_info * ci)456 tprof_init_percpu_counters_offset(void *vp, void *vp2, struct cpu_info *ci)
457 {
458 	uint64_t *counters_offset = vp;
459 	u_int counter = (uintptr_t)vp2;
460 
461 	tprof_backend_t *tb = tprof_backend;
462 	tprof_param_t *param = &tb->tb_softc.sc_count[counter].ctr_param;
463 	counters_offset[counter] = param->p_value;
464 }
465 
466 static void
tprof_configure_event_cpu(void * arg1,void * arg2)467 tprof_configure_event_cpu(void *arg1, void *arg2)
468 {
469 	tprof_backend_t *tb = arg1;
470 	u_int counter = (uintptr_t)arg2;
471 	tprof_param_t *param = &tb->tb_softc.sc_count[counter].ctr_param;
472 
473 	tb->tb_ops->tbo_configure_event(counter, param);
474 }
475 
476 static int
tprof_configure_event(const tprof_param_t * param)477 tprof_configure_event(const tprof_param_t *param)
478 {
479 	tprof_backend_t *tb;
480 	tprof_backend_softc_t *sc;
481 	tprof_param_t *sc_param;
482 	uint64_t xc;
483 	int c, error;
484 
485 	if ((param->p_flags & (TPROF_PARAM_USER | TPROF_PARAM_KERN)) == 0) {
486 		error = EINVAL;
487 		goto done;
488 	}
489 
490 	tb = tprof_backend;
491 	if (tb == NULL) {
492 		error = ENOENT;
493 		goto done;
494 	}
495 	sc = &tb->tb_softc;
496 
497 	c = param->p_counter;
498 	if (c >= tb->tb_softc.sc_ncounters) {
499 		error = EINVAL;
500 		goto done;
501 	}
502 
503 	if (tb->tb_ops->tbo_valid_event != NULL) {
504 		error = tb->tb_ops->tbo_valid_event(param->p_counter, param);
505 		if (error != 0)
506 			goto done;
507 	}
508 
509 	/* if already running, stop the counter */
510 	if (ISSET(c, tb->tb_softc.sc_ctr_running_mask))
511 		tprof_stop(__BIT(c));
512 
513 	sc->sc_count[c].ctr_bitwidth =
514 	    tb->tb_ops->tbo_counter_bitwidth(param->p_counter);
515 
516 	sc_param = &sc->sc_count[c].ctr_param;
517 	memcpy(sc_param, param, sizeof(*sc_param)); /* save copy of param */
518 
519 	if (ISSET(param->p_flags, TPROF_PARAM_PROFILE)) {
520 		uint64_t freq, inum, dnum;
521 
522 		freq = tb->tb_ops->tbo_counter_estimate_freq(c);
523 		sc->sc_count[c].ctr_counter_val = freq / TPROF_HZ;
524 		if (sc->sc_count[c].ctr_counter_val == 0) {
525 			printf("%s: counter#%d frequency (%"PRIu64") is"
526 			    " very low relative to TPROF_HZ (%u)\n", __func__,
527 			    c, freq, TPROF_HZ);
528 			sc->sc_count[c].ctr_counter_val =
529 			    4000000000ULL / TPROF_HZ;
530 		}
531 
532 		switch (param->p_flags & TPROF_PARAM_VALUE2_MASK) {
533 		case TPROF_PARAM_VALUE2_SCALE:
534 			if (sc_param->p_value2 == 0)
535 				break;
536 			/*
537 			 * p_value2 is 64-bit fixed-point
538 			 * upper 32 bits are the integer part
539 			 * lower 32 bits are the decimal part
540 			 */
541 			inum = sc_param->p_value2 >> 32;
542 			dnum = sc_param->p_value2 & __BITS(31, 0);
543 			sc->sc_count[c].ctr_counter_val =
544 			    sc->sc_count[c].ctr_counter_val * inum +
545 			    (sc->sc_count[c].ctr_counter_val * dnum >> 32);
546 			if (sc->sc_count[c].ctr_counter_val == 0)
547 				sc->sc_count[c].ctr_counter_val = 1;
548 			break;
549 		case TPROF_PARAM_VALUE2_TRIGGERCOUNT:
550 			if (sc_param->p_value2 == 0)
551 				sc_param->p_value2 = 1;
552 			if (sc_param->p_value2 >
553 			    __BITS(sc->sc_count[c].ctr_bitwidth - 1, 0)) {
554 				sc_param->p_value2 =
555 				    __BITS(sc->sc_count[c].ctr_bitwidth - 1, 0);
556 			}
557 			sc->sc_count[c].ctr_counter_val = sc_param->p_value2;
558 			break;
559 		default:
560 			break;
561 		}
562 		sc->sc_count[c].ctr_counter_reset_val =
563 		    -sc->sc_count[c].ctr_counter_val;
564 		sc->sc_count[c].ctr_counter_reset_val &=
565 		    __BITS(sc->sc_count[c].ctr_bitwidth - 1, 0);
566 	} else {
567 		sc->sc_count[c].ctr_counter_val = 0;
568 		sc->sc_count[c].ctr_counter_reset_val = 0;
569 	}
570 
571 	/* At this point, p_value is used as an initial value */
572 	percpu_foreach(tb->tb_softc.sc_ctr_offset_percpu,
573 	    tprof_init_percpu_counters_offset, (void *)(uintptr_t)c);
574 	/* On the backend side, p_value is used as the reset value */
575 	sc_param->p_value = tb->tb_softc.sc_count[c].ctr_counter_reset_val;
576 
577 	xc = xc_broadcast(0, tprof_configure_event_cpu,
578 	    tb, (void *)(uintptr_t)c);
579 	xc_wait(xc);
580 
581 	mutex_enter(&tprof_lock);
582 	/* update counters bitmasks */
583 	SET(tb->tb_softc.sc_ctr_configured_mask, __BIT(c));
584 	CLR(tb->tb_softc.sc_ctr_prof_mask, __BIT(c));
585 	CLR(tb->tb_softc.sc_ctr_ovf_mask, __BIT(c));
586 	/* profiled counter requires overflow handling */
587 	if (ISSET(param->p_flags, TPROF_PARAM_PROFILE)) {
588 		SET(tb->tb_softc.sc_ctr_prof_mask, __BIT(c));
589 		SET(tb->tb_softc.sc_ctr_ovf_mask, __BIT(c));
590 	}
591 	/* counters with less than 64bits also require overflow handling */
592 	if (sc->sc_count[c].ctr_bitwidth != 64)
593 		SET(tb->tb_softc.sc_ctr_ovf_mask, __BIT(c));
594 	mutex_exit(&tprof_lock);
595 
596 	error = 0;
597 
598  done:
599 	return error;
600 }
601 
602 static void
tprof_getcounts_cpu(void * arg1,void * arg2)603 tprof_getcounts_cpu(void *arg1, void *arg2)
604 {
605 	tprof_backend_t *tb = arg1;
606 	tprof_backend_softc_t *sc = &tb->tb_softc;
607 	uint64_t *counters = arg2;
608 	uint64_t *counters_offset;
609 	unsigned int c;
610 
611 	tprof_countermask_t configmask = sc->sc_ctr_configured_mask;
612 	counters_offset = percpu_getref(sc->sc_ctr_offset_percpu);
613 	for (c = 0; c < sc->sc_ncounters; c++) {
614 		if (ISSET(configmask, __BIT(c))) {
615 			uint64_t ctr = tb->tb_ops->tbo_counter_read(c);
616 			counters[c] = counters_offset[c] +
617 			    ((ctr - sc->sc_count[c].ctr_counter_reset_val) &
618 			    __BITS(sc->sc_count[c].ctr_bitwidth - 1, 0));
619 		} else
620 			counters[c] = 0;
621 	}
622 	percpu_putref(sc->sc_ctr_offset_percpu);
623 }
624 
625 static int
tprof_getcounts(tprof_counts_t * counts)626 tprof_getcounts(tprof_counts_t *counts)
627 {
628 	struct cpu_info *ci;
629 	tprof_backend_t *tb;
630 	uint64_t xc;
631 
632 	tb = tprof_backend;
633 	if (tb == NULL)
634 		return ENOENT;
635 
636 	if (counts->c_cpu >= ncpu)
637 		return ESRCH;
638 	ci = cpu_lookup(counts->c_cpu);
639 	if (ci == NULL)
640 		return ESRCH;
641 
642 	xc = xc_unicast(0, tprof_getcounts_cpu, tb, counts->c_count, ci);
643 	xc_wait(xc);
644 
645 	counts->c_ncounters = tb->tb_softc.sc_ncounters;
646 	counts->c_runningmask = tb->tb_softc.sc_ctr_running_mask;
647 	return 0;
648 }
649 
650 /*
651  * tprof_clear: drain unread samples.
652  */
653 
654 static void
tprof_clear(void)655 tprof_clear(void)
656 {
657 	tprof_buf_t *buf;
658 
659 	mutex_enter(&tprof_reader_lock);
660 	mutex_enter(&tprof_lock);
661 	while ((buf = STAILQ_FIRST(&tprof_list)) != NULL) {
662 		if (buf != NULL) {
663 			STAILQ_REMOVE_HEAD(&tprof_list, b_list);
664 			KASSERT(tprof_nbuf_on_list > 0);
665 			tprof_nbuf_on_list--;
666 			mutex_exit(&tprof_lock);
667 			tprof_buf_free(buf);
668 			mutex_enter(&tprof_lock);
669 		}
670 	}
671 	KASSERT(tprof_nbuf_on_list == 0);
672 	mutex_exit(&tprof_lock);
673 	tprof_reader_offset = 0;
674 	mutex_exit(&tprof_reader_lock);
675 
676 	memset(&tprof_stat, 0, sizeof(tprof_stat));
677 }
678 
679 static tprof_backend_t *
tprof_backend_lookup(const char * name)680 tprof_backend_lookup(const char *name)
681 {
682 	tprof_backend_t *tb;
683 
684 	KASSERT(mutex_owned(&tprof_startstop_lock));
685 
686 	LIST_FOREACH(tb, &tprof_backends, tb_list) {
687 		if (!strcmp(tb->tb_name, name)) {
688 			return tb;
689 		}
690 	}
691 	return NULL;
692 }
693 
694 /* -------------------- backend interfaces */
695 
696 /*
697  * tprof_sample: record a sample on the per-cpu buffer.
698  *
699  * be careful; can be called in NMI context.
700  * we are bluntly assuming the followings are safe.
701  *	curcpu()
702  *	curlwp->l_lid
703  *	curlwp->l_proc->p_pid
704  */
705 
706 void
tprof_sample(void * unused,const tprof_frame_info_t * tfi)707 tprof_sample(void *unused, const tprof_frame_info_t *tfi)
708 {
709 	tprof_cpu_t * const c = tprof_cpu_direct(curcpu());
710 	tprof_buf_t * const buf = c->c_buf;
711 	tprof_sample_t *sp;
712 	const uintptr_t pc = tfi->tfi_pc;
713 	const lwp_t * const l = curlwp;
714 	u_int idx;
715 
716 	idx = buf->b_used;
717 	if (__predict_false(idx >= buf->b_size)) {
718 		buf->b_overflow++;
719 		return;
720 	}
721 	sp = &buf->b_data[idx];
722 	sp->s_pid = l->l_proc->p_pid;
723 	sp->s_lwpid = l->l_lid;
724 	sp->s_cpuid = c->c_cpuid;
725 	sp->s_flags = ((tfi->tfi_inkernel) ? TPROF_SAMPLE_INKERNEL : 0) |
726 	    __SHIFTIN(tfi->tfi_counter, TPROF_SAMPLE_COUNTER_MASK);
727 	sp->s_pc = pc;
728 	buf->b_used = idx + 1;
729 }
730 
731 /*
732  * tprof_backend_register:
733  */
734 
735 int
tprof_backend_register(const char * name,const tprof_backend_ops_t * ops,int vers)736 tprof_backend_register(const char *name, const tprof_backend_ops_t *ops,
737     int vers)
738 {
739 	tprof_backend_t *tb;
740 
741 	if (vers != TPROF_BACKEND_VERSION)
742 		return EINVAL;
743 
744 	mutex_enter(&tprof_startstop_lock);
745 	tb = tprof_backend_lookup(name);
746 	if (tb != NULL) {
747 		mutex_exit(&tprof_startstop_lock);
748 		return EEXIST;
749 	}
750 #if 1 /* XXX for now */
751 	if (!LIST_EMPTY(&tprof_backends)) {
752 		mutex_exit(&tprof_startstop_lock);
753 		return ENOTSUP;
754 	}
755 #endif
756 	tb = kmem_zalloc(sizeof(*tb), KM_SLEEP);
757 	tb->tb_name = name;
758 	tb->tb_ops = ops;
759 	LIST_INSERT_HEAD(&tprof_backends, tb, tb_list);
760 #if 1 /* XXX for now */
761 	if (tprof_backend == NULL) {
762 		tprof_backend = tb;
763 	}
764 #endif
765 	mutex_exit(&tprof_startstop_lock);
766 
767 	/* Init backend softc */
768 	tb->tb_softc.sc_ncounters = tb->tb_ops->tbo_ncounters();
769 	tb->tb_softc.sc_ctr_offset_percpu_size =
770 	    sizeof(uint64_t) * tb->tb_softc.sc_ncounters;
771 	tb->tb_softc.sc_ctr_offset_percpu =
772 	    percpu_alloc(tb->tb_softc.sc_ctr_offset_percpu_size);
773 
774 	return 0;
775 }
776 
777 /*
778  * tprof_backend_unregister:
779  */
780 
781 int
tprof_backend_unregister(const char * name)782 tprof_backend_unregister(const char *name)
783 {
784 	tprof_backend_t *tb;
785 
786 	mutex_enter(&tprof_startstop_lock);
787 	tb = tprof_backend_lookup(name);
788 #if defined(DIAGNOSTIC)
789 	if (tb == NULL) {
790 		mutex_exit(&tprof_startstop_lock);
791 		panic("%s: not found '%s'", __func__, name);
792 	}
793 #endif /* defined(DIAGNOSTIC) */
794 	if (tb->tb_softc.sc_ctr_running_mask != 0) {
795 		mutex_exit(&tprof_startstop_lock);
796 		return EBUSY;
797 	}
798 #if 1 /* XXX for now */
799 	if (tprof_backend == tb)
800 		tprof_backend = NULL;
801 #endif
802 	LIST_REMOVE(tb, tb_list);
803 	mutex_exit(&tprof_startstop_lock);
804 
805 	/* fini backend softc */
806 	percpu_free(tb->tb_softc.sc_ctr_offset_percpu,
807 	    tb->tb_softc.sc_ctr_offset_percpu_size);
808 
809 	/* Free backend */
810 	kmem_free(tb, sizeof(*tb));
811 
812 	return 0;
813 }
814 
815 /* -------------------- cdevsw interfaces */
816 
817 static int
tprof_open(dev_t dev,int flags,int type,struct lwp * l)818 tprof_open(dev_t dev, int flags, int type, struct lwp *l)
819 {
820 
821 	if (minor(dev) != 0)
822 		return EXDEV;
823 
824 	mutex_enter(&tprof_lock);
825 	if (tprof_owner != NULL) {
826 		mutex_exit(&tprof_lock);
827 		return  EBUSY;
828 	}
829 	tprof_owner = curlwp;
830 	mutex_exit(&tprof_lock);
831 
832 	return 0;
833 }
834 
835 static int
tprof_close(dev_t dev,int flags,int type,struct lwp * l)836 tprof_close(dev_t dev, int flags, int type, struct lwp *l)
837 {
838 
839 	KASSERT(minor(dev) == 0);
840 
841 	mutex_enter(&tprof_startstop_lock);
842 	mutex_enter(&tprof_lock);
843 	tprof_owner = NULL;
844 	mutex_exit(&tprof_lock);
845 	tprof_stop(TPROF_COUNTERMASK_ALL);
846 	tprof_clear();
847 
848 	tprof_backend_t *tb = tprof_backend;
849 	if (tb != NULL) {
850 		KASSERT(tb->tb_softc.sc_ctr_running_mask == 0);
851 		tb->tb_softc.sc_ctr_configured_mask = 0;
852 		tb->tb_softc.sc_ctr_prof_mask = 0;
853 		tb->tb_softc.sc_ctr_ovf_mask = 0;
854 	}
855 
856 	mutex_exit(&tprof_startstop_lock);
857 
858 	return 0;
859 }
860 
861 static int
tprof_poll(dev_t dev,int events,struct lwp * l)862 tprof_poll(dev_t dev, int events, struct lwp *l)
863 {
864 	int revents;
865 
866 	revents = events & (POLLIN | POLLRDNORM);
867 	if (revents == 0)
868 		return 0;
869 
870 	mutex_enter(&tprof_lock);
871 	if (STAILQ_EMPTY(&tprof_list)) {
872 		revents = 0;
873 		selrecord(l, &tprof_selp);
874 	}
875 	mutex_exit(&tprof_lock);
876 
877 	return revents;
878 }
879 
880 static void
filt_tprof_read_detach(struct knote * kn)881 filt_tprof_read_detach(struct knote *kn)
882 {
883 	mutex_enter(&tprof_lock);
884 	selremove_knote(&tprof_selp, kn);
885 	mutex_exit(&tprof_lock);
886 }
887 
888 static int
filt_tprof_read_event(struct knote * kn,long hint)889 filt_tprof_read_event(struct knote *kn, long hint)
890 {
891 	int rv = 0;
892 
893 	if ((hint & NOTE_SUBMIT) == 0)
894 		mutex_enter(&tprof_lock);
895 
896 	if (!STAILQ_EMPTY(&tprof_list)) {
897 		tprof_buf_t *buf;
898 		int64_t n = 0;
899 
900 		STAILQ_FOREACH(buf, &tprof_list, b_list) {
901 			n += buf->b_used;
902 		}
903 		kn->kn_data = n * sizeof(tprof_sample_t);
904 
905 		rv = 1;
906 	}
907 
908 	if ((hint & NOTE_SUBMIT) == 0)
909 		mutex_exit(&tprof_lock);
910 
911 	return rv;
912 }
913 
914 static const struct filterops tprof_read_filtops = {
915 	.f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE,
916 	.f_attach = NULL,
917 	.f_detach = filt_tprof_read_detach,
918 	.f_event = filt_tprof_read_event,
919 };
920 
921 static int
tprof_kqfilter(dev_t dev,struct knote * kn)922 tprof_kqfilter(dev_t dev, struct knote *kn)
923 {
924 	switch (kn->kn_filter) {
925 	case EVFILT_READ:
926 		kn->kn_fop = &tprof_read_filtops;
927 		mutex_enter(&tprof_lock);
928 		selrecord_knote(&tprof_selp, kn);
929 		mutex_exit(&tprof_lock);
930 		break;
931 	default:
932 		return EINVAL;
933 	}
934 
935 	return 0;
936 }
937 
938 static int
tprof_read(dev_t dev,struct uio * uio,int flags)939 tprof_read(dev_t dev, struct uio *uio, int flags)
940 {
941 	tprof_buf_t *buf;
942 	size_t bytes;
943 	size_t resid;
944 	size_t done = 0;
945 	int error = 0;
946 
947 	KASSERT(minor(dev) == 0);
948 	mutex_enter(&tprof_reader_lock);
949 	while (uio->uio_resid > 0 && error == 0) {
950 		/*
951 		 * Take the first buffer from the list.
952 		 */
953 		mutex_enter(&tprof_lock);
954 		buf = STAILQ_FIRST(&tprof_list);
955 		if (buf == NULL) {
956 			if (tprof_nworker == 0 || done != 0) {
957 				mutex_exit(&tprof_lock);
958 				error = 0;
959 				break;
960 			}
961 			mutex_exit(&tprof_reader_lock);
962 			error = cv_wait_sig(&tprof_reader_cv, &tprof_lock);
963 			mutex_exit(&tprof_lock);
964 			mutex_enter(&tprof_reader_lock);
965 			continue;
966 		}
967 		STAILQ_REMOVE_HEAD(&tprof_list, b_list);
968 		KASSERT(tprof_nbuf_on_list > 0);
969 		tprof_nbuf_on_list--;
970 		mutex_exit(&tprof_lock);
971 
972 		/*
973 		 * Copy it out.
974 		 */
975 		bytes = MIN(buf->b_used * sizeof(tprof_sample_t) -
976 		    tprof_reader_offset, uio->uio_resid);
977 		resid = uio->uio_resid;
978 		error = uiomove((char *)buf->b_data + tprof_reader_offset,
979 		    bytes, uio);
980 		done = resid - uio->uio_resid;
981 		tprof_reader_offset += done;
982 
983 		/*
984 		 * If we didn't consume the whole buffer,
985 		 * put it back to the list.
986 		 */
987 		if (tprof_reader_offset <
988 		    buf->b_used * sizeof(tprof_sample_t)) {
989 			mutex_enter(&tprof_lock);
990 			STAILQ_INSERT_HEAD(&tprof_list, buf, b_list);
991 			tprof_nbuf_on_list++;
992 			cv_broadcast(&tprof_reader_cv);
993 			mutex_exit(&tprof_lock);
994 		} else {
995 			tprof_buf_free(buf);
996 			tprof_reader_offset = 0;
997 		}
998 	}
999 	mutex_exit(&tprof_reader_lock);
1000 
1001 	return error;
1002 }
1003 
1004 static int
tprof_ioctl(dev_t dev,u_long cmd,void * data,int flags,struct lwp * l)1005 tprof_ioctl(dev_t dev, u_long cmd, void *data, int flags, struct lwp *l)
1006 {
1007 	const tprof_param_t *param;
1008 	tprof_counts_t *counts;
1009 	int error = 0;
1010 
1011 	KASSERT(minor(dev) == 0);
1012 
1013 	switch (cmd) {
1014 	case TPROF_IOC_GETINFO:
1015 		mutex_enter(&tprof_startstop_lock);
1016 		tprof_getinfo(data);
1017 		mutex_exit(&tprof_startstop_lock);
1018 		break;
1019 	case TPROF_IOC_GETNCOUNTERS:
1020 		mutex_enter(&tprof_lock);
1021 		error = tprof_getncounters((u_int *)data);
1022 		mutex_exit(&tprof_lock);
1023 		break;
1024 	case TPROF_IOC_START:
1025 		mutex_enter(&tprof_startstop_lock);
1026 		error = tprof_start(*(tprof_countermask_t *)data);
1027 		mutex_exit(&tprof_startstop_lock);
1028 		break;
1029 	case TPROF_IOC_STOP:
1030 		mutex_enter(&tprof_startstop_lock);
1031 		tprof_stop(*(tprof_countermask_t *)data);
1032 		mutex_exit(&tprof_startstop_lock);
1033 		break;
1034 	case TPROF_IOC_GETSTAT:
1035 		mutex_enter(&tprof_lock);
1036 		memcpy(data, &tprof_stat, sizeof(tprof_stat));
1037 		mutex_exit(&tprof_lock);
1038 		break;
1039 	case TPROF_IOC_CONFIGURE_EVENT:
1040 		param = data;
1041 		mutex_enter(&tprof_startstop_lock);
1042 		error = tprof_configure_event(param);
1043 		mutex_exit(&tprof_startstop_lock);
1044 		break;
1045 	case TPROF_IOC_GETCOUNTS:
1046 		counts = data;
1047 		mutex_enter(&tprof_startstop_lock);
1048 		error = tprof_getcounts(counts);
1049 		mutex_exit(&tprof_startstop_lock);
1050 		break;
1051 	default:
1052 		error = EINVAL;
1053 		break;
1054 	}
1055 
1056 	return error;
1057 }
1058 
1059 const struct cdevsw tprof_cdevsw = {
1060 	.d_open = tprof_open,
1061 	.d_close = tprof_close,
1062 	.d_read = tprof_read,
1063 	.d_write = nowrite,
1064 	.d_ioctl = tprof_ioctl,
1065 	.d_stop = nostop,
1066 	.d_tty = notty,
1067 	.d_poll = tprof_poll,
1068 	.d_mmap = nommap,
1069 	.d_kqfilter = tprof_kqfilter,
1070 	.d_discard = nodiscard,
1071 	.d_flag = D_OTHER | D_MPSAFE
1072 };
1073 
1074 void
tprofattach(int nunits)1075 tprofattach(int nunits)
1076 {
1077 
1078 	/* Nothing */
1079 }
1080 
1081 MODULE(MODULE_CLASS_DRIVER, tprof, NULL);
1082 
1083 static void
tprof_cpu_init(void * vcp,void * vcookie,struct cpu_info * ci)1084 tprof_cpu_init(void *vcp, void *vcookie, struct cpu_info *ci)
1085 {
1086 	tprof_cpu_t **cp = vcp, *c;
1087 
1088 	c = kmem_zalloc(sizeof(*c), KM_SLEEP);
1089 	c->c_buf = NULL;
1090 	c->c_cpuid = cpu_index(ci);
1091 	*cp = c;
1092 }
1093 
1094 static void
tprof_cpu_fini(void * vcp,void * vcookie,struct cpu_info * ci)1095 tprof_cpu_fini(void *vcp, void *vcookie, struct cpu_info *ci)
1096 {
1097 	tprof_cpu_t **cp = vcp, *c;
1098 
1099 	c = *cp;
1100 	KASSERT(c->c_cpuid == cpu_index(ci));
1101 	KASSERT(c->c_buf == NULL);
1102 	kmem_free(c, sizeof(*c));
1103 	*cp = NULL;
1104 }
1105 
1106 static void
tprof_driver_init(void)1107 tprof_driver_init(void)
1108 {
1109 
1110 	mutex_init(&tprof_lock, MUTEX_DEFAULT, IPL_NONE);
1111 	mutex_init(&tprof_reader_lock, MUTEX_DEFAULT, IPL_NONE);
1112 	mutex_init(&tprof_startstop_lock, MUTEX_DEFAULT, IPL_NONE);
1113 	selinit(&tprof_selp);
1114 	cv_init(&tprof_cv, "tprof");
1115 	cv_init(&tprof_reader_cv, "tprof_rd");
1116 	STAILQ_INIT(&tprof_list);
1117 	tprof_cpus = percpu_create(sizeof(tprof_cpu_t *),
1118 	    tprof_cpu_init, tprof_cpu_fini, NULL);
1119 }
1120 
1121 static void
tprof_driver_fini(void)1122 tprof_driver_fini(void)
1123 {
1124 
1125 	percpu_free(tprof_cpus, sizeof(tprof_cpu_t *));
1126 	mutex_destroy(&tprof_lock);
1127 	mutex_destroy(&tprof_reader_lock);
1128 	mutex_destroy(&tprof_startstop_lock);
1129 	seldestroy(&tprof_selp);
1130 	cv_destroy(&tprof_cv);
1131 	cv_destroy(&tprof_reader_cv);
1132 }
1133 
1134 static int
tprof_modcmd(modcmd_t cmd,void * arg)1135 tprof_modcmd(modcmd_t cmd, void *arg)
1136 {
1137 
1138 	switch (cmd) {
1139 	case MODULE_CMD_INIT:
1140 		tprof_driver_init();
1141 #if defined(_MODULE)
1142 		{
1143 			devmajor_t bmajor = NODEVMAJOR;
1144 			devmajor_t cmajor = NODEVMAJOR;
1145 			int error;
1146 
1147 			error = devsw_attach("tprof", NULL, &bmajor,
1148 			    &tprof_cdevsw, &cmajor);
1149 			if (error) {
1150 				tprof_driver_fini();
1151 				return error;
1152 			}
1153 		}
1154 #endif /* defined(_MODULE) */
1155 		return 0;
1156 
1157 	case MODULE_CMD_FINI:
1158 #if defined(_MODULE)
1159 		devsw_detach(NULL, &tprof_cdevsw);
1160 #endif /* defined(_MODULE) */
1161 		tprof_driver_fini();
1162 		return 0;
1163 
1164 	default:
1165 		return ENOTTY;
1166 	}
1167 }
1168