xref: /dflybsd-src/lib/libevtr/evtr.c (revision 899b08f0481cee99aeea3b0d44cd623a9d6bc2db)
1 /*
2  * Copyright (c) 2009, 2010 Aggelos Economopoulos.  All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  *
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in
12  *    the documentation and/or other materials provided with the
13  *    distribution.
14  * 3. Neither the name of The DragonFly Project nor the names of its
15  *    contributors may be used to endorse or promote products derived
16  *    from this software without specific, prior written permission.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
21  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
22  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
23  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
24  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
26  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
28  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29  * SUCH DAMAGE.
30  */
31 
32 #include <assert.h>
33 #include <ctype.h>
34 #include <err.h>
35 #include <errno.h>
36 #include <limits.h>
37 #include <stdarg.h>
38 #include <stdio.h>
39 #include <stdlib.h>
40 #include <string.h>
41 #include <sys/queue.h>
42 #include <sys/stat.h>
43 #include <sys/tree.h>
44 
45 
46 #include "evtr.h"
47 #include "internal.h"
48 
49 unsigned evtr_debug;
50 
51 static
52 void
53 printd_set_flags(const char *str, unsigned int *flags)
54 {
55 	/*
56 	 * This is suboptimal as we don't detect
57 	 * invalid flags.
58 	 */
59 	for (; *str; ++str) {
60 		if ('A' == *str) {
61 			*flags = -1;
62 			return;
63 		}
64 		if (!islower(*str))
65 			err(2, "invalid debug flag %c\n", *str);
66 		*flags |= 1 << (*str - 'a');
67 	}
68 }
69 
70 
71 enum {
72 	MAX_EVHDR_SIZE = PATH_MAX + 200,
73 	/* string namespaces */
74 	EVTR_NS_PATH = 0x1,
75 	EVTR_NS_FUNC,
76 	EVTR_NS_DSTR,
77 	EVTR_NS_MAX,
78 	NR_BUCKETS = 1021,	/* prime */
79 	REC_ALIGN = 8,
80 	REC_BOUNDARY = 1 << 14,
81 	FILTF_ID = 0x10,
82 	EVTRF_WR = 0x1,		/* open for writing */
83 	EVTRQF_PENDING = 0x1,
84 };
85 
86 typedef uint16_t fileid_t;
87 typedef uint16_t funcid_t;
88 typedef uint16_t fmtid_t;
89 
90 struct trace_event_header {
91 	uint8_t type;
92 	uint64_t ts;	/* XXX: this should only be part of probe */
93 } __attribute__((packed));
94 
95 struct probe_event_header {
96 	struct trace_event_header eh;
97 	/*
98 	 * For these fields, 0 implies "not available"
99 	 */
100 	fileid_t file;
101 	funcid_t caller1;
102 	funcid_t caller2;
103 	funcid_t func;
104 	uint16_t line;
105 	fmtid_t fmt;
106 	uint16_t datalen;
107 	uint8_t cpu;	/* -1 if n/a */
108 } __attribute__((packed));
109 
110 struct string_event_header {
111 	struct trace_event_header eh;
112 	uint16_t ns;
113 	uint32_t id;
114 	uint16_t len;
115 } __attribute__((packed));
116 
117 struct fmt_event_header {
118 	struct trace_event_header eh;
119 	uint16_t id;
120 	uint8_t subsys_len;
121 	uint8_t fmt_len;
122 } __attribute__((packed));
123 
124 struct cpuinfo_event_header {
125 	double freq;
126 	uint8_t cpu;
127 } __attribute__((packed));
128 
129 struct hashentry {
130 	uintptr_t key;
131 	uintptr_t val;
132 	struct hashentry *next;
133 };
134 
135 struct hashtab {
136 	struct hashentry *buckets[NR_BUCKETS];
137 	uintptr_t (*hashfunc)(uintptr_t);
138 	uintptr_t (*cmpfunc)(uintptr_t, uintptr_t);
139 };
140 
141 struct symtab {
142 	struct hashtab tab;
143 };
144 
145 struct event_fmt {
146 	const char *subsys;
147 	const char *fmt;
148 };
149 
150 struct event_filter_unresolved {
151 	TAILQ_ENTRY(event_filter_unresolved) link;
152 	evtr_filter_t filt;
153 };
154 
155 struct id_map {
156 	RB_ENTRY(id_map) rb_node;
157 	int id;
158 	const void *data;
159 };
160 
161 RB_HEAD(id_tree, id_map);
162 struct string_map {
163 	struct id_tree root;
164 };
165 
166 struct fmt_map {
167 	struct id_tree root;
168 };
169 
170 RB_HEAD(thread_tree, evtr_thread);
171 
172 struct thread_map {
173 	struct thread_tree root;
174 };
175 
176 struct event_callback {
177 	void (*cb)(evtr_event_t, void *data);
178 	void *data;	/* this field must be malloc()ed */
179 };
180 
181 struct cpu {
182 	struct evtr_thread *td;	/* currently executing thread */
183 	double freq;
184 };
185 
186 struct evtr {
187 	FILE *f;
188 	int flags;
189 	int err;
190 	const char *errmsg;
191 	off_t bytes;
192 	union {
193 		/*
194 		 * When writing, we keep track of the strings we've
195 		 * already dumped so we only dump them once.
196 		 * Paths, function names etc belong to different
197 		 * namespaces.
198 		 */
199 		struct hashtab_str *strings[EVTR_NS_MAX - 1];
200 		/*
201 		 * When reading, we build a map from id to string.
202 		 * Every id must be defined at the point of use.
203 		 */
204 		struct string_map maps[EVTR_NS_MAX - 1];
205 	};
206 	union {
207 		/* same as above, but for subsys+fmt pairs */
208 		struct fmt_map fmtmap;
209 		struct hashtab_str *fmts;
210 	};
211 	struct thread_map threads;
212 	struct cpu *cpus;
213 	int ncpus;
214 };
215 
216 struct evtr_query {
217 	evtr_t evtr;
218 	off_t off;
219 	evtr_filter_t filt;
220 	int nfilt;
221 	int nmatched;
222 	int ntried;
223 	void *buf;
224 	int bufsize;
225 	struct symtab *symtab;
226 	int ncbs;
227 	struct event_callback **cbs;
228 	/*
229 	 * Filters that have a format specified and we
230 	 * need to resolve that to an fmtid
231 	 */
232 	TAILQ_HEAD(, event_filter_unresolved) unresolved_filtq;
233 	int err;
234 	const char *errmsg;
235 	int flags;
236 	struct evtr_event pending_event;
237 };
238 
239 void
240 evtr_set_debug(const char *str)
241 {
242 	printd_set_flags(str, &evtr_debug);
243 }
244 
245 static int id_map_cmp(struct id_map *, struct id_map *);
246 RB_PROTOTYPE2(id_tree, id_map, rb_node, id_map_cmp, int);
247 RB_GENERATE2(id_tree, id_map, rb_node, id_map_cmp, int, id);
248 
249 static int thread_cmp(struct evtr_thread *, struct evtr_thread *);
250 RB_PROTOTYPE2(thread_tree, evtr_thread, rb_node, thread_cmp, void *);
251 RB_GENERATE2(thread_tree, evtr_thread, rb_node, thread_cmp, void *, id);
252 
253 static inline
254 void
255 validate_string(const char *str)
256 {
257 	if (!(evtr_debug & MISC))
258 		return;
259 	for (; *str; ++str)
260 		assert(isprint(*str));
261 }
262 
263 static
264 void
265 id_tree_free(struct id_tree *root)
266 {
267 	struct id_map *v, *n;
268 
269 	for (v = RB_MIN(id_tree, root); v; v = n) {
270 		n = RB_NEXT(id_tree, root, v);
271 		RB_REMOVE(id_tree, root, v);
272 	}
273 }
274 
275 static
276 int
277 evtr_register_callback(evtr_query_t q, void (*fn)(evtr_event_t, void *), void *d)
278 {
279 	struct event_callback *cb;
280 	void *cbs;
281 
282 	if (!(cb = malloc(sizeof(*cb)))) {
283 		q->err = ENOMEM;
284 		return !0;
285 	}
286 	cb->cb = fn;
287 	cb->data = d;
288 	if (!(cbs = realloc(q->cbs, (++q->ncbs) * sizeof(cb)))) {
289 		--q->ncbs;
290 		free(cb);
291 		q->err = ENOMEM;
292 		return !0;
293 	}
294 	q->cbs = cbs;
295 	q->cbs[q->ncbs - 1] = cb;
296 	return 0;
297 }
298 
299 static
300 void
301 evtr_deregister_callbacks(evtr_query_t q)
302 {
303 	int i;
304 
305 	for (i = 0; i < q->ncbs; ++i) {
306 		free(q->cbs[i]);
307 	}
308 	free(q->cbs);
309 	q->cbs = NULL;
310 }
311 
312 static
313 void
314 evtr_run_callbacks(evtr_event_t ev, evtr_query_t q)
315 {
316 	struct event_callback *cb;
317 	int i;
318 
319 	for (i = 0; i < q->ncbs; ++i) {
320 		cb = q->cbs[i];
321 		cb->cb(ev, cb->data);
322 	}
323 }
324 
325 static
326 struct cpu *
327 evtr_cpu(evtr_t evtr, int c)
328 {
329 	if ((c < 0) || (c >= evtr->ncpus))
330 		return NULL;
331 	return &evtr->cpus[c];
332 }
333 
334 static
335 int
336 parse_format_data(evtr_event_t ev, const char *fmt, ...) __attribute__((format (scanf, 2, 3)));
337 static
338 int
339 parse_format_data(evtr_event_t ev, const char *fmt, ...)
340 {
341 	va_list ap;
342 	char buf[2048];
343 
344 	if (strcmp(fmt, ev->fmt))
345 		return 0;
346 	vsnprintf(buf, sizeof(buf), fmt, __DECONST(void *, ev->fmtdata));
347 	printd(MISC, "string is: %s\n", buf);
348 	va_start(ap, fmt);
349 	return vsscanf(buf, fmt, ap);
350 }
351 
352 static
353 void
354 evtr_deregister_filters(evtr_query_t q, evtr_filter_t filt, int nfilt)
355 {
356 	struct event_filter_unresolved *u, *tmp;
357 	int i;
358 	TAILQ_FOREACH_MUTABLE(u, &q->unresolved_filtq, link, tmp) {
359 		for (i = 0; i < nfilt; ++i) {
360 			if (u->filt == &filt[i]) {
361 				TAILQ_REMOVE(&q->unresolved_filtq, u, link);
362 			}
363 		}
364 	}
365 }
366 
367 static
368 int
369 evtr_filter_register(evtr_query_t q, evtr_filter_t filt)
370 {
371 	struct event_filter_unresolved *res;
372 
373 	if (!(res = malloc(sizeof(*res)))) {
374 		q->err = ENOMEM;
375 		return !0;
376 	}
377 	res->filt = filt;
378 	TAILQ_INSERT_TAIL(&q->unresolved_filtq, res, link);
379 	return 0;
380 }
381 
382 static
383 int
384 evtr_query_needs_parsing(evtr_query_t q)
385 {
386 	int i;
387 
388 	for (i = 0; i < q->nfilt; ++i)
389 		if (q->filt[i].ev_type == EVTR_TYPE_STMT)
390 			return !0;
391 	return 0;
392 }
393 
394 void
395 evtr_event_data(evtr_event_t ev, char *buf, size_t len)
396 {
397 	/*
398 	 * XXX: we implicitly trust the format string.
399 	 * We shouldn't.
400 	 */
401 	if (ev->fmtdatalen) {
402 		vsnprintf(buf, len, ev->fmt, __DECONST(void *, ev->fmtdata));
403 	} else {
404 		strlcpy(buf, ev->fmt, len);
405 	}
406 }
407 
408 int
409 evtr_error(evtr_t evtr)
410 {
411 	return evtr->err || (evtr->errmsg != NULL);
412 }
413 
414 const char *
415 evtr_errmsg(evtr_t evtr)
416 {
417 	return evtr->errmsg ? evtr->errmsg : strerror(evtr->err);
418 }
419 
420 int
421 evtr_query_error(evtr_query_t q)
422 {
423 	return q->err || (q->errmsg != NULL) || evtr_error(q->evtr);
424 }
425 
426 const char *
427 evtr_query_errmsg(evtr_query_t q)
428 {
429 	return q->errmsg ? q->errmsg :
430 		(q->err ? strerror(q->err) :
431 		 (evtr_errmsg(q->evtr)));
432 }
433 
434 static
435 int
436 id_map_cmp(struct id_map *a, struct id_map *b)
437 {
438 	return a->id - b->id;
439 }
440 
441 static
442 int
443 thread_cmp(struct evtr_thread *a, struct evtr_thread *b)
444 {
445 	ptrdiff_t d;
446 	d =  a->id - b->id;
447 	if (d < 0)
448 		return -1;
449 	if (!d)
450 		return 0;
451 	return 1;
452 }
453 
454 #define DEFINE_MAP_FIND(prefix, type)		\
455 	static					\
456 	type				\
457 	prefix ## _map_find(struct id_tree *tree, int id)\
458 	{						 \
459 		struct id_map *sid;			 \
460 							\
461 		sid = id_tree_RB_LOOKUP(tree, id);	\
462 		return sid ? sid->data : NULL;		\
463 	}
464 
465 DEFINE_MAP_FIND(string, const char *)
466 DEFINE_MAP_FIND(fmt, const struct event_fmt *)
467 
468 static
469 struct evtr_thread *
470 thread_map_find(struct thread_map *map, void *id)
471 {
472 	return thread_tree_RB_LOOKUP(&map->root, id);
473 }
474 
475 #define DEFINE_MAP_INSERT(prefix, type, _cmp, _dup)	\
476 	static					\
477 	int								\
478 	prefix ## _map_insert(struct id_tree *tree, type data, int id) \
479 	{								\
480 	struct id_map *sid, *osid;					\
481 									\
482 	sid = malloc(sizeof(*sid));					\
483 	if (!sid) {							\
484 		return ENOMEM;						\
485 	}								\
486 	sid->id = id;							\
487 	sid->data = data;						\
488 	if ((osid = id_tree_RB_INSERT(tree, sid))) {			\
489 		free(sid);						\
490 		if (_cmp((type)osid->data, data)) {			\
491 			return EEXIST;					\
492 		}							\
493 		printd(DS, "mapping already exists, skipping\n");		\
494 		/* we're OK with redefinitions of an id to the same string */ \
495 		return 0;						\
496 	}								\
497 	/* only do the strdup if we're inserting a new string */	\
498 	sid->data = _dup(data);		/* XXX: oom */			\
499 	return 0;							\
500 }
501 
502 static
503 void
504 thread_map_insert(struct thread_map *map, struct evtr_thread *td)
505 {
506 	struct evtr_thread *otd;
507 
508 	if ((otd = thread_tree_RB_INSERT(&map->root, td))) {
509 		/*
510 		 * Thread addresses might be reused, we're
511 		 * ok with that.
512 		 * DANGER, Will Robinson: this means the user
513 		 * of the API needs to copy event->td if they
514 		 * want it to remain stable.
515 		 */
516 		free((void *)otd->comm);
517 		otd->comm = td->comm;
518 		free(td);
519 	}
520 }
521 
522 static
523 int
524 event_fmt_cmp(const struct event_fmt *a, const struct event_fmt *b)
525 {
526 	int ret = 0;
527 
528 	if (a->subsys) {
529 		if (b->subsys) {
530 			ret = strcmp(a->subsys, b->subsys);
531 		} else {
532 			ret = strcmp(a->subsys, "");
533 		}
534 	} else if (b->subsys) {
535 			ret = strcmp("", b->subsys);
536 	}
537 	if (ret)
538 		return ret;
539 	return strcmp(a->fmt, b->fmt);
540 }
541 
542 static
543 struct event_fmt *
544 event_fmt_dup(const struct event_fmt *o)
545 {
546 	struct event_fmt *n;
547 
548 	if (!(n = malloc(sizeof(*n)))) {
549 		return n;
550 	}
551 	memcpy(n, o, sizeof(*n));
552 	return n;
553 }
554 
555 DEFINE_MAP_INSERT(string, const char *, strcmp, strdup)
556 DEFINE_MAP_INSERT(fmt, const struct event_fmt *, event_fmt_cmp, event_fmt_dup)
557 
558 int
559 hash_find(const struct hashtab *tab, uintptr_t key, uintptr_t *val)
560 {
561 	struct hashentry *ent;
562 
563 	for(ent = tab->buckets[tab->hashfunc(key)];
564 	    ent && tab->cmpfunc(ent->key, key);
565 	    ent = ent->next);
566 
567 	if (!ent)
568 		return !0;
569 	*val = ent->val;
570 	return 0;
571 }
572 
573 struct hashentry *
574 hash_insert(struct hashtab *tab, uintptr_t key, uintptr_t val)
575 {
576 	struct hashentry *ent;
577 	int hsh;
578 
579 	if (!(ent = malloc(sizeof(*ent)))) {
580 		fprintf(stderr, "out of memory\n");
581 		return NULL;
582 	}
583 	hsh = tab->hashfunc(key);
584 	ent->next = tab->buckets[hsh];
585 	ent->key = key;
586 	ent->val = val;
587 	tab->buckets[hsh] = ent;
588 	return ent;
589 }
590 
591 static
592 uintptr_t
593 cmpfunc_pointer(uintptr_t a, uintptr_t b)
594 {
595 	return b - a;
596 }
597 
598 static
599 uintptr_t
600 hashfunc_pointer(uintptr_t p)
601 {
602 	return p;
603 }
604 
605 struct hashtab *
606 hash_new(void)
607 {
608 	struct hashtab *tab;
609 	if (!(tab = calloc(sizeof(struct hashtab), 1)))
610 		return tab;
611 	tab->hashfunc = &hashfunc_pointer;
612 	tab->cmpfunc = &cmpfunc_pointer;
613 	return tab;
614 }
615 
616 struct hashtab_str {	/* string -> id map */
617 	struct hashtab tab;
618 	uint16_t id;
619 };
620 
621 static
622 uintptr_t
623 hashfunc_string(uintptr_t p)
624 {
625 	const char *str = (char *)p;
626         unsigned long hash = 5381;
627         int c;
628 
629         while ((c = *str++))
630             hash = ((hash << 5) + hash) + c; /* hash * 33 + c */
631 	return hash  % NR_BUCKETS;
632 }
633 
634 static
635 uintptr_t
636 cmpfunc_string(uintptr_t a, uintptr_t b)
637 {
638 	return strcmp((char *)a, (char *)b);
639 }
640 
641 
642 static
643 struct hashtab_str *
644 strhash_new(void)
645 {
646 	struct hashtab_str *strtab;
647 	if (!(strtab = calloc(sizeof(struct hashtab_str), 1)))
648 		return strtab;
649 	strtab->tab.hashfunc = &hashfunc_string;
650 	strtab->tab.cmpfunc = &cmpfunc_string;
651 	return strtab;
652 }
653 
654 static
655 void
656 strhash_destroy(struct hashtab_str *strtab)
657 {
658 	free(strtab);
659 }
660 
661 static
662 int
663 strhash_find(struct hashtab_str *strtab, const char *str, uint16_t *id)
664 {
665 	uintptr_t val;
666 
667 	if (hash_find(&strtab->tab, (uintptr_t)str, &val))
668 		return !0;
669 	*id = (uint16_t)val;
670 	return 0;
671 }
672 
673 static
674 int
675 strhash_insert(struct hashtab_str *strtab, const char *str, uint16_t *id)
676 {
677 	uintptr_t val;
678 
679 	val = ++strtab->id;
680 	if (strtab->id == 0) {
681 		fprintf(stderr, "too many strings\n");
682 		return ERANGE;
683 	}
684 	str = strdup(str);
685 	if (!str) {
686 		fprintf(stderr, "out of memory\n");
687 		--strtab->id;
688 		return ENOMEM;
689 	}
690 	hash_insert(&strtab->tab, (uintptr_t)str, (uintptr_t)val);
691 	*id = strtab->id;
692 	return 0;
693 }
694 
695 static
696 struct symtab *
697 symtab_new(void)
698 {
699 	struct symtab *symtab;
700 	if (!(symtab = calloc(sizeof(struct symtab), 1)))
701 		return symtab;
702 	symtab->tab.hashfunc = &hashfunc_string;
703 	symtab->tab.cmpfunc = &cmpfunc_string;
704 	return symtab;
705 }
706 
707 static
708 void
709 symtab_destroy(struct symtab *symtab)
710 {
711 	free(symtab);
712 }
713 
714 struct evtr_variable *
715 symtab_find(const struct symtab *symtab, const char *str)
716 {
717 	uintptr_t val;
718 
719 	if (hash_find(&symtab->tab, (uintptr_t)str, &val))
720 		return NULL;
721 	return (struct evtr_variable *)val;
722 }
723 
724 int
725 symtab_insert(struct symtab *symtab, const char *name,
726 	       struct evtr_variable *var)
727 {
728 	name = strdup(name);
729 	if (!name) {
730 		fprintf(stderr, "out of memory\n");
731 		return ENOMEM;
732 	}
733 	hash_insert(&symtab->tab, (uintptr_t)name, (uintptr_t)var);
734 	return 0;
735 }
736 
737 static
738 int
739 evtr_filter_match(evtr_query_t q, evtr_filter_t f, evtr_event_t ev)
740 {
741 	if ((f->cpu != -1) && (f->cpu != ev->cpu))
742 		return 0;
743 
744 	assert(!(f->flags & FILTF_ID));
745 	if (ev->type != f->ev_type)
746 		return 0;
747 	if (ev->type == EVTR_TYPE_PROBE) {
748 		if (f->fmt && strcmp(ev->fmt, f->fmt))
749 			return 0;
750 	} else if (ev->type == EVTR_TYPE_STMT) {
751 		struct evtr_variable *var;
752 		/* resolve var */
753 		/* XXX: no need to do that *every* time */
754 		parse_var(f->var, q->symtab, &var);
755 		if (var != ev->stmt.var)
756 			return 0;
757 	}
758 	return !0;
759 }
760 
761 static
762 int
763 evtr_match_filters(struct evtr_query *q, evtr_event_t ev)
764 {
765 	int i;
766 
767 	/* no filters means we're interested in all events */
768 	if (!q->nfilt)
769 		return !0;
770 	++q->ntried;
771 	for (i = 0; i < q->nfilt; ++i) {
772 		if (evtr_filter_match(q, &q->filt[i], ev)) {
773 			++q->nmatched;
774 			return !0;
775 		}
776 	}
777 	return 0;
778 }
779 
780 static
781 void
782 parse_callback(evtr_event_t ev, void *d)
783 {
784 	evtr_query_t q = (evtr_query_t)d;
785 	if (ev->type != EVTR_TYPE_PROBE)
786 		return;
787 	if (!ev->fmt || (ev->fmt[0] != '#'))
788 		return;
789 	/*
790 	 * Copy the event to ->pending_event, then call
791 	 * the parser to convert it into a synthesized
792 	 * EVTR_TYPE_STMT event.
793 	 */
794 	memcpy(&q->pending_event, ev, sizeof(ev));
795 	parse_string(&q->pending_event, q->symtab, &ev->fmt[1]);
796 	if (!evtr_match_filters(q, &q->pending_event))
797 		return;
798 	/*
799 	 * This will cause us to return ->pending_event next time
800 	 * we're called.
801 	 */
802 	q->flags |= EVTRQF_PENDING;
803 }
804 
805 static
806 void
807 thread_creation_callback(evtr_event_t ev, void *d)
808 {
809 	evtr_query_t q = (evtr_query_t)d;
810 	evtr_t evtr = q->evtr;
811 	struct evtr_thread *td;
812 	void *ktd;
813 	char buf[20];
814 
815 	if (parse_format_data(ev, "new_td %p %s", &ktd, buf) != 2) {
816 		return;
817 	}
818 	buf[19] = '\0';
819 
820 	if (!(td = malloc(sizeof(*td)))) {
821 		q->err = ENOMEM;
822 		return;
823 	}
824 	td->id = ktd;
825 	td->userdata = NULL;
826 	if (!(td->comm = strdup(buf))) {
827 		free(td);
828 		q->err = ENOMEM;
829 		return;
830 	}
831 	printd(DS, "inserting new thread %p: %s\n", td->id, td->comm);
832 	thread_map_insert(&evtr->threads, td);
833 }
834 
835 static
836 void
837 thread_switch_callback(evtr_event_t ev, void *d)
838 {
839 	evtr_t evtr = ((evtr_query_t)d)->evtr;
840 	struct evtr_thread *tdp, *tdn;
841 	void *ktdp, *ktdn;
842 	struct cpu *cpu;
843 	static struct evtr_event tdcr;
844 	static char *fmt = "new_td %p %s";
845 	char tidstr[40];
846 	char fmtdata[sizeof(void *) + sizeof(char *)];
847 
848 	cpu = evtr_cpu(evtr, ev->cpu);
849 	if (!cpu) {
850 		printw("invalid cpu %d\n", ev->cpu);
851 		return;
852 	}
853 	if (parse_format_data(ev, "sw  %p > %p", &ktdp, &ktdn) != 2) {
854 		return;
855 	}
856 	tdp = thread_map_find(&evtr->threads, ktdp);
857 	if (!tdp) {
858 		printd(DS, "switching from unknown thread %p\n", ktdp);
859 	}
860 	tdn = thread_map_find(&evtr->threads, ktdn);
861 	if (!tdn) {
862 		/*
863 		 * Fake a thread creation event for threads we
864 		 * haven't seen before.
865 		 */
866 		tdcr.type = EVTR_TYPE_PROBE;
867 		tdcr.ts = ev->ts;
868 		tdcr.file = NULL;
869 		tdcr.func = NULL;
870 		tdcr.line = 0;
871 		tdcr.fmt = fmt;
872 		tdcr.fmtdata = &fmtdata;
873 		tdcr.fmtdatalen = sizeof(fmtdata);
874 		tdcr.cpu = ev->cpu;
875 		tdcr.td = NULL;
876 		snprintf(tidstr, sizeof(tidstr), "%p", ktdn);
877 		((void **)fmtdata)[0] = ktdn;
878 		((char **)fmtdata)[1] = &tidstr[0];
879 		thread_creation_callback(&tdcr, d);
880 
881 		tdn = thread_map_find(&evtr->threads, ktdn);
882 		assert(tdn != NULL);
883 		printd(DS, "switching to unknown thread %p\n", ktdn);
884 		cpu->td = tdn;
885 		return;
886 	}
887 	printd(DS, "cpu %d: switching to thread %p\n", ev->cpu, ktdn);
888 	cpu->td = tdn;
889 }
890 
891 static
892 void
893 assert_foff_in_sync(evtr_t evtr)
894 {
895 	off_t off;
896 
897 	/*
898 	 * We keep our own offset because we
899 	 * might want to support mmap()
900 	 */
901 	off = ftello(evtr->f);
902 	if (evtr->bytes != off) {
903 		fprintf(stderr, "bytes %jd, off %jd\n", evtr->bytes, off);
904 		abort();
905 	}
906 }
907 
908 static
909 int
910 evtr_write(evtr_t evtr, const void *buf, size_t bytes)
911 {
912 	assert_foff_in_sync(evtr);
913 	if (fwrite(buf, bytes, 1, evtr->f) != 1) {
914 		evtr->err = errno;
915 		evtr->errmsg = strerror(errno);
916 		return !0;
917 	}
918 	evtr->bytes += bytes;
919 	assert_foff_in_sync(evtr);
920 	return 0;
921 }
922 
923 /*
924  * Called after dumping a record to make sure the next
925  * record is REC_ALIGN aligned. This does not make much sense,
926  * as we shouldn't be using packed structs anyway.
927  */
928 static
929 int
930 evtr_dump_pad(evtr_t evtr)
931 {
932 	size_t pad;
933 	static char buf[REC_ALIGN];
934 
935 	pad = REC_ALIGN - (evtr->bytes % REC_ALIGN);
936 	if (pad > 0) {
937 		return evtr_write(evtr, buf, pad);
938 	}
939 	return 0;
940 }
941 
942 /*
943  * We make sure that there is a new record every REC_BOUNDARY
944  * bytes, this costs next to nothing in space and allows for
945  * fast seeking.
946  */
947 static
948 int
949 evtr_dump_avoid_boundary(evtr_t evtr, size_t bytes)
950 {
951 	unsigned pad, i;
952 	static char buf[256];
953 
954 	pad = REC_BOUNDARY - (evtr->bytes % REC_BOUNDARY);
955 	/* if adding @bytes would cause us to cross a boundary... */
956 	if (bytes > pad) {
957 		/* then pad to the boundary */
958 		for (i = 0; i < (pad / sizeof(buf)); ++i) {
959 			if (evtr_write(evtr, buf, sizeof(buf))) {
960 				return !0;
961 			}
962 		}
963 		i = pad % sizeof(buf);
964 		if (i) {
965 			if (evtr_write(evtr, buf, i)) {
966 				return !0;
967 			}
968 		}
969 	}
970 	return 0;
971 }
972 
973 static
974 int
975 evtr_dump_fmt(evtr_t evtr, uint64_t ts, const evtr_event_t ev)
976 {
977 	struct fmt_event_header fmt;
978 	uint16_t id;
979 	int err;
980 	char *subsys = "", buf[1024];
981 
982 	if (strlcpy(buf, subsys, sizeof(buf)) >= sizeof(buf)) {
983 		evtr->errmsg = "name of subsystem is too large";
984 		evtr->err = ERANGE;
985 		return 0;
986 	}
987 	if (strlcat(buf, ev->fmt, sizeof(buf)) >= sizeof(buf)) {
988 		evtr->errmsg = "fmt + name of subsystem is too large";
989 		evtr->err = ERANGE;
990 		return 0;
991 	}
992 
993 	if (!strhash_find(evtr->fmts, buf, &id)) {
994 		return id;
995 	}
996 	if ((err = strhash_insert(evtr->fmts, buf, &id))) {
997 		evtr->err = err;
998 		return 0;
999 	}
1000 
1001 	fmt.eh.type = EVTR_TYPE_FMT;
1002 	fmt.eh.ts = ts;
1003 	fmt.subsys_len = strlen(subsys);
1004 	fmt.fmt_len = strlen(ev->fmt);
1005 	fmt.id = id;
1006 	if (evtr_dump_avoid_boundary(evtr, sizeof(fmt) + fmt.subsys_len +
1007 				     fmt.fmt_len))
1008 		return 0;
1009 	if (evtr_write(evtr, &fmt, sizeof(fmt)))
1010 		return 0;
1011 	if (evtr_write(evtr, subsys, fmt.subsys_len))
1012 		return 0;
1013 	if (evtr_write(evtr, ev->fmt, fmt.fmt_len))
1014 		return 0;
1015 	if (evtr_dump_pad(evtr))
1016 		return 0;
1017 	return fmt.id;
1018 }
1019 
1020 /*
1021  * Replace string pointers or string ids in fmtdata
1022  */
1023 static
1024 int
1025 mangle_string_ptrs(const char *fmt, uint8_t *fmtdata,
1026 		   const char *(*replace)(void *, const char *), void *ctx)
1027 {
1028 	const char *f, *p;
1029 	size_t skipsize, intsz;
1030 	int ret = 0;
1031 
1032 	for (f = fmt; f[0] != '\0'; ++f) {
1033 		if (f[0] != '%')
1034 			continue;
1035 		++f;
1036 		skipsize = 0;
1037 		for (p = f; p[0]; ++p) {
1038 			int again = 0;
1039 			/*
1040 			 * Eat flags. Notice this will accept duplicate
1041 			 * flags.
1042 			 */
1043 			switch (p[0]) {
1044 			case '#':
1045 			case '0':
1046 			case '-':
1047 			case ' ':
1048 			case '+':
1049 			case '\'':
1050 				again = !0;
1051 				break;
1052 			}
1053 			if (!again)
1054 				break;
1055 		}
1056 		/* Eat minimum field width, if any */
1057 		for (; isdigit(p[0]); ++p)
1058 			;
1059 		if (p[0] == '.')
1060 			++p;
1061 		/* Eat precision, if any */
1062 		for (; isdigit(p[0]); ++p)
1063 			;
1064 		intsz = 0;
1065 		switch (p[0]) {
1066 		case 'l':
1067 			if (p[1] == 'l') {
1068 				++p;
1069 				intsz = sizeof(long long);
1070 			} else {
1071 				intsz = sizeof(long);
1072 			}
1073 			break;
1074 		case 'j':
1075 			intsz = sizeof(intmax_t);
1076 			break;
1077 		case 't':
1078 			intsz = sizeof(ptrdiff_t);
1079 			break;
1080 		case 'z':
1081 			intsz = sizeof(size_t);
1082 			break;
1083 		default:
1084 			break;
1085 		}
1086 		if (intsz != 0)
1087 			++p;
1088 		else
1089 			intsz = sizeof(int);
1090 
1091 		switch (p[0]) {
1092 		case 'd':
1093 		case 'i':
1094 		case 'o':
1095 		case 'u':
1096 		case 'x':
1097 		case 'X':
1098 		case 'c':
1099 			skipsize = intsz;
1100 			break;
1101 		case 'p':
1102 			skipsize = sizeof(void *);
1103 			break;
1104 		case 'f':
1105 			if (p[-1] == 'l')
1106 				skipsize = sizeof(double);
1107 			else
1108 				skipsize = sizeof(float);
1109 			break;
1110 		case 's':
1111 			((const char **)fmtdata)[0] =
1112 				replace(ctx, ((char **)fmtdata)[0]);
1113 			skipsize = sizeof(char *);
1114 			++ret;
1115 			break;
1116 		default:
1117 			fprintf(stderr, "Unknown conversion specifier %c "
1118 				"in fmt starting with %s", p[0], f - 1);
1119 			return -1;
1120 		}
1121 		fmtdata += skipsize;
1122 	}
1123 	return ret;
1124 }
1125 
1126 /* XXX: do we really want the timestamp? */
1127 static
1128 int
1129 evtr_dump_string(evtr_t evtr, uint64_t ts, const char *str, int ns)
1130 {
1131 	struct string_event_header s;
1132 	int err;
1133 	uint16_t id;
1134 
1135 	assert((0 <= ns) && (ns < EVTR_NS_MAX));
1136 	if (!strhash_find(evtr->strings[ns], str, &id)) {
1137 		return id;
1138 	}
1139 	if ((err = strhash_insert(evtr->strings[ns], str, &id))) {
1140 		evtr->err = err;
1141 		return 0;
1142 	}
1143 
1144 	printd(DS, "hash_insert %s ns %d id %d\n", str, ns, id);
1145 	s.eh.type = EVTR_TYPE_STR;
1146 	s.eh.ts = ts;
1147 	s.ns = ns;
1148 	s.id = id;
1149 	s.len = strnlen(str, PATH_MAX);
1150 
1151 	if (evtr_dump_avoid_boundary(evtr, sizeof(s) + s.len))
1152 		return 0;
1153 	if (evtr_write(evtr, &s, sizeof(s)))
1154 		return 0;
1155 	if (evtr_write(evtr, str, s.len))
1156 		return 0;
1157 	if (evtr_dump_pad(evtr))
1158 		return 0;
1159 	return s.id;
1160 }
1161 
1162 struct replace_ctx {
1163 	evtr_t evtr;
1164 	uint64_t ts;
1165 };
1166 
1167 static
1168 const char *
1169 replace_strptr(void *_ctx, const char *s)
1170 {
1171 	struct replace_ctx *ctx = _ctx;
1172 	return (const char *)(uintptr_t)evtr_dump_string(ctx->evtr, ctx->ts, s,
1173 							 EVTR_NS_DSTR);
1174 }
1175 
1176 static
1177 const char *
1178 replace_strid(void *_ctx, const char *s)
1179 {
1180 	struct replace_ctx *ctx = _ctx;
1181 	const char *ret;
1182 
1183 	ret = string_map_find(&ctx->evtr->maps[EVTR_NS_DSTR - 1].root,
1184 			      (int)(uintptr_t)s);
1185 	if (!ret) {
1186 		fprintf(stderr, "Unknown id for data string\n");
1187 		ctx->evtr->errmsg = "unknown id for data string";
1188 		ctx->evtr->err = !0;
1189 	}
1190 	validate_string(ret);
1191 	printd(DS, "replacing strid %d (ns %d) with string '%s' (or int %#x)\n",
1192 	       (int)(uintptr_t)s, EVTR_NS_DSTR, ret ? ret : "NULL", (int)(uintptr_t)ret);
1193 	return ret;
1194 }
1195 
1196 static
1197 int
1198 evtr_dump_probe(evtr_t evtr, evtr_event_t ev)
1199 {
1200 	struct probe_event_header kev;
1201 	char buf[1024];
1202 
1203 	memset(&kev, '\0', sizeof(kev));
1204 	kev.eh.type = ev->type;
1205 	kev.eh.ts = ev->ts;
1206 	kev.line = ev->line;
1207 	kev.cpu = ev->cpu;
1208 	if (ev->file) {
1209 		kev.file = evtr_dump_string(evtr, kev.eh.ts, ev->file,
1210 					    EVTR_NS_PATH);
1211 	}
1212 	if (ev->func) {
1213 		kev.func = evtr_dump_string(evtr, kev.eh.ts, ev->func,
1214 					    EVTR_NS_FUNC);
1215 	}
1216 	if (ev->fmt) {
1217 		kev.fmt = evtr_dump_fmt(evtr, kev.eh.ts, ev);
1218 	}
1219 	if (ev->fmtdata) {
1220 		struct replace_ctx replctx = {
1221 			.evtr = evtr,
1222 			.ts = ev->ts,
1223 		};
1224 		assert(ev->fmtdatalen <= (int)sizeof(buf));
1225 		kev.datalen = ev->fmtdatalen;
1226 		/*
1227 		 * Replace all string pointers with string ids before dumping
1228 		 * the data.
1229 		 */
1230 		memcpy(buf, ev->fmtdata, ev->fmtdatalen);
1231 		if (mangle_string_ptrs(ev->fmt, buf,
1232 				       replace_strptr, &replctx) < 0)
1233 			return !0;
1234 		if (evtr->err)
1235 			return evtr->err;
1236 	}
1237 	if (evtr_dump_avoid_boundary(evtr, sizeof(kev) + ev->fmtdatalen))
1238 		return !0;
1239 	if (evtr_write(evtr, &kev, sizeof(kev)))
1240 		return !0;
1241 	if (evtr_write(evtr, buf, ev->fmtdatalen))
1242 		return !0;
1243 	if (evtr_dump_pad(evtr))
1244 		return !0;
1245 	return 0;
1246 }
1247 
1248 static
1249 int
1250 evtr_dump_sysinfo(evtr_t evtr, evtr_event_t ev)
1251 {
1252 	uint8_t type = EVTR_TYPE_SYSINFO;
1253 	uint16_t ncpus = ev->ncpus;
1254 
1255 	if (ncpus <= 0) {
1256 		evtr->errmsg = "invalid number of cpus";
1257 		return !0;
1258 	}
1259 	if (evtr_dump_avoid_boundary(evtr, sizeof(type) + sizeof(ncpus)))
1260 		return !0;
1261 	if (evtr_write(evtr, &type, sizeof(type))) {
1262 		return !0;
1263 	}
1264 	if (evtr_write(evtr, &ncpus, sizeof(ncpus))) {
1265 		return !0;
1266 	}
1267 	if (evtr_dump_pad(evtr))
1268 		return !0;
1269 	return 0;
1270 }
1271 static
1272 int
1273 evtr_dump_cpuinfo(evtr_t evtr, evtr_event_t ev)
1274 {
1275 	struct cpuinfo_event_header ci;
1276 	uint8_t type;
1277 
1278 	if (evtr_dump_avoid_boundary(evtr, sizeof(type) + sizeof(ci)))
1279 		return !0;
1280 	type = EVTR_TYPE_CPUINFO;
1281 	if (evtr_write(evtr, &type, sizeof(type))) {
1282 		return !0;
1283 	}
1284 	ci.cpu = ev->cpu;
1285 	ci.freq = ev->cpuinfo.freq;
1286 	if (evtr_dump_avoid_boundary(evtr, sizeof(ci)))
1287 		return !0;
1288 	if (evtr_write(evtr, &ci, sizeof(ci))) {
1289 		return !0;
1290 	}
1291 	if (evtr_dump_pad(evtr))
1292 		return !0;
1293 	return 0;
1294 }
1295 
1296 int
1297 evtr_rewind(evtr_t evtr)
1298 {
1299 	assert((evtr->flags & EVTRF_WR) == 0);
1300 	evtr->bytes = 0;
1301 	if (fseek(evtr->f, 0, SEEK_SET)) {
1302 		evtr->err = errno;
1303 		return !0;
1304 	}
1305 	return 0;
1306 }
1307 
1308 int
1309 evtr_dump_event(evtr_t evtr, evtr_event_t ev)
1310 {
1311 	switch (ev->type) {
1312 	case EVTR_TYPE_PROBE:
1313 		return evtr_dump_probe(evtr, ev);
1314 	case EVTR_TYPE_SYSINFO:
1315 		return evtr_dump_sysinfo(evtr, ev);
1316 	case EVTR_TYPE_CPUINFO:
1317 		return evtr_dump_cpuinfo(evtr, ev);
1318 	}
1319 	evtr->errmsg = "unknown event type";
1320 	return !0;
1321 }
1322 
1323 static
1324 evtr_t
1325 evtr_alloc(FILE *f)
1326 {
1327 	evtr_t evtr;
1328 	if (!(evtr = malloc(sizeof(*evtr)))) {
1329 		return NULL;
1330 	}
1331 
1332 	evtr->f = f;
1333 	evtr->err = 0;
1334 	evtr->errmsg = NULL;
1335 	evtr->bytes = 0;
1336 	return evtr;
1337 }
1338 
1339 static int evtr_next_event(evtr_t, evtr_event_t);
1340 
1341 evtr_t
1342 evtr_open_read(FILE *f)
1343 {
1344 	evtr_t evtr;
1345 	struct evtr_event ev;
1346 	int i;
1347 
1348 	if (!(evtr = evtr_alloc(f))) {
1349 		return NULL;
1350 	}
1351 	evtr->flags = 0;
1352 	for (i = 0; i < (EVTR_NS_MAX - 1); ++i) {
1353 		RB_INIT(&evtr->maps[i].root);
1354 	}
1355 	RB_INIT(&evtr->fmtmap.root);
1356 	RB_INIT(&evtr->threads.root);
1357 	evtr->cpus = NULL;
1358 	evtr->ncpus = 0;
1359 	/*
1360 	 * Load the first event so we can pick up any
1361 	 * sysinfo entries.
1362 	 */
1363 	if (evtr_next_event(evtr, &ev)) {
1364 		goto free_evtr;
1365 	}
1366 	if (evtr_rewind(evtr))
1367 		goto free_evtr;
1368 	return evtr;
1369 free_evtr:
1370 	free(evtr);
1371 	return NULL;
1372 }
1373 
1374 evtr_t
1375 evtr_open_write(FILE *f)
1376 {
1377 	evtr_t evtr;
1378 	int i, j;
1379 
1380 	if (!(evtr = evtr_alloc(f))) {
1381 		return NULL;
1382 	}
1383 
1384 	evtr->flags = EVTRF_WR;
1385 	if (!(evtr->fmts = strhash_new()))
1386 		goto free_evtr;
1387 	for (i = 0; i < EVTR_NS_MAX; ++i) {
1388 		evtr->strings[i] = strhash_new();
1389 		if (!evtr->strings[i]) {
1390 			for (j = 0; j < i; ++j) {
1391 				strhash_destroy(evtr->strings[j]);
1392 			}
1393 			goto free_fmts;
1394 		}
1395 	}
1396 
1397 	return evtr;
1398 free_fmts:
1399 	strhash_destroy(evtr->fmts);
1400 free_evtr:
1401 	free(evtr);
1402 	return NULL;
1403 }
1404 
1405 static
1406 void
1407 hashtab_destroy(struct hashtab *h)
1408 {
1409 	struct hashentry *ent, *next;
1410 	int i;
1411 	for (i = 0; i < NR_BUCKETS; ++i) {
1412 		for (ent = h->buckets[i]; ent; ent = next) {
1413 			next = ent->next;
1414 			free(ent);
1415 		}
1416 	}
1417 	free(h);
1418 }
1419 
1420 void
1421 evtr_close(evtr_t evtr)
1422 {
1423 	int i;
1424 
1425 	if (evtr->flags & EVTRF_WR) {
1426 		hashtab_destroy(&evtr->fmts->tab);
1427 		for (i = 0; i < EVTR_NS_MAX; ++i)
1428 			hashtab_destroy(&evtr->strings[i]->tab);
1429 	} else {
1430 		id_tree_free(&evtr->fmtmap.root);
1431 		for (i = 0; i < EVTR_NS_MAX - 1; ++i) {
1432 			id_tree_free(&evtr->maps[i].root);
1433 		}
1434 	}
1435 	free(evtr);
1436 }
1437 
1438 static
1439 int
1440 evtr_read(evtr_t evtr, void *buf, size_t size)
1441 {
1442 	assert(size > 0);
1443 	assert_foff_in_sync(evtr);
1444 	printd(IO, "evtr_read at %#jx, %zd bytes\n", evtr->bytes, size);
1445 	if (fread(buf, size, 1, evtr->f) != 1) {
1446 		if (feof(evtr->f)) {
1447 			evtr->errmsg = "incomplete record";
1448 		} else {
1449 			evtr->errmsg = strerror(errno);
1450 		}
1451 		return !0;
1452 	}
1453 	evtr->bytes += size;
1454 	assert_foff_in_sync(evtr);
1455 	return 0;
1456 }
1457 
1458 static
1459 int
1460 evtr_load_fmt(evtr_query_t q, char *buf)
1461 {
1462 	evtr_t evtr = q->evtr;
1463 	struct fmt_event_header *evh = (struct fmt_event_header *)buf;
1464 	struct event_fmt *fmt;
1465 	char *subsys = NULL, *fmtstr;
1466 
1467 	if (!(fmt = malloc(sizeof(*fmt)))) {
1468 		evtr->err = errno;
1469 		return !0;
1470 	}
1471 	if (evtr_read(evtr, buf + sizeof(struct trace_event_header),
1472 		      sizeof(*evh) - sizeof(evh->eh))) {
1473 		goto free_fmt;
1474 	}
1475 	assert(!evh->subsys_len);
1476 	if (evh->subsys_len) {
1477 		if (!(subsys = malloc(evh->subsys_len))) {
1478 			evtr->err = errno;
1479 			goto free_fmt;
1480 		}
1481 		if (evtr_read(evtr, subsys, evh->subsys_len)) {
1482 			goto free_subsys;
1483 		}
1484 		fmt->subsys = subsys;
1485 	} else {
1486 		fmt->subsys = "";
1487 	}
1488 	if (!(fmtstr = malloc(evh->fmt_len + 1))) {
1489 		evtr->err = errno;
1490 		goto free_subsys;
1491 	}
1492 	if (evtr_read(evtr, fmtstr, evh->fmt_len)) {
1493 		goto free_fmtstr;
1494 	}
1495 	fmtstr[evh->fmt_len] = '\0';
1496 	fmt->fmt = fmtstr;
1497 
1498 	printd(DS, "fmt_map_insert (%d, %s)\n", evh->id, fmt->fmt);
1499 	evtr->err = fmt_map_insert(&evtr->fmtmap.root, fmt, evh->id);
1500 	switch (evtr->err) {
1501 	case ENOMEM:
1502 		evtr->errmsg = "out of memory";
1503 		break;
1504 	case EEXIST:
1505 		evtr->errmsg = "redefinition of an id to a "
1506 			"different format (corrupt input)";
1507 		break;
1508 	default:
1509 		;
1510 	}
1511 	return evtr->err;
1512 
1513 free_fmtstr:
1514 	free(fmtstr);
1515 free_subsys:
1516 	if (subsys)
1517 		free(subsys);
1518 free_fmt:
1519 	free(fmt);
1520 	return !0;
1521 }
1522 
1523 static
1524 int
1525 evtr_load_string(evtr_t evtr, char *buf)
1526 {
1527 	char sbuf[PATH_MAX + 1];
1528 	struct string_event_header *evh = (struct string_event_header *)buf;
1529 
1530 	if (evtr_read(evtr, buf + sizeof(struct trace_event_header),
1531 		      sizeof(*evh) - sizeof(evh->eh))) {
1532 		return !0;
1533 	}
1534 	if (evh->len > PATH_MAX) {
1535 		evtr->errmsg = "string too large (corrupt input)";
1536 		return !0;
1537 	}
1538 	if (evh->len && evtr_read(evtr, sbuf, evh->len)) {
1539 		return !0;
1540 	}
1541 	sbuf[evh->len] = 0;
1542 	if (evh->ns >= EVTR_NS_MAX) {
1543 		evtr->errmsg = "invalid namespace (corrupt input)";
1544 		return !0;
1545 	}
1546 	validate_string(sbuf);
1547 	printd(DS, "evtr_load_string:ns %d id %d : \"%s\"\n", evh->ns, evh->id,
1548 	       sbuf);
1549 	evtr->err = string_map_insert(&evtr->maps[evh->ns - 1].root, sbuf, evh->id);
1550 	switch (evtr->err) {
1551 	case ENOMEM:
1552 		evtr->errmsg = "out of memory";
1553 		break;
1554 	case EEXIST:
1555 		evtr->errmsg = "redefinition of an id to a "
1556 			"different string (corrupt input)";
1557 		break;
1558 	default:
1559 		;
1560 	}
1561 	return 0;
1562 }
1563 
1564 static
1565 int
1566 evtr_skip(evtr_t evtr, off_t bytes)
1567 {
1568 	if (fseek(evtr->f, bytes, SEEK_CUR)) {
1569 		evtr->err = errno;
1570 		evtr->errmsg = strerror(errno);
1571 		return !0;
1572 	}
1573 	evtr->bytes += bytes;
1574 	return 0;
1575 }
1576 
1577 /*
1578  * Make sure q->buf is at least len bytes
1579  */
1580 static
1581 int
1582 evtr_query_reserve_buf(struct evtr_query *q, int len)
1583 {
1584 	void *tmp;
1585 
1586 	if (q->bufsize >= len)
1587 		return 0;
1588 	if (!(tmp = realloc(q->buf, len)))
1589 		return !0;
1590 	q->buf = tmp;
1591 	q->bufsize = len;
1592 	return 0;
1593 }
1594 
1595 static
1596 int
1597 evtr_load_probe(evtr_t evtr, evtr_event_t ev, char *buf, struct evtr_query *q)
1598 {
1599 	struct probe_event_header *evh = (struct probe_event_header *)buf;
1600 	struct cpu *cpu;
1601 
1602 	if (evtr_read(evtr, buf + sizeof(struct trace_event_header),
1603 		      sizeof(*evh) - sizeof(evh->eh)))
1604 		return !0;
1605 	memset(ev, '\0', sizeof(*ev));
1606 	ev->ts = evh->eh.ts;
1607 	ev->type = EVTR_TYPE_PROBE;
1608 	ev->line = evh->line;
1609 	ev->cpu = evh->cpu;
1610 	if ((cpu = evtr_cpu(evtr, evh->cpu))) {
1611 		ev->td = cpu->td;
1612 	} else {
1613 		ev->td = NULL;
1614 	}
1615 	if (evh->file) {
1616 		ev->file = string_map_find(
1617 			&evtr->maps[EVTR_NS_PATH - 1].root,
1618 			evh->file);
1619 		if (!ev->file) {
1620 			evtr->errmsg = "unknown id for file path";
1621 			evtr->err = !0;
1622 			ev->file = "<unknown>";
1623 		} else {
1624 			validate_string(ev->file);
1625 		}
1626 	} else {
1627 		ev->file = "<unknown>";
1628 	}
1629 	if (evh->fmt) {
1630 		const struct event_fmt *fmt;
1631 		if (!(fmt = fmt_map_find(&evtr->fmtmap.root, evh->fmt))) {
1632 			evtr->errmsg = "unknown id for event fmt";
1633 			evtr->err = !0;
1634 			ev->fmt = NULL;
1635 		} else {
1636 			ev->fmt = fmt->fmt;
1637 			validate_string(fmt->fmt);
1638 		}
1639 	}
1640 	if (evh->datalen) {
1641 		if (evtr_query_reserve_buf(q, evh->datalen + 1)) {
1642 			evtr->err = ENOMEM;
1643 		} else if (!evtr_read(evtr, q->buf, evh->datalen)) {
1644 			struct replace_ctx replctx = {
1645 				.evtr = evtr,
1646 				.ts = ev->ts,
1647 			};
1648 			assert(ev->fmt);
1649 
1650 			ev->fmtdata = q->buf;
1651 			/*
1652 			 * If the format specifies any string pointers, there
1653 			 * is a string id stored in the fmtdata. Look it up
1654 			 * and replace it with a string pointer before
1655 			 * returning it to the user.
1656 			 */
1657 			if (mangle_string_ptrs(ev->fmt, __DECONST(uint8_t *,
1658 								  ev->fmtdata),
1659 					       replace_strid, &replctx) < 0)
1660 				return evtr->err;
1661 			if (evtr->err)
1662 				return evtr->err;
1663 			((char *)ev->fmtdata)[evh->datalen] = '\0';
1664 			ev->fmtdatalen = evh->datalen;
1665 		}
1666 	}
1667 	evtr_run_callbacks(ev, q);
1668 	return evtr->err;
1669 }
1670 
1671 static
1672 int
1673 evtr_skip_to_record(evtr_t evtr)
1674 {
1675 	int skip;
1676 
1677 	skip = REC_ALIGN - (evtr->bytes % REC_ALIGN);
1678 	if (skip > 0) {
1679 		if (fseek(evtr->f, skip, SEEK_CUR)) {
1680 			evtr->err = errno;
1681 			evtr->errmsg = strerror(errno);
1682 			return !0;
1683 		}
1684 		evtr->bytes += skip;
1685 	}
1686 	return 0;
1687 }
1688 
1689 static
1690 int
1691 evtr_load_sysinfo(evtr_t evtr)
1692 {
1693 	uint16_t ncpus;
1694 	int i;
1695 
1696 	if (evtr_read(evtr, &ncpus, sizeof(ncpus))) {
1697 		return !0;
1698 	}
1699 	if (evtr->cpus)
1700 		return 0;
1701 	evtr->cpus = malloc(ncpus * sizeof(struct cpu));
1702 	if (!evtr->cpus) {
1703 		evtr->err = ENOMEM;
1704 		return !0;
1705 	}
1706 	evtr->ncpus = ncpus;
1707 	for (i = 0; i < ncpus; ++i) {
1708 		evtr->cpus[i].td = NULL;
1709 		evtr->cpus[i].freq = -1.0;
1710 	}
1711 	return 0;
1712 }
1713 
1714 static
1715 int
1716 evtr_load_cpuinfo(evtr_t evtr)
1717 {
1718 	struct cpuinfo_event_header cih;
1719 	struct cpu *cpu;
1720 
1721 	if (evtr_read(evtr, &cih, sizeof(cih))) {
1722 		return !0;
1723 	}
1724 	if (cih.freq < 0.0) {
1725 		evtr->errmsg = "cpu freq is negative";
1726 		evtr->err = EINVAL;
1727 		return !0;
1728 	}
1729 	/*
1730 	 * Notice that freq is merely a multiplier with
1731 	 * which we convert a timestamp to seconds; if
1732 	 * ts is not in cycles, freq is not the frequency.
1733 	 */
1734 	if (!(cpu = evtr_cpu(evtr, cih.cpu))) {
1735 		evtr->errmsg = "freq for invalid cpu";
1736 		evtr->err = EINVAL;
1737 		return !0;
1738 	}
1739 	cpu->freq = cih.freq;
1740 	return 0;
1741 }
1742 
1743 static
1744 int
1745 _evtr_next_event(evtr_t evtr, evtr_event_t ev, struct evtr_query *q)
1746 {
1747 	char buf[MAX_EVHDR_SIZE];
1748 	int ret, err, ntried, nmatched;
1749 	struct trace_event_header *evhdr = (struct trace_event_header *)buf;
1750 
1751 	for (ret = 0; !ret;) {
1752 		if (q->flags & EVTRQF_PENDING) {
1753 			q->off = evtr->bytes;
1754 			memcpy(ev, &q->pending_event, sizeof(*ev));
1755 			q->flags &= ~EVTRQF_PENDING;
1756 			return 0;
1757 		}
1758 		if (evtr_read(evtr, &evhdr->type, 1)) {
1759 			if (feof(evtr->f)) {
1760 				evtr->errmsg = NULL;
1761 				evtr->err = 0;
1762 				return -1;
1763 			}
1764 			return !0;
1765 		}
1766 		/*
1767 		 * skip pad records -- this will only happen if there's a
1768 		 * variable sized record close to the boundary
1769 		 */
1770 		if (evhdr->type == EVTR_TYPE_PAD) {
1771 			evtr_skip_to_record(evtr);
1772 			continue;
1773 		}
1774 		if (evhdr->type == EVTR_TYPE_SYSINFO) {
1775 			evtr_load_sysinfo(evtr);
1776 			continue;
1777 		} else if (evhdr->type == EVTR_TYPE_CPUINFO) {
1778 			evtr_load_cpuinfo(evtr);
1779 			continue;
1780 		}
1781 		if (evtr_read(evtr, buf + 1, sizeof(*evhdr) - 1))
1782 			return feof(evtr->f) ? -1 : !0;
1783 		switch (evhdr->type) {
1784 		case EVTR_TYPE_PROBE:
1785 			ntried = q->ntried;
1786 			nmatched = q->nmatched;
1787 			if ((err = evtr_load_probe(evtr, ev, buf, q))) {
1788 				if (err == -1) {
1789 					/* no match */
1790 					ret = 0;
1791 				} else {
1792 					return !0;
1793 				}
1794 			} else {
1795 				ret = !0;
1796 			}
1797 			break;
1798 		case EVTR_TYPE_STR:
1799 			if (evtr_load_string(evtr, buf)) {
1800 				return !0;
1801 			}
1802 			break;
1803 		case EVTR_TYPE_FMT:
1804 			if (evtr_load_fmt(q, buf)) {
1805 				return !0;
1806 			}
1807 			break;
1808 		default:
1809 			evtr->err = !0;
1810 			evtr->errmsg = "unknown event type (corrupt input?)";
1811 			return !0;
1812 		}
1813 		evtr_skip_to_record(evtr);
1814 		if (ret) {
1815 			if (!evtr_match_filters(q, ev)) {
1816 				ret = 0;
1817 				continue;
1818 			}
1819 			q->off = evtr->bytes;
1820 			return 0;
1821 		}
1822 	}
1823 	/* can't get here */
1824 	return !0;
1825 }
1826 
1827 static
1828 int
1829 evtr_next_event(evtr_t evtr, evtr_event_t ev)
1830 {
1831 	struct evtr_query *q;
1832 	int ret;
1833 
1834 	if (!(q = evtr_query_init(evtr, NULL, 0))) {
1835 		evtr->err = ENOMEM;
1836 		return !0;
1837 	}
1838 	ret = _evtr_next_event(evtr, ev, q);
1839 	evtr_query_destroy(q);
1840 	return ret;
1841 }
1842 
1843 int
1844 evtr_last_event(evtr_t evtr, evtr_event_t ev)
1845 {
1846 	struct stat st;
1847 	int fd;
1848 	off_t last_boundary;
1849 
1850 	if (evtr_error(evtr))
1851 		return !0;
1852 
1853 	fd = fileno(evtr->f);
1854 	if (fstat(fd, &st))
1855 		return !0;
1856 	/*
1857 	 * This skips pseudo records, so we can't provide
1858 	 * an event with all fields filled in this way.
1859 	 * It's doable, just needs some care. TBD.
1860 	 */
1861 	if (0 && (st.st_mode & S_IFREG)) {
1862 		/*
1863 		 * Skip to last boundary, that's the closest to the EOF
1864 		 * location that we are sure contains a header so we can
1865 		 * pick up the stream.
1866 		 */
1867 		last_boundary = (st.st_size / REC_BOUNDARY) * REC_BOUNDARY;
1868 		/* XXX: ->bytes should be in query */
1869 		assert(evtr->bytes == 0);
1870 		evtr_skip(evtr, last_boundary);
1871 	}
1872 
1873 
1874 	/*
1875 	 * If we can't seek, we need to go through the whole file.
1876 	 * Since you can't seek back, this is pretty useless unless
1877 	 * you really are interested only in the last event.
1878 	 */
1879 	while (!evtr_next_event(evtr, ev))
1880 		;
1881 	if (evtr_error(evtr))
1882 		return !0;
1883 	evtr_rewind(evtr);
1884 	return 0;
1885 }
1886 
1887 struct evtr_query *
1888 evtr_query_init(evtr_t evtr, evtr_filter_t filt, int nfilt)
1889 {
1890 	struct evtr_query *q;
1891 	int i;
1892 
1893 	if (!(q = malloc(sizeof(*q)))) {
1894 		return q;
1895 	}
1896 	q->bufsize = 2;
1897 	if (!(q->buf = malloc(q->bufsize))) {
1898 		goto free_q;
1899 	}
1900 	if (!(q->symtab = symtab_new()))
1901 		goto free_buf;
1902 	q->evtr = evtr;
1903 	q->off = 0;
1904 	q->filt = filt;
1905 	q->nfilt = nfilt;
1906 	TAILQ_INIT(&q->unresolved_filtq);
1907 	q->nmatched = 0;
1908 	q->cbs = NULL;
1909 	q->ncbs = 0;
1910 	q->flags = 0;
1911 	memset(&q->pending_event, '\0', sizeof(q->pending_event));
1912 	if (evtr_register_callback(q, &thread_creation_callback, q)) {
1913 		goto free_symtab;
1914 	}
1915 	if (evtr_register_callback(q, &thread_switch_callback, q)) {
1916 		goto free_cbs;
1917 	}
1918 	if (evtr_query_needs_parsing(q) &&
1919 	    evtr_register_callback(q, &parse_callback, q)) {
1920 		goto free_cbs;
1921 	}
1922 
1923 	for (i = 0; i < nfilt; ++i) {
1924 		filt[i].flags = 0;
1925 		if (filt[i].fmt == NULL)
1926 			continue;
1927 		if (evtr_filter_register(q, &filt[i])) {
1928 			evtr_deregister_filters(q, filt, i);
1929 			goto free_symtab;
1930 		}
1931 	}
1932 
1933 	return q;
1934 free_cbs:
1935 	evtr_deregister_callbacks(q);
1936 free_symtab:
1937 	symtab_destroy(q->symtab);
1938 free_buf:
1939 	free(q->buf);
1940 free_q:
1941 	free(q);
1942 	return NULL;
1943 }
1944 
1945 void
1946 evtr_query_destroy(struct evtr_query *q)
1947 {
1948 	evtr_deregister_filters(q, q->filt, q->nfilt);
1949 
1950 	free(q->buf);
1951 	free(q);
1952 }
1953 
1954 int
1955 evtr_query_next(struct evtr_query *q, evtr_event_t ev)
1956 {
1957 	if (evtr_query_error(q))
1958 		return !0;
1959 	/* we may support that in the future */
1960 	if (q->off != q->evtr->bytes) {
1961 		q->errmsg = "evtr/query offset mismatch";
1962 		return !0;
1963 	}
1964 	return _evtr_next_event(q->evtr, ev, q);
1965 }
1966 
1967 int
1968 evtr_ncpus(evtr_t evtr)
1969 {
1970 	return evtr->ncpus;
1971 }
1972 
1973 int
1974 evtr_cpufreqs(evtr_t evtr, double *freqs)
1975 {
1976 	int i;
1977 
1978 	if (!freqs)
1979 		return EINVAL;
1980 	for (i = 0; i < evtr->ncpus; ++i) {
1981 		freqs[i] = evtr->cpus[i].freq;
1982 	}
1983 	return 0;
1984 }
1985