1 /* Copyright (C) 2021-2024 Free Software Foundation, Inc. 2 Contributed by Oracle. 3 4 This file is part of GNU Binutils. 5 6 This program is free software; you can redistribute it and/or modify 7 it under the terms of the GNU General Public License as published by 8 the Free Software Foundation; either version 3, or (at your option) 9 any later version. 10 11 This program is distributed in the hope that it will be useful, 12 but WITHOUT ANY WARRANTY; without even the implied warranty of 13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 GNU General Public License for more details. 15 16 You should have received a copy of the GNU General Public License 17 along with this program; if not, write to the Free Software 18 Foundation, 51 Franklin Street - Fifth Floor, Boston, 19 MA 02110-1301, USA. */ 20 21 #include <errno.h> 22 #include <unistd.h> 23 #include <fcntl.h> 24 #include <sys/mman.h> 25 #include <sys/ioctl.h> 26 #include <sys/syscall.h> 27 #include <linux/perf_event.h> 28 29 #include "hwcdrv.h" 30 31 /*---------------------------------------------------------------------------*/ 32 /* macros */ 33 #define IS_GLOBAL /* Mark global symbols */ 34 35 #include "cpuid.c" /* ftns for identifying a chip */ 36 37 static hdrv_pcbe_api_t hdrv_pcbe_core_api; 38 static hdrv_pcbe_api_t hdrv_pcbe_opteron_api; 39 static hdrv_pcbe_api_t *hdrv_pcbe_drivers[] = { 40 &hdrv_pcbe_core_api, 41 &hdrv_pcbe_opteron_api, 42 NULL 43 }; 44 #include "opteron_pcbe.c" /* CPU-specific code */ 45 #include "core_pcbe.c" /* CPU-specific code */ 46 47 extern hwcdrv_api_t hwcdrv_pcl_api; 48 IS_GLOBAL hwcdrv_api_t *hwcdrv_drivers[] = { 49 &hwcdrv_pcl_api, 50 NULL 51 }; 52 53 /*---------------------------------------------------------------------------*/ 54 55 /* utils for drivers */ 56 IS_GLOBAL int 57 hwcdrv_assign_all_regnos (Hwcentry* entries[], unsigned numctrs) 58 { 59 unsigned int pmc_assigned[MAX_PICS]; 60 unsigned idx; 61 for (int ii = 0; ii < MAX_PICS; ii++) 62 pmc_assigned[ii] = 0; 63 64 /* assign the HWCs that we already know about */ 65 for (idx = 0; idx < numctrs; idx++) 66 { 67 regno_t regno = entries[idx]->reg_num; 68 if (regno == REGNO_ANY) 69 { 70 /* check to see if list of possible registers only contains one entry */ 71 regno = REG_LIST_SINGLE_VALID_ENTRY (entries[idx]->reg_list); 72 } 73 if (regno != REGNO_ANY) 74 { 75 if (regno < 0 || regno >= MAX_PICS || !regno_is_valid (entries[idx], regno)) 76 { 77 logerr (GTXT ("For counter #%d, register %d is out of range\n"), idx + 1, regno); /*!*/ 78 return HWCFUNCS_ERROR_HWCARGS; 79 } 80 TprintfT (DBG_LT2, "hwcfuncs_assign_regnos(): preselected: idx=%d, regno=%d\n", idx, regno); 81 entries[idx]->reg_num = regno; /* assigning back to entries */ 82 pmc_assigned[regno] = 1; 83 } 84 } 85 86 /* assign HWCs that are currently REGNO_ANY */ 87 for (idx = 0; idx < numctrs; idx++) 88 { 89 if (entries[idx]->reg_num == REGNO_ANY) 90 { 91 int assigned = 0; 92 regno_t *reg_list = entries[idx]->reg_list; 93 for (; reg_list && *reg_list != REGNO_ANY; reg_list++) 94 { 95 regno_t regno = *reg_list; 96 if (regno < 0 || regno >= MAX_PICS) 97 { 98 logerr (GTXT ("For counter #%d, register %d is out of range\n"), idx + 1, regno); /*!*/ 99 return HWCFUNCS_ERROR_HWCARGS; 100 } 101 if (pmc_assigned[regno] == 0) 102 { 103 TprintfT (DBG_LT2, "hwcfuncs_assign_regnos(): assigned: idx=%d, regno=%d\n", idx, regno); 104 entries[idx]->reg_num = regno; /* assigning back to entries */ 105 pmc_assigned[regno] = 1; 106 assigned = 1; 107 break; 108 } 109 } 110 if (!assigned) 111 { 112 logerr (GTXT ("Counter '%s' could not be bound to a register\n"), 113 entries[idx]->name ? entries[idx]->name : "<NULL>"); 114 return HWCFUNCS_ERROR_HWCARGS; 115 } 116 } 117 } 118 return 0; 119 } 120 121 IS_GLOBAL int 122 hwcdrv_lookup_cpuver (const char * cpcN_cciname) 123 { 124 libcpc2_cpu_lookup_t *plookup; 125 static libcpc2_cpu_lookup_t cpu_table[] = { 126 LIBCPC2_CPU_LOOKUP_LIST 127 }; 128 if (cpcN_cciname == NULL) 129 return CPUVER_UNDEFINED; 130 131 /* search table for name */ 132 for (plookup = cpu_table; plookup->cpc2_cciname; plookup++) 133 { 134 int n = strlen (plookup->cpc2_cciname); 135 if (!strncmp (plookup->cpc2_cciname, cpcN_cciname, n)) 136 return plookup->cpc2_cpuver; 137 } 138 /* unknown, but does have a descriptive string */ 139 TprintfT (DBG_LT0, "hwcfuncs: CPC2: WARNING: Id of processor '%s' " 140 "could not be determined\n", 141 cpcN_cciname); 142 return CPUVER_GENERIC; 143 } 144 145 /*---------------------------------------------------------------------------*/ 146 /* utils to generate x86 register definitions on Linux */ 147 148 /* 149 * This code is structured as though we're going to initialize the 150 * HWC by writing the Intel MSR register directly. That is, we 151 * assume the lowest 16 bits of the event number will have the event 152 * and that higher bits will set attributes. 153 * 154 * While SPARC is different, we can nonetheless use basically the 155 * same "x86"-named functions: 156 * 157 * - The event code will still be 16 bits. It will still 158 * be in the lowest 16 bits of the event number. Though 159 * perf_event_code() on SPARC will expect those bits to 160 * shifted, hwcdrv_pcl.c can easily perform that shift. 161 * 162 * - On SPARC we support only two attributes, "user" and "system", 163 * which hwcdrv_pcl.c already converts to the "exclude_user" 164 * and "exclude_kernel" fields expected by perf_event_open(). 165 * "user" and "system" are stored in event bits 16 and 17. 166 * For M8, a 4-bit mask of supported PICs is stored in bits [23:20]. 167 */ 168 169 IS_GLOBAL hwcdrv_get_eventnum_fn_t *hwcdrv_get_x86_eventnum = 0; 170 171 static const attr_info_t perfctr_sparc_attrs[] = { 172 {NTXT ("user"), 0, 0x01, 16}, //usr 173 {NTXT ("system"), 0, 0x01, 17}, //os 174 {NULL, 0, 0x00, 0}, 175 }; 176 static const attr_info_t perfctr_x64_attrs[] = {/* ok for Core2 & later */ 177 {NTXT ("umask"), 0, 0xff, 8}, 178 {NTXT ("user"), 0, 0x01, 16}, //usr 179 //{NTXT("nouser"), 1, 0x01, 16}, //usr (inverted) 180 {NTXT ("system"), 0, 0x01, 17}, //os 181 {NTXT ("edge"), 0, 0x01, 18}, 182 {NTXT ("pc"), 0, 0x01, 19}, 183 {NTXT ("inv"), 0, 0x01, 23}, 184 {NTXT ("cmask"), 0, 0xff, 24}, 185 {NULL, 0, 0x00, 0}, 186 }; 187 const attr_info_t *perfctr_attrs_table = perfctr_x64_attrs; 188 189 static const eventsel_t perfctr_evntsel_enable_bits = (0x01 << 16) | /* usr */ 190 // (0xff << 0) | /* event*/ 191 // (0xff << 8) | /* umask */ 192 // (0x01 << 17) | /* os */ 193 // (0x01 << 18) | /* edge */ 194 // (0x01 << 19) | /* pc */ 195 (0x01 << 20) | /* int */ 196 // (0x01 << 21) | /* reserved */ 197 (0x01 << 22) | /* enable */ 198 // (0x01 << 23) | /* inv */ 199 // (0xff << 24) | /* cmask */ 200 0; 201 202 static int 203 myperfctr_get_x86_eventnum (const char *eventname, uint_t pmc, 204 eventsel_t *eventsel, eventsel_t *valid_umask, 205 uint_t *pmc_sel) 206 { 207 if (hwcdrv_get_x86_eventnum && 208 !hwcdrv_get_x86_eventnum (eventname, pmc, eventsel, valid_umask, pmc_sel)) 209 return 0; 210 211 /* check for numerically-specified counters */ 212 char * endptr; 213 uint64_t num = strtoull (eventname, &endptr, 0); 214 if (*eventname && !*endptr) 215 { 216 *eventsel = EXTENDED_EVNUM_2_EVSEL (num); 217 *valid_umask = 0xff; /* allow any umask (unused for SPARC?) */ 218 *pmc_sel = pmc; 219 return 0; 220 } 221 222 /* name does not specify a numeric value */ 223 *eventsel = (eventsel_t) - 1; 224 *valid_umask = 0x0; 225 *pmc_sel = pmc; 226 return -1; 227 } 228 229 static int 230 mask_shift_set (eventsel_t *presult, eventsel_t invalue, 231 eventsel_t mask, eventsel_t shift) 232 { 233 if (invalue & ~mask) 234 return -1; /* invalue attempts to set bits outside of mask */ 235 *presult &= ~(mask << shift); /* clear all the mask bits */ 236 *presult |= (invalue << shift); /* set bits according to invalue */ 237 return 0; 238 } 239 240 static int 241 set_x86_attr_bits (eventsel_t *result_mask, eventsel_t evnt_valid_umask, 242 hwcfuncs_attr_t attrs[], int nattrs, const char*nameOnly) 243 { 244 eventsel_t evntsel = *result_mask; 245 for (int ii = 0; ii < (int) nattrs; ii++) 246 { 247 const char *attrname = attrs[ii].ca_name; 248 eventsel_t attrval = (eventsel_t) attrs[ii].ca_val; 249 const char *tmpname; 250 int attr_found = 0; 251 for (int jj = 0; (tmpname = perfctr_attrs_table[jj].attrname); jj++) 252 { 253 if (strcmp (attrname, tmpname) == 0) 254 { 255 if (strcmp (attrname, "umask") == 0) 256 { 257 if (attrval & ~evnt_valid_umask) 258 { 259 logerr (GTXT ("for `%s', allowable umask bits are: 0x%llx\n"), 260 nameOnly, (long long) evnt_valid_umask); 261 return -1; 262 } 263 } 264 if (mask_shift_set (&evntsel, 265 perfctr_attrs_table[jj].is_inverted ? (attrval^1) : attrval, 266 perfctr_attrs_table[jj].mask, 267 perfctr_attrs_table[jj].shift)) 268 { 269 logerr (GTXT ("`%s' attribute `%s' could not be set to 0x%llx\n"), 270 nameOnly, attrname, (long long) attrval); 271 return -1; 272 } 273 TprintfT (DBG_LT2, "hwcfuncs: Counter %s, attribute %s set to 0x%llx\n", 274 nameOnly, attrname, (long long) attrval); 275 attr_found = 1; 276 break; 277 } 278 } 279 if (!attr_found) 280 { 281 logerr (GTXT ("attribute `%s' is invalid\n"), attrname); 282 return -1; 283 } 284 } 285 *result_mask = evntsel; 286 return 0; 287 } 288 289 IS_GLOBAL int 290 hwcfuncs_get_x86_eventsel (unsigned int regno, const char *int_name, 291 eventsel_t *return_event, uint_t *return_pmc_sel) 292 { 293 hwcfuncs_attr_t attrs[HWCFUNCS_MAX_ATTRS + 1]; 294 unsigned nattrs = 0; 295 char *nameOnly = NULL; 296 eventsel_t evntsel = 0; // event number 297 eventsel_t evnt_valid_umask = 0; 298 uint_t pmc_sel = 0; 299 int rc = -1; 300 *return_event = 0; 301 *return_pmc_sel = 0; 302 void *attr_mem = hwcfuncs_parse_attrs (int_name, attrs, HWCFUNCS_MAX_ATTRS, 303 &nattrs, NULL); 304 if (!attr_mem) 305 { 306 logerr (GTXT ("out of memory, could not parse attributes\n")); 307 return -1; 308 } 309 hwcfuncs_parse_ctr (int_name, NULL, &nameOnly, NULL, NULL, NULL); 310 if (regno == REGNO_ANY) 311 { 312 logerr (GTXT ("reg# could not be determined for `%s'\n"), nameOnly); 313 goto attr_wrapup; 314 } 315 316 /* look up evntsel */ 317 if (myperfctr_get_x86_eventnum (nameOnly, regno, 318 &evntsel, &evnt_valid_umask, &pmc_sel)) 319 { 320 logerr (GTXT ("counter `%s' is not valid\n"), nameOnly); 321 goto attr_wrapup; 322 } 323 TprintfT (DBG_LT1, "hwcfuncs: event=0x%llx pmc=0x%x '%s' nattrs = %u\n", 324 (long long) evntsel, pmc_sel, nameOnly, nattrs); 325 326 /* determine event attributes */ 327 eventsel_t evnt_attrs = perfctr_evntsel_enable_bits; 328 if (set_x86_attr_bits (&evnt_attrs, evnt_valid_umask, attrs, nattrs, nameOnly)) 329 goto attr_wrapup; 330 if (evntsel & evnt_attrs) 331 TprintfT (DBG_LT0, "hwcfuncs: ERROR - evntsel & enable bits overlap: 0x%llx 0x%llx 0x%llx\n", 332 (long long) evntsel, (long long) evnt_attrs, 333 (long long) (evntsel & evnt_attrs)); 334 *return_event = evntsel | evnt_attrs; 335 *return_pmc_sel = pmc_sel; 336 rc = 0; 337 338 attr_wrapup: 339 free (attr_mem); 340 free (nameOnly); 341 return rc; 342 } 343 344 #ifdef __x86_64__ 345 #define syscall_instr "syscall" 346 #define syscall_clobber "rcx", "r11", "memory" 347 #endif 348 #ifdef __i386__ 349 #define syscall_instr "int $0x80" 350 #define syscall_clobber "memory" 351 #endif 352 353 static inline int 354 perf_event_open (struct perf_event_attr *hw_event_uptr, pid_t pid, 355 int cpu, int group_fd, unsigned long flags) 356 { 357 /* It seems that perf_event_open() sometimes fails spuriously, 358 * even while an immediate retry succeeds. 359 * So, let's try a few retries if the call fails just to be sure. 360 */ 361 int rc; 362 for (int retry = 0; retry < 5; retry++) 363 { 364 rc = syscall (__NR_perf_event_open, hw_event_uptr, pid, cpu, group_fd, flags); 365 if (rc != -1) 366 return rc; 367 } 368 return rc; 369 } 370 371 /*---------------------------------------------------------------------------*/ 372 /* macros & fwd prototypes */ 373 374 #define HWCDRV_API static /* Mark functions used by hwcdrv API */ 375 376 HWCDRV_API int hwcdrv_start (void); 377 HWCDRV_API int hwcdrv_free_counters (); 378 379 static pid_t 380 hwcdrv_gettid (void) 381 { 382 #ifndef LIBCOLLECTOR_SRC 383 return syscall (__NR_gettid); 384 #elif defined(intel) 385 pid_t r; 386 __asm__ __volatile__(syscall_instr 387 : "=a" (r) : "0" (__NR_gettid) 388 : syscall_clobber); 389 return r; 390 #else 391 return syscall (__NR_gettid); // FIXUP_XXX_SPARC_LINUX // write gettid in asm 392 #endif 393 } 394 395 /*---------------------------------------------------------------------------*/ 396 /* types */ 397 398 #define NPAGES_PER_BUF 1 // number of pages to be used for perf_event samples 399 // must be a power of 2 400 401 /*---------------------------------------------------------------------------*/ 402 403 /* typedefs */ 404 405 typedef struct 406 { // event (hwc) definition 407 unsigned int reg_num; // PMC assignment, potentially for detecting conflicts 408 eventsel_t eventsel; // raw event bits (Intel/AMD) 409 uint64_t counter_preload; // number of HWC events before signal 410 struct perf_event_attr hw; // perf_event definition 411 hrtime_t min_time; // minimum time we're targeting between events 412 char *name; 413 } perf_event_def_t; 414 415 typedef struct 416 { // runtime state of perf_event buffer 417 void *buf; // pointer to mmapped buffer 418 size_t pagesz; // size of pages 419 } buffer_state_t; 420 421 typedef struct 422 { // runtime state of counter values 423 uint64_t prev_ena_ts; // previous perf_event "enabled" time 424 uint64_t prev_run_ts; // previous perf_event "running" time 425 uint64_t prev_value; // previous HWC value 426 } counter_value_state_t; 427 428 typedef struct 429 { // per-counter information 430 perf_event_def_t *ev_def; // global HWC definition for one counter 431 int fd; // perf_event fd 432 buffer_state_t buf_state; // perf_event buffer's state 433 counter_value_state_t value_state; // counter state 434 int needs_restart; // workaround for dbx failure to preserve si_fd 435 uint64_t last_overflow_period; 436 hrtime_t last_overflow_time; 437 } counter_state_t; 438 439 typedef struct 440 { // per-thread context 441 counter_state_t *ctr_list; 442 int signal_fd; // fd that caused the most recent signal 443 pid_t tid; // for debugging signal delivery problems 444 } hdrv_pcl_ctx_t; 445 446 /*---------------------------------------------------------------------------*/ 447 448 /* static variables */ 449 static struct 450 { 451 int library_ok; 452 int internal_open_called; 453 hwcfuncs_tsd_get_fn_t find_vpc_ctx; 454 unsigned hwcdef_cnt; /* number of *active* hardware counters */ 455 hwcdrv_get_events_fn_t *get_events; 456 } hdrv_pcl_state; 457 458 static hwcdrv_about_t hdrv_pcl_about = {.cpcN_cpuver = CPUVER_UNDEFINED}; 459 static perf_event_def_t global_perf_event_def[MAX_PICS]; 460 461 #define COUNTERS_ENABLED() (hdrv_pcl_state.hwcdef_cnt) 462 463 464 /* perf_event buffer formatting and handling */ 465 static void 466 reset_buf (buffer_state_t *bufstate) 467 { 468 TprintfT (0, "hwcdrv: ERROR: perf_event reset_buf() called!\n"); 469 struct perf_event_mmap_page *metadata = bufstate->buf; 470 if (metadata) 471 metadata->data_tail = metadata->data_head; 472 } 473 474 static int 475 skip_buf (buffer_state_t *bufstate, size_t sz) 476 { 477 TprintfT (DBG_LT1, "hwcdrv: WARNING: perf_event skip_buf called!\n"); 478 struct perf_event_mmap_page *metadata = bufstate->buf; 479 if (metadata == NULL) 480 return -1; 481 size_t pgsz = bufstate->pagesz; 482 size_t bufsz = NPAGES_PER_BUF*pgsz; 483 uint64_t d_tail = metadata->data_tail; 484 uint64_t d_head = metadata->data_head; 485 486 // validate request size 487 if (sz > d_head - d_tail || sz >= bufsz) 488 { 489 reset_buf (bufstate); 490 return -1; 491 } 492 metadata->data_tail = d_tail + sz; // advance tail 493 return 0; 494 } 495 496 static int 497 read_buf (buffer_state_t *bufstate, void *buf, size_t sz) 498 { 499 struct perf_event_mmap_page *metadata = bufstate->buf; 500 if (metadata == NULL) 501 return -1; 502 size_t pgsz = bufstate->pagesz; 503 size_t bufsz = NPAGES_PER_BUF*pgsz; 504 uint64_t d_tail = metadata->data_tail; 505 uint64_t d_head = metadata->data_head; 506 507 // validate request size 508 if (sz > d_head - d_tail || sz >= bufsz) 509 { 510 reset_buf (bufstate); 511 return -1; 512 } 513 char *buf_base = ((char *) metadata) + pgsz; // start of data buffer 514 uint64_t start_pos = d_tail & (bufsz - 1); // char offset into data buffer 515 size_t nbytes = sz; 516 if (start_pos + sz > bufsz) 517 { 518 // will wrap past end of buffer 519 nbytes = bufsz - start_pos; 520 memcpy (buf, buf_base + start_pos, nbytes); 521 start_pos = 0; // wrap to start 522 buf = (void *) (((char *) buf) + nbytes); 523 nbytes = sz - nbytes; 524 } 525 memcpy (buf, buf_base + start_pos, nbytes); 526 metadata->data_tail += sz; 527 return 0; 528 } 529 530 static int 531 read_u64 (buffer_state_t *bufstate, uint64_t *value) 532 { 533 return read_buf (bufstate, value, sizeof (uint64_t)); 534 } 535 536 static int 537 read_sample (counter_state_t *ctr_state, int msgsz, uint64_t *rvalue, 538 uint64_t *rlost) 539 { 540 // returns count of bytes read 541 buffer_state_t *bufstate = &ctr_state->buf_state; 542 counter_value_state_t *cntstate = &ctr_state->value_state; 543 int readsz = 0; 544 545 // PERF_SAMPLE_IP 546 uint64_t ipc = 0; 547 int rc = read_u64 (bufstate, &ipc); 548 if (rc) 549 return -1; 550 readsz += sizeof (uint64_t); 551 552 // PERF_SAMPLE_READ: value 553 uint64_t value = 0; 554 rc = read_u64 (bufstate, &value); 555 if (rc) 556 return -2; 557 readsz += sizeof (uint64_t); 558 559 /* Bug 20806896 560 * Old Linux kernels (e.g. 2.6.32) on certain systems return enabled and 561 * running times in the sample data that correspond to the metadata times 562 * metadata->time_enabled 563 * metadata->time_running 564 * from the PREVIOUS (not current) sample. Probably just ignore this bug 565 * since it's on old kernels and we only use the enabled and running times 566 * to construct loss_estimate. 567 */ 568 // PERF_SAMPLE_READ: PERF_FORMAT_ENABLED 569 uint64_t enabled_time = 0; 570 rc = read_u64 (bufstate, &enabled_time); 571 if (rc) 572 return -3; 573 readsz += sizeof (uint64_t); 574 575 // PERF_SAMPLE_READ: PERF_FORMAT_RUNNING 576 uint64_t running_time = 0; 577 rc = read_u64 (bufstate, &running_time); 578 if (rc) 579 return -4; 580 readsz += sizeof (uint64_t); 581 582 uint64_t value_delta = value - cntstate->prev_value; 583 uint64_t enabled_delta = enabled_time - cntstate->prev_ena_ts; 584 uint64_t running_delta = running_time - cntstate->prev_run_ts; 585 cntstate->prev_value = value; 586 cntstate->prev_ena_ts = enabled_time; 587 cntstate->prev_run_ts = running_time; 588 589 // 24830461 need workaround for Linux anomalous HWC skid overrun 590 int set_error_flag = 0; 591 if (value_delta > 2 * ctr_state->last_overflow_period + 2000 /* HWC_SKID_TOLERANCE */) 592 set_error_flag = 1; 593 594 uint64_t loss_estimate = 0; // estimate loss of events caused by multiplexing 595 if (running_delta == enabled_delta) 596 { 597 // counter was running 100% of time, no multiplexing 598 } 599 else if (running_delta == 0) 600 loss_estimate = 1; // token amount to aid in debugging perfctr oddities 601 else if ((running_delta > enabled_delta) || (enabled_delta & 0x1000000000000000ll)) 602 { 603 // running should be smaller than enabled, can't estimate 604 /* 605 * 21418391 HWC can have a negative count 606 * 607 * We've also seen enabled not only be smaller than running 608 * but in fact go negative. Guard against this. 609 */ 610 loss_estimate = 2; // token amount to aid in debugging perfctr oddities 611 } 612 else 613 { 614 // counter was running less than 100% of time 615 // Example: ena=7772268 run=6775669 raw_value=316004 scaled_value=362483 loss_est=46479 616 uint64_t scaled_delta = (double) value_delta * enabled_delta / running_delta; 617 value_delta = scaled_delta; 618 #if 0 619 // We should perhaps warn the user that multiplexing is going on, 620 // but hwcdrv_pcl.c doesn't know about the collector_interface, SP_JCMD_COMMENT, or COL_COMMENT_* values. 621 // For now we simply don't report. 622 // Perhaps we should address the issue not here but in the caller collector_sigemt_handler(), 623 // but at that level "lost" has a meaning that's considerably broader than just multiplexing. 624 collector_interface->writeLog ("<event kind=\"%s\" id=\"%d\">%s %d -> %d</event>\n", 625 SP_JCMD_COMMENT, COL_COMMENT_HWCADJ, global_perf_event_def[idx].name, 626 ctr_list[idx].last_overflow_period, new_period); 627 #endif 628 } 629 TprintfT ((loss_estimate || set_error_flag) ? DBG_LT1 : DBG_LT3, 630 "hwcdrv: '%s' ipc=0x%llx ena=%llu run=%llu " 631 "value_delta=%lld(0x%llx) loss_est=%llu %s error_flag='%s'\n", 632 ctr_state->ev_def->name, (long long) ipc, 633 (long long) enabled_delta, (long long) running_delta, 634 (long long) value_delta, (long long) value_delta, 635 (unsigned long long) loss_estimate, 636 loss_estimate ? ", WARNING - SCALED" : "", 637 set_error_flag ? ", ERRORFLAG" : ""); 638 if (set_error_flag == 1) 639 value_delta |= (1ULL << 63) /* HWCVAL_ERR_FLAG */; 640 *rvalue = value_delta; 641 *rlost = loss_estimate; 642 if (readsz != msgsz) 643 { 644 TprintfT (0, "hwcdrv: ERROR: perf_event sample not fully parsed\n"); 645 return -5; 646 } 647 return 0; 648 } 649 650 static void 651 dump_perf_event_attr (struct perf_event_attr *at) 652 { 653 TprintfT (DBG_LT2, "dump_perf_event_attr: size=%d type=%d sample_period=%lld\n" 654 " config=0x%llx config1=0x%llx config2=0x%llx wakeup_events=%lld __reserved_1=%lld\n", 655 (int) at->size, (int) at->type, (unsigned long long) at->sample_period, 656 (unsigned long long) at->config, (unsigned long long) at->config1, 657 (unsigned long long) at->config2, (unsigned long long) at->wakeup_events, 658 (unsigned long long) at->__reserved_1); 659 #define DUMP_F(fld) if (at->fld) TprintfT(DBG_LT2, " %-10s : %lld\n", #fld, (long long) at->fld) 660 DUMP_F (disabled); 661 DUMP_F (inherit); 662 DUMP_F (pinned); 663 DUMP_F (exclusive); 664 DUMP_F (exclude_user); 665 DUMP_F (exclude_kernel); 666 DUMP_F (exclude_hv); 667 DUMP_F (exclude_idle); 668 // DUMP_F(xmmap); 669 DUMP_F (comm); 670 DUMP_F (freq); 671 DUMP_F (inherit_stat); 672 DUMP_F (enable_on_exec); 673 DUMP_F (task); 674 DUMP_F (watermark); 675 } 676 677 static void 678 init_perf_event (struct perf_event_attr *hw, uint64_t event, uint64_t period, 679 Hwcentry *hwce) 680 { 681 memset (hw, 0, sizeof (struct perf_event_attr)); 682 hw->size = sizeof (struct perf_event_attr); 683 if (hwce && hwce->use_perf_event_type) 684 { 685 hw->config = hwce->config; 686 hw->type = hwce->type; 687 } 688 else 689 { // backward compatibility. The old interface had no 'hwce' argument. 690 hw->config = event; 691 hw->type = PERF_TYPE_RAW; // hw/sw/trace/raw... 692 } 693 hw->sample_period = period; 694 hw->sample_type = PERF_SAMPLE_IP | 695 // PERF_SAMPLE_TID | 696 // PERF_SAMPLE_TIME | // possibly interesting 697 // PERF_SAMPLE_ADDR | 698 PERF_SAMPLE_READ | // HWC value 699 // PERF_SAMPLE_CALLCHAIN | // interesting 700 // PERF_SAMPLE_ID | 701 // PERF_SAMPLE_CPU | // possibly interesting 702 // PERF_SAMPLE_PERIOD | 703 // PERF_SAMPLE_STREAM_ID | 704 // PERF_SAMPLE_RAW | 705 0; 706 hw->read_format = 707 PERF_FORMAT_TOTAL_TIME_ENABLED | // detect when hwc not scheduled 708 PERF_FORMAT_TOTAL_TIME_RUNNING | // detect when hwc not scheduled 709 // PERF_FORMAT_ID | 710 // PERF_FORMAT_GROUP | 711 0; 712 hw->disabled = 1; /* off by default */ 713 714 // Note: the following override config.priv bits! 715 hw->exclude_user = (event & (1 << 16)) == 0; /* don't count user */ 716 hw->exclude_kernel = (event & (1 << 17)) == 0; /* ditto kernel */ 717 hw->exclude_hv = 1; /* ditto hypervisor */ 718 hw->wakeup_events = 1; /* wakeup every n events */ 719 dump_perf_event_attr (hw); 720 } 721 722 static int 723 start_one_ctr (int ii, size_t pgsz, hdrv_pcl_ctx_t * pctx, char *error_string) 724 { 725 // pe_attr should have been initialized in hwcdrv_create_counters() 726 struct perf_event_attr pe_attr; 727 memcpy (&pe_attr, &global_perf_event_def[ii].hw, sizeof (pe_attr)); 728 729 // but we adjust the period, so make sure that pctx->ctr_list[ii].last_overflow_period has been set 730 pe_attr.sample_period = pctx->ctr_list[ii].last_overflow_period; 731 732 int hwc_fd = perf_event_open (&pe_attr, pctx->tid, -1, -1, 0); 733 if (hwc_fd == -1) 734 { 735 TprintfT (DBG_LT1, "%s idx=%d perf_event_open failed, errno=%d\n", 736 error_string, ii, errno); 737 return 1; 738 } 739 740 size_t buffer_area_sz = (NPAGES_PER_BUF + 1) * pgsz; // add a page for metadata 741 void * buf = mmap (NULL, buffer_area_sz, //YXXX is this a safe call? 742 PROT_READ | PROT_WRITE, MAP_SHARED, hwc_fd, 0); 743 if (buf == MAP_FAILED) 744 { 745 TprintfT (0, "sz = %ld, pgsz = %ld\n err=%s idx=%d mmap failed: %s\n", 746 (long) buffer_area_sz, (long) pgsz, error_string, ii, strerror (errno)); 747 return 1; 748 } 749 pctx->ctr_list[ii].ev_def = &global_perf_event_def[ii]; // why do we set ev_def? we never seem to use it 750 pctx->ctr_list[ii].fd = hwc_fd; 751 pctx->ctr_list[ii].buf_state.buf = buf; 752 pctx->ctr_list[ii].buf_state.pagesz = pgsz; 753 pctx->ctr_list[ii].value_state.prev_ena_ts = 0; 754 pctx->ctr_list[ii].value_state.prev_run_ts = 0; 755 pctx->ctr_list[ii].value_state.prev_value = 0; 756 pctx->ctr_list[ii].last_overflow_time = gethrtime (); 757 758 /* set async mode */ 759 long flags = fcntl (hwc_fd, F_GETFL, 0) | O_ASYNC; 760 int rc = fcntl (hwc_fd, F_SETFL, flags); 761 if (rc == -1) 762 { 763 TprintfT (0, "%s idx=%d O_ASYNC failed\n", error_string, ii); 764 return 1; 765 } 766 767 /* 768 * set lwp ownership of the fd 769 * See BUGS section of "man perf_event_open": 770 * The F_SETOWN_EX option to fcntl(2) is needed to properly get 771 * overflow signals in threads. This was introduced in Linux 2.6.32. 772 * Legacy references: 773 * see http://lkml.org/lkml/2009/8/4/128 774 * google man fcntl F_SETOWN_EX -conflict 775 * "From Linux 2.6.32 onward, use F_SETOWN_EX to target 776 * SIGIO and SIGURG signals at a particular thread." 777 * http://icl.cs.utk.edu/papi/docs/da/d2a/examples__v2_8x_2self__smpl__multi_8c.html 778 * See 2010 CSCADS presentation by Eranian 779 */ 780 struct f_owner_ex fowner_ex; 781 fowner_ex.type = F_OWNER_TID; 782 fowner_ex.pid = pctx->tid; 783 rc = fcntl (hwc_fd, F_SETOWN_EX, (unsigned long) &fowner_ex); 784 if (rc == -1) 785 { 786 TprintfT (0, "%s idx=%d F_SETOWN failed\n", error_string, ii); 787 return 1; 788 } 789 790 /* Use sigio so handler can determine FD via siginfo->si_fd. */ 791 rc = fcntl (hwc_fd, F_SETSIG, SIGIO); 792 if (rc == -1) 793 { 794 TprintfT (0, "%s idx=%d F_SETSIG failed\n", error_string, ii); 795 return 1; 796 } 797 return 0; 798 } 799 800 static int 801 stop_one_ctr (int ii, counter_state_t *ctr_list) 802 { 803 int hwc_rc = 0; 804 if (-1 == ioctl (ctr_list[ii].fd, PERF_EVENT_IOC_DISABLE, 1)) 805 { 806 TprintfT (0, "hwcdrv: ERROR: PERF_EVENT_IOC_DISABLE #%d failed: errno=%d\n", ii, errno); 807 hwc_rc = HWCFUNCS_ERROR_GENERIC; 808 } 809 void *buf = ctr_list[ii].buf_state.buf; 810 if (buf) 811 { 812 size_t bufsz = (NPAGES_PER_BUF + 1) * ctr_list[ii].buf_state.pagesz; 813 ctr_list[ii].buf_state.buf = NULL; 814 int tmprc = munmap (buf, bufsz); 815 if (tmprc) 816 { 817 TprintfT (0, "hwcdrv: ERROR: munmap() #%d failed: errno=%d\n", ii, errno); 818 hwc_rc = HWCFUNCS_ERROR_GENERIC; 819 } 820 } 821 if (-1 == close (ctr_list[ii].fd)) 822 { 823 TprintfT (0, "hwcdrv: ERROR: close(fd) #%d failed: errno=%d\n", ii, errno); 824 hwc_rc = HWCFUNCS_ERROR_GENERIC; 825 } 826 return hwc_rc; 827 } 828 829 /* HWCDRV_API for thread-specific actions */ 830 HWCDRV_API int 831 hwcdrv_lwp_init (void) 832 { 833 return hwcdrv_start (); 834 } 835 836 HWCDRV_API void 837 hwcdrv_lwp_fini (void) 838 { 839 hwcdrv_free_counters (); /* also sets pctx->ctr_list=NULL; */ 840 } 841 842 /* open */ 843 static int 844 hdrv_pcl_internal_open () 845 { 846 if (hdrv_pcl_state.internal_open_called) 847 { 848 TprintfT (0, "hwcdrv: WARNING: hdrv_pcl_internal_open: already called\n"); 849 return HWCFUNCS_ERROR_ALREADY_CALLED; 850 } 851 852 // determine if PCL is available 853 perf_event_def_t tmp_event_def; 854 memset (&tmp_event_def, 0, sizeof (tmp_event_def)); 855 struct perf_event_attr *pe_attr = &tmp_event_def.hw; 856 init_perf_event (pe_attr, 0, 0, NULL); 857 pe_attr->type = PERF_TYPE_HARDWARE; // specify abstracted HW event 858 pe_attr->config = PERF_COUNT_HW_INSTRUCTIONS; // specify abstracted insts 859 int hwc_fd = perf_event_open (pe_attr, 860 0, // pid/tid, 0 is self 861 -1, // cpu, -1 is per-thread mode 862 -1, // group_fd, -1 is root 863 0); // flags 864 if (hwc_fd == -1) 865 { 866 TprintfT (DBG_LT1, "hwcdrv: WARNING: hdrv_pcl_internal_open:" 867 " perf_event_open() failed, errno=%d\n", errno); 868 goto internal_open_error; 869 } 870 871 /* see if the PCL is new enough to know about F_SETOWN_EX */ 872 struct f_owner_ex fowner_ex; 873 fowner_ex.type = F_OWNER_TID; 874 fowner_ex.pid = hwcdrv_gettid (); // "pid=tid" is correct w/F_OWNER_TID 875 if (fcntl (hwc_fd, F_SETOWN_EX, (unsigned long) &fowner_ex) == -1) 876 { 877 TprintfT (DBG_LT1, "hwcdrv: WARNING: hdrv_pcl_internal_open: " 878 "F_SETOWN failed, errno=%d\n", errno); 879 close (hwc_fd); 880 goto internal_open_error; 881 } 882 close (hwc_fd); 883 884 hdrv_pcl_state.internal_open_called = 1; 885 hdrv_pcl_state.library_ok = 1; // set to non-zero to show it's initted 886 hdrv_pcl_about.cpcN_cpuver = CPUVER_UNDEFINED; 887 TprintfT (DBG_LT2, "hwcdrv: hdrv_pcl_internal_open()\n"); 888 for (int ii = 0; hdrv_pcbe_drivers[ii]; ii++) 889 { 890 hdrv_pcbe_api_t *ppcbe = hdrv_pcbe_drivers[ii]; 891 if (!ppcbe->hdrv_pcbe_init ()) 892 { 893 hdrv_pcl_about.cpcN_cciname = ppcbe->hdrv_pcbe_impl_name (); 894 hdrv_pcl_about.cpcN_cpuver = hwcdrv_lookup_cpuver (hdrv_pcl_about.cpcN_cciname); 895 if (hdrv_pcl_about.cpcN_cpuver == CPUVER_UNDEFINED) 896 goto internal_open_error; 897 hdrv_pcl_about.cpcN_npics = ppcbe->hdrv_pcbe_ncounters (); 898 hdrv_pcl_about.cpcN_docref = ppcbe->hdrv_pcbe_cpuref (); 899 hdrv_pcl_state.get_events = ppcbe->hdrv_pcbe_get_events; 900 hwcdrv_get_x86_eventnum = ppcbe->hdrv_pcbe_get_eventnum; 901 break; 902 } 903 } 904 if (hdrv_pcl_about.cpcN_npics > MAX_PICS) 905 { 906 TprintfT (0, "hwcdrv: WARNING: hdrv_pcl_internal_open:" 907 " reducing number of HWCs from %u to %u on processor '%s'\n", 908 hdrv_pcl_about.cpcN_npics, MAX_PICS, hdrv_pcl_about.cpcN_cciname); 909 hdrv_pcl_about.cpcN_npics = MAX_PICS; 910 } 911 TprintfT (DBG_LT1, "hwcdrv: hdrv_pcl_internal_open:" 912 " perf_event cpuver=%d, name='%s'\n", 913 hdrv_pcl_about.cpcN_cpuver, hdrv_pcl_about.cpcN_cciname); 914 return 0; 915 916 internal_open_error: 917 hdrv_pcl_about.cpcN_cpuver = CPUVER_UNDEFINED; 918 hdrv_pcl_about.cpcN_npics = 0; 919 hdrv_pcl_about.cpcN_docref = NULL; 920 hdrv_pcl_about.cpcN_cciname = NULL; 921 return HWCFUNCS_ERROR_NOT_SUPPORTED; 922 } 923 924 static void * 925 single_thread_tsd_ftn () 926 { 927 static hdrv_pcl_ctx_t tsd_context; 928 return &tsd_context; 929 } 930 931 /* HWCDRV_API */ 932 HWCDRV_API int 933 hwcdrv_init (hwcfuncs_abort_fn_t abort_ftn, int *tsd_sz) 934 { 935 hdrv_pcl_state.find_vpc_ctx = single_thread_tsd_ftn; 936 if (tsd_sz) 937 *tsd_sz = sizeof (hdrv_pcl_ctx_t); 938 939 if (hdrv_pcl_state.internal_open_called) 940 return HWCFUNCS_ERROR_ALREADY_CALLED; 941 return hdrv_pcl_internal_open (); 942 } 943 944 HWCDRV_API void 945 hwcdrv_get_info (int *cpuver, const char **cciname, uint_t *npics, 946 const char **docref, uint64_t *support) 947 { 948 if (cpuver) 949 *cpuver = hdrv_pcl_about.cpcN_cpuver; 950 if (cciname) 951 *cciname = hdrv_pcl_about.cpcN_cciname; 952 if (npics) 953 *npics = hdrv_pcl_about.cpcN_npics; 954 if (docref) 955 *docref = hdrv_pcl_about.cpcN_docref; 956 if (support) 957 *support = HWCFUNCS_SUPPORT_OVERFLOW_PROFILING | HWCFUNCS_SUPPORT_OVERFLOW_CTR_ID; 958 } 959 960 HWCDRV_API int 961 hwcdrv_enable_mt (hwcfuncs_tsd_get_fn_t tsd_ftn) 962 { 963 if (tsd_ftn) 964 hdrv_pcl_state.find_vpc_ctx = tsd_ftn; 965 else 966 { 967 TprintfT (0, "hwcdrv: ERROR: enable_mt(): tsd_ftn==NULL\n"); 968 return HWCFUNCS_ERROR_UNAVAIL; 969 } 970 return 0; 971 } 972 973 HWCDRV_API int 974 hwcdrv_get_descriptions (hwcf_hwc_cb_t *hwc_cb, hwcf_attr_cb_t *attr_cb) 975 { 976 int count = 0; 977 if (hwc_cb && hdrv_pcl_state.get_events) 978 count = hdrv_pcl_state.get_events (hwc_cb); 979 if (attr_cb) 980 for (int ii = 0; perfctr_attrs_table && perfctr_attrs_table[ii].attrname; ii++) 981 attr_cb (perfctr_attrs_table[ii].attrname); 982 if (!count) 983 return -1; 984 return 0; 985 } 986 987 HWCDRV_API int 988 hwcdrv_assign_regnos (Hwcentry* entries[], unsigned numctrs) 989 { 990 return hwcdrv_assign_all_regnos (entries, numctrs); 991 } 992 993 static int 994 internal_hwc_start (int fd) 995 { 996 int rc = ioctl (fd, PERF_EVENT_IOC_REFRESH, 1); 997 if (rc == -1) 998 { 999 TprintfT (DBG_LT0, "hwcdrv: ERROR: internal_hwc_start:" 1000 " PERF_EVENT_IOC_REFRESH(fd=%d) failed: errno=%d\n", fd, errno); 1001 return HWCFUNCS_ERROR_UNAVAIL; 1002 } 1003 TprintfT (DBG_LT3, "hwcdrv: internal_hwc_start(fd=%d)\n", fd); 1004 return 0; 1005 } 1006 1007 HWCDRV_API int 1008 hwcdrv_overflow (siginfo_t *si, hwc_event_t *eventp, hwc_event_t *lost_events) 1009 { 1010 /* set expired counters to overflow value and all others to 0 */ 1011 /* return 0: OK, counters should be restarted */ 1012 /* return non-zero: eventp not set, counters should not be restarted */ 1013 /* clear return values */ 1014 int ii; 1015 for (ii = 0; ii < hdrv_pcl_state.hwcdef_cnt; ii++) 1016 { 1017 eventp->ce_pic[ii] = 0; 1018 lost_events->ce_pic[ii] = 0; 1019 } 1020 hrtime_t sig_ts = gethrtime (); //YXXX get this from HWC event? 1021 eventp->ce_hrt = sig_ts; 1022 lost_events->ce_hrt = sig_ts; 1023 1024 /* determine source signal */ 1025 int signal_fd = -1; 1026 switch (si->si_code) 1027 { 1028 case POLL_HUP: /* expected value from pcl */ 1029 /* According to Stephane Eranian: 1030 * "expect POLL_HUP instead of POLL_IN because we are 1031 * in one-shot mode (IOC_REFRESH)" 1032 */ 1033 signal_fd = si->si_fd; 1034 break; 1035 case SI_TKILL: /* event forwarded by tkill */ 1036 /* DBX can only forward SI_TKILL when it detects POLL_HUP 1037 * unfortunately, this means that si->si_fd has been lost... 1038 * We need to process the buffers, but we don't know the fd! 1039 */ 1040 TprintfT (DBG_LT0, "hwcdrv: sig_ts=%llu: WARNING: hwcdrv_overflow:" 1041 " SI_TKILL detected\n", sig_ts); 1042 break; 1043 default: 1044 // "sometimes we see a POLL_IN (1) with very high event rates," 1045 // according to eranian(?) 1046 TprintfT (DBG_LT0, "hwcdrv: sig_ts=%llu: ERROR: hwcdrv_overflow:" 1047 " unexpected si_code 0x%x\n", sig_ts, si->si_code); 1048 return HWCFUNCS_ERROR_GENERIC; 1049 } 1050 1051 hdrv_pcl_ctx_t * pctx = hdrv_pcl_state.find_vpc_ctx (); 1052 if (!pctx) 1053 { 1054 TprintfT (DBG_LT0, "hwcdrv: sig_ts=%llu: ERROR: hwcdrv_overflow:" 1055 " tsd context is NULL\n", sig_ts); 1056 return HWCFUNCS_ERROR_UNEXPECTED; 1057 } 1058 counter_state_t * ctr_list = (counter_state_t *) pctx->ctr_list; 1059 if (!ctr_list) 1060 { 1061 TprintfT (DBG_LT0, "hwcdrv: sig_ts=%llu: WARNING: hwcdrv_overflow:" 1062 " ctr_list is NULL\n", sig_ts); 1063 return HWCFUNCS_ERROR_UNEXPECTED; 1064 } 1065 1066 /* clear needs_restart flag */ 1067 for (ii = 0; ii < hdrv_pcl_state.hwcdef_cnt; ii++) 1068 ctr_list[ii].needs_restart = 0; 1069 1070 /* attempt to identify the counter to read */ 1071 int signal_idx = -1; 1072 pctx->signal_fd = signal_fd; // save the signal provided by siginfo_t 1073 if (signal_fd != -1) 1074 { 1075 for (ii = 0; ii < hdrv_pcl_state.hwcdef_cnt; ii++) 1076 { 1077 if (ctr_list[ii].fd == signal_fd) 1078 { 1079 signal_idx = ii; 1080 break; 1081 } 1082 } 1083 } 1084 1085 if (signal_idx < 0) 1086 { 1087 TprintfT (DBG_LT0, "hwcdrv: sig_ts=%llu: ERROR: hwcdrv_overflow:" 1088 " pmc not determined!\n", sig_ts); 1089 lost_events->ce_pic[0] = 1; /* record a bogus value into experiment */ 1090 // note: bogus value may get overwritten in loop below 1091 } 1092 1093 /* capture sample(s). In addition to signal_idx, check other counters. */ 1094 struct perf_event_header sheader; 1095 int idx; 1096 for (idx = 0; idx < hdrv_pcl_state.hwcdef_cnt; idx++) 1097 { 1098 int num_recs = 0; 1099 while (1) 1100 { 1101 /* check for samples */ 1102 struct perf_event_mmap_page *metadata = ctr_list[idx].buf_state.buf; 1103 if (metadata == NULL) 1104 break; // empty 1105 if (metadata->data_tail == metadata->data_head) 1106 break; // empty 1107 1108 /* read header */ 1109 if (read_buf (&ctr_list[idx].buf_state, &sheader, sizeof (sheader))) 1110 break; 1111 num_recs++; 1112 1113 /* check for PERF_RECORD_SAMPLE */ 1114 size_t datasz = sheader.size - sizeof (struct perf_event_header); 1115 if (sheader.type != PERF_RECORD_SAMPLE) 1116 { 1117 TprintfT (DBG_LT2, "hwcdrv: sig_ts=%llu: WARNING: hwcdrv_overflow:" 1118 " unexpected recd type=%d\n", 1119 sig_ts, sheader.type); 1120 if (skip_buf (&ctr_list[idx].buf_state, datasz)) 1121 { 1122 TprintfT (DBG_LT0, "hwcdrv: sig_ts=%llu: ERROR: hwcdrv_overflow:" 1123 " skip recd type=%d failed\n", sig_ts, sheader.type); 1124 lost_events->ce_pic[idx] = 4; /* record a bogus value */ 1125 break; // failed to skip buffer?? 1126 } 1127 lost_events->ce_pic[idx] = 2; /* record a bogus value */ 1128 continue; // advance to next record 1129 } 1130 1131 /* type is PERF_RECORD_SAMPLE */ 1132 uint64_t value, lostv; 1133 if (read_sample (&ctr_list[idx], datasz, &value, &lostv)) 1134 { 1135 TprintfT (DBG_LT0, "hwcdrv: sig_ts=%llu: ERROR: hwcdrv_overflow:" 1136 " read_sample() failed\n", sig_ts); 1137 lost_events->ce_pic[idx] = 3; // record a bogus value 1138 break; // failed to read sample data?? 1139 } 1140 TprintfT (DBG_LT3, "hwcdrv: sig_ts=%llu: hwcdrv_overflow:" 1141 " idx=%d value=%llu lost=%llu\n", (unsigned long long) sig_ts, 1142 idx, (unsigned long long) value, (unsigned long long) lostv); 1143 if (eventp->ce_pic[idx]) 1144 { 1145 TprintfT (DBG_LT2, "hwcdrv: sig_ts=%llu: WARNING: hwcdrv_overflow:" 1146 " idx=%d previous sample recorded as lost_event\n", sig_ts, idx); 1147 lost_events->ce_pic[idx] += eventp->ce_pic[idx]; 1148 } 1149 eventp->ce_pic[idx] = value; 1150 lost_events->ce_pic[idx] += lostv; 1151 } 1152 1153 /* debug output for unexpected (but common) cases */ 1154 if (idx == signal_idx) 1155 { 1156 if (num_recs != 1) 1157 TprintfT (DBG_LT2, "hwcdrv: sig_ts=%llu: WARNING: hwcdrv_overflow:" 1158 " %d records for signal_idx=%d\n", sig_ts, num_recs, signal_idx); 1159 } 1160 else if (num_recs) 1161 TprintfT (DBG_LT2, "hwcdrv: sig_ts=%llu: WARNING: hwcdrv_overflow:" 1162 " %d unexpected record(s) for idx=%d (signal_idx=%d)\n", 1163 sig_ts, num_recs, idx, signal_idx); 1164 1165 /* trigger counter restart whenever records were found */ 1166 if (num_recs) 1167 { 1168 /* check whether to adapt the overflow interval */ 1169 /* This is the Linux version. 1170 * The Solaris version is in hwprofile.c collector_update_overflow_counters(). 1171 */ 1172 hrtime_t min_time = global_perf_event_def[idx].min_time; 1173 if (min_time > 0 // overflow interval is adaptive 1174 && sig_ts - ctr_list[idx].last_overflow_time < min_time) // last interval below min 1175 { 1176 /* pick a new overflow interval */ 1177 /* roughly doubled, but add funny numbers */ 1178 /* hopefully the result is prime or not a multiple of some # of ops/loop */ 1179 uint64_t new_period = 2 * ctr_list[idx].last_overflow_period + 37; 1180 #if 0 1181 // On Solaris, we report the adjustment to the log file. 1182 // On Linux it's hard for us to do so since hwcdrv_pcl.c doesn't know about collector_interface, SP_JCMD_COMMENT, or COL_COMMENT_HWCADJ. 1183 // For now we simply don't report. 1184 collector_interface->writeLog ("<event kind=\"%s\" id=\"%d\">%s %d -> %d</event>\n", 1185 SP_JCMD_COMMENT, COL_COMMENT_HWCADJ, global_perf_event_def[idx].name, 1186 ctr_list[idx].last_overflow_period, new_period); 1187 #endif 1188 /* There are a variety of ways of resetting the period on Linux. 1189 * The most elegant is 1190 * ioctl(fd,PERF_EVENT_IOC_PERIOD,&period) 1191 * but check the perf_event_open man page for PERF_EVENT_IOC_PERIOD: 1192 * > Prior to Linux 2.6.36 this ioctl always failed due to a bug in the kernel. 1193 * > Prior to Linux 3.14 (or 3.7 on ARM), the new period did not take effect 1194 * until after the next overflow. 1195 * So we're kind of stuck shutting the fd down and restarting it with the new period. 1196 */ 1197 if (stop_one_ctr (idx, ctr_list)) 1198 { 1199 // EUGENE figure out what to do on error 1200 } 1201 ctr_list[idx].last_overflow_period = new_period; 1202 if (start_one_ctr (idx, ctr_list[idx].buf_state.pagesz, pctx, "hwcdrv: ERROR: hwcdrv_overflow (readjust overflow):")) 1203 { 1204 // EUGENE figure out what to do on error 1205 } 1206 } 1207 ctr_list[idx].last_overflow_time = sig_ts; 1208 #if 0 1209 ctr_list[idx].needs_restart = 1; 1210 #else // seems to be more reliable to restart here instead of hwcdrv_sighlr_restart() 1211 internal_hwc_start (ctr_list[idx].fd); 1212 #endif 1213 } 1214 } 1215 return 0; // OK to restart counters 1216 } 1217 1218 HWCDRV_API int 1219 hwcdrv_sighlr_restart (const hwc_event_t *pp) 1220 { 1221 #if 0 // restarting here doesn't seem to work as well as restarting in hwcdrv_overflow() 1222 hdrv_pcl_ctx_t * pctx = hdrv_pcl_state.find_vpc_ctx (); 1223 if (!pctx) 1224 { 1225 TprintfT (DBG_LT0, "hwcdrv: ERROR: hwcdrv_sighlr_restart: find_vpc_ctx()==NULL\n"); 1226 return -1; 1227 } 1228 counter_state_t * ctr_list = (counter_state_t *) pctx->ctr_list; 1229 if (!ctr_list) 1230 { 1231 TprintfT (DBG_LT0, "hwcdrv: WARNING: hwcdrv_sighlr_restart: ctr_list is NULL\n"); 1232 return -1; 1233 } 1234 int errors = 0; 1235 for (int ii = 0; ii < hdrv_pcl_state.hwcdef_cnt; ii++) 1236 { 1237 if (ctr_list[ii].needs_restart) 1238 errors |= internal_hwc_start (ctr_list[ii].fd); 1239 ctr_list[ii].needs_restart = 0; 1240 } 1241 return errors; 1242 #else 1243 return 0; 1244 #endif 1245 } 1246 1247 /* create counters based on hwcdef[] */ 1248 HWCDRV_API int 1249 hwcdrv_create_counters (unsigned hwcdef_cnt, Hwcentry *hwcdef) 1250 { 1251 if (hwcdef_cnt > hdrv_pcl_about.cpcN_npics) 1252 { 1253 logerr (GTXT ("More than %d counters were specified\n"), hdrv_pcl_about.cpcN_npics); /*!*/ 1254 return HWCFUNCS_ERROR_HWCARGS; 1255 } 1256 if (hdrv_pcl_about.cpcN_cpuver == CPUVER_UNDEFINED) 1257 { 1258 logerr (GTXT ("Processor not supported\n")); 1259 return HWCFUNCS_ERROR_HWCARGS; 1260 } 1261 1262 /* add counters */ 1263 for (unsigned idx = 0; idx < hwcdef_cnt; idx++) 1264 { 1265 perf_event_def_t *glb_event_def = &global_perf_event_def[idx]; 1266 memset (glb_event_def, 0, sizeof (perf_event_def_t)); 1267 unsigned int pmc_sel; 1268 eventsel_t evntsel; 1269 if (hwcfuncs_get_x86_eventsel (hwcdef[idx].reg_num, 1270 hwcdef[idx].int_name, &evntsel, &pmc_sel)) 1271 { 1272 TprintfT (0, "hwcdrv: ERROR: hwcfuncs_get_x86_eventsel() failed\n"); 1273 return HWCFUNCS_ERROR_HWCARGS; 1274 } 1275 glb_event_def->reg_num = pmc_sel; 1276 glb_event_def->eventsel = evntsel; 1277 glb_event_def->counter_preload = hwcdef[idx].val; 1278 glb_event_def->min_time = hwcdef[idx].min_time; 1279 glb_event_def->name = strdup (hwcdef[idx].name); // memory leak??? very minor 1280 init_perf_event (&glb_event_def->hw, glb_event_def->eventsel, 1281 glb_event_def->counter_preload, hwcdef + idx); 1282 TprintfT (DBG_LT1, "hwcdrv: create_counters: pic=%u name='%s' interval=%lld" 1283 "(min_time=%lld): reg_num=0x%x eventsel=0x%llx ireset=%lld usr=%lld sys=%lld\n", 1284 idx, hwcdef[idx].int_name, (long long) glb_event_def->counter_preload, 1285 (long long) glb_event_def->min_time, (int) glb_event_def->reg_num, 1286 (long long) glb_event_def->eventsel, 1287 (long long) HW_INTERVAL_PRESET (hwcdef[idx].val), 1288 (long long) glb_event_def->hw.exclude_user, 1289 (long long) glb_event_def->hw.exclude_kernel); 1290 } 1291 1292 hdrv_pcl_state.hwcdef_cnt = hwcdef_cnt; 1293 return 0; 1294 } 1295 1296 HWCDRV_API int 1297 hwcdrv_free_counters () // note: only performs shutdown for this thread 1298 { 1299 hdrv_pcl_ctx_t * pctx; 1300 if (!COUNTERS_ENABLED ()) 1301 return 0; 1302 pctx = hdrv_pcl_state.find_vpc_ctx (); 1303 if (!pctx) 1304 { 1305 TprintfT (0, "hwcdrv: WARNING: hwcdrv_free_counters: tsd context is NULL\n"); 1306 return HWCFUNCS_ERROR_GENERIC; 1307 } 1308 counter_state_t *ctr_list = pctx->ctr_list; 1309 if (!ctr_list) 1310 { 1311 // fork child: prolog suspends hwcs, then epilog frees them 1312 TprintfT (DBG_LT1, "hwcdrv: WARNING: hwcdrv_free_counters: ctr_list is already NULL\n"); 1313 return 0; 1314 } 1315 int hwc_rc = 0; 1316 for (int ii = 0; ii < hdrv_pcl_state.hwcdef_cnt; ii++) 1317 if (stop_one_ctr (ii, ctr_list)) 1318 hwc_rc = HWCFUNCS_ERROR_GENERIC; 1319 TprintfT (DBG_LT1, "hwcdrv: hwcdrv_free_counters(tid=0x%lx).\n", (long) pctx->tid); 1320 pctx->ctr_list = NULL; 1321 return hwc_rc; 1322 } 1323 1324 HWCDRV_API int 1325 hwcdrv_start (void) /* must be called from each thread ? */ 1326 { 1327 hdrv_pcl_ctx_t *pctx = NULL; 1328 if (!COUNTERS_ENABLED ()) 1329 { 1330 TprintfT (DBG_LT1, "hwcdrv: WARNING: hwcdrv_start: no counters to start \n"); 1331 return 0; 1332 } 1333 if (!hdrv_pcl_state.library_ok) 1334 { 1335 TprintfT (0, "hwcdrv: ERROR: hwcdrv_start: library is not open\n"); 1336 return HWCFUNCS_ERROR_NOT_SUPPORTED; 1337 } 1338 1339 /* 1340 * set up per-thread context 1341 */ 1342 pctx = hdrv_pcl_state.find_vpc_ctx (); 1343 if (!pctx) 1344 { 1345 TprintfT (0, "hwcdrv: ERROR: hwcdrv_start: tsd context is NULL\n"); 1346 return HWCFUNCS_ERROR_UNEXPECTED; 1347 } 1348 pctx->tid = hwcdrv_gettid (); 1349 TprintfT (DBG_LT1, "hwcdrv: hwcdrv_start(tid=0x%lx)\n", (long) pctx->tid); 1350 1351 /* 1352 * create per-thread counter list 1353 */ 1354 counter_state_t *ctr_list = (counter_state_t *) calloc (hdrv_pcl_state.hwcdef_cnt, 1355 sizeof (counter_state_t)); 1356 if (!ctr_list) 1357 { 1358 TprintfT (0, "hwcdrv: ERROR: hwcdrv_start: calloc(ctr_list) failed\n"); 1359 return HWCFUNCS_ERROR_MEMORY; 1360 } 1361 int ii; 1362 for (ii = 0; ii < hdrv_pcl_state.hwcdef_cnt; ii++) 1363 ctr_list[ii].fd = -1; // invalidate fds in case we have to close prematurely 1364 pctx->ctr_list = ctr_list; 1365 1366 /* 1367 * bind the counters 1368 */ 1369 size_t pgsz = sysconf (_SC_PAGESIZE); 1370 for (ii = 0; ii < hdrv_pcl_state.hwcdef_cnt; ii++) 1371 { 1372 ctr_list[ii].last_overflow_period = global_perf_event_def[ii].hw.sample_period; 1373 if (start_one_ctr (ii, pgsz, pctx, "hwcdrv: ERROR: hwcdrv_start:")) goto hwcdrv_start_cleanup; 1374 } 1375 1376 /* 1377 * start the counters 1378 */ 1379 for (ii = 0; ii < hdrv_pcl_state.hwcdef_cnt; ii++) 1380 { 1381 int rc = internal_hwc_start (ctr_list[ii].fd); 1382 if (rc < 0) 1383 goto hwcdrv_start_cleanup; 1384 } 1385 return 0; 1386 1387 hwcdrv_start_cleanup: 1388 hwcdrv_free_counters (); // PERF_EVENT_IOC_DISABLE and close() for all fds 1389 return HWCFUNCS_ERROR_UNAVAIL; 1390 } 1391 1392 HWCDRV_API int 1393 hwcdrv_lwp_suspend (void) /* must be called from each thread */ 1394 { 1395 if (!COUNTERS_ENABLED ()) 1396 { 1397 TprintfT (DBG_LT1, "hwcdrv: WARNING: hwcdrv_lwp_suspend: no counters\n"); 1398 return 0; 1399 } 1400 TprintfT (DBG_LT1, "hwcdrv: hwcdrv_lwp_suspend()\n"); 1401 return hwcdrv_free_counters (); 1402 } 1403 1404 HWCDRV_API int 1405 hwcdrv_lwp_resume (void) /* must be called from each thread */ 1406 { 1407 if (!COUNTERS_ENABLED ()) 1408 { 1409 TprintfT (DBG_LT1, "hwcdrv: WARNING: hwcdrv_lwp_resume: no counters\n"); 1410 return 0; 1411 } 1412 TprintfT (DBG_LT1, "hwcdrv: hwcdrv_lwp_resume()\n"); 1413 return hwcdrv_start (); 1414 } 1415 1416 HWCDRV_API int 1417 hwcdrv_read_events (hwc_event_t *overflow_data, hwc_event_samples_t *sampled_data) 1418 { 1419 overflow_data->ce_hrt = 0; 1420 for (int i = 0; i < MAX_PICS; i++) 1421 { 1422 overflow_data->ce_pic[i] = 0; 1423 if (sampled_data) 1424 HWCFUNCS_SAMPLE_RESET (&sampled_data->sample[i]); 1425 } 1426 return 0; 1427 } 1428 1429 /*---------------------------------------------------------------------------*/ 1430 /* HWCDRV_API */ 1431 1432 hwcdrv_api_t hwcdrv_pcl_api = { 1433 hwcdrv_init, 1434 hwcdrv_get_info, 1435 hwcdrv_enable_mt, 1436 hwcdrv_get_descriptions, 1437 hwcdrv_assign_regnos, 1438 hwcdrv_create_counters, 1439 hwcdrv_start, 1440 hwcdrv_overflow, 1441 hwcdrv_read_events, 1442 hwcdrv_sighlr_restart, 1443 hwcdrv_lwp_suspend, 1444 hwcdrv_lwp_resume, 1445 hwcdrv_free_counters, 1446 hwcdrv_lwp_init, 1447 hwcdrv_lwp_fini, 1448 -1 // hwcdrv_init_status 1449 }; 1450