1 /* Copyright (C) 2021 Free Software Foundation, Inc. 2 Contributed by Oracle. 3 4 This file is part of GNU Binutils. 5 6 This program is free software; you can redistribute it and/or modify 7 it under the terms of the GNU General Public License as published by 8 the Free Software Foundation; either version 3, or (at your option) 9 any later version. 10 11 This program is distributed in the hope that it will be useful, 12 but WITHOUT ANY WARRANTY; without even the implied warranty of 13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 GNU General Public License for more details. 15 16 You should have received a copy of the GNU General Public License 17 along with this program; if not, write to the Free Software 18 Foundation, 51 Franklin Street - Fifth Floor, Boston, 19 MA 02110-1301, USA. */ 20 21 #include <errno.h> 22 #include <unistd.h> 23 #include <fcntl.h> 24 #include <sys/mman.h> 25 #include <sys/ioctl.h> 26 #include <sys/syscall.h> 27 #include <linux/perf_event.h> 28 29 #include "hwcdrv.h" 30 31 /*---------------------------------------------------------------------------*/ 32 /* macros */ 33 #define IS_GLOBAL /* Mark global symbols */ 34 35 #include "cpuid.c" /* ftns for identifying a chip */ 36 37 static hdrv_pcbe_api_t hdrv_pcbe_core_api; 38 static hdrv_pcbe_api_t hdrv_pcbe_opteron_api; 39 static hdrv_pcbe_api_t *hdrv_pcbe_drivers[] = { 40 &hdrv_pcbe_core_api, 41 &hdrv_pcbe_opteron_api, 42 NULL 43 }; 44 #include "opteron_pcbe.c" /* CPU-specific code */ 45 #include "core_pcbe.c" /* CPU-specific code */ 46 47 extern hwcdrv_api_t hwcdrv_pcl_api; 48 IS_GLOBAL hwcdrv_api_t *hwcdrv_drivers[] = { 49 &hwcdrv_pcl_api, 50 NULL 51 }; 52 53 /*---------------------------------------------------------------------------*/ 54 55 /* utils for drivers */ 56 IS_GLOBAL int 57 hwcdrv_assign_all_regnos (Hwcentry* entries[], unsigned numctrs) 58 { 59 unsigned int pmc_assigned[MAX_PICS]; 60 unsigned idx; 61 for (int ii = 0; ii < MAX_PICS; ii++) 62 pmc_assigned[ii] = 0; 63 64 /* assign the HWCs that we already know about */ 65 for (idx = 0; idx < numctrs; idx++) 66 { 67 regno_t regno = entries[idx]->reg_num; 68 if (regno == REGNO_ANY) 69 { 70 /* check to see if list of possible registers only contains one entry */ 71 regno = REG_LIST_SINGLE_VALID_ENTRY (entries[idx]->reg_list); 72 } 73 if (regno != REGNO_ANY) 74 { 75 if (regno < 0 || regno >= MAX_PICS || !regno_is_valid (entries[idx], regno)) 76 { 77 logerr (GTXT ("For counter #%d, register %d is out of range\n"), idx + 1, regno); /*!*/ 78 return HWCFUNCS_ERROR_HWCARGS; 79 } 80 TprintfT (DBG_LT2, "hwcfuncs_assign_regnos(): preselected: idx=%d, regno=%d\n", idx, regno); 81 entries[idx]->reg_num = regno; /* assigning back to entries */ 82 pmc_assigned[regno] = 1; 83 } 84 } 85 86 /* assign HWCs that are currently REGNO_ANY */ 87 for (idx = 0; idx < numctrs; idx++) 88 { 89 if (entries[idx]->reg_num == REGNO_ANY) 90 { 91 int assigned = 0; 92 regno_t *reg_list = entries[idx]->reg_list; 93 for (; reg_list && *reg_list != REGNO_ANY; reg_list++) 94 { 95 regno_t regno = *reg_list; 96 if (regno < 0 || regno >= MAX_PICS) 97 { 98 logerr (GTXT ("For counter #%d, register %d is out of range\n"), idx + 1, regno); /*!*/ 99 return HWCFUNCS_ERROR_HWCARGS; 100 } 101 if (pmc_assigned[regno] == 0) 102 { 103 TprintfT (DBG_LT2, "hwcfuncs_assign_regnos(): assigned: idx=%d, regno=%d\n", idx, regno); 104 entries[idx]->reg_num = regno; /* assigning back to entries */ 105 pmc_assigned[regno] = 1; 106 assigned = 1; 107 break; 108 } 109 } 110 if (!assigned) 111 { 112 logerr (GTXT ("Counter '%s' could not be bound to a register\n"), 113 entries[idx]->name ? entries[idx]->name : "<NULL>"); 114 return HWCFUNCS_ERROR_HWCARGS; 115 } 116 } 117 } 118 return 0; 119 } 120 121 IS_GLOBAL int 122 hwcdrv_lookup_cpuver (const char * cpcN_cciname) 123 { 124 libcpc2_cpu_lookup_t *plookup; 125 static libcpc2_cpu_lookup_t cpu_table[] = { 126 LIBCPC2_CPU_LOOKUP_LIST 127 }; 128 if (cpcN_cciname == NULL) 129 return CPUVER_UNDEFINED; 130 131 /* search table for name */ 132 for (plookup = cpu_table; plookup->cpc2_cciname; plookup++) 133 { 134 int n = strlen (plookup->cpc2_cciname); 135 if (!strncmp (plookup->cpc2_cciname, cpcN_cciname, n)) 136 return plookup->cpc2_cpuver; 137 } 138 /* unknown, but does have a descriptive string */ 139 TprintfT (DBG_LT0, "hwcfuncs: CPC2: WARNING: Id of processor '%s' " 140 "could not be determined\n", 141 cpcN_cciname); 142 return CPUVER_GENERIC; 143 } 144 145 /*---------------------------------------------------------------------------*/ 146 /* utils to generate x86 register definitions on Linux */ 147 148 /* 149 * This code is structured as though we're going to initialize the 150 * HWC by writing the Intel MSR register directly. That is, we 151 * assume the lowest 16 bits of the event number will have the event 152 * and that higher bits will set attributes. 153 * 154 * While SPARC is different, we can nonetheless use basically the 155 * same "x86"-named functions: 156 * 157 * - The event code will still be 16 bits. It will still 158 * be in the lowest 16 bits of the event number. Though 159 * perf_event_code() on SPARC will expect those bits to 160 * shifted, hwcdrv_pcl.c can easily perform that shift. 161 * 162 * - On SPARC we support only two attributes, "user" and "system", 163 * which hwcdrv_pcl.c already converts to the "exclude_user" 164 * and "exclude_kernel" fields expected by perf_event_open(). 165 * "user" and "system" are stored in event bits 16 and 17. 166 * For M8, a 4-bit mask of supported PICs is stored in bits [23:20]. 167 */ 168 169 IS_GLOBAL hwcdrv_get_eventnum_fn_t *hwcdrv_get_x86_eventnum = 0; 170 171 static const attr_info_t perfctr_sparc_attrs[] = { 172 {NTXT ("user"), 0, 0x01, 16}, //usr 173 {NTXT ("system"), 0, 0x01, 17}, //os 174 {NULL, 0, 0x00, 0}, 175 }; 176 static const attr_info_t perfctr_x64_attrs[] = {/* ok for Core2 & later */ 177 {NTXT ("umask"), 0, 0xff, 8}, 178 {NTXT ("user"), 0, 0x01, 16}, //usr 179 //{NTXT("nouser"), 1, 0x01, 16}, //usr (inverted) 180 {NTXT ("system"), 0, 0x01, 17}, //os 181 {NTXT ("edge"), 0, 0x01, 18}, 182 {NTXT ("pc"), 0, 0x01, 19}, 183 {NTXT ("inv"), 0, 0x01, 23}, 184 {NTXT ("cmask"), 0, 0xff, 24}, 185 {NULL, 0, 0x00, 0}, 186 }; 187 const attr_info_t *perfctr_attrs_table = perfctr_x64_attrs; 188 189 static const eventsel_t perfctr_evntsel_enable_bits = (0x01 << 16) | /* usr */ 190 // (0xff << 0) | /* event*/ 191 // (0xff << 8) | /* umask */ 192 // (0x01 << 17) | /* os */ 193 // (0x01 << 18) | /* edge */ 194 // (0x01 << 19) | /* pc */ 195 (0x01 << 20) | /* int */ 196 // (0x01 << 21) | /* reserved */ 197 (0x01 << 22) | /* enable */ 198 // (0x01 << 23) | /* inv */ 199 // (0xff << 24) | /* cmask */ 200 0; 201 202 static int 203 myperfctr_get_x86_eventnum (const char *eventname, uint_t pmc, 204 eventsel_t *eventsel, eventsel_t *valid_umask, 205 uint_t *pmc_sel) 206 { 207 if (hwcdrv_get_x86_eventnum && 208 !hwcdrv_get_x86_eventnum (eventname, pmc, eventsel, valid_umask, pmc_sel)) 209 return 0; 210 211 /* check for numerically-specified counters */ 212 char * endptr; 213 uint64_t num = strtoull (eventname, &endptr, 0); 214 if (*eventname && !*endptr) 215 { 216 *eventsel = EXTENDED_EVNUM_2_EVSEL (num); 217 *valid_umask = 0xff; /* allow any umask (unused for SPARC?) */ 218 *pmc_sel = pmc; 219 return 0; 220 } 221 222 /* name does not specify a numeric value */ 223 *eventsel = (eventsel_t) - 1; 224 *valid_umask = 0x0; 225 *pmc_sel = pmc; 226 return -1; 227 } 228 229 static int 230 mask_shift_set (eventsel_t *presult, eventsel_t invalue, 231 eventsel_t mask, eventsel_t shift) 232 { 233 if (invalue & ~mask) 234 return -1; /* invalue attempts to set bits outside of mask */ 235 *presult &= ~(mask << shift); /* clear all the mask bits */ 236 *presult |= (invalue << shift); /* set bits according to invalue */ 237 return 0; 238 } 239 240 static int 241 set_x86_attr_bits (eventsel_t *result_mask, eventsel_t evnt_valid_umask, 242 hwcfuncs_attr_t attrs[], int nattrs, const char*nameOnly) 243 { 244 eventsel_t evntsel = *result_mask; 245 for (int ii = 0; ii < (int) nattrs; ii++) 246 { 247 const char *attrname = attrs[ii].ca_name; 248 eventsel_t attrval = (eventsel_t) attrs[ii].ca_val; 249 const char *tmpname; 250 int attr_found = 0; 251 for (int jj = 0; (tmpname = perfctr_attrs_table[jj].attrname); jj++) 252 { 253 if (strcmp (attrname, tmpname) == 0) 254 { 255 if (strcmp (attrname, "umask") == 0) 256 { 257 if (attrval & ~evnt_valid_umask) 258 { 259 logerr (GTXT ("for `%s', allowable umask bits are: 0x%llx\n"), 260 nameOnly, (long long) evnt_valid_umask); 261 return -1; 262 } 263 } 264 if (mask_shift_set (&evntsel, 265 perfctr_attrs_table[jj].is_inverted ? (attrval^1) : attrval, 266 perfctr_attrs_table[jj].mask, 267 perfctr_attrs_table[jj].shift)) 268 { 269 logerr (GTXT ("`%s' attribute `%s' could not be set to 0x%llx\n"), 270 nameOnly, attrname, (long long) attrval); 271 return -1; 272 } 273 TprintfT (DBG_LT2, "hwcfuncs: Counter %s, attribute %s set to 0x%llx\n", 274 nameOnly, attrname, (long long) attrval); 275 attr_found = 1; 276 break; 277 } 278 } 279 if (!attr_found) 280 { 281 logerr (GTXT ("attribute `%s' is invalid\n"), attrname); 282 return -1; 283 } 284 } 285 *result_mask = evntsel; 286 return 0; 287 } 288 289 IS_GLOBAL int 290 hwcfuncs_get_x86_eventsel (unsigned int regno, const char *int_name, 291 eventsel_t *return_event, uint_t *return_pmc_sel) 292 { 293 hwcfuncs_attr_t attrs[HWCFUNCS_MAX_ATTRS + 1]; 294 unsigned nattrs = 0; 295 char *nameOnly = NULL; 296 eventsel_t evntsel = 0; // event number 297 eventsel_t evnt_valid_umask = 0; 298 uint_t pmc_sel = 0; 299 int rc = -1; 300 *return_event = 0; 301 *return_pmc_sel = 0; 302 void *attr_mem = hwcfuncs_parse_attrs (int_name, attrs, HWCFUNCS_MAX_ATTRS, 303 &nattrs, NULL); 304 if (!attr_mem) 305 { 306 logerr (GTXT ("out of memory, could not parse attributes\n")); 307 return -1; 308 } 309 hwcfuncs_parse_ctr (int_name, NULL, &nameOnly, NULL, NULL, NULL); 310 if (regno == REGNO_ANY) 311 { 312 logerr (GTXT ("reg# could not be determined for `%s'\n"), nameOnly); 313 goto attr_wrapup; 314 } 315 316 /* look up evntsel */ 317 if (myperfctr_get_x86_eventnum (nameOnly, regno, 318 &evntsel, &evnt_valid_umask, &pmc_sel)) 319 { 320 logerr (GTXT ("counter `%s' is not valid\n"), nameOnly); 321 goto attr_wrapup; 322 } 323 TprintfT (DBG_LT1, "hwcfuncs: event=0x%llx pmc=0x%x '%s' nattrs = %u\n", 324 (long long) evntsel, pmc_sel, nameOnly, nattrs); 325 326 /* determine event attributes */ 327 eventsel_t evnt_attrs = perfctr_evntsel_enable_bits; 328 if (set_x86_attr_bits (&evnt_attrs, evnt_valid_umask, attrs, nattrs, nameOnly)) 329 goto attr_wrapup; 330 if (evntsel & evnt_attrs) 331 TprintfT (DBG_LT0, "hwcfuncs: ERROR - evntsel & enable bits overlap: 0x%llx 0x%llx 0x%llx\n", 332 (long long) evntsel, (long long) evnt_attrs, 333 (long long) (evntsel & evnt_attrs)); 334 *return_event = evntsel | evnt_attrs; 335 *return_pmc_sel = pmc_sel; 336 rc = 0; 337 338 attr_wrapup: 339 free (attr_mem); 340 free (nameOnly); 341 return rc; 342 } 343 344 #ifdef __x86_64__ 345 #define syscall_instr "syscall" 346 #define syscall_clobber "rcx", "r11", "memory" 347 #endif 348 #ifdef __i386__ 349 #define syscall_instr "int $0x80" 350 #define syscall_clobber "memory" 351 #endif 352 353 static inline int 354 perf_event_open (struct perf_event_attr *hw_event_uptr, pid_t pid, 355 int cpu, int group_fd, unsigned long flags) 356 { 357 /* It seems that perf_event_open() sometimes fails spuriously, 358 * even while an immediate retry succeeds. 359 * So, let's try a few retries if the call fails just to be sure. 360 */ 361 int rc; 362 for (int retry = 0; retry < 5; retry++) 363 { 364 rc = syscall (__NR_perf_event_open, hw_event_uptr, pid, cpu, group_fd, flags); 365 if (rc != -1) 366 return rc; 367 } 368 return rc; 369 } 370 371 /*---------------------------------------------------------------------------*/ 372 /* macros & fwd prototypes */ 373 374 #define HWCDRV_API static /* Mark functions used by hwcdrv API */ 375 376 HWCDRV_API int hwcdrv_start (void); 377 HWCDRV_API int hwcdrv_free_counters (); 378 379 static pid_t 380 hwcdrv_gettid (void) 381 { 382 #ifndef LIBCOLLECTOR_SRC 383 return syscall (__NR_gettid); 384 #elif defined(intel) 385 pid_t r; 386 __asm__ __volatile__(syscall_instr 387 : "=a" (r) : "0" (__NR_gettid) 388 : syscall_clobber); 389 return r; 390 #else 391 return syscall (__NR_gettid); // FIXUP_XXX_SPARC_LINUX // write gettid in asm 392 #endif 393 } 394 395 /*---------------------------------------------------------------------------*/ 396 /* types */ 397 398 #define NPAGES_PER_BUF 1 // number of pages to be used for perf_event samples 399 // must be a power of 2 400 401 /*---------------------------------------------------------------------------*/ 402 403 /* typedefs */ 404 405 typedef struct 406 { // event (hwc) definition 407 unsigned int reg_num; // PMC assignment, potentially for detecting conflicts 408 eventsel_t eventsel; // raw event bits (Intel/AMD) 409 uint64_t counter_preload; // number of HWC events before signal 410 struct perf_event_attr hw; // perf_event definition 411 hrtime_t min_time; // minimum time we're targeting between events 412 char *name; 413 } perf_event_def_t; 414 415 typedef struct 416 { // runtime state of perf_event buffer 417 void *buf; // pointer to mmapped buffer 418 size_t pagesz; // size of pages 419 } buffer_state_t; 420 421 typedef struct 422 { // runtime state of counter values 423 uint64_t prev_ena_ts; // previous perf_event "enabled" time 424 uint64_t prev_run_ts; // previous perf_event "running" time 425 uint64_t prev_value; // previous HWC value 426 } counter_value_state_t; 427 428 typedef struct 429 { // per-counter information 430 perf_event_def_t *ev_def; // global HWC definition for one counter 431 int fd; // perf_event fd 432 buffer_state_t buf_state; // perf_event buffer's state 433 counter_value_state_t value_state; // counter state 434 int needs_restart; // workaround for dbx failure to preserve si_fd 435 uint64_t last_overflow_period; 436 hrtime_t last_overflow_time; 437 } counter_state_t; 438 439 typedef struct 440 { // per-thread context 441 counter_state_t *ctr_list; 442 int signal_fd; // fd that caused the most recent signal 443 pthread_t tid; // for debugging signal delivery problems 444 } hdrv_pcl_ctx_t; 445 446 /*---------------------------------------------------------------------------*/ 447 448 /* static variables */ 449 static struct 450 { 451 int library_ok; 452 int internal_open_called; 453 hwcfuncs_tsd_get_fn_t find_vpc_ctx; 454 unsigned hwcdef_cnt; /* number of *active* hardware counters */ 455 hwcdrv_get_events_fn_t *get_events; 456 } hdrv_pcl_state; 457 458 static hwcdrv_about_t hdrv_pcl_about = {.cpcN_cpuver = CPUVER_UNDEFINED}; 459 static perf_event_def_t global_perf_event_def[MAX_PICS]; 460 461 #define COUNTERS_ENABLED() (hdrv_pcl_state.hwcdef_cnt) 462 463 464 /* perf_event buffer formatting and handling */ 465 static void 466 reset_buf (buffer_state_t *bufstate) 467 { 468 TprintfT (0, "hwcdrv: ERROR: perf_event reset_buf() called!\n"); 469 struct perf_event_mmap_page *metadata = bufstate->buf; 470 if (metadata) 471 metadata->data_tail = metadata->data_head; 472 } 473 474 static int 475 skip_buf (buffer_state_t *bufstate, size_t sz) 476 { 477 TprintfT (DBG_LT1, "hwcdrv: WARNING: perf_event skip_buf called!\n"); 478 struct perf_event_mmap_page *metadata = bufstate->buf; 479 if (metadata == NULL) 480 return -1; 481 size_t pgsz = bufstate->pagesz; 482 size_t bufsz = NPAGES_PER_BUF*pgsz; 483 uint64_t d_tail = metadata->data_tail; 484 uint64_t d_head = metadata->data_head; 485 486 // validate request size 487 if (sz > d_head - d_tail || sz >= bufsz) 488 { 489 reset_buf (bufstate); 490 return -1; 491 } 492 metadata->data_tail = d_tail + sz; // advance tail 493 return 0; 494 } 495 496 static int 497 read_buf (buffer_state_t *bufstate, void *buf, size_t sz) 498 { 499 struct perf_event_mmap_page *metadata = bufstate->buf; 500 if (metadata == NULL) 501 return -1; 502 size_t pgsz = bufstate->pagesz; 503 size_t bufsz = NPAGES_PER_BUF*pgsz; 504 uint64_t d_tail = metadata->data_tail; 505 uint64_t d_head = metadata->data_head; 506 507 // validate request size 508 if (sz > d_head - d_tail || sz >= bufsz) 509 { 510 reset_buf (bufstate); 511 return -1; 512 } 513 char *buf_base = ((char *) metadata) + pgsz; // start of data buffer 514 uint64_t start_pos = d_tail & (bufsz - 1); // char offset into data buffer 515 size_t nbytes = sz; 516 if (start_pos + sz > bufsz) 517 { 518 // will wrap past end of buffer 519 nbytes = bufsz - start_pos; 520 memcpy (buf, buf_base + start_pos, nbytes); 521 start_pos = 0; // wrap to start 522 buf = (void *) (((char *) buf) + nbytes); 523 nbytes = sz - nbytes; 524 } 525 memcpy (buf, buf_base + start_pos, nbytes); 526 metadata->data_tail += sz; 527 return 0; 528 } 529 530 static int 531 read_u64 (buffer_state_t *bufstate, uint64_t *value) 532 { 533 return read_buf (bufstate, value, sizeof (uint64_t)); 534 } 535 536 static int 537 read_sample (counter_state_t *ctr_state, int msgsz, uint64_t *rvalue, 538 uint64_t *rlost) 539 { 540 // returns count of bytes read 541 buffer_state_t *bufstate = &ctr_state->buf_state; 542 counter_value_state_t *cntstate = &ctr_state->value_state; 543 int readsz = 0; 544 545 // PERF_SAMPLE_IP 546 uint64_t ipc = 0; 547 int rc = read_u64 (bufstate, &ipc); 548 if (rc) 549 return -1; 550 readsz += sizeof (uint64_t); 551 552 // PERF_SAMPLE_READ: value 553 uint64_t value = 0; 554 rc = read_u64 (bufstate, &value); 555 if (rc) 556 return -2; 557 readsz += sizeof (uint64_t); 558 559 /* Bug 20806896 560 * Old Linux kernels (e.g. 2.6.32) on certain systems return enabled and 561 * running times in the sample data that correspond to the metadata times 562 * metadata->time_enabled 563 * metadata->time_running 564 * from the PREVIOUS (not current) sample. Probably just ignore this bug 565 * since it's on old kernels and we only use the enabled and running times 566 * to construct loss_estimate. 567 */ 568 // PERF_SAMPLE_READ: PERF_FORMAT_ENABLED 569 uint64_t enabled_time = 0; 570 rc = read_u64 (bufstate, &enabled_time); 571 if (rc) 572 return -3; 573 readsz += sizeof (uint64_t); 574 575 // PERF_SAMPLE_READ: PERF_FORMAT_RUNNING 576 uint64_t running_time = 0; 577 rc = read_u64 (bufstate, &running_time); 578 if (rc) 579 return -4; 580 readsz += sizeof (uint64_t); 581 582 uint64_t value_delta = value - cntstate->prev_value; 583 uint64_t enabled_delta = enabled_time - cntstate->prev_ena_ts; 584 uint64_t running_delta = running_time - cntstate->prev_run_ts; 585 cntstate->prev_value = value; 586 cntstate->prev_ena_ts = enabled_time; 587 cntstate->prev_run_ts = running_time; 588 589 // 24830461 need workaround for Linux anomalous HWC skid overrun 590 int set_error_flag = 0; 591 if (value_delta > 2 * ctr_state->last_overflow_period + 2000 /* HWC_SKID_TOLERANCE */) 592 set_error_flag = 1; 593 594 uint64_t loss_estimate = 0; // estimate loss of events caused by multiplexing 595 if (running_delta == enabled_delta) 596 { 597 // counter was running 100% of time, no multiplexing 598 } 599 else if (running_delta == 0) 600 loss_estimate = 1; // token amount to aid in debugging perfctr oddities 601 else if ((running_delta > enabled_delta) || (enabled_delta & 0x1000000000000000ll)) 602 { 603 // running should be smaller than enabled, can't estimate 604 /* 605 * 21418391 HWC can have a negative count 606 * 607 * We've also seen enabled not only be smaller than running 608 * but in fact go negative. Guard against this. 609 */ 610 loss_estimate = 2; // token amount to aid in debugging perfctr oddities 611 } 612 else 613 { 614 // counter was running less than 100% of time 615 // Example: ena=7772268 run=6775669 raw_value=316004 scaled_value=362483 loss_est=46479 616 uint64_t scaled_delta = (double) value_delta * enabled_delta / running_delta; 617 value_delta = scaled_delta; 618 #if 0 619 // We should perhaps warn the user that multiplexing is going on, 620 // but hwcdrv_pcl.c doesn't know about the collector_interface, SP_JCMD_COMMENT, or COL_COMMENT_* values. 621 // For now we simply don't report. 622 // Perhaps we should address the issue not here but in the caller collector_sigemt_handler(), 623 // but at that level "lost" has a meaning that's considerably broader than just multiplexing. 624 collector_interface->writeLog ("<event kind=\"%s\" id=\"%d\">%s %d -> %d</event>\n", 625 SP_JCMD_COMMENT, COL_COMMENT_HWCADJ, global_perf_event_def[idx].name, 626 ctr_list[idx].last_overflow_period, new_period); 627 #endif 628 } 629 TprintfT ((loss_estimate || set_error_flag) ? DBG_LT1 : DBG_LT3, 630 "hwcdrv: '%s' ipc=0x%llx ena=%llu run=%llu " 631 "value_delta=%lld(0x%llx) loss_est=%llu %s error_flag='%s'\n", 632 ctr_state->ev_def->name, (long long) ipc, 633 (long long) enabled_delta, (long long) running_delta, 634 (long long) value_delta, (long long) value_delta, 635 (unsigned long long) loss_estimate, 636 loss_estimate ? ", WARNING - SCALED" : "", 637 set_error_flag ? ", ERRORFLAG" : ""); 638 if (set_error_flag == 1) 639 value_delta |= (1ULL << 63) /* HWCVAL_ERR_FLAG */; 640 *rvalue = value_delta; 641 *rlost = loss_estimate; 642 if (readsz != msgsz) 643 { 644 TprintfT (0, "hwcdrv: ERROR: perf_event sample not fully parsed\n"); 645 return -5; 646 } 647 return 0; 648 } 649 650 static void 651 dump_perf_event_attr (struct perf_event_attr *at) 652 { 653 TprintfT (DBG_LT2, "dump_perf_event_attr: size=%d type=%d sample_period=%lld\n" 654 " config=0x%llx config1=0x%llx config2=0x%llx wakeup_events=%lld __reserved_1=%lld\n", 655 (int) at->size, (int) at->type, (unsigned long long) at->sample_period, 656 (unsigned long long) at->config, (unsigned long long) at->config1, 657 (unsigned long long) at->config2, (unsigned long long) at->wakeup_events, 658 (unsigned long long) at->__reserved_1); 659 #define DUMP_F(fld) if (at->fld) TprintfT(DBG_LT2, " %-10s : %lld\n", #fld, (long long) at->fld) 660 DUMP_F (disabled); 661 DUMP_F (inherit); 662 DUMP_F (pinned); 663 DUMP_F (exclusive); 664 DUMP_F (exclude_user); 665 DUMP_F (exclude_kernel); 666 DUMP_F (exclude_hv); 667 DUMP_F (exclude_idle); 668 // DUMP_F(xmmap); 669 DUMP_F (comm); 670 DUMP_F (freq); 671 DUMP_F (inherit_stat); 672 DUMP_F (enable_on_exec); 673 DUMP_F (task); 674 DUMP_F (watermark); 675 } 676 677 static void 678 init_perf_event (struct perf_event_attr *hw, uint64_t event, uint64_t period) 679 { 680 memset (hw, 0, sizeof (struct perf_event_attr)); 681 hw->size = sizeof (struct perf_event_attr); // fwd/bwd compat 682 683 #if defined(__i386__) || defined(__x86_64) 684 //note: Nehalem/Westmere OFFCORE_RESPONSE in upper 32 bits 685 hw->config = event; 686 hw->type = PERF_TYPE_RAW; // hw/sw/trace/raw... 687 #elif defined(__aarch64__) 688 hw->type = (event >> 24) & 7; 689 hw->config = event & 0xff; 690 #elif defined(sparc) 691 //SPARC needs to be shifted up 16 bits 692 hw->config = (event & 0xFFFF) << 16; // uint64_t event 693 uint64_t regs = (event >> 20) & 0xf; // see sparc_pcbe.c 694 hw->config |= regs << 4; // for M8, supported PICs need to be placed at bits [7:4] 695 hw->type = PERF_TYPE_RAW; // hw/sw/trace/raw... 696 #endif 697 698 hw->sample_period = period; 699 hw->sample_type = PERF_SAMPLE_IP | PERF_SAMPLE_READ | 700 // PERF_SAMPLE_TID | 701 // PERF_SAMPLE_TIME | // possibly interesting 702 // PERF_SAMPLE_ADDR | 703 PERF_SAMPLE_READ | // HWC value 704 // PERF_SAMPLE_CALLCHAIN | // interesting 705 // PERF_SAMPLE_ID | 706 // PERF_SAMPLE_CPU | // possibly interesting 707 // PERF_SAMPLE_PERIOD | 708 // PERF_SAMPLE_STREAM_ID | 709 // PERF_SAMPLE_RAW | 710 0; 711 hw->read_format = 712 PERF_FORMAT_TOTAL_TIME_ENABLED | // detect when hwc not scheduled 713 PERF_FORMAT_TOTAL_TIME_RUNNING | // detect when hwc not scheduled 714 // PERF_FORMAT_ID | 715 // PERF_FORMAT_GROUP | 716 0; 717 hw->disabled = 1; /* off by default */ 718 719 // Note: the following override config.priv bits! 720 hw->exclude_user = (event & (1 << 16)) == 0; /* don't count user */ 721 hw->exclude_kernel = (event & (1 << 17)) == 0; /* ditto kernel */ 722 hw->exclude_hv = 1; /* ditto hypervisor */ 723 hw->wakeup_events = 1; /* wakeup every n events */ 724 dump_perf_event_attr (hw); 725 } 726 727 static int 728 start_one_ctr (int ii, size_t pgsz, hdrv_pcl_ctx_t * pctx, char *error_string) 729 { 730 // pe_attr should have been initialized in hwcdrv_create_counters() 731 struct perf_event_attr pe_attr; 732 memcpy (&pe_attr, &global_perf_event_def[ii].hw, sizeof (pe_attr)); 733 734 // but we adjust the period, so make sure that pctx->ctr_list[ii].last_overflow_period has been set 735 pe_attr.sample_period = pctx->ctr_list[ii].last_overflow_period; 736 737 int hwc_fd = perf_event_open (&pe_attr, pctx->tid, -1, -1, 0); 738 if (hwc_fd == -1) 739 { 740 TprintfT (DBG_LT1, "%s idx=%d perf_event_open failed, errno=%d\n", 741 error_string, ii, errno); 742 return 1; 743 } 744 745 size_t buffer_area_sz = (NPAGES_PER_BUF + 1) * pgsz; // add a page for metadata 746 void * buf = mmap (NULL, buffer_area_sz, //YXXX is this a safe call? 747 PROT_READ | PROT_WRITE, MAP_SHARED, hwc_fd, 0); 748 if (buf == MAP_FAILED) 749 { 750 TprintfT (0, "sz = %ld, pgsz = %ld\n err=%s idx=%d mmap failed: %s\n", 751 (long) buffer_area_sz, (long) pgsz, error_string, ii, strerror (errno)); 752 return 1; 753 } 754 pctx->ctr_list[ii].ev_def = &global_perf_event_def[ii]; // why do we set ev_def? we never seem to use it 755 pctx->ctr_list[ii].fd = hwc_fd; 756 pctx->ctr_list[ii].buf_state.buf = buf; 757 pctx->ctr_list[ii].buf_state.pagesz = pgsz; 758 pctx->ctr_list[ii].value_state.prev_ena_ts = 0; 759 pctx->ctr_list[ii].value_state.prev_run_ts = 0; 760 pctx->ctr_list[ii].value_state.prev_value = 0; 761 pctx->ctr_list[ii].last_overflow_time = gethrtime (); 762 763 /* set async mode */ 764 long flags = fcntl (hwc_fd, F_GETFL, 0) | O_ASYNC; 765 int rc = fcntl (hwc_fd, F_SETFL, flags); 766 if (rc == -1) 767 { 768 TprintfT (0, "%s idx=%d O_ASYNC failed\n", error_string, ii); 769 return 1; 770 } 771 772 /* 773 * set lwp ownership of the fd 774 * See BUGS section of "man perf_event_open": 775 * The F_SETOWN_EX option to fcntl(2) is needed to properly get 776 * overflow signals in threads. This was introduced in Linux 2.6.32. 777 * Legacy references: 778 * see http://lkml.org/lkml/2009/8/4/128 779 * google man fcntl F_SETOWN_EX -conflict 780 * "From Linux 2.6.32 onward, use F_SETOWN_EX to target 781 * SIGIO and SIGURG signals at a particular thread." 782 * http://icl.cs.utk.edu/papi/docs/da/d2a/examples__v2_8x_2self__smpl__multi_8c.html 783 * See 2010 CSCADS presentation by Eranian 784 */ 785 struct f_owner_ex fowner_ex; 786 fowner_ex.type = F_OWNER_TID; 787 fowner_ex.pid = pctx->tid; 788 rc = fcntl (hwc_fd, F_SETOWN_EX, (unsigned long) &fowner_ex); 789 if (rc == -1) 790 { 791 TprintfT (0, "%s idx=%d F_SETOWN failed\n", error_string, ii); 792 return 1; 793 } 794 795 /* Use sigio so handler can determine FD via siginfo->si_fd. */ 796 rc = fcntl (hwc_fd, F_SETSIG, SIGIO); 797 if (rc == -1) 798 { 799 TprintfT (0, "%s idx=%d F_SETSIG failed\n", error_string, ii); 800 return 1; 801 } 802 return 0; 803 } 804 805 static int 806 stop_one_ctr (int ii, counter_state_t *ctr_list) 807 { 808 int hwc_rc = 0; 809 if (-1 == ioctl (ctr_list[ii].fd, PERF_EVENT_IOC_DISABLE, 1)) 810 { 811 TprintfT (0, "hwcdrv: ERROR: PERF_EVENT_IOC_DISABLE #%d failed: errno=%d\n", ii, errno); 812 hwc_rc = HWCFUNCS_ERROR_GENERIC; 813 } 814 void *buf = ctr_list[ii].buf_state.buf; 815 if (buf) 816 { 817 size_t bufsz = (NPAGES_PER_BUF + 1) * ctr_list[ii].buf_state.pagesz; 818 ctr_list[ii].buf_state.buf = NULL; 819 int tmprc = munmap (buf, bufsz); 820 if (tmprc) 821 { 822 TprintfT (0, "hwcdrv: ERROR: munmap() #%d failed: errno=%d\n", ii, errno); 823 hwc_rc = HWCFUNCS_ERROR_GENERIC; 824 } 825 } 826 if (-1 == close (ctr_list[ii].fd)) 827 { 828 TprintfT (0, "hwcdrv: ERROR: close(fd) #%d failed: errno=%d\n", ii, errno); 829 hwc_rc = HWCFUNCS_ERROR_GENERIC; 830 } 831 return hwc_rc; 832 } 833 834 /* HWCDRV_API for thread-specific actions */ 835 HWCDRV_API int 836 hwcdrv_lwp_init (void) 837 { 838 return hwcdrv_start (); 839 } 840 841 HWCDRV_API void 842 hwcdrv_lwp_fini (void) 843 { 844 hwcdrv_free_counters (); /* also sets pctx->ctr_list=NULL; */ 845 } 846 847 /* open */ 848 static int 849 hdrv_pcl_internal_open () 850 { 851 if (hdrv_pcl_state.internal_open_called) 852 { 853 TprintfT (0, "hwcdrv: WARNING: hdrv_pcl_internal_open: already called\n"); 854 return HWCFUNCS_ERROR_ALREADY_CALLED; 855 } 856 857 // determine if PCL is available 858 perf_event_def_t tmp_event_def; 859 memset (&tmp_event_def, 0, sizeof (tmp_event_def)); 860 struct perf_event_attr *pe_attr = &tmp_event_def.hw; 861 init_perf_event (pe_attr, 0, 0); 862 pe_attr->type = PERF_TYPE_HARDWARE; // specify abstracted HW event 863 pe_attr->config = PERF_COUNT_HW_INSTRUCTIONS; // specify abstracted insts 864 int hwc_fd = perf_event_open (pe_attr, 865 0, // pid/tid, 0 is self 866 -1, // cpu, -1 is per-thread mode 867 -1, // group_fd, -1 is root 868 0); // flags 869 if (hwc_fd == -1) 870 { 871 TprintfT (DBG_LT1, "hwcdrv: WARNING: hdrv_pcl_internal_open:" 872 " perf_event_open() failed, errno=%d\n", errno); 873 goto internal_open_error; 874 } 875 876 /* see if the PCL is new enough to know about F_SETOWN_EX */ 877 struct f_owner_ex fowner_ex; 878 fowner_ex.type = F_OWNER_TID; 879 fowner_ex.pid = hwcdrv_gettid (); // "pid=tid" is correct w/F_OWNER_TID 880 if (fcntl (hwc_fd, F_SETOWN_EX, (unsigned long) &fowner_ex) == -1) 881 { 882 TprintfT (DBG_LT1, "hwcdrv: WARNING: hdrv_pcl_internal_open: " 883 "F_SETOWN failed, errno=%d\n", errno); 884 close (hwc_fd); 885 goto internal_open_error; 886 } 887 close (hwc_fd); 888 889 hdrv_pcl_state.internal_open_called = 1; 890 hdrv_pcl_state.library_ok = 1; // set to non-zero to show it's initted 891 hdrv_pcl_about.cpcN_cpuver = CPUVER_UNDEFINED; 892 TprintfT (DBG_LT2, "hwcdrv: hdrv_pcl_internal_open()\n"); 893 for (int ii = 0; hdrv_pcbe_drivers[ii]; ii++) 894 { 895 hdrv_pcbe_api_t *ppcbe = hdrv_pcbe_drivers[ii]; 896 if (!ppcbe->hdrv_pcbe_init ()) 897 { 898 hdrv_pcl_about.cpcN_cciname = ppcbe->hdrv_pcbe_impl_name (); 899 hdrv_pcl_about.cpcN_cpuver = hwcdrv_lookup_cpuver (hdrv_pcl_about.cpcN_cciname); 900 if (hdrv_pcl_about.cpcN_cpuver == CPUVER_UNDEFINED) 901 goto internal_open_error; 902 hdrv_pcl_about.cpcN_npics = ppcbe->hdrv_pcbe_ncounters (); 903 hdrv_pcl_about.cpcN_docref = ppcbe->hdrv_pcbe_cpuref (); 904 hdrv_pcl_state.get_events = ppcbe->hdrv_pcbe_get_events; 905 hwcdrv_get_x86_eventnum = ppcbe->hdrv_pcbe_get_eventnum; 906 break; 907 } 908 } 909 if (hdrv_pcl_about.cpcN_npics > MAX_PICS) 910 { 911 TprintfT (0, "hwcdrv: WARNING: hdrv_pcl_internal_open:" 912 " reducing number of HWCs from %u to %u on processor '%s'\n", 913 hdrv_pcl_about.cpcN_npics, MAX_PICS, hdrv_pcl_about.cpcN_cciname); 914 hdrv_pcl_about.cpcN_npics = MAX_PICS; 915 } 916 TprintfT (DBG_LT1, "hwcdrv: hdrv_pcl_internal_open:" 917 " perf_event cpuver=%d, name='%s'\n", 918 hdrv_pcl_about.cpcN_cpuver, hdrv_pcl_about.cpcN_cciname); 919 return 0; 920 921 internal_open_error: 922 hdrv_pcl_about.cpcN_cpuver = CPUVER_UNDEFINED; 923 hdrv_pcl_about.cpcN_npics = 0; 924 hdrv_pcl_about.cpcN_docref = NULL; 925 hdrv_pcl_about.cpcN_cciname = NULL; 926 return HWCFUNCS_ERROR_NOT_SUPPORTED; 927 } 928 929 static void * 930 single_thread_tsd_ftn () 931 { 932 static hdrv_pcl_ctx_t tsd_context; 933 return &tsd_context; 934 } 935 936 /* HWCDRV_API */ 937 HWCDRV_API int 938 hwcdrv_init (hwcfuncs_abort_fn_t abort_ftn, int *tsd_sz) 939 { 940 hdrv_pcl_state.find_vpc_ctx = single_thread_tsd_ftn; 941 if (tsd_sz) 942 *tsd_sz = sizeof (hdrv_pcl_ctx_t); 943 944 if (hdrv_pcl_state.internal_open_called) 945 return HWCFUNCS_ERROR_ALREADY_CALLED; 946 return hdrv_pcl_internal_open (); 947 } 948 949 HWCDRV_API void 950 hwcdrv_get_info (int *cpuver, const char **cciname, uint_t *npics, 951 const char **docref, uint64_t *support) 952 { 953 if (cpuver) 954 *cpuver = hdrv_pcl_about.cpcN_cpuver; 955 if (cciname) 956 *cciname = hdrv_pcl_about.cpcN_cciname; 957 if (npics) 958 *npics = hdrv_pcl_about.cpcN_npics; 959 if (docref) 960 *docref = hdrv_pcl_about.cpcN_docref; 961 if (support) 962 *support = HWCFUNCS_SUPPORT_OVERFLOW_PROFILING | HWCFUNCS_SUPPORT_OVERFLOW_CTR_ID; 963 } 964 965 HWCDRV_API int 966 hwcdrv_enable_mt (hwcfuncs_tsd_get_fn_t tsd_ftn) 967 { 968 if (tsd_ftn) 969 hdrv_pcl_state.find_vpc_ctx = tsd_ftn; 970 else 971 { 972 TprintfT (0, "hwcdrv: ERROR: enable_mt(): tsd_ftn==NULL\n"); 973 return HWCFUNCS_ERROR_UNAVAIL; 974 } 975 return 0; 976 } 977 978 HWCDRV_API int 979 hwcdrv_get_descriptions (hwcf_hwc_cb_t *hwc_cb, hwcf_attr_cb_t *attr_cb) 980 { 981 int count = 0; 982 if (hwc_cb && hdrv_pcl_state.get_events) 983 count = hdrv_pcl_state.get_events (hwc_cb); 984 if (attr_cb) 985 for (int ii = 0; perfctr_attrs_table && perfctr_attrs_table[ii].attrname; ii++) 986 attr_cb (perfctr_attrs_table[ii].attrname); 987 if (!count) 988 return -1; 989 return 0; 990 } 991 992 HWCDRV_API int 993 hwcdrv_assign_regnos (Hwcentry* entries[], unsigned numctrs) 994 { 995 return hwcdrv_assign_all_regnos (entries, numctrs); 996 } 997 998 static int 999 internal_hwc_start (int fd) 1000 { 1001 int rc = ioctl (fd, PERF_EVENT_IOC_REFRESH, 1); 1002 if (rc == -1) 1003 { 1004 TprintfT (DBG_LT0, "hwcdrv: ERROR: internal_hwc_start:" 1005 " PERF_EVENT_IOC_REFRESH(fd=%d) failed: errno=%d\n", fd, errno); 1006 return HWCFUNCS_ERROR_UNAVAIL; 1007 } 1008 TprintfT (DBG_LT3, "hwcdrv: internal_hwc_start(fd=%d)\n", fd); 1009 return 0; 1010 } 1011 1012 HWCDRV_API int 1013 hwcdrv_overflow (siginfo_t *si, hwc_event_t *eventp, hwc_event_t *lost_events) 1014 { 1015 /* set expired counters to overflow value and all others to 0 */ 1016 /* return 0: OK, counters should be restarted */ 1017 /* return non-zero: eventp not set, counters should not be restarted */ 1018 /* clear return values */ 1019 int ii; 1020 for (ii = 0; ii < hdrv_pcl_state.hwcdef_cnt; ii++) 1021 { 1022 eventp->ce_pic[ii] = 0; 1023 lost_events->ce_pic[ii] = 0; 1024 } 1025 hrtime_t sig_ts = gethrtime (); //YXXX get this from HWC event? 1026 eventp->ce_hrt = sig_ts; 1027 lost_events->ce_hrt = sig_ts; 1028 1029 /* determine source signal */ 1030 int signal_fd = -1; 1031 switch (si->si_code) 1032 { 1033 case POLL_HUP: /* expected value from pcl */ 1034 /* According to Stephane Eranian: 1035 * "expect POLL_HUP instead of POLL_IN because we are 1036 * in one-shot mode (IOC_REFRESH)" 1037 */ 1038 signal_fd = si->si_fd; 1039 break; 1040 case SI_TKILL: /* event forwarded by tkill */ 1041 /* DBX can only forward SI_TKILL when it detects POLL_HUP 1042 * unfortunately, this means that si->si_fd has been lost... 1043 * We need to process the buffers, but we don't know the fd! 1044 */ 1045 TprintfT (DBG_LT0, "hwcdrv: sig_ts=%llu: WARNING: hwcdrv_overflow:" 1046 " SI_TKILL detected\n", sig_ts); 1047 break; 1048 default: 1049 // "sometimes we see a POLL_IN (1) with very high event rates," 1050 // according to eranian(?) 1051 TprintfT (DBG_LT0, "hwcdrv: sig_ts=%llu: ERROR: hwcdrv_overflow:" 1052 " unexpected si_code 0x%x\n", sig_ts, si->si_code); 1053 return HWCFUNCS_ERROR_GENERIC; 1054 } 1055 1056 hdrv_pcl_ctx_t * pctx = hdrv_pcl_state.find_vpc_ctx (); 1057 if (!pctx) 1058 { 1059 TprintfT (DBG_LT0, "hwcdrv: sig_ts=%llu: ERROR: hwcdrv_overflow:" 1060 " tsd context is NULL\n", sig_ts); 1061 return HWCFUNCS_ERROR_UNEXPECTED; 1062 } 1063 counter_state_t * ctr_list = (counter_state_t *) pctx->ctr_list; 1064 if (!ctr_list) 1065 { 1066 TprintfT (DBG_LT0, "hwcdrv: sig_ts=%llu: WARNING: hwcdrv_overflow:" 1067 " ctr_list is NULL\n", sig_ts); 1068 return HWCFUNCS_ERROR_UNEXPECTED; 1069 } 1070 1071 /* clear needs_restart flag */ 1072 for (ii = 0; ii < hdrv_pcl_state.hwcdef_cnt; ii++) 1073 ctr_list[ii].needs_restart = 0; 1074 1075 /* attempt to identify the counter to read */ 1076 int signal_idx = -1; 1077 pctx->signal_fd = signal_fd; // save the signal provided by siginfo_t 1078 if (signal_fd != -1) 1079 { 1080 for (ii = 0; ii < hdrv_pcl_state.hwcdef_cnt; ii++) 1081 { 1082 if (ctr_list[ii].fd == signal_fd) 1083 { 1084 signal_idx = ii; 1085 break; 1086 } 1087 } 1088 } 1089 1090 if (signal_idx < 0) 1091 { 1092 TprintfT (DBG_LT0, "hwcdrv: sig_ts=%llu: ERROR: hwcdrv_overflow:" 1093 " pmc not determined!\n", sig_ts); 1094 lost_events->ce_pic[0] = 1; /* record a bogus value into experiment */ 1095 // note: bogus value may get overwritten in loop below 1096 } 1097 1098 /* capture sample(s). In addition to signal_idx, check other counters. */ 1099 struct perf_event_header sheader; 1100 int idx; 1101 for (idx = 0; idx < hdrv_pcl_state.hwcdef_cnt; idx++) 1102 { 1103 int num_recs = 0; 1104 while (1) 1105 { 1106 /* check for samples */ 1107 struct perf_event_mmap_page *metadata = ctr_list[idx].buf_state.buf; 1108 if (metadata == NULL) 1109 break; // empty 1110 if (metadata->data_tail == metadata->data_head) 1111 break; // empty 1112 1113 /* read header */ 1114 if (read_buf (&ctr_list[idx].buf_state, &sheader, sizeof (sheader))) 1115 break; 1116 num_recs++; 1117 1118 /* check for PERF_RECORD_SAMPLE */ 1119 size_t datasz = sheader.size - sizeof (struct perf_event_header); 1120 if (sheader.type != PERF_RECORD_SAMPLE) 1121 { 1122 TprintfT (DBG_LT2, "hwcdrv: sig_ts=%llu: WARNING: hwcdrv_overflow:" 1123 " unexpected recd type=%d\n", 1124 sig_ts, sheader.type); 1125 if (skip_buf (&ctr_list[idx].buf_state, datasz)) 1126 { 1127 TprintfT (DBG_LT0, "hwcdrv: sig_ts=%llu: ERROR: hwcdrv_overflow:" 1128 " skip recd type=%d failed\n", sig_ts, sheader.type); 1129 lost_events->ce_pic[idx] = 4; /* record a bogus value */ 1130 break; // failed to skip buffer?? 1131 } 1132 lost_events->ce_pic[idx] = 2; /* record a bogus value */ 1133 continue; // advance to next record 1134 } 1135 1136 /* type is PERF_RECORD_SAMPLE */ 1137 uint64_t value, lostv; 1138 if (read_sample (&ctr_list[idx], datasz, &value, &lostv)) 1139 { 1140 TprintfT (DBG_LT0, "hwcdrv: sig_ts=%llu: ERROR: hwcdrv_overflow:" 1141 " read_sample() failed\n", sig_ts); 1142 lost_events->ce_pic[idx] = 3; // record a bogus value 1143 break; // failed to read sample data?? 1144 } 1145 TprintfT (DBG_LT3, "hwcdrv: sig_ts=%llu: hwcdrv_overflow:" 1146 " idx=%d value=%llu lost=%llu\n", (unsigned long long) sig_ts, 1147 idx, (unsigned long long) value, (unsigned long long) lostv); 1148 if (eventp->ce_pic[idx]) 1149 { 1150 TprintfT (DBG_LT2, "hwcdrv: sig_ts=%llu: WARNING: hwcdrv_overflow:" 1151 " idx=%d previous sample recorded as lost_event\n", sig_ts, idx); 1152 lost_events->ce_pic[idx] += eventp->ce_pic[idx]; 1153 } 1154 eventp->ce_pic[idx] = value; 1155 lost_events->ce_pic[idx] += lostv; 1156 } 1157 1158 /* debug output for unexpected (but common) cases */ 1159 if (idx == signal_idx) 1160 { 1161 if (num_recs != 1) 1162 TprintfT (DBG_LT2, "hwcdrv: sig_ts=%llu: WARNING: hwcdrv_overflow:" 1163 " %d records for signal_idx=%d\n", sig_ts, num_recs, signal_idx); 1164 } 1165 else if (num_recs) 1166 TprintfT (DBG_LT2, "hwcdrv: sig_ts=%llu: WARNING: hwcdrv_overflow:" 1167 " %d unexpected record(s) for idx=%d (signal_idx=%d)\n", 1168 sig_ts, num_recs, idx, signal_idx); 1169 1170 /* trigger counter restart whenever records were found */ 1171 if (num_recs) 1172 { 1173 /* check whether to adapt the overflow interval */ 1174 /* This is the Linux version. 1175 * The Solaris version is in hwprofile.c collector_update_overflow_counters(). 1176 */ 1177 hrtime_t min_time = global_perf_event_def[idx].min_time; 1178 if (min_time > 0 // overflow interval is adaptive 1179 && sig_ts - ctr_list[idx].last_overflow_time < min_time) // last interval below min 1180 { 1181 /* pick a new overflow interval */ 1182 /* roughly doubled, but add funny numbers */ 1183 /* hopefully the result is prime or not a multiple of some # of ops/loop */ 1184 uint64_t new_period = 2 * ctr_list[idx].last_overflow_period + 37; 1185 #if 0 1186 // On Solaris, we report the adjustment to the log file. 1187 // On Linux it's hard for us to do so since hwcdrv_pcl.c doesn't know about collector_interface, SP_JCMD_COMMENT, or COL_COMMENT_HWCADJ. 1188 // For now we simply don't report. 1189 collector_interface->writeLog ("<event kind=\"%s\" id=\"%d\">%s %d -> %d</event>\n", 1190 SP_JCMD_COMMENT, COL_COMMENT_HWCADJ, global_perf_event_def[idx].name, 1191 ctr_list[idx].last_overflow_period, new_period); 1192 #endif 1193 /* There are a variety of ways of resetting the period on Linux. 1194 * The most elegant is 1195 * ioctl(fd,PERF_EVENT_IOC_PERIOD,&period) 1196 * but check the perf_event_open man page for PERF_EVENT_IOC_PERIOD: 1197 * > Prior to Linux 2.6.36 this ioctl always failed due to a bug in the kernel. 1198 * > Prior to Linux 3.14 (or 3.7 on ARM), the new period did not take effect 1199 * until after the next overflow. 1200 * So we're kind of stuck shutting the fd down and restarting it with the new period. 1201 */ 1202 if (stop_one_ctr (idx, ctr_list)) 1203 { 1204 // EUGENE figure out what to do on error 1205 } 1206 ctr_list[idx].last_overflow_period = new_period; 1207 if (start_one_ctr (idx, ctr_list[idx].buf_state.pagesz, pctx, "hwcdrv: ERROR: hwcdrv_overflow (readjust overflow):")) 1208 { 1209 // EUGENE figure out what to do on error 1210 } 1211 } 1212 ctr_list[idx].last_overflow_time = sig_ts; 1213 #if 0 1214 ctr_list[idx].needs_restart = 1; 1215 #else // seems to be more reliable to restart here instead of hwcdrv_sighlr_restart() 1216 internal_hwc_start (ctr_list[idx].fd); 1217 #endif 1218 } 1219 } 1220 return 0; // OK to restart counters 1221 } 1222 1223 HWCDRV_API int 1224 hwcdrv_sighlr_restart (const hwc_event_t *pp) 1225 { 1226 #if 0 // restarting here doesn't seem to work as well as restarting in hwcdrv_overflow() 1227 hdrv_pcl_ctx_t * pctx = hdrv_pcl_state.find_vpc_ctx (); 1228 if (!pctx) 1229 { 1230 TprintfT (DBG_LT0, "hwcdrv: ERROR: hwcdrv_sighlr_restart: find_vpc_ctx()==NULL\n"); 1231 return -1; 1232 } 1233 counter_state_t * ctr_list = (counter_state_t *) pctx->ctr_list; 1234 if (!ctr_list) 1235 { 1236 TprintfT (DBG_LT0, "hwcdrv: WARNING: hwcdrv_sighlr_restart: ctr_list is NULL\n"); 1237 return -1; 1238 } 1239 int errors = 0; 1240 for (int ii = 0; ii < hdrv_pcl_state.hwcdef_cnt; ii++) 1241 { 1242 if (ctr_list[ii].needs_restart) 1243 errors |= internal_hwc_start (ctr_list[ii].fd); 1244 ctr_list[ii].needs_restart = 0; 1245 } 1246 return errors; 1247 #else 1248 return 0; 1249 #endif 1250 } 1251 1252 /* create counters based on hwcdef[] */ 1253 HWCDRV_API int 1254 hwcdrv_create_counters (unsigned hwcdef_cnt, Hwcentry *hwcdef) 1255 { 1256 if (hwcdef_cnt > hdrv_pcl_about.cpcN_npics) 1257 { 1258 logerr (GTXT ("More than %d counters were specified\n"), hdrv_pcl_about.cpcN_npics); /*!*/ 1259 return HWCFUNCS_ERROR_HWCARGS; 1260 } 1261 if (hdrv_pcl_about.cpcN_cpuver == CPUVER_UNDEFINED) 1262 { 1263 logerr (GTXT ("Processor not supported\n")); 1264 return HWCFUNCS_ERROR_HWCARGS; 1265 } 1266 1267 /* add counters */ 1268 for (unsigned idx = 0; idx < hwcdef_cnt; idx++) 1269 { 1270 perf_event_def_t *glb_event_def = &global_perf_event_def[idx]; 1271 memset (glb_event_def, 0, sizeof (perf_event_def_t)); 1272 unsigned int pmc_sel; 1273 eventsel_t evntsel; 1274 if (hwcfuncs_get_x86_eventsel (hwcdef[idx].reg_num, 1275 hwcdef[idx].int_name, &evntsel, &pmc_sel)) 1276 { 1277 TprintfT (0, "hwcdrv: ERROR: hwcfuncs_get_x86_eventsel() failed\n"); 1278 return HWCFUNCS_ERROR_HWCARGS; 1279 } 1280 glb_event_def->reg_num = pmc_sel; 1281 glb_event_def->eventsel = evntsel; 1282 glb_event_def->counter_preload = hwcdef[idx].val; 1283 glb_event_def->min_time = hwcdef[idx].min_time; 1284 glb_event_def->name = strdup (hwcdef[idx].name); // memory leak??? very minor 1285 init_perf_event (&glb_event_def->hw, glb_event_def->eventsel, 1286 glb_event_def->counter_preload); 1287 TprintfT (DBG_LT1, "hwcdrv: create_counters: pic=%u name='%s' interval=%lld" 1288 "(min_time=%lld): reg_num=0x%x eventsel=0x%llx ireset=%lld usr=%lld sys=%lld\n", 1289 idx, hwcdef[idx].int_name, (long long) glb_event_def->counter_preload, 1290 (long long) glb_event_def->min_time, (int) glb_event_def->reg_num, 1291 (long long) glb_event_def->eventsel, 1292 (long long) HW_INTERVAL_PRESET (hwcdef[idx].val), 1293 (long long) glb_event_def->hw.exclude_user, 1294 (long long) glb_event_def->hw.exclude_kernel); 1295 } 1296 1297 hdrv_pcl_state.hwcdef_cnt = hwcdef_cnt; 1298 return 0; 1299 } 1300 1301 HWCDRV_API int 1302 hwcdrv_free_counters () // note: only performs shutdown for this thread 1303 { 1304 hdrv_pcl_ctx_t * pctx; 1305 if (!COUNTERS_ENABLED ()) 1306 return 0; 1307 pctx = hdrv_pcl_state.find_vpc_ctx (); 1308 if (!pctx) 1309 { 1310 TprintfT (0, "hwcdrv: WARNING: hwcdrv_free_counters: tsd context is NULL\n"); 1311 return HWCFUNCS_ERROR_GENERIC; 1312 } 1313 counter_state_t *ctr_list = pctx->ctr_list; 1314 if (!ctr_list) 1315 { 1316 // fork child: prolog suspends hwcs, then epilog frees them 1317 TprintfT (DBG_LT1, "hwcdrv: WARNING: hwcdrv_free_counters: ctr_list is already NULL\n"); 1318 return 0; 1319 } 1320 int hwc_rc = 0; 1321 for (int ii = 0; ii < hdrv_pcl_state.hwcdef_cnt; ii++) 1322 if (stop_one_ctr (ii, ctr_list)) 1323 hwc_rc = HWCFUNCS_ERROR_GENERIC; 1324 TprintfT (DBG_LT1, "hwcdrv: hwcdrv_free_counters(tid=0x%lx).\n", pctx->tid); 1325 pctx->ctr_list = NULL; 1326 return hwc_rc; 1327 } 1328 1329 HWCDRV_API int 1330 hwcdrv_start (void) /* must be called from each thread ? */ 1331 { 1332 hdrv_pcl_ctx_t *pctx = NULL; 1333 if (!COUNTERS_ENABLED ()) 1334 { 1335 TprintfT (DBG_LT1, "hwcdrv: WARNING: hwcdrv_start: no counters to start \n"); 1336 return 0; 1337 } 1338 if (!hdrv_pcl_state.library_ok) 1339 { 1340 TprintfT (0, "hwcdrv: ERROR: hwcdrv_start: library is not open\n"); 1341 return HWCFUNCS_ERROR_NOT_SUPPORTED; 1342 } 1343 1344 /* 1345 * set up per-thread context 1346 */ 1347 pctx = hdrv_pcl_state.find_vpc_ctx (); 1348 if (!pctx) 1349 { 1350 TprintfT (0, "hwcdrv: ERROR: hwcdrv_start: tsd context is NULL\n"); 1351 return HWCFUNCS_ERROR_UNEXPECTED; 1352 } 1353 pctx->tid = hwcdrv_gettid (); 1354 TprintfT (DBG_LT1, "hwcdrv: hwcdrv_start(tid=0x%lx)\n", pctx->tid); 1355 1356 /* 1357 * create per-thread counter list 1358 */ 1359 counter_state_t *ctr_list = (counter_state_t *) calloc (hdrv_pcl_state.hwcdef_cnt, 1360 sizeof (counter_state_t)); 1361 if (!ctr_list) 1362 { 1363 TprintfT (0, "hwcdrv: ERROR: hwcdrv_start: calloc(ctr_list) failed\n"); 1364 return HWCFUNCS_ERROR_MEMORY; 1365 } 1366 int ii; 1367 for (ii = 0; ii < hdrv_pcl_state.hwcdef_cnt; ii++) 1368 ctr_list[ii].fd = -1; // invalidate fds in case we have to close prematurely 1369 pctx->ctr_list = ctr_list; 1370 1371 /* 1372 * bind the counters 1373 */ 1374 size_t pgsz = sysconf (_SC_PAGESIZE); 1375 for (ii = 0; ii < hdrv_pcl_state.hwcdef_cnt; ii++) 1376 { 1377 ctr_list[ii].last_overflow_period = global_perf_event_def[ii].hw.sample_period; 1378 if (start_one_ctr (ii, pgsz, pctx, "hwcdrv: ERROR: hwcdrv_start:")) goto hwcdrv_start_cleanup; 1379 } 1380 1381 /* 1382 * start the counters 1383 */ 1384 for (ii = 0; ii < hdrv_pcl_state.hwcdef_cnt; ii++) 1385 { 1386 int rc = internal_hwc_start (ctr_list[ii].fd); 1387 if (rc < 0) 1388 goto hwcdrv_start_cleanup; 1389 } 1390 return 0; 1391 1392 hwcdrv_start_cleanup: 1393 hwcdrv_free_counters (); // PERF_EVENT_IOC_DISABLE and close() for all fds 1394 return HWCFUNCS_ERROR_UNAVAIL; 1395 } 1396 1397 HWCDRV_API int 1398 hwcdrv_lwp_suspend (void) /* must be called from each thread */ 1399 { 1400 if (!COUNTERS_ENABLED ()) 1401 { 1402 TprintfT (DBG_LT1, "hwcdrv: WARNING: hwcdrv_lwp_suspend: no counters\n"); 1403 return 0; 1404 } 1405 TprintfT (DBG_LT1, "hwcdrv: hwcdrv_lwp_suspend()\n"); 1406 return hwcdrv_free_counters (); 1407 } 1408 1409 HWCDRV_API int 1410 hwcdrv_lwp_resume (void) /* must be called from each thread */ 1411 { 1412 if (!COUNTERS_ENABLED ()) 1413 { 1414 TprintfT (DBG_LT1, "hwcdrv: WARNING: hwcdrv_lwp_resume: no counters\n"); 1415 return 0; 1416 } 1417 TprintfT (DBG_LT1, "hwcdrv: hwcdrv_lwp_resume()\n"); 1418 return hwcdrv_start (); 1419 } 1420 1421 HWCDRV_API int 1422 hwcdrv_read_events (hwc_event_t *overflow_data, hwc_event_samples_t *sampled_data) 1423 { 1424 overflow_data->ce_hrt = 0; 1425 for (int i = 0; i < MAX_PICS; i++) 1426 { 1427 overflow_data->ce_pic[i] = 0; 1428 if (sampled_data) 1429 HWCFUNCS_SAMPLE_RESET (&sampled_data->sample[i]); 1430 } 1431 return 0; 1432 } 1433 1434 /*---------------------------------------------------------------------------*/ 1435 /* HWCDRV_API */ 1436 1437 hwcdrv_api_t hwcdrv_pcl_api = { 1438 hwcdrv_init, 1439 hwcdrv_get_info, 1440 hwcdrv_enable_mt, 1441 hwcdrv_get_descriptions, 1442 hwcdrv_assign_regnos, 1443 hwcdrv_create_counters, 1444 hwcdrv_start, 1445 hwcdrv_overflow, 1446 hwcdrv_read_events, 1447 hwcdrv_sighlr_restart, 1448 hwcdrv_lwp_suspend, 1449 hwcdrv_lwp_resume, 1450 hwcdrv_free_counters, 1451 hwcdrv_lwp_init, 1452 hwcdrv_lwp_fini, 1453 -1 // hwcdrv_init_status 1454 }; 1455