1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright(c) 2020 Intel Corporation 3 */ 4 5 #include <rte_lcore.h> 6 #include <rte_cycles.h> 7 #include <rte_cpuflags.h> 8 #include <rte_malloc.h> 9 #include <rte_ethdev.h> 10 #include <rte_power_intrinsics.h> 11 12 #include "rte_power_pmd_mgmt.h" 13 14 #define EMPTYPOLL_MAX 512 15 16 /* store some internal state */ 17 static struct pmd_conf_data { 18 /** what do we support? */ 19 struct rte_cpu_intrinsics intrinsics_support; 20 /** pre-calculated tsc diff for 1us */ 21 uint64_t tsc_per_us; 22 /** how many rte_pause can we fit in a microsecond? */ 23 uint64_t pause_per_us; 24 } global_data; 25 26 /** 27 * Possible power management states of an ethdev port. 28 */ 29 enum pmd_mgmt_state { 30 /** Device power management is disabled. */ 31 PMD_MGMT_DISABLED = 0, 32 /** Device power management is enabled. */ 33 PMD_MGMT_ENABLED 34 }; 35 36 union queue { 37 uint32_t val; 38 struct { 39 uint16_t portid; 40 uint16_t qid; 41 }; 42 }; 43 44 struct queue_list_entry { 45 TAILQ_ENTRY(queue_list_entry) next; 46 union queue queue; 47 uint64_t n_empty_polls; 48 uint64_t n_sleeps; 49 const struct rte_eth_rxtx_callback *cb; 50 }; 51 52 struct pmd_core_cfg { 53 TAILQ_HEAD(queue_list_head, queue_list_entry) head; 54 /**< List of queues associated with this lcore */ 55 size_t n_queues; 56 /**< How many queues are in the list? */ 57 volatile enum pmd_mgmt_state pwr_mgmt_state; 58 /**< State of power management for this queue */ 59 enum rte_power_pmd_mgmt_type cb_mode; 60 /**< Callback mode for this queue */ 61 uint64_t n_queues_ready_to_sleep; 62 /**< Number of queues ready to enter power optimized state */ 63 uint64_t sleep_target; 64 /**< Prevent a queue from triggering sleep multiple times */ 65 } __rte_cache_aligned; 66 static struct pmd_core_cfg lcore_cfgs[RTE_MAX_LCORE]; 67 68 static inline bool 69 queue_equal(const union queue *l, const union queue *r) 70 { 71 return l->val == r->val; 72 } 73 74 static inline void 75 queue_copy(union queue *dst, const union queue *src) 76 { 77 dst->val = src->val; 78 } 79 80 static struct queue_list_entry * 81 queue_list_find(const struct pmd_core_cfg *cfg, const union queue *q) 82 { 83 struct queue_list_entry *cur; 84 85 TAILQ_FOREACH(cur, &cfg->head, next) { 86 if (queue_equal(&cur->queue, q)) 87 return cur; 88 } 89 return NULL; 90 } 91 92 static int 93 queue_list_add(struct pmd_core_cfg *cfg, const union queue *q) 94 { 95 struct queue_list_entry *qle; 96 97 /* is it already in the list? */ 98 if (queue_list_find(cfg, q) != NULL) 99 return -EEXIST; 100 101 qle = malloc(sizeof(*qle)); 102 if (qle == NULL) 103 return -ENOMEM; 104 memset(qle, 0, sizeof(*qle)); 105 106 queue_copy(&qle->queue, q); 107 TAILQ_INSERT_TAIL(&cfg->head, qle, next); 108 cfg->n_queues++; 109 110 return 0; 111 } 112 113 static struct queue_list_entry * 114 queue_list_take(struct pmd_core_cfg *cfg, const union queue *q) 115 { 116 struct queue_list_entry *found; 117 118 found = queue_list_find(cfg, q); 119 if (found == NULL) 120 return NULL; 121 122 TAILQ_REMOVE(&cfg->head, found, next); 123 cfg->n_queues--; 124 125 /* freeing is responsibility of the caller */ 126 return found; 127 } 128 129 static inline int 130 get_monitor_addresses(struct pmd_core_cfg *cfg, 131 struct rte_power_monitor_cond *pmc, size_t len) 132 { 133 const struct queue_list_entry *qle; 134 size_t i = 0; 135 int ret; 136 137 TAILQ_FOREACH(qle, &cfg->head, next) { 138 const union queue *q = &qle->queue; 139 struct rte_power_monitor_cond *cur; 140 141 /* attempted out of bounds access */ 142 if (i >= len) { 143 RTE_LOG(ERR, POWER, "Too many queues being monitored\n"); 144 return -1; 145 } 146 147 cur = &pmc[i++]; 148 ret = rte_eth_get_monitor_addr(q->portid, q->qid, cur); 149 if (ret < 0) 150 return ret; 151 } 152 return 0; 153 } 154 155 static void 156 calc_tsc(void) 157 { 158 const uint64_t hz = rte_get_timer_hz(); 159 const uint64_t tsc_per_us = hz / US_PER_S; /* 1us */ 160 161 global_data.tsc_per_us = tsc_per_us; 162 163 /* only do this if we don't have tpause */ 164 if (!global_data.intrinsics_support.power_pause) { 165 const uint64_t start = rte_rdtsc_precise(); 166 const uint32_t n_pauses = 10000; 167 double us, us_per_pause; 168 uint64_t end; 169 unsigned int i; 170 171 /* estimate number of rte_pause() calls per us*/ 172 for (i = 0; i < n_pauses; i++) 173 rte_pause(); 174 175 end = rte_rdtsc_precise(); 176 us = (end - start) / (double)tsc_per_us; 177 us_per_pause = us / n_pauses; 178 179 global_data.pause_per_us = (uint64_t)(1.0 / us_per_pause); 180 } 181 } 182 183 static inline void 184 queue_reset(struct pmd_core_cfg *cfg, struct queue_list_entry *qcfg) 185 { 186 const bool is_ready_to_sleep = qcfg->n_sleeps == cfg->sleep_target; 187 188 /* reset empty poll counter for this queue */ 189 qcfg->n_empty_polls = 0; 190 /* reset the queue sleep counter as well */ 191 qcfg->n_sleeps = 0; 192 /* remove the queue from list of queues ready to sleep */ 193 if (is_ready_to_sleep) 194 cfg->n_queues_ready_to_sleep--; 195 /* 196 * no need change the lcore sleep target counter because this lcore will 197 * reach the n_sleeps anyway, and the other cores are already counted so 198 * there's no need to do anything else. 199 */ 200 } 201 202 static inline bool 203 queue_can_sleep(struct pmd_core_cfg *cfg, struct queue_list_entry *qcfg) 204 { 205 /* this function is called - that means we have an empty poll */ 206 qcfg->n_empty_polls++; 207 208 /* if we haven't reached threshold for empty polls, we can't sleep */ 209 if (qcfg->n_empty_polls <= EMPTYPOLL_MAX) 210 return false; 211 212 /* 213 * we've reached a point where we are able to sleep, but we still need 214 * to check if this queue has already been marked for sleeping. 215 */ 216 if (qcfg->n_sleeps == cfg->sleep_target) 217 return true; 218 219 /* mark this queue as ready for sleep */ 220 qcfg->n_sleeps = cfg->sleep_target; 221 cfg->n_queues_ready_to_sleep++; 222 223 return true; 224 } 225 226 static inline bool 227 lcore_can_sleep(struct pmd_core_cfg *cfg) 228 { 229 /* are all queues ready to sleep? */ 230 if (cfg->n_queues_ready_to_sleep != cfg->n_queues) 231 return false; 232 233 /* we've reached an iteration where we can sleep, reset sleep counter */ 234 cfg->n_queues_ready_to_sleep = 0; 235 cfg->sleep_target++; 236 /* 237 * we do not reset any individual queue empty poll counters, because 238 * we want to keep sleeping on every poll until we actually get traffic. 239 */ 240 241 return true; 242 } 243 244 static uint16_t 245 clb_multiwait(uint16_t port_id __rte_unused, uint16_t qidx __rte_unused, 246 struct rte_mbuf **pkts __rte_unused, uint16_t nb_rx, 247 uint16_t max_pkts __rte_unused, void *arg) 248 { 249 const unsigned int lcore = rte_lcore_id(); 250 struct queue_list_entry *queue_conf = arg; 251 struct pmd_core_cfg *lcore_conf; 252 const bool empty = nb_rx == 0; 253 254 lcore_conf = &lcore_cfgs[lcore]; 255 256 /* early exit */ 257 if (likely(!empty)) 258 /* early exit */ 259 queue_reset(lcore_conf, queue_conf); 260 else { 261 struct rte_power_monitor_cond pmc[lcore_conf->n_queues]; 262 int ret; 263 264 /* can this queue sleep? */ 265 if (!queue_can_sleep(lcore_conf, queue_conf)) 266 return nb_rx; 267 268 /* can this lcore sleep? */ 269 if (!lcore_can_sleep(lcore_conf)) 270 return nb_rx; 271 272 /* gather all monitoring conditions */ 273 ret = get_monitor_addresses(lcore_conf, pmc, 274 lcore_conf->n_queues); 275 if (ret < 0) 276 return nb_rx; 277 278 rte_power_monitor_multi(pmc, lcore_conf->n_queues, UINT64_MAX); 279 } 280 281 return nb_rx; 282 } 283 284 static uint16_t 285 clb_umwait(uint16_t port_id, uint16_t qidx, struct rte_mbuf **pkts __rte_unused, 286 uint16_t nb_rx, uint16_t max_pkts __rte_unused, void *arg) 287 { 288 struct queue_list_entry *queue_conf = arg; 289 290 /* this callback can't do more than one queue, omit multiqueue logic */ 291 if (unlikely(nb_rx == 0)) { 292 queue_conf->n_empty_polls++; 293 if (unlikely(queue_conf->n_empty_polls > EMPTYPOLL_MAX)) { 294 struct rte_power_monitor_cond pmc; 295 int ret; 296 297 /* use monitoring condition to sleep */ 298 ret = rte_eth_get_monitor_addr(port_id, qidx, 299 &pmc); 300 if (ret == 0) 301 rte_power_monitor(&pmc, UINT64_MAX); 302 } 303 } else 304 queue_conf->n_empty_polls = 0; 305 306 return nb_rx; 307 } 308 309 static uint16_t 310 clb_pause(uint16_t port_id __rte_unused, uint16_t qidx __rte_unused, 311 struct rte_mbuf **pkts __rte_unused, uint16_t nb_rx, 312 uint16_t max_pkts __rte_unused, void *arg) 313 { 314 const unsigned int lcore = rte_lcore_id(); 315 struct queue_list_entry *queue_conf = arg; 316 struct pmd_core_cfg *lcore_conf; 317 const bool empty = nb_rx == 0; 318 319 lcore_conf = &lcore_cfgs[lcore]; 320 321 if (likely(!empty)) 322 /* early exit */ 323 queue_reset(lcore_conf, queue_conf); 324 else { 325 /* can this queue sleep? */ 326 if (!queue_can_sleep(lcore_conf, queue_conf)) 327 return nb_rx; 328 329 /* can this lcore sleep? */ 330 if (!lcore_can_sleep(lcore_conf)) 331 return nb_rx; 332 333 /* sleep for 1 microsecond, use tpause if we have it */ 334 if (global_data.intrinsics_support.power_pause) { 335 const uint64_t cur = rte_rdtsc(); 336 const uint64_t wait_tsc = 337 cur + global_data.tsc_per_us; 338 rte_power_pause(wait_tsc); 339 } else { 340 uint64_t i; 341 for (i = 0; i < global_data.pause_per_us; i++) 342 rte_pause(); 343 } 344 } 345 346 return nb_rx; 347 } 348 349 static uint16_t 350 clb_scale_freq(uint16_t port_id __rte_unused, uint16_t qidx __rte_unused, 351 struct rte_mbuf **pkts __rte_unused, uint16_t nb_rx, 352 uint16_t max_pkts __rte_unused, void *arg) 353 { 354 const unsigned int lcore = rte_lcore_id(); 355 const bool empty = nb_rx == 0; 356 struct pmd_core_cfg *lcore_conf = &lcore_cfgs[lcore]; 357 struct queue_list_entry *queue_conf = arg; 358 359 if (likely(!empty)) { 360 /* early exit */ 361 queue_reset(lcore_conf, queue_conf); 362 363 /* scale up freq immediately */ 364 rte_power_freq_max(rte_lcore_id()); 365 } else { 366 /* can this queue sleep? */ 367 if (!queue_can_sleep(lcore_conf, queue_conf)) 368 return nb_rx; 369 370 /* can this lcore sleep? */ 371 if (!lcore_can_sleep(lcore_conf)) 372 return nb_rx; 373 374 rte_power_freq_min(rte_lcore_id()); 375 } 376 377 return nb_rx; 378 } 379 380 static int 381 queue_stopped(const uint16_t port_id, const uint16_t queue_id) 382 { 383 struct rte_eth_rxq_info qinfo; 384 385 if (rte_eth_rx_queue_info_get(port_id, queue_id, &qinfo) < 0) 386 return -1; 387 388 return qinfo.queue_state == RTE_ETH_QUEUE_STATE_STOPPED; 389 } 390 391 static int 392 cfg_queues_stopped(struct pmd_core_cfg *queue_cfg) 393 { 394 const struct queue_list_entry *entry; 395 396 TAILQ_FOREACH(entry, &queue_cfg->head, next) { 397 const union queue *q = &entry->queue; 398 int ret = queue_stopped(q->portid, q->qid); 399 if (ret != 1) 400 return ret; 401 } 402 return 1; 403 } 404 405 static int 406 check_scale(unsigned int lcore) 407 { 408 enum power_management_env env; 409 410 /* only PSTATE and ACPI modes are supported */ 411 if (!rte_power_check_env_supported(PM_ENV_ACPI_CPUFREQ) && 412 !rte_power_check_env_supported(PM_ENV_PSTATE_CPUFREQ)) { 413 RTE_LOG(DEBUG, POWER, "Neither ACPI nor PSTATE modes are supported\n"); 414 return -ENOTSUP; 415 } 416 /* ensure we could initialize the power library */ 417 if (rte_power_init(lcore)) 418 return -EINVAL; 419 420 /* ensure we initialized the correct env */ 421 env = rte_power_get_env(); 422 if (env != PM_ENV_ACPI_CPUFREQ && env != PM_ENV_PSTATE_CPUFREQ) { 423 RTE_LOG(DEBUG, POWER, "Neither ACPI nor PSTATE modes were initialized\n"); 424 return -ENOTSUP; 425 } 426 427 /* we're done */ 428 return 0; 429 } 430 431 static int 432 check_monitor(struct pmd_core_cfg *cfg, const union queue *qdata) 433 { 434 struct rte_power_monitor_cond dummy; 435 bool multimonitor_supported; 436 437 /* check if rte_power_monitor is supported */ 438 if (!global_data.intrinsics_support.power_monitor) { 439 RTE_LOG(DEBUG, POWER, "Monitoring intrinsics are not supported\n"); 440 return -ENOTSUP; 441 } 442 /* check if multi-monitor is supported */ 443 multimonitor_supported = 444 global_data.intrinsics_support.power_monitor_multi; 445 446 /* if we're adding a new queue, do we support multiple queues? */ 447 if (cfg->n_queues > 0 && !multimonitor_supported) { 448 RTE_LOG(DEBUG, POWER, "Monitoring multiple queues is not supported\n"); 449 return -ENOTSUP; 450 } 451 452 /* check if the device supports the necessary PMD API */ 453 if (rte_eth_get_monitor_addr(qdata->portid, qdata->qid, 454 &dummy) == -ENOTSUP) { 455 RTE_LOG(DEBUG, POWER, "The device does not support rte_eth_get_monitor_addr\n"); 456 return -ENOTSUP; 457 } 458 459 /* we're done */ 460 return 0; 461 } 462 463 static inline rte_rx_callback_fn 464 get_monitor_callback(void) 465 { 466 return global_data.intrinsics_support.power_monitor_multi ? 467 clb_multiwait : clb_umwait; 468 } 469 470 int 471 rte_power_ethdev_pmgmt_queue_enable(unsigned int lcore_id, uint16_t port_id, 472 uint16_t queue_id, enum rte_power_pmd_mgmt_type mode) 473 { 474 const union queue qdata = {.portid = port_id, .qid = queue_id}; 475 struct pmd_core_cfg *lcore_cfg; 476 struct queue_list_entry *queue_cfg; 477 struct rte_eth_dev_info info; 478 rte_rx_callback_fn clb; 479 int ret; 480 481 RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -EINVAL); 482 483 if (queue_id >= RTE_MAX_QUEUES_PER_PORT || lcore_id >= RTE_MAX_LCORE) { 484 ret = -EINVAL; 485 goto end; 486 } 487 488 if (rte_eth_dev_info_get(port_id, &info) < 0) { 489 ret = -EINVAL; 490 goto end; 491 } 492 493 /* check if queue id is valid */ 494 if (queue_id >= info.nb_rx_queues) { 495 ret = -EINVAL; 496 goto end; 497 } 498 499 /* check if the queue is stopped */ 500 ret = queue_stopped(port_id, queue_id); 501 if (ret != 1) { 502 /* error means invalid queue, 0 means queue wasn't stopped */ 503 ret = ret < 0 ? -EINVAL : -EBUSY; 504 goto end; 505 } 506 507 lcore_cfg = &lcore_cfgs[lcore_id]; 508 509 /* check if other queues are stopped as well */ 510 ret = cfg_queues_stopped(lcore_cfg); 511 if (ret != 1) { 512 /* error means invalid queue, 0 means queue wasn't stopped */ 513 ret = ret < 0 ? -EINVAL : -EBUSY; 514 goto end; 515 } 516 517 /* if callback was already enabled, check current callback type */ 518 if (lcore_cfg->pwr_mgmt_state != PMD_MGMT_DISABLED && 519 lcore_cfg->cb_mode != mode) { 520 ret = -EINVAL; 521 goto end; 522 } 523 524 /* we need this in various places */ 525 rte_cpu_get_intrinsics_support(&global_data.intrinsics_support); 526 527 switch (mode) { 528 case RTE_POWER_MGMT_TYPE_MONITOR: 529 /* check if we can add a new queue */ 530 ret = check_monitor(lcore_cfg, &qdata); 531 if (ret < 0) 532 goto end; 533 534 clb = get_monitor_callback(); 535 break; 536 case RTE_POWER_MGMT_TYPE_SCALE: 537 clb = clb_scale_freq; 538 539 /* we only have to check this when enabling first queue */ 540 if (lcore_cfg->pwr_mgmt_state != PMD_MGMT_DISABLED) 541 break; 542 /* check if we can add a new queue */ 543 ret = check_scale(lcore_id); 544 if (ret < 0) 545 goto end; 546 break; 547 case RTE_POWER_MGMT_TYPE_PAUSE: 548 /* figure out various time-to-tsc conversions */ 549 if (global_data.tsc_per_us == 0) 550 calc_tsc(); 551 552 clb = clb_pause; 553 break; 554 default: 555 RTE_LOG(DEBUG, POWER, "Invalid power management type\n"); 556 ret = -EINVAL; 557 goto end; 558 } 559 /* add this queue to the list */ 560 ret = queue_list_add(lcore_cfg, &qdata); 561 if (ret < 0) { 562 RTE_LOG(DEBUG, POWER, "Failed to add queue to list: %s\n", 563 strerror(-ret)); 564 goto end; 565 } 566 /* new queue is always added last */ 567 queue_cfg = TAILQ_LAST(&lcore_cfg->head, queue_list_head); 568 569 /* when enabling first queue, ensure sleep target is not 0 */ 570 if (lcore_cfg->n_queues == 1 && lcore_cfg->sleep_target == 0) 571 lcore_cfg->sleep_target = 1; 572 573 /* initialize data before enabling the callback */ 574 if (lcore_cfg->n_queues == 1) { 575 lcore_cfg->cb_mode = mode; 576 lcore_cfg->pwr_mgmt_state = PMD_MGMT_ENABLED; 577 } 578 queue_cfg->cb = rte_eth_add_rx_callback(port_id, queue_id, 579 clb, queue_cfg); 580 581 ret = 0; 582 end: 583 return ret; 584 } 585 586 int 587 rte_power_ethdev_pmgmt_queue_disable(unsigned int lcore_id, 588 uint16_t port_id, uint16_t queue_id) 589 { 590 const union queue qdata = {.portid = port_id, .qid = queue_id}; 591 struct pmd_core_cfg *lcore_cfg; 592 struct queue_list_entry *queue_cfg; 593 int ret; 594 595 RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -EINVAL); 596 597 if (lcore_id >= RTE_MAX_LCORE || queue_id >= RTE_MAX_QUEUES_PER_PORT) 598 return -EINVAL; 599 600 /* check if the queue is stopped */ 601 ret = queue_stopped(port_id, queue_id); 602 if (ret != 1) { 603 /* error means invalid queue, 0 means queue wasn't stopped */ 604 return ret < 0 ? -EINVAL : -EBUSY; 605 } 606 607 /* no need to check queue id as wrong queue id would not be enabled */ 608 lcore_cfg = &lcore_cfgs[lcore_id]; 609 610 /* check if other queues are stopped as well */ 611 ret = cfg_queues_stopped(lcore_cfg); 612 if (ret != 1) { 613 /* error means invalid queue, 0 means queue wasn't stopped */ 614 return ret < 0 ? -EINVAL : -EBUSY; 615 } 616 617 if (lcore_cfg->pwr_mgmt_state != PMD_MGMT_ENABLED) 618 return -EINVAL; 619 620 /* 621 * There is no good/easy way to do this without race conditions, so we 622 * are just going to throw our hands in the air and hope that the user 623 * has read the documentation and has ensured that ports are stopped at 624 * the time we enter the API functions. 625 */ 626 queue_cfg = queue_list_take(lcore_cfg, &qdata); 627 if (queue_cfg == NULL) 628 return -ENOENT; 629 630 /* if we've removed all queues from the lists, set state to disabled */ 631 if (lcore_cfg->n_queues == 0) 632 lcore_cfg->pwr_mgmt_state = PMD_MGMT_DISABLED; 633 634 switch (lcore_cfg->cb_mode) { 635 case RTE_POWER_MGMT_TYPE_MONITOR: /* fall-through */ 636 case RTE_POWER_MGMT_TYPE_PAUSE: 637 rte_eth_remove_rx_callback(port_id, queue_id, queue_cfg->cb); 638 break; 639 case RTE_POWER_MGMT_TYPE_SCALE: 640 rte_eth_remove_rx_callback(port_id, queue_id, queue_cfg->cb); 641 /* disable power library on this lcore if this was last queue */ 642 if (lcore_cfg->pwr_mgmt_state == PMD_MGMT_DISABLED) { 643 rte_power_freq_max(lcore_id); 644 rte_power_exit(lcore_id); 645 } 646 break; 647 } 648 /* 649 * the API doc mandates that the user stops all processing on affected 650 * ports before calling any of these API's, so we can assume that the 651 * callbacks can be freed. we're intentionally casting away const-ness. 652 */ 653 rte_free((void *)queue_cfg->cb); 654 free(queue_cfg); 655 656 return 0; 657 } 658 659 RTE_INIT(rte_power_ethdev_pmgmt_init) { 660 size_t i; 661 662 /* initialize all tailqs */ 663 for (i = 0; i < RTE_DIM(lcore_cfgs); i++) { 664 struct pmd_core_cfg *cfg = &lcore_cfgs[i]; 665 TAILQ_INIT(&cfg->head); 666 } 667 } 668