1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (c) Intel Corporation. All rights reserved. 3 * Copyright (c) 2020 Mellanox Technologies LTD. All rights reserved. 4 * 5 * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved. 6 */ 7 8 #include "spdk/stdinc.h" 9 10 #include "spdk/env.h" 11 #include "spdk/nvme.h" 12 #include "spdk/queue.h" 13 #include "spdk/string.h" 14 #include "spdk/util.h" 15 #include "spdk/log.h" 16 #include "spdk/likely.h" 17 18 struct ctrlr_entry { 19 struct spdk_nvme_ctrlr *ctrlr; 20 struct spdk_nvme_transport_id failover_trid; 21 enum spdk_nvme_transport_type trtype; 22 TAILQ_ENTRY(ctrlr_entry) link; 23 char name[1024]; 24 int num_resets; 25 }; 26 27 struct ns_entry { 28 struct spdk_nvme_ctrlr *ctrlr; 29 struct spdk_nvme_ns *ns; 30 31 TAILQ_ENTRY(ns_entry) link; 32 uint32_t io_size_blocks; 33 uint32_t num_io_requests; 34 uint64_t size_in_ios; 35 uint32_t block_size; 36 uint32_t io_flags; 37 char name[1024]; 38 }; 39 40 struct ns_worker_ctx { 41 struct ns_entry *entry; 42 uint64_t io_completed; 43 uint64_t current_queue_depth; 44 uint64_t offset_in_ios; 45 bool is_draining; 46 47 int num_qpairs; 48 struct spdk_nvme_qpair **qpair; 49 int last_qpair; 50 51 TAILQ_ENTRY(ns_worker_ctx) link; 52 }; 53 54 struct perf_task { 55 struct ns_worker_ctx *ns_ctx; 56 struct iovec iov; 57 bool is_read; 58 }; 59 60 struct worker_thread { 61 TAILQ_HEAD(, ns_worker_ctx) ns_ctx; 62 TAILQ_ENTRY(worker_thread) link; 63 unsigned lcore; 64 }; 65 66 /* For basic reset handling. */ 67 static int g_max_ctrlr_resets = 15; 68 69 static TAILQ_HEAD(, ctrlr_entry) g_controllers = TAILQ_HEAD_INITIALIZER(g_controllers); 70 static TAILQ_HEAD(, ns_entry) g_namespaces = TAILQ_HEAD_INITIALIZER(g_namespaces); 71 static int g_num_namespaces = 0; 72 static TAILQ_HEAD(, worker_thread) g_workers = TAILQ_HEAD_INITIALIZER(g_workers); 73 static int g_num_workers = 0; 74 75 static uint64_t g_tsc_rate; 76 77 static uint32_t g_io_align = 0x200; 78 static uint32_t g_io_size_bytes; 79 static uint32_t g_max_io_size_blocks; 80 static int g_rw_percentage; 81 static int g_is_random; 82 static int g_queue_depth; 83 static int g_time_in_sec; 84 static uint32_t g_max_completions; 85 static int g_dpdk_mem; 86 static bool g_warn; 87 static uint32_t g_keep_alive_timeout_in_ms = 0; 88 static uint8_t g_transport_retry_count = 4; 89 static uint8_t g_transport_ack_timeout = 0; /* disabled */ 90 static bool g_dpdk_mem_single_seg = false; 91 92 static const char *g_core_mask; 93 94 struct trid_entry { 95 struct spdk_nvme_transport_id trid; 96 struct spdk_nvme_transport_id failover_trid; 97 TAILQ_ENTRY(trid_entry) tailq; 98 }; 99 100 static TAILQ_HEAD(, trid_entry) g_trid_list = TAILQ_HEAD_INITIALIZER(g_trid_list); 101 102 static inline void task_complete(struct perf_task *task); 103 static void submit_io(struct ns_worker_ctx *ns_ctx, int queue_depth); 104 105 static void io_complete(void *ctx, const struct spdk_nvme_cpl *cpl); 106 107 static void 108 nvme_setup_payload(struct perf_task *task) 109 { 110 /* maximum extended lba format size from all active namespace, 111 * it's same with g_io_size_bytes for namespace without metadata. 112 */ 113 task->iov.iov_base = spdk_dma_zmalloc(g_io_size_bytes, g_io_align, NULL); 114 task->iov.iov_len = g_io_size_bytes; 115 if (task->iov.iov_base == NULL) { 116 fprintf(stderr, "task->buf spdk_dma_zmalloc failed\n"); 117 exit(1); 118 } 119 } 120 121 static int 122 nvme_submit_io(struct perf_task *task, struct ns_worker_ctx *ns_ctx, 123 struct ns_entry *entry, uint64_t offset_in_ios) 124 { 125 uint64_t lba; 126 int qp_num; 127 128 lba = offset_in_ios * entry->io_size_blocks; 129 130 qp_num = ns_ctx->last_qpair; 131 ns_ctx->last_qpair++; 132 if (ns_ctx->last_qpair == ns_ctx->num_qpairs) { 133 ns_ctx->last_qpair = 0; 134 } 135 136 if (task->is_read) { 137 return spdk_nvme_ns_cmd_read(entry->ns, ns_ctx->qpair[qp_num], 138 task->iov.iov_base, lba, 139 entry->io_size_blocks, io_complete, 140 task, entry->io_flags); 141 } 142 143 return spdk_nvme_ns_cmd_write(entry->ns, ns_ctx->qpair[qp_num], 144 task->iov.iov_base, lba, 145 entry->io_size_blocks, io_complete, 146 task, entry->io_flags); 147 } 148 149 static void 150 nvme_check_io(struct ns_worker_ctx *ns_ctx) 151 { 152 int i, rc; 153 154 for (i = 0; i < ns_ctx->num_qpairs; i++) { 155 rc = spdk_nvme_qpair_process_completions(ns_ctx->qpair[i], g_max_completions); 156 /* The transport level qpair is failed and we need to reconnect it. */ 157 if (spdk_unlikely(rc == -ENXIO)) { 158 rc = spdk_nvme_ctrlr_reconnect_io_qpair(ns_ctx->qpair[i]); 159 /* successful reconnect */ 160 if (rc == 0) { 161 continue; 162 } else if (rc == -ENXIO) { 163 /* This means the controller is failed. Defer to it to restore the qpair. */ 164 continue; 165 } else { 166 /* 167 * We were unable to restore the qpair on this attempt. We don't 168 * really know why. For naive handling, just keep trying. 169 * TODO: add a retry limit, and destroy the qpair after x iterations. 170 */ 171 fprintf(stderr, "qpair failed and we were unable to recover it.\n"); 172 } 173 } else if (spdk_unlikely(rc < 0)) { 174 fprintf(stderr, "Received an unknown error processing completions.\n"); 175 exit(1); 176 } 177 } 178 } 179 180 /* 181 * TODO: If a controller has multiple namespaces, they could all use the same queue. 182 * For now, give each namespace/thread combination its own queue. 183 */ 184 static int 185 nvme_init_ns_worker_ctx(struct ns_worker_ctx *ns_ctx) 186 { 187 struct spdk_nvme_io_qpair_opts opts; 188 struct ns_entry *entry = ns_ctx->entry; 189 int i; 190 191 ns_ctx->num_qpairs = 1; 192 ns_ctx->qpair = calloc(ns_ctx->num_qpairs, sizeof(struct spdk_nvme_qpair *)); 193 if (!ns_ctx->qpair) { 194 return -1; 195 } 196 197 spdk_nvme_ctrlr_get_default_io_qpair_opts(entry->ctrlr, &opts, sizeof(opts)); 198 if (opts.io_queue_requests < entry->num_io_requests) { 199 opts.io_queue_requests = entry->num_io_requests; 200 } 201 202 for (i = 0; i < ns_ctx->num_qpairs; i++) { 203 ns_ctx->qpair[i] = spdk_nvme_ctrlr_alloc_io_qpair(entry->ctrlr, &opts, 204 sizeof(opts)); 205 if (!ns_ctx->qpair[i]) { 206 printf("ERROR: spdk_nvme_ctrlr_alloc_io_qpair failed\n"); 207 return -1; 208 } 209 } 210 211 return 0; 212 } 213 214 static void 215 nvme_cleanup_ns_worker_ctx(struct ns_worker_ctx *ns_ctx) 216 { 217 int i; 218 219 for (i = 0; i < ns_ctx->num_qpairs; i++) { 220 spdk_nvme_ctrlr_free_io_qpair(ns_ctx->qpair[i]); 221 } 222 223 free(ns_ctx->qpair); 224 } 225 226 static void 227 build_nvme_name(char *name, size_t length, struct spdk_nvme_ctrlr *ctrlr) 228 { 229 const struct spdk_nvme_transport_id *trid; 230 231 trid = spdk_nvme_ctrlr_get_transport_id(ctrlr); 232 233 switch (trid->trtype) { 234 case SPDK_NVME_TRANSPORT_RDMA: 235 snprintf(name, length, "RDMA (addr:%s subnqn:%s)", trid->traddr, trid->subnqn); 236 break; 237 case SPDK_NVME_TRANSPORT_TCP: 238 snprintf(name, length, "TCP (addr:%s subnqn:%s)", trid->traddr, trid->subnqn); 239 break; 240 case SPDK_NVME_TRANSPORT_VFIOUSER: 241 snprintf(name, length, "VFIOUSER (%s)", trid->traddr); 242 break; 243 case SPDK_NVME_TRANSPORT_CUSTOM: 244 snprintf(name, length, "CUSTOM (%s)", trid->traddr); 245 break; 246 default: 247 fprintf(stderr, "Unknown transport type %d\n", trid->trtype); 248 break; 249 } 250 } 251 252 static void 253 register_ns(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_ns *ns) 254 { 255 struct ns_entry *entry; 256 const struct spdk_nvme_ctrlr_data *cdata; 257 uint32_t max_xfer_size, entries, sector_size; 258 uint64_t ns_size; 259 struct spdk_nvme_io_qpair_opts opts; 260 261 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 262 263 if (!spdk_nvme_ns_is_active(ns)) { 264 printf("Controller %-20.20s (%-20.20s): Skipping inactive NS %u\n", 265 cdata->mn, cdata->sn, 266 spdk_nvme_ns_get_id(ns)); 267 g_warn = true; 268 return; 269 } 270 271 ns_size = spdk_nvme_ns_get_size(ns); 272 sector_size = spdk_nvme_ns_get_sector_size(ns); 273 274 if (ns_size < g_io_size_bytes || sector_size > g_io_size_bytes) { 275 printf("WARNING: controller %-20.20s (%-20.20s) ns %u has invalid " 276 "ns size %" PRIu64 " / block size %u for I/O size %u\n", 277 cdata->mn, cdata->sn, spdk_nvme_ns_get_id(ns), 278 ns_size, spdk_nvme_ns_get_sector_size(ns), g_io_size_bytes); 279 g_warn = true; 280 return; 281 } 282 283 max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns); 284 spdk_nvme_ctrlr_get_default_io_qpair_opts(ctrlr, &opts, sizeof(opts)); 285 /* NVMe driver may add additional entries based on 286 * stripe size and maximum transfer size, we assume 287 * 1 more entry be used for stripe. 288 */ 289 entries = (g_io_size_bytes - 1) / max_xfer_size + 2; 290 if ((g_queue_depth * entries) > opts.io_queue_size) { 291 printf("controller IO queue size %u less than required\n", 292 opts.io_queue_size); 293 printf("Consider using lower queue depth or small IO size because " 294 "IO requests may be queued at the NVMe driver.\n"); 295 g_warn = true; 296 } 297 /* For requests which have children requests, parent request itself 298 * will also occupy 1 entry. 299 */ 300 entries += 1; 301 302 entry = calloc(1, sizeof(struct ns_entry)); 303 if (entry == NULL) { 304 perror("ns_entry malloc"); 305 exit(1); 306 } 307 308 entry->ctrlr = ctrlr; 309 entry->ns = ns; 310 entry->num_io_requests = g_queue_depth * entries; 311 312 entry->size_in_ios = ns_size / g_io_size_bytes; 313 entry->io_size_blocks = g_io_size_bytes / sector_size; 314 315 entry->block_size = spdk_nvme_ns_get_sector_size(ns); 316 317 318 if (g_max_io_size_blocks < entry->io_size_blocks) { 319 g_max_io_size_blocks = entry->io_size_blocks; 320 } 321 322 build_nvme_name(entry->name, sizeof(entry->name), ctrlr); 323 324 g_num_namespaces++; 325 TAILQ_INSERT_TAIL(&g_namespaces, entry, link); 326 } 327 328 static void 329 unregister_namespaces(void) 330 { 331 struct ns_entry *entry, *tmp; 332 333 TAILQ_FOREACH_SAFE(entry, &g_namespaces, link, tmp) { 334 TAILQ_REMOVE(&g_namespaces, entry, link); 335 free(entry); 336 } 337 } 338 339 static void 340 register_ctrlr(struct spdk_nvme_ctrlr *ctrlr, struct trid_entry *trid_entry) 341 { 342 struct spdk_nvme_ns *ns; 343 struct ctrlr_entry *entry = calloc(1, sizeof(struct ctrlr_entry)); 344 const struct spdk_nvme_transport_id *ctrlr_trid; 345 uint32_t nsid; 346 347 if (entry == NULL) { 348 perror("ctrlr_entry malloc"); 349 exit(1); 350 } 351 352 ctrlr_trid = spdk_nvme_ctrlr_get_transport_id(ctrlr); 353 assert(ctrlr_trid != NULL); 354 355 /* each controller needs a unique failover trid. */ 356 entry->failover_trid = trid_entry->failover_trid; 357 358 /* 359 * Users are allowed to leave the trid subnqn blank or specify a discovery controller subnqn. 360 * In those cases, the controller subnqn will not equal the trid_entry subnqn and, by association, 361 * the failover_trid subnqn. 362 * When we do failover, we want to reconnect to the same nqn so explicitly set the failover nqn to 363 * the ctrlr nqn here. 364 */ 365 snprintf(entry->failover_trid.subnqn, SPDK_NVMF_NQN_MAX_LEN + 1, "%s", ctrlr_trid->subnqn); 366 367 368 build_nvme_name(entry->name, sizeof(entry->name), ctrlr); 369 370 entry->ctrlr = ctrlr; 371 entry->trtype = trid_entry->trid.trtype; 372 TAILQ_INSERT_TAIL(&g_controllers, entry, link); 373 374 for (nsid = spdk_nvme_ctrlr_get_first_active_ns(ctrlr); 375 nsid != 0; nsid = spdk_nvme_ctrlr_get_next_active_ns(ctrlr, nsid)) { 376 ns = spdk_nvme_ctrlr_get_ns(ctrlr, nsid); 377 if (ns == NULL) { 378 continue; 379 } 380 register_ns(ctrlr, ns); 381 } 382 } 383 384 static __thread unsigned int seed = 0; 385 386 static inline void 387 submit_single_io(struct perf_task *task) 388 { 389 uint64_t offset_in_ios; 390 int rc; 391 struct ns_worker_ctx *ns_ctx = task->ns_ctx; 392 struct ns_entry *entry = ns_ctx->entry; 393 394 if (g_is_random) { 395 offset_in_ios = rand_r(&seed) % entry->size_in_ios; 396 } else { 397 offset_in_ios = ns_ctx->offset_in_ios++; 398 if (ns_ctx->offset_in_ios == entry->size_in_ios) { 399 ns_ctx->offset_in_ios = 0; 400 } 401 } 402 403 if ((g_rw_percentage == 100) || 404 (g_rw_percentage != 0 && ((rand_r(&seed) % 100) < g_rw_percentage))) { 405 task->is_read = true; 406 } else { 407 task->is_read = false; 408 } 409 410 rc = nvme_submit_io(task, ns_ctx, entry, offset_in_ios); 411 412 if (spdk_unlikely(rc != 0)) { 413 fprintf(stderr, "starting I/O failed\n"); 414 } else { 415 ns_ctx->current_queue_depth++; 416 } 417 } 418 419 static inline void 420 task_complete(struct perf_task *task) 421 { 422 struct ns_worker_ctx *ns_ctx; 423 424 ns_ctx = task->ns_ctx; 425 ns_ctx->current_queue_depth--; 426 ns_ctx->io_completed++; 427 428 /* 429 * is_draining indicates when time has expired for the test run 430 * and we are just waiting for the previously submitted I/O 431 * to complete. In this case, do not submit a new I/O to replace 432 * the one just completed. 433 */ 434 if (spdk_unlikely(ns_ctx->is_draining)) { 435 spdk_dma_free(task->iov.iov_base); 436 free(task); 437 } else { 438 submit_single_io(task); 439 } 440 } 441 442 static void 443 io_complete(void *ctx, const struct spdk_nvme_cpl *cpl) 444 { 445 struct perf_task *task = ctx; 446 447 if (spdk_unlikely(spdk_nvme_cpl_is_error(cpl))) { 448 fprintf(stderr, "%s completed with error (sct=%d, sc=%d)\n", 449 task->is_read ? "Read" : "Write", 450 cpl->status.sct, cpl->status.sc); 451 } 452 453 task_complete(task); 454 } 455 456 static void 457 check_io(struct ns_worker_ctx *ns_ctx) 458 { 459 nvme_check_io(ns_ctx); 460 } 461 462 static struct perf_task * 463 allocate_task(struct ns_worker_ctx *ns_ctx, int queue_depth) 464 { 465 struct perf_task *task; 466 467 task = calloc(1, sizeof(*task)); 468 if (task == NULL) { 469 fprintf(stderr, "Out of memory allocating tasks\n"); 470 exit(1); 471 } 472 473 nvme_setup_payload(task); 474 475 task->ns_ctx = ns_ctx; 476 477 return task; 478 } 479 480 static void 481 submit_io(struct ns_worker_ctx *ns_ctx, int queue_depth) 482 { 483 struct perf_task *task; 484 485 while (queue_depth-- > 0) { 486 task = allocate_task(ns_ctx, queue_depth); 487 submit_single_io(task); 488 } 489 } 490 491 static int 492 work_fn(void *arg) 493 { 494 uint64_t tsc_end; 495 struct worker_thread *worker = (struct worker_thread *)arg; 496 struct ns_worker_ctx *ns_ctx = NULL; 497 uint32_t unfinished_ns_ctx; 498 499 printf("Starting thread on core %u\n", worker->lcore); 500 501 /* Allocate queue pairs for each namespace. */ 502 TAILQ_FOREACH(ns_ctx, &worker->ns_ctx, link) { 503 if (nvme_init_ns_worker_ctx(ns_ctx) != 0) { 504 printf("ERROR: init_ns_worker_ctx() failed\n"); 505 return 1; 506 } 507 } 508 509 tsc_end = spdk_get_ticks() + g_time_in_sec * g_tsc_rate; 510 511 /* Submit initial I/O for each namespace. */ 512 TAILQ_FOREACH(ns_ctx, &worker->ns_ctx, link) { 513 submit_io(ns_ctx, g_queue_depth); 514 } 515 516 while (1) { 517 /* 518 * Check for completed I/O for each controller. A new 519 * I/O will be submitted in the io_complete callback 520 * to replace each I/O that is completed. 521 */ 522 TAILQ_FOREACH(ns_ctx, &worker->ns_ctx, link) { 523 check_io(ns_ctx); 524 } 525 526 if (spdk_get_ticks() > tsc_end) { 527 break; 528 } 529 } 530 531 /* drain the io of each ns_ctx in round robin to make the fairness */ 532 do { 533 unfinished_ns_ctx = 0; 534 TAILQ_FOREACH(ns_ctx, &worker->ns_ctx, link) { 535 /* first time will enter into this if case */ 536 if (!ns_ctx->is_draining) { 537 ns_ctx->is_draining = true; 538 } 539 540 if (ns_ctx->current_queue_depth > 0) { 541 check_io(ns_ctx); 542 if (ns_ctx->current_queue_depth == 0) { 543 nvme_cleanup_ns_worker_ctx(ns_ctx); 544 } else { 545 unfinished_ns_ctx++; 546 } 547 } 548 } 549 } while (unfinished_ns_ctx > 0); 550 551 return 0; 552 } 553 554 static void 555 usage(char *program_name) 556 { 557 printf("%s options", program_name); 558 printf("\n"); 559 printf("\t[-q io depth]\n"); 560 printf("\t[-o io size in bytes]\n"); 561 printf("\t[-w io pattern type, must be one of\n"); 562 printf("\t\t(read, write, randread, randwrite, rw, randrw)]\n"); 563 printf("\t[-M rwmixread (100 for reads, 0 for writes)]\n"); 564 printf("\t[-t time in seconds]\n"); 565 printf("\t[-c core mask for I/O submission/completion.]\n"); 566 printf("\t\t(default: 1)\n"); 567 printf("\t[-r Transport ID for NVMeoF]\n"); 568 printf("\t Format: 'key:value [key:value] ...'\n"); 569 printf("\t Keys:\n"); 570 printf("\t trtype Transport type (e.g. RDMA)\n"); 571 printf("\t adrfam Address family (e.g. IPv4, IPv6)\n"); 572 printf("\t traddr Transport address (e.g. 192.168.100.8 for RDMA)\n"); 573 printf("\t trsvcid Transport service identifier (e.g. 4420)\n"); 574 printf("\t subnqn Subsystem NQN (default: %s)\n", SPDK_NVMF_DISCOVERY_NQN); 575 printf("\t alt_traddr (Optional) Alternative Transport address for failover.\n"); 576 printf("\t Example: -r 'trtype:RDMA adrfam:IPv4 traddr:192.168.100.8 trsvcid:4420' for NVMeoF\n"); 577 printf("\t[-k keep alive timeout period in millisecond]\n"); 578 printf("\t[-s DPDK huge memory size in MB.]\n"); 579 printf("\t[-m max completions per poll]\n"); 580 printf("\t\t(default: 0 - unlimited)\n"); 581 printf("\t[-i shared memory group ID]\n"); 582 printf("\t[-A transport ACK timeout]\n"); 583 printf("\t[-R transport retry count]\n"); 584 printf("\t"); 585 spdk_log_usage(stdout, "-T"); 586 #ifdef DEBUG 587 printf("\t[-G enable debug logging]\n"); 588 #else 589 printf("\t[-G enable debug logging (flag disabled, must reconfigure with --enable-debug)]\n"); 590 #endif 591 } 592 593 static void 594 unregister_trids(void) 595 { 596 struct trid_entry *trid_entry, *tmp; 597 598 TAILQ_FOREACH_SAFE(trid_entry, &g_trid_list, tailq, tmp) { 599 TAILQ_REMOVE(&g_trid_list, trid_entry, tailq); 600 free(trid_entry); 601 } 602 } 603 604 static int 605 add_trid(const char *trid_str) 606 { 607 struct trid_entry *trid_entry; 608 struct spdk_nvme_transport_id *trid; 609 char *alt_traddr; 610 int len; 611 612 trid_entry = calloc(1, sizeof(*trid_entry)); 613 if (trid_entry == NULL) { 614 return -1; 615 } 616 617 trid = &trid_entry->trid; 618 snprintf(trid->subnqn, sizeof(trid->subnqn), "%s", SPDK_NVMF_DISCOVERY_NQN); 619 620 if (spdk_nvme_transport_id_parse(trid, trid_str) != 0) { 621 fprintf(stderr, "Invalid transport ID format '%s'\n", trid_str); 622 free(trid_entry); 623 return 1; 624 } 625 626 trid_entry->failover_trid = trid_entry->trid; 627 628 alt_traddr = strcasestr(trid_str, "alt_traddr:"); 629 if (alt_traddr) { 630 alt_traddr += strlen("alt_traddr:"); 631 len = strcspn(alt_traddr, " \t\n"); 632 if (len > SPDK_NVMF_TRADDR_MAX_LEN) { 633 fprintf(stderr, "The failover traddr %s is too long.\n", alt_traddr); 634 free(trid_entry); 635 return -1; 636 } 637 snprintf(trid_entry->failover_trid.traddr, SPDK_NVMF_TRADDR_MAX_LEN + 1, "%s", alt_traddr); 638 } 639 640 TAILQ_INSERT_TAIL(&g_trid_list, trid_entry, tailq); 641 return 0; 642 } 643 644 static int 645 parse_args(int argc, char **argv) 646 { 647 struct trid_entry *trid_entry, *trid_entry_tmp; 648 const char *workload_type; 649 int op; 650 bool mix_specified = false; 651 long int val; 652 int rc; 653 654 /* default value */ 655 g_queue_depth = 0; 656 g_io_size_bytes = 0; 657 workload_type = NULL; 658 g_time_in_sec = 0; 659 g_rw_percentage = -1; 660 g_core_mask = NULL; 661 g_max_completions = 0; 662 663 while ((op = getopt(argc, argv, "c:gm:o:q:r:k:s:t:w:A:GM:R:T:")) != -1) { 664 switch (op) { 665 case 'm': 666 case 'o': 667 case 'q': 668 case 'k': 669 case 's': 670 case 't': 671 case 'A': 672 case 'M': 673 case 'R': 674 val = spdk_strtol(optarg, 10); 675 if (val < 0) { 676 fprintf(stderr, "Converting a string to integer failed\n"); 677 return val; 678 } 679 switch (op) { 680 case 'm': 681 g_max_completions = val; 682 break; 683 case 'o': 684 g_io_size_bytes = val; 685 break; 686 case 'q': 687 g_queue_depth = val; 688 break; 689 case 'k': 690 g_keep_alive_timeout_in_ms = val; 691 break; 692 case 's': 693 g_dpdk_mem = val; 694 break; 695 case 't': 696 g_time_in_sec = val; 697 break; 698 case 'A': 699 g_transport_ack_timeout = val; 700 break; 701 case 'M': 702 g_rw_percentage = val; 703 mix_specified = true; 704 break; 705 case 'R': 706 g_transport_retry_count = val; 707 break; 708 } 709 break; 710 case 'c': 711 g_core_mask = optarg; 712 break; 713 case 'g': 714 g_dpdk_mem_single_seg = true; 715 break; 716 case 'r': 717 if (add_trid(optarg)) { 718 usage(argv[0]); 719 return 1; 720 } 721 break; 722 case 'w': 723 workload_type = optarg; 724 break; 725 case 'G': 726 #ifndef DEBUG 727 fprintf(stderr, "%s must be configured with --enable-debug for -G flag\n", 728 argv[0]); 729 usage(argv[0]); 730 return 1; 731 #else 732 spdk_log_set_flag("nvme"); 733 spdk_log_set_print_level(SPDK_LOG_DEBUG); 734 break; 735 #endif 736 case 'T': 737 rc = spdk_log_set_flag(optarg); 738 if (rc < 0) { 739 fprintf(stderr, "unknown flag\n"); 740 usage(argv[0]); 741 exit(EXIT_FAILURE); 742 } 743 #ifdef DEBUG 744 spdk_log_set_print_level(SPDK_LOG_DEBUG); 745 #endif 746 break; 747 default: 748 usage(argv[0]); 749 return 1; 750 } 751 } 752 753 if (!g_queue_depth) { 754 usage(argv[0]); 755 return 1; 756 } 757 if (!g_io_size_bytes) { 758 usage(argv[0]); 759 return 1; 760 } 761 if (!workload_type) { 762 usage(argv[0]); 763 return 1; 764 } 765 if (!g_time_in_sec) { 766 usage(argv[0]); 767 return 1; 768 } 769 770 if (strcmp(workload_type, "read") && 771 strcmp(workload_type, "write") && 772 strcmp(workload_type, "randread") && 773 strcmp(workload_type, "randwrite") && 774 strcmp(workload_type, "rw") && 775 strcmp(workload_type, "randrw")) { 776 fprintf(stderr, 777 "io pattern type must be one of\n" 778 "(read, write, randread, randwrite, rw, randrw)\n"); 779 return 1; 780 } 781 782 if (!strcmp(workload_type, "read") || 783 !strcmp(workload_type, "randread")) { 784 g_rw_percentage = 100; 785 } 786 787 if (!strcmp(workload_type, "write") || 788 !strcmp(workload_type, "randwrite")) { 789 g_rw_percentage = 0; 790 } 791 792 if (!strcmp(workload_type, "read") || 793 !strcmp(workload_type, "randread") || 794 !strcmp(workload_type, "write") || 795 !strcmp(workload_type, "randwrite")) { 796 if (mix_specified) { 797 fprintf(stderr, "Ignoring -M option... Please use -M option" 798 " only when using rw or randrw.\n"); 799 } 800 } 801 802 if (!strcmp(workload_type, "rw") || 803 !strcmp(workload_type, "randrw")) { 804 if (g_rw_percentage < 0 || g_rw_percentage > 100) { 805 fprintf(stderr, 806 "-M must be specified to value from 0 to 100 " 807 "for rw or randrw.\n"); 808 return 1; 809 } 810 } 811 812 if (!strcmp(workload_type, "read") || 813 !strcmp(workload_type, "write") || 814 !strcmp(workload_type, "rw")) { 815 g_is_random = 0; 816 } else { 817 g_is_random = 1; 818 } 819 820 if (TAILQ_EMPTY(&g_trid_list)) { 821 fprintf(stderr, "You must specify at least one fabrics TRID.\n"); 822 return -1; 823 } 824 825 /* check whether there is local PCIe type and fail. */ 826 TAILQ_FOREACH_SAFE(trid_entry, &g_trid_list, tailq, trid_entry_tmp) { 827 if (trid_entry->trid.trtype == SPDK_NVME_TRANSPORT_PCIE) { 828 fprintf(stderr, "This application was not intended to be run on PCIe controllers.\n"); 829 return 1; 830 } 831 } 832 833 return 0; 834 } 835 836 static int 837 register_workers(void) 838 { 839 uint32_t i; 840 struct worker_thread *worker; 841 842 SPDK_ENV_FOREACH_CORE(i) { 843 worker = calloc(1, sizeof(*worker)); 844 if (worker == NULL) { 845 fprintf(stderr, "Unable to allocate worker\n"); 846 return -1; 847 } 848 849 TAILQ_INIT(&worker->ns_ctx); 850 worker->lcore = i; 851 TAILQ_INSERT_TAIL(&g_workers, worker, link); 852 g_num_workers++; 853 } 854 855 return 0; 856 } 857 858 static void 859 unregister_workers(void) 860 { 861 struct worker_thread *worker, *tmp_worker; 862 struct ns_worker_ctx *ns_ctx, *tmp_ns_ctx; 863 864 /* Free namespace context and worker thread */ 865 TAILQ_FOREACH_SAFE(worker, &g_workers, link, tmp_worker) { 866 TAILQ_REMOVE(&g_workers, worker, link); 867 TAILQ_FOREACH_SAFE(ns_ctx, &worker->ns_ctx, link, tmp_ns_ctx) { 868 TAILQ_REMOVE(&worker->ns_ctx, ns_ctx, link); 869 free(ns_ctx); 870 } 871 872 free(worker); 873 } 874 } 875 876 static bool 877 probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 878 struct spdk_nvme_ctrlr_opts *opts) 879 { 880 /* These should have been weeded out earlier. */ 881 assert(trid->trtype != SPDK_NVME_TRANSPORT_PCIE); 882 883 printf("Attaching to NVMe over Fabrics controller at %s:%s: %s\n", 884 trid->traddr, trid->trsvcid, 885 trid->subnqn); 886 887 /* Set io_queue_size to UINT16_MAX, NVMe driver 888 * will then reduce this to MQES to maximize 889 * the io_queue_size as much as possible. 890 */ 891 opts->io_queue_size = UINT16_MAX; 892 893 opts->keep_alive_timeout_ms = spdk_max(opts->keep_alive_timeout_ms, 894 g_keep_alive_timeout_in_ms); 895 896 opts->transport_retry_count = g_transport_retry_count; 897 opts->transport_ack_timeout = g_transport_ack_timeout; 898 899 return true; 900 } 901 902 static void 903 attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 904 struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts) 905 { 906 struct trid_entry *trid_entry = cb_ctx; 907 908 printf("Attached to NVMe over Fabrics controller at %s:%s: %s\n", 909 trid->traddr, trid->trsvcid, 910 trid->subnqn); 911 912 register_ctrlr(ctrlr, trid_entry); 913 } 914 915 static int 916 register_controllers(void) 917 { 918 struct trid_entry *trid_entry; 919 920 printf("Initializing NVMe Controllers\n"); 921 922 TAILQ_FOREACH(trid_entry, &g_trid_list, tailq) { 923 if (spdk_nvme_probe(&trid_entry->trid, trid_entry, probe_cb, attach_cb, NULL) != 0) { 924 fprintf(stderr, "spdk_nvme_probe() failed for transport address '%s'\n", 925 trid_entry->trid.traddr); 926 return -1; 927 } 928 } 929 930 return 0; 931 } 932 933 static void 934 unregister_controllers(void) 935 { 936 struct ctrlr_entry *entry, *tmp; 937 struct spdk_nvme_detach_ctx *detach_ctx = NULL; 938 939 TAILQ_FOREACH_SAFE(entry, &g_controllers, link, tmp) { 940 TAILQ_REMOVE(&g_controllers, entry, link); 941 spdk_nvme_detach_async(entry->ctrlr, &detach_ctx); 942 free(entry); 943 } 944 945 if (detach_ctx) { 946 spdk_nvme_detach_poll(detach_ctx); 947 } 948 } 949 950 static int 951 associate_workers_with_ns(void) 952 { 953 struct ns_entry *entry = TAILQ_FIRST(&g_namespaces); 954 struct worker_thread *worker = TAILQ_FIRST(&g_workers); 955 struct ns_worker_ctx *ns_ctx; 956 int i, count; 957 958 count = g_num_namespaces > g_num_workers ? g_num_namespaces : g_num_workers; 959 960 for (i = 0; i < count; i++) { 961 if (entry == NULL) { 962 break; 963 } 964 965 ns_ctx = calloc(1, sizeof(struct ns_worker_ctx)); 966 if (!ns_ctx) { 967 return -1; 968 } 969 970 printf("Associating %s with lcore %d\n", entry->name, worker->lcore); 971 ns_ctx->entry = entry; 972 973 TAILQ_INSERT_TAIL(&worker->ns_ctx, ns_ctx, link); 974 975 worker = TAILQ_NEXT(worker, link); 976 if (worker == NULL) { 977 worker = TAILQ_FIRST(&g_workers); 978 } 979 980 entry = TAILQ_NEXT(entry, link); 981 if (entry == NULL) { 982 entry = TAILQ_FIRST(&g_namespaces); 983 } 984 985 } 986 987 return 0; 988 } 989 990 static void * 991 nvme_poll_ctrlrs(void *arg) 992 { 993 struct ctrlr_entry *entry; 994 const struct spdk_nvme_transport_id *old_trid; 995 int oldstate; 996 int rc; 997 998 999 spdk_unaffinitize_thread(); 1000 1001 while (true) { 1002 pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &oldstate); 1003 1004 TAILQ_FOREACH(entry, &g_controllers, link) { 1005 rc = spdk_nvme_ctrlr_process_admin_completions(entry->ctrlr); 1006 /* This controller has encountered a failure at the transport level. reset it. */ 1007 if (rc == -ENXIO) { 1008 if (entry->num_resets == 0) { 1009 old_trid = spdk_nvme_ctrlr_get_transport_id(entry->ctrlr); 1010 fprintf(stderr, "A controller has encountered a failure and is being reset.\n"); 1011 if (spdk_nvme_transport_id_compare(old_trid, &entry->failover_trid)) { 1012 fprintf(stderr, "Resorting to new failover address %s\n", entry->failover_trid.traddr); 1013 spdk_nvme_ctrlr_fail(entry->ctrlr); 1014 rc = spdk_nvme_ctrlr_set_trid(entry->ctrlr, &entry->failover_trid); 1015 if (rc != 0) { 1016 fprintf(stderr, "Unable to fail over to back up trid.\n"); 1017 } 1018 } 1019 } 1020 1021 rc = spdk_nvme_ctrlr_reset(entry->ctrlr); 1022 if (rc != 0) { 1023 entry->num_resets++; 1024 fprintf(stderr, "Unable to reset the controller.\n"); 1025 1026 if (entry->num_resets > g_max_ctrlr_resets) { 1027 fprintf(stderr, "Controller cannot be recovered. Exiting.\n"); 1028 exit(1); 1029 } 1030 } else { 1031 fprintf(stderr, "Controller properly reset.\n"); 1032 } 1033 } 1034 } 1035 1036 pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, &oldstate); 1037 1038 /* This is a pthread cancellation point and cannot be removed. */ 1039 sleep(1); 1040 } 1041 1042 return NULL; 1043 } 1044 1045 int 1046 main(int argc, char **argv) 1047 { 1048 int rc; 1049 struct worker_thread *worker, *main_worker; 1050 unsigned main_core; 1051 struct spdk_env_opts opts; 1052 pthread_t thread_id = 0; 1053 1054 rc = parse_args(argc, argv); 1055 if (rc != 0) { 1056 return rc; 1057 } 1058 1059 spdk_env_opts_init(&opts); 1060 opts.name = "reconnect"; 1061 if (g_core_mask) { 1062 opts.core_mask = g_core_mask; 1063 } 1064 1065 if (g_dpdk_mem) { 1066 opts.mem_size = g_dpdk_mem; 1067 } 1068 opts.hugepage_single_segments = g_dpdk_mem_single_seg; 1069 if (spdk_env_init(&opts) < 0) { 1070 fprintf(stderr, "Unable to initialize SPDK env\n"); 1071 unregister_trids(); 1072 return 1; 1073 } 1074 1075 g_tsc_rate = spdk_get_ticks_hz(); 1076 1077 if (register_workers() != 0) { 1078 rc = 1; 1079 goto cleanup; 1080 } 1081 1082 if (register_controllers() != 0) { 1083 rc = 1; 1084 goto cleanup; 1085 } 1086 1087 if (g_warn) { 1088 printf("WARNING: Some requested NVMe devices were skipped\n"); 1089 } 1090 1091 if (g_num_namespaces == 0) { 1092 fprintf(stderr, "No valid NVMe controllers found\n"); 1093 goto cleanup; 1094 } 1095 1096 rc = pthread_create(&thread_id, NULL, &nvme_poll_ctrlrs, NULL); 1097 if (rc != 0) { 1098 fprintf(stderr, "Unable to spawn a thread to poll admin queues.\n"); 1099 goto cleanup; 1100 } 1101 1102 if (associate_workers_with_ns() != 0) { 1103 rc = 1; 1104 goto cleanup; 1105 } 1106 1107 printf("Initialization complete. Launching workers.\n"); 1108 1109 /* Launch all of the secondary workers */ 1110 main_core = spdk_env_get_current_core(); 1111 main_worker = NULL; 1112 TAILQ_FOREACH(worker, &g_workers, link) { 1113 if (worker->lcore != main_core) { 1114 spdk_env_thread_launch_pinned(worker->lcore, work_fn, worker); 1115 } else { 1116 assert(main_worker == NULL); 1117 main_worker = worker; 1118 } 1119 } 1120 1121 assert(main_worker != NULL); 1122 rc = work_fn(main_worker); 1123 1124 spdk_env_thread_wait_all(); 1125 1126 cleanup: 1127 if (thread_id && pthread_cancel(thread_id) == 0) { 1128 pthread_join(thread_id, NULL); 1129 } 1130 unregister_trids(); 1131 unregister_namespaces(); 1132 unregister_controllers(); 1133 unregister_workers(); 1134 1135 spdk_env_fini(); 1136 1137 if (rc != 0) { 1138 fprintf(stderr, "%s: errors occurred\n", argv[0]); 1139 /* 1140 * return a generic error to the caller. This allows us to 1141 * distinguish between a failure in the script and something 1142 * like a segfault or an invalid access which causes the program 1143 * to crash. 1144 */ 1145 rc = 1; 1146 } 1147 1148 return rc; 1149 } 1150