1 /*- 2 * BSD LICENSE 3 * 4 * Copyright (c) Intel Corporation. All rights reserved. 5 * Copyright (c) 2020 Mellanox Technologies LTD. All rights reserved. 6 * 7 * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 13 * * Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * * Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in 17 * the documentation and/or other materials provided with the 18 * distribution. 19 * * Neither the name of Intel Corporation nor the names of its 20 * contributors may be used to endorse or promote products derived 21 * from this software without specific prior written permission. 22 * 23 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 24 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 25 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 26 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 27 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 28 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 29 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 30 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 31 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 32 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 33 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 34 */ 35 36 #include "spdk/stdinc.h" 37 38 #include "spdk/env.h" 39 #include "spdk/nvme.h" 40 #include "spdk/queue.h" 41 #include "spdk/string.h" 42 #include "spdk/util.h" 43 #include "spdk/log.h" 44 #include "spdk/likely.h" 45 46 struct ctrlr_entry { 47 struct spdk_nvme_ctrlr *ctrlr; 48 struct spdk_nvme_transport_id failover_trid; 49 enum spdk_nvme_transport_type trtype; 50 TAILQ_ENTRY(ctrlr_entry) link; 51 char name[1024]; 52 int num_resets; 53 }; 54 55 struct ns_entry { 56 struct spdk_nvme_ctrlr *ctrlr; 57 struct spdk_nvme_ns *ns; 58 59 TAILQ_ENTRY(ns_entry) link; 60 uint32_t io_size_blocks; 61 uint32_t num_io_requests; 62 uint64_t size_in_ios; 63 uint32_t block_size; 64 uint32_t io_flags; 65 char name[1024]; 66 }; 67 68 struct ns_worker_ctx { 69 struct ns_entry *entry; 70 uint64_t io_completed; 71 uint64_t current_queue_depth; 72 uint64_t offset_in_ios; 73 bool is_draining; 74 75 int num_qpairs; 76 struct spdk_nvme_qpair **qpair; 77 int last_qpair; 78 79 TAILQ_ENTRY(ns_worker_ctx) link; 80 }; 81 82 struct perf_task { 83 struct ns_worker_ctx *ns_ctx; 84 struct iovec iov; 85 bool is_read; 86 }; 87 88 struct worker_thread { 89 TAILQ_HEAD(, ns_worker_ctx) ns_ctx; 90 TAILQ_ENTRY(worker_thread) link; 91 unsigned lcore; 92 }; 93 94 /* For basic reset handling. */ 95 static int g_max_ctrlr_resets = 15; 96 97 static TAILQ_HEAD(, ctrlr_entry) g_controllers = TAILQ_HEAD_INITIALIZER(g_controllers); 98 static TAILQ_HEAD(, ns_entry) g_namespaces = TAILQ_HEAD_INITIALIZER(g_namespaces); 99 static int g_num_namespaces = 0; 100 static TAILQ_HEAD(, worker_thread) g_workers = TAILQ_HEAD_INITIALIZER(g_workers); 101 static int g_num_workers = 0; 102 103 static uint64_t g_tsc_rate; 104 105 static uint32_t g_io_align = 0x200; 106 static uint32_t g_io_size_bytes; 107 static uint32_t g_max_io_size_blocks; 108 static int g_rw_percentage; 109 static int g_is_random; 110 static int g_queue_depth; 111 static int g_time_in_sec; 112 static uint32_t g_max_completions; 113 static int g_dpdk_mem; 114 static bool g_warn; 115 static uint32_t g_keep_alive_timeout_in_ms = 0; 116 static uint8_t g_transport_retry_count = 4; 117 static uint8_t g_transport_ack_timeout = 0; /* disabled */ 118 119 static const char *g_core_mask; 120 121 struct trid_entry { 122 struct spdk_nvme_transport_id trid; 123 struct spdk_nvme_transport_id failover_trid; 124 TAILQ_ENTRY(trid_entry) tailq; 125 }; 126 127 static TAILQ_HEAD(, trid_entry) g_trid_list = TAILQ_HEAD_INITIALIZER(g_trid_list); 128 129 static inline void 130 task_complete(struct perf_task *task); 131 static void submit_io(struct ns_worker_ctx *ns_ctx, int queue_depth); 132 133 static void io_complete(void *ctx, const struct spdk_nvme_cpl *cpl); 134 135 static void 136 nvme_setup_payload(struct perf_task *task) 137 { 138 /* maximum extended lba format size from all active namespace, 139 * it's same with g_io_size_bytes for namespace without metadata. 140 */ 141 task->iov.iov_base = spdk_dma_zmalloc(g_io_size_bytes, g_io_align, NULL); 142 task->iov.iov_len = g_io_size_bytes; 143 if (task->iov.iov_base == NULL) { 144 fprintf(stderr, "task->buf spdk_dma_zmalloc failed\n"); 145 exit(1); 146 } 147 } 148 149 static int 150 nvme_submit_io(struct perf_task *task, struct ns_worker_ctx *ns_ctx, 151 struct ns_entry *entry, uint64_t offset_in_ios) 152 { 153 uint64_t lba; 154 int qp_num; 155 156 lba = offset_in_ios * entry->io_size_blocks; 157 158 qp_num = ns_ctx->last_qpair; 159 ns_ctx->last_qpair++; 160 if (ns_ctx->last_qpair == ns_ctx->num_qpairs) { 161 ns_ctx->last_qpair = 0; 162 } 163 164 if (task->is_read) { 165 return spdk_nvme_ns_cmd_read(entry->ns, ns_ctx->qpair[qp_num], 166 task->iov.iov_base, lba, 167 entry->io_size_blocks, io_complete, 168 task, entry->io_flags); 169 } 170 171 return spdk_nvme_ns_cmd_write(entry->ns, ns_ctx->qpair[qp_num], 172 task->iov.iov_base, lba, 173 entry->io_size_blocks, io_complete, 174 task, entry->io_flags); 175 } 176 177 static void 178 nvme_check_io(struct ns_worker_ctx *ns_ctx) 179 { 180 int i, rc; 181 182 for (i = 0; i < ns_ctx->num_qpairs; i++) { 183 rc = spdk_nvme_qpair_process_completions(ns_ctx->qpair[i], g_max_completions); 184 /* The transport level qpair is failed and we need to reconnect it. */ 185 if (spdk_unlikely(rc == -ENXIO)) { 186 rc = spdk_nvme_ctrlr_reconnect_io_qpair(ns_ctx->qpair[i]); 187 /* successful reconnect */ 188 if (rc == 0) { 189 continue; 190 } else if (rc == -ENXIO) { 191 /* This means the controller is failed. Defer to it to restore the qpair. */ 192 continue; 193 } else { 194 /* 195 * We were unable to restore the qpair on this attempt. We don't 196 * really know why. For naive handling, just keep trying. 197 * TODO: add a retry limit, and destroy the qpair after x iterations. 198 */ 199 fprintf(stderr, "qpair failed and we were unable to recover it.\n"); 200 } 201 } else if (spdk_unlikely(rc < 0)) { 202 fprintf(stderr, "Received an unknown error processing completions.\n"); 203 exit(1); 204 } 205 } 206 } 207 208 /* 209 * TODO: If a controller has multiple namespaces, they could all use the same queue. 210 * For now, give each namespace/thread combination its own queue. 211 */ 212 static int 213 nvme_init_ns_worker_ctx(struct ns_worker_ctx *ns_ctx) 214 { 215 struct spdk_nvme_io_qpair_opts opts; 216 struct ns_entry *entry = ns_ctx->entry; 217 int i; 218 219 ns_ctx->num_qpairs = 1; 220 ns_ctx->qpair = calloc(ns_ctx->num_qpairs, sizeof(struct spdk_nvme_qpair *)); 221 if (!ns_ctx->qpair) { 222 return -1; 223 } 224 225 spdk_nvme_ctrlr_get_default_io_qpair_opts(entry->ctrlr, &opts, sizeof(opts)); 226 if (opts.io_queue_requests < entry->num_io_requests) { 227 opts.io_queue_requests = entry->num_io_requests; 228 } 229 230 for (i = 0; i < ns_ctx->num_qpairs; i++) { 231 ns_ctx->qpair[i] = spdk_nvme_ctrlr_alloc_io_qpair(entry->ctrlr, &opts, 232 sizeof(opts)); 233 if (!ns_ctx->qpair[i]) { 234 printf("ERROR: spdk_nvme_ctrlr_alloc_io_qpair failed\n"); 235 return -1; 236 } 237 } 238 239 return 0; 240 } 241 242 static void 243 nvme_cleanup_ns_worker_ctx(struct ns_worker_ctx *ns_ctx) 244 { 245 int i; 246 247 for (i = 0; i < ns_ctx->num_qpairs; i++) { 248 spdk_nvme_ctrlr_free_io_qpair(ns_ctx->qpair[i]); 249 } 250 251 free(ns_ctx->qpair); 252 } 253 254 static void 255 build_nvme_name(char *name, size_t length, struct spdk_nvme_ctrlr *ctrlr) 256 { 257 const struct spdk_nvme_transport_id *trid; 258 259 trid = spdk_nvme_ctrlr_get_transport_id(ctrlr); 260 261 switch (trid->trtype) { 262 case SPDK_NVME_TRANSPORT_RDMA: 263 snprintf(name, length, "RDMA (addr:%s subnqn:%s)", trid->traddr, trid->subnqn); 264 break; 265 case SPDK_NVME_TRANSPORT_TCP: 266 snprintf(name, length, "TCP (addr:%s subnqn:%s)", trid->traddr, trid->subnqn); 267 break; 268 case SPDK_NVME_TRANSPORT_CUSTOM: 269 snprintf(name, length, "CUSTOM (%s)", trid->traddr); 270 break; 271 default: 272 fprintf(stderr, "Unknown transport type %d\n", trid->trtype); 273 break; 274 } 275 } 276 277 static void 278 register_ns(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_ns *ns) 279 { 280 struct ns_entry *entry; 281 const struct spdk_nvme_ctrlr_data *cdata; 282 uint32_t max_xfer_size, entries, sector_size; 283 uint64_t ns_size; 284 struct spdk_nvme_io_qpair_opts opts; 285 286 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 287 288 if (!spdk_nvme_ns_is_active(ns)) { 289 printf("Controller %-20.20s (%-20.20s): Skipping inactive NS %u\n", 290 cdata->mn, cdata->sn, 291 spdk_nvme_ns_get_id(ns)); 292 g_warn = true; 293 return; 294 } 295 296 ns_size = spdk_nvme_ns_get_size(ns); 297 sector_size = spdk_nvme_ns_get_sector_size(ns); 298 299 if (ns_size < g_io_size_bytes || sector_size > g_io_size_bytes) { 300 printf("WARNING: controller %-20.20s (%-20.20s) ns %u has invalid " 301 "ns size %" PRIu64 " / block size %u for I/O size %u\n", 302 cdata->mn, cdata->sn, spdk_nvme_ns_get_id(ns), 303 ns_size, spdk_nvme_ns_get_sector_size(ns), g_io_size_bytes); 304 g_warn = true; 305 return; 306 } 307 308 max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns); 309 spdk_nvme_ctrlr_get_default_io_qpair_opts(ctrlr, &opts, sizeof(opts)); 310 /* NVMe driver may add additional entries based on 311 * stripe size and maximum transfer size, we assume 312 * 1 more entry be used for stripe. 313 */ 314 entries = (g_io_size_bytes - 1) / max_xfer_size + 2; 315 if ((g_queue_depth * entries) > opts.io_queue_size) { 316 printf("controller IO queue size %u less than required\n", 317 opts.io_queue_size); 318 printf("Consider using lower queue depth or small IO size because " 319 "IO requests may be queued at the NVMe driver.\n"); 320 g_warn = true; 321 } 322 /* For requests which have children requests, parent request itself 323 * will also occupy 1 entry. 324 */ 325 entries += 1; 326 327 entry = calloc(1, sizeof(struct ns_entry)); 328 if (entry == NULL) { 329 perror("ns_entry malloc"); 330 exit(1); 331 } 332 333 entry->ctrlr = ctrlr; 334 entry->ns = ns; 335 entry->num_io_requests = g_queue_depth * entries; 336 337 entry->size_in_ios = ns_size / g_io_size_bytes; 338 entry->io_size_blocks = g_io_size_bytes / sector_size; 339 340 entry->block_size = spdk_nvme_ns_get_sector_size(ns); 341 342 343 if (g_max_io_size_blocks < entry->io_size_blocks) { 344 g_max_io_size_blocks = entry->io_size_blocks; 345 } 346 347 build_nvme_name(entry->name, sizeof(entry->name), ctrlr); 348 349 g_num_namespaces++; 350 TAILQ_INSERT_TAIL(&g_namespaces, entry, link); 351 } 352 353 static void 354 unregister_namespaces(void) 355 { 356 struct ns_entry *entry, *tmp; 357 358 TAILQ_FOREACH_SAFE(entry, &g_namespaces, link, tmp) { 359 TAILQ_REMOVE(&g_namespaces, entry, link); 360 free(entry); 361 } 362 } 363 364 static void 365 register_ctrlr(struct spdk_nvme_ctrlr *ctrlr, struct trid_entry *trid_entry) 366 { 367 struct spdk_nvme_ns *ns; 368 struct ctrlr_entry *entry = calloc(1, sizeof(struct ctrlr_entry)); 369 const struct spdk_nvme_transport_id *ctrlr_trid; 370 uint32_t nsid; 371 372 if (entry == NULL) { 373 perror("ctrlr_entry malloc"); 374 exit(1); 375 } 376 377 ctrlr_trid = spdk_nvme_ctrlr_get_transport_id(ctrlr); 378 assert(ctrlr_trid != NULL); 379 380 /* each controller needs a unique failover trid. */ 381 entry->failover_trid = trid_entry->failover_trid; 382 383 /* 384 * Users are allowed to leave the trid subnqn blank or specify a discovery controller subnqn. 385 * In those cases, the controller subnqn will not equal the trid_entry subnqn and, by association, 386 * the failover_trid subnqn. 387 * When we do failover, we want to reconnect to the same nqn so explicitly set the failover nqn to 388 * the ctrlr nqn here. 389 */ 390 snprintf(entry->failover_trid.subnqn, SPDK_NVMF_NQN_MAX_LEN + 1, "%s", ctrlr_trid->subnqn); 391 392 393 build_nvme_name(entry->name, sizeof(entry->name), ctrlr); 394 395 entry->ctrlr = ctrlr; 396 entry->trtype = trid_entry->trid.trtype; 397 TAILQ_INSERT_TAIL(&g_controllers, entry, link); 398 399 for (nsid = spdk_nvme_ctrlr_get_first_active_ns(ctrlr); 400 nsid != 0; nsid = spdk_nvme_ctrlr_get_next_active_ns(ctrlr, nsid)) { 401 ns = spdk_nvme_ctrlr_get_ns(ctrlr, nsid); 402 if (ns == NULL) { 403 continue; 404 } 405 register_ns(ctrlr, ns); 406 } 407 } 408 409 static __thread unsigned int seed = 0; 410 411 static inline void 412 submit_single_io(struct perf_task *task) 413 { 414 uint64_t offset_in_ios; 415 int rc; 416 struct ns_worker_ctx *ns_ctx = task->ns_ctx; 417 struct ns_entry *entry = ns_ctx->entry; 418 419 if (g_is_random) { 420 offset_in_ios = rand_r(&seed) % entry->size_in_ios; 421 } else { 422 offset_in_ios = ns_ctx->offset_in_ios++; 423 if (ns_ctx->offset_in_ios == entry->size_in_ios) { 424 ns_ctx->offset_in_ios = 0; 425 } 426 } 427 428 if ((g_rw_percentage == 100) || 429 (g_rw_percentage != 0 && ((rand_r(&seed) % 100) < g_rw_percentage))) { 430 task->is_read = true; 431 } else { 432 task->is_read = false; 433 } 434 435 rc = nvme_submit_io(task, ns_ctx, entry, offset_in_ios); 436 437 if (spdk_unlikely(rc != 0)) { 438 fprintf(stderr, "starting I/O failed\n"); 439 } else { 440 ns_ctx->current_queue_depth++; 441 } 442 } 443 444 static inline void 445 task_complete(struct perf_task *task) 446 { 447 struct ns_worker_ctx *ns_ctx; 448 449 ns_ctx = task->ns_ctx; 450 ns_ctx->current_queue_depth--; 451 ns_ctx->io_completed++; 452 453 /* 454 * is_draining indicates when time has expired for the test run 455 * and we are just waiting for the previously submitted I/O 456 * to complete. In this case, do not submit a new I/O to replace 457 * the one just completed. 458 */ 459 if (spdk_unlikely(ns_ctx->is_draining)) { 460 spdk_dma_free(task->iov.iov_base); 461 free(task); 462 } else { 463 submit_single_io(task); 464 } 465 } 466 467 static void 468 io_complete(void *ctx, const struct spdk_nvme_cpl *cpl) 469 { 470 struct perf_task *task = ctx; 471 472 if (spdk_unlikely(spdk_nvme_cpl_is_error(cpl))) { 473 fprintf(stderr, "%s completed with error (sct=%d, sc=%d)\n", 474 task->is_read ? "Read" : "Write", 475 cpl->status.sct, cpl->status.sc); 476 } 477 478 task_complete(task); 479 } 480 481 static void 482 check_io(struct ns_worker_ctx *ns_ctx) 483 { 484 nvme_check_io(ns_ctx); 485 } 486 487 static struct perf_task * 488 allocate_task(struct ns_worker_ctx *ns_ctx, int queue_depth) 489 { 490 struct perf_task *task; 491 492 task = calloc(1, sizeof(*task)); 493 if (task == NULL) { 494 fprintf(stderr, "Out of memory allocating tasks\n"); 495 exit(1); 496 } 497 498 nvme_setup_payload(task); 499 500 task->ns_ctx = ns_ctx; 501 502 return task; 503 } 504 505 static void 506 submit_io(struct ns_worker_ctx *ns_ctx, int queue_depth) 507 { 508 struct perf_task *task; 509 510 while (queue_depth-- > 0) { 511 task = allocate_task(ns_ctx, queue_depth); 512 submit_single_io(task); 513 } 514 } 515 516 static int 517 work_fn(void *arg) 518 { 519 uint64_t tsc_end; 520 struct worker_thread *worker = (struct worker_thread *)arg; 521 struct ns_worker_ctx *ns_ctx = NULL; 522 uint32_t unfinished_ns_ctx; 523 524 printf("Starting thread on core %u\n", worker->lcore); 525 526 /* Allocate queue pairs for each namespace. */ 527 TAILQ_FOREACH(ns_ctx, &worker->ns_ctx, link) { 528 if (nvme_init_ns_worker_ctx(ns_ctx) != 0) { 529 printf("ERROR: init_ns_worker_ctx() failed\n"); 530 return 1; 531 } 532 } 533 534 tsc_end = spdk_get_ticks() + g_time_in_sec * g_tsc_rate; 535 536 /* Submit initial I/O for each namespace. */ 537 TAILQ_FOREACH(ns_ctx, &worker->ns_ctx, link) { 538 submit_io(ns_ctx, g_queue_depth); 539 } 540 541 while (1) { 542 /* 543 * Check for completed I/O for each controller. A new 544 * I/O will be submitted in the io_complete callback 545 * to replace each I/O that is completed. 546 */ 547 TAILQ_FOREACH(ns_ctx, &worker->ns_ctx, link) { 548 check_io(ns_ctx); 549 } 550 551 if (spdk_get_ticks() > tsc_end) { 552 break; 553 } 554 } 555 556 /* drain the io of each ns_ctx in round robin to make the fairness */ 557 do { 558 unfinished_ns_ctx = 0; 559 TAILQ_FOREACH(ns_ctx, &worker->ns_ctx, link) { 560 /* first time will enter into this if case */ 561 if (!ns_ctx->is_draining) { 562 ns_ctx->is_draining = true; 563 } 564 565 if (ns_ctx->current_queue_depth > 0) { 566 check_io(ns_ctx); 567 if (ns_ctx->current_queue_depth == 0) { 568 nvme_cleanup_ns_worker_ctx(ns_ctx); 569 } else { 570 unfinished_ns_ctx++; 571 } 572 } 573 } 574 } while (unfinished_ns_ctx > 0); 575 576 return 0; 577 } 578 579 static void usage(char *program_name) 580 { 581 printf("%s options", program_name); 582 printf("\n"); 583 printf("\t[-q io depth]\n"); 584 printf("\t[-o io size in bytes]\n"); 585 printf("\t[-w io pattern type, must be one of\n"); 586 printf("\t\t(read, write, randread, randwrite, rw, randrw)]\n"); 587 printf("\t[-M rwmixread (100 for reads, 0 for writes)]\n"); 588 printf("\t[-t time in seconds]\n"); 589 printf("\t[-c core mask for I/O submission/completion.]\n"); 590 printf("\t\t(default: 1)\n"); 591 printf("\t[-r Transport ID for NVMeoF]\n"); 592 printf("\t Format: 'key:value [key:value] ...'\n"); 593 printf("\t Keys:\n"); 594 printf("\t trtype Transport type (e.g. RDMA)\n"); 595 printf("\t adrfam Address family (e.g. IPv4, IPv6)\n"); 596 printf("\t traddr Transport address (e.g. 192.168.100.8 for RDMA)\n"); 597 printf("\t trsvcid Transport service identifier (e.g. 4420)\n"); 598 printf("\t subnqn Subsystem NQN (default: %s)\n", SPDK_NVMF_DISCOVERY_NQN); 599 printf("\t alt_traddr (Optional) Alternative Transport address for failover.\n"); 600 printf("\t Example: -r 'trtype:RDMA adrfam:IPv4 traddr:192.168.100.8 trsvcid:4420' for NVMeoF\n"); 601 printf("\t[-k keep alive timeout period in millisecond]\n"); 602 printf("\t[-s DPDK huge memory size in MB.]\n"); 603 printf("\t[-m max completions per poll]\n"); 604 printf("\t\t(default: 0 - unlimited)\n"); 605 printf("\t[-i shared memory group ID]\n"); 606 printf("\t[-A transport ACK timeout]\n"); 607 printf("\t[-R transport retry count]\n"); 608 printf("\t"); 609 spdk_log_usage(stdout, "-T"); 610 #ifdef DEBUG 611 printf("\t[-G enable debug logging]\n"); 612 #else 613 printf("\t[-G enable debug logging (flag disabled, must reconfigure with --enable-debug)\n"); 614 #endif 615 } 616 617 static void 618 unregister_trids(void) 619 { 620 struct trid_entry *trid_entry, *tmp; 621 622 TAILQ_FOREACH_SAFE(trid_entry, &g_trid_list, tailq, tmp) { 623 TAILQ_REMOVE(&g_trid_list, trid_entry, tailq); 624 free(trid_entry); 625 } 626 } 627 628 static int 629 add_trid(const char *trid_str) 630 { 631 struct trid_entry *trid_entry; 632 struct spdk_nvme_transport_id *trid; 633 char *alt_traddr; 634 int len; 635 636 trid_entry = calloc(1, sizeof(*trid_entry)); 637 if (trid_entry == NULL) { 638 return -1; 639 } 640 641 trid = &trid_entry->trid; 642 snprintf(trid->subnqn, sizeof(trid->subnqn), "%s", SPDK_NVMF_DISCOVERY_NQN); 643 644 if (spdk_nvme_transport_id_parse(trid, trid_str) != 0) { 645 fprintf(stderr, "Invalid transport ID format '%s'\n", trid_str); 646 free(trid_entry); 647 return 1; 648 } 649 650 trid_entry->failover_trid = trid_entry->trid; 651 652 alt_traddr = strcasestr(trid_str, "alt_traddr:"); 653 if (alt_traddr) { 654 alt_traddr += strlen("alt_traddr:"); 655 len = strcspn(alt_traddr, " \t\n"); 656 if (len > SPDK_NVMF_TRADDR_MAX_LEN) { 657 fprintf(stderr, "The failover traddr %s is too long.\n", alt_traddr); 658 free(trid_entry); 659 return -1; 660 } 661 snprintf(trid_entry->failover_trid.traddr, SPDK_NVMF_TRADDR_MAX_LEN + 1, "%s", alt_traddr); 662 } 663 664 TAILQ_INSERT_TAIL(&g_trid_list, trid_entry, tailq); 665 return 0; 666 } 667 668 static int 669 parse_args(int argc, char **argv) 670 { 671 struct trid_entry *trid_entry, *trid_entry_tmp; 672 const char *workload_type; 673 int op; 674 bool mix_specified = false; 675 long int val; 676 int rc; 677 678 /* default value */ 679 g_queue_depth = 0; 680 g_io_size_bytes = 0; 681 workload_type = NULL; 682 g_time_in_sec = 0; 683 g_rw_percentage = -1; 684 g_core_mask = NULL; 685 g_max_completions = 0; 686 687 while ((op = getopt(argc, argv, "c:m:o:q:r:k:s:t:w:A:GM:R:T:")) != -1) { 688 switch (op) { 689 case 'm': 690 case 'o': 691 case 'q': 692 case 'k': 693 case 's': 694 case 't': 695 case 'A': 696 case 'M': 697 case 'R': 698 val = spdk_strtol(optarg, 10); 699 if (val < 0) { 700 fprintf(stderr, "Converting a string to integer failed\n"); 701 return val; 702 } 703 switch (op) { 704 case 'm': 705 g_max_completions = val; 706 break; 707 case 'o': 708 g_io_size_bytes = val; 709 break; 710 case 'q': 711 g_queue_depth = val; 712 break; 713 case 'k': 714 g_keep_alive_timeout_in_ms = val; 715 break; 716 case 's': 717 g_dpdk_mem = val; 718 break; 719 case 't': 720 g_time_in_sec = val; 721 break; 722 case 'A': 723 g_transport_ack_timeout = val; 724 break; 725 case 'M': 726 g_rw_percentage = val; 727 mix_specified = true; 728 break; 729 case 'R': 730 g_transport_retry_count = val; 731 break; 732 } 733 break; 734 case 'c': 735 g_core_mask = optarg; 736 break; 737 case 'r': 738 if (add_trid(optarg)) { 739 usage(argv[0]); 740 return 1; 741 } 742 break; 743 case 'w': 744 workload_type = optarg; 745 break; 746 case 'G': 747 #ifndef DEBUG 748 fprintf(stderr, "%s must be configured with --enable-debug for -G flag\n", 749 argv[0]); 750 usage(argv[0]); 751 return 1; 752 #else 753 spdk_log_set_flag("nvme"); 754 spdk_log_set_print_level(SPDK_LOG_DEBUG); 755 break; 756 #endif 757 case 'T': 758 rc = spdk_log_set_flag(optarg); 759 if (rc < 0) { 760 fprintf(stderr, "unknown flag\n"); 761 usage(argv[0]); 762 exit(EXIT_FAILURE); 763 } 764 #ifdef DEBUG 765 spdk_log_set_print_level(SPDK_LOG_DEBUG); 766 #endif 767 break; 768 default: 769 usage(argv[0]); 770 return 1; 771 } 772 } 773 774 if (!g_queue_depth) { 775 usage(argv[0]); 776 return 1; 777 } 778 if (!g_io_size_bytes) { 779 usage(argv[0]); 780 return 1; 781 } 782 if (!workload_type) { 783 usage(argv[0]); 784 return 1; 785 } 786 if (!g_time_in_sec) { 787 usage(argv[0]); 788 return 1; 789 } 790 791 if (strcmp(workload_type, "read") && 792 strcmp(workload_type, "write") && 793 strcmp(workload_type, "randread") && 794 strcmp(workload_type, "randwrite") && 795 strcmp(workload_type, "rw") && 796 strcmp(workload_type, "randrw")) { 797 fprintf(stderr, 798 "io pattern type must be one of\n" 799 "(read, write, randread, randwrite, rw, randrw)\n"); 800 return 1; 801 } 802 803 if (!strcmp(workload_type, "read") || 804 !strcmp(workload_type, "randread")) { 805 g_rw_percentage = 100; 806 } 807 808 if (!strcmp(workload_type, "write") || 809 !strcmp(workload_type, "randwrite")) { 810 g_rw_percentage = 0; 811 } 812 813 if (!strcmp(workload_type, "read") || 814 !strcmp(workload_type, "randread") || 815 !strcmp(workload_type, "write") || 816 !strcmp(workload_type, "randwrite")) { 817 if (mix_specified) { 818 fprintf(stderr, "Ignoring -M option... Please use -M option" 819 " only when using rw or randrw.\n"); 820 } 821 } 822 823 if (!strcmp(workload_type, "rw") || 824 !strcmp(workload_type, "randrw")) { 825 if (g_rw_percentage < 0 || g_rw_percentage > 100) { 826 fprintf(stderr, 827 "-M must be specified to value from 0 to 100 " 828 "for rw or randrw.\n"); 829 return 1; 830 } 831 } 832 833 if (!strcmp(workload_type, "read") || 834 !strcmp(workload_type, "write") || 835 !strcmp(workload_type, "rw")) { 836 g_is_random = 0; 837 } else { 838 g_is_random = 1; 839 } 840 841 if (TAILQ_EMPTY(&g_trid_list)) { 842 fprintf(stderr, "You must specify at least one fabrics TRID.\n"); 843 return -1; 844 } 845 846 /* check whether there is local PCIe type and fail. */ 847 TAILQ_FOREACH_SAFE(trid_entry, &g_trid_list, tailq, trid_entry_tmp) { 848 if (trid_entry->trid.trtype == SPDK_NVME_TRANSPORT_PCIE) { 849 fprintf(stderr, "This application was not intended to be run on PCIe controllers.\n"); 850 return 1; 851 } 852 } 853 854 return 0; 855 } 856 857 static int 858 register_workers(void) 859 { 860 uint32_t i; 861 struct worker_thread *worker; 862 863 SPDK_ENV_FOREACH_CORE(i) { 864 worker = calloc(1, sizeof(*worker)); 865 if (worker == NULL) { 866 fprintf(stderr, "Unable to allocate worker\n"); 867 return -1; 868 } 869 870 TAILQ_INIT(&worker->ns_ctx); 871 worker->lcore = i; 872 TAILQ_INSERT_TAIL(&g_workers, worker, link); 873 g_num_workers++; 874 } 875 876 return 0; 877 } 878 879 static void 880 unregister_workers(void) 881 { 882 struct worker_thread *worker, *tmp_worker; 883 struct ns_worker_ctx *ns_ctx, *tmp_ns_ctx; 884 885 /* Free namespace context and worker thread */ 886 TAILQ_FOREACH_SAFE(worker, &g_workers, link, tmp_worker) { 887 TAILQ_REMOVE(&g_workers, worker, link); 888 TAILQ_FOREACH_SAFE(ns_ctx, &worker->ns_ctx, link, tmp_ns_ctx) { 889 TAILQ_REMOVE(&worker->ns_ctx, ns_ctx, link); 890 free(ns_ctx); 891 } 892 893 free(worker); 894 } 895 } 896 897 static bool 898 probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 899 struct spdk_nvme_ctrlr_opts *opts) 900 { 901 /* These should have been weeded out earlier. */ 902 assert(trid->trtype != SPDK_NVME_TRANSPORT_PCIE); 903 904 printf("Attaching to NVMe over Fabrics controller at %s:%s: %s\n", 905 trid->traddr, trid->trsvcid, 906 trid->subnqn); 907 908 /* Set io_queue_size to UINT16_MAX, NVMe driver 909 * will then reduce this to MQES to maximize 910 * the io_queue_size as much as possible. 911 */ 912 opts->io_queue_size = UINT16_MAX; 913 914 opts->keep_alive_timeout_ms = spdk_max(opts->keep_alive_timeout_ms, 915 g_keep_alive_timeout_in_ms); 916 917 opts->transport_retry_count = g_transport_retry_count; 918 opts->transport_ack_timeout = g_transport_ack_timeout; 919 920 return true; 921 } 922 923 static void 924 attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 925 struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts) 926 { 927 struct trid_entry *trid_entry = cb_ctx; 928 929 printf("Attached to NVMe over Fabrics controller at %s:%s: %s\n", 930 trid->traddr, trid->trsvcid, 931 trid->subnqn); 932 933 register_ctrlr(ctrlr, trid_entry); 934 } 935 936 static int 937 register_controllers(void) 938 { 939 struct trid_entry *trid_entry; 940 941 printf("Initializing NVMe Controllers\n"); 942 943 TAILQ_FOREACH(trid_entry, &g_trid_list, tailq) { 944 if (spdk_nvme_probe(&trid_entry->trid, trid_entry, probe_cb, attach_cb, NULL) != 0) { 945 fprintf(stderr, "spdk_nvme_probe() failed for transport address '%s'\n", 946 trid_entry->trid.traddr); 947 return -1; 948 } 949 } 950 951 return 0; 952 } 953 954 static void 955 unregister_controllers(void) 956 { 957 struct ctrlr_entry *entry, *tmp; 958 959 TAILQ_FOREACH_SAFE(entry, &g_controllers, link, tmp) { 960 TAILQ_REMOVE(&g_controllers, entry, link); 961 spdk_nvme_detach(entry->ctrlr); 962 free(entry); 963 } 964 } 965 966 static int 967 associate_workers_with_ns(void) 968 { 969 struct ns_entry *entry = TAILQ_FIRST(&g_namespaces); 970 struct worker_thread *worker = TAILQ_FIRST(&g_workers); 971 struct ns_worker_ctx *ns_ctx; 972 int i, count; 973 974 count = g_num_namespaces > g_num_workers ? g_num_namespaces : g_num_workers; 975 976 for (i = 0; i < count; i++) { 977 if (entry == NULL) { 978 break; 979 } 980 981 ns_ctx = calloc(1, sizeof(struct ns_worker_ctx)); 982 if (!ns_ctx) { 983 return -1; 984 } 985 986 printf("Associating %s with lcore %d\n", entry->name, worker->lcore); 987 ns_ctx->entry = entry; 988 989 TAILQ_INSERT_TAIL(&worker->ns_ctx, ns_ctx, link); 990 991 worker = TAILQ_NEXT(worker, link); 992 if (worker == NULL) { 993 worker = TAILQ_FIRST(&g_workers); 994 } 995 996 entry = TAILQ_NEXT(entry, link); 997 if (entry == NULL) { 998 entry = TAILQ_FIRST(&g_namespaces); 999 } 1000 1001 } 1002 1003 return 0; 1004 } 1005 1006 static void * 1007 nvme_poll_ctrlrs(void *arg) 1008 { 1009 struct ctrlr_entry *entry; 1010 const struct spdk_nvme_transport_id *old_trid; 1011 int oldstate; 1012 int rc; 1013 1014 1015 spdk_unaffinitize_thread(); 1016 1017 while (true) { 1018 pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &oldstate); 1019 1020 TAILQ_FOREACH(entry, &g_controllers, link) { 1021 rc = spdk_nvme_ctrlr_process_admin_completions(entry->ctrlr); 1022 /* This controller has encountered a failure at the transport level. reset it. */ 1023 if (rc == -ENXIO) { 1024 if (entry->num_resets == 0) { 1025 old_trid = spdk_nvme_ctrlr_get_transport_id(entry->ctrlr); 1026 fprintf(stderr, "A controller has encountered a failure and is being reset.\n"); 1027 if (spdk_nvme_transport_id_compare(old_trid, &entry->failover_trid)) { 1028 fprintf(stderr, "Resorting to new failover address %s\n", entry->failover_trid.traddr); 1029 spdk_nvme_ctrlr_fail(entry->ctrlr); 1030 rc = spdk_nvme_ctrlr_set_trid(entry->ctrlr, &entry->failover_trid); 1031 if (rc != 0) { 1032 fprintf(stderr, "Unable to fail over to back up trid.\n"); 1033 } 1034 } 1035 } 1036 1037 rc = spdk_nvme_ctrlr_reset(entry->ctrlr); 1038 if (rc != 0) { 1039 entry->num_resets++; 1040 fprintf(stderr, "Unable to reset the controller.\n"); 1041 1042 if (entry->num_resets > g_max_ctrlr_resets) { 1043 fprintf(stderr, "Controller cannot be recovered. Exiting.\n"); 1044 exit(1); 1045 } 1046 } else { 1047 fprintf(stderr, "Controller properly reset.\n"); 1048 } 1049 } 1050 } 1051 1052 pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, &oldstate); 1053 1054 /* This is a pthread cancellation point and cannot be removed. */ 1055 sleep(1); 1056 } 1057 1058 return NULL; 1059 } 1060 1061 int main(int argc, char **argv) 1062 { 1063 int rc; 1064 struct worker_thread *worker, *master_worker; 1065 unsigned master_core; 1066 struct spdk_env_opts opts; 1067 pthread_t thread_id = 0; 1068 1069 rc = parse_args(argc, argv); 1070 if (rc != 0) { 1071 return rc; 1072 } 1073 1074 spdk_env_opts_init(&opts); 1075 opts.name = "reconnect"; 1076 if (g_core_mask) { 1077 opts.core_mask = g_core_mask; 1078 } 1079 1080 if (g_dpdk_mem) { 1081 opts.mem_size = g_dpdk_mem; 1082 } 1083 if (spdk_env_init(&opts) < 0) { 1084 fprintf(stderr, "Unable to initialize SPDK env\n"); 1085 rc = 1; 1086 goto cleanup; 1087 } 1088 1089 g_tsc_rate = spdk_get_ticks_hz(); 1090 1091 if (register_workers() != 0) { 1092 rc = 1; 1093 goto cleanup; 1094 } 1095 1096 if (register_controllers() != 0) { 1097 rc = 1; 1098 goto cleanup; 1099 } 1100 1101 if (g_warn) { 1102 printf("WARNING: Some requested NVMe devices were skipped\n"); 1103 } 1104 1105 if (g_num_namespaces == 0) { 1106 fprintf(stderr, "No valid NVMe controllers found\n"); 1107 goto cleanup; 1108 } 1109 1110 rc = pthread_create(&thread_id, NULL, &nvme_poll_ctrlrs, NULL); 1111 if (rc != 0) { 1112 fprintf(stderr, "Unable to spawn a thread to poll admin queues.\n"); 1113 goto cleanup; 1114 } 1115 1116 if (associate_workers_with_ns() != 0) { 1117 rc = 1; 1118 goto cleanup; 1119 } 1120 1121 printf("Initialization complete. Launching workers.\n"); 1122 1123 /* Launch all of the slave workers */ 1124 master_core = spdk_env_get_current_core(); 1125 master_worker = NULL; 1126 TAILQ_FOREACH(worker, &g_workers, link) { 1127 if (worker->lcore != master_core) { 1128 spdk_env_thread_launch_pinned(worker->lcore, work_fn, worker); 1129 } else { 1130 assert(master_worker == NULL); 1131 master_worker = worker; 1132 } 1133 } 1134 1135 assert(master_worker != NULL); 1136 rc = work_fn(master_worker); 1137 1138 spdk_env_thread_wait_all(); 1139 1140 cleanup: 1141 if (thread_id && pthread_cancel(thread_id) == 0) { 1142 pthread_join(thread_id, NULL); 1143 } 1144 unregister_trids(); 1145 unregister_namespaces(); 1146 unregister_controllers(); 1147 unregister_workers(); 1148 1149 if (rc != 0) { 1150 fprintf(stderr, "%s: errors occured\n", argv[0]); 1151 /* 1152 * return a generic error to the caller. This allows us to 1153 * distinguish between a failure in the script and something 1154 * like a segfault or an invalid access which causes the program 1155 * to crash. 1156 */ 1157 rc = 1; 1158 } 1159 1160 return rc; 1161 } 1162