1 /*- 2 * BSD LICENSE 3 * 4 * Copyright (c) Intel Corporation. All rights reserved. 5 * Copyright (c) 2020 Mellanox Technologies LTD. All rights reserved. 6 * 7 * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 13 * * Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * * Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in 17 * the documentation and/or other materials provided with the 18 * distribution. 19 * * Neither the name of Intel Corporation nor the names of its 20 * contributors may be used to endorse or promote products derived 21 * from this software without specific prior written permission. 22 * 23 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 24 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 25 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 26 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 27 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 28 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 29 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 30 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 31 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 32 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 33 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 34 */ 35 36 #include "spdk/stdinc.h" 37 38 #include "spdk/env.h" 39 #include "spdk/nvme.h" 40 #include "spdk/queue.h" 41 #include "spdk/string.h" 42 #include "spdk/util.h" 43 #include "spdk/log.h" 44 #include "spdk/likely.h" 45 46 struct ctrlr_entry { 47 struct spdk_nvme_ctrlr *ctrlr; 48 struct spdk_nvme_transport_id failover_trid; 49 enum spdk_nvme_transport_type trtype; 50 TAILQ_ENTRY(ctrlr_entry) link; 51 char name[1024]; 52 int num_resets; 53 }; 54 55 struct ns_entry { 56 struct spdk_nvme_ctrlr *ctrlr; 57 struct spdk_nvme_ns *ns; 58 59 TAILQ_ENTRY(ns_entry) link; 60 uint32_t io_size_blocks; 61 uint32_t num_io_requests; 62 uint64_t size_in_ios; 63 uint32_t block_size; 64 uint32_t io_flags; 65 char name[1024]; 66 }; 67 68 struct ns_worker_ctx { 69 struct ns_entry *entry; 70 uint64_t io_completed; 71 uint64_t current_queue_depth; 72 uint64_t offset_in_ios; 73 bool is_draining; 74 75 int num_qpairs; 76 struct spdk_nvme_qpair **qpair; 77 int last_qpair; 78 79 TAILQ_ENTRY(ns_worker_ctx) link; 80 }; 81 82 struct perf_task { 83 struct ns_worker_ctx *ns_ctx; 84 struct iovec iov; 85 bool is_read; 86 }; 87 88 struct worker_thread { 89 TAILQ_HEAD(, ns_worker_ctx) ns_ctx; 90 TAILQ_ENTRY(worker_thread) link; 91 unsigned lcore; 92 }; 93 94 /* For basic reset handling. */ 95 static int g_max_ctrlr_resets = 15; 96 97 static TAILQ_HEAD(, ctrlr_entry) g_controllers = TAILQ_HEAD_INITIALIZER(g_controllers); 98 static TAILQ_HEAD(, ns_entry) g_namespaces = TAILQ_HEAD_INITIALIZER(g_namespaces); 99 static int g_num_namespaces = 0; 100 static TAILQ_HEAD(, worker_thread) g_workers = TAILQ_HEAD_INITIALIZER(g_workers); 101 static int g_num_workers = 0; 102 103 static uint64_t g_tsc_rate; 104 105 static uint32_t g_io_align = 0x200; 106 static uint32_t g_io_size_bytes; 107 static uint32_t g_max_io_size_blocks; 108 static int g_rw_percentage; 109 static int g_is_random; 110 static int g_queue_depth; 111 static int g_time_in_sec; 112 static uint32_t g_max_completions; 113 static int g_dpdk_mem; 114 static bool g_warn; 115 static uint32_t g_keep_alive_timeout_in_ms = 0; 116 static uint8_t g_transport_retry_count = 4; 117 static uint8_t g_transport_ack_timeout = 0; /* disabled */ 118 static bool g_dpdk_mem_single_seg = false; 119 120 static const char *g_core_mask; 121 122 struct trid_entry { 123 struct spdk_nvme_transport_id trid; 124 struct spdk_nvme_transport_id failover_trid; 125 TAILQ_ENTRY(trid_entry) tailq; 126 }; 127 128 static TAILQ_HEAD(, trid_entry) g_trid_list = TAILQ_HEAD_INITIALIZER(g_trid_list); 129 130 static inline void 131 task_complete(struct perf_task *task); 132 static void submit_io(struct ns_worker_ctx *ns_ctx, int queue_depth); 133 134 static void io_complete(void *ctx, const struct spdk_nvme_cpl *cpl); 135 136 static void 137 nvme_setup_payload(struct perf_task *task) 138 { 139 /* maximum extended lba format size from all active namespace, 140 * it's same with g_io_size_bytes for namespace without metadata. 141 */ 142 task->iov.iov_base = spdk_dma_zmalloc(g_io_size_bytes, g_io_align, NULL); 143 task->iov.iov_len = g_io_size_bytes; 144 if (task->iov.iov_base == NULL) { 145 fprintf(stderr, "task->buf spdk_dma_zmalloc failed\n"); 146 exit(1); 147 } 148 } 149 150 static int 151 nvme_submit_io(struct perf_task *task, struct ns_worker_ctx *ns_ctx, 152 struct ns_entry *entry, uint64_t offset_in_ios) 153 { 154 uint64_t lba; 155 int qp_num; 156 157 lba = offset_in_ios * entry->io_size_blocks; 158 159 qp_num = ns_ctx->last_qpair; 160 ns_ctx->last_qpair++; 161 if (ns_ctx->last_qpair == ns_ctx->num_qpairs) { 162 ns_ctx->last_qpair = 0; 163 } 164 165 if (task->is_read) { 166 return spdk_nvme_ns_cmd_read(entry->ns, ns_ctx->qpair[qp_num], 167 task->iov.iov_base, lba, 168 entry->io_size_blocks, io_complete, 169 task, entry->io_flags); 170 } 171 172 return spdk_nvme_ns_cmd_write(entry->ns, ns_ctx->qpair[qp_num], 173 task->iov.iov_base, lba, 174 entry->io_size_blocks, io_complete, 175 task, entry->io_flags); 176 } 177 178 static void 179 nvme_check_io(struct ns_worker_ctx *ns_ctx) 180 { 181 int i, rc; 182 183 for (i = 0; i < ns_ctx->num_qpairs; i++) { 184 rc = spdk_nvme_qpair_process_completions(ns_ctx->qpair[i], g_max_completions); 185 /* The transport level qpair is failed and we need to reconnect it. */ 186 if (spdk_unlikely(rc == -ENXIO)) { 187 rc = spdk_nvme_ctrlr_reconnect_io_qpair(ns_ctx->qpair[i]); 188 /* successful reconnect */ 189 if (rc == 0) { 190 continue; 191 } else if (rc == -ENXIO) { 192 /* This means the controller is failed. Defer to it to restore the qpair. */ 193 continue; 194 } else { 195 /* 196 * We were unable to restore the qpair on this attempt. We don't 197 * really know why. For naive handling, just keep trying. 198 * TODO: add a retry limit, and destroy the qpair after x iterations. 199 */ 200 fprintf(stderr, "qpair failed and we were unable to recover it.\n"); 201 } 202 } else if (spdk_unlikely(rc < 0)) { 203 fprintf(stderr, "Received an unknown error processing completions.\n"); 204 exit(1); 205 } 206 } 207 } 208 209 /* 210 * TODO: If a controller has multiple namespaces, they could all use the same queue. 211 * For now, give each namespace/thread combination its own queue. 212 */ 213 static int 214 nvme_init_ns_worker_ctx(struct ns_worker_ctx *ns_ctx) 215 { 216 struct spdk_nvme_io_qpair_opts opts; 217 struct ns_entry *entry = ns_ctx->entry; 218 int i; 219 220 ns_ctx->num_qpairs = 1; 221 ns_ctx->qpair = calloc(ns_ctx->num_qpairs, sizeof(struct spdk_nvme_qpair *)); 222 if (!ns_ctx->qpair) { 223 return -1; 224 } 225 226 spdk_nvme_ctrlr_get_default_io_qpair_opts(entry->ctrlr, &opts, sizeof(opts)); 227 if (opts.io_queue_requests < entry->num_io_requests) { 228 opts.io_queue_requests = entry->num_io_requests; 229 } 230 231 for (i = 0; i < ns_ctx->num_qpairs; i++) { 232 ns_ctx->qpair[i] = spdk_nvme_ctrlr_alloc_io_qpair(entry->ctrlr, &opts, 233 sizeof(opts)); 234 if (!ns_ctx->qpair[i]) { 235 printf("ERROR: spdk_nvme_ctrlr_alloc_io_qpair failed\n"); 236 return -1; 237 } 238 } 239 240 return 0; 241 } 242 243 static void 244 nvme_cleanup_ns_worker_ctx(struct ns_worker_ctx *ns_ctx) 245 { 246 int i; 247 248 for (i = 0; i < ns_ctx->num_qpairs; i++) { 249 spdk_nvme_ctrlr_free_io_qpair(ns_ctx->qpair[i]); 250 } 251 252 free(ns_ctx->qpair); 253 } 254 255 static void 256 build_nvme_name(char *name, size_t length, struct spdk_nvme_ctrlr *ctrlr) 257 { 258 const struct spdk_nvme_transport_id *trid; 259 260 trid = spdk_nvme_ctrlr_get_transport_id(ctrlr); 261 262 switch (trid->trtype) { 263 case SPDK_NVME_TRANSPORT_RDMA: 264 snprintf(name, length, "RDMA (addr:%s subnqn:%s)", trid->traddr, trid->subnqn); 265 break; 266 case SPDK_NVME_TRANSPORT_TCP: 267 snprintf(name, length, "TCP (addr:%s subnqn:%s)", trid->traddr, trid->subnqn); 268 break; 269 case SPDK_NVME_TRANSPORT_CUSTOM: 270 snprintf(name, length, "CUSTOM (%s)", trid->traddr); 271 break; 272 default: 273 fprintf(stderr, "Unknown transport type %d\n", trid->trtype); 274 break; 275 } 276 } 277 278 static void 279 register_ns(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_ns *ns) 280 { 281 struct ns_entry *entry; 282 const struct spdk_nvme_ctrlr_data *cdata; 283 uint32_t max_xfer_size, entries, sector_size; 284 uint64_t ns_size; 285 struct spdk_nvme_io_qpair_opts opts; 286 287 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 288 289 if (!spdk_nvme_ns_is_active(ns)) { 290 printf("Controller %-20.20s (%-20.20s): Skipping inactive NS %u\n", 291 cdata->mn, cdata->sn, 292 spdk_nvme_ns_get_id(ns)); 293 g_warn = true; 294 return; 295 } 296 297 ns_size = spdk_nvme_ns_get_size(ns); 298 sector_size = spdk_nvme_ns_get_sector_size(ns); 299 300 if (ns_size < g_io_size_bytes || sector_size > g_io_size_bytes) { 301 printf("WARNING: controller %-20.20s (%-20.20s) ns %u has invalid " 302 "ns size %" PRIu64 " / block size %u for I/O size %u\n", 303 cdata->mn, cdata->sn, spdk_nvme_ns_get_id(ns), 304 ns_size, spdk_nvme_ns_get_sector_size(ns), g_io_size_bytes); 305 g_warn = true; 306 return; 307 } 308 309 max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns); 310 spdk_nvme_ctrlr_get_default_io_qpair_opts(ctrlr, &opts, sizeof(opts)); 311 /* NVMe driver may add additional entries based on 312 * stripe size and maximum transfer size, we assume 313 * 1 more entry be used for stripe. 314 */ 315 entries = (g_io_size_bytes - 1) / max_xfer_size + 2; 316 if ((g_queue_depth * entries) > opts.io_queue_size) { 317 printf("controller IO queue size %u less than required\n", 318 opts.io_queue_size); 319 printf("Consider using lower queue depth or small IO size because " 320 "IO requests may be queued at the NVMe driver.\n"); 321 g_warn = true; 322 } 323 /* For requests which have children requests, parent request itself 324 * will also occupy 1 entry. 325 */ 326 entries += 1; 327 328 entry = calloc(1, sizeof(struct ns_entry)); 329 if (entry == NULL) { 330 perror("ns_entry malloc"); 331 exit(1); 332 } 333 334 entry->ctrlr = ctrlr; 335 entry->ns = ns; 336 entry->num_io_requests = g_queue_depth * entries; 337 338 entry->size_in_ios = ns_size / g_io_size_bytes; 339 entry->io_size_blocks = g_io_size_bytes / sector_size; 340 341 entry->block_size = spdk_nvme_ns_get_sector_size(ns); 342 343 344 if (g_max_io_size_blocks < entry->io_size_blocks) { 345 g_max_io_size_blocks = entry->io_size_blocks; 346 } 347 348 build_nvme_name(entry->name, sizeof(entry->name), ctrlr); 349 350 g_num_namespaces++; 351 TAILQ_INSERT_TAIL(&g_namespaces, entry, link); 352 } 353 354 static void 355 unregister_namespaces(void) 356 { 357 struct ns_entry *entry, *tmp; 358 359 TAILQ_FOREACH_SAFE(entry, &g_namespaces, link, tmp) { 360 TAILQ_REMOVE(&g_namespaces, entry, link); 361 free(entry); 362 } 363 } 364 365 static void 366 register_ctrlr(struct spdk_nvme_ctrlr *ctrlr, struct trid_entry *trid_entry) 367 { 368 struct spdk_nvme_ns *ns; 369 struct ctrlr_entry *entry = calloc(1, sizeof(struct ctrlr_entry)); 370 const struct spdk_nvme_transport_id *ctrlr_trid; 371 uint32_t nsid; 372 373 if (entry == NULL) { 374 perror("ctrlr_entry malloc"); 375 exit(1); 376 } 377 378 ctrlr_trid = spdk_nvme_ctrlr_get_transport_id(ctrlr); 379 assert(ctrlr_trid != NULL); 380 381 /* each controller needs a unique failover trid. */ 382 entry->failover_trid = trid_entry->failover_trid; 383 384 /* 385 * Users are allowed to leave the trid subnqn blank or specify a discovery controller subnqn. 386 * In those cases, the controller subnqn will not equal the trid_entry subnqn and, by association, 387 * the failover_trid subnqn. 388 * When we do failover, we want to reconnect to the same nqn so explicitly set the failover nqn to 389 * the ctrlr nqn here. 390 */ 391 snprintf(entry->failover_trid.subnqn, SPDK_NVMF_NQN_MAX_LEN + 1, "%s", ctrlr_trid->subnqn); 392 393 394 build_nvme_name(entry->name, sizeof(entry->name), ctrlr); 395 396 entry->ctrlr = ctrlr; 397 entry->trtype = trid_entry->trid.trtype; 398 TAILQ_INSERT_TAIL(&g_controllers, entry, link); 399 400 for (nsid = spdk_nvme_ctrlr_get_first_active_ns(ctrlr); 401 nsid != 0; nsid = spdk_nvme_ctrlr_get_next_active_ns(ctrlr, nsid)) { 402 ns = spdk_nvme_ctrlr_get_ns(ctrlr, nsid); 403 if (ns == NULL) { 404 continue; 405 } 406 register_ns(ctrlr, ns); 407 } 408 } 409 410 static __thread unsigned int seed = 0; 411 412 static inline void 413 submit_single_io(struct perf_task *task) 414 { 415 uint64_t offset_in_ios; 416 int rc; 417 struct ns_worker_ctx *ns_ctx = task->ns_ctx; 418 struct ns_entry *entry = ns_ctx->entry; 419 420 if (g_is_random) { 421 offset_in_ios = rand_r(&seed) % entry->size_in_ios; 422 } else { 423 offset_in_ios = ns_ctx->offset_in_ios++; 424 if (ns_ctx->offset_in_ios == entry->size_in_ios) { 425 ns_ctx->offset_in_ios = 0; 426 } 427 } 428 429 if ((g_rw_percentage == 100) || 430 (g_rw_percentage != 0 && ((rand_r(&seed) % 100) < g_rw_percentage))) { 431 task->is_read = true; 432 } else { 433 task->is_read = false; 434 } 435 436 rc = nvme_submit_io(task, ns_ctx, entry, offset_in_ios); 437 438 if (spdk_unlikely(rc != 0)) { 439 fprintf(stderr, "starting I/O failed\n"); 440 } else { 441 ns_ctx->current_queue_depth++; 442 } 443 } 444 445 static inline void 446 task_complete(struct perf_task *task) 447 { 448 struct ns_worker_ctx *ns_ctx; 449 450 ns_ctx = task->ns_ctx; 451 ns_ctx->current_queue_depth--; 452 ns_ctx->io_completed++; 453 454 /* 455 * is_draining indicates when time has expired for the test run 456 * and we are just waiting for the previously submitted I/O 457 * to complete. In this case, do not submit a new I/O to replace 458 * the one just completed. 459 */ 460 if (spdk_unlikely(ns_ctx->is_draining)) { 461 spdk_dma_free(task->iov.iov_base); 462 free(task); 463 } else { 464 submit_single_io(task); 465 } 466 } 467 468 static void 469 io_complete(void *ctx, const struct spdk_nvme_cpl *cpl) 470 { 471 struct perf_task *task = ctx; 472 473 if (spdk_unlikely(spdk_nvme_cpl_is_error(cpl))) { 474 fprintf(stderr, "%s completed with error (sct=%d, sc=%d)\n", 475 task->is_read ? "Read" : "Write", 476 cpl->status.sct, cpl->status.sc); 477 } 478 479 task_complete(task); 480 } 481 482 static void 483 check_io(struct ns_worker_ctx *ns_ctx) 484 { 485 nvme_check_io(ns_ctx); 486 } 487 488 static struct perf_task * 489 allocate_task(struct ns_worker_ctx *ns_ctx, int queue_depth) 490 { 491 struct perf_task *task; 492 493 task = calloc(1, sizeof(*task)); 494 if (task == NULL) { 495 fprintf(stderr, "Out of memory allocating tasks\n"); 496 exit(1); 497 } 498 499 nvme_setup_payload(task); 500 501 task->ns_ctx = ns_ctx; 502 503 return task; 504 } 505 506 static void 507 submit_io(struct ns_worker_ctx *ns_ctx, int queue_depth) 508 { 509 struct perf_task *task; 510 511 while (queue_depth-- > 0) { 512 task = allocate_task(ns_ctx, queue_depth); 513 submit_single_io(task); 514 } 515 } 516 517 static int 518 work_fn(void *arg) 519 { 520 uint64_t tsc_end; 521 struct worker_thread *worker = (struct worker_thread *)arg; 522 struct ns_worker_ctx *ns_ctx = NULL; 523 uint32_t unfinished_ns_ctx; 524 525 printf("Starting thread on core %u\n", worker->lcore); 526 527 /* Allocate queue pairs for each namespace. */ 528 TAILQ_FOREACH(ns_ctx, &worker->ns_ctx, link) { 529 if (nvme_init_ns_worker_ctx(ns_ctx) != 0) { 530 printf("ERROR: init_ns_worker_ctx() failed\n"); 531 return 1; 532 } 533 } 534 535 tsc_end = spdk_get_ticks() + g_time_in_sec * g_tsc_rate; 536 537 /* Submit initial I/O for each namespace. */ 538 TAILQ_FOREACH(ns_ctx, &worker->ns_ctx, link) { 539 submit_io(ns_ctx, g_queue_depth); 540 } 541 542 while (1) { 543 /* 544 * Check for completed I/O for each controller. A new 545 * I/O will be submitted in the io_complete callback 546 * to replace each I/O that is completed. 547 */ 548 TAILQ_FOREACH(ns_ctx, &worker->ns_ctx, link) { 549 check_io(ns_ctx); 550 } 551 552 if (spdk_get_ticks() > tsc_end) { 553 break; 554 } 555 } 556 557 /* drain the io of each ns_ctx in round robin to make the fairness */ 558 do { 559 unfinished_ns_ctx = 0; 560 TAILQ_FOREACH(ns_ctx, &worker->ns_ctx, link) { 561 /* first time will enter into this if case */ 562 if (!ns_ctx->is_draining) { 563 ns_ctx->is_draining = true; 564 } 565 566 if (ns_ctx->current_queue_depth > 0) { 567 check_io(ns_ctx); 568 if (ns_ctx->current_queue_depth == 0) { 569 nvme_cleanup_ns_worker_ctx(ns_ctx); 570 } else { 571 unfinished_ns_ctx++; 572 } 573 } 574 } 575 } while (unfinished_ns_ctx > 0); 576 577 return 0; 578 } 579 580 static void usage(char *program_name) 581 { 582 printf("%s options", program_name); 583 printf("\n"); 584 printf("\t[-q io depth]\n"); 585 printf("\t[-o io size in bytes]\n"); 586 printf("\t[-w io pattern type, must be one of\n"); 587 printf("\t\t(read, write, randread, randwrite, rw, randrw)]\n"); 588 printf("\t[-M rwmixread (100 for reads, 0 for writes)]\n"); 589 printf("\t[-t time in seconds]\n"); 590 printf("\t[-c core mask for I/O submission/completion.]\n"); 591 printf("\t\t(default: 1)\n"); 592 printf("\t[-r Transport ID for NVMeoF]\n"); 593 printf("\t Format: 'key:value [key:value] ...'\n"); 594 printf("\t Keys:\n"); 595 printf("\t trtype Transport type (e.g. RDMA)\n"); 596 printf("\t adrfam Address family (e.g. IPv4, IPv6)\n"); 597 printf("\t traddr Transport address (e.g. 192.168.100.8 for RDMA)\n"); 598 printf("\t trsvcid Transport service identifier (e.g. 4420)\n"); 599 printf("\t subnqn Subsystem NQN (default: %s)\n", SPDK_NVMF_DISCOVERY_NQN); 600 printf("\t alt_traddr (Optional) Alternative Transport address for failover.\n"); 601 printf("\t Example: -r 'trtype:RDMA adrfam:IPv4 traddr:192.168.100.8 trsvcid:4420' for NVMeoF\n"); 602 printf("\t[-k keep alive timeout period in millisecond]\n"); 603 printf("\t[-s DPDK huge memory size in MB.]\n"); 604 printf("\t[-m max completions per poll]\n"); 605 printf("\t\t(default: 0 - unlimited)\n"); 606 printf("\t[-i shared memory group ID]\n"); 607 printf("\t[-A transport ACK timeout]\n"); 608 printf("\t[-R transport retry count]\n"); 609 printf("\t"); 610 spdk_log_usage(stdout, "-T"); 611 #ifdef DEBUG 612 printf("\t[-G enable debug logging]\n"); 613 #else 614 printf("\t[-G enable debug logging (flag disabled, must reconfigure with --enable-debug)\n"); 615 #endif 616 } 617 618 static void 619 unregister_trids(void) 620 { 621 struct trid_entry *trid_entry, *tmp; 622 623 TAILQ_FOREACH_SAFE(trid_entry, &g_trid_list, tailq, tmp) { 624 TAILQ_REMOVE(&g_trid_list, trid_entry, tailq); 625 free(trid_entry); 626 } 627 } 628 629 static int 630 add_trid(const char *trid_str) 631 { 632 struct trid_entry *trid_entry; 633 struct spdk_nvme_transport_id *trid; 634 char *alt_traddr; 635 int len; 636 637 trid_entry = calloc(1, sizeof(*trid_entry)); 638 if (trid_entry == NULL) { 639 return -1; 640 } 641 642 trid = &trid_entry->trid; 643 snprintf(trid->subnqn, sizeof(trid->subnqn), "%s", SPDK_NVMF_DISCOVERY_NQN); 644 645 if (spdk_nvme_transport_id_parse(trid, trid_str) != 0) { 646 fprintf(stderr, "Invalid transport ID format '%s'\n", trid_str); 647 free(trid_entry); 648 return 1; 649 } 650 651 trid_entry->failover_trid = trid_entry->trid; 652 653 alt_traddr = strcasestr(trid_str, "alt_traddr:"); 654 if (alt_traddr) { 655 alt_traddr += strlen("alt_traddr:"); 656 len = strcspn(alt_traddr, " \t\n"); 657 if (len > SPDK_NVMF_TRADDR_MAX_LEN) { 658 fprintf(stderr, "The failover traddr %s is too long.\n", alt_traddr); 659 free(trid_entry); 660 return -1; 661 } 662 snprintf(trid_entry->failover_trid.traddr, SPDK_NVMF_TRADDR_MAX_LEN + 1, "%s", alt_traddr); 663 } 664 665 TAILQ_INSERT_TAIL(&g_trid_list, trid_entry, tailq); 666 return 0; 667 } 668 669 static int 670 parse_args(int argc, char **argv) 671 { 672 struct trid_entry *trid_entry, *trid_entry_tmp; 673 const char *workload_type; 674 int op; 675 bool mix_specified = false; 676 long int val; 677 int rc; 678 679 /* default value */ 680 g_queue_depth = 0; 681 g_io_size_bytes = 0; 682 workload_type = NULL; 683 g_time_in_sec = 0; 684 g_rw_percentage = -1; 685 g_core_mask = NULL; 686 g_max_completions = 0; 687 688 while ((op = getopt(argc, argv, "c:gm:o:q:r:k:s:t:w:A:GM:R:T:")) != -1) { 689 switch (op) { 690 case 'm': 691 case 'o': 692 case 'q': 693 case 'k': 694 case 's': 695 case 't': 696 case 'A': 697 case 'M': 698 case 'R': 699 val = spdk_strtol(optarg, 10); 700 if (val < 0) { 701 fprintf(stderr, "Converting a string to integer failed\n"); 702 return val; 703 } 704 switch (op) { 705 case 'm': 706 g_max_completions = val; 707 break; 708 case 'o': 709 g_io_size_bytes = val; 710 break; 711 case 'q': 712 g_queue_depth = val; 713 break; 714 case 'k': 715 g_keep_alive_timeout_in_ms = val; 716 break; 717 case 's': 718 g_dpdk_mem = val; 719 break; 720 case 't': 721 g_time_in_sec = val; 722 break; 723 case 'A': 724 g_transport_ack_timeout = val; 725 break; 726 case 'M': 727 g_rw_percentage = val; 728 mix_specified = true; 729 break; 730 case 'R': 731 g_transport_retry_count = val; 732 break; 733 } 734 break; 735 case 'c': 736 g_core_mask = optarg; 737 break; 738 case 'g': 739 g_dpdk_mem_single_seg = true; 740 break; 741 case 'r': 742 if (add_trid(optarg)) { 743 usage(argv[0]); 744 return 1; 745 } 746 break; 747 case 'w': 748 workload_type = optarg; 749 break; 750 case 'G': 751 #ifndef DEBUG 752 fprintf(stderr, "%s must be configured with --enable-debug for -G flag\n", 753 argv[0]); 754 usage(argv[0]); 755 return 1; 756 #else 757 spdk_log_set_flag("nvme"); 758 spdk_log_set_print_level(SPDK_LOG_DEBUG); 759 break; 760 #endif 761 case 'T': 762 rc = spdk_log_set_flag(optarg); 763 if (rc < 0) { 764 fprintf(stderr, "unknown flag\n"); 765 usage(argv[0]); 766 exit(EXIT_FAILURE); 767 } 768 #ifdef DEBUG 769 spdk_log_set_print_level(SPDK_LOG_DEBUG); 770 #endif 771 break; 772 default: 773 usage(argv[0]); 774 return 1; 775 } 776 } 777 778 if (!g_queue_depth) { 779 usage(argv[0]); 780 return 1; 781 } 782 if (!g_io_size_bytes) { 783 usage(argv[0]); 784 return 1; 785 } 786 if (!workload_type) { 787 usage(argv[0]); 788 return 1; 789 } 790 if (!g_time_in_sec) { 791 usage(argv[0]); 792 return 1; 793 } 794 795 if (strcmp(workload_type, "read") && 796 strcmp(workload_type, "write") && 797 strcmp(workload_type, "randread") && 798 strcmp(workload_type, "randwrite") && 799 strcmp(workload_type, "rw") && 800 strcmp(workload_type, "randrw")) { 801 fprintf(stderr, 802 "io pattern type must be one of\n" 803 "(read, write, randread, randwrite, rw, randrw)\n"); 804 return 1; 805 } 806 807 if (!strcmp(workload_type, "read") || 808 !strcmp(workload_type, "randread")) { 809 g_rw_percentage = 100; 810 } 811 812 if (!strcmp(workload_type, "write") || 813 !strcmp(workload_type, "randwrite")) { 814 g_rw_percentage = 0; 815 } 816 817 if (!strcmp(workload_type, "read") || 818 !strcmp(workload_type, "randread") || 819 !strcmp(workload_type, "write") || 820 !strcmp(workload_type, "randwrite")) { 821 if (mix_specified) { 822 fprintf(stderr, "Ignoring -M option... Please use -M option" 823 " only when using rw or randrw.\n"); 824 } 825 } 826 827 if (!strcmp(workload_type, "rw") || 828 !strcmp(workload_type, "randrw")) { 829 if (g_rw_percentage < 0 || g_rw_percentage > 100) { 830 fprintf(stderr, 831 "-M must be specified to value from 0 to 100 " 832 "for rw or randrw.\n"); 833 return 1; 834 } 835 } 836 837 if (!strcmp(workload_type, "read") || 838 !strcmp(workload_type, "write") || 839 !strcmp(workload_type, "rw")) { 840 g_is_random = 0; 841 } else { 842 g_is_random = 1; 843 } 844 845 if (TAILQ_EMPTY(&g_trid_list)) { 846 fprintf(stderr, "You must specify at least one fabrics TRID.\n"); 847 return -1; 848 } 849 850 /* check whether there is local PCIe type and fail. */ 851 TAILQ_FOREACH_SAFE(trid_entry, &g_trid_list, tailq, trid_entry_tmp) { 852 if (trid_entry->trid.trtype == SPDK_NVME_TRANSPORT_PCIE) { 853 fprintf(stderr, "This application was not intended to be run on PCIe controllers.\n"); 854 return 1; 855 } 856 } 857 858 return 0; 859 } 860 861 static int 862 register_workers(void) 863 { 864 uint32_t i; 865 struct worker_thread *worker; 866 867 SPDK_ENV_FOREACH_CORE(i) { 868 worker = calloc(1, sizeof(*worker)); 869 if (worker == NULL) { 870 fprintf(stderr, "Unable to allocate worker\n"); 871 return -1; 872 } 873 874 TAILQ_INIT(&worker->ns_ctx); 875 worker->lcore = i; 876 TAILQ_INSERT_TAIL(&g_workers, worker, link); 877 g_num_workers++; 878 } 879 880 return 0; 881 } 882 883 static void 884 unregister_workers(void) 885 { 886 struct worker_thread *worker, *tmp_worker; 887 struct ns_worker_ctx *ns_ctx, *tmp_ns_ctx; 888 889 /* Free namespace context and worker thread */ 890 TAILQ_FOREACH_SAFE(worker, &g_workers, link, tmp_worker) { 891 TAILQ_REMOVE(&g_workers, worker, link); 892 TAILQ_FOREACH_SAFE(ns_ctx, &worker->ns_ctx, link, tmp_ns_ctx) { 893 TAILQ_REMOVE(&worker->ns_ctx, ns_ctx, link); 894 free(ns_ctx); 895 } 896 897 free(worker); 898 } 899 } 900 901 static bool 902 probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 903 struct spdk_nvme_ctrlr_opts *opts) 904 { 905 /* These should have been weeded out earlier. */ 906 assert(trid->trtype != SPDK_NVME_TRANSPORT_PCIE); 907 908 printf("Attaching to NVMe over Fabrics controller at %s:%s: %s\n", 909 trid->traddr, trid->trsvcid, 910 trid->subnqn); 911 912 /* Set io_queue_size to UINT16_MAX, NVMe driver 913 * will then reduce this to MQES to maximize 914 * the io_queue_size as much as possible. 915 */ 916 opts->io_queue_size = UINT16_MAX; 917 918 opts->keep_alive_timeout_ms = spdk_max(opts->keep_alive_timeout_ms, 919 g_keep_alive_timeout_in_ms); 920 921 opts->transport_retry_count = g_transport_retry_count; 922 opts->transport_ack_timeout = g_transport_ack_timeout; 923 924 return true; 925 } 926 927 static void 928 attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 929 struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts) 930 { 931 struct trid_entry *trid_entry = cb_ctx; 932 933 printf("Attached to NVMe over Fabrics controller at %s:%s: %s\n", 934 trid->traddr, trid->trsvcid, 935 trid->subnqn); 936 937 register_ctrlr(ctrlr, trid_entry); 938 } 939 940 static int 941 register_controllers(void) 942 { 943 struct trid_entry *trid_entry; 944 945 printf("Initializing NVMe Controllers\n"); 946 947 TAILQ_FOREACH(trid_entry, &g_trid_list, tailq) { 948 if (spdk_nvme_probe(&trid_entry->trid, trid_entry, probe_cb, attach_cb, NULL) != 0) { 949 fprintf(stderr, "spdk_nvme_probe() failed for transport address '%s'\n", 950 trid_entry->trid.traddr); 951 return -1; 952 } 953 } 954 955 return 0; 956 } 957 958 static void 959 unregister_controllers(void) 960 { 961 struct ctrlr_entry *entry, *tmp; 962 struct spdk_nvme_detach_ctx *detach_ctx = NULL; 963 964 TAILQ_FOREACH_SAFE(entry, &g_controllers, link, tmp) { 965 TAILQ_REMOVE(&g_controllers, entry, link); 966 spdk_nvme_detach_async(entry->ctrlr, &detach_ctx); 967 free(entry); 968 } 969 970 while (detach_ctx && spdk_nvme_detach_poll_async(detach_ctx) == -EAGAIN) { 971 ; 972 } 973 } 974 975 static int 976 associate_workers_with_ns(void) 977 { 978 struct ns_entry *entry = TAILQ_FIRST(&g_namespaces); 979 struct worker_thread *worker = TAILQ_FIRST(&g_workers); 980 struct ns_worker_ctx *ns_ctx; 981 int i, count; 982 983 count = g_num_namespaces > g_num_workers ? g_num_namespaces : g_num_workers; 984 985 for (i = 0; i < count; i++) { 986 if (entry == NULL) { 987 break; 988 } 989 990 ns_ctx = calloc(1, sizeof(struct ns_worker_ctx)); 991 if (!ns_ctx) { 992 return -1; 993 } 994 995 printf("Associating %s with lcore %d\n", entry->name, worker->lcore); 996 ns_ctx->entry = entry; 997 998 TAILQ_INSERT_TAIL(&worker->ns_ctx, ns_ctx, link); 999 1000 worker = TAILQ_NEXT(worker, link); 1001 if (worker == NULL) { 1002 worker = TAILQ_FIRST(&g_workers); 1003 } 1004 1005 entry = TAILQ_NEXT(entry, link); 1006 if (entry == NULL) { 1007 entry = TAILQ_FIRST(&g_namespaces); 1008 } 1009 1010 } 1011 1012 return 0; 1013 } 1014 1015 static void * 1016 nvme_poll_ctrlrs(void *arg) 1017 { 1018 struct ctrlr_entry *entry; 1019 const struct spdk_nvme_transport_id *old_trid; 1020 int oldstate; 1021 int rc; 1022 1023 1024 spdk_unaffinitize_thread(); 1025 1026 while (true) { 1027 pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &oldstate); 1028 1029 TAILQ_FOREACH(entry, &g_controllers, link) { 1030 rc = spdk_nvme_ctrlr_process_admin_completions(entry->ctrlr); 1031 /* This controller has encountered a failure at the transport level. reset it. */ 1032 if (rc == -ENXIO) { 1033 if (entry->num_resets == 0) { 1034 old_trid = spdk_nvme_ctrlr_get_transport_id(entry->ctrlr); 1035 fprintf(stderr, "A controller has encountered a failure and is being reset.\n"); 1036 if (spdk_nvme_transport_id_compare(old_trid, &entry->failover_trid)) { 1037 fprintf(stderr, "Resorting to new failover address %s\n", entry->failover_trid.traddr); 1038 spdk_nvme_ctrlr_fail(entry->ctrlr); 1039 rc = spdk_nvme_ctrlr_set_trid(entry->ctrlr, &entry->failover_trid); 1040 if (rc != 0) { 1041 fprintf(stderr, "Unable to fail over to back up trid.\n"); 1042 } 1043 } 1044 } 1045 1046 rc = spdk_nvme_ctrlr_reset(entry->ctrlr); 1047 if (rc != 0) { 1048 entry->num_resets++; 1049 fprintf(stderr, "Unable to reset the controller.\n"); 1050 1051 if (entry->num_resets > g_max_ctrlr_resets) { 1052 fprintf(stderr, "Controller cannot be recovered. Exiting.\n"); 1053 exit(1); 1054 } 1055 } else { 1056 fprintf(stderr, "Controller properly reset.\n"); 1057 } 1058 } 1059 } 1060 1061 pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, &oldstate); 1062 1063 /* This is a pthread cancellation point and cannot be removed. */ 1064 sleep(1); 1065 } 1066 1067 return NULL; 1068 } 1069 1070 int main(int argc, char **argv) 1071 { 1072 int rc; 1073 struct worker_thread *worker, *main_worker; 1074 unsigned main_core; 1075 struct spdk_env_opts opts; 1076 pthread_t thread_id = 0; 1077 1078 rc = parse_args(argc, argv); 1079 if (rc != 0) { 1080 return rc; 1081 } 1082 1083 spdk_env_opts_init(&opts); 1084 opts.name = "reconnect"; 1085 if (g_core_mask) { 1086 opts.core_mask = g_core_mask; 1087 } 1088 1089 if (g_dpdk_mem) { 1090 opts.mem_size = g_dpdk_mem; 1091 } 1092 opts.hugepage_single_segments = g_dpdk_mem_single_seg; 1093 if (spdk_env_init(&opts) < 0) { 1094 fprintf(stderr, "Unable to initialize SPDK env\n"); 1095 rc = 1; 1096 goto cleanup; 1097 } 1098 1099 g_tsc_rate = spdk_get_ticks_hz(); 1100 1101 if (register_workers() != 0) { 1102 rc = 1; 1103 goto cleanup; 1104 } 1105 1106 if (register_controllers() != 0) { 1107 rc = 1; 1108 goto cleanup; 1109 } 1110 1111 if (g_warn) { 1112 printf("WARNING: Some requested NVMe devices were skipped\n"); 1113 } 1114 1115 if (g_num_namespaces == 0) { 1116 fprintf(stderr, "No valid NVMe controllers found\n"); 1117 goto cleanup; 1118 } 1119 1120 rc = pthread_create(&thread_id, NULL, &nvme_poll_ctrlrs, NULL); 1121 if (rc != 0) { 1122 fprintf(stderr, "Unable to spawn a thread to poll admin queues.\n"); 1123 goto cleanup; 1124 } 1125 1126 if (associate_workers_with_ns() != 0) { 1127 rc = 1; 1128 goto cleanup; 1129 } 1130 1131 printf("Initialization complete. Launching workers.\n"); 1132 1133 /* Launch all of the secondary workers */ 1134 main_core = spdk_env_get_current_core(); 1135 main_worker = NULL; 1136 TAILQ_FOREACH(worker, &g_workers, link) { 1137 if (worker->lcore != main_core) { 1138 spdk_env_thread_launch_pinned(worker->lcore, work_fn, worker); 1139 } else { 1140 assert(main_worker == NULL); 1141 main_worker = worker; 1142 } 1143 } 1144 1145 assert(main_worker != NULL); 1146 rc = work_fn(main_worker); 1147 1148 spdk_env_thread_wait_all(); 1149 1150 cleanup: 1151 if (thread_id && pthread_cancel(thread_id) == 0) { 1152 pthread_join(thread_id, NULL); 1153 } 1154 unregister_trids(); 1155 unregister_namespaces(); 1156 unregister_controllers(); 1157 unregister_workers(); 1158 1159 if (rc != 0) { 1160 fprintf(stderr, "%s: errors occured\n", argv[0]); 1161 /* 1162 * return a generic error to the caller. This allows us to 1163 * distinguish between a failure in the script and something 1164 * like a segfault or an invalid access which causes the program 1165 * to crash. 1166 */ 1167 rc = 1; 1168 } 1169 1170 return rc; 1171 } 1172